aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/arch/x86/kernel/cpu
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel/cpu')
-rw-r--r--arch/x86/kernel/cpu/Makefile32
-rw-r--r--arch/x86/kernel/cpu/acrn.c9
-rw-r--r--arch/x86/kernel/cpu/amd.c707
-rw-r--r--arch/x86/kernel/cpu/amd_cache_disable.c301
-rw-r--r--arch/x86/kernel/cpu/aperfmperf.c153
-rw-r--r--arch/x86/kernel/cpu/bugs.c2186
-rw-r--r--arch/x86/kernel/cpu/bus_lock.c432
-rw-r--r--arch/x86/kernel/cpu/cacheinfo.c1282
-rw-r--r--arch/x86/kernel/cpu/centaur.c4
-rw-r--r--arch/x86/kernel/cpu/common.c1417
-rw-r--r--arch/x86/kernel/cpu/cpu.h43
-rw-r--r--arch/x86/kernel/cpu/cpuid-deps.c55
-rw-r--r--arch/x86/kernel/cpu/cpuid_0x2_table.c128
-rw-r--r--arch/x86/kernel/cpu/cyrix.c5
-rw-r--r--arch/x86/kernel/cpu/debugfs.c101
-rw-r--r--arch/x86/kernel/cpu/feat_ctl.c20
-rw-r--r--arch/x86/kernel/cpu/hygon.c176
-rw-r--r--arch/x86/kernel/cpu/intel.c1042
-rw-r--r--arch/x86/kernel/cpu/intel_epb.c21
-rw-r--r--arch/x86/kernel/cpu/intel_pconfig.c82
-rw-r--r--arch/x86/kernel/cpu/match.c69
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c392
-rw-r--r--arch/x86/kernel/cpu/mce/apei.c132
-rw-r--r--arch/x86/kernel/cpu/mce/core.c799
-rw-r--r--arch/x86/kernel/cpu/mce/dev-mcelog.c17
-rw-r--r--arch/x86/kernel/cpu/mce/genpool.c75
-rw-r--r--arch/x86/kernel/cpu/mce/inject.c57
-rw-r--r--arch/x86/kernel/cpu/mce/intel.c397
-rw-r--r--arch/x86/kernel/cpu/mce/internal.h140
-rw-r--r--arch/x86/kernel/cpu/mce/severity.c46
-rw-r--r--arch/x86/kernel/cpu/mce/threshold.c117
-rw-r--r--arch/x86/kernel/cpu/microcode/Makefile4
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c804
-rw-r--r--arch/x86/kernel/cpu/microcode/amd_shas.c444
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c906
-rw-r--r--arch/x86/kernel/cpu/microcode/intel-ucode-defs.h150
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c880
-rw-r--r--arch/x86/kernel/cpu/microcode/internal.h125
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.sh3
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c377
-rw-r--r--arch/x86/kernel/cpu/mtrr/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mtrr/amd.c10
-rw-r--r--arch/x86/kernel/cpu/mtrr/centaur.c19
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c87
-rw-r--r--arch/x86/kernel/cpu/mtrr/cyrix.c44
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c804
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c6
-rw-r--r--arch/x86/kernel/cpu/mtrr/legacy.c90
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.c363
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h48
-rw-r--r--arch/x86/kernel/cpu/proc.c48
-rw-r--r--arch/x86/kernel/cpu/rdrand.c1
-rw-r--r--arch/x86/kernel/cpu/resctrl/Makefile7
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c744
-rw-r--r--arch/x86/kernel/cpu/resctrl/ctrlmondata.c530
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h488
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c732
-rw-r--r--arch/x86/kernel/cpu/resctrl/pseudo_lock.c1193
-rw-r--r--arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h (renamed from arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h)10
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c3179
-rw-r--r--arch/x86/kernel/cpu/scattered.c52
-rw-r--r--arch/x86/kernel/cpu/sgx/driver.c14
-rw-r--r--arch/x86/kernel/cpu/sgx/driver.h1
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.c422
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.h18
-rw-r--r--arch/x86/kernel/cpu/sgx/encls.h33
-rw-r--r--arch/x86/kernel/cpu/sgx/ioctl.c691
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c153
-rw-r--r--arch/x86/kernel/cpu/sgx/sgx.h5
-rw-r--r--arch/x86/kernel/cpu/sgx/virt.c5
-rw-r--r--arch/x86/kernel/cpu/topology.c627
-rw-r--r--arch/x86/kernel/cpu/topology.h67
-rw-r--r--arch/x86/kernel/cpu/topology_amd.c221
-rw-r--r--arch/x86/kernel/cpu/topology_common.c255
-rw-r--r--arch/x86/kernel/cpu/topology_ext.c145
-rw-r--r--arch/x86/kernel/cpu/tsx.c60
-rw-r--r--arch/x86/kernel/cpu/umwait.c14
-rw-r--r--arch/x86/kernel/cpu/vmware.c231
-rw-r--r--arch/x86/kernel/cpu/zhaoxin.c19
79 files changed, 12576 insertions, 12992 deletions
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 9661e3e802be..1e26179ff18c 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -12,31 +12,35 @@ endif
# If these files are instrumented, boot hangs during the first second.
KCOV_INSTRUMENT_common.o := n
KCOV_INSTRUMENT_perf_event.o := n
+KMSAN_SANITIZE_common.o := n
# As above, instrumenting secondary CPU boot code causes boot hangs.
KCSAN_SANITIZE_common.o := n
-# Make sure load_percpu_segment has no stackprotector
-CFLAGS_common.o := -fno-stack-protector
-
-obj-y := cacheinfo.o scattered.o topology.o
+obj-y := cacheinfo.o scattered.o
+obj-y += topology_common.o topology_ext.o topology_amd.o
obj-y += common.o
obj-y += rdrand.o
obj-y += match.o
obj-y += bugs.o
obj-y += aperfmperf.o
-obj-y += cpuid-deps.o
+obj-y += cpuid-deps.o cpuid_0x2_table.o
obj-y += umwait.o
+obj-y += capflags.o powerflags.o
+
+obj-$(CONFIG_X86_LOCAL_APIC) += topology.o
-obj-$(CONFIG_PROC_FS) += proc.o
-obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
+obj-$(CONFIG_PROC_FS) += proc.o
-obj-$(CONFIG_IA32_FEAT_CTL) += feat_ctl.o
+obj-$(CONFIG_IA32_FEAT_CTL) += feat_ctl.o
ifdef CONFIG_CPU_SUP_INTEL
-obj-y += intel.o intel_pconfig.o tsx.o
-obj-$(CONFIG_PM) += intel_epb.o
+obj-y += intel.o tsx.o
+obj-$(CONFIG_PM) += intel_epb.o
endif
obj-$(CONFIG_CPU_SUP_AMD) += amd.o
+ifeq ($(CONFIG_AMD_NB)$(CONFIG_SYSFS),yy)
+obj-y += amd_cache_disable.o
+endif
obj-$(CONFIG_CPU_SUP_HYGON) += hygon.o
obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
@@ -56,14 +60,16 @@ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o
obj-$(CONFIG_ACRN_GUEST) += acrn.o
-ifdef CONFIG_X86_FEATURE_NAMES
+obj-$(CONFIG_DEBUG_FS) += debugfs.o
+
+obj-$(CONFIG_X86_BUS_LOCK_DETECT) += bus_lock.o
+
quiet_cmd_mkcapflags = MKCAP $@
- cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $@ $^
+ cmd_mkcapflags = $(CONFIG_SHELL) $(src)/mkcapflags.sh $@ $^
cpufeature = $(src)/../../include/asm/cpufeatures.h
vmxfeature = $(src)/../../include/asm/vmxfeatures.h
$(obj)/capflags.c: $(cpufeature) $(vmxfeature) $(src)/mkcapflags.sh FORCE
$(call if_changed,mkcapflags)
-endif
targets += capflags.c
diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c
index 23f5f27b5a02..2c5b51aad91a 100644
--- a/arch/x86/kernel/cpu/acrn.c
+++ b/arch/x86/kernel/cpu/acrn.c
@@ -26,8 +26,11 @@ static u32 __init acrn_detect(void)
static void __init acrn_init_platform(void)
{
- /* Setup the IDT for ACRN hypervisor callback */
- alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_acrn_hv_callback);
+ /* Install system interrupt handler for ACRN hypervisor callback */
+ sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_acrn_hv_callback);
+
+ x86_platform.calibrate_tsc = acrn_get_tsc_khz;
+ x86_platform.calibrate_cpu = acrn_get_tsc_khz;
}
static bool acrn_x2apic_available(void)
@@ -48,7 +51,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_acrn_hv_callback)
* will block the interrupt whose vector is lower than
* HYPERVISOR_CALLBACK_VECTOR.
*/
- ack_APIC_irq();
+ apic_eoi();
inc_irq_stat(irq_hv_callback_count);
if (acrn_intr_handler)
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 48276c0e479d..b2ad8d13211a 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -9,10 +9,12 @@
#include <linux/sched/clock.h>
#include <linux/random.h>
#include <linux/topology.h>
+#include <asm/amd/fch.h>
#include <asm/processor.h>
#include <asm/apic.h>
#include <asm/cacheinfo.h>
#include <asm/cpu.h>
+#include <asm/cpu_device_id.h>
#include <asm/spec-ctrl.h>
#include <asm/smp.h>
#include <asm/numa.h>
@@ -20,6 +22,8 @@
#include <asm/delay.h>
#include <asm/debugreg.h>
#include <asm/resctrl.h>
+#include <asm/msr.h>
+#include <asm/sev.h>
#ifdef CONFIG_X86_64
# include <asm/mmconfig.h>
@@ -27,19 +31,9 @@
#include "cpu.h"
-static const int amd_erratum_383[];
-static const int amd_erratum_400[];
-static const int amd_erratum_1054[];
-static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum);
+u16 invlpgb_count_max __ro_after_init = 1;
-/*
- * nodes_per_socket: Stores the number of nodes per socket.
- * Refer to Fam15h Models 00-0fh BKDG - CPUID Fn8000_001E_ECX
- * Node Identifiers[10:8]
- */
-static u32 nodes_per_socket = 1;
-
-static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
+static inline int rdmsrq_amd_safe(unsigned msr, u64 *p)
{
u32 gprs[8] = { 0 };
int err;
@@ -57,7 +51,7 @@ static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
return err;
}
-static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val)
+static inline int wrmsrq_amd_safe(unsigned msr, u64 val)
{
u32 gprs[8] = { 0 };
@@ -305,111 +299,16 @@ static int nearby_node(int apicid)
}
#endif
-/*
- * Fix up cpu_core_id for pre-F17h systems to be in the
- * [0 .. cores_per_node - 1] range. Not really needed but
- * kept so as not to break existing setups.
- */
-static void legacy_fixup_core_id(struct cpuinfo_x86 *c)
-{
- u32 cus_per_node;
-
- if (c->x86 >= 0x17)
- return;
-
- cus_per_node = c->x86_max_cores / nodes_per_socket;
- c->cpu_core_id %= cus_per_node;
-}
-
-/*
- * Fixup core topology information for
- * (1) AMD multi-node processors
- * Assumption: Number of cores in each internal node is the same.
- * (2) AMD processors supporting compute units
- */
-static void amd_get_topology(struct cpuinfo_x86 *c)
-{
- int cpu = smp_processor_id();
-
- /* get information required for multi-node processors */
- if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
- int err;
- u32 eax, ebx, ecx, edx;
-
- cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
-
- c->cpu_die_id = ecx & 0xff;
-
- if (c->x86 == 0x15)
- c->cu_id = ebx & 0xff;
-
- if (c->x86 >= 0x17) {
- c->cpu_core_id = ebx & 0xff;
-
- if (smp_num_siblings > 1)
- c->x86_max_cores /= smp_num_siblings;
- }
-
- /*
- * In case leaf B is available, use it to derive
- * topology information.
- */
- err = detect_extended_topology(c);
- if (!err)
- c->x86_coreid_bits = get_count_order(c->x86_max_cores);
-
- cacheinfo_amd_init_llc_id(c, cpu);
-
- } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
- u64 value;
-
- rdmsrl(MSR_FAM10H_NODE_ID, value);
- c->cpu_die_id = value & 7;
-
- per_cpu(cpu_llc_id, cpu) = c->cpu_die_id;
- } else
- return;
-
- if (nodes_per_socket > 1) {
- set_cpu_cap(c, X86_FEATURE_AMD_DCM);
- legacy_fixup_core_id(c);
- }
-}
-
-/*
- * On a AMD dual core setup the lower bits of the APIC id distinguish the cores.
- * Assumes number of cores is a power of two.
- */
-static void amd_detect_cmp(struct cpuinfo_x86 *c)
-{
- unsigned bits;
- int cpu = smp_processor_id();
-
- bits = c->x86_coreid_bits;
- /* Low order bits define the core id (index of core in socket) */
- c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
- /* Convert the initial APIC ID into the socket ID */
- c->phys_proc_id = c->initial_apicid >> bits;
- /* use socket ID also for last level cache */
- per_cpu(cpu_llc_id, cpu) = c->cpu_die_id = c->phys_proc_id;
-}
-
-u32 amd_get_nodes_per_socket(void)
-{
- return nodes_per_socket;
-}
-EXPORT_SYMBOL_GPL(amd_get_nodes_per_socket);
-
static void srat_detect_node(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_NUMA
int cpu = smp_processor_id();
int node;
- unsigned apicid = c->apicid;
+ unsigned apicid = c->topo.apicid;
node = numa_cpu_node(cpu);
if (node == NUMA_NO_NODE)
- node = get_llc_id(cpu);
+ node = per_cpu_llc_id(cpu);
/*
* On multi-fabric platform (e.g. Numascale NumaChip) a
@@ -439,7 +338,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
* through CPU mapping may alter the outcome, directly
* access __apicid_to_node[].
*/
- int ht_nodeid = c->initial_apicid;
+ int ht_nodeid = c->topo.initial_apicid;
if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
node = __apicid_to_node[ht_nodeid];
@@ -451,29 +350,30 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
#endif
}
-static void early_init_amd_mc(struct cpuinfo_x86 *c)
+static void bsp_determine_snp(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_SMP
- unsigned bits, ecx;
-
- /* Multi core CPU? */
- if (c->extended_cpuid_level < 0x80000008)
- return;
-
- ecx = cpuid_ecx(0x80000008);
+#ifdef CONFIG_ARCH_HAS_CC_PLATFORM
+ cc_vendor = CC_VENDOR_AMD;
- c->x86_max_cores = (ecx & 0xff) + 1;
-
- /* CPU telling us the core id bits shift? */
- bits = (ecx >> 12) & 0xF;
-
- /* Otherwise recompute */
- if (bits == 0) {
- while ((1 << bits) < c->x86_max_cores)
- bits++;
+ if (cpu_has(c, X86_FEATURE_SEV_SNP)) {
+ /*
+ * RMP table entry format is not architectural and is defined by the
+ * per-processor PPR. Restrict SNP support on the known CPU models
+ * for which the RMP table entry format is currently defined or for
+ * processors which support the architecturally defined RMPREAD
+ * instruction.
+ */
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR) &&
+ (cpu_feature_enabled(X86_FEATURE_ZEN3) ||
+ cpu_feature_enabled(X86_FEATURE_ZEN4) ||
+ cpu_feature_enabled(X86_FEATURE_RMPREAD)) &&
+ snp_probe_rmptable_info()) {
+ cc_platform_set(CC_ATTR_HOST_SEV_SNP);
+ } else {
+ setup_clear_cpu_cap(X86_FEATURE_SEV_SNP);
+ cc_platform_clear(CC_ATTR_HOST_SEV_SNP);
+ }
}
-
- c->x86_coreid_bits = bits;
#endif
}
@@ -485,7 +385,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
(c->x86 == 0x10 && c->x86_model >= 0x2)) {
u64 val;
- rdmsrl(MSR_K7_HWCR, val);
+ rdmsrq(MSR_K7_HWCR, val);
if (!(val & BIT(24)))
pr_warn(FW_BUG "TSC doesn't count with P0 frequency!\n");
}
@@ -503,24 +403,12 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
va_align.flags = ALIGN_VA_32 | ALIGN_VA_64;
/* A random value per boot for bit slice [12:upper_bit) */
- va_align.bits = get_random_int() & va_align.mask;
+ va_align.bits = get_random_u32() & va_align.mask;
}
if (cpu_has(c, X86_FEATURE_MWAITX))
use_mwaitx_delay();
- if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
- u32 ecx;
-
- ecx = cpuid_ecx(0x8000001e);
- __max_die_per_package = nodes_per_socket = ((ecx >> 8) & 7) + 1;
- } else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) {
- u64 value;
-
- rdmsrl(MSR_FAM10H_NODE_ID, value);
- __max_die_per_package = nodes_per_socket = ((value >> 3) & 7) + 1;
- }
-
if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) &&
!boot_cpu_has(X86_FEATURE_VIRT_SSBD) &&
c->x86 >= 0x15 && c->x86 <= 0x17) {
@@ -536,7 +424,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
* Try to cache the base value so further operations can
* avoid RMW. If that faults, do not enable SSBD.
*/
- if (!rdmsrl_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) {
+ if (!rdmsrq_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) {
setup_force_cpu_cap(X86_FEATURE_LS_CFG_SSBD);
setup_force_cpu_cap(X86_FEATURE_SSBD);
x86_amd_ls_cfg_ssbd_mask = 1ULL << bit;
@@ -544,6 +432,67 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
}
resctrl_cpu_detect(c);
+
+ /* Figure out Zen generations: */
+ switch (c->x86) {
+ case 0x17:
+ switch (c->x86_model) {
+ case 0x00 ... 0x2f:
+ case 0x50 ... 0x5f:
+ setup_force_cpu_cap(X86_FEATURE_ZEN1);
+ break;
+ case 0x30 ... 0x4f:
+ case 0x60 ... 0x7f:
+ case 0x90 ... 0x91:
+ case 0xa0 ... 0xaf:
+ setup_force_cpu_cap(X86_FEATURE_ZEN2);
+ break;
+ default:
+ goto warn;
+ }
+ break;
+
+ case 0x19:
+ switch (c->x86_model) {
+ case 0x00 ... 0x0f:
+ case 0x20 ... 0x5f:
+ setup_force_cpu_cap(X86_FEATURE_ZEN3);
+ break;
+ case 0x10 ... 0x1f:
+ case 0x60 ... 0xaf:
+ setup_force_cpu_cap(X86_FEATURE_ZEN4);
+ break;
+ default:
+ goto warn;
+ }
+ break;
+
+ case 0x1a:
+ switch (c->x86_model) {
+ case 0x00 ... 0x2f:
+ case 0x40 ... 0x4f:
+ case 0x60 ... 0x7f:
+ setup_force_cpu_cap(X86_FEATURE_ZEN5);
+ break;
+ case 0x50 ... 0x5f:
+ case 0x90 ... 0xaf:
+ case 0xc0 ... 0xcf:
+ setup_force_cpu_cap(X86_FEATURE_ZEN6);
+ break;
+ default:
+ goto warn;
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ bsp_determine_snp(c);
+ return;
+
+warn:
+ WARN_ONCE(1, "Family 0x%x, model: 0x%x??\n", c->x86, c->x86_model);
}
static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
@@ -558,15 +507,15 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
* SME feature (set in scattered.c).
* If the kernel has not enabled SME via any means then
* don't advertise the SME feature.
- * For SEV: If BIOS has not enabled SEV then don't advertise the
- * SEV and SEV_ES feature (set in scattered.c).
+ * For SEV: If BIOS has not enabled SEV then don't advertise SEV and
+ * any additional functionality based on it.
*
* In all cases, since support for SME and SEV requires long mode,
* don't advertise the feature under CONFIG_X86_32.
*/
if (cpu_has(c, X86_FEATURE_SME) || cpu_has(c, X86_FEATURE_SEV)) {
/* Check if memory encryption is enabled */
- rdmsrl(MSR_AMD64_SYSCFG, msr);
+ rdmsrq(MSR_AMD64_SYSCFG, msr);
if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
goto clear_all;
@@ -583,7 +532,7 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
if (!sme_me_mask)
setup_clear_cpu_cap(X86_FEATURE_SME);
- rdmsrl(MSR_K7_HWCR, msr);
+ rdmsrq(MSR_K7_HWCR, msr);
if (!(msr & MSR_K7_HWCR_SMMLOCK))
goto clear_sev;
@@ -594,16 +543,14 @@ clear_all:
clear_sev:
setup_clear_cpu_cap(X86_FEATURE_SEV);
setup_clear_cpu_cap(X86_FEATURE_SEV_ES);
+ setup_clear_cpu_cap(X86_FEATURE_SEV_SNP);
}
}
static void early_init_amd(struct cpuinfo_x86 *c)
{
- u64 value;
u32 dummy;
- early_init_amd_mc(c);
-
if (c->x86 >= 0xf)
set_cpu_cap(c, X86_FEATURE_K8);
@@ -667,33 +614,16 @@ static void early_init_amd(struct cpuinfo_x86 *c)
if (c->x86 == 0x16 && c->x86_model <= 0xf)
msr_set_bit(MSR_AMD64_LS_CFG, 15);
- /*
- * Check whether the machine is affected by erratum 400. This is
- * used to select the proper idle routine and to enable the check
- * whether the machine is affected in arch_post_acpi_init(), which
- * sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check.
- */
- if (cpu_has_amd_erratum(c, amd_erratum_400))
- set_cpu_bug(c, X86_BUG_AMD_E400);
-
early_detect_mem_encrypt(c);
- /* Re-enable TopologyExtensions if switched off by BIOS */
- if (c->x86 == 0x15 &&
- (c->x86_model >= 0x10 && c->x86_model <= 0x6f) &&
- !cpu_has(c, X86_FEATURE_TOPOEXT)) {
-
- if (msr_set_bit(0xc0011005, 54) > 0) {
- rdmsrl(0xc0011005, value);
- if (value & BIT_64(54)) {
- set_cpu_cap(c, X86_FEATURE_TOPOEXT);
- pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n");
- }
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_IBPB_BRTYPE)) {
+ if (c->x86 == 0x17 && boot_cpu_has(X86_FEATURE_AMD_IBPB))
+ setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE);
+ else if (c->x86 >= 0x19 && !wrmsrq_safe(MSR_IA32_PRED_CMD, PRED_CMD_SBPB)) {
+ setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE);
+ setup_force_cpu_cap(X86_FEATURE_SBPB);
}
}
-
- if (cpu_has(c, X86_FEATURE_TOPOEXT))
- smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1;
}
static void init_amd_k8(struct cpuinfo_x86 *c)
@@ -711,16 +641,16 @@ static void init_amd_k8(struct cpuinfo_x86 *c)
* (model = 0x14) and later actually support it.
* (AMD Erratum #110, docId: 25759).
*/
- if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) {
+ if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM) && !cpu_has(c, X86_FEATURE_HYPERVISOR)) {
clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
- if (!rdmsrl_amd_safe(0xc001100d, &value)) {
+ if (!rdmsrq_amd_safe(0xc001100d, &value)) {
value &= ~BIT_64(32);
- wrmsrl_amd_safe(0xc001100d, value);
+ wrmsrq_amd_safe(0xc001100d, value);
}
}
if (!c->x86_model_id[0])
- strcpy(c->x86_model_id, "Hammer");
+ strscpy(c->x86_model_id, "Hammer");
#ifdef CONFIG_SMP
/*
@@ -733,6 +663,16 @@ static void init_amd_k8(struct cpuinfo_x86 *c)
msr_set_bit(MSR_K7_HWCR, 6);
#endif
set_cpu_bug(c, X86_BUG_SWAPGS_FENCE);
+
+ /*
+ * Check models and steppings affected by erratum 400. This is
+ * used to select the proper idle routine and to enable the
+ * check whether the machine is affected in arch_post_acpi_subsys_init()
+ * which sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check.
+ */
+ if (c->x86_model > 0x41 ||
+ (c->x86_model == 0x41 && c->x86_stepping >= 0x2))
+ setup_force_cpu_bug(X86_BUG_AMD_E400);
}
static void init_amd_gh(struct cpuinfo_x86 *c)
@@ -766,11 +706,18 @@ static void init_amd_gh(struct cpuinfo_x86 *c)
*/
msr_clear_bit(MSR_AMD64_BU_CFG2, 24);
- if (cpu_has_amd_erratum(c, amd_erratum_383))
- set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
-}
+ set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
-#define MSR_AMD64_DE_CFG 0xC0011029
+ /*
+ * Check models and steppings affected by erratum 400. This is
+ * used to select the proper idle routine and to enable the
+ * check whether the machine is affected in arch_post_acpi_subsys_init()
+ * which sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check.
+ */
+ if (c->x86_model > 0x2 ||
+ (c->x86_model == 0x2 && c->x86_stepping >= 0x1))
+ setup_force_cpu_bug(X86_BUG_AMD_E400);
+}
static void init_amd_ln(struct cpuinfo_x86 *c)
{
@@ -848,9 +795,9 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
* Disable it on the affected CPUs.
*/
if ((c->x86_model >= 0x02) && (c->x86_model < 0x20)) {
- if (!rdmsrl_safe(MSR_F15H_IC_CFG, &value) && !(value & 0x1E)) {
+ if (!rdmsrq_safe(MSR_F15H_IC_CFG, &value) && !(value & 0x1E)) {
value |= 0x1E;
- wrmsrl_safe(MSR_F15H_IC_CFG, value);
+ wrmsrq_safe(MSR_F15H_IC_CFG, value);
}
}
@@ -862,9 +809,34 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
clear_rdrand_cpuid_bit(c);
}
+static const struct x86_cpu_id erratum_1386_microcode[] = {
+ X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x01), 0x2, 0x2, 0x0800126e),
+ X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x31), 0x0, 0x0, 0x08301052),
+ {}
+};
+
+static void fix_erratum_1386(struct cpuinfo_x86 *c)
+{
+ /*
+ * Work around Erratum 1386. The XSAVES instruction malfunctions in
+ * certain circumstances on Zen1/2 uarch, and not all parts have had
+ * updated microcode at the time of writing (March 2023).
+ *
+ * Affected parts all have no supervisor XSAVE states, meaning that
+ * the XSAVEC instruction (which works fine) is equivalent.
+ *
+ * Clear the feature flag only on microcode revisions which
+ * don't have the fix.
+ */
+ if (x86_match_min_microcode_rev(erratum_1386_microcode))
+ return;
+
+ clear_cpu_cap(c, X86_FEATURE_XSAVES);
+}
+
void init_spectral_chicken(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_CPU_UNRET_ENTRY
+#ifdef CONFIG_MITIGATION_UNRET_ENTRY
u64 value;
/*
@@ -872,25 +844,27 @@ void init_spectral_chicken(struct cpuinfo_x86 *c)
*
* This suppresses speculation from the middle of a basic block, i.e. it
* suppresses non-branch predictions.
- *
- * We use STIBP as a heuristic to filter out Zen2 from the rest of F17H
*/
- if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && cpu_has(c, X86_FEATURE_AMD_STIBP)) {
- if (!rdmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, &value)) {
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) {
+ if (!rdmsrq_safe(MSR_ZEN2_SPECTRAL_CHICKEN, &value)) {
value |= MSR_ZEN2_SPECTRAL_CHICKEN_BIT;
- wrmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, value);
+ wrmsrq_safe(MSR_ZEN2_SPECTRAL_CHICKEN, value);
}
}
#endif
}
-static void init_amd_zn(struct cpuinfo_x86 *c)
+static void init_amd_zen_common(void)
{
- set_cpu_cap(c, X86_FEATURE_ZEN);
-
+ setup_force_cpu_cap(X86_FEATURE_ZEN);
#ifdef CONFIG_NUMA
node_reclaim_distance = 32;
#endif
+}
+
+static void init_amd_zen1(struct cpuinfo_x86 *c)
+{
+ fix_erratum_1386(c);
/* Fix up CPUID bits, but only if not virtualised. */
if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) {
@@ -898,19 +872,104 @@ static void init_amd_zn(struct cpuinfo_x86 *c)
/* Erratum 1076: CPB feature bit not being set in CPUID. */
if (!cpu_has(c, X86_FEATURE_CPB))
set_cpu_cap(c, X86_FEATURE_CPB);
+ }
+
+ pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n");
+ setup_force_cpu_bug(X86_BUG_DIV0);
+
+ /*
+ * Turn off the Instructions Retired free counter on machines that are
+ * susceptible to erratum #1054 "Instructions Retired Performance
+ * Counter May Be Inaccurate".
+ */
+ if (c->x86_model < 0x30) {
+ msr_clear_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT);
+ clear_cpu_cap(c, X86_FEATURE_IRPERF);
+ }
+}
+
+static bool cpu_has_zenbleed_microcode(void)
+{
+ u32 good_rev = 0;
+
+ switch (boot_cpu_data.x86_model) {
+ case 0x30 ... 0x3f: good_rev = 0x0830107b; break;
+ case 0x60 ... 0x67: good_rev = 0x0860010c; break;
+ case 0x68 ... 0x6f: good_rev = 0x08608107; break;
+ case 0x70 ... 0x7f: good_rev = 0x08701033; break;
+ case 0xa0 ... 0xaf: good_rev = 0x08a00009; break;
+
+ default:
+ return false;
+ }
+
+ if (boot_cpu_data.microcode < good_rev)
+ return false;
+
+ return true;
+}
+static void zen2_zenbleed_check(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_HYPERVISOR))
+ return;
+
+ if (!cpu_has(c, X86_FEATURE_AVX))
+ return;
+
+ if (!cpu_has_zenbleed_microcode()) {
+ pr_notice_once("Zenbleed: please update your microcode for the most optimal fix\n");
+ msr_set_bit(MSR_AMD64_DE_CFG, MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT);
+ } else {
+ msr_clear_bit(MSR_AMD64_DE_CFG, MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT);
+ }
+}
+
+static void init_amd_zen2(struct cpuinfo_x86 *c)
+{
+ init_spectral_chicken(c);
+ fix_erratum_1386(c);
+ zen2_zenbleed_check(c);
+}
+
+static void init_amd_zen3(struct cpuinfo_x86 *c)
+{
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) {
/*
* Zen3 (Fam19 model < 0x10) parts are not susceptible to
* Branch Type Confusion, but predate the allocation of the
* BTC_NO bit.
*/
- if (c->x86 == 0x19 && !cpu_has(c, X86_FEATURE_BTC_NO))
+ if (!cpu_has(c, X86_FEATURE_BTC_NO))
set_cpu_cap(c, X86_FEATURE_BTC_NO);
}
}
+static void init_amd_zen4(struct cpuinfo_x86 *c)
+{
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR))
+ msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT);
+
+ /*
+ * These Zen4 SoCs advertise support for virtualized VMLOAD/VMSAVE
+ * in some BIOS versions but they can lead to random host reboots.
+ */
+ switch (c->x86_model) {
+ case 0x18 ... 0x1f:
+ case 0x60 ... 0x7f:
+ clear_cpu_cap(c, X86_FEATURE_V_VMSAVE_VMLOAD);
+ break;
+ }
+}
+
+static void init_amd_zen5(struct cpuinfo_x86 *c)
+{
+}
+
static void init_amd(struct cpuinfo_x86 *c)
{
+ u64 vm_cr;
+
early_init_amd(c);
/*
@@ -922,8 +981,9 @@ static void init_amd(struct cpuinfo_x86 *c)
if (c->x86 >= 0x10)
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
- /* get apicid instead of initial apic id from cpuid */
- c->apicid = hard_smp_processor_id();
+ /* AMD FSRM also implies FSRS */
+ if (cpu_has(c, X86_FEATURE_FSRM))
+ set_cpu_cap(c, X86_FEATURE_FSRS);
/* K6s reports MCEs but don't actually have all the MSRs */
if (c->x86 < 6)
@@ -938,12 +998,27 @@ static void init_amd(struct cpuinfo_x86 *c)
case 0x12: init_amd_ln(c); break;
case 0x15: init_amd_bd(c); break;
case 0x16: init_amd_jg(c); break;
- case 0x17: init_spectral_chicken(c);
- fallthrough;
- case 0x19: init_amd_zn(c); break;
}
/*
+ * Save up on some future enablement work and do common Zen
+ * settings.
+ */
+ if (c->x86 >= 0x17)
+ init_amd_zen_common();
+
+ if (boot_cpu_has(X86_FEATURE_ZEN1))
+ init_amd_zen1(c);
+ else if (boot_cpu_has(X86_FEATURE_ZEN2))
+ init_amd_zen2(c);
+ else if (boot_cpu_has(X86_FEATURE_ZEN3))
+ init_amd_zen3(c);
+ else if (boot_cpu_has(X86_FEATURE_ZEN4))
+ init_amd_zen4(c);
+ else if (boot_cpu_has(X86_FEATURE_ZEN5))
+ init_amd_zen5(c);
+
+ /*
* Enable workaround for FXSAVE leak on CPUs
* without a XSaveErPtr feature
*/
@@ -952,21 +1027,27 @@ static void init_amd(struct cpuinfo_x86 *c)
cpu_detect_cache_sizes(c);
- amd_detect_cmp(c);
- amd_get_topology(c);
srat_detect_node(c);
init_amd_cacheinfo(c);
- if (cpu_has(c, X86_FEATURE_XMM2)) {
+ if (cpu_has(c, X86_FEATURE_SVM)) {
+ rdmsrq(MSR_VM_CR, vm_cr);
+ if (vm_cr & SVM_VM_CR_SVM_DIS_MASK) {
+ pr_notice_once("SVM disabled (by BIOS) in MSR_VM_CR\n");
+ clear_cpu_cap(c, X86_FEATURE_SVM);
+ }
+ }
+
+ if (!cpu_has(c, X86_FEATURE_LFENCE_RDTSC) && cpu_has(c, X86_FEATURE_XMM2)) {
/*
* Use LFENCE for execution serialization. On families which
* don't have that MSR, LFENCE is already serializing.
* msr_set_bit() uses the safe accessors, too, even if the MSR
* is not present.
*/
- msr_set_bit(MSR_F10H_DECFG,
- MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT);
+ msr_set_bit(MSR_AMD64_DE_CFG,
+ MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT);
/* A serializing LFENCE stops RDTSC speculation */
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
@@ -985,19 +1066,32 @@ static void init_amd(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH);
/* AMD CPUs don't reset SS attributes on SYSRET, Xen does. */
- if (!cpu_has(c, X86_FEATURE_XENPV))
+ if (!cpu_feature_enabled(X86_FEATURE_XENPV))
set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
- /*
- * Turn on the Instructions Retired free counter on machines not
- * susceptible to erratum #1054 "Instructions Retired Performance
- * Counter May Be Inaccurate".
- */
- if (cpu_has(c, X86_FEATURE_IRPERF) &&
- !cpu_has_amd_erratum(c, amd_erratum_1054))
+ /* Enable the Instructions Retired free counter */
+ if (cpu_has(c, X86_FEATURE_IRPERF))
msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT);
check_null_seg_clears_base(c);
+
+ /*
+ * Make sure EFER[AIBRSE - Automatic IBRS Enable] is set. The APs are brought up
+ * using the trampoline code and as part of it, MSR_EFER gets prepared there in
+ * order to be replicated onto them. Regardless, set it here again, if not set,
+ * to protect against any future refactoring/code reorganization which might
+ * miss setting this important bit.
+ */
+ if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
+ cpu_has(c, X86_FEATURE_AUTOIBRS))
+ WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS) < 0);
+
+ /* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
+ clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
+
+ /* Enable Translation Cache Extension */
+ if (cpu_has(c, X86_FEATURE_TCE))
+ msr_set_bit(MSR_EFER, _EFER_TCE);
}
#ifdef CONFIG_X86_32
@@ -1030,8 +1124,8 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
- tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask;
- tlb_lli_4k[ENTRIES] = ebx & mask;
+ tlb_lld_4k = (ebx >> 16) & mask;
+ tlb_lli_4k = ebx & mask;
/*
* K8 doesn't have 2M/4M entries in the L2 TLB so read out the L1 TLB
@@ -1044,26 +1138,30 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
/* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
if (!((eax >> 16) & mask))
- tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff;
+ tlb_lld_2m = (cpuid_eax(0x80000005) >> 16) & 0xff;
else
- tlb_lld_2m[ENTRIES] = (eax >> 16) & mask;
+ tlb_lld_2m = (eax >> 16) & mask;
/* a 4M entry uses two 2M entries */
- tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1;
+ tlb_lld_4m = tlb_lld_2m >> 1;
/* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
if (!(eax & mask)) {
/* Erratum 658 */
if (c->x86 == 0x15 && c->x86_model <= 0x1f) {
- tlb_lli_2m[ENTRIES] = 1024;
+ tlb_lli_2m = 1024;
} else {
cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
- tlb_lli_2m[ENTRIES] = eax & 0xff;
+ tlb_lli_2m = eax & 0xff;
}
} else
- tlb_lli_2m[ENTRIES] = eax & mask;
+ tlb_lli_2m = eax & mask;
- tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
+ tlb_lli_4m = tlb_lli_2m >> 1;
+
+ /* Max number of pages INVLPGB can invalidate in one shot */
+ if (cpu_has(c, X86_FEATURE_INVLPGB))
+ invlpgb_count_max = (cpuid_edx(0x80000008) & 0xffff) + 1;
}
static const struct cpu_dev amd_cpu_dev = {
@@ -1093,104 +1191,109 @@ static const struct cpu_dev amd_cpu_dev = {
cpu_dev_register(amd_cpu_dev);
-/*
- * AMD errata checking
- *
- * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or
- * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that
- * have an OSVW id assigned, which it takes as first argument. Both take a
- * variable number of family-specific model-stepping ranges created by
- * AMD_MODEL_RANGE().
- *
- * Example:
- *
- * const int amd_erratum_319[] =
- * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2),
- * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0),
- * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0));
- */
+static DEFINE_PER_CPU_READ_MOSTLY(unsigned long[4], amd_dr_addr_mask);
-#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 }
-#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 }
-#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \
- ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end))
-#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff)
-#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff)
-#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff)
+static unsigned int amd_msr_dr_addr_masks[] = {
+ MSR_F16H_DR0_ADDR_MASK,
+ MSR_F16H_DR1_ADDR_MASK,
+ MSR_F16H_DR1_ADDR_MASK + 1,
+ MSR_F16H_DR1_ADDR_MASK + 2
+};
-static const int amd_erratum_400[] =
- AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf),
- AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf));
+void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr)
+{
+ int cpu = smp_processor_id();
-static const int amd_erratum_383[] =
- AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf));
+ if (!cpu_feature_enabled(X86_FEATURE_BPEXT))
+ return;
-/* #1054: Instructions Retired Performance Counter May Be Inaccurate */
-static const int amd_erratum_1054[] =
- AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf));
+ if (WARN_ON_ONCE(dr >= ARRAY_SIZE(amd_msr_dr_addr_masks)))
+ return;
-static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum)
-{
- int osvw_id = *erratum++;
- u32 range;
- u32 ms;
+ if (per_cpu(amd_dr_addr_mask, cpu)[dr] == mask)
+ return;
+
+ wrmsrq(amd_msr_dr_addr_masks[dr], mask);
+ per_cpu(amd_dr_addr_mask, cpu)[dr] = mask;
+}
- if (osvw_id >= 0 && osvw_id < 65536 &&
- cpu_has(cpu, X86_FEATURE_OSVW)) {
- u64 osvw_len;
+unsigned long amd_get_dr_addr_mask(unsigned int dr)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_BPEXT))
+ return 0;
- rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len);
- if (osvw_id < osvw_len) {
- u64 osvw_bits;
+ if (WARN_ON_ONCE(dr >= ARRAY_SIZE(amd_msr_dr_addr_masks)))
+ return 0;
- rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6),
- osvw_bits);
- return osvw_bits & (1ULL << (osvw_id & 0x3f));
- }
- }
+ return per_cpu(amd_dr_addr_mask[dr], smp_processor_id());
+}
+EXPORT_SYMBOL_GPL(amd_get_dr_addr_mask);
- /* OSVW unavailable or ID unknown, match family-model-stepping range */
- ms = (cpu->x86_model << 4) | cpu->x86_stepping;
- while ((range = *erratum++))
- if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) &&
- (ms >= AMD_MODEL_RANGE_START(range)) &&
- (ms <= AMD_MODEL_RANGE_END(range)))
- return true;
+static void zenbleed_check_cpu(void *unused)
+{
+ struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
- return false;
+ zen2_zenbleed_check(c);
}
-void set_dr_addr_mask(unsigned long mask, int dr)
+void amd_check_microcode(void)
{
- if (!boot_cpu_has(X86_FEATURE_BPEXT))
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
return;
- switch (dr) {
- case 0:
- wrmsr(MSR_F16H_DR0_ADDR_MASK, mask, 0);
- break;
- case 1:
- case 2:
- case 3:
- wrmsr(MSR_F16H_DR1_ADDR_MASK - 1 + dr, mask, 0);
- break;
- default:
- break;
- }
+ if (cpu_feature_enabled(X86_FEATURE_ZEN2))
+ on_each_cpu(zenbleed_check_cpu, NULL, 1);
}
-u32 amd_get_highest_perf(void)
+static const char * const s5_reset_reason_txt[] = {
+ [0] = "thermal pin BP_THERMTRIP_L was tripped",
+ [1] = "power button was pressed for 4 seconds",
+ [2] = "shutdown pin was tripped",
+ [4] = "remote ASF power off command was received",
+ [9] = "internal CPU thermal limit was tripped",
+ [16] = "system reset pin BP_SYS_RST_L was tripped",
+ [17] = "software issued PCI reset",
+ [18] = "software wrote 0x4 to reset control register 0xCF9",
+ [19] = "software wrote 0x6 to reset control register 0xCF9",
+ [20] = "software wrote 0xE to reset control register 0xCF9",
+ [21] = "ACPI power state transition occurred",
+ [22] = "keyboard reset pin KB_RST_L was tripped",
+ [23] = "internal CPU shutdown event occurred",
+ [24] = "system failed to boot before failed boot timer expired",
+ [25] = "hardware watchdog timer expired",
+ [26] = "remote ASF reset command was received",
+ [27] = "an uncorrected error caused a data fabric sync flood event",
+ [29] = "FCH and MP1 failed warm reset handshake",
+ [30] = "a parity error occurred",
+ [31] = "a software sync flood event occurred",
+};
+
+static __init int print_s5_reset_status_mmio(void)
{
- struct cpuinfo_x86 *c = &boot_cpu_data;
+ unsigned long value;
+ void __iomem *addr;
+ int i;
- if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) ||
- (c->x86_model >= 0x70 && c->x86_model < 0x80)))
- return 166;
+ if (!cpu_feature_enabled(X86_FEATURE_ZEN))
+ return 0;
- if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) ||
- (c->x86_model >= 0x40 && c->x86_model < 0x70)))
- return 166;
+ addr = ioremap(FCH_PM_BASE + FCH_PM_S5_RESET_STATUS, sizeof(value));
+ if (!addr)
+ return 0;
- return 255;
+ value = ioread32(addr);
+ iounmap(addr);
+
+ for (i = 0; i < ARRAY_SIZE(s5_reset_reason_txt); i++) {
+ if (!(value & BIT(i)))
+ continue;
+
+ if (s5_reset_reason_txt[i]) {
+ pr_info("x86/amd: Previous system reset reason [0x%08lx]: %s\n",
+ value, s5_reset_reason_txt[i]);
+ }
+ }
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(amd_get_highest_perf);
+late_initcall(print_s5_reset_status_mmio);
diff --git a/arch/x86/kernel/cpu/amd_cache_disable.c b/arch/x86/kernel/cpu/amd_cache_disable.c
new file mode 100644
index 000000000000..8843b9557aea
--- /dev/null
+++ b/arch/x86/kernel/cpu/amd_cache_disable.c
@@ -0,0 +1,301 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * AMD L3 cache_disable_{0,1} sysfs handling
+ * Documentation/ABI/testing/sysfs-devices-system-cpu
+ */
+
+#include <linux/cacheinfo.h>
+#include <linux/capability.h>
+#include <linux/pci.h>
+#include <linux/sysfs.h>
+
+#include <asm/amd/nb.h>
+
+#include "cpu.h"
+
+/*
+ * L3 cache descriptors
+ */
+static void amd_calc_l3_indices(struct amd_northbridge *nb)
+{
+ struct amd_l3_cache *l3 = &nb->l3_cache;
+ unsigned int sc0, sc1, sc2, sc3;
+ u32 val = 0;
+
+ pci_read_config_dword(nb->misc, 0x1C4, &val);
+
+ /* calculate subcache sizes */
+ l3->subcaches[0] = sc0 = !(val & BIT(0));
+ l3->subcaches[1] = sc1 = !(val & BIT(4));
+
+ if (boot_cpu_data.x86 == 0x15) {
+ l3->subcaches[0] = sc0 += !(val & BIT(1));
+ l3->subcaches[1] = sc1 += !(val & BIT(5));
+ }
+
+ l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9));
+ l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
+
+ l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
+}
+
+/*
+ * check whether a slot used for disabling an L3 index is occupied.
+ * @l3: L3 cache descriptor
+ * @slot: slot number (0..1)
+ *
+ * @returns: the disabled index if used or negative value if slot free.
+ */
+static int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned int slot)
+{
+ unsigned int reg = 0;
+
+ pci_read_config_dword(nb->misc, 0x1BC + slot * 4, &reg);
+
+ /* check whether this slot is activated already */
+ if (reg & (3UL << 30))
+ return reg & 0xfff;
+
+ return -1;
+}
+
+static ssize_t show_cache_disable(struct cacheinfo *ci, char *buf, unsigned int slot)
+{
+ int index;
+ struct amd_northbridge *nb = ci->priv;
+
+ index = amd_get_l3_disable_slot(nb, slot);
+ if (index >= 0)
+ return sysfs_emit(buf, "%d\n", index);
+
+ return sysfs_emit(buf, "FREE\n");
+}
+
+#define SHOW_CACHE_DISABLE(slot) \
+static ssize_t \
+cache_disable_##slot##_show(struct device *dev, \
+ struct device_attribute *attr, char *buf) \
+{ \
+ struct cacheinfo *ci = dev_get_drvdata(dev); \
+ return show_cache_disable(ci, buf, slot); \
+}
+
+SHOW_CACHE_DISABLE(0)
+SHOW_CACHE_DISABLE(1)
+
+static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu,
+ unsigned int slot, unsigned long idx)
+{
+ int i;
+
+ idx |= BIT(30);
+
+ /*
+ * disable index in all 4 subcaches
+ */
+ for (i = 0; i < 4; i++) {
+ u32 reg = idx | (i << 20);
+
+ if (!nb->l3_cache.subcaches[i])
+ continue;
+
+ pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg);
+
+ /*
+ * We need to WBINVD on a core on the node containing the L3
+ * cache which indices we disable therefore a simple wbinvd()
+ * is not sufficient.
+ */
+ wbinvd_on_cpu(cpu);
+
+ reg |= BIT(31);
+ pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg);
+ }
+}
+
+/*
+ * disable a L3 cache index by using a disable-slot
+ *
+ * @l3: L3 cache descriptor
+ * @cpu: A CPU on the node containing the L3 cache
+ * @slot: slot number (0..1)
+ * @index: index to disable
+ *
+ * @return: 0 on success, error status on failure
+ */
+static int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu,
+ unsigned int slot, unsigned long index)
+{
+ int ret = 0;
+
+ /* check if @slot is already used or the index is already disabled */
+ ret = amd_get_l3_disable_slot(nb, slot);
+ if (ret >= 0)
+ return -EEXIST;
+
+ if (index > nb->l3_cache.indices)
+ return -EINVAL;
+
+ /* check whether the other slot has disabled the same index already */
+ if (index == amd_get_l3_disable_slot(nb, !slot))
+ return -EEXIST;
+
+ amd_l3_disable_index(nb, cpu, slot, index);
+
+ return 0;
+}
+
+static ssize_t store_cache_disable(struct cacheinfo *ci, const char *buf,
+ size_t count, unsigned int slot)
+{
+ struct amd_northbridge *nb = ci->priv;
+ unsigned long val = 0;
+ int cpu, err = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ cpu = cpumask_first(&ci->shared_cpu_map);
+
+ if (kstrtoul(buf, 10, &val) < 0)
+ return -EINVAL;
+
+ err = amd_set_l3_disable_slot(nb, cpu, slot, val);
+ if (err) {
+ if (err == -EEXIST)
+ pr_warn("L3 slot %d in use/index already disabled!\n",
+ slot);
+ return err;
+ }
+ return count;
+}
+
+#define STORE_CACHE_DISABLE(slot) \
+static ssize_t \
+cache_disable_##slot##_store(struct device *dev, \
+ struct device_attribute *attr, \
+ const char *buf, size_t count) \
+{ \
+ struct cacheinfo *ci = dev_get_drvdata(dev); \
+ return store_cache_disable(ci, buf, count, slot); \
+}
+
+STORE_CACHE_DISABLE(0)
+STORE_CACHE_DISABLE(1)
+
+static ssize_t subcaches_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct cacheinfo *ci = dev_get_drvdata(dev);
+ int cpu = cpumask_first(&ci->shared_cpu_map);
+
+ return sysfs_emit(buf, "%x\n", amd_get_subcaches(cpu));
+}
+
+static ssize_t subcaches_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct cacheinfo *ci = dev_get_drvdata(dev);
+ int cpu = cpumask_first(&ci->shared_cpu_map);
+ unsigned long val;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (kstrtoul(buf, 16, &val) < 0)
+ return -EINVAL;
+
+ if (amd_set_subcaches(cpu, val))
+ return -EINVAL;
+
+ return count;
+}
+
+static DEVICE_ATTR_RW(cache_disable_0);
+static DEVICE_ATTR_RW(cache_disable_1);
+static DEVICE_ATTR_RW(subcaches);
+
+static umode_t cache_private_attrs_is_visible(struct kobject *kobj,
+ struct attribute *attr, int unused)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct cacheinfo *ci = dev_get_drvdata(dev);
+ umode_t mode = attr->mode;
+
+ if (!ci->priv)
+ return 0;
+
+ if ((attr == &dev_attr_subcaches.attr) &&
+ amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ return mode;
+
+ if ((attr == &dev_attr_cache_disable_0.attr ||
+ attr == &dev_attr_cache_disable_1.attr) &&
+ amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
+ return mode;
+
+ return 0;
+}
+
+static struct attribute_group cache_private_group = {
+ .is_visible = cache_private_attrs_is_visible,
+};
+
+static void init_amd_l3_attrs(void)
+{
+ static struct attribute **amd_l3_attrs;
+ int n = 1;
+
+ if (amd_l3_attrs) /* already initialized */
+ return;
+
+ if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
+ n += 2;
+ if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ n += 1;
+
+ amd_l3_attrs = kcalloc(n, sizeof(*amd_l3_attrs), GFP_KERNEL);
+ if (!amd_l3_attrs)
+ return;
+
+ n = 0;
+ if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
+ amd_l3_attrs[n++] = &dev_attr_cache_disable_0.attr;
+ amd_l3_attrs[n++] = &dev_attr_cache_disable_1.attr;
+ }
+ if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ amd_l3_attrs[n++] = &dev_attr_subcaches.attr;
+
+ cache_private_group.attrs = amd_l3_attrs;
+}
+
+const struct attribute_group *cache_get_priv_group(struct cacheinfo *ci)
+{
+ struct amd_northbridge *nb = ci->priv;
+
+ if (ci->level < 3 || !nb)
+ return NULL;
+
+ if (nb && nb->l3_cache.indices)
+ init_amd_l3_attrs();
+
+ return &cache_private_group;
+}
+
+struct amd_northbridge *amd_init_l3_cache(int index)
+{
+ struct amd_northbridge *nb;
+ int node;
+
+ /* only for L3, and not in virtualized environments */
+ if (index < 3)
+ return NULL;
+
+ node = topology_amd_node_id(smp_processor_id());
+ nb = node_to_amd_nb(node);
+ if (nb && !nb->l3_cache.indices)
+ amd_calc_l3_indices(nb);
+
+ return nb;
+}
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
index 1f60a2b27936..a315b0627dfb 100644
--- a/arch/x86/kernel/cpu/aperfmperf.c
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -20,6 +20,7 @@
#include <asm/cpu.h>
#include <asm/cpu_device_id.h>
#include <asm/intel-family.h>
+#include <asm/msr.h>
#include "cpu.h"
@@ -40,8 +41,8 @@ static void init_counter_refs(void)
{
u64 aperf, mperf;
- rdmsrl(MSR_IA32_APERF, aperf);
- rdmsrl(MSR_IA32_MPERF, mperf);
+ rdmsrq(MSR_IA32_APERF, aperf);
+ rdmsrq(MSR_IA32_MPERF, mperf);
this_cpu_write(cpu_samples.aperf, aperf);
this_cpu_write(cpu_samples.mperf, mperf);
@@ -99,7 +100,7 @@ static bool __init turbo_disabled(void)
u64 misc_en;
int err;
- err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en);
+ err = rdmsrq_safe(MSR_IA32_MISC_ENABLE, &misc_en);
if (err)
return false;
@@ -110,11 +111,11 @@ static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
{
int err;
- err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq);
+ err = rdmsrq_safe(MSR_ATOM_CORE_RATIOS, base_freq);
if (err)
return false;
- err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
+ err = rdmsrq_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
if (err)
return false;
@@ -124,25 +125,24 @@ static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
return true;
}
-#define X86_MATCH(model) \
- X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \
- INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL)
+#define X86_MATCH(vfm) \
+ X86_MATCH_VFM_FEATURE(vfm, X86_FEATURE_APERFMPERF, NULL)
static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = {
- X86_MATCH(XEON_PHI_KNL),
- X86_MATCH(XEON_PHI_KNM),
+ X86_MATCH(INTEL_XEON_PHI_KNL),
+ X86_MATCH(INTEL_XEON_PHI_KNM),
{}
};
static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = {
- X86_MATCH(SKYLAKE_X),
+ X86_MATCH(INTEL_SKYLAKE_X),
{}
};
static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = {
- X86_MATCH(ATOM_GOLDMONT),
- X86_MATCH(ATOM_GOLDMONT_D),
- X86_MATCH(ATOM_GOLDMONT_PLUS),
+ X86_MATCH(INTEL_ATOM_GOLDMONT),
+ X86_MATCH(INTEL_ATOM_GOLDMONT_D),
+ X86_MATCH(INTEL_ATOM_GOLDMONT_PLUS),
{}
};
@@ -153,13 +153,13 @@ static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
int err, i;
u64 msr;
- err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
+ err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq);
if (err)
return false;
*base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
- err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
+ err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &msr);
if (err)
return false;
@@ -191,17 +191,17 @@ static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int s
u32 group_size;
int err, i;
- err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
+ err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq);
if (err)
return false;
*base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
- err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
+ err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
if (err)
return false;
- err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
+ err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
if (err)
return false;
@@ -221,11 +221,11 @@ static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
u64 msr;
int err;
- err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
+ err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq);
if (err)
return false;
- err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
+ err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &msr);
if (err)
return false;
@@ -307,7 +307,7 @@ static void freq_invariance_enable(void)
WARN_ON_ONCE(1);
return;
}
- static_branch_enable(&arch_scale_freq_key);
+ static_branch_enable_cpuslocked(&arch_scale_freq_key);
register_freq_invariance_syscore_ops();
pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
}
@@ -324,23 +324,115 @@ static void __init bp_init_freq_invariance(void)
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return;
- if (intel_set_max_freq_ratio())
+ if (intel_set_max_freq_ratio()) {
+ guard(cpus_read_lock)();
freq_invariance_enable();
+ }
}
static void disable_freq_invariance_workfn(struct work_struct *work)
{
+ int cpu;
+
static_branch_disable(&arch_scale_freq_key);
+
+ /*
+ * Set arch_freq_scale to a default value on all cpus
+ * This negates the effect of scaling
+ */
+ for_each_possible_cpu(cpu)
+ per_cpu(arch_freq_scale, cpu) = SCHED_CAPACITY_SCALE;
}
static DECLARE_WORK(disable_freq_invariance_work,
disable_freq_invariance_workfn);
DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
+EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale);
+
+static DEFINE_STATIC_KEY_FALSE(arch_hybrid_cap_scale_key);
+
+struct arch_hybrid_cpu_scale {
+ unsigned long capacity;
+ unsigned long freq_ratio;
+};
+
+static struct arch_hybrid_cpu_scale __percpu *arch_cpu_scale;
+
+/**
+ * arch_enable_hybrid_capacity_scale() - Enable hybrid CPU capacity scaling
+ *
+ * Allocate memory for per-CPU data used by hybrid CPU capacity scaling,
+ * initialize it and set the static key controlling its code paths.
+ *
+ * Must be called before arch_set_cpu_capacity().
+ */
+bool arch_enable_hybrid_capacity_scale(void)
+{
+ int cpu;
+
+ if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) {
+ WARN_ONCE(1, "Hybrid CPU capacity scaling already enabled");
+ return true;
+ }
+
+ arch_cpu_scale = alloc_percpu(struct arch_hybrid_cpu_scale);
+ if (!arch_cpu_scale)
+ return false;
+
+ for_each_possible_cpu(cpu) {
+ per_cpu_ptr(arch_cpu_scale, cpu)->capacity = SCHED_CAPACITY_SCALE;
+ per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio = arch_max_freq_ratio;
+ }
+
+ static_branch_enable(&arch_hybrid_cap_scale_key);
+
+ pr_info("Hybrid CPU capacity scaling enabled\n");
+
+ return true;
+}
+
+/**
+ * arch_set_cpu_capacity() - Set scale-invariance parameters for a CPU
+ * @cpu: Target CPU.
+ * @cap: Capacity of @cpu at its maximum frequency, relative to @max_cap.
+ * @max_cap: System-wide maximum CPU capacity.
+ * @cap_freq: Frequency of @cpu corresponding to @cap.
+ * @base_freq: Frequency of @cpu at which MPERF counts.
+ *
+ * The units in which @cap and @max_cap are expressed do not matter, so long
+ * as they are consistent, because the former is effectively divided by the
+ * latter. Analogously for @cap_freq and @base_freq.
+ *
+ * After calling this function for all CPUs, call arch_rebuild_sched_domains()
+ * to let the scheduler know that capacity-aware scheduling can be used going
+ * forward.
+ */
+void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap,
+ unsigned long cap_freq, unsigned long base_freq)
+{
+ if (static_branch_likely(&arch_hybrid_cap_scale_key)) {
+ WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity,
+ div_u64(cap << SCHED_CAPACITY_SHIFT, max_cap));
+ WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio,
+ div_u64(cap_freq << SCHED_CAPACITY_SHIFT, base_freq));
+ } else {
+ WARN_ONCE(1, "Hybrid CPU capacity scaling not enabled");
+ }
+}
+
+unsigned long arch_scale_cpu_capacity(int cpu)
+{
+ if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
+ return READ_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity);
+
+ return SCHED_CAPACITY_SCALE;
+}
+EXPORT_SYMBOL_GPL(arch_scale_cpu_capacity);
static void scale_freq_tick(u64 acnt, u64 mcnt)
{
- u64 freq_scale;
+ u64 freq_scale, freq_ratio;
if (!arch_scale_freq_invariant())
return;
@@ -348,7 +440,12 @@ static void scale_freq_tick(u64 acnt, u64 mcnt)
if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
goto error;
- if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt)
+ if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
+ freq_ratio = READ_ONCE(this_cpu_ptr(arch_cpu_scale)->freq_ratio);
+ else
+ freq_ratio = arch_max_freq_ratio;
+
+ if (check_mul_overflow(mcnt, freq_ratio, &mcnt) || !mcnt)
goto error;
freq_scale = div64_u64(acnt, mcnt);
@@ -378,8 +475,8 @@ void arch_scale_freq_tick(void)
if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
return;
- rdmsrl(MSR_IA32_APERF, aperf);
- rdmsrl(MSR_IA32_MPERF, mperf);
+ rdmsrq(MSR_IA32_APERF, aperf);
+ rdmsrq(MSR_IA32_MPERF, mperf);
acnt = aperf - s->aperf;
mcnt = mperf - s->mperf;
@@ -402,7 +499,7 @@ void arch_scale_freq_tick(void)
*/
#define MAX_SAMPLE_AGE ((unsigned long)HZ / 50)
-unsigned int arch_freq_get_on_cpu(int cpu)
+int arch_freq_get_on_cpu(int cpu)
{
struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu);
unsigned int seq, freq;
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 6761668100b9..7f94e6a5497d 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -9,7 +9,6 @@
* - Andrew D. Balsa (code cleanup).
*/
#include <linux/init.h>
-#include <linux/utsname.h>
#include <linux/cpu.h>
#include <linux/module.h>
#include <linux/nospec.h>
@@ -27,28 +26,74 @@
#include <asm/msr.h>
#include <asm/vmx.h>
#include <asm/paravirt.h>
-#include <asm/alternative.h>
-#include <asm/set_memory.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/e820/api.h>
#include <asm/hypervisor.h>
#include <asm/tlbflush.h>
+#include <asm/cpu.h>
#include "cpu.h"
+/*
+ * Speculation Vulnerability Handling
+ *
+ * Each vulnerability is handled with the following functions:
+ * <vuln>_select_mitigation() -- Selects a mitigation to use. This should
+ * take into account all relevant command line
+ * options.
+ * <vuln>_update_mitigation() -- This is called after all vulnerabilities have
+ * selected a mitigation, in case the selection
+ * may want to change based on other choices
+ * made. This function is optional.
+ * <vuln>_apply_mitigation() -- Enable the selected mitigation.
+ *
+ * The compile-time mitigation in all cases should be AUTO. An explicit
+ * command-line option can override AUTO. If no such option is
+ * provided, <vuln>_select_mitigation() will override AUTO to the best
+ * mitigation option.
+ */
+
static void __init spectre_v1_select_mitigation(void);
+static void __init spectre_v1_apply_mitigation(void);
static void __init spectre_v2_select_mitigation(void);
+static void __init spectre_v2_update_mitigation(void);
+static void __init spectre_v2_apply_mitigation(void);
static void __init retbleed_select_mitigation(void);
+static void __init retbleed_update_mitigation(void);
+static void __init retbleed_apply_mitigation(void);
static void __init spectre_v2_user_select_mitigation(void);
+static void __init spectre_v2_user_update_mitigation(void);
+static void __init spectre_v2_user_apply_mitigation(void);
static void __init ssb_select_mitigation(void);
+static void __init ssb_apply_mitigation(void);
static void __init l1tf_select_mitigation(void);
+static void __init l1tf_apply_mitigation(void);
static void __init mds_select_mitigation(void);
-static void __init md_clear_update_mitigation(void);
-static void __init md_clear_select_mitigation(void);
+static void __init mds_update_mitigation(void);
+static void __init mds_apply_mitigation(void);
static void __init taa_select_mitigation(void);
+static void __init taa_update_mitigation(void);
+static void __init taa_apply_mitigation(void);
static void __init mmio_select_mitigation(void);
+static void __init mmio_update_mitigation(void);
+static void __init mmio_apply_mitigation(void);
+static void __init rfds_select_mitigation(void);
+static void __init rfds_update_mitigation(void);
+static void __init rfds_apply_mitigation(void);
static void __init srbds_select_mitigation(void);
+static void __init srbds_apply_mitigation(void);
static void __init l1d_flush_select_mitigation(void);
+static void __init srso_select_mitigation(void);
+static void __init srso_update_mitigation(void);
+static void __init srso_apply_mitigation(void);
+static void __init gds_select_mitigation(void);
+static void __init gds_apply_mitigation(void);
+static void __init bhi_select_mitigation(void);
+static void __init bhi_update_mitigation(void);
+static void __init bhi_apply_mitigation(void);
+static void __init its_select_mitigation(void);
+static void __init its_update_mitigation(void);
+static void __init its_apply_mitigation(void);
/* The base value of the SPEC_CTRL MSR without task-specific bits set */
u64 x86_spec_ctrl_base;
@@ -56,15 +101,36 @@ EXPORT_SYMBOL_GPL(x86_spec_ctrl_base);
/* The current value of the SPEC_CTRL MSR with task-specific bits set */
DEFINE_PER_CPU(u64, x86_spec_ctrl_current);
-EXPORT_SYMBOL_GPL(x86_spec_ctrl_current);
+EXPORT_PER_CPU_SYMBOL_GPL(x86_spec_ctrl_current);
+
+u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB;
+
+static u64 __ro_after_init x86_arch_cap_msr;
static DEFINE_MUTEX(spec_ctrl_mutex);
+void (*x86_return_thunk)(void) __ro_after_init = __x86_return_thunk;
+
+static void __init set_return_thunk(void *thunk)
+{
+ if (x86_return_thunk != __x86_return_thunk)
+ pr_warn("x86/bugs: return thunk changed\n");
+
+ x86_return_thunk = thunk;
+}
+
+/* Update SPEC_CTRL MSR and its cached copy unconditionally */
+static void update_spec_ctrl(u64 val)
+{
+ this_cpu_write(x86_spec_ctrl_current, val);
+ wrmsrq(MSR_IA32_SPEC_CTRL, val);
+}
+
/*
* Keep track of the SPEC_CTRL MSR value for the current task, which may differ
* from x86_spec_ctrl_base due to STIBP/SSB in __speculation_ctrl_update().
*/
-void write_spec_ctrl_current(u64 val, bool force)
+void update_spec_ctrl_cond(u64 val)
{
if (this_cpu_read(x86_spec_ctrl_current) == val)
return;
@@ -75,11 +141,11 @@ void write_spec_ctrl_current(u64 val, bool force)
* When KERNEL_IBRS this MSR is written on return-to-user, unless
* forced the update can be delayed until that time.
*/
- if (force || !cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS))
- wrmsrl(MSR_IA32_SPEC_CTRL, val);
+ if (!cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS))
+ wrmsrq(MSR_IA32_SPEC_CTRL, val);
}
-u64 spec_ctrl_current(void)
+noinstr u64 spec_ctrl_current(void)
{
return this_cpu_read(x86_spec_ctrl_current);
}
@@ -99,9 +165,10 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
/* Control unconditional IBPB in switch_mm() */
DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
-/* Control MDS CPU buffer clear before returning to user space */
-DEFINE_STATIC_KEY_FALSE(mds_user_clear);
-EXPORT_SYMBOL_GPL(mds_user_clear);
+/* Control IBPB on vCPU load */
+DEFINE_STATIC_KEY_FALSE(switch_vcpu_ibpb);
+EXPORT_SYMBOL_GPL(switch_vcpu_ibpb);
+
/* Control MDS CPU buffer clear before idling (halt, mwait) */
DEFINE_STATIC_KEY_FALSE(mds_idle_clear);
EXPORT_SYMBOL_GPL(mds_idle_clear);
@@ -113,105 +180,110 @@ EXPORT_SYMBOL_GPL(mds_idle_clear);
*/
DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush);
-/* Controls CPU Fill buffer clear before KVM guest MMIO accesses */
-DEFINE_STATIC_KEY_FALSE(mmio_stale_data_clear);
-EXPORT_SYMBOL_GPL(mmio_stale_data_clear);
+/*
+ * Controls CPU Fill buffer clear before VMenter. This is a subset of
+ * X86_FEATURE_CLEAR_CPU_BUF, and should only be enabled when KVM-only
+ * mitigation is required.
+ */
+DEFINE_STATIC_KEY_FALSE(cpu_buf_vm_clear);
+EXPORT_SYMBOL_GPL(cpu_buf_vm_clear);
-void __init check_bugs(void)
+void __init cpu_select_mitigations(void)
{
- identify_boot_cpu();
-
- /*
- * identify_boot_cpu() initialized SMT support information, let the
- * core code know.
- */
- cpu_smt_check_topology();
-
- if (!IS_ENABLED(CONFIG_SMP)) {
- pr_info("CPU: ");
- print_cpu_info(&boot_cpu_data);
- }
-
/*
* Read the SPEC_CTRL MSR to account for reserved bits which may
* have unknown values. AMD64_LS_CFG MSR is cached in the early AMD
* init code as it is not enumerated and depends on the family.
*/
- if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
- rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+ if (cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) {
+ rdmsrq(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+
+ /*
+ * Previously running kernel (kexec), may have some controls
+ * turned ON. Clear them and let the mitigations setup below
+ * rediscover them based on configuration.
+ */
+ x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK;
+ }
+
+ x86_arch_cap_msr = x86_read_arch_cap_msr();
/* Select the proper CPU mitigations before patching alternatives: */
spectre_v1_select_mitigation();
spectre_v2_select_mitigation();
- /*
- * retbleed_select_mitigation() relies on the state set by
- * spectre_v2_select_mitigation(); specifically it wants to know about
- * spectre_v2=ibrs.
- */
retbleed_select_mitigation();
- /*
- * spectre_v2_user_select_mitigation() relies on the state set by
- * retbleed_select_mitigation(); specifically the STIBP selection is
- * forced for UNRET.
- */
spectre_v2_user_select_mitigation();
ssb_select_mitigation();
l1tf_select_mitigation();
- md_clear_select_mitigation();
+ mds_select_mitigation();
+ taa_select_mitigation();
+ mmio_select_mitigation();
+ rfds_select_mitigation();
srbds_select_mitigation();
l1d_flush_select_mitigation();
+ srso_select_mitigation();
+ gds_select_mitigation();
+ its_select_mitigation();
+ bhi_select_mitigation();
- arch_smt_update();
-
-#ifdef CONFIG_X86_32
/*
- * Check whether we are able to run this kernel safely on SMP.
- *
- * - i386 is no longer supported.
- * - In order to run on anything without a TSC, we need to be
- * compiled for a i486.
+ * After mitigations are selected, some may need to update their
+ * choices.
*/
- if (boot_cpu_data.x86 < 4)
- panic("Kernel requires i486+ for 'invlpg' and other features");
-
- init_utsname()->machine[1] =
- '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
- alternative_instructions();
-
- fpu__init_check_bugs();
-#else /* CONFIG_X86_64 */
- alternative_instructions();
+ spectre_v2_update_mitigation();
+ /*
+ * retbleed_update_mitigation() relies on the state set by
+ * spectre_v2_update_mitigation(); specifically it wants to know about
+ * spectre_v2=ibrs.
+ */
+ retbleed_update_mitigation();
+ /*
+ * its_update_mitigation() depends on spectre_v2_update_mitigation()
+ * and retbleed_update_mitigation().
+ */
+ its_update_mitigation();
/*
- * Make sure the first 2MB area is not mapped by huge pages
- * There are typically fixed size MTRRs in there and overlapping
- * MTRRs into large pages causes slow downs.
- *
- * Right now we don't do that with gbpages because there seems
- * very little benefit for that case.
+ * spectre_v2_user_update_mitigation() depends on
+ * retbleed_update_mitigation(), specifically the STIBP
+ * selection is forced for UNRET or IBPB.
*/
- if (!direct_gbpages)
- set_memory_4k((unsigned long)__va(0), 1);
-#endif
+ spectre_v2_user_update_mitigation();
+ mds_update_mitigation();
+ taa_update_mitigation();
+ mmio_update_mitigation();
+ rfds_update_mitigation();
+ bhi_update_mitigation();
+ /* srso_update_mitigation() depends on retbleed_update_mitigation(). */
+ srso_update_mitigation();
+
+ spectre_v1_apply_mitigation();
+ spectre_v2_apply_mitigation();
+ retbleed_apply_mitigation();
+ spectre_v2_user_apply_mitigation();
+ ssb_apply_mitigation();
+ l1tf_apply_mitigation();
+ mds_apply_mitigation();
+ taa_apply_mitigation();
+ mmio_apply_mitigation();
+ rfds_apply_mitigation();
+ srbds_apply_mitigation();
+ srso_apply_mitigation();
+ gds_apply_mitigation();
+ its_apply_mitigation();
+ bhi_apply_mitigation();
}
/*
- * NOTE: This function is *only* called for SVM. VMX spec_ctrl handling is
- * done in vmenter.S.
+ * NOTE: This function is *only* called for SVM, since Intel uses
+ * MSR_IA32_SPEC_CTRL for SSBD.
*/
void
-x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
+x86_virt_spec_ctrl(u64 guest_virt_spec_ctrl, bool setguest)
{
- u64 msrval, guestval = guest_spec_ctrl, hostval = spec_ctrl_current();
+ u64 guestval, hostval;
struct thread_info *ti = current_thread_info();
- if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) {
- if (hostval != guestval) {
- msrval = setguest ? guestval : hostval;
- wrmsrl(MSR_IA32_SPEC_CTRL, msrval);
- }
- }
-
/*
* If SSBD is not handled in MSR_SPEC_CTRL on AMD, update
* MSR_AMD64_L2_CFG or MSR_VIRT_SPEC_CTRL if supported.
@@ -249,16 +321,17 @@ static void x86_amd_ssb_disable(void)
u64 msrval = x86_amd_ls_cfg_base | x86_amd_ls_cfg_ssbd_mask;
if (boot_cpu_has(X86_FEATURE_VIRT_SSBD))
- wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, SPEC_CTRL_SSBD);
+ wrmsrq(MSR_AMD64_VIRT_SPEC_CTRL, SPEC_CTRL_SSBD);
else if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD))
- wrmsrl(MSR_AMD64_LS_CFG, msrval);
+ wrmsrq(MSR_AMD64_LS_CFG, msrval);
}
#undef pr_fmt
#define pr_fmt(fmt) "MDS: " fmt
/* Default mitigation for MDS-affected CPUs */
-static enum mds_mitigations mds_mitigation __ro_after_init = MDS_MITIGATION_FULL;
+static enum mds_mitigations mds_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_MDS) ? MDS_MITIGATION_AUTO : MDS_MITIGATION_OFF;
static bool mds_nosmt __ro_after_init = false;
static const char * const mds_strings[] = {
@@ -267,6 +340,46 @@ static const char * const mds_strings[] = {
[MDS_MITIGATION_VMWERV] = "Vulnerable: Clear CPU buffers attempted, no microcode",
};
+enum taa_mitigations {
+ TAA_MITIGATION_OFF,
+ TAA_MITIGATION_AUTO,
+ TAA_MITIGATION_UCODE_NEEDED,
+ TAA_MITIGATION_VERW,
+ TAA_MITIGATION_TSX_DISABLED,
+};
+
+/* Default mitigation for TAA-affected CPUs */
+static enum taa_mitigations taa_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_TAA) ? TAA_MITIGATION_AUTO : TAA_MITIGATION_OFF;
+
+enum mmio_mitigations {
+ MMIO_MITIGATION_OFF,
+ MMIO_MITIGATION_AUTO,
+ MMIO_MITIGATION_UCODE_NEEDED,
+ MMIO_MITIGATION_VERW,
+};
+
+/* Default mitigation for Processor MMIO Stale Data vulnerabilities */
+static enum mmio_mitigations mmio_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_MMIO_STALE_DATA) ? MMIO_MITIGATION_AUTO : MMIO_MITIGATION_OFF;
+
+enum rfds_mitigations {
+ RFDS_MITIGATION_OFF,
+ RFDS_MITIGATION_AUTO,
+ RFDS_MITIGATION_VERW,
+ RFDS_MITIGATION_UCODE_NEEDED,
+};
+
+/* Default mitigation for Register File Data Sampling */
+static enum rfds_mitigations rfds_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_AUTO : RFDS_MITIGATION_OFF;
+
+/*
+ * Set if any of MDS/TAA/MMIO/RFDS are going to enable VERW clearing
+ * through X86_FEATURE_CLEAR_CPU_BUF on kernel and guest entry.
+ */
+static bool verw_clear_cpu_buf_mitigation_selected __ro_after_init;
+
static void __init mds_select_mitigation(void)
{
if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) {
@@ -274,12 +387,37 @@ static void __init mds_select_mitigation(void)
return;
}
+ if (mds_mitigation == MDS_MITIGATION_AUTO)
+ mds_mitigation = MDS_MITIGATION_FULL;
+
+ if (mds_mitigation == MDS_MITIGATION_OFF)
+ return;
+
+ verw_clear_cpu_buf_mitigation_selected = true;
+}
+
+static void __init mds_update_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off())
+ return;
+
+ /* If TAA, MMIO, or RFDS are being mitigated, MDS gets mitigated too. */
+ if (verw_clear_cpu_buf_mitigation_selected)
+ mds_mitigation = MDS_MITIGATION_FULL;
+
if (mds_mitigation == MDS_MITIGATION_FULL) {
if (!boot_cpu_has(X86_FEATURE_MD_CLEAR))
mds_mitigation = MDS_MITIGATION_VMWERV;
+ }
- static_branch_enable(&mds_user_clear);
+ pr_info("%s\n", mds_strings[mds_mitigation]);
+}
+static void __init mds_apply_mitigation(void)
+{
+ if (mds_mitigation == MDS_MITIGATION_FULL ||
+ mds_mitigation == MDS_MITIGATION_VMWERV) {
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) &&
(mds_nosmt || cpu_mitigations_auto_nosmt()))
cpu_smt_disable(false);
@@ -310,15 +448,6 @@ early_param("mds", mds_cmdline);
#undef pr_fmt
#define pr_fmt(fmt) "TAA: " fmt
-enum taa_mitigations {
- TAA_MITIGATION_OFF,
- TAA_MITIGATION_UCODE_NEEDED,
- TAA_MITIGATION_VERW,
- TAA_MITIGATION_TSX_DISABLED,
-};
-
-/* Default mitigation for TAA-affected CPUs */
-static enum taa_mitigations taa_mitigation __ro_after_init = TAA_MITIGATION_VERW;
static bool taa_nosmt __ro_after_init;
static const char * const taa_strings[] = {
@@ -328,10 +457,13 @@ static const char * const taa_strings[] = {
[TAA_MITIGATION_TSX_DISABLED] = "Mitigation: TSX disabled",
};
-static void __init taa_select_mitigation(void)
+static bool __init taa_vulnerable(void)
{
- u64 ia32_cap;
+ return boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM);
+}
+static void __init taa_select_mitigation(void)
+{
if (!boot_cpu_has_bug(X86_BUG_TAA)) {
taa_mitigation = TAA_MITIGATION_OFF;
return;
@@ -343,49 +475,63 @@ static void __init taa_select_mitigation(void)
return;
}
- if (cpu_mitigations_off()) {
+ if (cpu_mitigations_off())
taa_mitigation = TAA_MITIGATION_OFF;
- return;
- }
- /*
- * TAA mitigation via VERW is turned off if both
- * tsx_async_abort=off and mds=off are specified.
- */
- if (taa_mitigation == TAA_MITIGATION_OFF &&
- mds_mitigation == MDS_MITIGATION_OFF)
+ /* Microcode will be checked in taa_update_mitigation(). */
+ if (taa_mitigation == TAA_MITIGATION_AUTO)
+ taa_mitigation = TAA_MITIGATION_VERW;
+
+ if (taa_mitigation != TAA_MITIGATION_OFF)
+ verw_clear_cpu_buf_mitigation_selected = true;
+}
+
+static void __init taa_update_mitigation(void)
+{
+ if (!taa_vulnerable() || cpu_mitigations_off())
return;
- if (boot_cpu_has(X86_FEATURE_MD_CLEAR))
+ if (verw_clear_cpu_buf_mitigation_selected)
taa_mitigation = TAA_MITIGATION_VERW;
- else
- taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
- /*
- * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1.
- * A microcode update fixes this behavior to clear CPU buffers. It also
- * adds support for MSR_IA32_TSX_CTRL which is enumerated by the
- * ARCH_CAP_TSX_CTRL_MSR bit.
- *
- * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode
- * update is required.
- */
- ia32_cap = x86_read_arch_cap_msr();
- if ( (ia32_cap & ARCH_CAP_MDS_NO) &&
- !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR))
- taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
+ if (taa_mitigation == TAA_MITIGATION_VERW) {
+ /* Check if the requisite ucode is available. */
+ if (!boot_cpu_has(X86_FEATURE_MD_CLEAR))
+ taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
- /*
- * TSX is enabled, select alternate mitigation for TAA which is
- * the same as MDS. Enable MDS static branch to clear CPU buffers.
- *
- * For guests that can't determine whether the correct microcode is
- * present on host, enable the mitigation for UCODE_NEEDED as well.
- */
- static_branch_enable(&mds_user_clear);
+ /*
+ * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1.
+ * A microcode update fixes this behavior to clear CPU buffers. It also
+ * adds support for MSR_IA32_TSX_CTRL which is enumerated by the
+ * ARCH_CAP_TSX_CTRL_MSR bit.
+ *
+ * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode
+ * update is required.
+ */
+ if ((x86_arch_cap_msr & ARCH_CAP_MDS_NO) &&
+ !(x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR))
+ taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
+ }
- if (taa_nosmt || cpu_mitigations_auto_nosmt())
- cpu_smt_disable(false);
+ pr_info("%s\n", taa_strings[taa_mitigation]);
+}
+
+static void __init taa_apply_mitigation(void)
+{
+ if (taa_mitigation == TAA_MITIGATION_VERW ||
+ taa_mitigation == TAA_MITIGATION_UCODE_NEEDED) {
+ /*
+ * TSX is enabled, select alternate mitigation for TAA which is
+ * the same as MDS. Enable MDS static branch to clear CPU buffers.
+ *
+ * For guests that can't determine whether the correct microcode is
+ * present on host, enable the mitigation for UCODE_NEEDED as well.
+ */
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+
+ if (taa_nosmt || cpu_mitigations_auto_nosmt())
+ cpu_smt_disable(false);
+ }
}
static int __init tsx_async_abort_parse_cmdline(char *str)
@@ -412,14 +558,6 @@ early_param("tsx_async_abort", tsx_async_abort_parse_cmdline);
#undef pr_fmt
#define pr_fmt(fmt) "MMIO Stale Data: " fmt
-enum mmio_mitigations {
- MMIO_MITIGATION_OFF,
- MMIO_MITIGATION_UCODE_NEEDED,
- MMIO_MITIGATION_VERW,
-};
-
-/* Default mitigation for Processor MMIO Stale Data vulnerabilities */
-static enum mmio_mitigations mmio_mitigation __ro_after_init = MMIO_MITIGATION_VERW;
static bool mmio_nosmt __ro_after_init = false;
static const char * const mmio_strings[] = {
@@ -430,52 +568,77 @@ static const char * const mmio_strings[] = {
static void __init mmio_select_mitigation(void)
{
- u64 ia32_cap;
-
if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) ||
- cpu_mitigations_off()) {
+ cpu_mitigations_off()) {
mmio_mitigation = MMIO_MITIGATION_OFF;
return;
}
+ /* Microcode will be checked in mmio_update_mitigation(). */
+ if (mmio_mitigation == MMIO_MITIGATION_AUTO)
+ mmio_mitigation = MMIO_MITIGATION_VERW;
+
if (mmio_mitigation == MMIO_MITIGATION_OFF)
return;
- ia32_cap = x86_read_arch_cap_msr();
-
/*
* Enable CPU buffer clear mitigation for host and VMM, if also affected
- * by MDS or TAA. Otherwise, enable mitigation for VMM only.
+ * by MDS or TAA.
*/
- if (boot_cpu_has_bug(X86_BUG_MDS) || (boot_cpu_has_bug(X86_BUG_TAA) &&
- boot_cpu_has(X86_FEATURE_RTM)))
- static_branch_enable(&mds_user_clear);
- else
- static_branch_enable(&mmio_stale_data_clear);
+ if (boot_cpu_has_bug(X86_BUG_MDS) || taa_vulnerable())
+ verw_clear_cpu_buf_mitigation_selected = true;
+}
+
+static void __init mmio_update_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || cpu_mitigations_off())
+ return;
+
+ if (verw_clear_cpu_buf_mitigation_selected)
+ mmio_mitigation = MMIO_MITIGATION_VERW;
+
+ if (mmio_mitigation == MMIO_MITIGATION_VERW) {
+ /*
+ * Check if the system has the right microcode.
+ *
+ * CPU Fill buffer clear mitigation is enumerated by either an explicit
+ * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS
+ * affected systems.
+ */
+ if (!((x86_arch_cap_msr & ARCH_CAP_FB_CLEAR) ||
+ (boot_cpu_has(X86_FEATURE_MD_CLEAR) &&
+ boot_cpu_has(X86_FEATURE_FLUSH_L1D) &&
+ !(x86_arch_cap_msr & ARCH_CAP_MDS_NO))))
+ mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED;
+ }
+
+ pr_info("%s\n", mmio_strings[mmio_mitigation]);
+}
+
+static void __init mmio_apply_mitigation(void)
+{
+ if (mmio_mitigation == MMIO_MITIGATION_OFF)
+ return;
+
+ /*
+ * Only enable the VMM mitigation if the CPU buffer clear mitigation is
+ * not being used.
+ */
+ if (verw_clear_cpu_buf_mitigation_selected) {
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ static_branch_disable(&cpu_buf_vm_clear);
+ } else {
+ static_branch_enable(&cpu_buf_vm_clear);
+ }
/*
* If Processor-MMIO-Stale-Data bug is present and Fill Buffer data can
* be propagated to uncore buffers, clearing the Fill buffers on idle
* is required irrespective of SMT state.
*/
- if (!(ia32_cap & ARCH_CAP_FBSDP_NO))
+ if (!(x86_arch_cap_msr & ARCH_CAP_FBSDP_NO))
static_branch_enable(&mds_idle_clear);
- /*
- * Check if the system has the right microcode.
- *
- * CPU Fill buffer clear mitigation is enumerated by either an explicit
- * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS
- * affected systems.
- */
- if ((ia32_cap & ARCH_CAP_FB_CLEAR) ||
- (boot_cpu_has(X86_FEATURE_MD_CLEAR) &&
- boot_cpu_has(X86_FEATURE_FLUSH_L1D) &&
- !(ia32_cap & ARCH_CAP_MDS_NO)))
- mmio_mitigation = MMIO_MITIGATION_VERW;
- else
- mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED;
-
if (mmio_nosmt || cpu_mitigations_auto_nosmt())
cpu_smt_disable(false);
}
@@ -502,70 +665,89 @@ static int __init mmio_stale_data_parse_cmdline(char *str)
early_param("mmio_stale_data", mmio_stale_data_parse_cmdline);
#undef pr_fmt
-#define pr_fmt(fmt) "" fmt
+#define pr_fmt(fmt) "Register File Data Sampling: " fmt
-static void __init md_clear_update_mitigation(void)
+static const char * const rfds_strings[] = {
+ [RFDS_MITIGATION_OFF] = "Vulnerable",
+ [RFDS_MITIGATION_VERW] = "Mitigation: Clear Register File",
+ [RFDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
+};
+
+static inline bool __init verw_clears_cpu_reg_file(void)
{
- if (cpu_mitigations_off())
+ return (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR);
+}
+
+static void __init rfds_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_RFDS) || cpu_mitigations_off()) {
+ rfds_mitigation = RFDS_MITIGATION_OFF;
return;
+ }
- if (!static_key_enabled(&mds_user_clear))
- goto out;
+ if (rfds_mitigation == RFDS_MITIGATION_AUTO)
+ rfds_mitigation = RFDS_MITIGATION_VERW;
- /*
- * mds_user_clear is now enabled. Update MDS, TAA and MMIO Stale Data
- * mitigation, if necessary.
- */
- if (mds_mitigation == MDS_MITIGATION_OFF &&
- boot_cpu_has_bug(X86_BUG_MDS)) {
- mds_mitigation = MDS_MITIGATION_FULL;
- mds_select_mitigation();
- }
- if (taa_mitigation == TAA_MITIGATION_OFF &&
- boot_cpu_has_bug(X86_BUG_TAA)) {
- taa_mitigation = TAA_MITIGATION_VERW;
- taa_select_mitigation();
- }
- if (mmio_mitigation == MMIO_MITIGATION_OFF &&
- boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) {
- mmio_mitigation = MMIO_MITIGATION_VERW;
- mmio_select_mitigation();
+ if (rfds_mitigation == RFDS_MITIGATION_OFF)
+ return;
+
+ if (verw_clears_cpu_reg_file())
+ verw_clear_cpu_buf_mitigation_selected = true;
+}
+
+static void __init rfds_update_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_RFDS) || cpu_mitigations_off())
+ return;
+
+ if (verw_clear_cpu_buf_mitigation_selected)
+ rfds_mitigation = RFDS_MITIGATION_VERW;
+
+ if (rfds_mitigation == RFDS_MITIGATION_VERW) {
+ if (!verw_clears_cpu_reg_file())
+ rfds_mitigation = RFDS_MITIGATION_UCODE_NEEDED;
}
-out:
- if (boot_cpu_has_bug(X86_BUG_MDS))
- pr_info("MDS: %s\n", mds_strings[mds_mitigation]);
- if (boot_cpu_has_bug(X86_BUG_TAA))
- pr_info("TAA: %s\n", taa_strings[taa_mitigation]);
- if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA))
- pr_info("MMIO Stale Data: %s\n", mmio_strings[mmio_mitigation]);
+
+ pr_info("%s\n", rfds_strings[rfds_mitigation]);
}
-static void __init md_clear_select_mitigation(void)
+static void __init rfds_apply_mitigation(void)
{
- mds_select_mitigation();
- taa_select_mitigation();
- mmio_select_mitigation();
+ if (rfds_mitigation == RFDS_MITIGATION_VERW)
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+}
- /*
- * As MDS, TAA and MMIO Stale Data mitigations are inter-related, update
- * and print their mitigation after MDS, TAA and MMIO Stale Data
- * mitigation selection is done.
- */
- md_clear_update_mitigation();
+static __init int rfds_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!boot_cpu_has_bug(X86_BUG_RFDS))
+ return 0;
+
+ if (!strcmp(str, "off"))
+ rfds_mitigation = RFDS_MITIGATION_OFF;
+ else if (!strcmp(str, "on"))
+ rfds_mitigation = RFDS_MITIGATION_VERW;
+
+ return 0;
}
+early_param("reg_file_data_sampling", rfds_parse_cmdline);
#undef pr_fmt
#define pr_fmt(fmt) "SRBDS: " fmt
enum srbds_mitigations {
SRBDS_MITIGATION_OFF,
+ SRBDS_MITIGATION_AUTO,
SRBDS_MITIGATION_UCODE_NEEDED,
SRBDS_MITIGATION_FULL,
SRBDS_MITIGATION_TSX_OFF,
SRBDS_MITIGATION_HYPERVISOR,
};
-static enum srbds_mitigations srbds_mitigation __ro_after_init = SRBDS_MITIGATION_FULL;
+static enum srbds_mitigations srbds_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_SRBDS) ? SRBDS_MITIGATION_AUTO : SRBDS_MITIGATION_OFF;
static const char * const srbds_strings[] = {
[SRBDS_MITIGATION_OFF] = "Vulnerable",
@@ -597,7 +779,7 @@ void update_srbds_msr(void)
if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL))
return;
- rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+ rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
switch (srbds_mitigation) {
case SRBDS_MITIGATION_OFF:
@@ -611,36 +793,42 @@ void update_srbds_msr(void)
break;
}
- wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+ wrmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
}
static void __init srbds_select_mitigation(void)
{
- u64 ia32_cap;
-
- if (!boot_cpu_has_bug(X86_BUG_SRBDS))
+ if (!boot_cpu_has_bug(X86_BUG_SRBDS) || cpu_mitigations_off()) {
+ srbds_mitigation = SRBDS_MITIGATION_OFF;
return;
+ }
+
+ if (srbds_mitigation == SRBDS_MITIGATION_AUTO)
+ srbds_mitigation = SRBDS_MITIGATION_FULL;
/*
* Check to see if this is one of the MDS_NO systems supporting TSX that
* are only exposed to SRBDS when TSX is enabled or when CPU is affected
* by Processor MMIO Stale Data vulnerability.
*/
- ia32_cap = x86_read_arch_cap_msr();
- if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) &&
+ if ((x86_arch_cap_msr & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) &&
!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA))
srbds_mitigation = SRBDS_MITIGATION_TSX_OFF;
else if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR;
else if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL))
srbds_mitigation = SRBDS_MITIGATION_UCODE_NEEDED;
- else if (cpu_mitigations_off() || srbds_off)
+ else if (srbds_off)
srbds_mitigation = SRBDS_MITIGATION_OFF;
- update_srbds_msr();
pr_info("%s\n", srbds_strings[srbds_mitigation]);
}
+static void __init srbds_apply_mitigation(void)
+{
+ update_srbds_msr();
+}
+
static int __init srbds_parse_cmdline(char *str)
{
if (!str)
@@ -683,6 +871,159 @@ static int __init l1d_flush_parse_cmdline(char *str)
early_param("l1d_flush", l1d_flush_parse_cmdline);
#undef pr_fmt
+#define pr_fmt(fmt) "GDS: " fmt
+
+enum gds_mitigations {
+ GDS_MITIGATION_OFF,
+ GDS_MITIGATION_AUTO,
+ GDS_MITIGATION_UCODE_NEEDED,
+ GDS_MITIGATION_FORCE,
+ GDS_MITIGATION_FULL,
+ GDS_MITIGATION_FULL_LOCKED,
+ GDS_MITIGATION_HYPERVISOR,
+};
+
+static enum gds_mitigations gds_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_GDS) ? GDS_MITIGATION_AUTO : GDS_MITIGATION_OFF;
+
+static const char * const gds_strings[] = {
+ [GDS_MITIGATION_OFF] = "Vulnerable",
+ [GDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
+ [GDS_MITIGATION_FORCE] = "Mitigation: AVX disabled, no microcode",
+ [GDS_MITIGATION_FULL] = "Mitigation: Microcode",
+ [GDS_MITIGATION_FULL_LOCKED] = "Mitigation: Microcode (locked)",
+ [GDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status",
+};
+
+bool gds_ucode_mitigated(void)
+{
+ return (gds_mitigation == GDS_MITIGATION_FULL ||
+ gds_mitigation == GDS_MITIGATION_FULL_LOCKED);
+}
+EXPORT_SYMBOL_GPL(gds_ucode_mitigated);
+
+void update_gds_msr(void)
+{
+ u64 mcu_ctrl_after;
+ u64 mcu_ctrl;
+
+ switch (gds_mitigation) {
+ case GDS_MITIGATION_OFF:
+ rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+ mcu_ctrl |= GDS_MITG_DIS;
+ break;
+ case GDS_MITIGATION_FULL_LOCKED:
+ /*
+ * The LOCKED state comes from the boot CPU. APs might not have
+ * the same state. Make sure the mitigation is enabled on all
+ * CPUs.
+ */
+ case GDS_MITIGATION_FULL:
+ rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+ mcu_ctrl &= ~GDS_MITG_DIS;
+ break;
+ case GDS_MITIGATION_FORCE:
+ case GDS_MITIGATION_UCODE_NEEDED:
+ case GDS_MITIGATION_HYPERVISOR:
+ case GDS_MITIGATION_AUTO:
+ return;
+ }
+
+ wrmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+
+ /*
+ * Check to make sure that the WRMSR value was not ignored. Writes to
+ * GDS_MITG_DIS will be ignored if this processor is locked but the boot
+ * processor was not.
+ */
+ rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl_after);
+ WARN_ON_ONCE(mcu_ctrl != mcu_ctrl_after);
+}
+
+static void __init gds_select_mitigation(void)
+{
+ u64 mcu_ctrl;
+
+ if (!boot_cpu_has_bug(X86_BUG_GDS))
+ return;
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ gds_mitigation = GDS_MITIGATION_HYPERVISOR;
+ return;
+ }
+
+ if (cpu_mitigations_off())
+ gds_mitigation = GDS_MITIGATION_OFF;
+ /* Will verify below that mitigation _can_ be disabled */
+
+ if (gds_mitigation == GDS_MITIGATION_AUTO)
+ gds_mitigation = GDS_MITIGATION_FULL;
+
+ /* No microcode */
+ if (!(x86_arch_cap_msr & ARCH_CAP_GDS_CTRL)) {
+ if (gds_mitigation != GDS_MITIGATION_FORCE)
+ gds_mitigation = GDS_MITIGATION_UCODE_NEEDED;
+ return;
+ }
+
+ /* Microcode has mitigation, use it */
+ if (gds_mitigation == GDS_MITIGATION_FORCE)
+ gds_mitigation = GDS_MITIGATION_FULL;
+
+ rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+ if (mcu_ctrl & GDS_MITG_LOCKED) {
+ if (gds_mitigation == GDS_MITIGATION_OFF)
+ pr_warn("Mitigation locked. Disable failed.\n");
+
+ /*
+ * The mitigation is selected from the boot CPU. All other CPUs
+ * _should_ have the same state. If the boot CPU isn't locked
+ * but others are then update_gds_msr() will WARN() of the state
+ * mismatch. If the boot CPU is locked update_gds_msr() will
+ * ensure the other CPUs have the mitigation enabled.
+ */
+ gds_mitigation = GDS_MITIGATION_FULL_LOCKED;
+ }
+}
+
+static void __init gds_apply_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_GDS))
+ return;
+
+ /* Microcode is present */
+ if (x86_arch_cap_msr & ARCH_CAP_GDS_CTRL)
+ update_gds_msr();
+ else if (gds_mitigation == GDS_MITIGATION_FORCE) {
+ /*
+ * This only needs to be done on the boot CPU so do it
+ * here rather than in update_gds_msr()
+ */
+ setup_clear_cpu_cap(X86_FEATURE_AVX);
+ pr_warn("Microcode update needed! Disabling AVX as mitigation.\n");
+ }
+
+ pr_info("%s\n", gds_strings[gds_mitigation]);
+}
+
+static int __init gds_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!boot_cpu_has_bug(X86_BUG_GDS))
+ return 0;
+
+ if (!strcmp(str, "off"))
+ gds_mitigation = GDS_MITIGATION_OFF;
+ else if (!strcmp(str, "force"))
+ gds_mitigation = GDS_MITIGATION_FORCE;
+
+ return 0;
+}
+early_param("gather_data_sampling", gds_parse_cmdline);
+
+#undef pr_fmt
#define pr_fmt(fmt) "Spectre V1 : " fmt
enum spectre_v1_mitigation {
@@ -691,7 +1032,8 @@ enum spectre_v1_mitigation {
};
static enum spectre_v1_mitigation spectre_v1_mitigation __ro_after_init =
- SPECTRE_V1_MITIGATION_AUTO;
+ IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V1) ?
+ SPECTRE_V1_MITIGATION_AUTO : SPECTRE_V1_MITIGATION_NONE;
static const char * const spectre_v1_strings[] = {
[SPECTRE_V1_MITIGATION_NONE] = "Vulnerable: __user pointer sanitization and usercopy barriers only; no swapgs barriers",
@@ -721,10 +1063,14 @@ static bool smap_works_speculatively(void)
static void __init spectre_v1_select_mitigation(void)
{
- if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off()) {
+ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off())
spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE;
+}
+
+static void __init spectre_v1_apply_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off())
return;
- }
if (spectre_v1_mitigation == SPECTRE_V1_MITIGATION_AUTO) {
/*
@@ -772,25 +1118,30 @@ static int __init nospectre_v1_cmdline(char *str)
}
early_param("nospectre_v1", nospectre_v1_cmdline);
-static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init =
- SPECTRE_V2_NONE;
+enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = SPECTRE_V2_NONE;
#undef pr_fmt
#define pr_fmt(fmt) "RETBleed: " fmt
+enum its_mitigation {
+ ITS_MITIGATION_OFF,
+ ITS_MITIGATION_AUTO,
+ ITS_MITIGATION_VMEXIT_ONLY,
+ ITS_MITIGATION_ALIGNED_THUNKS,
+ ITS_MITIGATION_RETPOLINE_STUFF,
+};
+
+static enum its_mitigation its_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_ITS) ? ITS_MITIGATION_AUTO : ITS_MITIGATION_OFF;
+
enum retbleed_mitigation {
RETBLEED_MITIGATION_NONE,
+ RETBLEED_MITIGATION_AUTO,
RETBLEED_MITIGATION_UNRET,
RETBLEED_MITIGATION_IBPB,
RETBLEED_MITIGATION_IBRS,
RETBLEED_MITIGATION_EIBRS,
-};
-
-enum retbleed_mitigation_cmd {
- RETBLEED_CMD_OFF,
- RETBLEED_CMD_AUTO,
- RETBLEED_CMD_UNRET,
- RETBLEED_CMD_IBPB,
+ RETBLEED_MITIGATION_STUFF,
};
static const char * const retbleed_strings[] = {
@@ -799,12 +1150,11 @@ static const char * const retbleed_strings[] = {
[RETBLEED_MITIGATION_IBPB] = "Mitigation: IBPB",
[RETBLEED_MITIGATION_IBRS] = "Mitigation: IBRS",
[RETBLEED_MITIGATION_EIBRS] = "Mitigation: Enhanced IBRS",
+ [RETBLEED_MITIGATION_STUFF] = "Mitigation: Stuffing",
};
static enum retbleed_mitigation retbleed_mitigation __ro_after_init =
- RETBLEED_MITIGATION_NONE;
-static enum retbleed_mitigation_cmd retbleed_cmd __ro_after_init =
- RETBLEED_CMD_AUTO;
+ IS_ENABLED(CONFIG_MITIGATION_RETBLEED) ? RETBLEED_MITIGATION_AUTO : RETBLEED_MITIGATION_NONE;
static int __ro_after_init retbleed_nosmt = false;
@@ -821,15 +1171,19 @@ static int __init retbleed_parse_cmdline(char *str)
}
if (!strcmp(str, "off")) {
- retbleed_cmd = RETBLEED_CMD_OFF;
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
} else if (!strcmp(str, "auto")) {
- retbleed_cmd = RETBLEED_CMD_AUTO;
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
} else if (!strcmp(str, "unret")) {
- retbleed_cmd = RETBLEED_CMD_UNRET;
+ retbleed_mitigation = RETBLEED_MITIGATION_UNRET;
} else if (!strcmp(str, "ibpb")) {
- retbleed_cmd = RETBLEED_CMD_IBPB;
+ retbleed_mitigation = RETBLEED_MITIGATION_IBPB;
+ } else if (!strcmp(str, "stuff")) {
+ retbleed_mitigation = RETBLEED_MITIGATION_STUFF;
} else if (!strcmp(str, "nosmt")) {
retbleed_nosmt = true;
+ } else if (!strcmp(str, "force")) {
+ setup_force_cpu_bug(X86_BUG_RETBLEED);
} else {
pr_err("Ignoring unknown retbleed option (%s).", str);
}
@@ -846,61 +1200,123 @@ early_param("retbleed", retbleed_parse_cmdline);
static void __init retbleed_select_mitigation(void)
{
- bool mitigate_smt = false;
-
- if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off())
- return;
-
- switch (retbleed_cmd) {
- case RETBLEED_CMD_OFF:
+ if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) {
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
return;
+ }
- case RETBLEED_CMD_UNRET:
- if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY)) {
- retbleed_mitigation = RETBLEED_MITIGATION_UNRET;
- } else {
- pr_err("WARNING: kernel not compiled with CPU_UNRET_ENTRY.\n");
- goto do_cmd_auto;
+ switch (retbleed_mitigation) {
+ case RETBLEED_MITIGATION_UNRET:
+ if (!IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) {
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
+ pr_err("WARNING: kernel not compiled with MITIGATION_UNRET_ENTRY.\n");
}
break;
-
- case RETBLEED_CMD_IBPB:
+ case RETBLEED_MITIGATION_IBPB:
if (!boot_cpu_has(X86_FEATURE_IBPB)) {
pr_err("WARNING: CPU does not support IBPB.\n");
- goto do_cmd_auto;
- } else if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY)) {
- retbleed_mitigation = RETBLEED_MITIGATION_IBPB;
- } else {
- pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n");
- goto do_cmd_auto;
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
+ } else if (!IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) {
+ pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n");
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
}
break;
-
-do_cmd_auto:
- case RETBLEED_CMD_AUTO:
- default:
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
- boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) {
- if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY))
- retbleed_mitigation = RETBLEED_MITIGATION_UNRET;
- else if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY) && boot_cpu_has(X86_FEATURE_IBPB))
- retbleed_mitigation = RETBLEED_MITIGATION_IBPB;
+ case RETBLEED_MITIGATION_STUFF:
+ if (!IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING)) {
+ pr_err("WARNING: kernel not compiled with MITIGATION_CALL_DEPTH_TRACKING.\n");
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
+ } else if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
+ pr_err("WARNING: retbleed=stuff only supported for Intel CPUs.\n");
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
}
+ break;
+ default:
+ break;
+ }
- /*
- * The Intel mitigation (IBRS or eIBRS) was already selected in
- * spectre_v2_select_mitigation(). 'retbleed_mitigation' will
- * be set accordingly below.
- */
+ if (retbleed_mitigation != RETBLEED_MITIGATION_AUTO)
+ return;
- break;
+ /* Intel mitigation selected in retbleed_update_mitigation() */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) {
+ if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY))
+ retbleed_mitigation = RETBLEED_MITIGATION_UNRET;
+ else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY) &&
+ boot_cpu_has(X86_FEATURE_IBPB))
+ retbleed_mitigation = RETBLEED_MITIGATION_IBPB;
+ else
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
+ }
+}
+
+static void __init retbleed_update_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off())
+ return;
+
+ if (retbleed_mitigation == RETBLEED_MITIGATION_NONE)
+ goto out;
+
+ /*
+ * retbleed=stuff is only allowed on Intel. If stuffing can't be used
+ * then a different mitigation will be selected below.
+ *
+ * its=stuff will also attempt to enable stuffing.
+ */
+ if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF ||
+ its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF) {
+ if (spectre_v2_enabled != SPECTRE_V2_RETPOLINE) {
+ pr_err("WARNING: retbleed=stuff depends on spectre_v2=retpoline\n");
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
+ } else {
+ if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF)
+ pr_info("Retbleed mitigation updated to stuffing\n");
+
+ retbleed_mitigation = RETBLEED_MITIGATION_STUFF;
+ }
+ }
+ /*
+ * Let IBRS trump all on Intel without affecting the effects of the
+ * retbleed= cmdline option except for call depth based stuffing
+ */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+ switch (spectre_v2_enabled) {
+ case SPECTRE_V2_IBRS:
+ retbleed_mitigation = RETBLEED_MITIGATION_IBRS;
+ break;
+ case SPECTRE_V2_EIBRS:
+ case SPECTRE_V2_EIBRS_RETPOLINE:
+ case SPECTRE_V2_EIBRS_LFENCE:
+ retbleed_mitigation = RETBLEED_MITIGATION_EIBRS;
+ break;
+ default:
+ if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF)
+ pr_err(RETBLEED_INTEL_MSG);
+ }
+ /* If nothing has set the mitigation yet, default to NONE. */
+ if (retbleed_mitigation == RETBLEED_MITIGATION_AUTO)
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
}
+out:
+ pr_info("%s\n", retbleed_strings[retbleed_mitigation]);
+}
+
+
+static void __init retbleed_apply_mitigation(void)
+{
+ bool mitigate_smt = false;
switch (retbleed_mitigation) {
+ case RETBLEED_MITIGATION_NONE:
+ return;
+
case RETBLEED_MITIGATION_UNRET:
setup_force_cpu_cap(X86_FEATURE_RETHUNK);
setup_force_cpu_cap(X86_FEATURE_UNRET);
+ set_return_thunk(retbleed_return_thunk);
+
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
pr_err(RETBLEED_UNTRAIN_MSG);
@@ -910,7 +1326,31 @@ do_cmd_auto:
case RETBLEED_MITIGATION_IBPB:
setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
+ setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
mitigate_smt = true;
+
+ /*
+ * IBPB on entry already obviates the need for
+ * software-based untraining so clear those in case some
+ * other mitigation like SRSO has selected them.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_UNRET);
+ setup_clear_cpu_cap(X86_FEATURE_RETHUNK);
+
+ /*
+ * There is no need for RSB filling: write_ibpb() ensures
+ * all predictions, including the RSB, are invalidated,
+ * regardless of IBPB implementation.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT);
+
+ break;
+
+ case RETBLEED_MITIGATION_STUFF:
+ setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+ setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH);
+
+ set_return_thunk(call_depth_return_thunk);
break;
default:
@@ -920,27 +1360,131 @@ do_cmd_auto:
if (mitigate_smt && !boot_cpu_has(X86_FEATURE_STIBP) &&
(retbleed_nosmt || cpu_mitigations_auto_nosmt()))
cpu_smt_disable(false);
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) "ITS: " fmt
+
+static const char * const its_strings[] = {
+ [ITS_MITIGATION_OFF] = "Vulnerable",
+ [ITS_MITIGATION_VMEXIT_ONLY] = "Mitigation: Vulnerable, KVM: Not affected",
+ [ITS_MITIGATION_ALIGNED_THUNKS] = "Mitigation: Aligned branch/return thunks",
+ [ITS_MITIGATION_RETPOLINE_STUFF] = "Mitigation: Retpolines, Stuffing RSB",
+};
+
+static int __init its_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!IS_ENABLED(CONFIG_MITIGATION_ITS)) {
+ pr_err("Mitigation disabled at compile time, ignoring option (%s)", str);
+ return 0;
+ }
+
+ if (!strcmp(str, "off")) {
+ its_mitigation = ITS_MITIGATION_OFF;
+ } else if (!strcmp(str, "on")) {
+ its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+ } else if (!strcmp(str, "force")) {
+ its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+ setup_force_cpu_bug(X86_BUG_ITS);
+ } else if (!strcmp(str, "vmexit")) {
+ its_mitigation = ITS_MITIGATION_VMEXIT_ONLY;
+ } else if (!strcmp(str, "stuff")) {
+ its_mitigation = ITS_MITIGATION_RETPOLINE_STUFF;
+ } else {
+ pr_err("Ignoring unknown indirect_target_selection option (%s).", str);
+ }
+
+ return 0;
+}
+early_param("indirect_target_selection", its_parse_cmdline);
+
+static void __init its_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_ITS) || cpu_mitigations_off()) {
+ its_mitigation = ITS_MITIGATION_OFF;
+ return;
+ }
+
+ if (its_mitigation == ITS_MITIGATION_AUTO)
+ its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+
+ if (its_mitigation == ITS_MITIGATION_OFF)
+ return;
+
+ if (!IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) ||
+ !IS_ENABLED(CONFIG_MITIGATION_RETHUNK)) {
+ pr_err("WARNING: ITS mitigation depends on retpoline and rethunk support\n");
+ its_mitigation = ITS_MITIGATION_OFF;
+ return;
+ }
+
+ if (IS_ENABLED(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B)) {
+ pr_err("WARNING: ITS mitigation is not compatible with CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B\n");
+ its_mitigation = ITS_MITIGATION_OFF;
+ return;
+ }
+
+ if (its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF &&
+ !IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING)) {
+ pr_err("RSB stuff mitigation not supported, using default\n");
+ its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+ }
+
+ if (its_mitigation == ITS_MITIGATION_VMEXIT_ONLY &&
+ !boot_cpu_has_bug(X86_BUG_ITS_NATIVE_ONLY))
+ its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+}
+
+static void __init its_update_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_ITS) || cpu_mitigations_off())
+ return;
+
+ switch (spectre_v2_enabled) {
+ case SPECTRE_V2_NONE:
+ pr_err("WARNING: Spectre-v2 mitigation is off, disabling ITS\n");
+ its_mitigation = ITS_MITIGATION_OFF;
+ break;
+ case SPECTRE_V2_RETPOLINE:
+ /* Retpoline+CDT mitigates ITS */
+ if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF)
+ its_mitigation = ITS_MITIGATION_RETPOLINE_STUFF;
+ break;
+ case SPECTRE_V2_LFENCE:
+ case SPECTRE_V2_EIBRS_LFENCE:
+ pr_err("WARNING: ITS mitigation is not compatible with lfence mitigation\n");
+ its_mitigation = ITS_MITIGATION_OFF;
+ break;
+ default:
+ break;
+ }
/*
- * Let IBRS trump all on Intel without affecting the effects of the
- * retbleed= cmdline option.
+ * retbleed_update_mitigation() will try to do stuffing if its=stuff.
+ * If it can't, such as if spectre_v2!=retpoline, then fall back to
+ * aligned thunks.
*/
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
- switch (spectre_v2_enabled) {
- case SPECTRE_V2_IBRS:
- retbleed_mitigation = RETBLEED_MITIGATION_IBRS;
- break;
- case SPECTRE_V2_EIBRS:
- case SPECTRE_V2_EIBRS_RETPOLINE:
- case SPECTRE_V2_EIBRS_LFENCE:
- retbleed_mitigation = RETBLEED_MITIGATION_EIBRS;
- break;
- default:
- pr_err(RETBLEED_INTEL_MSG);
- }
- }
+ if (its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF &&
+ retbleed_mitigation != RETBLEED_MITIGATION_STUFF)
+ its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
- pr_info("%s\n", retbleed_strings[retbleed_mitigation]);
+ pr_info("%s\n", its_strings[its_mitigation]);
+}
+
+static void __init its_apply_mitigation(void)
+{
+ /* its=stuff forces retbleed stuffing and is enabled there. */
+ if (its_mitigation != ITS_MITIGATION_ALIGNED_THUNKS)
+ return;
+
+ if (!boot_cpu_has(X86_FEATURE_RETPOLINE))
+ setup_force_cpu_cap(X86_FEATURE_INDIRECT_THUNK_ITS);
+
+ setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+ set_return_thunk(its_return_thunk);
}
#undef pr_fmt
@@ -951,7 +1495,7 @@ static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init =
static enum spectre_v2_user_mitigation spectre_v2_user_ibpb __ro_after_init =
SPECTRE_V2_USER_NONE;
-#ifdef CONFIG_RETPOLINE
+#ifdef CONFIG_MITIGATION_RETPOLINE
static bool spectre_v2_bad_module;
bool retpoline_module_ok(bool has_retpoline)
@@ -1020,6 +1564,8 @@ enum spectre_v2_mitigation_cmd {
SPECTRE_V2_CMD_IBRS,
};
+static enum spectre_v2_mitigation_cmd spectre_v2_cmd __ro_after_init = SPECTRE_V2_CMD_AUTO;
+
enum spectre_v2_user_cmd {
SPECTRE_V2_USER_CMD_NONE,
SPECTRE_V2_USER_CMD_AUTO,
@@ -1058,22 +1604,13 @@ static void __init spec_v2_user_print_cond(const char *reason, bool secure)
pr_info("spectre_v2_user=%s forced on command line.\n", reason);
}
-static __ro_after_init enum spectre_v2_mitigation_cmd spectre_v2_cmd;
-
-static enum spectre_v2_user_cmd __init
-spectre_v2_parse_user_cmdline(void)
+static enum spectre_v2_user_cmd __init spectre_v2_parse_user_cmdline(void)
{
char arg[20];
int ret, i;
- switch (spectre_v2_cmd) {
- case SPECTRE_V2_CMD_NONE:
+ if (cpu_mitigations_off() || !IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2))
return SPECTRE_V2_USER_CMD_NONE;
- case SPECTRE_V2_CMD_FORCE:
- return SPECTRE_V2_USER_CMD_FORCE;
- default:
- break;
- }
ret = cmdline_find_option(boot_command_line, "spectre_v2_user",
arg, sizeof(arg));
@@ -1088,68 +1625,126 @@ spectre_v2_parse_user_cmdline(void)
}
}
- pr_err("Unknown user space protection option (%s). Switching to AUTO select\n", arg);
+ pr_err("Unknown user space protection option (%s). Switching to default\n", arg);
return SPECTRE_V2_USER_CMD_AUTO;
}
static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode)
{
- return mode == SPECTRE_V2_IBRS ||
- mode == SPECTRE_V2_EIBRS ||
- mode == SPECTRE_V2_EIBRS_RETPOLINE ||
- mode == SPECTRE_V2_EIBRS_LFENCE;
+ return spectre_v2_in_eibrs_mode(mode) || mode == SPECTRE_V2_IBRS;
}
-static void __init
-spectre_v2_user_select_mitigation(void)
+static void __init spectre_v2_user_select_mitigation(void)
{
- enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE;
- bool smt_possible = IS_ENABLED(CONFIG_SMP);
- enum spectre_v2_user_cmd cmd;
-
if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP))
return;
- if (cpu_smt_control == CPU_SMT_FORCE_DISABLED ||
- cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
- smt_possible = false;
-
- cmd = spectre_v2_parse_user_cmdline();
- switch (cmd) {
+ switch (spectre_v2_parse_user_cmdline()) {
case SPECTRE_V2_USER_CMD_NONE:
- goto set_mode;
+ return;
case SPECTRE_V2_USER_CMD_FORCE:
- mode = SPECTRE_V2_USER_STRICT;
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT;
break;
case SPECTRE_V2_USER_CMD_AUTO:
case SPECTRE_V2_USER_CMD_PRCTL:
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_PRCTL;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL;
+ break;
case SPECTRE_V2_USER_CMD_PRCTL_IBPB:
- mode = SPECTRE_V2_USER_PRCTL;
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL;
break;
case SPECTRE_V2_USER_CMD_SECCOMP:
+ if (IS_ENABLED(CONFIG_SECCOMP))
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_SECCOMP;
+ else
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_PRCTL;
+ spectre_v2_user_stibp = spectre_v2_user_ibpb;
+ break;
case SPECTRE_V2_USER_CMD_SECCOMP_IBPB:
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT;
if (IS_ENABLED(CONFIG_SECCOMP))
- mode = SPECTRE_V2_USER_SECCOMP;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_SECCOMP;
else
- mode = SPECTRE_V2_USER_PRCTL;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL;
break;
}
+ /*
+ * At this point, an STIBP mode other than "off" has been set.
+ * If STIBP support is not being forced, check if STIBP always-on
+ * is preferred.
+ */
+ if ((spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP) &&
+ boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON))
+ spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT_PREFERRED;
+
+ if (!boot_cpu_has(X86_FEATURE_IBPB))
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_NONE;
+
+ if (!boot_cpu_has(X86_FEATURE_STIBP))
+ spectre_v2_user_stibp = SPECTRE_V2_USER_NONE;
+}
+
+static void __init spectre_v2_user_update_mitigation(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP))
+ return;
+
+ /* The spectre_v2 cmd line can override spectre_v2_user options */
+ if (spectre_v2_cmd == SPECTRE_V2_CMD_NONE) {
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_NONE;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_NONE;
+ } else if (spectre_v2_cmd == SPECTRE_V2_CMD_FORCE) {
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT;
+ }
+
+ /*
+ * If no STIBP, Intel enhanced IBRS is enabled, or SMT impossible, STIBP
+ * is not required.
+ *
+ * Intel's Enhanced IBRS also protects against cross-thread branch target
+ * injection in user-mode as the IBRS bit remains always set which
+ * implicitly enables cross-thread protections. However, in legacy IBRS
+ * mode, the IBRS bit is set only on kernel entry and cleared on return
+ * to userspace. AMD Automatic IBRS also does not protect userspace.
+ * These modes therefore disable the implicit cross-thread protection,
+ * so allow for STIBP to be selected in those cases.
+ */
+ if (!boot_cpu_has(X86_FEATURE_STIBP) ||
+ !cpu_smt_possible() ||
+ (spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
+ !boot_cpu_has(X86_FEATURE_AUTOIBRS))) {
+ spectre_v2_user_stibp = SPECTRE_V2_USER_NONE;
+ return;
+ }
+
+ if (spectre_v2_user_stibp != SPECTRE_V2_USER_NONE &&
+ (retbleed_mitigation == RETBLEED_MITIGATION_UNRET ||
+ retbleed_mitigation == RETBLEED_MITIGATION_IBPB)) {
+ if (spectre_v2_user_stibp != SPECTRE_V2_USER_STRICT &&
+ spectre_v2_user_stibp != SPECTRE_V2_USER_STRICT_PREFERRED)
+ pr_info("Selecting STIBP always-on mode to complement retbleed mitigation\n");
+ spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT_PREFERRED;
+ }
+ pr_info("%s\n", spectre_v2_user_strings[spectre_v2_user_stibp]);
+}
+
+static void __init spectre_v2_user_apply_mitigation(void)
+{
/* Initialize Indirect Branch Prediction Barrier */
- if (boot_cpu_has(X86_FEATURE_IBPB)) {
- setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
+ if (spectre_v2_user_ibpb != SPECTRE_V2_USER_NONE) {
+ static_branch_enable(&switch_vcpu_ibpb);
- spectre_v2_user_ibpb = mode;
- switch (cmd) {
- case SPECTRE_V2_USER_CMD_FORCE:
- case SPECTRE_V2_USER_CMD_PRCTL_IBPB:
- case SPECTRE_V2_USER_CMD_SECCOMP_IBPB:
+ switch (spectre_v2_user_ibpb) {
+ case SPECTRE_V2_USER_STRICT:
static_branch_enable(&switch_mm_always_ibpb);
- spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT;
break;
- case SPECTRE_V2_USER_CMD_PRCTL:
- case SPECTRE_V2_USER_CMD_AUTO:
- case SPECTRE_V2_USER_CMD_SECCOMP:
+ case SPECTRE_V2_USER_PRCTL:
+ case SPECTRE_V2_USER_SECCOMP:
static_branch_enable(&switch_mm_cond_ibpb);
break;
default:
@@ -1160,45 +1755,15 @@ spectre_v2_user_select_mitigation(void)
static_key_enabled(&switch_mm_always_ibpb) ?
"always-on" : "conditional");
}
-
- /*
- * If no STIBP, IBRS or enhanced IBRS is enabled, or SMT impossible,
- * STIBP is not required.
- */
- if (!boot_cpu_has(X86_FEATURE_STIBP) ||
- !smt_possible ||
- spectre_v2_in_ibrs_mode(spectre_v2_enabled))
- return;
-
- /*
- * At this point, an STIBP mode other than "off" has been set.
- * If STIBP support is not being forced, check if STIBP always-on
- * is preferred.
- */
- if (mode != SPECTRE_V2_USER_STRICT &&
- boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON))
- mode = SPECTRE_V2_USER_STRICT_PREFERRED;
-
- if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET) {
- if (mode != SPECTRE_V2_USER_STRICT &&
- mode != SPECTRE_V2_USER_STRICT_PREFERRED)
- pr_info("Selecting STIBP always-on mode to complement retbleed mitigation\n");
- mode = SPECTRE_V2_USER_STRICT_PREFERRED;
- }
-
- spectre_v2_user_stibp = mode;
-
-set_mode:
- pr_info("%s\n", spectre_v2_user_strings[mode]);
}
static const char * const spectre_v2_strings[] = {
[SPECTRE_V2_NONE] = "Vulnerable",
[SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines",
[SPECTRE_V2_LFENCE] = "Mitigation: LFENCE",
- [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced IBRS",
- [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced IBRS + LFENCE",
- [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced IBRS + Retpolines",
+ [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced / Automatic IBRS",
+ [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced / Automatic IBRS + LFENCE",
+ [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced / Automatic IBRS + Retpolines",
[SPECTRE_V2_IBRS] = "Mitigation: IBRS",
};
@@ -1228,17 +1793,18 @@ static void __init spec_v2_print_cond(const char *reason, bool secure)
static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
{
- enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO;
+ enum spectre_v2_mitigation_cmd cmd;
char arg[20];
int ret, i;
+ cmd = IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_CMD_AUTO : SPECTRE_V2_CMD_NONE;
if (cmdline_find_option_bool(boot_command_line, "nospectre_v2") ||
cpu_mitigations_off())
return SPECTRE_V2_CMD_NONE;
ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg));
if (ret < 0)
- return SPECTRE_V2_CMD_AUTO;
+ return cmd;
for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) {
if (!match_option(arg, ret, mitigation_options[i].option))
@@ -1248,8 +1814,8 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
}
if (i >= ARRAY_SIZE(mitigation_options)) {
- pr_err("unknown option (%s). Switching to AUTO select\n", arg);
- return SPECTRE_V2_CMD_AUTO;
+ pr_err("unknown option (%s). Switching to default mode\n", arg);
+ return cmd;
}
if ((cmd == SPECTRE_V2_CMD_RETPOLINE ||
@@ -1257,7 +1823,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC ||
cmd == SPECTRE_V2_CMD_EIBRS_LFENCE ||
cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) &&
- !IS_ENABLED(CONFIG_RETPOLINE)) {
+ !IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) {
pr_err("%s selected but not compiled in. Switching to AUTO select\n",
mitigation_options[i].option);
return SPECTRE_V2_CMD_AUTO;
@@ -1267,7 +1833,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
cmd == SPECTRE_V2_CMD_EIBRS_LFENCE ||
cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) &&
!boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) {
- pr_err("%s selected but CPU doesn't have eIBRS. Switching to AUTO select\n",
+ pr_err("%s selected but CPU doesn't have Enhanced or Automatic IBRS. Switching to AUTO select\n",
mitigation_options[i].option);
return SPECTRE_V2_CMD_AUTO;
}
@@ -1280,7 +1846,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
return SPECTRE_V2_CMD_AUTO;
}
- if (cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_CPU_IBRS_ENTRY)) {
+ if (cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY)) {
pr_err("%s selected but not compiled in. Switching to AUTO select\n",
mitigation_options[i].option);
return SPECTRE_V2_CMD_AUTO;
@@ -1298,7 +1864,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
return SPECTRE_V2_CMD_AUTO;
}
- if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_has(X86_FEATURE_XENPV)) {
+ if (cmd == SPECTRE_V2_CMD_IBRS && cpu_feature_enabled(X86_FEATURE_XENPV)) {
pr_err("%s selected but running as XenPV guest. Switching to AUTO select\n",
mitigation_options[i].option);
return SPECTRE_V2_CMD_AUTO;
@@ -1311,7 +1877,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void)
{
- if (!IS_ENABLED(CONFIG_RETPOLINE)) {
+ if (!IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) {
pr_err("Kernel not compiled with retpoline; no mitigation available!");
return SPECTRE_V2_NONE;
}
@@ -1319,98 +1885,260 @@ static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void)
return SPECTRE_V2_RETPOLINE;
}
+static bool __ro_after_init rrsba_disabled;
+
/* Disable in-kernel use of non-RSB RET predictors */
static void __init spec_ctrl_disable_kernel_rrsba(void)
{
- u64 ia32_cap;
+ if (rrsba_disabled)
+ return;
+
+ if (!(x86_arch_cap_msr & ARCH_CAP_RRSBA)) {
+ rrsba_disabled = true;
+ return;
+ }
if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL))
return;
- ia32_cap = x86_read_arch_cap_msr();
+ x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S;
+ update_spec_ctrl(x86_spec_ctrl_base);
+ rrsba_disabled = true;
+}
+
+static void __init spectre_v2_select_rsb_mitigation(enum spectre_v2_mitigation mode)
+{
+ /*
+ * WARNING! There are many subtleties to consider when changing *any*
+ * code related to RSB-related mitigations. Before doing so, carefully
+ * read the following document, and update if necessary:
+ *
+ * Documentation/admin-guide/hw-vuln/rsb.rst
+ *
+ * In an overly simplified nutshell:
+ *
+ * - User->user RSB attacks are conditionally mitigated during
+ * context switches by cond_mitigation -> write_ibpb().
+ *
+ * - User->kernel and guest->host attacks are mitigated by eIBRS or
+ * RSB filling.
+ *
+ * Though, depending on config, note that other alternative
+ * mitigations may end up getting used instead, e.g., IBPB on
+ * entry/vmexit, call depth tracking, or return thunks.
+ */
+
+ switch (mode) {
+ case SPECTRE_V2_NONE:
+ break;
+
+ case SPECTRE_V2_EIBRS:
+ case SPECTRE_V2_EIBRS_LFENCE:
+ case SPECTRE_V2_EIBRS_RETPOLINE:
+ if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) {
+ pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL on VMEXIT\n");
+ setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE);
+ }
+ break;
+
+ case SPECTRE_V2_RETPOLINE:
+ case SPECTRE_V2_LFENCE:
+ case SPECTRE_V2_IBRS:
+ pr_info("Spectre v2 / SpectreRSB: Filling RSB on context switch and VMEXIT\n");
+ setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
+ setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT);
+ break;
+
+ default:
+ pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation\n");
+ dump_stack();
+ break;
+ }
+}
+
+/*
+ * Set BHI_DIS_S to prevent indirect branches in kernel to be influenced by
+ * branch history in userspace. Not needed if BHI_NO is set.
+ */
+static bool __init spec_ctrl_bhi_dis(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_BHI_CTRL))
+ return false;
+
+ x86_spec_ctrl_base |= SPEC_CTRL_BHI_DIS_S;
+ update_spec_ctrl(x86_spec_ctrl_base);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_HW);
+
+ return true;
+}
+
+enum bhi_mitigations {
+ BHI_MITIGATION_OFF,
+ BHI_MITIGATION_AUTO,
+ BHI_MITIGATION_ON,
+ BHI_MITIGATION_VMEXIT_ONLY,
+};
+
+static enum bhi_mitigations bhi_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_SPECTRE_BHI) ? BHI_MITIGATION_AUTO : BHI_MITIGATION_OFF;
+
+static int __init spectre_bhi_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off"))
+ bhi_mitigation = BHI_MITIGATION_OFF;
+ else if (!strcmp(str, "on"))
+ bhi_mitigation = BHI_MITIGATION_ON;
+ else if (!strcmp(str, "vmexit"))
+ bhi_mitigation = BHI_MITIGATION_VMEXIT_ONLY;
+ else
+ pr_err("Ignoring unknown spectre_bhi option (%s)", str);
+
+ return 0;
+}
+early_param("spectre_bhi", spectre_bhi_parse_cmdline);
+
+static void __init bhi_select_mitigation(void)
+{
+ if (!boot_cpu_has(X86_BUG_BHI) || cpu_mitigations_off())
+ bhi_mitigation = BHI_MITIGATION_OFF;
+
+ if (bhi_mitigation == BHI_MITIGATION_AUTO)
+ bhi_mitigation = BHI_MITIGATION_ON;
+}
+
+static void __init bhi_update_mitigation(void)
+{
+ if (spectre_v2_cmd == SPECTRE_V2_CMD_NONE)
+ bhi_mitigation = BHI_MITIGATION_OFF;
+
+ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) &&
+ spectre_v2_cmd == SPECTRE_V2_CMD_AUTO)
+ bhi_mitigation = BHI_MITIGATION_OFF;
+}
+
+static void __init bhi_apply_mitigation(void)
+{
+ if (bhi_mitigation == BHI_MITIGATION_OFF)
+ return;
+
+ /* Retpoline mitigates against BHI unless the CPU has RRSBA behavior */
+ if (boot_cpu_has(X86_FEATURE_RETPOLINE) &&
+ !boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE)) {
+ spec_ctrl_disable_kernel_rrsba();
+ if (rrsba_disabled)
+ return;
+ }
- if (ia32_cap & ARCH_CAP_RRSBA) {
- x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S;
- write_spec_ctrl_current(x86_spec_ctrl_base, true);
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return;
+
+ /* Mitigate in hardware if supported */
+ if (spec_ctrl_bhi_dis())
+ return;
+
+ if (bhi_mitigation == BHI_MITIGATION_VMEXIT_ONLY) {
+ pr_info("Spectre BHI mitigation: SW BHB clearing on VM exit only\n");
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_VMEXIT);
+ return;
}
+
+ pr_info("Spectre BHI mitigation: SW BHB clearing on syscall and VM exit\n");
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_VMEXIT);
}
static void __init spectre_v2_select_mitigation(void)
{
- enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
- enum spectre_v2_mitigation mode = SPECTRE_V2_NONE;
+ spectre_v2_cmd = spectre_v2_parse_cmdline();
- /*
- * If the CPU is not affected and the command line mode is NONE or AUTO
- * then nothing to do.
- */
if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) &&
- (cmd == SPECTRE_V2_CMD_NONE || cmd == SPECTRE_V2_CMD_AUTO))
+ (spectre_v2_cmd == SPECTRE_V2_CMD_NONE || spectre_v2_cmd == SPECTRE_V2_CMD_AUTO))
return;
- switch (cmd) {
+ switch (spectre_v2_cmd) {
case SPECTRE_V2_CMD_NONE:
return;
case SPECTRE_V2_CMD_FORCE:
case SPECTRE_V2_CMD_AUTO:
if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) {
- mode = SPECTRE_V2_EIBRS;
+ spectre_v2_enabled = SPECTRE_V2_EIBRS;
break;
}
- if (IS_ENABLED(CONFIG_CPU_IBRS_ENTRY) &&
- boot_cpu_has_bug(X86_BUG_RETBLEED) &&
- retbleed_cmd != RETBLEED_CMD_OFF &&
- boot_cpu_has(X86_FEATURE_IBRS) &&
- boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
- mode = SPECTRE_V2_IBRS;
- break;
- }
-
- mode = spectre_v2_select_retpoline();
+ spectre_v2_enabled = spectre_v2_select_retpoline();
break;
case SPECTRE_V2_CMD_RETPOLINE_LFENCE:
pr_err(SPECTRE_V2_LFENCE_MSG);
- mode = SPECTRE_V2_LFENCE;
+ spectre_v2_enabled = SPECTRE_V2_LFENCE;
break;
case SPECTRE_V2_CMD_RETPOLINE_GENERIC:
- mode = SPECTRE_V2_RETPOLINE;
+ spectre_v2_enabled = SPECTRE_V2_RETPOLINE;
break;
case SPECTRE_V2_CMD_RETPOLINE:
- mode = spectre_v2_select_retpoline();
+ spectre_v2_enabled = spectre_v2_select_retpoline();
break;
case SPECTRE_V2_CMD_IBRS:
- mode = SPECTRE_V2_IBRS;
+ spectre_v2_enabled = SPECTRE_V2_IBRS;
break;
case SPECTRE_V2_CMD_EIBRS:
- mode = SPECTRE_V2_EIBRS;
+ spectre_v2_enabled = SPECTRE_V2_EIBRS;
break;
case SPECTRE_V2_CMD_EIBRS_LFENCE:
- mode = SPECTRE_V2_EIBRS_LFENCE;
+ spectre_v2_enabled = SPECTRE_V2_EIBRS_LFENCE;
break;
case SPECTRE_V2_CMD_EIBRS_RETPOLINE:
- mode = SPECTRE_V2_EIBRS_RETPOLINE;
+ spectre_v2_enabled = SPECTRE_V2_EIBRS_RETPOLINE;
break;
}
+}
+
+static void __init spectre_v2_update_mitigation(void)
+{
+ if (spectre_v2_cmd == SPECTRE_V2_CMD_AUTO &&
+ !spectre_v2_in_eibrs_mode(spectre_v2_enabled)) {
+ if (IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY) &&
+ boot_cpu_has_bug(X86_BUG_RETBLEED) &&
+ retbleed_mitigation != RETBLEED_MITIGATION_NONE &&
+ retbleed_mitigation != RETBLEED_MITIGATION_STUFF &&
+ boot_cpu_has(X86_FEATURE_IBRS) &&
+ boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+ spectre_v2_enabled = SPECTRE_V2_IBRS;
+ }
+ }
- if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled())
+ if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && !cpu_mitigations_off())
+ pr_info("%s\n", spectre_v2_strings[spectre_v2_enabled]);
+}
+
+static void __init spectre_v2_apply_mitigation(void)
+{
+ if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled())
pr_err(SPECTRE_V2_EIBRS_EBPF_MSG);
- if (spectre_v2_in_ibrs_mode(mode)) {
- x86_spec_ctrl_base |= SPEC_CTRL_IBRS;
- write_spec_ctrl_current(x86_spec_ctrl_base, true);
+ if (spectre_v2_in_ibrs_mode(spectre_v2_enabled)) {
+ if (boot_cpu_has(X86_FEATURE_AUTOIBRS)) {
+ msr_set_bit(MSR_EFER, _EFER_AUTOIBRS);
+ } else {
+ x86_spec_ctrl_base |= SPEC_CTRL_IBRS;
+ update_spec_ctrl(x86_spec_ctrl_base);
+ }
}
- switch (mode) {
+ switch (spectre_v2_enabled) {
case SPECTRE_V2_NONE:
+ return;
+
case SPECTRE_V2_EIBRS:
break;
@@ -1436,112 +2164,45 @@ static void __init spectre_v2_select_mitigation(void)
* JMPs gets protection against BHI and Intramode-BTI, but RET
* prediction from a non-RSB predictor is still a risk.
*/
- if (mode == SPECTRE_V2_EIBRS_LFENCE ||
- mode == SPECTRE_V2_EIBRS_RETPOLINE ||
- mode == SPECTRE_V2_RETPOLINE)
+ if (spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE ||
+ spectre_v2_enabled == SPECTRE_V2_EIBRS_RETPOLINE ||
+ spectre_v2_enabled == SPECTRE_V2_RETPOLINE)
spec_ctrl_disable_kernel_rrsba();
- spectre_v2_enabled = mode;
- pr_info("%s\n", spectre_v2_strings[mode]);
-
- /*
- * If Spectre v2 protection has been enabled, fill the RSB during a
- * context switch. In general there are two types of RSB attacks
- * across context switches, for which the CALLs/RETs may be unbalanced.
- *
- * 1) RSB underflow
- *
- * Some Intel parts have "bottomless RSB". When the RSB is empty,
- * speculated return targets may come from the branch predictor,
- * which could have a user-poisoned BTB or BHB entry.
- *
- * AMD has it even worse: *all* returns are speculated from the BTB,
- * regardless of the state of the RSB.
- *
- * When IBRS or eIBRS is enabled, the "user -> kernel" attack
- * scenario is mitigated by the IBRS branch prediction isolation
- * properties, so the RSB buffer filling wouldn't be necessary to
- * protect against this type of attack.
- *
- * The "user -> user" attack scenario is mitigated by RSB filling.
- *
- * 2) Poisoned RSB entry
- *
- * If the 'next' in-kernel return stack is shorter than 'prev',
- * 'next' could be tricked into speculating with a user-poisoned RSB
- * entry.
- *
- * The "user -> kernel" attack scenario is mitigated by SMEP and
- * eIBRS.
- *
- * The "user -> user" scenario, also known as SpectreBHB, requires
- * RSB clearing.
- *
- * So to mitigate all cases, unconditionally fill RSB on context
- * switches.
- *
- * FIXME: Is this pointless for retbleed-affected AMD?
- */
- setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
- pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n");
-
- /*
- * Similar to context switches, there are two types of RSB attacks
- * after vmexit:
- *
- * 1) RSB underflow
- *
- * 2) Poisoned RSB entry
- *
- * When retpoline is enabled, both are mitigated by filling/clearing
- * the RSB.
- *
- * When IBRS is enabled, while #1 would be mitigated by the IBRS branch
- * prediction isolation protections, RSB still needs to be cleared
- * because of #2. Note that SMEP provides no protection here, unlike
- * user-space-poisoned RSB entries.
- *
- * eIBRS, on the other hand, has RSB-poisoning protections, so it
- * doesn't need RSB clearing after vmexit.
- */
- if (boot_cpu_has(X86_FEATURE_RETPOLINE) ||
- boot_cpu_has(X86_FEATURE_KERNEL_IBRS))
- setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT);
+ spectre_v2_select_rsb_mitigation(spectre_v2_enabled);
/*
* Retpoline protects the kernel, but doesn't protect firmware. IBRS
* and Enhanced IBRS protect firmware too, so enable IBRS around
- * firmware calls only when IBRS / Enhanced IBRS aren't otherwise
- * enabled.
+ * firmware calls only when IBRS / Enhanced / Automatic IBRS aren't
+ * otherwise enabled.
*
- * Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because
- * the user might select retpoline on the kernel command line and if
- * the CPU supports Enhanced IBRS, kernel might un-intentionally not
- * enable IBRS around firmware calls.
+ * Use "spectre_v2_enabled" to check Enhanced IBRS instead of
+ * boot_cpu_has(), because the user might select retpoline on the kernel
+ * command line and if the CPU supports Enhanced IBRS, kernel might
+ * un-intentionally not enable IBRS around firmware calls.
*/
if (boot_cpu_has_bug(X86_BUG_RETBLEED) &&
boot_cpu_has(X86_FEATURE_IBPB) &&
(boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)) {
- if (retbleed_cmd != RETBLEED_CMD_IBPB) {
+ if (retbleed_mitigation != RETBLEED_MITIGATION_IBPB) {
setup_force_cpu_cap(X86_FEATURE_USE_IBPB_FW);
pr_info("Enabling Speculation Barrier for firmware calls\n");
}
- } else if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_ibrs_mode(mode)) {
+ } else if (boot_cpu_has(X86_FEATURE_IBRS) &&
+ !spectre_v2_in_ibrs_mode(spectre_v2_enabled)) {
setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW);
pr_info("Enabling Restricted Speculation for firmware calls\n");
}
-
- /* Set up IBPB and STIBP depending on the general spectre V2 command */
- spectre_v2_cmd = cmd;
}
static void update_stibp_msr(void * __unused)
{
u64 val = spec_ctrl_current() | (x86_spec_ctrl_base & SPEC_CTRL_STIBP);
- write_spec_ctrl_current(val, true);
+ update_spec_ctrl(val);
}
/* Update x86_spec_ctrl_base in case SMT state changed. */
@@ -1576,8 +2237,6 @@ static void update_indir_branch_cond(void)
/* Update the static key controlling the MDS CPU buffer clear in idle */
static void update_mds_branch_idle(void)
{
- u64 ia32_cap = x86_read_arch_cap_msr();
-
/*
* Enable the idle clearing if SMT is active on CPUs which are
* affected only by MSBDS and not any other MDS variant.
@@ -1592,7 +2251,7 @@ static void update_mds_branch_idle(void)
if (sched_smt_active()) {
static_branch_enable(&mds_idle_clear);
} else if (mmio_mitigation == MMIO_MITIGATION_OFF ||
- (ia32_cap & ARCH_CAP_FBSDP_NO)) {
+ (x86_arch_cap_msr & ARCH_CAP_FBSDP_NO)) {
static_branch_disable(&mds_idle_clear);
}
}
@@ -1624,6 +2283,7 @@ void cpu_bugs_smt_update(void)
switch (mds_mitigation) {
case MDS_MITIGATION_FULL:
+ case MDS_MITIGATION_AUTO:
case MDS_MITIGATION_VMWERV:
if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY))
pr_warn_once(MDS_MSG_SMT);
@@ -1635,6 +2295,7 @@ void cpu_bugs_smt_update(void)
switch (taa_mitigation) {
case TAA_MITIGATION_VERW:
+ case TAA_MITIGATION_AUTO:
case TAA_MITIGATION_UCODE_NEEDED:
if (sched_smt_active())
pr_warn_once(TAA_MSG_SMT);
@@ -1646,6 +2307,7 @@ void cpu_bugs_smt_update(void)
switch (mmio_mitigation) {
case MMIO_MITIGATION_VERW:
+ case MMIO_MITIGATION_AUTO:
case MMIO_MITIGATION_UCODE_NEEDED:
if (sched_smt_active())
pr_warn_once(MMIO_MSG_SMT);
@@ -1691,10 +2353,12 @@ static const struct {
static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void)
{
- enum ssb_mitigation_cmd cmd = SPEC_STORE_BYPASS_CMD_AUTO;
+ enum ssb_mitigation_cmd cmd;
char arg[20];
int ret, i;
+ cmd = IS_ENABLED(CONFIG_MITIGATION_SSB) ?
+ SPEC_STORE_BYPASS_CMD_AUTO : SPEC_STORE_BYPASS_CMD_NONE;
if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") ||
cpu_mitigations_off()) {
return SPEC_STORE_BYPASS_CMD_NONE;
@@ -1702,7 +2366,7 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void)
ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable",
arg, sizeof(arg));
if (ret < 0)
- return SPEC_STORE_BYPASS_CMD_AUTO;
+ return cmd;
for (i = 0; i < ARRAY_SIZE(ssb_mitigation_options); i++) {
if (!match_option(arg, ret, ssb_mitigation_options[i].option))
@@ -1713,27 +2377,26 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void)
}
if (i >= ARRAY_SIZE(ssb_mitigation_options)) {
- pr_err("unknown option (%s). Switching to AUTO select\n", arg);
- return SPEC_STORE_BYPASS_CMD_AUTO;
+ pr_err("unknown option (%s). Switching to default mode\n", arg);
+ return cmd;
}
}
return cmd;
}
-static enum ssb_mitigation __init __ssb_select_mitigation(void)
+static void __init ssb_select_mitigation(void)
{
- enum ssb_mitigation mode = SPEC_STORE_BYPASS_NONE;
enum ssb_mitigation_cmd cmd;
if (!boot_cpu_has(X86_FEATURE_SSBD))
- return mode;
+ goto out;
cmd = ssb_parse_cmdline();
if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS) &&
(cmd == SPEC_STORE_BYPASS_CMD_NONE ||
cmd == SPEC_STORE_BYPASS_CMD_AUTO))
- return mode;
+ return;
switch (cmd) {
case SPEC_STORE_BYPASS_CMD_SECCOMP:
@@ -1742,28 +2405,35 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void)
* enabled.
*/
if (IS_ENABLED(CONFIG_SECCOMP))
- mode = SPEC_STORE_BYPASS_SECCOMP;
+ ssb_mode = SPEC_STORE_BYPASS_SECCOMP;
else
- mode = SPEC_STORE_BYPASS_PRCTL;
+ ssb_mode = SPEC_STORE_BYPASS_PRCTL;
break;
case SPEC_STORE_BYPASS_CMD_ON:
- mode = SPEC_STORE_BYPASS_DISABLE;
+ ssb_mode = SPEC_STORE_BYPASS_DISABLE;
break;
case SPEC_STORE_BYPASS_CMD_AUTO:
case SPEC_STORE_BYPASS_CMD_PRCTL:
- mode = SPEC_STORE_BYPASS_PRCTL;
+ ssb_mode = SPEC_STORE_BYPASS_PRCTL;
break;
case SPEC_STORE_BYPASS_CMD_NONE:
break;
}
+out:
+ if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+ pr_info("%s\n", ssb_strings[ssb_mode]);
+}
+
+static void __init ssb_apply_mitigation(void)
+{
/*
* We have three CPU feature flags that are in play here:
* - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible.
* - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass
* - X86_FEATURE_SPEC_STORE_BYPASS_DISABLE - engage the mitigation
*/
- if (mode == SPEC_STORE_BYPASS_DISABLE) {
+ if (ssb_mode == SPEC_STORE_BYPASS_DISABLE) {
setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE);
/*
* Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD may
@@ -1774,19 +2444,9 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void)
x86_amd_ssb_disable();
} else {
x86_spec_ctrl_base |= SPEC_CTRL_SSBD;
- write_spec_ctrl_current(x86_spec_ctrl_base, true);
+ update_spec_ctrl(x86_spec_ctrl_base);
}
}
-
- return mode;
-}
-
-static void ssb_select_mitigation(void)
-{
- ssb_mode = __ssb_select_mitigation();
-
- if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
- pr_info("%s\n", ssb_strings[ssb_mode]);
}
#undef pr_fmt
@@ -1921,6 +2581,8 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
if (ctrl == PR_SPEC_FORCE_DISABLE)
task_set_spec_ib_force_disable(task);
task_update_spec_tif(task);
+ if (task == current)
+ indirect_branch_prediction_barrier();
break;
default:
return -ERANGE;
@@ -1968,6 +2630,10 @@ static int l1d_flush_prctl_get(struct task_struct *task)
static int ssb_prctl_get(struct task_struct *task)
{
switch (ssb_mode) {
+ case SPEC_STORE_BYPASS_NONE:
+ if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+ return PR_SPEC_ENABLE;
+ return PR_SPEC_NOT_AFFECTED;
case SPEC_STORE_BYPASS_DISABLE:
return PR_SPEC_DISABLE;
case SPEC_STORE_BYPASS_SECCOMP:
@@ -1979,11 +2645,8 @@ static int ssb_prctl_get(struct task_struct *task)
if (task_spec_ssb_disable(task))
return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
- default:
- if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
- return PR_SPEC_ENABLE;
- return PR_SPEC_NOT_AFFECTED;
}
+ BUG();
}
static int ib_prctl_get(struct task_struct *task)
@@ -2025,7 +2688,7 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
void x86_spec_ctrl_setup_ap(void)
{
if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
- write_spec_ctrl_current(x86_spec_ctrl_base, true);
+ update_spec_ctrl(x86_spec_ctrl_base);
if (ssb_mode == SPEC_STORE_BYPASS_DISABLE)
x86_amd_ssb_disable();
@@ -2038,7 +2701,8 @@ EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation);
#define pr_fmt(fmt) "L1TF: " fmt
/* Default mitigation for L1TF-affected CPUs */
-enum l1tf_mitigations l1tf_mitigation __ro_after_init = L1TF_MITIGATION_FLUSH;
+enum l1tf_mitigations l1tf_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_L1TF) ? L1TF_MITIGATION_AUTO : L1TF_MITIGATION_OFF;
#if IS_ENABLED(CONFIG_KVM_INTEL)
EXPORT_SYMBOL_GPL(l1tf_mitigation);
#endif
@@ -2064,20 +2728,20 @@ static void override_cache_bits(struct cpuinfo_x86 *c)
if (c->x86 != 6)
return;
- switch (c->x86_model) {
- case INTEL_FAM6_NEHALEM:
- case INTEL_FAM6_WESTMERE:
- case INTEL_FAM6_SANDYBRIDGE:
- case INTEL_FAM6_IVYBRIDGE:
- case INTEL_FAM6_HASWELL:
- case INTEL_FAM6_HASWELL_L:
- case INTEL_FAM6_HASWELL_G:
- case INTEL_FAM6_BROADWELL:
- case INTEL_FAM6_BROADWELL_G:
- case INTEL_FAM6_SKYLAKE_L:
- case INTEL_FAM6_SKYLAKE:
- case INTEL_FAM6_KABYLAKE_L:
- case INTEL_FAM6_KABYLAKE:
+ switch (c->x86_vfm) {
+ case INTEL_NEHALEM:
+ case INTEL_WESTMERE:
+ case INTEL_SANDYBRIDGE:
+ case INTEL_IVYBRIDGE:
+ case INTEL_HASWELL:
+ case INTEL_HASWELL_L:
+ case INTEL_HASWELL_G:
+ case INTEL_BROADWELL:
+ case INTEL_BROADWELL_G:
+ case INTEL_SKYLAKE_L:
+ case INTEL_SKYLAKE:
+ case INTEL_KABYLAKE_L:
+ case INTEL_KABYLAKE:
if (c->x86_cache_bits < 44)
c->x86_cache_bits = 44;
break;
@@ -2086,22 +2750,33 @@ static void override_cache_bits(struct cpuinfo_x86 *c)
static void __init l1tf_select_mitigation(void)
{
+ if (!boot_cpu_has_bug(X86_BUG_L1TF) || cpu_mitigations_off()) {
+ l1tf_mitigation = L1TF_MITIGATION_OFF;
+ return;
+ }
+
+ if (l1tf_mitigation == L1TF_MITIGATION_AUTO) {
+ if (cpu_mitigations_auto_nosmt())
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT;
+ else
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH;
+ }
+}
+
+static void __init l1tf_apply_mitigation(void)
+{
u64 half_pa;
if (!boot_cpu_has_bug(X86_BUG_L1TF))
return;
- if (cpu_mitigations_off())
- l1tf_mitigation = L1TF_MITIGATION_OFF;
- else if (cpu_mitigations_auto_nosmt())
- l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT;
-
override_cache_bits(&boot_cpu_data);
switch (l1tf_mitigation) {
case L1TF_MITIGATION_OFF:
case L1TF_MITIGATION_FLUSH_NOWARN:
case L1TF_MITIGATION_FLUSH:
+ case L1TF_MITIGATION_AUTO:
break;
case L1TF_MITIGATION_FLUSH_NOSMT:
case L1TF_MITIGATION_FULL:
@@ -2157,6 +2832,194 @@ static int __init l1tf_cmdline(char *str)
early_param("l1tf", l1tf_cmdline);
#undef pr_fmt
+#define pr_fmt(fmt) "Speculative Return Stack Overflow: " fmt
+
+enum srso_mitigation {
+ SRSO_MITIGATION_NONE,
+ SRSO_MITIGATION_AUTO,
+ SRSO_MITIGATION_UCODE_NEEDED,
+ SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED,
+ SRSO_MITIGATION_MICROCODE,
+ SRSO_MITIGATION_SAFE_RET,
+ SRSO_MITIGATION_IBPB,
+ SRSO_MITIGATION_IBPB_ON_VMEXIT,
+ SRSO_MITIGATION_BP_SPEC_REDUCE,
+};
+
+static const char * const srso_strings[] = {
+ [SRSO_MITIGATION_NONE] = "Vulnerable",
+ [SRSO_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
+ [SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED] = "Vulnerable: Safe RET, no microcode",
+ [SRSO_MITIGATION_MICROCODE] = "Vulnerable: Microcode, no safe RET",
+ [SRSO_MITIGATION_SAFE_RET] = "Mitigation: Safe RET",
+ [SRSO_MITIGATION_IBPB] = "Mitigation: IBPB",
+ [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only",
+ [SRSO_MITIGATION_BP_SPEC_REDUCE] = "Mitigation: Reduced Speculation"
+};
+
+static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_AUTO;
+
+static int __init srso_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off"))
+ srso_mitigation = SRSO_MITIGATION_NONE;
+ else if (!strcmp(str, "microcode"))
+ srso_mitigation = SRSO_MITIGATION_MICROCODE;
+ else if (!strcmp(str, "safe-ret"))
+ srso_mitigation = SRSO_MITIGATION_SAFE_RET;
+ else if (!strcmp(str, "ibpb"))
+ srso_mitigation = SRSO_MITIGATION_IBPB;
+ else if (!strcmp(str, "ibpb-vmexit"))
+ srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT;
+ else
+ pr_err("Ignoring unknown SRSO option (%s).", str);
+
+ return 0;
+}
+early_param("spec_rstack_overflow", srso_parse_cmdline);
+
+#define SRSO_NOTICE "WARNING: See https://kernel.org/doc/html/latest/admin-guide/hw-vuln/srso.html for mitigation options."
+
+static void __init srso_select_mitigation(void)
+{
+ bool has_microcode;
+
+ if (!boot_cpu_has_bug(X86_BUG_SRSO) || cpu_mitigations_off())
+ srso_mitigation = SRSO_MITIGATION_NONE;
+
+ if (srso_mitigation == SRSO_MITIGATION_NONE)
+ return;
+
+ if (srso_mitigation == SRSO_MITIGATION_AUTO)
+ srso_mitigation = SRSO_MITIGATION_SAFE_RET;
+
+ has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE);
+ if (has_microcode) {
+ /*
+ * Zen1/2 with SMT off aren't vulnerable after the right
+ * IBPB microcode has been applied.
+ */
+ if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) {
+ setup_force_cpu_cap(X86_FEATURE_SRSO_NO);
+ srso_mitigation = SRSO_MITIGATION_NONE;
+ return;
+ }
+ } else {
+ pr_warn("IBPB-extending microcode not applied!\n");
+ pr_warn(SRSO_NOTICE);
+ }
+
+ switch (srso_mitigation) {
+ case SRSO_MITIGATION_SAFE_RET:
+ if (boot_cpu_has(X86_FEATURE_SRSO_USER_KERNEL_NO)) {
+ srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT;
+ goto ibpb_on_vmexit;
+ }
+
+ if (!IS_ENABLED(CONFIG_MITIGATION_SRSO)) {
+ pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n");
+ srso_mitigation = SRSO_MITIGATION_NONE;
+ }
+
+ if (!has_microcode)
+ srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED;
+ break;
+ibpb_on_vmexit:
+ case SRSO_MITIGATION_IBPB_ON_VMEXIT:
+ if (boot_cpu_has(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) {
+ pr_notice("Reducing speculation to address VM/HV SRSO attack vector.\n");
+ srso_mitigation = SRSO_MITIGATION_BP_SPEC_REDUCE;
+ break;
+ }
+ fallthrough;
+ case SRSO_MITIGATION_IBPB:
+ if (!IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) {
+ pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n");
+ srso_mitigation = SRSO_MITIGATION_NONE;
+ }
+
+ if (!has_microcode)
+ srso_mitigation = SRSO_MITIGATION_UCODE_NEEDED;
+ break;
+ default:
+ break;
+ }
+}
+
+static void __init srso_update_mitigation(void)
+{
+ /* If retbleed is using IBPB, that works for SRSO as well */
+ if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB &&
+ boot_cpu_has(X86_FEATURE_IBPB_BRTYPE))
+ srso_mitigation = SRSO_MITIGATION_IBPB;
+
+ if (boot_cpu_has_bug(X86_BUG_SRSO) &&
+ !cpu_mitigations_off() &&
+ !boot_cpu_has(X86_FEATURE_SRSO_NO))
+ pr_info("%s\n", srso_strings[srso_mitigation]);
+}
+
+static void __init srso_apply_mitigation(void)
+{
+ /*
+ * Clear the feature flag if this mitigation is not selected as that
+ * feature flag controls the BpSpecReduce MSR bit toggling in KVM.
+ */
+ if (srso_mitigation != SRSO_MITIGATION_BP_SPEC_REDUCE)
+ setup_clear_cpu_cap(X86_FEATURE_SRSO_BP_SPEC_REDUCE);
+
+ if (srso_mitigation == SRSO_MITIGATION_NONE) {
+ if (boot_cpu_has(X86_FEATURE_SBPB))
+ x86_pred_cmd = PRED_CMD_SBPB;
+ return;
+ }
+
+ switch (srso_mitigation) {
+ case SRSO_MITIGATION_SAFE_RET:
+ case SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED:
+ /*
+ * Enable the return thunk for generated code
+ * like ftrace, static_call, etc.
+ */
+ setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+ setup_force_cpu_cap(X86_FEATURE_UNRET);
+
+ if (boot_cpu_data.x86 == 0x19) {
+ setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS);
+ set_return_thunk(srso_alias_return_thunk);
+ } else {
+ setup_force_cpu_cap(X86_FEATURE_SRSO);
+ set_return_thunk(srso_return_thunk);
+ }
+ break;
+ case SRSO_MITIGATION_IBPB:
+ setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
+ /*
+ * IBPB on entry already obviates the need for
+ * software-based untraining so clear those in case some
+ * other mitigation like Retbleed has selected them.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_UNRET);
+ setup_clear_cpu_cap(X86_FEATURE_RETHUNK);
+ fallthrough;
+ case SRSO_MITIGATION_IBPB_ON_VMEXIT:
+ setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
+ /*
+ * There is no need for RSB filling: entry_ibpb() ensures
+ * all predictions, including the RSB, are invalidated,
+ * regardless of IBPB implementation.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT);
+ break;
+ default:
+ break;
+ }
+}
+
+#undef pr_fmt
#define pr_fmt(fmt) fmt
#ifdef CONFIG_SYSFS
@@ -2176,74 +3039,74 @@ static const char * const l1tf_vmx_states[] = {
static ssize_t l1tf_show_state(char *buf)
{
if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO)
- return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
+ return sysfs_emit(buf, "%s\n", L1TF_DEFAULT_MSG);
if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_EPT_DISABLED ||
(l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER &&
sched_smt_active())) {
- return sprintf(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG,
- l1tf_vmx_states[l1tf_vmx_mitigation]);
+ return sysfs_emit(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG,
+ l1tf_vmx_states[l1tf_vmx_mitigation]);
}
- return sprintf(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG,
- l1tf_vmx_states[l1tf_vmx_mitigation],
- sched_smt_active() ? "vulnerable" : "disabled");
+ return sysfs_emit(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG,
+ l1tf_vmx_states[l1tf_vmx_mitigation],
+ sched_smt_active() ? "vulnerable" : "disabled");
}
static ssize_t itlb_multihit_show_state(char *buf)
{
if (!boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
!boot_cpu_has(X86_FEATURE_VMX))
- return sprintf(buf, "KVM: Mitigation: VMX unsupported\n");
+ return sysfs_emit(buf, "KVM: Mitigation: VMX unsupported\n");
else if (!(cr4_read_shadow() & X86_CR4_VMXE))
- return sprintf(buf, "KVM: Mitigation: VMX disabled\n");
+ return sysfs_emit(buf, "KVM: Mitigation: VMX disabled\n");
else if (itlb_multihit_kvm_mitigation)
- return sprintf(buf, "KVM: Mitigation: Split huge pages\n");
+ return sysfs_emit(buf, "KVM: Mitigation: Split huge pages\n");
else
- return sprintf(buf, "KVM: Vulnerable\n");
+ return sysfs_emit(buf, "KVM: Vulnerable\n");
}
#else
static ssize_t l1tf_show_state(char *buf)
{
- return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
+ return sysfs_emit(buf, "%s\n", L1TF_DEFAULT_MSG);
}
static ssize_t itlb_multihit_show_state(char *buf)
{
- return sprintf(buf, "Processor vulnerable\n");
+ return sysfs_emit(buf, "Processor vulnerable\n");
}
#endif
static ssize_t mds_show_state(char *buf)
{
if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
- return sprintf(buf, "%s; SMT Host state unknown\n",
- mds_strings[mds_mitigation]);
+ return sysfs_emit(buf, "%s; SMT Host state unknown\n",
+ mds_strings[mds_mitigation]);
}
if (boot_cpu_has(X86_BUG_MSBDS_ONLY)) {
- return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
- (mds_mitigation == MDS_MITIGATION_OFF ? "vulnerable" :
- sched_smt_active() ? "mitigated" : "disabled"));
+ return sysfs_emit(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
+ (mds_mitigation == MDS_MITIGATION_OFF ? "vulnerable" :
+ sched_smt_active() ? "mitigated" : "disabled"));
}
- return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
- sched_smt_active() ? "vulnerable" : "disabled");
+ return sysfs_emit(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
+ sched_smt_active() ? "vulnerable" : "disabled");
}
static ssize_t tsx_async_abort_show_state(char *buf)
{
if ((taa_mitigation == TAA_MITIGATION_TSX_DISABLED) ||
(taa_mitigation == TAA_MITIGATION_OFF))
- return sprintf(buf, "%s\n", taa_strings[taa_mitigation]);
+ return sysfs_emit(buf, "%s\n", taa_strings[taa_mitigation]);
if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
- return sprintf(buf, "%s; SMT Host state unknown\n",
- taa_strings[taa_mitigation]);
+ return sysfs_emit(buf, "%s; SMT Host state unknown\n",
+ taa_strings[taa_mitigation]);
}
- return sprintf(buf, "%s; SMT %s\n", taa_strings[taa_mitigation],
- sched_smt_active() ? "vulnerable" : "disabled");
+ return sysfs_emit(buf, "%s; SMT %s\n", taa_strings[taa_mitigation],
+ sched_smt_active() ? "vulnerable" : "disabled");
}
static ssize_t mmio_stale_data_show_state(char *buf)
@@ -2260,22 +3123,41 @@ static ssize_t mmio_stale_data_show_state(char *buf)
sched_smt_active() ? "vulnerable" : "disabled");
}
+static ssize_t rfds_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", rfds_strings[rfds_mitigation]);
+}
+
+static ssize_t old_microcode_show_state(char *buf)
+{
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return sysfs_emit(buf, "Unknown: running under hypervisor");
+
+ return sysfs_emit(buf, "Vulnerable\n");
+}
+
+static ssize_t its_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", its_strings[its_mitigation]);
+}
+
static char *stibp_state(void)
{
- if (spectre_v2_in_ibrs_mode(spectre_v2_enabled))
+ if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
+ !boot_cpu_has(X86_FEATURE_AUTOIBRS))
return "";
switch (spectre_v2_user_stibp) {
case SPECTRE_V2_USER_NONE:
- return ", STIBP: disabled";
+ return "; STIBP: disabled";
case SPECTRE_V2_USER_STRICT:
- return ", STIBP: forced";
+ return "; STIBP: forced";
case SPECTRE_V2_USER_STRICT_PREFERRED:
- return ", STIBP: always-on";
+ return "; STIBP: always-on";
case SPECTRE_V2_USER_PRCTL:
case SPECTRE_V2_USER_SECCOMP:
if (static_key_enabled(&switch_to_cond_stibp))
- return ", STIBP: conditional";
+ return "; STIBP: conditional";
}
return "";
}
@@ -2284,82 +3166,129 @@ static char *ibpb_state(void)
{
if (boot_cpu_has(X86_FEATURE_IBPB)) {
if (static_key_enabled(&switch_mm_always_ibpb))
- return ", IBPB: always-on";
+ return "; IBPB: always-on";
if (static_key_enabled(&switch_mm_cond_ibpb))
- return ", IBPB: conditional";
- return ", IBPB: disabled";
+ return "; IBPB: conditional";
+ return "; IBPB: disabled";
}
return "";
}
+static char *pbrsb_eibrs_state(void)
+{
+ if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) {
+ if (boot_cpu_has(X86_FEATURE_RSB_VMEXIT_LITE) ||
+ boot_cpu_has(X86_FEATURE_RSB_VMEXIT))
+ return "; PBRSB-eIBRS: SW sequence";
+ else
+ return "; PBRSB-eIBRS: Vulnerable";
+ } else {
+ return "; PBRSB-eIBRS: Not affected";
+ }
+}
+
+static const char *spectre_bhi_state(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_BHI))
+ return "; BHI: Not affected";
+ else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_HW))
+ return "; BHI: BHI_DIS_S";
+ else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP))
+ return "; BHI: SW loop, KVM: SW loop";
+ else if (boot_cpu_has(X86_FEATURE_RETPOLINE) &&
+ !boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE) &&
+ rrsba_disabled)
+ return "; BHI: Retpoline";
+ else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_VMEXIT))
+ return "; BHI: Vulnerable, KVM: SW loop";
+
+ return "; BHI: Vulnerable";
+}
+
static ssize_t spectre_v2_show_state(char *buf)
{
if (spectre_v2_enabled == SPECTRE_V2_LFENCE)
- return sprintf(buf, "Vulnerable: LFENCE\n");
+ return sysfs_emit(buf, "Vulnerable: LFENCE\n");
if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled())
- return sprintf(buf, "Vulnerable: eIBRS with unprivileged eBPF\n");
+ return sysfs_emit(buf, "Vulnerable: eIBRS with unprivileged eBPF\n");
if (sched_smt_active() && unprivileged_ebpf_enabled() &&
spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE)
- return sprintf(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n");
+ return sysfs_emit(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n");
- return sprintf(buf, "%s%s%s%s%s%s\n",
- spectre_v2_strings[spectre_v2_enabled],
- ibpb_state(),
- boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
- stibp_state(),
- boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "",
- spectre_v2_module_string());
+ return sysfs_emit(buf, "%s%s%s%s%s%s%s%s\n",
+ spectre_v2_strings[spectre_v2_enabled],
+ ibpb_state(),
+ boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? "; IBRS_FW" : "",
+ stibp_state(),
+ boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? "; RSB filling" : "",
+ pbrsb_eibrs_state(),
+ spectre_bhi_state(),
+ /* this should always be at the end */
+ spectre_v2_module_string());
}
static ssize_t srbds_show_state(char *buf)
{
- return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]);
+ return sysfs_emit(buf, "%s\n", srbds_strings[srbds_mitigation]);
}
static ssize_t retbleed_show_state(char *buf)
{
- if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET) {
- if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
- boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
- return sprintf(buf, "Vulnerable: untrained return thunk on non-Zen uarch\n");
+ if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET ||
+ retbleed_mitigation == RETBLEED_MITIGATION_IBPB) {
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
+ return sysfs_emit(buf, "Vulnerable: untrained return thunk / IBPB on non-AMD based uarch\n");
- return sprintf(buf, "%s; SMT %s\n",
- retbleed_strings[retbleed_mitigation],
- !sched_smt_active() ? "disabled" :
- spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
- spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED ?
- "enabled with STIBP protection" : "vulnerable");
+ return sysfs_emit(buf, "%s; SMT %s\n", retbleed_strings[retbleed_mitigation],
+ !sched_smt_active() ? "disabled" :
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED ?
+ "enabled with STIBP protection" : "vulnerable");
}
- return sprintf(buf, "%s\n", retbleed_strings[retbleed_mitigation]);
+ return sysfs_emit(buf, "%s\n", retbleed_strings[retbleed_mitigation]);
+}
+
+static ssize_t srso_show_state(char *buf)
+{
+ if (boot_cpu_has(X86_FEATURE_SRSO_NO))
+ return sysfs_emit(buf, "Mitigation: SMT disabled\n");
+
+ return sysfs_emit(buf, "%s\n", srso_strings[srso_mitigation]);
+}
+
+static ssize_t gds_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", gds_strings[gds_mitigation]);
}
static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
char *buf, unsigned int bug)
{
if (!boot_cpu_has_bug(bug))
- return sprintf(buf, "Not affected\n");
+ return sysfs_emit(buf, "Not affected\n");
switch (bug) {
case X86_BUG_CPU_MELTDOWN:
if (boot_cpu_has(X86_FEATURE_PTI))
- return sprintf(buf, "Mitigation: PTI\n");
+ return sysfs_emit(buf, "Mitigation: PTI\n");
if (hypervisor_is_type(X86_HYPER_XEN_PV))
- return sprintf(buf, "Unknown (XEN PV detected, hypervisor mitigation required)\n");
+ return sysfs_emit(buf, "Unknown (XEN PV detected, hypervisor mitigation required)\n");
break;
case X86_BUG_SPECTRE_V1:
- return sprintf(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]);
+ return sysfs_emit(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]);
case X86_BUG_SPECTRE_V2:
return spectre_v2_show_state(buf);
case X86_BUG_SPEC_STORE_BYPASS:
- return sprintf(buf, "%s\n", ssb_strings[ssb_mode]);
+ return sysfs_emit(buf, "%s\n", ssb_strings[ssb_mode]);
case X86_BUG_L1TF:
if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV))
@@ -2384,11 +3313,26 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
case X86_BUG_RETBLEED:
return retbleed_show_state(buf);
+ case X86_BUG_SRSO:
+ return srso_show_state(buf);
+
+ case X86_BUG_GDS:
+ return gds_show_state(buf);
+
+ case X86_BUG_RFDS:
+ return rfds_show_state(buf);
+
+ case X86_BUG_OLD_MICROCODE:
+ return old_microcode_show_state(buf);
+
+ case X86_BUG_ITS:
+ return its_show_state(buf);
+
default:
break;
}
- return sprintf(buf, "Vulnerable\n");
+ return sysfs_emit(buf, "Vulnerable\n");
}
ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf)
@@ -2445,4 +3389,34 @@ ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, cha
{
return cpu_show_common(dev, attr, buf, X86_BUG_RETBLEED);
}
+
+ssize_t cpu_show_spec_rstack_overflow(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_SRSO);
+}
+
+ssize_t cpu_show_gds(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_GDS);
+}
+
+ssize_t cpu_show_reg_file_data_sampling(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_RFDS);
+}
+
+ssize_t cpu_show_old_microcode(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_OLD_MICROCODE);
+}
+
+ssize_t cpu_show_indirect_target_selection(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_ITS);
+}
#endif
+
+void __warn_thunk(void)
+{
+ WARN_ONCE(1, "Unpatched return thunk in use. This should not happen!\n");
+}
diff --git a/arch/x86/kernel/cpu/bus_lock.c b/arch/x86/kernel/cpu/bus_lock.c
new file mode 100644
index 000000000000..981f8b1f0792
--- /dev/null
+++ b/arch/x86/kernel/cpu/bus_lock.c
@@ -0,0 +1,432 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define pr_fmt(fmt) "x86/split lock detection: " fmt
+
+#include <linux/semaphore.h>
+#include <linux/workqueue.h>
+#include <linux/delay.h>
+#include <linux/cpuhotplug.h>
+#include <asm/cpu_device_id.h>
+#include <asm/cmdline.h>
+#include <asm/traps.h>
+#include <asm/cpu.h>
+#include <asm/msr.h>
+
+enum split_lock_detect_state {
+ sld_off = 0,
+ sld_warn,
+ sld_fatal,
+ sld_ratelimit,
+};
+
+/*
+ * Default to sld_off because most systems do not support split lock detection.
+ * sld_state_setup() will switch this to sld_warn on systems that support
+ * split lock/bus lock detect, unless there is a command line override.
+ */
+static enum split_lock_detect_state sld_state __ro_after_init = sld_off;
+static u64 msr_test_ctrl_cache __ro_after_init;
+
+/*
+ * With a name like MSR_TEST_CTL it should go without saying, but don't touch
+ * MSR_TEST_CTL unless the CPU is one of the whitelisted models. Writing it
+ * on CPUs that do not support SLD can cause fireworks, even when writing '0'.
+ */
+static bool cpu_model_supports_sld __ro_after_init;
+
+static const struct {
+ const char *option;
+ enum split_lock_detect_state state;
+} sld_options[] __initconst = {
+ { "off", sld_off },
+ { "warn", sld_warn },
+ { "fatal", sld_fatal },
+ { "ratelimit:", sld_ratelimit },
+};
+
+static struct ratelimit_state bld_ratelimit;
+
+static unsigned int sysctl_sld_mitigate = 1;
+static DEFINE_SEMAPHORE(buslock_sem, 1);
+
+#ifdef CONFIG_PROC_SYSCTL
+static const struct ctl_table sld_sysctls[] = {
+ {
+ .procname = "split_lock_mitigate",
+ .data = &sysctl_sld_mitigate,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+};
+
+static int __init sld_mitigate_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sld_sysctls);
+ return 0;
+}
+
+late_initcall(sld_mitigate_sysctl_init);
+#endif
+
+static inline bool match_option(const char *arg, int arglen, const char *opt)
+{
+ int len = strlen(opt), ratelimit;
+
+ if (strncmp(arg, opt, len))
+ return false;
+
+ /*
+ * Min ratelimit is 1 bus lock/sec.
+ * Max ratelimit is 1000 bus locks/sec.
+ */
+ if (sscanf(arg, "ratelimit:%d", &ratelimit) == 1 &&
+ ratelimit > 0 && ratelimit <= 1000) {
+ ratelimit_state_init(&bld_ratelimit, HZ, ratelimit);
+ ratelimit_set_flags(&bld_ratelimit, RATELIMIT_MSG_ON_RELEASE);
+ return true;
+ }
+
+ return len == arglen;
+}
+
+static bool split_lock_verify_msr(bool on)
+{
+ u64 ctrl, tmp;
+
+ if (rdmsrq_safe(MSR_TEST_CTRL, &ctrl))
+ return false;
+ if (on)
+ ctrl |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
+ else
+ ctrl &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
+ if (wrmsrq_safe(MSR_TEST_CTRL, ctrl))
+ return false;
+ rdmsrq(MSR_TEST_CTRL, tmp);
+ return ctrl == tmp;
+}
+
+static void __init sld_state_setup(void)
+{
+ enum split_lock_detect_state state = sld_warn;
+ char arg[20];
+ int i, ret;
+
+ if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) &&
+ !boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+ return;
+
+ ret = cmdline_find_option(boot_command_line, "split_lock_detect",
+ arg, sizeof(arg));
+ if (ret >= 0) {
+ for (i = 0; i < ARRAY_SIZE(sld_options); i++) {
+ if (match_option(arg, ret, sld_options[i].option)) {
+ state = sld_options[i].state;
+ break;
+ }
+ }
+ }
+ sld_state = state;
+}
+
+static void __init __split_lock_setup(void)
+{
+ if (!split_lock_verify_msr(false)) {
+ pr_info("MSR access failed: Disabled\n");
+ return;
+ }
+
+ rdmsrq(MSR_TEST_CTRL, msr_test_ctrl_cache);
+
+ if (!split_lock_verify_msr(true)) {
+ pr_info("MSR access failed: Disabled\n");
+ return;
+ }
+
+ /* Restore the MSR to its cached value. */
+ wrmsrq(MSR_TEST_CTRL, msr_test_ctrl_cache);
+
+ setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT);
+}
+
+/*
+ * MSR_TEST_CTRL is per core, but we treat it like a per CPU MSR. Locking
+ * is not implemented as one thread could undo the setting of the other
+ * thread immediately after dropping the lock anyway.
+ */
+static void sld_update_msr(bool on)
+{
+ u64 test_ctrl_val = msr_test_ctrl_cache;
+
+ if (on)
+ test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
+
+ wrmsrq(MSR_TEST_CTRL, test_ctrl_val);
+}
+
+void split_lock_init(void)
+{
+ /*
+ * #DB for bus lock handles ratelimit and #AC for split lock is
+ * disabled.
+ */
+ if (sld_state == sld_ratelimit) {
+ split_lock_verify_msr(false);
+ return;
+ }
+
+ if (cpu_model_supports_sld)
+ split_lock_verify_msr(sld_state != sld_off);
+}
+
+static void __split_lock_reenable_unlock(struct work_struct *work)
+{
+ sld_update_msr(true);
+ up(&buslock_sem);
+}
+
+static DECLARE_DELAYED_WORK(sl_reenable_unlock, __split_lock_reenable_unlock);
+
+static void __split_lock_reenable(struct work_struct *work)
+{
+ sld_update_msr(true);
+}
+/*
+ * In order for each CPU to schedule its delayed work independently of the
+ * others, delayed work struct must be per-CPU. This is not required when
+ * sysctl_sld_mitigate is enabled because of the semaphore that limits
+ * the number of simultaneously scheduled delayed works to 1.
+ */
+static DEFINE_PER_CPU(struct delayed_work, sl_reenable);
+
+/*
+ * Per-CPU delayed_work can't be statically initialized properly because
+ * the struct address is unknown. Thus per-CPU delayed_work structures
+ * have to be initialized during kernel initialization and after calling
+ * setup_per_cpu_areas().
+ */
+static int __init setup_split_lock_delayed_work(void)
+{
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct delayed_work *work = per_cpu_ptr(&sl_reenable, cpu);
+
+ INIT_DELAYED_WORK(work, __split_lock_reenable);
+ }
+
+ return 0;
+}
+pure_initcall(setup_split_lock_delayed_work);
+
+/*
+ * If a CPU goes offline with pending delayed work to re-enable split lock
+ * detection then the delayed work will be executed on some other CPU. That
+ * handles releasing the buslock_sem, but because it executes on a
+ * different CPU probably won't re-enable split lock detection. This is a
+ * problem on HT systems since the sibling CPU on the same core may then be
+ * left running with split lock detection disabled.
+ *
+ * Unconditionally re-enable detection here.
+ */
+static int splitlock_cpu_offline(unsigned int cpu)
+{
+ sld_update_msr(true);
+
+ return 0;
+}
+
+static void split_lock_warn(unsigned long ip)
+{
+ struct delayed_work *work;
+ int cpu;
+ unsigned int saved_sld_mitigate = READ_ONCE(sysctl_sld_mitigate);
+
+ if (!current->reported_split_lock)
+ pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n",
+ current->comm, current->pid, ip);
+ current->reported_split_lock = 1;
+
+ if (saved_sld_mitigate) {
+ /*
+ * misery factor #1:
+ * sleep 10ms before trying to execute split lock.
+ */
+ if (msleep_interruptible(10) > 0)
+ return;
+ /*
+ * Misery factor #2:
+ * only allow one buslocked disabled core at a time.
+ */
+ if (down_interruptible(&buslock_sem) == -EINTR)
+ return;
+ }
+
+ cpu = get_cpu();
+ work = saved_sld_mitigate ? &sl_reenable_unlock : per_cpu_ptr(&sl_reenable, cpu);
+ schedule_delayed_work_on(cpu, work, 2);
+
+ /* Disable split lock detection on this CPU to make progress */
+ sld_update_msr(false);
+ put_cpu();
+}
+
+bool handle_guest_split_lock(unsigned long ip)
+{
+ if (sld_state == sld_warn) {
+ split_lock_warn(ip);
+ return true;
+ }
+
+ pr_warn_once("#AC: %s/%d %s split_lock trap at address: 0x%lx\n",
+ current->comm, current->pid,
+ sld_state == sld_fatal ? "fatal" : "bogus", ip);
+
+ current->thread.error_code = 0;
+ current->thread.trap_nr = X86_TRAP_AC;
+ force_sig_fault(SIGBUS, BUS_ADRALN, NULL);
+ return false;
+}
+EXPORT_SYMBOL_GPL(handle_guest_split_lock);
+
+void bus_lock_init(void)
+{
+ u64 val;
+
+ if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+ return;
+
+ rdmsrq(MSR_IA32_DEBUGCTLMSR, val);
+
+ if ((boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) &&
+ (sld_state == sld_warn || sld_state == sld_fatal)) ||
+ sld_state == sld_off) {
+ /*
+ * Warn and fatal are handled by #AC for split lock if #AC for
+ * split lock is supported.
+ */
+ val &= ~DEBUGCTLMSR_BUS_LOCK_DETECT;
+ } else {
+ val |= DEBUGCTLMSR_BUS_LOCK_DETECT;
+ }
+
+ wrmsrq(MSR_IA32_DEBUGCTLMSR, val);
+}
+
+bool handle_user_split_lock(struct pt_regs *regs, long error_code)
+{
+ if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal)
+ return false;
+ split_lock_warn(regs->ip);
+ return true;
+}
+
+void handle_bus_lock(struct pt_regs *regs)
+{
+ switch (sld_state) {
+ case sld_off:
+ break;
+ case sld_ratelimit:
+ /* Enforce no more than bld_ratelimit bus locks/sec. */
+ while (!__ratelimit(&bld_ratelimit))
+ msleep(20);
+ /* Warn on the bus lock. */
+ fallthrough;
+ case sld_warn:
+ pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n",
+ current->comm, current->pid, regs->ip);
+ break;
+ case sld_fatal:
+ force_sig_fault(SIGBUS, BUS_ADRALN, NULL);
+ break;
+ }
+}
+
+/*
+ * CPU models that are known to have the per-core split-lock detection
+ * feature even though they do not enumerate IA32_CORE_CAPABILITIES.
+ */
+static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
+ X86_MATCH_VFM(INTEL_ICELAKE_X, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_L, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, 0),
+ {}
+};
+
+static void __init split_lock_setup(struct cpuinfo_x86 *c)
+{
+ const struct x86_cpu_id *m;
+ u64 ia32_core_caps;
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return;
+
+ /* Check for CPUs that have support but do not enumerate it: */
+ m = x86_match_cpu(split_lock_cpu_ids);
+ if (m)
+ goto supported;
+
+ if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES))
+ return;
+
+ /*
+ * Not all bits in MSR_IA32_CORE_CAPS are architectural, but
+ * MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT is. All CPUs that set
+ * it have split lock detection.
+ */
+ rdmsrq(MSR_IA32_CORE_CAPS, ia32_core_caps);
+ if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT)
+ goto supported;
+
+ /* CPU is not in the model list and does not have the MSR bit: */
+ return;
+
+supported:
+ cpu_model_supports_sld = true;
+ __split_lock_setup();
+}
+
+static void sld_state_show(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
+ !boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
+ return;
+
+ switch (sld_state) {
+ case sld_off:
+ pr_info("disabled\n");
+ break;
+ case sld_warn:
+ if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
+ pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n");
+ if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+ "x86/splitlock", NULL, splitlock_cpu_offline) < 0)
+ pr_warn("No splitlock CPU offline handler\n");
+ } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) {
+ pr_info("#DB: warning on user-space bus_locks\n");
+ }
+ break;
+ case sld_fatal:
+ if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
+ pr_info("#AC: crashing the kernel on kernel split_locks and sending SIGBUS on user-space split_locks\n");
+ } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) {
+ pr_info("#DB: sending SIGBUS on user-space bus_locks%s\n",
+ boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) ?
+ " from non-WB" : "");
+ }
+ break;
+ case sld_ratelimit:
+ if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+ pr_info("#DB: setting system wide bus lock rate limit to %u/sec\n", bld_ratelimit.burst);
+ break;
+ }
+}
+
+void __init sld_setup(struct cpuinfo_x86 *c)
+{
+ split_lock_setup(c);
+ sld_state_setup();
+ sld_state_show();
+}
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
index fe98a1465be6..adfa7e8bb865 100644
--- a/arch/x86/kernel/cpu/cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -1,242 +1,160 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Routines to identify caches on Intel CPU.
+ * x86 CPU caches detection and configuration
*
- * Changes:
- * Venkatesh Pallipadi : Adding cache identification through cpuid(4)
- * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure.
- * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD.
+ * Previous changes
+ * - Venkatesh Pallipadi: Cache identification through CPUID(0x4)
+ * - Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure
+ * - Andi Kleen / Andreas Herrmann: CPUID(0x4) emulation on AMD
*/
-#include <linux/slab.h>
#include <linux/cacheinfo.h>
#include <linux/cpu.h>
-#include <linux/sched.h>
-#include <linux/capability.h>
-#include <linux/sysfs.h>
-#include <linux/pci.h>
+#include <linux/cpuhotplug.h>
+#include <linux/stop_machine.h>
-#include <asm/cpufeature.h>
+#include <asm/amd/nb.h>
#include <asm/cacheinfo.h>
-#include <asm/amd_nb.h>
+#include <asm/cpufeature.h>
+#include <asm/cpuid/api.h>
+#include <asm/mtrr.h>
#include <asm/smp.h>
+#include <asm/tlbflush.h>
#include "cpu.h"
-#define LVL_1_INST 1
-#define LVL_1_DATA 2
-#define LVL_2 3
-#define LVL_3 4
-#define LVL_TRACE 5
+/* Shared last level cache maps */
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
-struct _cache_table {
- unsigned char descriptor;
- char cache_type;
- short size;
-};
+/* Shared L2 cache maps */
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map);
-#define MB(x) ((x) * 1024)
-
-/* All the cache descriptor types we care about (no TLB or
- trace cache entries) */
-
-static const struct _cache_table cache_table[] =
-{
- { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */
- { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */
- { 0x09, LVL_1_INST, 32 }, /* 4-way set assoc, 64 byte line size */
- { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */
- { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */
- { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */
- { 0x0e, LVL_1_DATA, 24 }, /* 6-way set assoc, 64 byte line size */
- { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */
- { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x25, LVL_3, MB(2) }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x29, LVL_3, MB(4) }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x2c, LVL_1_DATA, 32 }, /* 8-way set assoc, 64 byte line size */
- { 0x30, LVL_1_INST, 32 }, /* 8-way set assoc, 64 byte line size */
- { 0x39, LVL_2, 128 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x3a, LVL_2, 192 }, /* 6-way set assoc, sectored cache, 64 byte line size */
- { 0x3b, LVL_2, 128 }, /* 2-way set assoc, sectored cache, 64 byte line size */
- { 0x3c, LVL_2, 256 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x3d, LVL_2, 384 }, /* 6-way set assoc, sectored cache, 64 byte line size */
- { 0x3e, LVL_2, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x3f, LVL_2, 256 }, /* 2-way set assoc, 64 byte line size */
- { 0x41, LVL_2, 128 }, /* 4-way set assoc, 32 byte line size */
- { 0x42, LVL_2, 256 }, /* 4-way set assoc, 32 byte line size */
- { 0x43, LVL_2, 512 }, /* 4-way set assoc, 32 byte line size */
- { 0x44, LVL_2, MB(1) }, /* 4-way set assoc, 32 byte line size */
- { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */
- { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */
- { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */
- { 0x48, LVL_2, MB(3) }, /* 12-way set assoc, 64 byte line size */
- { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */
- { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */
- { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */
- { 0x4c, LVL_3, MB(12) }, /* 12-way set assoc, 64 byte line size */
- { 0x4d, LVL_3, MB(16) }, /* 16-way set assoc, 64 byte line size */
- { 0x4e, LVL_2, MB(6) }, /* 24-way set assoc, 64 byte line size */
- { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x68, LVL_1_DATA, 32 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x70, LVL_TRACE, 12 }, /* 8-way set assoc */
- { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */
- { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */
- { 0x73, LVL_TRACE, 64 }, /* 8-way set assoc */
- { 0x78, LVL_2, MB(1) }, /* 4-way set assoc, 64 byte line size */
- { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */
- { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */
- { 0x80, LVL_2, 512 }, /* 8-way set assoc, 64 byte line size */
- { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */
- { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */
- { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */
- { 0x85, LVL_2, MB(2) }, /* 8-way set assoc, 32 byte line size */
- { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */
- { 0x87, LVL_2, MB(1) }, /* 8-way set assoc, 64 byte line size */
- { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */
- { 0xd1, LVL_3, MB(1) }, /* 4-way set assoc, 64 byte line size */
- { 0xd2, LVL_3, MB(2) }, /* 4-way set assoc, 64 byte line size */
- { 0xd6, LVL_3, MB(1) }, /* 8-way set assoc, 64 byte line size */
- { 0xd7, LVL_3, MB(2) }, /* 8-way set assoc, 64 byte line size */
- { 0xd8, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */
- { 0xdc, LVL_3, MB(2) }, /* 12-way set assoc, 64 byte line size */
- { 0xdd, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */
- { 0xde, LVL_3, MB(8) }, /* 12-way set assoc, 64 byte line size */
- { 0xe2, LVL_3, MB(2) }, /* 16-way set assoc, 64 byte line size */
- { 0xe3, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */
- { 0xe4, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */
- { 0xea, LVL_3, MB(12) }, /* 24-way set assoc, 64 byte line size */
- { 0xeb, LVL_3, MB(18) }, /* 24-way set assoc, 64 byte line size */
- { 0xec, LVL_3, MB(24) }, /* 24-way set assoc, 64 byte line size */
- { 0x00, 0, 0}
-};
+static cpumask_var_t cpu_cacheinfo_mask;
+/* Kernel controls MTRR and/or PAT MSRs. */
+unsigned int memory_caching_control __ro_after_init;
enum _cache_type {
- CTYPE_NULL = 0,
- CTYPE_DATA = 1,
- CTYPE_INST = 2,
- CTYPE_UNIFIED = 3
+ CTYPE_NULL = 0,
+ CTYPE_DATA = 1,
+ CTYPE_INST = 2,
+ CTYPE_UNIFIED = 3
};
union _cpuid4_leaf_eax {
struct {
- enum _cache_type type:5;
- unsigned int level:3;
- unsigned int is_self_initializing:1;
- unsigned int is_fully_associative:1;
- unsigned int reserved:4;
- unsigned int num_threads_sharing:12;
- unsigned int num_cores_on_die:6;
+ enum _cache_type type :5;
+ unsigned int level :3;
+ unsigned int is_self_initializing :1;
+ unsigned int is_fully_associative :1;
+ unsigned int reserved :4;
+ unsigned int num_threads_sharing :12;
+ unsigned int num_cores_on_die :6;
} split;
u32 full;
};
union _cpuid4_leaf_ebx {
struct {
- unsigned int coherency_line_size:12;
- unsigned int physical_line_partition:10;
- unsigned int ways_of_associativity:10;
+ unsigned int coherency_line_size :12;
+ unsigned int physical_line_partition :10;
+ unsigned int ways_of_associativity :10;
} split;
u32 full;
};
union _cpuid4_leaf_ecx {
struct {
- unsigned int number_of_sets:32;
+ unsigned int number_of_sets :32;
} split;
u32 full;
};
-struct _cpuid4_info_regs {
+struct _cpuid4_info {
union _cpuid4_leaf_eax eax;
union _cpuid4_leaf_ebx ebx;
union _cpuid4_leaf_ecx ecx;
unsigned int id;
unsigned long size;
- struct amd_northbridge *nb;
};
-static unsigned short num_cache_leaves;
+/* Map CPUID(0x4) EAX.cache_type to <linux/cacheinfo.h> types */
+static const enum cache_type cache_type_map[] = {
+ [CTYPE_NULL] = CACHE_TYPE_NOCACHE,
+ [CTYPE_DATA] = CACHE_TYPE_DATA,
+ [CTYPE_INST] = CACHE_TYPE_INST,
+ [CTYPE_UNIFIED] = CACHE_TYPE_UNIFIED,
+};
+
+/*
+ * Fallback AMD CPUID(0x4) emulation
+ * AMD CPUs with TOPOEXT can just use CPUID(0x8000001d)
+ *
+ * @AMD_L2_L3_INVALID_ASSOC: cache info for the respective L2/L3 cache should
+ * be determined from CPUID(0x8000001d) instead of CPUID(0x80000006).
+ */
-/* AMD doesn't have CPUID4. Emulate it here to report the same
- information to the user. This makes some assumptions about the machine:
- L2 not shared, no SMT etc. that is currently true on AMD CPUs.
+#define AMD_CPUID4_FULLY_ASSOCIATIVE 0xffff
+#define AMD_L2_L3_INVALID_ASSOC 0x9
- In theory the TLBs could be reported as fake type (they are in "dummy").
- Maybe later */
union l1_cache {
struct {
- unsigned line_size:8;
- unsigned lines_per_tag:8;
- unsigned assoc:8;
- unsigned size_in_kb:8;
+ unsigned line_size :8;
+ unsigned lines_per_tag :8;
+ unsigned assoc :8;
+ unsigned size_in_kb :8;
};
- unsigned val;
+ unsigned int val;
};
union l2_cache {
struct {
- unsigned line_size:8;
- unsigned lines_per_tag:4;
- unsigned assoc:4;
- unsigned size_in_kb:16;
+ unsigned line_size :8;
+ unsigned lines_per_tag :4;
+ unsigned assoc :4;
+ unsigned size_in_kb :16;
};
- unsigned val;
+ unsigned int val;
};
union l3_cache {
struct {
- unsigned line_size:8;
- unsigned lines_per_tag:4;
- unsigned assoc:4;
- unsigned res:2;
- unsigned size_encoded:14;
+ unsigned line_size :8;
+ unsigned lines_per_tag :4;
+ unsigned assoc :4;
+ unsigned res :2;
+ unsigned size_encoded :14;
};
- unsigned val;
+ unsigned int val;
};
+/* L2/L3 associativity mapping */
static const unsigned short assocs[] = {
- [1] = 1,
- [2] = 2,
- [4] = 4,
- [6] = 8,
- [8] = 16,
- [0xa] = 32,
- [0xb] = 48,
- [0xc] = 64,
- [0xd] = 96,
- [0xe] = 128,
- [0xf] = 0xffff /* fully associative - no way to show this currently */
+ [1] = 1,
+ [2] = 2,
+ [3] = 3,
+ [4] = 4,
+ [5] = 6,
+ [6] = 8,
+ [8] = 16,
+ [0xa] = 32,
+ [0xb] = 48,
+ [0xc] = 64,
+ [0xd] = 96,
+ [0xe] = 128,
+ [0xf] = AMD_CPUID4_FULLY_ASSOCIATIVE
};
static const unsigned char levels[] = { 1, 1, 2, 3 };
-static const unsigned char types[] = { 1, 2, 3, 3 };
-
-static const enum cache_type cache_type_map[] = {
- [CTYPE_NULL] = CACHE_TYPE_NOCACHE,
- [CTYPE_DATA] = CACHE_TYPE_DATA,
- [CTYPE_INST] = CACHE_TYPE_INST,
- [CTYPE_UNIFIED] = CACHE_TYPE_UNIFIED,
-};
+static const unsigned char types[] = { 1, 2, 3, 3 };
-static void
-amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
- union _cpuid4_leaf_ebx *ebx,
- union _cpuid4_leaf_ecx *ecx)
+static void legacy_amd_cpuid4(int index, union _cpuid4_leaf_eax *eax,
+ union _cpuid4_leaf_ebx *ebx, union _cpuid4_leaf_ecx *ecx)
{
- unsigned dummy;
- unsigned line_size, lines_per_tag, assoc, size_in_kb;
- union l1_cache l1i, l1d;
+ unsigned int dummy, line_size, lines_per_tag, assoc, size_in_kb;
+ union l1_cache l1i, l1d, *l1;
union l2_cache l2;
union l3_cache l3;
- union l1_cache *l1 = &l1d;
eax->full = 0;
ebx->full = 0;
@@ -245,430 +163,155 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
cpuid(0x80000005, &dummy, &dummy, &l1d.val, &l1i.val);
cpuid(0x80000006, &dummy, &dummy, &l2.val, &l3.val);
- switch (leaf) {
+ l1 = &l1d;
+ switch (index) {
case 1:
l1 = &l1i;
fallthrough;
case 0:
if (!l1->val)
return;
- assoc = assocs[l1->assoc];
- line_size = l1->line_size;
- lines_per_tag = l1->lines_per_tag;
- size_in_kb = l1->size_in_kb;
+
+ assoc = (l1->assoc == 0xff) ? AMD_CPUID4_FULLY_ASSOCIATIVE : l1->assoc;
+ line_size = l1->line_size;
+ lines_per_tag = l1->lines_per_tag;
+ size_in_kb = l1->size_in_kb;
break;
case 2:
- if (!l2.val)
+ if (!l2.assoc || l2.assoc == AMD_L2_L3_INVALID_ASSOC)
return;
- assoc = assocs[l2.assoc];
- line_size = l2.line_size;
- lines_per_tag = l2.lines_per_tag;
- /* cpu_data has errata corrections for K7 applied */
- size_in_kb = __this_cpu_read(cpu_info.x86_cache_size);
+
+ /* Use x86_cache_size as it might have K7 errata fixes */
+ assoc = assocs[l2.assoc];
+ line_size = l2.line_size;
+ lines_per_tag = l2.lines_per_tag;
+ size_in_kb = __this_cpu_read(cpu_info.x86_cache_size);
break;
case 3:
- if (!l3.val)
+ if (!l3.assoc || l3.assoc == AMD_L2_L3_INVALID_ASSOC)
return;
- assoc = assocs[l3.assoc];
- line_size = l3.line_size;
- lines_per_tag = l3.lines_per_tag;
- size_in_kb = l3.size_encoded * 512;
+
+ assoc = assocs[l3.assoc];
+ line_size = l3.line_size;
+ lines_per_tag = l3.lines_per_tag;
+ size_in_kb = l3.size_encoded * 512;
if (boot_cpu_has(X86_FEATURE_AMD_DCM)) {
- size_in_kb = size_in_kb >> 1;
- assoc = assoc >> 1;
+ size_in_kb = size_in_kb >> 1;
+ assoc = assoc >> 1;
}
break;
default:
return;
}
- eax->split.is_self_initializing = 1;
- eax->split.type = types[leaf];
- eax->split.level = levels[leaf];
- eax->split.num_threads_sharing = 0;
- eax->split.num_cores_on_die = __this_cpu_read(cpu_info.x86_max_cores) - 1;
+ eax->split.is_self_initializing = 1;
+ eax->split.type = types[index];
+ eax->split.level = levels[index];
+ eax->split.num_threads_sharing = 0;
+ eax->split.num_cores_on_die = topology_num_cores_per_package();
-
- if (assoc == 0xffff)
+ if (assoc == AMD_CPUID4_FULLY_ASSOCIATIVE)
eax->split.is_fully_associative = 1;
- ebx->split.coherency_line_size = line_size - 1;
- ebx->split.ways_of_associativity = assoc - 1;
- ebx->split.physical_line_partition = lines_per_tag - 1;
- ecx->split.number_of_sets = (size_in_kb * 1024) / line_size /
- (ebx->split.ways_of_associativity + 1) - 1;
-}
-
-#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS)
-
-/*
- * L3 cache descriptors
- */
-static void amd_calc_l3_indices(struct amd_northbridge *nb)
-{
- struct amd_l3_cache *l3 = &nb->l3_cache;
- unsigned int sc0, sc1, sc2, sc3;
- u32 val = 0;
-
- pci_read_config_dword(nb->misc, 0x1C4, &val);
-
- /* calculate subcache sizes */
- l3->subcaches[0] = sc0 = !(val & BIT(0));
- l3->subcaches[1] = sc1 = !(val & BIT(4));
-
- if (boot_cpu_data.x86 == 0x15) {
- l3->subcaches[0] = sc0 += !(val & BIT(1));
- l3->subcaches[1] = sc1 += !(val & BIT(5));
- }
-
- l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9));
- l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
-
- l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
-}
-
-/*
- * check whether a slot used for disabling an L3 index is occupied.
- * @l3: L3 cache descriptor
- * @slot: slot number (0..1)
- *
- * @returns: the disabled index if used or negative value if slot free.
- */
-static int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot)
-{
- unsigned int reg = 0;
-
- pci_read_config_dword(nb->misc, 0x1BC + slot * 4, &reg);
-
- /* check whether this slot is activated already */
- if (reg & (3UL << 30))
- return reg & 0xfff;
-
- return -1;
-}
-
-static ssize_t show_cache_disable(struct cacheinfo *this_leaf, char *buf,
- unsigned int slot)
-{
- int index;
- struct amd_northbridge *nb = this_leaf->priv;
-
- index = amd_get_l3_disable_slot(nb, slot);
- if (index >= 0)
- return sprintf(buf, "%d\n", index);
-
- return sprintf(buf, "FREE\n");
-}
-
-#define SHOW_CACHE_DISABLE(slot) \
-static ssize_t \
-cache_disable_##slot##_show(struct device *dev, \
- struct device_attribute *attr, char *buf) \
-{ \
- struct cacheinfo *this_leaf = dev_get_drvdata(dev); \
- return show_cache_disable(this_leaf, buf, slot); \
-}
-SHOW_CACHE_DISABLE(0)
-SHOW_CACHE_DISABLE(1)
-
-static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu,
- unsigned slot, unsigned long idx)
-{
- int i;
-
- idx |= BIT(30);
-
- /*
- * disable index in all 4 subcaches
- */
- for (i = 0; i < 4; i++) {
- u32 reg = idx | (i << 20);
-
- if (!nb->l3_cache.subcaches[i])
- continue;
-
- pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg);
-
- /*
- * We need to WBINVD on a core on the node containing the L3
- * cache which indices we disable therefore a simple wbinvd()
- * is not sufficient.
- */
- wbinvd_on_cpu(cpu);
-
- reg |= BIT(31);
- pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg);
- }
-}
-
-/*
- * disable a L3 cache index by using a disable-slot
- *
- * @l3: L3 cache descriptor
- * @cpu: A CPU on the node containing the L3 cache
- * @slot: slot number (0..1)
- * @index: index to disable
- *
- * @return: 0 on success, error status on failure
- */
-static int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu,
- unsigned slot, unsigned long index)
-{
- int ret = 0;
-
- /* check if @slot is already used or the index is already disabled */
- ret = amd_get_l3_disable_slot(nb, slot);
- if (ret >= 0)
- return -EEXIST;
-
- if (index > nb->l3_cache.indices)
- return -EINVAL;
-
- /* check whether the other slot has disabled the same index already */
- if (index == amd_get_l3_disable_slot(nb, !slot))
- return -EEXIST;
-
- amd_l3_disable_index(nb, cpu, slot, index);
-
- return 0;
-}
-
-static ssize_t store_cache_disable(struct cacheinfo *this_leaf,
- const char *buf, size_t count,
- unsigned int slot)
-{
- unsigned long val = 0;
- int cpu, err = 0;
- struct amd_northbridge *nb = this_leaf->priv;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- cpu = cpumask_first(&this_leaf->shared_cpu_map);
-
- if (kstrtoul(buf, 10, &val) < 0)
- return -EINVAL;
-
- err = amd_set_l3_disable_slot(nb, cpu, slot, val);
- if (err) {
- if (err == -EEXIST)
- pr_warn("L3 slot %d in use/index already disabled!\n",
- slot);
- return err;
- }
- return count;
-}
-
-#define STORE_CACHE_DISABLE(slot) \
-static ssize_t \
-cache_disable_##slot##_store(struct device *dev, \
- struct device_attribute *attr, \
- const char *buf, size_t count) \
-{ \
- struct cacheinfo *this_leaf = dev_get_drvdata(dev); \
- return store_cache_disable(this_leaf, buf, count, slot); \
-}
-STORE_CACHE_DISABLE(0)
-STORE_CACHE_DISABLE(1)
-
-static ssize_t subcaches_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct cacheinfo *this_leaf = dev_get_drvdata(dev);
- int cpu = cpumask_first(&this_leaf->shared_cpu_map);
-
- return sprintf(buf, "%x\n", amd_get_subcaches(cpu));
-}
-
-static ssize_t subcaches_store(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
-{
- struct cacheinfo *this_leaf = dev_get_drvdata(dev);
- int cpu = cpumask_first(&this_leaf->shared_cpu_map);
- unsigned long val;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (kstrtoul(buf, 16, &val) < 0)
- return -EINVAL;
- if (amd_set_subcaches(cpu, val))
- return -EINVAL;
-
- return count;
+ ebx->split.coherency_line_size = line_size - 1;
+ ebx->split.ways_of_associativity = assoc - 1;
+ ebx->split.physical_line_partition = lines_per_tag - 1;
+ ecx->split.number_of_sets = (size_in_kb * 1024) / line_size /
+ (ebx->split.ways_of_associativity + 1) - 1;
}
-static DEVICE_ATTR_RW(cache_disable_0);
-static DEVICE_ATTR_RW(cache_disable_1);
-static DEVICE_ATTR_RW(subcaches);
-
-static umode_t
-cache_private_attrs_is_visible(struct kobject *kobj,
- struct attribute *attr, int unused)
+static int cpuid4_info_fill_done(struct _cpuid4_info *id4, union _cpuid4_leaf_eax eax,
+ union _cpuid4_leaf_ebx ebx, union _cpuid4_leaf_ecx ecx)
{
- struct device *dev = kobj_to_dev(kobj);
- struct cacheinfo *this_leaf = dev_get_drvdata(dev);
- umode_t mode = attr->mode;
-
- if (!this_leaf->priv)
- return 0;
-
- if ((attr == &dev_attr_subcaches.attr) &&
- amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
- return mode;
+ if (eax.split.type == CTYPE_NULL)
+ return -EIO;
- if ((attr == &dev_attr_cache_disable_0.attr ||
- attr == &dev_attr_cache_disable_1.attr) &&
- amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
- return mode;
+ id4->eax = eax;
+ id4->ebx = ebx;
+ id4->ecx = ecx;
+ id4->size = (ecx.split.number_of_sets + 1) *
+ (ebx.split.coherency_line_size + 1) *
+ (ebx.split.physical_line_partition + 1) *
+ (ebx.split.ways_of_associativity + 1);
return 0;
}
-static struct attribute_group cache_private_group = {
- .is_visible = cache_private_attrs_is_visible,
-};
-
-static void init_amd_l3_attrs(void)
+static int amd_fill_cpuid4_info(int index, struct _cpuid4_info *id4)
{
- int n = 1;
- static struct attribute **amd_l3_attrs;
-
- if (amd_l3_attrs) /* already initialized */
- return;
-
- if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
- n += 2;
- if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
- n += 1;
-
- amd_l3_attrs = kcalloc(n, sizeof(*amd_l3_attrs), GFP_KERNEL);
- if (!amd_l3_attrs)
- return;
-
- n = 0;
- if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
- amd_l3_attrs[n++] = &dev_attr_cache_disable_0.attr;
- amd_l3_attrs[n++] = &dev_attr_cache_disable_1.attr;
- }
- if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
- amd_l3_attrs[n++] = &dev_attr_subcaches.attr;
-
- cache_private_group.attrs = amd_l3_attrs;
-}
-
-const struct attribute_group *
-cache_get_priv_group(struct cacheinfo *this_leaf)
-{
- struct amd_northbridge *nb = this_leaf->priv;
-
- if (this_leaf->level < 3 || !nb)
- return NULL;
+ union _cpuid4_leaf_eax eax;
+ union _cpuid4_leaf_ebx ebx;
+ union _cpuid4_leaf_ecx ecx;
+ u32 ignored;
- if (nb && nb->l3_cache.indices)
- init_amd_l3_attrs();
+ if (boot_cpu_has(X86_FEATURE_TOPOEXT) || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
+ cpuid_count(0x8000001d, index, &eax.full, &ebx.full, &ecx.full, &ignored);
+ else
+ legacy_amd_cpuid4(index, &eax, &ebx, &ecx);
- return &cache_private_group;
+ return cpuid4_info_fill_done(id4, eax, ebx, ecx);
}
-static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index)
+static int intel_fill_cpuid4_info(int index, struct _cpuid4_info *id4)
{
- int node;
+ union _cpuid4_leaf_eax eax;
+ union _cpuid4_leaf_ebx ebx;
+ union _cpuid4_leaf_ecx ecx;
+ u32 ignored;
- /* only for L3, and not in virtualized environments */
- if (index < 3)
- return;
+ cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &ignored);
- node = topology_die_id(smp_processor_id());
- this_leaf->nb = node_to_amd_nb(node);
- if (this_leaf->nb && !this_leaf->nb->l3_cache.indices)
- amd_calc_l3_indices(this_leaf->nb);
+ return cpuid4_info_fill_done(id4, eax, ebx, ecx);
}
-#else
-#define amd_init_l3_cache(x, y)
-#endif /* CONFIG_AMD_NB && CONFIG_SYSFS */
-static int
-cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf)
+static int fill_cpuid4_info(int index, struct _cpuid4_info *id4)
{
- union _cpuid4_leaf_eax eax;
- union _cpuid4_leaf_ebx ebx;
- union _cpuid4_leaf_ecx ecx;
- unsigned edx;
-
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
- if (boot_cpu_has(X86_FEATURE_TOPOEXT))
- cpuid_count(0x8000001d, index, &eax.full,
- &ebx.full, &ecx.full, &edx);
- else
- amd_cpuid4(index, &eax, &ebx, &ecx);
- amd_init_l3_cache(this_leaf, index);
- } else if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) {
- cpuid_count(0x8000001d, index, &eax.full,
- &ebx.full, &ecx.full, &edx);
- amd_init_l3_cache(this_leaf, index);
- } else {
- cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
- }
+ u8 cpu_vendor = boot_cpu_data.x86_vendor;
- if (eax.split.type == CTYPE_NULL)
- return -EIO; /* better error ? */
-
- this_leaf->eax = eax;
- this_leaf->ebx = ebx;
- this_leaf->ecx = ecx;
- this_leaf->size = (ecx.split.number_of_sets + 1) *
- (ebx.split.coherency_line_size + 1) *
- (ebx.split.physical_line_partition + 1) *
- (ebx.split.ways_of_associativity + 1);
- return 0;
+ return (cpu_vendor == X86_VENDOR_AMD || cpu_vendor == X86_VENDOR_HYGON) ?
+ amd_fill_cpuid4_info(index, id4) :
+ intel_fill_cpuid4_info(index, id4);
}
static int find_num_cache_leaves(struct cpuinfo_x86 *c)
{
- unsigned int eax, ebx, ecx, edx, op;
- union _cpuid4_leaf_eax cache_eax;
- int i = -1;
-
- if (c->x86_vendor == X86_VENDOR_AMD ||
- c->x86_vendor == X86_VENDOR_HYGON)
- op = 0x8000001d;
- else
- op = 4;
+ unsigned int eax, ebx, ecx, edx, op;
+ union _cpuid4_leaf_eax cache_eax;
+ int i = -1;
+ /* Do a CPUID(op) loop to calculate num_cache_leaves */
+ op = (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) ? 0x8000001d : 4;
do {
++i;
- /* Do cpuid(op) loop to find out num_cache_leaves */
cpuid_count(op, i, &eax, &ebx, &ecx, &edx);
cache_eax.full = eax;
} while (cache_eax.split.type != CTYPE_NULL);
return i;
}
-void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu)
+/*
+ * AMD/Hygon CPUs may have multiple LLCs if L3 caches exist.
+ */
+
+void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id)
{
- /*
- * We may have multiple LLCs if L3 caches exist, so check if we
- * have an L3 cache by looking at the L3 cache CPUID leaf.
- */
- if (!cpuid_edx(0x80000006))
+ if (!cpuid_amd_hygon_has_l3_cache())
return;
if (c->x86 < 0x17) {
- /* LLC is at the node level. */
- per_cpu(cpu_llc_id, cpu) = c->cpu_die_id;
+ /* Pre-Zen: LLC is at the node level */
+ c->topo.llc_id = die_id;
} else if (c->x86 == 0x17 && c->x86_model <= 0x1F) {
/*
- * LLC is at the core complex level.
- * Core complex ID is ApicId[3] for these processors.
+ * Family 17h up to 1F models: LLC is at the core
+ * complex level. Core complex ID is ApicId[3].
*/
- per_cpu(cpu_llc_id, cpu) = c->apicid >> 3;
+ c->topo.llc_id = c->topo.apicid >> 3;
} else {
/*
- * LLC ID is calculated from the number of threads sharing the
- * cache.
- * */
+ * Newer families: LLC ID is calculated from the number
+ * of threads sharing the L3 cache.
+ */
u32 eax, ebx, ecx, edx, num_sharing_cache = 0;
u32 llc_index = find_num_cache_leaves(c) - 1;
@@ -677,209 +320,167 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu)
num_sharing_cache = ((eax >> 14) & 0xfff) + 1;
if (num_sharing_cache) {
- int bits = get_count_order(num_sharing_cache);
+ int index_msb = get_count_order(num_sharing_cache);
- per_cpu(cpu_llc_id, cpu) = c->apicid >> bits;
+ c->topo.llc_id = c->topo.apicid >> index_msb;
}
}
}
-void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu)
+void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c)
{
- /*
- * We may have multiple LLCs if L3 caches exist, so check if we
- * have an L3 cache by looking at the L3 cache CPUID leaf.
- */
- if (!cpuid_edx(0x80000006))
+ if (!cpuid_amd_hygon_has_l3_cache())
return;
/*
- * LLC is at the core complex level.
- * Core complex ID is ApicId[3] for these processors.
+ * Hygons are similar to AMD Family 17h up to 1F models: LLC is
+ * at the core complex level. Core complex ID is ApicId[3].
*/
- per_cpu(cpu_llc_id, cpu) = c->apicid >> 3;
+ c->topo.llc_id = c->topo.apicid >> 3;
}
void init_amd_cacheinfo(struct cpuinfo_x86 *c)
{
+ struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index);
- if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
- num_cache_leaves = find_num_cache_leaves(c);
- } else if (c->extended_cpuid_level >= 0x80000006) {
- if (cpuid_edx(0x80000006) & 0xf000)
- num_cache_leaves = 4;
- else
- num_cache_leaves = 3;
- }
+ if (boot_cpu_has(X86_FEATURE_TOPOEXT))
+ ci->num_leaves = find_num_cache_leaves(c);
+ else if (c->extended_cpuid_level >= 0x80000006)
+ ci->num_leaves = (cpuid_edx(0x80000006) & 0xf000) ? 4 : 3;
}
void init_hygon_cacheinfo(struct cpuinfo_x86 *c)
{
- num_cache_leaves = find_num_cache_leaves(c);
+ struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index);
+
+ ci->num_leaves = find_num_cache_leaves(c);
}
-void init_intel_cacheinfo(struct cpuinfo_x86 *c)
+static void intel_cacheinfo_done(struct cpuinfo_x86 *c, unsigned int l3,
+ unsigned int l2, unsigned int l1i, unsigned int l1d)
{
- /* Cache sizes */
- unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0;
- unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
- unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
- unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
-#ifdef CONFIG_SMP
- unsigned int cpu = c->cpu_index;
-#endif
-
- if (c->cpuid_level > 3) {
- static int is_initialized;
-
- if (is_initialized == 0) {
- /* Init num_cache_leaves from boot CPU */
- num_cache_leaves = find_num_cache_leaves(c);
- is_initialized++;
- }
+ /*
+ * If llc_id is still unset, then cpuid_level < 4, which implies
+ * that the only possibility left is SMT. Since CPUID(0x2) doesn't
+ * specify any shared caches and SMT shares all caches, we can
+ * unconditionally set LLC ID to the package ID so that all
+ * threads share it.
+ */
+ if (c->topo.llc_id == BAD_APICID)
+ c->topo.llc_id = c->topo.pkg_id;
- /*
- * Whenever possible use cpuid(4), deterministic cache
- * parameters cpuid leaf to find the cache details
- */
- for (i = 0; i < num_cache_leaves; i++) {
- struct _cpuid4_info_regs this_leaf = {};
- int retval;
+ c->x86_cache_size = l3 ? l3 : (l2 ? l2 : l1i + l1d);
- retval = cpuid4_cache_lookup_regs(i, &this_leaf);
- if (retval < 0)
- continue;
+ if (!l2)
+ cpu_detect_cache_sizes(c);
+}
- switch (this_leaf.eax.split.level) {
- case 1:
- if (this_leaf.eax.split.type == CTYPE_DATA)
- new_l1d = this_leaf.size/1024;
- else if (this_leaf.eax.split.type == CTYPE_INST)
- new_l1i = this_leaf.size/1024;
- break;
- case 2:
- new_l2 = this_leaf.size/1024;
- num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
- index_msb = get_count_order(num_threads_sharing);
- l2_id = c->apicid & ~((1 << index_msb) - 1);
- break;
- case 3:
- new_l3 = this_leaf.size/1024;
- num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
- index_msb = get_count_order(num_threads_sharing);
- l3_id = c->apicid & ~((1 << index_msb) - 1);
- break;
- default:
- break;
- }
+/*
+ * Legacy Intel CPUID(0x2) path if CPUID(0x4) is not available.
+ */
+static void intel_cacheinfo_0x2(struct cpuinfo_x86 *c)
+{
+ unsigned int l1i = 0, l1d = 0, l2 = 0, l3 = 0;
+ const struct leaf_0x2_table *desc;
+ union leaf_0x2_regs regs;
+ u8 *ptr;
+
+ if (c->cpuid_level < 2)
+ return;
+
+ cpuid_leaf_0x2(&regs);
+ for_each_cpuid_0x2_desc(regs, ptr, desc) {
+ switch (desc->c_type) {
+ case CACHE_L1_INST: l1i += desc->c_size; break;
+ case CACHE_L1_DATA: l1d += desc->c_size; break;
+ case CACHE_L2: l2 += desc->c_size; break;
+ case CACHE_L3: l3 += desc->c_size; break;
}
}
+
+ intel_cacheinfo_done(c, l3, l2, l1i, l1d);
+}
+
+static unsigned int calc_cache_topo_id(struct cpuinfo_x86 *c, const struct _cpuid4_info *id4)
+{
+ unsigned int num_threads_sharing;
+ int index_msb;
+
+ num_threads_sharing = 1 + id4->eax.split.num_threads_sharing;
+ index_msb = get_count_order(num_threads_sharing);
+ return c->topo.apicid & ~((1 << index_msb) - 1);
+}
+
+static bool intel_cacheinfo_0x4(struct cpuinfo_x86 *c)
+{
+ struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index);
+ unsigned int l2_id = BAD_APICID, l3_id = BAD_APICID;
+ unsigned int l1d = 0, l1i = 0, l2 = 0, l3 = 0;
+
+ if (c->cpuid_level < 4)
+ return false;
+
/*
- * Don't use cpuid2 if cpuid4 is supported. For P4, we use cpuid2 for
- * trace cache
+ * There should be at least one leaf. A non-zero value means
+ * that the number of leaves has been previously initialized.
*/
- if ((num_cache_leaves == 0 || c->x86 == 15) && c->cpuid_level > 1) {
- /* supports eax=2 call */
- int j, n;
- unsigned int regs[4];
- unsigned char *dp = (unsigned char *)regs;
- int only_trace = 0;
-
- if (num_cache_leaves != 0 && c->x86 == 15)
- only_trace = 1;
-
- /* Number of times to iterate */
- n = cpuid_eax(2) & 0xFF;
-
- for (i = 0 ; i < n ; i++) {
- cpuid(2, &regs[0], &regs[1], &regs[2], &regs[3]);
-
- /* If bit 31 is set, this is an unknown format */
- for (j = 0 ; j < 3 ; j++)
- if (regs[j] & (1 << 31))
- regs[j] = 0;
-
- /* Byte 0 is level count, not a descriptor */
- for (j = 1 ; j < 16 ; j++) {
- unsigned char des = dp[j];
- unsigned char k = 0;
-
- /* look up this descriptor in the table */
- while (cache_table[k].descriptor != 0) {
- if (cache_table[k].descriptor == des) {
- if (only_trace && cache_table[k].cache_type != LVL_TRACE)
- break;
- switch (cache_table[k].cache_type) {
- case LVL_1_INST:
- l1i += cache_table[k].size;
- break;
- case LVL_1_DATA:
- l1d += cache_table[k].size;
- break;
- case LVL_2:
- l2 += cache_table[k].size;
- break;
- case LVL_3:
- l3 += cache_table[k].size;
- break;
- case LVL_TRACE:
- trace += cache_table[k].size;
- break;
- }
-
- break;
- }
-
- k++;
- }
- }
- }
- }
+ if (!ci->num_leaves)
+ ci->num_leaves = find_num_cache_leaves(c);
- if (new_l1d)
- l1d = new_l1d;
+ if (!ci->num_leaves)
+ return false;
- if (new_l1i)
- l1i = new_l1i;
+ for (int i = 0; i < ci->num_leaves; i++) {
+ struct _cpuid4_info id4 = {};
+ int ret;
- if (new_l2) {
- l2 = new_l2;
-#ifdef CONFIG_SMP
- per_cpu(cpu_llc_id, cpu) = l2_id;
- per_cpu(cpu_l2c_id, cpu) = l2_id;
-#endif
- }
+ ret = intel_fill_cpuid4_info(i, &id4);
+ if (ret < 0)
+ continue;
- if (new_l3) {
- l3 = new_l3;
-#ifdef CONFIG_SMP
- per_cpu(cpu_llc_id, cpu) = l3_id;
-#endif
+ switch (id4.eax.split.level) {
+ case 1:
+ if (id4.eax.split.type == CTYPE_DATA)
+ l1d = id4.size / 1024;
+ else if (id4.eax.split.type == CTYPE_INST)
+ l1i = id4.size / 1024;
+ break;
+ case 2:
+ l2 = id4.size / 1024;
+ l2_id = calc_cache_topo_id(c, &id4);
+ break;
+ case 3:
+ l3 = id4.size / 1024;
+ l3_id = calc_cache_topo_id(c, &id4);
+ break;
+ default:
+ break;
+ }
}
-#ifdef CONFIG_SMP
- /*
- * If cpu_llc_id is not yet set, this means cpuid_level < 4 which in
- * turns means that the only possibility is SMT (as indicated in
- * cpuid1). Since cpuid2 doesn't specify shared caches, and we know
- * that SMT shares all caches, we can unconditionally set cpu_llc_id to
- * c->phys_proc_id.
- */
- if (per_cpu(cpu_llc_id, cpu) == BAD_APICID)
- per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
-#endif
+ c->topo.l2c_id = l2_id;
+ c->topo.llc_id = (l3_id == BAD_APICID) ? l2_id : l3_id;
+ intel_cacheinfo_done(c, l3, l2, l1i, l1d);
+ return true;
+}
- c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d));
+void init_intel_cacheinfo(struct cpuinfo_x86 *c)
+{
+ /* Don't use CPUID(0x2) if CPUID(0x4) is supported. */
+ if (intel_cacheinfo_0x4(c))
+ return;
- if (!l2)
- cpu_detect_cache_sizes(c);
+ intel_cacheinfo_0x2(c);
}
+/*
+ * <linux/cacheinfo.h> shared_cpu_map setup, AMD/Hygon
+ */
static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
- struct _cpuid4_info_regs *base)
+ const struct _cpuid4_info *id4)
{
struct cpu_cacheinfo *this_cpu_ci;
- struct cacheinfo *this_leaf;
+ struct cacheinfo *ci;
int i, sibling;
/*
@@ -891,19 +492,19 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
this_cpu_ci = get_cpu_cacheinfo(i);
if (!this_cpu_ci->info_list)
continue;
- this_leaf = this_cpu_ci->info_list + index;
+
+ ci = this_cpu_ci->info_list + index;
for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) {
if (!cpu_online(sibling))
continue;
- cpumask_set_cpu(sibling,
- &this_leaf->shared_cpu_map);
+ cpumask_set_cpu(sibling, &ci->shared_cpu_map);
}
}
} else if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
unsigned int apicid, nshared, first, last;
- nshared = base->eax.split.num_threads_sharing + 1;
- apicid = cpu_data(cpu).apicid;
+ nshared = id4->eax.split.num_threads_sharing + 1;
+ apicid = cpu_data(cpu).topo.apicid;
first = apicid - (apicid % nshared);
last = first + nshared - 1;
@@ -912,18 +513,17 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
if (!this_cpu_ci->info_list)
continue;
- apicid = cpu_data(i).apicid;
+ apicid = cpu_data(i).topo.apicid;
if ((apicid < first) || (apicid > last))
continue;
- this_leaf = this_cpu_ci->info_list + index;
+ ci = this_cpu_ci->info_list + index;
for_each_online_cpu(sibling) {
- apicid = cpu_data(sibling).apicid;
+ apicid = cpu_data(sibling).topo.apicid;
if ((apicid < first) || (apicid > last))
continue;
- cpumask_set_cpu(sibling,
- &this_leaf->shared_cpu_map);
+ cpumask_set_cpu(sibling, &ci->shared_cpu_map);
}
}
} else
@@ -932,105 +532,295 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
return 1;
}
+/*
+ * <linux/cacheinfo.h> shared_cpu_map setup, Intel + fallback AMD/Hygon
+ */
static void __cache_cpumap_setup(unsigned int cpu, int index,
- struct _cpuid4_info_regs *base)
+ const struct _cpuid4_info *id4)
{
struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
- struct cacheinfo *this_leaf, *sibling_leaf;
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ struct cacheinfo *ci, *sibling_ci;
unsigned long num_threads_sharing;
int index_msb, i;
- struct cpuinfo_x86 *c = &cpu_data(cpu);
- if (c->x86_vendor == X86_VENDOR_AMD ||
- c->x86_vendor == X86_VENDOR_HYGON) {
- if (__cache_amd_cpumap_setup(cpu, index, base))
+ if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) {
+ if (__cache_amd_cpumap_setup(cpu, index, id4))
return;
}
- this_leaf = this_cpu_ci->info_list + index;
- num_threads_sharing = 1 + base->eax.split.num_threads_sharing;
+ ci = this_cpu_ci->info_list + index;
+ num_threads_sharing = 1 + id4->eax.split.num_threads_sharing;
- cpumask_set_cpu(cpu, &this_leaf->shared_cpu_map);
+ cpumask_set_cpu(cpu, &ci->shared_cpu_map);
if (num_threads_sharing == 1)
return;
index_msb = get_count_order(num_threads_sharing);
for_each_online_cpu(i)
- if (cpu_data(i).apicid >> index_msb == c->apicid >> index_msb) {
+ if (cpu_data(i).topo.apicid >> index_msb == c->topo.apicid >> index_msb) {
struct cpu_cacheinfo *sib_cpu_ci = get_cpu_cacheinfo(i);
+ /* Skip if itself or no cacheinfo */
if (i == cpu || !sib_cpu_ci->info_list)
- continue;/* skip if itself or no cacheinfo */
- sibling_leaf = sib_cpu_ci->info_list + index;
- cpumask_set_cpu(i, &this_leaf->shared_cpu_map);
- cpumask_set_cpu(cpu, &sibling_leaf->shared_cpu_map);
+ continue;
+
+ sibling_ci = sib_cpu_ci->info_list + index;
+ cpumask_set_cpu(i, &ci->shared_cpu_map);
+ cpumask_set_cpu(cpu, &sibling_ci->shared_cpu_map);
}
}
-static void ci_leaf_init(struct cacheinfo *this_leaf,
- struct _cpuid4_info_regs *base)
+static void ci_info_init(struct cacheinfo *ci, const struct _cpuid4_info *id4,
+ struct amd_northbridge *nb)
{
- this_leaf->id = base->id;
- this_leaf->attributes = CACHE_ID;
- this_leaf->level = base->eax.split.level;
- this_leaf->type = cache_type_map[base->eax.split.type];
- this_leaf->coherency_line_size =
- base->ebx.split.coherency_line_size + 1;
- this_leaf->ways_of_associativity =
- base->ebx.split.ways_of_associativity + 1;
- this_leaf->size = base->size;
- this_leaf->number_of_sets = base->ecx.split.number_of_sets + 1;
- this_leaf->physical_line_partition =
- base->ebx.split.physical_line_partition + 1;
- this_leaf->priv = base->nb;
+ ci->id = id4->id;
+ ci->attributes = CACHE_ID;
+ ci->level = id4->eax.split.level;
+ ci->type = cache_type_map[id4->eax.split.type];
+ ci->coherency_line_size = id4->ebx.split.coherency_line_size + 1;
+ ci->ways_of_associativity = id4->ebx.split.ways_of_associativity + 1;
+ ci->size = id4->size;
+ ci->number_of_sets = id4->ecx.split.number_of_sets + 1;
+ ci->physical_line_partition = id4->ebx.split.physical_line_partition + 1;
+ ci->priv = nb;
}
int init_cache_level(unsigned int cpu)
{
- struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+ struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu);
- if (!num_cache_leaves)
+ /* There should be at least one leaf. */
+ if (!ci->num_leaves)
return -ENOENT;
- if (!this_cpu_ci)
- return -EINVAL;
- this_cpu_ci->num_levels = 3;
- this_cpu_ci->num_leaves = num_cache_leaves;
+
return 0;
}
/*
- * The max shared threads number comes from CPUID.4:EAX[25-14] with input
+ * The max shared threads number comes from CPUID(0x4) EAX[25-14] with input
* ECX as cache index. Then right shift apicid by the number's order to get
* cache id for this cache node.
*/
-static void get_cache_id(int cpu, struct _cpuid4_info_regs *id4_regs)
+static void get_cache_id(int cpu, struct _cpuid4_info *id4)
{
struct cpuinfo_x86 *c = &cpu_data(cpu);
unsigned long num_threads_sharing;
int index_msb;
- num_threads_sharing = 1 + id4_regs->eax.split.num_threads_sharing;
+ num_threads_sharing = 1 + id4->eax.split.num_threads_sharing;
index_msb = get_count_order(num_threads_sharing);
- id4_regs->id = c->apicid >> index_msb;
+ id4->id = c->topo.apicid >> index_msb;
}
int populate_cache_leaves(unsigned int cpu)
{
- unsigned int idx, ret;
struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
- struct cacheinfo *this_leaf = this_cpu_ci->info_list;
- struct _cpuid4_info_regs id4_regs = {};
+ struct cacheinfo *ci = this_cpu_ci->info_list;
+ u8 cpu_vendor = boot_cpu_data.x86_vendor;
+ struct amd_northbridge *nb = NULL;
+ struct _cpuid4_info id4 = {};
+ int idx, ret;
for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) {
- ret = cpuid4_cache_lookup_regs(idx, &id4_regs);
+ ret = fill_cpuid4_info(idx, &id4);
if (ret)
return ret;
- get_cache_id(cpu, &id4_regs);
- ci_leaf_init(this_leaf++, &id4_regs);
- __cache_cpumap_setup(cpu, idx, &id4_regs);
+
+ get_cache_id(cpu, &id4);
+
+ if (cpu_vendor == X86_VENDOR_AMD || cpu_vendor == X86_VENDOR_HYGON)
+ nb = amd_init_l3_cache(idx);
+
+ ci_info_init(ci++, &id4, nb);
+ __cache_cpumap_setup(cpu, idx, &id4);
}
+
this_cpu_ci->cpu_map_populated = true;
+ return 0;
+}
+
+/*
+ * Disable and enable caches. Needed for changing MTRRs and the PAT MSR.
+ *
+ * Since we are disabling the cache don't allow any interrupts,
+ * they would run extremely slow and would only increase the pain.
+ *
+ * The caller must ensure that local interrupts are disabled and
+ * are reenabled after cache_enable() has been called.
+ */
+static unsigned long saved_cr4;
+static DEFINE_RAW_SPINLOCK(cache_disable_lock);
+
+/*
+ * Cache flushing is the most time-consuming step when programming the
+ * MTRRs. On many Intel CPUs without known erratas, it can be skipped
+ * if the CPU declares cache self-snooping support.
+ */
+static void maybe_flush_caches(void)
+{
+ if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
+ wbinvd();
+}
+
+void cache_disable(void) __acquires(cache_disable_lock)
+{
+ unsigned long cr0;
+
+ /*
+ * This is not ideal since the cache is only flushed/disabled
+ * for this CPU while the MTRRs are changed, but changing this
+ * requires more invasive changes to the way the kernel boots.
+ */
+ raw_spin_lock(&cache_disable_lock);
+
+ /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
+ cr0 = read_cr0() | X86_CR0_CD;
+ write_cr0(cr0);
+
+ maybe_flush_caches();
+
+ /* Save value of CR4 and clear Page Global Enable (bit 7) */
+ if (cpu_feature_enabled(X86_FEATURE_PGE)) {
+ saved_cr4 = __read_cr4();
+ __write_cr4(saved_cr4 & ~X86_CR4_PGE);
+ }
+
+ /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ flush_tlb_local();
+
+ if (cpu_feature_enabled(X86_FEATURE_MTRR))
+ mtrr_disable();
+
+ maybe_flush_caches();
+}
+
+void cache_enable(void) __releases(cache_disable_lock)
+{
+ /* Flush TLBs (no need to flush caches - they are disabled) */
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ flush_tlb_local();
+
+ if (cpu_feature_enabled(X86_FEATURE_MTRR))
+ mtrr_enable();
+
+ /* Enable caches */
+ write_cr0(read_cr0() & ~X86_CR0_CD);
+
+ /* Restore value of CR4 */
+ if (cpu_feature_enabled(X86_FEATURE_PGE))
+ __write_cr4(saved_cr4);
+
+ raw_spin_unlock(&cache_disable_lock);
+}
+
+static void cache_cpu_init(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ if (memory_caching_control & CACHE_MTRR) {
+ cache_disable();
+ mtrr_generic_set_state();
+ cache_enable();
+ }
+
+ if (memory_caching_control & CACHE_PAT)
+ pat_cpu_init();
+
+ local_irq_restore(flags);
+}
+
+static bool cache_aps_delayed_init = true;
+
+void set_cache_aps_delayed_init(bool val)
+{
+ cache_aps_delayed_init = val;
+}
+
+bool get_cache_aps_delayed_init(void)
+{
+ return cache_aps_delayed_init;
+}
+
+static int cache_rendezvous_handler(void *unused)
+{
+ if (get_cache_aps_delayed_init() || !cpu_online(smp_processor_id()))
+ cache_cpu_init();
+
+ return 0;
+}
+
+void __init cache_bp_init(void)
+{
+ mtrr_bp_init();
+ pat_bp_init();
+
+ if (memory_caching_control)
+ cache_cpu_init();
+}
+
+void cache_bp_restore(void)
+{
+ if (memory_caching_control)
+ cache_cpu_init();
+}
+
+static int cache_ap_online(unsigned int cpu)
+{
+ cpumask_set_cpu(cpu, cpu_cacheinfo_mask);
+
+ if (!memory_caching_control || get_cache_aps_delayed_init())
+ return 0;
+
+ /*
+ * Ideally we should hold mtrr_mutex here to avoid MTRR entries
+ * changed, but this routine will be called in CPU boot time,
+ * holding the lock breaks it.
+ *
+ * This routine is called in two cases:
+ *
+ * 1. very early time of software resume, when there absolutely
+ * isn't MTRR entry changes;
+ *
+ * 2. CPU hotadd time. We let mtrr_add/del_page hold cpuhotplug
+ * lock to prevent MTRR entry changes
+ */
+ stop_machine_from_inactive_cpu(cache_rendezvous_handler, NULL,
+ cpu_cacheinfo_mask);
+
+ return 0;
+}
+
+static int cache_ap_offline(unsigned int cpu)
+{
+ cpumask_clear_cpu(cpu, cpu_cacheinfo_mask);
+ return 0;
+}
+
+/*
+ * Delayed cache initialization for all AP's
+ */
+void cache_aps_init(void)
+{
+ if (!memory_caching_control || !get_cache_aps_delayed_init())
+ return;
+
+ stop_machine(cache_rendezvous_handler, NULL, cpu_online_mask);
+ set_cache_aps_delayed_init(false);
+}
+
+static int __init cache_ap_register(void)
+{
+ zalloc_cpumask_var(&cpu_cacheinfo_mask, GFP_KERNEL);
+ cpumask_set_cpu(smp_processor_id(), cpu_cacheinfo_mask);
+ cpuhp_setup_state_nocalls(CPUHP_AP_CACHECTRL_STARTING,
+ "x86/cachectrl:starting",
+ cache_ap_online, cache_ap_offline);
return 0;
}
+early_initcall(cache_ap_register);
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 345f7d905db6..a3b55db35c96 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -128,10 +128,6 @@ static void init_centaur(struct cpuinfo_x86 *c)
#endif
early_init_centaur(c);
init_intel_cacheinfo(c);
- detect_num_cpu_cores(c);
-#ifdef CONFIG_X86_32
- detect_ht(c);
-#endif
if (c->cpuid_level > 9) {
unsigned int eax = cpuid_eax(10);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 736262a76a12..8feb8fd2957a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -18,13 +18,18 @@
#include <linux/init.h>
#include <linux/kprobes.h>
#include <linux/kgdb.h>
+#include <linux/mem_encrypt.h>
#include <linux/smp.h>
+#include <linux/cpu.h>
#include <linux/io.h>
#include <linux/syscore_ops.h>
#include <linux/pgtable.h>
+#include <linux/stackprotector.h>
+#include <linux/utsname.h>
+#include <asm/alternative.h>
#include <asm/cmdline.h>
-#include <asm/stackprotector.h>
+#include <asm/cpuid/api.h>
#include <asm/perf_event.h>
#include <asm/mmu_context.h>
#include <asm/doublefault.h>
@@ -52,43 +57,43 @@
#include <asm/cpu.h>
#include <asm/mce.h>
#include <asm/msr.h>
+#include <asm/cacheinfo.h>
#include <asm/memtype.h>
#include <asm/microcode.h>
-#include <asm/microcode_intel.h>
#include <asm/intel-family.h>
#include <asm/cpu_device_id.h>
+#include <asm/fred.h>
#include <asm/uv/uv.h>
-#include <asm/sigframe.h>
+#include <asm/ia32.h>
+#include <asm/set_memory.h>
#include <asm/traps.h>
#include <asm/sev.h>
+#include <asm/tdx.h>
+#include <asm/posted_intr.h>
+#include <asm/runtime-const.h>
#include "cpu.h"
-u32 elf_hwcap2 __read_mostly;
-
-/* all of these masks are initialized in setup_cpu_local_masks() */
-cpumask_var_t cpu_initialized_mask;
-cpumask_var_t cpu_callout_mask;
-cpumask_var_t cpu_callin_mask;
+DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
+EXPORT_PER_CPU_SYMBOL(cpu_info);
-/* representing cpus for which sibling maps can be computed */
-cpumask_var_t cpu_sibling_setup_mask;
+u32 elf_hwcap2 __read_mostly;
/* Number of siblings per CPU package */
-int smp_num_siblings = 1;
-EXPORT_SYMBOL(smp_num_siblings);
+unsigned int __max_threads_per_core __ro_after_init = 1;
+EXPORT_SYMBOL(__max_threads_per_core);
-/* Last level cache ID of each logical CPU */
-DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID;
+unsigned int __max_dies_per_package __ro_after_init = 1;
+EXPORT_SYMBOL(__max_dies_per_package);
-u16 get_llc_id(unsigned int cpu)
-{
- return per_cpu(cpu_llc_id, cpu);
-}
-EXPORT_SYMBOL_GPL(get_llc_id);
+unsigned int __max_logical_packages __ro_after_init = 1;
+EXPORT_SYMBOL(__max_logical_packages);
+
+unsigned int __num_cores_per_package __ro_after_init = 1;
+EXPORT_SYMBOL(__num_cores_per_package);
-/* L2 cache ID of each logical CPU */
-DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id) = BAD_APICID;
+unsigned int __num_threads_per_package __ro_after_init = 1;
+EXPORT_SYMBOL(__num_threads_per_package);
static struct ppin_info {
int feature;
@@ -112,16 +117,17 @@ static const struct x86_cpu_id ppin_cpuids[] = {
X86_MATCH_FEATURE(X86_FEATURE_INTEL_PPIN, &ppin_info[X86_VENDOR_INTEL]),
/* Legacy models without CPUID enumeration */
- X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_IVYBRIDGE_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_HASWELL_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_BROADWELL_D, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_BROADWELL_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_SKYLAKE_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_ICELAKE_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &ppin_info[X86_VENDOR_INTEL]),
{}
};
@@ -142,7 +148,7 @@ static void ppin_init(struct cpuinfo_x86 *c)
*/
info = (struct ppin_info *)id->driver_data;
- if (rdmsrl_safe(info->msr_ppin_ctl, &val))
+ if (rdmsrq_safe(info->msr_ppin_ctl, &val))
goto clear_ppin;
if ((val & 3UL) == 1UL) {
@@ -152,28 +158,19 @@ static void ppin_init(struct cpuinfo_x86 *c)
/* If PPIN is disabled, try to enable */
if (!(val & 2UL)) {
- wrmsrl_safe(info->msr_ppin_ctl, val | 2UL);
- rdmsrl_safe(info->msr_ppin_ctl, &val);
+ wrmsrq_safe(info->msr_ppin_ctl, val | 2UL);
+ rdmsrq_safe(info->msr_ppin_ctl, &val);
}
/* Is the enable bit set? */
if (val & 2UL) {
- c->ppin = __rdmsr(info->msr_ppin);
+ c->ppin = native_rdmsrq(info->msr_ppin);
set_cpu_cap(c, info->feature);
return;
}
clear_ppin:
- clear_cpu_cap(c, info->feature);
-}
-
-/* correctly size the local cpu masks */
-void __init setup_cpu_local_masks(void)
-{
- alloc_bootmem_cpumask_var(&cpu_initialized_mask);
- alloc_bootmem_cpumask_var(&cpu_callin_mask);
- alloc_bootmem_cpumask_var(&cpu_callout_mask);
- alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
+ setup_clear_cpu_cap(info->feature);
}
static void default_init(struct cpuinfo_x86 *c)
@@ -211,48 +208,41 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
* TLS descriptors are currently at a different place compared to i386.
* Hopefully nobody expects them at a fixed place (Wine?)
*/
- [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
- [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
- [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff),
- [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff),
- [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff),
- [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(DESC_CODE32, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE64, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA64, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(DESC_CODE32 | DESC_USER, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(DESC_DATA64 | DESC_USER, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(DESC_CODE64 | DESC_USER, 0, 0xfffff),
#else
- [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xc09a, 0, 0xfffff),
- [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
- [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff),
- [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE32, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA32, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(DESC_CODE32 | DESC_USER, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(DESC_DATA32 | DESC_USER, 0, 0xfffff),
/*
* Segments used for calling PnP BIOS have byte granularity.
* They code segments and data segments have fixed 64k limits,
* the transfer segment sizes are set at run time.
*/
- /* 32-bit code */
- [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(0x409a, 0, 0xffff),
- /* 16-bit code */
- [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(0x009a, 0, 0xffff),
- /* 16-bit data */
- [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(0x0092, 0, 0xffff),
- /* 16-bit data */
- [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(0x0092, 0, 0),
- /* 16-bit data */
- [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(0x0092, 0, 0),
+ [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(DESC_CODE32_BIOS, 0, 0xffff),
+ [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(DESC_CODE16, 0, 0xffff),
+ [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(DESC_DATA16, 0, 0xffff),
+ [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(DESC_DATA16, 0, 0),
+ [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(DESC_DATA16, 0, 0),
/*
* The APM segments have byte granularity and their bases
* are set at run time. All have 64k limits.
*/
- /* 32-bit code */
- [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(0x409a, 0, 0xffff),
- /* 16-bit code */
- [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(0x009a, 0, 0xffff),
- /* data */
- [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(0x4092, 0, 0xffff),
-
- [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
- [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
+ [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(DESC_CODE32_BIOS, 0, 0xffff),
+ [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(DESC_CODE16, 0, 0xffff),
+ [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(DESC_DATA32_BIOS, 0, 0xffff),
+
+ [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(DESC_DATA32, 0, 0xfffff),
+ [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(DESC_DATA32, 0, 0xfffff),
#endif
} };
EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
+SYM_PIC_ALIAS(gdt_page);
#ifdef CONFIG_X86_64
static int __init x86_nopcid_setup(char *s)
@@ -288,21 +278,13 @@ static int __init x86_noinvpcid_setup(char *s)
}
early_param("noinvpcid", x86_noinvpcid_setup);
-#ifdef CONFIG_X86_32
-static int cachesize_override = -1;
-static int disable_x86_serial_nr = 1;
-
-static int __init cachesize_setup(char *str)
-{
- get_option(&str, &cachesize_override);
- return 1;
-}
-__setup("cachesize=", cachesize_setup);
-
/* Standard macro to see if a specific flag is changeable */
-static inline int flag_is_changeable_p(u32 flag)
+static inline bool flag_is_changeable_p(unsigned long flag)
{
- u32 f1, f2;
+ unsigned long f1, f2;
+
+ if (!IS_ENABLED(CONFIG_X86_32))
+ return true;
/*
* Cyrix and IDT cpus allow disabling of CPUID
@@ -325,11 +307,22 @@ static inline int flag_is_changeable_p(u32 flag)
: "=&r" (f1), "=&r" (f2)
: "ir" (flag));
- return ((f1^f2) & flag) != 0;
+ return (f1 ^ f2) & flag;
}
+#ifdef CONFIG_X86_32
+static int cachesize_override = -1;
+static int disable_x86_serial_nr = 1;
+
+static int __init cachesize_setup(char *str)
+{
+ get_option(&str, &cachesize_override);
+ return 1;
+}
+__setup("cachesize=", cachesize_setup);
+
/* Probe for the CPUID instruction */
-int have_cpuid_p(void)
+bool cpuid_feature(void)
{
return flag_is_changeable_p(X86_EFLAGS_ID);
}
@@ -361,10 +354,6 @@ static int __init x86_serial_nr_setup(char *s)
}
__setup("serialnumber", x86_serial_nr_setup);
#else
-static inline int flag_is_changeable_p(u32 flag)
-{
- return 1;
-}
static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
{
}
@@ -412,9 +401,8 @@ out:
}
/* These bits should not change their value after CPU init is finished. */
-static const unsigned long cr4_pinned_mask =
- X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP |
- X86_CR4_FSGSBASE | X86_CR4_CET;
+static const unsigned long cr4_pinned_mask = X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP |
+ X86_CR4_FSGSBASE | X86_CR4_CET | X86_CR4_FRED;
static DEFINE_STATIC_KEY_FALSE_RO(cr_pinning);
static unsigned long cr4_pinned_bits __ro_after_init;
@@ -566,17 +554,18 @@ static __init int setup_disable_pku(char *arg)
return 1;
}
__setup("nopku", setup_disable_pku);
-#endif /* CONFIG_X86_64 */
+#endif
#ifdef CONFIG_X86_KERNEL_IBT
-__noendbr u64 ibt_save(void)
+__noendbr u64 ibt_save(bool disable)
{
u64 msr = 0;
if (cpu_feature_enabled(X86_FEATURE_IBT)) {
- rdmsrl(MSR_IA32_S_CET, msr);
- wrmsrl(MSR_IA32_S_CET, msr & ~CET_ENDBR_EN);
+ rdmsrq(MSR_IA32_S_CET, msr);
+ if (disable)
+ wrmsrq(MSR_IA32_S_CET, msr & ~CET_ENDBR_EN);
}
return msr;
@@ -587,10 +576,10 @@ __noendbr void ibt_restore(u64 save)
u64 msr;
if (cpu_feature_enabled(X86_FEATURE_IBT)) {
- rdmsrl(MSR_IA32_S_CET, msr);
+ rdmsrq(MSR_IA32_S_CET, msr);
msr &= ~CET_ENDBR_EN;
msr |= (save & CET_ENDBR_EN);
- wrmsrl(MSR_IA32_S_CET, msr);
+ wrmsrq(MSR_IA32_S_CET, msr);
}
}
@@ -598,26 +587,43 @@ __noendbr void ibt_restore(u64 save)
static __always_inline void setup_cet(struct cpuinfo_x86 *c)
{
- u64 msr = CET_ENDBR_EN;
+ bool user_shstk, kernel_ibt;
- if (!HAS_KERNEL_IBT ||
- !cpu_feature_enabled(X86_FEATURE_IBT))
+ if (!IS_ENABLED(CONFIG_X86_CET))
return;
- wrmsrl(MSR_IA32_S_CET, msr);
+ kernel_ibt = HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT);
+ user_shstk = cpu_feature_enabled(X86_FEATURE_SHSTK) &&
+ IS_ENABLED(CONFIG_X86_USER_SHADOW_STACK);
+
+ if (!kernel_ibt && !user_shstk)
+ return;
+
+ if (user_shstk)
+ set_cpu_cap(c, X86_FEATURE_USER_SHSTK);
+
+ if (kernel_ibt)
+ wrmsrq(MSR_IA32_S_CET, CET_ENDBR_EN);
+ else
+ wrmsrq(MSR_IA32_S_CET, 0);
+
cr4_set_bits(X86_CR4_CET);
- if (!ibt_selftest()) {
+ if (kernel_ibt && ibt_selftest()) {
pr_err("IBT selftest: Failed!\n");
+ wrmsrq(MSR_IA32_S_CET, 0);
setup_clear_cpu_cap(X86_FEATURE_IBT);
- return;
}
}
__noendbr void cet_disable(void)
{
- if (cpu_feature_enabled(X86_FEATURE_IBT))
- wrmsrl(MSR_IA32_S_CET, 0);
+ if (!(cpu_feature_enabled(X86_FEATURE_IBT) ||
+ cpu_feature_enabled(X86_FEATURE_SHSTK)))
+ return;
+
+ wrmsrq(MSR_IA32_S_CET, 0);
+ wrmsrq(MSR_IA32_U_CET, 0);
}
/*
@@ -632,9 +638,9 @@ struct cpuid_dependent_feature {
static const struct cpuid_dependent_feature
cpuid_dependent_features[] = {
- { X86_FEATURE_MWAIT, 0x00000005 },
- { X86_FEATURE_DCA, 0x00000009 },
- { X86_FEATURE_XSAVE, 0x0000000d },
+ { X86_FEATURE_MWAIT, CPUID_LEAF_MWAIT },
+ { X86_FEATURE_DCA, CPUID_LEAF_DCA },
+ { X86_FEATURE_XSAVE, CPUID_LEAF_XSTATE },
{ 0, 0 }
};
@@ -662,8 +668,8 @@ static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
if (!warn)
continue;
- pr_warn("CPU: CPU feature " X86_CAP_FMT " disabled, no CPUID level 0x%x\n",
- x86_cap_flag(df->feature), df->level);
+ pr_warn("CPU: CPU feature %s disabled, no CPUID level 0x%x\n",
+ x86_cap_flags[df->feature], df->level);
}
}
@@ -701,16 +707,6 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c)
__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
__u32 cpu_caps_set[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
-void load_percpu_segment(int cpu)
-{
-#ifdef CONFIG_X86_32
- loadsegment(fs, __KERNEL_PERCPU);
-#else
- __loadsegment_simple(gs, 0);
- wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu));
-#endif
-}
-
#ifdef CONFIG_X86_32
/* The 32-bit entry code needs to find cpu_entry_area. */
DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
@@ -738,16 +734,45 @@ void load_fixmap_gdt(int cpu)
}
EXPORT_SYMBOL_GPL(load_fixmap_gdt);
-/*
- * Current gdt points %fs at the "master" per-cpu area: after this,
- * it's on the real one.
+/**
+ * switch_gdt_and_percpu_base - Switch to direct GDT and runtime per CPU base
+ * @cpu: The CPU number for which this is invoked
+ *
+ * Invoked during early boot to switch from early GDT and early per CPU to
+ * the direct GDT and the runtime per CPU area. On 32-bit the percpu base
+ * switch is implicit by loading the direct GDT. On 64bit this requires
+ * to update GSBASE.
*/
-void switch_to_new_gdt(int cpu)
+void __init switch_gdt_and_percpu_base(int cpu)
{
- /* Load the original GDT */
load_direct_gdt(cpu);
- /* Reload the per-cpu base */
- load_percpu_segment(cpu);
+
+#ifdef CONFIG_X86_64
+ /*
+ * No need to load %gs. It is already correct.
+ *
+ * Writing %gs on 64bit would zero GSBASE which would make any per
+ * CPU operation up to the point of the wrmsrq() fault.
+ *
+ * Set GSBASE to the new offset. Until the wrmsrq() happens the
+ * early mapping is still valid. That means the GSBASE update will
+ * lose any prior per CPU data which was not copied over in
+ * setup_per_cpu_areas().
+ *
+ * This works even with stackprotector enabled because the
+ * per CPU stack canary is 0 in both per CPU areas.
+ */
+ wrmsrq(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu));
+#else
+ /*
+ * %fs is already set to __KERNEL_PERCPU, but after switching GDT
+ * it is required to load FS again so that the 'hidden' part is
+ * updated from the new GDT. Up to this point the early per CPU
+ * translation is active. Any content of the early per CPU data
+ * which was not copied over in setup_per_cpu_areas() is lost.
+ */
+ loadsegment(fs, __KERNEL_PERCPU);
+#endif
}
static const struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
@@ -783,19 +808,6 @@ static void get_model_name(struct cpuinfo_x86 *c)
*(s + 1) = '\0';
}
-void detect_num_cpu_cores(struct cpuinfo_x86 *c)
-{
- unsigned int eax, ebx, ecx, edx;
-
- c->x86_max_cores = 1;
- if (!IS_ENABLED(CONFIG_SMP) || c->cpuid_level < 4)
- return;
-
- cpuid_count(4, 0, &eax, &ebx, &ecx, &edx);
- if (eax & 0x1f)
- c->x86_max_cores = (eax >> 26) + 1;
-}
-
void cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
{
unsigned int n, dummy, ebx, ecx, edx, l2size;
@@ -835,13 +847,13 @@ void cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
c->x86_cache_size = l2size;
}
-u16 __read_mostly tlb_lli_4k[NR_INFO];
-u16 __read_mostly tlb_lli_2m[NR_INFO];
-u16 __read_mostly tlb_lli_4m[NR_INFO];
-u16 __read_mostly tlb_lld_4k[NR_INFO];
-u16 __read_mostly tlb_lld_2m[NR_INFO];
-u16 __read_mostly tlb_lld_4m[NR_INFO];
-u16 __read_mostly tlb_lld_1g[NR_INFO];
+u16 __read_mostly tlb_lli_4k;
+u16 __read_mostly tlb_lli_2m;
+u16 __read_mostly tlb_lli_4m;
+u16 __read_mostly tlb_lld_4k;
+u16 __read_mostly tlb_lld_2m;
+u16 __read_mostly tlb_lld_4m;
+u16 __read_mostly tlb_lld_1g;
static void cpu_detect_tlb(struct cpuinfo_x86 *c)
{
@@ -849,60 +861,13 @@ static void cpu_detect_tlb(struct cpuinfo_x86 *c)
this_cpu->c_detect_tlb(c);
pr_info("Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n",
- tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
- tlb_lli_4m[ENTRIES]);
+ tlb_lli_4k, tlb_lli_2m, tlb_lli_4m);
pr_info("Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n",
- tlb_lld_4k[ENTRIES], tlb_lld_2m[ENTRIES],
- tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]);
-}
-
-int detect_ht_early(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_SMP
- u32 eax, ebx, ecx, edx;
-
- if (!cpu_has(c, X86_FEATURE_HT))
- return -1;
-
- if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
- return -1;
-
- if (cpu_has(c, X86_FEATURE_XTOPOLOGY))
- return -1;
-
- cpuid(1, &eax, &ebx, &ecx, &edx);
-
- smp_num_siblings = (ebx & 0xff0000) >> 16;
- if (smp_num_siblings == 1)
- pr_info_once("CPU0: Hyper-Threading is disabled\n");
-#endif
- return 0;
-}
-
-void detect_ht(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_SMP
- int index_msb, core_bits;
-
- if (detect_ht_early(c) < 0)
- return;
-
- index_msb = get_count_order(smp_num_siblings);
- c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
-
- smp_num_siblings = smp_num_siblings / c->x86_max_cores;
-
- index_msb = get_count_order(smp_num_siblings);
-
- core_bits = get_count_order(c->x86_max_cores);
-
- c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
- ((1 << core_bits) - 1);
-#endif
+ tlb_lld_4k, tlb_lld_2m, tlb_lld_4m, tlb_lld_1g);
}
-static void get_cpu_vendor(struct cpuinfo_x86 *c)
+void get_cpu_vendor(struct cpuinfo_x86 *c)
{
char *v = c->x86_vendor_id;
int i;
@@ -1041,17 +1006,18 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
c->x86_capability[CPUID_D_1_EAX] = eax;
}
- /* AMD-defined flags: level 0x80000001 */
+ /*
+ * Check if extended CPUID leaves are implemented: Max extended
+ * CPUID leaf must be in the 0x80000001-0x8000ffff range.
+ */
eax = cpuid_eax(0x80000000);
- c->extended_cpuid_level = eax;
+ c->extended_cpuid_level = ((eax & 0xffff0000) == 0x80000000) ? eax : 0;
- if ((eax & 0xffff0000) == 0x80000000) {
- if (eax >= 0x80000001) {
- cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
+ if (c->extended_cpuid_level >= 0x80000001) {
+ cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
- c->x86_capability[CPUID_8000_0001_ECX] = ecx;
- c->x86_capability[CPUID_8000_0001_EDX] = edx;
- }
+ c->x86_capability[CPUID_8000_0001_ECX] = ecx;
+ c->x86_capability[CPUID_8000_0001_EDX] = edx;
}
if (c->extended_cpuid_level >= 0x80000007) {
@@ -1072,6 +1038,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
if (c->extended_cpuid_level >= 0x8000001f)
c->x86_capability[CPUID_8000_001F_EAX] = cpuid_eax(0x8000001f);
+ if (c->extended_cpuid_level >= 0x80000021)
+ c->x86_capability[CPUID_8000_0021_EAX] = cpuid_eax(0x80000021);
+
init_scattered_cpuid_features(c);
init_speculation_control(c);
@@ -1087,22 +1056,38 @@ void get_cpu_address_sizes(struct cpuinfo_x86 *c)
{
u32 eax, ebx, ecx, edx;
- if (c->extended_cpuid_level >= 0x80000008) {
+ if (!cpu_has(c, X86_FEATURE_CPUID) ||
+ (c->extended_cpuid_level < 0x80000008)) {
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ c->x86_clflush_size = 64;
+ c->x86_phys_bits = 36;
+ c->x86_virt_bits = 48;
+ } else {
+ c->x86_clflush_size = 32;
+ c->x86_virt_bits = 32;
+ c->x86_phys_bits = 32;
+
+ if (cpu_has(c, X86_FEATURE_PAE) ||
+ cpu_has(c, X86_FEATURE_PSE36))
+ c->x86_phys_bits = 36;
+ }
+ } else {
cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
c->x86_virt_bits = (eax >> 8) & 0xff;
c->x86_phys_bits = eax & 0xff;
+
+ /* Provide a sane default if not enumerated: */
+ if (!c->x86_clflush_size)
+ c->x86_clflush_size = 32;
}
-#ifdef CONFIG_X86_32
- else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
- c->x86_phys_bits = 36;
-#endif
+
c->x86_cache_bits = c->x86_phys_bits;
+ c->x86_cache_alignment = c->x86_clflush_size;
}
static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_X86_32
int i;
/*
@@ -1123,7 +1108,6 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
break;
}
}
-#endif
}
#define NO_SPECULATION BIT(0)
@@ -1135,12 +1119,15 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
#define NO_SWAPGS BIT(6)
#define NO_ITLB_MULTIHIT BIT(7)
#define NO_SPECTRE_V2 BIT(8)
+#define NO_MMIO BIT(9)
+#define NO_EIBRS_PBRSB BIT(10)
+#define NO_BHI BIT(11)
#define VULNWL(vendor, family, model, whitelist) \
X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, whitelist)
-#define VULNWL_INTEL(model, whitelist) \
- VULNWL(INTEL, 6, INTEL_FAM6_##model, whitelist)
+#define VULNWL_INTEL(vfm, whitelist) \
+ X86_MATCH_VFM(vfm, whitelist)
#define VULNWL_AMD(family, whitelist) \
VULNWL(AMD, family, X86_MODEL_ANY, whitelist)
@@ -1157,27 +1144,32 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
VULNWL(VORTEX, 6, X86_MODEL_ANY, NO_SPECULATION),
/* Intel Family 6 */
- VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_TIGERLAKE, NO_MMIO),
+ VULNWL_INTEL(INTEL_TIGERLAKE_L, NO_MMIO),
+ VULNWL_INTEL(INTEL_ALDERLAKE, NO_MMIO),
+ VULNWL_INTEL(INTEL_ALDERLAKE_L, NO_MMIO),
+
+ VULNWL_INTEL(INTEL_ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(CORE_YONAH, NO_SSB),
+ VULNWL_INTEL(INTEL_CORE_YONAH, NO_SSB),
- VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SILVERMONT_MID2,NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | MSBDS_ONLY),
+ VULNWL_INTEL(INTEL_ATOM_AIRMONT_NP, NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
+ VULNWL_INTEL(INTEL_ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
+ VULNWL_INTEL(INTEL_ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB),
/*
* Technically, swapgs isn't serializing on AMD (despite it previously
@@ -1187,31 +1179,34 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
* good enough for our purposes.
*/
- VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_TREMONT, NO_EIBRS_PBRSB),
+ VULNWL_INTEL(INTEL_ATOM_TREMONT_L, NO_EIBRS_PBRSB),
+ VULNWL_INTEL(INTEL_ATOM_TREMONT_D, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB),
/* AMD Family 0xf - 0x12 */
- VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI),
+ VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI),
+ VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI),
+ VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI),
/* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */
- VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB | NO_BHI),
+ VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB | NO_BHI),
/* Zhaoxin Family 7 */
- VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS),
- VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS),
+ VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO | NO_BHI),
+ VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO | NO_BHI),
{}
};
#define VULNBL(vendor, family, model, blacklist) \
X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, blacklist)
-#define VULNBL_INTEL_STEPPINGS(model, steppings, issues) \
- X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, \
- INTEL_FAM6_##model, steppings, \
- X86_FEATURE_ANY, issues)
+#define VULNBL_INTEL_STEPS(vfm, max_stepping, issues) \
+ X86_MATCH_VFM_STEPS(vfm, X86_STEP_MIN, max_stepping, issues)
+
+#define VULNBL_INTEL_TYPE(vfm, cpu_type, issues) \
+ X86_MATCH_VFM_CPU_TYPE(vfm, INTEL_CPU_TYPE_##cpu_type, issues)
#define VULNBL_AMD(family, blacklist) \
VULNBL(AMD, family, X86_MODEL_ANY, blacklist)
@@ -1226,39 +1221,67 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
#define MMIO_SBDS BIT(2)
/* CPU is affected by RETbleed, speculating where you would not expect it */
#define RETBLEED BIT(3)
+/* CPU is affected by SMT (cross-thread) return predictions */
+#define SMT_RSB BIT(4)
+/* CPU is affected by SRSO */
+#define SRSO BIT(5)
+/* CPU is affected by GDS */
+#define GDS BIT(6)
+/* CPU is affected by Register File Data Sampling */
+#define RFDS BIT(7)
+/* CPU is affected by Indirect Target Selection */
+#define ITS BIT(8)
+/* CPU is affected by Indirect Target Selection, but guest-host isolation is not affected */
+#define ITS_NATIVE_ONLY BIT(9)
static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
- VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(HASWELL, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(HASWELL_L, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(HASWELL_G, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(HASWELL_X, X86_STEPPING_ANY, MMIO),
- VULNBL_INTEL_STEPPINGS(BROADWELL_D, X86_STEPPING_ANY, MMIO),
- VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO),
- VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED),
- VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED),
- VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED),
- VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED),
- VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED),
- VULNBL_INTEL_STEPPINGS(CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED),
- VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
- VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO),
- VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPING_ANY, MMIO),
- VULNBL_INTEL_STEPPINGS(COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
- VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED),
- VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
- VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
- VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED),
- VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS),
- VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO),
- VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS),
+ VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE, X86_STEP_MAX, SRBDS),
+ VULNBL_INTEL_STEPS(INTEL_HASWELL, X86_STEP_MAX, SRBDS),
+ VULNBL_INTEL_STEPS(INTEL_HASWELL_L, X86_STEP_MAX, SRBDS),
+ VULNBL_INTEL_STEPS(INTEL_HASWELL_G, X86_STEP_MAX, SRBDS),
+ VULNBL_INTEL_STEPS(INTEL_HASWELL_X, X86_STEP_MAX, MMIO),
+ VULNBL_INTEL_STEPS(INTEL_BROADWELL_D, X86_STEP_MAX, MMIO),
+ VULNBL_INTEL_STEPS(INTEL_BROADWELL_G, X86_STEP_MAX, SRBDS),
+ VULNBL_INTEL_STEPS(INTEL_BROADWELL_X, X86_STEP_MAX, MMIO),
+ VULNBL_INTEL_STEPS(INTEL_BROADWELL, X86_STEP_MAX, SRBDS),
+ VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, 0x5, MMIO | RETBLEED | GDS),
+ VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS),
+ VULNBL_INTEL_STEPS(INTEL_SKYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS),
+ VULNBL_INTEL_STEPS(INTEL_SKYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS),
+ VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, 0xb, MMIO | RETBLEED | GDS | SRBDS),
+ VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS),
+ VULNBL_INTEL_STEPS(INTEL_KABYLAKE, 0xc, MMIO | RETBLEED | GDS | SRBDS),
+ VULNBL_INTEL_STEPS(INTEL_KABYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS),
+ VULNBL_INTEL_STEPS(INTEL_CANNONLAKE_L, X86_STEP_MAX, RETBLEED),
+ VULNBL_INTEL_STEPS(INTEL_ICELAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_STEPS(INTEL_ICELAKE_D, X86_STEP_MAX, MMIO | GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_STEPS(INTEL_ICELAKE_X, X86_STEP_MAX, MMIO | GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_STEPS(INTEL_COMETLAKE, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS),
+ VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, 0x0, MMIO | RETBLEED | ITS),
+ VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS),
+ VULNBL_INTEL_STEPS(INTEL_TIGERLAKE_L, X86_STEP_MAX, GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_STEPS(INTEL_TIGERLAKE, X86_STEP_MAX, GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_STEPS(INTEL_LAKEFIELD, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED),
+ VULNBL_INTEL_STEPS(INTEL_ROCKETLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_TYPE(INTEL_ALDERLAKE, ATOM, RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ALDERLAKE_L, X86_STEP_MAX, RFDS),
+ VULNBL_INTEL_TYPE(INTEL_RAPTORLAKE, ATOM, RFDS),
+ VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_P, X86_STEP_MAX, RFDS),
+ VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_S, X86_STEP_MAX, RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_GRACEMONT, X86_STEP_MAX, RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT, X86_STEP_MAX, MMIO | MMIO_SBDS | RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_D, X86_STEP_MAX, MMIO | RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT, X86_STEP_MAX, RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_D, X86_STEP_MAX, RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_PLUS, X86_STEP_MAX, RFDS),
VULNBL_AMD(0x15, RETBLEED),
VULNBL_AMD(0x16, RETBLEED),
- VULNBL_AMD(0x17, RETBLEED),
- VULNBL_HYGON(0x18, RETBLEED),
+ VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO),
+ VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO),
+ VULNBL_AMD(0x19, SRSO),
+ VULNBL_AMD(0x1a, SRSO),
{}
};
@@ -1271,28 +1294,114 @@ static bool __init cpu_matches(const struct x86_cpu_id *table, unsigned long whi
u64 x86_read_arch_cap_msr(void)
{
- u64 ia32_cap = 0;
+ u64 x86_arch_cap_msr = 0;
if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
- rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
+ rdmsrq(MSR_IA32_ARCH_CAPABILITIES, x86_arch_cap_msr);
+
+ return x86_arch_cap_msr;
+}
+
+static bool arch_cap_mmio_immune(u64 x86_arch_cap_msr)
+{
+ return (x86_arch_cap_msr & ARCH_CAP_FBSDP_NO &&
+ x86_arch_cap_msr & ARCH_CAP_PSDP_NO &&
+ x86_arch_cap_msr & ARCH_CAP_SBDR_SSDP_NO);
+}
+
+static bool __init vulnerable_to_rfds(u64 x86_arch_cap_msr)
+{
+ /* The "immunity" bit trumps everything else: */
+ if (x86_arch_cap_msr & ARCH_CAP_RFDS_NO)
+ return false;
- return ia32_cap;
+ /*
+ * VMMs set ARCH_CAP_RFDS_CLEAR for processors not in the blacklist to
+ * indicate that mitigation is needed because guest is running on a
+ * vulnerable hardware or may migrate to such hardware:
+ */
+ if (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR)
+ return true;
+
+ /* Only consult the blacklist when there is no enumeration: */
+ return cpu_matches(cpu_vuln_blacklist, RFDS);
}
-static bool arch_cap_mmio_immune(u64 ia32_cap)
+static bool __init vulnerable_to_its(u64 x86_arch_cap_msr)
{
- return (ia32_cap & ARCH_CAP_FBSDP_NO &&
- ia32_cap & ARCH_CAP_PSDP_NO &&
- ia32_cap & ARCH_CAP_SBDR_SSDP_NO);
+ /* The "immunity" bit trumps everything else: */
+ if (x86_arch_cap_msr & ARCH_CAP_ITS_NO)
+ return false;
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return false;
+
+ /* None of the affected CPUs have BHI_CTRL */
+ if (boot_cpu_has(X86_FEATURE_BHI_CTRL))
+ return false;
+
+ /*
+ * If a VMM did not expose ITS_NO, assume that a guest could
+ * be running on a vulnerable hardware or may migrate to such
+ * hardware.
+ */
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return true;
+
+ if (cpu_matches(cpu_vuln_blacklist, ITS))
+ return true;
+
+ return false;
+}
+
+static struct x86_cpu_id cpu_latest_microcode[] = {
+#include "microcode/intel-ucode-defs.h"
+ {}
+};
+
+static bool __init cpu_has_old_microcode(void)
+{
+ const struct x86_cpu_id *m = x86_match_cpu(cpu_latest_microcode);
+
+ /* Give unknown CPUs a pass: */
+ if (!m) {
+ /* Intel CPUs should be in the list. Warn if not: */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ pr_info("x86/CPU: Model not found in latest microcode list\n");
+ return false;
+ }
+
+ /*
+ * Hosts usually lie to guests with a super high microcode
+ * version. Just ignore what hosts tell guests:
+ */
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return false;
+
+ /* Consider all debug microcode to be old: */
+ if (boot_cpu_data.microcode & BIT(31))
+ return true;
+
+ /* Give new microcode a pass: */
+ if (boot_cpu_data.microcode >= m->driver_data)
+ return false;
+
+ /* Uh oh, too old: */
+ return true;
}
static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
{
- u64 ia32_cap = x86_read_arch_cap_msr();
+ u64 x86_arch_cap_msr = x86_read_arch_cap_msr();
+
+ if (cpu_has_old_microcode()) {
+ pr_warn("x86/CPU: Running old microcode\n");
+ setup_force_cpu_bug(X86_BUG_OLD_MICROCODE);
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+ }
/* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */
if (!cpu_matches(cpu_vuln_whitelist, NO_ITLB_MULTIHIT) &&
- !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO))
+ !(x86_arch_cap_msr & ARCH_CAP_PSCHANGE_MC_NO))
setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT);
if (cpu_matches(cpu_vuln_whitelist, NO_SPECULATION))
@@ -1300,19 +1409,34 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
- if (!cpu_matches(cpu_vuln_whitelist, NO_SPECTRE_V2))
+ if (!cpu_matches(cpu_vuln_whitelist, NO_SPECTRE_V2)) {
setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
+ setup_force_cpu_bug(X86_BUG_SPECTRE_V2_USER);
+ }
if (!cpu_matches(cpu_vuln_whitelist, NO_SSB) &&
- !(ia32_cap & ARCH_CAP_SSB_NO) &&
+ !(x86_arch_cap_msr & ARCH_CAP_SSB_NO) &&
!cpu_has(c, X86_FEATURE_AMD_SSB_NO))
setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
- if (ia32_cap & ARCH_CAP_IBRS_ALL)
+ /*
+ * AMD's AutoIBRS is equivalent to Intel's eIBRS - use the Intel feature
+ * flag and protect from vendor-specific bugs via the whitelist.
+ *
+ * Don't use AutoIBRS when SNP is enabled because it degrades host
+ * userspace indirect branch performance.
+ */
+ if ((x86_arch_cap_msr & ARCH_CAP_IBRS_ALL) ||
+ (cpu_has(c, X86_FEATURE_AUTOIBRS) &&
+ !cpu_feature_enabled(X86_FEATURE_SEV_SNP))) {
setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED);
+ if (!cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) &&
+ !(x86_arch_cap_msr & ARCH_CAP_PBRSB_NO))
+ setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB);
+ }
if (!cpu_matches(cpu_vuln_whitelist, NO_MDS) &&
- !(ia32_cap & ARCH_CAP_MDS_NO)) {
+ !(x86_arch_cap_msr & ARCH_CAP_MDS_NO)) {
setup_force_cpu_bug(X86_BUG_MDS);
if (cpu_matches(cpu_vuln_whitelist, MSBDS_ONLY))
setup_force_cpu_bug(X86_BUG_MSBDS_ONLY);
@@ -1331,9 +1455,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
* TSX_CTRL check alone is not sufficient for cases when the microcode
* update is not present or running as guest that don't get TSX_CTRL.
*/
- if (!(ia32_cap & ARCH_CAP_TAA_NO) &&
+ if (!(x86_arch_cap_msr & ARCH_CAP_TAA_NO) &&
(cpu_has(c, X86_FEATURE_RTM) ||
- (ia32_cap & ARCH_CAP_TSX_CTRL_MSR)))
+ (x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR)))
setup_force_cpu_bug(X86_BUG_TAA);
/*
@@ -1356,20 +1480,61 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
* but for virtualization case check for ARCH_CAP MSR bits also, VMM may
* not want the guest to enumerate the bug.
*/
- if (cpu_matches(cpu_vuln_blacklist, MMIO) &&
- !arch_cap_mmio_immune(ia32_cap))
- setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA);
+ if (!arch_cap_mmio_immune(x86_arch_cap_msr)) {
+ if (cpu_matches(cpu_vuln_blacklist, MMIO))
+ setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA);
+ }
if (!cpu_has(c, X86_FEATURE_BTC_NO)) {
- if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (ia32_cap & ARCH_CAP_RSBA))
+ if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (x86_arch_cap_msr & ARCH_CAP_RSBA))
setup_force_cpu_bug(X86_BUG_RETBLEED);
}
+ if (cpu_matches(cpu_vuln_blacklist, SMT_RSB))
+ setup_force_cpu_bug(X86_BUG_SMT_RSB);
+
+ if (!cpu_has(c, X86_FEATURE_SRSO_NO)) {
+ if (cpu_matches(cpu_vuln_blacklist, SRSO))
+ setup_force_cpu_bug(X86_BUG_SRSO);
+ }
+
+ /*
+ * Check if CPU is vulnerable to GDS. If running in a virtual machine on
+ * an affected processor, the VMM may have disabled the use of GATHER by
+ * disabling AVX2. The only way to do this in HW is to clear XCR0[2],
+ * which means that AVX will be disabled.
+ */
+ if (cpu_matches(cpu_vuln_blacklist, GDS) && !(x86_arch_cap_msr & ARCH_CAP_GDS_NO) &&
+ boot_cpu_has(X86_FEATURE_AVX))
+ setup_force_cpu_bug(X86_BUG_GDS);
+
+ if (vulnerable_to_rfds(x86_arch_cap_msr))
+ setup_force_cpu_bug(X86_BUG_RFDS);
+
+ /*
+ * Intel parts with eIBRS are vulnerable to BHI attacks. Parts with
+ * BHI_NO still need to use the BHI mitigation to prevent Intra-mode
+ * attacks. When virtualized, eIBRS could be hidden, assume vulnerable.
+ */
+ if (!cpu_matches(cpu_vuln_whitelist, NO_BHI) &&
+ (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED) ||
+ boot_cpu_has(X86_FEATURE_HYPERVISOR)))
+ setup_force_cpu_bug(X86_BUG_BHI);
+
+ if (cpu_has(c, X86_FEATURE_AMD_IBPB) && !cpu_has(c, X86_FEATURE_AMD_IBPB_RET))
+ setup_force_cpu_bug(X86_BUG_IBPB_NO_RET);
+
+ if (vulnerable_to_its(x86_arch_cap_msr)) {
+ setup_force_cpu_bug(X86_BUG_ITS);
+ if (cpu_matches(cpu_vuln_blacklist, ITS_NATIVE_ONLY))
+ setup_force_cpu_bug(X86_BUG_ITS_NATIVE_ONLY);
+ }
+
if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN))
return;
/* Rogue Data Cache Load? No! */
- if (ia32_cap & ARCH_CAP_RDCL_NO)
+ if (x86_arch_cap_msr & ARCH_CAP_RDCL_NO)
return;
setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
@@ -1398,65 +1563,38 @@ static void detect_nopl(void)
#endif
}
-/*
- * We parse cpu parameters early because fpu__init_system() is executed
- * before parse_early_param().
- */
-static void __init cpu_parse_early_param(void)
+static inline bool parse_set_clear_cpuid(char *arg, bool set)
{
- char arg[128];
- char *argptr = arg, *opt;
- int arglen, taint = 0;
-
-#ifdef CONFIG_X86_32
- if (cmdline_find_option_bool(boot_command_line, "no387"))
-#ifdef CONFIG_MATH_EMULATION
- setup_clear_cpu_cap(X86_FEATURE_FPU);
-#else
- pr_err("Option 'no387' required CONFIG_MATH_EMULATION enabled.\n");
-#endif
-
- if (cmdline_find_option_bool(boot_command_line, "nofxsr"))
- setup_clear_cpu_cap(X86_FEATURE_FXSR);
-#endif
-
- if (cmdline_find_option_bool(boot_command_line, "noxsave"))
- setup_clear_cpu_cap(X86_FEATURE_XSAVE);
-
- if (cmdline_find_option_bool(boot_command_line, "noxsaveopt"))
- setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
-
- if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
- setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+ char *opt;
+ int taint = 0;
- arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg));
- if (arglen <= 0)
- return;
-
- pr_info("Clearing CPUID bits:");
-
- while (argptr) {
+ while (arg) {
bool found __maybe_unused = false;
unsigned int bit;
- opt = strsep(&argptr, ",");
+ opt = strsep(&arg, ",");
/*
* Handle naked numbers first for feature flags which don't
- * have names.
+ * have names. It doesn't make sense for a bug not to have a
+ * name so don't handle bug flags here.
*/
if (!kstrtouint(opt, 10, &bit)) {
if (bit < NCAPINTS * 32) {
-#ifdef CONFIG_X86_FEATURE_NAMES
+ if (set) {
+ pr_warn("setcpuid: force-enabling CPU feature flag:");
+ setup_force_cpu_cap(bit);
+ } else {
+ pr_warn("clearcpuid: force-disabling CPU feature flag:");
+ setup_clear_cpu_cap(bit);
+ }
/* empty-string, i.e., ""-defined feature flags */
if (!x86_cap_flags[bit])
- pr_cont(" " X86_CAP_FMT_NUM, x86_cap_flag_num(bit));
+ pr_cont(" %d:%d\n", bit >> 5, bit & 31);
else
-#endif
- pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit));
+ pr_cont(" %s\n", x86_cap_flags[bit]);
- setup_clear_cpu_cap(bit);
taint++;
}
/*
@@ -1466,29 +1604,97 @@ static void __init cpu_parse_early_param(void)
continue;
}
-#ifdef CONFIG_X86_FEATURE_NAMES
- for (bit = 0; bit < 32 * NCAPINTS; bit++) {
- if (!x86_cap_flag(bit))
+ for (bit = 0; bit < 32 * (NCAPINTS + NBUGINTS); bit++) {
+ const char *flag;
+ const char *kind;
+
+ if (bit < 32 * NCAPINTS) {
+ flag = x86_cap_flags[bit];
+ kind = "feature";
+ } else {
+ kind = "bug";
+ flag = x86_bug_flags[bit - (32 * NCAPINTS)];
+ }
+
+ if (!flag)
continue;
- if (strcmp(x86_cap_flag(bit), opt))
+ if (strcmp(flag, opt))
continue;
- pr_cont(" %s", opt);
- setup_clear_cpu_cap(bit);
+ if (set) {
+ pr_warn("setcpuid: force-enabling CPU %s flag: %s\n",
+ kind, flag);
+ setup_force_cpu_cap(bit);
+ } else {
+ pr_warn("clearcpuid: force-disabling CPU %s flag: %s\n",
+ kind, flag);
+ setup_clear_cpu_cap(bit);
+ }
taint++;
found = true;
break;
}
if (!found)
- pr_cont(" (unknown: %s)", opt);
-#endif
+ pr_warn("%s: unknown CPU flag: %s", set ? "setcpuid" : "clearcpuid", opt);
}
- pr_cont("\n");
- if (taint)
+ return taint;
+}
+
+
+/*
+ * We parse cpu parameters early because fpu__init_system() is executed
+ * before parse_early_param().
+ */
+static void __init cpu_parse_early_param(void)
+{
+ bool cpuid_taint = false;
+ char arg[128];
+ int arglen;
+
+#ifdef CONFIG_X86_32
+ if (cmdline_find_option_bool(boot_command_line, "no387"))
+#ifdef CONFIG_MATH_EMULATION
+ setup_clear_cpu_cap(X86_FEATURE_FPU);
+#else
+ pr_err("Option 'no387' required CONFIG_MATH_EMULATION enabled.\n");
+#endif
+
+ if (cmdline_find_option_bool(boot_command_line, "nofxsr"))
+ setup_clear_cpu_cap(X86_FEATURE_FXSR);
+#endif
+
+ if (cmdline_find_option_bool(boot_command_line, "noxsave"))
+ setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+
+ if (cmdline_find_option_bool(boot_command_line, "noxsaveopt"))
+ setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+
+ if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
+ setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+
+ if (cmdline_find_option_bool(boot_command_line, "nousershstk"))
+ setup_clear_cpu_cap(X86_FEATURE_USER_SHSTK);
+
+ /* Minimize the gap between FRED is available and available but disabled. */
+ arglen = cmdline_find_option(boot_command_line, "fred", arg, sizeof(arg));
+ if (arglen != 2 || strncmp(arg, "on", 2))
+ setup_clear_cpu_cap(X86_FEATURE_FRED);
+
+ arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg));
+ if (arglen > 0)
+ cpuid_taint |= parse_set_clear_cpuid(arg, false);
+
+ arglen = cmdline_find_option(boot_command_line, "setcpuid", arg, sizeof(arg));
+ if (arglen > 0)
+ cpuid_taint |= parse_set_clear_cpuid(arg, true);
+
+ if (cpuid_taint) {
+ pr_warn("!!! setcpuid=/clearcpuid= in use, this is for TESTING ONLY, may break things horribly. Tainting kernel.\n");
add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+ }
}
/*
@@ -1502,42 +1708,37 @@ static void __init cpu_parse_early_param(void)
*/
static void __init early_identify_cpu(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_X86_64
- c->x86_clflush_size = 64;
- c->x86_phys_bits = 36;
- c->x86_virt_bits = 48;
-#else
- c->x86_clflush_size = 32;
- c->x86_phys_bits = 32;
- c->x86_virt_bits = 32;
-#endif
- c->x86_cache_alignment = c->x86_clflush_size;
-
memset(&c->x86_capability, 0, sizeof(c->x86_capability));
c->extended_cpuid_level = 0;
- if (!have_cpuid_p())
+ if (!cpuid_feature())
identify_cpu_without_cpuid(c);
/* cyrix could have cpuid enabled via c_identify()*/
- if (have_cpuid_p()) {
+ if (cpuid_feature()) {
cpu_detect(c);
get_cpu_vendor(c);
+ intel_unlock_cpuid_leafs(c);
get_cpu_cap(c);
- get_cpu_address_sizes(c);
setup_force_cpu_cap(X86_FEATURE_CPUID);
+ get_cpu_address_sizes(c);
cpu_parse_early_param();
+ cpu_init_topology(c);
+
if (this_cpu->c_early_init)
this_cpu->c_early_init(c);
c->cpu_index = 0;
filter_cpuid_features(c, false);
+ check_cpufeature_deps(c);
if (this_cpu->c_bsp_init)
this_cpu->c_bsp_init(c);
} else {
setup_clear_cpu_cap(X86_FEATURE_CPUID);
+ get_cpu_address_sizes(c);
+ cpu_init_topology(c);
}
setup_force_cpu_cap(X86_FEATURE_ALWAYS);
@@ -1546,10 +1747,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
sld_setup(c);
- fpu__init_system(c);
-
- init_sigframe_size();
-
#ifdef CONFIG_X86_32
/*
* Regardless of whether PCID is enumerated, the SDM says
@@ -1576,15 +1773,11 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
detect_nopl();
}
-void __init early_cpu_init(void)
+void __init init_cpu_devs(void)
{
const struct cpu_dev *const *cdev;
int count = 0;
-#ifdef CONFIG_PROCESSOR_SELECT
- pr_info("KERNEL supported cpus:\n");
-#endif
-
for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
const struct cpu_dev *cpudev = *cdev;
@@ -1592,20 +1785,30 @@ void __init early_cpu_init(void)
break;
cpu_devs[count] = cpudev;
count++;
+ }
+}
+void __init early_cpu_init(void)
+{
#ifdef CONFIG_PROCESSOR_SELECT
- {
- unsigned int j;
-
- for (j = 0; j < 2; j++) {
- if (!cpudev->c_ident[j])
- continue;
- pr_info(" %s %s\n", cpudev->c_vendor,
- cpudev->c_ident[j]);
- }
- }
+ unsigned int i, j;
+
+ pr_info("KERNEL supported cpus:\n");
#endif
+
+ init_cpu_devs();
+
+#ifdef CONFIG_PROCESSOR_SELECT
+ for (i = 0; i < X86_VENDOR_NUM && cpu_devs[i]; i++) {
+ for (j = 0; j < 2; j++) {
+ if (!cpu_devs[i]->c_ident[j])
+ continue;
+ pr_info(" %s %s\n", cpu_devs[i]->c_vendor,
+ cpu_devs[i]->c_ident[j]);
+ }
}
+#endif
+
early_identify_cpu(&boot_cpu_data);
}
@@ -1627,11 +1830,11 @@ static bool detect_null_seg_behavior(void)
*/
unsigned long old_base, tmp;
- rdmsrl(MSR_FS_BASE, old_base);
- wrmsrl(MSR_FS_BASE, 1);
+ rdmsrq(MSR_FS_BASE, old_base);
+ wrmsrq(MSR_FS_BASE, 1);
loadsegment(fs, 0);
- rdmsrl(MSR_FS_BASE, tmp);
- wrmsrl(MSR_FS_BASE, old_base);
+ rdmsrq(MSR_FS_BASE, tmp);
+ wrmsrq(MSR_FS_BASE, old_base);
return tmp == 0;
}
@@ -1641,9 +1844,7 @@ void check_null_seg_clears_base(struct cpuinfo_x86 *c)
if (!IS_ENABLED(CONFIG_X86_64))
return;
- /* Zen3 CPUs advertise Null Selector Clears Base in CPUID. */
- if (c->extended_cpuid_level >= 0x80000021 &&
- cpuid_eax(0x80000021) & BIT(6))
+ if (cpu_has(c, X86_FEATURE_NULL_SEL_CLR_BASE))
return;
/*
@@ -1674,33 +1875,21 @@ static void generic_identify(struct cpuinfo_x86 *c)
{
c->extended_cpuid_level = 0;
- if (!have_cpuid_p())
+ if (!cpuid_feature())
identify_cpu_without_cpuid(c);
/* cyrix could have cpuid enabled via c_identify()*/
- if (!have_cpuid_p())
+ if (!cpuid_feature())
return;
cpu_detect(c);
get_cpu_vendor(c);
-
+ intel_unlock_cpuid_leafs(c);
get_cpu_cap(c);
get_cpu_address_sizes(c);
- if (c->cpuid_level >= 0x00000001) {
- c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
-#ifdef CONFIG_X86_32
-# ifdef CONFIG_SMP
- c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
-# else
- c->apicid = c->initial_apicid;
-# endif
-#endif
- c->phys_proc_id = c->initial_apicid;
- }
-
get_model_name(c); /* Default name */
/*
@@ -1722,28 +1911,6 @@ static void generic_identify(struct cpuinfo_x86 *c)
}
/*
- * Validate that ACPI/mptables have the same information about the
- * effective APIC id and update the package map.
- */
-static void validate_apic_and_package_id(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_SMP
- unsigned int apicid, cpu = smp_processor_id();
-
- apicid = apic->cpu_present_to_apicid(cpu);
-
- if (apicid != c->apicid) {
- pr_err(FW_BUG "CPU%u: APIC id mismatch. Firmware: %x APIC: %x\n",
- cpu, apicid, c->initial_apicid);
- }
- BUG_ON(topology_update_package_map(c->phys_proc_id, cpu));
- BUG_ON(topology_update_die_map(c->cpu_die_id, cpu));
-#else
- c->logical_proc_id = 0;
-#endif
-}
-
-/*
* This does the hard work of actually picking apart the CPU stuff...
*/
static void identify_cpu(struct cpuinfo_x86 *c)
@@ -1756,9 +1923,6 @@ static void identify_cpu(struct cpuinfo_x86 *c)
c->x86_model = c->x86_stepping = 0; /* So far unknown... */
c->x86_vendor_id[0] = '\0'; /* Unset */
c->x86_model_id[0] = '\0'; /* Unset */
- c->x86_max_cores = 1;
- c->x86_coreid_bits = 0;
- c->cu_id = 0xff;
#ifdef CONFIG_X86_64
c->x86_clflush_size = 64;
c->x86_phys_bits = 36;
@@ -1777,15 +1941,19 @@ static void identify_cpu(struct cpuinfo_x86 *c)
generic_identify(c);
+ cpu_parse_topology(c);
+
if (this_cpu->c_identify)
this_cpu->c_identify(c);
/* Clear/Set all flags overridden by options, after probe */
apply_forced_caps(c);
-#ifdef CONFIG_X86_64
- c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
-#endif
+ /*
+ * Set default APIC and TSC_DEADLINE MSR fencing flag. AMD and
+ * Hygon will clear it in ->c_init() below.
+ */
+ set_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
/*
* Vendor-specific initialization. In this section we
@@ -1800,6 +1968,8 @@ static void identify_cpu(struct cpuinfo_x86 *c)
if (this_cpu->c_init)
this_cpu->c_init(c);
+ bus_lock_init();
+
/* Disable the PN if appropriate */
squash_the_stupid_serial_number(c);
@@ -1822,6 +1992,9 @@ static void identify_cpu(struct cpuinfo_x86 *c)
/* Filter out anything that depends on CPUID levels we don't have */
filter_cpuid_features(c, true);
+ /* Check for unmet dependencies based on the CPUID dependency table */
+ check_cpufeature_deps(c);
+
/* If the model name is still unset, do table lookup. */
if (!c->x86_model_id[0]) {
const char *p;
@@ -1834,10 +2007,6 @@ static void identify_cpu(struct cpuinfo_x86 *c)
c->x86, c->x86_model);
}
-#ifdef CONFIG_X86_64
- detect_ht(c);
-#endif
-
x86_init_rdrand(c);
setup_pku(c);
setup_cet(c);
@@ -1869,11 +2038,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
/* Init Machine Check Exception if available. */
mcheck_cpu_init(c);
- select_idle_routine(c);
-
-#ifdef CONFIG_NUMA
numa_add_cpu(smp_processor_id());
-#endif
}
/*
@@ -1898,42 +2063,50 @@ void enable_sep_cpu(void)
*/
tss->x86_tss.ss1 = __KERNEL_CS;
- wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
- wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1), 0);
- wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0);
+ wrmsrq(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1);
+ wrmsrq(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1));
+ wrmsrq(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32);
put_cpu();
}
#endif
-void __init identify_boot_cpu(void)
+static __init void identify_boot_cpu(void)
{
identify_cpu(&boot_cpu_data);
if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT))
pr_info("CET detected: Indirect Branch Tracking enabled\n");
#ifdef CONFIG_X86_32
- sysenter_setup();
enable_sep_cpu();
#endif
cpu_detect_tlb(&boot_cpu_data);
setup_cr_pinning();
tsx_init();
+ tdx_init();
+ lkgs_init();
}
-void identify_secondary_cpu(struct cpuinfo_x86 *c)
+void identify_secondary_cpu(unsigned int cpu)
{
- BUG_ON(c == &boot_cpu_data);
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ /* Copy boot_cpu_data only on the first bringup */
+ if (!c->initialized)
+ *c = boot_cpu_data;
+ c->cpu_index = cpu;
+
identify_cpu(c);
#ifdef CONFIG_X86_32
enable_sep_cpu();
#endif
- mtrr_ap_init();
- validate_apic_and_package_id(c);
x86_spec_ctrl_setup_ap();
update_srbds_msr();
+ if (boot_cpu_has_bug(X86_BUG_GDS))
+ update_gds_msr();
tsx_ap_init();
+ c->initialized = true;
}
void print_cpu_info(struct cpuinfo_x86 *c)
@@ -1964,37 +2137,42 @@ void print_cpu_info(struct cpuinfo_x86 *c)
}
/*
- * clearcpuid= was already parsed in cpu_parse_early_param(). This dummy
- * function prevents it from becoming an environment variable for init.
+ * clearcpuid= and setcpuid= were already parsed in cpu_parse_early_param().
+ * These dummy functions prevent them from becoming an environment variable for
+ * init.
*/
+
static __init int setup_clearcpuid(char *arg)
{
return 1;
}
__setup("clearcpuid=", setup_clearcpuid);
-#ifdef CONFIG_X86_64
-DEFINE_PER_CPU_FIRST(struct fixed_percpu_data,
- fixed_percpu_data) __aligned(PAGE_SIZE) __visible;
-EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data);
+static __init int setup_setcpuid(char *arg)
+{
+ return 1;
+}
+__setup("setcpuid=", setup_setcpuid);
-/*
- * The following percpu variables are hot. Align current_task to
- * cacheline size such that they fall in the same cacheline.
- */
-DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
- &init_task;
+DEFINE_PER_CPU_CACHE_HOT(struct task_struct *, current_task) = &init_task;
EXPORT_PER_CPU_SYMBOL(current_task);
+EXPORT_PER_CPU_SYMBOL(const_current_task);
-DEFINE_PER_CPU(void *, hardirq_stack_ptr);
-DEFINE_PER_CPU(bool, hardirq_stack_inuse);
-
-DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
+DEFINE_PER_CPU_CACHE_HOT(int, __preempt_count) = INIT_PREEMPT_COUNT;
EXPORT_PER_CPU_SYMBOL(__preempt_count);
-DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK;
+DEFINE_PER_CPU_CACHE_HOT(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK;
+
+#ifdef CONFIG_X86_64
+/*
+ * Note: Do not make this dependant on CONFIG_MITIGATION_CALL_DEPTH_TRACKING
+ * so that this space is reserved in the hot cache section even when the
+ * mitigation is disabled.
+ */
+DEFINE_PER_CPU_CACHE_HOT(u64, __x86_call_depth);
+EXPORT_PER_CPU_SYMBOL(__x86_call_depth);
-static void wrmsrl_cstar(unsigned long val)
+static void wrmsrq_cstar(unsigned long val)
{
/*
* Intel CPUs do not support 32-bit SYSCALL. Writing to MSR_CSTAR
@@ -2002,39 +2180,37 @@ static void wrmsrl_cstar(unsigned long val)
* guest. Avoid the pointless write on all Intel CPUs.
*/
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
- wrmsrl(MSR_CSTAR, val);
+ wrmsrq(MSR_CSTAR, val);
}
-/* May not be marked __init: used by software suspend */
-void syscall_init(void)
+static inline void idt_syscall_init(void)
{
- wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
- wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
+ wrmsrq(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
-#ifdef CONFIG_IA32_EMULATION
- wrmsrl_cstar((unsigned long)entry_SYSCALL_compat);
- /*
- * This only works on Intel CPUs.
- * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
- * This does not cause SYSENTER to jump to the wrong location, because
- * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
- */
- wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
- wrmsrl_safe(MSR_IA32_SYSENTER_ESP,
- (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1));
- wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
-#else
- wrmsrl_cstar((unsigned long)ignore_sysret);
- wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
- wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
- wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
-#endif
+ if (ia32_enabled()) {
+ wrmsrq_cstar((unsigned long)entry_SYSCALL_compat);
+ /*
+ * This only works on Intel CPUs.
+ * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
+ * This does not cause SYSENTER to jump to the wrong location, because
+ * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
+ */
+ wrmsrq_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
+ wrmsrq_safe(MSR_IA32_SYSENTER_ESP,
+ (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1));
+ wrmsrq_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
+ } else {
+ wrmsrq_cstar((unsigned long)entry_SYSCALL32_ignore);
+ wrmsrq_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
+ wrmsrq_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+ wrmsrq_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
+ }
/*
* Flags to clear on syscall; clear as much as possible
* to minimize user space-kernel interference.
*/
- wrmsrl(MSR_SYSCALL_MASK,
+ wrmsrq(MSR_SYSCALL_MASK,
X86_EFLAGS_CF|X86_EFLAGS_PF|X86_EFLAGS_AF|
X86_EFLAGS_ZF|X86_EFLAGS_SF|X86_EFLAGS_TF|
X86_EFLAGS_IF|X86_EFLAGS_DF|X86_EFLAGS_OF|
@@ -2042,28 +2218,30 @@ void syscall_init(void)
X86_EFLAGS_AC|X86_EFLAGS_ID);
}
-#else /* CONFIG_X86_64 */
-
-DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
-EXPORT_PER_CPU_SYMBOL(current_task);
-DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
-EXPORT_PER_CPU_SYMBOL(__preempt_count);
+/* May not be marked __init: used by software suspend */
+void syscall_init(void)
+{
+ /* The default user and kernel segments */
+ wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
-/*
- * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find
- * the top of the kernel stack. Use an extra percpu variable to track the
- * top of the kernel stack directly.
- */
-DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) =
- (unsigned long)&init_thread_union + THREAD_SIZE;
-EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack);
+ /*
+ * Except the IA32_STAR MSR, there is NO need to setup SYSCALL and
+ * SYSENTER MSRs for FRED, because FRED uses the ring 3 FRED
+ * entrypoint for SYSCALL and SYSENTER, and ERETU is the only legit
+ * instruction to return to ring 3 (both sysexit and sysret cause
+ * #UD when FRED is enabled).
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_FRED))
+ idt_syscall_init();
+}
+#endif /* CONFIG_X86_64 */
#ifdef CONFIG_STACKPROTECTOR
-DEFINE_PER_CPU(unsigned long, __stack_chk_guard);
+DEFINE_PER_CPU_CACHE_HOT(unsigned long, __stack_chk_guard);
+#ifndef CONFIG_SMP
EXPORT_PER_CPU_SYMBOL(__stack_chk_guard);
#endif
-
-#endif /* CONFIG_X86_64 */
+#endif
/*
* Clear all 6 debug registers:
@@ -2095,27 +2273,13 @@ static void dbg_restore_debug_regs(void)
#define dbg_restore_debug_regs()
#endif /* ! CONFIG_KGDB */
-static void wait_for_master_cpu(int cpu)
-{
-#ifdef CONFIG_SMP
- /*
- * wait for ACK from master CPU before continuing
- * with AP initialization
- */
- WARN_ON(cpumask_test_and_set_cpu(cpu, cpu_initialized_mask));
- while (!cpumask_test_cpu(cpu, cpu_callout_mask))
- cpu_relax();
-#endif
-}
-
-#ifdef CONFIG_X86_64
static inline void setup_getcpu(int cpu)
{
unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu));
struct desc_struct d = { };
if (boot_cpu_has(X86_FEATURE_RDTSCP) || boot_cpu_has(X86_FEATURE_RDPID))
- wrmsr(MSR_TSC_AUX, cpudata, 0);
+ wrmsrq(MSR_TSC_AUX, cpudata);
/* Store CPU and node number in limit. */
d.limit0 = cpudata;
@@ -2130,12 +2294,7 @@ static inline void setup_getcpu(int cpu)
write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_CPUNODE, &d, DESCTYPE_S);
}
-static inline void ucode_cpu_init(int cpu)
-{
- if (cpu)
- load_ucode_ap();
-}
-
+#ifdef CONFIG_X86_64
static inline void tss_setup_ist(struct tss_struct *tss)
{
/* Set up the per-CPU TSS IST stacks */
@@ -2146,18 +2305,8 @@ static inline void tss_setup_ist(struct tss_struct *tss)
/* Only mapped when SEV-ES is active */
tss->x86_tss.ist[IST_INDEX_VC] = __this_cpu_ist_top_va(VC);
}
-
#else /* CONFIG_X86_64 */
-
-static inline void setup_getcpu(int cpu) { }
-
-static inline void ucode_cpu_init(int cpu)
-{
- show_ucode_info_early();
-}
-
static inline void tss_setup_ist(struct tss_struct *tss) { }
-
#endif /* !CONFIG_X86_64 */
static inline void tss_setup_io_bitmap(struct tss_struct *tss)
@@ -2180,7 +2329,7 @@ static inline void tss_setup_io_bitmap(struct tss_struct *tss)
* Setup everything needed to handle exceptions from the IDT, including the IST
* exceptions which use paranoid_entry().
*/
-void cpu_init_exception_handling(void)
+void cpu_init_exception_handling(bool boot_cpu)
{
struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
int cpu = raw_smp_processor_id();
@@ -2188,8 +2337,9 @@ void cpu_init_exception_handling(void)
/* paranoid_entry() gets the CPU number from the GDT */
setup_getcpu(cpu);
- /* IST vectors need TSS to be set up. */
- tss_setup_ist(tss);
+ /* For IDT mode, IST vectors need to be set in TSS. */
+ if (!cpu_feature_enabled(X86_FEATURE_FRED))
+ tss_setup_ist(tss);
tss_setup_io_bitmap(tss);
set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
@@ -2198,8 +2348,23 @@ void cpu_init_exception_handling(void)
/* GHCB needs to be setup to handle #VC. */
setup_ghcb();
- /* Finally load the IDT */
- load_current_idt();
+ if (cpu_feature_enabled(X86_FEATURE_FRED)) {
+ /* The boot CPU has enabled FRED during early boot */
+ if (!boot_cpu)
+ cpu_init_fred_exceptions();
+
+ cpu_init_fred_rsps();
+ } else {
+ load_current_idt();
+ }
+}
+
+void __init cpu_init_replace_early_idt(void)
+{
+ if (cpu_feature_enabled(X86_FEATURE_FRED))
+ cpu_init_fred_exceptions();
+ else
+ idt_setup_early_pf();
}
/*
@@ -2213,10 +2378,6 @@ void cpu_init(void)
struct task_struct *cur = current;
int cpu = raw_smp_processor_id();
- wait_for_master_cpu(cpu);
-
- ucode_cpu_init(cpu);
-
#ifdef CONFIG_NUMA
if (this_cpu_read(numa_node) == 0 &&
early_cpu_to_node(cpu) != NUMA_NO_NODE)
@@ -2228,22 +2389,18 @@ void cpu_init(void)
boot_cpu_has(X86_FEATURE_TSC) || boot_cpu_has(X86_FEATURE_DE))
cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
- /*
- * Initialize the per-CPU GDT with the boot GDT,
- * and set up the GDT descriptor:
- */
- switch_to_new_gdt(cpu);
-
if (IS_ENABLED(CONFIG_X86_64)) {
loadsegment(fs, 0);
memset(cur->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
syscall_init();
- wrmsrl(MSR_FS_BASE, 0);
- wrmsrl(MSR_KERNEL_GS_BASE, 0);
+ wrmsrq(MSR_FS_BASE, 0);
+ wrmsrq(MSR_KERNEL_GS_BASE, 0);
barrier();
x2apic_setup();
+
+ intel_posted_msi_init();
}
mmgrab(&init_mm);
@@ -2265,51 +2422,53 @@ void cpu_init(void)
doublefault_init_cpu_tss();
- fpu__init_cpu();
-
if (is_uv_system())
uv_cpu_init();
load_fixmap_gdt(cpu);
}
-#ifdef CONFIG_SMP
-void cpu_init_secondary(void)
+#ifdef CONFIG_MICROCODE_LATE_LOADING
+/**
+ * store_cpu_caps() - Store a snapshot of CPU capabilities
+ * @curr_info: Pointer where to store it
+ *
+ * Returns: None
+ */
+void store_cpu_caps(struct cpuinfo_x86 *curr_info)
{
- /*
- * Relies on the BP having set-up the IDT tables, which are loaded
- * on this CPU in cpu_init_exception_handling().
- */
- cpu_init_exception_handling();
- cpu_init();
+ /* Reload CPUID max function as it might've changed. */
+ curr_info->cpuid_level = cpuid_eax(0);
+
+ /* Copy all capability leafs and pick up the synthetic ones. */
+ memcpy(&curr_info->x86_capability, &boot_cpu_data.x86_capability,
+ sizeof(curr_info->x86_capability));
+
+ /* Get the hardware CPUID leafs */
+ get_cpu_cap(curr_info);
}
-#endif
-#ifdef CONFIG_MICROCODE_LATE_LOADING
-/*
+/**
+ * microcode_check() - Check if any CPU capabilities changed after an update.
+ * @prev_info: CPU capabilities stored before an update.
+ *
* The microcode loader calls this upon late microcode load to recheck features,
- * only when microcode has been updated. Caller holds microcode_mutex and CPU
- * hotplug lock.
+ * only when microcode has been updated. Caller holds and CPU hotplug lock.
+ *
+ * Return: None
*/
-void microcode_check(void)
+void microcode_check(struct cpuinfo_x86 *prev_info)
{
- struct cpuinfo_x86 info;
+ struct cpuinfo_x86 curr_info;
perf_check_microcode();
- /* Reload CPUID max function as it might've changed. */
- info.cpuid_level = cpuid_eax(0);
+ amd_check_microcode();
- /*
- * Copy all capability leafs to pick up the synthetic ones so that
- * memcmp() below doesn't fail on that. The ones coming from CPUID will
- * get overwritten in get_cpu_cap().
- */
- memcpy(&info.x86_capability, &boot_cpu_data.x86_capability, sizeof(info.x86_capability));
-
- get_cpu_cap(&info);
+ store_cpu_caps(&curr_info);
- if (!memcmp(&info.x86_capability, &boot_cpu_data.x86_capability, sizeof(info.x86_capability)))
+ if (!memcmp(&prev_info->x86_capability, &curr_info.x86_capability,
+ sizeof(prev_info->x86_capability)))
return;
pr_warn("x86/CPU: CPU features have changed after loading microcode, but might not take effect.\n");
@@ -2327,3 +2486,89 @@ void arch_smt_update(void)
/* Check whether IPI broadcasting can be enabled */
apic_smt_update();
}
+
+void __init arch_cpu_finalize_init(void)
+{
+ struct cpuinfo_x86 *c = this_cpu_ptr(&cpu_info);
+
+ identify_boot_cpu();
+
+ select_idle_routine();
+
+ /*
+ * identify_boot_cpu() initialized SMT support information, let the
+ * core code know.
+ */
+ cpu_smt_set_num_threads(__max_threads_per_core, __max_threads_per_core);
+
+ if (!IS_ENABLED(CONFIG_SMP)) {
+ pr_info("CPU: ");
+ print_cpu_info(&boot_cpu_data);
+ }
+
+ cpu_select_mitigations();
+
+ arch_smt_update();
+
+ if (IS_ENABLED(CONFIG_X86_32)) {
+ /*
+ * Check whether this is a real i386 which is not longer
+ * supported and fixup the utsname.
+ */
+ if (boot_cpu_data.x86 < 4)
+ panic("Kernel requires i486+ for 'invlpg' and other features");
+
+ init_utsname()->machine[1] =
+ '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
+ }
+
+ /*
+ * Must be before alternatives because it might set or clear
+ * feature bits.
+ */
+ fpu__init_system();
+ fpu__init_cpu();
+
+ /*
+ * Ensure that access to the per CPU representation has the initial
+ * boot CPU configuration.
+ */
+ *c = boot_cpu_data;
+ c->initialized = true;
+
+ alternative_instructions();
+
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ unsigned long USER_PTR_MAX = TASK_SIZE_MAX;
+
+ /*
+ * Enable this when LAM is gated on LASS support
+ if (cpu_feature_enabled(X86_FEATURE_LAM))
+ USER_PTR_MAX = (1ul << 63) - PAGE_SIZE;
+ */
+ runtime_const_init(ptr, USER_PTR_MAX);
+
+ /*
+ * Make sure the first 2MB area is not mapped by huge pages
+ * There are typically fixed size MTRRs in there and overlapping
+ * MTRRs into large pages causes slow downs.
+ *
+ * Right now we don't do that with gbpages because there seems
+ * very little benefit for that case.
+ */
+ if (!direct_gbpages)
+ set_memory_4k((unsigned long)__va(0), 1);
+ } else {
+ fpu__init_check_bugs();
+ }
+
+ /*
+ * This needs to be called before any devices perform DMA
+ * operations that might use the SWIOTLB bounce buffers. It will
+ * mark the bounce buffers as decrypted so that their usage will
+ * not cause "plain-text" data to be decrypted when accessed. It
+ * must be called after late_time_init() so that Hyper-V x86/x64
+ * hypercalls work when the SWIOTLB bounce buffers are decrypted.
+ */
+ mem_encrypt_init();
+}
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 7c9b5893c30a..bc38b2d56f26 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -2,6 +2,11 @@
#ifndef ARCH_X86_CPU_H
#define ARCH_X86_CPU_H
+#include <asm/cpu.h>
+#include <asm/topology.h>
+
+#include "topology.h"
+
/* attempt to consolidate cpu attributes */
struct cpu_dev {
const char *c_vendor;
@@ -28,14 +33,6 @@ struct cpu_dev {
#endif
};
-struct _tlb_table {
- unsigned char descriptor;
- char tlb_type;
- unsigned int entries;
- /* unsigned int ways; */
- char info[128];
-};
-
#define cpu_dev_register(cpu_devX) \
static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \
__section(".x86_cpu_dev.init") = \
@@ -56,9 +53,11 @@ extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state;
extern void __init tsx_init(void);
void tsx_ap_init(void);
+void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c);
#else
static inline void tsx_init(void) { }
static inline void tsx_ap_init(void) { }
+static inline void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c) { }
#endif /* CONFIG_CPU_SUP_INTEL */
extern void init_spectral_chicken(struct cpuinfo_x86 *c);
@@ -71,18 +70,34 @@ extern void init_intel_cacheinfo(struct cpuinfo_x86 *c);
extern void init_amd_cacheinfo(struct cpuinfo_x86 *c);
extern void init_hygon_cacheinfo(struct cpuinfo_x86 *c);
-extern void detect_num_cpu_cores(struct cpuinfo_x86 *c);
-extern int detect_extended_topology_early(struct cpuinfo_x86 *c);
-extern int detect_extended_topology(struct cpuinfo_x86 *c);
-extern int detect_ht_early(struct cpuinfo_x86 *c);
-extern void detect_ht(struct cpuinfo_x86 *c);
extern void check_null_seg_clears_base(struct cpuinfo_x86 *c);
+void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id);
+void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c);
+
+#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS)
+struct amd_northbridge *amd_init_l3_cache(int index);
+#else
+static inline struct amd_northbridge *amd_init_l3_cache(int index)
+{
+ return NULL;
+}
+#endif
+
unsigned int aperfmperf_get_khz(int cpu);
+void cpu_select_mitigations(void);
extern void x86_spec_ctrl_setup_ap(void);
extern void update_srbds_msr(void);
+extern void update_gds_msr(void);
+
+extern enum spectre_v2_mitigation spectre_v2_enabled;
-extern u64 x86_read_arch_cap_msr(void);
+static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode)
+{
+ return mode == SPECTRE_V2_EIBRS ||
+ mode == SPECTRE_V2_EIBRS_RETPOLINE ||
+ mode == SPECTRE_V2_EIBRS_LFENCE;
+}
#endif /* ARCH_X86_CPU_H */
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index c881bcafba7d..46efcbd6afa4 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -28,6 +28,7 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_PKU, X86_FEATURE_XSAVE },
{ X86_FEATURE_MPX, X86_FEATURE_XSAVE },
{ X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE },
+ { X86_FEATURE_APX, X86_FEATURE_XSAVE },
{ X86_FEATURE_CMOV, X86_FEATURE_FXSR },
{ X86_FEATURE_MMX, X86_FEATURE_FXSR },
{ X86_FEATURE_MMXEXT, X86_FEATURE_MMX },
@@ -44,7 +45,11 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_F16C, X86_FEATURE_XMM2, },
{ X86_FEATURE_AES, X86_FEATURE_XMM2 },
{ X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 },
+ { X86_FEATURE_GFNI, X86_FEATURE_XMM2 },
+ { X86_FEATURE_AVX_VNNI, X86_FEATURE_AVX },
{ X86_FEATURE_FMA, X86_FEATURE_AVX },
+ { X86_FEATURE_VAES, X86_FEATURE_AVX },
+ { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX },
{ X86_FEATURE_AVX2, X86_FEATURE_AVX, },
{ X86_FEATURE_AVX512F, X86_FEATURE_AVX, },
{ X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F },
@@ -56,9 +61,6 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F },
{ X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F },
{ X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL },
- { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL },
- { X86_FEATURE_VAES, X86_FEATURE_AVX512VL },
- { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL },
{ X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL },
{ X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL },
{ X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F },
@@ -68,6 +70,8 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_CQM_OCCUP_LLC, X86_FEATURE_CQM_LLC },
{ X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC },
{ X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC },
+ { X86_FEATURE_BMEC, X86_FEATURE_CQM_MBM_TOTAL },
+ { X86_FEATURE_BMEC, X86_FEATURE_CQM_MBM_LOCAL },
{ X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL },
{ X86_FEATURE_AVX512_FP16, X86_FEATURE_AVX512BW },
{ X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES },
@@ -75,9 +79,16 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_SGX_LC, X86_FEATURE_SGX },
{ X86_FEATURE_SGX1, X86_FEATURE_SGX },
{ X86_FEATURE_SGX2, X86_FEATURE_SGX1 },
+ { X86_FEATURE_SGX_EDECCSSA, X86_FEATURE_SGX1 },
{ X86_FEATURE_XFD, X86_FEATURE_XSAVES },
{ X86_FEATURE_XFD, X86_FEATURE_XGETBV1 },
{ X86_FEATURE_AMX_TILE, X86_FEATURE_XFD },
+ { X86_FEATURE_AMX_FP16, X86_FEATURE_AMX_TILE },
+ { X86_FEATURE_AMX_BF16, X86_FEATURE_AMX_TILE },
+ { X86_FEATURE_AMX_INT8, X86_FEATURE_AMX_TILE },
+ { X86_FEATURE_SHSTK, X86_FEATURE_XSAVES },
+ { X86_FEATURE_FRED, X86_FEATURE_LKGS },
+ { X86_FEATURE_SPEC_CTRL_SSBD, X86_FEATURE_SPEC_CTRL },
{}
};
@@ -108,6 +119,9 @@ static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
if (WARN_ON(feature >= MAX_FEATURE_BITS))
return;
+ if (boot_cpu_has(feature))
+ WARN_ON(alternatives_patched);
+
clear_feature(c, feature);
/* Collect all features to disable, handling dependencies */
@@ -138,3 +152,38 @@ void setup_clear_cpu_cap(unsigned int feature)
{
do_clear_cpu_cap(NULL, feature);
}
+
+/*
+ * Return the feature "name" if available, otherwise return
+ * the X86_FEATURE_* numerals to make it easier to identify
+ * the feature.
+ */
+static const char *x86_feature_name(unsigned int feature, char *buf)
+{
+ if (x86_cap_flags[feature])
+ return x86_cap_flags[feature];
+
+ snprintf(buf, 16, "%d*32+%2d", feature / 32, feature % 32);
+
+ return buf;
+}
+
+void check_cpufeature_deps(struct cpuinfo_x86 *c)
+{
+ char feature_buf[16], depends_buf[16];
+ const struct cpuid_dep *d;
+
+ for (d = cpuid_deps; d->feature; d++) {
+ if (cpu_has(c, d->feature) && !cpu_has(c, d->depends)) {
+ /*
+ * Only warn about the first unmet dependency on the
+ * first CPU where it is encountered to avoid spamming
+ * the kernel log.
+ */
+ pr_warn_once("x86 CPU feature dependency check failure: CPU%d has '%s' enabled but '%s' disabled. Kernel might be fine, but no guarantees.\n",
+ smp_processor_id(),
+ x86_feature_name(d->feature, feature_buf),
+ x86_feature_name(d->depends, depends_buf));
+ }
+ }
+}
diff --git a/arch/x86/kernel/cpu/cpuid_0x2_table.c b/arch/x86/kernel/cpu/cpuid_0x2_table.c
new file mode 100644
index 000000000000..89bc8db5e9c6
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpuid_0x2_table.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/sizes.h>
+
+#include <asm/cpuid/types.h>
+
+#include "cpu.h"
+
+#define CACHE_ENTRY(_desc, _type, _size) \
+ [_desc] = { \
+ .c_type = (_type), \
+ .c_size = (_size) / SZ_1K, \
+ }
+
+#define TLB_ENTRY(_desc, _type, _entries) \
+ [_desc] = { \
+ .t_type = (_type), \
+ .entries = (_entries), \
+ }
+
+const struct leaf_0x2_table cpuid_0x2_table[256] = {
+ CACHE_ENTRY(0x06, CACHE_L1_INST, SZ_8K ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x08, CACHE_L1_INST, SZ_16K ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x09, CACHE_L1_INST, SZ_32K ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x0a, CACHE_L1_DATA, SZ_8K ), /* 2 way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x0c, CACHE_L1_DATA, SZ_16K ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x0d, CACHE_L1_DATA, SZ_16K ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x0e, CACHE_L1_DATA, SZ_24K ), /* 6-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x21, CACHE_L2, SZ_256K ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x22, CACHE_L3, SZ_512K ), /* 4-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x23, CACHE_L3, SZ_1M ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x25, CACHE_L3, SZ_2M ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x29, CACHE_L3, SZ_4M ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x2c, CACHE_L1_DATA, SZ_32K ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x30, CACHE_L1_INST, SZ_32K ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x39, CACHE_L2, SZ_128K ), /* 4-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x3a, CACHE_L2, SZ_192K ), /* 6-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x3b, CACHE_L2, SZ_128K ), /* 2-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x3c, CACHE_L2, SZ_256K ), /* 4-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x3d, CACHE_L2, SZ_384K ), /* 6-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x3e, CACHE_L2, SZ_512K ), /* 4-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x3f, CACHE_L2, SZ_256K ), /* 2-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x41, CACHE_L2, SZ_128K ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x42, CACHE_L2, SZ_256K ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x43, CACHE_L2, SZ_512K ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x44, CACHE_L2, SZ_1M ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x45, CACHE_L2, SZ_2M ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x46, CACHE_L3, SZ_4M ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x47, CACHE_L3, SZ_8M ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x48, CACHE_L2, SZ_3M ), /* 12-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x49, CACHE_L3, SZ_4M ), /* 16-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x4a, CACHE_L3, SZ_6M ), /* 12-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x4b, CACHE_L3, SZ_8M ), /* 16-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x4c, CACHE_L3, SZ_12M ), /* 12-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x4d, CACHE_L3, SZ_16M ), /* 16-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x4e, CACHE_L2, SZ_6M ), /* 24-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x60, CACHE_L1_DATA, SZ_16K ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x66, CACHE_L1_DATA, SZ_8K ), /* 4-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x67, CACHE_L1_DATA, SZ_16K ), /* 4-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x68, CACHE_L1_DATA, SZ_32K ), /* 4-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x78, CACHE_L2, SZ_1M ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x79, CACHE_L2, SZ_128K ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x7a, CACHE_L2, SZ_256K ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x7b, CACHE_L2, SZ_512K ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x7c, CACHE_L2, SZ_1M ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x7d, CACHE_L2, SZ_2M ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x7f, CACHE_L2, SZ_512K ), /* 2-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x80, CACHE_L2, SZ_512K ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x82, CACHE_L2, SZ_256K ), /* 8-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x83, CACHE_L2, SZ_512K ), /* 8-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x84, CACHE_L2, SZ_1M ), /* 8-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x85, CACHE_L2, SZ_2M ), /* 8-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x86, CACHE_L2, SZ_512K ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x87, CACHE_L2, SZ_1M ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xd0, CACHE_L3, SZ_512K ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xd1, CACHE_L3, SZ_1M ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xd2, CACHE_L3, SZ_2M ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xd6, CACHE_L3, SZ_1M ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xd7, CACHE_L3, SZ_2M ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xd8, CACHE_L3, SZ_4M ), /* 12-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xdc, CACHE_L3, SZ_2M ), /* 12-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xdd, CACHE_L3, SZ_4M ), /* 12-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xde, CACHE_L3, SZ_8M ), /* 12-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xe2, CACHE_L3, SZ_2M ), /* 16-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xe3, CACHE_L3, SZ_4M ), /* 16-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xe4, CACHE_L3, SZ_8M ), /* 16-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xea, CACHE_L3, SZ_12M ), /* 24-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xeb, CACHE_L3, SZ_18M ), /* 24-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xec, CACHE_L3, SZ_24M ), /* 24-way set assoc, 64 byte line size */
+
+ TLB_ENTRY( 0x01, TLB_INST_4K, 32 ), /* TLB_INST 4 KByte pages, 4-way set associative */
+ TLB_ENTRY( 0x02, TLB_INST_4M, 2 ), /* TLB_INST 4 MByte pages, full associative */
+ TLB_ENTRY( 0x03, TLB_DATA_4K, 64 ), /* TLB_DATA 4 KByte pages, 4-way set associative */
+ TLB_ENTRY( 0x04, TLB_DATA_4M, 8 ), /* TLB_DATA 4 MByte pages, 4-way set associative */
+ TLB_ENTRY( 0x05, TLB_DATA_4M, 32 ), /* TLB_DATA 4 MByte pages, 4-way set associative */
+ TLB_ENTRY( 0x0b, TLB_INST_4M, 4 ), /* TLB_INST 4 MByte pages, 4-way set associative */
+ TLB_ENTRY( 0x4f, TLB_INST_4K, 32 ), /* TLB_INST 4 KByte pages */
+ TLB_ENTRY( 0x50, TLB_INST_ALL, 64 ), /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */
+ TLB_ENTRY( 0x51, TLB_INST_ALL, 128 ), /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */
+ TLB_ENTRY( 0x52, TLB_INST_ALL, 256 ), /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */
+ TLB_ENTRY( 0x55, TLB_INST_2M_4M, 7 ), /* TLB_INST 2-MByte or 4-MByte pages, fully associative */
+ TLB_ENTRY( 0x56, TLB_DATA0_4M, 16 ), /* TLB_DATA0 4 MByte pages, 4-way set associative */
+ TLB_ENTRY( 0x57, TLB_DATA0_4K, 16 ), /* TLB_DATA0 4 KByte pages, 4-way associative */
+ TLB_ENTRY( 0x59, TLB_DATA0_4K, 16 ), /* TLB_DATA0 4 KByte pages, fully associative */
+ TLB_ENTRY( 0x5a, TLB_DATA0_2M_4M, 32 ), /* TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative */
+ TLB_ENTRY( 0x5b, TLB_DATA_4K_4M, 64 ), /* TLB_DATA 4 KByte and 4 MByte pages */
+ TLB_ENTRY( 0x5c, TLB_DATA_4K_4M, 128 ), /* TLB_DATA 4 KByte and 4 MByte pages */
+ TLB_ENTRY( 0x5d, TLB_DATA_4K_4M, 256 ), /* TLB_DATA 4 KByte and 4 MByte pages */
+ TLB_ENTRY( 0x61, TLB_INST_4K, 48 ), /* TLB_INST 4 KByte pages, full associative */
+ TLB_ENTRY( 0x63, TLB_DATA_1G_2M_4M, 4 ), /* TLB_DATA 1 GByte pages, 4-way set associative
+ * (plus 32 entries TLB_DATA 2 MByte or 4 MByte pages, not encoded here) */
+ TLB_ENTRY( 0x6b, TLB_DATA_4K, 256 ), /* TLB_DATA 4 KByte pages, 8-way associative */
+ TLB_ENTRY( 0x6c, TLB_DATA_2M_4M, 128 ), /* TLB_DATA 2 MByte or 4 MByte pages, 8-way associative */
+ TLB_ENTRY( 0x6d, TLB_DATA_1G, 16 ), /* TLB_DATA 1 GByte pages, fully associative */
+ TLB_ENTRY( 0x76, TLB_INST_2M_4M, 8 ), /* TLB_INST 2-MByte or 4-MByte pages, fully associative */
+ TLB_ENTRY( 0xb0, TLB_INST_4K, 128 ), /* TLB_INST 4 KByte pages, 4-way set associative */
+ TLB_ENTRY( 0xb1, TLB_INST_2M_4M, 4 ), /* TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries */
+ TLB_ENTRY( 0xb2, TLB_INST_4K, 64 ), /* TLB_INST 4KByte pages, 4-way set associative */
+ TLB_ENTRY( 0xb3, TLB_DATA_4K, 128 ), /* TLB_DATA 4 KByte pages, 4-way set associative */
+ TLB_ENTRY( 0xb4, TLB_DATA_4K, 256 ), /* TLB_DATA 4 KByte pages, 4-way associative */
+ TLB_ENTRY( 0xb5, TLB_INST_4K, 64 ), /* TLB_INST 4 KByte pages, 8-way set associative */
+ TLB_ENTRY( 0xb6, TLB_INST_4K, 128 ), /* TLB_INST 4 KByte pages, 8-way set associative */
+ TLB_ENTRY( 0xba, TLB_DATA_4K, 64 ), /* TLB_DATA 4 KByte pages, 4-way associative */
+ TLB_ENTRY( 0xc0, TLB_DATA_4K_4M, 8 ), /* TLB_DATA 4 KByte and 4 MByte pages, 4-way associative */
+ TLB_ENTRY( 0xc1, STLB_4K_2M, 1024 ), /* STLB 4 KByte and 2 MByte pages, 8-way associative */
+ TLB_ENTRY( 0xc2, TLB_DATA_2M_4M, 16 ), /* TLB_DATA 2 MByte/4MByte pages, 4-way associative */
+ TLB_ENTRY( 0xca, STLB_4K, 512 ), /* STLB 4 KByte pages, 4-way associative */
+};
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 7227c15299d0..dfec2c61e354 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/bitops.h>
#include <linux/delay.h>
+#include <linux/isa-dma.h>
#include <linux/pci.h>
#include <asm/dma.h>
#include <linux/io.h>
@@ -152,8 +153,8 @@ static void geode_configure(void)
u8 ccr3;
local_irq_save(flags);
- /* Suspend on halt power saving and enable #SUSP pin */
- setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88);
+ /* Suspend on halt power saving */
+ setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x08);
ccr3 = getCx86(CX86_CCR3);
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c
new file mode 100644
index 000000000000..1976fef2dfe5
--- /dev/null
+++ b/arch/x86/kernel/cpu/debugfs.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/debugfs.h>
+
+#include <asm/apic.h>
+#include <asm/processor.h>
+
+#include "cpu.h"
+
+static int cpu_debug_show(struct seq_file *m, void *p)
+{
+ unsigned long cpu = (unsigned long)m->private;
+ struct cpuinfo_x86 *c = per_cpu_ptr(&cpu_info, cpu);
+
+ seq_printf(m, "online: %d\n", cpu_online(cpu));
+ if (!c->initialized)
+ return 0;
+
+ seq_printf(m, "initial_apicid: 0x%x\n", c->topo.initial_apicid);
+ seq_printf(m, "apicid: 0x%x\n", c->topo.apicid);
+ seq_printf(m, "pkg_id: %u\n", c->topo.pkg_id);
+ seq_printf(m, "die_id: %u\n", c->topo.die_id);
+ seq_printf(m, "cu_id: %u\n", c->topo.cu_id);
+ seq_printf(m, "core_id: %u\n", c->topo.core_id);
+ seq_printf(m, "cpu_type: %s\n", get_topology_cpu_type_name(c));
+ seq_printf(m, "logical_pkg_id: %u\n", c->topo.logical_pkg_id);
+ seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id);
+ seq_printf(m, "logical_core_id: %u\n", c->topo.logical_core_id);
+ seq_printf(m, "llc_id: %u\n", c->topo.llc_id);
+ seq_printf(m, "l2c_id: %u\n", c->topo.l2c_id);
+ seq_printf(m, "amd_node_id: %u\n", c->topo.amd_node_id);
+ seq_printf(m, "amd_nodes_per_pkg: %u\n", topology_amd_nodes_per_pkg());
+ seq_printf(m, "num_threads: %u\n", __num_threads_per_package);
+ seq_printf(m, "num_cores: %u\n", __num_cores_per_package);
+ seq_printf(m, "max_dies_per_pkg: %u\n", __max_dies_per_package);
+ seq_printf(m, "max_threads_per_core:%u\n", __max_threads_per_core);
+ return 0;
+}
+
+static int cpu_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, cpu_debug_show, inode->i_private);
+}
+
+static const struct file_operations dfs_cpu_ops = {
+ .open = cpu_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int dom_debug_show(struct seq_file *m, void *p)
+{
+ static const char *domain_names[TOPO_MAX_DOMAIN] = {
+ [TOPO_SMT_DOMAIN] = "Thread",
+ [TOPO_CORE_DOMAIN] = "Core",
+ [TOPO_MODULE_DOMAIN] = "Module",
+ [TOPO_TILE_DOMAIN] = "Tile",
+ [TOPO_DIE_DOMAIN] = "Die",
+ [TOPO_DIEGRP_DOMAIN] = "DieGrp",
+ [TOPO_PKG_DOMAIN] = "Package",
+ };
+ unsigned int dom, nthreads = 1;
+
+ for (dom = 0; dom < TOPO_MAX_DOMAIN; dom++) {
+ nthreads *= x86_topo_system.dom_size[dom];
+ seq_printf(m, "domain: %-10s shift: %u dom_size: %5u max_threads: %5u\n",
+ domain_names[dom], x86_topo_system.dom_shifts[dom],
+ x86_topo_system.dom_size[dom], nthreads);
+ }
+ return 0;
+}
+
+static int dom_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, dom_debug_show, inode->i_private);
+}
+
+static const struct file_operations dfs_dom_ops = {
+ .open = dom_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static __init int cpu_init_debugfs(void)
+{
+ struct dentry *dir, *base = debugfs_create_dir("topo", arch_debugfs_dir);
+ unsigned long id;
+ char name[24];
+
+ debugfs_create_file("domains", 0444, base, NULL, &dfs_dom_ops);
+
+ dir = debugfs_create_dir("cpus", base);
+ for_each_possible_cpu(id) {
+ sprintf(name, "%lu", id);
+ debugfs_create_file(name, 0444, dir, (void *)id, &dfs_cpu_ops);
+ }
+ return 0;
+}
+late_initcall(cpu_init_debugfs);
diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c
index da696eb4821a..d69757246bde 100644
--- a/arch/x86/kernel/cpu/feat_ctl.c
+++ b/arch/x86/kernel/cpu/feat_ctl.c
@@ -1,11 +1,12 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/tboot.h>
+#include <asm/cpu.h>
#include <asm/cpufeature.h>
#include <asm/msr-index.h>
+#include <asm/msr.h>
#include <asm/processor.h>
#include <asm/vmx.h>
-#include "cpu.h"
#undef pr_fmt
#define pr_fmt(fmt) "x86/cpu: " fmt
@@ -15,6 +16,8 @@ enum vmx_feature_leafs {
MISC_FEATURES = 0,
PRIMARY_CTLS,
SECONDARY_CTLS,
+ TERTIARY_CTLS_LOW,
+ TERTIARY_CTLS_HIGH,
NR_VMX_FEATURE_WORDS,
};
@@ -22,7 +25,7 @@ enum vmx_feature_leafs {
static void init_vmx_capabilities(struct cpuinfo_x86 *c)
{
- u32 supported, funcs, ept, vpid, ign;
+ u32 supported, funcs, ept, vpid, ign, low, high;
BUILD_BUG_ON(NVMXINTS != NR_VMX_FEATURE_WORDS);
@@ -42,6 +45,11 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c)
rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS2, &ign, &supported);
c->vmx_capability[SECONDARY_CTLS] = supported;
+ /* All 64 bits of tertiary controls MSR are allowed-1 settings. */
+ rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS3, &low, &high);
+ c->vmx_capability[TERTIARY_CTLS_LOW] = low;
+ c->vmx_capability[TERTIARY_CTLS_HIGH] = high;
+
rdmsr(MSR_IA32_VMX_PINBASED_CTLS, ign, supported);
rdmsr_safe(MSR_IA32_VMX_VMFUNC, &ign, &funcs);
@@ -65,6 +73,8 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c)
c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_AD);
if (ept & VMX_EPT_1GB_PAGE_BIT)
c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_1GB);
+ if (ept & VMX_EPT_PAGE_WALK_5_BIT)
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_5LEVEL);
/* Synthetic APIC features that are aggregates of multiple features. */
if ((c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR)) &&
@@ -109,7 +119,7 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
bool enable_vmx;
u64 msr;
- if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) {
+ if (rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr)) {
clear_cpu_cap(c, X86_FEATURE_VMX);
clear_cpu_cap(c, X86_FEATURE_SGX);
return;
@@ -156,7 +166,7 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
msr |= FEAT_CTL_SGX_LC_ENABLED;
}
- wrmsrl(MSR_IA32_FEAT_CTL, msr);
+ wrmsrq(MSR_IA32_FEAT_CTL, msr);
update_caps:
set_cpu_cap(c, X86_FEATURE_MSR_IA32_FEAT_CTL);
@@ -179,7 +189,7 @@ update_caps:
update_sgx:
if (!(msr & FEAT_CTL_SGX_ENABLED)) {
if (enable_sgx_kvm || enable_sgx_driver)
- pr_err_once("SGX disabled by BIOS.\n");
+ pr_err_once("SGX disabled or unsupported by BIOS.\n");
clear_cpu_cap(c, X86_FEATURE_SGX);
return;
}
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index 21fd425088fe..2154f12766fb 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -8,23 +8,17 @@
*/
#include <linux/io.h>
+#include <asm/apic.h>
#include <asm/cpu.h>
#include <asm/smp.h>
#include <asm/numa.h>
#include <asm/cacheinfo.h>
#include <asm/spec-ctrl.h>
#include <asm/delay.h>
+#include <asm/msr.h>
#include "cpu.h"
-#define APICID_SOCKET_ID_BIT 6
-
-/*
- * nodes_per_socket: Stores the number of nodes per socket.
- * Refer to CPUID Fn8000_001E_ECX Node Identifiers[10:8]
- */
-static u32 nodes_per_socket = 1;
-
#ifdef CONFIG_NUMA
/*
* To workaround broken NUMA config. Read the comment in
@@ -48,90 +42,16 @@ static int nearby_node(int apicid)
}
#endif
-static void hygon_get_topology_early(struct cpuinfo_x86 *c)
-{
- if (cpu_has(c, X86_FEATURE_TOPOEXT))
- smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1;
-}
-
-/*
- * Fixup core topology information for
- * (1) Hygon multi-node processors
- * Assumption: Number of cores in each internal node is the same.
- * (2) Hygon processors supporting compute units
- */
-static void hygon_get_topology(struct cpuinfo_x86 *c)
-{
- int cpu = smp_processor_id();
-
- /* get information required for multi-node processors */
- if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
- int err;
- u32 eax, ebx, ecx, edx;
-
- cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
-
- c->cpu_die_id = ecx & 0xff;
-
- c->cpu_core_id = ebx & 0xff;
-
- if (smp_num_siblings > 1)
- c->x86_max_cores /= smp_num_siblings;
-
- /*
- * In case leaf B is available, use it to derive
- * topology information.
- */
- err = detect_extended_topology(c);
- if (!err)
- c->x86_coreid_bits = get_count_order(c->x86_max_cores);
-
- /* Socket ID is ApicId[6] for these processors. */
- c->phys_proc_id = c->apicid >> APICID_SOCKET_ID_BIT;
-
- cacheinfo_hygon_init_llc_id(c, cpu);
- } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
- u64 value;
-
- rdmsrl(MSR_FAM10H_NODE_ID, value);
- c->cpu_die_id = value & 7;
-
- per_cpu(cpu_llc_id, cpu) = c->cpu_die_id;
- } else
- return;
-
- if (nodes_per_socket > 1)
- set_cpu_cap(c, X86_FEATURE_AMD_DCM);
-}
-
-/*
- * On Hygon setup the lower bits of the APIC id distinguish the cores.
- * Assumes number of cores is a power of two.
- */
-static void hygon_detect_cmp(struct cpuinfo_x86 *c)
-{
- unsigned int bits;
- int cpu = smp_processor_id();
-
- bits = c->x86_coreid_bits;
- /* Low order bits define the core id (index of core in socket) */
- c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
- /* Convert the initial APIC ID into the socket ID */
- c->phys_proc_id = c->initial_apicid >> bits;
- /* use socket ID also for last level cache */
- per_cpu(cpu_llc_id, cpu) = c->cpu_die_id = c->phys_proc_id;
-}
-
static void srat_detect_node(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_NUMA
int cpu = smp_processor_id();
int node;
- unsigned int apicid = c->apicid;
+ unsigned int apicid = c->topo.apicid;
node = numa_cpu_node(cpu);
if (node == NUMA_NO_NODE)
- node = per_cpu(cpu_llc_id, cpu);
+ node = c->topo.llc_id;
/*
* On multi-fabric platform (e.g. Numascale NumaChip) a
@@ -160,7 +80,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
* through CPU mapping may alter the outcome, directly
* access __apicid_to_node[].
*/
- int ht_nodeid = c->initial_apicid;
+ int ht_nodeid = c->topo.initial_apicid;
if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
node = __apicid_to_node[ht_nodeid];
@@ -172,38 +92,12 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
#endif
}
-static void early_init_hygon_mc(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_SMP
- unsigned int bits, ecx;
-
- /* Multi core CPU? */
- if (c->extended_cpuid_level < 0x80000008)
- return;
-
- ecx = cpuid_ecx(0x80000008);
-
- c->x86_max_cores = (ecx & 0xff) + 1;
-
- /* CPU telling us the core id bits shift? */
- bits = (ecx >> 12) & 0xF;
-
- /* Otherwise recompute */
- if (bits == 0) {
- while ((1 << bits) < c->x86_max_cores)
- bits++;
- }
-
- c->x86_coreid_bits = bits;
-#endif
-}
-
static void bsp_init_hygon(struct cpuinfo_x86 *c)
{
if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
u64 val;
- rdmsrl(MSR_K7_HWCR, val);
+ rdmsrq(MSR_K7_HWCR, val);
if (!(val & BIT(24)))
pr_warn(FW_BUG "TSC doesn't count with P0 frequency!\n");
}
@@ -211,25 +105,13 @@ static void bsp_init_hygon(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_MWAITX))
use_mwaitx_delay();
- if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
- u32 ecx;
-
- ecx = cpuid_ecx(0x8000001e);
- __max_die_per_package = nodes_per_socket = ((ecx >> 8) & 7) + 1;
- } else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) {
- u64 value;
-
- rdmsrl(MSR_FAM10H_NODE_ID, value);
- __max_die_per_package = nodes_per_socket = ((value >> 3) & 7) + 1;
- }
-
if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) &&
!boot_cpu_has(X86_FEATURE_VIRT_SSBD)) {
/*
* Try to cache the base value so further operations can
* avoid RMW. If that faults, do not enable SSBD.
*/
- if (!rdmsrl_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) {
+ if (!rdmsrq_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) {
setup_force_cpu_cap(X86_FEATURE_LS_CFG_SSBD);
setup_force_cpu_cap(X86_FEATURE_SSBD);
x86_amd_ls_cfg_ssbd_mask = 1ULL << 10;
@@ -241,8 +123,6 @@ static void early_init_hygon(struct cpuinfo_x86 *c)
{
u32 dummy;
- early_init_hygon_mc(c);
-
set_cpu_cap(c, X86_FEATURE_K8);
rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
@@ -283,12 +163,12 @@ static void early_init_hygon(struct cpuinfo_x86 *c)
* we can set it unconditionally.
*/
set_cpu_cap(c, X86_FEATURE_VMMCALL);
-
- hygon_get_topology_early(c);
}
static void init_hygon(struct cpuinfo_x86 *c)
{
+ u64 vm_cr;
+
early_init_hygon(c);
/*
@@ -299,9 +179,6 @@ static void init_hygon(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
- /* get apicid instead of initial apic id from cpuid */
- c->apicid = hard_smp_processor_id();
-
/*
* XXX someone from Hygon needs to confirm this DTRT
*
@@ -313,12 +190,18 @@ static void init_hygon(struct cpuinfo_x86 *c)
cpu_detect_cache_sizes(c);
- hygon_detect_cmp(c);
- hygon_get_topology(c);
srat_detect_node(c);
init_hygon_cacheinfo(c);
+ if (cpu_has(c, X86_FEATURE_SVM)) {
+ rdmsrq(MSR_VM_CR, vm_cr);
+ if (vm_cr & SVM_VM_CR_SVM_DIS_MASK) {
+ pr_notice_once("SVM disabled (by BIOS) in MSR_VM_CR\n");
+ clear_cpu_cap(c, X86_FEATURE_SVM);
+ }
+ }
+
if (cpu_has(c, X86_FEATURE_XMM2)) {
/*
* Use LFENCE for execution serialization. On families which
@@ -326,8 +209,8 @@ static void init_hygon(struct cpuinfo_x86 *c)
* msr_set_bit() uses the safe accessors, too, even if the MSR
* is not present.
*/
- msr_set_bit(MSR_F10H_DECFG,
- MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT);
+ msr_set_bit(MSR_AMD64_DE_CFG,
+ MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT);
/* A serializing LFENCE stops RDTSC speculation */
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
@@ -339,10 +222,13 @@ static void init_hygon(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_ARAT);
/* Hygon CPUs don't reset SS attributes on SYSRET, Xen does. */
- if (!cpu_has(c, X86_FEATURE_XENPV))
+ if (!cpu_feature_enabled(X86_FEATURE_XENPV))
set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
check_null_seg_clears_base(c);
+
+ /* Hygon CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
+ clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
}
static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c)
@@ -355,26 +241,26 @@ static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c)
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
- tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask;
- tlb_lli_4k[ENTRIES] = ebx & mask;
+ tlb_lld_4k = (ebx >> 16) & mask;
+ tlb_lli_4k = ebx & mask;
/* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
if (!((eax >> 16) & mask))
- tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff;
+ tlb_lld_2m = (cpuid_eax(0x80000005) >> 16) & 0xff;
else
- tlb_lld_2m[ENTRIES] = (eax >> 16) & mask;
+ tlb_lld_2m = (eax >> 16) & mask;
/* a 4M entry uses two 2M entries */
- tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1;
+ tlb_lld_4m = tlb_lld_2m >> 1;
/* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
if (!(eax & mask)) {
cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
- tlb_lli_2m[ENTRIES] = eax & 0xff;
+ tlb_lli_2m = eax & 0xff;
} else
- tlb_lli_2m[ENTRIES] = eax & mask;
+ tlb_lli_2m = eax & mask;
- tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
+ tlb_lli_4m = tlb_lli_2m >> 1;
}
static const struct cpu_dev hygon_cpu_dev = {
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 663f6e6dd288..076eaa41b8c8 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -1,68 +1,33 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/kernel.h>
-#include <linux/pgtable.h>
-#include <linux/string.h>
#include <linux/bitops.h>
-#include <linux/smp.h>
-#include <linux/sched.h>
-#include <linux/sched/clock.h>
-#include <linux/semaphore.h>
-#include <linux/thread_info.h>
#include <linux/init.h>
-#include <linux/uaccess.h>
-#include <linux/workqueue.h>
-#include <linux/delay.h>
-#include <linux/cpuhotplug.h>
+#include <linux/kernel.h>
+#include <linux/minmax.h>
+#include <linux/smp.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#ifdef CONFIG_X86_64
+#include <linux/topology.h>
+#endif
-#include <asm/cpufeature.h>
-#include <asm/msr.h>
#include <asm/bugs.h>
+#include <asm/cpu_device_id.h>
+#include <asm/cpufeature.h>
#include <asm/cpu.h>
-#include <asm/intel-family.h>
-#include <asm/microcode_intel.h>
+#include <asm/cpuid/api.h>
#include <asm/hwcap2.h>
-#include <asm/elf.h>
-#include <asm/cpu_device_id.h>
-#include <asm/cmdline.h>
-#include <asm/traps.h>
-#include <asm/resctrl.h>
+#include <asm/intel-family.h>
+#include <asm/microcode.h>
+#include <asm/msr.h>
#include <asm/numa.h>
+#include <asm/resctrl.h>
#include <asm/thermal.h>
-
-#ifdef CONFIG_X86_64
-#include <linux/topology.h>
-#endif
+#include <asm/uaccess.h>
#include "cpu.h"
-#ifdef CONFIG_X86_LOCAL_APIC
-#include <asm/mpspec.h>
-#include <asm/apic.h>
-#endif
-
-enum split_lock_detect_state {
- sld_off = 0,
- sld_warn,
- sld_fatal,
- sld_ratelimit,
-};
-
-/*
- * Default to sld_off because most systems do not support split lock detection.
- * sld_state_setup() will switch this to sld_warn on systems that support
- * split lock/bus lock detect, unless there is a command line override.
- */
-static enum split_lock_detect_state sld_state __ro_after_init = sld_off;
-static u64 msr_test_ctrl_cache __ro_after_init;
-
-/*
- * With a name like MSR_TEST_CTL it should go without saying, but don't touch
- * MSR_TEST_CTL unless the CPU is one of the whitelisted models. Writing it
- * on CPUs that do not support SLD can cause fireworks, even when writing '0'.
- */
-static bool cpu_model_supports_sld __ro_after_init;
-
/*
* Processors which have self-snooping capability can handle conflicting
* memory type across CPUs by snooping its own cache. However, there exists
@@ -72,19 +37,19 @@ static bool cpu_model_supports_sld __ro_after_init;
*/
static void check_memory_type_self_snoop_errata(struct cpuinfo_x86 *c)
{
- switch (c->x86_model) {
- case INTEL_FAM6_CORE_YONAH:
- case INTEL_FAM6_CORE2_MEROM:
- case INTEL_FAM6_CORE2_MEROM_L:
- case INTEL_FAM6_CORE2_PENRYN:
- case INTEL_FAM6_CORE2_DUNNINGTON:
- case INTEL_FAM6_NEHALEM:
- case INTEL_FAM6_NEHALEM_G:
- case INTEL_FAM6_NEHALEM_EP:
- case INTEL_FAM6_NEHALEM_EX:
- case INTEL_FAM6_WESTMERE:
- case INTEL_FAM6_WESTMERE_EP:
- case INTEL_FAM6_SANDYBRIDGE:
+ switch (c->x86_vfm) {
+ case INTEL_CORE_YONAH:
+ case INTEL_CORE2_MEROM:
+ case INTEL_CORE2_MEROM_L:
+ case INTEL_CORE2_PENRYN:
+ case INTEL_CORE2_DUNNINGTON:
+ case INTEL_NEHALEM:
+ case INTEL_NEHALEM_G:
+ case INTEL_NEHALEM_EP:
+ case INTEL_NEHALEM_EX:
+ case INTEL_WESTMERE:
+ case INTEL_WESTMERE_EP:
+ case INTEL_SANDYBRIDGE:
setup_clear_cpu_cap(X86_FEATURE_SELFSNOOP);
}
}
@@ -106,9 +71,9 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c)
*/
if (c->x86 != 6)
return;
- switch (c->x86_model) {
- case INTEL_FAM6_XEON_PHI_KNL:
- case INTEL_FAM6_XEON_PHI_KNM:
+ switch (c->x86_vfm) {
+ case INTEL_XEON_PHI_KNL:
+ case INTEL_XEON_PHI_KNM:
break;
default:
return;
@@ -134,32 +99,32 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c)
* - Release note from 20180108 microcode release
*/
struct sku_microcode {
- u8 model;
+ u32 vfm;
u8 stepping;
u32 microcode;
};
static const struct sku_microcode spectre_bad_microcodes[] = {
- { INTEL_FAM6_KABYLAKE, 0x0B, 0x80 },
- { INTEL_FAM6_KABYLAKE, 0x0A, 0x80 },
- { INTEL_FAM6_KABYLAKE, 0x09, 0x80 },
- { INTEL_FAM6_KABYLAKE_L, 0x0A, 0x80 },
- { INTEL_FAM6_KABYLAKE_L, 0x09, 0x80 },
- { INTEL_FAM6_SKYLAKE_X, 0x03, 0x0100013e },
- { INTEL_FAM6_SKYLAKE_X, 0x04, 0x0200003c },
- { INTEL_FAM6_BROADWELL, 0x04, 0x28 },
- { INTEL_FAM6_BROADWELL_G, 0x01, 0x1b },
- { INTEL_FAM6_BROADWELL_D, 0x02, 0x14 },
- { INTEL_FAM6_BROADWELL_D, 0x03, 0x07000011 },
- { INTEL_FAM6_BROADWELL_X, 0x01, 0x0b000025 },
- { INTEL_FAM6_HASWELL_L, 0x01, 0x21 },
- { INTEL_FAM6_HASWELL_G, 0x01, 0x18 },
- { INTEL_FAM6_HASWELL, 0x03, 0x23 },
- { INTEL_FAM6_HASWELL_X, 0x02, 0x3b },
- { INTEL_FAM6_HASWELL_X, 0x04, 0x10 },
- { INTEL_FAM6_IVYBRIDGE_X, 0x04, 0x42a },
+ { INTEL_KABYLAKE, 0x0B, 0x80 },
+ { INTEL_KABYLAKE, 0x0A, 0x80 },
+ { INTEL_KABYLAKE, 0x09, 0x80 },
+ { INTEL_KABYLAKE_L, 0x0A, 0x80 },
+ { INTEL_KABYLAKE_L, 0x09, 0x80 },
+ { INTEL_SKYLAKE_X, 0x03, 0x0100013e },
+ { INTEL_SKYLAKE_X, 0x04, 0x0200003c },
+ { INTEL_BROADWELL, 0x04, 0x28 },
+ { INTEL_BROADWELL_G, 0x01, 0x1b },
+ { INTEL_BROADWELL_D, 0x02, 0x14 },
+ { INTEL_BROADWELL_D, 0x03, 0x07000011 },
+ { INTEL_BROADWELL_X, 0x01, 0x0b000025 },
+ { INTEL_HASWELL_L, 0x01, 0x21 },
+ { INTEL_HASWELL_G, 0x01, 0x18 },
+ { INTEL_HASWELL, 0x03, 0x23 },
+ { INTEL_HASWELL_X, 0x02, 0x3b },
+ { INTEL_HASWELL_X, 0x04, 0x10 },
+ { INTEL_IVYBRIDGE_X, 0x04, 0x42a },
/* Observed in the wild */
- { INTEL_FAM6_SANDYBRIDGE_X, 0x06, 0x61b },
- { INTEL_FAM6_SANDYBRIDGE_X, 0x07, 0x712 },
+ { INTEL_SANDYBRIDGE_X, 0x06, 0x61b },
+ { INTEL_SANDYBRIDGE_X, 0x07, 0x712 },
};
static bool bad_spectre_microcode(struct cpuinfo_x86 *c)
@@ -173,66 +138,71 @@ static bool bad_spectre_microcode(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_HYPERVISOR))
return false;
- if (c->x86 != 6)
- return false;
-
for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) {
- if (c->x86_model == spectre_bad_microcodes[i].model &&
+ if (c->x86_vfm == spectre_bad_microcodes[i].vfm &&
c->x86_stepping == spectre_bad_microcodes[i].stepping)
return (c->microcode <= spectre_bad_microcodes[i].microcode);
}
return false;
}
-int intel_cpu_collect_info(struct ucode_cpu_info *uci)
-{
- unsigned int val[2];
- unsigned int family, model;
- struct cpu_signature csig = { 0 };
- unsigned int eax, ebx, ecx, edx;
+#define MSR_IA32_TME_ACTIVATE 0x982
+
+/* Helpers to access TME_ACTIVATE MSR */
+#define TME_ACTIVATE_LOCKED(x) (x & 0x1)
+#define TME_ACTIVATE_ENABLED(x) (x & 0x2)
- memset(uci, 0, sizeof(*uci));
+#define TME_ACTIVATE_KEYID_BITS(x) ((x >> 32) & 0xf) /* Bits 35:32 */
- eax = 0x00000001;
- ecx = 0;
- native_cpuid(&eax, &ebx, &ecx, &edx);
- csig.sig = eax;
+static void detect_tme_early(struct cpuinfo_x86 *c)
+{
+ u64 tme_activate;
+ int keyid_bits;
- family = x86_family(eax);
- model = x86_model(eax);
+ rdmsrq(MSR_IA32_TME_ACTIVATE, tme_activate);
- if (model >= 5 || family > 6) {
- /* get processor flags from MSR 0x17 */
- native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
- csig.pf = 1 << ((val[1] >> 18) & 7);
+ if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) {
+ pr_info_once("x86/tme: not enabled by BIOS\n");
+ clear_cpu_cap(c, X86_FEATURE_TME);
+ return;
}
+ pr_info_once("x86/tme: enabled by BIOS\n");
+ keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate);
+ if (!keyid_bits)
+ return;
- csig.rev = intel_get_microcode_revision();
+ /*
+ * KeyID bits are set by BIOS and can be present regardless
+ * of whether the kernel is using them. They effectively lower
+ * the number of physical address bits.
+ *
+ * Update cpuinfo_x86::x86_phys_bits accordingly.
+ */
+ c->x86_phys_bits -= keyid_bits;
+ pr_info_once("x86/mktme: BIOS enabled: x86_phys_bits reduced by %d\n",
+ keyid_bits);
+}
+
+void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return;
- uci->cpu_sig = csig;
- uci->valid = 1;
+ if (c->x86_vfm < INTEL_PENTIUM_M_DOTHAN)
+ return;
- return 0;
+ /*
+ * The BIOS can have limited CPUID to leaf 2, which breaks feature
+ * enumeration. Unlock it and update the maximum leaf info.
+ */
+ if (msr_clear_bit(MSR_IA32_MISC_ENABLE, MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0)
+ c->cpuid_level = cpuid_eax(0);
}
-EXPORT_SYMBOL_GPL(intel_cpu_collect_info);
static void early_init_intel(struct cpuinfo_x86 *c)
{
u64 misc_enable;
- /* Unmask CPUID levels if masked: */
- if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
- if (msr_clear_bit(MSR_IA32_MISC_ENABLE,
- MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) {
- c->cpuid_level = cpuid_eax(0);
- get_cpu_cap(c);
- }
- }
-
- if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
- (c->x86 == 0x6 && c->x86_model >= 0x0e))
- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-
if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64))
c->microcode = intel_get_microcode_revision();
@@ -260,7 +230,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
* need the microcode to have already been loaded... so if it is
* not, recommend a BIOS update and disable large pages.
*/
- if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_stepping <= 2 &&
+ if (c->x86_vfm == INTEL_ATOM_BONNELL && c->x86_stepping <= 2 &&
c->microcode < 0x20e) {
pr_warn("Atom PSE erratum detected, BIOS microcode update recommended\n");
clear_cpu_cap(c, X86_FEATURE_PSE);
@@ -275,8 +245,8 @@ static void early_init_intel(struct cpuinfo_x86 *c)
#endif
/* CPUID workaround for 0F33/0F34 CPU */
- if (c->x86 == 0xF && c->x86_model == 0x3
- && (c->x86_stepping == 0x3 || c->x86_stepping == 0x4))
+ if (c->x86_vfm == INTEL_P4_PRESCOTT &&
+ (c->x86_stepping == 0x3 || c->x86_stepping == 0x4))
c->x86_phys_bits = 36;
/*
@@ -285,46 +255,57 @@ static void early_init_intel(struct cpuinfo_x86 *c)
*
* It is also reliable across cores and sockets. (but not across
* cabinets - we turn it off in that case explicitly.)
+ *
+ * Use a model-specific check for some older CPUs that have invariant
+ * TSC but may not report it architecturally via 8000_0007.
*/
if (c->x86_power & (1 << 8)) {
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+ } else if ((c->x86_vfm >= INTEL_P4_PRESCOTT && c->x86_vfm <= INTEL_P4_WILLAMETTE) ||
+ (c->x86_vfm >= INTEL_CORE_YONAH && c->x86_vfm <= INTEL_IVYBRIDGE)) {
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
}
/* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
- if (c->x86 == 6) {
- switch (c->x86_model) {
- case INTEL_FAM6_ATOM_SALTWELL_MID:
- case INTEL_FAM6_ATOM_SALTWELL_TABLET:
- case INTEL_FAM6_ATOM_SILVERMONT_MID:
- case INTEL_FAM6_ATOM_AIRMONT_NP:
- set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3);
- break;
- default:
- break;
- }
+ switch (c->x86_vfm) {
+ case INTEL_ATOM_SALTWELL_MID:
+ case INTEL_ATOM_SALTWELL_TABLET:
+ case INTEL_ATOM_SILVERMONT_MID:
+ case INTEL_ATOM_AIRMONT_NP:
+ set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3);
+ break;
}
/*
- * There is a known erratum on Pentium III and Core Solo
- * and Core Duo CPUs.
- * " Page with PAT set to WC while associated MTRR is UC
- * may consolidate to UC "
- * Because of this erratum, it is better to stick with
- * setting WC in MTRR rather than using PAT on these CPUs.
+ * PAT is broken on early family 6 CPUs, the last of which
+ * is "Yonah" where the erratum is named "AN7":
+ *
+ * Page with PAT (Page Attribute Table) Set to USWC
+ * (Uncacheable Speculative Write Combine) While
+ * Associated MTRR (Memory Type Range Register) Is UC
+ * (Uncacheable) May Consolidate to UC
*
- * Enable PAT WC only on P4, Core 2 or later CPUs.
+ * Disable PAT and fall back to MTRR on these CPUs.
*/
- if (c->x86 == 6 && c->x86_model < 15)
+ if (c->x86_vfm >= INTEL_PENTIUM_PRO &&
+ c->x86_vfm <= INTEL_CORE_YONAH)
clear_cpu_cap(c, X86_FEATURE_PAT);
/*
- * If fast string is not enabled in IA32_MISC_ENABLE for any reason,
- * clear the fast string and enhanced fast string CPU capabilities.
+ * Modern CPUs are generally expected to have a sane fast string
+ * implementation. However, BIOSes typically have a knob to tweak
+ * the architectural MISC_ENABLE.FAST_STRING enable bit.
+ *
+ * Adhere to the preference and program the Linux-defined fast
+ * string flag and enhanced fast string capabilities accordingly.
*/
- if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
- rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
- if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
+ if (c->x86_vfm >= INTEL_PENTIUM_M_DOTHAN) {
+ rdmsrq(MSR_IA32_MISC_ENABLE, misc_enable);
+ if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) {
+ /* X86_FEATURE_ERMS is set based on CPUID */
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+ } else {
pr_info("Disabled fast string operations\n");
setup_clear_cpu_cap(X86_FEATURE_REP_GOOD);
setup_clear_cpu_cap(X86_FEATURE_ERMS);
@@ -341,32 +322,19 @@ static void early_init_intel(struct cpuinfo_x86 *c)
* should be false so that __flush_tlb_all() causes CR3 instead of CR4.PGE
* to be modified.
*/
- if (c->x86 == 5 && c->x86_model == 9) {
+ if (c->x86_vfm == INTEL_QUARK_X1000) {
pr_info("Disabling PGE capability bit\n");
setup_clear_cpu_cap(X86_FEATURE_PGE);
}
- if (c->cpuid_level >= 0x00000001) {
- u32 eax, ebx, ecx, edx;
-
- cpuid(0x00000001, &eax, &ebx, &ecx, &edx);
- /*
- * If HTT (EDX[28]) is set EBX[16:23] contain the number of
- * apicids which are reserved per package. Store the resulting
- * shift value for the package management code.
- */
- if (edx & (1U << 28))
- c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff);
- }
-
check_memory_type_self_snoop_errata(c);
/*
- * Get the number of SMT siblings early from the extended topology
- * leaf, if available. Otherwise try the legacy SMT detection.
+ * Adjust the number of physical bits early because it affects the
+ * valid bits of the MTRR mask registers.
*/
- if (detect_extended_topology_early(c) < 0)
- detect_ht_early(c);
+ if (cpu_has(c, X86_FEATURE_TME))
+ detect_tme_early(c);
}
static void bsp_init_intel(struct cpuinfo_x86 *c)
@@ -384,9 +352,7 @@ static void bsp_init_intel(struct cpuinfo_x86 *c)
int ppro_with_ram_bug(void)
{
/* Uses data from early_cpu_detect now */
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
- boot_cpu_data.x86 == 6 &&
- boot_cpu_data.x86_model == 1 &&
+ if (boot_cpu_data.x86_vfm == INTEL_PENTIUM_PRO &&
boot_cpu_data.x86_stepping < 8) {
pr_info("Pentium Pro with Errata#50 detected. Taking evasive action.\n");
return 1;
@@ -403,9 +369,8 @@ static void intel_smp_check(struct cpuinfo_x86 *c)
/*
* Mask B, Pentium, but not Pentium MMX
*/
- if (c->x86 == 5 &&
- c->x86_stepping >= 1 && c->x86_stepping <= 4 &&
- c->x86_model <= 3) {
+ if (c->x86_vfm >= INTEL_FAM5_START && c->x86_vfm < INTEL_PENTIUM_MMX &&
+ c->x86_stepping >= 1 && c->x86_stepping <= 4) {
/*
* Remember we have B step Pentia with bugs
*/
@@ -432,7 +397,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
* The Quark is also family 5, but does not have the same bug.
*/
clear_cpu_bug(c, X86_BUG_F00F);
- if (c->x86 == 5 && c->x86_model < 9) {
+ if (c->x86_vfm >= INTEL_FAM5_START && c->x86_vfm < INTEL_QUARK_X1000) {
static int f00f_workaround_enabled;
set_cpu_bug(c, X86_BUG_F00F);
@@ -447,7 +412,8 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until
* model 3 mask 3
*/
- if ((c->x86<<8 | c->x86_model<<4 | c->x86_stepping) < 0x633)
+ if ((c->x86_vfm == INTEL_PENTIUM_II_KLAMATH && c->x86_stepping < 3) ||
+ c->x86_vfm < INTEL_PENTIUM_II_KLAMATH)
clear_cpu_cap(c, X86_FEATURE_SEP);
/*
@@ -465,7 +431,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
* P4 Xeon erratum 037 workaround.
* Hardware prefetcher may cause stale data to be loaded into the cache.
*/
- if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_stepping == 1)) {
+ if (c->x86_vfm == INTEL_P4_WILLAMETTE && c->x86_stepping == 1) {
if (msr_set_bit(MSR_IA32_MISC_ENABLE,
MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) > 0) {
pr_info("CPU: C0 stepping P4 Xeon detected.\n");
@@ -479,27 +445,20 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
* integrated APIC (see 11AP erratum in "Pentium Processor
* Specification Update").
*/
- if (boot_cpu_has(X86_FEATURE_APIC) && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
+ if (boot_cpu_has(X86_FEATURE_APIC) && c->x86_vfm == INTEL_PENTIUM_75 &&
(c->x86_stepping < 0x6 || c->x86_stepping == 0xb))
set_cpu_bug(c, X86_BUG_11AP);
-
#ifdef CONFIG_X86_INTEL_USERCOPY
/*
- * Set up the preferred alignment for movsl bulk memory moves
+ * MOVSL bulk memory moves can be slow when source and dest are not
+ * both 8-byte aligned. PII/PIII only like MOVSL with 8-byte alignment.
+ *
+ * Set the preferred alignment for Pentium Pro and newer processors, as
+ * it has only been tested on these.
*/
- switch (c->x86) {
- case 4: /* 486: untested */
- break;
- case 5: /* Old Pentia: untested */
- break;
- case 6: /* PII/PIII only like movsl with 8-byte alignment */
- movsl_mask.mask = 7;
- break;
- case 15: /* P4 is OK down to 8-byte alignment */
+ if (c->x86_vfm >= INTEL_PENTIUM_PRO)
movsl_mask.mask = 7;
- break;
- }
#endif
intel_smp_check(c);
@@ -527,95 +486,11 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
#endif
}
-#define MSR_IA32_TME_ACTIVATE 0x982
-
-/* Helpers to access TME_ACTIVATE MSR */
-#define TME_ACTIVATE_LOCKED(x) (x & 0x1)
-#define TME_ACTIVATE_ENABLED(x) (x & 0x2)
-
-#define TME_ACTIVATE_POLICY(x) ((x >> 4) & 0xf) /* Bits 7:4 */
-#define TME_ACTIVATE_POLICY_AES_XTS_128 0
-
-#define TME_ACTIVATE_KEYID_BITS(x) ((x >> 32) & 0xf) /* Bits 35:32 */
-
-#define TME_ACTIVATE_CRYPTO_ALGS(x) ((x >> 48) & 0xffff) /* Bits 63:48 */
-#define TME_ACTIVATE_CRYPTO_AES_XTS_128 1
-
-/* Values for mktme_status (SW only construct) */
-#define MKTME_ENABLED 0
-#define MKTME_DISABLED 1
-#define MKTME_UNINITIALIZED 2
-static int mktme_status = MKTME_UNINITIALIZED;
-
-static void detect_tme(struct cpuinfo_x86 *c)
-{
- u64 tme_activate, tme_policy, tme_crypto_algs;
- int keyid_bits = 0, nr_keyids = 0;
- static u64 tme_activate_cpu0 = 0;
-
- rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate);
-
- if (mktme_status != MKTME_UNINITIALIZED) {
- if (tme_activate != tme_activate_cpu0) {
- /* Broken BIOS? */
- pr_err_once("x86/tme: configuration is inconsistent between CPUs\n");
- pr_err_once("x86/tme: MKTME is not usable\n");
- mktme_status = MKTME_DISABLED;
-
- /* Proceed. We may need to exclude bits from x86_phys_bits. */
- }
- } else {
- tme_activate_cpu0 = tme_activate;
- }
-
- if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) {
- pr_info_once("x86/tme: not enabled by BIOS\n");
- mktme_status = MKTME_DISABLED;
- return;
- }
-
- if (mktme_status != MKTME_UNINITIALIZED)
- goto detect_keyid_bits;
-
- pr_info("x86/tme: enabled by BIOS\n");
-
- tme_policy = TME_ACTIVATE_POLICY(tme_activate);
- if (tme_policy != TME_ACTIVATE_POLICY_AES_XTS_128)
- pr_warn("x86/tme: Unknown policy is active: %#llx\n", tme_policy);
-
- tme_crypto_algs = TME_ACTIVATE_CRYPTO_ALGS(tme_activate);
- if (!(tme_crypto_algs & TME_ACTIVATE_CRYPTO_AES_XTS_128)) {
- pr_err("x86/mktme: No known encryption algorithm is supported: %#llx\n",
- tme_crypto_algs);
- mktme_status = MKTME_DISABLED;
- }
-detect_keyid_bits:
- keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate);
- nr_keyids = (1UL << keyid_bits) - 1;
- if (nr_keyids) {
- pr_info_once("x86/mktme: enabled by BIOS\n");
- pr_info_once("x86/mktme: %d KeyIDs available\n", nr_keyids);
- } else {
- pr_info_once("x86/mktme: disabled by BIOS\n");
- }
-
- if (mktme_status == MKTME_UNINITIALIZED) {
- /* MKTME is usable */
- mktme_status = MKTME_ENABLED;
- }
-
- /*
- * KeyID bits effectively lower the number of physical address
- * bits. Update cpuinfo_x86::x86_phys_bits accordingly.
- */
- c->x86_phys_bits -= keyid_bits;
-}
-
static void init_cpuid_fault(struct cpuinfo_x86 *c)
{
u64 msr;
- if (!rdmsrl_safe(MSR_PLATFORM_INFO, &msr)) {
+ if (!rdmsrq_safe(MSR_PLATFORM_INFO, &msr)) {
if (msr & MSR_PLATFORM_INFO_CPUID_FAULT)
set_cpu_cap(c, X86_FEATURE_CPUID_FAULT);
}
@@ -625,7 +500,7 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c)
{
u64 msr;
- if (rdmsrl_safe(MSR_MISC_FEATURES_ENABLES, &msr))
+ if (rdmsrq_safe(MSR_MISC_FEATURES_ENABLES, &msr))
return;
/* Clear all MISC features */
@@ -636,11 +511,27 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c)
probe_xeon_phi_r3mwait(c);
msr = this_cpu_read(msr_misc_features_shadow);
- wrmsrl(MSR_MISC_FEATURES_ENABLES, msr);
+ wrmsrq(MSR_MISC_FEATURES_ENABLES, msr);
}
-static void split_lock_init(void);
-static void bus_lock_init(void);
+/*
+ * This is a list of Intel CPUs that are known to suffer from downclocking when
+ * ZMM registers (512-bit vectors) are used. On these CPUs, when the kernel
+ * executes SIMD-optimized code such as cryptography functions or CRCs, it
+ * should prefer 256-bit (YMM) code to 512-bit (ZMM) code.
+ */
+static const struct x86_cpu_id zmm_exclusion_list[] = {
+ X86_MATCH_VFM(INTEL_SKYLAKE_X, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_X, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_L, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_NNPI, 0),
+ X86_MATCH_VFM(INTEL_TIGERLAKE_L, 0),
+ X86_MATCH_VFM(INTEL_TIGERLAKE, 0),
+ /* Allow Rocket Lake and later, and Sapphire Rapids and later. */
+ {},
+};
static void init_intel(struct cpuinfo_x86 *c)
{
@@ -648,24 +539,6 @@ static void init_intel(struct cpuinfo_x86 *c)
intel_workarounds(c);
- /*
- * Detect the extended topology information if available. This
- * will reinitialise the initial_apicid which will be used
- * in init_intel_cacheinfo()
- */
- detect_extended_topology(c);
-
- if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
- /*
- * let's use the legacy cpuid vector 0x1 and 0x4 for topology
- * detection.
- */
- detect_num_cpu_cores(c);
-#ifdef CONFIG_X86_32
- detect_ht(c);
-#endif
- }
-
init_intel_cacheinfo(c);
if (c->cpuid_level > 9) {
@@ -688,19 +561,20 @@ static void init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_PEBS);
}
- if (c->x86 == 6 && boot_cpu_has(X86_FEATURE_CLFLUSH) &&
- (c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47))
+ if (boot_cpu_has(X86_FEATURE_CLFLUSH) &&
+ (c->x86_vfm == INTEL_CORE2_DUNNINGTON ||
+ c->x86_vfm == INTEL_NEHALEM_EX ||
+ c->x86_vfm == INTEL_WESTMERE_EX))
set_cpu_bug(c, X86_BUG_CLFLUSH_MONITOR);
- if (c->x86 == 6 && boot_cpu_has(X86_FEATURE_MWAIT) &&
- ((c->x86_model == INTEL_FAM6_ATOM_GOLDMONT)))
+ if (boot_cpu_has(X86_FEATURE_MWAIT) &&
+ (c->x86_vfm == INTEL_ATOM_GOLDMONT ||
+ c->x86_vfm == INTEL_LUNARLAKE_M))
set_cpu_bug(c, X86_BUG_MONITOR);
#ifdef CONFIG_X86_64
if (c->x86 == 15)
c->x86_cache_alignment = c->x86_clflush_size * 2;
- if (c->x86 == 6)
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
#else
/*
* Names for the Pentium II/Celeron processors
@@ -735,25 +609,19 @@ static void init_intel(struct cpuinfo_x86 *c)
if (p)
strcpy(c->x86_model_id, p);
}
-
- if (c->x86 == 15)
- set_cpu_cap(c, X86_FEATURE_P4);
- if (c->x86 == 6)
- set_cpu_cap(c, X86_FEATURE_P3);
#endif
+ if (x86_match_cpu(zmm_exclusion_list))
+ set_cpu_cap(c, X86_FEATURE_PREFER_YMM);
+
/* Work around errata */
srat_detect_node(c);
init_ia32_feat_ctl(c);
- if (cpu_has(c, X86_FEATURE_TME))
- detect_tme(c);
-
init_intel_misc_features(c);
split_lock_init();
- bus_lock_init();
intel_init_thermal(c);
}
@@ -767,191 +635,90 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
* to determine which, so we use a boottime override
* for the 512kb model, and assume 256 otherwise.
*/
- if ((c->x86 == 6) && (c->x86_model == 11) && (size == 0))
+ if (c->x86_vfm == INTEL_PENTIUM_III_TUALATIN && size == 0)
size = 256;
/*
* Intel Quark SoC X1000 contains a 4-way set associative
* 16K cache with a 16 byte cache line and 256 lines per tag
*/
- if ((c->x86 == 5) && (c->x86_model == 9))
+ if (c->x86_vfm == INTEL_QUARK_X1000)
size = 16;
return size;
}
#endif
-#define TLB_INST_4K 0x01
-#define TLB_INST_4M 0x02
-#define TLB_INST_2M_4M 0x03
-
-#define TLB_INST_ALL 0x05
-#define TLB_INST_1G 0x06
-
-#define TLB_DATA_4K 0x11
-#define TLB_DATA_4M 0x12
-#define TLB_DATA_2M_4M 0x13
-#define TLB_DATA_4K_4M 0x14
-
-#define TLB_DATA_1G 0x16
-
-#define TLB_DATA0_4K 0x21
-#define TLB_DATA0_4M 0x22
-#define TLB_DATA0_2M_4M 0x23
-
-#define STLB_4K 0x41
-#define STLB_4K_2M 0x42
-
-static const struct _tlb_table intel_tlb_table[] = {
- { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" },
- { 0x02, TLB_INST_4M, 2, " TLB_INST 4 MByte pages, full associative" },
- { 0x03, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way set associative" },
- { 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" },
- { 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" },
- { 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" },
- { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages" },
- { 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
- { 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
- { 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
- { 0x55, TLB_INST_2M_4M, 7, " TLB_INST 2-MByte or 4-MByte pages, fully associative" },
- { 0x56, TLB_DATA0_4M, 16, " TLB_DATA0 4 MByte pages, 4-way set associative" },
- { 0x57, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, 4-way associative" },
- { 0x59, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, fully associative" },
- { 0x5a, TLB_DATA0_2M_4M, 32, " TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative" },
- { 0x5b, TLB_DATA_4K_4M, 64, " TLB_DATA 4 KByte and 4 MByte pages" },
- { 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" },
- { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" },
- { 0x61, TLB_INST_4K, 48, " TLB_INST 4 KByte pages, full associative" },
- { 0x63, TLB_DATA_1G, 4, " TLB_DATA 1 GByte pages, 4-way set associative" },
- { 0x6b, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 8-way associative" },
- { 0x6c, TLB_DATA_2M_4M, 128, " TLB_DATA 2 MByte or 4 MByte pages, 8-way associative" },
- { 0x6d, TLB_DATA_1G, 16, " TLB_DATA 1 GByte pages, fully associative" },
- { 0x76, TLB_INST_2M_4M, 8, " TLB_INST 2-MByte or 4-MByte pages, fully associative" },
- { 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" },
- { 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" },
- { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" },
- { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" },
- { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" },
- { 0xb5, TLB_INST_4K, 64, " TLB_INST 4 KByte pages, 8-way set associative" },
- { 0xb6, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 8-way set associative" },
- { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" },
- { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" },
- { 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" },
- { 0xc2, TLB_DATA_2M_4M, 16, " TLB_DATA 2 MByte/4MByte pages, 4-way associative" },
- { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" },
- { 0x00, 0, 0 }
-};
-
-static void intel_tlb_lookup(const unsigned char desc)
+static void intel_tlb_lookup(const struct leaf_0x2_table *desc)
{
- unsigned char k;
- if (desc == 0)
- return;
-
- /* look up this descriptor in the table */
- for (k = 0; intel_tlb_table[k].descriptor != desc &&
- intel_tlb_table[k].descriptor != 0; k++)
- ;
-
- if (intel_tlb_table[k].tlb_type == 0)
- return;
+ short entries = desc->entries;
- switch (intel_tlb_table[k].tlb_type) {
+ switch (desc->t_type) {
case STLB_4K:
- if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lli_4k = max(tlb_lli_4k, entries);
+ tlb_lld_4k = max(tlb_lld_4k, entries);
break;
case STLB_4K_2M:
- if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lli_4k = max(tlb_lli_4k, entries);
+ tlb_lld_4k = max(tlb_lld_4k, entries);
+ tlb_lli_2m = max(tlb_lli_2m, entries);
+ tlb_lld_2m = max(tlb_lld_2m, entries);
+ tlb_lli_4m = max(tlb_lli_4m, entries);
+ tlb_lld_4m = max(tlb_lld_4m, entries);
break;
case TLB_INST_ALL:
- if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lli_4k = max(tlb_lli_4k, entries);
+ tlb_lli_2m = max(tlb_lli_2m, entries);
+ tlb_lli_4m = max(tlb_lli_4m, entries);
break;
case TLB_INST_4K:
- if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lli_4k = max(tlb_lli_4k, entries);
break;
case TLB_INST_4M:
- if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lli_4m = max(tlb_lli_4m, entries);
break;
case TLB_INST_2M_4M:
- if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lli_2m = max(tlb_lli_2m, entries);
+ tlb_lli_4m = max(tlb_lli_4m, entries);
break;
case TLB_DATA_4K:
case TLB_DATA0_4K:
- if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lld_4k = max(tlb_lld_4k, entries);
break;
case TLB_DATA_4M:
case TLB_DATA0_4M:
- if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lld_4m = max(tlb_lld_4m, entries);
break;
case TLB_DATA_2M_4M:
case TLB_DATA0_2M_4M:
- if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lld_2m = max(tlb_lld_2m, entries);
+ tlb_lld_4m = max(tlb_lld_4m, entries);
break;
case TLB_DATA_4K_4M:
- if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lld_4k = max(tlb_lld_4k, entries);
+ tlb_lld_4m = max(tlb_lld_4m, entries);
break;
+ case TLB_DATA_1G_2M_4M:
+ tlb_lld_2m = max(tlb_lld_2m, TLB_0x63_2M_4M_ENTRIES);
+ tlb_lld_4m = max(tlb_lld_4m, TLB_0x63_2M_4M_ENTRIES);
+ fallthrough;
case TLB_DATA_1G:
- if (tlb_lld_1g[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_1g[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lld_1g = max(tlb_lld_1g, entries);
break;
}
}
static void intel_detect_tlb(struct cpuinfo_x86 *c)
{
- int i, j, n;
- unsigned int regs[4];
- unsigned char *desc = (unsigned char *)regs;
+ const struct leaf_0x2_table *desc;
+ union leaf_0x2_regs regs;
+ u8 *ptr;
if (c->cpuid_level < 2)
return;
- /* Number of times to iterate */
- n = cpuid_eax(2) & 0xFF;
-
- for (i = 0 ; i < n ; i++) {
- cpuid(2, &regs[0], &regs[1], &regs[2], &regs[3]);
-
- /* If bit 31 is set, this is an unknown format */
- for (j = 0 ; j < 3 ; j++)
- if (regs[j] & (1 << 31))
- regs[j] = 0;
-
- /* Byte 0 is level count, not a descriptor */
- for (j = 1 ; j < 16 ; j++)
- intel_tlb_lookup(desc[j]);
- }
+ cpuid_leaf_0x2(&regs);
+ for_each_cpuid_0x2_desc(regs, ptr, desc)
+ intel_tlb_lookup(desc);
}
static const struct cpu_dev intel_cpu_dev = {
@@ -1018,366 +785,3 @@ static const struct cpu_dev intel_cpu_dev = {
};
cpu_dev_register(intel_cpu_dev);
-
-#undef pr_fmt
-#define pr_fmt(fmt) "x86/split lock detection: " fmt
-
-static const struct {
- const char *option;
- enum split_lock_detect_state state;
-} sld_options[] __initconst = {
- { "off", sld_off },
- { "warn", sld_warn },
- { "fatal", sld_fatal },
- { "ratelimit:", sld_ratelimit },
-};
-
-static struct ratelimit_state bld_ratelimit;
-
-static DEFINE_SEMAPHORE(buslock_sem);
-
-static inline bool match_option(const char *arg, int arglen, const char *opt)
-{
- int len = strlen(opt), ratelimit;
-
- if (strncmp(arg, opt, len))
- return false;
-
- /*
- * Min ratelimit is 1 bus lock/sec.
- * Max ratelimit is 1000 bus locks/sec.
- */
- if (sscanf(arg, "ratelimit:%d", &ratelimit) == 1 &&
- ratelimit > 0 && ratelimit <= 1000) {
- ratelimit_state_init(&bld_ratelimit, HZ, ratelimit);
- ratelimit_set_flags(&bld_ratelimit, RATELIMIT_MSG_ON_RELEASE);
- return true;
- }
-
- return len == arglen;
-}
-
-static bool split_lock_verify_msr(bool on)
-{
- u64 ctrl, tmp;
-
- if (rdmsrl_safe(MSR_TEST_CTRL, &ctrl))
- return false;
- if (on)
- ctrl |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
- else
- ctrl &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
- if (wrmsrl_safe(MSR_TEST_CTRL, ctrl))
- return false;
- rdmsrl(MSR_TEST_CTRL, tmp);
- return ctrl == tmp;
-}
-
-static void __init sld_state_setup(void)
-{
- enum split_lock_detect_state state = sld_warn;
- char arg[20];
- int i, ret;
-
- if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) &&
- !boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
- return;
-
- ret = cmdline_find_option(boot_command_line, "split_lock_detect",
- arg, sizeof(arg));
- if (ret >= 0) {
- for (i = 0; i < ARRAY_SIZE(sld_options); i++) {
- if (match_option(arg, ret, sld_options[i].option)) {
- state = sld_options[i].state;
- break;
- }
- }
- }
- sld_state = state;
-}
-
-static void __init __split_lock_setup(void)
-{
- if (!split_lock_verify_msr(false)) {
- pr_info("MSR access failed: Disabled\n");
- return;
- }
-
- rdmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache);
-
- if (!split_lock_verify_msr(true)) {
- pr_info("MSR access failed: Disabled\n");
- return;
- }
-
- /* Restore the MSR to its cached value. */
- wrmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache);
-
- setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT);
-}
-
-/*
- * MSR_TEST_CTRL is per core, but we treat it like a per CPU MSR. Locking
- * is not implemented as one thread could undo the setting of the other
- * thread immediately after dropping the lock anyway.
- */
-static void sld_update_msr(bool on)
-{
- u64 test_ctrl_val = msr_test_ctrl_cache;
-
- if (on)
- test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
-
- wrmsrl(MSR_TEST_CTRL, test_ctrl_val);
-}
-
-static void split_lock_init(void)
-{
- /*
- * #DB for bus lock handles ratelimit and #AC for split lock is
- * disabled.
- */
- if (sld_state == sld_ratelimit) {
- split_lock_verify_msr(false);
- return;
- }
-
- if (cpu_model_supports_sld)
- split_lock_verify_msr(sld_state != sld_off);
-}
-
-static void __split_lock_reenable(struct work_struct *work)
-{
- sld_update_msr(true);
- up(&buslock_sem);
-}
-
-/*
- * If a CPU goes offline with pending delayed work to re-enable split lock
- * detection then the delayed work will be executed on some other CPU. That
- * handles releasing the buslock_sem, but because it executes on a
- * different CPU probably won't re-enable split lock detection. This is a
- * problem on HT systems since the sibling CPU on the same core may then be
- * left running with split lock detection disabled.
- *
- * Unconditionally re-enable detection here.
- */
-static int splitlock_cpu_offline(unsigned int cpu)
-{
- sld_update_msr(true);
-
- return 0;
-}
-
-static DECLARE_DELAYED_WORK(split_lock_reenable, __split_lock_reenable);
-
-static void split_lock_warn(unsigned long ip)
-{
- int cpu;
-
- if (!current->reported_split_lock)
- pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n",
- current->comm, current->pid, ip);
- current->reported_split_lock = 1;
-
- /* misery factor #1, sleep 10ms before trying to execute split lock */
- if (msleep_interruptible(10) > 0)
- return;
- /* Misery factor #2, only allow one buslocked disabled core at a time */
- if (down_interruptible(&buslock_sem) == -EINTR)
- return;
- cpu = get_cpu();
- schedule_delayed_work_on(cpu, &split_lock_reenable, 2);
-
- /* Disable split lock detection on this CPU to make progress */
- sld_update_msr(false);
- put_cpu();
-}
-
-bool handle_guest_split_lock(unsigned long ip)
-{
- if (sld_state == sld_warn) {
- split_lock_warn(ip);
- return true;
- }
-
- pr_warn_once("#AC: %s/%d %s split_lock trap at address: 0x%lx\n",
- current->comm, current->pid,
- sld_state == sld_fatal ? "fatal" : "bogus", ip);
-
- current->thread.error_code = 0;
- current->thread.trap_nr = X86_TRAP_AC;
- force_sig_fault(SIGBUS, BUS_ADRALN, NULL);
- return false;
-}
-EXPORT_SYMBOL_GPL(handle_guest_split_lock);
-
-static void bus_lock_init(void)
-{
- u64 val;
-
- /*
- * Warn and fatal are handled by #AC for split lock if #AC for
- * split lock is supported.
- */
- if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) ||
- (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) &&
- (sld_state == sld_warn || sld_state == sld_fatal)) ||
- sld_state == sld_off)
- return;
-
- /*
- * Enable #DB for bus lock. All bus locks are handled in #DB except
- * split locks are handled in #AC in the fatal case.
- */
- rdmsrl(MSR_IA32_DEBUGCTLMSR, val);
- val |= DEBUGCTLMSR_BUS_LOCK_DETECT;
- wrmsrl(MSR_IA32_DEBUGCTLMSR, val);
-}
-
-bool handle_user_split_lock(struct pt_regs *regs, long error_code)
-{
- if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal)
- return false;
- split_lock_warn(regs->ip);
- return true;
-}
-
-void handle_bus_lock(struct pt_regs *regs)
-{
- switch (sld_state) {
- case sld_off:
- break;
- case sld_ratelimit:
- /* Enforce no more than bld_ratelimit bus locks/sec. */
- while (!__ratelimit(&bld_ratelimit))
- msleep(20);
- /* Warn on the bus lock. */
- fallthrough;
- case sld_warn:
- pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n",
- current->comm, current->pid, regs->ip);
- break;
- case sld_fatal:
- force_sig_fault(SIGBUS, BUS_ADRALN, NULL);
- break;
- }
-}
-
-/*
- * Bits in the IA32_CORE_CAPABILITIES are not architectural, so they should
- * only be trusted if it is confirmed that a CPU model implements a
- * specific feature at a particular bit position.
- *
- * The possible driver data field values:
- *
- * - 0: CPU models that are known to have the per-core split-lock detection
- * feature even though they do not enumerate IA32_CORE_CAPABILITIES.
- *
- * - 1: CPU models which may enumerate IA32_CORE_CAPABILITIES and if so use
- * bit 5 to enumerate the per-core split-lock detection feature.
- */
-static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, 0),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, 0),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, 0),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, 1),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, 1),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, 1),
- X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, 1),
- X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, 1),
- X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, 1),
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, 1),
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, 1),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, 1),
- {}
-};
-
-static void __init split_lock_setup(struct cpuinfo_x86 *c)
-{
- const struct x86_cpu_id *m;
- u64 ia32_core_caps;
-
- if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
- return;
-
- m = x86_match_cpu(split_lock_cpu_ids);
- if (!m)
- return;
-
- switch (m->driver_data) {
- case 0:
- break;
- case 1:
- if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES))
- return;
- rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps);
- if (!(ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT))
- return;
- break;
- default:
- return;
- }
-
- cpu_model_supports_sld = true;
- __split_lock_setup();
-}
-
-static void sld_state_show(void)
-{
- if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
- !boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
- return;
-
- switch (sld_state) {
- case sld_off:
- pr_info("disabled\n");
- break;
- case sld_warn:
- if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
- pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n");
- if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
- "x86/splitlock", NULL, splitlock_cpu_offline) < 0)
- pr_warn("No splitlock CPU offline handler\n");
- } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) {
- pr_info("#DB: warning on user-space bus_locks\n");
- }
- break;
- case sld_fatal:
- if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
- pr_info("#AC: crashing the kernel on kernel split_locks and sending SIGBUS on user-space split_locks\n");
- } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) {
- pr_info("#DB: sending SIGBUS on user-space bus_locks%s\n",
- boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) ?
- " from non-WB" : "");
- }
- break;
- case sld_ratelimit:
- if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
- pr_info("#DB: setting system wide bus lock rate limit to %u/sec\n", bld_ratelimit.burst);
- break;
- }
-}
-
-void __init sld_setup(struct cpuinfo_x86 *c)
-{
- split_lock_setup(c);
- sld_state_setup();
- sld_state_show();
-}
-
-#define X86_HYBRID_CPU_TYPE_ID_SHIFT 24
-
-/**
- * get_this_hybrid_cpu_type() - Get the type of this hybrid CPU
- *
- * Returns the CPU type [31:24] (i.e., Atom or Core) of a CPU in
- * a hybrid processor. If the processor is not hybrid, returns 0.
- */
-u8 get_this_hybrid_cpu_type(void)
-{
- if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
- return 0;
-
- return cpuid_eax(0x0000001a) >> X86_HYBRID_CPU_TYPE_ID_SHIFT;
-}
diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c
index fbaf12e43f41..bc7671f920a7 100644
--- a/arch/x86/kernel/cpu/intel_epb.c
+++ b/arch/x86/kernel/cpu/intel_epb.c
@@ -79,7 +79,7 @@ static int intel_epb_save(void)
{
u64 epb;
- rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
+ rdmsrq(MSR_IA32_ENERGY_PERF_BIAS, epb);
/*
* Ensure that saved_epb will always be nonzero after this write even if
* the EPB value read from the MSR is 0.
@@ -94,7 +94,7 @@ static void intel_epb_restore(void)
u64 val = this_cpu_read(saved_epb);
u64 epb;
- rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
+ rdmsrq(MSR_IA32_ENERGY_PERF_BIAS, epb);
if (val) {
val &= EPB_MASK;
} else {
@@ -111,7 +111,7 @@ static void intel_epb_restore(void)
pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n");
}
}
- wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, (epb & ~EPB_MASK) | val);
+ wrmsrq(MSR_IA32_ENERGY_PERF_BIAS, (epb & ~EPB_MASK) | val);
}
static struct syscore_ops intel_epb_syscore_ops = {
@@ -135,7 +135,7 @@ static ssize_t energy_perf_bias_show(struct device *dev,
u64 epb;
int ret;
- ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb);
+ ret = rdmsrq_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb);
if (ret < 0)
return ret;
@@ -157,11 +157,11 @@ static ssize_t energy_perf_bias_store(struct device *dev,
else if (kstrtou64(buf, 0, &val) || val > MAX_EPB)
return -EINVAL;
- ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb);
+ ret = rdmsrq_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb);
if (ret < 0)
return ret;
- ret = wrmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS,
+ ret = wrmsrq_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS,
(epb & ~EPB_MASK) | val);
if (ret < 0)
return ret;
@@ -204,7 +204,12 @@ static int intel_epb_offline(unsigned int cpu)
}
static const struct x86_cpu_id intel_epb_normal[] = {
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, 7),
+ X86_MATCH_VFM(INTEL_ALDERLAKE_L,
+ ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
+ X86_MATCH_VFM(INTEL_ATOM_GRACEMONT,
+ ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_P,
+ ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
{}
};
@@ -232,4 +237,4 @@ err_out_online:
cpuhp_remove_state(CPUHP_AP_X86_INTEL_EPB_ONLINE);
return ret;
}
-subsys_initcall(intel_epb_init);
+late_initcall(intel_epb_init);
diff --git a/arch/x86/kernel/cpu/intel_pconfig.c b/arch/x86/kernel/cpu/intel_pconfig.c
deleted file mode 100644
index 0771a905b286..000000000000
--- a/arch/x86/kernel/cpu/intel_pconfig.c
+++ /dev/null
@@ -1,82 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Intel PCONFIG instruction support.
- *
- * Copyright (C) 2017 Intel Corporation
- *
- * Author:
- * Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
- */
-
-#include <asm/cpufeature.h>
-#include <asm/intel_pconfig.h>
-
-#define PCONFIG_CPUID 0x1b
-
-#define PCONFIG_CPUID_SUBLEAF_MASK ((1 << 12) - 1)
-
-/* Subleaf type (EAX) for PCONFIG CPUID leaf (0x1B) */
-enum {
- PCONFIG_CPUID_SUBLEAF_INVALID = 0,
- PCONFIG_CPUID_SUBLEAF_TARGETID = 1,
-};
-
-/* Bitmask of supported targets */
-static u64 targets_supported __read_mostly;
-
-int pconfig_target_supported(enum pconfig_target target)
-{
- /*
- * We would need to re-think the implementation once we get > 64
- * PCONFIG targets. Spec allows up to 2^32 targets.
- */
- BUILD_BUG_ON(PCONFIG_TARGET_NR >= 64);
-
- if (WARN_ON_ONCE(target >= 64))
- return 0;
- return targets_supported & (1ULL << target);
-}
-
-static int __init intel_pconfig_init(void)
-{
- int subleaf;
-
- if (!boot_cpu_has(X86_FEATURE_PCONFIG))
- return 0;
-
- /*
- * Scan subleafs of PCONFIG CPUID leaf.
- *
- * Subleafs of the same type need not to be consecutive.
- *
- * Stop on the first invalid subleaf type. All subleafs after the first
- * invalid are invalid too.
- */
- for (subleaf = 0; subleaf < INT_MAX; subleaf++) {
- struct cpuid_regs regs;
-
- cpuid_count(PCONFIG_CPUID, subleaf,
- &regs.eax, &regs.ebx, &regs.ecx, &regs.edx);
-
- switch (regs.eax & PCONFIG_CPUID_SUBLEAF_MASK) {
- case PCONFIG_CPUID_SUBLEAF_INVALID:
- /* Stop on the first invalid subleaf */
- goto out;
- case PCONFIG_CPUID_SUBLEAF_TARGETID:
- /* Mark supported PCONFIG targets */
- if (regs.ebx < 64)
- targets_supported |= (1ULL << regs.ebx);
- if (regs.ecx < 64)
- targets_supported |= (1ULL << regs.ecx);
- if (regs.edx < 64)
- targets_supported |= (1ULL << regs.edx);
- break;
- default:
- /* Unknown CPUID.PCONFIG subleaf: ignore */
- break;
- }
- }
-out:
- return 0;
-}
-arch_initcall(intel_pconfig_init);
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
index ad6776081e60..6af1e8baeb0f 100644
--- a/arch/x86/kernel/cpu/match.c
+++ b/arch/x86/kernel/cpu/match.c
@@ -6,7 +6,35 @@
#include <linux/slab.h>
/**
- * x86_match_cpu - match current CPU again an array of x86_cpu_ids
+ * x86_match_vendor_cpu_type - helper function to match the hardware defined
+ * cpu-type for a single entry in the x86_cpu_id
+ * table. Note, this function does not match the
+ * generic cpu-types TOPO_CPU_TYPE_EFFICIENCY and
+ * TOPO_CPU_TYPE_PERFORMANCE.
+ * @c: Pointer to the cpuinfo_x86 structure of the CPU to match.
+ * @m: Pointer to the x86_cpu_id entry to match against.
+ *
+ * Return: true if the cpu-type matches, false otherwise.
+ */
+static bool x86_match_vendor_cpu_type(struct cpuinfo_x86 *c, const struct x86_cpu_id *m)
+{
+ if (m->type == X86_CPU_TYPE_ANY)
+ return true;
+
+ /* Hybrid CPUs are special, they are assumed to match all cpu-types */
+ if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
+ return true;
+
+ if (c->x86_vendor == X86_VENDOR_INTEL)
+ return m->type == c->topo.intel_type;
+ if (c->x86_vendor == X86_VENDOR_AMD)
+ return m->type == c->topo.amd_type;
+
+ return false;
+}
+
+/**
+ * x86_match_cpu - match current CPU against an array of x86_cpu_ids
* @match: Pointer to array of x86_cpu_ids. Last entry terminated with
* {}.
*
@@ -17,8 +45,7 @@
*
* A typical table entry would be to match a specific CPU
*
- * X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_BROADWELL,
- * X86_FEATURE_ANY, NULL);
+ * X86_MATCH_VFM_FEATURE(INTEL_BROADWELL, X86_FEATURE_ANY, NULL);
*
* Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY,
* %X86_MODEL_ANY, %X86_FEATURE_ANY (except for vendor)
@@ -26,7 +53,7 @@
* asm/cpu_device_id.h contains a set of useful macros which are shortcuts
* for various common selections. The above can be shortened to:
*
- * X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, NULL);
+ * X86_MATCH_VFM(INTEL_BROADWELL, NULL);
*
* Arrays used to match for this should also be declared using
* MODULE_DEVICE_TABLE(x86cpu, ...)
@@ -39,9 +66,7 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)
const struct x86_cpu_id *m;
struct cpuinfo_x86 *c = &boot_cpu_data;
- for (m = match;
- m->vendor | m->family | m->model | m->steppings | m->feature;
- m++) {
+ for (m = match; m->flags & X86_CPU_ID_FLAG_ENTRY_VALID; m++) {
if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor)
continue;
if (m->family != X86_FAMILY_ANY && c->x86 != m->family)
@@ -53,39 +78,21 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)
continue;
if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature))
continue;
- return m;
- }
- return NULL;
-}
-EXPORT_SYMBOL(x86_match_cpu);
-
-static const struct x86_cpu_desc *
-x86_match_cpu_with_stepping(const struct x86_cpu_desc *match)
-{
- struct cpuinfo_x86 *c = &boot_cpu_data;
- const struct x86_cpu_desc *m;
-
- for (m = match; m->x86_family | m->x86_model; m++) {
- if (c->x86_vendor != m->x86_vendor)
- continue;
- if (c->x86 != m->x86_family)
- continue;
- if (c->x86_model != m->x86_model)
- continue;
- if (c->x86_stepping != m->x86_stepping)
+ if (!x86_match_vendor_cpu_type(c, m))
continue;
return m;
}
return NULL;
}
+EXPORT_SYMBOL(x86_match_cpu);
-bool x86_cpu_has_min_microcode_rev(const struct x86_cpu_desc *table)
+bool x86_match_min_microcode_rev(const struct x86_cpu_id *table)
{
- const struct x86_cpu_desc *res = x86_match_cpu_with_stepping(table);
+ const struct x86_cpu_id *res = x86_match_cpu(table);
- if (!res || res->x86_microcode_rev > boot_cpu_data.microcode)
+ if (!res || res->driver_data > boot_cpu_data.microcode)
return false;
return true;
}
-EXPORT_SYMBOL_GPL(x86_cpu_has_min_microcode_rev);
+EXPORT_SYMBOL_GPL(x86_match_min_microcode_rev);
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 1c87501e0fa3..9d852c3b2cb5 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -4,8 +4,6 @@
*
* Written by Jacob Shin - AMD, Inc.
* Maintained by: Borislav Petkov <bp@alien8.de>
- *
- * All MC4_MISCi registers are shared between cores on a node.
*/
#include <linux/interrupt.h>
#include <linux/notifier.h>
@@ -20,7 +18,6 @@
#include <linux/smp.h>
#include <linux/string.h>
-#include <asm/amd_nb.h>
#include <asm/traps.h>
#include <asm/apic.h>
#include <asm/mce.h>
@@ -87,42 +84,40 @@ struct smca_bank {
static DEFINE_PER_CPU_READ_MOSTLY(struct smca_bank[MAX_NR_BANKS], smca_banks);
static DEFINE_PER_CPU_READ_MOSTLY(u8[N_SMCA_BANK_TYPES], smca_bank_counts);
-struct smca_bank_name {
- const char *name; /* Short name for sysfs */
- const char *long_name; /* Long name for pretty-printing */
-};
-
-static struct smca_bank_name smca_names[] = {
- [SMCA_LS ... SMCA_LS_V2] = { "load_store", "Load Store Unit" },
- [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" },
- [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" },
- [SMCA_DE] = { "decode_unit", "Decode Unit" },
- [SMCA_RESERVED] = { "reserved", "Reserved" },
- [SMCA_EX] = { "execution_unit", "Execution Unit" },
- [SMCA_FP] = { "floating_point", "Floating Point Unit" },
- [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" },
- [SMCA_CS ... SMCA_CS_V2] = { "coherent_slave", "Coherent Slave" },
- [SMCA_PIE] = { "pie", "Power, Interrupts, etc." },
+static const char * const smca_names[] = {
+ [SMCA_LS ... SMCA_LS_V2] = "load_store",
+ [SMCA_IF] = "insn_fetch",
+ [SMCA_L2_CACHE] = "l2_cache",
+ [SMCA_DE] = "decode_unit",
+ [SMCA_RESERVED] = "reserved",
+ [SMCA_EX] = "execution_unit",
+ [SMCA_FP] = "floating_point",
+ [SMCA_L3_CACHE] = "l3_cache",
+ [SMCA_CS ... SMCA_CS_V2] = "coherent_slave",
+ [SMCA_PIE] = "pie",
/* UMC v2 is separate because both of them can exist in a single system. */
- [SMCA_UMC] = { "umc", "Unified Memory Controller" },
- [SMCA_UMC_V2] = { "umc_v2", "Unified Memory Controller v2" },
- [SMCA_PB] = { "param_block", "Parameter Block" },
- [SMCA_PSP ... SMCA_PSP_V2] = { "psp", "Platform Security Processor" },
- [SMCA_SMU ... SMCA_SMU_V2] = { "smu", "System Management Unit" },
- [SMCA_MP5] = { "mp5", "Microprocessor 5 Unit" },
- [SMCA_MPDMA] = { "mpdma", "MPDMA Unit" },
- [SMCA_NBIO] = { "nbio", "Northbridge IO Unit" },
- [SMCA_PCIE ... SMCA_PCIE_V2] = { "pcie", "PCI Express Unit" },
- [SMCA_XGMI_PCS] = { "xgmi_pcs", "Ext Global Memory Interconnect PCS Unit" },
- [SMCA_NBIF] = { "nbif", "NBIF Unit" },
- [SMCA_SHUB] = { "shub", "System Hub Unit" },
- [SMCA_SATA] = { "sata", "SATA Unit" },
- [SMCA_USB] = { "usb", "USB Unit" },
- [SMCA_GMI_PCS] = { "gmi_pcs", "Global Memory Interconnect PCS Unit" },
- [SMCA_XGMI_PHY] = { "xgmi_phy", "Ext Global Memory Interconnect PHY Unit" },
- [SMCA_WAFL_PHY] = { "wafl_phy", "WAFL PHY Unit" },
- [SMCA_GMI_PHY] = { "gmi_phy", "Global Memory Interconnect PHY Unit" },
+ [SMCA_UMC] = "umc",
+ [SMCA_UMC_V2] = "umc_v2",
+ [SMCA_MA_LLC] = "ma_llc",
+ [SMCA_PB] = "param_block",
+ [SMCA_PSP ... SMCA_PSP_V2] = "psp",
+ [SMCA_SMU ... SMCA_SMU_V2] = "smu",
+ [SMCA_MP5] = "mp5",
+ [SMCA_MPDMA] = "mpdma",
+ [SMCA_NBIO] = "nbio",
+ [SMCA_PCIE ... SMCA_PCIE_V2] = "pcie",
+ [SMCA_XGMI_PCS] = "xgmi_pcs",
+ [SMCA_NBIF] = "nbif",
+ [SMCA_SHUB] = "shub",
+ [SMCA_SATA] = "sata",
+ [SMCA_USB] = "usb",
+ [SMCA_USR_DP] = "usr_dp",
+ [SMCA_USR_CP] = "usr_cp",
+ [SMCA_GMI_PCS] = "gmi_pcs",
+ [SMCA_XGMI_PHY] = "xgmi_phy",
+ [SMCA_WAFL_PHY] = "wafl_phy",
+ [SMCA_GMI_PHY] = "gmi_phy",
};
static const char *smca_get_name(enum smca_bank_types t)
@@ -130,18 +125,9 @@ static const char *smca_get_name(enum smca_bank_types t)
if (t >= N_SMCA_BANK_TYPES)
return NULL;
- return smca_names[t].name;
+ return smca_names[t];
}
-const char *smca_get_long_name(enum smca_bank_types t)
-{
- if (t >= N_SMCA_BANK_TYPES)
- return NULL;
-
- return smca_names[t].long_name;
-}
-EXPORT_SYMBOL_GPL(smca_get_long_name);
-
enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank)
{
struct smca_bank *b;
@@ -178,6 +164,7 @@ static const struct smca_hwid smca_hwid_mcatypes[] = {
{ SMCA_CS, HWID_MCATYPE(0x2E, 0x0) },
{ SMCA_PIE, HWID_MCATYPE(0x2E, 0x1) },
{ SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2) },
+ { SMCA_MA_LLC, HWID_MCATYPE(0x2E, 0x4) },
/* Unified Memory Controller MCA type */
{ SMCA_UMC, HWID_MCATYPE(0x96, 0x0) },
@@ -212,6 +199,8 @@ static const struct smca_hwid smca_hwid_mcatypes[] = {
{ SMCA_SHUB, HWID_MCATYPE(0x80, 0x0) },
{ SMCA_SATA, HWID_MCATYPE(0xA8, 0x0) },
{ SMCA_USB, HWID_MCATYPE(0xAA, 0x0) },
+ { SMCA_USR_DP, HWID_MCATYPE(0x170, 0x0) },
+ { SMCA_USR_CP, HWID_MCATYPE(0x180, 0x0) },
{ SMCA_GMI_PCS, HWID_MCATYPE(0x241, 0x0) },
{ SMCA_XGMI_PHY, HWID_MCATYPE(0x259, 0x0) },
{ SMCA_WAFL_PHY, HWID_MCATYPE(0x267, 0x0) },
@@ -229,16 +218,42 @@ static const struct smca_hwid smca_hwid_mcatypes[] = {
#define MAX_MCATYPE_NAME_LEN 30
static char buf_mcatype[MAX_MCATYPE_NAME_LEN];
+struct threshold_block {
+ /* This block's number within its bank. */
+ unsigned int block;
+ /* MCA bank number that contains this block. */
+ unsigned int bank;
+ /* CPU which controls this block's MCA bank. */
+ unsigned int cpu;
+ /* MCA_MISC MSR address for this block. */
+ u32 address;
+ /* Enable/Disable APIC interrupt. */
+ bool interrupt_enable;
+ /* Bank can generate an interrupt. */
+ bool interrupt_capable;
+ /* Value upon which threshold interrupt is generated. */
+ u16 threshold_limit;
+ /* sysfs object */
+ struct kobject kobj;
+ /* List of threshold blocks within this block's MCA bank. */
+ struct list_head miscj;
+};
+
+struct threshold_bank {
+ struct kobject *kobj;
+ struct threshold_block *blocks;
+};
+
static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
/*
* A list of the banks enabled on each logical CPU. Controls which respective
* descriptors to initialize later in mce_threshold_create_device().
*/
-static DEFINE_PER_CPU(unsigned int, bank_map);
+static DEFINE_PER_CPU(u64, bank_map);
/* Map of banks that have more than MCA_MISC0 available. */
-static DEFINE_PER_CPU(u32, smca_misc_banks_map);
+static DEFINE_PER_CPU(u64, smca_misc_banks_map);
static void amd_threshold_interrupt(void);
static void amd_deferred_error_interrupt(void);
@@ -267,7 +282,7 @@ static void smca_set_misc_banks_map(unsigned int bank, unsigned int cpu)
return;
if (low & MASK_BLKPTR_LO)
- per_cpu(smca_misc_banks_map, cpu) |= BIT(bank);
+ per_cpu(smca_misc_banks_map, cpu) |= BIT_ULL(bank);
}
@@ -306,6 +321,8 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
if ((low & BIT(5)) && !((high >> 5) & 0x3))
high |= BIT(5);
+ this_cpu_ptr(mce_banks_array)[bank].lsb_in_status = !!(low & BIT(8));
+
wrmsr(smca_config, low, high);
}
@@ -339,19 +356,6 @@ struct thresh_restart {
u16 old_limit;
};
-static inline bool is_shared_bank(int bank)
-{
- /*
- * Scalable MCA provides for only one core to have access to the MSRs of
- * a shared bank.
- */
- if (mce_flags.smca)
- return false;
-
- /* Bank 4 is for northbridge reporting and is thus shared */
- return (bank == 4);
-}
-
static const char *bank4_names(const struct threshold_block *b)
{
switch (b->address) {
@@ -387,7 +391,7 @@ static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits)
return msr_high_bits & BIT(28);
}
-static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
+static bool lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
{
int msr = (hi & MASK_LVTOFF_HI) >> 20;
@@ -395,7 +399,7 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt "
"for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu,
b->bank, b->block, b->address, hi, lo);
- return 0;
+ return false;
}
if (apic != msr) {
@@ -405,15 +409,15 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
* was set is reserved. Return early here:
*/
if (mce_flags.smca)
- return 0;
+ return false;
pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d "
"for bank %d, block %d (MSR%08X=0x%x%08x)\n",
b->cpu, apic, b->bank, b->block, b->address, hi, lo);
- return 0;
+ return false;
}
- return 1;
+ return true;
};
/* Reprogram MCx_MISC MSR behind this threshold bank. */
@@ -528,7 +532,7 @@ static u32 smca_get_block_address(unsigned int bank, unsigned int block,
if (!block)
return MSR_AMD64_SMCA_MCx_MISC(bank);
- if (!(per_cpu(smca_misc_banks_map, cpu) & BIT(bank)))
+ if (!(per_cpu(smca_misc_banks_map, cpu) & BIT_ULL(bank)))
return 0;
return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
@@ -572,7 +576,7 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
int new;
if (!block)
- per_cpu(bank_map, cpu) |= (1 << bank);
+ per_cpu(bank_map, cpu) |= BIT_ULL(bank);
memset(&b, 0, sizeof(b));
b.cpu = cpu;
@@ -658,12 +662,12 @@ static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
return;
}
- rdmsrl(MSR_K7_HWCR, hwcr);
+ rdmsrq(MSR_K7_HWCR, hwcr);
/* McStatusWrEn has to be set */
need_toggle = !(hwcr & BIT(18));
if (need_toggle)
- wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
+ wrmsrq(MSR_K7_HWCR, hwcr | BIT(18));
/* Clear CntP bit safely */
for (i = 0; i < num_msrs; i++)
@@ -671,7 +675,7 @@ static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
/* restore old settings */
if (need_toggle)
- wrmsrl(MSR_K7_HWCR, hwcr);
+ wrmsrq(MSR_K7_HWCR, hwcr);
}
/* cpu init entry point, called from mce.c with preempt off */
@@ -711,50 +715,106 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
deferred_error_interrupt_enable(c);
}
-bool amd_mce_is_memory_error(struct mce *m)
+/*
+ * DRAM ECC errors are reported in the Northbridge (bank 4) with
+ * Extended Error Code 8.
+ */
+static bool legacy_mce_is_memory_error(struct mce *m)
+{
+ return m->bank == 4 && XEC(m->status, 0x1f) == 8;
+}
+
+/*
+ * DRAM ECC errors are reported in Unified Memory Controllers with
+ * Extended Error Code 0.
+ */
+static bool smca_mce_is_memory_error(struct mce *m)
{
- /* ErrCodeExt[20:16] */
- u8 xec = (m->status >> 16) & 0x1f;
+ enum smca_bank_types bank_type;
+ if (XEC(m->status, 0x3f))
+ return false;
+
+ bank_type = smca_get_bank_type(m->extcpu, m->bank);
+
+ return bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2;
+}
+
+bool amd_mce_is_memory_error(struct mce *m)
+{
if (mce_flags.smca)
- return smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC && xec == 0x0;
+ return smca_mce_is_memory_error(m);
+ else
+ return legacy_mce_is_memory_error(m);
+}
+
+/*
+ * AMD systems do not have an explicit indicator that the value in MCA_ADDR is
+ * a system physical address. Therefore, individual cases need to be detected.
+ * Future cases and checks will be added as needed.
+ *
+ * 1) General case
+ * a) Assume address is not usable.
+ * 2) Poison errors
+ * a) Indicated by MCA_STATUS[43]: poison. Defined for all banks except legacy
+ * northbridge (bank 4).
+ * b) Refers to poison consumption in the core. Does not include "no action",
+ * "action optional", or "deferred" error severities.
+ * c) Will include a usable address so that immediate action can be taken.
+ * 3) Northbridge DRAM ECC errors
+ * a) Reported in legacy bank 4 with extended error code (XEC) 8.
+ * b) MCA_STATUS[43] is *not* defined as poison in legacy bank 4. Therefore,
+ * this bit should not be checked.
+ *
+ * NOTE: SMCA UMC memory errors fall into case #1.
+ */
+bool amd_mce_usable_address(struct mce *m)
+{
+ /* Check special northbridge case 3) first. */
+ if (!mce_flags.smca) {
+ if (legacy_mce_is_memory_error(m))
+ return true;
+ else if (m->bank == 4)
+ return false;
+ }
- return m->bank == 4 && xec == 0x8;
+ /* Check poison bit for all other bank types. */
+ if (m->status & MCI_STATUS_POISON)
+ return true;
+
+ /* Assume address is not usable for all others. */
+ return false;
}
static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
{
- struct mce m;
-
- mce_setup(&m);
+ struct mce_hw_err err;
+ struct mce *m = &err.m;
- m.status = status;
- m.misc = misc;
- m.bank = bank;
- m.tsc = rdtsc();
+ mce_prep_record(&err);
- if (m.status & MCI_STATUS_ADDRV) {
- m.addr = addr;
+ m->status = status;
+ m->misc = misc;
+ m->bank = bank;
+ m->tsc = rdtsc();
- /*
- * Extract [55:<lsb>] where lsb is the least significant
- * *valid* bit of the address bits.
- */
- if (mce_flags.smca) {
- u8 lsb = (m.addr >> 56) & 0x3f;
+ if (m->status & MCI_STATUS_ADDRV) {
+ m->addr = addr;
- m.addr &= GENMASK_ULL(55, lsb);
- }
+ smca_extract_err_addr(m);
}
if (mce_flags.smca) {
- rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m.ipid);
+ rdmsrq(MSR_AMD64_SMCA_MCx_IPID(bank), m->ipid);
- if (m.status & MCI_STATUS_SYNDV)
- rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m.synd);
+ if (m->status & MCI_STATUS_SYNDV) {
+ rdmsrq(MSR_AMD64_SMCA_MCx_SYND(bank), m->synd);
+ rdmsrq(MSR_AMD64_SMCA_MCx_SYND1(bank), err.vendor.amd.synd1);
+ rdmsrq(MSR_AMD64_SMCA_MCx_SYND2(bank), err.vendor.amd.synd2);
+ }
}
- mce_log(&m);
+ mce_log(&err);
}
DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error)
@@ -763,7 +823,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error)
inc_irq_stat(irq_deferred_error_count);
deferred_error_int_vector();
trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR);
- ack_APIC_irq();
+ apic_eoi();
}
/*
@@ -774,20 +834,38 @@ _log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc)
{
u64 status, addr = 0;
- rdmsrl(msr_stat, status);
+ rdmsrq(msr_stat, status);
if (!(status & MCI_STATUS_VAL))
return false;
if (status & MCI_STATUS_ADDRV)
- rdmsrl(msr_addr, addr);
+ rdmsrq(msr_addr, addr);
__log_error(bank, status, addr, misc);
- wrmsrl(msr_stat, 0);
+ wrmsrq(msr_stat, 0);
return status & MCI_STATUS_DEFERRED;
}
+static bool _log_error_deferred(unsigned int bank, u32 misc)
+{
+ if (!_log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS),
+ mca_msr_reg(bank, MCA_ADDR), misc))
+ return false;
+
+ /*
+ * Non-SMCA systems don't have MCA_DESTAT/MCA_DEADDR registers.
+ * Return true here to avoid accessing these registers.
+ */
+ if (!mce_flags.smca)
+ return true;
+
+ /* Clear MCA_DESTAT if the deferred error was logged from MCA_STATUS. */
+ wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0);
+ return true;
+}
+
/*
* We have three scenarios for checking for Deferred errors:
*
@@ -799,20 +877,9 @@ _log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc)
*/
static void log_error_deferred(unsigned int bank)
{
- bool defrd;
-
- defrd = _log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS),
- mca_msr_reg(bank, MCA_ADDR), 0);
-
- if (!mce_flags.smca)
+ if (_log_error_deferred(bank, 0))
return;
- /* Clear MCA_DESTAT if we logged the deferred error from MCA_STATUS. */
- if (defrd) {
- wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0);
- return;
- }
-
/*
* Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check
* for a valid error.
@@ -832,7 +899,7 @@ static void amd_deferred_error_interrupt(void)
static void log_error_thresholding(unsigned int bank, u64 misc)
{
- _log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS), mca_msr_reg(bank, MCA_ADDR), misc);
+ _log_error_deferred(bank, misc);
}
static void log_and_reset_block(struct threshold_block *block)
@@ -877,7 +944,7 @@ static void amd_threshold_interrupt(void)
return;
for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) {
- if (!(per_cpu(bank_map, cpu) & (1 << bank)))
+ if (!(per_cpu(bank_map, cpu) & BIT_ULL(bank)))
continue;
first_block = bp[bank]->blocks;
@@ -1028,7 +1095,7 @@ static const struct sysfs_ops threshold_ops = {
static void threshold_block_release(struct kobject *kobj);
-static struct kobj_type threshold_ktype = {
+static const struct kobj_type threshold_ktype = {
.sysfs_ops = &threshold_ops,
.default_groups = default_groups,
.release = threshold_block_release,
@@ -1049,7 +1116,7 @@ static const char *get_name(unsigned int cpu, unsigned int bank, struct threshol
if (bank_type >= N_SMCA_BANK_TYPES)
return NULL;
- if (b && bank_type == SMCA_UMC) {
+ if (b && (bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2)) {
if (b->block < ARRAY_SIZE(smca_umc_block_names))
return smca_umc_block_names[b->block];
return NULL;
@@ -1141,35 +1208,10 @@ out_free:
return err;
}
-static int __threshold_add_blocks(struct threshold_bank *b)
-{
- struct list_head *head = &b->blocks->miscj;
- struct threshold_block *pos = NULL;
- struct threshold_block *tmp = NULL;
- int err = 0;
-
- err = kobject_add(&b->blocks->kobj, b->kobj, b->blocks->kobj.name);
- if (err)
- return err;
-
- list_for_each_entry_safe(pos, tmp, head, miscj) {
-
- err = kobject_add(&pos->kobj, b->kobj, pos->kobj.name);
- if (err) {
- list_for_each_entry_safe_reverse(pos, tmp, head, miscj)
- kobject_del(&pos->kobj);
-
- return err;
- }
- }
- return err;
-}
-
static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu,
unsigned int bank)
{
struct device *dev = this_cpu_read(mce_device);
- struct amd_northbridge *nb = NULL;
struct threshold_bank *b = NULL;
const char *name = get_name(cpu, bank, NULL);
int err = 0;
@@ -1177,26 +1219,6 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu,
if (!dev)
return -ENODEV;
- if (is_shared_bank(bank)) {
- nb = node_to_amd_nb(topology_die_id(cpu));
-
- /* threshold descriptor already initialized on this node? */
- if (nb && nb->bank4) {
- /* yes, use it */
- b = nb->bank4;
- err = kobject_add(b->kobj, &dev->kobj, name);
- if (err)
- goto out;
-
- bp[bank] = b;
- refcount_inc(&b->cpus);
-
- err = __threshold_add_blocks(b);
-
- goto out;
- }
- }
-
b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
if (!b) {
err = -ENOMEM;
@@ -1210,17 +1232,6 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu,
goto out_free;
}
- if (is_shared_bank(bank)) {
- b->shared = 1;
- refcount_set(&b->cpus, 1);
-
- /* nb is already initialized, see above */
- if (nb) {
- WARN_ON(nb->bank4);
- nb->bank4 = b;
- }
- }
-
err = allocate_threshold_blocks(cpu, b, bank, 0, mca_msr_reg(bank, MCA_MISC));
if (err)
goto out_kobj;
@@ -1253,40 +1264,11 @@ static void deallocate_threshold_blocks(struct threshold_bank *bank)
kobject_put(&bank->blocks->kobj);
}
-static void __threshold_remove_blocks(struct threshold_bank *b)
-{
- struct threshold_block *pos = NULL;
- struct threshold_block *tmp = NULL;
-
- kobject_del(b->kobj);
-
- list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj)
- kobject_del(&pos->kobj);
-}
-
static void threshold_remove_bank(struct threshold_bank *bank)
{
- struct amd_northbridge *nb;
-
if (!bank->blocks)
goto out_free;
- if (!bank->shared)
- goto out_dealloc;
-
- if (!refcount_dec_and_test(&bank->cpus)) {
- __threshold_remove_blocks(bank);
- return;
- } else {
- /*
- * The last CPU on this node using the shared bank is going
- * away, remove that bank now.
- */
- nb = node_to_amd_nb(topology_die_id(smp_processor_id()));
- nb->bank4 = NULL;
- }
-
-out_dealloc:
deallocate_threshold_blocks(bank);
out_free:
@@ -1355,7 +1337,7 @@ int mce_threshold_create_device(unsigned int cpu)
return -ENOMEM;
for (bank = 0; bank < numbanks; ++bank) {
- if (!(this_cpu_read(bank_map) & (1 << bank)))
+ if (!(this_cpu_read(bank_map) & BIT_ULL(bank)))
continue;
err = threshold_create_bank(bp, cpu, bank);
if (err) {
diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c
index 717192915f28..0a89947e47bc 100644
--- a/arch/x86/kernel/cpu/mce/apei.c
+++ b/arch/x86/kernel/cpu/mce/apei.c
@@ -28,35 +28,50 @@
void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
{
- struct mce m;
+ struct mce_hw_err err;
+ struct mce *m;
+ int lsb;
if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
return;
- mce_setup(&m);
- m.bank = -1;
+ /*
+ * Even if the ->validation_bits are set for address mask,
+ * to be extra safe, check and reject an error radius '0',
+ * and fall back to the default page size.
+ */
+ if (mem_err->validation_bits & CPER_MEM_VALID_PA_MASK)
+ lsb = find_first_bit((void *)&mem_err->physical_addr_mask, PAGE_SHIFT);
+ else
+ lsb = PAGE_SHIFT;
+
+ mce_prep_record(&err);
+ m = &err.m;
+ m->bank = -1;
/* Fake a memory read error with unknown channel */
- m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f;
- m.misc = (MCI_MISC_ADDR_PHYS << 6) | PAGE_SHIFT;
+ m->status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f;
+ m->misc = (MCI_MISC_ADDR_PHYS << 6) | lsb;
if (severity >= GHES_SEV_RECOVERABLE)
- m.status |= MCI_STATUS_UC;
+ m->status |= MCI_STATUS_UC;
if (severity >= GHES_SEV_PANIC) {
- m.status |= MCI_STATUS_PCC;
- m.tsc = rdtsc();
+ m->status |= MCI_STATUS_PCC;
+ m->tsc = rdtsc();
}
- m.addr = mem_err->physical_addr;
- mce_log(&m);
+ m->addr = mem_err->physical_addr;
+ mce_log(&err);
}
EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id)
{
const u64 *i_mce = ((const u64 *) (ctx_info + 1));
- unsigned int cpu;
- struct mce m;
+ unsigned int cpu, num_regs;
+ bool apicid_found = false;
+ struct mce_hw_err err;
+ struct mce *m;
if (!boot_cpu_has(X86_FEATURE_SMCA))
return -EINVAL;
@@ -74,41 +89,86 @@ int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id)
return -EINVAL;
/*
- * The register array size must be large enough to include all the
- * SMCA registers which need to be extracted.
- *
* The number of registers in the register array is determined by
* Register Array Size/8 as defined in UEFI spec v2.8, sec N.2.4.2.2.
- * The register layout is fixed and currently the raw data in the
- * register array includes 6 SMCA registers which the kernel can
- * extract.
+ * Sanity-check registers array size.
*/
- if (ctx_info->reg_arr_size < 48)
+ num_regs = ctx_info->reg_arr_size >> 3;
+ if (!num_regs)
return -EINVAL;
- mce_setup(&m);
-
- m.extcpu = -1;
- m.socketid = -1;
-
for_each_possible_cpu(cpu) {
- if (cpu_data(cpu).initial_apicid == lapic_id) {
- m.extcpu = cpu;
- m.socketid = cpu_data(m.extcpu).phys_proc_id;
+ if (cpu_data(cpu).topo.initial_apicid == lapic_id) {
+ apicid_found = true;
break;
}
}
- m.apicid = lapic_id;
- m.bank = (ctx_info->msr_addr >> 4) & 0xFF;
- m.status = *i_mce;
- m.addr = *(i_mce + 1);
- m.misc = *(i_mce + 2);
- /* Skipping MCA_CONFIG */
- m.ipid = *(i_mce + 4);
- m.synd = *(i_mce + 5);
+ if (!apicid_found)
+ return -EINVAL;
+
+ m = &err.m;
+ memset(&err, 0, sizeof(struct mce_hw_err));
+ mce_prep_record_common(m);
+ mce_prep_record_per_cpu(cpu, m);
+
+ m->bank = (ctx_info->msr_addr >> 4) & 0xFF;
+
+ /*
+ * The SMCA register layout is fixed and includes 16 registers.
+ * The end of the array may be variable, but the beginning is known.
+ * Cap the number of registers to expected max (15).
+ */
+ if (num_regs > 15)
+ num_regs = 15;
+
+ switch (num_regs) {
+ /* MCA_SYND2 */
+ case 15:
+ err.vendor.amd.synd2 = *(i_mce + 14);
+ fallthrough;
+ /* MCA_SYND1 */
+ case 14:
+ err.vendor.amd.synd1 = *(i_mce + 13);
+ fallthrough;
+ /* MCA_MISC4 */
+ case 13:
+ /* MCA_MISC3 */
+ case 12:
+ /* MCA_MISC2 */
+ case 11:
+ /* MCA_MISC1 */
+ case 10:
+ /* MCA_DEADDR */
+ case 9:
+ /* MCA_DESTAT */
+ case 8:
+ /* reserved */
+ case 7:
+ /* MCA_SYND */
+ case 6:
+ m->synd = *(i_mce + 5);
+ fallthrough;
+ /* MCA_IPID */
+ case 5:
+ m->ipid = *(i_mce + 4);
+ fallthrough;
+ /* MCA_CONFIG */
+ case 4:
+ /* MCA_MISC0 */
+ case 3:
+ m->misc = *(i_mce + 2);
+ fallthrough;
+ /* MCA_ADDR */
+ case 2:
+ m->addr = *(i_mce + 1);
+ fallthrough;
+ /* MCA_STATUS */
+ case 1:
+ m->status = *i_mce;
+ }
- mce_log(&m);
+ mce_log(&err);
return 0;
}
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 2c8ec5c71712..e9b3c5d4a52e 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -44,14 +44,17 @@
#include <linux/sync_core.h>
#include <linux/task_work.h>
#include <linux/hardirq.h>
+#include <linux/kexec.h>
-#include <asm/intel-family.h>
+#include <asm/fred.h>
+#include <asm/cpu_device_id.h>
#include <asm/processor.h>
#include <asm/traps.h>
#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/msr.h>
#include <asm/reboot.h>
+#include <asm/tdx.h>
#include "internal.h"
@@ -67,13 +70,7 @@ DEFINE_PER_CPU(unsigned, mce_exception_count);
DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);
-struct mce_bank {
- u64 ctl; /* subevents to enable */
-
- __u64 init : 1, /* initialise bank? */
- __reserved_1 : 63;
-};
-static DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
+DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
#define ATTR_LEN 16
/* One object for each MCE bank, shared by all CPUs */
@@ -91,7 +88,7 @@ struct mca_config mca_cfg __read_mostly = {
.monarch_timeout = -1
};
-static DEFINE_PER_CPU(struct mce, mces_seen);
+static DEFINE_PER_CPU(struct mce_hw_err, hw_errs_seen);
static unsigned long mce_need_notify;
/*
@@ -120,28 +117,41 @@ static struct irq_work mce_irq_work;
*/
BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
-/* Do initial initialization of a struct mce */
-void mce_setup(struct mce *m)
+void mce_prep_record_common(struct mce *m)
{
- memset(m, 0, sizeof(struct mce));
- m->cpu = m->extcpu = smp_processor_id();
+ m->cpuid = cpuid_eax(1);
+ m->cpuvendor = boot_cpu_data.x86_vendor;
+ m->mcgcap = native_rdmsrq(MSR_IA32_MCG_CAP);
/* need the internal __ version to avoid deadlocks */
- m->time = __ktime_get_real_seconds();
- m->cpuvendor = boot_cpu_data.x86_vendor;
- m->cpuid = cpuid_eax(1);
- m->socketid = cpu_data(m->extcpu).phys_proc_id;
- m->apicid = cpu_data(m->extcpu).initial_apicid;
- m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP);
- m->ppin = cpu_data(m->extcpu).ppin;
- m->microcode = boot_cpu_data.microcode;
+ m->time = __ktime_get_real_seconds();
+}
+
+void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m)
+{
+ m->cpu = cpu;
+ m->extcpu = cpu;
+ m->apicid = cpu_data(cpu).topo.initial_apicid;
+ m->microcode = cpu_data(cpu).microcode;
+ m->ppin = topology_ppin(cpu);
+ m->socketid = topology_physical_package_id(cpu);
+}
+
+/* Do initial initialization of struct mce_hw_err */
+void mce_prep_record(struct mce_hw_err *err)
+{
+ struct mce *m = &err->m;
+
+ memset(err, 0, sizeof(struct mce_hw_err));
+ mce_prep_record_common(m);
+ mce_prep_record_per_cpu(smp_processor_id(), m);
}
DEFINE_PER_CPU(struct mce, injectm);
EXPORT_PER_CPU_SYMBOL_GPL(injectm);
-void mce_log(struct mce *m)
+void mce_log(struct mce_hw_err *err)
{
- if (!mce_gen_pool_add(m))
+ if (mce_gen_pool_add(err))
irq_work_queue(&mce_irq_work);
}
EXPORT_SYMBOL_GPL(mce_log);
@@ -162,8 +172,10 @@ void mce_unregister_decode_chain(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
-static void __print_mce(struct mce *m)
+static void __print_mce(struct mce_hw_err *err)
{
+ struct mce *m = &err->m;
+
pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
m->extcpu,
(m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
@@ -190,6 +202,10 @@ static void __print_mce(struct mce *m)
if (mce_flags.smca) {
if (m->synd)
pr_cont("SYND %llx ", m->synd);
+ if (err->vendor.amd.synd1)
+ pr_cont("SYND1 %llx ", err->vendor.amd.synd1);
+ if (err->vendor.amd.synd2)
+ pr_cont("SYND2 %llx ", err->vendor.amd.synd2);
if (m->ipid)
pr_cont("IPID %llx ", m->ipid);
}
@@ -205,9 +221,11 @@ static void __print_mce(struct mce *m)
m->microcode);
}
-static void print_mce(struct mce *m)
+static void print_mce(struct mce_hw_err *err)
{
- __print_mce(m);
+ struct mce *m = &err->m;
+
+ __print_mce(err);
if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON)
pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
@@ -234,11 +252,20 @@ static void wait_for_panic(void)
panic("Panicing machine check CPU died");
}
-static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
+static const char *mce_dump_aux_info(struct mce *m)
+{
+ if (boot_cpu_has_bug(X86_BUG_TDX_PW_MCE))
+ return tdx_dump_mce_info(m);
+
+ return NULL;
+}
+
+static noinstr void mce_panic(const char *msg, struct mce_hw_err *final, char *exp)
{
struct llist_node *pending;
struct mce_evt_llist *l;
int apei_err = 0;
+ const char *memmsg;
/*
* Allow instrumentation around external facilities usage. Not that it
@@ -264,20 +291,22 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
pending = mce_gen_pool_prepare_records();
/* First print corrected ones that are still unlogged */
llist_for_each_entry(l, pending, llnode) {
- struct mce *m = &l->mce;
+ struct mce_hw_err *err = &l->err;
+ struct mce *m = &err->m;
if (!(m->status & MCI_STATUS_UC)) {
- print_mce(m);
+ print_mce(err);
if (!apei_err)
apei_err = apei_write_mce(m);
}
}
/* Now print uncorrected but with the final one last */
llist_for_each_entry(l, pending, llnode) {
- struct mce *m = &l->mce;
+ struct mce_hw_err *err = &l->err;
+ struct mce *m = &err->m;
if (!(m->status & MCI_STATUS_UC))
continue;
- if (!final || mce_cmp(m, final)) {
- print_mce(m);
+ if (!final || mce_cmp(m, &final->m)) {
+ print_mce(err);
if (!apei_err)
apei_err = apei_write_mce(m);
}
@@ -285,13 +314,33 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
if (final) {
print_mce(final);
if (!apei_err)
- apei_err = apei_write_mce(final);
+ apei_err = apei_write_mce(&final->m);
}
if (exp)
pr_emerg(HW_ERR "Machine check: %s\n", exp);
+
+ memmsg = mce_dump_aux_info(&final->m);
+ if (memmsg)
+ pr_emerg(HW_ERR "Machine check: %s\n", memmsg);
+
if (!fake_panic) {
if (panic_timeout == 0)
panic_timeout = mca_cfg.panic_timeout;
+
+ /*
+ * Kdump skips the poisoned page in order to avoid
+ * touching the error bits again. Poison the page even
+ * if the error is fatal and the machine is about to
+ * panic.
+ */
+ if (kexec_crash_loaded()) {
+ if (final && (final->m.status & MCI_STATUS_ADDRV)) {
+ struct page *p;
+ p = pfn_to_online_page(final->m.addr >> PAGE_SHIFT);
+ if (p)
+ SetPageHWPoison(p);
+ }
+ }
panic(msg);
} else
pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
@@ -339,9 +388,9 @@ void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr)
}
/* MSR access wrappers used for error injection */
-noinstr u64 mce_rdmsrl(u32 msr)
+noinstr u64 mce_rdmsrq(u32 msr)
{
- DECLARE_ARGS(val, low, high);
+ EAX_EDX_DECLARE_ARGS(val, low, high);
if (__this_cpu_read(injectm.finished)) {
int offset;
@@ -374,7 +423,7 @@ noinstr u64 mce_rdmsrl(u32 msr)
return EAX_EDX_VAL(val, low, high);
}
-static noinstr void mce_wrmsrl(u32 msr, u64 v)
+static noinstr void mce_wrmsrq(u32 msr, u64 v)
{
u32 low, high;
@@ -395,7 +444,7 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v)
low = (u32)v;
high = (u32)(v >> 32);
- /* See comment in mce_rdmsrl() */
+ /* See comment in mce_rdmsrq() */
asm volatile("1: wrmsr\n"
"2:\n"
_ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR_IN_MCE)
@@ -407,17 +456,19 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v)
* check into our "mce" struct so that we can use it later to assess
* the severity of the problem as we read per-bank specific details.
*/
-static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs)
+static noinstr void mce_gather_info(struct mce_hw_err *err, struct pt_regs *regs)
{
+ struct mce *m;
/*
- * Enable instrumentation around mce_setup() which calls external
+ * Enable instrumentation around mce_prep_record() which calls external
* facilities.
*/
instrumentation_begin();
- mce_setup(m);
+ mce_prep_record(err);
instrumentation_end();
- m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+ m = &err->m;
+ m->mcgstatus = mce_rdmsrq(MSR_IA32_MCG_STATUS);
if (regs) {
/*
* Get the address of the instruction at the time of
@@ -437,14 +488,14 @@ static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs)
}
/* Use accurate RIP reporting if available. */
if (mca_cfg.rip_msr)
- m->ip = mce_rdmsrl(mca_cfg.rip_msr);
+ m->ip = mce_rdmsrq(mca_cfg.rip_msr);
}
}
-int mce_available(struct cpuinfo_x86 *c)
+bool mce_available(struct cpuinfo_x86 *c)
{
if (mca_cfg.disabled)
- return 0;
+ return false;
return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
}
@@ -459,32 +510,22 @@ static void mce_irq_work_cb(struct irq_work *entry)
mce_schedule_work();
}
-/*
- * Check if the address reported by the CPU is in a format we can parse.
- * It would be possible to add code for most other cases, but all would
- * be somewhat complicated (e.g. segment offset would require an instruction
- * parser). So only support physical addresses up to page granularity for now.
- */
-int mce_usable_address(struct mce *m)
+bool mce_usable_address(struct mce *m)
{
if (!(m->status & MCI_STATUS_ADDRV))
- return 0;
-
- /* Checks after this one are Intel/Zhaoxin-specific: */
- if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
- boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
- return 1;
-
- if (!(m->status & MCI_STATUS_MISCV))
- return 0;
+ return false;
- if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
- return 0;
+ switch (m->cpuvendor) {
+ case X86_VENDOR_AMD:
+ return amd_mce_usable_address(m);
- if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
- return 0;
+ case X86_VENDOR_INTEL:
+ case X86_VENDOR_ZHAOXIN:
+ return intel_mce_usable_address(m);
- return 1;
+ default:
+ return true;
+ }
}
EXPORT_SYMBOL_GPL(mce_usable_address);
@@ -543,16 +584,38 @@ bool mce_is_correctable(struct mce *m)
}
EXPORT_SYMBOL_GPL(mce_is_correctable);
+/*
+ * Notify the user(s) about new machine check events.
+ * Can be called from interrupt context, but not from machine check/NMI
+ * context.
+ */
+static bool mce_notify_irq(void)
+{
+ /* Not more than two messages every minute */
+ static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
+
+ if (test_and_clear_bit(0, &mce_need_notify)) {
+ mce_work_trigger();
+
+ if (__ratelimit(&ratelimit))
+ pr_info(HW_ERR "Machine check events logged\n");
+
+ return true;
+ }
+
+ return false;
+}
+
static int mce_early_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
- struct mce *m = (struct mce *)data;
+ struct mce_hw_err *err = to_mce_hw_err(data);
- if (!m)
+ if (!err)
return NOTIFY_DONE;
/* Emit the trace record: */
- trace_mce_record(m);
+ trace_mce_record(err);
set_bit(0, &mce_need_notify);
@@ -579,7 +642,7 @@ static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
mce->severity != MCE_DEFERRED_SEVERITY)
return NOTIFY_DONE;
- pfn = mce->addr >> PAGE_SHIFT;
+ pfn = (mce->addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT;
if (!memory_failure(pfn, 0)) {
set_mce_nospec(pfn);
mce->kflags |= MCE_HANDLED_UC;
@@ -596,13 +659,13 @@ static struct notifier_block mce_uc_nb = {
static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
- struct mce *m = (struct mce *)data;
+ struct mce_hw_err *err = to_mce_hw_err(data);
- if (!m)
+ if (!err)
return NOTIFY_DONE;
- if (mca_cfg.print_all || !m->kflags)
- __print_mce(m);
+ if (mca_cfg.print_all || !(err->m.kflags))
+ __print_mce(err);
return NOTIFY_DONE;
}
@@ -616,13 +679,15 @@ static struct notifier_block mce_default_nb = {
/*
* Read ADDR and MISC registers.
*/
-static noinstr void mce_read_aux(struct mce *m, int i)
+static noinstr void mce_read_aux(struct mce_hw_err *err, int i)
{
+ struct mce *m = &err->m;
+
if (m->status & MCI_STATUS_MISCV)
- m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC));
+ m->misc = mce_rdmsrq(mca_msr_reg(i, MCA_MISC));
if (m->status & MCI_STATUS_ADDRV) {
- m->addr = mce_rdmsrl(mca_msr_reg(i, MCA_ADDR));
+ m->addr = mce_rdmsrq(mca_msr_reg(i, MCA_ADDR));
/*
* Mask the reported address by the reported granularity.
@@ -633,22 +698,17 @@ static noinstr void mce_read_aux(struct mce *m, int i)
m->addr <<= shift;
}
- /*
- * Extract [55:<lsb>] where lsb is the least significant
- * *valid* bit of the address bits.
- */
- if (mce_flags.smca) {
- u8 lsb = (m->addr >> 56) & 0x3f;
-
- m->addr &= GENMASK_ULL(55, lsb);
- }
+ smca_extract_err_addr(m);
}
if (mce_flags.smca) {
- m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
+ m->ipid = mce_rdmsrq(MSR_AMD64_SMCA_MCx_IPID(i));
- if (m->status & MCI_STATUS_SYNDV)
- m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
+ if (m->status & MCI_STATUS_SYNDV) {
+ m->synd = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND(i));
+ err->vendor.amd.synd1 = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND1(i));
+ err->vendor.amd.synd2 = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND2(i));
+ }
}
}
@@ -669,40 +729,51 @@ DEFINE_PER_CPU(unsigned, mce_poll_count);
* is already totally * confused. In this case it's likely it will
* not fully execute the machine check handler either.
*/
-bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
+void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
- bool error_seen = false;
- struct mce m;
+ struct mce_hw_err err;
+ struct mce *m;
int i;
this_cpu_inc(mce_poll_count);
- mce_gather_info(&m, NULL);
+ mce_gather_info(&err, NULL);
+ m = &err.m;
if (flags & MCP_TIMESTAMP)
- m.tsc = rdtsc();
+ m->tsc = rdtsc();
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
if (!mce_banks[i].ctl || !test_bit(i, *b))
continue;
- m.misc = 0;
- m.addr = 0;
- m.bank = i;
+ m->misc = 0;
+ m->addr = 0;
+ m->bank = i;
barrier();
- m.status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
+ m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS));
+
+ /*
+ * Update storm tracking here, before checking for the
+ * MCI_STATUS_VAL bit. Valid corrected errors count
+ * towards declaring, or maintaining, storm status. No
+ * error in a bank counts towards avoiding, or ending,
+ * storm status.
+ */
+ if (!mca_cfg.cmci_disabled)
+ mce_track_storm(m);
/* If this entry is not valid, ignore it */
- if (!(m.status & MCI_STATUS_VAL))
+ if (!(m->status & MCI_STATUS_VAL))
continue;
/*
* If we are logging everything (at CPU online) or this
* is a corrected error, then we must log it.
*/
- if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC))
+ if ((flags & MCP_UC) || !(m->status & MCI_STATUS_UC))
goto log_it;
/*
@@ -712,20 +783,20 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
* everything else.
*/
if (!mca_cfg.ser) {
- if (m.status & MCI_STATUS_UC)
+ if (m->status & MCI_STATUS_UC)
continue;
goto log_it;
}
/* Log "not enabled" (speculative) errors */
- if (!(m.status & MCI_STATUS_EN))
+ if (!(m->status & MCI_STATUS_EN))
goto log_it;
/*
* Log UCNA (SDM: 15.6.3 "UCR Error Classification")
* UC == 1 && PCC == 0 && S == 0
*/
- if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S))
+ if (!(m->status & MCI_STATUS_PCC) && !(m->status & MCI_STATUS_S))
goto log_it;
/*
@@ -736,31 +807,29 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
continue;
log_it:
- error_seen = true;
-
if (flags & MCP_DONTLOG)
goto clear_it;
- mce_read_aux(&m, i);
- m.severity = mce_severity(&m, NULL, NULL, false);
+ mce_read_aux(&err, i);
+ m->severity = mce_severity(m, NULL, NULL, false);
/*
* Don't get the IP here because it's unlikely to
* have anything to do with the actual error location.
*/
- if (mca_cfg.dont_log_ce && !mce_usable_address(&m))
+ if (mca_cfg.dont_log_ce && !mce_usable_address(m))
goto clear_it;
if (flags & MCP_QUEUE_LOG)
- mce_gen_pool_add(&m);
+ mce_gen_pool_add(&err);
else
- mce_log(&m);
+ mce_log(&err);
clear_it:
/*
* Clear state for this bank.
*/
- mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
+ mce_wrmsrq(mca_msr_reg(i, MCA_STATUS), 0);
}
/*
@@ -769,8 +838,6 @@ clear_it:
*/
sync_core();
-
- return error_seen;
}
EXPORT_SYMBOL_GPL(machine_check_poll);
@@ -820,8 +887,8 @@ quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
*/
static noinstr bool quirk_skylake_repmov(void)
{
- u64 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
- u64 misc_enable = mce_rdmsrl(MSR_IA32_MISC_ENABLE);
+ u64 mcgstatus = mce_rdmsrq(MSR_IA32_MCG_STATUS);
+ u64 misc_enable = mce_rdmsrq(MSR_IA32_MISC_ENABLE);
u64 mc1_status;
/*
@@ -832,7 +899,7 @@ static noinstr bool quirk_skylake_repmov(void)
!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING))
return false;
- mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(1));
+ mc1_status = mce_rdmsrq(MSR_IA32_MCx_STATUS(1));
/* Check for a software-recoverable data fetch error. */
if ((mc1_status &
@@ -843,8 +910,8 @@ static noinstr bool quirk_skylake_repmov(void)
MCI_STATUS_ADDRV | MCI_STATUS_MISCV |
MCI_STATUS_AR | MCI_STATUS_S)) {
misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
- mce_wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
- mce_wrmsrl(MSR_IA32_MCx_STATUS(1), 0);
+ mce_wrmsrq(MSR_IA32_MISC_ENABLE, misc_enable);
+ mce_wrmsrq(MSR_IA32_MCx_STATUS(1), 0);
instrumentation_begin();
pr_err_once("Erratum detected, disable fast string copy instructions.\n");
@@ -857,17 +924,38 @@ static noinstr bool quirk_skylake_repmov(void)
}
/*
+ * Some Zen-based Instruction Fetch Units set EIPV=RIPV=0 on poison consumption
+ * errors. This means mce_gather_info() will not save the "ip" and "cs" registers.
+ *
+ * However, the context is still valid, so save the "cs" register for later use.
+ *
+ * The "ip" register is truly unknown, so don't save it or fixup EIPV/RIPV.
+ *
+ * The Instruction Fetch Unit is at MCA bank 1 for all affected systems.
+ */
+static __always_inline void quirk_zen_ifu(int bank, struct mce *m, struct pt_regs *regs)
+{
+ if (bank != 1)
+ return;
+ if (!(m->status & MCI_STATUS_POISON))
+ return;
+
+ m->cs = regs->cs;
+}
+
+/*
* Do a quick check if any of the events requires a panic.
* This decides if we keep the events around or clear them.
*/
-static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
+static __always_inline int mce_no_way_out(struct mce_hw_err *err, char **msg, unsigned long *validp,
struct pt_regs *regs)
{
+ struct mce *m = &err->m;
char *tmp = *msg;
int i;
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
- m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
+ m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS));
if (!(m->status & MCI_STATUS_VAL))
continue;
@@ -875,9 +963,12 @@ static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned lo
if (mce_flags.snb_ifu_quirk)
quirk_sandybridge_ifu(i, m, regs);
+ if (mce_flags.zen_ifu_quirk)
+ quirk_zen_ifu(i, m, regs);
+
m->bank = i;
if (mce_severity(m, regs, &tmp, true) >= MCE_PANIC_SEVERITY) {
- mce_read_aux(m, i);
+ mce_read_aux(err, i);
*msg = tmp;
return 1;
}
@@ -968,10 +1059,11 @@ out:
*/
static void mce_reign(void)
{
- int cpu;
+ struct mce_hw_err *err = NULL;
struct mce *m = NULL;
int global_worst = 0;
char *msg = NULL;
+ int cpu;
/*
* This CPU is the Monarch and the other CPUs have run
@@ -979,11 +1071,13 @@ static void mce_reign(void)
* Grade the severity of the errors of all the CPUs.
*/
for_each_possible_cpu(cpu) {
- struct mce *mtmp = &per_cpu(mces_seen, cpu);
+ struct mce_hw_err *etmp = &per_cpu(hw_errs_seen, cpu);
+ struct mce *mtmp = &etmp->m;
if (mtmp->severity > global_worst) {
global_worst = mtmp->severity;
- m = &per_cpu(mces_seen, cpu);
+ err = &per_cpu(hw_errs_seen, cpu);
+ m = &err->m;
}
}
@@ -995,7 +1089,7 @@ static void mce_reign(void)
if (m && global_worst >= MCE_PANIC_SEVERITY) {
/* call mce_severity() to get "msg" for panic */
mce_severity(m, NULL, &msg, true);
- mce_panic("Fatal machine check", m, msg);
+ mce_panic("Fatal machine check", err, msg);
}
/*
@@ -1012,11 +1106,11 @@ static void mce_reign(void)
mce_panic("Fatal machine check from unknown source", NULL, NULL);
/*
- * Now clear all the mces_seen so that they don't reappear on
+ * Now clear all the hw_errs_seen so that they don't reappear on
* the next mce.
*/
for_each_possible_cpu(cpu)
- memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
+ memset(&per_cpu(hw_errs_seen, cpu), 0, sizeof(struct mce_hw_err));
}
static atomic_t global_nwo;
@@ -1036,12 +1130,12 @@ static noinstr int mce_start(int *no_way_out)
if (!timeout)
return ret;
- arch_atomic_add(*no_way_out, &global_nwo);
+ raw_atomic_add(*no_way_out, &global_nwo);
/*
* Rely on the implied barrier below, such that global_nwo
* is updated before mce_callin.
*/
- order = arch_atomic_inc_return(&mce_callin);
+ order = raw_atomic_inc_return(&mce_callin);
arch_cpumask_clear_cpu(smp_processor_id(), &mce_missing_cpus);
/* Enable instrumentation around calls to external facilities */
@@ -1050,10 +1144,10 @@ static noinstr int mce_start(int *no_way_out)
/*
* Wait for everyone.
*/
- while (arch_atomic_read(&mce_callin) != num_online_cpus()) {
+ while (raw_atomic_read(&mce_callin) != num_online_cpus()) {
if (mce_timed_out(&timeout,
"Timeout: Not all CPUs entered broadcast exception handler")) {
- arch_atomic_set(&global_nwo, 0);
+ raw_atomic_set(&global_nwo, 0);
goto out;
}
ndelay(SPINUNIT);
@@ -1068,7 +1162,7 @@ static noinstr int mce_start(int *no_way_out)
/*
* Monarch: Starts executing now, the others wait.
*/
- arch_atomic_set(&mce_executing, 1);
+ raw_atomic_set(&mce_executing, 1);
} else {
/*
* Subject: Now start the scanning loop one by one in
@@ -1076,10 +1170,10 @@ static noinstr int mce_start(int *no_way_out)
* This way when there are any shared banks it will be
* only seen by one CPU before cleared, avoiding duplicates.
*/
- while (arch_atomic_read(&mce_executing) < order) {
+ while (raw_atomic_read(&mce_executing) < order) {
if (mce_timed_out(&timeout,
"Timeout: Subject CPUs unable to finish machine check processing")) {
- arch_atomic_set(&global_nwo, 0);
+ raw_atomic_set(&global_nwo, 0);
goto out;
}
ndelay(SPINUNIT);
@@ -1089,7 +1183,7 @@ static noinstr int mce_start(int *no_way_out)
/*
* Cache the global no_way_out state.
*/
- *no_way_out = arch_atomic_read(&global_nwo);
+ *no_way_out = raw_atomic_read(&global_nwo);
ret = order;
@@ -1180,7 +1274,7 @@ static __always_inline void mce_clear_state(unsigned long *toclear)
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
if (arch_test_bit(i, toclear))
- mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
+ mce_wrmsrq(mca_msr_reg(i, MCA_STATUS), 0);
}
}
@@ -1204,7 +1298,7 @@ static noinstr bool mce_check_crashing_cpu(void)
(crashing_cpu != -1 && crashing_cpu != cpu)) {
u64 mcgstatus;
- mcgstatus = __rdmsr(MSR_IA32_MCG_STATUS);
+ mcgstatus = native_rdmsrq(MSR_IA32_MCG_STATUS);
if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) {
if (mcgstatus & MCG_STATUS_LMCES)
@@ -1212,7 +1306,7 @@ static noinstr bool mce_check_crashing_cpu(void)
}
if (mcgstatus & MCG_STATUS_RIPV) {
- __wrmsr(MSR_IA32_MCG_STATUS, 0, 0);
+ native_wrmsrq(MSR_IA32_MCG_STATUS, 0);
return true;
}
}
@@ -1220,13 +1314,14 @@ static noinstr bool mce_check_crashing_cpu(void)
}
static __always_inline int
-__mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final,
- unsigned long *toclear, unsigned long *valid_banks, int no_way_out,
- int *worst)
+__mc_scan_banks(struct mce_hw_err *err, struct pt_regs *regs,
+ struct mce_hw_err *final, unsigned long *toclear,
+ unsigned long *valid_banks, int no_way_out, int *worst)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
struct mca_config *cfg = &mca_cfg;
int severity, i, taint = 0;
+ struct mce *m = &err->m;
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
arch___clear_bit(i, toclear);
@@ -1240,7 +1335,7 @@ __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final,
m->addr = 0;
m->bank = i;
- m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
+ m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS));
if (!(m->status & MCI_STATUS_VAL))
continue;
@@ -1271,7 +1366,7 @@ __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final,
if (severity == MCE_NO_SEVERITY)
continue;
- mce_read_aux(m, i);
+ mce_read_aux(err, i);
/* assuming valid severity level != 0 */
m->severity = severity;
@@ -1281,17 +1376,17 @@ __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final,
* done in #MC context, where instrumentation is disabled.
*/
instrumentation_begin();
- mce_log(m);
+ mce_log(err);
instrumentation_end();
if (severity > *worst) {
- *final = *m;
+ *final = *err;
*worst = severity;
}
}
/* mce_clear_state will clear *final, save locally for use later */
- *m = *final;
+ *err = *final;
return taint;
}
@@ -1308,6 +1403,7 @@ static void kill_me_maybe(struct callback_head *cb)
{
struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
int flags = MF_ACTION_REQUIRED;
+ unsigned long pfn;
int ret;
p->mce_count = 0;
@@ -1316,9 +1412,10 @@ static void kill_me_maybe(struct callback_head *cb)
if (!p->mce_ripv)
flags |= MF_MUST_KILL;
- ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags);
+ pfn = (p->mce_addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT;
+ ret = memory_failure(pfn, flags);
if (!ret) {
- set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
+ set_mce_nospec(pfn);
sync_core();
return;
}
@@ -1340,16 +1437,19 @@ static void kill_me_maybe(struct callback_head *cb)
static void kill_me_never(struct callback_head *cb)
{
struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
+ unsigned long pfn;
p->mce_count = 0;
pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr);
- if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0))
- set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
+ pfn = (p->mce_addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT;
+ if (!memory_failure(pfn, 0))
+ set_mce_nospec(pfn);
}
-static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *))
+static void queue_task_work(struct mce_hw_err *err, char *msg, void (*func)(struct callback_head *))
{
int count = ++current->mce_count;
+ struct mce *m = &err->m;
/* First call, save all the details */
if (count == 1) {
@@ -1362,11 +1462,12 @@ static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callba
/* Ten is likely overkill. Don't expect more than two faults before task_work() */
if (count > 10)
- mce_panic("Too many consecutive machine checks while accessing user data", m, msg);
+ mce_panic("Too many consecutive machine checks while accessing user data",
+ err, msg);
/* Second or later call, make sure page address matches the one from first call */
if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT))
- mce_panic("Consecutive machine checks to different user pages", m, msg);
+ mce_panic("Consecutive machine checks to different user pages", err, msg);
/* Do not call task_work_add() more than once */
if (count > 1)
@@ -1415,8 +1516,10 @@ noinstr void do_machine_check(struct pt_regs *regs)
int worst = 0, order, no_way_out, kill_current_task, lmce, taint = 0;
DECLARE_BITMAP(valid_banks, MAX_NR_BANKS) = { 0 };
DECLARE_BITMAP(toclear, MAX_NR_BANKS) = { 0 };
- struct mce m, *final;
+ struct mce_hw_err *final;
+ struct mce_hw_err err;
char *msg = NULL;
+ struct mce *m;
if (unlikely(mce_flags.p5))
return pentium_machine_check(regs);
@@ -1454,13 +1557,14 @@ noinstr void do_machine_check(struct pt_regs *regs)
this_cpu_inc(mce_exception_count);
- mce_gather_info(&m, regs);
- m.tsc = rdtsc();
+ mce_gather_info(&err, regs);
+ m = &err.m;
+ m->tsc = rdtsc();
- final = this_cpu_ptr(&mces_seen);
- *final = m;
+ final = this_cpu_ptr(&hw_errs_seen);
+ *final = err;
- no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
+ no_way_out = mce_no_way_out(&err, &msg, valid_banks, regs);
barrier();
@@ -1469,15 +1573,15 @@ noinstr void do_machine_check(struct pt_regs *regs)
* Assume the worst for now, but if we find the
* severity is MCE_AR_SEVERITY we have other options.
*/
- if (!(m.mcgstatus & MCG_STATUS_RIPV))
+ if (!(m->mcgstatus & MCG_STATUS_RIPV))
kill_current_task = 1;
/*
* Check if this MCE is signaled to only this logical processor,
* on Intel, Zhaoxin only.
*/
- if (m.cpuvendor == X86_VENDOR_INTEL ||
- m.cpuvendor == X86_VENDOR_ZHAOXIN)
- lmce = m.mcgstatus & MCG_STATUS_LMCES;
+ if (m->cpuvendor == X86_VENDOR_INTEL ||
+ m->cpuvendor == X86_VENDOR_ZHAOXIN)
+ lmce = m->mcgstatus & MCG_STATUS_LMCES;
/*
* Local machine check may already know that we have to panic.
@@ -1488,12 +1592,12 @@ noinstr void do_machine_check(struct pt_regs *regs)
*/
if (lmce) {
if (no_way_out)
- mce_panic("Fatal local machine check", &m, msg);
+ mce_panic("Fatal local machine check", &err, msg);
} else {
order = mce_start(&no_way_out);
}
- taint = __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst);
+ taint = __mc_scan_banks(&err, regs, final, toclear, valid_banks, no_way_out, &worst);
if (!no_way_out)
mce_clear_state(toclear);
@@ -1508,7 +1612,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
no_way_out = worst >= MCE_PANIC_SEVERITY;
if (no_way_out)
- mce_panic("Fatal machine check on current CPU", &m, msg);
+ mce_panic("Fatal machine check on current CPU", &err, msg);
}
} else {
/*
@@ -1520,8 +1624,8 @@ noinstr void do_machine_check(struct pt_regs *regs)
* make sure we have the right "msg".
*/
if (worst >= MCE_PANIC_SEVERITY) {
- mce_severity(&m, regs, &msg, true);
- mce_panic("Local fatal machine check!", &m, msg);
+ mce_severity(m, regs, &msg, true);
+ mce_panic("Local fatal machine check!", &err, msg);
}
}
@@ -1539,15 +1643,33 @@ noinstr void do_machine_check(struct pt_regs *regs)
goto out;
/* Fault was in user mode and we need to take some action */
- if ((m.cs & 3) == 3) {
+ if ((m->cs & 3) == 3) {
/* If this triggers there is no way to recover. Die hard. */
BUG_ON(!on_thread_stack() || !user_mode(regs));
- if (kill_current_task)
- queue_task_work(&m, msg, kill_me_now);
+ if (!mce_usable_address(m))
+ queue_task_work(&err, msg, kill_me_now);
else
- queue_task_work(&m, msg, kill_me_maybe);
+ queue_task_work(&err, msg, kill_me_maybe);
+ } else if (m->mcgstatus & MCG_STATUS_SEAM_NR) {
+ /*
+ * Saved RIP on stack makes it look like the machine check
+ * was taken in the kernel on the instruction following
+ * the entry to SEAM mode. But MCG_STATUS_SEAM_NR indicates
+ * that the machine check was taken inside SEAM non-root
+ * mode. CPU core has already marked that guest as dead.
+ * It is OK for the kernel to resume execution at the
+ * apparent point of the machine check as the fault did
+ * not occur there. Mark the page as poisoned so it won't
+ * be added to free list when the guest is terminated.
+ */
+ if (mce_usable_address(m)) {
+ struct page *p = pfn_to_online_page(m->addr >> PAGE_SHIFT);
+
+ if (p)
+ SetPageHWPoison(p);
+ }
} else {
/*
* Handle an MCE which has happened in kernel space but from
@@ -1558,20 +1680,20 @@ noinstr void do_machine_check(struct pt_regs *regs)
* corresponding exception handler which would do that is the
* proper one.
*/
- if (m.kflags & MCE_IN_KERNEL_RECOV) {
+ if (m->kflags & MCE_IN_KERNEL_RECOV) {
if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
- mce_panic("Failed kernel mode recovery", &m, msg);
+ mce_panic("Failed kernel mode recovery", &err, msg);
}
- if (m.kflags & MCE_IN_KERNEL_COPYIN)
- queue_task_work(&m, msg, kill_me_never);
+ if (m->kflags & MCE_IN_KERNEL_COPYIN)
+ queue_task_work(&err, msg, kill_me_never);
}
out:
instrumentation_end();
clear:
- mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
+ mce_wrmsrq(MSR_IA32_MCG_STATUS, 0);
}
EXPORT_SYMBOL_GPL(do_machine_check);
@@ -1598,13 +1720,6 @@ static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
static DEFINE_PER_CPU(struct timer_list, mce_timer);
-static unsigned long mce_adjust_timer_default(unsigned long interval)
-{
- return interval;
-}
-
-static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
-
static void __start_timer(struct timer_list *t, unsigned long interval)
{
unsigned long when = jiffies + interval;
@@ -1618,6 +1733,13 @@ static void __start_timer(struct timer_list *t, unsigned long interval)
local_irq_restore(flags);
}
+static void mc_poll_banks_default(void)
+{
+ machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
+}
+
+void (*mc_poll_banks)(void) = mc_poll_banks_default;
+
static void mce_timer_fn(struct timer_list *t)
{
struct timer_list *cpu_t = this_cpu_ptr(&mce_timer);
@@ -1627,14 +1749,8 @@ static void mce_timer_fn(struct timer_list *t)
iv = __this_cpu_read(mce_next_interval);
- if (mce_available(this_cpu_ptr(&cpu_info))) {
- machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
-
- if (mce_intel_cmci_poll()) {
- iv = mce_adjust_timer(iv);
- goto done;
- }
- }
+ if (mce_available(this_cpu_ptr(&cpu_info)))
+ mc_poll_banks();
/*
* Alert userspace if needed. If we logged an MCE, reduce the polling
@@ -1645,55 +1761,39 @@ static void mce_timer_fn(struct timer_list *t)
else
iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
-done:
- __this_cpu_write(mce_next_interval, iv);
- __start_timer(t, iv);
+ if (mce_get_storm_mode()) {
+ __start_timer(t, HZ);
+ } else {
+ __this_cpu_write(mce_next_interval, iv);
+ __start_timer(t, iv);
+ }
}
/*
- * Ensure that the timer is firing in @interval from now.
+ * When a storm starts on any bank on this CPU, switch to polling
+ * once per second. When the storm ends, revert to the default
+ * polling interval.
*/
-void mce_timer_kick(unsigned long interval)
+void mce_timer_kick(bool storm)
{
struct timer_list *t = this_cpu_ptr(&mce_timer);
- unsigned long iv = __this_cpu_read(mce_next_interval);
- __start_timer(t, interval);
+ mce_set_storm_mode(storm);
- if (interval < iv)
- __this_cpu_write(mce_next_interval, interval);
+ if (storm)
+ __start_timer(t, HZ);
+ else
+ __this_cpu_write(mce_next_interval, check_interval * HZ);
}
-/* Must not be called in IRQ context where del_timer_sync() can deadlock */
+/* Must not be called in IRQ context where timer_delete_sync() can deadlock */
static void mce_timer_delete_all(void)
{
int cpu;
for_each_online_cpu(cpu)
- del_timer_sync(&per_cpu(mce_timer, cpu));
-}
-
-/*
- * Notify the user(s) about new machine check events.
- * Can be called from interrupt context, but not from machine check/NMI
- * context.
- */
-int mce_notify_irq(void)
-{
- /* Not more than two messages every minute */
- static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
-
- if (test_and_clear_bit(0, &mce_need_notify)) {
- mce_work_trigger();
-
- if (__ratelimit(&ratelimit))
- pr_info(HW_ERR "Machine check events logged\n");
-
- return 1;
- }
- return 0;
+ timer_delete_sync(&per_cpu(mce_timer, cpu));
}
-EXPORT_SYMBOL_GPL(mce_notify_irq);
static void __mcheck_cpu_mce_banks_init(void)
{
@@ -1722,7 +1822,7 @@ static void __mcheck_cpu_cap_init(void)
u64 cap;
u8 b;
- rdmsrl(MSR_IA32_MCG_CAP, cap);
+ rdmsrq(MSR_IA32_MCG_CAP, cap);
b = cap & MCG_BANKCNT_MASK;
@@ -1763,7 +1863,7 @@ static void __mcheck_cpu_init_generic(void)
cr4_set_bits(X86_CR4_MCE);
- rdmsrl(MSR_IA32_MCG_CAP, cap);
+ rdmsrq(MSR_IA32_MCG_CAP, cap);
if (cap & MCG_CTL_P)
wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
}
@@ -1778,8 +1878,8 @@ static void __mcheck_cpu_init_clear_banks(void)
if (!b->init)
continue;
- wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl);
- wrmsrl(mca_msr_reg(i, MCA_STATUS), 0);
+ wrmsrq(mca_msr_reg(i, MCA_CTL), b->ctl);
+ wrmsrq(mca_msr_reg(i, MCA_STATUS), 0);
}
}
@@ -1805,103 +1905,125 @@ static void __mcheck_cpu_check_banks(void)
if (!b->init)
continue;
- rdmsrl(mca_msr_reg(i, MCA_CTL), msrval);
+ rdmsrq(mca_msr_reg(i, MCA_CTL), msrval);
b->init = !!msrval;
}
}
-/* Add per CPU specific workarounds here */
-static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
+static void apply_quirks_amd(struct cpuinfo_x86 *c)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
- struct mca_config *cfg = &mca_cfg;
-
- if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
- pr_info("unknown CPU type - not enabling MCE support\n");
- return -EOPNOTSUPP;
- }
/* This should be disabled by the BIOS, but isn't always */
- if (c->x86_vendor == X86_VENDOR_AMD) {
- if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) {
- /*
- * disable GART TBL walk error reporting, which
- * trips off incorrectly with the IOMMU & 3ware
- * & Cerberus:
- */
- clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
- }
- if (c->x86 < 0x11 && cfg->bootlog < 0) {
- /*
- * Lots of broken BIOS around that don't clear them
- * by default and leave crap in there. Don't log:
- */
- cfg->bootlog = 0;
- }
+ if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) {
/*
- * Various K7s with broken bank 0 around. Always disable
- * by default.
+ * disable GART TBL walk error reporting, which
+ * trips off incorrectly with the IOMMU & 3ware
+ * & Cerberus:
*/
- if (c->x86 == 6 && this_cpu_read(mce_num_banks) > 0)
- mce_banks[0].ctl = 0;
+ clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
+ }
+ if (c->x86 < 0x11 && mca_cfg.bootlog < 0) {
/*
- * overflow_recov is supported for F15h Models 00h-0fh
- * even though we don't have a CPUID bit for it.
+ * Lots of broken BIOS around that don't clear them
+ * by default and leave crap in there. Don't log:
*/
- if (c->x86 == 0x15 && c->x86_model <= 0xf)
- mce_flags.overflow_recov = 1;
-
+ mca_cfg.bootlog = 0;
}
- if (c->x86_vendor == X86_VENDOR_INTEL) {
- /*
- * SDM documents that on family 6 bank 0 should not be written
- * because it aliases to another special BIOS controlled
- * register.
- * But it's not aliased anymore on model 0x1a+
- * Don't ignore bank 0 completely because there could be a
- * valid event later, merely don't write CTL0.
- */
+ /*
+ * Various K7s with broken bank 0 around. Always disable
+ * by default.
+ */
+ if (c->x86 == 6 && this_cpu_read(mce_num_banks))
+ mce_banks[0].ctl = 0;
- if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0)
- mce_banks[0].init = false;
+ /*
+ * overflow_recov is supported for F15h Models 00h-0fh
+ * even though we don't have a CPUID bit for it.
+ */
+ if (c->x86 == 0x15 && c->x86_model <= 0xf)
+ mce_flags.overflow_recov = 1;
- /*
- * All newer Intel systems support MCE broadcasting. Enable
- * synchronization with a one second timeout.
- */
- if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
- cfg->monarch_timeout < 0)
- cfg->monarch_timeout = USEC_PER_SEC;
+ if (c->x86 >= 0x17 && c->x86 <= 0x1A)
+ mce_flags.zen_ifu_quirk = 1;
+}
- /*
- * There are also broken BIOSes on some Pentium M and
- * earlier systems:
- */
- if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
- cfg->bootlog = 0;
+static void apply_quirks_intel(struct cpuinfo_x86 *c)
+{
+ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
- if (c->x86 == 6 && c->x86_model == 45)
- mce_flags.snb_ifu_quirk = 1;
+ /* Older CPUs (prior to family 6) don't need quirks. */
+ if (c->x86_vfm < INTEL_PENTIUM_PRO)
+ return;
- /*
- * Skylake, Cascacde Lake and Cooper Lake require a quirk on
- * rep movs.
- */
- if (c->x86 == 6 && c->x86_model == INTEL_FAM6_SKYLAKE_X)
- mce_flags.skx_repmov_quirk = 1;
+ /*
+ * SDM documents that on family 6 bank 0 should not be written
+ * because it aliases to another special BIOS controlled
+ * register.
+ * But it's not aliased anymore on model 0x1a+
+ * Don't ignore bank 0 completely because there could be a
+ * valid event later, merely don't write CTL0.
+ */
+ if (c->x86_vfm < INTEL_NEHALEM_EP && this_cpu_read(mce_num_banks))
+ mce_banks[0].init = false;
+
+ /*
+ * All newer Intel systems support MCE broadcasting. Enable
+ * synchronization with a one second timeout.
+ */
+ if (c->x86_vfm >= INTEL_CORE_YONAH && mca_cfg.monarch_timeout < 0)
+ mca_cfg.monarch_timeout = USEC_PER_SEC;
+
+ /*
+ * There are also broken BIOSes on some Pentium M and
+ * earlier systems:
+ */
+ if (c->x86_vfm < INTEL_CORE_YONAH && mca_cfg.bootlog < 0)
+ mca_cfg.bootlog = 0;
+
+ if (c->x86_vfm == INTEL_SANDYBRIDGE_X)
+ mce_flags.snb_ifu_quirk = 1;
+
+ /*
+ * Skylake, Cascacde Lake and Cooper Lake require a quirk on
+ * rep movs.
+ */
+ if (c->x86_vfm == INTEL_SKYLAKE_X)
+ mce_flags.skx_repmov_quirk = 1;
+}
+
+static void apply_quirks_zhaoxin(struct cpuinfo_x86 *c)
+{
+ /*
+ * All newer Zhaoxin CPUs support MCE broadcasting. Enable
+ * synchronization with a one second timeout.
+ */
+ if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
+ if (mca_cfg.monarch_timeout < 0)
+ mca_cfg.monarch_timeout = USEC_PER_SEC;
}
+}
- if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
- /*
- * All newer Zhaoxin CPUs support MCE broadcasting. Enable
- * synchronization with a one second timeout.
- */
- if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
- if (cfg->monarch_timeout < 0)
- cfg->monarch_timeout = USEC_PER_SEC;
- }
+/* Add per CPU specific workarounds here */
+static bool __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
+{
+ struct mca_config *cfg = &mca_cfg;
+
+ switch (c->x86_vendor) {
+ case X86_VENDOR_UNKNOWN:
+ pr_info("unknown CPU type - not enabling MCE support\n");
+ return false;
+ case X86_VENDOR_AMD:
+ apply_quirks_amd(c);
+ break;
+ case X86_VENDOR_INTEL:
+ apply_quirks_intel(c);
+ break;
+ case X86_VENDOR_ZHAOXIN:
+ apply_quirks_zhaoxin(c);
+ break;
}
if (cfg->monarch_timeout < 0)
@@ -1909,28 +2031,28 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
if (cfg->bootlog != 0)
cfg->panic_timeout = 30;
- return 0;
+ return true;
}
-static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
+static bool __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
{
if (c->x86 != 5)
- return 0;
+ return false;
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
intel_p5_mcheck_init(c);
mce_flags.p5 = 1;
- return 1;
+ return true;
case X86_VENDOR_CENTAUR:
winchip_mcheck_init(c);
mce_flags.winchip = 1;
- return 1;
+ return true;
default:
- return 0;
+ return false;
}
- return 0;
+ return false;
}
/*
@@ -1982,7 +2104,6 @@ static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c)
intel_init_cmci();
intel_init_lmce();
- mce_adjust_timer = cmci_intel_adjust_timer;
}
static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
@@ -1995,16 +2116,11 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
mce_intel_feature_init(c);
- mce_adjust_timer = cmci_intel_adjust_timer;
- break;
-
- case X86_VENDOR_AMD: {
- mce_amd_feature_init(c);
break;
- }
+ case X86_VENDOR_AMD:
case X86_VENDOR_HYGON:
- mce_hygon_feature_init(c);
+ mce_amd_feature_init(c);
break;
case X86_VENDOR_CENTAUR:
@@ -2121,6 +2237,31 @@ DEFINE_IDTENTRY_MCE_USER(exc_machine_check)
exc_machine_check_user(regs);
local_db_restore(dr7);
}
+
+#ifdef CONFIG_X86_FRED
+/*
+ * When occurred on different ring level, i.e., from user or kernel
+ * context, #MCE needs to be handled on different stack: User #MCE
+ * on current task stack, while kernel #MCE on a dedicated stack.
+ *
+ * This is exactly how FRED event delivery invokes an exception
+ * handler: ring 3 event on level 0 stack, i.e., current task stack;
+ * ring 0 event on the #MCE dedicated stack specified in the
+ * IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED machine check entry
+ * stub doesn't do stack switch.
+ */
+DEFINE_FREDENTRY_MCE(exc_machine_check)
+{
+ unsigned long dr7;
+
+ dr7 = local_db_save();
+ if (user_mode(regs))
+ exc_machine_check_user(regs);
+ else
+ exc_machine_check_kernel(regs);
+ local_db_restore(dr7);
+}
+#endif
#else
/* 32bit unified entry point */
DEFINE_IDTENTRY_RAW(exc_machine_check)
@@ -2153,12 +2294,12 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
__mcheck_cpu_cap_init();
- if (__mcheck_cpu_apply_quirks(c) < 0) {
+ if (!__mcheck_cpu_apply_quirks(c)) {
mca_cfg.disabled = 1;
return;
}
- if (mce_gen_pool_init()) {
+ if (!mce_gen_pool_init()) {
mca_cfg.disabled = 1;
pr_emerg("Couldn't allocate MCE records pool!\n");
return;
@@ -2295,7 +2436,7 @@ static void mce_disable_error_reporting(void)
struct mce_bank *b = &mce_banks[i];
if (b->init)
- wrmsrl(mca_msr_reg(i, MCA_CTL), 0);
+ wrmsrq(mca_msr_reg(i, MCA_CTL), 0);
}
return;
}
@@ -2365,6 +2506,7 @@ static void mce_restart(void)
{
mce_timer_delete_all();
on_each_cpu(mce_cpu_restart, NULL, 1);
+ mce_schedule_work();
}
/* Toggle features for corrected errors */
@@ -2385,7 +2527,7 @@ static void mce_enable_ce(void *all)
__mcheck_cpu_init_timer();
}
-static struct bus_type mce_subsys = {
+static const struct bus_type mce_subsys = {
.name = "machinecheck",
.dev_name = "machinecheck",
};
@@ -2428,12 +2570,14 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr,
return -EINVAL;
b = &per_cpu(mce_banks_array, s->id)[bank];
-
if (!b->init)
return -ENODEV;
b->ctl = new;
+
+ mutex_lock(&mce_sysfs_mutex);
mce_restart();
+ mutex_unlock(&mce_sysfs_mutex);
return size;
}
@@ -2554,9 +2698,6 @@ static int mce_device_create(unsigned int cpu)
int err;
int i, j;
- if (!mce_available(&boot_cpu_data))
- return -EIO;
-
dev = per_cpu(mce_device, cpu);
if (dev)
return 0;
@@ -2645,14 +2786,12 @@ static void mce_reenable_cpu(void)
struct mce_bank *b = &mce_banks[i];
if (b->init)
- wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl);
+ wrmsrq(mca_msr_reg(i, MCA_CTL), b->ctl);
}
}
static int mce_cpu_dead(unsigned int cpu)
{
- mce_intel_hcpu_update(cpu);
-
/* intentionally ignoring frozen here */
if (!cpuhp_tasks_frozen)
cmci_rediscover();
@@ -2681,7 +2820,7 @@ static int mce_cpu_pre_down(unsigned int cpu)
struct timer_list *t = this_cpu_ptr(&mce_timer);
mce_disable_cpu();
- del_timer_sync(t);
+ timer_delete_sync(t);
mce_threshold_remove_device(cpu);
mce_device_remove(cpu);
return 0;
diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c
index 100fbeebdc72..8d023239ce18 100644
--- a/arch/x86/kernel/cpu/mce/dev-mcelog.c
+++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c
@@ -105,8 +105,7 @@ static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
{
char *p;
- strncpy(mce_helper, buf, sizeof(mce_helper));
- mce_helper[sizeof(mce_helper)-1] = 0;
+ strscpy(mce_helper, buf, sizeof(mce_helper));
p = strchr(mce_helper, '\n');
if (p)
@@ -265,15 +264,8 @@ static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
return put_user(sizeof(struct mce), p);
case MCE_GET_LOG_LEN:
return put_user(mcelog->len, p);
- case MCE_GETCLEAR_FLAGS: {
- unsigned flags;
-
- do {
- flags = mcelog->flags;
- } while (cmpxchg(&mcelog->flags, flags, 0) != flags);
-
- return put_user(flags, p);
- }
+ case MCE_GETCLEAR_FLAGS:
+ return put_user(xchg(&mcelog->flags, 0), p);
default:
return -ENOTTY;
}
@@ -315,7 +307,7 @@ static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
/*
* Need to give user space some time to set everything up,
- * so do it a jiffie or two later everywhere.
+ * so do it a jiffy or two later everywhere.
*/
schedule_timeout(2);
@@ -332,7 +324,6 @@ static const struct file_operations mce_chrdev_ops = {
.poll = mce_chrdev_poll,
.unlocked_ioctl = mce_chrdev_ioctl,
.compat_ioctl = compat_ptr_ioctl,
- .llseek = no_llseek,
};
static struct miscdevice mce_chrdev_device = {
diff --git a/arch/x86/kernel/cpu/mce/genpool.c b/arch/x86/kernel/cpu/mce/genpool.c
index fbe8b61c3413..3ca9c007a666 100644
--- a/arch/x86/kernel/cpu/mce/genpool.c
+++ b/arch/x86/kernel/cpu/mce/genpool.c
@@ -16,14 +16,14 @@
* used to save error information organized in a lock-less list.
*
* This memory pool is only to be used to save MCE records in MCE context.
- * MCE events are rare, so a fixed size memory pool should be enough. Use
- * 2 pages to save MCE events for now (~80 MCE records at most).
+ * MCE events are rare, so a fixed size memory pool should be enough.
+ * Allocate on a sliding scale based on number of CPUs.
*/
-#define MCE_POOLSZ (2 * PAGE_SIZE)
+#define MCE_MIN_ENTRIES 80
+#define MCE_PER_CPU 2
static struct gen_pool *mce_evt_pool;
static LLIST_HEAD(mce_event_llist);
-static char gen_pool_buf[MCE_POOLSZ];
/*
* Compare the record "t" with each of the records on list "l" to see if
@@ -31,15 +31,15 @@ static char gen_pool_buf[MCE_POOLSZ];
*/
static bool is_duplicate_mce_record(struct mce_evt_llist *t, struct mce_evt_llist *l)
{
+ struct mce_hw_err *err1, *err2;
struct mce_evt_llist *node;
- struct mce *m1, *m2;
- m1 = &t->mce;
+ err1 = &t->err;
llist_for_each_entry(node, &l->llnode, llnode) {
- m2 = &node->mce;
+ err2 = &node->err;
- if (!mce_cmp(m1, m2))
+ if (!mce_cmp(&err1->m, &err2->m))
return true;
}
return false;
@@ -73,8 +73,8 @@ struct llist_node *mce_gen_pool_prepare_records(void)
void mce_gen_pool_process(struct work_struct *__unused)
{
- struct llist_node *head;
struct mce_evt_llist *node, *tmp;
+ struct llist_node *head;
struct mce *mce;
head = llist_del_all(&mce_event_llist);
@@ -83,7 +83,7 @@ void mce_gen_pool_process(struct work_struct *__unused)
head = llist_reverse_order(head);
llist_for_each_entry_safe(node, tmp, head, llnode) {
- mce = &node->mce;
+ mce = &node->err.m;
blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node));
}
@@ -94,54 +94,63 @@ bool mce_gen_pool_empty(void)
return llist_empty(&mce_event_llist);
}
-int mce_gen_pool_add(struct mce *mce)
+bool mce_gen_pool_add(struct mce_hw_err *err)
{
struct mce_evt_llist *node;
- if (filter_mce(mce))
- return -EINVAL;
+ if (filter_mce(&err->m))
+ return false;
if (!mce_evt_pool)
- return -EINVAL;
+ return false;
node = (void *)gen_pool_alloc(mce_evt_pool, sizeof(*node));
if (!node) {
pr_warn_ratelimited("MCE records pool full!\n");
- return -ENOMEM;
+ return false;
}
- memcpy(&node->mce, mce, sizeof(*mce));
+ memcpy(&node->err, err, sizeof(*err));
llist_add(&node->llnode, &mce_event_llist);
- return 0;
+ return true;
}
-static int mce_gen_pool_create(void)
+static bool mce_gen_pool_create(void)
{
- struct gen_pool *tmpp;
- int ret = -ENOMEM;
-
- tmpp = gen_pool_create(ilog2(sizeof(struct mce_evt_llist)), -1);
- if (!tmpp)
- goto out;
+ int mce_numrecords, mce_poolsz, order;
+ struct gen_pool *gpool;
+ void *mce_pool;
+
+ order = order_base_2(sizeof(struct mce_evt_llist));
+ gpool = gen_pool_create(order, -1);
+ if (!gpool)
+ return false;
+
+ mce_numrecords = max(MCE_MIN_ENTRIES, num_possible_cpus() * MCE_PER_CPU);
+ mce_poolsz = mce_numrecords * (1 << order);
+ mce_pool = kmalloc(mce_poolsz, GFP_KERNEL);
+ if (!mce_pool) {
+ gen_pool_destroy(gpool);
+ return false;
+ }
- ret = gen_pool_add(tmpp, (unsigned long)gen_pool_buf, MCE_POOLSZ, -1);
- if (ret) {
- gen_pool_destroy(tmpp);
- goto out;
+ if (gen_pool_add(gpool, (unsigned long)mce_pool, mce_poolsz, -1)) {
+ gen_pool_destroy(gpool);
+ kfree(mce_pool);
+ return false;
}
- mce_evt_pool = tmpp;
+ mce_evt_pool = gpool;
-out:
- return ret;
+ return true;
}
-int mce_gen_pool_init(void)
+bool mce_gen_pool_init(void)
{
/* Just init mce_gen_pool once. */
if (mce_evt_pool)
- return 0;
+ return true;
return mce_gen_pool_create();
}
diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c
index 12cf2e7ca33c..d02c4f556cd0 100644
--- a/arch/x86/kernel/cpu/mce/inject.c
+++ b/arch/x86/kernel/cpu/mce/inject.c
@@ -24,10 +24,11 @@
#include <linux/pci.h>
#include <linux/uaccess.h>
-#include <asm/amd_nb.h>
+#include <asm/amd/nb.h>
#include <asm/apic.h>
#include <asm/irq_vectors.h>
#include <asm/mce.h>
+#include <asm/msr.h>
#include <asm/nmi.h>
#include <asm/smp.h>
@@ -229,7 +230,6 @@ static int raise_local(void)
} else if (m->status) {
pr_info("Starting machine check poll CPU %d\n", cpu);
raise_poll(m);
- mce_notify_irq();
pr_info("Machine check poll done on CPU %d\n", cpu);
} else
m->finished = 0;
@@ -270,8 +270,7 @@ static void __maybe_unused raise_mce(struct mce *m)
mce_irq_ipi, NULL, 0);
preempt_enable();
} else if (m->inject_flags & MCJ_NMI_BROADCAST)
- apic->send_IPI_mask(mce_inject_cpumask,
- NMI_VECTOR);
+ __apic_send_IPI_mask(mce_inject_cpumask, NMI_VECTOR);
}
start = jiffies;
while (!cpumask_empty(mce_inject_cpumask)) {
@@ -431,11 +430,9 @@ static void trigger_thr_int(void *info)
static u32 get_nbc_for_node(int node_id)
{
- struct cpuinfo_x86 *c = &boot_cpu_data;
u32 cores_per_node;
- cores_per_node = (c->x86_max_cores * smp_num_siblings) / amd_get_nodes_per_socket();
-
+ cores_per_node = topology_num_threads_per_package() / topology_amd_nodes_per_pkg();
return cores_per_node * node_id;
}
@@ -479,30 +476,35 @@ static void prepare_msrs(void *info)
struct mce m = *(struct mce *)info;
u8 b = m.bank;
- wrmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
+ wrmsrq(MSR_IA32_MCG_STATUS, m.mcgstatus);
if (boot_cpu_has(X86_FEATURE_SMCA)) {
if (m.inject_flags == DFR_INT_INJ) {
- wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status);
- wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr);
+ wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status);
+ wrmsrq(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr);
} else {
- wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), m.status);
- wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr);
+ wrmsrq(MSR_AMD64_SMCA_MCx_STATUS(b), m.status);
+ wrmsrq(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr);
}
- wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc);
- wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd);
+ wrmsrq(MSR_AMD64_SMCA_MCx_SYND(b), m.synd);
+
+ if (m.misc)
+ wrmsrq(MSR_AMD64_SMCA_MCx_MISC(b), m.misc);
} else {
- wrmsrl(MSR_IA32_MCx_STATUS(b), m.status);
- wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr);
- wrmsrl(MSR_IA32_MCx_MISC(b), m.misc);
+ wrmsrq(MSR_IA32_MCx_STATUS(b), m.status);
+ wrmsrq(MSR_IA32_MCx_ADDR(b), m.addr);
+
+ if (m.misc)
+ wrmsrq(MSR_IA32_MCx_MISC(b), m.misc);
}
}
static void do_inject(void)
{
- u64 mcg_status = 0;
unsigned int cpu = i_mce.extcpu;
+ struct mce_hw_err err;
+ u64 mcg_status = 0;
u8 b = i_mce.bank;
i_mce.tsc = rdtsc_ordered();
@@ -516,7 +518,8 @@ static void do_inject(void)
i_mce.status |= MCI_STATUS_SYNDV;
if (inj_type == SW_INJ) {
- mce_log(&i_mce);
+ err.m = i_mce;
+ mce_log(&err);
return;
}
@@ -544,8 +547,8 @@ static void do_inject(void)
if (boot_cpu_has(X86_FEATURE_AMD_DCM) &&
b == 4 &&
boot_cpu_data.x86 < 0x17) {
- toggle_nb_mca_mst_cpu(topology_die_id(cpu));
- cpu = get_nbc_for_node(topology_die_id(cpu));
+ toggle_nb_mca_mst_cpu(topology_amd_node_id(cpu));
+ cpu = get_nbc_for_node(topology_amd_node_id(cpu));
}
cpus_read_lock();
@@ -587,7 +590,7 @@ static int inj_bank_set(void *data, u64 val)
u64 cap;
/* Get bank count on target CPU so we can handle non-uniform values. */
- rdmsrl_on_cpu(m->extcpu, MSR_IA32_MCG_CAP, &cap);
+ rdmsrq_on_cpu(m->extcpu, MSR_IA32_MCG_CAP, &cap);
n_banks = cap & MCG_BANKCNT_MASK;
if (val >= n_banks) {
@@ -611,7 +614,7 @@ static int inj_bank_set(void *data, u64 val)
if (cpu_feature_enabled(X86_FEATURE_SMCA)) {
u64 ipid;
- if (rdmsrl_on_cpu(m->extcpu, MSR_AMD64_SMCA_MCx_IPID(val), &ipid)) {
+ if (rdmsrq_on_cpu(m->extcpu, MSR_AMD64_SMCA_MCx_IPID(val), &ipid)) {
pr_err("Error reading IPID on CPU%d\n", m->extcpu);
return -EINVAL;
}
@@ -739,14 +742,15 @@ static void check_hw_inj_possible(void)
u64 status = MCI_STATUS_VAL, ipid;
/* Check whether bank is populated */
- rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), ipid);
+ rdmsrq(MSR_AMD64_SMCA_MCx_IPID(bank), ipid);
if (!ipid)
continue;
toggle_hw_mce_inject(cpu, true);
- wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), status);
- rdmsrl_safe(mca_msr_reg(bank, MCA_STATUS), &status);
+ wrmsrq_safe(mca_msr_reg(bank, MCA_STATUS), status);
+ rdmsrq_safe(mca_msr_reg(bank, MCA_STATUS), &status);
+ wrmsrq_safe(mca_msr_reg(bank, MCA_STATUS), 0);
if (!status) {
hw_injection_possible = false;
@@ -797,4 +801,5 @@ static void __exit inject_exit(void)
module_init(inject_init);
module_exit(inject_exit);
+MODULE_DESCRIPTION("Machine check injection support");
MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index 95275a5e57e0..efcf21e9552e 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -13,7 +13,7 @@
#include <linux/cpumask.h>
#include <asm/apic.h>
#include <asm/cpufeature.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/mce.h>
@@ -42,43 +42,45 @@
static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
/*
- * CMCI storm detection backoff counter
- *
- * During storm, we reset this counter to INITIAL_CHECK_INTERVAL in case we've
- * encountered an error. If not, we decrement it by one. We signal the end of
- * the CMCI storm when it reaches 0.
- */
-static DEFINE_PER_CPU(int, cmci_backoff_cnt);
-
-/*
* cmci_discover_lock protects against parallel discovery attempts
* which could race against each other.
*/
static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
-#define CMCI_THRESHOLD 1
-#define CMCI_POLL_INTERVAL (30 * HZ)
-#define CMCI_STORM_INTERVAL (HZ)
-#define CMCI_STORM_THRESHOLD 15
+/*
+ * On systems that do support CMCI but it's disabled, polling for MCEs can
+ * cause the same event to be reported multiple times because IA32_MCi_STATUS
+ * is shared by the same package.
+ */
+static DEFINE_SPINLOCK(cmci_poll_lock);
-static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
-static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt);
-static DEFINE_PER_CPU(unsigned int, cmci_storm_state);
+/* Linux non-storm CMCI threshold (may be overridden by BIOS) */
+#define CMCI_THRESHOLD 1
-enum {
- CMCI_STORM_NONE,
- CMCI_STORM_ACTIVE,
- CMCI_STORM_SUBSIDED,
-};
+/*
+ * MCi_CTL2 threshold for each bank when there is no storm.
+ * Default value for each bank may have been set by BIOS.
+ */
+static u16 cmci_threshold[MAX_NR_BANKS];
-static atomic_t cmci_storm_on_cpus;
+/*
+ * High threshold to limit CMCI rate during storms. Max supported is
+ * 0x7FFF. Use this slightly smaller value so it has a distinctive
+ * signature when some asks "Why am I not seeing all corrected errors?"
+ * A high threshold is used instead of just disabling CMCI for a
+ * bank because both corrected and uncorrected errors may be logged
+ * in the same bank and signalled with CMCI. The threshold only applies
+ * to corrected errors, so keeping CMCI enabled means that uncorrected
+ * errors will still be processed in a timely fashion.
+ */
+#define CMCI_STORM_THRESHOLD 32749
-static int cmci_supported(int *banks)
+static bool cmci_supported(int *banks)
{
u64 cap;
if (mca_cfg.cmci_disabled || mca_cfg.ignore_ce)
- return 0;
+ return false;
/*
* Vendor check is not strictly needed, but the initial
@@ -87,12 +89,13 @@ static int cmci_supported(int *banks)
*/
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
- return 0;
+ return false;
if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6)
- return 0;
- rdmsrl(MSR_IA32_MCG_CAP, cap);
- *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
+ return false;
+
+ rdmsrq(MSR_IA32_MCG_CAP, cap);
+ *banks = min_t(unsigned, MAX_NR_BANKS, cap & MCG_BANKCNT_MASK);
return !!(cap & MCG_CMCI_P);
}
@@ -103,7 +106,7 @@ static bool lmce_supported(void)
if (mca_cfg.lmce_disabled)
return false;
- rdmsrl(MSR_IA32_MCG_CAP, tmp);
+ rdmsrq(MSR_IA32_MCG_CAP, tmp);
/*
* LMCE depends on recovery support in the processor. Hence both
@@ -120,211 +123,173 @@ static bool lmce_supported(void)
* WARN if the MSR isn't locked as init_ia32_feat_ctl() unconditionally
* locks the MSR in the event that it wasn't already locked by BIOS.
*/
- rdmsrl(MSR_IA32_FEAT_CTL, tmp);
+ rdmsrq(MSR_IA32_FEAT_CTL, tmp);
if (WARN_ON_ONCE(!(tmp & FEAT_CTL_LOCKED)))
return false;
return tmp & FEAT_CTL_LMCE_ENABLED;
}
-bool mce_intel_cmci_poll(void)
+/*
+ * Set a new CMCI threshold value. Preserve the state of the
+ * MCI_CTL2_CMCI_EN bit in case this happens during a
+ * cmci_rediscover() operation.
+ */
+static void cmci_set_threshold(int bank, int thresh)
{
- if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
- return false;
-
- /*
- * Reset the counter if we've logged an error in the last poll
- * during the storm.
- */
- if (machine_check_poll(0, this_cpu_ptr(&mce_banks_owned)))
- this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
- else
- this_cpu_dec(cmci_backoff_cnt);
+ unsigned long flags;
+ u64 val;
- return true;
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+ rdmsrq(MSR_IA32_MCx_CTL2(bank), val);
+ val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
+ wrmsrq(MSR_IA32_MCx_CTL2(bank), val | thresh);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
-void mce_intel_hcpu_update(unsigned long cpu)
+void mce_intel_handle_storm(int bank, bool on)
{
- if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE)
- atomic_dec(&cmci_storm_on_cpus);
+ if (on)
+ cmci_set_threshold(bank, CMCI_STORM_THRESHOLD);
+ else
+ cmci_set_threshold(bank, cmci_threshold[bank]);
+}
- per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
+/*
+ * The interrupt handler. This is called on every event.
+ * Just call the poller directly to log any events.
+ * This could in theory increase the threshold under high load,
+ * but doesn't for now.
+ */
+static void intel_threshold_interrupt(void)
+{
+ machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
}
-static void cmci_toggle_interrupt_mode(bool on)
+/*
+ * Check all the reasons why current CPU cannot claim
+ * ownership of a bank.
+ * 1: CPU already owns this bank
+ * 2: BIOS owns this bank
+ * 3: Some other CPU owns this bank
+ */
+static bool cmci_skip_bank(int bank, u64 *val)
{
- unsigned long flags, *owned;
- int bank;
- u64 val;
+ unsigned long *owned = (void *)this_cpu_ptr(&mce_banks_owned);
- raw_spin_lock_irqsave(&cmci_discover_lock, flags);
- owned = this_cpu_ptr(mce_banks_owned);
- for_each_set_bit(bank, owned, MAX_NR_BANKS) {
- rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ if (test_bit(bank, owned))
+ return true;
- if (on)
- val |= MCI_CTL2_CMCI_EN;
- else
- val &= ~MCI_CTL2_CMCI_EN;
+ /* Skip banks in firmware first mode */
+ if (test_bit(bank, mce_banks_ce_disabled))
+ return true;
- wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
- }
- raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
-}
+ rdmsrq(MSR_IA32_MCx_CTL2(bank), *val);
-unsigned long cmci_intel_adjust_timer(unsigned long interval)
-{
- if ((this_cpu_read(cmci_backoff_cnt) > 0) &&
- (__this_cpu_read(cmci_storm_state) == CMCI_STORM_ACTIVE)) {
- mce_notify_irq();
- return CMCI_STORM_INTERVAL;
+ /* Already owned by someone else? */
+ if (*val & MCI_CTL2_CMCI_EN) {
+ clear_bit(bank, owned);
+ __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
+ return true;
}
- switch (__this_cpu_read(cmci_storm_state)) {
- case CMCI_STORM_ACTIVE:
-
- /*
- * We switch back to interrupt mode once the poll timer has
- * silenced itself. That means no events recorded and the timer
- * interval is back to our poll interval.
- */
- __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
- if (!atomic_sub_return(1, &cmci_storm_on_cpus))
- pr_notice("CMCI storm subsided: switching to interrupt mode\n");
+ return false;
+}
- fallthrough;
+/*
+ * Decide which CMCI interrupt threshold to use:
+ * 1: If this bank is in storm mode from whichever CPU was
+ * the previous owner, stay in storm mode.
+ * 2: If ignoring any threshold set by BIOS, set Linux default
+ * 3: Try to honor BIOS threshold (unless buggy BIOS set it at zero).
+ */
+static u64 cmci_pick_threshold(u64 val, int *bios_zero_thresh)
+{
+ if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD)
+ return val;
- case CMCI_STORM_SUBSIDED:
+ if (!mca_cfg.bios_cmci_threshold) {
+ val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
+ val |= CMCI_THRESHOLD;
+ } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) {
/*
- * We wait for all CPUs to go back to SUBSIDED state. When that
- * happens we switch back to interrupt mode.
+ * If bios_cmci_threshold boot option was specified
+ * but the threshold is zero, we'll try to initialize
+ * it to 1.
*/
- if (!atomic_read(&cmci_storm_on_cpus)) {
- __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
- cmci_toggle_interrupt_mode(true);
- cmci_recheck();
- }
- return CMCI_POLL_INTERVAL;
- default:
-
- /* We have shiny weather. Let the poll do whatever it thinks. */
- return interval;
+ *bios_zero_thresh = 1;
+ val |= CMCI_THRESHOLD;
}
+
+ return val;
}
-static bool cmci_storm_detect(void)
+/*
+ * Try to claim ownership of a bank.
+ */
+static void cmci_claim_bank(int bank, u64 val, int bios_zero_thresh, int *bios_wrong_thresh)
{
- unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
- unsigned long ts = __this_cpu_read(cmci_time_stamp);
- unsigned long now = jiffies;
- int r;
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
- if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE)
- return true;
+ val |= MCI_CTL2_CMCI_EN;
+ wrmsrq(MSR_IA32_MCx_CTL2(bank), val);
+ rdmsrq(MSR_IA32_MCx_CTL2(bank), val);
- if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) {
- cnt++;
- } else {
- cnt = 1;
- __this_cpu_write(cmci_time_stamp, now);
+ /* If the enable bit did not stick, this bank should be polled. */
+ if (!(val & MCI_CTL2_CMCI_EN)) {
+ WARN_ON(!test_bit(bank, this_cpu_ptr(mce_poll_banks)));
+ storm->banks[bank].poll_only = true;
+ return;
}
- __this_cpu_write(cmci_storm_cnt, cnt);
- if (cnt <= CMCI_STORM_THRESHOLD)
- return false;
+ /* This CPU successfully set the enable bit. */
+ set_bit(bank, (void *)this_cpu_ptr(&mce_banks_owned));
- cmci_toggle_interrupt_mode(false);
- __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
- r = atomic_add_return(1, &cmci_storm_on_cpus);
- mce_timer_kick(CMCI_STORM_INTERVAL);
- this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
-
- if (r == 1)
- pr_notice("CMCI storm detected: switching to poll mode\n");
- return true;
-}
+ if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD) {
+ pr_notice("CPU%d BANK%d CMCI inherited storm\n", smp_processor_id(), bank);
+ mce_inherit_storm(bank);
+ cmci_storm_begin(bank);
+ } else {
+ __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
+ }
-/*
- * The interrupt handler. This is called on every event.
- * Just call the poller directly to log any events.
- * This could in theory increase the threshold under high load,
- * but doesn't for now.
- */
-static void intel_threshold_interrupt(void)
-{
- if (cmci_storm_detect())
- return;
+ /*
+ * We are able to set thresholds for some banks that
+ * had a threshold of 0. This means the BIOS has not
+ * set the thresholds properly or does not work with
+ * this boot option. Note down now and report later.
+ */
+ if (mca_cfg.bios_cmci_threshold && bios_zero_thresh &&
+ (val & MCI_CTL2_CMCI_THRESHOLD_MASK))
+ *bios_wrong_thresh = 1;
- machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
+ /* Save default threshold for each bank */
+ if (cmci_threshold[bank] == 0)
+ cmci_threshold[bank] = val & MCI_CTL2_CMCI_THRESHOLD_MASK;
}
/*
* Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
* on this CPU. Use the algorithm recommended in the SDM to discover shared
- * banks.
+ * banks. Called during initial bootstrap, and also for hotplug CPU operations
+ * to rediscover/reassign machine check banks.
*/
static void cmci_discover(int banks)
{
- unsigned long *owned = (void *)this_cpu_ptr(&mce_banks_owned);
+ int bios_wrong_thresh = 0;
unsigned long flags;
int i;
- int bios_wrong_thresh = 0;
raw_spin_lock_irqsave(&cmci_discover_lock, flags);
for (i = 0; i < banks; i++) {
u64 val;
int bios_zero_thresh = 0;
- if (test_bit(i, owned))
+ if (cmci_skip_bank(i, &val))
continue;
- /* Skip banks in firmware first mode */
- if (test_bit(i, mce_banks_ce_disabled))
- continue;
-
- rdmsrl(MSR_IA32_MCx_CTL2(i), val);
-
- /* Already owned by someone else? */
- if (val & MCI_CTL2_CMCI_EN) {
- clear_bit(i, owned);
- __clear_bit(i, this_cpu_ptr(mce_poll_banks));
- continue;
- }
-
- if (!mca_cfg.bios_cmci_threshold) {
- val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
- val |= CMCI_THRESHOLD;
- } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) {
- /*
- * If bios_cmci_threshold boot option was specified
- * but the threshold is zero, we'll try to initialize
- * it to 1.
- */
- bios_zero_thresh = 1;
- val |= CMCI_THRESHOLD;
- }
-
- val |= MCI_CTL2_CMCI_EN;
- wrmsrl(MSR_IA32_MCx_CTL2(i), val);
- rdmsrl(MSR_IA32_MCx_CTL2(i), val);
-
- /* Did the enable bit stick? -- the bank supports CMCI */
- if (val & MCI_CTL2_CMCI_EN) {
- set_bit(i, owned);
- __clear_bit(i, this_cpu_ptr(mce_poll_banks));
- /*
- * We are able to set thresholds for some banks that
- * had a threshold of 0. This means the BIOS has not
- * set the thresholds properly or does not work with
- * this boot option. Note down now and report later.
- */
- if (mca_cfg.bios_cmci_threshold && bios_zero_thresh &&
- (val & MCI_CTL2_CMCI_THRESHOLD_MASK))
- bios_wrong_thresh = 1;
- } else {
- WARN_ON(!test_bit(i, this_cpu_ptr(mce_poll_banks)));
- }
+ val = cmci_pick_threshold(val, &bios_zero_thresh);
+ cmci_claim_bank(i, val, bios_zero_thresh, &bios_wrong_thresh);
}
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) {
@@ -359,10 +324,13 @@ static void __cmci_disable_bank(int bank)
if (!test_bit(bank, this_cpu_ptr(mce_banks_owned)))
return;
- rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ rdmsrq(MSR_IA32_MCx_CTL2(bank), val);
val &= ~MCI_CTL2_CMCI_EN;
- wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ wrmsrq(MSR_IA32_MCx_CTL2(bank), val);
__clear_bit(bank, this_cpu_ptr(mce_banks_owned));
+
+ if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD)
+ cmci_storm_end(bank);
}
/*
@@ -426,12 +394,22 @@ void cmci_disable_bank(int bank)
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
+/* Bank polling function when CMCI is disabled. */
+static void cmci_mc_poll_banks(void)
+{
+ spin_lock(&cmci_poll_lock);
+ machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
+ spin_unlock(&cmci_poll_lock);
+}
+
void intel_init_cmci(void)
{
int banks;
- if (!cmci_supported(&banks))
+ if (!cmci_supported(&banks)) {
+ mc_poll_banks = cmci_mc_poll_banks;
return;
+ }
mce_threshold_vector = intel_threshold_interrupt;
cmci_discover(banks);
@@ -452,10 +430,10 @@ void intel_init_lmce(void)
if (!lmce_supported())
return;
- rdmsrl(MSR_IA32_MCG_EXT_CTL, val);
+ rdmsrq(MSR_IA32_MCG_EXT_CTL, val);
if (!(val & MCG_EXT_CTL_LMCE_EN))
- wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN);
+ wrmsrq(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN);
}
void intel_clear_lmce(void)
@@ -465,9 +443,9 @@ void intel_clear_lmce(void)
if (!lmce_supported())
return;
- rdmsrl(MSR_IA32_MCG_EXT_CTL, val);
+ rdmsrq(MSR_IA32_MCG_EXT_CTL, val);
val &= ~MCG_EXT_CTL_LMCE_EN;
- wrmsrl(MSR_IA32_MCG_EXT_CTL, val);
+ wrmsrq(MSR_IA32_MCG_EXT_CTL, val);
}
/*
@@ -478,14 +456,14 @@ static void intel_imc_init(struct cpuinfo_x86 *c)
{
u64 error_control;
- switch (c->x86_model) {
- case INTEL_FAM6_SANDYBRIDGE_X:
- case INTEL_FAM6_IVYBRIDGE_X:
- case INTEL_FAM6_HASWELL_X:
- if (rdmsrl_safe(MSR_ERROR_CONTROL, &error_control))
+ switch (c->x86_vfm) {
+ case INTEL_SANDYBRIDGE_X:
+ case INTEL_IVYBRIDGE_X:
+ case INTEL_HASWELL_X:
+ if (rdmsrq_safe(MSR_ERROR_CONTROL, &error_control))
return;
error_control |= 2;
- wrmsrl_safe(MSR_ERROR_CONTROL, error_control);
+ wrmsrq_safe(MSR_ERROR_CONTROL, error_control);
break;
}
}
@@ -507,15 +485,34 @@ bool intel_filter_mce(struct mce *m)
struct cpuinfo_x86 *c = &boot_cpu_data;
/* MCE errata HSD131, HSM142, HSW131, BDM48, HSM142 and SKX37 */
- if ((c->x86 == 6) &&
- ((c->x86_model == INTEL_FAM6_HASWELL) ||
- (c->x86_model == INTEL_FAM6_HASWELL_L) ||
- (c->x86_model == INTEL_FAM6_BROADWELL) ||
- (c->x86_model == INTEL_FAM6_HASWELL_G) ||
- (c->x86_model == INTEL_FAM6_SKYLAKE_X)) &&
+ if ((c->x86_vfm == INTEL_HASWELL ||
+ c->x86_vfm == INTEL_HASWELL_L ||
+ c->x86_vfm == INTEL_BROADWELL ||
+ c->x86_vfm == INTEL_HASWELL_G ||
+ c->x86_vfm == INTEL_SKYLAKE_X) &&
(m->bank == 0) &&
((m->status & 0xa0000000ffffffff) == 0x80000000000f0005))
return true;
return false;
}
+
+/*
+ * Check if the address reported by the CPU is in a format we can parse.
+ * It would be possible to add code for most other cases, but all would
+ * be somewhat complicated (e.g. segment offset would require an instruction
+ * parser). So only support physical addresses up to page granularity for now.
+ */
+bool intel_mce_usable_address(struct mce *m)
+{
+ if (!(m->status & MCI_STATUS_MISCV))
+ return false;
+
+ if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
+ return false;
+
+ if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
+ return false;
+
+ return true;
+}
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 7e03f5b7f6bd..b5ba598e54cb 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -26,13 +26,13 @@ extern struct blocking_notifier_head x86_mce_decoder_chain;
struct mce_evt_llist {
struct llist_node llnode;
- struct mce mce;
+ struct mce_hw_err err;
};
void mce_gen_pool_process(struct work_struct *__unused);
bool mce_gen_pool_empty(void);
-int mce_gen_pool_add(struct mce *mce);
-int mce_gen_pool_init(void);
+bool mce_gen_pool_add(struct mce_hw_err *err);
+bool mce_gen_pool_init(void);
struct llist_node *mce_gen_pool_prepare_records(void);
int mce_severity(struct mce *a, struct pt_regs *regs, char **msg, bool is_excp);
@@ -41,26 +41,80 @@ struct dentry *mce_get_debugfs_dir(void);
extern mce_banks_t mce_banks_ce_disabled;
#ifdef CONFIG_X86_MCE_INTEL
-unsigned long cmci_intel_adjust_timer(unsigned long interval);
-bool mce_intel_cmci_poll(void);
-void mce_intel_hcpu_update(unsigned long cpu);
+void mce_intel_handle_storm(int bank, bool on);
void cmci_disable_bank(int bank);
void intel_init_cmci(void);
void intel_init_lmce(void);
void intel_clear_lmce(void);
bool intel_filter_mce(struct mce *m);
+bool intel_mce_usable_address(struct mce *m);
#else
-# define cmci_intel_adjust_timer mce_adjust_timer_default
-static inline bool mce_intel_cmci_poll(void) { return false; }
-static inline void mce_intel_hcpu_update(unsigned long cpu) { }
+static inline void mce_intel_handle_storm(int bank, bool on) { }
static inline void cmci_disable_bank(int bank) { }
static inline void intel_init_cmci(void) { }
static inline void intel_init_lmce(void) { }
static inline void intel_clear_lmce(void) { }
static inline bool intel_filter_mce(struct mce *m) { return false; }
+static inline bool intel_mce_usable_address(struct mce *m) { return false; }
#endif
-void mce_timer_kick(unsigned long interval);
+void mce_timer_kick(bool storm);
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+void cmci_storm_begin(unsigned int bank);
+void cmci_storm_end(unsigned int bank);
+void mce_track_storm(struct mce *mce);
+void mce_inherit_storm(unsigned int bank);
+bool mce_get_storm_mode(void);
+void mce_set_storm_mode(bool storm);
+#else
+static inline void cmci_storm_begin(unsigned int bank) {}
+static inline void cmci_storm_end(unsigned int bank) {}
+static inline void mce_track_storm(struct mce *mce) {}
+static inline void mce_inherit_storm(unsigned int bank) {}
+static inline bool mce_get_storm_mode(void) { return false; }
+static inline void mce_set_storm_mode(bool storm) {}
+#endif
+
+/*
+ * history: Bitmask tracking errors occurrence. Each set bit
+ * represents an error seen.
+ *
+ * timestamp: Last time (in jiffies) that the bank was polled.
+ * in_storm_mode: Is this bank in storm mode?
+ * poll_only: Bank does not support CMCI, skip storm tracking.
+ */
+struct storm_bank {
+ u64 history;
+ u64 timestamp;
+ bool in_storm_mode;
+ bool poll_only;
+};
+
+#define NUM_HISTORY_BITS (sizeof(u64) * BITS_PER_BYTE)
+
+/* How many errors within the history buffer mark the start of a storm. */
+#define STORM_BEGIN_THRESHOLD 5
+
+/*
+ * How many polls of machine check bank without an error before declaring
+ * the storm is over. Since it is tracked by the bitmasks in the history
+ * field of struct storm_bank the mask is 30 bits [0 ... 29].
+ */
+#define STORM_END_POLL_THRESHOLD 29
+
+/*
+ * banks: per-cpu, per-bank details
+ * stormy_bank_count: count of MC banks in storm state
+ * poll_mode: CPU is in poll mode
+ */
+struct mca_storm_desc {
+ struct storm_bank banks[MAX_NR_BANKS];
+ u8 stormy_bank_count;
+ bool poll_mode;
+};
+
+DECLARE_PER_CPU(struct mca_storm_desc, storm_desc);
#ifdef CONFIG_ACPI_APEI
int apei_write_mce(struct mce *m);
@@ -157,6 +211,9 @@ struct mce_vendor_flags {
*/
smca : 1,
+ /* Zen IFU quirk */
+ zen_ifu_quirk : 1,
+
/* AMD-style error thresholding banks present. */
amd_threshold : 1,
@@ -172,11 +229,29 @@ struct mce_vendor_flags {
/* Skylake, Cascade Lake, Cooper Lake REP;MOVS* quirk */
skx_repmov_quirk : 1,
- __reserved_0 : 56;
+ __reserved_0 : 55;
};
extern struct mce_vendor_flags mce_flags;
+struct mce_bank {
+ /* subevents to enable */
+ u64 ctl;
+
+ /* initialise bank? */
+ __u64 init : 1,
+
+ /*
+ * (AMD) MCA_CONFIG[McaLsbInStatusSupported]: When set, this bit indicates
+ * the LSB field is found in MCA_STATUS and not in MCA_ADDR.
+ */
+ lsb_in_status : 1,
+
+ __reserved_1 : 62;
+};
+
+DECLARE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
+
enum mca_msr {
MCA_CTL,
MCA_STATUS,
@@ -186,11 +261,41 @@ enum mca_msr {
/* Decide whether to add MCE record to MCE event pool or filter it out. */
extern bool filter_mce(struct mce *m);
+void mce_prep_record_common(struct mce *m);
+void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m);
#ifdef CONFIG_X86_MCE_AMD
extern bool amd_filter_mce(struct mce *m);
+bool amd_mce_usable_address(struct mce *m);
+
+/*
+ * If MCA_CONFIG[McaLsbInStatusSupported] is set, extract ErrAddr in bits
+ * [56:0] of MCA_STATUS, else in bits [55:0] of MCA_ADDR.
+ */
+static __always_inline void smca_extract_err_addr(struct mce *m)
+{
+ u8 lsb;
+
+ if (!mce_flags.smca)
+ return;
+
+ if (this_cpu_ptr(mce_banks_array)[m->bank].lsb_in_status) {
+ lsb = (m->status >> 24) & 0x3f;
+
+ m->addr &= GENMASK_ULL(56, lsb);
+
+ return;
+ }
+
+ lsb = (m->addr >> 56) & 0x3f;
+
+ m->addr &= GENMASK_ULL(55, lsb);
+}
+
#else
static inline bool amd_filter_mce(struct mce *m) { return false; }
+static inline bool amd_mce_usable_address(struct mce *m) { return false; }
+static inline void smca_extract_err_addr(struct mce *m) { }
#endif
#ifdef CONFIG_X86_ANCIENT_MCE
@@ -200,14 +305,14 @@ noinstr void pentium_machine_check(struct pt_regs *regs);
noinstr void winchip_machine_check(struct pt_regs *regs);
static inline void enable_p5_mce(void) { mce_p5_enabled = 1; }
#else
-static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {}
-static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
-static inline void enable_p5_mce(void) {}
-static inline void pentium_machine_check(struct pt_regs *regs) {}
-static inline void winchip_machine_check(struct pt_regs *regs) {}
+static __always_inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {}
+static __always_inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
+static __always_inline void enable_p5_mce(void) {}
+static __always_inline void pentium_machine_check(struct pt_regs *regs) {}
+static __always_inline void winchip_machine_check(struct pt_regs *regs) {}
#endif
-noinstr u64 mce_rdmsrl(u32 msr);
+noinstr u64 mce_rdmsrq(u32 msr);
static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg)
{
@@ -230,4 +335,5 @@ static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg)
return 0;
}
+extern void (*mc_poll_banks)(void);
#endif /* __X86_MCE_INTERNAL_H__ */
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
index 00483d1c27e4..2235a7477436 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -12,7 +12,7 @@
#include <linux/uaccess.h>
#include <asm/mce.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/traps.h>
#include <asm/insn.h>
#include <asm/insn-eval.h>
@@ -39,20 +39,20 @@ static struct severity {
u64 mask;
u64 result;
unsigned char sev;
- unsigned char mcgmask;
- unsigned char mcgres;
+ unsigned short mcgmask;
+ unsigned short mcgres;
unsigned char ser;
unsigned char context;
unsigned char excp;
unsigned char covered;
- unsigned char cpu_model;
+ unsigned int cpu_vfm;
unsigned char cpu_minstepping;
unsigned char bank_lo, bank_hi;
char *msg;
} severities[] = {
#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
#define BANK_RANGE(l, h) .bank_lo = l, .bank_hi = h
-#define MODEL_STEPPING(m, s) .cpu_model = m, .cpu_minstepping = s
+#define VFM_STEPPING(m, s) .cpu_vfm = m, .cpu_minstepping = s
#define KERNEL .context = IN_KERNEL
#define USER .context = IN_USER
#define KERNEL_RECOV .context = IN_KERNEL_RECOV
@@ -128,7 +128,7 @@ static struct severity {
MCESEV(
AO, "Uncorrected Patrol Scrub Error",
SER, MASK(MCI_STATUS_UC|MCI_ADDR|0xffffeff0, MCI_ADDR|0x001000c0),
- MODEL_STEPPING(INTEL_FAM6_SKYLAKE_X, 4), BANK_RANGE(13, 18)
+ VFM_STEPPING(INTEL_SKYLAKE_X, 4), BANK_RANGE(13, 18)
),
/* ignore OVER for UCNA */
@@ -174,6 +174,18 @@ static struct severity {
USER
),
MCESEV(
+ AR, "Data load error in SEAM non-root mode",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+ MCGMASK(MCG_STATUS_SEAM_NR, MCG_STATUS_SEAM_NR),
+ KERNEL
+ ),
+ MCESEV(
+ AR, "Instruction fetch error in SEAM non-root mode",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
+ MCGMASK(MCG_STATUS_SEAM_NR, MCG_STATUS_SEAM_NR),
+ KERNEL
+ ),
+ MCESEV(
PANIC, "Data load in unrecoverable area of kernel",
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
KERNEL
@@ -203,6 +215,11 @@ static struct severity {
BITSET(MCI_STATUS_OVER|MCI_STATUS_UC)
),
MCESEV(
+ PANIC, "Uncorrected in kernel",
+ BITSET(MCI_STATUS_UC),
+ KERNEL
+ ),
+ MCESEV(
UC, "Uncorrected",
BITSET(MCI_STATUS_UC)
),
@@ -283,14 +300,12 @@ static noinstr int error_context(struct mce *m, struct pt_regs *regs)
copy_user = is_copy_from_user(regs);
instrumentation_end();
- switch (fixup_type) {
- case EX_TYPE_UACCESS:
- case EX_TYPE_COPY:
- if (!copy_user)
- return IN_KERNEL;
- m->kflags |= MCE_IN_KERNEL_COPYIN;
- fallthrough;
+ if (copy_user) {
+ m->kflags |= MCE_IN_KERNEL_COPYIN | MCE_IN_KERNEL_RECOV;
+ return IN_KERNEL_RECOV;
+ }
+ switch (fixup_type) {
case EX_TYPE_FAULT_MCE_SAFE:
case EX_TYPE_DEFAULT_MCE_SAFE:
m->kflags |= MCE_IN_KERNEL_RECOV;
@@ -381,7 +396,7 @@ static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs, char
continue;
if (s->excp && excp != s->excp)
continue;
- if (s->cpu_model && boot_cpu_data.x86_model != s->cpu_model)
+ if (s->cpu_vfm && boot_cpu_data.x86_vfm != s->cpu_vfm)
continue;
if (s->cpu_minstepping && boot_cpu_data.x86_stepping < s->cpu_minstepping)
continue;
@@ -391,9 +406,6 @@ static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs, char
*msg = s->msg;
s->covered = 1;
- if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL)
- return MCE_PANIC_SEVERITY;
-
return s->sev;
}
}
diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c
index 6a059a035021..f4a007616468 100644
--- a/arch/x86/kernel/cpu/mce/threshold.c
+++ b/arch/x86/kernel/cpu/mce/threshold.c
@@ -27,5 +27,120 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_threshold)
inc_irq_stat(irq_threshold_count);
mce_threshold_vector();
trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR);
- ack_APIC_irq();
+ apic_eoi();
+}
+
+DEFINE_PER_CPU(struct mca_storm_desc, storm_desc);
+
+void mce_inherit_storm(unsigned int bank)
+{
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
+
+ /*
+ * Previous CPU owning this bank had put it into storm mode,
+ * but the precise history of that storm is unknown. Assume
+ * the worst (all recent polls of the bank found a valid error
+ * logged). This will avoid the new owner prematurely declaring
+ * the storm has ended.
+ */
+ storm->banks[bank].history = ~0ull;
+ storm->banks[bank].timestamp = jiffies;
+}
+
+bool mce_get_storm_mode(void)
+{
+ return __this_cpu_read(storm_desc.poll_mode);
+}
+
+void mce_set_storm_mode(bool storm)
+{
+ __this_cpu_write(storm_desc.poll_mode, storm);
+}
+
+static void mce_handle_storm(unsigned int bank, bool on)
+{
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_INTEL:
+ mce_intel_handle_storm(bank, on);
+ break;
+ }
+}
+
+void cmci_storm_begin(unsigned int bank)
+{
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
+
+ __set_bit(bank, this_cpu_ptr(mce_poll_banks));
+ storm->banks[bank].in_storm_mode = true;
+
+ /*
+ * If this is the first bank on this CPU to enter storm mode
+ * start polling.
+ */
+ if (++storm->stormy_bank_count == 1)
+ mce_timer_kick(true);
+}
+
+void cmci_storm_end(unsigned int bank)
+{
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
+
+ __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
+ storm->banks[bank].history = 0;
+ storm->banks[bank].in_storm_mode = false;
+
+ /* If no banks left in storm mode, stop polling. */
+ if (!--storm->stormy_bank_count)
+ mce_timer_kick(false);
+}
+
+void mce_track_storm(struct mce *mce)
+{
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
+ unsigned long now = jiffies, delta;
+ unsigned int shift = 1;
+ u64 history = 0;
+
+ /* No tracking needed for banks that do not support CMCI */
+ if (storm->banks[mce->bank].poll_only)
+ return;
+
+ /*
+ * When a bank is in storm mode it is polled once per second and
+ * the history mask will record about the last minute of poll results.
+ * If it is not in storm mode, then the bank is only checked when
+ * there is a CMCI interrupt. Check how long it has been since
+ * this bank was last checked, and adjust the amount of "shift"
+ * to apply to history.
+ */
+ if (!storm->banks[mce->bank].in_storm_mode) {
+ delta = now - storm->banks[mce->bank].timestamp;
+ shift = (delta + HZ) / HZ;
+ }
+
+ /* If it has been a long time since the last poll, clear history. */
+ if (shift < NUM_HISTORY_BITS)
+ history = storm->banks[mce->bank].history << shift;
+
+ storm->banks[mce->bank].timestamp = now;
+
+ /* History keeps track of corrected errors. VAL=1 && UC=0 */
+ if ((mce->status & MCI_STATUS_VAL) && mce_is_correctable(mce))
+ history |= 1;
+
+ storm->banks[mce->bank].history = history;
+
+ if (storm->banks[mce->bank].in_storm_mode) {
+ if (history & GENMASK_ULL(STORM_END_POLL_THRESHOLD, 0))
+ return;
+ printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm subsided\n", smp_processor_id(), mce->bank);
+ mce_handle_storm(mce->bank, false);
+ cmci_storm_end(mce->bank);
+ } else {
+ if (hweight64(history) < STORM_BEGIN_THRESHOLD)
+ return;
+ printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm detected\n", smp_processor_id(), mce->bank);
+ mce_handle_storm(mce->bank, true);
+ cmci_storm_begin(mce->bank);
+ }
}
diff --git a/arch/x86/kernel/cpu/microcode/Makefile b/arch/x86/kernel/cpu/microcode/Makefile
index 34098d48c48f..193d98b33a0a 100644
--- a/arch/x86/kernel/cpu/microcode/Makefile
+++ b/arch/x86/kernel/cpu/microcode/Makefile
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: GPL-2.0-only
microcode-y := core.o
obj-$(CONFIG_MICROCODE) += microcode.o
-microcode-$(CONFIG_MICROCODE_INTEL) += intel.o
-microcode-$(CONFIG_MICROCODE_AMD) += amd.o
+microcode-$(CONFIG_CPU_SUP_INTEL) += intel.o
+microcode-$(CONFIG_CPU_SUP_AMD) += amd.o
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 8b2fcdfa6d31..097e39327942 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -23,24 +23,102 @@
#include <linux/earlycpio.h>
#include <linux/firmware.h>
+#include <linux/bsearch.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h>
#include <linux/initrd.h>
#include <linux/kernel.h>
#include <linux/pci.h>
-#include <asm/microcode_amd.h>
+#include <crypto/sha2.h>
+
#include <asm/microcode.h>
#include <asm/processor.h>
+#include <asm/cmdline.h>
#include <asm/setup.h>
#include <asm/cpu.h>
#include <asm/msr.h>
+#include <asm/tlb.h>
+
+#include "internal.h"
+
+struct ucode_patch {
+ struct list_head plist;
+ void *data;
+ unsigned int size;
+ u32 patch_id;
+ u16 equiv_cpu;
+};
+
+static LIST_HEAD(microcode_cache);
+
+#define UCODE_MAGIC 0x00414d44
+#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000
+#define UCODE_UCODE_TYPE 0x00000001
+
+#define SECTION_HDR_SIZE 8
+#define CONTAINER_HDR_SZ 12
+
+struct equiv_cpu_entry {
+ u32 installed_cpu;
+ u32 fixed_errata_mask;
+ u32 fixed_errata_compare;
+ u16 equiv_cpu;
+ u16 res;
+} __packed;
+
+struct microcode_header_amd {
+ u32 data_code;
+ u32 patch_id;
+ u16 mc_patch_data_id;
+ u8 mc_patch_data_len;
+ u8 init_flag;
+ u32 mc_patch_data_checksum;
+ u32 nb_dev_id;
+ u32 sb_dev_id;
+ u16 processor_rev_id;
+ u8 nb_rev_id;
+ u8 sb_rev_id;
+ u8 bios_api_rev;
+ u8 reserved1[3];
+ u32 match_reg[8];
+} __packed;
+
+struct microcode_amd {
+ struct microcode_header_amd hdr;
+ unsigned int mpb[];
+};
static struct equiv_cpu_table {
unsigned int num_entries;
struct equiv_cpu_entry *entry;
} equiv_table;
+union zen_patch_rev {
+ struct {
+ __u32 rev : 8,
+ stepping : 4,
+ model : 4,
+ __reserved : 4,
+ ext_model : 4,
+ ext_fam : 8;
+ };
+ __u32 ucode_rev;
+};
+
+union cpuid_1_eax {
+ struct {
+ __u32 stepping : 4,
+ model : 4,
+ family : 4,
+ __reserved0 : 4,
+ ext_model : 4,
+ ext_fam : 8,
+ __reserved1 : 4;
+ };
+ __u32 full;
+};
+
/*
* This points to the current valid container of microcode patches which we will
* save from the initrd/builtin before jettisoning its contents. @mc is the
@@ -48,26 +126,163 @@ static struct equiv_cpu_table {
*/
struct cont_desc {
struct microcode_amd *mc;
- u32 cpuid_1_eax;
u32 psize;
u8 *data;
size_t size;
};
-static u32 ucode_new_rev;
-static u8 amd_ucode_patch[PATCH_MAX_SIZE];
-
/*
* Microcode patch container file is prepended to the initrd in cpio
- * format. See Documentation/x86/microcode.rst
+ * format. See Documentation/arch/x86/microcode.rst
*/
static const char
ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin";
+/*
+ * This is CPUID(1).EAX on the BSP. It is used in two ways:
+ *
+ * 1. To ignore the equivalence table on Zen1 and newer.
+ *
+ * 2. To match which patches to load because the patch revision ID
+ * already contains the f/m/s for which the microcode is destined
+ * for.
+ */
+static u32 bsp_cpuid_1_eax __ro_after_init;
+
+static bool sha_check = true;
+
+struct patch_digest {
+ u32 patch_id;
+ u8 sha256[SHA256_DIGEST_SIZE];
+};
+
+#include "amd_shas.c"
+
+static int cmp_id(const void *key, const void *elem)
+{
+ struct patch_digest *pd = (struct patch_digest *)elem;
+ u32 patch_id = *(u32 *)key;
+
+ if (patch_id == pd->patch_id)
+ return 0;
+ else if (patch_id < pd->patch_id)
+ return -1;
+ else
+ return 1;
+}
+
+static bool need_sha_check(u32 cur_rev)
+{
+ switch (cur_rev >> 8) {
+ case 0x80012: return cur_rev <= 0x800126f; break;
+ case 0x80082: return cur_rev <= 0x800820f; break;
+ case 0x83010: return cur_rev <= 0x830107c; break;
+ case 0x86001: return cur_rev <= 0x860010e; break;
+ case 0x86081: return cur_rev <= 0x8608108; break;
+ case 0x87010: return cur_rev <= 0x8701034; break;
+ case 0x8a000: return cur_rev <= 0x8a0000a; break;
+ case 0xa0010: return cur_rev <= 0xa00107a; break;
+ case 0xa0011: return cur_rev <= 0xa0011da; break;
+ case 0xa0012: return cur_rev <= 0xa001243; break;
+ case 0xa0082: return cur_rev <= 0xa00820e; break;
+ case 0xa1011: return cur_rev <= 0xa101153; break;
+ case 0xa1012: return cur_rev <= 0xa10124e; break;
+ case 0xa1081: return cur_rev <= 0xa108109; break;
+ case 0xa2010: return cur_rev <= 0xa20102f; break;
+ case 0xa2012: return cur_rev <= 0xa201212; break;
+ case 0xa4041: return cur_rev <= 0xa404109; break;
+ case 0xa5000: return cur_rev <= 0xa500013; break;
+ case 0xa6012: return cur_rev <= 0xa60120a; break;
+ case 0xa7041: return cur_rev <= 0xa704109; break;
+ case 0xa7052: return cur_rev <= 0xa705208; break;
+ case 0xa7080: return cur_rev <= 0xa708009; break;
+ case 0xa70c0: return cur_rev <= 0xa70C009; break;
+ case 0xaa001: return cur_rev <= 0xaa00116; break;
+ case 0xaa002: return cur_rev <= 0xaa00218; break;
+ case 0xb0021: return cur_rev <= 0xb002146; break;
+ case 0xb1010: return cur_rev <= 0xb101046; break;
+ case 0xb2040: return cur_rev <= 0xb204031; break;
+ case 0xb4040: return cur_rev <= 0xb404031; break;
+ case 0xb6000: return cur_rev <= 0xb600031; break;
+ case 0xb7000: return cur_rev <= 0xb700031; break;
+ default: break;
+ }
+
+ pr_info("You should not be seeing this. Please send the following couple of lines to x86-<at>-kernel.org\n");
+ pr_info("CPUID(1).EAX: 0x%x, current revision: 0x%x\n", bsp_cpuid_1_eax, cur_rev);
+ return true;
+}
+
+static bool verify_sha256_digest(u32 patch_id, u32 cur_rev, const u8 *data, unsigned int len)
+{
+ struct patch_digest *pd = NULL;
+ u8 digest[SHA256_DIGEST_SIZE];
+ int i;
+
+ if (x86_family(bsp_cpuid_1_eax) < 0x17)
+ return true;
+
+ if (!need_sha_check(cur_rev))
+ return true;
+
+ if (!sha_check)
+ return true;
+
+ pd = bsearch(&patch_id, phashes, ARRAY_SIZE(phashes), sizeof(struct patch_digest), cmp_id);
+ if (!pd) {
+ pr_err("No sha256 digest for patch ID: 0x%x found\n", patch_id);
+ return false;
+ }
+
+ sha256(data, len, digest);
+
+ if (memcmp(digest, pd->sha256, sizeof(digest))) {
+ pr_err("Patch 0x%x SHA256 digest mismatch!\n", patch_id);
+
+ for (i = 0; i < SHA256_DIGEST_SIZE; i++)
+ pr_cont("0x%x ", digest[i]);
+ pr_info("\n");
+
+ return false;
+ }
+
+ return true;
+}
+
+static u32 get_patch_level(void)
+{
+ u32 rev, dummy __always_unused;
+
+ native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+
+ return rev;
+}
+
+static union cpuid_1_eax ucode_rev_to_cpuid(unsigned int val)
+{
+ union zen_patch_rev p;
+ union cpuid_1_eax c;
+
+ p.ucode_rev = val;
+ c.full = 0;
+
+ c.stepping = p.stepping;
+ c.model = p.model;
+ c.ext_model = p.ext_model;
+ c.family = 0xf;
+ c.ext_fam = p.ext_fam;
+
+ return c;
+}
+
static u16 find_equiv_id(struct equiv_cpu_table *et, u32 sig)
{
unsigned int i;
+ /* Zen and newer do not need an equivalence table. */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ return 0;
+
if (!et || !et->num_entries)
return 0;
@@ -76,32 +291,26 @@ static u16 find_equiv_id(struct equiv_cpu_table *et, u32 sig)
if (sig == e->installed_cpu)
return e->equiv_cpu;
-
- e++;
}
return 0;
}
/*
* Check whether there is a valid microcode container file at the beginning
- * of @buf of size @buf_size. Set @early to use this function in the early path.
+ * of @buf of size @buf_size.
*/
-static bool verify_container(const u8 *buf, size_t buf_size, bool early)
+static bool verify_container(const u8 *buf, size_t buf_size)
{
u32 cont_magic;
if (buf_size <= CONTAINER_HDR_SZ) {
- if (!early)
- pr_debug("Truncated microcode container header.\n");
-
+ pr_debug("Truncated microcode container header.\n");
return false;
}
cont_magic = *(const u32 *)buf;
if (cont_magic != UCODE_MAGIC) {
- if (!early)
- pr_debug("Invalid magic value (0x%08x).\n", cont_magic);
-
+ pr_debug("Invalid magic value (0x%08x).\n", cont_magic);
return false;
}
@@ -110,23 +319,24 @@ static bool verify_container(const u8 *buf, size_t buf_size, bool early)
/*
* Check whether there is a valid, non-truncated CPU equivalence table at the
- * beginning of @buf of size @buf_size. Set @early to use this function in the
- * early path.
+ * beginning of @buf of size @buf_size.
*/
-static bool verify_equivalence_table(const u8 *buf, size_t buf_size, bool early)
+static bool verify_equivalence_table(const u8 *buf, size_t buf_size)
{
const u32 *hdr = (const u32 *)buf;
u32 cont_type, equiv_tbl_len;
- if (!verify_container(buf, buf_size, early))
+ if (!verify_container(buf, buf_size))
return false;
+ /* Zen and newer do not need an equivalence table. */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ return true;
+
cont_type = hdr[1];
if (cont_type != UCODE_EQUIV_CPU_TABLE_TYPE) {
- if (!early)
- pr_debug("Wrong microcode container equivalence table type: %u.\n",
- cont_type);
-
+ pr_debug("Wrong microcode container equivalence table type: %u.\n",
+ cont_type);
return false;
}
@@ -135,9 +345,7 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size, bool early)
equiv_tbl_len = hdr[2];
if (equiv_tbl_len < sizeof(struct equiv_cpu_entry) ||
buf_size < equiv_tbl_len) {
- if (!early)
- pr_debug("Truncated equivalence table.\n");
-
+ pr_debug("Truncated equivalence table.\n");
return false;
}
@@ -146,22 +354,18 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size, bool early)
/*
* Check whether there is a valid, non-truncated microcode patch section at the
- * beginning of @buf of size @buf_size. Set @early to use this function in the
- * early path.
+ * beginning of @buf of size @buf_size.
*
* On success, @sh_psize returns the patch size according to the section header,
* to the caller.
*/
-static bool
-__verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize, bool early)
+static bool __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize)
{
u32 p_type, p_size;
const u32 *hdr;
if (buf_size < SECTION_HDR_SIZE) {
- if (!early)
- pr_debug("Truncated patch section.\n");
-
+ pr_debug("Truncated patch section.\n");
return false;
}
@@ -170,17 +374,13 @@ __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize, bool early
p_size = hdr[1];
if (p_type != UCODE_UCODE_TYPE) {
- if (!early)
- pr_debug("Invalid type field (0x%x) in container file section header.\n",
- p_type);
-
+ pr_debug("Invalid type field (0x%x) in container file section header.\n",
+ p_type);
return false;
}
if (p_size < sizeof(struct microcode_header_amd)) {
- if (!early)
- pr_debug("Patch of size %u too short.\n", p_size);
-
+ pr_debug("Patch of size %u too short.\n", p_size);
return false;
}
@@ -195,12 +395,13 @@ __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize, bool early
* exceed the per-family maximum). @sh_psize is the size read from the section
* header.
*/
-static unsigned int __verify_patch_size(u8 family, u32 sh_psize, size_t buf_size)
+static bool __verify_patch_size(u32 sh_psize, size_t buf_size)
{
+ u8 family = x86_family(bsp_cpuid_1_eax);
u32 max_size;
if (family >= 0x15)
- return min_t(u32, sh_psize, buf_size);
+ goto ret;
#define F1XH_MPB_MAX_SIZE 2048
#define F14H_MPB_MAX_SIZE 1824
@@ -214,13 +415,15 @@ static unsigned int __verify_patch_size(u8 family, u32 sh_psize, size_t buf_size
break;
default:
WARN(1, "%s: WTF family: 0x%x\n", __func__, family);
- return 0;
+ return false;
}
- if (sh_psize > min_t(u32, buf_size, max_size))
- return 0;
+ if (sh_psize > max_size)
+ return false;
- return sh_psize;
+ret:
+ /* Working with the whole buffer so < is ok. */
+ return sh_psize <= buf_size;
}
/*
@@ -231,16 +434,15 @@ static unsigned int __verify_patch_size(u8 family, u32 sh_psize, size_t buf_size
* positive: patch is not for this family, skip it
* 0: success
*/
-static int
-verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size, bool early)
+static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size)
{
+ u8 family = x86_family(bsp_cpuid_1_eax);
struct microcode_header_amd *mc_hdr;
- unsigned int ret;
u32 sh_psize;
u16 proc_id;
u8 patch_fam;
- if (!__verify_patch_section(buf, buf_size, &sh_psize, early))
+ if (!__verify_patch_section(buf, buf_size, &sh_psize))
return -1;
/*
@@ -255,16 +457,12 @@ verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size, bool ea
* size sh_psize, as the section claims.
*/
if (buf_size < sh_psize) {
- if (!early)
- pr_debug("Patch of size %u truncated.\n", sh_psize);
-
+ pr_debug("Patch of size %u truncated.\n", sh_psize);
return -1;
}
- ret = __verify_patch_size(family, sh_psize, buf_size);
- if (!ret) {
- if (!early)
- pr_debug("Per-family patch size mismatch.\n");
+ if (!__verify_patch_size(sh_psize, buf_size)) {
+ pr_debug("Per-family patch size mismatch.\n");
return -1;
}
@@ -272,8 +470,7 @@ verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size, bool ea
mc_hdr = (struct microcode_header_amd *)(buf + SECTION_HDR_SIZE);
if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) {
- if (!early)
- pr_err("Patch-ID 0x%08x: chipset-specific code unsupported.\n", mc_hdr->patch_id);
+ pr_err("Patch-ID 0x%08x: chipset-specific code unsupported.\n", mc_hdr->patch_id);
return -1;
}
@@ -285,10 +482,19 @@ verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size, bool ea
return 0;
}
+static bool mc_patch_matches(struct microcode_amd *mc, u16 eq_id)
+{
+ /* Zen and newer do not need an equivalence table. */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ return ucode_rev_to_cpuid(mc->hdr.patch_id).full == bsp_cpuid_1_eax;
+ else
+ return eq_id == mc->hdr.processor_rev_id;
+}
+
/*
* This scans the ucode blob for the proper container as we can have multiple
- * containers glued together. Returns the equivalence ID from the equivalence
- * table or 0 if none found.
+ * containers glued together.
+ *
* Returns the amount of bytes consumed while scanning. @desc contains all the
* data we're going to use in later stages of the application.
*/
@@ -300,7 +506,7 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc)
u16 eq_id;
u8 *buf;
- if (!verify_equivalence_table(ucode, size, true))
+ if (!verify_equivalence_table(ucode, size))
return 0;
buf = ucode;
@@ -313,7 +519,7 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc)
* doesn't contain a patch for the CPU, scan through the whole container
* so that it can be skipped in case there are other containers appended.
*/
- eq_id = find_equiv_id(&table, desc->cpuid_1_eax);
+ eq_id = find_equiv_id(&table, bsp_cpuid_1_eax);
buf += hdr[2] + CONTAINER_HDR_SZ;
size -= hdr[2] + CONTAINER_HDR_SZ;
@@ -327,11 +533,12 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc)
u32 patch_size;
int ret;
- ret = verify_patch(x86_family(desc->cpuid_1_eax), buf, size, &patch_size, true);
+ ret = verify_patch(buf, size, &patch_size);
if (ret < 0) {
/*
- * Patch verification failed, skip to the next
- * container, if there's one:
+ * Patch verification failed, skip to the next container, if
+ * there is one. Before exit, check whether that container has
+ * found a patch already. If so, use it.
*/
goto out;
} else if (ret > 0) {
@@ -339,7 +546,7 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc)
}
mc = (struct microcode_amd *)(buf + SECTION_HDR_SIZE);
- if (eq_id == mc->hdr.processor_rev_id) {
+ if (mc_patch_matches(mc, eq_id)) {
desc->psize = patch_size;
desc->mc = mc;
}
@@ -350,6 +557,7 @@ skip:
size -= patch_size + SECTION_HDR_SIZE;
}
+out:
/*
* If we have found a patch (desc->mc), it means we're looking at the
* container which has a patch for this CPU so return 0 to mean, @ucode
@@ -364,7 +572,6 @@ skip:
return 0;
}
-out:
return orig_size - size;
}
@@ -389,74 +596,41 @@ static void scan_containers(u8 *ucode, size_t size, struct cont_desc *desc)
}
}
-static int __apply_microcode_amd(struct microcode_amd *mc)
-{
- u32 rev, dummy;
-
- native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc->hdr.data_code);
-
- /* verify patch application was successful */
- native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
- if (rev != mc->hdr.patch_id)
- return -1;
-
- return 0;
-}
-
-/*
- * Early load occurs before we can vmalloc(). So we look for the microcode
- * patch container file in initrd, traverse equivalent cpu table, look for a
- * matching microcode patch, and update, all in initrd memory in place.
- * When vmalloc() is available for use later -- on 64-bit during first AP load,
- * and on 32-bit during save_microcode_in_initrd_amd() -- we can call
- * load_microcode_amd() to save equivalent cpu table and microcode patches in
- * kernel heap memory.
- *
- * Returns true if container found (sets @desc), false otherwise.
- */
-static bool
-apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_patch)
+static bool __apply_microcode_amd(struct microcode_amd *mc, u32 *cur_rev,
+ unsigned int psize)
{
- struct cont_desc desc = { 0 };
- u8 (*patch)[PATCH_MAX_SIZE];
- struct microcode_amd *mc;
- u32 rev, dummy, *new_rev;
- bool ret = false;
-
-#ifdef CONFIG_X86_32
- new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
- patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch);
-#else
- new_rev = &ucode_new_rev;
- patch = &amd_ucode_patch;
-#endif
+ unsigned long p_addr = (unsigned long)&mc->hdr.data_code;
- desc.cpuid_1_eax = cpuid_1_eax;
-
- scan_containers(ucode, size, &desc);
+ if (!verify_sha256_digest(mc->hdr.patch_id, *cur_rev, (const u8 *)p_addr, psize))
+ return false;
- mc = desc.mc;
- if (!mc)
- return ret;
+ native_wrmsrq(MSR_AMD64_PATCH_LOADER, p_addr);
- native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
- if (rev >= mc->hdr.patch_id)
- return ret;
+ if (x86_family(bsp_cpuid_1_eax) == 0x17) {
+ unsigned long p_addr_end = p_addr + psize - 1;
- if (!__apply_microcode_amd(mc)) {
- *new_rev = mc->hdr.patch_id;
- ret = true;
+ invlpg(p_addr);
- if (save_patch)
- memcpy(patch, mc, min_t(u32, desc.psize, PATCH_MAX_SIZE));
+ /*
+ * Flush next page too if patch image is crossing a page
+ * boundary.
+ */
+ if (p_addr >> PAGE_SHIFT != p_addr_end >> PAGE_SHIFT)
+ invlpg(p_addr_end);
}
- return ret;
+ /* verify patch application was successful */
+ *cur_rev = get_patch_level();
+ if (*cur_rev != mc->hdr.patch_id)
+ return false;
+
+ return true;
}
-static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
+static bool get_builtin_microcode(struct cpio_data *cp)
{
char fw_name[36] = "amd-ucode/microcode_amd.bin";
+ u8 family = x86_family(bsp_cpuid_1_eax);
struct firmware fw;
if (IS_ENABLED(CONFIG_X86_32))
@@ -464,7 +638,7 @@ static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
if (family >= 0x15)
snprintf(fw_name, sizeof(fw_name),
- "amd-ucode/microcode_amd_fam%.2xh.bin", family);
+ "amd-ucode/microcode_amd_fam%02hhxh.bin", family);
if (firmware_request_builtin(&fw, fw_name)) {
cp->size = fw.size;
@@ -475,142 +649,144 @@ static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
return false;
}
-static void __load_ucode_amd(unsigned int cpuid_1_eax, struct cpio_data *ret)
+static bool __init find_blobs_in_containers(struct cpio_data *ret)
{
- struct ucode_cpu_info *uci;
struct cpio_data cp;
- const char *path;
- bool use_pa;
+ bool found;
- if (IS_ENABLED(CONFIG_X86_32)) {
- uci = (struct ucode_cpu_info *)__pa_nodebug(ucode_cpu_info);
- path = (const char *)__pa_nodebug(ucode_path);
- use_pa = true;
- } else {
- uci = ucode_cpu_info;
- path = ucode_path;
- use_pa = false;
- }
+ if (!get_builtin_microcode(&cp))
+ cp = find_microcode_in_initrd(ucode_path);
- if (!get_builtin_microcode(&cp, x86_family(cpuid_1_eax)))
- cp = find_microcode_in_initrd(path, use_pa);
+ found = cp.data && cp.size;
+ if (found)
+ *ret = cp;
- /* Needed in load_microcode_amd() */
- uci->cpu_sig.sig = cpuid_1_eax;
-
- *ret = cp;
+ return found;
}
-void __init load_ucode_amd_bsp(unsigned int cpuid_1_eax)
-{
- struct cpio_data cp = { };
-
- __load_ucode_amd(cpuid_1_eax, &cp);
- if (!(cp.data && cp.size))
- return;
-
- apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, true);
-}
-
-void load_ucode_amd_ap(unsigned int cpuid_1_eax)
+/*
+ * Early load occurs before we can vmalloc(). So we look for the microcode
+ * patch container file in initrd, traverse equivalent cpu table, look for a
+ * matching microcode patch, and update, all in initrd memory in place.
+ * When vmalloc() is available for use later -- on 64-bit during first AP load,
+ * and on 32-bit during save_microcode_in_initrd() -- we can call
+ * load_microcode_amd() to save equivalent cpu table and microcode patches in
+ * kernel heap memory.
+ */
+void __init load_ucode_amd_bsp(struct early_load_data *ed, unsigned int cpuid_1_eax)
{
+ struct cont_desc desc = { };
struct microcode_amd *mc;
- struct cpio_data cp;
- u32 *new_rev, rev, dummy;
-
- if (IS_ENABLED(CONFIG_X86_32)) {
- mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch);
- new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
- } else {
- mc = (struct microcode_amd *)amd_ucode_patch;
- new_rev = &ucode_new_rev;
- }
-
- native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
-
- /* Check whether we have saved a new patch already: */
- if (*new_rev && rev < mc->hdr.patch_id) {
- if (!__apply_microcode_amd(mc)) {
- *new_rev = mc->hdr.patch_id;
- return;
+ struct cpio_data cp = { };
+ char buf[4];
+ u32 rev;
+
+ if (cmdline_find_option(boot_command_line, "microcode.amd_sha_check", buf, 4)) {
+ if (!strncmp(buf, "off", 3)) {
+ sha_check = false;
+ pr_warn_once("It is a very very bad idea to disable the blobs SHA check!\n");
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
}
}
- __load_ucode_amd(cpuid_1_eax, &cp);
- if (!(cp.data && cp.size))
- return;
-
- apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, false);
-}
+ bsp_cpuid_1_eax = cpuid_1_eax;
-static enum ucode_state
-load_microcode_amd(bool save, u8 family, const u8 *data, size_t size);
+ rev = get_patch_level();
+ ed->old_rev = rev;
-int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
-{
- struct cont_desc desc = { 0 };
- enum ucode_state ret;
- struct cpio_data cp;
-
- cp = find_microcode_in_initrd(ucode_path, false);
- if (!(cp.data && cp.size))
- return -EINVAL;
+ /* Needed in load_microcode_amd() */
+ ucode_cpu_info[0].cpu_sig.sig = cpuid_1_eax;
- desc.cpuid_1_eax = cpuid_1_eax;
+ if (!find_blobs_in_containers(&cp))
+ return;
scan_containers(cp.data, cp.size, &desc);
- if (!desc.mc)
- return -EINVAL;
- ret = load_microcode_amd(true, x86_family(cpuid_1_eax), desc.data, desc.size);
- if (ret > UCODE_UPDATED)
- return -EINVAL;
+ mc = desc.mc;
+ if (!mc)
+ return;
- return 0;
+ /*
+ * Allow application of the same revision to pick up SMT-specific
+ * changes even if the revision of the other SMT thread is already
+ * up-to-date.
+ */
+ if (ed->old_rev > mc->hdr.patch_id)
+ return;
+
+ if (__apply_microcode_amd(mc, &rev, desc.psize))
+ ed->new_rev = rev;
}
-void reload_ucode_amd(void)
+static inline bool patch_cpus_equivalent(struct ucode_patch *p,
+ struct ucode_patch *n,
+ bool ignore_stepping)
{
- struct microcode_amd *mc;
- u32 rev, dummy __always_unused;
-
- mc = (struct microcode_amd *)amd_ucode_patch;
-
- rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
-
- if (rev < mc->hdr.patch_id) {
- if (!__apply_microcode_amd(mc)) {
- ucode_new_rev = mc->hdr.patch_id;
- pr_info("reload patch_level=0x%08x\n", ucode_new_rev);
+ /* Zen and newer hardcode the f/m/s in the patch ID */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17) {
+ union cpuid_1_eax p_cid = ucode_rev_to_cpuid(p->patch_id);
+ union cpuid_1_eax n_cid = ucode_rev_to_cpuid(n->patch_id);
+
+ if (ignore_stepping) {
+ p_cid.stepping = 0;
+ n_cid.stepping = 0;
}
+
+ return p_cid.full == n_cid.full;
+ } else {
+ return p->equiv_cpu == n->equiv_cpu;
}
}
-static u16 __find_equiv_id(unsigned int cpu)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- return find_equiv_id(&equiv_table, uci->cpu_sig.sig);
-}
/*
* a small, trivial cache of per-family ucode patches
*/
-static struct ucode_patch *cache_find_patch(u16 equiv_cpu)
+static struct ucode_patch *cache_find_patch(struct ucode_cpu_info *uci, u16 equiv_cpu)
{
struct ucode_patch *p;
+ struct ucode_patch n;
+
+ n.equiv_cpu = equiv_cpu;
+ n.patch_id = uci->cpu_sig.rev;
+
+ WARN_ON_ONCE(!n.patch_id);
list_for_each_entry(p, &microcode_cache, plist)
- if (p->equiv_cpu == equiv_cpu)
+ if (patch_cpus_equivalent(p, &n, false))
return p;
+
return NULL;
}
+static inline int patch_newer(struct ucode_patch *p, struct ucode_patch *n)
+{
+ /* Zen and newer hardcode the f/m/s in the patch ID */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17) {
+ union zen_patch_rev zp, zn;
+
+ zp.ucode_rev = p->patch_id;
+ zn.ucode_rev = n->patch_id;
+
+ if (zn.stepping != zp.stepping)
+ return -1;
+
+ return zn.rev > zp.rev;
+ } else {
+ return n->patch_id > p->patch_id;
+ }
+}
+
static void update_cache(struct ucode_patch *new_patch)
{
struct ucode_patch *p;
+ int ret;
list_for_each_entry(p, &microcode_cache, plist) {
- if (p->equiv_cpu == new_patch->equiv_cpu) {
- if (p->patch_id >= new_patch->patch_id) {
+ if (patch_cpus_equivalent(p, new_patch, true)) {
+ ret = patch_newer(p, new_patch);
+ if (ret < 0)
+ continue;
+ else if (!ret) {
/* we already have the latest patch */
kfree(new_patch->data);
kfree(new_patch);
@@ -640,23 +816,46 @@ static void free_cache(void)
static struct ucode_patch *find_patch(unsigned int cpu)
{
- u16 equiv_id;
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ u16 equiv_id = 0;
- equiv_id = __find_equiv_id(cpu);
- if (!equiv_id)
- return NULL;
+ uci->cpu_sig.rev = get_patch_level();
+
+ if (x86_family(bsp_cpuid_1_eax) < 0x17) {
+ equiv_id = find_equiv_id(&equiv_table, uci->cpu_sig.sig);
+ if (!equiv_id)
+ return NULL;
+ }
+
+ return cache_find_patch(uci, equiv_id);
+}
+
+void reload_ucode_amd(unsigned int cpu)
+{
+ u32 rev, dummy __always_unused;
+ struct microcode_amd *mc;
+ struct ucode_patch *p;
+
+ p = find_patch(cpu);
+ if (!p)
+ return;
- return cache_find_patch(equiv_id);
+ mc = p->data;
+
+ rev = get_patch_level();
+ if (rev < mc->hdr.patch_id) {
+ if (__apply_microcode_amd(mc, &rev, p->size))
+ pr_info_once("reload revision: 0x%08x\n", rev);
+ }
}
static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
{
- struct cpuinfo_x86 *c = &cpu_data(cpu);
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
struct ucode_patch *p;
csig->sig = cpuid_eax(0x00000001);
- csig->rev = c->microcode;
+ csig->rev = get_patch_level();
/*
* a patch could have been loaded early, set uci->mc so that
@@ -666,8 +865,6 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
if (p && (p->patch_id == csig->rev))
uci->mc = p->data;
- pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev);
-
return 0;
}
@@ -678,7 +875,7 @@ static enum ucode_state apply_microcode_amd(int cpu)
struct ucode_cpu_info *uci;
struct ucode_patch *p;
enum ucode_state ret;
- u32 rev, dummy __always_unused;
+ u32 rev;
BUG_ON(raw_smp_processor_id() != cpu);
@@ -688,18 +885,18 @@ static enum ucode_state apply_microcode_amd(int cpu)
if (!p)
return UCODE_NFOUND;
+ rev = uci->cpu_sig.rev;
+
mc_amd = p->data;
uci->mc = p->data;
- rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
-
/* need to apply patch? */
- if (rev >= mc_amd->hdr.patch_id) {
+ if (rev > mc_amd->hdr.patch_id) {
ret = UCODE_OK;
goto out;
}
- if (__apply_microcode_amd(mc_amd)) {
+ if (!__apply_microcode_amd(mc_amd, &rev, p->size)) {
pr_err("CPU%d: update failed for patch_level=0x%08x\n",
cpu, mc_amd->hdr.patch_id);
return UCODE_ERROR;
@@ -708,8 +905,6 @@ static enum ucode_state apply_microcode_amd(int cpu)
rev = mc_amd->hdr.patch_id;
ret = UCODE_UPDATED;
- pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev);
-
out:
uci->cpu_sig.rev = rev;
c->microcode = rev;
@@ -721,17 +916,29 @@ out:
return ret;
}
+void load_ucode_amd_ap(unsigned int cpuid_1_eax)
+{
+ unsigned int cpu = smp_processor_id();
+
+ ucode_cpu_info[cpu].cpu_sig.sig = cpuid_1_eax;
+ apply_microcode_amd(cpu);
+}
+
static size_t install_equiv_cpu_table(const u8 *buf, size_t buf_size)
{
u32 equiv_tbl_len;
const u32 *hdr;
- if (!verify_equivalence_table(buf, buf_size, false))
+ if (!verify_equivalence_table(buf, buf_size))
return 0;
hdr = (const u32 *)buf;
equiv_tbl_len = hdr[2];
+ /* Zen and newer do not need an equivalence table. */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ goto out;
+
equiv_table.entry = vmalloc(equiv_tbl_len);
if (!equiv_table.entry) {
pr_err("failed to allocate equivalent CPU table\n");
@@ -741,12 +948,16 @@ static size_t install_equiv_cpu_table(const u8 *buf, size_t buf_size)
memcpy(equiv_table.entry, buf + CONTAINER_HDR_SZ, equiv_tbl_len);
equiv_table.num_entries = equiv_tbl_len / sizeof(struct equiv_cpu_entry);
+out:
/* add header length */
return equiv_tbl_len + CONTAINER_HDR_SZ;
}
static void free_equiv_cpu_table(void)
{
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ return;
+
vfree(equiv_table.entry);
memset(&equiv_table, 0, sizeof(equiv_table));
}
@@ -772,7 +983,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover,
u16 proc_id;
int ret;
- ret = verify_patch(family, fw, leftover, patch_size, false);
+ ret = verify_patch(fw, leftover, patch_size);
if (ret)
return ret;
@@ -788,6 +999,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover,
kfree(patch);
return -EINVAL;
}
+ patch->size = *patch_size;
mc_hdr = (struct microcode_header_amd *)(fw + SECTION_HDR_SIZE);
proc_id = mc_hdr->processor_rev_id;
@@ -796,7 +1008,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover,
patch->patch_id = mc_hdr->patch_id;
patch->equiv_cpu = proc_id;
- pr_debug("%s: Added patch_id: 0x%08x, proc_id: 0x%04x\n",
+ pr_debug("%s: Adding patch_id: 0x%08x, proc_id: 0x%04x\n",
__func__, patch->patch_id, proc_id);
/* ... and add to cache. */
@@ -805,8 +1017,8 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover,
return 0;
}
-static enum ucode_state __load_microcode_amd(u8 family, const u8 *data,
- size_t size)
+/* Scan the blob in @data and add microcode patches to the cache. */
+static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, size_t size)
{
u8 *fw = (u8 *)data;
size_t offset;
@@ -839,40 +1051,75 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data,
return UCODE_OK;
}
-static enum ucode_state
-load_microcode_amd(bool save, u8 family, const u8 *data, size_t size)
+static enum ucode_state _load_microcode_amd(u8 family, const u8 *data, size_t size)
{
- struct ucode_patch *p;
enum ucode_state ret;
/* free old equiv table */
free_equiv_cpu_table();
ret = __load_microcode_amd(family, data, size);
- if (ret != UCODE_OK) {
+ if (ret != UCODE_OK)
cleanup();
- return ret;
- }
- p = find_patch(0);
- if (!p) {
+ return ret;
+}
+
+static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size)
+{
+ struct cpuinfo_x86 *c;
+ unsigned int nid, cpu;
+ struct ucode_patch *p;
+ enum ucode_state ret;
+
+ ret = _load_microcode_amd(family, data, size);
+ if (ret != UCODE_OK)
return ret;
- } else {
- if (boot_cpu_data.microcode >= p->patch_id)
- return ret;
+
+ for_each_node_with_cpus(nid) {
+ cpu = cpumask_first(cpumask_of_node(nid));
+ c = &cpu_data(cpu);
+
+ p = find_patch(cpu);
+ if (!p)
+ continue;
+
+ if (c->microcode >= p->patch_id)
+ continue;
ret = UCODE_NEW;
}
- /* save BSP's matching patch for early load */
- if (!save)
- return ret;
+ return ret;
+}
+
+static int __init save_microcode_in_initrd(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+ struct cont_desc desc = { 0 };
+ unsigned int cpuid_1_eax;
+ enum ucode_state ret;
+ struct cpio_data cp;
- memset(amd_ucode_patch, 0, PATCH_MAX_SIZE);
- memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data), PATCH_MAX_SIZE));
+ if (microcode_loader_disabled() || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10)
+ return 0;
- return ret;
+ cpuid_1_eax = native_cpuid_eax(1);
+
+ if (!find_blobs_in_containers(&cp))
+ return -EINVAL;
+
+ scan_containers(cp.data, cp.size, &desc);
+ if (!desc.mc)
+ return -EINVAL;
+
+ ret = _load_microcode_amd(x86_family(cpuid_1_eax), desc.data, desc.size);
+ if (ret > UCODE_UPDATED)
+ return -EINVAL;
+
+ return 0;
}
+early_initcall(save_microcode_in_initrd);
/*
* AMD microcode firmware naming convention, up to family 15h they are in
@@ -890,18 +1137,15 @@ load_microcode_amd(bool save, u8 family, const u8 *data, size_t size)
*
* These might be larger than 2K.
*/
-static enum ucode_state request_microcode_amd(int cpu, struct device *device,
- bool refresh_fw)
+static enum ucode_state request_microcode_amd(int cpu, struct device *device)
{
char fw_name[36] = "amd-ucode/microcode_amd.bin";
struct cpuinfo_x86 *c = &cpu_data(cpu);
- bool bsp = c->cpu_index == boot_cpu_data.cpu_index;
enum ucode_state ret = UCODE_NFOUND;
const struct firmware *fw;
- /* reload ucode container only on the boot cpu */
- if (!refresh_fw || !bsp)
- return UCODE_OK;
+ if (force_minrev)
+ return UCODE_NFOUND;
if (c->x86 >= 0x15)
snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86);
@@ -912,10 +1156,10 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device,
}
ret = UCODE_ERROR;
- if (!verify_container(fw->data, fw->size, false))
+ if (!verify_container(fw->data, fw->size))
goto fw_release;
- ret = load_microcode_amd(bsp, c->x86, fw->data, fw->size);
+ ret = load_microcode_amd(c->x86, fw->data, fw->size);
fw_release:
release_firmware(fw);
@@ -924,12 +1168,6 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device,
return ret;
}
-static enum ucode_state
-request_microcode_user(int cpu, const void __user *buf, size_t size)
-{
- return UCODE_ERROR;
-}
-
static void microcode_fini_cpu_amd(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
@@ -937,12 +1175,19 @@ static void microcode_fini_cpu_amd(int cpu)
uci->mc = NULL;
}
+static void finalize_late_load_amd(int result)
+{
+ if (result)
+ cleanup();
+}
+
static struct microcode_ops microcode_amd_ops = {
- .request_microcode_user = request_microcode_user,
- .request_microcode_fw = request_microcode_amd,
- .collect_cpu_info = collect_cpu_info_amd,
- .apply_microcode = apply_microcode_amd,
- .microcode_fini_cpu = microcode_fini_cpu_amd,
+ .request_microcode_fw = request_microcode_amd,
+ .collect_cpu_info = collect_cpu_info_amd,
+ .apply_microcode = apply_microcode_amd,
+ .microcode_fini_cpu = microcode_fini_cpu_amd,
+ .finalize_late_load = finalize_late_load_amd,
+ .nmi_safe = true,
};
struct microcode_ops * __init init_amd_microcode(void)
@@ -953,11 +1198,6 @@ struct microcode_ops * __init init_amd_microcode(void)
pr_warn("AMD CPU family 0x%x not supported\n", c->x86);
return NULL;
}
-
- if (ucode_new_rev)
- pr_info_once("microcode updated early to new patch_level=0x%08x\n",
- ucode_new_rev);
-
return &microcode_amd_ops;
}
diff --git a/arch/x86/kernel/cpu/microcode/amd_shas.c b/arch/x86/kernel/cpu/microcode/amd_shas.c
new file mode 100644
index 000000000000..2a1655b1fdd8
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/amd_shas.c
@@ -0,0 +1,444 @@
+/* Keep 'em sorted. */
+static const struct patch_digest phashes[] = {
+ { 0x8001227, {
+ 0x99,0xc0,0x9b,0x2b,0xcc,0x9f,0x52,0x1b,
+ 0x1a,0x5f,0x1d,0x83,0xa1,0x6c,0xc4,0x46,
+ 0xe2,0x6c,0xda,0x73,0xfb,0x2d,0x23,0xa8,
+ 0x77,0xdc,0x15,0x31,0x33,0x4a,0x46,0x18,
+ }
+ },
+ { 0x8001250, {
+ 0xc0,0x0b,0x6b,0x19,0xfd,0x5c,0x39,0x60,
+ 0xd5,0xc3,0x57,0x46,0x54,0xe4,0xd1,0xaa,
+ 0xa8,0xf7,0x1f,0xa8,0x6a,0x60,0x3e,0xe3,
+ 0x27,0x39,0x8e,0x53,0x30,0xf8,0x49,0x19,
+ }
+ },
+ { 0x800126e, {
+ 0xf3,0x8b,0x2b,0xb6,0x34,0xe3,0xc8,0x2c,
+ 0xef,0xec,0x63,0x6d,0xc8,0x76,0x77,0xb3,
+ 0x25,0x5a,0xb7,0x52,0x8c,0x83,0x26,0xe6,
+ 0x4c,0xbe,0xbf,0xe9,0x7d,0x22,0x6a,0x43,
+ }
+ },
+ { 0x800126f, {
+ 0x2b,0x5a,0xf2,0x9c,0xdd,0xd2,0x7f,0xec,
+ 0xec,0x96,0x09,0x57,0xb0,0x96,0x29,0x8b,
+ 0x2e,0x26,0x91,0xf0,0x49,0x33,0x42,0x18,
+ 0xdd,0x4b,0x65,0x5a,0xd4,0x15,0x3d,0x33,
+ }
+ },
+ { 0x800820d, {
+ 0x68,0x98,0x83,0xcd,0x22,0x0d,0xdd,0x59,
+ 0x73,0x2c,0x5b,0x37,0x1f,0x84,0x0e,0x67,
+ 0x96,0x43,0x83,0x0c,0x46,0x44,0xab,0x7c,
+ 0x7b,0x65,0x9e,0x57,0xb5,0x90,0x4b,0x0e,
+ }
+ },
+ { 0x8301025, {
+ 0xe4,0x7d,0xdb,0x1e,0x14,0xb4,0x5e,0x36,
+ 0x8f,0x3e,0x48,0x88,0x3c,0x6d,0x76,0xa1,
+ 0x59,0xc6,0xc0,0x72,0x42,0xdf,0x6c,0x30,
+ 0x6f,0x0b,0x28,0x16,0x61,0xfc,0x79,0x77,
+ }
+ },
+ { 0x8301055, {
+ 0x81,0x7b,0x99,0x1b,0xae,0x2d,0x4f,0x9a,
+ 0xef,0x13,0xce,0xb5,0x10,0xaf,0x6a,0xea,
+ 0xe5,0xb0,0x64,0x98,0x10,0x68,0x34,0x3b,
+ 0x9d,0x7a,0xd6,0x22,0x77,0x5f,0xb3,0x5b,
+ }
+ },
+ { 0x8301072, {
+ 0xcf,0x76,0xa7,0x1a,0x49,0xdf,0x2a,0x5e,
+ 0x9e,0x40,0x70,0xe5,0xdd,0x8a,0xa8,0x28,
+ 0x20,0xdc,0x91,0xd8,0x2c,0xa6,0xa0,0xb1,
+ 0x2d,0x22,0x26,0x94,0x4b,0x40,0x85,0x30,
+ }
+ },
+ { 0x830107a, {
+ 0x2a,0x65,0x8c,0x1a,0x5e,0x07,0x21,0x72,
+ 0xdf,0x90,0xa6,0x51,0x37,0xd3,0x4b,0x34,
+ 0xc4,0xda,0x03,0xe1,0x8a,0x6c,0xfb,0x20,
+ 0x04,0xb2,0x81,0x05,0xd4,0x87,0xf4,0x0a,
+ }
+ },
+ { 0x830107b, {
+ 0xb3,0x43,0x13,0x63,0x56,0xc1,0x39,0xad,
+ 0x10,0xa6,0x2b,0xcc,0x02,0xe6,0x76,0x2a,
+ 0x1e,0x39,0x58,0x3e,0x23,0x6e,0xa4,0x04,
+ 0x95,0xea,0xf9,0x6d,0xc2,0x8a,0x13,0x19,
+ }
+ },
+ { 0x830107c, {
+ 0x21,0x64,0xde,0xfb,0x9f,0x68,0x96,0x47,
+ 0x70,0x5c,0xe2,0x8f,0x18,0x52,0x6a,0xac,
+ 0xa4,0xd2,0x2e,0xe0,0xde,0x68,0x66,0xc3,
+ 0xeb,0x1e,0xd3,0x3f,0xbc,0x51,0x1d,0x38,
+ }
+ },
+ { 0x860010d, {
+ 0x86,0xb6,0x15,0x83,0xbc,0x3b,0x9c,0xe0,
+ 0xb3,0xef,0x1d,0x99,0x84,0x35,0x15,0xf7,
+ 0x7c,0x2a,0xc6,0x42,0xdb,0x73,0x07,0x5c,
+ 0x7d,0xc3,0x02,0xb5,0x43,0x06,0x5e,0xf8,
+ }
+ },
+ { 0x8608108, {
+ 0x14,0xfe,0x57,0x86,0x49,0xc8,0x68,0xe2,
+ 0x11,0xa3,0xcb,0x6e,0xff,0x6e,0xd5,0x38,
+ 0xfe,0x89,0x1a,0xe0,0x67,0xbf,0xc4,0xcc,
+ 0x1b,0x9f,0x84,0x77,0x2b,0x9f,0xaa,0xbd,
+ }
+ },
+ { 0x8701034, {
+ 0xc3,0x14,0x09,0xa8,0x9c,0x3f,0x8d,0x83,
+ 0x9b,0x4c,0xa5,0xb7,0x64,0x8b,0x91,0x5d,
+ 0x85,0x6a,0x39,0x26,0x1e,0x14,0x41,0xa8,
+ 0x75,0xea,0xa6,0xf9,0xc9,0xd1,0xea,0x2b,
+ }
+ },
+ { 0x8a00008, {
+ 0xd7,0x2a,0x93,0xdc,0x05,0x2f,0xa5,0x6e,
+ 0x0c,0x61,0x2c,0x07,0x9f,0x38,0xe9,0x8e,
+ 0xef,0x7d,0x2a,0x05,0x4d,0x56,0xaf,0x72,
+ 0xe7,0x56,0x47,0x6e,0x60,0x27,0xd5,0x8c,
+ }
+ },
+ { 0x8a0000a, {
+ 0x73,0x31,0x26,0x22,0xd4,0xf9,0xee,0x3c,
+ 0x07,0x06,0xe7,0xb9,0xad,0xd8,0x72,0x44,
+ 0x33,0x31,0xaa,0x7d,0xc3,0x67,0x0e,0xdb,
+ 0x47,0xb5,0xaa,0xbc,0xf5,0xbb,0xd9,0x20,
+ }
+ },
+ { 0xa00104c, {
+ 0x3c,0x8a,0xfe,0x04,0x62,0xd8,0x6d,0xbe,
+ 0xa7,0x14,0x28,0x64,0x75,0xc0,0xa3,0x76,
+ 0xb7,0x92,0x0b,0x97,0x0a,0x8e,0x9c,0x5b,
+ 0x1b,0xc8,0x9d,0x3a,0x1e,0x81,0x3d,0x3b,
+ }
+ },
+ { 0xa00104e, {
+ 0xc4,0x35,0x82,0x67,0xd2,0x86,0xe5,0xb2,
+ 0xfd,0x69,0x12,0x38,0xc8,0x77,0xba,0xe0,
+ 0x70,0xf9,0x77,0x89,0x10,0xa6,0x74,0x4e,
+ 0x56,0x58,0x13,0xf5,0x84,0x70,0x28,0x0b,
+ }
+ },
+ { 0xa001053, {
+ 0x92,0x0e,0xf4,0x69,0x10,0x3b,0xf9,0x9d,
+ 0x31,0x1b,0xa6,0x99,0x08,0x7d,0xd7,0x25,
+ 0x7e,0x1e,0x89,0xba,0x35,0x8d,0xac,0xcb,
+ 0x3a,0xb4,0xdf,0x58,0x12,0xcf,0xc0,0xc3,
+ }
+ },
+ { 0xa001058, {
+ 0x33,0x7d,0xa9,0xb5,0x4e,0x62,0x13,0x36,
+ 0xef,0x66,0xc9,0xbd,0x0a,0xa6,0x3b,0x19,
+ 0xcb,0xf5,0xc2,0xc3,0x55,0x47,0x20,0xec,
+ 0x1f,0x7b,0xa1,0x44,0x0e,0x8e,0xa4,0xb2,
+ }
+ },
+ { 0xa001075, {
+ 0x39,0x02,0x82,0xd0,0x7c,0x26,0x43,0xe9,
+ 0x26,0xa3,0xd9,0x96,0xf7,0x30,0x13,0x0a,
+ 0x8a,0x0e,0xac,0xe7,0x1d,0xdc,0xe2,0x0f,
+ 0xcb,0x9e,0x8d,0xbc,0xd2,0xa2,0x44,0xe0,
+ }
+ },
+ { 0xa001078, {
+ 0x2d,0x67,0xc7,0x35,0xca,0xef,0x2f,0x25,
+ 0x4c,0x45,0x93,0x3f,0x36,0x01,0x8c,0xce,
+ 0xa8,0x5b,0x07,0xd3,0xc1,0x35,0x3c,0x04,
+ 0x20,0xa2,0xfc,0xdc,0xe6,0xce,0x26,0x3e,
+ }
+ },
+ { 0xa001079, {
+ 0x43,0xe2,0x05,0x9c,0xfd,0xb7,0x5b,0xeb,
+ 0x5b,0xe9,0xeb,0x3b,0x96,0xf4,0xe4,0x93,
+ 0x73,0x45,0x3e,0xac,0x8d,0x3b,0xe4,0xdb,
+ 0x10,0x31,0xc1,0xe4,0xa2,0xd0,0x5a,0x8a,
+ }
+ },
+ { 0xa00107a, {
+ 0x5f,0x92,0xca,0xff,0xc3,0x59,0x22,0x5f,
+ 0x02,0xa0,0x91,0x3b,0x4a,0x45,0x10,0xfd,
+ 0x19,0xe1,0x8a,0x6d,0x9a,0x92,0xc1,0x3f,
+ 0x75,0x78,0xac,0x78,0x03,0x1d,0xdb,0x18,
+ }
+ },
+ { 0xa001143, {
+ 0x56,0xca,0xf7,0x43,0x8a,0x4c,0x46,0x80,
+ 0xec,0xde,0xe5,0x9c,0x50,0x84,0x9a,0x42,
+ 0x27,0xe5,0x51,0x84,0x8f,0x19,0xc0,0x8d,
+ 0x0c,0x25,0xb4,0xb0,0x8f,0x10,0xf3,0xf8,
+ }
+ },
+ { 0xa001144, {
+ 0x42,0xd5,0x9b,0xa7,0xd6,0x15,0x29,0x41,
+ 0x61,0xc4,0x72,0x3f,0xf3,0x06,0x78,0x4b,
+ 0x65,0xf3,0x0e,0xfa,0x9c,0x87,0xde,0x25,
+ 0xbd,0xb3,0x9a,0xf4,0x75,0x13,0x53,0xdc,
+ }
+ },
+ { 0xa00115d, {
+ 0xd4,0xc4,0x49,0x36,0x89,0x0b,0x47,0xdd,
+ 0xfb,0x2f,0x88,0x3b,0x5f,0xf2,0x8e,0x75,
+ 0xc6,0x6c,0x37,0x5a,0x90,0x25,0x94,0x3e,
+ 0x36,0x9c,0xae,0x02,0x38,0x6c,0xf5,0x05,
+ }
+ },
+ { 0xa001173, {
+ 0x28,0xbb,0x9b,0xd1,0xa0,0xa0,0x7e,0x3a,
+ 0x59,0x20,0xc0,0xa9,0xb2,0x5c,0xc3,0x35,
+ 0x53,0x89,0xe1,0x4c,0x93,0x2f,0x1d,0xc3,
+ 0xe5,0xf7,0xf3,0xc8,0x9b,0x61,0xaa,0x9e,
+ }
+ },
+ { 0xa0011a8, {
+ 0x97,0xc6,0x16,0x65,0x99,0xa4,0x85,0x3b,
+ 0xf6,0xce,0xaa,0x49,0x4a,0x3a,0xc5,0xb6,
+ 0x78,0x25,0xbc,0x53,0xaf,0x5d,0xcf,0xf4,
+ 0x23,0x12,0xbb,0xb1,0xbc,0x8a,0x02,0x2e,
+ }
+ },
+ { 0xa0011ce, {
+ 0xcf,0x1c,0x90,0xa3,0x85,0x0a,0xbf,0x71,
+ 0x94,0x0e,0x80,0x86,0x85,0x4f,0xd7,0x86,
+ 0xae,0x38,0x23,0x28,0x2b,0x35,0x9b,0x4e,
+ 0xfe,0xb8,0xcd,0x3d,0x3d,0x39,0xc9,0x6a,
+ }
+ },
+ { 0xa0011d1, {
+ 0xdf,0x0e,0xca,0xde,0xf6,0xce,0x5c,0x1e,
+ 0x4c,0xec,0xd7,0x71,0x83,0xcc,0xa8,0x09,
+ 0xc7,0xc5,0xfe,0xb2,0xf7,0x05,0xd2,0xc5,
+ 0x12,0xdd,0xe4,0xf3,0x92,0x1c,0x3d,0xb8,
+ }
+ },
+ { 0xa0011d3, {
+ 0x91,0xe6,0x10,0xd7,0x57,0xb0,0x95,0x0b,
+ 0x9a,0x24,0xee,0xf7,0xcf,0x56,0xc1,0xa6,
+ 0x4a,0x52,0x7d,0x5f,0x9f,0xdf,0xf6,0x00,
+ 0x65,0xf7,0xea,0xe8,0x2a,0x88,0xe2,0x26,
+ }
+ },
+ { 0xa0011d5, {
+ 0xed,0x69,0x89,0xf4,0xeb,0x64,0xc2,0x13,
+ 0xe0,0x51,0x1f,0x03,0x26,0x52,0x7d,0xb7,
+ 0x93,0x5d,0x65,0xca,0xb8,0x12,0x1d,0x62,
+ 0x0d,0x5b,0x65,0x34,0x69,0xb2,0x62,0x21,
+ }
+ },
+ { 0xa001223, {
+ 0xfb,0x32,0x5f,0xc6,0x83,0x4f,0x8c,0xb8,
+ 0xa4,0x05,0xf9,0x71,0x53,0x01,0x16,0xc4,
+ 0x83,0x75,0x94,0xdd,0xeb,0x7e,0xb7,0x15,
+ 0x8e,0x3b,0x50,0x29,0x8a,0x9c,0xcc,0x45,
+ }
+ },
+ { 0xa001224, {
+ 0x0e,0x0c,0xdf,0xb4,0x89,0xee,0x35,0x25,
+ 0xdd,0x9e,0xdb,0xc0,0x69,0x83,0x0a,0xad,
+ 0x26,0xa9,0xaa,0x9d,0xfc,0x3c,0xea,0xf9,
+ 0x6c,0xdc,0xd5,0x6d,0x8b,0x6e,0x85,0x4a,
+ }
+ },
+ { 0xa001227, {
+ 0xab,0xc6,0x00,0x69,0x4b,0x50,0x87,0xad,
+ 0x5f,0x0e,0x8b,0xea,0x57,0x38,0xce,0x1d,
+ 0x0f,0x75,0x26,0x02,0xf6,0xd6,0x96,0xe9,
+ 0x87,0xb9,0xd6,0x20,0x27,0x7c,0xd2,0xe0,
+ }
+ },
+ { 0xa001229, {
+ 0x7f,0x49,0x49,0x48,0x46,0xa5,0x50,0xa6,
+ 0x28,0x89,0x98,0xe2,0x9e,0xb4,0x7f,0x75,
+ 0x33,0xa7,0x04,0x02,0xe4,0x82,0xbf,0xb4,
+ 0xa5,0x3a,0xba,0x24,0x8d,0x31,0x10,0x1d,
+ }
+ },
+ { 0xa00122e, {
+ 0x56,0x94,0xa9,0x5d,0x06,0x68,0xfe,0xaf,
+ 0xdf,0x7a,0xff,0x2d,0xdf,0x74,0x0f,0x15,
+ 0x66,0xfb,0x00,0xb5,0x51,0x97,0x9b,0xfa,
+ 0xcb,0x79,0x85,0x46,0x25,0xb4,0xd2,0x10,
+ }
+ },
+ { 0xa001231, {
+ 0x0b,0x46,0xa5,0xfc,0x18,0x15,0xa0,0x9e,
+ 0xa6,0xdc,0xb7,0xff,0x17,0xf7,0x30,0x64,
+ 0xd4,0xda,0x9e,0x1b,0xc3,0xfc,0x02,0x3b,
+ 0xe2,0xc6,0x0e,0x41,0x54,0xb5,0x18,0xdd,
+ }
+ },
+ { 0xa001234, {
+ 0x88,0x8d,0xed,0xab,0xb5,0xbd,0x4e,0xf7,
+ 0x7f,0xd4,0x0e,0x95,0x34,0x91,0xff,0xcc,
+ 0xfb,0x2a,0xcd,0xf7,0xd5,0xdb,0x4c,0x9b,
+ 0xd6,0x2e,0x73,0x50,0x8f,0x83,0x79,0x1a,
+ }
+ },
+ { 0xa001236, {
+ 0x3d,0x30,0x00,0xb9,0x71,0xba,0x87,0x78,
+ 0xa8,0x43,0x55,0xc4,0x26,0x59,0xcf,0x9d,
+ 0x93,0xce,0x64,0x0e,0x8b,0x72,0x11,0x8b,
+ 0xa3,0x8f,0x51,0xe9,0xca,0x98,0xaa,0x25,
+ }
+ },
+ { 0xa001238, {
+ 0x72,0xf7,0x4b,0x0c,0x7d,0x58,0x65,0xcc,
+ 0x00,0xcc,0x57,0x16,0x68,0x16,0xf8,0x2a,
+ 0x1b,0xb3,0x8b,0xe1,0xb6,0x83,0x8c,0x7e,
+ 0xc0,0xcd,0x33,0xf2,0x8d,0xf9,0xef,0x59,
+ }
+ },
+ { 0xa00820c, {
+ 0xa8,0x0c,0x81,0xc0,0xa6,0x00,0xe7,0xf3,
+ 0x5f,0x65,0xd3,0xb9,0x6f,0xea,0x93,0x63,
+ 0xf1,0x8c,0x88,0x45,0xd7,0x82,0x80,0xd1,
+ 0xe1,0x3b,0x8d,0xb2,0xf8,0x22,0x03,0xe2,
+ }
+ },
+ { 0xa10113e, {
+ 0x05,0x3c,0x66,0xd7,0xa9,0x5a,0x33,0x10,
+ 0x1b,0xf8,0x9c,0x8f,0xed,0xfc,0xa7,0xa0,
+ 0x15,0xe3,0x3f,0x4b,0x1d,0x0d,0x0a,0xd5,
+ 0xfa,0x90,0xc4,0xed,0x9d,0x90,0xaf,0x53,
+ }
+ },
+ { 0xa101144, {
+ 0xb3,0x0b,0x26,0x9a,0xf8,0x7c,0x02,0x26,
+ 0x35,0x84,0x53,0xa4,0xd3,0x2c,0x7c,0x09,
+ 0x68,0x7b,0x96,0xb6,0x93,0xef,0xde,0xbc,
+ 0xfd,0x4b,0x15,0xd2,0x81,0xd3,0x51,0x47,
+ }
+ },
+ { 0xa101148, {
+ 0x20,0xd5,0x6f,0x40,0x4a,0xf6,0x48,0x90,
+ 0xc2,0x93,0x9a,0xc2,0xfd,0xac,0xef,0x4f,
+ 0xfa,0xc0,0x3d,0x92,0x3c,0x6d,0x01,0x08,
+ 0xf1,0x5e,0xb0,0xde,0xb4,0x98,0xae,0xc4,
+ }
+ },
+ { 0xa10123e, {
+ 0x03,0xb9,0x2c,0x76,0x48,0x93,0xc9,0x18,
+ 0xfb,0x56,0xfd,0xf7,0xe2,0x1d,0xca,0x4d,
+ 0x1d,0x13,0x53,0x63,0xfe,0x42,0x6f,0xfc,
+ 0x19,0x0f,0xf1,0xfc,0xa7,0xdd,0x89,0x1b,
+ }
+ },
+ { 0xa101244, {
+ 0x71,0x56,0xb5,0x9f,0x21,0xbf,0xb3,0x3c,
+ 0x8c,0xd7,0x36,0xd0,0x34,0x52,0x1b,0xb1,
+ 0x46,0x2f,0x04,0xf0,0x37,0xd8,0x1e,0x72,
+ 0x24,0xa2,0x80,0x84,0x83,0x65,0x84,0xc0,
+ }
+ },
+ { 0xa101248, {
+ 0xed,0x3b,0x95,0xa6,0x68,0xa7,0x77,0x3e,
+ 0xfc,0x17,0x26,0xe2,0x7b,0xd5,0x56,0x22,
+ 0x2c,0x1d,0xef,0xeb,0x56,0xdd,0xba,0x6e,
+ 0x1b,0x7d,0x64,0x9d,0x4b,0x53,0x13,0x75,
+ }
+ },
+ { 0xa108108, {
+ 0xed,0xc2,0xec,0xa1,0x15,0xc6,0x65,0xe9,
+ 0xd0,0xef,0x39,0xaa,0x7f,0x55,0x06,0xc6,
+ 0xf5,0xd4,0x3f,0x7b,0x14,0xd5,0x60,0x2c,
+ 0x28,0x1e,0x9c,0x59,0x69,0x99,0x4d,0x16,
+ }
+ },
+ { 0xa20102d, {
+ 0xf9,0x6e,0xf2,0x32,0xd3,0x0f,0x5f,0x11,
+ 0x59,0xa1,0xfe,0xcc,0xcd,0x9b,0x42,0x89,
+ 0x8b,0x89,0x2f,0xb5,0xbb,0x82,0xef,0x23,
+ 0x8c,0xe9,0x19,0x3e,0xcc,0x3f,0x7b,0xb4,
+ }
+ },
+ { 0xa201210, {
+ 0xe8,0x6d,0x51,0x6a,0x8e,0x72,0xf3,0xfe,
+ 0x6e,0x16,0xbc,0x62,0x59,0x40,0x17,0xe9,
+ 0x6d,0x3d,0x0e,0x6b,0xa7,0xac,0xe3,0x68,
+ 0xf7,0x55,0xf0,0x13,0xbb,0x22,0xf6,0x41,
+ }
+ },
+ { 0xa404107, {
+ 0xbb,0x04,0x4e,0x47,0xdd,0x5e,0x26,0x45,
+ 0x1a,0xc9,0x56,0x24,0xa4,0x4c,0x82,0xb0,
+ 0x8b,0x0d,0x9f,0xf9,0x3a,0xdf,0xc6,0x81,
+ 0x13,0xbc,0xc5,0x25,0xe4,0xc5,0xc3,0x99,
+ }
+ },
+ { 0xa500011, {
+ 0x23,0x3d,0x70,0x7d,0x03,0xc3,0xc4,0xf4,
+ 0x2b,0x82,0xc6,0x05,0xda,0x80,0x0a,0xf1,
+ 0xd7,0x5b,0x65,0x3a,0x7d,0xab,0xdf,0xa2,
+ 0x11,0x5e,0x96,0x7e,0x71,0xe9,0xfc,0x74,
+ }
+ },
+ { 0xa601209, {
+ 0x66,0x48,0xd4,0x09,0x05,0xcb,0x29,0x32,
+ 0x66,0xb7,0x9a,0x76,0xcd,0x11,0xf3,0x30,
+ 0x15,0x86,0xcc,0x5d,0x97,0x0f,0xc0,0x46,
+ 0xe8,0x73,0xe2,0xd6,0xdb,0xd2,0x77,0x1d,
+ }
+ },
+ { 0xa704107, {
+ 0xf3,0xc6,0x58,0x26,0xee,0xac,0x3f,0xd6,
+ 0xce,0xa1,0x72,0x47,0x3b,0xba,0x2b,0x93,
+ 0x2a,0xad,0x8e,0x6b,0xea,0x9b,0xb7,0xc2,
+ 0x64,0x39,0x71,0x8c,0xce,0xe7,0x41,0x39,
+ }
+ },
+ { 0xa705206, {
+ 0x8d,0xc0,0x76,0xbd,0x58,0x9f,0x8f,0xa4,
+ 0x12,0x9d,0x21,0xfb,0x48,0x21,0xbc,0xe7,
+ 0x67,0x6f,0x04,0x18,0xae,0x20,0x87,0x4b,
+ 0x03,0x35,0xe9,0xbe,0xfb,0x06,0xdf,0xfc,
+ }
+ },
+ { 0xa708007, {
+ 0x6b,0x76,0xcc,0x78,0xc5,0x8a,0xa3,0xe3,
+ 0x32,0x2d,0x79,0xe4,0xc3,0x80,0xdb,0xb2,
+ 0x07,0xaa,0x3a,0xe0,0x57,0x13,0x72,0x80,
+ 0xdf,0x92,0x73,0x84,0x87,0x3c,0x73,0x93,
+ }
+ },
+ { 0xa70c005, {
+ 0x88,0x5d,0xfb,0x79,0x64,0xd8,0x46,0x3b,
+ 0x4a,0x83,0x8e,0x77,0x7e,0xcf,0xb3,0x0f,
+ 0x1f,0x1f,0xf1,0x97,0xeb,0xfe,0x56,0x55,
+ 0xee,0x49,0xac,0xe1,0x8b,0x13,0xc5,0x13,
+ }
+ },
+ { 0xaa00116, {
+ 0xe8,0x4c,0x2c,0x88,0xa1,0xac,0x24,0x63,
+ 0x65,0xe5,0xaa,0x2d,0x16,0xa9,0xc3,0xf5,
+ 0xfe,0x1d,0x5e,0x65,0xc7,0xaa,0x92,0x4d,
+ 0x91,0xee,0x76,0xbb,0x4c,0x66,0x78,0xc9,
+ }
+ },
+ { 0xaa00212, {
+ 0xbd,0x57,0x5d,0x0a,0x0a,0x30,0xc1,0x75,
+ 0x95,0x58,0x5e,0x93,0x02,0x28,0x43,0x71,
+ 0xed,0x42,0x29,0xc8,0xec,0x34,0x2b,0xb2,
+ 0x1a,0x65,0x4b,0xfe,0x07,0x0f,0x34,0xa1,
+ }
+ },
+ { 0xaa00213, {
+ 0xed,0x58,0xb7,0x76,0x81,0x7f,0xd9,0x3a,
+ 0x1a,0xff,0x8b,0x34,0xb8,0x4a,0x99,0x0f,
+ 0x28,0x49,0x6c,0x56,0x2b,0xdc,0xb7,0xed,
+ 0x96,0xd5,0x9d,0xc1,0x7a,0xd4,0x51,0x9b,
+ }
+ },
+ { 0xaa00215, {
+ 0x55,0xd3,0x28,0xcb,0x87,0xa9,0x32,0xe9,
+ 0x4e,0x85,0x4b,0x7c,0x6b,0xd5,0x7c,0xd4,
+ 0x1b,0x51,0x71,0x3a,0x0e,0x0b,0xdc,0x9b,
+ 0x68,0x2f,0x46,0xee,0xfe,0xc6,0x6d,0xef,
+ }
+ },
+};
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index ad57e0e4d674..fe50eb5b7c4a 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -23,6 +23,7 @@
#include <linux/miscdevice.h>
#include <linux/capability.h>
#include <linux/firmware.h>
+#include <linux/cpumask.h>
#include <linux/kernel.h>
#include <linux/delay.h>
#include <linux/mutex.h>
@@ -31,45 +32,35 @@
#include <linux/fs.h>
#include <linux/mm.h>
-#include <asm/microcode_intel.h>
+#include <asm/apic.h>
#include <asm/cpu_device_id.h>
-#include <asm/microcode_amd.h>
#include <asm/perf_event.h>
-#include <asm/microcode.h>
#include <asm/processor.h>
#include <asm/cmdline.h>
+#include <asm/msr.h>
#include <asm/setup.h>
-#define DRIVER_VERSION "2.2"
+#include "internal.h"
-static struct microcode_ops *microcode_ops;
-static bool dis_ucode_ldr = true;
+static struct microcode_ops *microcode_ops;
+static bool dis_ucode_ldr = false;
-bool initrd_gone;
-
-LIST_HEAD(microcode_cache);
+bool force_minrev = IS_ENABLED(CONFIG_MICROCODE_LATE_FORCE_MINREV);
+module_param(force_minrev, bool, S_IRUSR | S_IWUSR);
/*
* Synchronization.
*
* All non cpu-hotplug-callback call sites use:
*
- * - microcode_mutex to synchronize with each other;
* - cpus_read_lock/unlock() to synchronize with
* the cpu-hotplug-callback call sites.
*
* We guarantee that only a single cpu is being
* updated at any particular moment of time.
*/
-static DEFINE_MUTEX(microcode_mutex);
-
struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
-struct cpu_info_ctx {
- struct cpu_signature *cpu_sig;
- int err;
-};
-
/*
* Those patch levels cannot be updated to newer ones and thus should be final.
*/
@@ -80,6 +71,8 @@ static u32 final_levels[] = {
0, /* T-101 terminator */
};
+struct early_load_data early_data;
+
/*
* Check the current patch level on this CPU.
*
@@ -92,12 +85,12 @@ static bool amd_check_current_patch_level(void)
u32 lvl, dummy, i;
u32 *levels;
+ if (x86_cpuid_vendor() != X86_VENDOR_AMD)
+ return false;
+
native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy);
- if (IS_ENABLED(CONFIG_X86_32))
- levels = (u32 *)__pa_nodebug(&final_levels);
- else
- levels = final_levels;
+ levels = final_levels;
for (i = 0; levels[i]; i++) {
if (lvl == levels[i])
@@ -106,38 +99,31 @@ static bool amd_check_current_patch_level(void)
return false;
}
-static bool __init check_loader_disabled_bsp(void)
+bool __init microcode_loader_disabled(void)
{
- static const char *__dis_opt_str = "dis_ucode_ldr";
-
-#ifdef CONFIG_X86_32
- const char *cmdline = (const char *)__pa_nodebug(boot_command_line);
- const char *option = (const char *)__pa_nodebug(__dis_opt_str);
- bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr);
-
-#else /* CONFIG_X86_64 */
- const char *cmdline = boot_command_line;
- const char *option = __dis_opt_str;
- bool *res = &dis_ucode_ldr;
-#endif
+ if (dis_ucode_ldr)
+ return true;
/*
- * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not
- * completely accurate as xen pv guests don't see that CPUID bit set but
- * that's good enough as they don't land on the BSP path anyway.
+ * Disable when:
+ *
+ * 1) The CPU does not support CPUID.
+ *
+ * 2) Bit 31 in CPUID[1]:ECX is clear
+ * The bit is reserved for hypervisor use. This is still not
+ * completely accurate as XEN PV guests don't see that CPUID bit
+ * set, but that's good enough as they don't land on the BSP
+ * path anyway.
+ *
+ * 3) Certain AMD patch levels are not allowed to be
+ * overwritten.
*/
- if (native_cpuid_ecx(1) & BIT(31))
- return *res;
-
- if (x86_cpuid_vendor() == X86_VENDOR_AMD) {
- if (amd_check_current_patch_level())
- return *res;
- }
+ if (!cpuid_feature() ||
+ native_cpuid_ecx(1) & BIT(31) ||
+ amd_check_current_patch_level())
+ dis_ucode_ldr = true;
- if (cmdline_find_option_bool(cmdline, option) <= 0)
- *res = false;
-
- return *res;
+ return dis_ucode_ldr;
}
void __init load_ucode_bsp(void)
@@ -145,7 +131,10 @@ void __init load_ucode_bsp(void)
unsigned int cpuid_1_eax;
bool intel = true;
- if (!have_cpuid_p())
+ if (cmdline_find_option_bool(boot_command_line, "dis_ucode_ldr") > 0)
+ dis_ucode_ldr = true;
+
+ if (microcode_loader_disabled())
return;
cpuid_1_eax = native_cpuid_eax(1);
@@ -166,29 +155,22 @@ void __init load_ucode_bsp(void)
return;
}
- if (check_loader_disabled_bsp())
- return;
-
if (intel)
- load_ucode_intel_bsp();
+ load_ucode_intel_bsp(&early_data);
else
- load_ucode_amd_bsp(cpuid_1_eax);
-}
-
-static bool check_loader_disabled_ap(void)
-{
-#ifdef CONFIG_X86_32
- return *((bool *)__pa_nodebug(&dis_ucode_ldr));
-#else
- return dis_ucode_ldr;
-#endif
+ load_ucode_amd_bsp(&early_data, cpuid_1_eax);
}
void load_ucode_ap(void)
{
unsigned int cpuid_1_eax;
- if (check_loader_disabled_ap())
+ /*
+ * Can't use microcode_loader_disabled() here - .init section
+ * hell. It doesn't have to either - the BSP variant must've
+ * parsed cmdline already anyway.
+ */
+ if (dis_ucode_ldr)
return;
cpuid_1_eax = native_cpuid_eax(1);
@@ -207,90 +189,37 @@ void load_ucode_ap(void)
}
}
-static int __init save_microcode_in_initrd(void)
-{
- struct cpuinfo_x86 *c = &boot_cpu_data;
- int ret = -EINVAL;
-
- switch (c->x86_vendor) {
- case X86_VENDOR_INTEL:
- if (c->x86 >= 6)
- ret = save_microcode_in_initrd_intel();
- break;
- case X86_VENDOR_AMD:
- if (c->x86 >= 0x10)
- ret = save_microcode_in_initrd_amd(cpuid_eax(1));
- break;
- default:
- break;
- }
-
- initrd_gone = true;
-
- return ret;
-}
-
-struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa)
+struct cpio_data __init find_microcode_in_initrd(const char *path)
{
#ifdef CONFIG_BLK_DEV_INITRD
unsigned long start = 0;
size_t size;
#ifdef CONFIG_X86_32
- struct boot_params *params;
-
- if (use_pa)
- params = (struct boot_params *)__pa_nodebug(&boot_params);
- else
- params = &boot_params;
-
- size = params->hdr.ramdisk_size;
-
- /*
- * Set start only if we have an initrd image. We cannot use initrd_start
- * because it is not set that early yet.
- */
+ size = boot_params.hdr.ramdisk_size;
+ /* Early load on BSP has a temporary mapping. */
if (size)
- start = params->hdr.ramdisk_image;
+ start = initrd_start_early;
-# else /* CONFIG_X86_64 */
+#else /* CONFIG_X86_64 */
size = (unsigned long)boot_params.ext_ramdisk_size << 32;
size |= boot_params.hdr.ramdisk_size;
if (size) {
start = (unsigned long)boot_params.ext_ramdisk_image << 32;
start |= boot_params.hdr.ramdisk_image;
-
start += PAGE_OFFSET;
}
-# endif
+#endif
/*
* Fixup the start address: after reserve_initrd() runs, initrd_start
* has the virtual address of the beginning of the initrd. It also
* possibly relocates the ramdisk. In either case, initrd_start contains
* the updated address so use that instead.
- *
- * initrd_gone is for the hotplug case where we've thrown out initrd
- * already.
*/
- if (!use_pa) {
- if (initrd_gone)
- return (struct cpio_data){ NULL, 0, "" };
- if (initrd_start)
- start = initrd_start;
- } else {
- /*
- * The picture with physical addresses is a bit different: we
- * need to get the *physical* address to which the ramdisk was
- * relocated, i.e., relocated_ramdisk (not initrd_start) and
- * since we're running from physical addresses, we need to access
- * relocated_ramdisk through its *physical* address too.
- */
- u64 *rr = (u64 *)__pa_nodebug(&relocated_ramdisk);
- if (*rr)
- start = *rr;
- }
+ if (initrd_start)
+ start = initrd_start;
return find_cpio_data(path, (void *)start, size, NULL);
#else /* !CONFIG_BLK_DEV_INITRD */
@@ -298,7 +227,7 @@ struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa)
#endif
}
-void reload_early_microcode(void)
+static void reload_early_microcode(unsigned int cpu)
{
int vendor, family;
@@ -312,241 +241,485 @@ void reload_early_microcode(void)
break;
case X86_VENDOR_AMD:
if (family >= 0x10)
- reload_ucode_amd();
+ reload_ucode_amd(cpu);
break;
default:
break;
}
}
-static void collect_cpu_info_local(void *arg)
-{
- struct cpu_info_ctx *ctx = arg;
+/* fake device for request_firmware */
+static struct platform_device *microcode_pdev;
- ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(),
- ctx->cpu_sig);
-}
+#ifdef CONFIG_MICROCODE_LATE_LOADING
+/*
+ * Late loading dance. Why the heavy-handed stomp_machine effort?
+ *
+ * - HT siblings must be idle and not execute other code while the other sibling
+ * is loading microcode in order to avoid any negative interactions caused by
+ * the loading.
+ *
+ * - In addition, microcode update on the cores must be serialized until this
+ * requirement can be relaxed in the future. Right now, this is conservative
+ * and good.
+ */
+enum sibling_ctrl {
+ /* Spinwait with timeout */
+ SCTRL_WAIT,
+ /* Invoke the microcode_apply() callback */
+ SCTRL_APPLY,
+ /* Proceed without invoking the microcode_apply() callback */
+ SCTRL_DONE,
+};
+
+struct microcode_ctrl {
+ enum sibling_ctrl ctrl;
+ enum ucode_state result;
+ unsigned int ctrl_cpu;
+ bool nmi_enabled;
+};
-static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig)
+DEFINE_STATIC_KEY_FALSE(microcode_nmi_handler_enable);
+static DEFINE_PER_CPU(struct microcode_ctrl, ucode_ctrl);
+static atomic_t late_cpus_in, offline_in_nmi;
+static unsigned int loops_per_usec;
+static cpumask_t cpu_offline_mask;
+
+static noinstr bool wait_for_cpus(atomic_t *cnt)
{
- struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 };
- int ret;
+ unsigned int timeout, loops;
+
+ WARN_ON_ONCE(raw_atomic_dec_return(cnt) < 0);
- ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1);
- if (!ret)
- ret = ctx.err;
+ for (timeout = 0; timeout < USEC_PER_SEC; timeout++) {
+ if (!raw_atomic_read(cnt))
+ return true;
- return ret;
+ for (loops = 0; loops < loops_per_usec; loops++)
+ cpu_relax();
+
+ /* If invoked directly, tickle the NMI watchdog */
+ if (!microcode_ops->use_nmi && !(timeout % USEC_PER_MSEC)) {
+ instrumentation_begin();
+ touch_nmi_watchdog();
+ instrumentation_end();
+ }
+ }
+ /* Prevent the late comers from making progress and let them time out */
+ raw_atomic_inc(cnt);
+ return false;
}
-static int collect_cpu_info(int cpu)
+static noinstr bool wait_for_ctrl(void)
{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- int ret;
+ unsigned int timeout, loops;
- memset(uci, 0, sizeof(*uci));
+ for (timeout = 0; timeout < USEC_PER_SEC; timeout++) {
+ if (raw_cpu_read(ucode_ctrl.ctrl) != SCTRL_WAIT)
+ return true;
- ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig);
- if (!ret)
- uci->valid = 1;
+ for (loops = 0; loops < loops_per_usec; loops++)
+ cpu_relax();
- return ret;
+ /* If invoked directly, tickle the NMI watchdog */
+ if (!microcode_ops->use_nmi && !(timeout % USEC_PER_MSEC)) {
+ instrumentation_begin();
+ touch_nmi_watchdog();
+ instrumentation_end();
+ }
+ }
+ return false;
}
-static void apply_microcode_local(void *arg)
+/*
+ * Protected against instrumentation up to the point where the primary
+ * thread completed the update. See microcode_nmi_handler() for details.
+ */
+static noinstr bool load_secondary_wait(unsigned int ctrl_cpu)
{
- enum ucode_state *err = arg;
+ /* Initial rendezvous to ensure that all CPUs have arrived */
+ if (!wait_for_cpus(&late_cpus_in)) {
+ raw_cpu_write(ucode_ctrl.result, UCODE_TIMEOUT);
+ return false;
+ }
- *err = microcode_ops->apply_microcode(smp_processor_id());
+ /*
+ * Wait for primary threads to complete. If one of them hangs due
+ * to the update, there is no way out. This is non-recoverable
+ * because the CPU might hold locks or resources and confuse the
+ * scheduler, watchdogs etc. There is no way to safely evacuate the
+ * machine.
+ */
+ if (wait_for_ctrl())
+ return true;
+
+ instrumentation_begin();
+ panic("Microcode load: Primary CPU %d timed out\n", ctrl_cpu);
+ instrumentation_end();
}
-static int apply_microcode_on_target(int cpu)
+/*
+ * Protected against instrumentation up to the point where the primary
+ * thread completed the update. See microcode_nmi_handler() for details.
+ */
+static noinstr void load_secondary(unsigned int cpu)
{
- enum ucode_state err;
- int ret;
-
- ret = smp_call_function_single(cpu, apply_microcode_local, &err, 1);
- if (!ret) {
- if (err == UCODE_ERROR)
- ret = 1;
+ unsigned int ctrl_cpu = raw_cpu_read(ucode_ctrl.ctrl_cpu);
+ enum ucode_state ret;
+
+ if (!load_secondary_wait(ctrl_cpu)) {
+ instrumentation_begin();
+ pr_err_once("load: %d CPUs timed out\n",
+ atomic_read(&late_cpus_in) - 1);
+ instrumentation_end();
+ return;
}
- return ret;
-}
-/* fake device for request_firmware */
-static struct platform_device *microcode_pdev;
+ /* Primary thread completed. Allow to invoke instrumentable code */
+ instrumentation_begin();
+ /*
+ * If the primary succeeded then invoke the apply() callback,
+ * otherwise copy the state from the primary thread.
+ */
+ if (this_cpu_read(ucode_ctrl.ctrl) == SCTRL_APPLY)
+ ret = microcode_ops->apply_microcode(cpu);
+ else
+ ret = per_cpu(ucode_ctrl.result, ctrl_cpu);
-#ifdef CONFIG_MICROCODE_LATE_LOADING
-/*
- * Late loading dance. Why the heavy-handed stomp_machine effort?
- *
- * - HT siblings must be idle and not execute other code while the other sibling
- * is loading microcode in order to avoid any negative interactions caused by
- * the loading.
- *
- * - In addition, microcode update on the cores must be serialized until this
- * requirement can be relaxed in the future. Right now, this is conservative
- * and good.
- */
-#define SPINUNIT 100 /* 100 nsec */
+ this_cpu_write(ucode_ctrl.result, ret);
+ this_cpu_write(ucode_ctrl.ctrl, SCTRL_DONE);
+ instrumentation_end();
+}
-static int check_online_cpus(void)
+static void __load_primary(unsigned int cpu)
{
- unsigned int cpu;
+ struct cpumask *secondaries = topology_sibling_cpumask(cpu);
+ enum sibling_ctrl ctrl;
+ enum ucode_state ret;
+ unsigned int sibling;
+
+ /* Initial rendezvous to ensure that all CPUs have arrived */
+ if (!wait_for_cpus(&late_cpus_in)) {
+ this_cpu_write(ucode_ctrl.result, UCODE_TIMEOUT);
+ pr_err_once("load: %d CPUs timed out\n", atomic_read(&late_cpus_in) - 1);
+ return;
+ }
+
+ ret = microcode_ops->apply_microcode(cpu);
+ this_cpu_write(ucode_ctrl.result, ret);
+ this_cpu_write(ucode_ctrl.ctrl, SCTRL_DONE);
/*
- * Make sure all CPUs are online. It's fine for SMT to be disabled if
- * all the primary threads are still online.
+ * If the update was successful, let the siblings run the apply()
+ * callback. If not, tell them it's done. This also covers the
+ * case where the CPU has uniform loading at package or system
+ * scope implemented but does not advertise it.
*/
- for_each_present_cpu(cpu) {
- if (topology_is_primary_thread(cpu) && !cpu_online(cpu)) {
- pr_err("Not all CPUs online, aborting microcode update.\n");
- return -EINVAL;
- }
+ if (ret == UCODE_UPDATED || ret == UCODE_OK)
+ ctrl = SCTRL_APPLY;
+ else
+ ctrl = SCTRL_DONE;
+
+ for_each_cpu(sibling, secondaries) {
+ if (sibling != cpu)
+ per_cpu(ucode_ctrl.ctrl, sibling) = ctrl;
}
+}
- return 0;
+static bool kick_offline_cpus(unsigned int nr_offl)
+{
+ unsigned int cpu, timeout;
+
+ for_each_cpu(cpu, &cpu_offline_mask) {
+ /* Enable the rendezvous handler and send NMI */
+ per_cpu(ucode_ctrl.nmi_enabled, cpu) = true;
+ apic_send_nmi_to_offline_cpu(cpu);
+ }
+
+ /* Wait for them to arrive */
+ for (timeout = 0; timeout < (USEC_PER_SEC / 2); timeout++) {
+ if (atomic_read(&offline_in_nmi) == nr_offl)
+ return true;
+ udelay(1);
+ }
+ /* Let the others time out */
+ return false;
}
-static atomic_t late_cpus_in;
-static atomic_t late_cpus_out;
+static void release_offline_cpus(void)
+{
+ unsigned int cpu;
-static int __wait_for_cpus(atomic_t *t, long long timeout)
+ for_each_cpu(cpu, &cpu_offline_mask)
+ per_cpu(ucode_ctrl.ctrl, cpu) = SCTRL_DONE;
+}
+
+static void load_primary(unsigned int cpu)
{
- int all_cpus = num_online_cpus();
+ unsigned int nr_offl = cpumask_weight(&cpu_offline_mask);
+ bool proceed = true;
- atomic_inc(t);
+ /* Kick soft-offlined SMT siblings if required */
+ if (!cpu && nr_offl)
+ proceed = kick_offline_cpus(nr_offl);
- while (atomic_read(t) < all_cpus) {
- if (timeout < SPINUNIT) {
- pr_err("Timeout while waiting for CPUs rendezvous, remaining: %d\n",
- all_cpus - atomic_read(t));
- return 1;
- }
+ /* If the soft-offlined CPUs did not respond, abort */
+ if (proceed)
+ __load_primary(cpu);
+
+ /* Unconditionally release soft-offlined SMT siblings if required */
+ if (!cpu && nr_offl)
+ release_offline_cpus();
+}
+
+/*
+ * Minimal stub rendezvous handler for soft-offlined CPUs which participate
+ * in the NMI rendezvous to protect against a concurrent NMI on affected
+ * CPUs.
+ */
+void noinstr microcode_offline_nmi_handler(void)
+{
+ if (!raw_cpu_read(ucode_ctrl.nmi_enabled))
+ return;
+ raw_cpu_write(ucode_ctrl.nmi_enabled, false);
+ raw_cpu_write(ucode_ctrl.result, UCODE_OFFLINE);
+ raw_atomic_inc(&offline_in_nmi);
+ wait_for_ctrl();
+}
- ndelay(SPINUNIT);
- timeout -= SPINUNIT;
+static noinstr bool microcode_update_handler(void)
+{
+ unsigned int cpu = raw_smp_processor_id();
- touch_nmi_watchdog();
+ if (raw_cpu_read(ucode_ctrl.ctrl_cpu) == cpu) {
+ instrumentation_begin();
+ load_primary(cpu);
+ instrumentation_end();
+ } else {
+ load_secondary(cpu);
}
- return 0;
+
+ instrumentation_begin();
+ touch_nmi_watchdog();
+ instrumentation_end();
+
+ return true;
}
/*
- * Returns:
- * < 0 - on error
- * 0 - success (no update done or microcode was updated)
+ * Protection against instrumentation is required for CPUs which are not
+ * safe against an NMI which is delivered to the secondary SMT sibling
+ * while the primary thread updates the microcode. Instrumentation can end
+ * up in #INT3, #DB and #PF. The IRET from those exceptions reenables NMI
+ * which is the opposite of what the NMI rendezvous is trying to achieve.
+ *
+ * The primary thread is safe versus instrumentation as the actual
+ * microcode update handles this correctly. It's only the sibling code
+ * path which must be NMI safe until the primary thread completed the
+ * update.
*/
-static int __reload_late(void *info)
+bool noinstr microcode_nmi_handler(void)
{
- int cpu = smp_processor_id();
- enum ucode_state err;
- int ret = 0;
+ if (!raw_cpu_read(ucode_ctrl.nmi_enabled))
+ return false;
- /*
- * Wait for all CPUs to arrive. A load will not be attempted unless all
- * CPUs show up.
- * */
- if (__wait_for_cpus(&late_cpus_in, NSEC_PER_SEC))
- return -1;
+ raw_cpu_write(ucode_ctrl.nmi_enabled, false);
+ return microcode_update_handler();
+}
+
+static int load_cpus_stopped(void *unused)
+{
+ if (microcode_ops->use_nmi) {
+ /* Enable the NMI handler and raise NMI */
+ this_cpu_write(ucode_ctrl.nmi_enabled, true);
+ apic->send_IPI(smp_processor_id(), NMI_VECTOR);
+ } else {
+ /* Just invoke the handler directly */
+ microcode_update_handler();
+ }
+ return 0;
+}
+
+static int load_late_stop_cpus(bool is_safe)
+{
+ unsigned int cpu, updated = 0, failed = 0, timedout = 0, siblings = 0;
+ unsigned int nr_offl, offline = 0;
+ int old_rev = boot_cpu_data.microcode;
+ struct cpuinfo_x86 prev_info;
+
+ if (!is_safe) {
+ pr_err("Late microcode loading without minimal revision check.\n");
+ pr_err("You should switch to early loading, if possible.\n");
+ }
+
+ atomic_set(&late_cpus_in, num_online_cpus());
+ atomic_set(&offline_in_nmi, 0);
+ loops_per_usec = loops_per_jiffy / (TICK_NSEC / 1000);
/*
- * On an SMT system, it suffices to load the microcode on one sibling of
- * the core because the microcode engine is shared between the threads.
- * Synchronization still needs to take place so that no concurrent
- * loading attempts happen on multiple threads of an SMT core. See
- * below.
+ * Take a snapshot before the microcode update in order to compare and
+ * check whether any bits changed after an update.
*/
- if (cpumask_first(topology_sibling_cpumask(cpu)) == cpu)
- apply_microcode_local(&err);
- else
- goto wait_for_siblings;
+ store_cpu_caps(&prev_info);
+
+ if (microcode_ops->use_nmi)
+ static_branch_enable_cpuslocked(&microcode_nmi_handler_enable);
- if (err >= UCODE_NFOUND) {
- if (err == UCODE_ERROR)
- pr_warn("Error reloading microcode on CPU %d\n", cpu);
+ stop_machine_cpuslocked(load_cpus_stopped, NULL, cpu_online_mask);
- ret = -1;
+ if (microcode_ops->use_nmi)
+ static_branch_disable_cpuslocked(&microcode_nmi_handler_enable);
+
+ /* Analyze the results */
+ for_each_cpu_and(cpu, cpu_present_mask, &cpus_booted_once_mask) {
+ switch (per_cpu(ucode_ctrl.result, cpu)) {
+ case UCODE_UPDATED: updated++; break;
+ case UCODE_TIMEOUT: timedout++; break;
+ case UCODE_OK: siblings++; break;
+ case UCODE_OFFLINE: offline++; break;
+ default: failed++; break;
+ }
}
-wait_for_siblings:
- if (__wait_for_cpus(&late_cpus_out, NSEC_PER_SEC))
- panic("Timeout during microcode update!\n");
+ if (microcode_ops->finalize_late_load)
+ microcode_ops->finalize_late_load(!updated);
- /*
- * At least one thread has completed update on each core.
- * For others, simply call the update to make sure the
- * per-cpu cpuinfo can be updated with right microcode
- * revision.
- */
- if (cpumask_first(topology_sibling_cpumask(cpu)) != cpu)
- apply_microcode_local(&err);
+ if (!updated) {
+ /* Nothing changed. */
+ if (!failed && !timedout)
+ return 0;
+
+ nr_offl = cpumask_weight(&cpu_offline_mask);
+ if (offline < nr_offl) {
+ pr_warn("%u offline siblings did not respond.\n",
+ nr_offl - atomic_read(&offline_in_nmi));
+ return -EIO;
+ }
+ pr_err("update failed: %u CPUs failed %u CPUs timed out\n",
+ failed, timedout);
+ return -EIO;
+ }
- return ret;
+ if (!is_safe || failed || timedout)
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+
+ pr_info("load: updated on %u primary CPUs with %u siblings\n", updated, siblings);
+ if (failed || timedout) {
+ pr_err("load incomplete. %u CPUs timed out or failed\n",
+ num_online_cpus() - (updated + siblings));
+ }
+ pr_info("revision: 0x%x -> 0x%x\n", old_rev, boot_cpu_data.microcode);
+ microcode_check(&prev_info);
+
+ return updated + siblings == num_online_cpus() ? 0 : -EIO;
}
/*
- * Reload microcode late on all CPUs. Wait for a sec until they
- * all gather together.
+ * This function does two things:
+ *
+ * 1) Ensure that all required CPUs which are present and have been booted
+ * once are online.
+ *
+ * To pass this check, all primary threads must be online.
+ *
+ * If the microcode load is not safe against NMI then all SMT threads
+ * must be online as well because they still react to NMIs when they are
+ * soft-offlined and parked in one of the play_dead() variants. So if a
+ * NMI hits while the primary thread updates the microcode the resulting
+ * behaviour is undefined. The default play_dead() implementation on
+ * modern CPUs uses MWAIT, which is also not guaranteed to be safe
+ * against a microcode update which affects MWAIT.
+ *
+ * As soft-offlined CPUs still react on NMIs, the SMT sibling
+ * restriction can be lifted when the vendor driver signals to use NMI
+ * for rendezvous and the APIC provides a mechanism to send an NMI to a
+ * soft-offlined CPU. The soft-offlined CPUs are then able to
+ * participate in the rendezvous in a trivial stub handler.
+ *
+ * 2) Initialize the per CPU control structure and create a cpumask
+ * which contains "offline"; secondary threads, so they can be handled
+ * correctly by a control CPU.
*/
-static int microcode_reload_late(void)
+static bool setup_cpus(void)
{
- int ret;
+ struct microcode_ctrl ctrl = { .ctrl = SCTRL_WAIT, .result = -1, };
+ bool allow_smt_offline;
+ unsigned int cpu;
- pr_err("Attempting late microcode loading - it is dangerous and taints the kernel.\n");
- pr_err("You should switch to early loading, if possible.\n");
+ allow_smt_offline = microcode_ops->nmi_safe ||
+ (microcode_ops->use_nmi && apic->nmi_to_offline_cpu);
- atomic_set(&late_cpus_in, 0);
- atomic_set(&late_cpus_out, 0);
+ cpumask_clear(&cpu_offline_mask);
- ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask);
- if (ret == 0)
- microcode_check();
+ for_each_cpu_and(cpu, cpu_present_mask, &cpus_booted_once_mask) {
+ /*
+ * Offline CPUs sit in one of the play_dead() functions
+ * with interrupts disabled, but they still react on NMIs
+ * and execute arbitrary code. Also MWAIT being updated
+ * while the offline CPU sits there is not necessarily safe
+ * on all CPU variants.
+ *
+ * Mark them in the offline_cpus mask which will be handled
+ * by CPU0 later in the update process.
+ *
+ * Ensure that the primary thread is online so that it is
+ * guaranteed that all cores are updated.
+ */
+ if (!cpu_online(cpu)) {
+ if (topology_is_primary_thread(cpu) || !allow_smt_offline) {
+ pr_err("CPU %u not online, loading aborted\n", cpu);
+ return false;
+ }
+ cpumask_set_cpu(cpu, &cpu_offline_mask);
+ per_cpu(ucode_ctrl, cpu) = ctrl;
+ continue;
+ }
- pr_info("Reload completed, microcode revision: 0x%x\n", boot_cpu_data.microcode);
+ /*
+ * Initialize the per CPU state. This is core scope for now,
+ * but prepared to take package or system scope into account.
+ */
+ ctrl.ctrl_cpu = cpumask_first(topology_sibling_cpumask(cpu));
+ per_cpu(ucode_ctrl, cpu) = ctrl;
+ }
+ return true;
+}
- return ret;
+static int load_late_locked(void)
+{
+ if (!setup_cpus())
+ return -EBUSY;
+
+ switch (microcode_ops->request_microcode_fw(0, &microcode_pdev->dev)) {
+ case UCODE_NEW:
+ return load_late_stop_cpus(false);
+ case UCODE_NEW_SAFE:
+ return load_late_stop_cpus(true);
+ case UCODE_NFOUND:
+ return -ENOENT;
+ case UCODE_OK:
+ return 0;
+ default:
+ return -EBADFD;
+ }
}
static ssize_t reload_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t size)
{
- enum ucode_state tmp_ret = UCODE_OK;
- int bsp = boot_cpu_data.cpu_index;
unsigned long val;
- ssize_t ret = 0;
+ ssize_t ret;
ret = kstrtoul(buf, 0, &val);
- if (ret)
- return ret;
-
- if (val != 1)
- return size;
+ if (ret || val != 1)
+ return -EINVAL;
cpus_read_lock();
-
- ret = check_online_cpus();
- if (ret)
- goto put;
-
- tmp_ret = microcode_ops->request_microcode_fw(bsp, &microcode_pdev->dev, true);
- if (tmp_ret != UCODE_NEW)
- goto put;
-
- mutex_lock(&microcode_mutex);
- ret = microcode_reload_late();
- mutex_unlock(&microcode_mutex);
-
-put:
+ ret = load_late_locked();
cpus_read_unlock();
- if (ret == 0)
- ret = size;
-
- add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
-
- return ret;
+ return ret ? : size;
}
static DEVICE_ATTR_WO(reload);
@@ -560,7 +733,7 @@ static ssize_t version_show(struct device *dev,
return sprintf(buf, "0x%x\n", uci->cpu_sig.rev);
}
-static ssize_t pf_show(struct device *dev,
+static ssize_t processor_flags_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
@@ -568,8 +741,8 @@ static ssize_t pf_show(struct device *dev,
return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);
}
-static DEVICE_ATTR(version, 0444, version_show, NULL);
-static DEVICE_ATTR(processor_flags, 0444, pf_show, NULL);
+static DEVICE_ATTR_RO(version);
+static DEVICE_ATTR_RO(processor_flags);
static struct attribute *mc_default_attrs[] = {
&dev_attr_version.attr,
@@ -588,91 +761,6 @@ static void microcode_fini_cpu(int cpu)
microcode_ops->microcode_fini_cpu(cpu);
}
-static enum ucode_state microcode_resume_cpu(int cpu)
-{
- if (apply_microcode_on_target(cpu))
- return UCODE_ERROR;
-
- pr_debug("CPU%d updated upon resume\n", cpu);
-
- return UCODE_OK;
-}
-
-static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw)
-{
- enum ucode_state ustate;
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
- if (uci->valid)
- return UCODE_OK;
-
- if (collect_cpu_info(cpu))
- return UCODE_ERROR;
-
- /* --dimm. Trigger a delayed update? */
- if (system_state != SYSTEM_RUNNING)
- return UCODE_NFOUND;
-
- ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev, refresh_fw);
- if (ustate == UCODE_NEW) {
- pr_debug("CPU%d updated upon init\n", cpu);
- apply_microcode_on_target(cpu);
- }
-
- return ustate;
-}
-
-static enum ucode_state microcode_update_cpu(int cpu)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
- /* Refresh CPU microcode revision after resume. */
- collect_cpu_info(cpu);
-
- if (uci->valid)
- return microcode_resume_cpu(cpu);
-
- return microcode_init_cpu(cpu, false);
-}
-
-static int mc_device_add(struct device *dev, struct subsys_interface *sif)
-{
- int err, cpu = dev->id;
-
- if (!cpu_online(cpu))
- return 0;
-
- pr_debug("CPU%d added\n", cpu);
-
- err = sysfs_create_group(&dev->kobj, &mc_attr_group);
- if (err)
- return err;
-
- if (microcode_init_cpu(cpu, true) == UCODE_ERROR)
- return -EINVAL;
-
- return err;
-}
-
-static void mc_device_remove(struct device *dev, struct subsys_interface *sif)
-{
- int cpu = dev->id;
-
- if (!cpu_online(cpu))
- return;
-
- pr_debug("CPU%d removed\n", cpu);
- microcode_fini_cpu(cpu);
- sysfs_remove_group(&dev->kobj, &mc_attr_group);
-}
-
-static struct subsys_interface mc_cpu_interface = {
- .name = "microcode",
- .subsys = &cpu_subsys,
- .add_dev = mc_device_add,
- .remove_dev = mc_device_remove,
-};
-
/**
* microcode_bsp_resume - Update boot CPU microcode during resume.
*/
@@ -681,27 +769,28 @@ void microcode_bsp_resume(void)
int cpu = smp_processor_id();
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- if (uci->valid && uci->mc)
+ if (uci->mc)
microcode_ops->apply_microcode(cpu);
- else if (!uci->mc)
- reload_early_microcode();
+ else
+ reload_early_microcode(cpu);
}
static struct syscore_ops mc_syscore_ops = {
- .resume = microcode_bsp_resume,
+ .resume = microcode_bsp_resume,
};
-static int mc_cpu_starting(unsigned int cpu)
-{
- microcode_update_cpu(cpu);
- pr_debug("CPU%d added\n", cpu);
- return 0;
-}
-
static int mc_cpu_online(unsigned int cpu)
{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
struct device *dev = get_cpu_device(cpu);
+ memset(uci, 0, sizeof(*uci));
+
+ microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig);
+ cpu_data(cpu).microcode = uci->cpu_sig.rev;
+ if (!cpu)
+ boot_cpu_data.microcode = uci->cpu_sig.rev;
+
if (sysfs_create_group(&dev->kobj, &mc_attr_group))
pr_err("Failed to create group for CPU%d\n", cpu);
return 0;
@@ -709,13 +798,10 @@ static int mc_cpu_online(unsigned int cpu)
static int mc_cpu_down_prep(unsigned int cpu)
{
- struct device *dev;
+ struct device *dev = get_cpu_device(cpu);
- dev = get_cpu_device(cpu);
- /* Suspend is in progress, only remove the interface */
+ microcode_fini_cpu(cpu);
sysfs_remove_group(&dev->kobj, &mc_attr_group);
- pr_debug("CPU%d removed\n", cpu);
-
return 0;
}
@@ -733,10 +819,11 @@ static const struct attribute_group cpu_root_microcode_group = {
static int __init microcode_init(void)
{
+ struct device *dev_root;
struct cpuinfo_x86 *c = &boot_cpu_data;
int error;
- if (dis_ucode_ldr)
+ if (microcode_loader_disabled())
return -EINVAL;
if (c->x86_vendor == X86_VENDOR_INTEL)
@@ -749,51 +836,34 @@ static int __init microcode_init(void)
if (!microcode_ops)
return -ENODEV;
- microcode_pdev = platform_device_register_simple("microcode", -1,
- NULL, 0);
- if (IS_ERR(microcode_pdev))
- return PTR_ERR(microcode_pdev);
+ pr_info_once("Current revision: 0x%08x\n", (early_data.new_rev ?: early_data.old_rev));
- cpus_read_lock();
- mutex_lock(&microcode_mutex);
- error = subsys_interface_register(&mc_cpu_interface);
- mutex_unlock(&microcode_mutex);
- cpus_read_unlock();
+ if (early_data.new_rev)
+ pr_info_once("Updated early from: 0x%08x\n", early_data.old_rev);
- if (error)
- goto out_pdev;
-
- error = sysfs_create_group(&cpu_subsys.dev_root->kobj,
- &cpu_root_microcode_group);
+ microcode_pdev = platform_device_register_simple("microcode", -1, NULL, 0);
+ if (IS_ERR(microcode_pdev))
+ return PTR_ERR(microcode_pdev);
- if (error) {
- pr_err("Error creating microcode group!\n");
- goto out_driver;
+ dev_root = bus_get_dev_root(&cpu_subsys);
+ if (dev_root) {
+ error = sysfs_create_group(&dev_root->kobj, &cpu_root_microcode_group);
+ put_device(dev_root);
+ if (error) {
+ pr_err("Error creating microcode group!\n");
+ goto out_pdev;
+ }
}
register_syscore_ops(&mc_syscore_ops);
- cpuhp_setup_state_nocalls(CPUHP_AP_MICROCODE_LOADER, "x86/microcode:starting",
- mc_cpu_starting, NULL);
- cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/microcode:online",
- mc_cpu_online, mc_cpu_down_prep);
-
- pr_info("Microcode Update Driver: v%s.", DRIVER_VERSION);
+ cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/microcode:online",
+ mc_cpu_online, mc_cpu_down_prep);
return 0;
- out_driver:
- cpus_read_lock();
- mutex_lock(&microcode_mutex);
-
- subsys_interface_unregister(&mc_cpu_interface);
-
- mutex_unlock(&microcode_mutex);
- cpus_read_unlock();
-
out_pdev:
platform_device_unregister(microcode_pdev);
return error;
}
-fs_initcall(save_microcode_in_initrd);
late_initcall(microcode_init);
diff --git a/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h b/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h
new file mode 100644
index 000000000000..cb6e601701ab
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h
@@ -0,0 +1,150 @@
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x03, .steppings = 0x0004, .driver_data = 0x2 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0001, .driver_data = 0x45 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0002, .driver_data = 0x40 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0004, .driver_data = 0x2c },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0008, .driver_data = 0x10 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x0001, .driver_data = 0xa },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x0020, .driver_data = 0x3 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x0400, .driver_data = 0xd },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x2000, .driver_data = 0x7 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x07, .steppings = 0x0002, .driver_data = 0x14 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x07, .steppings = 0x0004, .driver_data = 0x38 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x07, .steppings = 0x0008, .driver_data = 0x2e },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0002, .driver_data = 0x11 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0008, .driver_data = 0x8 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0040, .driver_data = 0xc },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0400, .driver_data = 0x5 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x09, .steppings = 0x0020, .driver_data = 0x47 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0a, .steppings = 0x0001, .driver_data = 0x3 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0a, .steppings = 0x0002, .driver_data = 0x1 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0b, .steppings = 0x0002, .driver_data = 0x1d },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0b, .steppings = 0x0010, .driver_data = 0x2 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0d, .steppings = 0x0040, .driver_data = 0x18 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0e, .steppings = 0x0100, .driver_data = 0x39 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0e, .steppings = 0x1000, .driver_data = 0x59 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0004, .driver_data = 0x5d },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0040, .driver_data = 0xd2 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0080, .driver_data = 0x6b },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0400, .driver_data = 0x95 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0800, .driver_data = 0xbc },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x2000, .driver_data = 0xa4 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x16, .steppings = 0x0002, .driver_data = 0x44 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x17, .steppings = 0x0040, .driver_data = 0x60f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x17, .steppings = 0x0080, .driver_data = 0x70a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x17, .steppings = 0x0400, .driver_data = 0xa0b },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1a, .steppings = 0x0010, .driver_data = 0x12 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1a, .steppings = 0x0020, .driver_data = 0x1d },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1c, .steppings = 0x0004, .driver_data = 0x219 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1c, .steppings = 0x0400, .driver_data = 0x107 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1d, .steppings = 0x0002, .driver_data = 0x29 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1e, .steppings = 0x0020, .driver_data = 0xa },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x25, .steppings = 0x0004, .driver_data = 0x11 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x25, .steppings = 0x0020, .driver_data = 0x7 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x26, .steppings = 0x0002, .driver_data = 0x105 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2a, .steppings = 0x0080, .driver_data = 0x2f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2c, .steppings = 0x0004, .driver_data = 0x1f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2d, .steppings = 0x0040, .driver_data = 0x621 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2d, .steppings = 0x0080, .driver_data = 0x71a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2e, .steppings = 0x0040, .driver_data = 0xd },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2f, .steppings = 0x0004, .driver_data = 0x3b },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x37, .steppings = 0x0100, .driver_data = 0x838 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x37, .steppings = 0x0200, .driver_data = 0x90d },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3a, .steppings = 0x0200, .driver_data = 0x21 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3c, .steppings = 0x0008, .driver_data = 0x28 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3d, .steppings = 0x0010, .driver_data = 0x2f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3e, .steppings = 0x0010, .driver_data = 0x42e },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3e, .steppings = 0x0040, .driver_data = 0x600 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3e, .steppings = 0x0080, .driver_data = 0x715 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3f, .steppings = 0x0004, .driver_data = 0x49 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3f, .steppings = 0x0010, .driver_data = 0x1a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x45, .steppings = 0x0002, .driver_data = 0x26 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x46, .steppings = 0x0002, .driver_data = 0x1c },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x47, .steppings = 0x0002, .driver_data = 0x22 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4c, .steppings = 0x0008, .driver_data = 0x368 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4c, .steppings = 0x0010, .driver_data = 0x411 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4d, .steppings = 0x0100, .driver_data = 0x12d },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4e, .steppings = 0x0008, .driver_data = 0xf0 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0008, .driver_data = 0x1000191 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0010, .driver_data = 0x2007006 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0020, .driver_data = 0x3000010 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0040, .driver_data = 0x4003605 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0080, .driver_data = 0x5003707 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0800, .driver_data = 0x7002904 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0004, .driver_data = 0x1c },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0008, .driver_data = 0x700001c },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0010, .driver_data = 0xf00001a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0020, .driver_data = 0xe000015 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5c, .steppings = 0x0004, .driver_data = 0x14 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5c, .steppings = 0x0200, .driver_data = 0x48 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5c, .steppings = 0x0400, .driver_data = 0x28 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5e, .steppings = 0x0008, .driver_data = 0xf0 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5f, .steppings = 0x0002, .driver_data = 0x3e },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x66, .steppings = 0x0008, .driver_data = 0x2a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6a, .steppings = 0x0020, .driver_data = 0xc0002f0 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6a, .steppings = 0x0040, .driver_data = 0xd0003e7 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6c, .steppings = 0x0002, .driver_data = 0x10002b0 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7a, .steppings = 0x0002, .driver_data = 0x42 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7a, .steppings = 0x0100, .driver_data = 0x24 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7e, .steppings = 0x0020, .driver_data = 0xc6 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8a, .steppings = 0x0002, .driver_data = 0x33 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0002, .driver_data = 0xb8 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0004, .driver_data = 0x38 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8d, .steppings = 0x0002, .driver_data = 0x52 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0200, .driver_data = 0xf6 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0400, .driver_data = 0xf6 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0800, .driver_data = 0xf6 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x1000, .driver_data = 0xfc },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0100, .driver_data = 0x2c000390 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0080, .driver_data = 0x2b000603 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0040, .driver_data = 0x2c000390 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0020, .driver_data = 0x2c000390 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0010, .driver_data = 0x2c000390 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x96, .steppings = 0x0002, .driver_data = 0x1a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0004, .driver_data = 0x37 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0020, .driver_data = 0x37 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0004, .driver_data = 0x37 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0020, .driver_data = 0x37 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0008, .driver_data = 0x435 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0010, .driver_data = 0x435 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9c, .steppings = 0x0001, .driver_data = 0x24000026 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0200, .driver_data = 0xf8 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0400, .driver_data = 0xf8 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0800, .driver_data = 0xf6 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x1000, .driver_data = 0xf8 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x2000, .driver_data = 0x100 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0004, .driver_data = 0xfc },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0008, .driver_data = 0xfc },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0020, .driver_data = 0xfc },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0001, .driver_data = 0xfe },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0002, .driver_data = 0xfc },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa7, .steppings = 0x0002, .driver_data = 0x62 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xaa, .steppings = 0x0010, .driver_data = 0x20 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb7, .steppings = 0x0002, .driver_data = 0x12b },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0004, .driver_data = 0x4123 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0008, .driver_data = 0x4123 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0100, .driver_data = 0x4123 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbe, .steppings = 0x0001, .driver_data = 0x1a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0004, .driver_data = 0x21000283 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0002, .driver_data = 0x21000283 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x00, .steppings = 0x0080, .driver_data = 0x12 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x00, .steppings = 0x0400, .driver_data = 0x15 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x01, .steppings = 0x0004, .driver_data = 0x2e },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0010, .driver_data = 0x21 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0020, .driver_data = 0x2c },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0040, .driver_data = 0x10 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0080, .driver_data = 0x39 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0200, .driver_data = 0x2f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x03, .steppings = 0x0004, .driver_data = 0xa },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x03, .steppings = 0x0008, .driver_data = 0xc },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x03, .steppings = 0x0010, .driver_data = 0x17 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0002, .driver_data = 0x17 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0008, .driver_data = 0x5 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0010, .driver_data = 0x6 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0080, .driver_data = 0x3 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0100, .driver_data = 0xe },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0200, .driver_data = 0x3 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0400, .driver_data = 0x4 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0004, .driver_data = 0xf },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0010, .driver_data = 0x4 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0020, .driver_data = 0x8 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0100, .driver_data = 0x9 },
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 025c8f0cd948..371ca6eac00e 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -10,19 +10,10 @@
* Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
* H Peter Anvin" <hpa@zytor.com>
*/
-
-/*
- * This needs to be before all headers so that pr_debug in printk.h doesn't turn
- * printk calls into no_printk().
- *
- *#define DEBUG
- */
#define pr_fmt(fmt) "microcode: " fmt
-
#include <linux/earlycpio.h>
#include <linux/firmware.h>
#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
#include <linux/initrd.h>
#include <linux/kernel.h>
#include <linux/slab.h>
@@ -30,140 +21,120 @@
#include <linux/uio.h>
#include <linux/mm.h>
-#include <asm/microcode_intel.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/processor.h>
#include <asm/tlbflush.h>
#include <asm/setup.h>
#include <asm/msr.h>
+#include "internal.h"
+
static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin";
+#define UCODE_BSP_LOADED ((struct microcode_intel *)0x1UL)
+
/* Current microcode patch used in early patching on the APs. */
-static struct microcode_intel *intel_ucode_patch;
+static struct microcode_intel *ucode_patch_va __read_mostly;
+static struct microcode_intel *ucode_patch_late __read_mostly;
/* last level cache size per core */
-static int llc_size_per_core;
+static unsigned int llc_size_per_core __ro_after_init;
-/*
- * Returns 1 if update has been found, 0 otherwise.
- */
-static int find_matching_signature(void *mc, unsigned int csig, int cpf)
-{
- struct microcode_header_intel *mc_hdr = mc;
- struct extended_sigtable *ext_hdr;
- struct extended_signature *ext_sig;
- int i;
-
- if (intel_cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf))
- return 1;
+/* microcode format is extended from prescott processors */
+struct extended_signature {
+ unsigned int sig;
+ unsigned int pf;
+ unsigned int cksum;
+};
- /* Look for ext. headers: */
- if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE)
- return 0;
+struct extended_sigtable {
+ unsigned int count;
+ unsigned int cksum;
+ unsigned int reserved[3];
+ struct extended_signature sigs[];
+};
- ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE;
- ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE;
+#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
+#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable))
+#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature))
- for (i = 0; i < ext_hdr->count; i++) {
- if (intel_cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf))
- return 1;
- ext_sig++;
- }
- return 0;
+static inline unsigned int get_totalsize(struct microcode_header_intel *hdr)
+{
+ return hdr->datasize ? hdr->totalsize : DEFAULT_UCODE_TOTALSIZE;
}
-/*
- * Returns 1 if update has been found, 0 otherwise.
- */
-static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev)
+static inline unsigned int exttable_size(struct extended_sigtable *et)
{
- struct microcode_header_intel *mc_hdr = mc;
-
- if (mc_hdr->rev <= new_rev)
- return 0;
-
- return find_matching_signature(mc, csig, cpf);
+ return et->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE;
}
-static struct ucode_patch *memdup_patch(void *data, unsigned int size)
+void intel_collect_cpu_info(struct cpu_signature *sig)
{
- struct ucode_patch *p;
+ sig->sig = cpuid_eax(1);
+ sig->pf = 0;
+ sig->rev = intel_get_microcode_revision();
- p = kzalloc(sizeof(struct ucode_patch), GFP_KERNEL);
- if (!p)
- return NULL;
+ if (IFM(x86_family(sig->sig), x86_model(sig->sig)) >= INTEL_PENTIUM_III_DESCHUTES) {
+ unsigned int val[2];
- p->data = kmemdup(data, size, GFP_KERNEL);
- if (!p->data) {
- kfree(p);
- return NULL;
+ /* get processor flags from MSR 0x17 */
+ native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
+ sig->pf = 1 << ((val[1] >> 18) & 7);
}
-
- return p;
}
+EXPORT_SYMBOL_GPL(intel_collect_cpu_info);
-static void save_microcode_patch(struct ucode_cpu_info *uci, void *data, unsigned int size)
+static inline bool cpu_signatures_match(struct cpu_signature *s1, unsigned int sig2,
+ unsigned int pf2)
{
- struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
- struct ucode_patch *iter, *tmp, *p = NULL;
- bool prev_found = false;
- unsigned int sig, pf;
+ if (s1->sig != sig2)
+ return false;
- mc_hdr = (struct microcode_header_intel *)data;
+ /* Processor flags are either both 0 or they intersect. */
+ return ((!s1->pf && !pf2) || (s1->pf & pf2));
+}
- list_for_each_entry_safe(iter, tmp, &microcode_cache, plist) {
- mc_saved_hdr = (struct microcode_header_intel *)iter->data;
- sig = mc_saved_hdr->sig;
- pf = mc_saved_hdr->pf;
+bool intel_find_matching_signature(void *mc, struct cpu_signature *sig)
+{
+ struct microcode_header_intel *mc_hdr = mc;
+ struct extended_signature *ext_sig;
+ struct extended_sigtable *ext_hdr;
+ int i;
- if (find_matching_signature(data, sig, pf)) {
- prev_found = true;
+ if (cpu_signatures_match(sig, mc_hdr->sig, mc_hdr->pf))
+ return true;
- if (mc_hdr->rev <= mc_saved_hdr->rev)
- continue;
+ /* Look for ext. headers: */
+ if (get_totalsize(mc_hdr) <= intel_microcode_get_datasize(mc_hdr) + MC_HEADER_SIZE)
+ return false;
- p = memdup_patch(data, size);
- if (!p)
- pr_err("Error allocating buffer %p\n", data);
- else {
- list_replace(&iter->plist, &p->plist);
- kfree(iter->data);
- kfree(iter);
- }
- }
- }
+ ext_hdr = mc + intel_microcode_get_datasize(mc_hdr) + MC_HEADER_SIZE;
+ ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE;
- /*
- * There weren't any previous patches found in the list cache; save the
- * newly found.
- */
- if (!prev_found) {
- p = memdup_patch(data, size);
- if (!p)
- pr_err("Error allocating buffer for %p\n", data);
- else
- list_add_tail(&p->plist, &microcode_cache);
+ for (i = 0; i < ext_hdr->count; i++) {
+ if (cpu_signatures_match(sig, ext_sig->sig, ext_sig->pf))
+ return true;
+ ext_sig++;
}
-
- if (!p)
- return;
-
- if (!find_matching_signature(p->data, uci->cpu_sig.sig, uci->cpu_sig.pf))
- return;
-
- /*
- * Save for early loading. On 32-bit, that needs to be a physical
- * address as the APs are running from physical addresses, before
- * paging has been enabled.
- */
- if (IS_ENABLED(CONFIG_X86_32))
- intel_ucode_patch = (struct microcode_intel *)__pa_nodebug(p->data);
- else
- intel_ucode_patch = p->data;
+ return 0;
}
-
-static int microcode_sanity_check(void *mc, int print_err)
+EXPORT_SYMBOL_GPL(intel_find_matching_signature);
+
+/**
+ * intel_microcode_sanity_check() - Sanity check microcode file.
+ * @mc: Pointer to the microcode file contents.
+ * @print_err: Display failure reason if true, silent if false.
+ * @hdr_type: Type of file, i.e. normal microcode file or In Field Scan file.
+ * Validate if the microcode header type matches with the type
+ * specified here.
+ *
+ * Validate certain header fields and verify if computed checksum matches
+ * with the one specified in the header.
+ *
+ * Return: 0 if the file passes all the checks, -EINVAL if any of the checks
+ * fail.
+ */
+int intel_microcode_sanity_check(void *mc, bool print_err, int hdr_type)
{
unsigned long total_size, data_size, ext_table_size;
struct microcode_header_intel *mc_header = mc;
@@ -172,7 +143,7 @@ static int microcode_sanity_check(void *mc, int print_err)
struct extended_signature *ext_sig;
total_size = get_totalsize(mc_header);
- data_size = get_datasize(mc_header);
+ data_size = intel_microcode_get_datasize(mc_header);
if (data_size + MC_HEADER_SIZE > total_size) {
if (print_err)
@@ -180,9 +151,10 @@ static int microcode_sanity_check(void *mc, int print_err)
return -EINVAL;
}
- if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
+ if (mc_header->ldrver != 1 || mc_header->hdrver != hdr_type) {
if (print_err)
- pr_err("Error: invalid/unknown microcode update format.\n");
+ pr_err("Error: invalid/unknown microcode update format. Header type %d\n",
+ mc_header->hdrver);
return -EINVAL;
}
@@ -191,8 +163,8 @@ static int microcode_sanity_check(void *mc, int print_err)
u32 ext_table_sum = 0;
u32 *ext_tablep;
- if ((ext_table_size < EXT_HEADER_SIZE)
- || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
+ if (ext_table_size < EXT_HEADER_SIZE ||
+ ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
if (print_err)
pr_err("Error: truncated extended signature table.\n");
return -EINVAL;
@@ -260,155 +232,113 @@ static int microcode_sanity_check(void *mc, int print_err)
}
return 0;
}
+EXPORT_SYMBOL_GPL(intel_microcode_sanity_check);
-/*
- * Get microcode matching with BSP's model. Only CPUs with the same model as
- * BSP can stay in the platform.
- */
-static struct microcode_intel *
-scan_microcode(void *data, size_t size, struct ucode_cpu_info *uci, bool save)
+static void update_ucode_pointer(struct microcode_intel *mc)
+{
+ kvfree(ucode_patch_va);
+
+ /*
+ * Save the virtual address for early loading and for eventual free
+ * on late loading.
+ */
+ ucode_patch_va = mc;
+}
+
+static void save_microcode_patch(struct microcode_intel *patch)
+{
+ unsigned int size = get_totalsize(&patch->hdr);
+ struct microcode_intel *mc;
+
+ mc = kvmemdup(patch, size, GFP_KERNEL);
+ if (mc)
+ update_ucode_pointer(mc);
+ else
+ pr_err("Unable to allocate microcode memory size: %u\n", size);
+}
+
+/* Scan blob for microcode matching the boot CPUs family, model, stepping */
+static __init struct microcode_intel *scan_microcode(void *data, size_t size,
+ struct ucode_cpu_info *uci,
+ bool save)
{
struct microcode_header_intel *mc_header;
struct microcode_intel *patch = NULL;
+ u32 cur_rev = uci->cpu_sig.rev;
unsigned int mc_size;
- while (size) {
- if (size < sizeof(struct microcode_header_intel))
- break;
-
+ for (; size >= sizeof(struct microcode_header_intel); size -= mc_size, data += mc_size) {
mc_header = (struct microcode_header_intel *)data;
mc_size = get_totalsize(mc_header);
- if (!mc_size ||
- mc_size > size ||
- microcode_sanity_check(data, 0) < 0)
+ if (!mc_size || mc_size > size ||
+ intel_microcode_sanity_check(data, false, MC_HEADER_TYPE_MICROCODE) < 0)
break;
- size -= mc_size;
-
- if (!find_matching_signature(data, uci->cpu_sig.sig,
- uci->cpu_sig.pf)) {
- data += mc_size;
+ if (!intel_find_matching_signature(data, &uci->cpu_sig))
continue;
- }
+ /*
+ * For saving the early microcode, find the matching revision which
+ * was loaded on the BSP.
+ *
+ * On the BSP during early boot, find a newer revision than
+ * actually loaded in the CPU.
+ */
if (save) {
- save_microcode_patch(uci, data, mc_size);
- goto next;
- }
-
-
- if (!patch) {
- if (!has_newer_microcode(data,
- uci->cpu_sig.sig,
- uci->cpu_sig.pf,
- uci->cpu_sig.rev))
- goto next;
-
- } else {
- struct microcode_header_intel *phdr = &patch->hdr;
-
- if (!has_newer_microcode(data,
- phdr->sig,
- phdr->pf,
- phdr->rev))
- goto next;
+ if (cur_rev != mc_header->rev)
+ continue;
+ } else if (cur_rev >= mc_header->rev) {
+ continue;
}
- /* We have a newer patch, save it. */
patch = data;
-
-next:
- data += mc_size;
+ cur_rev = mc_header->rev;
}
- if (size)
- return NULL;
-
- return patch;
+ return size ? NULL : patch;
}
-static void show_saved_mc(void)
+static enum ucode_state __apply_microcode(struct ucode_cpu_info *uci,
+ struct microcode_intel *mc,
+ u32 *cur_rev)
{
-#ifdef DEBUG
- int i = 0, j;
- unsigned int sig, pf, rev, total_size, data_size, date;
- struct ucode_cpu_info uci;
- struct ucode_patch *p;
-
- if (list_empty(&microcode_cache)) {
- pr_debug("no microcode data saved.\n");
- return;
- }
-
- intel_cpu_collect_info(&uci);
-
- sig = uci.cpu_sig.sig;
- pf = uci.cpu_sig.pf;
- rev = uci.cpu_sig.rev;
- pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev);
-
- list_for_each_entry(p, &microcode_cache, plist) {
- struct microcode_header_intel *mc_saved_header;
- struct extended_sigtable *ext_header;
- struct extended_signature *ext_sig;
- int ext_sigcount;
-
- mc_saved_header = (struct microcode_header_intel *)p->data;
-
- sig = mc_saved_header->sig;
- pf = mc_saved_header->pf;
- rev = mc_saved_header->rev;
- date = mc_saved_header->date;
-
- total_size = get_totalsize(mc_saved_header);
- data_size = get_datasize(mc_saved_header);
-
- pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, total size=0x%x, date = %04x-%02x-%02x\n",
- i++, sig, pf, rev, total_size,
- date & 0xffff,
- date >> 24,
- (date >> 16) & 0xff);
+ u32 rev;
- /* Look for ext. headers: */
- if (total_size <= data_size + MC_HEADER_SIZE)
- continue;
+ if (!mc)
+ return UCODE_NFOUND;
- ext_header = (void *)mc_saved_header + data_size + MC_HEADER_SIZE;
- ext_sigcount = ext_header->count;
- ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
+ /*
+ * Save us the MSR write below - which is a particular expensive
+ * operation - when the other hyperthread has updated the microcode
+ * already.
+ */
+ *cur_rev = intel_get_microcode_revision();
+ if (*cur_rev >= mc->hdr.rev) {
+ uci->cpu_sig.rev = *cur_rev;
+ return UCODE_OK;
+ }
- for (j = 0; j < ext_sigcount; j++) {
- sig = ext_sig->sig;
- pf = ext_sig->pf;
+ /* write microcode via MSR 0x79 */
+ native_wrmsrq(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
- pr_debug("\tExtended[%d]: sig=0x%x, pf=0x%x\n",
- j, sig, pf);
+ rev = intel_get_microcode_revision();
+ if (rev != mc->hdr.rev)
+ return UCODE_ERROR;
- ext_sig++;
- }
- }
-#endif
+ uci->cpu_sig.rev = rev;
+ return UCODE_UPDATED;
}
-/*
- * Save this microcode patch. It will be loaded early when a CPU is
- * hot-added or resumes.
- */
-static void save_mc_for_early(struct ucode_cpu_info *uci, u8 *mc, unsigned int size)
+static enum ucode_state apply_microcode_early(struct ucode_cpu_info *uci)
{
- /* Synchronization during CPU hotplug. */
- static DEFINE_MUTEX(x86_cpu_microcode_mutex);
-
- mutex_lock(&x86_cpu_microcode_mutex);
+ struct microcode_intel *mc = uci->mc;
+ u32 cur_rev;
- save_microcode_patch(uci, mc, size);
- show_saved_mc();
-
- mutex_unlock(&x86_cpu_microcode_mutex);
+ return __apply_microcode(uci, mc, &cur_rev);
}
-static bool load_builtin_intel_microcode(struct cpio_data *cp)
+static __init bool load_builtin_intel_microcode(struct cpio_data *cp)
{
unsigned int eax = 1, ebx, ecx = 0, edx;
struct firmware fw;
@@ -420,372 +350,147 @@ static bool load_builtin_intel_microcode(struct cpio_data *cp)
native_cpuid(&eax, &ebx, &ecx, &edx);
sprintf(name, "intel-ucode/%02x-%02x-%02x",
- x86_family(eax), x86_model(eax), x86_stepping(eax));
+ x86_family(eax), x86_model(eax), x86_stepping(eax));
if (firmware_request_builtin(&fw, name)) {
cp->size = fw.size;
cp->data = (void *)fw.data;
return true;
}
-
return false;
}
-/*
- * Print ucode update info.
- */
-static void
-print_ucode_info(struct ucode_cpu_info *uci, unsigned int date)
-{
- pr_info_once("microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n",
- uci->cpu_sig.rev,
- date & 0xffff,
- date >> 24,
- (date >> 16) & 0xff);
-}
-
-#ifdef CONFIG_X86_32
-
-static int delay_ucode_info;
-static int current_mc_date;
-
-/*
- * Print early updated ucode info after printk works. This is delayed info dump.
- */
-void show_ucode_info_early(void)
-{
- struct ucode_cpu_info uci;
-
- if (delay_ucode_info) {
- intel_cpu_collect_info(&uci);
- print_ucode_info(&uci, current_mc_date);
- delay_ucode_info = 0;
- }
-}
-
-/*
- * At this point, we can not call printk() yet. Delay printing microcode info in
- * show_ucode_info_early() until printk() works.
- */
-static void print_ucode(struct ucode_cpu_info *uci)
-{
- struct microcode_intel *mc;
- int *delay_ucode_info_p;
- int *current_mc_date_p;
-
- mc = uci->mc;
- if (!mc)
- return;
-
- delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info);
- current_mc_date_p = (int *)__pa_nodebug(&current_mc_date);
-
- *delay_ucode_info_p = 1;
- *current_mc_date_p = mc->hdr.date;
-}
-#else
-
-static inline void print_ucode(struct ucode_cpu_info *uci)
-{
- struct microcode_intel *mc;
-
- mc = uci->mc;
- if (!mc)
- return;
-
- print_ucode_info(uci, mc->hdr.date);
-}
-#endif
-
-static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
-{
- struct microcode_intel *mc;
- u32 rev;
-
- mc = uci->mc;
- if (!mc)
- return 0;
-
- /*
- * Save us the MSR write below - which is a particular expensive
- * operation - when the other hyperthread has updated the microcode
- * already.
- */
- rev = intel_get_microcode_revision();
- if (rev >= mc->hdr.rev) {
- uci->cpu_sig.rev = rev;
- return UCODE_OK;
- }
-
- /*
- * Writeback and invalidate caches before updating microcode to avoid
- * internal issues depending on what the microcode is updating.
- */
- native_wbinvd();
-
- /* write microcode via MSR 0x79 */
- native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
-
- rev = intel_get_microcode_revision();
- if (rev != mc->hdr.rev)
- return -1;
-
- uci->cpu_sig.rev = rev;
-
- if (early)
- print_ucode(uci);
- else
- print_ucode_info(uci, mc->hdr.date);
-
- return 0;
-}
-
-int __init save_microcode_in_initrd_intel(void)
+static __init struct microcode_intel *get_microcode_blob(struct ucode_cpu_info *uci, bool save)
{
- struct ucode_cpu_info uci;
struct cpio_data cp;
- /*
- * initrd is going away, clear patch ptr. We will scan the microcode one
- * last time before jettisoning and save a patch, if found. Then we will
- * update that pointer too, with a stable patch address to use when
- * resuming the cores.
- */
- intel_ucode_patch = NULL;
+ intel_collect_cpu_info(&uci->cpu_sig);
if (!load_builtin_intel_microcode(&cp))
- cp = find_microcode_in_initrd(ucode_path, false);
+ cp = find_microcode_in_initrd(ucode_path);
if (!(cp.data && cp.size))
- return 0;
-
- intel_cpu_collect_info(&uci);
-
- scan_microcode(cp.data, cp.size, &uci, true);
-
- show_saved_mc();
+ return NULL;
- return 0;
+ return scan_microcode(cp.data, cp.size, uci, save);
}
/*
- * @res_patch, output: a pointer to the patch we found.
+ * Invoked from an early init call to save the microcode blob which was
+ * selected during early boot when mm was not usable. The microcode must be
+ * saved because initrd is going away. It's an early init call so the APs
+ * just can use the pointer and do not have to scan initrd/builtin firmware
+ * again.
*/
-static struct microcode_intel *__load_ucode_intel(struct ucode_cpu_info *uci)
-{
- static const char *path;
- struct cpio_data cp;
- bool use_pa;
-
- if (IS_ENABLED(CONFIG_X86_32)) {
- path = (const char *)__pa_nodebug(ucode_path);
- use_pa = true;
- } else {
- path = ucode_path;
- use_pa = false;
- }
-
- /* try built-in microcode first */
- if (!load_builtin_intel_microcode(&cp))
- cp = find_microcode_in_initrd(path, use_pa);
-
- if (!(cp.data && cp.size))
- return NULL;
-
- intel_cpu_collect_info(uci);
-
- return scan_microcode(cp.data, cp.size, uci, false);
-}
-
-void __init load_ucode_intel_bsp(void)
+static int __init save_builtin_microcode(void)
{
- struct microcode_intel *patch;
struct ucode_cpu_info uci;
- patch = __load_ucode_intel(&uci);
- if (!patch)
- return;
+ if (xchg(&ucode_patch_va, NULL) != UCODE_BSP_LOADED)
+ return 0;
- uci.mc = patch;
+ if (microcode_loader_disabled() || boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return 0;
- apply_microcode_early(&uci, true);
+ uci.mc = get_microcode_blob(&uci, true);
+ if (uci.mc)
+ save_microcode_patch(uci.mc);
+ return 0;
}
+early_initcall(save_builtin_microcode);
-void load_ucode_intel_ap(void)
+/* Load microcode on BSP from initrd or builtin blobs */
+void __init load_ucode_intel_bsp(struct early_load_data *ed)
{
- struct microcode_intel *patch, **iup;
struct ucode_cpu_info uci;
- if (IS_ENABLED(CONFIG_X86_32))
- iup = (struct microcode_intel **) __pa_nodebug(&intel_ucode_patch);
- else
- iup = &intel_ucode_patch;
-
-reget:
- if (!*iup) {
- patch = __load_ucode_intel(&uci);
- if (!patch)
- return;
+ uci.mc = get_microcode_blob(&uci, false);
+ ed->old_rev = uci.cpu_sig.rev;
- *iup = patch;
- }
-
- uci.mc = *iup;
-
- if (apply_microcode_early(&uci, true)) {
- /* Mixed-silicon system? Try to refetch the proper patch: */
- *iup = NULL;
-
- goto reget;
+ if (uci.mc && apply_microcode_early(&uci) == UCODE_UPDATED) {
+ ucode_patch_va = UCODE_BSP_LOADED;
+ ed->new_rev = uci.cpu_sig.rev;
}
}
-static struct microcode_intel *find_patch(struct ucode_cpu_info *uci)
+void load_ucode_intel_ap(void)
{
- struct microcode_header_intel *phdr;
- struct ucode_patch *iter, *tmp;
-
- list_for_each_entry_safe(iter, tmp, &microcode_cache, plist) {
-
- phdr = (struct microcode_header_intel *)iter->data;
-
- if (phdr->rev <= uci->cpu_sig.rev)
- continue;
-
- if (!find_matching_signature(phdr,
- uci->cpu_sig.sig,
- uci->cpu_sig.pf))
- continue;
+ struct ucode_cpu_info uci;
- return iter->data;
- }
- return NULL;
+ uci.mc = ucode_patch_va;
+ if (uci.mc)
+ apply_microcode_early(&uci);
}
+/* Reload microcode on resume */
void reload_ucode_intel(void)
{
- struct microcode_intel *p;
- struct ucode_cpu_info uci;
-
- intel_cpu_collect_info(&uci);
-
- p = find_patch(&uci);
- if (!p)
- return;
-
- uci.mc = p;
+ struct ucode_cpu_info uci = { .mc = ucode_patch_va, };
- apply_microcode_early(&uci, false);
+ if (uci.mc)
+ apply_microcode_early(&uci);
}
static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
{
- static struct cpu_signature prev;
- struct cpuinfo_x86 *c = &cpu_data(cpu_num);
- unsigned int val[2];
-
- memset(csig, 0, sizeof(*csig));
-
- csig->sig = cpuid_eax(0x00000001);
-
- if ((c->x86_model >= 5) || (c->x86 > 6)) {
- /* get processor flags from MSR 0x17 */
- rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
- csig->pf = 1 << ((val[1] >> 18) & 7);
- }
-
- csig->rev = c->microcode;
-
- /* No extra locking on prev, races are harmless. */
- if (csig->sig != prev.sig || csig->pf != prev.pf || csig->rev != prev.rev) {
- pr_info("sig=0x%x, pf=0x%x, revision=0x%x\n",
- csig->sig, csig->pf, csig->rev);
- prev = *csig;
- }
-
+ intel_collect_cpu_info(csig);
return 0;
}
-static enum ucode_state apply_microcode_intel(int cpu)
+static enum ucode_state apply_microcode_late(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- struct cpuinfo_x86 *c = &cpu_data(cpu);
- bool bsp = c->cpu_index == boot_cpu_data.cpu_index;
- struct microcode_intel *mc;
+ struct microcode_intel *mc = ucode_patch_late;
enum ucode_state ret;
- static int prev_rev;
- u32 rev;
+ u32 cur_rev;
- /* We should bind the task to the CPU */
- if (WARN_ON(raw_smp_processor_id() != cpu))
+ if (WARN_ON_ONCE(smp_processor_id() != cpu))
return UCODE_ERROR;
- /* Look for a newer patch in our cache: */
- mc = find_patch(uci);
- if (!mc) {
- mc = uci->mc;
- if (!mc)
- return UCODE_NFOUND;
- }
+ ret = __apply_microcode(uci, mc, &cur_rev);
+ if (ret != UCODE_UPDATED && ret != UCODE_OK)
+ return ret;
+
+ cpu_data(cpu).microcode = uci->cpu_sig.rev;
+ if (!cpu)
+ boot_cpu_data.microcode = uci->cpu_sig.rev;
+
+ return ret;
+}
+
+static bool ucode_validate_minrev(struct microcode_header_intel *mc_header)
+{
+ int cur_rev = boot_cpu_data.microcode;
/*
- * Save us the MSR write below - which is a particular expensive
- * operation - when the other hyperthread has updated the microcode
- * already.
+ * When late-loading, ensure the header declares a minimum revision
+ * required to perform a late-load. The previously reserved field
+ * is 0 in older microcode blobs.
*/
- rev = intel_get_microcode_revision();
- if (rev >= mc->hdr.rev) {
- ret = UCODE_OK;
- goto out;
+ if (!mc_header->min_req_ver) {
+ pr_info("Unsafe microcode update: Microcode header does not specify a required min version\n");
+ return false;
}
/*
- * Writeback and invalidate caches before updating microcode to avoid
- * internal issues depending on what the microcode is updating.
+ * Check whether the current revision is either greater or equal to
+ * to the minimum revision specified in the header.
*/
- native_wbinvd();
-
- /* write microcode via MSR 0x79 */
- wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
-
- rev = intel_get_microcode_revision();
-
- if (rev != mc->hdr.rev) {
- pr_err("CPU%d update to revision 0x%x failed\n",
- cpu, mc->hdr.rev);
- return UCODE_ERROR;
- }
-
- if (bsp && rev != prev_rev) {
- pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n",
- rev,
- mc->hdr.date & 0xffff,
- mc->hdr.date >> 24,
- (mc->hdr.date >> 16) & 0xff);
- prev_rev = rev;
+ if (cur_rev < mc_header->min_req_ver) {
+ pr_info("Unsafe microcode update: Current revision 0x%x too old\n", cur_rev);
+ pr_info("Current should be at 0x%x or higher. Use early loading instead\n", mc_header->min_req_ver);
+ return false;
}
-
- ret = UCODE_UPDATED;
-
-out:
- uci->cpu_sig.rev = rev;
- c->microcode = rev;
-
- /* Update boot_cpu_data's revision too, if we're on the BSP: */
- if (bsp)
- boot_cpu_data.microcode = rev;
-
- return ret;
+ return true;
}
-static enum ucode_state generic_load_microcode(int cpu, struct iov_iter *iter)
+static enum ucode_state parse_microcode_blobs(int cpu, struct iov_iter *iter)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- unsigned int curr_mc_size = 0, new_mc_size = 0;
- enum ucode_state ret = UCODE_OK;
- int new_rev = uci->cpu_sig.rev;
+ bool is_safe, new_is_safe = false;
+ int cur_rev = uci->cpu_sig.rev;
+ unsigned int curr_mc_size = 0;
u8 *new_mc = NULL, *mc = NULL;
- unsigned int csig, cpf;
while (iov_iter_count(iter)) {
struct microcode_header_intel mc_header;
@@ -794,72 +499,66 @@ static enum ucode_state generic_load_microcode(int cpu, struct iov_iter *iter)
if (!copy_from_iter_full(&mc_header, sizeof(mc_header), iter)) {
pr_err("error! Truncated or inaccessible header in microcode data file\n");
- break;
+ goto fail;
}
mc_size = get_totalsize(&mc_header);
if (mc_size < sizeof(mc_header)) {
pr_err("error! Bad data in microcode data file (totalsize too small)\n");
- break;
+ goto fail;
}
data_size = mc_size - sizeof(mc_header);
if (data_size > iov_iter_count(iter)) {
pr_err("error! Bad data in microcode data file (truncated file?)\n");
- break;
+ goto fail;
}
/* For performance reasons, reuse mc area when possible */
if (!mc || mc_size > curr_mc_size) {
- vfree(mc);
- mc = vmalloc(mc_size);
+ kvfree(mc);
+ mc = kvmalloc(mc_size, GFP_KERNEL);
if (!mc)
- break;
+ goto fail;
curr_mc_size = mc_size;
}
memcpy(mc, &mc_header, sizeof(mc_header));
data = mc + sizeof(mc_header);
if (!copy_from_iter_full(data, data_size, iter) ||
- microcode_sanity_check(mc, 1) < 0) {
- break;
- }
+ intel_microcode_sanity_check(mc, true, MC_HEADER_TYPE_MICROCODE) < 0)
+ goto fail;
- csig = uci->cpu_sig.sig;
- cpf = uci->cpu_sig.pf;
- if (has_newer_microcode(mc, csig, cpf, new_rev)) {
- vfree(new_mc);
- new_rev = mc_header.rev;
- new_mc = mc;
- new_mc_size = mc_size;
- mc = NULL; /* trigger new vmalloc */
- ret = UCODE_NEW;
- }
- }
+ if (cur_rev >= mc_header.rev)
+ continue;
- vfree(mc);
+ if (!intel_find_matching_signature(mc, &uci->cpu_sig))
+ continue;
- if (iov_iter_count(iter)) {
- vfree(new_mc);
- return UCODE_ERROR;
+ is_safe = ucode_validate_minrev(&mc_header);
+ if (force_minrev && !is_safe)
+ continue;
+
+ kvfree(new_mc);
+ cur_rev = mc_header.rev;
+ new_mc = mc;
+ new_is_safe = is_safe;
+ mc = NULL;
}
+ if (iov_iter_count(iter))
+ goto fail;
+
+ kvfree(mc);
if (!new_mc)
return UCODE_NFOUND;
- vfree(uci->mc);
- uci->mc = (struct microcode_intel *)new_mc;
-
- /*
- * If early loading microcode is supported, save this mc into
- * permanent memory. So it will be loaded early when a CPU is hot added
- * or resumes.
- */
- save_mc_for_early(uci, new_mc, new_mc_size);
-
- pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
- cpu, new_rev, uci->cpu_sig.rev);
+ ucode_patch_late = (struct microcode_intel *)new_mc;
+ return new_is_safe ? UCODE_NEW_SAFE : UCODE_NEW;
- return ret;
+fail:
+ kvfree(mc);
+ kvfree(new_mc);
+ return UCODE_ERROR;
}
static bool is_blacklisted(unsigned int cpu)
@@ -869,15 +568,14 @@ static bool is_blacklisted(unsigned int cpu)
/*
* Late loading on model 79 with microcode revision less than 0x0b000021
* and LLC size per core bigger than 2.5MB may result in a system hang.
- * This behavior is documented in item BDF90, #334165 (Intel Xeon
+ * This behavior is documented in item BDX90, #334165 (Intel Xeon
* Processor E7-8800/4800 v4 Product Family).
*/
- if (c->x86 == 6 &&
- c->x86_model == INTEL_FAM6_BROADWELL_X &&
+ if (c->x86_vfm == INTEL_BROADWELL_X &&
c->x86_stepping == 0x01 &&
llc_size_per_core > 2621440 &&
c->microcode < 0x0b000021) {
- pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode);
+ pr_err_once("Erratum BDX90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode);
pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n");
return true;
}
@@ -885,8 +583,7 @@ static bool is_blacklisted(unsigned int cpu)
return false;
}
-static enum ucode_state request_microcode_fw(int cpu, struct device *device,
- bool refresh_fw)
+static enum ucode_state request_microcode_fw(int cpu, struct device *device)
{
struct cpuinfo_x86 *c = &cpu_data(cpu);
const struct firmware *firmware;
@@ -908,44 +605,37 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device,
kvec.iov_base = (void *)firmware->data;
kvec.iov_len = firmware->size;
- iov_iter_kvec(&iter, WRITE, &kvec, 1, firmware->size);
- ret = generic_load_microcode(cpu, &iter);
+ iov_iter_kvec(&iter, ITER_SOURCE, &kvec, 1, firmware->size);
+ ret = parse_microcode_blobs(cpu, &iter);
release_firmware(firmware);
return ret;
}
-static enum ucode_state
-request_microcode_user(int cpu, const void __user *buf, size_t size)
+static void finalize_late_load(int result)
{
- struct iov_iter iter;
- struct iovec iov;
-
- if (is_blacklisted(cpu))
- return UCODE_NFOUND;
-
- iov.iov_base = (void __user *)buf;
- iov.iov_len = size;
- iov_iter_init(&iter, WRITE, &iov, 1, size);
-
- return generic_load_microcode(cpu, &iter);
+ if (!result)
+ update_ucode_pointer(ucode_patch_late);
+ else
+ kvfree(ucode_patch_late);
+ ucode_patch_late = NULL;
}
static struct microcode_ops microcode_intel_ops = {
- .request_microcode_user = request_microcode_user,
- .request_microcode_fw = request_microcode_fw,
- .collect_cpu_info = collect_cpu_info,
- .apply_microcode = apply_microcode_intel,
+ .request_microcode_fw = request_microcode_fw,
+ .collect_cpu_info = collect_cpu_info,
+ .apply_microcode = apply_microcode_late,
+ .finalize_late_load = finalize_late_load,
+ .use_nmi = IS_ENABLED(CONFIG_X86_64),
};
-static int __init calc_llc_size_per_core(struct cpuinfo_x86 *c)
+static __init void calc_llc_size_per_core(struct cpuinfo_x86 *c)
{
u64 llc_size = c->x86_cache_size * 1024ULL;
- do_div(llc_size, c->x86_max_cores);
-
- return (int)llc_size;
+ do_div(llc_size, topology_num_cores_per_package());
+ llc_size_per_core = (unsigned int)llc_size;
}
struct microcode_ops * __init init_intel_microcode(void)
@@ -958,7 +648,7 @@ struct microcode_ops * __init init_intel_microcode(void)
return NULL;
}
- llc_size_per_core = calc_llc_size_per_core(c);
+ calc_llc_size_per_core(c);
return &microcode_intel_ops;
}
diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h
new file mode 100644
index 000000000000..50a9702ae4e2
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/internal.h
@@ -0,0 +1,125 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _X86_MICROCODE_INTERNAL_H
+#define _X86_MICROCODE_INTERNAL_H
+
+#include <linux/earlycpio.h>
+#include <linux/initrd.h>
+
+#include <asm/cpu.h>
+#include <asm/microcode.h>
+
+struct device;
+
+enum ucode_state {
+ UCODE_OK = 0,
+ UCODE_NEW,
+ UCODE_NEW_SAFE,
+ UCODE_UPDATED,
+ UCODE_NFOUND,
+ UCODE_ERROR,
+ UCODE_TIMEOUT,
+ UCODE_OFFLINE,
+};
+
+struct microcode_ops {
+ enum ucode_state (*request_microcode_fw)(int cpu, struct device *dev);
+ void (*microcode_fini_cpu)(int cpu);
+
+ /*
+ * The generic 'microcode_core' part guarantees that the callbacks
+ * below run on a target CPU when they are being called.
+ * See also the "Synchronization" section in microcode_core.c.
+ */
+ enum ucode_state (*apply_microcode)(int cpu);
+ int (*collect_cpu_info)(int cpu, struct cpu_signature *csig);
+ void (*finalize_late_load)(int result);
+ unsigned int nmi_safe : 1,
+ use_nmi : 1;
+};
+
+struct early_load_data {
+ u32 old_rev;
+ u32 new_rev;
+};
+
+extern struct early_load_data early_data;
+extern struct ucode_cpu_info ucode_cpu_info[];
+struct cpio_data find_microcode_in_initrd(const char *path);
+
+#define MAX_UCODE_COUNT 128
+
+#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24))
+#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u')
+#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I')
+#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l')
+#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h')
+#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i')
+#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D')
+
+#define CPUID_IS(a, b, c, ebx, ecx, edx) \
+ (!(((ebx) ^ (a)) | ((edx) ^ (b)) | ((ecx) ^ (c))))
+
+/*
+ * In early loading microcode phase on BSP, boot_cpu_data is not set up yet.
+ * x86_cpuid_vendor() gets vendor id for BSP.
+ *
+ * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify
+ * coding, we still use x86_cpuid_vendor() to get vendor id for AP.
+ *
+ * x86_cpuid_vendor() gets vendor information directly from CPUID.
+ */
+static inline int x86_cpuid_vendor(void)
+{
+ u32 eax = 0x00000000;
+ u32 ebx, ecx = 0, edx;
+
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+
+ if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx))
+ return X86_VENDOR_INTEL;
+
+ if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx))
+ return X86_VENDOR_AMD;
+
+ return X86_VENDOR_UNKNOWN;
+}
+
+static inline unsigned int x86_cpuid_family(void)
+{
+ u32 eax = 0x00000001;
+ u32 ebx, ecx = 0, edx;
+
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+
+ return x86_family(eax);
+}
+
+extern bool force_minrev;
+
+#ifdef CONFIG_CPU_SUP_AMD
+void load_ucode_amd_bsp(struct early_load_data *ed, unsigned int family);
+void load_ucode_amd_ap(unsigned int family);
+void reload_ucode_amd(unsigned int cpu);
+struct microcode_ops *init_amd_microcode(void);
+void exit_amd_microcode(void);
+#else /* CONFIG_CPU_SUP_AMD */
+static inline void load_ucode_amd_bsp(struct early_load_data *ed, unsigned int family) { }
+static inline void load_ucode_amd_ap(unsigned int family) { }
+static inline void reload_ucode_amd(unsigned int cpu) { }
+static inline struct microcode_ops *init_amd_microcode(void) { return NULL; }
+static inline void exit_amd_microcode(void) { }
+#endif /* !CONFIG_CPU_SUP_AMD */
+
+#ifdef CONFIG_CPU_SUP_INTEL
+void load_ucode_intel_bsp(struct early_load_data *ed);
+void load_ucode_intel_ap(void);
+void reload_ucode_intel(void);
+struct microcode_ops *init_intel_microcode(void);
+#else /* CONFIG_CPU_SUP_INTEL */
+static inline void load_ucode_intel_bsp(struct early_load_data *ed) { }
+static inline void load_ucode_intel_ap(void) { }
+static inline void reload_ucode_intel(void) { }
+static inline struct microcode_ops *init_intel_microcode(void) { return NULL; }
+#endif /* !CONFIG_CPU_SUP_INTEL */
+
+#endif /* _X86_MICROCODE_INTERNAL_H */
diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh
index 1db560ed2ca3..68f537347466 100644
--- a/arch/x86/kernel/cpu/mkcapflags.sh
+++ b/arch/x86/kernel/cpu/mkcapflags.sh
@@ -30,8 +30,7 @@ dump_array()
# If the /* comment */ starts with a quote string, grab that.
VALUE="$(echo "$i" | sed -n 's@.*/\* *\("[^"]*"\).*\*/@\1@p')"
- [ -z "$VALUE" ] && VALUE="\"$NAME\""
- [ "$VALUE" = '""' ] && continue
+ [ ! "$VALUE" ] && continue
# Name is uppercase, VALUE is all lowercase
VALUE="$(echo "$VALUE" | tr A-Z a-z)"
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 831613959a92..c78f860419d6 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -16,12 +16,10 @@
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/kexec.h>
-#include <linux/i8253.h>
#include <linux/random.h>
-#include <linux/swiotlb.h>
#include <asm/processor.h>
#include <asm/hypervisor.h>
-#include <asm/hyperv-tlfs.h>
+#include <hyperv/hvhdk.h>
#include <asm/mshyperv.h>
#include <asm/desc.h>
#include <asm/idtentry.h>
@@ -32,14 +30,85 @@
#include <asm/reboot.h>
#include <asm/nmi.h>
#include <clocksource/hyperv_timer.h>
+#include <asm/msr.h>
#include <asm/numa.h>
-#include <asm/coco.h>
+#include <asm/svm.h>
-/* Is Linux running as the root partition? */
-bool hv_root_partition;
+/* Is Linux running on nested Microsoft Hypervisor */
+bool hv_nested;
struct ms_hyperv_info ms_hyperv;
+/* Used in modules via hv_do_hypercall(): see arch/x86/include/asm/mshyperv.h */
+bool hyperv_paravisor_present __ro_after_init;
+EXPORT_SYMBOL_GPL(hyperv_paravisor_present);
+
#if IS_ENABLED(CONFIG_HYPERV)
+static inline unsigned int hv_get_nested_msr(unsigned int reg)
+{
+ if (hv_is_sint_msr(reg))
+ return reg - HV_X64_MSR_SINT0 + HV_X64_MSR_NESTED_SINT0;
+
+ switch (reg) {
+ case HV_X64_MSR_SIMP:
+ return HV_X64_MSR_NESTED_SIMP;
+ case HV_X64_MSR_SIEFP:
+ return HV_X64_MSR_NESTED_SIEFP;
+ case HV_X64_MSR_SVERSION:
+ return HV_X64_MSR_NESTED_SVERSION;
+ case HV_X64_MSR_SCONTROL:
+ return HV_X64_MSR_NESTED_SCONTROL;
+ case HV_X64_MSR_EOM:
+ return HV_X64_MSR_NESTED_EOM;
+ default:
+ return reg;
+ }
+}
+
+u64 hv_get_non_nested_msr(unsigned int reg)
+{
+ u64 value;
+
+ if (hv_is_synic_msr(reg) && ms_hyperv.paravisor_present)
+ hv_ivm_msr_read(reg, &value);
+ else
+ rdmsrq(reg, value);
+ return value;
+}
+EXPORT_SYMBOL_GPL(hv_get_non_nested_msr);
+
+void hv_set_non_nested_msr(unsigned int reg, u64 value)
+{
+ if (hv_is_synic_msr(reg) && ms_hyperv.paravisor_present) {
+ hv_ivm_msr_write(reg, value);
+
+ /* Write proxy bit via wrmsl instruction */
+ if (hv_is_sint_msr(reg))
+ wrmsrq(reg, value | 1 << 20);
+ } else {
+ wrmsrq(reg, value);
+ }
+}
+EXPORT_SYMBOL_GPL(hv_set_non_nested_msr);
+
+u64 hv_get_msr(unsigned int reg)
+{
+ if (hv_nested)
+ reg = hv_get_nested_msr(reg);
+
+ return hv_get_non_nested_msr(reg);
+}
+EXPORT_SYMBOL_GPL(hv_get_msr);
+
+void hv_set_msr(unsigned int reg, u64 value)
+{
+ if (hv_nested)
+ reg = hv_get_nested_msr(reg);
+
+ hv_set_non_nested_msr(reg, value);
+}
+EXPORT_SYMBOL_GPL(hv_set_msr);
+
+static void (*mshv_handler)(void);
static void (*vmbus_handler)(void);
static void (*hv_stimer0_handler)(void);
static void (*hv_kexec_handler)(void);
@@ -50,15 +119,23 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback)
struct pt_regs *old_regs = set_irq_regs(regs);
inc_irq_stat(irq_hv_callback_count);
+ if (mshv_handler)
+ mshv_handler();
+
if (vmbus_handler)
vmbus_handler();
if (ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED)
- ack_APIC_irq();
+ apic_eoi();
set_irq_regs(old_regs);
}
+void hv_setup_mshv_handler(void (*handler)(void))
+{
+ mshv_handler = handler;
+}
+
void hv_setup_vmbus_handler(void (*handler)(void))
{
vmbus_handler = handler;
@@ -82,7 +159,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0)
if (hv_stimer0_handler)
hv_stimer0_handler();
add_interrupt_randomness(HYPERV_STIMER0_VECTOR);
- ack_APIC_irq();
+ apic_eoi();
set_irq_regs(old_regs);
}
@@ -129,8 +206,8 @@ static void hv_machine_shutdown(void)
* Call hv_cpu_die() on all the CPUs, otherwise later the hypervisor
* corrupts the old VP Assist Pages and can crash the kexec kernel.
*/
- if (kexec_in_progress && hyperv_init_cpuhp > 0)
- cpuhp_remove_state(hyperv_init_cpuhp);
+ if (kexec_in_progress)
+ cpuhp_remove_state(CPUHP_AP_HYPERV_ONLINE);
/* The function calls stop_other_cpus(). */
native_machine_shutdown();
@@ -139,7 +216,9 @@ static void hv_machine_shutdown(void)
if (kexec_in_progress)
hyperv_cleanup();
}
+#endif /* CONFIG_KEXEC_CORE */
+#ifdef CONFIG_CRASH_DUMP
static void hv_machine_crash_shutdown(struct pt_regs *regs)
{
if (hv_crash_handler)
@@ -151,7 +230,64 @@ static void hv_machine_crash_shutdown(struct pt_regs *regs)
/* Disable the hypercall page when there is only 1 active CPU. */
hyperv_cleanup();
}
-#endif /* CONFIG_KEXEC_CORE */
+#endif /* CONFIG_CRASH_DUMP */
+
+static u64 hv_ref_counter_at_suspend;
+static void (*old_save_sched_clock_state)(void);
+static void (*old_restore_sched_clock_state)(void);
+
+/*
+ * Hyper-V clock counter resets during hibernation. Save and restore clock
+ * offset during suspend/resume, while also considering the time passed
+ * before suspend. This is to make sure that sched_clock using hv tsc page
+ * based clocksource, proceeds from where it left off during suspend and
+ * it shows correct time for the timestamps of kernel messages after resume.
+ */
+static void save_hv_clock_tsc_state(void)
+{
+ hv_ref_counter_at_suspend = hv_read_reference_counter();
+}
+
+static void restore_hv_clock_tsc_state(void)
+{
+ /*
+ * Adjust the offsets used by hv tsc clocksource to
+ * account for the time spent before hibernation.
+ * adjusted value = reference counter (time) at suspend
+ * - reference counter (time) now.
+ */
+ hv_adj_sched_clock_offset(hv_ref_counter_at_suspend - hv_read_reference_counter());
+}
+
+/*
+ * Functions to override save_sched_clock_state and restore_sched_clock_state
+ * functions of x86_platform. The Hyper-V clock counter is reset during
+ * suspend-resume and the offset used to measure time needs to be
+ * corrected, post resume.
+ */
+static void hv_save_sched_clock_state(void)
+{
+ old_save_sched_clock_state();
+ save_hv_clock_tsc_state();
+}
+
+static void hv_restore_sched_clock_state(void)
+{
+ restore_hv_clock_tsc_state();
+ old_restore_sched_clock_state();
+}
+
+static void __init x86_setup_ops_for_tsc_pg_clock(void)
+{
+ if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE))
+ return;
+
+ old_save_sched_clock_state = x86_platform.save_sched_clock_state;
+ x86_platform.save_sched_clock_state = hv_save_sched_clock_state;
+
+ old_restore_sched_clock_state = x86_platform.restore_sched_clock_state;
+ x86_platform.restore_sched_clock_state = hv_restore_sched_clock_state;
+}
#endif /* CONFIG_HYPERV */
static uint32_t __init ms_hyperv_platform(void)
@@ -183,11 +319,6 @@ static uint32_t __init ms_hyperv_platform(void)
return HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS;
}
-static unsigned char hv_get_nmi_reason(void)
-{
- return 0;
-}
-
#ifdef CONFIG_X86_LOCAL_APIC
/*
* Prior to WS2016 Debug-VM sends NMIs to all CPUs which makes
@@ -197,11 +328,14 @@ static unsigned char hv_get_nmi_reason(void)
static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs)
{
static atomic_t nmi_cpu = ATOMIC_INIT(-1);
+ unsigned int old_cpu, this_cpu;
if (!unknown_nmi_panic)
return NMI_DONE;
- if (atomic_cmpxchg(&nmi_cpu, -1, raw_smp_processor_id()) != -1)
+ old_cpu = -1;
+ this_cpu = raw_smp_processor_id();
+ if (!atomic_try_cmpxchg(&nmi_cpu, &old_cpu, this_cpu))
return NMI_HANDLED;
return NMI_DONE;
@@ -212,7 +346,7 @@ static unsigned long hv_get_tsc_khz(void)
{
unsigned long freq;
- rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq);
+ rdmsrq(HV_X64_MSR_TSC_FREQUENCY, freq);
return freq / 1000;
}
@@ -235,6 +369,15 @@ static void __init hv_smp_prepare_cpus(unsigned int max_cpus)
native_smp_prepare_cpus(max_cpus);
+ /*
+ * Override wakeup_secondary_cpu_64 callback for SEV-SNP
+ * enlightened guest.
+ */
+ if (!ms_hyperv.paravisor_present && hv_isolation_type_snp()) {
+ apic->wakeup_secondary_cpu_64 = hv_snp_boot_ap;
+ return;
+ }
+
#ifdef CONFIG_X86_64
for_each_present_cpu(i) {
if (i == 0)
@@ -253,13 +396,45 @@ static void __init hv_smp_prepare_cpus(unsigned int max_cpus)
}
#endif
+/*
+ * When a fully enlightened TDX VM runs on Hyper-V, the firmware sets the
+ * HW_REDUCED flag: refer to acpi_tb_create_local_fadt(). Consequently ttyS0
+ * interrupts can't work because request_irq() -> ... -> irq_to_desc() returns
+ * NULL for ttyS0. This happens because mp_config_acpi_legacy_irqs() sees a
+ * nr_legacy_irqs() of 0, so it doesn't initialize the array 'mp_irqs[]', and
+ * later setup_IO_APIC_irqs() -> find_irq_entry() fails to find the legacy irqs
+ * from the array and hence doesn't create the necessary irq description info.
+ *
+ * Clone arch/x86/kernel/acpi/boot.c: acpi_generic_reduced_hw_init() here,
+ * except don't change 'legacy_pic', which keeps its default value
+ * 'default_legacy_pic'. This way, mp_config_acpi_legacy_irqs() sees a non-zero
+ * nr_legacy_irqs() and eventually serial console interrupts works properly.
+ */
+static void __init reduced_hw_init(void)
+{
+ x86_init.timers.timer_init = x86_init_noop;
+ x86_init.irqs.pre_vector_init = x86_init_noop;
+}
+
+int hv_get_hypervisor_version(union hv_hypervisor_version_info *info)
+{
+ unsigned int hv_max_functions;
+
+ hv_max_functions = cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS);
+ if (hv_max_functions < HYPERV_CPUID_VERSION) {
+ pr_err("%s: Could not detect Hyper-V version\n", __func__);
+ return -ENODEV;
+ }
+
+ cpuid(HYPERV_CPUID_VERSION, &info->eax, &info->ebx, &info->ecx, &info->edx);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(hv_get_hypervisor_version);
+
static void __init ms_hyperv_init_platform(void)
{
int hv_max_functions_eax;
- int hv_host_info_eax;
- int hv_host_info_ebx;
- int hv_host_info_ecx;
- int hv_host_info_edx;
#ifdef CONFIG_PARAVIRT
pv_info.name = "Hyper-V";
@@ -270,13 +445,15 @@ static void __init ms_hyperv_init_platform(void)
*/
ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
ms_hyperv.priv_high = cpuid_ebx(HYPERV_CPUID_FEATURES);
+ ms_hyperv.ext_features = cpuid_ecx(HYPERV_CPUID_FEATURES);
ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
hv_max_functions_eax = cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS);
- pr_info("Hyper-V: privilege flags low 0x%x, high 0x%x, hints 0x%x, misc 0x%x\n",
- ms_hyperv.features, ms_hyperv.priv_high, ms_hyperv.hints,
+ pr_info("Hyper-V: privilege flags low %#x, high %#x, ext %#x, hints %#x, misc %#x\n",
+ ms_hyperv.features, ms_hyperv.priv_high,
+ ms_hyperv.ext_features, ms_hyperv.hints,
ms_hyperv.misc_features);
ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS);
@@ -285,62 +462,68 @@ static void __init ms_hyperv_init_platform(void)
pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n",
ms_hyperv.max_vp_index, ms_hyperv.max_lp_index);
- /*
- * Check CPU management privilege.
- *
- * To mirror what Windows does we should extract CPU management
- * features and use the ReservedIdentityBit to detect if Linux is the
- * root partition. But that requires negotiating CPU management
- * interface (a process to be finalized).
- *
- * For now, use the privilege flag as the indicator for running as
- * root.
- */
- if (cpuid_ebx(HYPERV_CPUID_FEATURES) & HV_CPU_MANAGEMENT) {
- hv_root_partition = true;
- pr_info("Hyper-V: running as root partition\n");
- }
+ hv_identify_partition_type();
- /*
- * Extract host information.
- */
- if (hv_max_functions_eax >= HYPERV_CPUID_VERSION) {
- hv_host_info_eax = cpuid_eax(HYPERV_CPUID_VERSION);
- hv_host_info_ebx = cpuid_ebx(HYPERV_CPUID_VERSION);
- hv_host_info_ecx = cpuid_ecx(HYPERV_CPUID_VERSION);
- hv_host_info_edx = cpuid_edx(HYPERV_CPUID_VERSION);
-
- pr_info("Hyper-V: Host Build %d.%d.%d.%d-%d-%d\n",
- hv_host_info_ebx >> 16, hv_host_info_ebx & 0xFFFF,
- hv_host_info_eax, hv_host_info_edx & 0xFFFFFF,
- hv_host_info_ecx, hv_host_info_edx >> 24);
+ if (ms_hyperv.hints & HV_X64_HYPERV_NESTED) {
+ hv_nested = true;
+ pr_info("Hyper-V: running on a nested hypervisor\n");
}
if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS &&
ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
x86_platform.calibrate_tsc = hv_get_tsc_khz;
x86_platform.calibrate_cpu = hv_get_tsc_khz;
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
}
if (ms_hyperv.priv_high & HV_ISOLATION) {
ms_hyperv.isolation_config_a = cpuid_eax(HYPERV_CPUID_ISOLATION_CONFIG);
ms_hyperv.isolation_config_b = cpuid_ebx(HYPERV_CPUID_ISOLATION_CONFIG);
- ms_hyperv.shared_gpa_boundary =
- BIT_ULL(ms_hyperv.shared_gpa_boundary_bits);
+
+ if (ms_hyperv.shared_gpa_boundary_active)
+ ms_hyperv.shared_gpa_boundary =
+ BIT_ULL(ms_hyperv.shared_gpa_boundary_bits);
+
+ hyperv_paravisor_present = !!ms_hyperv.paravisor_present;
pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n",
ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b);
+
if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) {
static_branch_enable(&isolation_type_snp);
-#ifdef CONFIG_SWIOTLB
- swiotlb_unencrypted_base = ms_hyperv.shared_gpa_boundary;
-#endif
- }
- /* Isolation VMs are unenlightened SEV-based VMs, thus this check: */
- if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) {
- if (hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE)
- cc_set_vendor(CC_VENDOR_HYPERV);
+ } else if (hv_get_isolation_type() == HV_ISOLATION_TYPE_TDX) {
+ static_branch_enable(&isolation_type_tdx);
+
+ /* A TDX VM must use x2APIC and doesn't use lazy EOI. */
+ ms_hyperv.hints &= ~HV_X64_APIC_ACCESS_RECOMMENDED;
+
+ if (!ms_hyperv.paravisor_present) {
+ /*
+ * Mark the Hyper-V TSC page feature as disabled
+ * in a TDX VM without paravisor so that the
+ * Invariant TSC, which is a better clocksource
+ * anyway, is used instead.
+ */
+ ms_hyperv.features &= ~HV_MSR_REFERENCE_TSC_AVAILABLE;
+
+ /*
+ * The Invariant TSC is expected to be available
+ * in a TDX VM without paravisor, but if not,
+ * print a warning message. The slower Hyper-V MSR-based
+ * Ref Counter should end up being the clocksource.
+ */
+ if (!(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT))
+ pr_warn("Hyper-V: Invariant TSC is unavailable\n");
+
+ /* HV_MSR_CRASH_CTL is unsupported. */
+ ms_hyperv.misc_features &= ~HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
+
+ /* Don't trust Hyper-V's TLB-flushing hypercalls. */
+ ms_hyperv.hints &= ~HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED;
+
+ x86_init.acpi.reduced_hw_early_init = reduced_hw_init;
+ }
}
}
@@ -359,7 +542,7 @@ static void __init ms_hyperv_init_platform(void)
*/
u64 hv_lapic_frequency;
- rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency);
+ rdmsrq(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency);
hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ);
lapic_timer_period = hv_lapic_frequency;
pr_info("Hyper-V: LAPIC Timer Frequency: %#x\n",
@@ -374,10 +557,14 @@ static void __init ms_hyperv_init_platform(void)
no_timer_check = 1;
#endif
-#if IS_ENABLED(CONFIG_HYPERV) && defined(CONFIG_KEXEC_CORE)
+#if IS_ENABLED(CONFIG_HYPERV)
+#if defined(CONFIG_KEXEC_CORE)
machine_ops.shutdown = hv_machine_shutdown;
+#endif
+#if defined(CONFIG_CRASH_DUMP)
machine_ops.crash_shutdown = hv_machine_crash_shutdown;
#endif
+#endif
if (ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) {
/*
* Writing to synthetic MSR 0x40000118 updates/changes the
@@ -388,7 +575,7 @@ static void __init ms_hyperv_init_platform(void)
* setting of this MSR bit should happen before init_intel()
* is called.
*/
- wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, 0x1);
+ wrmsrq(HV_X64_MSR_TSC_INVARIANT_CONTROL, HV_EXPOSE_INVARIANT_TSC);
setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
}
@@ -399,40 +586,33 @@ static void __init ms_hyperv_init_platform(void)
if (efi_enabled(EFI_BOOT))
x86_platform.get_nmi_reason = hv_get_nmi_reason;
- /*
- * Hyper-V VMs have a PIT emulation quirk such that zeroing the
- * counter register during PIT shutdown restarts the PIT. So it
- * continues to interrupt @18.2 HZ. Setting i8253_clear_counter
- * to false tells pit_shutdown() not to zero the counter so that
- * the PIT really is shutdown. Generation 2 VMs don't have a PIT,
- * and setting this value has no effect.
- */
- i8253_clear_counter_on_shutdown = false;
-
#if IS_ENABLED(CONFIG_HYPERV)
+ if ((hv_get_isolation_type() == HV_ISOLATION_TYPE_VBS) ||
+ ms_hyperv.paravisor_present)
+ hv_vtom_init();
/*
* Setup the hook to get control post apic initialization.
*/
x86_platform.apic_post_init = hyperv_init;
hyperv_setup_mmu_ops();
- /* Setup the IDT for hypervisor callback */
- alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_hyperv_callback);
- /* Setup the IDT for reenlightenment notifications */
+ /* Install system interrupt handler for hypervisor callback */
+ sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_hyperv_callback);
+
+ /* Install system interrupt handler for reenlightenment notifications */
if (ms_hyperv.features & HV_ACCESS_REENLIGHTENMENT) {
- alloc_intr_gate(HYPERV_REENLIGHTENMENT_VECTOR,
- asm_sysvec_hyperv_reenlightenment);
+ sysvec_install(HYPERV_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment);
}
- /* Setup the IDT for stimer0 */
+ /* Install system interrupt handler for stimer0 */
if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) {
- alloc_intr_gate(HYPERV_STIMER0_VECTOR,
- asm_sysvec_hyperv_stimer0);
+ sysvec_install(HYPERV_STIMER0_VECTOR, sysvec_hyperv_stimer0);
}
# ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu;
- if (hv_root_partition)
+ if (hv_root_partition() ||
+ (!ms_hyperv.paravisor_present && hv_isolation_type_snp()))
smp_ops.smp_prepare_cpus = hv_smp_prepare_cpus;
# endif
@@ -449,6 +629,8 @@ static void __init ms_hyperv_init_platform(void)
/* Register Hyper-V specific clocksource */
hv_init_clocksource();
+ x86_setup_ops_for_tsc_pg_clock();
+ hv_vtl_init_platform();
#endif
/*
* TSC should be marked as unstable only after Hyper-V
@@ -475,6 +657,12 @@ static bool __init ms_hyperv_x2apic_available(void)
* (logically) generates MSIs directly to the system APIC irq domain.
* There is no HPET, and PCI MSI/MSI-X interrupts are remapped by the
* pci-hyperv host bridge.
+ *
+ * Note: for a Hyper-V root partition, this will always return false.
+ * The hypervisor doesn't expose these HYPERV_CPUID_VIRT_STACK_* cpuids by
+ * default, they are implemented as intercepts by the Windows Hyper-V stack.
+ * Even a nested root partition (L2 root) will not get them because the
+ * nested (L1) hypervisor filters them out.
*/
static bool __init ms_hyperv_msi_ext_dest_id(void)
{
@@ -488,6 +676,22 @@ static bool __init ms_hyperv_msi_ext_dest_id(void)
return eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE;
}
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+static void hv_sev_es_hcall_prepare(struct ghcb *ghcb, struct pt_regs *regs)
+{
+ /* RAX and CPL are already in the GHCB */
+ ghcb_set_rcx(ghcb, regs->cx);
+ ghcb_set_rdx(ghcb, regs->dx);
+ ghcb_set_r8(ghcb, regs->r8);
+}
+
+static bool hv_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs)
+{
+ /* No checking of the return state needed */
+ return true;
+}
+#endif
+
const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
.name = "Microsoft Hyper-V",
.detect = ms_hyperv_platform,
@@ -495,4 +699,9 @@ const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
.init.x2apic_available = ms_hyperv_x2apic_available,
.init.msi_ext_dest_id = ms_hyperv_msi_ext_dest_id,
.init.init_platform = ms_hyperv_init_platform,
+ .init.guest_late_init = ms_hyperv_late_init,
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ .runtime.sev_es_hcall_prepare = hv_sev_es_hcall_prepare,
+ .runtime.sev_es_hcall_finish = hv_sev_es_hcall_finish,
+#endif
};
diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile
index cc4f9f1cb94c..aee4bc5ad496 100644
--- a/arch/x86/kernel/cpu/mtrr/Makefile
+++ b/arch/x86/kernel/cpu/mtrr/Makefile
@@ -1,4 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-y := mtrr.o if.o generic.o cleanup.o
-obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
+obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o legacy.o
diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c
index a65a0272096d..ef3e8e42b782 100644
--- a/arch/x86/kernel/cpu/mtrr/amd.c
+++ b/arch/x86/kernel/cpu/mtrr/amd.c
@@ -109,17 +109,11 @@ amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
return 0;
}
-static const struct mtrr_ops amd_mtrr_ops = {
- .vendor = X86_VENDOR_AMD,
+const struct mtrr_ops amd_mtrr_ops = {
+ .var_regs = 2,
.set = amd_set_mtrr,
.get = amd_get_mtrr,
.get_free_region = generic_get_free_region,
.validate_add_page = amd_validate_add_page,
.have_wrcomb = positive_have_wrcomb,
};
-
-int __init amd_init_mtrr(void)
-{
- set_mtrr_ops(&amd_mtrr_ops);
- return 0;
-}
diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c
index f27177816569..6f6c3ae92943 100644
--- a/arch/x86/kernel/cpu/mtrr/centaur.c
+++ b/arch/x86/kernel/cpu/mtrr/centaur.c
@@ -45,15 +45,6 @@ centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg)
return -ENOSPC;
}
-/*
- * Report boot time MCR setups
- */
-void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
-{
- centaur_mcr[mcr].low = lo;
- centaur_mcr[mcr].high = hi;
-}
-
static void
centaur_get_mcr(unsigned int reg, unsigned long *base,
unsigned long *size, mtrr_type * type)
@@ -111,17 +102,11 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t
return 0;
}
-static const struct mtrr_ops centaur_mtrr_ops = {
- .vendor = X86_VENDOR_CENTAUR,
+const struct mtrr_ops centaur_mtrr_ops = {
+ .var_regs = 8,
.set = centaur_set_mcr,
.get = centaur_get_mcr,
.get_free_region = centaur_get_free_region,
.validate_add_page = centaur_validate_add_page,
.have_wrcomb = positive_have_wrcomb,
};
-
-int __init centaur_init_mtrr(void)
-{
- set_mtrr_ops(&centaur_mtrr_ops);
- return 0;
-}
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index b5f43049fa5f..18cf79d6e2c5 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -55,9 +55,6 @@ static int __initdata nr_range;
static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
-static int __initdata debug_print;
-#define Dprintk(x...) do { if (debug_print) pr_debug(x); } while (0)
-
#define BIOS_BUG_MSG \
"WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
@@ -79,12 +76,11 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
nr_range = add_range_with_merge(range, RANGE_NUM, nr_range,
base, base + size);
}
- if (debug_print) {
- pr_debug("After WB checking\n");
- for (i = 0; i < nr_range; i++)
- pr_debug("MTRR MAP PFN: %016llx - %016llx\n",
- range[i].start, range[i].end);
- }
+
+ Dprintk("After WB checking\n");
+ for (i = 0; i < nr_range; i++)
+ Dprintk("MTRR MAP PFN: %016llx - %016llx\n",
+ range[i].start, range[i].end);
/* Take out UC ranges: */
for (i = 0; i < num_var_ranges; i++) {
@@ -112,24 +108,22 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
subtract_range(range, RANGE_NUM, extra_remove_base,
extra_remove_base + extra_remove_size);
- if (debug_print) {
- pr_debug("After UC checking\n");
- for (i = 0; i < RANGE_NUM; i++) {
- if (!range[i].end)
- continue;
- pr_debug("MTRR MAP PFN: %016llx - %016llx\n",
- range[i].start, range[i].end);
- }
+ Dprintk("After UC checking\n");
+ for (i = 0; i < RANGE_NUM; i++) {
+ if (!range[i].end)
+ continue;
+
+ Dprintk("MTRR MAP PFN: %016llx - %016llx\n",
+ range[i].start, range[i].end);
}
/* sort the ranges */
nr_range = clean_sort_range(range, RANGE_NUM);
- if (debug_print) {
- pr_debug("After sorting\n");
- for (i = 0; i < nr_range; i++)
- pr_debug("MTRR MAP PFN: %016llx - %016llx\n",
- range[i].start, range[i].end);
- }
+
+ Dprintk("After sorting\n");
+ for (i = 0; i < nr_range; i++)
+ Dprintk("MTRR MAP PFN: %016llx - %016llx\n",
+ range[i].start, range[i].end);
return nr_range;
}
@@ -164,16 +158,9 @@ static int __init enable_mtrr_cleanup_setup(char *str)
}
early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
-static int __init mtrr_cleanup_debug_setup(char *str)
-{
- debug_print = 1;
- return 0;
-}
-early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
-
static void __init
set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
- unsigned char type, unsigned int address_bits)
+ unsigned char type)
{
u32 base_lo, base_hi, mask_lo, mask_hi;
u64 base, mask;
@@ -183,7 +170,7 @@ set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
return;
}
- mask = (1ULL << address_bits) - 1;
+ mask = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
mask &= ~((((u64)sizek) << 10) - 1);
base = ((u64)basek) << 10;
@@ -209,7 +196,7 @@ save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
range_state[reg].type = type;
}
-static void __init set_var_mtrr_all(unsigned int address_bits)
+static void __init set_var_mtrr_all(void)
{
unsigned long basek, sizek;
unsigned char type;
@@ -220,7 +207,7 @@ static void __init set_var_mtrr_all(unsigned int address_bits)
sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10);
type = range_state[reg].type;
- set_var_mtrr(reg, basek, sizek, type, address_bits);
+ set_var_mtrr(reg, basek, sizek, type);
}
}
@@ -267,7 +254,7 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk,
align = max_align;
sizek = 1UL << align;
- if (debug_print) {
+ if (mtrr_debug) {
char start_factor = 'K', size_factor = 'K';
unsigned long start_base, size_base;
@@ -542,7 +529,7 @@ static void __init print_out_mtrr_range_state(void)
start_base = to_size_factor(start_base, &start_factor);
type = range_state[i].type;
- pr_debug("reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
+ Dprintk("reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
i, start_base, start_factor,
size_base, size_factor,
(type == MTRR_TYPE_UNCACHABLE) ? "UC" :
@@ -680,7 +667,7 @@ static int __init mtrr_search_optimal_index(void)
return index_good;
}
-int __init mtrr_cleanup(unsigned address_bits)
+int __init mtrr_cleanup(void)
{
unsigned long x_remove_base, x_remove_size;
unsigned long base, size, def, dummy;
@@ -689,7 +676,10 @@ int __init mtrr_cleanup(unsigned address_bits)
int index_good;
int i;
- if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
+ if (!mtrr_enabled())
+ return 0;
+
+ if (!cpu_feature_enabled(X86_FEATURE_MTRR) || enable_mtrr_cleanup < 1)
return 0;
rdmsr(MSR_MTRRdefType, def, dummy);
@@ -711,7 +701,7 @@ int __init mtrr_cleanup(unsigned address_bits)
return 0;
/* Print original var MTRRs at first, for debugging: */
- pr_debug("original variable MTRRs\n");
+ Dprintk("original variable MTRRs\n");
print_out_mtrr_range_state();
memset(range, 0, sizeof(range));
@@ -742,8 +732,8 @@ int __init mtrr_cleanup(unsigned address_bits)
mtrr_print_out_one_result(i);
if (!result[i].bad) {
- set_var_mtrr_all(address_bits);
- pr_debug("New variable MTRRs\n");
+ set_var_mtrr_all();
+ Dprintk("New variable MTRRs\n");
print_out_mtrr_range_state();
return 1;
}
@@ -763,7 +753,7 @@ int __init mtrr_cleanup(unsigned address_bits)
mtrr_calc_range_state(chunk_size, gran_size,
x_remove_base, x_remove_size, i);
- if (debug_print) {
+ if (mtrr_debug) {
mtrr_print_out_one_result(i);
pr_info("\n");
}
@@ -786,8 +776,8 @@ int __init mtrr_cleanup(unsigned address_bits)
gran_size = result[i].gran_sizek;
gran_size <<= 10;
x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
- set_var_mtrr_all(address_bits);
- pr_debug("New variable MTRRs\n");
+ set_var_mtrr_all();
+ Dprintk("New variable MTRRs\n");
print_out_mtrr_range_state();
return 1;
} else {
@@ -802,7 +792,7 @@ int __init mtrr_cleanup(unsigned address_bits)
return 0;
}
#else
-int __init mtrr_cleanup(unsigned address_bits)
+int __init mtrr_cleanup(void)
{
return 0;
}
@@ -882,15 +872,18 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
/* extra one for all 0 */
int num[MTRR_NUM_TYPES + 1];
+ if (!mtrr_enabled())
+ return 0;
+
/*
* Make sure we only trim uncachable memory on machines that
* support the Intel MTRR architecture:
*/
- if (!is_cpu(INTEL) || disable_mtrr_trim)
+ if (!cpu_feature_enabled(X86_FEATURE_MTRR) || disable_mtrr_trim)
return 0;
rdmsr(MSR_MTRRdefType, def, dummy);
- def &= 0xff;
+ def &= MTRR_DEF_TYPE_TYPE;
if (def != MTRR_TYPE_UNCACHABLE)
return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
index ca670919b561..238dad57d4d6 100644
--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -234,51 +234,11 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base,
post_set();
}
-typedef struct {
- unsigned long base;
- unsigned long size;
- mtrr_type type;
-} arr_state_t;
-
-static arr_state_t arr_state[8] = {
- {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL},
- {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}
-};
-
-static unsigned char ccr_state[7] = { 0, 0, 0, 0, 0, 0, 0 };
-
-static void cyrix_set_all(void)
-{
- int i;
-
- prepare_set();
-
- /* the CCRs are not contiguous */
- for (i = 0; i < 4; i++)
- setCx86(CX86_CCR0 + i, ccr_state[i]);
- for (; i < 7; i++)
- setCx86(CX86_CCR4 + i, ccr_state[i]);
-
- for (i = 0; i < 8; i++) {
- cyrix_set_arr(i, arr_state[i].base,
- arr_state[i].size, arr_state[i].type);
- }
-
- post_set();
-}
-
-static const struct mtrr_ops cyrix_mtrr_ops = {
- .vendor = X86_VENDOR_CYRIX,
- .set_all = cyrix_set_all,
+const struct mtrr_ops cyrix_mtrr_ops = {
+ .var_regs = 8,
.set = cyrix_set_arr,
.get = cyrix_get_arr,
.get_free_region = cyrix_get_free_region,
.validate_add_page = generic_validate_add_page,
.have_wrcomb = positive_have_wrcomb,
};
-
-int __init cyrix_init_mtrr(void)
-{
- set_mtrr_ops(&cyrix_mtrr_ops);
- return 0;
-}
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 558108296f3c..8c18327eb10b 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -8,9 +8,14 @@
#include <linux/init.h>
#include <linux/io.h>
#include <linux/mm.h>
-
+#include <linux/cc_platform.h>
+#include <linux/string_choices.h>
#include <asm/processor-flags.h>
+#include <asm/cacheinfo.h>
#include <asm/cpufeature.h>
+#include <asm/cpu_device_id.h>
+#include <asm/hypervisor.h>
+#include <asm/mshyperv.h>
#include <asm/tlbflush.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
@@ -30,6 +35,55 @@ static struct fixed_range_block fixed_range_blocks[] = {
{}
};
+struct cache_map {
+ u64 start;
+ u64 end;
+ u64 flags;
+ u64 type:8;
+ u64 fixed:1;
+};
+
+bool mtrr_debug;
+
+static int __init mtrr_param_setup(char *str)
+{
+ int rc = 0;
+
+ if (!str)
+ return -EINVAL;
+ if (!strcmp(str, "debug"))
+ mtrr_debug = true;
+ else
+ rc = -EINVAL;
+
+ return rc;
+}
+early_param("mtrr", mtrr_param_setup);
+
+/*
+ * CACHE_MAP_MAX is the maximum number of memory ranges in cache_map, where
+ * no 2 adjacent ranges have the same cache mode (those would be merged).
+ * The number is based on the worst case:
+ * - no two adjacent fixed MTRRs share the same cache mode
+ * - one variable MTRR is spanning a huge area with mode WB
+ * - 255 variable MTRRs with mode UC all overlap with the WB MTRR, creating 2
+ * additional ranges each (result like "ababababa...aba" with a = WB, b = UC),
+ * accounting for MTRR_MAX_VAR_RANGES * 2 - 1 range entries
+ * - a TOP_MEM2 area (even with overlapping an UC MTRR can't add 2 range entries
+ * to the possible maximum, as it always starts at 4GB, thus it can't be in
+ * the middle of that MTRR, unless that MTRR starts at 0, which would remove
+ * the initial "a" from the "abababa" pattern above)
+ * The map won't contain ranges with no matching MTRR (those fall back to the
+ * default cache mode).
+ */
+#define CACHE_MAP_MAX (MTRR_NUM_FIXED_RANGES + MTRR_MAX_VAR_RANGES * 2)
+
+static struct cache_map init_cache_map[CACHE_MAP_MAX] __initdata;
+static struct cache_map *cache_map __refdata = init_cache_map;
+static unsigned int cache_map_size = CACHE_MAP_MAX;
+static unsigned int cache_map_n;
+static unsigned int cache_map_fixed;
+
static unsigned long smp_changes_mask;
static int mtrr_state_set;
u64 mtrr_tom2;
@@ -37,6 +91,9 @@ u64 mtrr_tom2;
struct mtrr_state_type mtrr_state;
EXPORT_SYMBOL_GPL(mtrr_state);
+/* Reserved bits in the high portion of the MTRRphysBaseN MSR. */
+u32 phys_hi_rsvd;
+
/*
* BIOS is expected to clear MtrrFixDramModEn bit, see for example
* "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
@@ -53,6 +110,9 @@ static inline void k8_check_syscfg_dram_mod_en(void)
(boot_cpu_data.x86 >= 0x0f)))
return;
+ if (cc_platform_has(CC_ATTR_HOST_SEV_SNP))
+ return;
+
rdmsr(MSR_AMD64_SYSCFG, lo, hi);
if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) {
pr_err(FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
@@ -68,245 +128,429 @@ static u64 get_mtrr_size(u64 mask)
{
u64 size;
- mask >>= PAGE_SHIFT;
- mask |= size_or_mask;
+ mask |= (u64)phys_hi_rsvd << 32;
size = -mask;
- size <<= PAGE_SHIFT;
+
return size;
}
+static u8 get_var_mtrr_state(unsigned int reg, u64 *start, u64 *size)
+{
+ struct mtrr_var_range *mtrr = mtrr_state.var_ranges + reg;
+
+ if (!(mtrr->mask_lo & MTRR_PHYSMASK_V))
+ return MTRR_TYPE_INVALID;
+
+ *start = (((u64)mtrr->base_hi) << 32) + (mtrr->base_lo & PAGE_MASK);
+ *size = get_mtrr_size((((u64)mtrr->mask_hi) << 32) +
+ (mtrr->mask_lo & PAGE_MASK));
+
+ return mtrr->base_lo & MTRR_PHYSBASE_TYPE;
+}
+
+static u8 get_effective_type(u8 type1, u8 type2)
+{
+ if (type1 == MTRR_TYPE_UNCACHABLE || type2 == MTRR_TYPE_UNCACHABLE)
+ return MTRR_TYPE_UNCACHABLE;
+
+ if ((type1 == MTRR_TYPE_WRBACK && type2 == MTRR_TYPE_WRTHROUGH) ||
+ (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK))
+ return MTRR_TYPE_WRTHROUGH;
+
+ if (type1 != type2)
+ return MTRR_TYPE_UNCACHABLE;
+
+ return type1;
+}
+
+static void rm_map_entry_at(int idx)
+{
+ cache_map_n--;
+ if (cache_map_n > idx) {
+ memmove(cache_map + idx, cache_map + idx + 1,
+ sizeof(*cache_map) * (cache_map_n - idx));
+ }
+}
+
/*
- * Check and return the effective type for MTRR-MTRR type overlap.
- * Returns 1 if the effective type is UNCACHEABLE, else returns 0
+ * Add an entry into cache_map at a specific index. Merges adjacent entries if
+ * appropriate. Return the number of merges for correcting the scan index
+ * (this is needed as merging will reduce the number of entries, which will
+ * result in skipping entries in future iterations if the scan index isn't
+ * corrected).
+ * Note that the corrected index can never go below -1 (resulting in being 0 in
+ * the next scan iteration), as "2" is returned only if the current index is
+ * larger than zero.
*/
-static int check_type_overlap(u8 *prev, u8 *curr)
+static int add_map_entry_at(u64 start, u64 end, u8 type, int idx)
{
- if (*prev == MTRR_TYPE_UNCACHABLE || *curr == MTRR_TYPE_UNCACHABLE) {
- *prev = MTRR_TYPE_UNCACHABLE;
- *curr = MTRR_TYPE_UNCACHABLE;
- return 1;
+ bool merge_prev = false, merge_next = false;
+
+ if (start >= end)
+ return 0;
+
+ if (idx > 0) {
+ struct cache_map *prev = cache_map + idx - 1;
+
+ if (!prev->fixed && start == prev->end && type == prev->type)
+ merge_prev = true;
}
- if ((*prev == MTRR_TYPE_WRBACK && *curr == MTRR_TYPE_WRTHROUGH) ||
- (*prev == MTRR_TYPE_WRTHROUGH && *curr == MTRR_TYPE_WRBACK)) {
- *prev = MTRR_TYPE_WRTHROUGH;
- *curr = MTRR_TYPE_WRTHROUGH;
+ if (idx < cache_map_n) {
+ struct cache_map *next = cache_map + idx;
+
+ if (!next->fixed && end == next->start && type == next->type)
+ merge_next = true;
}
- if (*prev != *curr) {
- *prev = MTRR_TYPE_UNCACHABLE;
- *curr = MTRR_TYPE_UNCACHABLE;
+ if (merge_prev && merge_next) {
+ cache_map[idx - 1].end = cache_map[idx].end;
+ rm_map_entry_at(idx);
+ return 2;
+ }
+ if (merge_prev) {
+ cache_map[idx - 1].end = end;
return 1;
}
+ if (merge_next) {
+ cache_map[idx].start = start;
+ return 1;
+ }
+
+ /* Sanity check: the array should NEVER be too small! */
+ if (cache_map_n == cache_map_size) {
+ WARN(1, "MTRR cache mode memory map exhausted!\n");
+ cache_map_n = cache_map_fixed;
+ return 0;
+ }
+
+ if (cache_map_n > idx) {
+ memmove(cache_map + idx + 1, cache_map + idx,
+ sizeof(*cache_map) * (cache_map_n - idx));
+ }
+
+ cache_map[idx].start = start;
+ cache_map[idx].end = end;
+ cache_map[idx].type = type;
+ cache_map[idx].fixed = 0;
+ cache_map_n++;
return 0;
}
-/**
- * mtrr_type_lookup_fixed - look up memory type in MTRR fixed entries
- *
- * Return the MTRR fixed memory type of 'start'.
- *
- * MTRR fixed entries are divided into the following ways:
- * 0x00000 - 0x7FFFF : This range is divided into eight 64KB sub-ranges
- * 0x80000 - 0xBFFFF : This range is divided into sixteen 16KB sub-ranges
- * 0xC0000 - 0xFFFFF : This range is divided into sixty-four 4KB sub-ranges
- *
- * Return Values:
- * MTRR_TYPE_(type) - Matched memory type
- * MTRR_TYPE_INVALID - Unmatched
+/* Clear a part of an entry. Return 1 if start of entry is still valid. */
+static int clr_map_range_at(u64 start, u64 end, int idx)
+{
+ int ret = start != cache_map[idx].start;
+ u64 tmp;
+
+ if (start == cache_map[idx].start && end == cache_map[idx].end) {
+ rm_map_entry_at(idx);
+ } else if (start == cache_map[idx].start) {
+ cache_map[idx].start = end;
+ } else if (end == cache_map[idx].end) {
+ cache_map[idx].end = start;
+ } else {
+ tmp = cache_map[idx].end;
+ cache_map[idx].end = start;
+ add_map_entry_at(end, tmp, cache_map[idx].type, idx + 1);
+ }
+
+ return ret;
+}
+
+/*
+ * Add MTRR to the map. The current map is scanned and each part of the MTRR
+ * either overlapping with an existing entry or with a hole in the map is
+ * handled separately.
*/
-static u8 mtrr_type_lookup_fixed(u64 start, u64 end)
+static void add_map_entry(u64 start, u64 end, u8 type)
{
- int idx;
+ u8 new_type, old_type;
+ u64 tmp;
+ int i;
- if (start >= 0x100000)
- return MTRR_TYPE_INVALID;
+ for (i = 0; i < cache_map_n && start < end; i++) {
+ if (start >= cache_map[i].end)
+ continue;
+
+ if (start < cache_map[i].start) {
+ /* Region start has no overlap. */
+ tmp = min(end, cache_map[i].start);
+ i -= add_map_entry_at(start, tmp, type, i);
+ start = tmp;
+ continue;
+ }
+
+ new_type = get_effective_type(type, cache_map[i].type);
+ old_type = cache_map[i].type;
+
+ if (cache_map[i].fixed || new_type == old_type) {
+ /* Cut off start of new entry. */
+ start = cache_map[i].end;
+ continue;
+ }
+
+ /* Handle only overlapping part of region. */
+ tmp = min(end, cache_map[i].end);
+ i += clr_map_range_at(start, tmp, i);
+ i -= add_map_entry_at(start, tmp, new_type, i);
+ start = tmp;
+ }
+
+ /* Add rest of region after last map entry (rest might be empty). */
+ add_map_entry_at(start, end, type, i);
+}
+
+/* Add variable MTRRs to cache map. */
+static void map_add_var(void)
+{
+ u64 start, size;
+ unsigned int i;
+ u8 type;
+
+ /*
+ * Add AMD TOP_MEM2 area. Can't be added in mtrr_build_map(), as it
+ * needs to be added again when rebuilding the map due to potentially
+ * having moved as a result of variable MTRRs for memory below 4GB.
+ */
+ if (mtrr_tom2) {
+ add_map_entry(BIT_ULL(32), mtrr_tom2, MTRR_TYPE_WRBACK);
+ cache_map[cache_map_n - 1].fixed = 1;
+ }
+
+ for (i = 0; i < num_var_ranges; i++) {
+ type = get_var_mtrr_state(i, &start, &size);
+ if (type != MTRR_TYPE_INVALID)
+ add_map_entry(start, start + size, type);
+ }
+}
+
+/*
+ * Rebuild map by replacing variable entries. Needs to be called when MTRR
+ * registers are being changed after boot, as such changes could include
+ * removals of registers, which are complicated to handle without rebuild of
+ * the map.
+ */
+void generic_rebuild_map(void)
+{
+ if (mtrr_if != &generic_mtrr_ops)
+ return;
+
+ cache_map_n = cache_map_fixed;
+
+ map_add_var();
+}
+
+static unsigned int __init get_cache_map_size(void)
+{
+ return cache_map_fixed + 2 * num_var_ranges + (mtrr_tom2 != 0);
+}
+
+/* Build the cache_map containing the cache modes per memory range. */
+void __init mtrr_build_map(void)
+{
+ u64 start, end, size;
+ unsigned int i;
+ u8 type;
- /* 0x0 - 0x7FFFF */
- if (start < 0x80000) {
- idx = 0;
- idx += (start >> 16);
- return mtrr_state.fixed_ranges[idx];
- /* 0x80000 - 0xBFFFF */
- } else if (start < 0xC0000) {
- idx = 1 * 8;
- idx += ((start - 0x80000) >> 14);
- return mtrr_state.fixed_ranges[idx];
+ /* Add fixed MTRRs, optimize for adjacent entries with same type. */
+ if (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED) {
+ /*
+ * Start with 64k size fixed entries, preset 1st one (hence the
+ * loop below is starting with index 1).
+ */
+ start = 0;
+ end = size = 0x10000;
+ type = mtrr_state.fixed_ranges[0];
+
+ for (i = 1; i < MTRR_NUM_FIXED_RANGES; i++) {
+ /* 8 64k entries, then 16 16k ones, rest 4k. */
+ if (i == 8 || i == 24)
+ size >>= 2;
+
+ if (mtrr_state.fixed_ranges[i] != type) {
+ add_map_entry(start, end, type);
+ start = end;
+ type = mtrr_state.fixed_ranges[i];
+ }
+ end += size;
+ }
+ add_map_entry(start, end, type);
}
- /* 0xC0000 - 0xFFFFF */
- idx = 3 * 8;
- idx += ((start - 0xC0000) >> 12);
- return mtrr_state.fixed_ranges[idx];
+ /* Mark fixed, they take precedence. */
+ for (i = 0; i < cache_map_n; i++)
+ cache_map[i].fixed = 1;
+ cache_map_fixed = cache_map_n;
+
+ map_add_var();
+
+ pr_info("MTRR map: %u entries (%u fixed + %u variable; max %u), built from %u variable MTRRs\n",
+ cache_map_n, cache_map_fixed, cache_map_n - cache_map_fixed,
+ get_cache_map_size(), num_var_ranges + (mtrr_tom2 != 0));
+
+ if (mtrr_debug) {
+ for (i = 0; i < cache_map_n; i++) {
+ pr_info("%3u: %016llx-%016llx %s\n", i,
+ cache_map[i].start, cache_map[i].end - 1,
+ mtrr_attrib_to_str(cache_map[i].type));
+ }
+ }
+}
+
+/* Copy the cache_map from __initdata memory to dynamically allocated one. */
+void __init mtrr_copy_map(void)
+{
+ unsigned int new_size = get_cache_map_size();
+
+ if (!mtrr_state.enabled || !new_size) {
+ cache_map = NULL;
+ return;
+ }
+
+ mutex_lock(&mtrr_mutex);
+
+ cache_map = kcalloc(new_size, sizeof(*cache_map), GFP_KERNEL);
+ if (cache_map) {
+ memmove(cache_map, init_cache_map,
+ cache_map_n * sizeof(*cache_map));
+ cache_map_size = new_size;
+ } else {
+ mtrr_state.enabled = 0;
+ pr_err("MTRRs disabled due to allocation failure for lookup map.\n");
+ }
+
+ mutex_unlock(&mtrr_mutex);
}
/**
- * mtrr_type_lookup_variable - look up memory type in MTRR variable entries
- *
- * Return Value:
- * MTRR_TYPE_(type) - Matched memory type or default memory type (unmatched)
+ * guest_force_mtrr_state - set static MTRR state for a guest
*
- * Output Arguments:
- * repeat - Set to 1 when [start:end] spanned across MTRR range and type
- * returned corresponds only to [start:*partial_end]. Caller has
- * to lookup again for [*partial_end:end].
+ * Used to set MTRR state via different means (e.g. with data obtained from
+ * a hypervisor).
+ * Is allowed only for special cases when running virtualized. Must be called
+ * from the x86_init.hyper.init_platform() hook. It can be called only once.
+ * The MTRR state can't be changed afterwards. To ensure that, X86_FEATURE_MTRR
+ * is cleared.
*
- * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the
- * region is fully covered by a single MTRR entry or the default
- * type.
+ * @var: MTRR variable range array to use
+ * @num_var: length of the @var array
+ * @def_type: default caching type
*/
-static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end,
- int *repeat, u8 *uniform)
+void guest_force_mtrr_state(struct mtrr_var_range *var, unsigned int num_var,
+ mtrr_type def_type)
{
- int i;
- u64 base, mask;
- u8 prev_match, curr_match;
+ unsigned int i;
- *repeat = 0;
- *uniform = 1;
+ /* Only allowed to be called once before mtrr_bp_init(). */
+ if (WARN_ON_ONCE(mtrr_state_set))
+ return;
- prev_match = MTRR_TYPE_INVALID;
- for (i = 0; i < num_var_ranges; ++i) {
- unsigned short start_state, end_state, inclusive;
+ /* Only allowed when running virtualized. */
+ if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR))
+ return;
- if (!(mtrr_state.var_ranges[i].mask_lo & (1 << 11)))
- continue;
+ /*
+ * Only allowed for special virtualization cases:
+ * - when running as Hyper-V, SEV-SNP guest using vTOM
+ * - when running as Xen PV guest
+ * - when running as SEV-SNP or TDX guest to avoid unnecessary
+ * VMM communication/Virtualization exceptions (#VC, #VE)
+ */
+ if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP) &&
+ !hv_is_isolation_supported() &&
+ !cpu_feature_enabled(X86_FEATURE_XENPV) &&
+ !cpu_feature_enabled(X86_FEATURE_TDX_GUEST))
+ return;
- base = (((u64)mtrr_state.var_ranges[i].base_hi) << 32) +
- (mtrr_state.var_ranges[i].base_lo & PAGE_MASK);
- mask = (((u64)mtrr_state.var_ranges[i].mask_hi) << 32) +
- (mtrr_state.var_ranges[i].mask_lo & PAGE_MASK);
-
- start_state = ((start & mask) == (base & mask));
- end_state = ((end & mask) == (base & mask));
- inclusive = ((start < base) && (end > base));
-
- if ((start_state != end_state) || inclusive) {
- /*
- * We have start:end spanning across an MTRR.
- * We split the region into either
- *
- * - start_state:1
- * (start:mtrr_end)(mtrr_end:end)
- * - end_state:1
- * (start:mtrr_start)(mtrr_start:end)
- * - inclusive:1
- * (start:mtrr_start)(mtrr_start:mtrr_end)(mtrr_end:end)
- *
- * depending on kind of overlap.
- *
- * Return the type of the first region and a pointer
- * to the start of next region so that caller will be
- * advised to lookup again after having adjusted start
- * and end.
- *
- * Note: This way we handle overlaps with multiple
- * entries and the default type properly.
- */
- if (start_state)
- *partial_end = base + get_mtrr_size(mask);
- else
- *partial_end = base;
-
- if (unlikely(*partial_end <= start)) {
- WARN_ON(1);
- *partial_end = start + PAGE_SIZE;
- }
+ /* Disable MTRR in order to disable MTRR modifications. */
+ setup_clear_cpu_cap(X86_FEATURE_MTRR);
- end = *partial_end - 1; /* end is inclusive */
- *repeat = 1;
- *uniform = 0;
+ if (var) {
+ if (num_var > MTRR_MAX_VAR_RANGES) {
+ pr_warn("Trying to overwrite MTRR state with %u variable entries\n",
+ num_var);
+ num_var = MTRR_MAX_VAR_RANGES;
}
+ for (i = 0; i < num_var; i++)
+ mtrr_state.var_ranges[i] = var[i];
+ num_var_ranges = num_var;
+ }
- if ((start & mask) != (base & mask))
- continue;
+ mtrr_state.def_type = def_type;
+ mtrr_state.enabled |= MTRR_STATE_MTRR_ENABLED;
- curr_match = mtrr_state.var_ranges[i].base_lo & 0xff;
- if (prev_match == MTRR_TYPE_INVALID) {
- prev_match = curr_match;
- continue;
- }
+ mtrr_state_set = 1;
+}
- *uniform = 0;
- if (check_type_overlap(&prev_match, &curr_match))
- return curr_match;
- }
+static u8 type_merge(u8 type, u8 new_type, u8 *uniform)
+{
+ u8 effective_type;
- if (prev_match != MTRR_TYPE_INVALID)
- return prev_match;
+ if (type == MTRR_TYPE_INVALID)
+ return new_type;
- return mtrr_state.def_type;
+ effective_type = get_effective_type(type, new_type);
+ if (type != effective_type)
+ *uniform = 0;
+
+ return effective_type;
}
/**
* mtrr_type_lookup - look up memory type in MTRR
*
+ * @start: Begin of the physical address range
+ * @end: End of the physical address range
+ * @uniform: output argument:
+ * - 1: the returned MTRR type is valid for the whole region
+ * - 0: otherwise
+ *
* Return Values:
* MTRR_TYPE_(type) - The effective MTRR type for the region
* MTRR_TYPE_INVALID - MTRR is disabled
- *
- * Output Argument:
- * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the
- * region is fully covered by a single MTRR entry or the default
- * type.
*/
u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform)
{
- u8 type, prev_type, is_uniform = 1, dummy;
- int repeat;
- u64 partial_end;
+ u8 type = MTRR_TYPE_INVALID;
+ unsigned int i;
- /* Make end inclusive instead of exclusive */
- end--;
+ if (!mtrr_state_set) {
+ /* Uniformity is unknown. */
+ *uniform = 0;
+ return MTRR_TYPE_UNCACHABLE;
+ }
- if (!mtrr_state_set)
- return MTRR_TYPE_INVALID;
+ *uniform = 1;
if (!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED))
- return MTRR_TYPE_INVALID;
+ return MTRR_TYPE_UNCACHABLE;
- /*
- * Look up the fixed ranges first, which take priority over
- * the variable ranges.
- */
- if ((start < 0x100000) &&
- (mtrr_state.have_fixed) &&
- (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) {
- is_uniform = 0;
- type = mtrr_type_lookup_fixed(start, end);
- goto out;
- }
+ for (i = 0; i < cache_map_n && start < end; i++) {
+ /* Region after current map entry? -> continue with next one. */
+ if (start >= cache_map[i].end)
+ continue;
- /*
- * Look up the variable ranges. Look of multiple ranges matching
- * this address and pick type as per MTRR precedence.
- */
- type = mtrr_type_lookup_variable(start, end, &partial_end,
- &repeat, &is_uniform);
+ /* Start of region not covered by current map entry? */
+ if (start < cache_map[i].start) {
+ /* At least some part of region has default type. */
+ type = type_merge(type, mtrr_state.def_type, uniform);
+ /* End of region not covered, too? -> lookup done. */
+ if (end <= cache_map[i].start)
+ return type;
+ }
- /*
- * Common path is with repeat = 0.
- * However, we can have cases where [start:end] spans across some
- * MTRR ranges and/or the default type. Do repeated lookups for
- * that case here.
- */
- while (repeat) {
- prev_type = type;
- start = partial_end;
- is_uniform = 0;
- type = mtrr_type_lookup_variable(start, end, &partial_end,
- &repeat, &dummy);
+ /* At least part of region covered by map entry. */
+ type = type_merge(type, cache_map[i].type, uniform);
- if (check_type_overlap(&prev_type, &type))
- goto out;
+ start = cache_map[i].end;
}
- if (mtrr_tom2 && (start >= (1ULL<<32)) && (end < mtrr_tom2))
- type = MTRR_TYPE_WRBACK;
+ /* End of region past last entry in map? -> use default type. */
+ if (start < end)
+ type = type_merge(type, mtrr_state.def_type, uniform);
-out:
- *uniform = is_uniform;
return type;
}
@@ -349,7 +593,7 @@ static void get_fixed_ranges(mtrr_type *frs)
void mtrr_save_fixed_ranges(void *info)
{
- if (boot_cpu_has(X86_FEATURE_MTRR))
+ if (mtrr_state.have_fixed)
get_fixed_ranges(mtrr_state.fixed_ranges);
}
@@ -362,8 +606,8 @@ static void __init print_fixed_last(void)
if (!last_fixed_end)
return;
- pr_debug(" %05X-%05X %s\n", last_fixed_start,
- last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type));
+ pr_info(" %05X-%05X %s\n", last_fixed_start,
+ last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type));
last_fixed_end = 0;
}
@@ -396,21 +640,18 @@ print_fixed(unsigned base, unsigned step, const mtrr_type *types)
}
}
-static void prepare_set(void);
-static void post_set(void);
-
static void __init print_mtrr_state(void)
{
unsigned int i;
int high_width;
- pr_debug("MTRR default type: %s\n",
- mtrr_attrib_to_str(mtrr_state.def_type));
+ pr_info("MTRR default type: %s\n",
+ mtrr_attrib_to_str(mtrr_state.def_type));
if (mtrr_state.have_fixed) {
- pr_debug("MTRR fixed ranges %sabled:\n",
- ((mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
- (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) ?
- "en" : "dis");
+ pr_info("MTRR fixed ranges %s:\n",
+ str_enabled_disabled(
+ (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
+ (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)));
print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
for (i = 0; i < 2; ++i)
print_fixed(0x80000 + i * 0x20000, 0x04000,
@@ -422,40 +663,27 @@ static void __init print_mtrr_state(void)
/* tail */
print_fixed_last();
}
- pr_debug("MTRR variable ranges %sabled:\n",
- mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED ? "en" : "dis");
- high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4;
+ pr_info("MTRR variable ranges %s:\n",
+ str_enabled_disabled(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED));
+ high_width = (boot_cpu_data.x86_phys_bits - (32 - PAGE_SHIFT) + 3) / 4;
for (i = 0; i < num_var_ranges; ++i) {
- if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
- pr_debug(" %u base %0*X%05X000 mask %0*X%05X000 %s\n",
- i,
- high_width,
- mtrr_state.var_ranges[i].base_hi,
- mtrr_state.var_ranges[i].base_lo >> 12,
- high_width,
- mtrr_state.var_ranges[i].mask_hi,
- mtrr_state.var_ranges[i].mask_lo >> 12,
- mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
+ if (mtrr_state.var_ranges[i].mask_lo & MTRR_PHYSMASK_V)
+ pr_info(" %u base %0*X%05X000 mask %0*X%05X000 %s\n",
+ i,
+ high_width,
+ mtrr_state.var_ranges[i].base_hi,
+ mtrr_state.var_ranges[i].base_lo >> 12,
+ high_width,
+ mtrr_state.var_ranges[i].mask_hi,
+ mtrr_state.var_ranges[i].mask_lo >> 12,
+ mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo &
+ MTRR_PHYSBASE_TYPE));
else
- pr_debug(" %u disabled\n", i);
+ pr_info(" %u disabled\n", i);
}
if (mtrr_tom2)
- pr_debug("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20);
-}
-
-/* PAT setup for BP. We need to go through sync steps here */
-void __init mtrr_bp_pat_init(void)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- prepare_set();
-
- pat_init();
-
- post_set();
- local_irq_restore(flags);
+ pr_info("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20);
}
/* Grab all of the MTRR state for this CPU into *state */
@@ -468,7 +696,7 @@ bool __init get_mtrr_state(void)
vrs = mtrr_state.var_ranges;
rdmsr(MSR_MTRRcap, lo, dummy);
- mtrr_state.have_fixed = (lo >> 8) & 1;
+ mtrr_state.have_fixed = lo & MTRR_CAP_FIX;
for (i = 0; i < num_var_ranges; i++)
get_mtrr_var_range(i, &vrs[i]);
@@ -476,8 +704,8 @@ bool __init get_mtrr_state(void)
get_fixed_ranges(mtrr_state.fixed_ranges);
rdmsr(MSR_MTRRdefType, lo, dummy);
- mtrr_state.def_type = (lo & 0xff);
- mtrr_state.enabled = (lo & 0xc00) >> 10;
+ mtrr_state.def_type = lo & MTRR_DEF_TYPE_TYPE;
+ mtrr_state.enabled = (lo & MTRR_DEF_TYPE_ENABLE) >> MTRR_STATE_SHIFT;
if (amd_special_default_mtrr()) {
unsigned low, high;
@@ -490,7 +718,8 @@ bool __init get_mtrr_state(void)
mtrr_tom2 &= 0xffffff800000ULL;
}
- print_mtrr_state();
+ if (mtrr_debug)
+ print_mtrr_state();
mtrr_state_set = 1;
@@ -590,7 +819,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
- if ((mask_lo & 0x800) == 0) {
+ if (!(mask_lo & MTRR_PHYSMASK_V)) {
/* Invalid (i.e. free) range */
*base = 0;
*size = 0;
@@ -601,8 +830,8 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi);
/* Work out the shifted address mask: */
- tmp = (u64)mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT;
- mask = size_or_mask | tmp;
+ tmp = (u64)mask_hi << 32 | (mask_lo & PAGE_MASK);
+ mask = (u64)phys_hi_rsvd << 32 | tmp;
/* Expand tmp with high bits to all 1s: */
hi = fls64(tmp);
@@ -620,9 +849,9 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
* This works correctly if size is a power of two, i.e. a
* contiguous range:
*/
- *size = -mask;
+ *size = -mask >> PAGE_SHIFT;
*base = (u64)base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT;
- *type = base_lo & 0xff;
+ *type = base_lo & MTRR_PHYSBASE_TYPE;
out_put_cpu:
put_cpu();
@@ -660,9 +889,8 @@ static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
bool changed = false;
rdmsr(MTRRphysBase_MSR(index), lo, hi);
- if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL)
- || (vr->base_hi & (size_and_mask >> (32 - PAGE_SHIFT))) !=
- (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) {
+ if ((vr->base_lo & ~MTRR_PHYSBASE_RSVD) != (lo & ~MTRR_PHYSBASE_RSVD)
+ || (vr->base_hi & ~phys_hi_rsvd) != (hi & ~phys_hi_rsvd)) {
mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi);
changed = true;
@@ -670,9 +898,8 @@ static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
rdmsr(MTRRphysMask_MSR(index), lo, hi);
- if ((vr->mask_lo & 0xfffff800UL) != (lo & 0xfffff800UL)
- || (vr->mask_hi & (size_and_mask >> (32 - PAGE_SHIFT))) !=
- (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) {
+ if ((vr->mask_lo & ~MTRR_PHYSMASK_RSVD) != (lo & ~MTRR_PHYSMASK_RSVD)
+ || (vr->mask_hi & ~phys_hi_rsvd) != (hi & ~phys_hi_rsvd)) {
mtrr_wrmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
changed = true;
}
@@ -684,7 +911,10 @@ static u32 deftype_lo, deftype_hi;
/**
* set_mtrr_state - Set the MTRR state for this CPU.
*
- * NOTE: The CPU must already be in a safe state for MTRR changes.
+ * NOTE: The CPU must already be in a safe state for MTRR changes, including
+ * measures that only a single CPU can be active in set_mtrr_state() in
+ * order to not be subject to races for usage of deftype_lo. This is
+ * accomplished by taking cache_disable_lock.
* RETURNS: 0 if no changes made, else a mask indicating what was changed.
*/
static unsigned long set_mtrr_state(void)
@@ -704,117 +934,46 @@ static unsigned long set_mtrr_state(void)
* Set_mtrr_restore restores the old value of MTRRdefType,
* so to set it we fiddle with the saved value:
*/
- if ((deftype_lo & 0xff) != mtrr_state.def_type
- || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) {
+ if ((deftype_lo & MTRR_DEF_TYPE_TYPE) != mtrr_state.def_type ||
+ ((deftype_lo & MTRR_DEF_TYPE_ENABLE) >> MTRR_STATE_SHIFT) != mtrr_state.enabled) {
- deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type |
- (mtrr_state.enabled << 10);
+ deftype_lo = (deftype_lo & MTRR_DEF_TYPE_DISABLE) |
+ mtrr_state.def_type |
+ (mtrr_state.enabled << MTRR_STATE_SHIFT);
change_mask |= MTRR_CHANGE_MASK_DEFTYPE;
}
return change_mask;
}
-
-static unsigned long cr4;
-static DEFINE_RAW_SPINLOCK(set_atomicity_lock);
-
-/*
- * Since we are disabling the cache don't allow any interrupts,
- * they would run extremely slow and would only increase the pain.
- *
- * The caller must ensure that local interrupts are disabled and
- * are reenabled after post_set() has been called.
- */
-static void prepare_set(void) __acquires(set_atomicity_lock)
+void mtrr_disable(void)
{
- unsigned long cr0;
-
- /*
- * Note that this is not ideal
- * since the cache is only flushed/disabled for this CPU while the
- * MTRRs are changed, but changing this requires more invasive
- * changes to the way the kernel boots
- */
-
- raw_spin_lock(&set_atomicity_lock);
-
- /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
- cr0 = read_cr0() | X86_CR0_CD;
- write_cr0(cr0);
-
- /*
- * Cache flushing is the most time-consuming step when programming
- * the MTRRs. Fortunately, as per the Intel Software Development
- * Manual, we can skip it if the processor supports cache self-
- * snooping.
- */
- if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
- wbinvd();
-
- /* Save value of CR4 and clear Page Global Enable (bit 7) */
- if (boot_cpu_has(X86_FEATURE_PGE)) {
- cr4 = __read_cr4();
- __write_cr4(cr4 & ~X86_CR4_PGE);
- }
-
- /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
- flush_tlb_local();
-
/* Save MTRR state */
rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
/* Disable MTRRs, and set the default type to uncached */
- mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
-
- /* Again, only flush caches if we have to. */
- if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
- wbinvd();
+ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & MTRR_DEF_TYPE_DISABLE, deftype_hi);
}
-static void post_set(void) __releases(set_atomicity_lock)
+void mtrr_enable(void)
{
- /* Flush TLBs (no need to flush caches - they are disabled) */
- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
- flush_tlb_local();
-
/* Intel (P6) standard MTRRs */
mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
-
- /* Enable caches */
- write_cr0(read_cr0() & ~X86_CR0_CD);
-
- /* Restore value of CR4 */
- if (boot_cpu_has(X86_FEATURE_PGE))
- __write_cr4(cr4);
- raw_spin_unlock(&set_atomicity_lock);
}
-static void generic_set_all(void)
+void mtrr_generic_set_state(void)
{
unsigned long mask, count;
- unsigned long flags;
-
- local_irq_save(flags);
- prepare_set();
/* Actually set the state */
mask = set_mtrr_state();
- /* also set PAT */
- pat_init();
-
- post_set();
- local_irq_restore(flags);
-
/* Use the atomic bitops to update the global mask */
for (count = 0; count < sizeof(mask) * 8; ++count) {
if (mask & 0x01)
set_bit(count, &smp_changes_mask);
mask >>= 1;
}
-
}
/**
@@ -836,7 +995,7 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base,
vr = &mtrr_state.var_ranges[reg];
local_irq_save(flags);
- prepare_set();
+ cache_disable();
if (size == 0) {
/*
@@ -847,15 +1006,15 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base,
memset(vr, 0, sizeof(struct mtrr_var_range));
} else {
vr->base_lo = base << PAGE_SHIFT | type;
- vr->base_hi = (base & size_and_mask) >> (32 - PAGE_SHIFT);
- vr->mask_lo = -size << PAGE_SHIFT | 0x800;
- vr->mask_hi = (-size & size_and_mask) >> (32 - PAGE_SHIFT);
+ vr->base_hi = (base >> (32 - PAGE_SHIFT)) & ~phys_hi_rsvd;
+ vr->mask_lo = -size << PAGE_SHIFT | MTRR_PHYSMASK_V;
+ vr->mask_hi = (-size >> (32 - PAGE_SHIFT)) & ~phys_hi_rsvd;
mtrr_wrmsr(MTRRphysBase_MSR(reg), vr->base_lo, vr->base_hi);
mtrr_wrmsr(MTRRphysMask_MSR(reg), vr->mask_lo, vr->mask_hi);
}
- post_set();
+ cache_enable();
local_irq_restore(flags);
}
@@ -868,8 +1027,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size,
* For Intel PPro stepping <= 7
* must be 4 MiB aligned and not touch 0x70000000 -> 0x7003FFFF
*/
- if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 &&
- boot_cpu_data.x86_model == 1 &&
+ if (mtrr_if == &generic_mtrr_ops && boot_cpu_data.x86_vfm == INTEL_PENTIUM_PRO &&
boot_cpu_data.x86_stepping <= 7) {
if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) {
pr_warn("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
@@ -902,7 +1060,7 @@ static int generic_have_wrcomb(void)
{
unsigned long config, dummy;
rdmsr(MSR_MTRRcap, config, dummy);
- return config & (1 << 10);
+ return config & MTRR_CAP_WC;
}
int positive_have_wrcomb(void)
@@ -914,8 +1072,6 @@ int positive_have_wrcomb(void)
* Generic structure...
*/
const struct mtrr_ops generic_mtrr_ops = {
- .use_intel_if = 1,
- .set_all = generic_set_all,
.get = generic_get_mtrr,
.get_free_region = generic_get_free_region,
.set = generic_set_mtrr,
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index a5c506f6da7f..4049235b1bfe 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -99,7 +99,6 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
char *ptr;
char line[LINE_SIZE];
int length;
- size_t linelen;
memset(line, 0, LINE_SIZE);
@@ -108,9 +107,8 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
if (length < 0)
return length;
- linelen = strlen(line);
- ptr = line + linelen - 1;
- if (linelen && *ptr == '\n')
+ ptr = line + length - 1;
+ if (length && *ptr == '\n')
*ptr = '\0';
if (!strncmp(line, "disable=", 8)) {
diff --git a/arch/x86/kernel/cpu/mtrr/legacy.c b/arch/x86/kernel/cpu/mtrr/legacy.c
new file mode 100644
index 000000000000..d25882fcf181
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/legacy.c
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/syscore_ops.h>
+#include <asm/cpufeature.h>
+#include <asm/mtrr.h>
+#include <asm/processor.h>
+#include "mtrr.h"
+
+void mtrr_set_if(void)
+{
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ /* Pre-Athlon (K6) AMD CPU MTRRs */
+ if (cpu_feature_enabled(X86_FEATURE_K6_MTRR))
+ mtrr_if = &amd_mtrr_ops;
+ break;
+ case X86_VENDOR_CENTAUR:
+ if (cpu_feature_enabled(X86_FEATURE_CENTAUR_MCR))
+ mtrr_if = &centaur_mtrr_ops;
+ break;
+ case X86_VENDOR_CYRIX:
+ if (cpu_feature_enabled(X86_FEATURE_CYRIX_ARR))
+ mtrr_if = &cyrix_mtrr_ops;
+ break;
+ default:
+ break;
+ }
+}
+
+/*
+ * The suspend/resume methods are only for CPUs without MTRR. CPUs using generic
+ * MTRR driver don't require this.
+ */
+struct mtrr_value {
+ mtrr_type ltype;
+ unsigned long lbase;
+ unsigned long lsize;
+};
+
+static struct mtrr_value *mtrr_value;
+
+static int mtrr_save(void)
+{
+ int i;
+
+ if (!mtrr_value)
+ return -ENOMEM;
+
+ for (i = 0; i < num_var_ranges; i++) {
+ mtrr_if->get(i, &mtrr_value[i].lbase,
+ &mtrr_value[i].lsize,
+ &mtrr_value[i].ltype);
+ }
+ return 0;
+}
+
+static void mtrr_restore(void)
+{
+ int i;
+
+ for (i = 0; i < num_var_ranges; i++) {
+ if (mtrr_value[i].lsize) {
+ mtrr_if->set(i, mtrr_value[i].lbase,
+ mtrr_value[i].lsize,
+ mtrr_value[i].ltype);
+ }
+ }
+}
+
+static struct syscore_ops mtrr_syscore_ops = {
+ .suspend = mtrr_save,
+ .resume = mtrr_restore,
+};
+
+void mtrr_register_syscore(void)
+{
+ mtrr_value = kcalloc(num_var_ranges, sizeof(*mtrr_value), GFP_KERNEL);
+
+ /*
+ * The CPU has no MTRR and seems to not support SMP. They have
+ * specific drivers, we use a tricky method to support
+ * suspend/resume for them.
+ *
+ * TBD: is there any system with such CPU which supports
+ * suspend/resume? If no, we should remove the code.
+ */
+ register_syscore_ops(&mtrr_syscore_ops);
+}
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c
index 2746cac9d8a9..ecbda0341a8a 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.c
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.c
@@ -46,6 +46,7 @@
#include <linux/syscore_ops.h>
#include <linux/rcupdate.h>
+#include <asm/cacheinfo.h>
#include <asm/cpufeature.h>
#include <asm/e820/api.h>
#include <asm/mtrr.h>
@@ -54,36 +55,22 @@
#include "mtrr.h"
+static_assert(X86_MEMTYPE_UC == MTRR_TYPE_UNCACHABLE);
+static_assert(X86_MEMTYPE_WC == MTRR_TYPE_WRCOMB);
+static_assert(X86_MEMTYPE_WT == MTRR_TYPE_WRTHROUGH);
+static_assert(X86_MEMTYPE_WP == MTRR_TYPE_WRPROT);
+static_assert(X86_MEMTYPE_WB == MTRR_TYPE_WRBACK);
+
/* arch_phys_wc_add returns an MTRR register index plus this offset. */
#define MTRR_TO_PHYS_WC_OFFSET 1000
u32 num_var_ranges;
-static bool __mtrr_enabled;
-
-static bool mtrr_enabled(void)
-{
- return __mtrr_enabled;
-}
unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
-static DEFINE_MUTEX(mtrr_mutex);
-
-u64 size_or_mask, size_and_mask;
-static bool mtrr_aps_delayed_init;
-
-static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM] __ro_after_init;
+DEFINE_MUTEX(mtrr_mutex);
const struct mtrr_ops *mtrr_if;
-static void set_mtrr(unsigned int reg, unsigned long base,
- unsigned long size, mtrr_type type);
-
-void __init set_mtrr_ops(const struct mtrr_ops *ops)
-{
- if (ops->vendor && ops->vendor < X86_VENDOR_NUM)
- mtrr_ops[ops->vendor] = ops;
-}
-
/* Returns non-zero if we have the write-combining memory type */
static int have_wrcomb(void)
{
@@ -118,21 +105,6 @@ static int have_wrcomb(void)
return mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0;
}
-/* This function returns the number of variable MTRRs */
-static void __init set_num_var_ranges(void)
-{
- unsigned long config = 0, dummy;
-
- if (use_intel())
- rdmsr(MSR_MTRRcap, config, dummy);
- else if (is_cpu(AMD) || is_cpu(HYGON))
- config = 2;
- else if (is_cpu(CYRIX) || is_cpu(CENTAUR))
- config = 8;
-
- num_var_ranges = config & 0xff;
-}
-
static void __init init_table(void)
{
int i, max;
@@ -160,25 +132,8 @@ static int mtrr_rendezvous_handler(void *info)
{
struct set_mtrr_data *data = info;
- /*
- * We use this same function to initialize the mtrrs during boot,
- * resume, runtime cpu online and on an explicit request to set a
- * specific MTRR.
- *
- * During boot or suspend, the state of the boot cpu's mtrrs has been
- * saved, and we want to replicate that across all the cpus that come
- * online (either at the end of boot or resume or during a runtime cpu
- * online). If we're doing that, @reg is set to something special and on
- * all the cpu's we do mtrr_if->set_all() (On the logical cpu that
- * started the boot/resume sequence, this might be a duplicate
- * set_all()).
- */
- if (data->smp_reg != ~0U) {
- mtrr_if->set(data->smp_reg, data->smp_base,
- data->smp_size, data->smp_type);
- } else if (mtrr_aps_delayed_init || !cpu_online(smp_processor_id())) {
- mtrr_if->set_all();
- }
+ mtrr_if->set(data->smp_reg, data->smp_base,
+ data->smp_size, data->smp_type);
return 0;
}
@@ -224,20 +179,8 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
* Note that the mechanism is the same for UP systems, too; all the SMP stuff
* becomes nops.
*/
-static void
-set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type)
-{
- struct set_mtrr_data data = { .smp_reg = reg,
- .smp_base = base,
- .smp_size = size,
- .smp_type = type
- };
-
- stop_machine(mtrr_rendezvous_handler, &data, cpu_online_mask);
-}
-
-static void set_mtrr_cpuslocked(unsigned int reg, unsigned long base,
- unsigned long size, mtrr_type type)
+static void set_mtrr(unsigned int reg, unsigned long base, unsigned long size,
+ mtrr_type type)
{
struct set_mtrr_data data = { .smp_reg = reg,
.smp_base = base,
@@ -246,19 +189,8 @@ static void set_mtrr_cpuslocked(unsigned int reg, unsigned long base,
};
stop_machine_cpuslocked(mtrr_rendezvous_handler, &data, cpu_online_mask);
-}
-
-static void set_mtrr_from_inactive_cpu(unsigned int reg, unsigned long base,
- unsigned long size, mtrr_type type)
-{
- struct set_mtrr_data data = { .smp_reg = reg,
- .smp_base = base,
- .smp_size = size,
- .smp_type = type
- };
- stop_machine_from_inactive_cpu(mtrr_rendezvous_handler, &data,
- cpu_callout_mask);
+ generic_rebuild_map();
}
/**
@@ -380,7 +312,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
/* Search for an empty MTRR */
i = mtrr_if->get_free_region(base, size, replace);
if (i >= 0) {
- set_mtrr_cpuslocked(i, base, size, type);
+ set_mtrr(i, base, size, type);
if (likely(replace < 0)) {
mtrr_usage_table[i] = 1;
} else {
@@ -388,7 +320,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
if (increment)
mtrr_usage_table[i]++;
if (unlikely(replace != i)) {
- set_mtrr_cpuslocked(replace, 0, 0, 0);
+ set_mtrr(replace, 0, 0, 0);
mtrr_usage_table[replace] = 0;
}
}
@@ -406,7 +338,7 @@ static int mtrr_check(unsigned long base, unsigned long size)
{
if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
pr_warn("size and base must be multiples of 4 kiB\n");
- pr_debug("size: 0x%lx base: 0x%lx\n", size, base);
+ Dprintk("size: 0x%lx base: 0x%lx\n", size, base);
dump_stack();
return -1;
}
@@ -497,8 +429,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
}
}
if (reg < 0) {
- pr_debug("no MTRR for %lx000,%lx000 found\n",
- base, size);
+ Dprintk("no MTRR for %lx000,%lx000 found\n", base, size);
goto out;
}
}
@@ -516,7 +447,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
goto out;
}
if (--mtrr_usage_table[reg] < 1)
- set_mtrr_cpuslocked(reg, 0, 0, 0);
+ set_mtrr(reg, 0, 0, 0);
error = reg;
out:
mutex_unlock(&mtrr_mutex);
@@ -617,195 +548,63 @@ int arch_phys_wc_index(int handle)
}
EXPORT_SYMBOL_GPL(arch_phys_wc_index);
-/*
- * HACK ALERT!
- * These should be called implicitly, but we can't yet until all the initcall
- * stuff is done...
- */
-static void __init init_ifs(void)
-{
-#ifndef CONFIG_X86_64
- amd_init_mtrr();
- cyrix_init_mtrr();
- centaur_init_mtrr();
-#endif
-}
-
-/* The suspend/resume methods are only for CPU without MTRR. CPU using generic
- * MTRR driver doesn't require this
- */
-struct mtrr_value {
- mtrr_type ltype;
- unsigned long lbase;
- unsigned long lsize;
-};
-
-static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES];
-
-static int mtrr_save(void)
-{
- int i;
-
- for (i = 0; i < num_var_ranges; i++) {
- mtrr_if->get(i, &mtrr_value[i].lbase,
- &mtrr_value[i].lsize,
- &mtrr_value[i].ltype);
- }
- return 0;
-}
-
-static void mtrr_restore(void)
-{
- int i;
-
- for (i = 0; i < num_var_ranges; i++) {
- if (mtrr_value[i].lsize) {
- set_mtrr(i, mtrr_value[i].lbase,
- mtrr_value[i].lsize,
- mtrr_value[i].ltype);
- }
- }
-}
-
-
-
-static struct syscore_ops mtrr_syscore_ops = {
- .suspend = mtrr_save,
- .resume = mtrr_restore,
-};
-
int __initdata changed_by_mtrr_cleanup;
-#define SIZE_OR_MASK_BITS(n) (~((1ULL << ((n) - PAGE_SHIFT)) - 1))
/**
- * mtrr_bp_init - initialize mtrrs on the boot CPU
+ * mtrr_bp_init - initialize MTRRs on the boot CPU
*
* This needs to be called early; before any of the other CPUs are
* initialized (i.e. before smp_init()).
- *
*/
void __init mtrr_bp_init(void)
{
- u32 phys_addr;
-
- init_ifs();
+ bool generic_mtrrs = cpu_feature_enabled(X86_FEATURE_MTRR);
+ const char *why = "(not available)";
+ unsigned long config, dummy;
- phys_addr = 32;
-
- if (boot_cpu_has(X86_FEATURE_MTRR)) {
- mtrr_if = &generic_mtrr_ops;
- size_or_mask = SIZE_OR_MASK_BITS(36);
- size_and_mask = 0x00f00000;
- phys_addr = 36;
+ phys_hi_rsvd = GENMASK(31, boot_cpu_data.x86_phys_bits - 32);
+ if (!generic_mtrrs && mtrr_state.enabled) {
/*
- * This is an AMD specific MSR, but we assume(hope?) that
- * Intel will implement it too when they extend the address
- * bus of the Xeon.
+ * Software overwrite of MTRR state, only for generic case.
+ * Note that X86_FEATURE_MTRR has been reset in this case.
*/
- if (cpuid_eax(0x80000000) >= 0x80000008) {
- phys_addr = cpuid_eax(0x80000008) & 0xff;
- /* CPUID workaround for Intel 0F33/0F34 CPU */
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
- boot_cpu_data.x86 == 0xF &&
- boot_cpu_data.x86_model == 0x3 &&
- (boot_cpu_data.x86_stepping == 0x3 ||
- boot_cpu_data.x86_stepping == 0x4))
- phys_addr = 36;
-
- size_or_mask = SIZE_OR_MASK_BITS(phys_addr);
- size_and_mask = ~size_or_mask & 0xfffff00000ULL;
- } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR &&
- boot_cpu_data.x86 == 6) {
- /*
- * VIA C* family have Intel style MTRRs,
- * but don't support PAE
- */
- size_or_mask = SIZE_OR_MASK_BITS(32);
- size_and_mask = 0;
- phys_addr = 32;
- }
- } else {
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_AMD:
- if (cpu_feature_enabled(X86_FEATURE_K6_MTRR)) {
- /* Pre-Athlon (K6) AMD CPU MTRRs */
- mtrr_if = mtrr_ops[X86_VENDOR_AMD];
- size_or_mask = SIZE_OR_MASK_BITS(32);
- size_and_mask = 0;
- }
- break;
- case X86_VENDOR_CENTAUR:
- if (cpu_feature_enabled(X86_FEATURE_CENTAUR_MCR)) {
- mtrr_if = mtrr_ops[X86_VENDOR_CENTAUR];
- size_or_mask = SIZE_OR_MASK_BITS(32);
- size_and_mask = 0;
- }
- break;
- case X86_VENDOR_CYRIX:
- if (cpu_feature_enabled(X86_FEATURE_CYRIX_ARR)) {
- mtrr_if = mtrr_ops[X86_VENDOR_CYRIX];
- size_or_mask = SIZE_OR_MASK_BITS(32);
- size_and_mask = 0;
- }
- break;
- default:
- break;
- }
+ init_table();
+ mtrr_build_map();
+ pr_info("MTRRs set to read-only\n");
+
+ return;
}
- if (mtrr_if) {
- __mtrr_enabled = true;
- set_num_var_ranges();
- init_table();
- if (use_intel()) {
- /* BIOS may override */
- __mtrr_enabled = get_mtrr_state();
+ if (generic_mtrrs)
+ mtrr_if = &generic_mtrr_ops;
+ else
+ mtrr_set_if();
- if (mtrr_enabled())
- mtrr_bp_pat_init();
+ if (mtrr_enabled()) {
+ /* Get the number of variable MTRR ranges. */
+ if (mtrr_if == &generic_mtrr_ops)
+ rdmsr(MSR_MTRRcap, config, dummy);
+ else
+ config = mtrr_if->var_regs;
+ num_var_ranges = config & MTRR_CAP_VCNT;
- if (mtrr_cleanup(phys_addr)) {
- changed_by_mtrr_cleanup = 1;
- mtrr_if->set_all();
+ init_table();
+ if (mtrr_if == &generic_mtrr_ops) {
+ /* BIOS may override */
+ if (get_mtrr_state()) {
+ memory_caching_control |= CACHE_MTRR;
+ changed_by_mtrr_cleanup = mtrr_cleanup();
+ mtrr_build_map();
+ } else {
+ mtrr_if = NULL;
+ why = "by BIOS";
}
}
}
- if (!mtrr_enabled()) {
- pr_info("Disabled\n");
-
- /*
- * PAT initialization relies on MTRR's rendezvous handler.
- * Skip PAT init until the handler can initialize both
- * features independently.
- */
- pat_disable("MTRRs disabled, skipping PAT initialization too.");
- }
-}
-
-void mtrr_ap_init(void)
-{
if (!mtrr_enabled())
- return;
-
- if (!use_intel() || mtrr_aps_delayed_init)
- return;
-
- /*
- * Ideally we should hold mtrr_mutex here to avoid mtrr entries
- * changed, but this routine will be called in cpu boot time,
- * holding the lock breaks it.
- *
- * This routine is called in two cases:
- *
- * 1. very early time of software resume, when there absolutely
- * isn't mtrr entry changes;
- *
- * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug
- * lock to prevent mtrr entry changes
- */
- set_mtrr_from_inactive_cpu(~0U, 0, 0, 0);
+ pr_info("MTRRs disabled %s\n", why);
}
/**
@@ -816,72 +615,32 @@ void mtrr_save_state(void)
{
int first_cpu;
- if (!mtrr_enabled())
+ if (!mtrr_enabled() || !mtrr_state.have_fixed)
return;
first_cpu = cpumask_first(cpu_online_mask);
smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1);
}
-void set_mtrr_aps_delayed_init(void)
-{
- if (!mtrr_enabled())
- return;
- if (!use_intel())
- return;
-
- mtrr_aps_delayed_init = true;
-}
-
-/*
- * Delayed MTRR initialization for all AP's
- */
-void mtrr_aps_init(void)
+static int __init mtrr_init_finalize(void)
{
- if (!use_intel() || !mtrr_enabled())
- return;
-
/*
- * Check if someone has requested the delay of AP MTRR initialization,
- * by doing set_mtrr_aps_delayed_init(), prior to this point. If not,
- * then we are done.
+ * Map might exist if guest_force_mtrr_state() has been called or if
+ * mtrr_enabled() returns true.
*/
- if (!mtrr_aps_delayed_init)
- return;
-
- set_mtrr(~0U, 0, 0, 0);
- mtrr_aps_delayed_init = false;
-}
-
-void mtrr_bp_restore(void)
-{
- if (!use_intel() || !mtrr_enabled())
- return;
-
- mtrr_if->set_all();
-}
+ mtrr_copy_map();
-static int __init mtrr_init_finialize(void)
-{
if (!mtrr_enabled())
return 0;
- if (use_intel()) {
+ if (memory_caching_control & CACHE_MTRR) {
if (!changed_by_mtrr_cleanup)
mtrr_state_warn();
return 0;
}
- /*
- * The CPU has no MTRR and seems to not support SMP. They have
- * specific drivers, we use a tricky method to support
- * suspend/resume for them.
- *
- * TBD: is there any system with such CPU which supports
- * suspend/resume? If no, we should remove the code.
- */
- register_syscore_ops(&mtrr_syscore_ops);
+ mtrr_register_syscore();
return 0;
}
-subsys_initcall(mtrr_init_finialize);
+subsys_initcall(mtrr_init_finalize);
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 2ac99e561181..5655f253d929 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -10,15 +10,15 @@
#define MTRR_CHANGE_MASK_VARIABLE 0x02
#define MTRR_CHANGE_MASK_DEFTYPE 0x04
+extern bool mtrr_debug;
+#define Dprintk(x...) do { if (mtrr_debug) pr_info(x); } while (0)
+
extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
struct mtrr_ops {
- u32 vendor;
- u32 use_intel_if;
+ u32 var_regs;
void (*set)(unsigned int reg, unsigned long base,
unsigned long size, mtrr_type type);
- void (*set_all)(void);
-
void (*get)(unsigned int reg, unsigned long *base,
unsigned long *size, mtrr_type *type);
int (*get_free_region)(unsigned long base, unsigned long size,
@@ -53,28 +53,42 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
void fill_mtrr_var_range(unsigned int index,
u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
bool get_mtrr_state(void);
-void mtrr_bp_pat_init(void);
-extern void __init set_mtrr_ops(const struct mtrr_ops *ops);
-
-extern u64 size_or_mask, size_and_mask;
extern const struct mtrr_ops *mtrr_if;
-
-#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd)
-#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1)
+extern struct mutex mtrr_mutex;
extern unsigned int num_var_ranges;
extern u64 mtrr_tom2;
extern struct mtrr_state_type mtrr_state;
+extern u32 phys_hi_rsvd;
void mtrr_state_warn(void);
const char *mtrr_attrib_to_str(int x);
void mtrr_wrmsr(unsigned, unsigned, unsigned);
-
-/* CPU specific mtrr init functions */
-int amd_init_mtrr(void);
-int cyrix_init_mtrr(void);
-int centaur_init_mtrr(void);
+#ifdef CONFIG_X86_32
+void mtrr_set_if(void);
+void mtrr_register_syscore(void);
+#else
+static inline void mtrr_set_if(void) { }
+static inline void mtrr_register_syscore(void) { }
+#endif
+void mtrr_build_map(void);
+void mtrr_copy_map(void);
+
+/* CPU specific mtrr_ops vectors. */
+extern const struct mtrr_ops amd_mtrr_ops;
+extern const struct mtrr_ops cyrix_mtrr_ops;
+extern const struct mtrr_ops centaur_mtrr_ops;
extern int changed_by_mtrr_cleanup;
-extern int mtrr_cleanup(unsigned address_bits);
+extern int mtrr_cleanup(void);
+
+/*
+ * Must be used by code which uses mtrr_if to call platform-specific
+ * MTRR manipulation functions.
+ */
+static inline bool mtrr_enabled(void)
+{
+ return !!mtrr_if;
+}
+void generic_rebuild_map(void);
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 099b6f0d96bd..6571d432cbe3 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -4,6 +4,8 @@
#include <linux/string.h>
#include <linux/seq_file.h>
#include <linux/cpufreq.h>
+#include <asm/prctl.h>
+#include <linux/proc_fs.h>
#include "cpu.h"
@@ -18,13 +20,13 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
unsigned int cpu)
{
#ifdef CONFIG_SMP
- seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
+ seq_printf(m, "physical id\t: %d\n", c->topo.pkg_id);
seq_printf(m, "siblings\t: %d\n",
cpumask_weight(topology_core_cpumask(cpu)));
- seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
+ seq_printf(m, "core id\t\t: %d\n", c->topo.core_id);
seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
- seq_printf(m, "apicid\t\t: %d\n", c->apicid);
- seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid);
+ seq_printf(m, "apicid\t\t: %d\n", c->topo.apicid);
+ seq_printf(m, "initial apicid\t: %d\n", c->topo.initial_apicid);
#endif
}
@@ -39,11 +41,11 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
"fpu_exception\t: %s\n"
"cpuid level\t: %d\n"
"wp\t\t: yes\n",
- boot_cpu_has_bug(X86_BUG_FDIV) ? "yes" : "no",
- boot_cpu_has_bug(X86_BUG_F00F) ? "yes" : "no",
- boot_cpu_has_bug(X86_BUG_COMA) ? "yes" : "no",
- boot_cpu_has(X86_FEATURE_FPU) ? "yes" : "no",
- boot_cpu_has(X86_FEATURE_FPU) ? "yes" : "no",
+ str_yes_no(boot_cpu_has_bug(X86_BUG_FDIV)),
+ str_yes_no(boot_cpu_has_bug(X86_BUG_F00F)),
+ str_yes_no(boot_cpu_has_bug(X86_BUG_COMA)),
+ str_yes_no(boot_cpu_has(X86_FEATURE_FPU)),
+ str_yes_no(boot_cpu_has(X86_FEATURE_FPU)),
c->cpuid_level);
}
#else
@@ -84,9 +86,12 @@ static int show_cpuinfo(struct seq_file *m, void *v)
seq_printf(m, "microcode\t: 0x%x\n", c->microcode);
if (cpu_has(c, X86_FEATURE_TSC)) {
- unsigned int freq = arch_freq_get_on_cpu(cpu);
+ int freq = arch_freq_get_on_cpu(cpu);
- seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000));
+ if (freq < 0)
+ seq_puts(m, "cpu MHz\t\t: Unknown\n");
+ else
+ seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000));
}
/* Cache size */
@@ -175,3 +180,24 @@ const struct seq_operations cpuinfo_op = {
.stop = c_stop,
.show = show_cpuinfo,
};
+
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+static void dump_x86_features(struct seq_file *m, unsigned long features)
+{
+ if (features & ARCH_SHSTK_SHSTK)
+ seq_puts(m, "shstk ");
+ if (features & ARCH_SHSTK_WRSS)
+ seq_puts(m, "wrss ");
+}
+
+void arch_proc_pid_thread_features(struct seq_file *m, struct task_struct *task)
+{
+ seq_puts(m, "x86_Thread_features:\t");
+ dump_x86_features(m, task->thread.features);
+ seq_putc(m, '\n');
+
+ seq_puts(m, "x86_Thread_features_locked:\t");
+ dump_x86_features(m, task->thread.features_locked);
+ seq_putc(m, '\n');
+}
+#endif /* CONFIG_X86_USER_SHADOW_STACK */
diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c
index 26a427fa84ea..eeac00d20926 100644
--- a/arch/x86/kernel/cpu/rdrand.c
+++ b/arch/x86/kernel/cpu/rdrand.c
@@ -6,6 +6,7 @@
* Authors: Fenghua Yu <fenghua.yu@intel.com>,
* H. Peter Anvin <hpa@linux.intel.com>
*/
+#include <linux/printk.h>
#include <asm/processor.h>
#include <asm/archrandom.h>
diff --git a/arch/x86/kernel/cpu/resctrl/Makefile b/arch/x86/kernel/cpu/resctrl/Makefile
index 4a06c37b9cf1..d8a04b195da2 100644
--- a/arch/x86/kernel/cpu/resctrl/Makefile
+++ b/arch/x86/kernel/cpu/resctrl/Makefile
@@ -1,4 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_X86_CPU_RESCTRL) += core.o rdtgroup.o monitor.o
-obj-$(CONFIG_X86_CPU_RESCTRL) += ctrlmondata.o pseudo_lock.o
+obj-$(CONFIG_X86_CPU_RESCTRL) += core.o rdtgroup.o monitor.o
+obj-$(CONFIG_X86_CPU_RESCTRL) += ctrlmondata.o
+obj-$(CONFIG_RESCTRL_FS_PSEUDO_LOCK) += pseudo_lock.o
+
+# To allow define_trace.h's recursive include:
CFLAGS_pseudo_lock.o = -I$(src)
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index bb1c3f5f60c8..187d527ef73b 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -16,17 +16,25 @@
#define pr_fmt(fmt) "resctrl: " fmt
+#include <linux/cpu.h>
#include <linux/slab.h>
#include <linux/err.h>
-#include <linux/cacheinfo.h>
#include <linux/cpuhotplug.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
+#include <asm/msr.h>
#include <asm/resctrl.h>
#include "internal.h"
-/* Mutex to protect rdtgroup access. */
-DEFINE_MUTEX(rdtgroup_mutex);
+/*
+ * rdt_domain structures are kfree()d when their last CPU goes offline,
+ * and allocated when the first CPU in a new domain comes online.
+ * The rdt_resource's domain list is updated when this happens. Readers of
+ * the domain list must either take cpus_read_lock(), or rely on an RCU
+ * read-side critical section, to avoid observing concurrent modification.
+ * All writers take this mutex:
+ */
+static DEFINE_MUTEX(domain_list_lock);
/*
* The cached resctrl_pqr_state is strictly per CPU and can never be
@@ -37,42 +45,28 @@ DEFINE_MUTEX(rdtgroup_mutex);
DEFINE_PER_CPU(struct resctrl_pqr_state, pqr_state);
/*
- * Used to store the max resource name width and max resource data width
- * to display the schemata in a tabular format
- */
-int max_name_width, max_data_width;
-
-/*
* Global boolean for rdt_alloc which is true if any
* resource allocation is enabled.
*/
bool rdt_alloc_capable;
-static void
-mba_wrmsr_intel(struct rdt_domain *d, struct msr_param *m,
- struct rdt_resource *r);
-static void
-cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r);
-static void
-mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m,
- struct rdt_resource *r);
+static void mba_wrmsr_intel(struct msr_param *m);
+static void cat_wrmsr(struct msr_param *m);
+static void mba_wrmsr_amd(struct msr_param *m);
-#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.domains)
+#define ctrl_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.ctrl_domains)
+#define mon_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.mon_domains)
-struct rdt_hw_resource rdt_resources_all[] = {
+struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = {
[RDT_RESOURCE_L3] =
{
.r_resctrl = {
- .rid = RDT_RESOURCE_L3,
.name = "L3",
- .cache_level = 3,
- .cache = {
- .min_cbm_bits = 1,
- },
- .domains = domain_init(RDT_RESOURCE_L3),
- .parse_ctrlval = parse_cbm,
- .format_str = "%d=%0*x",
- .fflags = RFTYPE_RES_CACHE,
+ .ctrl_scope = RESCTRL_L3_CACHE,
+ .mon_scope = RESCTRL_L3_CACHE,
+ .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L3),
+ .mon_domains = mon_domain_init(RDT_RESOURCE_L3),
+ .schema_fmt = RESCTRL_SCHEMA_BITMAP,
},
.msr_base = MSR_IA32_L3_CBM_BASE,
.msr_update = cat_wrmsr,
@@ -80,16 +74,10 @@ struct rdt_hw_resource rdt_resources_all[] = {
[RDT_RESOURCE_L2] =
{
.r_resctrl = {
- .rid = RDT_RESOURCE_L2,
.name = "L2",
- .cache_level = 2,
- .cache = {
- .min_cbm_bits = 1,
- },
- .domains = domain_init(RDT_RESOURCE_L2),
- .parse_ctrlval = parse_cbm,
- .format_str = "%d=%0*x",
- .fflags = RFTYPE_RES_CACHE,
+ .ctrl_scope = RESCTRL_L2_CACHE,
+ .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L2),
+ .schema_fmt = RESCTRL_SCHEMA_BITMAP,
},
.msr_base = MSR_IA32_L2_CBM_BASE,
.msr_update = cat_wrmsr,
@@ -97,17 +85,39 @@ struct rdt_hw_resource rdt_resources_all[] = {
[RDT_RESOURCE_MBA] =
{
.r_resctrl = {
- .rid = RDT_RESOURCE_MBA,
.name = "MB",
- .cache_level = 3,
- .domains = domain_init(RDT_RESOURCE_MBA),
- .parse_ctrlval = parse_bw,
- .format_str = "%d=%*u",
- .fflags = RFTYPE_RES_MB,
+ .ctrl_scope = RESCTRL_L3_CACHE,
+ .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_MBA),
+ .schema_fmt = RESCTRL_SCHEMA_RANGE,
+ },
+ },
+ [RDT_RESOURCE_SMBA] =
+ {
+ .r_resctrl = {
+ .name = "SMBA",
+ .ctrl_scope = RESCTRL_L3_CACHE,
+ .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_SMBA),
+ .schema_fmt = RESCTRL_SCHEMA_RANGE,
},
},
};
+u32 resctrl_arch_system_num_rmid_idx(void)
+{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+
+ /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */
+ return r->num_rmid;
+}
+
+struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l)
+{
+ if (l >= RDT_NUM_RESOURCES)
+ return NULL;
+
+ return &rdt_resources_all[l].r_resctrl;
+}
+
/*
* cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs
* as they do not have CPUID enumeration support for Cache allocation.
@@ -130,36 +140,27 @@ static inline void cache_alloc_hsw_probe(void)
{
struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_L3];
struct rdt_resource *r = &hw_res->r_resctrl;
- u32 l, h, max_cbm = BIT_MASK(20) - 1;
+ u64 max_cbm = BIT_ULL_MASK(20) - 1, l3_cbm_0;
- if (wrmsr_safe(MSR_IA32_L3_CBM_BASE, max_cbm, 0))
+ if (wrmsrq_safe(MSR_IA32_L3_CBM_BASE, max_cbm))
return;
- rdmsr(MSR_IA32_L3_CBM_BASE, l, h);
+ rdmsrq(MSR_IA32_L3_CBM_BASE, l3_cbm_0);
/* If all the bits were set in MSR, return success */
- if (l != max_cbm)
+ if (l3_cbm_0 != max_cbm)
return;
hw_res->num_closid = 4;
- r->default_ctrl = max_cbm;
r->cache.cbm_len = 20;
r->cache.shareable_bits = 0xc0000;
r->cache.min_cbm_bits = 2;
+ r->cache.arch_has_sparse_bitmasks = false;
r->alloc_capable = true;
- r->alloc_enabled = true;
rdt_alloc_capable = true;
}
-bool is_mba_sc(struct rdt_resource *r)
-{
- if (!r)
- return rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl.membw.mba_sc;
-
- return r->membw.mba_sc;
-}
-
/*
* rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values
* exposed to user interface and the h/w understandable delay values.
@@ -181,7 +182,7 @@ static inline bool rdt_get_mb_table(struct rdt_resource *r)
return false;
}
-static bool __get_mem_config_intel(struct rdt_resource *r)
+static __init bool __get_mem_config_intel(struct rdt_resource *r)
{
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
union cpuid_0x10_3_eax eax;
@@ -191,7 +192,7 @@ static bool __get_mem_config_intel(struct rdt_resource *r)
cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full);
hw_res->num_closid = edx.split.cos_max + 1;
max_delay = eax.split.max_delay + 1;
- r->default_ctrl = MAX_MBA_BW;
+ r->membw.max_bw = MAX_MBA_BW;
r->membw.arch_needs_linear = true;
if (ecx & MBA_IS_LINEAR) {
r->membw.delay_linear = true;
@@ -202,30 +203,31 @@ static bool __get_mem_config_intel(struct rdt_resource *r)
return false;
r->membw.arch_needs_linear = false;
}
- r->data_width = 3;
if (boot_cpu_has(X86_FEATURE_PER_THREAD_MBA))
r->membw.throttle_mode = THREAD_THROTTLE_PER_THREAD;
else
r->membw.throttle_mode = THREAD_THROTTLE_MAX;
- thread_throttle_mode_init();
r->alloc_capable = true;
- r->alloc_enabled = true;
return true;
}
-static bool __rdt_get_mem_config_amd(struct rdt_resource *r)
+static __init bool __rdt_get_mem_config_amd(struct rdt_resource *r)
{
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
- union cpuid_0x10_3_eax eax;
- union cpuid_0x10_x_edx edx;
- u32 ebx, ecx;
+ u32 eax, ebx, ecx, edx, subleaf;
- cpuid_count(0x80000020, 1, &eax.full, &ebx, &ecx, &edx.full);
- hw_res->num_closid = edx.split.cos_max + 1;
- r->default_ctrl = MAX_MBA_BW_AMD;
+ /*
+ * Query CPUID_Fn80000020_EDX_x01 for MBA and
+ * CPUID_Fn80000020_EDX_x02 for SMBA
+ */
+ subleaf = (r->rid == RDT_RESOURCE_SMBA) ? 2 : 1;
+
+ cpuid_count(0x80000020, subleaf, &eax, &ebx, &ecx, &edx);
+ hw_res->num_closid = edx + 1;
+ r->membw.max_bw = 1 << eax;
/* AMD does not use delay */
r->membw.delay_linear = false;
@@ -238,11 +240,8 @@ static bool __rdt_get_mem_config_amd(struct rdt_resource *r)
r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED;
r->membw.min_bw = 0;
r->membw.bw_gran = 1;
- /* Max value is 2048, Data width should be 4 in decimal */
- r->data_width = 4;
r->alloc_capable = true;
- r->alloc_enabled = true;
return true;
}
@@ -251,17 +250,18 @@ static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
{
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
union cpuid_0x10_1_eax eax;
+ union cpuid_0x10_x_ecx ecx;
union cpuid_0x10_x_edx edx;
- u32 ebx, ecx;
+ u32 ebx, default_ctrl;
- cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx, &edx.full);
+ cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx.full, &edx.full);
hw_res->num_closid = edx.split.cos_max + 1;
r->cache.cbm_len = eax.split.cbm_len + 1;
- r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
- r->cache.shareable_bits = ebx & r->default_ctrl;
- r->data_width = (r->cache.cbm_len + 3) / 4;
+ default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
+ r->cache.shareable_bits = ebx & default_ctrl;
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ r->cache.arch_has_sparse_bitmasks = ecx.split.noncont;
r->alloc_capable = true;
- r->alloc_enabled = true;
}
static void rdt_get_cdp_config(int level)
@@ -284,15 +284,14 @@ static void rdt_get_cdp_l2_config(void)
rdt_get_cdp_config(RDT_RESOURCE_L2);
}
-static void
-mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
+static void mba_wrmsr_amd(struct msr_param *m)
{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom);
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
unsigned int i;
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
for (i = m->low; i < m->high; i++)
- wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
+ wrmsrq(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
}
/*
@@ -300,50 +299,34 @@ mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
* that can be written to QOS_MSRs.
* There are currently no SKUs which support non linear delay values.
*/
-u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
+static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
{
if (r->membw.delay_linear)
return MAX_MBA_BW - bw;
pr_warn_once("Non Linear delay-bw map not supported but queried\n");
- return r->default_ctrl;
+ return MAX_MBA_BW;
}
-static void
-mba_wrmsr_intel(struct rdt_domain *d, struct msr_param *m,
- struct rdt_resource *r)
+static void mba_wrmsr_intel(struct msr_param *m)
{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom);
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
unsigned int i;
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
/* Write the delay values for mba. */
for (i = m->low; i < m->high; i++)
- wrmsrl(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], r));
+ wrmsrq(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], m->res));
}
-static void
-cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
+static void cat_wrmsr(struct msr_param *m)
{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom);
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
unsigned int i;
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
for (i = m->low; i < m->high; i++)
- wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
-}
-
-struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r)
-{
- struct rdt_domain *d;
-
- list_for_each_entry(d, &r->domains, list) {
- /* Find the domain that contains this CPU */
- if (cpumask_test_cpu(cpu, &d->cpu_mask))
- return d;
- }
-
- return NULL;
+ wrmsrq(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
}
u32 resctrl_arch_get_num_closid(struct rdt_resource *r)
@@ -353,55 +336,14 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *r)
void rdt_ctrl_update(void *arg)
{
+ struct rdt_hw_resource *hw_res;
struct msr_param *m = arg;
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
- struct rdt_resource *r = m->res;
- int cpu = smp_processor_id();
- struct rdt_domain *d;
- d = get_domain_from_cpu(cpu, r);
- if (d) {
- hw_res->msr_update(d, m, r);
- return;
- }
- pr_warn_once("cpu %d not found in any domain for resource %s\n",
- cpu, r->name);
+ hw_res = resctrl_to_arch_res(m->res);
+ hw_res->msr_update(m);
}
-/*
- * rdt_find_domain - Find a domain in a resource that matches input resource id
- *
- * Search resource r's domain list to find the resource id. If the resource
- * id is found in a domain, return the domain. Otherwise, if requested by
- * caller, return the first domain whose id is bigger than the input id.
- * The domain list is sorted by id in ascending order.
- */
-struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
- struct list_head **pos)
-{
- struct rdt_domain *d;
- struct list_head *l;
-
- if (id < 0)
- return ERR_PTR(-ENODEV);
-
- list_for_each(l, &r->domains) {
- d = list_entry(l, struct rdt_domain, list);
- /* When id is found, return its domain. */
- if (id == d->id)
- return d;
- /* Stop searching when finding id's position in sorted list. */
- if (id < d->id)
- break;
- }
-
- if (pos)
- *pos = l;
-
- return NULL;
-}
-
-void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm)
+static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc)
{
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
int i;
@@ -410,107 +352,114 @@ void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm)
* Initialize the Control MSRs to having no control.
* For Cache Allocation: Set all bits in cbm
* For Memory Allocation: Set b/w requested to 100%
- * and the bandwidth in MBps to U32_MAX
*/
- for (i = 0; i < hw_res->num_closid; i++, dc++, dm++) {
- *dc = r->default_ctrl;
- *dm = MBA_MAX_MBPS;
- }
+ for (i = 0; i < hw_res->num_closid; i++, dc++)
+ *dc = resctrl_get_default_ctrl(r);
+}
+
+static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom)
+{
+ kfree(hw_dom->ctrl_val);
+ kfree(hw_dom);
+}
+
+static void mon_domain_free(struct rdt_hw_mon_domain *hw_dom)
+{
+ kfree(hw_dom->arch_mbm_total);
+ kfree(hw_dom->arch_mbm_local);
+ kfree(hw_dom);
}
-static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
+static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_ctrl_domain *d)
{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d);
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
struct msr_param m;
- u32 *dc, *dm;
+ u32 *dc;
dc = kmalloc_array(hw_res->num_closid, sizeof(*hw_dom->ctrl_val),
GFP_KERNEL);
if (!dc)
return -ENOMEM;
- dm = kmalloc_array(hw_res->num_closid, sizeof(*hw_dom->mbps_val),
- GFP_KERNEL);
- if (!dm) {
- kfree(dc);
- return -ENOMEM;
- }
-
hw_dom->ctrl_val = dc;
- hw_dom->mbps_val = dm;
- setup_default_ctrlval(r, dc, dm);
+ setup_default_ctrlval(r, dc);
+ m.res = r;
+ m.dom = d;
m.low = 0;
m.high = hw_res->num_closid;
- hw_res->msr_update(d, &m, r);
+ hw_res->msr_update(&m);
return 0;
}
-static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
+/**
+ * arch_domain_mbm_alloc() - Allocate arch private storage for the MBM counters
+ * @num_rmid: The size of the MBM counter array
+ * @hw_dom: The domain that owns the allocated arrays
+ */
+static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_mon_domain *hw_dom)
{
size_t tsize;
- if (is_llc_occupancy_enabled()) {
- d->rmid_busy_llc = bitmap_zalloc(r->num_rmid, GFP_KERNEL);
- if (!d->rmid_busy_llc)
+ if (resctrl_arch_is_mbm_total_enabled()) {
+ tsize = sizeof(*hw_dom->arch_mbm_total);
+ hw_dom->arch_mbm_total = kcalloc(num_rmid, tsize, GFP_KERNEL);
+ if (!hw_dom->arch_mbm_total)
return -ENOMEM;
- INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo);
}
- if (is_mbm_total_enabled()) {
- tsize = sizeof(*d->mbm_total);
- d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
- if (!d->mbm_total) {
- bitmap_free(d->rmid_busy_llc);
- return -ENOMEM;
- }
- }
- if (is_mbm_local_enabled()) {
- tsize = sizeof(*d->mbm_local);
- d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
- if (!d->mbm_local) {
- bitmap_free(d->rmid_busy_llc);
- kfree(d->mbm_total);
+ if (resctrl_arch_is_mbm_local_enabled()) {
+ tsize = sizeof(*hw_dom->arch_mbm_local);
+ hw_dom->arch_mbm_local = kcalloc(num_rmid, tsize, GFP_KERNEL);
+ if (!hw_dom->arch_mbm_local) {
+ kfree(hw_dom->arch_mbm_total);
+ hw_dom->arch_mbm_total = NULL;
return -ENOMEM;
}
}
- if (is_mbm_enabled()) {
- INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow);
- mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL);
+ return 0;
+}
+
+static int get_domain_id_from_scope(int cpu, enum resctrl_scope scope)
+{
+ switch (scope) {
+ case RESCTRL_L2_CACHE:
+ case RESCTRL_L3_CACHE:
+ return get_cpu_cacheinfo_id(cpu, scope);
+ case RESCTRL_L3_NODE:
+ return cpu_to_node(cpu);
+ default:
+ break;
}
- return 0;
+ return -EINVAL;
}
-/*
- * domain_add_cpu - Add a cpu to a resource's domain list.
- *
- * If an existing domain in the resource r's domain list matches the cpu's
- * resource id, add the cpu in the domain.
- *
- * Otherwise, a new domain is allocated and inserted into the right position
- * in the domain list sorted by id in ascending order.
- *
- * The order in the domain list is visible to users when we print entries
- * in the schemata file and schemata input is validated to have the same order
- * as this list.
- */
-static void domain_add_cpu(int cpu, struct rdt_resource *r)
+static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r)
{
- int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
+ int id = get_domain_id_from_scope(cpu, r->ctrl_scope);
+ struct rdt_hw_ctrl_domain *hw_dom;
struct list_head *add_pos = NULL;
- struct rdt_hw_domain *hw_dom;
- struct rdt_domain *d;
+ struct rdt_domain_hdr *hdr;
+ struct rdt_ctrl_domain *d;
+ int err;
- d = rdt_find_domain(r, id, &add_pos);
- if (IS_ERR(d)) {
- pr_warn("Couldn't find cache id for CPU %d\n", cpu);
+ lockdep_assert_held(&domain_list_lock);
+
+ if (id < 0) {
+ pr_warn_once("Can't find control domain id for CPU:%d scope:%d for resource %s\n",
+ cpu, r->ctrl_scope, r->name);
return;
}
- if (d) {
- cpumask_set_cpu(cpu, &d->cpu_mask);
+ hdr = resctrl_find_domain(&r->ctrl_domains, id, &add_pos);
+ if (hdr) {
+ if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN))
+ return;
+ d = container_of(hdr, struct rdt_ctrl_domain, hdr);
+
+ cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
if (r->cache.arch_has_per_cpu_cfg)
rdt_domain_reconfigure_cdp(r);
return;
@@ -521,170 +470,232 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
return;
d = &hw_dom->d_resctrl;
- d->id = id;
- cpumask_set_cpu(cpu, &d->cpu_mask);
+ d->hdr.id = id;
+ d->hdr.type = RESCTRL_CTRL_DOMAIN;
+ cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
rdt_domain_reconfigure_cdp(r);
- if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
- kfree(hw_dom);
+ if (domain_setup_ctrlval(r, d)) {
+ ctrl_domain_free(hw_dom);
return;
}
- if (r->mon_capable && domain_setup_mon_state(r, d)) {
- kfree(hw_dom->ctrl_val);
- kfree(hw_dom->mbps_val);
- kfree(hw_dom);
+ list_add_tail_rcu(&d->hdr.list, add_pos);
+
+ err = resctrl_online_ctrl_domain(r, d);
+ if (err) {
+ list_del_rcu(&d->hdr.list);
+ synchronize_rcu();
+ ctrl_domain_free(hw_dom);
+ }
+}
+
+static void domain_add_cpu_mon(int cpu, struct rdt_resource *r)
+{
+ int id = get_domain_id_from_scope(cpu, r->mon_scope);
+ struct list_head *add_pos = NULL;
+ struct rdt_hw_mon_domain *hw_dom;
+ struct rdt_domain_hdr *hdr;
+ struct rdt_mon_domain *d;
+ struct cacheinfo *ci;
+ int err;
+
+ lockdep_assert_held(&domain_list_lock);
+
+ if (id < 0) {
+ pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n",
+ cpu, r->mon_scope, r->name);
return;
}
- list_add_tail(&d->list, add_pos);
+ hdr = resctrl_find_domain(&r->mon_domains, id, &add_pos);
+ if (hdr) {
+ if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN))
+ return;
+ d = container_of(hdr, struct rdt_mon_domain, hdr);
- /*
- * If resctrl is mounted, add
- * per domain monitor data directories.
- */
- if (static_branch_unlikely(&rdt_mon_enable_key))
- mkdir_mondata_subdir_allrdtgrp(r, d);
+ cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
+ return;
+ }
+
+ hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
+ if (!hw_dom)
+ return;
+
+ d = &hw_dom->d_resctrl;
+ d->hdr.id = id;
+ d->hdr.type = RESCTRL_MON_DOMAIN;
+ ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
+ if (!ci) {
+ pr_warn_once("Can't find L3 cache for CPU:%d resource %s\n", cpu, r->name);
+ mon_domain_free(hw_dom);
+ return;
+ }
+ d->ci_id = ci->id;
+ cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
+
+ arch_mon_domain_online(r, d);
+
+ if (arch_domain_mbm_alloc(r->num_rmid, hw_dom)) {
+ mon_domain_free(hw_dom);
+ return;
+ }
+
+ list_add_tail_rcu(&d->hdr.list, add_pos);
+
+ err = resctrl_online_mon_domain(r, d);
+ if (err) {
+ list_del_rcu(&d->hdr.list);
+ synchronize_rcu();
+ mon_domain_free(hw_dom);
+ }
}
-static void domain_remove_cpu(int cpu, struct rdt_resource *r)
+static void domain_add_cpu(int cpu, struct rdt_resource *r)
+{
+ if (r->alloc_capable)
+ domain_add_cpu_ctrl(cpu, r);
+ if (r->mon_capable)
+ domain_add_cpu_mon(cpu, r);
+}
+
+static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r)
{
- int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
- struct rdt_hw_domain *hw_dom;
- struct rdt_domain *d;
+ int id = get_domain_id_from_scope(cpu, r->ctrl_scope);
+ struct rdt_hw_ctrl_domain *hw_dom;
+ struct rdt_domain_hdr *hdr;
+ struct rdt_ctrl_domain *d;
+
+ lockdep_assert_held(&domain_list_lock);
- d = rdt_find_domain(r, id, NULL);
- if (IS_ERR_OR_NULL(d)) {
- pr_warn("Couldn't find cache id for CPU %d\n", cpu);
+ if (id < 0) {
+ pr_warn_once("Can't find control domain id for CPU:%d scope:%d for resource %s\n",
+ cpu, r->ctrl_scope, r->name);
return;
}
- hw_dom = resctrl_to_arch_dom(d);
- cpumask_clear_cpu(cpu, &d->cpu_mask);
- if (cpumask_empty(&d->cpu_mask)) {
- /*
- * If resctrl is mounted, remove all the
- * per domain monitor data directories.
- */
- if (static_branch_unlikely(&rdt_mon_enable_key))
- rmdir_mondata_subdir_allrdtgrp(r, d->id);
- list_del(&d->list);
- if (r->mon_capable && is_mbm_enabled())
- cancel_delayed_work(&d->mbm_over);
- if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) {
- /*
- * When a package is going down, forcefully
- * decrement rmid->ebusy. There is no way to know
- * that the L3 was flushed and hence may lead to
- * incorrect counts in rare scenarios, but leaving
- * the RMID as busy creates RMID leaks if the
- * package never comes back.
- */
- __check_limbo(d, true);
- cancel_delayed_work(&d->cqm_limbo);
- }
+ hdr = resctrl_find_domain(&r->ctrl_domains, id, NULL);
+ if (!hdr) {
+ pr_warn("Can't find control domain for id=%d for CPU %d for resource %s\n",
+ id, cpu, r->name);
+ return;
+ }
+
+ if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN))
+ return;
+
+ d = container_of(hdr, struct rdt_ctrl_domain, hdr);
+ hw_dom = resctrl_to_arch_ctrl_dom(d);
+
+ cpumask_clear_cpu(cpu, &d->hdr.cpu_mask);
+ if (cpumask_empty(&d->hdr.cpu_mask)) {
+ resctrl_offline_ctrl_domain(r, d);
+ list_del_rcu(&d->hdr.list);
+ synchronize_rcu();
/*
- * rdt_domain "d" is going to be freed below, so clear
+ * rdt_ctrl_domain "d" is going to be freed below, so clear
* its pointer from pseudo_lock_region struct.
*/
if (d->plr)
d->plr->d = NULL;
+ ctrl_domain_free(hw_dom);
- kfree(hw_dom->ctrl_val);
- kfree(hw_dom->mbps_val);
- bitmap_free(d->rmid_busy_llc);
- kfree(d->mbm_total);
- kfree(d->mbm_local);
- kfree(hw_dom);
return;
}
+}
- if (r == &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl) {
- if (is_mbm_enabled() && cpu == d->mbm_work_cpu) {
- cancel_delayed_work(&d->mbm_over);
- mbm_setup_overflow_handler(d, 0);
- }
- if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu &&
- has_busy_rmid(r, d)) {
- cancel_delayed_work(&d->cqm_limbo);
- cqm_setup_limbo_handler(d, 0);
- }
+static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r)
+{
+ int id = get_domain_id_from_scope(cpu, r->mon_scope);
+ struct rdt_hw_mon_domain *hw_dom;
+ struct rdt_domain_hdr *hdr;
+ struct rdt_mon_domain *d;
+
+ lockdep_assert_held(&domain_list_lock);
+
+ if (id < 0) {
+ pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n",
+ cpu, r->mon_scope, r->name);
+ return;
+ }
+
+ hdr = resctrl_find_domain(&r->mon_domains, id, NULL);
+ if (!hdr) {
+ pr_warn("Can't find monitor domain for id=%d for CPU %d for resource %s\n",
+ id, cpu, r->name);
+ return;
+ }
+
+ if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN))
+ return;
+
+ d = container_of(hdr, struct rdt_mon_domain, hdr);
+ hw_dom = resctrl_to_arch_mon_dom(d);
+
+ cpumask_clear_cpu(cpu, &d->hdr.cpu_mask);
+ if (cpumask_empty(&d->hdr.cpu_mask)) {
+ resctrl_offline_mon_domain(r, d);
+ list_del_rcu(&d->hdr.list);
+ synchronize_rcu();
+ mon_domain_free(hw_dom);
+
+ return;
}
}
+static void domain_remove_cpu(int cpu, struct rdt_resource *r)
+{
+ if (r->alloc_capable)
+ domain_remove_cpu_ctrl(cpu, r);
+ if (r->mon_capable)
+ domain_remove_cpu_mon(cpu, r);
+}
+
static void clear_closid_rmid(int cpu)
{
struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state);
- state->default_closid = 0;
- state->default_rmid = 0;
- state->cur_closid = 0;
- state->cur_rmid = 0;
- wrmsr(IA32_PQR_ASSOC, 0, 0);
+ state->default_closid = RESCTRL_RESERVED_CLOSID;
+ state->default_rmid = RESCTRL_RESERVED_RMID;
+ state->cur_closid = RESCTRL_RESERVED_CLOSID;
+ state->cur_rmid = RESCTRL_RESERVED_RMID;
+ wrmsr(MSR_IA32_PQR_ASSOC, RESCTRL_RESERVED_RMID,
+ RESCTRL_RESERVED_CLOSID);
}
-static int resctrl_online_cpu(unsigned int cpu)
+static int resctrl_arch_online_cpu(unsigned int cpu)
{
struct rdt_resource *r;
- mutex_lock(&rdtgroup_mutex);
+ mutex_lock(&domain_list_lock);
for_each_capable_rdt_resource(r)
domain_add_cpu(cpu, r);
- /* The cpu is set in default rdtgroup after online. */
- cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask);
+ mutex_unlock(&domain_list_lock);
+
clear_closid_rmid(cpu);
- mutex_unlock(&rdtgroup_mutex);
+ resctrl_online_cpu(cpu);
return 0;
}
-static void clear_childcpus(struct rdtgroup *r, unsigned int cpu)
-{
- struct rdtgroup *cr;
-
- list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) {
- if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) {
- break;
- }
- }
-}
-
-static int resctrl_offline_cpu(unsigned int cpu)
+static int resctrl_arch_offline_cpu(unsigned int cpu)
{
- struct rdtgroup *rdtgrp;
struct rdt_resource *r;
- mutex_lock(&rdtgroup_mutex);
+ resctrl_offline_cpu(cpu);
+
+ mutex_lock(&domain_list_lock);
for_each_capable_rdt_resource(r)
domain_remove_cpu(cpu, r);
- list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
- if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) {
- clear_childcpus(rdtgrp, cpu);
- break;
- }
- }
+ mutex_unlock(&domain_list_lock);
+
clear_closid_rmid(cpu);
- mutex_unlock(&rdtgroup_mutex);
return 0;
}
-/*
- * Choose a width for the resource name and resource data based on the
- * resource that has widest name and cbm.
- */
-static __init void rdt_init_padding(void)
-{
- struct rdt_resource *r;
-
- for_each_alloc_capable_rdt_resource(r) {
- if (r->data_width > max_data_width)
- max_data_width = r->data_width;
- }
-}
-
enum {
RDT_FLAG_CMT,
RDT_FLAG_MBM_TOTAL,
@@ -694,6 +705,8 @@ enum {
RDT_FLAG_L2_CAT,
RDT_FLAG_L2_CDP,
RDT_FLAG_MBA,
+ RDT_FLAG_SMBA,
+ RDT_FLAG_BMEC,
};
#define RDT_OPT(idx, n, f) \
@@ -708,7 +721,7 @@ struct rdt_options {
bool force_off, force_on;
};
-static struct rdt_options rdt_options[] __initdata = {
+static struct rdt_options rdt_options[] __ro_after_init = {
RDT_OPT(RDT_FLAG_CMT, "cmt", X86_FEATURE_CQM_OCCUP_LLC),
RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL),
RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL),
@@ -717,6 +730,8 @@ static struct rdt_options rdt_options[] __initdata = {
RDT_OPT(RDT_FLAG_L2_CAT, "l2cat", X86_FEATURE_CAT_L2),
RDT_OPT(RDT_FLAG_L2_CDP, "l2cdp", X86_FEATURE_CDP_L2),
RDT_OPT(RDT_FLAG_MBA, "mba", X86_FEATURE_MBA),
+ RDT_OPT(RDT_FLAG_SMBA, "smba", X86_FEATURE_SMBA),
+ RDT_OPT(RDT_FLAG_BMEC, "bmec", X86_FEATURE_BMEC),
};
#define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options)
@@ -746,7 +761,7 @@ static int __init set_rdt_options(char *str)
}
__setup("rdt", set_rdt_options);
-static bool __init rdt_cpu_has(int flag)
+bool rdt_cpu_has(int flag)
{
bool ret = boot_cpu_has(flag);
struct rdt_options *o;
@@ -766,6 +781,21 @@ static bool __init rdt_cpu_has(int flag)
return ret;
}
+bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt)
+{
+ if (!rdt_cpu_has(X86_FEATURE_BMEC))
+ return false;
+
+ switch (evt) {
+ case QOS_L3_MBM_TOTAL_EVENT_ID:
+ return rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL);
+ case QOS_L3_MBM_LOCAL_EVENT_ID:
+ return rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL);
+ default:
+ return false;
+ }
+}
+
static __init bool get_mem_config(void)
{
struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_MBA];
@@ -781,6 +811,19 @@ static __init bool get_mem_config(void)
return false;
}
+static __init bool get_slow_mem_config(void)
+{
+ struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_SMBA];
+
+ if (!rdt_cpu_has(X86_FEATURE_SMBA))
+ return false;
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ return __rdt_get_mem_config_amd(&hw_res->r_resctrl);
+
+ return false;
+}
+
static __init bool get_rdt_alloc_resources(void)
{
struct rdt_resource *r;
@@ -811,6 +854,9 @@ static __init bool get_rdt_alloc_resources(void)
if (get_mem_config())
ret = true;
+ if (get_slow_mem_config())
+ ret = true;
+
return ret;
}
@@ -833,18 +879,18 @@ static __init bool get_rdt_mon_resources(void)
static __init void __check_quirks_intel(void)
{
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_HASWELL_X:
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_HASWELL_X:
if (!rdt_options[RDT_FLAG_L3_CAT].force_off)
cache_alloc_hsw_probe();
break;
- case INTEL_FAM6_SKYLAKE_X:
+ case INTEL_SKYLAKE_X:
if (boot_cpu_data.x86_stepping <= 4)
set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat");
else
set_rdt_options("!l3cat");
fallthrough;
- case INTEL_FAM6_BROADWELL_X:
+ case INTEL_BROADWELL_X:
intel_rdt_mbm_apply_quirk();
break;
}
@@ -874,9 +920,8 @@ static __init void rdt_init_res_defs_intel(void)
if (r->rid == RDT_RESOURCE_L3 ||
r->rid == RDT_RESOURCE_L2) {
- r->cache.arch_has_sparse_bitmaps = false;
- r->cache.arch_has_empty_bitmaps = false;
r->cache.arch_has_per_cpu_cfg = false;
+ r->cache.min_cbm_bits = 1;
} else if (r->rid == RDT_RESOURCE_MBA) {
hw_res->msr_base = MSR_IA32_MBA_THRTL_BASE;
hw_res->msr_update = mba_wrmsr_intel;
@@ -894,12 +939,15 @@ static __init void rdt_init_res_defs_amd(void)
if (r->rid == RDT_RESOURCE_L3 ||
r->rid == RDT_RESOURCE_L2) {
- r->cache.arch_has_sparse_bitmaps = true;
- r->cache.arch_has_empty_bitmaps = true;
+ r->cache.arch_has_sparse_bitmasks = true;
r->cache.arch_has_per_cpu_cfg = true;
+ r->cache.min_cbm_bits = 0;
} else if (r->rid == RDT_RESOURCE_MBA) {
hw_res->msr_base = MSR_IA32_MBA_BW_BASE;
hw_res->msr_update = mba_wrmsr_amd;
+ } else if (r->rid == RDT_RESOURCE_SMBA) {
+ hw_res->msr_base = MSR_IA32_SMBA_BW_BASE;
+ hw_res->msr_update = mba_wrmsr_amd;
}
}
}
@@ -944,10 +992,14 @@ void resctrl_cpu_detect(struct cpuinfo_x86 *c)
}
}
-static int __init resctrl_late_init(void)
+static int __init resctrl_arch_late_init(void)
{
struct rdt_resource *r;
- int state, ret;
+ int state, ret, i;
+
+ /* for_each_rdt_resource() requires all rid to be initialised. */
+ for (i = 0; i < RDT_NUM_RESOURCES; i++)
+ rdt_resources_all[i].r_resctrl.rid = i;
/*
* Initialize functions(or definitions) that are different
@@ -960,15 +1012,14 @@ static int __init resctrl_late_init(void)
if (!get_rdt_resources())
return -ENODEV;
- rdt_init_padding();
-
state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
"x86/resctrl/cat:online:",
- resctrl_online_cpu, resctrl_offline_cpu);
+ resctrl_arch_online_cpu,
+ resctrl_arch_offline_cpu);
if (state < 0)
return state;
- ret = rdtgroup_init();
+ ret = resctrl_init();
if (ret) {
cpuhp_remove_state(state);
return ret;
@@ -984,12 +1035,13 @@ static int __init resctrl_late_init(void)
return 0;
}
-late_initcall(resctrl_late_init);
+late_initcall(resctrl_arch_late_init);
-static void __exit resctrl_exit(void)
+static void __exit resctrl_arch_exit(void)
{
cpuhp_remove_state(rdt_online);
- rdtgroup_exit();
+
+ resctrl_exit();
}
-__exitcall(resctrl_exit);
+__exitcall(resctrl_arch_exit);
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 87666275eed9..1189c0df4ad7 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -16,546 +16,78 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/cpu.h>
-#include <linux/kernfs.h>
-#include <linux/seq_file.h>
-#include <linux/slab.h>
-#include "internal.h"
-
-/*
- * Check whether MBA bandwidth percentage value is correct. The value is
- * checked against the minimum and max bandwidth values specified by the
- * hardware. The allocated bandwidth percentage is rounded to the next
- * control step available on the hardware.
- */
-static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
-{
- unsigned long bw;
- int ret;
-
- /*
- * Only linear delay values is supported for current Intel SKUs.
- */
- if (!r->membw.delay_linear && r->membw.arch_needs_linear) {
- rdt_last_cmd_puts("No support for non-linear MB domains\n");
- return false;
- }
-
- ret = kstrtoul(buf, 10, &bw);
- if (ret) {
- rdt_last_cmd_printf("Non-decimal digit in MB value %s\n", buf);
- return false;
- }
-
- if ((bw < r->membw.min_bw || bw > r->default_ctrl) &&
- !is_mba_sc(r)) {
- rdt_last_cmd_printf("MB value %ld out of range [%d,%d]\n", bw,
- r->membw.min_bw, r->default_ctrl);
- return false;
- }
-
- *data = roundup(bw, (unsigned long)r->membw.bw_gran);
- return true;
-}
-
-int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
- struct rdt_domain *d)
-{
- struct resctrl_staged_config *cfg;
- struct rdt_resource *r = s->res;
- unsigned long bw_val;
-
- cfg = &d->staged_config[s->conf_type];
- if (cfg->have_new_ctrl) {
- rdt_last_cmd_printf("Duplicate domain %d\n", d->id);
- return -EINVAL;
- }
-
- if (!bw_validate(data->buf, &bw_val, r))
- return -EINVAL;
- cfg->new_ctrl = bw_val;
- cfg->have_new_ctrl = true;
-
- return 0;
-}
-
-/*
- * Check whether a cache bit mask is valid.
- * For Intel the SDM says:
- * Please note that all (and only) contiguous '1' combinations
- * are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.).
- * Additionally Haswell requires at least two bits set.
- * AMD allows non-contiguous bitmasks.
- */
-static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
-{
- unsigned long first_bit, zero_bit, val;
- unsigned int cbm_len = r->cache.cbm_len;
- int ret;
-
- ret = kstrtoul(buf, 16, &val);
- if (ret) {
- rdt_last_cmd_printf("Non-hex character in the mask %s\n", buf);
- return false;
- }
-
- if ((!r->cache.arch_has_empty_bitmaps && val == 0) ||
- val > r->default_ctrl) {
- rdt_last_cmd_puts("Mask out of range\n");
- return false;
- }
- first_bit = find_first_bit(&val, cbm_len);
- zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
-
- /* Are non-contiguous bitmaps allowed? */
- if (!r->cache.arch_has_sparse_bitmaps &&
- (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) {
- rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val);
- return false;
- }
-
- if ((zero_bit - first_bit) < r->cache.min_cbm_bits) {
- rdt_last_cmd_printf("Need at least %d bits in the mask\n",
- r->cache.min_cbm_bits);
- return false;
- }
-
- *data = val;
- return true;
-}
+#include "internal.h"
-/*
- * Read one cache bit mask (hex). Check that it is valid for the current
- * resource type.
- */
-int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
- struct rdt_domain *d)
+int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d,
+ u32 closid, enum resctrl_conf_type t, u32 cfg_val)
{
- struct rdtgroup *rdtgrp = data->rdtgrp;
- struct resctrl_staged_config *cfg;
- struct rdt_resource *r = s->res;
- u32 cbm_val;
-
- cfg = &d->staged_config[s->conf_type];
- if (cfg->have_new_ctrl) {
- rdt_last_cmd_printf("Duplicate domain %d\n", d->id);
- return -EINVAL;
- }
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d);
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+ u32 idx = resctrl_get_config_index(closid, t);
+ struct msr_param msr_param;
- /*
- * Cannot set up more than one pseudo-locked region in a cache
- * hierarchy.
- */
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
- rdtgroup_pseudo_locked_in_hierarchy(d)) {
- rdt_last_cmd_puts("Pseudo-locked region in hierarchy\n");
+ if (!cpumask_test_cpu(smp_processor_id(), &d->hdr.cpu_mask))
return -EINVAL;
- }
- if (!cbm_validate(data->buf, &cbm_val, r))
- return -EINVAL;
+ hw_dom->ctrl_val[idx] = cfg_val;
- if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE ||
- rdtgrp->mode == RDT_MODE_SHAREABLE) &&
- rdtgroup_cbm_overlaps_pseudo_locked(d, cbm_val)) {
- rdt_last_cmd_puts("CBM overlaps with pseudo-locked region\n");
- return -EINVAL;
- }
-
- /*
- * The CBM may not overlap with the CBM of another closid if
- * either is exclusive.
- */
- if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, true)) {
- rdt_last_cmd_puts("Overlaps with exclusive group\n");
- return -EINVAL;
- }
-
- if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, false)) {
- if (rdtgrp->mode == RDT_MODE_EXCLUSIVE ||
- rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
- rdt_last_cmd_puts("Overlaps with other group\n");
- return -EINVAL;
- }
- }
-
- cfg->new_ctrl = cbm_val;
- cfg->have_new_ctrl = true;
+ msr_param.res = r;
+ msr_param.dom = d;
+ msr_param.low = idx;
+ msr_param.high = idx + 1;
+ hw_res->msr_update(&msr_param);
return 0;
}
-/*
- * For each domain in this resource we expect to find a series of:
- * id=mask
- * separated by ";". The "id" is in decimal, and must match one of
- * the "id"s for this resource.
- */
-static int parse_line(char *line, struct resctrl_schema *s,
- struct rdtgroup *rdtgrp)
-{
- enum resctrl_conf_type t = s->conf_type;
- struct resctrl_staged_config *cfg;
- struct rdt_resource *r = s->res;
- struct rdt_parse_data data;
- char *dom = NULL, *id;
- struct rdt_domain *d;
- unsigned long dom_id;
-
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
- r->rid == RDT_RESOURCE_MBA) {
- rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n");
- return -EINVAL;
- }
-
-next:
- if (!line || line[0] == '\0')
- return 0;
- dom = strsep(&line, ";");
- id = strsep(&dom, "=");
- if (!dom || kstrtoul(id, 10, &dom_id)) {
- rdt_last_cmd_puts("Missing '=' or non-numeric domain\n");
- return -EINVAL;
- }
- dom = strim(dom);
- list_for_each_entry(d, &r->domains, list) {
- if (d->id == dom_id) {
- data.buf = dom;
- data.rdtgrp = rdtgrp;
- if (r->parse_ctrlval(&data, s, d))
- return -EINVAL;
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
- cfg = &d->staged_config[t];
- /*
- * In pseudo-locking setup mode and just
- * parsed a valid CBM that should be
- * pseudo-locked. Only one locked region per
- * resource group and domain so just do
- * the required initialization for single
- * region and return.
- */
- rdtgrp->plr->s = s;
- rdtgrp->plr->d = d;
- rdtgrp->plr->cbm = cfg->new_ctrl;
- d->plr = rdtgrp->plr;
- return 0;
- }
- goto next;
- }
- }
- return -EINVAL;
-}
-
-static u32 get_config_index(u32 closid, enum resctrl_conf_type type)
-{
- switch (type) {
- default:
- case CDP_NONE:
- return closid;
- case CDP_CODE:
- return closid * 2 + 1;
- case CDP_DATA:
- return closid * 2;
- }
-}
-
-static bool apply_config(struct rdt_hw_domain *hw_dom,
- struct resctrl_staged_config *cfg, u32 idx,
- cpumask_var_t cpu_mask, bool mba_sc)
-{
- struct rdt_domain *dom = &hw_dom->d_resctrl;
- u32 *dc = !mba_sc ? hw_dom->ctrl_val : hw_dom->mbps_val;
-
- if (cfg->new_ctrl != dc[idx]) {
- cpumask_set_cpu(cpumask_any(&dom->cpu_mask), cpu_mask);
- dc[idx] = cfg->new_ctrl;
-
- return true;
- }
-
- return false;
-}
-
int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
{
struct resctrl_staged_config *cfg;
- struct rdt_hw_domain *hw_dom;
+ struct rdt_hw_ctrl_domain *hw_dom;
struct msr_param msr_param;
+ struct rdt_ctrl_domain *d;
enum resctrl_conf_type t;
- cpumask_var_t cpu_mask;
- struct rdt_domain *d;
- bool mba_sc;
- int cpu;
u32 idx;
- if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
- return -ENOMEM;
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
- mba_sc = is_mba_sc(r);
- msr_param.res = NULL;
- list_for_each_entry(d, &r->domains, list) {
- hw_dom = resctrl_to_arch_dom(d);
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+ hw_dom = resctrl_to_arch_ctrl_dom(d);
+ msr_param.res = NULL;
for (t = 0; t < CDP_NUM_TYPES; t++) {
cfg = &hw_dom->d_resctrl.staged_config[t];
if (!cfg->have_new_ctrl)
continue;
- idx = get_config_index(closid, t);
- if (!apply_config(hw_dom, cfg, idx, cpu_mask, mba_sc))
+ idx = resctrl_get_config_index(closid, t);
+ if (cfg->new_ctrl == hw_dom->ctrl_val[idx])
continue;
+ hw_dom->ctrl_val[idx] = cfg->new_ctrl;
if (!msr_param.res) {
msr_param.low = idx;
msr_param.high = msr_param.low + 1;
msr_param.res = r;
+ msr_param.dom = d;
} else {
msr_param.low = min(msr_param.low, idx);
msr_param.high = max(msr_param.high, idx + 1);
}
}
+ if (msr_param.res)
+ smp_call_function_any(&d->hdr.cpu_mask, rdt_ctrl_update, &msr_param, 1);
}
- /*
- * Avoid writing the control msr with control values when
- * MBA software controller is enabled
- */
- if (cpumask_empty(cpu_mask) || mba_sc)
- goto done;
- cpu = get_cpu();
- /* Update resource control msr on this CPU if it's in cpu_mask. */
- if (cpumask_test_cpu(cpu, cpu_mask))
- rdt_ctrl_update(&msr_param);
- /* Update resource control msr on other CPUs. */
- smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1);
- put_cpu();
-
-done:
- free_cpumask_var(cpu_mask);
-
return 0;
}
-static int rdtgroup_parse_resource(char *resname, char *tok,
- struct rdtgroup *rdtgrp)
-{
- struct resctrl_schema *s;
-
- list_for_each_entry(s, &resctrl_schema_all, list) {
- if (!strcmp(resname, s->name) && rdtgrp->closid < s->num_closid)
- return parse_line(tok, s, rdtgrp);
- }
- rdt_last_cmd_printf("Unknown or unsupported resource name '%s'\n", resname);
- return -EINVAL;
-}
-
-ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
-{
- struct resctrl_schema *s;
- struct rdtgroup *rdtgrp;
- struct rdt_domain *dom;
- struct rdt_resource *r;
- char *tok, *resname;
- int ret = 0;
-
- /* Valid input requires a trailing newline */
- if (nbytes == 0 || buf[nbytes - 1] != '\n')
- return -EINVAL;
- buf[nbytes - 1] = '\0';
-
- cpus_read_lock();
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
- if (!rdtgrp) {
- rdtgroup_kn_unlock(of->kn);
- cpus_read_unlock();
- return -ENOENT;
- }
- rdt_last_cmd_clear();
-
- /*
- * No changes to pseudo-locked region allowed. It has to be removed
- * and re-created instead.
- */
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
- ret = -EINVAL;
- rdt_last_cmd_puts("Resource group is pseudo-locked\n");
- goto out;
- }
-
- list_for_each_entry(s, &resctrl_schema_all, list) {
- list_for_each_entry(dom, &s->res->domains, list)
- memset(dom->staged_config, 0, sizeof(dom->staged_config));
- }
-
- while ((tok = strsep(&buf, "\n")) != NULL) {
- resname = strim(strsep(&tok, ":"));
- if (!tok) {
- rdt_last_cmd_puts("Missing ':'\n");
- ret = -EINVAL;
- goto out;
- }
- if (tok[0] == '\0') {
- rdt_last_cmd_printf("Missing '%s' value\n", resname);
- ret = -EINVAL;
- goto out;
- }
- ret = rdtgroup_parse_resource(resname, tok, rdtgrp);
- if (ret)
- goto out;
- }
-
- list_for_each_entry(s, &resctrl_schema_all, list) {
- r = s->res;
- ret = resctrl_arch_update_domains(r, rdtgrp->closid);
- if (ret)
- goto out;
- }
-
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
- /*
- * If pseudo-locking fails we keep the resource group in
- * mode RDT_MODE_PSEUDO_LOCKSETUP with its class of service
- * active and updated for just the domain the pseudo-locked
- * region was requested for.
- */
- ret = rdtgroup_pseudo_lock_create(rdtgrp);
- }
-
-out:
- rdtgroup_kn_unlock(of->kn);
- cpus_read_unlock();
- return ret ?: nbytes;
-}
-
-u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
+u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d,
u32 closid, enum resctrl_conf_type type)
{
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
- u32 idx = get_config_index(closid, type);
-
- if (!is_mba_sc(r))
- return hw_dom->ctrl_val[idx];
- return hw_dom->mbps_val[idx];
-}
-
-static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid)
-{
- struct rdt_resource *r = schema->res;
- struct rdt_domain *dom;
- bool sep = false;
- u32 ctrl_val;
-
- seq_printf(s, "%*s:", max_name_width, schema->name);
- list_for_each_entry(dom, &r->domains, list) {
- if (sep)
- seq_puts(s, ";");
-
- ctrl_val = resctrl_arch_get_config(r, dom, closid,
- schema->conf_type);
- seq_printf(s, r->format_str, dom->id, max_data_width,
- ctrl_val);
- sep = true;
- }
- seq_puts(s, "\n");
-}
-
-int rdtgroup_schemata_show(struct kernfs_open_file *of,
- struct seq_file *s, void *v)
-{
- struct resctrl_schema *schema;
- struct rdtgroup *rdtgrp;
- int ret = 0;
- u32 closid;
-
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
- if (rdtgrp) {
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
- list_for_each_entry(schema, &resctrl_schema_all, list) {
- seq_printf(s, "%s:uninitialized\n", schema->name);
- }
- } else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
- if (!rdtgrp->plr->d) {
- rdt_last_cmd_clear();
- rdt_last_cmd_puts("Cache domain offline\n");
- ret = -ENODEV;
- } else {
- seq_printf(s, "%s:%d=%x\n",
- rdtgrp->plr->s->res->name,
- rdtgrp->plr->d->id,
- rdtgrp->plr->cbm);
- }
- } else {
- closid = rdtgrp->closid;
- list_for_each_entry(schema, &resctrl_schema_all, list) {
- if (closid < schema->num_closid)
- show_doms(s, schema, closid);
- }
- }
- } else {
- ret = -ENOENT;
- }
- rdtgroup_kn_unlock(of->kn);
- return ret;
-}
-
-void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
- struct rdt_domain *d, struct rdtgroup *rdtgrp,
- int evtid, int first)
-{
- /*
- * setup the parameters to send to the IPI to read the data.
- */
- rr->rgrp = rdtgrp;
- rr->evtid = evtid;
- rr->r = r;
- rr->d = d;
- rr->val = 0;
- rr->first = first;
-
- smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1);
-}
-
-int rdtgroup_mondata_show(struct seq_file *m, void *arg)
-{
- struct kernfs_open_file *of = m->private;
- struct rdt_hw_resource *hw_res;
- u32 resid, evtid, domid;
- struct rdtgroup *rdtgrp;
- struct rdt_resource *r;
- union mon_data_bits md;
- struct rdt_domain *d;
- struct rmid_read rr;
- int ret = 0;
-
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
- if (!rdtgrp) {
- ret = -ENOENT;
- goto out;
- }
-
- md.priv = of->kn->priv;
- resid = md.u.rid;
- domid = md.u.domid;
- evtid = md.u.evtid;
-
- hw_res = &rdt_resources_all[resid];
- r = &hw_res->r_resctrl;
- d = rdt_find_domain(r, domid, NULL);
- if (IS_ERR_OR_NULL(d)) {
- ret = -ENOENT;
- goto out;
- }
-
- mon_event_read(&rr, r, d, rdtgrp, evtid, false);
-
- if (rr.val & RMID_VAL_ERROR)
- seq_puts(m, "Error\n");
- else if (rr.val & RMID_VAL_UNAVAIL)
- seq_puts(m, "Unavailable\n");
- else
- seq_printf(m, "%llu\n", rr.val * hw_res->mon_scale);
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d);
+ u32 idx = resctrl_get_config_index(closid, type);
-out:
- rdtgroup_kn_unlock(of->kn);
- return ret;
+ return hw_dom->ctrl_val[idx];
}
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 1d647188a43b..5e3c41b36437 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -3,45 +3,21 @@
#define _ASM_X86_RESCTRL_INTERNAL_H
#include <linux/resctrl.h>
-#include <linux/sched.h>
-#include <linux/kernfs.h>
-#include <linux/fs_context.h>
-#include <linux/jump_label.h>
-
-#define MSR_IA32_L3_QOS_CFG 0xc81
-#define MSR_IA32_L2_QOS_CFG 0xc82
-#define MSR_IA32_L3_CBM_BASE 0xc90
-#define MSR_IA32_L2_CBM_BASE 0xd10
-#define MSR_IA32_MBA_THRTL_BASE 0xd50
-#define MSR_IA32_MBA_BW_BASE 0xc0000200
-
-#define MSR_IA32_QM_CTR 0x0c8e
-#define MSR_IA32_QM_EVTSEL 0x0c8d
#define L3_QOS_CDP_ENABLE 0x01ULL
#define L2_QOS_CDP_ENABLE 0x01ULL
-/*
- * Event IDs are used to program IA32_QM_EVTSEL before reading event
- * counter from IA32_QM_CTR
- */
-#define QOS_L3_OCCUP_EVENT_ID 0x01
-#define QOS_L3_MBM_TOTAL_EVENT_ID 0x02
-#define QOS_L3_MBM_LOCAL_EVENT_ID 0x03
-
-#define CQM_LIMBOCHECK_INTERVAL 1000
-
#define MBM_CNTR_WIDTH_BASE 24
-#define MBM_OVERFLOW_INTERVAL 1000
-#define MAX_MBA_BW 100u
+
#define MBA_IS_LINEAR 0x4
-#define MBA_MAX_MBPS U32_MAX
-#define MAX_MBA_BW_AMD 0x800
+
#define MBM_CNTR_WIDTH_OFFSET_AMD 20
#define RMID_VAL_ERROR BIT_ULL(63)
+
#define RMID_VAL_UNAVAIL BIT_ULL(62)
+
/*
* With the above fields in use 62 bits remain in MSR_IA32_QM_CTR for
* data to be returned. The counter width is discovered from the hardware
@@ -49,324 +25,70 @@
*/
#define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE)
-
-struct rdt_fs_context {
- struct kernfs_fs_context kfc;
- bool enable_cdpl2;
- bool enable_cdpl3;
- bool enable_mba_mbps;
-};
-
-static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc)
-{
- struct kernfs_fs_context *kfc = fc->fs_private;
-
- return container_of(kfc, struct rdt_fs_context, kfc);
-}
-
-DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
-DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
-
-/**
- * struct mon_evt - Entry in the event list of a resource
- * @evtid: event id
- * @name: name of the event
- * @list: entry in &rdt_resource->evt_list
- */
-struct mon_evt {
- u32 evtid;
- char *name;
- struct list_head list;
-};
-
-/**
- * union mon_data_bits - Monitoring details for each event file
- * @priv: Used to store monitoring event data in @u
- * as kernfs private data
- * @rid: Resource id associated with the event file
- * @evtid: Event id associated with the event file
- * @domid: The domain to which the event file belongs
- * @u: Name of the bit fields struct
- */
-union mon_data_bits {
- void *priv;
- struct {
- unsigned int rid : 10;
- unsigned int evtid : 8;
- unsigned int domid : 14;
- } u;
-};
-
-struct rmid_read {
- struct rdtgroup *rgrp;
- struct rdt_resource *r;
- struct rdt_domain *d;
- int evtid;
- bool first;
- u64 val;
-};
-
-extern unsigned int resctrl_cqm_threshold;
-extern bool rdt_alloc_capable;
-extern bool rdt_mon_capable;
-extern unsigned int rdt_mon_features;
-extern struct list_head resctrl_schema_all;
-
-enum rdt_group_type {
- RDTCTRL_GROUP = 0,
- RDTMON_GROUP,
- RDT_NUM_GROUP,
-};
-
-/**
- * enum rdtgrp_mode - Mode of a RDT resource group
- * @RDT_MODE_SHAREABLE: This resource group allows sharing of its allocations
- * @RDT_MODE_EXCLUSIVE: No sharing of this resource group's allocations allowed
- * @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking
- * @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations
- * allowed AND the allocations are Cache Pseudo-Locked
- * @RDT_NUM_MODES: Total number of modes
- *
- * The mode of a resource group enables control over the allowed overlap
- * between allocations associated with different resource groups (classes
- * of service). User is able to modify the mode of a resource group by
- * writing to the "mode" resctrl file associated with the resource group.
- *
- * The "shareable", "exclusive", and "pseudo-locksetup" modes are set by
- * writing the appropriate text to the "mode" file. A resource group enters
- * "pseudo-locked" mode after the schemata is written while the resource
- * group is in "pseudo-locksetup" mode.
- */
-enum rdtgrp_mode {
- RDT_MODE_SHAREABLE = 0,
- RDT_MODE_EXCLUSIVE,
- RDT_MODE_PSEUDO_LOCKSETUP,
- RDT_MODE_PSEUDO_LOCKED,
-
- /* Must be last */
- RDT_NUM_MODES,
-};
-
-/**
- * struct mongroup - store mon group's data in resctrl fs.
- * @mon_data_kn: kernfs node for the mon_data directory
- * @parent: parent rdtgrp
- * @crdtgrp_list: child rdtgroup node list
- * @rmid: rmid for this rdtgroup
- */
-struct mongroup {
- struct kernfs_node *mon_data_kn;
- struct rdtgroup *parent;
- struct list_head crdtgrp_list;
- u32 rmid;
-};
-
-/**
- * struct pseudo_lock_region - pseudo-lock region information
- * @s: Resctrl schema for the resource to which this
- * pseudo-locked region belongs
- * @d: RDT domain to which this pseudo-locked region
- * belongs
- * @cbm: bitmask of the pseudo-locked region
- * @lock_thread_wq: waitqueue used to wait on the pseudo-locking thread
- * completion
- * @thread_done: variable used by waitqueue to test if pseudo-locking
- * thread completed
- * @cpu: core associated with the cache on which the setup code
- * will be run
- * @line_size: size of the cache lines
- * @size: size of pseudo-locked region in bytes
- * @kmem: the kernel memory associated with pseudo-locked region
- * @minor: minor number of character device associated with this
- * region
- * @debugfs_dir: pointer to this region's directory in the debugfs
- * filesystem
- * @pm_reqs: Power management QoS requests related to this region
- */
-struct pseudo_lock_region {
- struct resctrl_schema *s;
- struct rdt_domain *d;
- u32 cbm;
- wait_queue_head_t lock_thread_wq;
- int thread_done;
- int cpu;
- unsigned int line_size;
- unsigned int size;
- void *kmem;
- unsigned int minor;
- struct dentry *debugfs_dir;
- struct list_head pm_reqs;
-};
-
-/**
- * struct rdtgroup - store rdtgroup's data in resctrl file system.
- * @kn: kernfs node
- * @rdtgroup_list: linked list for all rdtgroups
- * @closid: closid for this rdtgroup
- * @cpu_mask: CPUs assigned to this rdtgroup
- * @flags: status bits
- * @waitcount: how many cpus expect to find this
- * group when they acquire rdtgroup_mutex
- * @type: indicates type of this rdtgroup - either
- * monitor only or ctrl_mon group
- * @mon: mongroup related data
- * @mode: mode of resource group
- * @plr: pseudo-locked region
- */
-struct rdtgroup {
- struct kernfs_node *kn;
- struct list_head rdtgroup_list;
- u32 closid;
- struct cpumask cpu_mask;
- int flags;
- atomic_t waitcount;
- enum rdt_group_type type;
- struct mongroup mon;
- enum rdtgrp_mode mode;
- struct pseudo_lock_region *plr;
-};
-
-/* rdtgroup.flags */
-#define RDT_DELETED 1
-
-/* rftype.flags */
-#define RFTYPE_FLAGS_CPUS_LIST 1
-
-/*
- * Define the file type flags for base and info directories.
- */
-#define RFTYPE_INFO BIT(0)
-#define RFTYPE_BASE BIT(1)
-#define RF_CTRLSHIFT 4
-#define RF_MONSHIFT 5
-#define RF_TOPSHIFT 6
-#define RFTYPE_CTRL BIT(RF_CTRLSHIFT)
-#define RFTYPE_MON BIT(RF_MONSHIFT)
-#define RFTYPE_TOP BIT(RF_TOPSHIFT)
-#define RFTYPE_RES_CACHE BIT(8)
-#define RFTYPE_RES_MB BIT(9)
-#define RF_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL)
-#define RF_MON_INFO (RFTYPE_INFO | RFTYPE_MON)
-#define RF_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP)
-#define RF_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL)
-
-/* List of all resource groups */
-extern struct list_head rdt_all_groups;
-
-extern int max_name_width, max_data_width;
-
-int __init rdtgroup_init(void);
-void __exit rdtgroup_exit(void);
-
-/**
- * struct rftype - describe each file in the resctrl file system
- * @name: File name
- * @mode: Access mode
- * @kf_ops: File operations
- * @flags: File specific RFTYPE_FLAGS_* flags
- * @fflags: File specific RF_* or RFTYPE_* flags
- * @seq_show: Show content of the file
- * @write: Write to the file
- */
-struct rftype {
- char *name;
- umode_t mode;
- const struct kernfs_ops *kf_ops;
- unsigned long flags;
- unsigned long fflags;
-
- int (*seq_show)(struct kernfs_open_file *of,
- struct seq_file *sf, void *v);
- /*
- * write() is the generic write callback which maps directly to
- * kernfs write operation and overrides all other operations.
- * Maximum write size is determined by ->max_write_len.
- */
- ssize_t (*write)(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off);
-};
-
/**
- * struct mbm_state - status for each MBM counter in each domain
+ * struct arch_mbm_state - values used to compute resctrl_arch_rmid_read()s
+ * return value.
* @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes)
- * @prev_msr: Value of IA32_QM_CTR for this RMID last time we read it
- * @prev_bw_msr:Value of previous IA32_QM_CTR for bandwidth counting
- * @prev_bw: The most recent bandwidth in MBps
- * @delta_bw: Difference between the current and previous bandwidth
- * @delta_comp: Indicates whether to compute the delta_bw
+ * @prev_msr: Value of IA32_QM_CTR last time it was read for the RMID used to
+ * find this struct.
*/
-struct mbm_state {
+struct arch_mbm_state {
u64 chunks;
u64 prev_msr;
- u64 prev_bw_msr;
- u32 prev_bw;
- u32 delta_bw;
- bool delta_comp;
};
/**
- * struct rdt_hw_domain - Arch private attributes of a set of CPUs that share
- * a resource
+ * struct rdt_hw_ctrl_domain - Arch private attributes of a set of CPUs that share
+ * a resource for a control function
* @d_resctrl: Properties exposed to the resctrl file system
* @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID)
- * @mbps_val: When mba_sc is enabled, this holds the bandwidth in MBps
*
* Members of this structure are accessed via helpers that provide abstraction.
*/
-struct rdt_hw_domain {
- struct rdt_domain d_resctrl;
+struct rdt_hw_ctrl_domain {
+ struct rdt_ctrl_domain d_resctrl;
u32 *ctrl_val;
- u32 *mbps_val;
};
-static inline struct rdt_hw_domain *resctrl_to_arch_dom(struct rdt_domain *r)
+/**
+ * struct rdt_hw_mon_domain - Arch private attributes of a set of CPUs that share
+ * a resource for a monitor function
+ * @d_resctrl: Properties exposed to the resctrl file system
+ * @arch_mbm_total: arch private state for MBM total bandwidth
+ * @arch_mbm_local: arch private state for MBM local bandwidth
+ *
+ * Members of this structure are accessed via helpers that provide abstraction.
+ */
+struct rdt_hw_mon_domain {
+ struct rdt_mon_domain d_resctrl;
+ struct arch_mbm_state *arch_mbm_total;
+ struct arch_mbm_state *arch_mbm_local;
+};
+
+static inline struct rdt_hw_ctrl_domain *resctrl_to_arch_ctrl_dom(struct rdt_ctrl_domain *r)
+{
+ return container_of(r, struct rdt_hw_ctrl_domain, d_resctrl);
+}
+
+static inline struct rdt_hw_mon_domain *resctrl_to_arch_mon_dom(struct rdt_mon_domain *r)
{
- return container_of(r, struct rdt_hw_domain, d_resctrl);
+ return container_of(r, struct rdt_hw_mon_domain, d_resctrl);
}
/**
* struct msr_param - set a range of MSRs from a domain
* @res: The resource to use
+ * @dom: The domain to update
* @low: Beginning index from base MSR
* @high: End index
*/
struct msr_param {
struct rdt_resource *res;
+ struct rdt_ctrl_domain *dom;
u32 low;
u32 high;
};
-static inline bool is_llc_occupancy_enabled(void)
-{
- return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID));
-}
-
-static inline bool is_mbm_total_enabled(void)
-{
- return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID));
-}
-
-static inline bool is_mbm_local_enabled(void)
-{
- return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID));
-}
-
-static inline bool is_mbm_enabled(void)
-{
- return (is_mbm_total_enabled() || is_mbm_local_enabled());
-}
-
-static inline bool is_mbm_event(int e)
-{
- return (e >= QOS_L3_MBM_TOTAL_EVENT_ID &&
- e <= QOS_L3_MBM_LOCAL_EVENT_ID);
-}
-
-struct rdt_parse_data {
- struct rdtgroup *rdtgrp;
- char *buf;
-};
-
/**
* struct rdt_hw_resource - arch private attributes of a resctrl resource
* @r_resctrl: Attributes of the resource used directly by resctrl.
@@ -389,8 +111,7 @@ struct rdt_hw_resource {
struct rdt_resource r_resctrl;
u32 num_closid;
unsigned int msr_base;
- void (*msr_update) (struct rdt_domain *d, struct msr_param *m,
- struct rdt_resource *r);
+ void (*msr_update)(struct msr_param *m);
unsigned int mon_scale;
unsigned int mbm_width;
bool cdp_enabled;
@@ -401,71 +122,9 @@ static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r
return container_of(r, struct rdt_hw_resource, r_resctrl);
}
-int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
- struct rdt_domain *d);
-int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
- struct rdt_domain *d);
-
-extern struct mutex rdtgroup_mutex;
-
extern struct rdt_hw_resource rdt_resources_all[];
-extern struct rdtgroup rdtgroup_default;
-DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
-
-extern struct dentry *debugfs_resctrl;
-
-enum resctrl_res_level {
- RDT_RESOURCE_L3,
- RDT_RESOURCE_L2,
- RDT_RESOURCE_MBA,
-
- /* Must be the last */
- RDT_NUM_RESOURCES,
-};
-
-static inline struct rdt_resource *resctrl_inc(struct rdt_resource *res)
-{
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(res);
-
- hw_res++;
- return &hw_res->r_resctrl;
-}
-
-static inline bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l)
-{
- return rdt_resources_all[l].cdp_enabled;
-}
-
-int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable);
-
-/*
- * To return the common struct rdt_resource, which is contained in struct
- * rdt_hw_resource, walk the resctrl member of struct rdt_hw_resource.
- */
-#define for_each_rdt_resource(r) \
- for (r = &rdt_resources_all[0].r_resctrl; \
- r <= &rdt_resources_all[RDT_NUM_RESOURCES - 1].r_resctrl; \
- r = resctrl_inc(r))
-
-#define for_each_capable_rdt_resource(r) \
- for_each_rdt_resource(r) \
- if (r->alloc_capable || r->mon_capable)
-
-#define for_each_alloc_capable_rdt_resource(r) \
- for_each_rdt_resource(r) \
- if (r->alloc_capable)
-
-#define for_each_mon_capable_rdt_resource(r) \
- for_each_rdt_resource(r) \
- if (r->mon_capable)
-#define for_each_alloc_enabled_rdt_resource(r) \
- for_each_rdt_resource(r) \
- if (r->alloc_enabled)
-
-#define for_each_mon_enabled_rdt_resource(r) \
- for_each_rdt_resource(r) \
- if (r->mon_enabled)
+void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d);
/* CPUID.(EAX=10H, ECX=ResID=1).EAX */
union cpuid_0x10_1_eax {
@@ -483,6 +142,15 @@ union cpuid_0x10_3_eax {
unsigned int full;
};
+/* CPUID.(EAX=10H, ECX=ResID).ECX */
+union cpuid_0x10_x_ecx {
+ struct {
+ unsigned int reserved:3;
+ unsigned int noncont:1;
+ } split;
+ unsigned int full;
+};
+
/* CPUID.(EAX=10H, ECX=ResID).EDX */
union cpuid_0x10_x_edx {
struct {
@@ -491,64 +159,14 @@ union cpuid_0x10_x_edx {
unsigned int full;
};
-void rdt_last_cmd_clear(void);
-void rdt_last_cmd_puts(const char *s);
-__printf(1, 2)
-void rdt_last_cmd_printf(const char *fmt, ...);
-
void rdt_ctrl_update(void *arg);
-struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
-void rdtgroup_kn_unlock(struct kernfs_node *kn);
-int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name);
-int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
- umode_t mask);
-struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
- struct list_head **pos);
-ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off);
-int rdtgroup_schemata_show(struct kernfs_open_file *of,
- struct seq_file *s, void *v);
-bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d,
- unsigned long cbm, int closid, bool exclusive);
-unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_domain *d,
- unsigned long cbm);
-enum rdtgrp_mode rdtgroup_mode_by_closid(int closid);
-int rdtgroup_tasks_assigned(struct rdtgroup *r);
-int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp);
-int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp);
-bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm);
-bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d);
-int rdt_pseudo_lock_init(void);
-void rdt_pseudo_lock_release(void);
-int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp);
-void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp);
-struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r);
-int closids_supported(void);
-void closid_free(int closid);
-int alloc_rmid(void);
-void free_rmid(u32 rmid);
+
int rdt_get_mon_l3_config(struct rdt_resource *r);
-void mon_event_count(void *info);
-int rdtgroup_mondata_show(struct seq_file *m, void *arg);
-void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
- unsigned int dom_id);
-void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
- struct rdt_domain *d);
-void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
- struct rdt_domain *d, struct rdtgroup *rdtgrp,
- int evtid, int first);
-void mbm_setup_overflow_handler(struct rdt_domain *dom,
- unsigned long delay_ms);
-void mbm_handle_overflow(struct work_struct *work);
+
+bool rdt_cpu_has(int flag);
+
void __init intel_rdt_mbm_apply_quirk(void);
-bool is_mba_sc(struct rdt_resource *r);
-void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm);
-u32 delay_bw_map(unsigned long bw, struct rdt_resource *r);
-void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms);
-void cqm_handle_limbo(struct work_struct *work);
-bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d);
-void __check_limbo(struct rdt_domain *d, bool force_free);
+
void rdt_domain_reconfigure_cdp(struct rdt_resource *r);
-void __init thread_throttle_mode_init(void);
#endif /* _ASM_X86_RESCTRL_INTERNAL_H */
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index eaf25a234ff5..c261558276cd 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -15,37 +15,15 @@
* Software Developer Manual June 2016, volume 3, section 17.17.
*/
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <asm/cpu_device_id.h>
-#include "internal.h"
+#define pr_fmt(fmt) "resctrl: " fmt
-struct rmid_entry {
- u32 rmid;
- int busy;
- struct list_head list;
-};
+#include <linux/cpu.h>
+#include <linux/resctrl.h>
-/**
- * @rmid_free_lru A least recently used list of free RMIDs
- * These RMIDs are guaranteed to have an occupancy less than the
- * threshold occupancy
- */
-static LIST_HEAD(rmid_free_lru);
-
-/**
- * @rmid_limbo_count count of currently unused but (potentially)
- * dirty RMIDs.
- * This counts RMIDs that no one is currently using but that
- * may have a occupancy value > intel_cqm_threshold. User can change
- * the threshold occupancy value.
- */
-static unsigned int rmid_limbo_count;
+#include <asm/cpu_device_id.h>
+#include <asm/msr.h>
-/**
- * @rmid_entry - The entry in the limbo and free lists.
- */
-static struct rmid_entry *rmid_ptrs;
+#include "internal.h"
/*
* Global boolean for rdt_monitor which is true if any
@@ -58,16 +36,12 @@ bool rdt_mon_capable;
*/
unsigned int rdt_mon_features;
-/*
- * This is the threshold cache occupancy at which we will consider an
- * RMID available for re-allocation.
- */
-unsigned int resctrl_cqm_threshold;
-
#define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5))
+static int snc_nodes_per_l3_cache = 1;
+
/*
- * The correction factor table is documented in Documentation/x86/resctrl.rst.
+ * The correction factor table is documented in Documentation/filesystems/resctrl.rst.
* If rmid > rmid threshold, MBM total and local values should be multiplied
* by the correction factor.
*
@@ -116,6 +90,7 @@ static const struct mbm_correction_factor_table {
};
static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX;
+
static u64 mbm_cf __read_mostly;
static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val)
@@ -127,19 +102,45 @@ static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val)
return val;
}
-static inline struct rmid_entry *__rmid_entry(u32 rmid)
+/*
+ * When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by
+ * "snc_nodes_per_l3_cache == 1") no translation of the RMID value is
+ * needed. The physical RMID is the same as the logical RMID.
+ *
+ * On a platform with SNC mode enabled, Linux enables RMID sharing mode
+ * via MSR 0xCA0 (see the "RMID Sharing Mode" section in the "Intel
+ * Resource Director Technology Architecture Specification" for a full
+ * description of RMID sharing mode).
+ *
+ * In RMID sharing mode there are fewer "logical RMID" values available
+ * to accumulate data ("physical RMIDs" are divided evenly between SNC
+ * nodes that share an L3 cache). Linux creates an rdt_mon_domain for
+ * each SNC node.
+ *
+ * The value loaded into IA32_PQR_ASSOC is the "logical RMID".
+ *
+ * Data is collected independently on each SNC node and can be retrieved
+ * using the "physical RMID" value computed by this function and loaded
+ * into IA32_QM_EVTSEL. @cpu can be any CPU in the SNC node.
+ *
+ * The scope of the IA32_QM_EVTSEL and IA32_QM_CTR MSRs is at the L3
+ * cache. So a "physical RMID" may be read from any CPU that shares
+ * the L3 cache with the desired SNC node, not just from a CPU in
+ * the specific SNC node.
+ */
+static int logical_rmid_to_physical_rmid(int cpu, int lrmid)
{
- struct rmid_entry *entry;
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
- entry = &rmid_ptrs[rmid];
- WARN_ON(entry->rmid != rmid);
+ if (snc_nodes_per_l3_cache == 1)
+ return lrmid;
- return entry;
+ return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->num_rmid;
}
-static u64 __rmid_read(u32 rmid, u32 eventid)
+static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val)
{
- u64 val;
+ u64 msr_val;
/*
* As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
@@ -149,132 +150,70 @@ static u64 __rmid_read(u32 rmid, u32 eventid)
* IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
* are error bits.
*/
- wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
- rdmsrl(MSR_IA32_QM_CTR, val);
-
- return val;
-}
+ wrmsr(MSR_IA32_QM_EVTSEL, eventid, prmid);
+ rdmsrq(MSR_IA32_QM_CTR, msr_val);
-static bool rmid_dirty(struct rmid_entry *entry)
-{
- u64 val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID);
+ if (msr_val & RMID_VAL_ERROR)
+ return -EIO;
+ if (msr_val & RMID_VAL_UNAVAIL)
+ return -EINVAL;
- return val >= resctrl_cqm_threshold;
+ *val = msr_val;
+ return 0;
}
-/*
- * Check the RMIDs that are marked as busy for this domain. If the
- * reported LLC occupancy is below the threshold clear the busy bit and
- * decrement the count. If the busy count gets to zero on an RMID, we
- * free the RMID
- */
-void __check_limbo(struct rdt_domain *d, bool force_free)
+static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_dom,
+ u32 rmid,
+ enum resctrl_event_id eventid)
{
- struct rmid_entry *entry;
- struct rdt_resource *r;
- u32 crmid = 1, nrmid;
-
- r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
-
- /*
- * Skip RMID 0 and start from RMID 1 and check all the RMIDs that
- * are marked as busy for occupancy < threshold. If the occupancy
- * is less than the threshold decrement the busy counter of the
- * RMID and move it to the free list when the counter reaches 0.
- */
- for (;;) {
- nrmid = find_next_bit(d->rmid_busy_llc, r->num_rmid, crmid);
- if (nrmid >= r->num_rmid)
- break;
-
- entry = __rmid_entry(nrmid);
- if (force_free || !rmid_dirty(entry)) {
- clear_bit(entry->rmid, d->rmid_busy_llc);
- if (!--entry->busy) {
- rmid_limbo_count--;
- list_add_tail(&entry->list, &rmid_free_lru);
- }
- }
- crmid = nrmid + 1;
+ switch (eventid) {
+ case QOS_L3_OCCUP_EVENT_ID:
+ return NULL;
+ case QOS_L3_MBM_TOTAL_EVENT_ID:
+ return &hw_dom->arch_mbm_total[rmid];
+ case QOS_L3_MBM_LOCAL_EVENT_ID:
+ return &hw_dom->arch_mbm_local[rmid];
+ default:
+ /* Never expect to get here */
+ WARN_ON_ONCE(1);
+ return NULL;
}
}
-bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d)
+void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d,
+ u32 unused, u32 rmid,
+ enum resctrl_event_id eventid)
{
- return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid;
+ struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
+ int cpu = cpumask_any(&d->hdr.cpu_mask);
+ struct arch_mbm_state *am;
+ u32 prmid;
+
+ am = get_arch_mbm_state(hw_dom, rmid, eventid);
+ if (am) {
+ memset(am, 0, sizeof(*am));
+
+ prmid = logical_rmid_to_physical_rmid(cpu, rmid);
+ /* Record any initial, non-zero count value. */
+ __rmid_read_phys(prmid, eventid, &am->prev_msr);
+ }
}
/*
- * As of now the RMIDs allocation is global.
- * However we keep track of which packages the RMIDs
- * are used to optimize the limbo list management.
+ * Assumes that hardware counters are also reset and thus that there is
+ * no need to record initial non-zero counts.
*/
-int alloc_rmid(void)
+void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d)
{
- struct rmid_entry *entry;
+ struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
- lockdep_assert_held(&rdtgroup_mutex);
+ if (resctrl_arch_is_mbm_total_enabled())
+ memset(hw_dom->arch_mbm_total, 0,
+ sizeof(*hw_dom->arch_mbm_total) * r->num_rmid);
- if (list_empty(&rmid_free_lru))
- return rmid_limbo_count ? -EBUSY : -ENOSPC;
-
- entry = list_first_entry(&rmid_free_lru,
- struct rmid_entry, list);
- list_del(&entry->list);
-
- return entry->rmid;
-}
-
-static void add_rmid_to_limbo(struct rmid_entry *entry)
-{
- struct rdt_resource *r;
- struct rdt_domain *d;
- int cpu;
- u64 val;
-
- r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
-
- entry->busy = 0;
- cpu = get_cpu();
- list_for_each_entry(d, &r->domains, list) {
- if (cpumask_test_cpu(cpu, &d->cpu_mask)) {
- val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID);
- if (val <= resctrl_cqm_threshold)
- continue;
- }
-
- /*
- * For the first limbo RMID in the domain,
- * setup up the limbo worker.
- */
- if (!has_busy_rmid(r, d))
- cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL);
- set_bit(entry->rmid, d->rmid_busy_llc);
- entry->busy++;
- }
- put_cpu();
-
- if (entry->busy)
- rmid_limbo_count++;
- else
- list_add_tail(&entry->list, &rmid_free_lru);
-}
-
-void free_rmid(u32 rmid)
-{
- struct rmid_entry *entry;
-
- if (!rmid)
- return;
-
- lockdep_assert_held(&rdtgroup_mutex);
-
- entry = __rmid_entry(rmid);
-
- if (is_llc_occupancy_enabled())
- add_rmid_to_limbo(entry);
- else
- list_add_tail(&entry->list, &rmid_free_lru);
+ if (resctrl_arch_is_mbm_local_enabled())
+ memset(hw_dom->arch_mbm_local, 0,
+ sizeof(*hw_dom->arch_mbm_local) * r->num_rmid);
}
static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
@@ -285,412 +224,134 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
return chunks >> shift;
}
-static u64 __mon_event_count(u32 rmid, struct rmid_read *rr)
+int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d,
+ u32 unused, u32 rmid, enum resctrl_event_id eventid,
+ u64 *val, void *ignored)
{
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(rr->r);
- struct mbm_state *m;
- u64 chunks, tval;
+ struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+ int cpu = cpumask_any(&d->hdr.cpu_mask);
+ struct arch_mbm_state *am;
+ u64 msr_val, chunks;
+ u32 prmid;
+ int ret;
- tval = __rmid_read(rmid, rr->evtid);
- if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) {
- return tval;
- }
- switch (rr->evtid) {
- case QOS_L3_OCCUP_EVENT_ID:
- rr->val += tval;
- return 0;
- case QOS_L3_MBM_TOTAL_EVENT_ID:
- m = &rr->d->mbm_total[rmid];
- break;
- case QOS_L3_MBM_LOCAL_EVENT_ID:
- m = &rr->d->mbm_local[rmid];
- break;
- default:
- /*
- * Code would never reach here because an invalid
- * event id would fail the __rmid_read.
- */
- return RMID_VAL_ERROR;
- }
+ resctrl_arch_rmid_read_context_check();
- if (rr->first) {
- memset(m, 0, sizeof(struct mbm_state));
- m->prev_bw_msr = m->prev_msr = tval;
- return 0;
- }
+ prmid = logical_rmid_to_physical_rmid(cpu, rmid);
+ ret = __rmid_read_phys(prmid, eventid, &msr_val);
+ if (ret)
+ return ret;
- chunks = mbm_overflow_count(m->prev_msr, tval, hw_res->mbm_width);
- m->chunks += chunks;
- m->prev_msr = tval;
+ am = get_arch_mbm_state(hw_dom, rmid, eventid);
+ if (am) {
+ am->chunks += mbm_overflow_count(am->prev_msr, msr_val,
+ hw_res->mbm_width);
+ chunks = get_corrected_mbm_count(rmid, am->chunks);
+ am->prev_msr = msr_val;
+ } else {
+ chunks = msr_val;
+ }
- rr->val += get_corrected_mbm_count(rmid, m->chunks);
+ *val = chunks * hw_res->mon_scale;
return 0;
}
/*
- * Supporting function to calculate the memory bandwidth
- * and delta bandwidth in MBps.
- */
-static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
-{
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(rr->r);
- struct mbm_state *m = &rr->d->mbm_local[rmid];
- u64 tval, cur_bw, chunks;
-
- tval = __rmid_read(rmid, rr->evtid);
- if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
- return;
-
- chunks = mbm_overflow_count(m->prev_bw_msr, tval, hw_res->mbm_width);
- cur_bw = (get_corrected_mbm_count(rmid, chunks) * hw_res->mon_scale) >> 20;
-
- if (m->delta_comp)
- m->delta_bw = abs(cur_bw - m->prev_bw);
- m->delta_comp = false;
- m->prev_bw = cur_bw;
- m->prev_bw_msr = tval;
-}
-
-/*
- * This is called via IPI to read the CQM/MBM counters
- * on a domain.
+ * The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1
+ * which indicates that RMIDs are configured in legacy mode.
+ * This mode is incompatible with Linux resctrl semantics
+ * as RMIDs are partitioned between SNC nodes, which requires
+ * a user to know which RMID is allocated to a task.
+ * Clearing bit 0 reconfigures the RMID counters for use
+ * in RMID sharing mode. This mode is better for Linux.
+ * The RMID space is divided between all SNC nodes with the
+ * RMIDs renumbered to start from zero in each node when
+ * counting operations from tasks. Code to read the counters
+ * must adjust RMID counter numbers based on SNC node. See
+ * logical_rmid_to_physical_rmid() for code that does this.
*/
-void mon_event_count(void *info)
+void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d)
{
- struct rdtgroup *rdtgrp, *entry;
- struct rmid_read *rr = info;
- struct list_head *head;
- u64 ret_val;
-
- rdtgrp = rr->rgrp;
-
- ret_val = __mon_event_count(rdtgrp->mon.rmid, rr);
-
- /*
- * For Ctrl groups read data from child monitor groups and
- * add them together. Count events which are read successfully.
- * Discard the rmid_read's reporting errors.
- */
- head = &rdtgrp->mon.crdtgrp_list;
-
- if (rdtgrp->type == RDTCTRL_GROUP) {
- list_for_each_entry(entry, head, mon.crdtgrp_list) {
- if (__mon_event_count(entry->mon.rmid, rr) == 0)
- ret_val = 0;
- }
- }
-
- /* Report error if none of rmid_reads are successful */
- if (ret_val)
- rr->val = ret_val;
+ if (snc_nodes_per_l3_cache > 1)
+ msr_clear_bit(MSR_RMID_SNC_CONFIG, 0);
}
-/*
- * Feedback loop for MBA software controller (mba_sc)
- *
- * mba_sc is a feedback loop where we periodically read MBM counters and
- * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so
- * that:
- *
- * current bandwidth(cur_bw) < user specified bandwidth(user_bw)
- *
- * This uses the MBM counters to measure the bandwidth and MBA throttle
- * MSRs to control the bandwidth for a particular rdtgrp. It builds on the
- * fact that resctrl rdtgroups have both monitoring and control.
- *
- * The frequency of the checks is 1s and we just tag along the MBM overflow
- * timer. Having 1s interval makes the calculation of bandwidth simpler.
- *
- * Although MBA's goal is to restrict the bandwidth to a maximum, there may
- * be a need to increase the bandwidth to avoid unnecessarily restricting
- * the L2 <-> L3 traffic.
- *
- * Since MBA controls the L2 external bandwidth where as MBM measures the
- * L3 external bandwidth the following sequence could lead to such a
- * situation.
- *
- * Consider an rdtgroup which had high L3 <-> memory traffic in initial
- * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but
- * after some time rdtgroup has mostly L2 <-> L3 traffic.
- *
- * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its
- * throttle MSRs already have low percentage values. To avoid
- * unnecessarily restricting such rdtgroups, we also increase the bandwidth.
- */
-static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
-{
- u32 closid, rmid, cur_msr, cur_msr_val, new_msr_val;
- struct mbm_state *pmbm_data, *cmbm_data;
- struct rdt_hw_resource *hw_r_mba;
- struct rdt_hw_domain *hw_dom_mba;
- u32 cur_bw, delta_bw, user_bw;
- struct rdt_resource *r_mba;
- struct rdt_domain *dom_mba;
- struct list_head *head;
- struct rdtgroup *entry;
-
- if (!is_mbm_local_enabled())
- return;
-
- hw_r_mba = &rdt_resources_all[RDT_RESOURCE_MBA];
- r_mba = &hw_r_mba->r_resctrl;
- closid = rgrp->closid;
- rmid = rgrp->mon.rmid;
- pmbm_data = &dom_mbm->mbm_local[rmid];
-
- dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba);
- if (!dom_mba) {
- pr_warn_once("Failure to get domain for MBA update\n");
- return;
- }
- hw_dom_mba = resctrl_to_arch_dom(dom_mba);
-
- cur_bw = pmbm_data->prev_bw;
- user_bw = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE);
- delta_bw = pmbm_data->delta_bw;
- /*
- * resctrl_arch_get_config() chooses the mbps/ctrl value to return
- * based on is_mba_sc(). For now, reach into the hw_dom.
- */
- cur_msr_val = hw_dom_mba->ctrl_val[closid];
-
- /*
- * For Ctrl groups read data from child monitor groups.
- */
- head = &rgrp->mon.crdtgrp_list;
- list_for_each_entry(entry, head, mon.crdtgrp_list) {
- cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
- cur_bw += cmbm_data->prev_bw;
- delta_bw += cmbm_data->delta_bw;
- }
-
- /*
- * Scale up/down the bandwidth linearly for the ctrl group. The
- * bandwidth step is the bandwidth granularity specified by the
- * hardware.
- *
- * The delta_bw is used when increasing the bandwidth so that we
- * dont alternately increase and decrease the control values
- * continuously.
- *
- * For ex: consider cur_bw = 90MBps, user_bw = 100MBps and if
- * bandwidth step is 20MBps(> user_bw - cur_bw), we would keep
- * switching between 90 and 110 continuously if we only check
- * cur_bw < user_bw.
- */
- if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) {
- new_msr_val = cur_msr_val - r_mba->membw.bw_gran;
- } else if (cur_msr_val < MAX_MBA_BW &&
- (user_bw > (cur_bw + delta_bw))) {
- new_msr_val = cur_msr_val + r_mba->membw.bw_gran;
- } else {
- return;
- }
-
- cur_msr = hw_r_mba->msr_base + closid;
- wrmsrl(cur_msr, delay_bw_map(new_msr_val, r_mba));
- hw_dom_mba->ctrl_val[closid] = new_msr_val;
-
- /*
- * Delta values are updated dynamically package wise for each
- * rdtgrp every time the throttle MSR changes value.
- *
- * This is because (1)the increase in bandwidth is not perfectly
- * linear and only "approximately" linear even when the hardware
- * says it is linear.(2)Also since MBA is a core specific
- * mechanism, the delta values vary based on number of cores used
- * by the rdtgrp.
- */
- pmbm_data->delta_comp = true;
- list_for_each_entry(entry, head, mon.crdtgrp_list) {
- cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
- cmbm_data->delta_comp = true;
- }
-}
-
-static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid)
-{
- struct rmid_read rr;
-
- rr.first = false;
- rr.r = r;
- rr.d = d;
-
- /*
- * This is protected from concurrent reads from user
- * as both the user and we hold the global mutex.
- */
- if (is_mbm_total_enabled()) {
- rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
- __mon_event_count(rmid, &rr);
- }
- if (is_mbm_local_enabled()) {
- rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
- __mon_event_count(rmid, &rr);
-
- /*
- * Call the MBA software controller only for the
- * control groups and when user has enabled
- * the software controller explicitly.
- */
- if (is_mba_sc(NULL))
- mbm_bw_count(rmid, &rr);
- }
-}
+/* CPU models that support MSR_RMID_SNC_CONFIG */
+static const struct x86_cpu_id snc_cpu_ids[] __initconst = {
+ X86_MATCH_VFM(INTEL_ICELAKE_X, 0),
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0),
+ X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, 0),
+ X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, 0),
+ X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, 0),
+ {}
+};
/*
- * Handler to scan the limbo list and move the RMIDs
- * to free list whose occupancy < threshold_occupancy.
+ * There isn't a simple hardware bit that indicates whether a CPU is running
+ * in Sub-NUMA Cluster (SNC) mode. Infer the state by comparing the
+ * number of CPUs sharing the L3 cache with CPU0 to the number of CPUs in
+ * the same NUMA node as CPU0.
+ * It is not possible to accurately determine SNC state if the system is
+ * booted with a maxcpus=N parameter. That distorts the ratio of SNC nodes
+ * to L3 caches. It will be OK if system is booted with hyperthreading
+ * disabled (since this doesn't affect the ratio).
*/
-void cqm_handle_limbo(struct work_struct *work)
-{
- unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL);
- int cpu = smp_processor_id();
- struct rdt_resource *r;
- struct rdt_domain *d;
-
- mutex_lock(&rdtgroup_mutex);
-
- r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
- d = container_of(work, struct rdt_domain, cqm_limbo.work);
-
- __check_limbo(d, false);
-
- if (has_busy_rmid(r, d))
- schedule_delayed_work_on(cpu, &d->cqm_limbo, delay);
-
- mutex_unlock(&rdtgroup_mutex);
-}
-
-void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms)
+static __init int snc_get_config(void)
{
- unsigned long delay = msecs_to_jiffies(delay_ms);
- int cpu;
-
- cpu = cpumask_any(&dom->cpu_mask);
- dom->cqm_work_cpu = cpu;
-
- schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay);
-}
-
-void mbm_handle_overflow(struct work_struct *work)
-{
- unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL);
- struct rdtgroup *prgrp, *crgrp;
- int cpu = smp_processor_id();
- struct list_head *head;
- struct rdt_resource *r;
- struct rdt_domain *d;
-
- mutex_lock(&rdtgroup_mutex);
-
- if (!static_branch_likely(&rdt_mon_enable_key))
- goto out_unlock;
-
- r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
- d = container_of(work, struct rdt_domain, mbm_over.work);
-
- list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
- mbm_update(r, d, prgrp->mon.rmid);
-
- head = &prgrp->mon.crdtgrp_list;
- list_for_each_entry(crgrp, head, mon.crdtgrp_list)
- mbm_update(r, d, crgrp->mon.rmid);
-
- if (is_mba_sc(NULL))
- update_mba_bw(prgrp, d);
- }
+ struct cacheinfo *ci = get_cpu_cacheinfo_level(0, RESCTRL_L3_CACHE);
+ const cpumask_t *node0_cpumask;
+ int cpus_per_node, cpus_per_l3;
+ int ret;
- schedule_delayed_work_on(cpu, &d->mbm_over, delay);
+ if (!x86_match_cpu(snc_cpu_ids) || !ci)
+ return 1;
-out_unlock:
- mutex_unlock(&rdtgroup_mutex);
-}
+ cpus_read_lock();
+ if (num_online_cpus() != num_present_cpus())
+ pr_warn("Some CPUs offline, SNC detection may be incorrect\n");
+ cpus_read_unlock();
-void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms)
-{
- unsigned long delay = msecs_to_jiffies(delay_ms);
- int cpu;
+ node0_cpumask = cpumask_of_node(cpu_to_node(0));
- if (!static_branch_likely(&rdt_mon_enable_key))
- return;
- cpu = cpumask_any(&dom->cpu_mask);
- dom->mbm_work_cpu = cpu;
- schedule_delayed_work_on(cpu, &dom->mbm_over, delay);
-}
+ cpus_per_node = cpumask_weight(node0_cpumask);
+ cpus_per_l3 = cpumask_weight(&ci->shared_cpu_map);
-static int dom_data_init(struct rdt_resource *r)
-{
- struct rmid_entry *entry = NULL;
- int i, nr_rmids;
+ if (!cpus_per_node || !cpus_per_l3)
+ return 1;
- nr_rmids = r->num_rmid;
- rmid_ptrs = kcalloc(nr_rmids, sizeof(struct rmid_entry), GFP_KERNEL);
- if (!rmid_ptrs)
- return -ENOMEM;
+ ret = cpus_per_l3 / cpus_per_node;
- for (i = 0; i < nr_rmids; i++) {
- entry = &rmid_ptrs[i];
- INIT_LIST_HEAD(&entry->list);
-
- entry->rmid = i;
- list_add_tail(&entry->list, &rmid_free_lru);
+ /* sanity check: Only valid results are 1, 2, 3, 4, 6 */
+ switch (ret) {
+ case 1:
+ break;
+ case 2 ... 4:
+ case 6:
+ pr_info("Sub-NUMA Cluster mode detected with %d nodes per L3 cache\n", ret);
+ rdt_resources_all[RDT_RESOURCE_L3].r_resctrl.mon_scope = RESCTRL_L3_NODE;
+ break;
+ default:
+ pr_warn("Ignore improbable SNC node count %d\n", ret);
+ ret = 1;
+ break;
}
- /*
- * RMID 0 is special and is always allocated. It's used for all
- * tasks that are not monitored.
- */
- entry = __rmid_entry(0);
- list_del(&entry->list);
-
- return 0;
+ return ret;
}
-static struct mon_evt llc_occupancy_event = {
- .name = "llc_occupancy",
- .evtid = QOS_L3_OCCUP_EVENT_ID,
-};
-
-static struct mon_evt mbm_total_event = {
- .name = "mbm_total_bytes",
- .evtid = QOS_L3_MBM_TOTAL_EVENT_ID,
-};
-
-static struct mon_evt mbm_local_event = {
- .name = "mbm_local_bytes",
- .evtid = QOS_L3_MBM_LOCAL_EVENT_ID,
-};
-
-/*
- * Initialize the event list for the resource.
- *
- * Note that MBM events are also part of RDT_RESOURCE_L3 resource
- * because as per the SDM the total and local memory bandwidth
- * are enumerated as part of L3 monitoring.
- */
-static void l3_mon_evt_init(struct rdt_resource *r)
-{
- INIT_LIST_HEAD(&r->evt_list);
-
- if (is_llc_occupancy_enabled())
- list_add_tail(&llc_occupancy_event.list, &r->evt_list);
- if (is_mbm_total_enabled())
- list_add_tail(&mbm_total_event.list, &r->evt_list);
- if (is_mbm_local_enabled())
- list_add_tail(&mbm_local_event.list, &r->evt_list);
-}
-
-int rdt_get_mon_l3_config(struct rdt_resource *r)
+int __init rdt_get_mon_l3_config(struct rdt_resource *r)
{
unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
- unsigned int cl_size = boot_cpu_data.x86_cache_size;
- int ret;
+ unsigned int threshold;
+
+ snc_nodes_per_l3_cache = snc_get_config();
- hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale;
- r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1;
+ resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024;
+ hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache;
+ r->num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache;
hw_res->mbm_width = MBM_CNTR_WIDTH_BASE;
if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX)
@@ -705,19 +366,24 @@ int rdt_get_mon_l3_config(struct rdt_resource *r)
*
* For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
*/
- resctrl_cqm_threshold = cl_size * 1024 / r->num_rmid;
+ threshold = resctrl_rmid_realloc_limit / r->num_rmid;
- /* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */
- resctrl_cqm_threshold /= hw_res->mon_scale;
+ /*
+ * Because num_rmid may not be a power of two, round the value
+ * to the nearest multiple of hw_res->mon_scale so it matches a
+ * value the hardware will measure. mon_scale may not be a power of 2.
+ */
+ resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold);
- ret = dom_data_init(r);
- if (ret)
- return ret;
+ if (rdt_cpu_has(X86_FEATURE_BMEC)) {
+ u32 eax, ebx, ecx, edx;
- l3_mon_evt_init(r);
+ /* Detect list of bandwidth sources that can be tracked */
+ cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx);
+ r->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS;
+ }
r->mon_capable = true;
- r->mon_enabled = true;
return 0;
}
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index db813f819ad6..de580eca3363 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -11,27 +11,22 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/cacheinfo.h>
+#include <linux/cacheflush.h>
#include <linux/cpu.h>
-#include <linux/cpumask.h>
-#include <linux/debugfs.h>
-#include <linux/kthread.h>
-#include <linux/mman.h>
#include <linux/perf_event.h>
#include <linux/pm_qos.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
+#include <linux/resctrl.h>
-#include <asm/cacheflush.h>
-#include <asm/intel-family.h>
-#include <asm/resctrl.h>
+#include <asm/cpu_device_id.h>
#include <asm/perf_event.h>
+#include <asm/msr.h>
#include "../../events/perf_event.h" /* For X86_CONFIG() */
#include "internal.h"
#define CREATE_TRACE_POINTS
-#include "pseudo_lock_event.h"
+
+#include "pseudo_lock_trace.h"
/*
* The bits needed to disable hardware prefetching varies based on the
@@ -39,16 +34,9 @@
*/
static u64 prefetch_disable_bits;
-/*
- * Major number assigned to and shared by all devices exposing
- * pseudo-locked regions.
- */
-static unsigned int pseudo_lock_major;
-static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0);
-static struct class *pseudo_lock_class;
-
/**
- * get_prefetch_disable_bits - prefetch disable bits of supported platforms
+ * resctrl_arch_get_prefetch_disable_bits - prefetch disable bits of supported
+ * platforms
* @void: It takes no parameters.
*
* Capture the list of platforms that have been validated to support
@@ -62,20 +50,22 @@ static struct class *pseudo_lock_class;
* in the SDM.
*
* When adding a platform here also add support for its cache events to
- * measure_cycles_perf_fn()
+ * resctrl_arch_measure_l*_residency()
*
* Return:
* If platform is supported, the bits to disable hardware prefetchers, 0
* if platform is not supported.
*/
-static u64 get_prefetch_disable_bits(void)
+u64 resctrl_arch_get_prefetch_disable_bits(void)
{
+ prefetch_disable_bits = 0;
+
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
boot_cpu_data.x86 != 6)
return 0;
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_BROADWELL_X:
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_BROADWELL_X:
/*
* SDM defines bits of MSR_MISC_FEATURE_CONTROL register
* as:
@@ -85,9 +75,10 @@ static u64 get_prefetch_disable_bits(void)
* 3 DCU IP Prefetcher Disable (R/W)
* 63:4 Reserved
*/
- return 0xF;
- case INTEL_FAM6_ATOM_GOLDMONT:
- case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
+ prefetch_disable_bits = 0xF;
+ break;
+ case INTEL_ATOM_GOLDMONT:
+ case INTEL_ATOM_GOLDMONT_PLUS:
/*
* SDM defines bits of MSR_MISC_FEATURE_CONTROL register
* as:
@@ -96,308 +87,16 @@ static u64 get_prefetch_disable_bits(void)
* 2 DCU Hardware Prefetcher Disable (R/W)
* 63:3 Reserved
*/
- return 0x5;
- }
-
- return 0;
-}
-
-/**
- * pseudo_lock_minor_get - Obtain available minor number
- * @minor: Pointer to where new minor number will be stored
- *
- * A bitmask is used to track available minor numbers. Here the next free
- * minor number is marked as unavailable and returned.
- *
- * Return: 0 on success, <0 on failure.
- */
-static int pseudo_lock_minor_get(unsigned int *minor)
-{
- unsigned long first_bit;
-
- first_bit = find_first_bit(&pseudo_lock_minor_avail, MINORBITS);
-
- if (first_bit == MINORBITS)
- return -ENOSPC;
-
- __clear_bit(first_bit, &pseudo_lock_minor_avail);
- *minor = first_bit;
-
- return 0;
-}
-
-/**
- * pseudo_lock_minor_release - Return minor number to available
- * @minor: The minor number made available
- */
-static void pseudo_lock_minor_release(unsigned int minor)
-{
- __set_bit(minor, &pseudo_lock_minor_avail);
-}
-
-/**
- * region_find_by_minor - Locate a pseudo-lock region by inode minor number
- * @minor: The minor number of the device representing pseudo-locked region
- *
- * When the character device is accessed we need to determine which
- * pseudo-locked region it belongs to. This is done by matching the minor
- * number of the device to the pseudo-locked region it belongs.
- *
- * Minor numbers are assigned at the time a pseudo-locked region is associated
- * with a cache instance.
- *
- * Return: On success return pointer to resource group owning the pseudo-locked
- * region, NULL on failure.
- */
-static struct rdtgroup *region_find_by_minor(unsigned int minor)
-{
- struct rdtgroup *rdtgrp, *rdtgrp_match = NULL;
-
- list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
- if (rdtgrp->plr && rdtgrp->plr->minor == minor) {
- rdtgrp_match = rdtgrp;
- break;
- }
- }
- return rdtgrp_match;
-}
-
-/**
- * struct pseudo_lock_pm_req - A power management QoS request list entry
- * @list: Entry within the @pm_reqs list for a pseudo-locked region
- * @req: PM QoS request
- */
-struct pseudo_lock_pm_req {
- struct list_head list;
- struct dev_pm_qos_request req;
-};
-
-static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr)
-{
- struct pseudo_lock_pm_req *pm_req, *next;
-
- list_for_each_entry_safe(pm_req, next, &plr->pm_reqs, list) {
- dev_pm_qos_remove_request(&pm_req->req);
- list_del(&pm_req->list);
- kfree(pm_req);
- }
-}
-
-/**
- * pseudo_lock_cstates_constrain - Restrict cores from entering C6
- * @plr: Pseudo-locked region
- *
- * To prevent the cache from being affected by power management entering
- * C6 has to be avoided. This is accomplished by requesting a latency
- * requirement lower than lowest C6 exit latency of all supported
- * platforms as found in the cpuidle state tables in the intel_idle driver.
- * At this time it is possible to do so with a single latency requirement
- * for all supported platforms.
- *
- * Since Goldmont is supported, which is affected by X86_BUG_MONITOR,
- * the ACPI latencies need to be considered while keeping in mind that C2
- * may be set to map to deeper sleep states. In this case the latency
- * requirement needs to prevent entering C2 also.
- *
- * Return: 0 on success, <0 on failure
- */
-static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr)
-{
- struct pseudo_lock_pm_req *pm_req;
- int cpu;
- int ret;
-
- for_each_cpu(cpu, &plr->d->cpu_mask) {
- pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL);
- if (!pm_req) {
- rdt_last_cmd_puts("Failure to allocate memory for PM QoS\n");
- ret = -ENOMEM;
- goto out_err;
- }
- ret = dev_pm_qos_add_request(get_cpu_device(cpu),
- &pm_req->req,
- DEV_PM_QOS_RESUME_LATENCY,
- 30);
- if (ret < 0) {
- rdt_last_cmd_printf("Failed to add latency req CPU%d\n",
- cpu);
- kfree(pm_req);
- ret = -1;
- goto out_err;
- }
- list_add(&pm_req->list, &plr->pm_reqs);
- }
-
- return 0;
-
-out_err:
- pseudo_lock_cstates_relax(plr);
- return ret;
-}
-
-/**
- * pseudo_lock_region_clear - Reset pseudo-lock region data
- * @plr: pseudo-lock region
- *
- * All content of the pseudo-locked region is reset - any memory allocated
- * freed.
- *
- * Return: void
- */
-static void pseudo_lock_region_clear(struct pseudo_lock_region *plr)
-{
- plr->size = 0;
- plr->line_size = 0;
- kfree(plr->kmem);
- plr->kmem = NULL;
- plr->s = NULL;
- if (plr->d)
- plr->d->plr = NULL;
- plr->d = NULL;
- plr->cbm = 0;
- plr->debugfs_dir = NULL;
-}
-
-/**
- * pseudo_lock_region_init - Initialize pseudo-lock region information
- * @plr: pseudo-lock region
- *
- * Called after user provided a schemata to be pseudo-locked. From the
- * schemata the &struct pseudo_lock_region is on entry already initialized
- * with the resource, domain, and capacity bitmask. Here the information
- * required for pseudo-locking is deduced from this data and &struct
- * pseudo_lock_region initialized further. This information includes:
- * - size in bytes of the region to be pseudo-locked
- * - cache line size to know the stride with which data needs to be accessed
- * to be pseudo-locked
- * - a cpu associated with the cache instance on which the pseudo-locking
- * flow can be executed
- *
- * Return: 0 on success, <0 on failure. Descriptive error will be written
- * to last_cmd_status buffer.
- */
-static int pseudo_lock_region_init(struct pseudo_lock_region *plr)
-{
- struct cpu_cacheinfo *ci;
- int ret;
- int i;
-
- /* Pick the first cpu we find that is associated with the cache. */
- plr->cpu = cpumask_first(&plr->d->cpu_mask);
-
- if (!cpu_online(plr->cpu)) {
- rdt_last_cmd_printf("CPU %u associated with cache not online\n",
- plr->cpu);
- ret = -ENODEV;
- goto out_region;
- }
-
- ci = get_cpu_cacheinfo(plr->cpu);
-
- plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm);
-
- for (i = 0; i < ci->num_leaves; i++) {
- if (ci->info_list[i].level == plr->s->res->cache_level) {
- plr->line_size = ci->info_list[i].coherency_line_size;
- return 0;
- }
- }
-
- ret = -1;
- rdt_last_cmd_puts("Unable to determine cache line size\n");
-out_region:
- pseudo_lock_region_clear(plr);
- return ret;
-}
-
-/**
- * pseudo_lock_init - Initialize a pseudo-lock region
- * @rdtgrp: resource group to which new pseudo-locked region will belong
- *
- * A pseudo-locked region is associated with a resource group. When this
- * association is created the pseudo-locked region is initialized. The
- * details of the pseudo-locked region are not known at this time so only
- * allocation is done and association established.
- *
- * Return: 0 on success, <0 on failure
- */
-static int pseudo_lock_init(struct rdtgroup *rdtgrp)
-{
- struct pseudo_lock_region *plr;
-
- plr = kzalloc(sizeof(*plr), GFP_KERNEL);
- if (!plr)
- return -ENOMEM;
-
- init_waitqueue_head(&plr->lock_thread_wq);
- INIT_LIST_HEAD(&plr->pm_reqs);
- rdtgrp->plr = plr;
- return 0;
-}
-
-/**
- * pseudo_lock_region_alloc - Allocate kernel memory that will be pseudo-locked
- * @plr: pseudo-lock region
- *
- * Initialize the details required to set up the pseudo-locked region and
- * allocate the contiguous memory that will be pseudo-locked to the cache.
- *
- * Return: 0 on success, <0 on failure. Descriptive error will be written
- * to last_cmd_status buffer.
- */
-static int pseudo_lock_region_alloc(struct pseudo_lock_region *plr)
-{
- int ret;
-
- ret = pseudo_lock_region_init(plr);
- if (ret < 0)
- return ret;
-
- /*
- * We do not yet support contiguous regions larger than
- * KMALLOC_MAX_SIZE.
- */
- if (plr->size > KMALLOC_MAX_SIZE) {
- rdt_last_cmd_puts("Requested region exceeds maximum size\n");
- ret = -E2BIG;
- goto out_region;
- }
-
- plr->kmem = kzalloc(plr->size, GFP_KERNEL);
- if (!plr->kmem) {
- rdt_last_cmd_puts("Unable to allocate memory\n");
- ret = -ENOMEM;
- goto out_region;
+ prefetch_disable_bits = 0x5;
+ break;
}
- ret = 0;
- goto out;
-out_region:
- pseudo_lock_region_clear(plr);
-out:
- return ret;
-}
-
-/**
- * pseudo_lock_free - Free a pseudo-locked region
- * @rdtgrp: resource group to which pseudo-locked region belonged
- *
- * The pseudo-locked region's resources have already been released, or not
- * yet created at this point. Now it can be freed and disassociated from the
- * resource group.
- *
- * Return: void
- */
-static void pseudo_lock_free(struct rdtgroup *rdtgrp)
-{
- pseudo_lock_region_clear(rdtgrp->plr);
- kfree(rdtgrp->plr);
- rdtgrp->plr = NULL;
+ return prefetch_disable_bits;
}
/**
- * pseudo_lock_fn - Load kernel memory into cache
- * @_rdtgrp: resource group to which pseudo-lock region belongs
+ * resctrl_arch_pseudo_lock_fn - Load kernel memory into cache
+ * @_plr: the pseudo-lock region descriptor
*
* This is the core pseudo-locking flow.
*
@@ -414,12 +113,12 @@ static void pseudo_lock_free(struct rdtgroup *rdtgrp)
*
* Return: 0. Waiter on waitqueue will be woken on completion.
*/
-static int pseudo_lock_fn(void *_rdtgrp)
+int resctrl_arch_pseudo_lock_fn(void *_plr)
{
- struct rdtgroup *rdtgrp = _rdtgrp;
- struct pseudo_lock_region *plr = rdtgrp->plr;
+ struct pseudo_lock_region *plr = _plr;
u32 rmid_p, closid_p;
unsigned long i;
+ u64 saved_msr;
#ifdef CONFIG_KASAN
/*
* The registers used for local register variables are also used
@@ -446,7 +145,7 @@ static int pseudo_lock_fn(void *_rdtgrp)
* increase likelihood that allocated cache portion will be filled
* with associated memory.
*/
- native_wbinvd();
+ wbinvd();
/*
* Always called with interrupts enabled. By disabling interrupts
@@ -463,7 +162,8 @@ static int pseudo_lock_fn(void *_rdtgrp)
* the buffer and evict pseudo-locked memory read earlier from the
* cache.
*/
- __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0);
+ saved_msr = native_rdmsrq(MSR_MISC_FEATURE_CONTROL);
+ native_wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits);
closid_p = this_cpu_read(pqr_state.cur_closid);
rmid_p = this_cpu_read(pqr_state.cur_rmid);
mem_r = plr->kmem;
@@ -475,7 +175,8 @@ static int pseudo_lock_fn(void *_rdtgrp)
* pseudo-locked followed by reading of kernel memory to load it
* into the cache.
*/
- __wrmsr(IA32_PQR_ASSOC, rmid_p, rdtgrp->closid);
+ native_wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, plr->closid);
+
/*
* Cache was flushed earlier. Now access kernel memory to read it
* into cache region associated with just activated plr->closid.
@@ -511,10 +212,10 @@ static int pseudo_lock_fn(void *_rdtgrp)
* Critical section end: restore closid with capacity bitmask that
* does not overlap with pseudo-locked region.
*/
- __wrmsr(IA32_PQR_ASSOC, rmid_p, closid_p);
+ native_wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, closid_p);
/* Re-enable the hardware prefetcher(s) */
- wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0);
+ wrmsrq(MSR_MISC_FEATURE_CONTROL, saved_msr);
local_irq_enable();
plr->thread_done = 1;
@@ -523,339 +224,8 @@ static int pseudo_lock_fn(void *_rdtgrp)
}
/**
- * rdtgroup_monitor_in_progress - Test if monitoring in progress
- * @rdtgrp: resource group being queried
- *
- * Return: 1 if monitor groups have been created for this resource
- * group, 0 otherwise.
- */
-static int rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp)
-{
- return !list_empty(&rdtgrp->mon.crdtgrp_list);
-}
-
-/**
- * rdtgroup_locksetup_user_restrict - Restrict user access to group
- * @rdtgrp: resource group needing access restricted
- *
- * A resource group used for cache pseudo-locking cannot have cpus or tasks
- * assigned to it. This is communicated to the user by restricting access
- * to all the files that can be used to make such changes.
- *
- * Permissions restored with rdtgroup_locksetup_user_restore()
- *
- * Return: 0 on success, <0 on failure. If a failure occurs during the
- * restriction of access an attempt will be made to restore permissions but
- * the state of the mode of these files will be uncertain when a failure
- * occurs.
- */
-static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp)
-{
- int ret;
-
- ret = rdtgroup_kn_mode_restrict(rdtgrp, "tasks");
- if (ret)
- return ret;
-
- ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus");
- if (ret)
- goto err_tasks;
-
- ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list");
- if (ret)
- goto err_cpus;
-
- if (rdt_mon_capable) {
- ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups");
- if (ret)
- goto err_cpus_list;
- }
-
- ret = 0;
- goto out;
-
-err_cpus_list:
- rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777);
-err_cpus:
- rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777);
-err_tasks:
- rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777);
-out:
- return ret;
-}
-
-/**
- * rdtgroup_locksetup_user_restore - Restore user access to group
- * @rdtgrp: resource group needing access restored
- *
- * Restore all file access previously removed using
- * rdtgroup_locksetup_user_restrict()
- *
- * Return: 0 on success, <0 on failure. If a failure occurs during the
- * restoration of access an attempt will be made to restrict permissions
- * again but the state of the mode of these files will be uncertain when
- * a failure occurs.
- */
-static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp)
-{
- int ret;
-
- ret = rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777);
- if (ret)
- return ret;
-
- ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777);
- if (ret)
- goto err_tasks;
-
- ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777);
- if (ret)
- goto err_cpus;
-
- if (rdt_mon_capable) {
- ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777);
- if (ret)
- goto err_cpus_list;
- }
-
- ret = 0;
- goto out;
-
-err_cpus_list:
- rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list");
-err_cpus:
- rdtgroup_kn_mode_restrict(rdtgrp, "cpus");
-err_tasks:
- rdtgroup_kn_mode_restrict(rdtgrp, "tasks");
-out:
- return ret;
-}
-
-/**
- * rdtgroup_locksetup_enter - Resource group enters locksetup mode
- * @rdtgrp: resource group requested to enter locksetup mode
- *
- * A resource group enters locksetup mode to reflect that it would be used
- * to represent a pseudo-locked region and is in the process of being set
- * up to do so. A resource group used for a pseudo-locked region would
- * lose the closid associated with it so we cannot allow it to have any
- * tasks or cpus assigned nor permit tasks or cpus to be assigned in the
- * future. Monitoring of a pseudo-locked region is not allowed either.
- *
- * The above and more restrictions on a pseudo-locked region are checked
- * for and enforced before the resource group enters the locksetup mode.
- *
- * Returns: 0 if the resource group successfully entered locksetup mode, <0
- * on failure. On failure the last_cmd_status buffer is updated with text to
- * communicate details of failure to the user.
- */
-int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp)
-{
- int ret;
-
- /*
- * The default resource group can neither be removed nor lose the
- * default closid associated with it.
- */
- if (rdtgrp == &rdtgroup_default) {
- rdt_last_cmd_puts("Cannot pseudo-lock default group\n");
- return -EINVAL;
- }
-
- /*
- * Cache Pseudo-locking not supported when CDP is enabled.
- *
- * Some things to consider if you would like to enable this
- * support (using L3 CDP as example):
- * - When CDP is enabled two separate resources are exposed,
- * L3DATA and L3CODE, but they are actually on the same cache.
- * The implication for pseudo-locking is that if a
- * pseudo-locked region is created on a domain of one
- * resource (eg. L3CODE), then a pseudo-locked region cannot
- * be created on that same domain of the other resource
- * (eg. L3DATA). This is because the creation of a
- * pseudo-locked region involves a call to wbinvd that will
- * affect all cache allocations on particular domain.
- * - Considering the previous, it may be possible to only
- * expose one of the CDP resources to pseudo-locking and
- * hide the other. For example, we could consider to only
- * expose L3DATA and since the L3 cache is unified it is
- * still possible to place instructions there are execute it.
- * - If only one region is exposed to pseudo-locking we should
- * still keep in mind that availability of a portion of cache
- * for pseudo-locking should take into account both resources.
- * Similarly, if a pseudo-locked region is created in one
- * resource, the portion of cache used by it should be made
- * unavailable to all future allocations from both resources.
- */
- if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3) ||
- resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) {
- rdt_last_cmd_puts("CDP enabled\n");
- return -EINVAL;
- }
-
- /*
- * Not knowing the bits to disable prefetching implies that this
- * platform does not support Cache Pseudo-Locking.
- */
- prefetch_disable_bits = get_prefetch_disable_bits();
- if (prefetch_disable_bits == 0) {
- rdt_last_cmd_puts("Pseudo-locking not supported\n");
- return -EINVAL;
- }
-
- if (rdtgroup_monitor_in_progress(rdtgrp)) {
- rdt_last_cmd_puts("Monitoring in progress\n");
- return -EINVAL;
- }
-
- if (rdtgroup_tasks_assigned(rdtgrp)) {
- rdt_last_cmd_puts("Tasks assigned to resource group\n");
- return -EINVAL;
- }
-
- if (!cpumask_empty(&rdtgrp->cpu_mask)) {
- rdt_last_cmd_puts("CPUs assigned to resource group\n");
- return -EINVAL;
- }
-
- if (rdtgroup_locksetup_user_restrict(rdtgrp)) {
- rdt_last_cmd_puts("Unable to modify resctrl permissions\n");
- return -EIO;
- }
-
- ret = pseudo_lock_init(rdtgrp);
- if (ret) {
- rdt_last_cmd_puts("Unable to init pseudo-lock region\n");
- goto out_release;
- }
-
- /*
- * If this system is capable of monitoring a rmid would have been
- * allocated when the control group was created. This is not needed
- * anymore when this group would be used for pseudo-locking. This
- * is safe to call on platforms not capable of monitoring.
- */
- free_rmid(rdtgrp->mon.rmid);
-
- ret = 0;
- goto out;
-
-out_release:
- rdtgroup_locksetup_user_restore(rdtgrp);
-out:
- return ret;
-}
-
-/**
- * rdtgroup_locksetup_exit - resource group exist locksetup mode
- * @rdtgrp: resource group
- *
- * When a resource group exits locksetup mode the earlier restrictions are
- * lifted.
- *
- * Return: 0 on success, <0 on failure
- */
-int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp)
-{
- int ret;
-
- if (rdt_mon_capable) {
- ret = alloc_rmid();
- if (ret < 0) {
- rdt_last_cmd_puts("Out of RMIDs\n");
- return ret;
- }
- rdtgrp->mon.rmid = ret;
- }
-
- ret = rdtgroup_locksetup_user_restore(rdtgrp);
- if (ret) {
- free_rmid(rdtgrp->mon.rmid);
- return ret;
- }
-
- pseudo_lock_free(rdtgrp);
- return 0;
-}
-
-/**
- * rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked
- * @d: RDT domain
- * @cbm: CBM to test
- *
- * @d represents a cache instance and @cbm a capacity bitmask that is
- * considered for it. Determine if @cbm overlaps with any existing
- * pseudo-locked region on @d.
- *
- * @cbm is unsigned long, even if only 32 bits are used, to make the
- * bitmap functions work correctly.
- *
- * Return: true if @cbm overlaps with pseudo-locked region on @d, false
- * otherwise.
- */
-bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm)
-{
- unsigned int cbm_len;
- unsigned long cbm_b;
-
- if (d->plr) {
- cbm_len = d->plr->s->res->cache.cbm_len;
- cbm_b = d->plr->cbm;
- if (bitmap_intersects(&cbm, &cbm_b, cbm_len))
- return true;
- }
- return false;
-}
-
-/**
- * rdtgroup_pseudo_locked_in_hierarchy - Pseudo-locked region in cache hierarchy
- * @d: RDT domain under test
- *
- * The setup of a pseudo-locked region affects all cache instances within
- * the hierarchy of the region. It is thus essential to know if any
- * pseudo-locked regions exist within a cache hierarchy to prevent any
- * attempts to create new pseudo-locked regions in the same hierarchy.
- *
- * Return: true if a pseudo-locked region exists in the hierarchy of @d or
- * if it is not possible to test due to memory allocation issue,
- * false otherwise.
- */
-bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d)
-{
- cpumask_var_t cpu_with_psl;
- struct rdt_resource *r;
- struct rdt_domain *d_i;
- bool ret = false;
-
- if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL))
- return true;
-
- /*
- * First determine which cpus have pseudo-locked regions
- * associated with them.
- */
- for_each_alloc_enabled_rdt_resource(r) {
- list_for_each_entry(d_i, &r->domains, list) {
- if (d_i->plr)
- cpumask_or(cpu_with_psl, cpu_with_psl,
- &d_i->cpu_mask);
- }
- }
-
- /*
- * Next test if new pseudo-locked region would intersect with
- * existing region.
- */
- if (cpumask_intersects(&d->cpu_mask, cpu_with_psl))
- ret = true;
-
- free_cpumask_var(cpu_with_psl);
- return ret;
-}
-
-/**
- * measure_cycles_lat_fn - Measure cycle latency to read pseudo-locked memory
+ * resctrl_arch_measure_cycles_lat_fn - Measure cycle latency to read
+ * pseudo-locked memory
* @_plr: pseudo-lock region to measure
*
* There is no deterministic way to test if a memory region is cached. One
@@ -868,9 +238,10 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d)
*
* Return: 0. Waiter on waitqueue will be woken on completion.
*/
-static int measure_cycles_lat_fn(void *_plr)
+int resctrl_arch_measure_cycles_lat_fn(void *_plr)
{
struct pseudo_lock_region *plr = _plr;
+ u32 saved_low, saved_high;
unsigned long i;
u64 start, end;
void *mem_r;
@@ -879,7 +250,8 @@ static int measure_cycles_lat_fn(void *_plr)
/*
* Disable hardware prefetchers.
*/
- wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0);
+ rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
+ wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits);
mem_r = READ_ONCE(plr->kmem);
/*
* Dummy execute of the time measurement to load the needed
@@ -895,7 +267,7 @@ static int measure_cycles_lat_fn(void *_plr)
end = rdtsc_ordered();
trace_pseudo_lock_mem_latency((u32)(end - start));
}
- wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0);
+ wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
local_irq_enable();
plr->thread_done = 1;
wake_up_interruptible(&plr->lock_thread_wq);
@@ -940,6 +312,7 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr,
u64 hits_before = 0, hits_after = 0, miss_before = 0, miss_after = 0;
struct perf_event *miss_event, *hit_event;
int hit_pmcnum, miss_pmcnum;
+ u32 saved_low, saved_high;
unsigned int line_size;
unsigned int size;
unsigned long i;
@@ -973,7 +346,8 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr,
/*
* Disable hardware prefetchers.
*/
- wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0);
+ rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
+ wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits);
/* Initialize rest of local variables */
/*
@@ -991,8 +365,8 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr,
* used in L1 cache, second to capture accurate value that does not
* include cache misses incurred because of instruction loads.
*/
- rdpmcl(hit_pmcnum, hits_before);
- rdpmcl(miss_pmcnum, miss_before);
+ hits_before = rdpmc(hit_pmcnum);
+ miss_before = rdpmc(miss_pmcnum);
/*
* From SDM: Performing back-to-back fast reads are not guaranteed
* to be monotonic.
@@ -1000,8 +374,8 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr,
* before proceeding.
*/
rmb();
- rdpmcl(hit_pmcnum, hits_before);
- rdpmcl(miss_pmcnum, miss_before);
+ hits_before = rdpmc(hit_pmcnum);
+ miss_before = rdpmc(miss_pmcnum);
/*
* Use LFENCE to ensure all previous instructions are retired
* before proceeding.
@@ -1023,15 +397,15 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr,
* before proceeding.
*/
rmb();
- rdpmcl(hit_pmcnum, hits_after);
- rdpmcl(miss_pmcnum, miss_after);
+ hits_after = rdpmc(hit_pmcnum);
+ miss_after = rdpmc(miss_pmcnum);
/*
* Use LFENCE to ensure all previous instructions are retired
* before proceeding.
*/
rmb();
/* Re-enable hardware prefetchers */
- wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0);
+ wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
local_irq_enable();
out_hit:
perf_event_release_kernel(hit_event);
@@ -1048,7 +422,7 @@ out:
return 0;
}
-static int measure_l2_residency(void *_plr)
+int resctrl_arch_measure_l2_residency(void *_plr)
{
struct pseudo_lock_region *plr = _plr;
struct residency_counts counts = {0};
@@ -1061,9 +435,9 @@ static int measure_l2_residency(void *_plr)
* L2_HIT 02H
* L2_MISS 10H
*/
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_ATOM_GOLDMONT:
- case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_ATOM_GOLDMONT:
+ case INTEL_ATOM_GOLDMONT_PLUS:
perf_miss_attr.config = X86_CONFIG(.event = 0xd1,
.umask = 0x10);
perf_hit_attr.config = X86_CONFIG(.event = 0xd1,
@@ -1086,7 +460,7 @@ out:
return 0;
}
-static int measure_l3_residency(void *_plr)
+int resctrl_arch_measure_l3_residency(void *_plr)
{
struct pseudo_lock_region *plr = _plr;
struct residency_counts counts = {0};
@@ -1100,8 +474,8 @@ static int measure_l3_residency(void *_plr)
* MISS 41H
*/
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_BROADWELL_X:
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_BROADWELL_X:
/* On BDW the hit event counts references, not hits */
perf_hit_attr.config = X86_CONFIG(.event = 0x2e,
.umask = 0x4f);
@@ -1119,7 +493,7 @@ static int measure_l3_residency(void *_plr)
*/
counts.miss_after -= counts.miss_before;
- if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X) {
+ if (boot_cpu_data.x86_vfm == INTEL_BROADWELL_X) {
/*
* On BDW references and misses are counted, need to adjust.
* Sometimes the "hits" counter is a bit more than the
@@ -1141,454 +515,3 @@ out:
wake_up_interruptible(&plr->lock_thread_wq);
return 0;
}
-
-/**
- * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region
- * @rdtgrp: Resource group to which the pseudo-locked region belongs.
- * @sel: Selector of which measurement to perform on a pseudo-locked region.
- *
- * The measurement of latency to access a pseudo-locked region should be
- * done from a cpu that is associated with that pseudo-locked region.
- * Determine which cpu is associated with this region and start a thread on
- * that cpu to perform the measurement, wait for that thread to complete.
- *
- * Return: 0 on success, <0 on failure
- */
-static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel)
-{
- struct pseudo_lock_region *plr = rdtgrp->plr;
- struct task_struct *thread;
- unsigned int cpu;
- int ret = -1;
-
- cpus_read_lock();
- mutex_lock(&rdtgroup_mutex);
-
- if (rdtgrp->flags & RDT_DELETED) {
- ret = -ENODEV;
- goto out;
- }
-
- if (!plr->d) {
- ret = -ENODEV;
- goto out;
- }
-
- plr->thread_done = 0;
- cpu = cpumask_first(&plr->d->cpu_mask);
- if (!cpu_online(cpu)) {
- ret = -ENODEV;
- goto out;
- }
-
- plr->cpu = cpu;
-
- if (sel == 1)
- thread = kthread_create_on_node(measure_cycles_lat_fn, plr,
- cpu_to_node(cpu),
- "pseudo_lock_measure/%u",
- cpu);
- else if (sel == 2)
- thread = kthread_create_on_node(measure_l2_residency, plr,
- cpu_to_node(cpu),
- "pseudo_lock_measure/%u",
- cpu);
- else if (sel == 3)
- thread = kthread_create_on_node(measure_l3_residency, plr,
- cpu_to_node(cpu),
- "pseudo_lock_measure/%u",
- cpu);
- else
- goto out;
-
- if (IS_ERR(thread)) {
- ret = PTR_ERR(thread);
- goto out;
- }
- kthread_bind(thread, cpu);
- wake_up_process(thread);
-
- ret = wait_event_interruptible(plr->lock_thread_wq,
- plr->thread_done == 1);
- if (ret < 0)
- goto out;
-
- ret = 0;
-
-out:
- mutex_unlock(&rdtgroup_mutex);
- cpus_read_unlock();
- return ret;
-}
-
-static ssize_t pseudo_lock_measure_trigger(struct file *file,
- const char __user *user_buf,
- size_t count, loff_t *ppos)
-{
- struct rdtgroup *rdtgrp = file->private_data;
- size_t buf_size;
- char buf[32];
- int ret;
- int sel;
-
- buf_size = min(count, (sizeof(buf) - 1));
- if (copy_from_user(buf, user_buf, buf_size))
- return -EFAULT;
-
- buf[buf_size] = '\0';
- ret = kstrtoint(buf, 10, &sel);
- if (ret == 0) {
- if (sel != 1 && sel != 2 && sel != 3)
- return -EINVAL;
- ret = debugfs_file_get(file->f_path.dentry);
- if (ret)
- return ret;
- ret = pseudo_lock_measure_cycles(rdtgrp, sel);
- if (ret == 0)
- ret = count;
- debugfs_file_put(file->f_path.dentry);
- }
-
- return ret;
-}
-
-static const struct file_operations pseudo_measure_fops = {
- .write = pseudo_lock_measure_trigger,
- .open = simple_open,
- .llseek = default_llseek,
-};
-
-/**
- * rdtgroup_pseudo_lock_create - Create a pseudo-locked region
- * @rdtgrp: resource group to which pseudo-lock region belongs
- *
- * Called when a resource group in the pseudo-locksetup mode receives a
- * valid schemata that should be pseudo-locked. Since the resource group is
- * in pseudo-locksetup mode the &struct pseudo_lock_region has already been
- * allocated and initialized with the essential information. If a failure
- * occurs the resource group remains in the pseudo-locksetup mode with the
- * &struct pseudo_lock_region associated with it, but cleared from all
- * information and ready for the user to re-attempt pseudo-locking by
- * writing the schemata again.
- *
- * Return: 0 if the pseudo-locked region was successfully pseudo-locked, <0
- * on failure. Descriptive error will be written to last_cmd_status buffer.
- */
-int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
-{
- struct pseudo_lock_region *plr = rdtgrp->plr;
- struct task_struct *thread;
- unsigned int new_minor;
- struct device *dev;
- int ret;
-
- ret = pseudo_lock_region_alloc(plr);
- if (ret < 0)
- return ret;
-
- ret = pseudo_lock_cstates_constrain(plr);
- if (ret < 0) {
- ret = -EINVAL;
- goto out_region;
- }
-
- plr->thread_done = 0;
-
- thread = kthread_create_on_node(pseudo_lock_fn, rdtgrp,
- cpu_to_node(plr->cpu),
- "pseudo_lock/%u", plr->cpu);
- if (IS_ERR(thread)) {
- ret = PTR_ERR(thread);
- rdt_last_cmd_printf("Locking thread returned error %d\n", ret);
- goto out_cstates;
- }
-
- kthread_bind(thread, plr->cpu);
- wake_up_process(thread);
-
- ret = wait_event_interruptible(plr->lock_thread_wq,
- plr->thread_done == 1);
- if (ret < 0) {
- /*
- * If the thread does not get on the CPU for whatever
- * reason and the process which sets up the region is
- * interrupted then this will leave the thread in runnable
- * state and once it gets on the CPU it will dereference
- * the cleared, but not freed, plr struct resulting in an
- * empty pseudo-locking loop.
- */
- rdt_last_cmd_puts("Locking thread interrupted\n");
- goto out_cstates;
- }
-
- ret = pseudo_lock_minor_get(&new_minor);
- if (ret < 0) {
- rdt_last_cmd_puts("Unable to obtain a new minor number\n");
- goto out_cstates;
- }
-
- /*
- * Unlock access but do not release the reference. The
- * pseudo-locked region will still be here on return.
- *
- * The mutex has to be released temporarily to avoid a potential
- * deadlock with the mm->mmap_lock which is obtained in the
- * device_create() and debugfs_create_dir() callpath below as well as
- * before the mmap() callback is called.
- */
- mutex_unlock(&rdtgroup_mutex);
-
- if (!IS_ERR_OR_NULL(debugfs_resctrl)) {
- plr->debugfs_dir = debugfs_create_dir(rdtgrp->kn->name,
- debugfs_resctrl);
- if (!IS_ERR_OR_NULL(plr->debugfs_dir))
- debugfs_create_file("pseudo_lock_measure", 0200,
- plr->debugfs_dir, rdtgrp,
- &pseudo_measure_fops);
- }
-
- dev = device_create(pseudo_lock_class, NULL,
- MKDEV(pseudo_lock_major, new_minor),
- rdtgrp, "%s", rdtgrp->kn->name);
-
- mutex_lock(&rdtgroup_mutex);
-
- if (IS_ERR(dev)) {
- ret = PTR_ERR(dev);
- rdt_last_cmd_printf("Failed to create character device: %d\n",
- ret);
- goto out_debugfs;
- }
-
- /* We released the mutex - check if group was removed while we did so */
- if (rdtgrp->flags & RDT_DELETED) {
- ret = -ENODEV;
- goto out_device;
- }
-
- plr->minor = new_minor;
-
- rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED;
- closid_free(rdtgrp->closid);
- rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0444);
- rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0444);
-
- ret = 0;
- goto out;
-
-out_device:
- device_destroy(pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor));
-out_debugfs:
- debugfs_remove_recursive(plr->debugfs_dir);
- pseudo_lock_minor_release(new_minor);
-out_cstates:
- pseudo_lock_cstates_relax(plr);
-out_region:
- pseudo_lock_region_clear(plr);
-out:
- return ret;
-}
-
-/**
- * rdtgroup_pseudo_lock_remove - Remove a pseudo-locked region
- * @rdtgrp: resource group to which the pseudo-locked region belongs
- *
- * The removal of a pseudo-locked region can be initiated when the resource
- * group is removed from user space via a "rmdir" from userspace or the
- * unmount of the resctrl filesystem. On removal the resource group does
- * not go back to pseudo-locksetup mode before it is removed, instead it is
- * removed directly. There is thus asymmetry with the creation where the
- * &struct pseudo_lock_region is removed here while it was not created in
- * rdtgroup_pseudo_lock_create().
- *
- * Return: void
- */
-void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp)
-{
- struct pseudo_lock_region *plr = rdtgrp->plr;
-
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
- /*
- * Default group cannot be a pseudo-locked region so we can
- * free closid here.
- */
- closid_free(rdtgrp->closid);
- goto free;
- }
-
- pseudo_lock_cstates_relax(plr);
- debugfs_remove_recursive(rdtgrp->plr->debugfs_dir);
- device_destroy(pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor));
- pseudo_lock_minor_release(plr->minor);
-
-free:
- pseudo_lock_free(rdtgrp);
-}
-
-static int pseudo_lock_dev_open(struct inode *inode, struct file *filp)
-{
- struct rdtgroup *rdtgrp;
-
- mutex_lock(&rdtgroup_mutex);
-
- rdtgrp = region_find_by_minor(iminor(inode));
- if (!rdtgrp) {
- mutex_unlock(&rdtgroup_mutex);
- return -ENODEV;
- }
-
- filp->private_data = rdtgrp;
- atomic_inc(&rdtgrp->waitcount);
- /* Perform a non-seekable open - llseek is not supported */
- filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
-
- mutex_unlock(&rdtgroup_mutex);
-
- return 0;
-}
-
-static int pseudo_lock_dev_release(struct inode *inode, struct file *filp)
-{
- struct rdtgroup *rdtgrp;
-
- mutex_lock(&rdtgroup_mutex);
- rdtgrp = filp->private_data;
- WARN_ON(!rdtgrp);
- if (!rdtgrp) {
- mutex_unlock(&rdtgroup_mutex);
- return -ENODEV;
- }
- filp->private_data = NULL;
- atomic_dec(&rdtgrp->waitcount);
- mutex_unlock(&rdtgroup_mutex);
- return 0;
-}
-
-static int pseudo_lock_dev_mremap(struct vm_area_struct *area)
-{
- /* Not supported */
- return -EINVAL;
-}
-
-static const struct vm_operations_struct pseudo_mmap_ops = {
- .mremap = pseudo_lock_dev_mremap,
-};
-
-static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma)
-{
- unsigned long vsize = vma->vm_end - vma->vm_start;
- unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
- struct pseudo_lock_region *plr;
- struct rdtgroup *rdtgrp;
- unsigned long physical;
- unsigned long psize;
-
- mutex_lock(&rdtgroup_mutex);
-
- rdtgrp = filp->private_data;
- WARN_ON(!rdtgrp);
- if (!rdtgrp) {
- mutex_unlock(&rdtgroup_mutex);
- return -ENODEV;
- }
-
- plr = rdtgrp->plr;
-
- if (!plr->d) {
- mutex_unlock(&rdtgroup_mutex);
- return -ENODEV;
- }
-
- /*
- * Task is required to run with affinity to the cpus associated
- * with the pseudo-locked region. If this is not the case the task
- * may be scheduled elsewhere and invalidate entries in the
- * pseudo-locked region.
- */
- if (!cpumask_subset(current->cpus_ptr, &plr->d->cpu_mask)) {
- mutex_unlock(&rdtgroup_mutex);
- return -EINVAL;
- }
-
- physical = __pa(plr->kmem) >> PAGE_SHIFT;
- psize = plr->size - off;
-
- if (off > plr->size) {
- mutex_unlock(&rdtgroup_mutex);
- return -ENOSPC;
- }
-
- /*
- * Ensure changes are carried directly to the memory being mapped,
- * do not allow copy-on-write mapping.
- */
- if (!(vma->vm_flags & VM_SHARED)) {
- mutex_unlock(&rdtgroup_mutex);
- return -EINVAL;
- }
-
- if (vsize > psize) {
- mutex_unlock(&rdtgroup_mutex);
- return -ENOSPC;
- }
-
- memset(plr->kmem + off, 0, vsize);
-
- if (remap_pfn_range(vma, vma->vm_start, physical + vma->vm_pgoff,
- vsize, vma->vm_page_prot)) {
- mutex_unlock(&rdtgroup_mutex);
- return -EAGAIN;
- }
- vma->vm_ops = &pseudo_mmap_ops;
- mutex_unlock(&rdtgroup_mutex);
- return 0;
-}
-
-static const struct file_operations pseudo_lock_dev_fops = {
- .owner = THIS_MODULE,
- .llseek = no_llseek,
- .read = NULL,
- .write = NULL,
- .open = pseudo_lock_dev_open,
- .release = pseudo_lock_dev_release,
- .mmap = pseudo_lock_dev_mmap,
-};
-
-static char *pseudo_lock_devnode(struct device *dev, umode_t *mode)
-{
- struct rdtgroup *rdtgrp;
-
- rdtgrp = dev_get_drvdata(dev);
- if (mode)
- *mode = 0600;
- return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdtgrp->kn->name);
-}
-
-int rdt_pseudo_lock_init(void)
-{
- int ret;
-
- ret = register_chrdev(0, "pseudo_lock", &pseudo_lock_dev_fops);
- if (ret < 0)
- return ret;
-
- pseudo_lock_major = ret;
-
- pseudo_lock_class = class_create(THIS_MODULE, "pseudo_lock");
- if (IS_ERR(pseudo_lock_class)) {
- ret = PTR_ERR(pseudo_lock_class);
- unregister_chrdev(pseudo_lock_major, "pseudo_lock");
- return ret;
- }
-
- pseudo_lock_class->devnode = pseudo_lock_devnode;
- return 0;
-}
-
-void rdt_pseudo_lock_release(void)
-{
- class_destroy(pseudo_lock_class);
- pseudo_lock_class = NULL;
- unregister_chrdev(pseudo_lock_major, "pseudo_lock");
- pseudo_lock_major = 0;
-}
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h b/arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h
index 428ebbd4270b..7c8aef08010f 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h
@@ -2,8 +2,8 @@
#undef TRACE_SYSTEM
#define TRACE_SYSTEM resctrl
-#if !defined(_TRACE_PSEUDO_LOCK_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_PSEUDO_LOCK_H
+#if !defined(_X86_RESCTRL_PSEUDO_LOCK_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _X86_RESCTRL_PSEUDO_LOCK_TRACE_H
#include <linux/tracepoint.h>
@@ -35,9 +35,11 @@ TRACE_EVENT(pseudo_lock_l3,
TP_printk("hits=%llu miss=%llu",
__entry->l3_hits, __entry->l3_miss));
-#endif /* _TRACE_PSEUDO_LOCK_H */
+#endif /* _X86_RESCTRL_PSEUDO_LOCK_TRACE_H */
#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH .
-#define TRACE_INCLUDE_FILE pseudo_lock_event
+
+#define TRACE_INCLUDE_FILE pseudo_lock_trace
+
#include <trace/define_trace.h>
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index f276aff521e8..885026468440 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -12,13 +12,13 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/cacheinfo.h>
#include <linux/cpu.h>
#include <linux/debugfs.h>
#include <linux/fs.h>
#include <linux/fs_parser.h>
#include <linux/sysfs.h>
#include <linux/kernfs.h>
+#include <linux/resctrl.h>
#include <linux/seq_buf.h>
#include <linux/seq_file.h>
#include <linux/sched/signal.h>
@@ -29,284 +29,28 @@
#include <uapi/linux/magic.h>
-#include <asm/resctrl.h>
+#include <asm/msr.h>
#include "internal.h"
DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
-DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key);
-DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
-static struct kernfs_root *rdt_root;
-struct rdtgroup rdtgroup_default;
-LIST_HEAD(rdt_all_groups);
-
-/* list of entries for the schemata file */
-LIST_HEAD(resctrl_schema_all);
-
-/* Kernel fs node for "info" directory under root */
-static struct kernfs_node *kn_info;
-
-/* Kernel fs node for "mon_groups" directory under root */
-static struct kernfs_node *kn_mongrp;
-
-/* Kernel fs node for "mon_data" directory under root */
-static struct kernfs_node *kn_mondata;
-
-static struct seq_buf last_cmd_status;
-static char last_cmd_status_buf[512];
-
-struct dentry *debugfs_resctrl;
-
-void rdt_last_cmd_clear(void)
-{
- lockdep_assert_held(&rdtgroup_mutex);
- seq_buf_clear(&last_cmd_status);
-}
-
-void rdt_last_cmd_puts(const char *s)
-{
- lockdep_assert_held(&rdtgroup_mutex);
- seq_buf_puts(&last_cmd_status, s);
-}
-
-void rdt_last_cmd_printf(const char *fmt, ...)
-{
- va_list ap;
-
- va_start(ap, fmt);
- lockdep_assert_held(&rdtgroup_mutex);
- seq_buf_vprintf(&last_cmd_status, fmt, ap);
- va_end(ap);
-}
-
-/*
- * Trivial allocator for CLOSIDs. Since h/w only supports a small number,
- * we can keep a bitmap of free CLOSIDs in a single integer.
- *
- * Using a global CLOSID across all resources has some advantages and
- * some drawbacks:
- * + We can simply set "current->closid" to assign a task to a resource
- * group.
- * + Context switch code can avoid extra memory references deciding which
- * CLOSID to load into the PQR_ASSOC MSR
- * - We give up some options in configuring resource groups across multi-socket
- * systems.
- * - Our choices on how to configure each resource become progressively more
- * limited as the number of resources grows.
- */
-static int closid_free_map;
-static int closid_free_map_len;
-
-int closids_supported(void)
-{
- return closid_free_map_len;
-}
-
-static void closid_init(void)
-{
- struct resctrl_schema *s;
- u32 rdt_min_closid = 32;
-
- /* Compute rdt_min_closid across all resources */
- list_for_each_entry(s, &resctrl_schema_all, list)
- rdt_min_closid = min(rdt_min_closid, s->num_closid);
-
- closid_free_map = BIT_MASK(rdt_min_closid) - 1;
-
- /* CLOSID 0 is always reserved for the default group */
- closid_free_map &= ~1;
- closid_free_map_len = rdt_min_closid;
-}
-
-static int closid_alloc(void)
-{
- u32 closid = ffs(closid_free_map);
-
- if (closid == 0)
- return -ENOSPC;
- closid--;
- closid_free_map &= ~(1 << closid);
-
- return closid;
-}
-
-void closid_free(int closid)
-{
- closid_free_map |= 1 << closid;
-}
-
-/**
- * closid_allocated - test if provided closid is in use
- * @closid: closid to be tested
- *
- * Return: true if @closid is currently associated with a resource group,
- * false if @closid is free
- */
-static bool closid_allocated(unsigned int closid)
-{
- return (closid_free_map & (1 << closid)) == 0;
-}
-
-/**
- * rdtgroup_mode_by_closid - Return mode of resource group with closid
- * @closid: closid if the resource group
- *
- * Each resource group is associated with a @closid. Here the mode
- * of a resource group can be queried by searching for it using its closid.
- *
- * Return: mode as &enum rdtgrp_mode of resource group with closid @closid
- */
-enum rdtgrp_mode rdtgroup_mode_by_closid(int closid)
-{
- struct rdtgroup *rdtgrp;
-
- list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
- if (rdtgrp->closid == closid)
- return rdtgrp->mode;
- }
-
- return RDT_NUM_MODES;
-}
-
-static const char * const rdt_mode_str[] = {
- [RDT_MODE_SHAREABLE] = "shareable",
- [RDT_MODE_EXCLUSIVE] = "exclusive",
- [RDT_MODE_PSEUDO_LOCKSETUP] = "pseudo-locksetup",
- [RDT_MODE_PSEUDO_LOCKED] = "pseudo-locked",
-};
-
-/**
- * rdtgroup_mode_str - Return the string representation of mode
- * @mode: the resource group mode as &enum rdtgroup_mode
- *
- * Return: string representation of valid mode, "unknown" otherwise
- */
-static const char *rdtgroup_mode_str(enum rdtgrp_mode mode)
-{
- if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES)
- return "unknown";
-
- return rdt_mode_str[mode];
-}
-
-/* set uid and gid of rdtgroup dirs and files to that of the creator */
-static int rdtgroup_kn_set_ugid(struct kernfs_node *kn)
-{
- struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
- .ia_uid = current_fsuid(),
- .ia_gid = current_fsgid(), };
-
- if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
- gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
- return 0;
-
- return kernfs_setattr(kn, &iattr);
-}
-
-static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft)
-{
- struct kernfs_node *kn;
- int ret;
-
- kn = __kernfs_create_file(parent_kn, rft->name, rft->mode,
- GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
- 0, rft->kf_ops, rft, NULL, NULL);
- if (IS_ERR(kn))
- return PTR_ERR(kn);
-
- ret = rdtgroup_kn_set_ugid(kn);
- if (ret) {
- kernfs_remove(kn);
- return ret;
- }
-
- return 0;
-}
-
-static int rdtgroup_seqfile_show(struct seq_file *m, void *arg)
-{
- struct kernfs_open_file *of = m->private;
- struct rftype *rft = of->kn->priv;
-
- if (rft->seq_show)
- return rft->seq_show(of, m, arg);
- return 0;
-}
-
-static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf,
- size_t nbytes, loff_t off)
-{
- struct rftype *rft = of->kn->priv;
-
- if (rft->write)
- return rft->write(of, buf, nbytes, off);
-
- return -EINVAL;
-}
-
-static const struct kernfs_ops rdtgroup_kf_single_ops = {
- .atomic_write_len = PAGE_SIZE,
- .write = rdtgroup_file_write,
- .seq_show = rdtgroup_seqfile_show,
-};
-
-static const struct kernfs_ops kf_mondata_ops = {
- .atomic_write_len = PAGE_SIZE,
- .seq_show = rdtgroup_mondata_show,
-};
-
-static bool is_cpu_list(struct kernfs_open_file *of)
-{
- struct rftype *rft = of->kn->priv;
-
- return rft->flags & RFTYPE_FLAGS_CPUS_LIST;
-}
-
-static int rdtgroup_cpus_show(struct kernfs_open_file *of,
- struct seq_file *s, void *v)
-{
- struct rdtgroup *rdtgrp;
- struct cpumask *mask;
- int ret = 0;
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
-
- if (rdtgrp) {
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
- if (!rdtgrp->plr->d) {
- rdt_last_cmd_clear();
- rdt_last_cmd_puts("Cache domain offline\n");
- ret = -ENODEV;
- } else {
- mask = &rdtgrp->plr->d->cpu_mask;
- seq_printf(s, is_cpu_list(of) ?
- "%*pbl\n" : "%*pb\n",
- cpumask_pr_args(mask));
- }
- } else {
- seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n",
- cpumask_pr_args(&rdtgrp->cpu_mask));
- }
- } else {
- ret = -ENOENT;
- }
- rdtgroup_kn_unlock(of->kn);
+DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key);
- return ret;
-}
+DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
/*
- * This is safe against resctrl_sched_in() called from __switch_to()
+ * This is safe against resctrl_arch_sched_in() called from __switch_to()
* because __switch_to() is executed with interrupts disabled. A local call
* from update_closid_rmid() is protected against __switch_to() because
* preemption is disabled.
*/
-static void update_cpu_closid_rmid(void *info)
+void resctrl_arch_sync_cpu_closid_rmid(void *info)
{
- struct rdtgroup *r = info;
+ struct resctrl_cpu_defaults *r = info;
if (r) {
this_cpu_write(pqr_state.default_closid, r->closid);
- this_cpu_write(pqr_state.default_rmid, r->mon.rmid);
+ this_cpu_write(pqr_state.default_rmid, r->rmid);
}
/*
@@ -314,1533 +58,88 @@ static void update_cpu_closid_rmid(void *info)
* executing task might have its own closid selected. Just reuse
* the context switch code.
*/
- resctrl_sched_in();
-}
-
-/*
- * Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
- *
- * Per task closids/rmids must have been set up before calling this function.
- */
-static void
-update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r)
-{
- int cpu = get_cpu();
-
- if (cpumask_test_cpu(cpu, cpu_mask))
- update_cpu_closid_rmid(r);
- smp_call_function_many(cpu_mask, update_cpu_closid_rmid, r, 1);
- put_cpu();
-}
-
-static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
- cpumask_var_t tmpmask)
-{
- struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp;
- struct list_head *head;
-
- /* Check whether cpus belong to parent ctrl group */
- cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask);
- if (!cpumask_empty(tmpmask)) {
- rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n");
- return -EINVAL;
- }
-
- /* Check whether cpus are dropped from this group */
- cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
- if (!cpumask_empty(tmpmask)) {
- /* Give any dropped cpus to parent rdtgroup */
- cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask);
- update_closid_rmid(tmpmask, prgrp);
- }
-
- /*
- * If we added cpus, remove them from previous group that owned them
- * and update per-cpu rmid
- */
- cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
- if (!cpumask_empty(tmpmask)) {
- head = &prgrp->mon.crdtgrp_list;
- list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
- if (crgrp == rdtgrp)
- continue;
- cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask,
- tmpmask);
- }
- update_closid_rmid(tmpmask, rdtgrp);
- }
-
- /* Done pushing/pulling - update this group with new mask */
- cpumask_copy(&rdtgrp->cpu_mask, newmask);
-
- return 0;
-}
-
-static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m)
-{
- struct rdtgroup *crgrp;
-
- cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m);
- /* update the child mon group masks as well*/
- list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list)
- cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask);
-}
-
-static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
- cpumask_var_t tmpmask, cpumask_var_t tmpmask1)
-{
- struct rdtgroup *r, *crgrp;
- struct list_head *head;
-
- /* Check whether cpus are dropped from this group */
- cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
- if (!cpumask_empty(tmpmask)) {
- /* Can't drop from default group */
- if (rdtgrp == &rdtgroup_default) {
- rdt_last_cmd_puts("Can't drop CPUs from default group\n");
- return -EINVAL;
- }
-
- /* Give any dropped cpus to rdtgroup_default */
- cpumask_or(&rdtgroup_default.cpu_mask,
- &rdtgroup_default.cpu_mask, tmpmask);
- update_closid_rmid(tmpmask, &rdtgroup_default);
- }
-
- /*
- * If we added cpus, remove them from previous group and
- * the prev group's child groups that owned them
- * and update per-cpu closid/rmid.
- */
- cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
- if (!cpumask_empty(tmpmask)) {
- list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
- if (r == rdtgrp)
- continue;
- cpumask_and(tmpmask1, &r->cpu_mask, tmpmask);
- if (!cpumask_empty(tmpmask1))
- cpumask_rdtgrp_clear(r, tmpmask1);
- }
- update_closid_rmid(tmpmask, rdtgrp);
- }
-
- /* Done pushing/pulling - update this group with new mask */
- cpumask_copy(&rdtgrp->cpu_mask, newmask);
-
- /*
- * Clear child mon group masks since there is a new parent mask
- * now and update the rmid for the cpus the child lost.
- */
- head = &rdtgrp->mon.crdtgrp_list;
- list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
- cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask);
- update_closid_rmid(tmpmask, rdtgrp);
- cpumask_clear(&crgrp->cpu_mask);
- }
-
- return 0;
+ resctrl_arch_sched_in(current);
}
-static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
-{
- cpumask_var_t tmpmask, newmask, tmpmask1;
- struct rdtgroup *rdtgrp;
- int ret;
-
- if (!buf)
- return -EINVAL;
-
- if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
- return -ENOMEM;
- if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) {
- free_cpumask_var(tmpmask);
- return -ENOMEM;
- }
- if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) {
- free_cpumask_var(tmpmask);
- free_cpumask_var(newmask);
- return -ENOMEM;
- }
-
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
- if (!rdtgrp) {
- ret = -ENOENT;
- goto unlock;
- }
-
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
- rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
- ret = -EINVAL;
- rdt_last_cmd_puts("Pseudo-locking in progress\n");
- goto unlock;
- }
-
- if (is_cpu_list(of))
- ret = cpulist_parse(buf, newmask);
- else
- ret = cpumask_parse(buf, newmask);
-
- if (ret) {
- rdt_last_cmd_puts("Bad CPU list/mask\n");
- goto unlock;
- }
-
- /* check that user didn't specify any offline cpus */
- cpumask_andnot(tmpmask, newmask, cpu_online_mask);
- if (!cpumask_empty(tmpmask)) {
- ret = -EINVAL;
- rdt_last_cmd_puts("Can only assign online CPUs\n");
- goto unlock;
- }
-
- if (rdtgrp->type == RDTCTRL_GROUP)
- ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1);
- else if (rdtgrp->type == RDTMON_GROUP)
- ret = cpus_mon_write(rdtgrp, newmask, tmpmask);
- else
- ret = -EINVAL;
-
-unlock:
- rdtgroup_kn_unlock(of->kn);
- free_cpumask_var(tmpmask);
- free_cpumask_var(newmask);
- free_cpumask_var(tmpmask1);
-
- return ret ?: nbytes;
-}
+#define INVALID_CONFIG_INDEX UINT_MAX
/**
- * rdtgroup_remove - the helper to remove resource group safely
- * @rdtgrp: resource group to remove
- *
- * On resource group creation via a mkdir, an extra kernfs_node reference is
- * taken to ensure that the rdtgroup structure remains accessible for the
- * rdtgroup_kn_unlock() calls where it is removed.
- *
- * Drop the extra reference here, then free the rdtgroup structure.
+ * mon_event_config_index_get - get the hardware index for the
+ * configurable event
+ * @evtid: event id.
*
- * Return: void
+ * Return: 0 for evtid == QOS_L3_MBM_TOTAL_EVENT_ID
+ * 1 for evtid == QOS_L3_MBM_LOCAL_EVENT_ID
+ * INVALID_CONFIG_INDEX for invalid evtid
*/
-static void rdtgroup_remove(struct rdtgroup *rdtgrp)
-{
- kernfs_put(rdtgrp->kn);
- kfree(rdtgrp);
-}
-
-static void _update_task_closid_rmid(void *task)
-{
- /*
- * If the task is still current on this CPU, update PQR_ASSOC MSR.
- * Otherwise, the MSR is updated when the task is scheduled in.
- */
- if (task == current)
- resctrl_sched_in();
-}
-
-static void update_task_closid_rmid(struct task_struct *t)
-{
- if (IS_ENABLED(CONFIG_SMP) && task_curr(t))
- smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1);
- else
- _update_task_closid_rmid(t);
-}
-
-static int __rdtgroup_move_task(struct task_struct *tsk,
- struct rdtgroup *rdtgrp)
+static inline unsigned int mon_event_config_index_get(u32 evtid)
{
- /* If the task is already in rdtgrp, no need to move the task. */
- if ((rdtgrp->type == RDTCTRL_GROUP && tsk->closid == rdtgrp->closid &&
- tsk->rmid == rdtgrp->mon.rmid) ||
- (rdtgrp->type == RDTMON_GROUP && tsk->rmid == rdtgrp->mon.rmid &&
- tsk->closid == rdtgrp->mon.parent->closid))
+ switch (evtid) {
+ case QOS_L3_MBM_TOTAL_EVENT_ID:
return 0;
-
- /*
- * Set the task's closid/rmid before the PQR_ASSOC MSR can be
- * updated by them.
- *
- * For ctrl_mon groups, move both closid and rmid.
- * For monitor groups, can move the tasks only from
- * their parent CTRL group.
- */
-
- if (rdtgrp->type == RDTCTRL_GROUP) {
- WRITE_ONCE(tsk->closid, rdtgrp->closid);
- WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid);
- } else if (rdtgrp->type == RDTMON_GROUP) {
- if (rdtgrp->mon.parent->closid == tsk->closid) {
- WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid);
- } else {
- rdt_last_cmd_puts("Can't move task to different control group\n");
- return -EINVAL;
- }
- }
-
- /*
- * Ensure the task's closid and rmid are written before determining if
- * the task is current that will decide if it will be interrupted.
- */
- barrier();
-
- /*
- * By now, the task's closid and rmid are set. If the task is current
- * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource
- * group go into effect. If the task is not current, the MSR will be
- * updated when the task is scheduled in.
- */
- update_task_closid_rmid(tsk);
-
- return 0;
-}
-
-static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
-{
- return (rdt_alloc_capable &&
- (r->type == RDTCTRL_GROUP) && (t->closid == r->closid));
-}
-
-static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
-{
- return (rdt_mon_capable &&
- (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid));
-}
-
-/**
- * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group
- * @r: Resource group
- *
- * Return: 1 if tasks have been assigned to @r, 0 otherwise
- */
-int rdtgroup_tasks_assigned(struct rdtgroup *r)
-{
- struct task_struct *p, *t;
- int ret = 0;
-
- lockdep_assert_held(&rdtgroup_mutex);
-
- rcu_read_lock();
- for_each_process_thread(p, t) {
- if (is_closid_match(t, r) || is_rmid_match(t, r)) {
- ret = 1;
- break;
- }
- }
- rcu_read_unlock();
-
- return ret;
-}
-
-static int rdtgroup_task_write_permission(struct task_struct *task,
- struct kernfs_open_file *of)
-{
- const struct cred *tcred = get_task_cred(task);
- const struct cred *cred = current_cred();
- int ret = 0;
-
- /*
- * Even if we're attaching all tasks in the thread group, we only
- * need to check permissions on one of them.
- */
- if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
- !uid_eq(cred->euid, tcred->uid) &&
- !uid_eq(cred->euid, tcred->suid)) {
- rdt_last_cmd_printf("No permission to move task %d\n", task->pid);
- ret = -EPERM;
- }
-
- put_cred(tcred);
- return ret;
-}
-
-static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp,
- struct kernfs_open_file *of)
-{
- struct task_struct *tsk;
- int ret;
-
- rcu_read_lock();
- if (pid) {
- tsk = find_task_by_vpid(pid);
- if (!tsk) {
- rcu_read_unlock();
- rdt_last_cmd_printf("No task %d\n", pid);
- return -ESRCH;
- }
- } else {
- tsk = current;
- }
-
- get_task_struct(tsk);
- rcu_read_unlock();
-
- ret = rdtgroup_task_write_permission(tsk, of);
- if (!ret)
- ret = __rdtgroup_move_task(tsk, rdtgrp);
-
- put_task_struct(tsk);
- return ret;
-}
-
-static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
-{
- struct rdtgroup *rdtgrp;
- int ret = 0;
- pid_t pid;
-
- if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
- return -EINVAL;
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
- if (!rdtgrp) {
- rdtgroup_kn_unlock(of->kn);
- return -ENOENT;
- }
- rdt_last_cmd_clear();
-
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
- rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
- ret = -EINVAL;
- rdt_last_cmd_puts("Pseudo-locking in progress\n");
- goto unlock;
- }
-
- ret = rdtgroup_move_task(pid, rdtgrp, of);
-
-unlock:
- rdtgroup_kn_unlock(of->kn);
-
- return ret ?: nbytes;
-}
-
-static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
-{
- struct task_struct *p, *t;
-
- rcu_read_lock();
- for_each_process_thread(p, t) {
- if (is_closid_match(t, r) || is_rmid_match(t, r))
- seq_printf(s, "%d\n", t->pid);
- }
- rcu_read_unlock();
-}
-
-static int rdtgroup_tasks_show(struct kernfs_open_file *of,
- struct seq_file *s, void *v)
-{
- struct rdtgroup *rdtgrp;
- int ret = 0;
-
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
- if (rdtgrp)
- show_rdt_tasks(rdtgrp, s);
- else
- ret = -ENOENT;
- rdtgroup_kn_unlock(of->kn);
-
- return ret;
-}
-
-#ifdef CONFIG_PROC_CPU_RESCTRL
-
-/*
- * A task can only be part of one resctrl control group and of one monitor
- * group which is associated to that control group.
- *
- * 1) res:
- * mon:
- *
- * resctrl is not available.
- *
- * 2) res:/
- * mon:
- *
- * Task is part of the root resctrl control group, and it is not associated
- * to any monitor group.
- *
- * 3) res:/
- * mon:mon0
- *
- * Task is part of the root resctrl control group and monitor group mon0.
- *
- * 4) res:group0
- * mon:
- *
- * Task is part of resctrl control group group0, and it is not associated
- * to any monitor group.
- *
- * 5) res:group0
- * mon:mon1
- *
- * Task is part of resctrl control group group0 and monitor group mon1.
- */
-int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns,
- struct pid *pid, struct task_struct *tsk)
-{
- struct rdtgroup *rdtg;
- int ret = 0;
-
- mutex_lock(&rdtgroup_mutex);
-
- /* Return empty if resctrl has not been mounted. */
- if (!static_branch_unlikely(&rdt_enable_key)) {
- seq_puts(s, "res:\nmon:\n");
- goto unlock;
- }
-
- list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) {
- struct rdtgroup *crg;
-
- /*
- * Task information is only relevant for shareable
- * and exclusive groups.
- */
- if (rdtg->mode != RDT_MODE_SHAREABLE &&
- rdtg->mode != RDT_MODE_EXCLUSIVE)
- continue;
-
- if (rdtg->closid != tsk->closid)
- continue;
-
- seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "",
- rdtg->kn->name);
- seq_puts(s, "mon:");
- list_for_each_entry(crg, &rdtg->mon.crdtgrp_list,
- mon.crdtgrp_list) {
- if (tsk->rmid != crg->mon.rmid)
- continue;
- seq_printf(s, "%s", crg->kn->name);
- break;
- }
- seq_putc(s, '\n');
- goto unlock;
- }
- /*
- * The above search should succeed. Otherwise return
- * with an error.
- */
- ret = -ENOENT;
-unlock:
- mutex_unlock(&rdtgroup_mutex);
-
- return ret;
-}
-#endif
-
-static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- int len;
-
- mutex_lock(&rdtgroup_mutex);
- len = seq_buf_used(&last_cmd_status);
- if (len)
- seq_printf(seq, "%.*s", len, last_cmd_status_buf);
- else
- seq_puts(seq, "ok\n");
- mutex_unlock(&rdtgroup_mutex);
- return 0;
-}
-
-static int rdt_num_closids_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct resctrl_schema *s = of->kn->parent->priv;
-
- seq_printf(seq, "%u\n", s->num_closid);
- return 0;
-}
-
-static int rdt_default_ctrl_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct resctrl_schema *s = of->kn->parent->priv;
- struct rdt_resource *r = s->res;
-
- seq_printf(seq, "%x\n", r->default_ctrl);
- return 0;
-}
-
-static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct resctrl_schema *s = of->kn->parent->priv;
- struct rdt_resource *r = s->res;
-
- seq_printf(seq, "%u\n", r->cache.min_cbm_bits);
- return 0;
-}
-
-static int rdt_shareable_bits_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct resctrl_schema *s = of->kn->parent->priv;
- struct rdt_resource *r = s->res;
-
- seq_printf(seq, "%x\n", r->cache.shareable_bits);
- return 0;
-}
-
-/**
- * rdt_bit_usage_show - Display current usage of resources
- *
- * A domain is a shared resource that can now be allocated differently. Here
- * we display the current regions of the domain as an annotated bitmask.
- * For each domain of this resource its allocation bitmask
- * is annotated as below to indicate the current usage of the corresponding bit:
- * 0 - currently unused
- * X - currently available for sharing and used by software and hardware
- * H - currently used by hardware only but available for software use
- * S - currently used and shareable by software only
- * E - currently used exclusively by one resource group
- * P - currently pseudo-locked by one resource group
- */
-static int rdt_bit_usage_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct resctrl_schema *s = of->kn->parent->priv;
- /*
- * Use unsigned long even though only 32 bits are used to ensure
- * test_bit() is used safely.
- */
- unsigned long sw_shareable = 0, hw_shareable = 0;
- unsigned long exclusive = 0, pseudo_locked = 0;
- struct rdt_resource *r = s->res;
- struct rdt_domain *dom;
- int i, hwb, swb, excl, psl;
- enum rdtgrp_mode mode;
- bool sep = false;
- u32 ctrl_val;
-
- mutex_lock(&rdtgroup_mutex);
- hw_shareable = r->cache.shareable_bits;
- list_for_each_entry(dom, &r->domains, list) {
- if (sep)
- seq_putc(seq, ';');
- sw_shareable = 0;
- exclusive = 0;
- seq_printf(seq, "%d=", dom->id);
- for (i = 0; i < closids_supported(); i++) {
- if (!closid_allocated(i))
- continue;
- ctrl_val = resctrl_arch_get_config(r, dom, i,
- s->conf_type);
- mode = rdtgroup_mode_by_closid(i);
- switch (mode) {
- case RDT_MODE_SHAREABLE:
- sw_shareable |= ctrl_val;
- break;
- case RDT_MODE_EXCLUSIVE:
- exclusive |= ctrl_val;
- break;
- case RDT_MODE_PSEUDO_LOCKSETUP:
- /*
- * RDT_MODE_PSEUDO_LOCKSETUP is possible
- * here but not included since the CBM
- * associated with this CLOSID in this mode
- * is not initialized and no task or cpu can be
- * assigned this CLOSID.
- */
- break;
- case RDT_MODE_PSEUDO_LOCKED:
- case RDT_NUM_MODES:
- WARN(1,
- "invalid mode for closid %d\n", i);
- break;
- }
- }
- for (i = r->cache.cbm_len - 1; i >= 0; i--) {
- pseudo_locked = dom->plr ? dom->plr->cbm : 0;
- hwb = test_bit(i, &hw_shareable);
- swb = test_bit(i, &sw_shareable);
- excl = test_bit(i, &exclusive);
- psl = test_bit(i, &pseudo_locked);
- if (hwb && swb)
- seq_putc(seq, 'X');
- else if (hwb && !swb)
- seq_putc(seq, 'H');
- else if (!hwb && swb)
- seq_putc(seq, 'S');
- else if (excl)
- seq_putc(seq, 'E');
- else if (psl)
- seq_putc(seq, 'P');
- else /* Unused bits remain */
- seq_putc(seq, '0');
- }
- sep = true;
- }
- seq_putc(seq, '\n');
- mutex_unlock(&rdtgroup_mutex);
- return 0;
-}
-
-static int rdt_min_bw_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct resctrl_schema *s = of->kn->parent->priv;
- struct rdt_resource *r = s->res;
-
- seq_printf(seq, "%u\n", r->membw.min_bw);
- return 0;
-}
-
-static int rdt_num_rmids_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct rdt_resource *r = of->kn->parent->priv;
-
- seq_printf(seq, "%d\n", r->num_rmid);
-
- return 0;
-}
-
-static int rdt_mon_features_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct rdt_resource *r = of->kn->parent->priv;
- struct mon_evt *mevt;
-
- list_for_each_entry(mevt, &r->evt_list, list)
- seq_printf(seq, "%s\n", mevt->name);
-
- return 0;
-}
-
-static int rdt_bw_gran_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct resctrl_schema *s = of->kn->parent->priv;
- struct rdt_resource *r = s->res;
-
- seq_printf(seq, "%u\n", r->membw.bw_gran);
- return 0;
-}
-
-static int rdt_delay_linear_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct resctrl_schema *s = of->kn->parent->priv;
- struct rdt_resource *r = s->res;
-
- seq_printf(seq, "%u\n", r->membw.delay_linear);
- return 0;
-}
-
-static int max_threshold_occ_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct rdt_resource *r = of->kn->parent->priv;
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
-
- seq_printf(seq, "%u\n", resctrl_cqm_threshold * hw_res->mon_scale);
-
- return 0;
-}
-
-static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of,
- struct seq_file *seq, void *v)
-{
- struct resctrl_schema *s = of->kn->parent->priv;
- struct rdt_resource *r = s->res;
-
- if (r->membw.throttle_mode == THREAD_THROTTLE_PER_THREAD)
- seq_puts(seq, "per-thread\n");
- else
- seq_puts(seq, "max\n");
-
- return 0;
-}
-
-static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
-{
- struct rdt_hw_resource *hw_res;
- unsigned int bytes;
- int ret;
-
- ret = kstrtouint(buf, 0, &bytes);
- if (ret)
- return ret;
-
- if (bytes > (boot_cpu_data.x86_cache_size * 1024))
- return -EINVAL;
-
- hw_res = resctrl_to_arch_res(of->kn->parent->priv);
- resctrl_cqm_threshold = bytes / hw_res->mon_scale;
-
- return nbytes;
-}
-
-/*
- * rdtgroup_mode_show - Display mode of this resource group
- */
-static int rdtgroup_mode_show(struct kernfs_open_file *of,
- struct seq_file *s, void *v)
-{
- struct rdtgroup *rdtgrp;
-
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
- if (!rdtgrp) {
- rdtgroup_kn_unlock(of->kn);
- return -ENOENT;
- }
-
- seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode));
-
- rdtgroup_kn_unlock(of->kn);
- return 0;
-}
-
-static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type)
-{
- switch (my_type) {
- case CDP_CODE:
- return CDP_DATA;
- case CDP_DATA:
- return CDP_CODE;
+ case QOS_L3_MBM_LOCAL_EVENT_ID:
+ return 1;
default:
- case CDP_NONE:
- return CDP_NONE;
- }
-}
-
-/**
- * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other
- * @r: Resource to which domain instance @d belongs.
- * @d: The domain instance for which @closid is being tested.
- * @cbm: Capacity bitmask being tested.
- * @closid: Intended closid for @cbm.
- * @exclusive: Only check if overlaps with exclusive resource groups
- *
- * Checks if provided @cbm intended to be used for @closid on domain
- * @d overlaps with any other closids or other hardware usage associated
- * with this domain. If @exclusive is true then only overlaps with
- * resource groups in exclusive mode will be considered. If @exclusive
- * is false then overlaps with any resource group or hardware entities
- * will be considered.
- *
- * @cbm is unsigned long, even if only 32 bits are used, to make the
- * bitmap functions work correctly.
- *
- * Return: false if CBM does not overlap, true if it does.
- */
-static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
- unsigned long cbm, int closid,
- enum resctrl_conf_type type, bool exclusive)
-{
- enum rdtgrp_mode mode;
- unsigned long ctrl_b;
- int i;
-
- /* Check for any overlap with regions used by hardware directly */
- if (!exclusive) {
- ctrl_b = r->cache.shareable_bits;
- if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len))
- return true;
- }
-
- /* Check for overlap with other resource groups */
- for (i = 0; i < closids_supported(); i++) {
- ctrl_b = resctrl_arch_get_config(r, d, i, type);
- mode = rdtgroup_mode_by_closid(i);
- if (closid_allocated(i) && i != closid &&
- mode != RDT_MODE_PSEUDO_LOCKSETUP) {
- if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) {
- if (exclusive) {
- if (mode == RDT_MODE_EXCLUSIVE)
- return true;
- continue;
- }
- return true;
- }
- }
- }
-
- return false;
-}
-
-/**
- * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware
- * @s: Schema for the resource to which domain instance @d belongs.
- * @d: The domain instance for which @closid is being tested.
- * @cbm: Capacity bitmask being tested.
- * @closid: Intended closid for @cbm.
- * @exclusive: Only check if overlaps with exclusive resource groups
- *
- * Resources that can be allocated using a CBM can use the CBM to control
- * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test
- * for overlap. Overlap test is not limited to the specific resource for
- * which the CBM is intended though - when dealing with CDP resources that
- * share the underlying hardware the overlap check should be performed on
- * the CDP resource sharing the hardware also.
- *
- * Refer to description of __rdtgroup_cbm_overlaps() for the details of the
- * overlap test.
- *
- * Return: true if CBM overlap detected, false if there is no overlap
- */
-bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d,
- unsigned long cbm, int closid, bool exclusive)
-{
- enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
- struct rdt_resource *r = s->res;
-
- if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, s->conf_type,
- exclusive))
- return true;
-
- if (!resctrl_arch_get_cdp_enabled(r->rid))
- return false;
- return __rdtgroup_cbm_overlaps(r, d, cbm, closid, peer_type, exclusive);
-}
-
-/**
- * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive
- *
- * An exclusive resource group implies that there should be no sharing of
- * its allocated resources. At the time this group is considered to be
- * exclusive this test can determine if its current schemata supports this
- * setting by testing for overlap with all other resource groups.
- *
- * Return: true if resource group can be exclusive, false if there is overlap
- * with allocations of other resource groups and thus this resource group
- * cannot be exclusive.
- */
-static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
-{
- int closid = rdtgrp->closid;
- struct resctrl_schema *s;
- struct rdt_resource *r;
- bool has_cache = false;
- struct rdt_domain *d;
- u32 ctrl;
-
- list_for_each_entry(s, &resctrl_schema_all, list) {
- r = s->res;
- if (r->rid == RDT_RESOURCE_MBA)
- continue;
- has_cache = true;
- list_for_each_entry(d, &r->domains, list) {
- ctrl = resctrl_arch_get_config(r, d, closid,
- s->conf_type);
- if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) {
- rdt_last_cmd_puts("Schemata overlaps\n");
- return false;
- }
- }
- }
-
- if (!has_cache) {
- rdt_last_cmd_puts("Cannot be exclusive without CAT/CDP\n");
- return false;
- }
-
- return true;
-}
-
-/**
- * rdtgroup_mode_write - Modify the resource group's mode
- *
- */
-static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
-{
- struct rdtgroup *rdtgrp;
- enum rdtgrp_mode mode;
- int ret = 0;
-
- /* Valid input requires a trailing newline */
- if (nbytes == 0 || buf[nbytes - 1] != '\n')
- return -EINVAL;
- buf[nbytes - 1] = '\0';
-
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
- if (!rdtgrp) {
- rdtgroup_kn_unlock(of->kn);
- return -ENOENT;
- }
-
- rdt_last_cmd_clear();
-
- mode = rdtgrp->mode;
-
- if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) ||
- (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) ||
- (!strcmp(buf, "pseudo-locksetup") &&
- mode == RDT_MODE_PSEUDO_LOCKSETUP) ||
- (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED))
- goto out;
-
- if (mode == RDT_MODE_PSEUDO_LOCKED) {
- rdt_last_cmd_puts("Cannot change pseudo-locked group\n");
- ret = -EINVAL;
- goto out;
+ /* Should never reach here */
+ return INVALID_CONFIG_INDEX;
}
-
- if (!strcmp(buf, "shareable")) {
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
- ret = rdtgroup_locksetup_exit(rdtgrp);
- if (ret)
- goto out;
- }
- rdtgrp->mode = RDT_MODE_SHAREABLE;
- } else if (!strcmp(buf, "exclusive")) {
- if (!rdtgroup_mode_test_exclusive(rdtgrp)) {
- ret = -EINVAL;
- goto out;
- }
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
- ret = rdtgroup_locksetup_exit(rdtgrp);
- if (ret)
- goto out;
- }
- rdtgrp->mode = RDT_MODE_EXCLUSIVE;
- } else if (!strcmp(buf, "pseudo-locksetup")) {
- ret = rdtgroup_locksetup_enter(rdtgrp);
- if (ret)
- goto out;
- rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP;
- } else {
- rdt_last_cmd_puts("Unknown or unsupported mode\n");
- ret = -EINVAL;
- }
-
-out:
- rdtgroup_kn_unlock(of->kn);
- return ret ?: nbytes;
}
-/**
- * rdtgroup_cbm_to_size - Translate CBM to size in bytes
- * @r: RDT resource to which @d belongs.
- * @d: RDT domain instance.
- * @cbm: bitmask for which the size should be computed.
- *
- * The bitmask provided associated with the RDT domain instance @d will be
- * translated into how many bytes it represents. The size in bytes is
- * computed by first dividing the total cache size by the CBM length to
- * determine how many bytes each bit in the bitmask represents. The result
- * is multiplied with the number of bits set in the bitmask.
- *
- * @cbm is unsigned long, even if only 32 bits are used to make the
- * bitmap functions work correctly.
- */
-unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
- struct rdt_domain *d, unsigned long cbm)
+void resctrl_arch_mon_event_config_read(void *_config_info)
{
- struct cpu_cacheinfo *ci;
- unsigned int size = 0;
- int num_b, i;
-
- num_b = bitmap_weight(&cbm, r->cache.cbm_len);
- ci = get_cpu_cacheinfo(cpumask_any(&d->cpu_mask));
- for (i = 0; i < ci->num_leaves; i++) {
- if (ci->info_list[i].level == r->cache_level) {
- size = ci->info_list[i].size / r->cache.cbm_len * num_b;
- break;
- }
- }
-
- return size;
-}
+ struct resctrl_mon_config_info *config_info = _config_info;
+ unsigned int index;
+ u64 msrval;
-/**
- * rdtgroup_size_show - Display size in bytes of allocated regions
- *
- * The "size" file mirrors the layout of the "schemata" file, printing the
- * size in bytes of each region instead of the capacity bitmask.
- *
- */
-static int rdtgroup_size_show(struct kernfs_open_file *of,
- struct seq_file *s, void *v)
-{
- struct resctrl_schema *schema;
- struct rdtgroup *rdtgrp;
- struct rdt_resource *r;
- struct rdt_domain *d;
- unsigned int size;
- int ret = 0;
- bool sep;
- u32 ctrl;
-
- rdtgrp = rdtgroup_kn_lock_live(of->kn);
- if (!rdtgrp) {
- rdtgroup_kn_unlock(of->kn);
- return -ENOENT;
- }
-
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
- if (!rdtgrp->plr->d) {
- rdt_last_cmd_clear();
- rdt_last_cmd_puts("Cache domain offline\n");
- ret = -ENODEV;
- } else {
- seq_printf(s, "%*s:", max_name_width,
- rdtgrp->plr->s->name);
- size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res,
- rdtgrp->plr->d,
- rdtgrp->plr->cbm);
- seq_printf(s, "%d=%u\n", rdtgrp->plr->d->id, size);
- }
- goto out;
- }
-
- list_for_each_entry(schema, &resctrl_schema_all, list) {
- r = schema->res;
- sep = false;
- seq_printf(s, "%*s:", max_name_width, schema->name);
- list_for_each_entry(d, &r->domains, list) {
- if (sep)
- seq_putc(s, ';');
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
- size = 0;
- } else {
- ctrl = resctrl_arch_get_config(r, d,
- rdtgrp->closid,
- schema->conf_type);
- if (r->rid == RDT_RESOURCE_MBA)
- size = ctrl;
- else
- size = rdtgroup_cbm_to_size(r, d, ctrl);
- }
- seq_printf(s, "%d=%u", d->id, size);
- sep = true;
- }
- seq_putc(s, '\n');
- }
-
-out:
- rdtgroup_kn_unlock(of->kn);
-
- return ret;
-}
-
-/* rdtgroup information files for one cache resource. */
-static struct rftype res_common_files[] = {
- {
- .name = "last_cmd_status",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_last_cmd_status_show,
- .fflags = RF_TOP_INFO,
- },
- {
- .name = "num_closids",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_num_closids_show,
- .fflags = RF_CTRL_INFO,
- },
- {
- .name = "mon_features",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_mon_features_show,
- .fflags = RF_MON_INFO,
- },
- {
- .name = "num_rmids",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_num_rmids_show,
- .fflags = RF_MON_INFO,
- },
- {
- .name = "cbm_mask",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_default_ctrl_show,
- .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE,
- },
- {
- .name = "min_cbm_bits",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_min_cbm_bits_show,
- .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE,
- },
- {
- .name = "shareable_bits",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_shareable_bits_show,
- .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE,
- },
- {
- .name = "bit_usage",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_bit_usage_show,
- .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE,
- },
- {
- .name = "min_bandwidth",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_min_bw_show,
- .fflags = RF_CTRL_INFO | RFTYPE_RES_MB,
- },
- {
- .name = "bandwidth_gran",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_bw_gran_show,
- .fflags = RF_CTRL_INFO | RFTYPE_RES_MB,
- },
- {
- .name = "delay_linear",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_delay_linear_show,
- .fflags = RF_CTRL_INFO | RFTYPE_RES_MB,
- },
- /*
- * Platform specific which (if any) capabilities are provided by
- * thread_throttle_mode. Defer "fflags" initialization to platform
- * discovery.
- */
- {
- .name = "thread_throttle_mode",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdt_thread_throttle_mode_show,
- },
- {
- .name = "max_threshold_occupancy",
- .mode = 0644,
- .kf_ops = &rdtgroup_kf_single_ops,
- .write = max_threshold_occ_write,
- .seq_show = max_threshold_occ_show,
- .fflags = RF_MON_INFO | RFTYPE_RES_CACHE,
- },
- {
- .name = "cpus",
- .mode = 0644,
- .kf_ops = &rdtgroup_kf_single_ops,
- .write = rdtgroup_cpus_write,
- .seq_show = rdtgroup_cpus_show,
- .fflags = RFTYPE_BASE,
- },
- {
- .name = "cpus_list",
- .mode = 0644,
- .kf_ops = &rdtgroup_kf_single_ops,
- .write = rdtgroup_cpus_write,
- .seq_show = rdtgroup_cpus_show,
- .flags = RFTYPE_FLAGS_CPUS_LIST,
- .fflags = RFTYPE_BASE,
- },
- {
- .name = "tasks",
- .mode = 0644,
- .kf_ops = &rdtgroup_kf_single_ops,
- .write = rdtgroup_tasks_write,
- .seq_show = rdtgroup_tasks_show,
- .fflags = RFTYPE_BASE,
- },
- {
- .name = "schemata",
- .mode = 0644,
- .kf_ops = &rdtgroup_kf_single_ops,
- .write = rdtgroup_schemata_write,
- .seq_show = rdtgroup_schemata_show,
- .fflags = RF_CTRL_BASE,
- },
- {
- .name = "mode",
- .mode = 0644,
- .kf_ops = &rdtgroup_kf_single_ops,
- .write = rdtgroup_mode_write,
- .seq_show = rdtgroup_mode_show,
- .fflags = RF_CTRL_BASE,
- },
- {
- .name = "size",
- .mode = 0444,
- .kf_ops = &rdtgroup_kf_single_ops,
- .seq_show = rdtgroup_size_show,
- .fflags = RF_CTRL_BASE,
- },
-
-};
-
-static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags)
-{
- struct rftype *rfts, *rft;
- int ret, len;
-
- rfts = res_common_files;
- len = ARRAY_SIZE(res_common_files);
-
- lockdep_assert_held(&rdtgroup_mutex);
-
- for (rft = rfts; rft < rfts + len; rft++) {
- if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) {
- ret = rdtgroup_add_file(kn, rft);
- if (ret)
- goto error;
- }
- }
-
- return 0;
-error:
- pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
- while (--rft >= rfts) {
- if ((fflags & rft->fflags) == rft->fflags)
- kernfs_remove_by_name(kn, rft->name);
- }
- return ret;
-}
-
-static struct rftype *rdtgroup_get_rftype_by_name(const char *name)
-{
- struct rftype *rfts, *rft;
- int len;
-
- rfts = res_common_files;
- len = ARRAY_SIZE(res_common_files);
-
- for (rft = rfts; rft < rfts + len; rft++) {
- if (!strcmp(rft->name, name))
- return rft;
- }
-
- return NULL;
-}
-
-void __init thread_throttle_mode_init(void)
-{
- struct rftype *rft;
-
- rft = rdtgroup_get_rftype_by_name("thread_throttle_mode");
- if (!rft)
+ index = mon_event_config_index_get(config_info->evtid);
+ if (index == INVALID_CONFIG_INDEX) {
+ pr_warn_once("Invalid event id %d\n", config_info->evtid);
return;
-
- rft->fflags = RF_CTRL_INFO | RFTYPE_RES_MB;
-}
-
-/**
- * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file
- * @r: The resource group with which the file is associated.
- * @name: Name of the file
- *
- * The permissions of named resctrl file, directory, or link are modified
- * to not allow read, write, or execute by any user.
- *
- * WARNING: This function is intended to communicate to the user that the
- * resctrl file has been locked down - that it is not relevant to the
- * particular state the system finds itself in. It should not be relied
- * on to protect from user access because after the file's permissions
- * are restricted the user can still change the permissions using chmod
- * from the command line.
- *
- * Return: 0 on success, <0 on failure.
- */
-int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name)
-{
- struct iattr iattr = {.ia_valid = ATTR_MODE,};
- struct kernfs_node *kn;
- int ret = 0;
-
- kn = kernfs_find_and_get_ns(r->kn, name, NULL);
- if (!kn)
- return -ENOENT;
-
- switch (kernfs_type(kn)) {
- case KERNFS_DIR:
- iattr.ia_mode = S_IFDIR;
- break;
- case KERNFS_FILE:
- iattr.ia_mode = S_IFREG;
- break;
- case KERNFS_LINK:
- iattr.ia_mode = S_IFLNK;
- break;
}
+ rdmsrq(MSR_IA32_EVT_CFG_BASE + index, msrval);
- ret = kernfs_setattr(kn, &iattr);
- kernfs_put(kn);
- return ret;
+ /* Report only the valid event configuration bits */
+ config_info->mon_config = msrval & MAX_EVT_CONFIG_BITS;
}
-/**
- * rdtgroup_kn_mode_restore - Restore user access to named resctrl file
- * @r: The resource group with which the file is associated.
- * @name: Name of the file
- * @mask: Mask of permissions that should be restored
- *
- * Restore the permissions of the named file. If @name is a directory the
- * permissions of its parent will be used.
- *
- * Return: 0 on success, <0 on failure.
- */
-int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
- umode_t mask)
+void resctrl_arch_mon_event_config_write(void *_config_info)
{
- struct iattr iattr = {.ia_valid = ATTR_MODE,};
- struct kernfs_node *kn, *parent;
- struct rftype *rfts, *rft;
- int ret, len;
-
- rfts = res_common_files;
- len = ARRAY_SIZE(res_common_files);
+ struct resctrl_mon_config_info *config_info = _config_info;
+ unsigned int index;
- for (rft = rfts; rft < rfts + len; rft++) {
- if (!strcmp(rft->name, name))
- iattr.ia_mode = rft->mode & mask;
- }
-
- kn = kernfs_find_and_get_ns(r->kn, name, NULL);
- if (!kn)
- return -ENOENT;
-
- switch (kernfs_type(kn)) {
- case KERNFS_DIR:
- parent = kernfs_get_parent(kn);
- if (parent) {
- iattr.ia_mode |= parent->mode;
- kernfs_put(parent);
- }
- iattr.ia_mode |= S_IFDIR;
- break;
- case KERNFS_FILE:
- iattr.ia_mode |= S_IFREG;
- break;
- case KERNFS_LINK:
- iattr.ia_mode |= S_IFLNK;
- break;
- }
-
- ret = kernfs_setattr(kn, &iattr);
- kernfs_put(kn);
- return ret;
-}
-
-static int rdtgroup_mkdir_info_resdir(void *priv, char *name,
- unsigned long fflags)
-{
- struct kernfs_node *kn_subdir;
- int ret;
-
- kn_subdir = kernfs_create_dir(kn_info, name,
- kn_info->mode, priv);
- if (IS_ERR(kn_subdir))
- return PTR_ERR(kn_subdir);
-
- ret = rdtgroup_kn_set_ugid(kn_subdir);
- if (ret)
- return ret;
-
- ret = rdtgroup_add_files(kn_subdir, fflags);
- if (!ret)
- kernfs_activate(kn_subdir);
-
- return ret;
-}
-
-static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
-{
- struct resctrl_schema *s;
- struct rdt_resource *r;
- unsigned long fflags;
- char name[32];
- int ret;
-
- /* create the directory */
- kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
- if (IS_ERR(kn_info))
- return PTR_ERR(kn_info);
-
- ret = rdtgroup_add_files(kn_info, RF_TOP_INFO);
- if (ret)
- goto out_destroy;
-
- /* loop over enabled controls, these are all alloc_enabled */
- list_for_each_entry(s, &resctrl_schema_all, list) {
- r = s->res;
- fflags = r->fflags | RF_CTRL_INFO;
- ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags);
- if (ret)
- goto out_destroy;
- }
-
- for_each_mon_enabled_rdt_resource(r) {
- fflags = r->fflags | RF_MON_INFO;
- sprintf(name, "%s_MON", r->name);
- ret = rdtgroup_mkdir_info_resdir(r, name, fflags);
- if (ret)
- goto out_destroy;
+ index = mon_event_config_index_get(config_info->evtid);
+ if (index == INVALID_CONFIG_INDEX) {
+ pr_warn_once("Invalid event id %d\n", config_info->evtid);
+ return;
}
-
- ret = rdtgroup_kn_set_ugid(kn_info);
- if (ret)
- goto out_destroy;
-
- kernfs_activate(kn_info);
-
- return 0;
-
-out_destroy:
- kernfs_remove(kn_info);
- return ret;
-}
-
-static int
-mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp,
- char *name, struct kernfs_node **dest_kn)
-{
- struct kernfs_node *kn;
- int ret;
-
- /* create the directory */
- kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
- if (IS_ERR(kn))
- return PTR_ERR(kn);
-
- if (dest_kn)
- *dest_kn = kn;
-
- ret = rdtgroup_kn_set_ugid(kn);
- if (ret)
- goto out_destroy;
-
- kernfs_activate(kn);
-
- return 0;
-
-out_destroy:
- kernfs_remove(kn);
- return ret;
+ wrmsrq(MSR_IA32_EVT_CFG_BASE + index, config_info->mon_config);
}
static void l3_qos_cfg_update(void *arg)
{
bool *enable = arg;
- wrmsrl(MSR_IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL);
+ wrmsrq(MSR_IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL);
}
static void l2_qos_cfg_update(void *arg)
{
bool *enable = arg;
- wrmsrl(MSR_IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL);
-}
-
-static inline bool is_mba_linear(void)
-{
- return rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl.membw.delay_linear;
+ wrmsrq(MSR_IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL);
}
static int set_cache_qos_cfg(int level, bool enable)
{
void (*update)(void *arg);
+ struct rdt_ctrl_domain *d;
struct rdt_resource *r_l;
cpumask_var_t cpu_mask;
- struct rdt_domain *d;
int cpu;
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
if (level == RDT_RESOURCE_L3)
update = l3_qos_cfg_update;
else if (level == RDT_RESOURCE_L2)
@@ -1852,22 +151,18 @@ static int set_cache_qos_cfg(int level, bool enable)
return -ENOMEM;
r_l = &rdt_resources_all[level].r_resctrl;
- list_for_each_entry(d, &r_l->domains, list) {
+ list_for_each_entry(d, &r_l->ctrl_domains, hdr.list) {
if (r_l->cache.arch_has_per_cpu_cfg)
/* Pick all the CPUs in the domain instance */
- for_each_cpu(cpu, &d->cpu_mask)
+ for_each_cpu(cpu, &d->hdr.cpu_mask)
cpumask_set_cpu(cpu, cpu_mask);
else
/* Pick one CPU from each domain instance to update MSR */
- cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+ cpumask_set_cpu(cpumask_any(&d->hdr.cpu_mask), cpu_mask);
}
- cpu = get_cpu();
- /* Update QOS_CFG MSR on this cpu if it's in cpu_mask. */
- if (cpumask_test_cpu(cpu, cpu_mask))
- update(&enable);
- /* Update QOS_CFG MSR on all other cpus in cpu_mask. */
- smp_call_function_many(cpu_mask, update, &enable, 1);
- put_cpu();
+
+ /* Update QOS_CFG MSR on all the CPUs in cpu_mask */
+ on_each_cpu_mask(cpu_mask, update, &enable, 1);
free_cpumask_var(cpu_mask);
@@ -1889,31 +184,6 @@ void rdt_domain_reconfigure_cdp(struct rdt_resource *r)
l3_qos_cfg_update(&hw_res->cdp_enabled);
}
-/*
- * Enable or disable the MBA software controller
- * which helps user specify bandwidth in MBps.
- * MBA software controller is supported only if
- * MBM is supported and MBA is in linear scale.
- */
-static int set_mba_sc(bool mba_sc)
-{
- struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
- struct rdt_hw_domain *hw_dom;
- struct rdt_domain *d;
-
- if (!is_mbm_enabled() || !is_mba_linear() ||
- mba_sc == is_mba_sc(r))
- return -EINVAL;
-
- r->membw.mba_sc = mba_sc;
- list_for_each_entry(d, &r->domains, list) {
- hw_dom = resctrl_to_arch_dom(d);
- setup_default_ctrlval(r, hw_dom->ctrl_val, hw_dom->mbps_val);
- }
-
- return 0;
-}
-
static int cdp_enable(int level)
{
struct rdt_resource *r_l = &rdt_resources_all[level].r_resctrl;
@@ -1954,365 +224,21 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable)
return 0;
}
-static void cdp_disable_all(void)
-{
- if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3))
- resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
- if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2))
- resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
-}
-
-/*
- * We don't allow rdtgroup directories to be created anywhere
- * except the root directory. Thus when looking for the rdtgroup
- * structure for a kernfs node we are either looking at a directory,
- * in which case the rdtgroup structure is pointed at by the "priv"
- * field, otherwise we have a file, and need only look to the parent
- * to find the rdtgroup.
- */
-static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn)
-{
- if (kernfs_type(kn) == KERNFS_DIR) {
- /*
- * All the resource directories use "kn->priv"
- * to point to the "struct rdtgroup" for the
- * resource. "info" and its subdirectories don't
- * have rdtgroup structures, so return NULL here.
- */
- if (kn == kn_info || kn->parent == kn_info)
- return NULL;
- else
- return kn->priv;
- } else {
- return kn->parent->priv;
- }
-}
-
-struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn)
-{
- struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
-
- if (!rdtgrp)
- return NULL;
-
- atomic_inc(&rdtgrp->waitcount);
- kernfs_break_active_protection(kn);
-
- mutex_lock(&rdtgroup_mutex);
-
- /* Was this group deleted while we waited? */
- if (rdtgrp->flags & RDT_DELETED)
- return NULL;
-
- return rdtgrp;
-}
-
-void rdtgroup_kn_unlock(struct kernfs_node *kn)
-{
- struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
-
- if (!rdtgrp)
- return;
-
- mutex_unlock(&rdtgroup_mutex);
-
- if (atomic_dec_and_test(&rdtgrp->waitcount) &&
- (rdtgrp->flags & RDT_DELETED)) {
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
- rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
- rdtgroup_pseudo_lock_remove(rdtgrp);
- kernfs_unbreak_active_protection(kn);
- rdtgroup_remove(rdtgrp);
- } else {
- kernfs_unbreak_active_protection(kn);
- }
-}
-
-static int mkdir_mondata_all(struct kernfs_node *parent_kn,
- struct rdtgroup *prgrp,
- struct kernfs_node **mon_data_kn);
-
-static int rdt_enable_ctx(struct rdt_fs_context *ctx)
-{
- int ret = 0;
-
- if (ctx->enable_cdpl2)
- ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true);
-
- if (!ret && ctx->enable_cdpl3)
- ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true);
-
- if (!ret && ctx->enable_mba_mbps)
- ret = set_mba_sc(true);
-
- return ret;
-}
-
-static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type)
-{
- struct resctrl_schema *s;
- const char *suffix = "";
- int ret, cl;
-
- s = kzalloc(sizeof(*s), GFP_KERNEL);
- if (!s)
- return -ENOMEM;
-
- s->res = r;
- s->num_closid = resctrl_arch_get_num_closid(r);
- if (resctrl_arch_get_cdp_enabled(r->rid))
- s->num_closid /= 2;
-
- s->conf_type = type;
- switch (type) {
- case CDP_CODE:
- suffix = "CODE";
- break;
- case CDP_DATA:
- suffix = "DATA";
- break;
- case CDP_NONE:
- suffix = "";
- break;
- }
-
- ret = snprintf(s->name, sizeof(s->name), "%s%s", r->name, suffix);
- if (ret >= sizeof(s->name)) {
- kfree(s);
- return -EINVAL;
- }
-
- cl = strlen(s->name);
-
- /*
- * If CDP is supported by this resource, but not enabled,
- * include the suffix. This ensures the tabular format of the
- * schemata file does not change between mounts of the filesystem.
- */
- if (r->cdp_capable && !resctrl_arch_get_cdp_enabled(r->rid))
- cl += 4;
-
- if (cl > max_name_width)
- max_name_width = cl;
-
- INIT_LIST_HEAD(&s->list);
- list_add(&s->list, &resctrl_schema_all);
-
- return 0;
-}
-
-static int schemata_list_create(void)
-{
- struct rdt_resource *r;
- int ret = 0;
-
- for_each_alloc_enabled_rdt_resource(r) {
- if (resctrl_arch_get_cdp_enabled(r->rid)) {
- ret = schemata_list_add(r, CDP_CODE);
- if (ret)
- break;
-
- ret = schemata_list_add(r, CDP_DATA);
- } else {
- ret = schemata_list_add(r, CDP_NONE);
- }
-
- if (ret)
- break;
- }
-
- return ret;
-}
-
-static void schemata_list_destroy(void)
+bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l)
{
- struct resctrl_schema *s, *tmp;
-
- list_for_each_entry_safe(s, tmp, &resctrl_schema_all, list) {
- list_del(&s->list);
- kfree(s);
- }
+ return rdt_resources_all[l].cdp_enabled;
}
-static int rdt_get_tree(struct fs_context *fc)
-{
- struct rdt_fs_context *ctx = rdt_fc2context(fc);
- struct rdt_domain *dom;
- struct rdt_resource *r;
- int ret;
-
- cpus_read_lock();
- mutex_lock(&rdtgroup_mutex);
- /*
- * resctrl file system can only be mounted once.
- */
- if (static_branch_unlikely(&rdt_enable_key)) {
- ret = -EBUSY;
- goto out;
- }
-
- ret = rdt_enable_ctx(ctx);
- if (ret < 0)
- goto out_cdp;
-
- ret = schemata_list_create();
- if (ret) {
- schemata_list_destroy();
- goto out_mba;
- }
-
- closid_init();
-
- ret = rdtgroup_create_info_dir(rdtgroup_default.kn);
- if (ret < 0)
- goto out_schemata_free;
-
- if (rdt_mon_capable) {
- ret = mongroup_create_dir(rdtgroup_default.kn,
- &rdtgroup_default, "mon_groups",
- &kn_mongrp);
- if (ret < 0)
- goto out_info;
-
- ret = mkdir_mondata_all(rdtgroup_default.kn,
- &rdtgroup_default, &kn_mondata);
- if (ret < 0)
- goto out_mongrp;
- rdtgroup_default.mon.mon_data_kn = kn_mondata;
- }
-
- ret = rdt_pseudo_lock_init();
- if (ret)
- goto out_mondata;
-
- ret = kernfs_get_tree(fc);
- if (ret < 0)
- goto out_psl;
-
- if (rdt_alloc_capable)
- static_branch_enable_cpuslocked(&rdt_alloc_enable_key);
- if (rdt_mon_capable)
- static_branch_enable_cpuslocked(&rdt_mon_enable_key);
-
- if (rdt_alloc_capable || rdt_mon_capable)
- static_branch_enable_cpuslocked(&rdt_enable_key);
-
- if (is_mbm_enabled()) {
- r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
- list_for_each_entry(dom, &r->domains, list)
- mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL);
- }
-
- goto out;
-
-out_psl:
- rdt_pseudo_lock_release();
-out_mondata:
- if (rdt_mon_capable)
- kernfs_remove(kn_mondata);
-out_mongrp:
- if (rdt_mon_capable)
- kernfs_remove(kn_mongrp);
-out_info:
- kernfs_remove(kn_info);
-out_schemata_free:
- schemata_list_destroy();
-out_mba:
- if (ctx->enable_mba_mbps)
- set_mba_sc(false);
-out_cdp:
- cdp_disable_all();
-out:
- rdt_last_cmd_clear();
- mutex_unlock(&rdtgroup_mutex);
- cpus_read_unlock();
- return ret;
-}
-
-enum rdt_param {
- Opt_cdp,
- Opt_cdpl2,
- Opt_mba_mbps,
- nr__rdt_params
-};
-
-static const struct fs_parameter_spec rdt_fs_parameters[] = {
- fsparam_flag("cdp", Opt_cdp),
- fsparam_flag("cdpl2", Opt_cdpl2),
- fsparam_flag("mba_MBps", Opt_mba_mbps),
- {}
-};
-
-static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
-{
- struct rdt_fs_context *ctx = rdt_fc2context(fc);
- struct fs_parse_result result;
- int opt;
-
- opt = fs_parse(fc, rdt_fs_parameters, param, &result);
- if (opt < 0)
- return opt;
-
- switch (opt) {
- case Opt_cdp:
- ctx->enable_cdpl3 = true;
- return 0;
- case Opt_cdpl2:
- ctx->enable_cdpl2 = true;
- return 0;
- case Opt_mba_mbps:
- if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
- return -EINVAL;
- ctx->enable_mba_mbps = true;
- return 0;
- }
-
- return -EINVAL;
-}
-
-static void rdt_fs_context_free(struct fs_context *fc)
-{
- struct rdt_fs_context *ctx = rdt_fc2context(fc);
-
- kernfs_free_fs_context(fc);
- kfree(ctx);
-}
-
-static const struct fs_context_operations rdt_fs_context_ops = {
- .free = rdt_fs_context_free,
- .parse_param = rdt_parse_param,
- .get_tree = rdt_get_tree,
-};
-
-static int rdt_init_fs_context(struct fs_context *fc)
-{
- struct rdt_fs_context *ctx;
-
- ctx = kzalloc(sizeof(struct rdt_fs_context), GFP_KERNEL);
- if (!ctx)
- return -ENOMEM;
-
- ctx->kfc.root = rdt_root;
- ctx->kfc.magic = RDTGROUP_SUPER_MAGIC;
- fc->fs_private = &ctx->kfc;
- fc->ops = &rdt_fs_context_ops;
- put_user_ns(fc->user_ns);
- fc->user_ns = get_user_ns(&init_user_ns);
- fc->global = true;
- return 0;
-}
-
-static int reset_all_ctrls(struct rdt_resource *r)
+void resctrl_arch_reset_all_ctrls(struct rdt_resource *r)
{
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
- struct rdt_hw_domain *hw_dom;
+ struct rdt_hw_ctrl_domain *hw_dom;
struct msr_param msr_param;
- cpumask_var_t cpu_mask;
- struct rdt_domain *d;
- int i, cpu;
+ struct rdt_ctrl_domain *d;
+ int i;
- if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
- return -ENOMEM;
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
msr_param.res = r;
msr_param.low = 0;
@@ -2320,986 +246,17 @@ static int reset_all_ctrls(struct rdt_resource *r)
/*
* Disable resource control for this resource by setting all
- * CBMs in all domains to the maximum mask value. Pick one CPU
+ * CBMs in all ctrl_domains to the maximum mask value. Pick one CPU
* from each domain to update the MSRs below.
*/
- list_for_each_entry(d, &r->domains, list) {
- hw_dom = resctrl_to_arch_dom(d);
- cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+ hw_dom = resctrl_to_arch_ctrl_dom(d);
for (i = 0; i < hw_res->num_closid; i++)
- hw_dom->ctrl_val[i] = r->default_ctrl;
- }
- cpu = get_cpu();
- /* Update CBM on this cpu if it's in cpu_mask. */
- if (cpumask_test_cpu(cpu, cpu_mask))
- rdt_ctrl_update(&msr_param);
- /* Update CBM on all other cpus in cpu_mask. */
- smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1);
- put_cpu();
-
- free_cpumask_var(cpu_mask);
-
- return 0;
-}
-
-/*
- * Move tasks from one to the other group. If @from is NULL, then all tasks
- * in the systems are moved unconditionally (used for teardown).
- *
- * If @mask is not NULL the cpus on which moved tasks are running are set
- * in that mask so the update smp function call is restricted to affected
- * cpus.
- */
-static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
- struct cpumask *mask)
-{
- struct task_struct *p, *t;
-
- read_lock(&tasklist_lock);
- for_each_process_thread(p, t) {
- if (!from || is_closid_match(t, from) ||
- is_rmid_match(t, from)) {
- WRITE_ONCE(t->closid, to->closid);
- WRITE_ONCE(t->rmid, to->mon.rmid);
-
- /*
- * If the task is on a CPU, set the CPU in the mask.
- * The detection is inaccurate as tasks might move or
- * schedule before the smp function call takes place.
- * In such a case the function call is pointless, but
- * there is no other side effect.
- */
- if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t))
- cpumask_set_cpu(task_cpu(t), mask);
- }
- }
- read_unlock(&tasklist_lock);
-}
-
-static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
-{
- struct rdtgroup *sentry, *stmp;
- struct list_head *head;
-
- head = &rdtgrp->mon.crdtgrp_list;
- list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
- free_rmid(sentry->mon.rmid);
- list_del(&sentry->mon.crdtgrp_list);
-
- if (atomic_read(&sentry->waitcount) != 0)
- sentry->flags = RDT_DELETED;
- else
- rdtgroup_remove(sentry);
- }
-}
-
-/*
- * Forcibly remove all of subdirectories under root.
- */
-static void rmdir_all_sub(void)
-{
- struct rdtgroup *rdtgrp, *tmp;
-
- /* Move all tasks to the default resource group */
- rdt_move_group_tasks(NULL, &rdtgroup_default, NULL);
-
- list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) {
- /* Free any child rmids */
- free_all_child_rdtgrp(rdtgrp);
-
- /* Remove each rdtgroup other than root */
- if (rdtgrp == &rdtgroup_default)
- continue;
-
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
- rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
- rdtgroup_pseudo_lock_remove(rdtgrp);
-
- /*
- * Give any CPUs back to the default group. We cannot copy
- * cpu_online_mask because a CPU might have executed the
- * offline callback already, but is still marked online.
- */
- cpumask_or(&rdtgroup_default.cpu_mask,
- &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
-
- free_rmid(rdtgrp->mon.rmid);
-
- kernfs_remove(rdtgrp->kn);
- list_del(&rdtgrp->rdtgroup_list);
-
- if (atomic_read(&rdtgrp->waitcount) != 0)
- rdtgrp->flags = RDT_DELETED;
- else
- rdtgroup_remove(rdtgrp);
- }
- /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
- update_closid_rmid(cpu_online_mask, &rdtgroup_default);
-
- kernfs_remove(kn_info);
- kernfs_remove(kn_mongrp);
- kernfs_remove(kn_mondata);
-}
-
-static void rdt_kill_sb(struct super_block *sb)
-{
- struct rdt_resource *r;
-
- cpus_read_lock();
- mutex_lock(&rdtgroup_mutex);
-
- set_mba_sc(false);
-
- /*Put everything back to default values. */
- for_each_alloc_enabled_rdt_resource(r)
- reset_all_ctrls(r);
- cdp_disable_all();
- rmdir_all_sub();
- rdt_pseudo_lock_release();
- rdtgroup_default.mode = RDT_MODE_SHAREABLE;
- schemata_list_destroy();
- static_branch_disable_cpuslocked(&rdt_alloc_enable_key);
- static_branch_disable_cpuslocked(&rdt_mon_enable_key);
- static_branch_disable_cpuslocked(&rdt_enable_key);
- kernfs_kill_sb(sb);
- mutex_unlock(&rdtgroup_mutex);
- cpus_read_unlock();
-}
-
-static struct file_system_type rdt_fs_type = {
- .name = "resctrl",
- .init_fs_context = rdt_init_fs_context,
- .parameters = rdt_fs_parameters,
- .kill_sb = rdt_kill_sb,
-};
-
-static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
- void *priv)
-{
- struct kernfs_node *kn;
- int ret = 0;
-
- kn = __kernfs_create_file(parent_kn, name, 0444,
- GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0,
- &kf_mondata_ops, priv, NULL, NULL);
- if (IS_ERR(kn))
- return PTR_ERR(kn);
-
- ret = rdtgroup_kn_set_ugid(kn);
- if (ret) {
- kernfs_remove(kn);
- return ret;
- }
-
- return ret;
-}
-
-/*
- * Remove all subdirectories of mon_data of ctrl_mon groups
- * and monitor groups with given domain id.
- */
-void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, unsigned int dom_id)
-{
- struct rdtgroup *prgrp, *crgrp;
- char name[32];
-
- if (!r->mon_enabled)
- return;
-
- list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
- sprintf(name, "mon_%s_%02d", r->name, dom_id);
- kernfs_remove_by_name(prgrp->mon.mon_data_kn, name);
-
- list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list)
- kernfs_remove_by_name(crgrp->mon.mon_data_kn, name);
- }
-}
-
-static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
- struct rdt_domain *d,
- struct rdt_resource *r, struct rdtgroup *prgrp)
-{
- union mon_data_bits priv;
- struct kernfs_node *kn;
- struct mon_evt *mevt;
- struct rmid_read rr;
- char name[32];
- int ret;
-
- sprintf(name, "mon_%s_%02d", r->name, d->id);
- /* create the directory */
- kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
- if (IS_ERR(kn))
- return PTR_ERR(kn);
-
- ret = rdtgroup_kn_set_ugid(kn);
- if (ret)
- goto out_destroy;
-
- if (WARN_ON(list_empty(&r->evt_list))) {
- ret = -EPERM;
- goto out_destroy;
- }
-
- priv.u.rid = r->rid;
- priv.u.domid = d->id;
- list_for_each_entry(mevt, &r->evt_list, list) {
- priv.u.evtid = mevt->evtid;
- ret = mon_addfile(kn, mevt->name, priv.priv);
- if (ret)
- goto out_destroy;
-
- if (is_mbm_event(mevt->evtid))
- mon_event_read(&rr, r, d, prgrp, mevt->evtid, true);
- }
- kernfs_activate(kn);
- return 0;
-
-out_destroy:
- kernfs_remove(kn);
- return ret;
-}
-
-/*
- * Add all subdirectories of mon_data for "ctrl_mon" groups
- * and "monitor" groups with given domain id.
- */
-void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
- struct rdt_domain *d)
-{
- struct kernfs_node *parent_kn;
- struct rdtgroup *prgrp, *crgrp;
- struct list_head *head;
-
- if (!r->mon_enabled)
- return;
-
- list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
- parent_kn = prgrp->mon.mon_data_kn;
- mkdir_mondata_subdir(parent_kn, d, r, prgrp);
-
- head = &prgrp->mon.crdtgrp_list;
- list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
- parent_kn = crgrp->mon.mon_data_kn;
- mkdir_mondata_subdir(parent_kn, d, r, crgrp);
- }
- }
-}
-
-static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn,
- struct rdt_resource *r,
- struct rdtgroup *prgrp)
-{
- struct rdt_domain *dom;
- int ret;
-
- list_for_each_entry(dom, &r->domains, list) {
- ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-/*
- * This creates a directory mon_data which contains the monitored data.
- *
- * mon_data has one directory for each domain which are named
- * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data
- * with L3 domain looks as below:
- * ./mon_data:
- * mon_L3_00
- * mon_L3_01
- * mon_L3_02
- * ...
- *
- * Each domain directory has one file per event:
- * ./mon_L3_00/:
- * llc_occupancy
- *
- */
-static int mkdir_mondata_all(struct kernfs_node *parent_kn,
- struct rdtgroup *prgrp,
- struct kernfs_node **dest_kn)
-{
- struct rdt_resource *r;
- struct kernfs_node *kn;
- int ret;
-
- /*
- * Create the mon_data directory first.
- */
- ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn);
- if (ret)
- return ret;
-
- if (dest_kn)
- *dest_kn = kn;
-
- /*
- * Create the subdirectories for each domain. Note that all events
- * in a domain like L3 are grouped into a resource whose domain is L3
- */
- for_each_mon_enabled_rdt_resource(r) {
- ret = mkdir_mondata_subdir_alldom(kn, r, prgrp);
- if (ret)
- goto out_destroy;
- }
-
- return 0;
-
-out_destroy:
- kernfs_remove(kn);
- return ret;
-}
-
-/**
- * cbm_ensure_valid - Enforce validity on provided CBM
- * @_val: Candidate CBM
- * @r: RDT resource to which the CBM belongs
- *
- * The provided CBM represents all cache portions available for use. This
- * may be represented by a bitmap that does not consist of contiguous ones
- * and thus be an invalid CBM.
- * Here the provided CBM is forced to be a valid CBM by only considering
- * the first set of contiguous bits as valid and clearing all bits.
- * The intention here is to provide a valid default CBM with which a new
- * resource group is initialized. The user can follow this with a
- * modification to the CBM if the default does not satisfy the
- * requirements.
- */
-static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r)
-{
- unsigned int cbm_len = r->cache.cbm_len;
- unsigned long first_bit, zero_bit;
- unsigned long val = _val;
-
- if (!val)
- return 0;
-
- first_bit = find_first_bit(&val, cbm_len);
- zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
-
- /* Clear any remaining bits to ensure contiguous region */
- bitmap_clear(&val, zero_bit, cbm_len - zero_bit);
- return (u32)val;
-}
-
-/*
- * Initialize cache resources per RDT domain
- *
- * Set the RDT domain up to start off with all usable allocations. That is,
- * all shareable and unused bits. All-zero CBM is invalid.
- */
-static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s,
- u32 closid)
-{
- enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
- enum resctrl_conf_type t = s->conf_type;
- struct resctrl_staged_config *cfg;
- struct rdt_resource *r = s->res;
- u32 used_b = 0, unused_b = 0;
- unsigned long tmp_cbm;
- enum rdtgrp_mode mode;
- u32 peer_ctl, ctrl_val;
- int i;
-
- cfg = &d->staged_config[t];
- cfg->have_new_ctrl = false;
- cfg->new_ctrl = r->cache.shareable_bits;
- used_b = r->cache.shareable_bits;
- for (i = 0; i < closids_supported(); i++) {
- if (closid_allocated(i) && i != closid) {
- mode = rdtgroup_mode_by_closid(i);
- if (mode == RDT_MODE_PSEUDO_LOCKSETUP)
- /*
- * ctrl values for locksetup aren't relevant
- * until the schemata is written, and the mode
- * becomes RDT_MODE_PSEUDO_LOCKED.
- */
- continue;
- /*
- * If CDP is active include peer domain's
- * usage to ensure there is no overlap
- * with an exclusive group.
- */
- if (resctrl_arch_get_cdp_enabled(r->rid))
- peer_ctl = resctrl_arch_get_config(r, d, i,
- peer_type);
- else
- peer_ctl = 0;
- ctrl_val = resctrl_arch_get_config(r, d, i,
- s->conf_type);
- used_b |= ctrl_val | peer_ctl;
- if (mode == RDT_MODE_SHAREABLE)
- cfg->new_ctrl |= ctrl_val | peer_ctl;
- }
- }
- if (d->plr && d->plr->cbm > 0)
- used_b |= d->plr->cbm;
- unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1);
- unused_b &= BIT_MASK(r->cache.cbm_len) - 1;
- cfg->new_ctrl |= unused_b;
- /*
- * Force the initial CBM to be valid, user can
- * modify the CBM based on system availability.
- */
- cfg->new_ctrl = cbm_ensure_valid(cfg->new_ctrl, r);
- /*
- * Assign the u32 CBM to an unsigned long to ensure that
- * bitmap_weight() does not access out-of-bound memory.
- */
- tmp_cbm = cfg->new_ctrl;
- if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) {
- rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->id);
- return -ENOSPC;
- }
- cfg->have_new_ctrl = true;
-
- return 0;
-}
-
-/*
- * Initialize cache resources with default values.
- *
- * A new RDT group is being created on an allocation capable (CAT)
- * supporting system. Set this group up to start off with all usable
- * allocations.
- *
- * If there are no more shareable bits available on any domain then
- * the entire allocation will fail.
- */
-static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid)
-{
- struct rdt_domain *d;
- int ret;
-
- list_for_each_entry(d, &s->res->domains, list) {
- ret = __init_one_rdt_domain(d, s, closid);
- if (ret < 0)
- return ret;
- }
-
- return 0;
-}
-
-/* Initialize MBA resource with default values. */
-static void rdtgroup_init_mba(struct rdt_resource *r)
-{
- struct resctrl_staged_config *cfg;
- struct rdt_domain *d;
-
- list_for_each_entry(d, &r->domains, list) {
- cfg = &d->staged_config[CDP_NONE];
- cfg->new_ctrl = is_mba_sc(r) ? MBA_MAX_MBPS : r->default_ctrl;
- cfg->have_new_ctrl = true;
- }
-}
-
-/* Initialize the RDT group's allocations. */
-static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
-{
- struct resctrl_schema *s;
- struct rdt_resource *r;
- int ret;
-
- list_for_each_entry(s, &resctrl_schema_all, list) {
- r = s->res;
- if (r->rid == RDT_RESOURCE_MBA) {
- rdtgroup_init_mba(r);
- } else {
- ret = rdtgroup_init_cat(s, rdtgrp->closid);
- if (ret < 0)
- return ret;
- }
-
- ret = resctrl_arch_update_domains(r, rdtgrp->closid);
- if (ret < 0) {
- rdt_last_cmd_puts("Failed to initialize allocations\n");
- return ret;
- }
-
- }
-
- rdtgrp->mode = RDT_MODE_SHAREABLE;
-
- return 0;
-}
-
-static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
- const char *name, umode_t mode,
- enum rdt_group_type rtype, struct rdtgroup **r)
-{
- struct rdtgroup *prdtgrp, *rdtgrp;
- struct kernfs_node *kn;
- uint files = 0;
- int ret;
-
- prdtgrp = rdtgroup_kn_lock_live(parent_kn);
- if (!prdtgrp) {
- ret = -ENODEV;
- goto out_unlock;
- }
-
- if (rtype == RDTMON_GROUP &&
- (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
- prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) {
- ret = -EINVAL;
- rdt_last_cmd_puts("Pseudo-locking in progress\n");
- goto out_unlock;
- }
-
- /* allocate the rdtgroup. */
- rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
- if (!rdtgrp) {
- ret = -ENOSPC;
- rdt_last_cmd_puts("Kernel out of memory\n");
- goto out_unlock;
- }
- *r = rdtgrp;
- rdtgrp->mon.parent = prdtgrp;
- rdtgrp->type = rtype;
- INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list);
-
- /* kernfs creates the directory for rdtgrp */
- kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp);
- if (IS_ERR(kn)) {
- ret = PTR_ERR(kn);
- rdt_last_cmd_puts("kernfs create error\n");
- goto out_free_rgrp;
+ hw_dom->ctrl_val[i] = resctrl_get_default_ctrl(r);
+ msr_param.dom = d;
+ smp_call_function_any(&d->hdr.cpu_mask, rdt_ctrl_update, &msr_param, 1);
}
- rdtgrp->kn = kn;
- /*
- * kernfs_remove() will drop the reference count on "kn" which
- * will free it. But we still need it to stick around for the
- * rdtgroup_kn_unlock(kn) call. Take one extra reference here,
- * which will be dropped by kernfs_put() in rdtgroup_remove().
- */
- kernfs_get(kn);
-
- ret = rdtgroup_kn_set_ugid(kn);
- if (ret) {
- rdt_last_cmd_puts("kernfs perm error\n");
- goto out_destroy;
- }
-
- files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype);
- ret = rdtgroup_add_files(kn, files);
- if (ret) {
- rdt_last_cmd_puts("kernfs fill error\n");
- goto out_destroy;
- }
-
- if (rdt_mon_capable) {
- ret = alloc_rmid();
- if (ret < 0) {
- rdt_last_cmd_puts("Out of RMIDs\n");
- goto out_destroy;
- }
- rdtgrp->mon.rmid = ret;
-
- ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn);
- if (ret) {
- rdt_last_cmd_puts("kernfs subdir error\n");
- goto out_idfree;
- }
- }
- kernfs_activate(kn);
-
- /*
- * The caller unlocks the parent_kn upon success.
- */
- return 0;
-
-out_idfree:
- free_rmid(rdtgrp->mon.rmid);
-out_destroy:
- kernfs_put(rdtgrp->kn);
- kernfs_remove(rdtgrp->kn);
-out_free_rgrp:
- kfree(rdtgrp);
-out_unlock:
- rdtgroup_kn_unlock(parent_kn);
- return ret;
-}
-
-static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
-{
- kernfs_remove(rgrp->kn);
- free_rmid(rgrp->mon.rmid);
- rdtgroup_remove(rgrp);
-}
-
-/*
- * Create a monitor group under "mon_groups" directory of a control
- * and monitor group(ctrl_mon). This is a resource group
- * to monitor a subset of tasks and cpus in its parent ctrl_mon group.
- */
-static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
- const char *name, umode_t mode)
-{
- struct rdtgroup *rdtgrp, *prgrp;
- int ret;
-
- ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTMON_GROUP, &rdtgrp);
- if (ret)
- return ret;
-
- prgrp = rdtgrp->mon.parent;
- rdtgrp->closid = prgrp->closid;
-
- /*
- * Add the rdtgrp to the list of rdtgrps the parent
- * ctrl_mon group has to track.
- */
- list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list);
-
- rdtgroup_kn_unlock(parent_kn);
- return ret;
-}
-
-/*
- * These are rdtgroups created under the root directory. Can be used
- * to allocate and monitor resources.
- */
-static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
- const char *name, umode_t mode)
-{
- struct rdtgroup *rdtgrp;
- struct kernfs_node *kn;
- u32 closid;
- int ret;
-
- ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp);
- if (ret)
- return ret;
-
- kn = rdtgrp->kn;
- ret = closid_alloc();
- if (ret < 0) {
- rdt_last_cmd_puts("Out of CLOSIDs\n");
- goto out_common_fail;
- }
- closid = ret;
- ret = 0;
-
- rdtgrp->closid = closid;
- ret = rdtgroup_init_alloc(rdtgrp);
- if (ret < 0)
- goto out_id_free;
-
- list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
-
- if (rdt_mon_capable) {
- /*
- * Create an empty mon_groups directory to hold the subset
- * of tasks and cpus to monitor.
- */
- ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL);
- if (ret) {
- rdt_last_cmd_puts("kernfs subdir error\n");
- goto out_del_list;
- }
- }
-
- goto out_unlock;
-
-out_del_list:
- list_del(&rdtgrp->rdtgroup_list);
-out_id_free:
- closid_free(closid);
-out_common_fail:
- mkdir_rdt_prepare_clean(rdtgrp);
-out_unlock:
- rdtgroup_kn_unlock(parent_kn);
- return ret;
-}
-
-/*
- * We allow creating mon groups only with in a directory called "mon_groups"
- * which is present in every ctrl_mon group. Check if this is a valid
- * "mon_groups" directory.
- *
- * 1. The directory should be named "mon_groups".
- * 2. The mon group itself should "not" be named "mon_groups".
- * This makes sure "mon_groups" directory always has a ctrl_mon group
- * as parent.
- */
-static bool is_mon_groups(struct kernfs_node *kn, const char *name)
-{
- return (!strcmp(kn->name, "mon_groups") &&
- strcmp(name, "mon_groups"));
-}
-
-static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
- umode_t mode)
-{
- /* Do not accept '\n' to avoid unparsable situation. */
- if (strchr(name, '\n'))
- return -EINVAL;
-
- /*
- * If the parent directory is the root directory and RDT
- * allocation is supported, add a control and monitoring
- * subdirectory
- */
- if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn)
- return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode);
-
- /*
- * If RDT monitoring is supported and the parent directory is a valid
- * "mon_groups" directory, add a monitoring subdirectory.
- */
- if (rdt_mon_capable && is_mon_groups(parent_kn, name))
- return rdtgroup_mkdir_mon(parent_kn, name, mode);
-
- return -EPERM;
-}
-
-static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
-{
- struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
- int cpu;
-
- /* Give any tasks back to the parent group */
- rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask);
-
- /* Update per cpu rmid of the moved CPUs first */
- for_each_cpu(cpu, &rdtgrp->cpu_mask)
- per_cpu(pqr_state.default_rmid, cpu) = prdtgrp->mon.rmid;
- /*
- * Update the MSR on moved CPUs and CPUs which have moved
- * task running on them.
- */
- cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
- update_closid_rmid(tmpmask, NULL);
-
- rdtgrp->flags = RDT_DELETED;
- free_rmid(rdtgrp->mon.rmid);
-
- /*
- * Remove the rdtgrp from the parent ctrl_mon group's list
- */
- WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
- list_del(&rdtgrp->mon.crdtgrp_list);
-
- kernfs_remove(rdtgrp->kn);
-
- return 0;
-}
-
-static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp)
-{
- rdtgrp->flags = RDT_DELETED;
- list_del(&rdtgrp->rdtgroup_list);
-
- kernfs_remove(rdtgrp->kn);
- return 0;
-}
-
-static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
-{
- int cpu;
-
- /* Give any tasks back to the default group */
- rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask);
-
- /* Give any CPUs back to the default group */
- cpumask_or(&rdtgroup_default.cpu_mask,
- &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
-
- /* Update per cpu closid and rmid of the moved CPUs first */
- for_each_cpu(cpu, &rdtgrp->cpu_mask) {
- per_cpu(pqr_state.default_closid, cpu) = rdtgroup_default.closid;
- per_cpu(pqr_state.default_rmid, cpu) = rdtgroup_default.mon.rmid;
- }
-
- /*
- * Update the MSR on moved CPUs and CPUs which have moved
- * task running on them.
- */
- cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
- update_closid_rmid(tmpmask, NULL);
-
- closid_free(rdtgrp->closid);
- free_rmid(rdtgrp->mon.rmid);
-
- rdtgroup_ctrl_remove(rdtgrp);
-
- /*
- * Free all the child monitor group rmids.
- */
- free_all_child_rdtgrp(rdtgrp);
-
- return 0;
-}
-
-static int rdtgroup_rmdir(struct kernfs_node *kn)
-{
- struct kernfs_node *parent_kn = kn->parent;
- struct rdtgroup *rdtgrp;
- cpumask_var_t tmpmask;
- int ret = 0;
-
- if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
- return -ENOMEM;
-
- rdtgrp = rdtgroup_kn_lock_live(kn);
- if (!rdtgrp) {
- ret = -EPERM;
- goto out;
- }
-
- /*
- * If the rdtgroup is a ctrl_mon group and parent directory
- * is the root directory, remove the ctrl_mon group.
- *
- * If the rdtgroup is a mon group and parent directory
- * is a valid "mon_groups" directory, remove the mon group.
- */
- if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn &&
- rdtgrp != &rdtgroup_default) {
- if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
- rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
- ret = rdtgroup_ctrl_remove(rdtgrp);
- } else {
- ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask);
- }
- } else if (rdtgrp->type == RDTMON_GROUP &&
- is_mon_groups(parent_kn, kn->name)) {
- ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask);
- } else {
- ret = -EPERM;
- }
-
-out:
- rdtgroup_kn_unlock(kn);
- free_cpumask_var(tmpmask);
- return ret;
-}
-
-static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
-{
- if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3))
- seq_puts(seq, ",cdp");
-
- if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2))
- seq_puts(seq, ",cdpl2");
-
- if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl))
- seq_puts(seq, ",mba_MBps");
-
- return 0;
-}
-
-static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = {
- .mkdir = rdtgroup_mkdir,
- .rmdir = rdtgroup_rmdir,
- .show_options = rdtgroup_show_options,
-};
-
-static int __init rdtgroup_setup_root(void)
-{
- int ret;
-
- rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops,
- KERNFS_ROOT_CREATE_DEACTIVATED |
- KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK,
- &rdtgroup_default);
- if (IS_ERR(rdt_root))
- return PTR_ERR(rdt_root);
-
- mutex_lock(&rdtgroup_mutex);
-
- rdtgroup_default.closid = 0;
- rdtgroup_default.mon.rmid = 0;
- rdtgroup_default.type = RDTCTRL_GROUP;
- INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list);
-
- list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
-
- ret = rdtgroup_add_files(kernfs_root_to_node(rdt_root), RF_CTRL_BASE);
- if (ret) {
- kernfs_destroy_root(rdt_root);
- goto out;
- }
-
- rdtgroup_default.kn = kernfs_root_to_node(rdt_root);
- kernfs_activate(rdtgroup_default.kn);
-
-out:
- mutex_unlock(&rdtgroup_mutex);
-
- return ret;
-}
-
-/*
- * rdtgroup_init - rdtgroup initialization
- *
- * Setup resctrl file system including set up root, create mount point,
- * register rdtgroup filesystem, and initialize files under root directory.
- *
- * Return: 0 on success or -errno
- */
-int __init rdtgroup_init(void)
-{
- int ret = 0;
-
- seq_buf_init(&last_cmd_status, last_cmd_status_buf,
- sizeof(last_cmd_status_buf));
-
- ret = rdtgroup_setup_root();
- if (ret)
- return ret;
-
- ret = sysfs_create_mount_point(fs_kobj, "resctrl");
- if (ret)
- goto cleanup_root;
-
- ret = register_filesystem(&rdt_fs_type);
- if (ret)
- goto cleanup_mountpoint;
-
- /*
- * Adding the resctrl debugfs directory here may not be ideal since
- * it would let the resctrl debugfs directory appear on the debugfs
- * filesystem before the resctrl filesystem is mounted.
- * It may also be ok since that would enable debugging of RDT before
- * resctrl is mounted.
- * The reason why the debugfs directory is created here and not in
- * rdt_get_tree() is because rdt_get_tree() takes rdtgroup_mutex and
- * during the debugfs directory creation also &sb->s_type->i_mutex_key
- * (the lockdep class of inode->i_rwsem). Other filesystem
- * interactions (eg. SyS_getdents) have the lock ordering:
- * &sb->s_type->i_mutex_key --> &mm->mmap_lock
- * During mmap(), called with &mm->mmap_lock, the rdtgroup_mutex
- * is taken, thus creating dependency:
- * &mm->mmap_lock --> rdtgroup_mutex for the latter that can cause
- * issues considering the other two lock dependencies.
- * By creating the debugfs directory here we avoid a dependency
- * that may cause deadlock (even though file operations cannot
- * occur until the filesystem is mounted, but I do not know how to
- * tell lockdep that).
- */
- debugfs_resctrl = debugfs_create_dir("resctrl", NULL);
-
- return 0;
-
-cleanup_mountpoint:
- sysfs_remove_mount_point(fs_kobj, "resctrl");
-cleanup_root:
- kernfs_destroy_root(rdt_root);
-
- return ret;
-}
-
-void __exit rdtgroup_exit(void)
-{
- debugfs_remove_recursive(debugfs_resctrl);
- unregister_filesystem(&rdt_fs_type);
- sysfs_remove_mount_point(fs_kobj, "resctrl");
- kernfs_destroy_root(rdt_root);
+ return;
}
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index fd44b54c90d5..dbf6d71bdf18 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -24,27 +24,37 @@ struct cpuid_bit {
* levels are different and there is a separate entry for each.
*/
static const struct cpuid_bit cpuid_bits[] = {
- { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
- { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
- { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 },
- { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 },
- { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 },
- { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 },
- { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 },
- { X86_FEATURE_CQM_MBM_LOCAL, CPUID_EDX, 2, 0x0000000f, 1 },
- { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 },
- { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 },
- { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 },
- { X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 },
- { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 },
- { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 },
- { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 },
- { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 },
- { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
- { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
- { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
- { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 },
- { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 },
+ { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
+ { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
+ { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 },
+ { X86_FEATURE_APX, CPUID_EDX, 21, 0x00000007, 1 },
+ { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 },
+ { X86_FEATURE_BHI_CTRL, CPUID_EDX, 4, 0x00000007, 2 },
+ { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 },
+ { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 },
+ { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 },
+ { X86_FEATURE_CQM_MBM_LOCAL, CPUID_EDX, 2, 0x0000000f, 1 },
+ { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 },
+ { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 },
+ { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 },
+ { X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 },
+ { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 },
+ { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 },
+ { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 },
+ { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 },
+ { X86_FEATURE_SGX_EDECCSSA, CPUID_EAX, 11, 0x00000012, 0 },
+ { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
+ { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
+ { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
+ { X86_FEATURE_AMD_FAST_CPPC, CPUID_EDX, 15, 0x80000007, 0 },
+ { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 },
+ { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 },
+ { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 },
+ { X86_FEATURE_AMD_WORKLOAD_CLASS, CPUID_EAX, 22, 0x80000021, 0 },
+ { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 },
+ { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 },
+ { X86_FEATURE_AMD_LBR_PMC_FREEZE, CPUID_EAX, 2, 0x80000022, 0 },
+ { X86_FEATURE_AMD_HTR_CORES, CPUID_EAX, 30, 0x80000026, 0 },
{ 0, 0, 0, 0, 0 }
};
diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
index aa9b8b868867..7f8d1e11dbee 100644
--- a/arch/x86/kernel/cpu/sgx/driver.c
+++ b/arch/x86/kernel/cpu/sgx/driver.c
@@ -95,7 +95,7 @@ static int sgx_mmap(struct file *file, struct vm_area_struct *vma)
return ret;
vma->vm_ops = &sgx_vm_ops;
- vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO;
+ vm_flags_set(vma, VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO);
vma->vm_private_data = encl;
return 0;
@@ -113,7 +113,7 @@ static unsigned long sgx_get_unmapped_area(struct file *file,
if (flags & MAP_FIXED)
return addr;
- return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
+ return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags);
}
#ifdef CONFIG_COMPAT
@@ -150,13 +150,15 @@ int __init sgx_drv_init(void)
u64 xfrm_mask;
int ret;
- if (!cpu_feature_enabled(X86_FEATURE_SGX_LC))
+ if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) {
+ pr_info("SGX disabled: SGX launch control CPU feature is not available, /dev/sgx_enclave disabled.\n");
return -ENODEV;
+ }
cpuid_count(SGX_CPUID, 0, &eax, &ebx, &ecx, &edx);
if (!(eax & 1)) {
- pr_err("SGX disabled: SGX1 instruction support not available.\n");
+ pr_info("SGX disabled: SGX1 instruction support not available, /dev/sgx_enclave disabled.\n");
return -ENODEV;
}
@@ -173,8 +175,10 @@ int __init sgx_drv_init(void)
}
ret = misc_register(&sgx_dev_enclave);
- if (ret)
+ if (ret) {
+ pr_info("SGX disabled: Unable to register the /dev/sgx_enclave driver (%d).\n", ret);
return ret;
+ }
return 0;
}
diff --git a/arch/x86/kernel/cpu/sgx/driver.h b/arch/x86/kernel/cpu/sgx/driver.h
index 4eddb4d571ef..30f39f92c98f 100644
--- a/arch/x86/kernel/cpu/sgx/driver.h
+++ b/arch/x86/kernel/cpu/sgx/driver.h
@@ -2,7 +2,6 @@
#ifndef __ARCH_SGX_DRIVER_H__
#define __ARCH_SGX_DRIVER_H__
-#include <crypto/hash.h>
#include <linux/kref.h>
#include <linux/mmu_notifier.h>
#include <linux/radix-tree.h>
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 19876ebfb504..279148e72459 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -12,6 +12,9 @@
#include "encls.h"
#include "sgx.h"
+static int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index,
+ struct sgx_backing *backing);
+
#define PCMDS_PER_PAGE (PAGE_SIZE / sizeof(struct sgx_pcmd))
/*
* 32 PCMD entries share a PCMD page. PCMD_FIRST_MASK is used to
@@ -157,8 +160,8 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
return ret;
pginfo.addr = encl_page->desc & PAGE_MASK;
- pginfo.contents = (unsigned long)kmap_atomic(b.contents);
- pcmd_page = kmap_atomic(b.pcmd);
+ pginfo.contents = (unsigned long)kmap_local_page(b.contents);
+ pcmd_page = kmap_local_page(b.pcmd);
pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset;
if (secs_page)
@@ -184,8 +187,8 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
*/
pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE);
- kunmap_atomic(pcmd_page);
- kunmap_atomic((void *)(unsigned long)pginfo.contents);
+ kunmap_local(pcmd_page);
+ kunmap_local((void *)(unsigned long)pginfo.contents);
get_page(b.pcmd);
sgx_encl_put_backing(&b);
@@ -194,10 +197,10 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) {
sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off));
- pcmd_page = kmap_atomic(b.pcmd);
+ pcmd_page = kmap_local_page(b.pcmd);
if (memchr_inv(pcmd_page, 0, PAGE_SIZE))
pr_warn("PCMD page not empty after truncate.\n");
- kunmap_atomic(pcmd_page);
+ kunmap_local(pcmd_page);
}
put_page(b.pcmd);
@@ -232,12 +235,53 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page,
return epc_page;
}
-static struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl,
- unsigned long addr,
- unsigned long vm_flags)
+/*
+ * Ensure the SECS page is not swapped out. Must be called with encl->lock
+ * to protect the enclave states including SECS and ensure the SECS page is
+ * not swapped out again while being used.
+ */
+static struct sgx_epc_page *sgx_encl_load_secs(struct sgx_encl *encl)
+{
+ struct sgx_epc_page *epc_page = encl->secs.epc_page;
+
+ if (!epc_page)
+ epc_page = sgx_encl_eldu(&encl->secs, NULL);
+
+ return epc_page;
+}
+
+static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl,
+ struct sgx_encl_page *entry)
{
- unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC);
struct sgx_epc_page *epc_page;
+
+ /* Entry successfully located. */
+ if (entry->epc_page) {
+ if (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)
+ return ERR_PTR(-EBUSY);
+
+ return entry;
+ }
+
+ epc_page = sgx_encl_load_secs(encl);
+ if (IS_ERR(epc_page))
+ return ERR_CAST(epc_page);
+
+ epc_page = sgx_encl_eldu(entry, encl->secs.epc_page);
+ if (IS_ERR(epc_page))
+ return ERR_CAST(epc_page);
+
+ encl->secs_child_cnt++;
+ sgx_mark_page_reclaimable(entry->epc_page);
+
+ return entry;
+}
+
+static struct sgx_encl_page *sgx_encl_load_page_in_vma(struct sgx_encl *encl,
+ unsigned long addr,
+ unsigned long vm_flags)
+{
+ unsigned long vm_prot_bits = vm_flags & VM_ACCESS_FLAGS;
struct sgx_encl_page *entry;
entry = xa_load(&encl->page_array, PFN_DOWN(addr));
@@ -245,35 +289,142 @@ static struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl,
return ERR_PTR(-EFAULT);
/*
- * Verify that the faulted page has equal or higher build time
+ * Verify that the page has equal or higher build time
* permissions than the VMA permissions (i.e. the subset of {VM_READ,
* VM_WRITE, VM_EXECUTE} in vma->vm_flags).
*/
if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits)
return ERR_PTR(-EFAULT);
- /* Entry successfully located. */
- if (entry->epc_page) {
- if (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)
- return ERR_PTR(-EBUSY);
+ return __sgx_encl_load_page(encl, entry);
+}
- return entry;
+struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl,
+ unsigned long addr)
+{
+ struct sgx_encl_page *entry;
+
+ entry = xa_load(&encl->page_array, PFN_DOWN(addr));
+ if (!entry)
+ return ERR_PTR(-EFAULT);
+
+ return __sgx_encl_load_page(encl, entry);
+}
+
+/**
+ * sgx_encl_eaug_page() - Dynamically add page to initialized enclave
+ * @vma: VMA obtained from fault info from where page is accessed
+ * @encl: enclave accessing the page
+ * @addr: address that triggered the page fault
+ *
+ * When an initialized enclave accesses a page with no backing EPC page
+ * on a SGX2 system then the EPC can be added dynamically via the SGX2
+ * ENCLS[EAUG] instruction.
+ *
+ * Returns: Appropriate vm_fault_t: VM_FAULT_NOPAGE when PTE was installed
+ * successfully, VM_FAULT_SIGBUS or VM_FAULT_OOM as error otherwise.
+ */
+static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
+ struct sgx_encl *encl, unsigned long addr)
+{
+ vm_fault_t vmret = VM_FAULT_SIGBUS;
+ struct sgx_pageinfo pginfo = {0};
+ struct sgx_encl_page *encl_page;
+ struct sgx_epc_page *epc_page;
+ struct sgx_va_page *va_page;
+ unsigned long phys_addr;
+ u64 secinfo_flags;
+ int ret;
+
+ if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags))
+ return VM_FAULT_SIGBUS;
+
+ /*
+ * Ignore internal permission checking for dynamically added pages.
+ * They matter only for data added during the pre-initialization
+ * phase. The enclave decides the permissions by the means of
+ * EACCEPT, EACCEPTCOPY and EMODPE.
+ */
+ secinfo_flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X;
+ encl_page = sgx_encl_page_alloc(encl, addr - encl->base, secinfo_flags);
+ if (IS_ERR(encl_page))
+ return VM_FAULT_OOM;
+
+ mutex_lock(&encl->lock);
+
+ epc_page = sgx_encl_load_secs(encl);
+ if (IS_ERR(epc_page)) {
+ if (PTR_ERR(epc_page) == -EBUSY)
+ vmret = VM_FAULT_NOPAGE;
+ goto err_out_unlock;
}
- if (!(encl->secs.epc_page)) {
- epc_page = sgx_encl_eldu(&encl->secs, NULL);
- if (IS_ERR(epc_page))
- return ERR_CAST(epc_page);
+ epc_page = sgx_alloc_epc_page(encl_page, false);
+ if (IS_ERR(epc_page)) {
+ if (PTR_ERR(epc_page) == -EBUSY)
+ vmret = VM_FAULT_NOPAGE;
+ goto err_out_unlock;
}
- epc_page = sgx_encl_eldu(entry, encl->secs.epc_page);
- if (IS_ERR(epc_page))
- return ERR_CAST(epc_page);
+ va_page = sgx_encl_grow(encl, false);
+ if (IS_ERR(va_page)) {
+ if (PTR_ERR(va_page) == -EBUSY)
+ vmret = VM_FAULT_NOPAGE;
+ goto err_out_epc;
+ }
+
+ if (va_page)
+ list_add(&va_page->list, &encl->va_pages);
+
+ ret = xa_insert(&encl->page_array, PFN_DOWN(encl_page->desc),
+ encl_page, GFP_KERNEL);
+ /*
+ * If ret == -EBUSY then page was created in another flow while
+ * running without encl->lock
+ */
+ if (ret)
+ goto err_out_shrink;
+ pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page);
+ pginfo.addr = encl_page->desc & PAGE_MASK;
+ pginfo.metadata = 0;
+
+ ret = __eaug(&pginfo, sgx_get_epc_virt_addr(epc_page));
+ if (ret)
+ goto err_out;
+
+ encl_page->encl = encl;
+ encl_page->epc_page = epc_page;
+ encl_page->type = SGX_PAGE_TYPE_REG;
encl->secs_child_cnt++;
- sgx_mark_page_reclaimable(entry->epc_page);
- return entry;
+ sgx_mark_page_reclaimable(encl_page->epc_page);
+
+ phys_addr = sgx_get_epc_phys_addr(epc_page);
+ /*
+ * Do not undo everything when creating PTE entry fails - next #PF
+ * would find page ready for a PTE.
+ */
+ vmret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr));
+ if (vmret != VM_FAULT_NOPAGE) {
+ mutex_unlock(&encl->lock);
+ return VM_FAULT_SIGBUS;
+ }
+ mutex_unlock(&encl->lock);
+ return VM_FAULT_NOPAGE;
+
+err_out:
+ xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc));
+
+err_out_shrink:
+ sgx_encl_shrink(encl, va_page);
+err_out_epc:
+ sgx_encl_free_epc_page(epc_page);
+err_out_unlock:
+ mutex_unlock(&encl->lock);
+ kfree(encl_page);
+
+ return vmret;
}
static vm_fault_t sgx_vma_fault(struct vm_fault *vmf)
@@ -295,9 +446,20 @@ static vm_fault_t sgx_vma_fault(struct vm_fault *vmf)
if (unlikely(!encl))
return VM_FAULT_SIGBUS;
+ /*
+ * The page_array keeps track of all enclave pages, whether they
+ * are swapped out or not. If there is no entry for this page and
+ * the system supports SGX2 then it is possible to dynamically add
+ * a new enclave page. This is only possible for an initialized
+ * enclave that will be checked for right away.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_SGX2) &&
+ (!xa_load(&encl->page_array, PFN_DOWN(addr))))
+ return sgx_encl_eaug_page(vma, encl, addr);
+
mutex_lock(&encl->lock);
- entry = sgx_encl_load_page(encl, addr, vma->vm_flags);
+ entry = sgx_encl_load_page_in_vma(encl, addr, vma->vm_flags);
if (IS_ERR(entry)) {
mutex_unlock(&encl->lock);
@@ -360,13 +522,18 @@ static void sgx_vma_open(struct vm_area_struct *vma)
int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
unsigned long end, unsigned long vm_flags)
{
- unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC);
+ unsigned long vm_prot_bits = vm_flags & VM_ACCESS_FLAGS;
struct sgx_encl_page *page;
unsigned long count = 0;
int ret = 0;
XA_STATE(xas, &encl->page_array, PFN_DOWN(start));
+ /* Disallow mapping outside enclave's address range. */
+ if (test_bit(SGX_ENCL_INITIALIZED, &encl->flags) &&
+ (start < encl->base || end > encl->base + encl->size))
+ return -EACCES;
+
/*
* Disallow READ_IMPLIES_EXEC tasks as their VMA permissions might
* conflict with the enclave page permissions.
@@ -445,7 +612,7 @@ static struct sgx_encl_page *sgx_encl_reserve_page(struct sgx_encl *encl,
for ( ; ; ) {
mutex_lock(&encl->lock);
- entry = sgx_encl_load_page(encl, addr, vm_flags);
+ entry = sgx_encl_load_page_in_vma(encl, addr, vm_flags);
if (PTR_ERR(entry) != -EBUSY)
break;
@@ -533,11 +700,15 @@ const struct vm_operations_struct sgx_vm_ops = {
void sgx_encl_release(struct kref *ref)
{
struct sgx_encl *encl = container_of(ref, struct sgx_encl, refcount);
+ unsigned long max_page_index = PFN_DOWN(encl->base + encl->size - 1);
struct sgx_va_page *va_page;
struct sgx_encl_page *entry;
- unsigned long index;
+ unsigned long count = 0;
+
+ XA_STATE(xas, &encl->page_array, PFN_DOWN(encl->base));
- xa_for_each(&encl->page_array, index, entry) {
+ xas_lock(&xas);
+ xas_for_each(&xas, entry, max_page_index) {
if (entry->epc_page) {
/*
* The page and its radix tree entry cannot be freed
@@ -552,9 +723,20 @@ void sgx_encl_release(struct kref *ref)
}
kfree(entry);
- /* Invoke scheduler to prevent soft lockups. */
- cond_resched();
+ /*
+ * Invoke scheduler on every XA_CHECK_SCHED iteration
+ * to prevent soft lockups.
+ */
+ if (!(++count % XA_CHECK_SCHED)) {
+ xas_pause(&xas);
+ xas_unlock(&xas);
+
+ cond_resched();
+
+ xas_lock(&xas);
+ }
}
+ xas_unlock(&xas);
xa_destroy(&encl->page_array);
@@ -593,6 +775,7 @@ static void sgx_mmu_notifier_release(struct mmu_notifier *mn,
{
struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier);
struct sgx_encl_mm *tmp = NULL;
+ bool found = false;
/*
* The enclave itself can remove encl_mm. Note, objects can't be moved
@@ -602,12 +785,13 @@ static void sgx_mmu_notifier_release(struct mmu_notifier *mn,
list_for_each_entry(tmp, &encl_mm->encl->mm_list, list) {
if (tmp == encl_mm) {
list_del_rcu(&encl_mm->list);
+ found = true;
break;
}
}
spin_unlock(&encl_mm->encl->mm_lock);
- if (tmp == encl_mm) {
+ if (found) {
synchronize_srcu(&encl_mm->encl->srcu);
mmu_notifier_put(mn);
}
@@ -687,7 +871,7 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm)
spin_lock(&encl->mm_lock);
list_add_rcu(&encl_mm->list, &encl->mm_list);
- /* Pairs with smp_rmb() in sgx_reclaimer_block(). */
+ /* Pairs with smp_rmb() in sgx_zap_enclave_ptes(). */
smp_wmb();
encl->mm_list_version++;
spin_unlock(&encl->mm_lock);
@@ -695,18 +879,84 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm)
return 0;
}
+/**
+ * sgx_encl_cpumask() - Query which CPUs might be accessing the enclave
+ * @encl: the enclave
+ *
+ * Some SGX functions require that no cached linear-to-physical address
+ * mappings are present before they can succeed. For example, ENCLS[EWB]
+ * copies a page from the enclave page cache to regular main memory but
+ * it fails if it cannot ensure that there are no cached
+ * linear-to-physical address mappings referring to the page.
+ *
+ * SGX hardware flushes all cached linear-to-physical mappings on a CPU
+ * when an enclave is exited via ENCLU[EEXIT] or an Asynchronous Enclave
+ * Exit (AEX). Exiting an enclave will thus ensure cached linear-to-physical
+ * address mappings are cleared but coordination with the tracking done within
+ * the SGX hardware is needed to support the SGX functions that depend on this
+ * cache clearing.
+ *
+ * When the ENCLS[ETRACK] function is issued on an enclave the hardware
+ * tracks threads operating inside the enclave at that time. The SGX
+ * hardware tracking require that all the identified threads must have
+ * exited the enclave in order to flush the mappings before a function such
+ * as ENCLS[EWB] will be permitted
+ *
+ * The following flow is used to support SGX functions that require that
+ * no cached linear-to-physical address mappings are present:
+ * 1) Execute ENCLS[ETRACK] to initiate hardware tracking.
+ * 2) Use this function (sgx_encl_cpumask()) to query which CPUs might be
+ * accessing the enclave.
+ * 3) Send IPI to identified CPUs, kicking them out of the enclave and
+ * thus flushing all locally cached linear-to-physical address mappings.
+ * 4) Execute SGX function.
+ *
+ * Context: It is required to call this function after ENCLS[ETRACK].
+ * This will ensure that if any new mm appears (racing with
+ * sgx_encl_mm_add()) then the new mm will enter into the
+ * enclave with fresh linear-to-physical address mappings.
+ *
+ * It is required that all IPIs are completed before a new
+ * ENCLS[ETRACK] is issued so be sure to protect steps 1 to 3
+ * of the above flow with the enclave's mutex.
+ *
+ * Return: cpumask of CPUs that might be accessing @encl
+ */
+const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl)
+{
+ cpumask_t *cpumask = &encl->cpumask;
+ struct sgx_encl_mm *encl_mm;
+ int idx;
+
+ cpumask_clear(cpumask);
+
+ idx = srcu_read_lock(&encl->srcu);
+
+ list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
+ if (!mmget_not_zero(encl_mm->mm))
+ continue;
+
+ cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm));
+
+ mmput_async(encl_mm->mm);
+ }
+
+ srcu_read_unlock(&encl->srcu, idx);
+
+ return cpumask;
+}
+
static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl,
pgoff_t index)
{
- struct inode *inode = encl->backing->f_path.dentry->d_inode;
- struct address_space *mapping = inode->i_mapping;
+ struct address_space *mapping = encl->backing->f_mapping;
gfp_t gfpmask = mapping_gfp_mask(mapping);
return shmem_read_mapping_page_gfp(mapping, index, gfpmask);
}
/**
- * sgx_encl_get_backing() - Pin the backing storage
+ * __sgx_encl_get_backing() - Pin the backing storage
* @encl: an enclave pointer
* @page_index: enclave page index
* @backing: data for accessing backing storage for the page
@@ -718,7 +968,7 @@ static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl,
* 0 on success,
* -errno otherwise.
*/
-static int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
+static int __sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
struct sgx_backing *backing)
{
pgoff_t page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index);
@@ -735,7 +985,6 @@ static int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
return PTR_ERR(pcmd);
}
- backing->page_index = page_index;
backing->contents = contents;
backing->pcmd = pcmd;
backing->pcmd_offset = page_pcmd_off & (PAGE_SIZE - 1);
@@ -794,7 +1043,7 @@ static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl)
}
/**
- * sgx_encl_alloc_backing() - allocate a new backing storage page
+ * sgx_encl_alloc_backing() - create a new backing storage page
* @encl: an enclave pointer
* @page_index: enclave page index
* @backing: data for accessing backing storage for the page
@@ -802,7 +1051,9 @@ static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl)
* When called from ksgxd, sets the active memcg from one of the
* mms in the enclave's mm_list prior to any backing page allocation,
* in order to ensure that shmem page allocations are charged to the
- * enclave.
+ * enclave. Create a backing page for loading data back into an EPC page with
+ * ELDU. This function takes a reference on a new backing page which
+ * must be dropped with a corresponding call to sgx_encl_put_backing().
*
* Return:
* 0 on success,
@@ -815,7 +1066,7 @@ int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index,
struct mem_cgroup *memcg = set_active_memcg(encl_memcg);
int ret;
- ret = sgx_encl_get_backing(encl, page_index, backing);
+ ret = __sgx_encl_get_backing(encl, page_index, backing);
set_active_memcg(memcg);
mem_cgroup_put(encl_memcg);
@@ -833,15 +1084,17 @@ int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index,
* It is the caller's responsibility to ensure that it is appropriate to use
* sgx_encl_lookup_backing() rather than sgx_encl_alloc_backing(). If lookup is
* not used correctly, this will cause an allocation which is not accounted for.
+ * This function takes a reference on an existing backing page which must be
+ * dropped with a corresponding call to sgx_encl_put_backing().
*
* Return:
* 0 on success,
* -errno otherwise.
*/
-int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index,
+static int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index,
struct sgx_backing *backing)
{
- return sgx_encl_get_backing(encl, page_index, backing);
+ return __sgx_encl_get_backing(encl, page_index, backing);
}
/**
@@ -902,8 +1155,85 @@ int sgx_encl_test_and_clear_young(struct mm_struct *mm,
return ret;
}
+struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl,
+ unsigned long offset,
+ u64 secinfo_flags)
+{
+ struct sgx_encl_page *encl_page;
+ unsigned long prot;
+
+ encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL);
+ if (!encl_page)
+ return ERR_PTR(-ENOMEM);
+
+ encl_page->desc = encl->base + offset;
+ encl_page->encl = encl;
+
+ prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ) |
+ _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) |
+ _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC);
+
+ /*
+ * TCS pages must always RW set for CPU access while the SECINFO
+ * permissions are *always* zero - the CPU ignores the user provided
+ * values and silently overwrites them with zero permissions.
+ */
+ if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS)
+ prot |= PROT_READ | PROT_WRITE;
+
+ /* Calculate maximum of the VM flags for the page. */
+ encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0);
+
+ return encl_page;
+}
+
+/**
+ * sgx_zap_enclave_ptes() - remove PTEs mapping the address from enclave
+ * @encl: the enclave
+ * @addr: page aligned pointer to single page for which PTEs will be removed
+ *
+ * Multiple VMAs may have an enclave page mapped. Remove the PTE mapping
+ * @addr from each VMA. Ensure that page fault handler is ready to handle
+ * new mappings of @addr before calling this function.
+ */
+void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr)
+{
+ unsigned long mm_list_version;
+ struct sgx_encl_mm *encl_mm;
+ struct vm_area_struct *vma;
+ int idx, ret;
+
+ do {
+ mm_list_version = encl->mm_list_version;
+
+ /* Pairs with smp_wmb() in sgx_encl_mm_add(). */
+ smp_rmb();
+
+ idx = srcu_read_lock(&encl->srcu);
+
+ list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
+ if (!mmget_not_zero(encl_mm->mm))
+ continue;
+
+ mmap_read_lock(encl_mm->mm);
+
+ ret = sgx_encl_find(encl_mm->mm, addr, &vma);
+ if (!ret && encl == vma->vm_private_data)
+ zap_vma_ptes(vma, addr, PAGE_SIZE);
+
+ mmap_read_unlock(encl_mm->mm);
+
+ mmput_async(encl_mm->mm);
+ }
+
+ srcu_read_unlock(&encl->srcu, idx);
+ } while (unlikely(encl->mm_list_version != mm_list_version));
+}
+
/**
* sgx_alloc_va_page() - Allocate a Version Array (VA) page
+ * @reclaim: Reclaim EPC pages directly if none available. Enclave
+ * mutex should not be held if this is set.
*
* Allocate a free EPC page and convert it to a Version Array (VA) page.
*
@@ -911,12 +1241,12 @@ int sgx_encl_test_and_clear_young(struct mm_struct *mm,
* a VA page,
* -errno otherwise
*/
-struct sgx_epc_page *sgx_alloc_va_page(void)
+struct sgx_epc_page *sgx_alloc_va_page(bool reclaim)
{
struct sgx_epc_page *epc_page;
int ret;
- epc_page = sgx_alloc_epc_page(NULL, true);
+ epc_page = sgx_alloc_epc_page(NULL, reclaim);
if (IS_ERR(epc_page))
return ERR_CAST(epc_page);
diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h
index 332ef3568267..f94ff14c9486 100644
--- a/arch/x86/kernel/cpu/sgx/encl.h
+++ b/arch/x86/kernel/cpu/sgx/encl.h
@@ -27,7 +27,8 @@
struct sgx_encl_page {
unsigned long desc;
- unsigned long vm_max_prot_bits;
+ unsigned long vm_max_prot_bits:8;
+ enum sgx_page_type type:16;
struct sgx_epc_page *epc_page;
struct sgx_encl *encl;
struct sgx_va_page *va_page;
@@ -78,7 +79,6 @@ struct sgx_va_page {
};
struct sgx_backing {
- pgoff_t page_index;
struct page *contents;
struct page *pcmd;
unsigned long pcmd_offset;
@@ -106,18 +106,24 @@ int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
bool current_is_ksgxd(void);
void sgx_encl_release(struct kref *ref);
int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm);
-int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index,
- struct sgx_backing *backing);
+const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl);
int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index,
struct sgx_backing *backing);
void sgx_encl_put_backing(struct sgx_backing *backing);
int sgx_encl_test_and_clear_young(struct mm_struct *mm,
struct sgx_encl_page *page);
-
-struct sgx_epc_page *sgx_alloc_va_page(void);
+struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl,
+ unsigned long offset,
+ u64 secinfo_flags);
+void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr);
+struct sgx_epc_page *sgx_alloc_va_page(bool reclaim);
unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page);
void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset);
bool sgx_va_page_full(struct sgx_va_page *va_page);
void sgx_encl_free_epc_page(struct sgx_epc_page *page);
+struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl,
+ unsigned long addr);
+struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl, bool reclaim);
+void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page);
#endif /* _X86_ENCL_H */
diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h
index fa04a73daf9c..99004b02e2ed 100644
--- a/arch/x86/kernel/cpu/sgx/encls.h
+++ b/arch/x86/kernel/cpu/sgx/encls.h
@@ -136,57 +136,71 @@ static inline bool encls_failed(int ret)
ret; \
})
+/* Initialize an EPC page into an SGX Enclave Control Structure (SECS) page. */
static inline int __ecreate(struct sgx_pageinfo *pginfo, void *secs)
{
return __encls_2(ECREATE, pginfo, secs);
}
+/* Hash a 256 byte region of an enclave page to SECS:MRENCLAVE. */
static inline int __eextend(void *secs, void *addr)
{
return __encls_2(EEXTEND, secs, addr);
}
+/*
+ * Associate an EPC page to an enclave either as a REG or TCS page
+ * populated with the provided data.
+ */
static inline int __eadd(struct sgx_pageinfo *pginfo, void *addr)
{
return __encls_2(EADD, pginfo, addr);
}
+/* Finalize enclave build, initialize enclave for user code execution. */
static inline int __einit(void *sigstruct, void *token, void *secs)
{
return __encls_ret_3(EINIT, sigstruct, secs, token);
}
+/* Disassociate EPC page from its enclave and mark it as unused. */
static inline int __eremove(void *addr)
{
return __encls_ret_1(EREMOVE, addr);
}
+/* Copy data to an EPC page belonging to a debug enclave. */
static inline int __edbgwr(void *addr, unsigned long *data)
{
return __encls_2(EDGBWR, *data, addr);
}
+/* Copy data from an EPC page belonging to a debug enclave. */
static inline int __edbgrd(void *addr, unsigned long *data)
{
return __encls_1_1(EDGBRD, *data, addr);
}
+/* Track that software has completed the required TLB address clears. */
static inline int __etrack(void *addr)
{
return __encls_ret_1(ETRACK, addr);
}
+/* Load, verify, and unblock an EPC page. */
static inline int __eldu(struct sgx_pageinfo *pginfo, void *addr,
void *va)
{
return __encls_ret_3(ELDU, pginfo, addr, va);
}
+/* Make EPC page inaccessible to enclave, ready to be written to memory. */
static inline int __eblock(void *addr)
{
return __encls_ret_1(EBLOCK, addr);
}
+/* Initialize an EPC page into a Version Array (VA) page. */
static inline int __epa(void *addr)
{
unsigned long rbx = SGX_PAGE_TYPE_VA;
@@ -194,10 +208,29 @@ static inline int __epa(void *addr)
return __encls_2(EPA, rbx, addr);
}
+/* Invalidate an EPC page and write it out to main memory. */
static inline int __ewb(struct sgx_pageinfo *pginfo, void *addr,
void *va)
{
return __encls_ret_3(EWB, pginfo, addr, va);
}
+/* Restrict the EPCM permissions of an EPC page. */
+static inline int __emodpr(struct sgx_secinfo *secinfo, void *addr)
+{
+ return __encls_ret_2(EMODPR, secinfo, addr);
+}
+
+/* Change the type of an EPC page. */
+static inline int __emodt(struct sgx_secinfo *secinfo, void *addr)
+{
+ return __encls_ret_2(EMODT, secinfo, addr);
+}
+
+/* Zero a page of EPC memory and add it to an initialized enclave. */
+static inline int __eaug(struct sgx_pageinfo *pginfo, void *addr)
+{
+ return __encls_2(EAUG, pginfo, addr);
+}
+
#endif /* _X86_ENCLS_H */
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
index 83df20e3e633..66f1efa16fbb 100644
--- a/arch/x86/kernel/cpu/sgx/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -3,6 +3,7 @@
#include <asm/mman.h>
#include <asm/sgx.h>
+#include <crypto/sha2.h>
#include <linux/mman.h>
#include <linux/delay.h>
#include <linux/file.h>
@@ -17,7 +18,7 @@
#include "encl.h"
#include "encls.h"
-static struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl)
+struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl, bool reclaim)
{
struct sgx_va_page *va_page = NULL;
void *err;
@@ -30,7 +31,7 @@ static struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl)
if (!va_page)
return ERR_PTR(-ENOMEM);
- va_page->epc_page = sgx_alloc_va_page();
+ va_page->epc_page = sgx_alloc_va_page(reclaim);
if (IS_ERR(va_page->epc_page)) {
err = ERR_CAST(va_page->epc_page);
kfree(va_page);
@@ -43,7 +44,7 @@ static struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl)
return va_page;
}
-static void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page)
+void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page)
{
encl->page_cnt--;
@@ -64,7 +65,14 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs)
struct file *backing;
long ret;
- va_page = sgx_encl_grow(encl);
+ /*
+ * ECREATE would detect this too, but checking here also ensures
+ * that the 'encl_size' calculations below can never overflow.
+ */
+ if (!is_power_of_2(secs->size))
+ return -EINVAL;
+
+ va_page = sgx_encl_grow(encl, true);
if (IS_ERR(va_page))
return PTR_ERR(va_page);
else if (va_page)
@@ -107,10 +115,11 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs)
set_bit(SGX_ENCL_DEBUG, &encl->flags);
encl->secs.encl = encl;
+ encl->secs.type = SGX_PAGE_TYPE_SECS;
encl->base = secs->base;
encl->size = secs->size;
encl->attributes = secs->attributes;
- encl->attributes_mask = SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT | SGX_ATTR_KSS;
+ encl->attributes_mask = SGX_ATTR_UNPRIV_MASK;
/* Set only after completion, as encl->lock has not been taken. */
set_bit(SGX_ENCL_CREATED, &encl->flags);
@@ -168,38 +177,6 @@ static long sgx_ioc_enclave_create(struct sgx_encl *encl, void __user *arg)
return ret;
}
-static struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl,
- unsigned long offset,
- u64 secinfo_flags)
-{
- struct sgx_encl_page *encl_page;
- unsigned long prot;
-
- encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL);
- if (!encl_page)
- return ERR_PTR(-ENOMEM);
-
- encl_page->desc = encl->base + offset;
- encl_page->encl = encl;
-
- prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ) |
- _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) |
- _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC);
-
- /*
- * TCS pages must always RW set for CPU access while the SECINFO
- * permissions are *always* zero - the CPU ignores the user provided
- * values and silently overwrites them with zero permissions.
- */
- if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS)
- prot |= PROT_READ | PROT_WRITE;
-
- /* Calculate maximum of the VM flags for the page. */
- encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0);
-
- return encl_page;
-}
-
static int sgx_validate_secinfo(struct sgx_secinfo *secinfo)
{
u64 perm = secinfo->flags & SGX_SECINFO_PERMISSION_MASK;
@@ -245,18 +222,18 @@ static int __sgx_encl_add_page(struct sgx_encl *encl,
if (!(vma->vm_flags & VM_MAYEXEC))
return -EACCES;
- ret = get_user_pages(src, 1, 0, &src_page, NULL);
+ ret = get_user_pages(src, 1, 0, &src_page);
if (ret < 1)
return -EFAULT;
pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page);
pginfo.addr = encl_page->desc & PAGE_MASK;
pginfo.metadata = (unsigned long)secinfo;
- pginfo.contents = (unsigned long)kmap_atomic(src_page);
+ pginfo.contents = (unsigned long)kmap_local_page(src_page);
ret = __eadd(&pginfo, sgx_get_epc_virt_addr(epc_page));
- kunmap_atomic((void *)pginfo.contents);
+ kunmap_local((void *)pginfo.contents);
put_page(src_page);
return ret ? -EIO : 0;
@@ -306,7 +283,7 @@ static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src,
return PTR_ERR(epc_page);
}
- va_page = sgx_encl_grow(encl);
+ va_page = sgx_encl_grow(encl, true);
if (IS_ERR(va_page)) {
ret = PTR_ERR(va_page);
goto err_out_free;
@@ -344,6 +321,7 @@ static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src,
*/
encl_page->encl = encl;
encl_page->epc_page = epc_page;
+ encl_page->type = (secinfo->flags & SGX_SECINFO_PAGE_TYPE_MASK) >> 8;
encl->secs_child_cnt++;
if (flags & SGX_PAGE_MEASURE) {
@@ -372,6 +350,29 @@ err_out_free:
return ret;
}
+/*
+ * Ensure user provided offset and length values are valid for
+ * an enclave.
+ */
+static int sgx_validate_offset_length(struct sgx_encl *encl,
+ unsigned long offset,
+ unsigned long length)
+{
+ if (!IS_ALIGNED(offset, PAGE_SIZE))
+ return -EINVAL;
+
+ if (!length || !IS_ALIGNED(length, PAGE_SIZE))
+ return -EINVAL;
+
+ if (offset + length < offset)
+ return -EINVAL;
+
+ if (offset + length - PAGE_SIZE >= encl->size)
+ return -EINVAL;
+
+ return 0;
+}
+
/**
* sgx_ioc_enclave_add_pages() - The handler for %SGX_IOC_ENCLAVE_ADD_PAGES
* @encl: an enclave pointer
@@ -425,14 +426,10 @@ static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg)
if (copy_from_user(&add_arg, arg, sizeof(add_arg)))
return -EFAULT;
- if (!IS_ALIGNED(add_arg.offset, PAGE_SIZE) ||
- !IS_ALIGNED(add_arg.src, PAGE_SIZE))
- return -EINVAL;
-
- if (!add_arg.length || add_arg.length & (PAGE_SIZE - 1))
+ if (!IS_ALIGNED(add_arg.src, PAGE_SIZE))
return -EINVAL;
- if (add_arg.offset + add_arg.length - PAGE_SIZE >= encl->size)
+ if (sgx_validate_offset_length(encl, add_arg.offset, add_arg.length))
return -EINVAL;
if (copy_from_user(&secinfo, (void __user *)add_arg.secinfo,
@@ -467,31 +464,6 @@ static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg)
return ret;
}
-static int __sgx_get_key_hash(struct crypto_shash *tfm, const void *modulus,
- void *hash)
-{
- SHASH_DESC_ON_STACK(shash, tfm);
-
- shash->tfm = tfm;
-
- return crypto_shash_digest(shash, modulus, SGX_MODULUS_SIZE, hash);
-}
-
-static int sgx_get_key_hash(const void *modulus, void *hash)
-{
- struct crypto_shash *tfm;
- int ret;
-
- tfm = crypto_alloc_shash("sha256", 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(tfm))
- return PTR_ERR(tfm);
-
- ret = __sgx_get_key_hash(tfm, modulus, hash);
-
- crypto_free_shash(tfm);
- return ret;
-}
-
static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
void *token)
{
@@ -527,9 +499,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
sgx_xfrm_reserved_mask)
return -EINVAL;
- ret = sgx_get_key_hash(sigstruct->modulus, mrsigner);
- if (ret)
- return ret;
+ sha256(sigstruct->modulus, SGX_MODULUS_SIZE, (u8 *)mrsigner);
mutex_lock(&encl->lock);
@@ -592,7 +562,7 @@ err_out:
*
* Flush any outstanding enqueued EADD operations and perform EINIT. The
* Launch Enclave Public Key Hash MSRs are rewritten as necessary to match
- * the enclave's MRSIGNER, which is caculated from the provided sigstruct.
+ * the enclave's MRSIGNER, which is calculated from the provided sigstruct.
*
* Return:
* - 0: Success.
@@ -674,6 +644,565 @@ static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg)
return sgx_set_attribute(&encl->attributes_mask, params.fd);
}
+/*
+ * Ensure enclave is ready for SGX2 functions. Readiness is checked
+ * by ensuring the hardware supports SGX2 and the enclave is initialized
+ * and thus able to handle requests to modify pages within it.
+ */
+static int sgx_ioc_sgx2_ready(struct sgx_encl *encl)
+{
+ if (!(cpu_feature_enabled(X86_FEATURE_SGX2)))
+ return -ENODEV;
+
+ if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags))
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * Some SGX functions require that no cached linear-to-physical address
+ * mappings are present before they can succeed. Collaborate with
+ * hardware via ENCLS[ETRACK] to ensure that all cached
+ * linear-to-physical address mappings belonging to all threads of
+ * the enclave are cleared. See sgx_encl_cpumask() for details.
+ *
+ * Must be called with enclave's mutex held from the time the
+ * SGX function requiring that no cached linear-to-physical mappings
+ * are present is executed until this ETRACK flow is complete.
+ */
+static int sgx_enclave_etrack(struct sgx_encl *encl)
+{
+ void *epc_virt;
+ int ret;
+
+ epc_virt = sgx_get_epc_virt_addr(encl->secs.epc_page);
+ ret = __etrack(epc_virt);
+ if (ret) {
+ /*
+ * ETRACK only fails when there is an OS issue. For
+ * example, two consecutive ETRACK was sent without
+ * completed IPI between.
+ */
+ pr_err_once("ETRACK returned %d (0x%x)", ret, ret);
+ /*
+ * Send IPIs to kick CPUs out of the enclave and
+ * try ETRACK again.
+ */
+ on_each_cpu_mask(sgx_encl_cpumask(encl), sgx_ipi_cb, NULL, 1);
+ ret = __etrack(epc_virt);
+ if (ret) {
+ pr_err_once("ETRACK repeat returned %d (0x%x)",
+ ret, ret);
+ return -EFAULT;
+ }
+ }
+ on_each_cpu_mask(sgx_encl_cpumask(encl), sgx_ipi_cb, NULL, 1);
+
+ return 0;
+}
+
+/**
+ * sgx_enclave_restrict_permissions() - Restrict EPCM permissions
+ * @encl: Enclave to which the pages belong.
+ * @modp: Checked parameters from user on which pages need modifying and
+ * their new permissions.
+ *
+ * Return:
+ * - 0: Success.
+ * - -errno: Otherwise.
+ */
+static long
+sgx_enclave_restrict_permissions(struct sgx_encl *encl,
+ struct sgx_enclave_restrict_permissions *modp)
+{
+ struct sgx_encl_page *entry;
+ struct sgx_secinfo secinfo;
+ unsigned long addr;
+ unsigned long c;
+ void *epc_virt;
+ int ret;
+
+ memset(&secinfo, 0, sizeof(secinfo));
+ secinfo.flags = modp->permissions & SGX_SECINFO_PERMISSION_MASK;
+
+ for (c = 0 ; c < modp->length; c += PAGE_SIZE) {
+ addr = encl->base + modp->offset + c;
+
+ sgx_reclaim_direct();
+
+ mutex_lock(&encl->lock);
+
+ entry = sgx_encl_load_page(encl, addr);
+ if (IS_ERR(entry)) {
+ ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT;
+ goto out_unlock;
+ }
+
+ /*
+ * Changing EPCM permissions is only supported on regular
+ * SGX pages. Attempting this change on other pages will
+ * result in #PF.
+ */
+ if (entry->type != SGX_PAGE_TYPE_REG) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ /*
+ * Apart from ensuring that read-access remains, do not verify
+ * the permission bits requested. Kernel has no control over
+ * how EPCM permissions can be relaxed from within the enclave.
+ * ENCLS[EMODPR] can only remove existing EPCM permissions,
+ * attempting to set new permissions will be ignored by the
+ * hardware.
+ */
+
+ /* Change EPCM permissions. */
+ epc_virt = sgx_get_epc_virt_addr(entry->epc_page);
+ ret = __emodpr(&secinfo, epc_virt);
+ if (encls_faulted(ret)) {
+ /*
+ * All possible faults should be avoidable:
+ * parameters have been checked, will only change
+ * permissions of a regular page, and no concurrent
+ * SGX1/SGX2 ENCLS instructions since these
+ * are protected with mutex.
+ */
+ pr_err_once("EMODPR encountered exception %d\n",
+ ENCLS_TRAPNR(ret));
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+ if (encls_failed(ret)) {
+ modp->result = ret;
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ ret = sgx_enclave_etrack(encl);
+ if (ret) {
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ mutex_unlock(&encl->lock);
+ }
+
+ ret = 0;
+ goto out;
+
+out_unlock:
+ mutex_unlock(&encl->lock);
+out:
+ modp->count = c;
+
+ return ret;
+}
+
+/**
+ * sgx_ioc_enclave_restrict_permissions() - handler for
+ * %SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS
+ * @encl: an enclave pointer
+ * @arg: userspace pointer to a &struct sgx_enclave_restrict_permissions
+ * instance
+ *
+ * SGX2 distinguishes between relaxing and restricting the enclave page
+ * permissions maintained by the hardware (EPCM permissions) of pages
+ * belonging to an initialized enclave (after SGX_IOC_ENCLAVE_INIT).
+ *
+ * EPCM permissions cannot be restricted from within the enclave, the enclave
+ * requires the kernel to run the privileged level 0 instructions ENCLS[EMODPR]
+ * and ENCLS[ETRACK]. An attempt to relax EPCM permissions with this call
+ * will be ignored by the hardware.
+ *
+ * Return:
+ * - 0: Success
+ * - -errno: Otherwise
+ */
+static long sgx_ioc_enclave_restrict_permissions(struct sgx_encl *encl,
+ void __user *arg)
+{
+ struct sgx_enclave_restrict_permissions params;
+ long ret;
+
+ ret = sgx_ioc_sgx2_ready(encl);
+ if (ret)
+ return ret;
+
+ if (copy_from_user(&params, arg, sizeof(params)))
+ return -EFAULT;
+
+ if (sgx_validate_offset_length(encl, params.offset, params.length))
+ return -EINVAL;
+
+ if (params.permissions & ~SGX_SECINFO_PERMISSION_MASK)
+ return -EINVAL;
+
+ /*
+ * Fail early if invalid permissions requested to prevent ENCLS[EMODPR]
+ * from faulting later when the CPU does the same check.
+ */
+ if ((params.permissions & SGX_SECINFO_W) &&
+ !(params.permissions & SGX_SECINFO_R))
+ return -EINVAL;
+
+ if (params.result || params.count)
+ return -EINVAL;
+
+ ret = sgx_enclave_restrict_permissions(encl, &params);
+
+ if (copy_to_user(arg, &params, sizeof(params)))
+ return -EFAULT;
+
+ return ret;
+}
+
+/**
+ * sgx_enclave_modify_types() - Modify type of SGX enclave pages
+ * @encl: Enclave to which the pages belong.
+ * @modt: Checked parameters from user about which pages need modifying
+ * and their new page type.
+ *
+ * Return:
+ * - 0: Success
+ * - -errno: Otherwise
+ */
+static long sgx_enclave_modify_types(struct sgx_encl *encl,
+ struct sgx_enclave_modify_types *modt)
+{
+ unsigned long max_prot_restore;
+ enum sgx_page_type page_type;
+ struct sgx_encl_page *entry;
+ struct sgx_secinfo secinfo;
+ unsigned long prot;
+ unsigned long addr;
+ unsigned long c;
+ void *epc_virt;
+ int ret;
+
+ page_type = modt->page_type & SGX_PAGE_TYPE_MASK;
+
+ /*
+ * The only new page types allowed by hardware are PT_TCS and PT_TRIM.
+ */
+ if (page_type != SGX_PAGE_TYPE_TCS && page_type != SGX_PAGE_TYPE_TRIM)
+ return -EINVAL;
+
+ memset(&secinfo, 0, sizeof(secinfo));
+
+ secinfo.flags = page_type << 8;
+
+ for (c = 0 ; c < modt->length; c += PAGE_SIZE) {
+ addr = encl->base + modt->offset + c;
+
+ sgx_reclaim_direct();
+
+ mutex_lock(&encl->lock);
+
+ entry = sgx_encl_load_page(encl, addr);
+ if (IS_ERR(entry)) {
+ ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT;
+ goto out_unlock;
+ }
+
+ /*
+ * Borrow the logic from the Intel SDM. Regular pages
+ * (SGX_PAGE_TYPE_REG) can change type to SGX_PAGE_TYPE_TCS
+ * or SGX_PAGE_TYPE_TRIM but TCS pages can only be trimmed.
+ * CET pages not supported yet.
+ */
+ if (!(entry->type == SGX_PAGE_TYPE_REG ||
+ (entry->type == SGX_PAGE_TYPE_TCS &&
+ page_type == SGX_PAGE_TYPE_TRIM))) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ max_prot_restore = entry->vm_max_prot_bits;
+
+ /*
+ * Once a regular page becomes a TCS page it cannot be
+ * changed back. So the maximum allowed protection reflects
+ * the TCS page that is always RW from kernel perspective but
+ * will be inaccessible from within enclave. Before doing
+ * so, do make sure that the new page type continues to
+ * respect the originally vetted page permissions.
+ */
+ if (entry->type == SGX_PAGE_TYPE_REG &&
+ page_type == SGX_PAGE_TYPE_TCS) {
+ if (~entry->vm_max_prot_bits & (VM_READ | VM_WRITE)) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+ prot = PROT_READ | PROT_WRITE;
+ entry->vm_max_prot_bits = calc_vm_prot_bits(prot, 0);
+
+ /*
+ * Prevent page from being reclaimed while mutex
+ * is released.
+ */
+ if (sgx_unmark_page_reclaimable(entry->epc_page)) {
+ ret = -EAGAIN;
+ goto out_entry_changed;
+ }
+
+ /*
+ * Do not keep encl->lock because of dependency on
+ * mmap_lock acquired in sgx_zap_enclave_ptes().
+ */
+ mutex_unlock(&encl->lock);
+
+ sgx_zap_enclave_ptes(encl, addr);
+
+ mutex_lock(&encl->lock);
+
+ sgx_mark_page_reclaimable(entry->epc_page);
+ }
+
+ /* Change EPC type */
+ epc_virt = sgx_get_epc_virt_addr(entry->epc_page);
+ ret = __emodt(&secinfo, epc_virt);
+ if (encls_faulted(ret)) {
+ /*
+ * All possible faults should be avoidable:
+ * parameters have been checked, will only change
+ * valid page types, and no concurrent
+ * SGX1/SGX2 ENCLS instructions since these are
+ * protected with mutex.
+ */
+ pr_err_once("EMODT encountered exception %d\n",
+ ENCLS_TRAPNR(ret));
+ ret = -EFAULT;
+ goto out_entry_changed;
+ }
+ if (encls_failed(ret)) {
+ modt->result = ret;
+ ret = -EFAULT;
+ goto out_entry_changed;
+ }
+
+ ret = sgx_enclave_etrack(encl);
+ if (ret) {
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ entry->type = page_type;
+
+ mutex_unlock(&encl->lock);
+ }
+
+ ret = 0;
+ goto out;
+
+out_entry_changed:
+ entry->vm_max_prot_bits = max_prot_restore;
+out_unlock:
+ mutex_unlock(&encl->lock);
+out:
+ modt->count = c;
+
+ return ret;
+}
+
+/**
+ * sgx_ioc_enclave_modify_types() - handler for %SGX_IOC_ENCLAVE_MODIFY_TYPES
+ * @encl: an enclave pointer
+ * @arg: userspace pointer to a &struct sgx_enclave_modify_types instance
+ *
+ * Ability to change the enclave page type supports the following use cases:
+ *
+ * * It is possible to add TCS pages to an enclave by changing the type of
+ * regular pages (%SGX_PAGE_TYPE_REG) to TCS (%SGX_PAGE_TYPE_TCS) pages.
+ * With this support the number of threads supported by an initialized
+ * enclave can be increased dynamically.
+ *
+ * * Regular or TCS pages can dynamically be removed from an initialized
+ * enclave by changing the page type to %SGX_PAGE_TYPE_TRIM. Changing the
+ * page type to %SGX_PAGE_TYPE_TRIM marks the page for removal with actual
+ * removal done by handler of %SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl() called
+ * after ENCLU[EACCEPT] is run on %SGX_PAGE_TYPE_TRIM page from within the
+ * enclave.
+ *
+ * Return:
+ * - 0: Success
+ * - -errno: Otherwise
+ */
+static long sgx_ioc_enclave_modify_types(struct sgx_encl *encl,
+ void __user *arg)
+{
+ struct sgx_enclave_modify_types params;
+ long ret;
+
+ ret = sgx_ioc_sgx2_ready(encl);
+ if (ret)
+ return ret;
+
+ if (copy_from_user(&params, arg, sizeof(params)))
+ return -EFAULT;
+
+ if (sgx_validate_offset_length(encl, params.offset, params.length))
+ return -EINVAL;
+
+ if (params.page_type & ~SGX_PAGE_TYPE_MASK)
+ return -EINVAL;
+
+ if (params.result || params.count)
+ return -EINVAL;
+
+ ret = sgx_enclave_modify_types(encl, &params);
+
+ if (copy_to_user(arg, &params, sizeof(params)))
+ return -EFAULT;
+
+ return ret;
+}
+
+/**
+ * sgx_encl_remove_pages() - Remove trimmed pages from SGX enclave
+ * @encl: Enclave to which the pages belong
+ * @params: Checked parameters from user on which pages need to be removed
+ *
+ * Return:
+ * - 0: Success.
+ * - -errno: Otherwise.
+ */
+static long sgx_encl_remove_pages(struct sgx_encl *encl,
+ struct sgx_enclave_remove_pages *params)
+{
+ struct sgx_encl_page *entry;
+ struct sgx_secinfo secinfo;
+ unsigned long addr;
+ unsigned long c;
+ void *epc_virt;
+ int ret;
+
+ memset(&secinfo, 0, sizeof(secinfo));
+ secinfo.flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X;
+
+ for (c = 0 ; c < params->length; c += PAGE_SIZE) {
+ addr = encl->base + params->offset + c;
+
+ sgx_reclaim_direct();
+
+ mutex_lock(&encl->lock);
+
+ entry = sgx_encl_load_page(encl, addr);
+ if (IS_ERR(entry)) {
+ ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT;
+ goto out_unlock;
+ }
+
+ if (entry->type != SGX_PAGE_TYPE_TRIM) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+
+ /*
+ * ENCLS[EMODPR] is a no-op instruction used to inform if
+ * ENCLU[EACCEPT] was run from within the enclave. If
+ * ENCLS[EMODPR] is run with RWX on a trimmed page that is
+ * not yet accepted then it will return
+ * %SGX_PAGE_NOT_MODIFIABLE, after the trimmed page is
+ * accepted the instruction will encounter a page fault.
+ */
+ epc_virt = sgx_get_epc_virt_addr(entry->epc_page);
+ ret = __emodpr(&secinfo, epc_virt);
+ if (!encls_faulted(ret) || ENCLS_TRAPNR(ret) != X86_TRAP_PF) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+
+ if (sgx_unmark_page_reclaimable(entry->epc_page)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ /*
+ * Do not keep encl->lock because of dependency on
+ * mmap_lock acquired in sgx_zap_enclave_ptes().
+ */
+ mutex_unlock(&encl->lock);
+
+ sgx_zap_enclave_ptes(encl, addr);
+
+ mutex_lock(&encl->lock);
+
+ sgx_encl_free_epc_page(entry->epc_page);
+ encl->secs_child_cnt--;
+ entry->epc_page = NULL;
+ xa_erase(&encl->page_array, PFN_DOWN(entry->desc));
+ sgx_encl_shrink(encl, NULL);
+ kfree(entry);
+
+ mutex_unlock(&encl->lock);
+ }
+
+ ret = 0;
+ goto out;
+
+out_unlock:
+ mutex_unlock(&encl->lock);
+out:
+ params->count = c;
+
+ return ret;
+}
+
+/**
+ * sgx_ioc_enclave_remove_pages() - handler for %SGX_IOC_ENCLAVE_REMOVE_PAGES
+ * @encl: an enclave pointer
+ * @arg: userspace pointer to &struct sgx_enclave_remove_pages instance
+ *
+ * Final step of the flow removing pages from an initialized enclave. The
+ * complete flow is:
+ *
+ * 1) User changes the type of the pages to be removed to %SGX_PAGE_TYPE_TRIM
+ * using the %SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl().
+ * 2) User approves the page removal by running ENCLU[EACCEPT] from within
+ * the enclave.
+ * 3) User initiates actual page removal using the
+ * %SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl() that is handled here.
+ *
+ * First remove any page table entries pointing to the page and then proceed
+ * with the actual removal of the enclave page and data in support of it.
+ *
+ * VA pages are not affected by this removal. It is thus possible that the
+ * enclave may end up with more VA pages than needed to support all its
+ * pages.
+ *
+ * Return:
+ * - 0: Success
+ * - -errno: Otherwise
+ */
+static long sgx_ioc_enclave_remove_pages(struct sgx_encl *encl,
+ void __user *arg)
+{
+ struct sgx_enclave_remove_pages params;
+ long ret;
+
+ ret = sgx_ioc_sgx2_ready(encl);
+ if (ret)
+ return ret;
+
+ if (copy_from_user(&params, arg, sizeof(params)))
+ return -EFAULT;
+
+ if (sgx_validate_offset_length(encl, params.offset, params.length))
+ return -EINVAL;
+
+ if (params.count)
+ return -EINVAL;
+
+ ret = sgx_encl_remove_pages(encl, &params);
+
+ if (copy_to_user(arg, &params, sizeof(params)))
+ return -EFAULT;
+
+ return ret;
+}
+
long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
{
struct sgx_encl *encl = filep->private_data;
@@ -695,6 +1224,16 @@ long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
case SGX_IOC_ENCLAVE_PROVISION:
ret = sgx_ioc_enclave_provision(encl, (void __user *)arg);
break;
+ case SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS:
+ ret = sgx_ioc_enclave_restrict_permissions(encl,
+ (void __user *)arg);
+ break;
+ case SGX_IOC_ENCLAVE_MODIFY_TYPES:
+ ret = sgx_ioc_enclave_modify_types(encl, (void __user *)arg);
+ break;
+ case SGX_IOC_ENCLAVE_REMOVE_PAGES:
+ ret = sgx_ioc_enclave_remove_pages(encl, (void __user *)arg);
+ break;
default:
ret = -ENOIOCTLCMD;
break;
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index a78652d43e61..2de01b379aa3 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -13,6 +13,8 @@
#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/sysfs.h>
+#include <linux/vmalloc.h>
+#include <asm/msr.h>
#include <asm/sgx.h>
#include "driver.h"
#include "encl.h"
@@ -49,9 +51,13 @@ static LIST_HEAD(sgx_dirty_page_list);
* Reset post-kexec EPC pages to the uninitialized state. The pages are removed
* from the input list, and made available for the page allocator. SECS pages
* prepending their children in the input list are left intact.
+ *
+ * Return 0 when sanitization was successful or kthread was stopped, and the
+ * number of unsanitized pages otherwise.
*/
-static void __sgx_sanitize_pages(struct list_head *dirty_page_list)
+static unsigned long __sgx_sanitize_pages(struct list_head *dirty_page_list)
{
+ unsigned long left_dirty = 0;
struct sgx_epc_page *page;
LIST_HEAD(dirty);
int ret;
@@ -59,7 +65,7 @@ static void __sgx_sanitize_pages(struct list_head *dirty_page_list)
/* dirty_page_list is thread-local, no need for a lock: */
while (!list_empty(dirty_page_list)) {
if (kthread_should_stop())
- return;
+ return 0;
page = list_first_entry(dirty_page_list, struct sgx_epc_page, list);
@@ -92,12 +98,14 @@ static void __sgx_sanitize_pages(struct list_head *dirty_page_list)
} else {
/* The page is not yet clean - move to the dirty list. */
list_move_tail(&page->list, &dirty);
+ left_dirty++;
}
cond_resched();
}
list_splice(&dirty, dirty_page_list);
+ return left_dirty;
}
static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page)
@@ -137,36 +145,9 @@ static void sgx_reclaimer_block(struct sgx_epc_page *epc_page)
struct sgx_encl_page *page = epc_page->owner;
unsigned long addr = page->desc & PAGE_MASK;
struct sgx_encl *encl = page->encl;
- unsigned long mm_list_version;
- struct sgx_encl_mm *encl_mm;
- struct vm_area_struct *vma;
- int idx, ret;
-
- do {
- mm_list_version = encl->mm_list_version;
-
- /* Pairs with smp_rmb() in sgx_encl_mm_add(). */
- smp_rmb();
-
- idx = srcu_read_lock(&encl->srcu);
-
- list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
- if (!mmget_not_zero(encl_mm->mm))
- continue;
-
- mmap_read_lock(encl_mm->mm);
-
- ret = sgx_encl_find(encl_mm->mm, addr, &vma);
- if (!ret && encl == vma->vm_private_data)
- zap_vma_ptes(vma, addr, PAGE_SIZE);
-
- mmap_read_unlock(encl_mm->mm);
-
- mmput_async(encl_mm->mm);
- }
+ int ret;
- srcu_read_unlock(&encl->srcu, idx);
- } while (unlikely(encl->mm_list_version != mm_list_version));
+ sgx_zap_enclave_ptes(encl, addr);
mutex_lock(&encl->lock);
@@ -186,54 +167,25 @@ static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot,
pginfo.addr = 0;
pginfo.secs = 0;
- pginfo.contents = (unsigned long)kmap_atomic(backing->contents);
- pginfo.metadata = (unsigned long)kmap_atomic(backing->pcmd) +
+ pginfo.contents = (unsigned long)kmap_local_page(backing->contents);
+ pginfo.metadata = (unsigned long)kmap_local_page(backing->pcmd) +
backing->pcmd_offset;
ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot);
set_page_dirty(backing->pcmd);
set_page_dirty(backing->contents);
- kunmap_atomic((void *)(unsigned long)(pginfo.metadata -
+ kunmap_local((void *)(unsigned long)(pginfo.metadata -
backing->pcmd_offset));
- kunmap_atomic((void *)(unsigned long)pginfo.contents);
+ kunmap_local((void *)(unsigned long)pginfo.contents);
return ret;
}
-static void sgx_ipi_cb(void *info)
+void sgx_ipi_cb(void *info)
{
}
-static const cpumask_t *sgx_encl_ewb_cpumask(struct sgx_encl *encl)
-{
- cpumask_t *cpumask = &encl->cpumask;
- struct sgx_encl_mm *encl_mm;
- int idx;
-
- /*
- * Can race with sgx_encl_mm_add(), but ETRACK has already been
- * executed, which means that the CPUs running in the new mm will enter
- * into the enclave with a fresh epoch.
- */
- cpumask_clear(cpumask);
-
- idx = srcu_read_lock(&encl->srcu);
-
- list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
- if (!mmget_not_zero(encl_mm->mm))
- continue;
-
- cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm));
-
- mmput_async(encl_mm->mm);
- }
-
- srcu_read_unlock(&encl->srcu, idx);
-
- return cpumask;
-}
-
/*
* Swap page to the regular memory transformed to the blocked state by using
* EBLOCK, which means that it can no longer be referenced (no new TLB entries).
@@ -280,7 +232,7 @@ static void sgx_encl_ewb(struct sgx_epc_page *epc_page,
* miss cpus that entered the enclave between
* generating the mask and incrementing epoch.
*/
- on_each_cpu_mask(sgx_encl_ewb_cpumask(encl),
+ on_each_cpu_mask(sgx_encl_cpumask(encl),
sgx_ipi_cb, NULL, 1);
ret = __sgx_encl_ewb(epc_page, va_slot, backing);
}
@@ -431,6 +383,17 @@ static bool sgx_should_reclaim(unsigned long watermark)
!list_empty(&sgx_active_page_list);
}
+/*
+ * sgx_reclaim_direct() should be called (without enclave's mutex held)
+ * in locations where SGX memory resources might be low and might be
+ * needed in order to make forward progress.
+ */
+void sgx_reclaim_direct(void)
+{
+ if (sgx_should_reclaim(SGX_NR_LOW_PAGES))
+ sgx_reclaim_pages();
+}
+
static int ksgxd(void *p)
{
set_freezable();
@@ -440,10 +403,7 @@ static int ksgxd(void *p)
* required for SECS pages, whose child pages blocked EREMOVE.
*/
__sgx_sanitize_pages(&sgx_dirty_page_list);
- __sgx_sanitize_pages(&sgx_dirty_page_list);
-
- /* sanity check: */
- WARN_ON(!list_empty(&sgx_dirty_page_list));
+ WARN_ON(__sgx_sanitize_pages(&sgx_dirty_page_list));
while (!kthread_should_stop()) {
if (try_to_freeze())
@@ -516,24 +476,25 @@ struct sgx_epc_page *__sgx_alloc_epc_page(void)
{
struct sgx_epc_page *page;
int nid_of_current = numa_node_id();
- int nid = nid_of_current;
-
- if (node_isset(nid_of_current, sgx_numa_mask)) {
- page = __sgx_alloc_epc_page_from_node(nid_of_current);
- if (page)
- return page;
- }
+ int nid_start, nid;
- /* Fall back to the non-local NUMA nodes: */
- while (true) {
- nid = next_node_in(nid, sgx_numa_mask);
- if (nid == nid_of_current)
- break;
+ /*
+ * Try local node first. If it doesn't have an EPC section,
+ * fall back to the non-local NUMA nodes.
+ */
+ if (node_isset(nid_of_current, sgx_numa_mask))
+ nid_start = nid_of_current;
+ else
+ nid_start = next_node_in(nid_of_current, sgx_numa_mask);
+ nid = nid_start;
+ do {
page = __sgx_alloc_epc_page_from_node(nid);
if (page)
return page;
- }
+
+ nid = next_node_in(nid, sgx_numa_mask);
+ } while (nid != nid_start);
return ERR_PTR(-ENOMEM);
}
@@ -670,7 +631,7 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
if (!section->virt_addr)
return false;
- section->pages = vmalloc(nr_pages * sizeof(struct sgx_epc_page));
+ section->pages = vmalloc_array(nr_pages, sizeof(struct sgx_epc_page));
if (!section->pages) {
memunmap(section->virt_addr);
return false;
@@ -759,6 +720,8 @@ int arch_memory_failure(unsigned long pfn, int flags)
goto out;
}
+ sgx_unmark_page_reclaimable(page);
+
/*
* TBD: Add additional plumbing to enable pre-emptive
* action for asynchronous poison notification. Until
@@ -773,7 +736,7 @@ out:
return 0;
}
-/**
+/*
* A section metric is concatenated in a way that @low bits 12-31 define the
* bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the
* metric.
@@ -888,6 +851,13 @@ static bool __init sgx_page_cache_init(void)
return false;
}
+ for_each_online_node(nid) {
+ if (!node_isset(nid, sgx_numa_mask) &&
+ node_state(nid, N_MEMORY) && node_state(nid, N_CPU))
+ pr_info("node%d has both CPUs and memory but doesn't have an EPC section\n",
+ nid);
+ }
+
return true;
}
@@ -904,7 +874,7 @@ void sgx_update_lepubkeyhash(u64 *lepubkeyhash)
WARN_ON_ONCE(preemptible());
for (i = 0; i < 4; i++)
- wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]);
+ wrmsrq(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]);
}
const struct file_operations sgx_provision_fops = {
@@ -934,20 +904,15 @@ static struct miscdevice sgx_dev_provision = {
int sgx_set_attribute(unsigned long *allowed_attributes,
unsigned int attribute_fd)
{
- struct file *file;
+ CLASS(fd, f)(attribute_fd);
- file = fget(attribute_fd);
- if (!file)
+ if (fd_empty(f))
return -EINVAL;
- if (file->f_op != &sgx_provision_fops) {
- fput(file);
+ if (fd_file(f)->f_op != &sgx_provision_fops)
return -EINVAL;
- }
*allowed_attributes |= SGX_ATTR_PROVISIONKEY;
-
- fput(file);
return 0;
}
EXPORT_SYMBOL_GPL(sgx_set_attribute);
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
index 0f17def9fe6f..d2dad21259a8 100644
--- a/arch/x86/kernel/cpu/sgx/sgx.h
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -15,7 +15,7 @@
#define EREMOVE_ERROR_MESSAGE \
"EREMOVE returned %d (0x%x) and an EPC page was leaked. SGX may become unusable. " \
- "Refer to Documentation/x86/sgx.rst for more information."
+ "Refer to Documentation/arch/x86/sgx.rst for more information."
#define SGX_MAX_EPC_SECTIONS 8
#define SGX_EEXTEND_BLOCK_SIZE 256
@@ -86,10 +86,13 @@ static inline void *sgx_get_epc_virt_addr(struct sgx_epc_page *page)
struct sgx_epc_page *__sgx_alloc_epc_page(void);
void sgx_free_epc_page(struct sgx_epc_page *page);
+void sgx_reclaim_direct(void);
void sgx_mark_page_reclaimable(struct sgx_epc_page *page);
int sgx_unmark_page_reclaimable(struct sgx_epc_page *page);
struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim);
+void sgx_ipi_cb(void *info);
+
#ifdef CONFIG_X86_SGX_KVM
int __init sgx_vepc_init(void);
#else
diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
index 6a77a14eee38..7aaa3652e31d 100644
--- a/arch/x86/kernel/cpu/sgx/virt.c
+++ b/arch/x86/kernel/cpu/sgx/virt.c
@@ -105,7 +105,7 @@ static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma)
vma->vm_ops = &sgx_vepc_vm_ops;
/* Don't copy VMA in fork() */
- vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY;
+ vm_flags_set(vma, VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY);
vma->vm_private_data = vepc;
return 0;
@@ -204,6 +204,7 @@ static int sgx_vepc_release(struct inode *inode, struct file *file)
continue;
xa_erase(&vepc->page_array, index);
+ cond_resched();
}
/*
@@ -222,6 +223,7 @@ static int sgx_vepc_release(struct inode *inode, struct file *file)
list_add_tail(&epc_page->list, &secs_pages);
xa_erase(&vepc->page_array, index);
+ cond_resched();
}
/*
@@ -243,6 +245,7 @@ static int sgx_vepc_release(struct inode *inode, struct file *file)
if (sgx_vepc_free_page(epc_page))
list_add_tail(&epc_page->list, &secs_pages);
+ cond_resched();
}
if (!list_empty(&secs_pages))
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index 132a2de44d2f..e35ccdc84910 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -1,161 +1,572 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * Check for extended topology enumeration cpuid leaf 0xb and if it
- * exists, use it for populating initial_apicid and cpu topology
- * detection.
+ * CPU/APIC topology
+ *
+ * The APIC IDs describe the system topology in multiple domain levels.
+ * The CPUID topology parser provides the information which part of the
+ * APIC ID is associated to the individual levels:
+ *
+ * [PACKAGE][DIEGRP][DIE][TILE][MODULE][CORE][THREAD]
+ *
+ * The root space contains the package (socket) IDs.
+ *
+ * Not enumerated levels consume 0 bits space, but conceptually they are
+ * always represented. If e.g. only CORE and THREAD levels are enumerated
+ * then the DIE, MODULE and TILE have the same physical ID as the PACKAGE.
+ *
+ * If SMT is not supported, then the THREAD domain is still used. It then
+ * has the same physical ID as the CORE domain and is the only child of
+ * the core domain.
+ *
+ * This allows a unified view on the system independent of the enumerated
+ * domain levels without requiring any conditionals in the code.
*/
-
+#define pr_fmt(fmt) "CPU topo: " fmt
#include <linux/cpu.h>
+
+#include <xen/xen.h>
+
#include <asm/apic.h>
-#include <asm/memtype.h>
-#include <asm/processor.h>
+#include <asm/hypervisor.h>
+#include <asm/io_apic.h>
+#include <asm/mpspec.h>
+#include <asm/msr.h>
+#include <asm/smp.h>
#include "cpu.h"
-/* leaf 0xb SMT level */
-#define SMT_LEVEL 0
+/*
+ * Map cpu index to physical APIC ID
+ */
+DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_apicid, BAD_APICID);
+DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, CPU_ACPIID_INVALID);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid);
-/* extended topology sub-leaf types */
-#define INVALID_TYPE 0
-#define SMT_TYPE 1
-#define CORE_TYPE 2
-#define DIE_TYPE 5
+/* Bitmap of physically present CPUs. */
+DECLARE_BITMAP(phys_cpu_present_map, MAX_LOCAL_APIC) __read_mostly;
-#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff)
-#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f)
-#define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff)
+/* Used for CPU number allocation and parallel CPU bringup */
+u32 cpuid_to_apicid[] __ro_after_init = { [0 ... NR_CPUS - 1] = BAD_APICID, };
-unsigned int __max_die_per_package __read_mostly = 1;
-EXPORT_SYMBOL(__max_die_per_package);
+/* Bitmaps to mark registered APICs at each topology domain */
+static struct { DECLARE_BITMAP(map, MAX_LOCAL_APIC); } apic_maps[TOPO_MAX_DOMAIN] __ro_after_init;
-#ifdef CONFIG_SMP
/*
- * Check if given CPUID extended topology "leaf" is implemented
+ * Keep track of assigned, disabled and rejected CPUs. Present assigned
+ * with 1 as CPU #0 is reserved for the boot CPU.
*/
-static int check_extended_topology_leaf(int leaf)
-{
- unsigned int eax, ebx, ecx, edx;
+static struct {
+ unsigned int nr_assigned_cpus;
+ unsigned int nr_disabled_cpus;
+ unsigned int nr_rejected_cpus;
+ u32 boot_cpu_apic_id;
+ u32 real_bsp_apic_id;
+} topo_info __ro_after_init = {
+ .nr_assigned_cpus = 1,
+ .boot_cpu_apic_id = BAD_APICID,
+ .real_bsp_apic_id = BAD_APICID,
+};
- cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
+#define domain_weight(_dom) bitmap_weight(apic_maps[_dom].map, MAX_LOCAL_APIC)
- if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE))
- return -1;
+bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
+{
+ return phys_id == (u64)cpuid_to_apicid[cpu];
+}
- return 0;
+#ifdef CONFIG_SMP
+static void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid)
+{
+ if (!(apicid & (__max_threads_per_core - 1)))
+ cpumask_set_cpu(cpu, &__cpu_primary_thread_mask);
}
+#else
+static inline void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) { }
+#endif
+
/*
- * Return best CPUID Extended Topology Leaf supported
+ * Convert the APIC ID to a domain level ID by masking out the low bits
+ * below the domain level @dom.
*/
-static int detect_extended_topology_leaf(struct cpuinfo_x86 *c)
+static inline u32 topo_apicid(u32 apicid, enum x86_topology_domains dom)
{
- if (c->cpuid_level >= 0x1f) {
- if (check_extended_topology_leaf(0x1f) == 0)
- return 0x1f;
- }
+ if (dom == TOPO_SMT_DOMAIN)
+ return apicid;
+ return apicid & (UINT_MAX << x86_topo_system.dom_shifts[dom - 1]);
+}
+
+static int topo_lookup_cpuid(u32 apic_id)
+{
+ int i;
- if (c->cpuid_level >= 0xb) {
- if (check_extended_topology_leaf(0xb) == 0)
- return 0xb;
+ /* CPU# to APICID mapping is persistent once it is established */
+ for (i = 0; i < topo_info.nr_assigned_cpus; i++) {
+ if (cpuid_to_apicid[i] == apic_id)
+ return i;
}
+ return -ENODEV;
+}
- return -1;
+static __init int topo_get_cpunr(u32 apic_id)
+{
+ int cpu = topo_lookup_cpuid(apic_id);
+
+ if (cpu >= 0)
+ return cpu;
+
+ return topo_info.nr_assigned_cpus++;
}
+
+static void topo_set_cpuids(unsigned int cpu, u32 apic_id, u32 acpi_id)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_X86_64)
+ early_per_cpu(x86_cpu_to_apicid, cpu) = apic_id;
+ early_per_cpu(x86_cpu_to_acpiid, cpu) = acpi_id;
#endif
+ set_cpu_present(cpu, true);
+}
-int detect_extended_topology_early(struct cpuinfo_x86 *c)
+static __init bool check_for_real_bsp(u32 apic_id)
{
-#ifdef CONFIG_SMP
- unsigned int eax, ebx, ecx, edx;
- int leaf;
+ bool is_bsp = false, has_apic_base = boot_cpu_data.x86 >= 6;
+ u64 msr;
+
+ /*
+ * There is no real good way to detect whether this a kdump()
+ * kernel, but except on the Voyager SMP monstrosity which is not
+ * longer supported, the real BSP APIC ID is the first one which is
+ * enumerated by firmware. That allows to detect whether the boot
+ * CPU is the real BSP. If it is not, then do not register the APIC
+ * because sending INIT to the real BSP would reset the whole
+ * system.
+ *
+ * The first APIC ID which is enumerated by firmware is detectable
+ * because the boot CPU APIC ID is registered before that without
+ * invoking this code.
+ */
+ if (topo_info.real_bsp_apic_id != BAD_APICID)
+ return false;
+
+ /*
+ * Check whether the enumeration order is broken by evaluating the
+ * BSP bit in the APICBASE MSR. If the CPU does not have the
+ * APICBASE MSR then the BSP detection is not possible and the
+ * kernel must rely on the firmware enumeration order.
+ */
+ if (has_apic_base) {
+ rdmsrq(MSR_IA32_APICBASE, msr);
+ is_bsp = !!(msr & MSR_IA32_APICBASE_BSP);
+ }
+
+ if (apic_id == topo_info.boot_cpu_apic_id) {
+ /*
+ * If the boot CPU has the APIC BSP bit set then the
+ * firmware enumeration is agreeing. If the CPU does not
+ * have the APICBASE MSR then the only choice is to trust
+ * the enumeration order.
+ */
+ if (is_bsp || !has_apic_base) {
+ topo_info.real_bsp_apic_id = apic_id;
+ return false;
+ }
+ /*
+ * If the boot APIC is enumerated first, but the APICBASE
+ * MSR does not have the BSP bit set, then there is no way
+ * to discover the real BSP here. Assume a crash kernel and
+ * limit the number of CPUs to 1 as an INIT to the real BSP
+ * would reset the machine.
+ */
+ pr_warn("Enumerated BSP APIC %x is not marked in APICBASE MSR\n", apic_id);
+ pr_warn("Assuming crash kernel. Limiting to one CPU to prevent machine INIT\n");
+ set_nr_cpu_ids(1);
+ goto fwbug;
+ }
+
+ pr_warn("Boot CPU APIC ID not the first enumerated APIC ID: %x != %x\n",
+ topo_info.boot_cpu_apic_id, apic_id);
+
+ if (is_bsp) {
+ /*
+ * The boot CPU has the APIC BSP bit set. Use it and complain
+ * about the broken firmware enumeration.
+ */
+ topo_info.real_bsp_apic_id = topo_info.boot_cpu_apic_id;
+ goto fwbug;
+ }
- leaf = detect_extended_topology_leaf(c);
- if (leaf < 0)
- return -1;
+ pr_warn("Crash kernel detected. Disabling real BSP to prevent machine INIT\n");
- set_cpu_cap(c, X86_FEATURE_XTOPOLOGY);
+ topo_info.real_bsp_apic_id = apic_id;
+ return true;
+
+fwbug:
+ pr_warn(FW_BUG "APIC enumeration order not specification compliant\n");
+ return false;
+}
+
+static unsigned int topo_unit_count(u32 lvlid, enum x86_topology_domains at_level,
+ unsigned long *map)
+{
+ unsigned int id, end, cnt = 0;
+
+ /* Calculate the exclusive end */
+ end = lvlid + (1U << x86_topo_system.dom_shifts[at_level]);
+
+ /* Unfortunately there is no bitmap_weight_range() */
+ for (id = find_next_bit(map, end, lvlid); id < end; id = find_next_bit(map, end, ++id))
+ cnt++;
+ return cnt;
+}
+
+static __init void topo_register_apic(u32 apic_id, u32 acpi_id, bool present)
+{
+ int cpu, dom;
+
+ if (present) {
+ set_bit(apic_id, phys_cpu_present_map);
+
+ /*
+ * Double registration is valid in case of the boot CPU
+ * APIC because that is registered before the enumeration
+ * of the APICs via firmware parsers or VM guest
+ * mechanisms.
+ */
+ if (apic_id == topo_info.boot_cpu_apic_id)
+ cpu = 0;
+ else
+ cpu = topo_get_cpunr(apic_id);
+
+ cpuid_to_apicid[cpu] = apic_id;
+ topo_set_cpuids(cpu, apic_id, acpi_id);
+ } else {
+ u32 pkgid = topo_apicid(apic_id, TOPO_PKG_DOMAIN);
+
+ /*
+ * Check for present APICs in the same package when running
+ * on bare metal. Allow the bogosity in a guest.
+ */
+ if (hypervisor_is_type(X86_HYPER_NATIVE) &&
+ topo_unit_count(pkgid, TOPO_PKG_DOMAIN, phys_cpu_present_map)) {
+ pr_info_once("Ignoring hot-pluggable APIC ID %x in present package.\n",
+ apic_id);
+ topo_info.nr_rejected_cpus++;
+ return;
+ }
+
+ topo_info.nr_disabled_cpus++;
+ }
- cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
/*
- * initial apic id, which also represents 32-bit extended x2apic id.
+ * Register present and possible CPUs in the domain
+ * maps. cpu_possible_map will be updated in
+ * topology_init_possible_cpus() after enumeration is done.
*/
- c->initial_apicid = edx;
- smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
-#endif
- return 0;
+ for (dom = TOPO_SMT_DOMAIN; dom < TOPO_MAX_DOMAIN; dom++)
+ set_bit(topo_apicid(apic_id, dom), apic_maps[dom].map);
}
-/*
- * Check for extended topology enumeration cpuid leaf, and if it
- * exists, use it for populating initial_apicid and cpu topology
- * detection.
+/**
+ * topology_register_apic - Register an APIC in early topology maps
+ * @apic_id: The APIC ID to set up
+ * @acpi_id: The ACPI ID associated to the APIC
+ * @present: True if the corresponding CPU is present
*/
-int detect_extended_topology(struct cpuinfo_x86 *c)
+void __init topology_register_apic(u32 apic_id, u32 acpi_id, bool present)
{
-#ifdef CONFIG_SMP
- unsigned int eax, ebx, ecx, edx, sub_index;
- unsigned int ht_mask_width, core_plus_mask_width, die_plus_mask_width;
- unsigned int core_select_mask, core_level_siblings;
- unsigned int die_select_mask, die_level_siblings;
- bool die_level_present = false;
- int leaf;
+ if (apic_id >= MAX_LOCAL_APIC) {
+ pr_err_once("APIC ID %x exceeds kernel limit of: %x\n", apic_id, MAX_LOCAL_APIC - 1);
+ topo_info.nr_rejected_cpus++;
+ return;
+ }
+
+ if (check_for_real_bsp(apic_id)) {
+ topo_info.nr_rejected_cpus++;
+ return;
+ }
+
+ /* CPU numbers exhausted? */
+ if (apic_id != topo_info.boot_cpu_apic_id && topo_info.nr_assigned_cpus >= nr_cpu_ids) {
+ pr_warn_once("CPU limit of %d reached. Ignoring further CPUs\n", nr_cpu_ids);
+ topo_info.nr_rejected_cpus++;
+ return;
+ }
- leaf = detect_extended_topology_leaf(c);
- if (leaf < 0)
- return -1;
+ topo_register_apic(apic_id, acpi_id, present);
+}
+
+/**
+ * topology_register_boot_apic - Register the boot CPU APIC
+ * @apic_id: The APIC ID to set up
+ *
+ * Separate so CPU #0 can be assigned
+ */
+void __init topology_register_boot_apic(u32 apic_id)
+{
+ WARN_ON_ONCE(topo_info.boot_cpu_apic_id != BAD_APICID);
+
+ topo_info.boot_cpu_apic_id = apic_id;
+ topo_register_apic(apic_id, CPU_ACPIID_INVALID, true);
+}
+
+/**
+ * topology_get_logical_id - Retrieve the logical ID at a given topology domain level
+ * @apicid: The APIC ID for which to lookup the logical ID
+ * @at_level: The topology domain level to use
+ *
+ * @apicid must be a full APIC ID, not the normalized variant. It's valid to have
+ * all bits below the domain level specified by @at_level to be clear. So both
+ * real APIC IDs and backshifted normalized APIC IDs work correctly.
+ *
+ * Returns:
+ * - >= 0: The requested logical ID
+ * - -ERANGE: @apicid is out of range
+ * - -ENODEV: @apicid is not registered
+ */
+int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level)
+{
+ /* Remove the bits below @at_level to get the proper level ID of @apicid */
+ unsigned int lvlid = topo_apicid(apicid, at_level);
+
+ if (lvlid >= MAX_LOCAL_APIC)
+ return -ERANGE;
+ if (!test_bit(lvlid, apic_maps[at_level].map))
+ return -ENODEV;
+ /* Get the number of set bits before @lvlid. */
+ return bitmap_weight(apic_maps[at_level].map, lvlid);
+}
+EXPORT_SYMBOL_GPL(topology_get_logical_id);
+
+/**
+ * topology_unit_count - Retrieve the count of specified units at a given topology domain level
+ * @apicid: The APIC ID which specifies the search range
+ * @which_units: The domain level specifying the units to count
+ * @at_level: The domain level at which @which_units have to be counted
+ *
+ * This returns the number of possible units according to the enumerated
+ * information.
+ *
+ * E.g. topology_count_units(apicid, TOPO_CORE_DOMAIN, TOPO_PKG_DOMAIN)
+ * counts the number of possible cores in the package to which @apicid
+ * belongs.
+ *
+ * @at_level must obviously be greater than @which_level to produce useful
+ * results. If @at_level is equal to @which_units the result is
+ * unsurprisingly 1. If @at_level is less than @which_units the results
+ * is by definition undefined and the function returns 0.
+ */
+unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_units,
+ enum x86_topology_domains at_level)
+{
+ /* Remove the bits below @at_level to get the proper level ID of @apicid */
+ unsigned int lvlid = topo_apicid(apicid, at_level);
+
+ if (lvlid >= MAX_LOCAL_APIC)
+ return 0;
+ if (!test_bit(lvlid, apic_maps[at_level].map))
+ return 0;
+ if (which_units > at_level)
+ return 0;
+ if (which_units == at_level)
+ return 1;
+ return topo_unit_count(lvlid, at_level, apic_maps[which_units].map);
+}
+
+#ifdef CONFIG_ACPI_HOTPLUG_CPU
+/**
+ * topology_hotplug_apic - Handle a physical hotplugged APIC after boot
+ * @apic_id: The APIC ID to set up
+ * @acpi_id: The ACPI ID associated to the APIC
+ */
+int topology_hotplug_apic(u32 apic_id, u32 acpi_id)
+{
+ int cpu;
+
+ if (apic_id >= MAX_LOCAL_APIC)
+ return -EINVAL;
+
+ /* Reject if the APIC ID was not registered during enumeration. */
+ if (!test_bit(apic_id, apic_maps[TOPO_SMT_DOMAIN].map))
+ return -ENODEV;
+
+ cpu = topo_lookup_cpuid(apic_id);
+ if (cpu < 0)
+ return -ENOSPC;
+
+ set_bit(apic_id, phys_cpu_present_map);
+ topo_set_cpuids(cpu, apic_id, acpi_id);
+ cpu_mark_primary_thread(cpu, apic_id);
+ return cpu;
+}
+
+/**
+ * topology_hotunplug_apic - Remove a physical hotplugged APIC after boot
+ * @cpu: The CPU number for which the APIC ID is removed
+ */
+void topology_hotunplug_apic(unsigned int cpu)
+{
+ u32 apic_id = cpuid_to_apicid[cpu];
+
+ if (apic_id == BAD_APICID)
+ return;
+
+ per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
+ clear_bit(apic_id, phys_cpu_present_map);
+ set_cpu_present(cpu, false);
+}
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static unsigned int max_possible_cpus __initdata = NR_CPUS;
+
+/**
+ * topology_apply_cmdline_limits_early - Apply topology command line limits early
+ *
+ * Ensure that command line limits are in effect before firmware parsing
+ * takes place.
+ */
+void __init topology_apply_cmdline_limits_early(void)
+{
+ unsigned int possible = nr_cpu_ids;
+
+ /* 'maxcpus=0' 'nosmp' 'nolapic' */
+ if (!setup_max_cpus || apic_is_disabled)
+ possible = 1;
+
+ /* 'possible_cpus=N' */
+ possible = min_t(unsigned int, max_possible_cpus, possible);
+ if (possible < nr_cpu_ids) {
+ pr_info("Limiting to %u possible CPUs\n", possible);
+ set_nr_cpu_ids(possible);
+ }
+}
+
+static __init bool restrict_to_up(void)
+{
+ if (!smp_found_config)
+ return true;
/*
- * Populate HT related information from sub-leaf level 0.
+ * XEN PV is special as it does not advertise the local APIC
+ * properly, but provides a fake topology for it so that the
+ * infrastructure works. So don't apply the restrictions vs. APIC
+ * here.
*/
- cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
- c->initial_apicid = edx;
- core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
- core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
- die_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
- die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
+ if (xen_pv_domain())
+ return false;
+
+ return apic_is_disabled;
+}
- sub_index = 1;
- do {
- cpuid_count(leaf, sub_index, &eax, &ebx, &ecx, &edx);
+void __init topology_init_possible_cpus(void)
+{
+ unsigned int assigned = topo_info.nr_assigned_cpus;
+ unsigned int disabled = topo_info.nr_disabled_cpus;
+ unsigned int cnta, cntb, cpu, allowed = 1;
+ unsigned int total = assigned + disabled;
+ u32 apicid, firstid;
- /*
- * Check for the Core type in the implemented sub leaves.
- */
- if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) {
- core_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
- core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
- die_level_siblings = core_level_siblings;
- die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
- }
- if (LEAFB_SUBTYPE(ecx) == DIE_TYPE) {
- die_level_present = true;
- die_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
- die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
+ /*
+ * If there was no APIC registered, then fake one so that the
+ * topology bitmap is populated. That ensures that the code below
+ * is valid and the various query interfaces can be used
+ * unconditionally. This does not affect the actual APIC code in
+ * any way because either the local APIC address has not been
+ * registered or the local APIC was disabled on the command line.
+ */
+ if (topo_info.boot_cpu_apic_id == BAD_APICID)
+ topology_register_boot_apic(0);
+
+ if (!restrict_to_up()) {
+ if (WARN_ON_ONCE(assigned > nr_cpu_ids)) {
+ disabled += assigned - nr_cpu_ids;
+ assigned = nr_cpu_ids;
}
+ allowed = min_t(unsigned int, total, nr_cpu_ids);
+ }
- sub_index++;
- } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE);
+ if (total > allowed)
+ pr_warn("%u possible CPUs exceed the limit of %u\n", total, allowed);
- core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width;
- die_select_mask = (~(-1 << die_plus_mask_width)) >>
- core_plus_mask_width;
+ assigned = min_t(unsigned int, allowed, assigned);
+ disabled = allowed - assigned;
- c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid,
- ht_mask_width) & core_select_mask;
+ topo_info.nr_assigned_cpus = assigned;
+ topo_info.nr_disabled_cpus = disabled;
- if (die_level_present) {
- c->cpu_die_id = apic->phys_pkg_id(c->initial_apicid,
- core_plus_mask_width) & die_select_mask;
- }
+ total_cpus = allowed;
+ set_nr_cpu_ids(allowed);
- c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid,
- die_plus_mask_width);
+ cnta = domain_weight(TOPO_PKG_DOMAIN);
+ cntb = domain_weight(TOPO_DIE_DOMAIN);
+ __max_logical_packages = cnta;
+ __max_dies_per_package = 1U << (get_count_order(cntb) - get_count_order(cnta));
+
+ pr_info("Max. logical packages: %3u\n", cnta);
+ pr_info("Max. logical dies: %3u\n", cntb);
+ pr_info("Max. dies per package: %3u\n", __max_dies_per_package);
+
+ cnta = domain_weight(TOPO_CORE_DOMAIN);
+ cntb = domain_weight(TOPO_SMT_DOMAIN);
/*
- * Reinit the apicid, now that we have extended initial_apicid.
+ * Can't use order delta here as order(cnta) can be equal
+ * order(cntb) even if cnta != cntb.
*/
- c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
+ __max_threads_per_core = DIV_ROUND_UP(cntb, cnta);
+ pr_info("Max. threads per core: %3u\n", __max_threads_per_core);
- c->x86_max_cores = (core_level_siblings / smp_num_siblings);
- __max_die_per_package = (die_level_siblings / core_level_siblings);
-#endif
+ firstid = find_first_bit(apic_maps[TOPO_SMT_DOMAIN].map, MAX_LOCAL_APIC);
+ __num_cores_per_package = topology_unit_count(firstid, TOPO_CORE_DOMAIN, TOPO_PKG_DOMAIN);
+ pr_info("Num. cores per package: %3u\n", __num_cores_per_package);
+ __num_threads_per_package = topology_unit_count(firstid, TOPO_SMT_DOMAIN, TOPO_PKG_DOMAIN);
+ pr_info("Num. threads per package: %3u\n", __num_threads_per_package);
+
+ pr_info("Allowing %u present CPUs plus %u hotplug CPUs\n", assigned, disabled);
+ if (topo_info.nr_rejected_cpus)
+ pr_info("Rejected CPUs %u\n", topo_info.nr_rejected_cpus);
+
+ init_cpu_present(cpumask_of(0));
+ init_cpu_possible(cpumask_of(0));
+
+ /* Assign CPU numbers to non-present CPUs */
+ for (apicid = 0; disabled; disabled--, apicid++) {
+ apicid = find_next_andnot_bit(apic_maps[TOPO_SMT_DOMAIN].map, phys_cpu_present_map,
+ MAX_LOCAL_APIC, apicid);
+ if (apicid >= MAX_LOCAL_APIC)
+ break;
+ cpuid_to_apicid[topo_info.nr_assigned_cpus++] = apicid;
+ }
+
+ for (cpu = 0; cpu < allowed; cpu++) {
+ apicid = cpuid_to_apicid[cpu];
+
+ set_cpu_possible(cpu, true);
+
+ if (apicid == BAD_APICID)
+ continue;
+
+ cpu_mark_primary_thread(cpu, apicid);
+ set_cpu_present(cpu, test_bit(apicid, phys_cpu_present_map));
+ }
+}
+
+/*
+ * Late SMP disable after sizing CPU masks when APIC/IOAPIC setup failed.
+ */
+void __init topology_reset_possible_cpus_up(void)
+{
+ init_cpu_present(cpumask_of(0));
+ init_cpu_possible(cpumask_of(0));
+
+ bitmap_zero(phys_cpu_present_map, MAX_LOCAL_APIC);
+ if (topo_info.boot_cpu_apic_id != BAD_APICID)
+ set_bit(topo_info.boot_cpu_apic_id, phys_cpu_present_map);
+}
+
+static int __init setup_possible_cpus(char *str)
+{
+ get_option(&str, &max_possible_cpus);
return 0;
}
+early_param("possible_cpus", setup_possible_cpus);
+#endif
diff --git a/arch/x86/kernel/cpu/topology.h b/arch/x86/kernel/cpu/topology.h
new file mode 100644
index 000000000000..37326297f80c
--- /dev/null
+++ b/arch/x86/kernel/cpu/topology.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_X86_TOPOLOGY_H
+#define ARCH_X86_TOPOLOGY_H
+
+struct topo_scan {
+ struct cpuinfo_x86 *c;
+ unsigned int dom_shifts[TOPO_MAX_DOMAIN];
+ unsigned int dom_ncpus[TOPO_MAX_DOMAIN];
+
+ /* Legacy CPUID[1]:EBX[23:16] number of logical processors */
+ unsigned int ebx1_nproc_shift;
+
+ /* AMD specific node ID which cannot be mapped into APIC space. */
+ u16 amd_nodes_per_pkg;
+ u16 amd_node_id;
+};
+
+void cpu_init_topology(struct cpuinfo_x86 *c);
+void cpu_parse_topology(struct cpuinfo_x86 *c);
+void topology_set_dom(struct topo_scan *tscan, enum x86_topology_domains dom,
+ unsigned int shift, unsigned int ncpus);
+bool cpu_parse_topology_ext(struct topo_scan *tscan);
+void cpu_parse_topology_amd(struct topo_scan *tscan);
+void cpu_topology_fixup_amd(struct topo_scan *tscan);
+
+static inline u32 topo_shift_apicid(u32 apicid, enum x86_topology_domains dom)
+{
+ if (dom == TOPO_SMT_DOMAIN)
+ return apicid;
+ return apicid >> x86_topo_system.dom_shifts[dom - 1];
+}
+
+static inline u32 topo_relative_domain_id(u32 apicid, enum x86_topology_domains dom)
+{
+ if (dom != TOPO_SMT_DOMAIN)
+ apicid >>= x86_topo_system.dom_shifts[dom - 1];
+ return apicid & (x86_topo_system.dom_size[dom] - 1);
+}
+
+static inline u32 topo_domain_mask(enum x86_topology_domains dom)
+{
+ return (1U << x86_topo_system.dom_shifts[dom]) - 1;
+}
+
+/*
+ * Update a domain level after the fact without propagating. Used to fixup
+ * broken CPUID enumerations.
+ */
+static inline void topology_update_dom(struct topo_scan *tscan, enum x86_topology_domains dom,
+ unsigned int shift, unsigned int ncpus)
+{
+ tscan->dom_shifts[dom] = shift;
+ tscan->dom_ncpus[dom] = ncpus;
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_units,
+ enum x86_topology_domains at_level);
+#else
+static inline unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_units,
+ enum x86_topology_domains at_level)
+{
+ return 1;
+}
+#endif
+
+#endif /* ARCH_X86_TOPOLOGY_H */
diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c
new file mode 100644
index 000000000000..843b1655ab45
--- /dev/null
+++ b/arch/x86/kernel/cpu/topology_amd.c
@@ -0,0 +1,221 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cpu.h>
+
+#include <asm/apic.h>
+#include <asm/memtype.h>
+#include <asm/msr.h>
+#include <asm/processor.h>
+
+#include "cpu.h"
+
+static bool parse_8000_0008(struct topo_scan *tscan)
+{
+ struct {
+ // ecx
+ u32 cpu_nthreads : 8, // Number of physical threads - 1
+ : 4, // Reserved
+ apicid_coreid_len : 4, // Number of thread core ID bits (shift) in APIC ID
+ perf_tsc_len : 2, // Performance time-stamp counter size
+ : 14; // Reserved
+ } ecx;
+ unsigned int sft;
+
+ if (tscan->c->extended_cpuid_level < 0x80000008)
+ return false;
+
+ cpuid_leaf_reg(0x80000008, CPUID_ECX, &ecx);
+
+ /* If the thread bits are 0, then get the shift value from ecx.cpu_nthreads */
+ sft = ecx.apicid_coreid_len;
+ if (!sft)
+ sft = get_count_order(ecx.cpu_nthreads + 1);
+
+ /*
+ * cpu_nthreads describes the number of threads in the package
+ * sft is the number of APIC ID bits per package
+ *
+ * As the number of actual threads per core is not described in
+ * this leaf, just set the CORE domain shift and let the later
+ * parsers set SMT shift. Assume one thread per core by default
+ * which is correct if there are no other CPUID leafs to parse.
+ */
+ topology_update_dom(tscan, TOPO_SMT_DOMAIN, 0, 1);
+ topology_set_dom(tscan, TOPO_CORE_DOMAIN, sft, ecx.cpu_nthreads + 1);
+ return true;
+}
+
+static void store_node(struct topo_scan *tscan, u16 nr_nodes, u16 node_id)
+{
+ /*
+ * Starting with Fam 17h the DIE domain could probably be used to
+ * retrieve the node info on AMD/HYGON. Analysis of CPUID dumps
+ * suggests it's the topmost bit(s) of the CPU cores area, but
+ * that's guess work and neither enumerated nor documented.
+ *
+ * Up to Fam 16h this does not work at all and the legacy node ID
+ * has to be used.
+ */
+ tscan->amd_nodes_per_pkg = nr_nodes;
+ tscan->amd_node_id = node_id;
+}
+
+static bool parse_8000_001e(struct topo_scan *tscan, bool has_topoext)
+{
+ struct {
+ // eax
+ u32 ext_apic_id : 32; // Extended APIC ID
+ // ebx
+ u32 core_id : 8, // Unique per-socket logical core unit ID
+ core_nthreads : 8, // #Threads per core (zero-based)
+ : 16; // Reserved
+ // ecx
+ u32 node_id : 8, // Node (die) ID of invoking logical CPU
+ nnodes_per_socket : 3, // #nodes in invoking logical CPU's package/socket
+ : 21; // Reserved
+ // edx
+ u32 : 32; // Reserved
+ } leaf;
+
+ if (!boot_cpu_has(X86_FEATURE_TOPOEXT))
+ return false;
+
+ cpuid_leaf(0x8000001e, &leaf);
+
+ tscan->c->topo.initial_apicid = leaf.ext_apic_id;
+
+ /*
+ * If leaf 0xb is available, then the domain shifts are set
+ * already and nothing to do here. Only valid for family >= 0x17.
+ */
+ if (!has_topoext && tscan->c->x86 >= 0x17) {
+ /*
+ * Leaf 0x80000008 set the CORE domain shift already.
+ * Update the SMT domain, but do not propagate it.
+ */
+ unsigned int nthreads = leaf.core_nthreads + 1;
+
+ topology_update_dom(tscan, TOPO_SMT_DOMAIN, get_count_order(nthreads), nthreads);
+ }
+
+ store_node(tscan, leaf.nnodes_per_socket + 1, leaf.node_id);
+
+ if (tscan->c->x86_vendor == X86_VENDOR_AMD) {
+ if (tscan->c->x86 == 0x15)
+ tscan->c->topo.cu_id = leaf.core_id;
+
+ cacheinfo_amd_init_llc_id(tscan->c, leaf.node_id);
+ } else {
+ /*
+ * Package ID is ApicId[6..] on certain Hygon CPUs. See
+ * commit e0ceeae708ce for explanation. The topology info
+ * is screwed up: The package shift is always 6 and the
+ * node ID is bit [4:5].
+ */
+ if (!boot_cpu_has(X86_FEATURE_HYPERVISOR) && tscan->c->x86_model <= 0x3) {
+ topology_set_dom(tscan, TOPO_CORE_DOMAIN, 6,
+ tscan->dom_ncpus[TOPO_CORE_DOMAIN]);
+ }
+ cacheinfo_hygon_init_llc_id(tscan->c);
+ }
+ return true;
+}
+
+static void parse_fam10h_node_id(struct topo_scan *tscan)
+{
+ union {
+ struct {
+ u64 node_id : 3,
+ nodes_per_pkg : 3,
+ unused : 58;
+ };
+ u64 msr;
+ } nid;
+
+ if (!boot_cpu_has(X86_FEATURE_NODEID_MSR))
+ return;
+
+ rdmsrq(MSR_FAM10H_NODE_ID, nid.msr);
+ store_node(tscan, nid.nodes_per_pkg + 1, nid.node_id);
+ tscan->c->topo.llc_id = nid.node_id;
+}
+
+static void legacy_set_llc(struct topo_scan *tscan)
+{
+ unsigned int apicid = tscan->c->topo.initial_apicid;
+
+ /* If none of the parsers set LLC ID then use the die ID for it. */
+ if (tscan->c->topo.llc_id == BAD_APICID)
+ tscan->c->topo.llc_id = apicid >> tscan->dom_shifts[TOPO_CORE_DOMAIN];
+}
+
+static void topoext_fixup(struct topo_scan *tscan)
+{
+ struct cpuinfo_x86 *c = tscan->c;
+ u64 msrval;
+
+ /* Try to re-enable TopologyExtensions if switched off by BIOS */
+ if (cpu_has(c, X86_FEATURE_TOPOEXT) || c->x86_vendor != X86_VENDOR_AMD ||
+ c->x86 != 0x15 || c->x86_model < 0x10 || c->x86_model > 0x6f)
+ return;
+
+ if (msr_set_bit(0xc0011005, 54) <= 0)
+ return;
+
+ rdmsrq(0xc0011005, msrval);
+ if (msrval & BIT_64(54)) {
+ set_cpu_cap(c, X86_FEATURE_TOPOEXT);
+ pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n");
+ }
+}
+
+static void parse_topology_amd(struct topo_scan *tscan)
+{
+ bool has_topoext = false;
+
+ /*
+ * If the extended topology leaf 0x8000_001e is available
+ * try to get SMT, CORE, TILE, and DIE shifts from extended
+ * CPUID leaf 0x8000_0026 on supported processors first. If
+ * extended CPUID leaf 0x8000_0026 is not supported, try to
+ * get SMT and CORE shift from leaf 0xb first, then try to
+ * get the CORE shift from leaf 0x8000_0008.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_TOPOEXT))
+ has_topoext = cpu_parse_topology_ext(tscan);
+
+ if (cpu_feature_enabled(X86_FEATURE_AMD_HTR_CORES))
+ tscan->c->topo.cpu_type = cpuid_ebx(0x80000026);
+
+ if (!has_topoext && !parse_8000_0008(tscan))
+ return;
+
+ /* Prefer leaf 0x8000001e if available */
+ if (parse_8000_001e(tscan, has_topoext))
+ return;
+
+ /* Try the NODEID MSR */
+ parse_fam10h_node_id(tscan);
+}
+
+void cpu_parse_topology_amd(struct topo_scan *tscan)
+{
+ tscan->amd_nodes_per_pkg = 1;
+ topoext_fixup(tscan);
+ parse_topology_amd(tscan);
+ legacy_set_llc(tscan);
+
+ if (tscan->amd_nodes_per_pkg > 1)
+ set_cpu_cap(tscan->c, X86_FEATURE_AMD_DCM);
+}
+
+void cpu_topology_fixup_amd(struct topo_scan *tscan)
+{
+ struct cpuinfo_x86 *c = tscan->c;
+
+ /*
+ * Adjust the core_id relative to the node when there is more than
+ * one node.
+ */
+ if (tscan->c->x86 < 0x17 && tscan->amd_nodes_per_pkg > 1)
+ c->topo.core_id %= tscan->dom_ncpus[TOPO_CORE_DOMAIN] / tscan->amd_nodes_per_pkg;
+}
diff --git a/arch/x86/kernel/cpu/topology_common.c b/arch/x86/kernel/cpu/topology_common.c
new file mode 100644
index 000000000000..b5a5e1411469
--- /dev/null
+++ b/arch/x86/kernel/cpu/topology_common.c
@@ -0,0 +1,255 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cpu.h>
+
+#include <xen/xen.h>
+
+#include <asm/intel-family.h>
+#include <asm/apic.h>
+#include <asm/processor.h>
+#include <asm/smp.h>
+
+#include "cpu.h"
+
+struct x86_topology_system x86_topo_system __ro_after_init;
+EXPORT_SYMBOL_GPL(x86_topo_system);
+
+unsigned int __amd_nodes_per_pkg __ro_after_init;
+EXPORT_SYMBOL_GPL(__amd_nodes_per_pkg);
+
+void topology_set_dom(struct topo_scan *tscan, enum x86_topology_domains dom,
+ unsigned int shift, unsigned int ncpus)
+{
+ topology_update_dom(tscan, dom, shift, ncpus);
+
+ /* Propagate to the upper levels */
+ for (dom++; dom < TOPO_MAX_DOMAIN; dom++) {
+ tscan->dom_shifts[dom] = tscan->dom_shifts[dom - 1];
+ tscan->dom_ncpus[dom] = tscan->dom_ncpus[dom - 1];
+ }
+}
+
+enum x86_topology_cpu_type get_topology_cpu_type(struct cpuinfo_x86 *c)
+{
+ if (c->x86_vendor == X86_VENDOR_INTEL) {
+ switch (c->topo.intel_type) {
+ case INTEL_CPU_TYPE_ATOM: return TOPO_CPU_TYPE_EFFICIENCY;
+ case INTEL_CPU_TYPE_CORE: return TOPO_CPU_TYPE_PERFORMANCE;
+ }
+ }
+ if (c->x86_vendor == X86_VENDOR_AMD) {
+ switch (c->topo.amd_type) {
+ case 0: return TOPO_CPU_TYPE_PERFORMANCE;
+ case 1: return TOPO_CPU_TYPE_EFFICIENCY;
+ }
+ }
+
+ return TOPO_CPU_TYPE_UNKNOWN;
+}
+
+const char *get_topology_cpu_type_name(struct cpuinfo_x86 *c)
+{
+ switch (get_topology_cpu_type(c)) {
+ case TOPO_CPU_TYPE_PERFORMANCE:
+ return "performance";
+ case TOPO_CPU_TYPE_EFFICIENCY:
+ return "efficiency";
+ default:
+ return "unknown";
+ }
+}
+
+static unsigned int __maybe_unused parse_num_cores_legacy(struct cpuinfo_x86 *c)
+{
+ struct {
+ u32 cache_type : 5,
+ unused : 21,
+ ncores : 6;
+ } eax;
+
+ if (c->cpuid_level < 4)
+ return 1;
+
+ cpuid_subleaf_reg(4, 0, CPUID_EAX, &eax);
+ if (!eax.cache_type)
+ return 1;
+
+ return eax.ncores + 1;
+}
+
+static void parse_legacy(struct topo_scan *tscan)
+{
+ unsigned int cores, core_shift, smt_shift = 0;
+ struct cpuinfo_x86 *c = tscan->c;
+
+ cores = parse_num_cores_legacy(c);
+ core_shift = get_count_order(cores);
+
+ if (cpu_has(c, X86_FEATURE_HT)) {
+ if (!WARN_ON_ONCE(tscan->ebx1_nproc_shift < core_shift))
+ smt_shift = tscan->ebx1_nproc_shift - core_shift;
+ /*
+ * The parser expects leaf 0xb/0x1f format, which means
+ * the number of logical processors at core level is
+ * counting threads.
+ */
+ core_shift += smt_shift;
+ cores <<= smt_shift;
+ }
+
+ topology_set_dom(tscan, TOPO_SMT_DOMAIN, smt_shift, 1U << smt_shift);
+ topology_set_dom(tscan, TOPO_CORE_DOMAIN, core_shift, cores);
+}
+
+static bool fake_topology(struct topo_scan *tscan)
+{
+ /*
+ * Preset the CORE level shift for CPUID less systems and XEN_PV,
+ * which has useless CPUID information.
+ */
+ topology_set_dom(tscan, TOPO_SMT_DOMAIN, 0, 1);
+ topology_set_dom(tscan, TOPO_CORE_DOMAIN, 0, 1);
+
+ return tscan->c->cpuid_level < 1;
+}
+
+static void parse_topology(struct topo_scan *tscan, bool early)
+{
+ const struct cpuinfo_topology topo_defaults = {
+ .cu_id = 0xff,
+ .llc_id = BAD_APICID,
+ .l2c_id = BAD_APICID,
+ .cpu_type = TOPO_CPU_TYPE_UNKNOWN,
+ };
+ struct cpuinfo_x86 *c = tscan->c;
+ struct {
+ u32 unused0 : 16,
+ nproc : 8,
+ apicid : 8;
+ } ebx;
+
+ c->topo = topo_defaults;
+
+ if (fake_topology(tscan))
+ return;
+
+ /* Preset Initial APIC ID from CPUID leaf 1 */
+ cpuid_leaf_reg(1, CPUID_EBX, &ebx);
+ c->topo.initial_apicid = ebx.apicid;
+
+ /*
+ * The initial invocation from early_identify_cpu() happens before
+ * the APIC is mapped or X2APIC enabled. For establishing the
+ * topology, that's not required. Use the initial APIC ID.
+ */
+ if (early)
+ c->topo.apicid = c->topo.initial_apicid;
+ else
+ c->topo.apicid = read_apic_id();
+
+ /* The above is sufficient for UP */
+ if (!IS_ENABLED(CONFIG_SMP))
+ return;
+
+ tscan->ebx1_nproc_shift = get_count_order(ebx.nproc);
+
+ switch (c->x86_vendor) {
+ case X86_VENDOR_AMD:
+ if (IS_ENABLED(CONFIG_CPU_SUP_AMD))
+ cpu_parse_topology_amd(tscan);
+ break;
+ case X86_VENDOR_CENTAUR:
+ case X86_VENDOR_ZHAOXIN:
+ parse_legacy(tscan);
+ break;
+ case X86_VENDOR_INTEL:
+ if (!IS_ENABLED(CONFIG_CPU_SUP_INTEL) || !cpu_parse_topology_ext(tscan))
+ parse_legacy(tscan);
+ if (c->cpuid_level >= 0x1a)
+ c->topo.cpu_type = cpuid_eax(0x1a);
+ break;
+ case X86_VENDOR_HYGON:
+ if (IS_ENABLED(CONFIG_CPU_SUP_HYGON))
+ cpu_parse_topology_amd(tscan);
+ break;
+ }
+}
+
+static void topo_set_ids(struct topo_scan *tscan, bool early)
+{
+ struct cpuinfo_x86 *c = tscan->c;
+ u32 apicid = c->topo.apicid;
+
+ c->topo.pkg_id = topo_shift_apicid(apicid, TOPO_PKG_DOMAIN);
+ c->topo.die_id = topo_shift_apicid(apicid, TOPO_DIE_DOMAIN);
+
+ if (!early) {
+ c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN);
+ c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN);
+ c->topo.logical_core_id = topology_get_logical_id(apicid, TOPO_CORE_DOMAIN);
+ }
+
+ /* Package relative core ID */
+ c->topo.core_id = (apicid & topo_domain_mask(TOPO_PKG_DOMAIN)) >>
+ x86_topo_system.dom_shifts[TOPO_SMT_DOMAIN];
+
+ c->topo.amd_node_id = tscan->amd_node_id;
+
+ if (c->x86_vendor == X86_VENDOR_AMD)
+ cpu_topology_fixup_amd(tscan);
+}
+
+void cpu_parse_topology(struct cpuinfo_x86 *c)
+{
+ unsigned int dom, cpu = smp_processor_id();
+ struct topo_scan tscan = { .c = c, };
+
+ parse_topology(&tscan, false);
+
+ if (IS_ENABLED(CONFIG_X86_LOCAL_APIC)) {
+ if (c->topo.initial_apicid != c->topo.apicid) {
+ pr_err(FW_BUG "CPU%4u: APIC ID mismatch. CPUID: 0x%04x APIC: 0x%04x\n",
+ cpu, c->topo.initial_apicid, c->topo.apicid);
+ }
+
+ if (c->topo.apicid != cpuid_to_apicid[cpu]) {
+ pr_err(FW_BUG "CPU%4u: APIC ID mismatch. Firmware: 0x%04x APIC: 0x%04x\n",
+ cpu, cpuid_to_apicid[cpu], c->topo.apicid);
+ }
+ }
+
+ for (dom = TOPO_SMT_DOMAIN; dom < TOPO_MAX_DOMAIN; dom++) {
+ if (tscan.dom_shifts[dom] == x86_topo_system.dom_shifts[dom])
+ continue;
+ pr_err(FW_BUG "CPU%d: Topology domain %u shift %u != %u\n", cpu, dom,
+ tscan.dom_shifts[dom], x86_topo_system.dom_shifts[dom]);
+ }
+
+ topo_set_ids(&tscan, false);
+}
+
+void __init cpu_init_topology(struct cpuinfo_x86 *c)
+{
+ struct topo_scan tscan = { .c = c, };
+ unsigned int dom, sft;
+
+ parse_topology(&tscan, true);
+
+ /* Copy the shift values and calculate the unit sizes. */
+ memcpy(x86_topo_system.dom_shifts, tscan.dom_shifts, sizeof(x86_topo_system.dom_shifts));
+
+ dom = TOPO_SMT_DOMAIN;
+ x86_topo_system.dom_size[dom] = 1U << x86_topo_system.dom_shifts[dom];
+
+ for (dom++; dom < TOPO_MAX_DOMAIN; dom++) {
+ sft = x86_topo_system.dom_shifts[dom] - x86_topo_system.dom_shifts[dom - 1];
+ x86_topo_system.dom_size[dom] = 1U << sft;
+ }
+
+ topo_set_ids(&tscan, true);
+
+ /*
+ * AMD systems have Nodes per package which cannot be mapped to
+ * APIC ID.
+ */
+ __amd_nodes_per_pkg = tscan.amd_nodes_per_pkg;
+}
diff --git a/arch/x86/kernel/cpu/topology_ext.c b/arch/x86/kernel/cpu/topology_ext.c
new file mode 100644
index 000000000000..467b0326bf1a
--- /dev/null
+++ b/arch/x86/kernel/cpu/topology_ext.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cpu.h>
+
+#include <asm/apic.h>
+#include <asm/memtype.h>
+#include <asm/processor.h>
+
+#include "cpu.h"
+
+enum topo_types {
+ INVALID_TYPE = 0,
+ SMT_TYPE = 1,
+ CORE_TYPE = 2,
+ MAX_TYPE_0B = 3,
+ MODULE_TYPE = 3,
+ AMD_CCD_TYPE = 3,
+ TILE_TYPE = 4,
+ AMD_SOCKET_TYPE = 4,
+ MAX_TYPE_80000026 = 5,
+ DIE_TYPE = 5,
+ DIEGRP_TYPE = 6,
+ MAX_TYPE_1F = 7,
+};
+
+/*
+ * Use a lookup table for the case that there are future types > 6 which
+ * describe an intermediate domain level which does not exist today.
+ */
+static const unsigned int topo_domain_map_0b_1f[MAX_TYPE_1F] = {
+ [SMT_TYPE] = TOPO_SMT_DOMAIN,
+ [CORE_TYPE] = TOPO_CORE_DOMAIN,
+ [MODULE_TYPE] = TOPO_MODULE_DOMAIN,
+ [TILE_TYPE] = TOPO_TILE_DOMAIN,
+ [DIE_TYPE] = TOPO_DIE_DOMAIN,
+ [DIEGRP_TYPE] = TOPO_DIEGRP_DOMAIN,
+};
+
+static const unsigned int topo_domain_map_80000026[MAX_TYPE_80000026] = {
+ [SMT_TYPE] = TOPO_SMT_DOMAIN,
+ [CORE_TYPE] = TOPO_CORE_DOMAIN,
+ [AMD_CCD_TYPE] = TOPO_TILE_DOMAIN,
+ [AMD_SOCKET_TYPE] = TOPO_DIE_DOMAIN,
+};
+
+static inline bool topo_subleaf(struct topo_scan *tscan, u32 leaf, u32 subleaf,
+ unsigned int *last_dom)
+{
+ unsigned int dom, maxtype;
+ const unsigned int *map;
+ struct {
+ // eax
+ u32 x2apic_shift : 5, // Number of bits to shift APIC ID right
+ // for the topology ID at the next level
+ : 27; // Reserved
+ // ebx
+ u32 num_processors : 16, // Number of processors at current level
+ : 16; // Reserved
+ // ecx
+ u32 level : 8, // Current topology level. Same as sub leaf number
+ type : 8, // Level type. If 0, invalid
+ : 16; // Reserved
+ // edx
+ u32 x2apic_id : 32; // X2APIC ID of the current logical processor
+ } sl;
+
+ switch (leaf) {
+ case 0x0b: maxtype = MAX_TYPE_0B; map = topo_domain_map_0b_1f; break;
+ case 0x1f: maxtype = MAX_TYPE_1F; map = topo_domain_map_0b_1f; break;
+ case 0x80000026: maxtype = MAX_TYPE_80000026; map = topo_domain_map_80000026; break;
+ default: return false;
+ }
+
+ cpuid_subleaf(leaf, subleaf, &sl);
+
+ if (!sl.num_processors || sl.type == INVALID_TYPE)
+ return false;
+
+ if (sl.type >= maxtype) {
+ pr_err_once("Topology: leaf 0x%x:%d Unknown domain type %u\n",
+ leaf, subleaf, sl.type);
+ /*
+ * It really would have been too obvious to make the domain
+ * type space sparse and leave a few reserved types between
+ * the points which might change instead of following the
+ * usual "this can be fixed in software" principle.
+ */
+ dom = *last_dom + 1;
+ } else {
+ dom = map[sl.type];
+ *last_dom = dom;
+ }
+
+ if (!dom) {
+ tscan->c->topo.initial_apicid = sl.x2apic_id;
+ } else if (tscan->c->topo.initial_apicid != sl.x2apic_id) {
+ pr_warn_once(FW_BUG "CPUID leaf 0x%x subleaf %d APIC ID mismatch %x != %x\n",
+ leaf, subleaf, tscan->c->topo.initial_apicid, sl.x2apic_id);
+ }
+
+ topology_set_dom(tscan, dom, sl.x2apic_shift, sl.num_processors);
+ return true;
+}
+
+static bool parse_topology_leaf(struct topo_scan *tscan, u32 leaf)
+{
+ unsigned int last_dom;
+ u32 subleaf;
+
+ /* Read all available subleafs and populate the levels */
+ for (subleaf = 0, last_dom = 0; topo_subleaf(tscan, leaf, subleaf, &last_dom); subleaf++);
+
+ /* If subleaf 0 failed to parse, give up */
+ if (!subleaf)
+ return false;
+
+ /*
+ * There are machines in the wild which have shift 0 in the subleaf
+ * 0, but advertise 2 logical processors at that level. They are
+ * truly SMT.
+ */
+ if (!tscan->dom_shifts[TOPO_SMT_DOMAIN] && tscan->dom_ncpus[TOPO_SMT_DOMAIN] > 1) {
+ unsigned int sft = get_count_order(tscan->dom_ncpus[TOPO_SMT_DOMAIN]);
+
+ pr_warn_once(FW_BUG "CPUID leaf 0x%x subleaf 0 has shift level 0 but %u CPUs. Fixing it up.\n",
+ leaf, tscan->dom_ncpus[TOPO_SMT_DOMAIN]);
+ topology_update_dom(tscan, TOPO_SMT_DOMAIN, sft, tscan->dom_ncpus[TOPO_SMT_DOMAIN]);
+ }
+
+ set_cpu_cap(tscan->c, X86_FEATURE_XTOPOLOGY);
+ return true;
+}
+
+bool cpu_parse_topology_ext(struct topo_scan *tscan)
+{
+ /* Intel: Try leaf 0x1F first. */
+ if (tscan->c->cpuid_level >= 0x1f && parse_topology_leaf(tscan, 0x1f))
+ return true;
+
+ /* AMD: Try leaf 0x80000026 first. */
+ if (tscan->c->extended_cpuid_level >= 0x80000026 && parse_topology_leaf(tscan, 0x80000026))
+ return true;
+
+ /* Intel/AMD: Fall back to leaf 0xB if available */
+ return tscan->c->cpuid_level >= 0x0b && parse_topology_leaf(tscan, 0x0b);
+}
diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c
index ec7bbac3a9f2..49782724a943 100644
--- a/arch/x86/kernel/cpu/tsx.c
+++ b/arch/x86/kernel/cpu/tsx.c
@@ -11,6 +11,8 @@
#include <linux/cpufeature.h>
#include <asm/cmdline.h>
+#include <asm/cpu.h>
+#include <asm/msr.h>
#include "cpu.h"
@@ -23,7 +25,7 @@ static void tsx_disable(void)
{
u64 tsx;
- rdmsrl(MSR_IA32_TSX_CTRL, tsx);
+ rdmsrq(MSR_IA32_TSX_CTRL, tsx);
/* Force all transactions to immediately abort */
tsx |= TSX_CTRL_RTM_DISABLE;
@@ -36,14 +38,14 @@ static void tsx_disable(void)
*/
tsx |= TSX_CTRL_CPUID_CLEAR;
- wrmsrl(MSR_IA32_TSX_CTRL, tsx);
+ wrmsrq(MSR_IA32_TSX_CTRL, tsx);
}
static void tsx_enable(void)
{
u64 tsx;
- rdmsrl(MSR_IA32_TSX_CTRL, tsx);
+ rdmsrq(MSR_IA32_TSX_CTRL, tsx);
/* Enable the RTM feature in the cpu */
tsx &= ~TSX_CTRL_RTM_DISABLE;
@@ -55,25 +57,7 @@ static void tsx_enable(void)
*/
tsx &= ~TSX_CTRL_CPUID_CLEAR;
- wrmsrl(MSR_IA32_TSX_CTRL, tsx);
-}
-
-static bool tsx_ctrl_is_supported(void)
-{
- u64 ia32_cap = x86_read_arch_cap_msr();
-
- /*
- * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this
- * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES.
- *
- * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a
- * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES
- * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get
- * MSR_IA32_TSX_CTRL support even after a microcode update. Thus,
- * tsx= cmdline requests will do nothing on CPUs without
- * MSR_IA32_TSX_CTRL support.
- */
- return !!(ia32_cap & ARCH_CAP_TSX_CTRL_MSR);
+ wrmsrq(MSR_IA32_TSX_CTRL, tsx);
}
static enum tsx_ctrl_states x86_get_tsx_auto_mode(void)
@@ -132,13 +116,13 @@ static void tsx_clear_cpuid(void)
*/
if (boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT) &&
boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) {
- rdmsrl(MSR_TSX_FORCE_ABORT, msr);
+ rdmsrq(MSR_TSX_FORCE_ABORT, msr);
msr |= MSR_TFA_TSX_CPUID_CLEAR;
- wrmsrl(MSR_TSX_FORCE_ABORT, msr);
- } else if (tsx_ctrl_is_supported()) {
- rdmsrl(MSR_IA32_TSX_CTRL, msr);
+ wrmsrq(MSR_TSX_FORCE_ABORT, msr);
+ } else if (cpu_feature_enabled(X86_FEATURE_MSR_TSX_CTRL)) {
+ rdmsrq(MSR_IA32_TSX_CTRL, msr);
msr |= TSX_CTRL_CPUID_CLEAR;
- wrmsrl(MSR_IA32_TSX_CTRL, msr);
+ wrmsrq(MSR_IA32_TSX_CTRL, msr);
}
}
@@ -158,15 +142,16 @@ static void tsx_dev_mode_disable(void)
u64 mcu_opt_ctrl;
/* Check if RTM_ALLOW exists */
- if (!boot_cpu_has_bug(X86_BUG_TAA) || !tsx_ctrl_is_supported() ||
+ if (!boot_cpu_has_bug(X86_BUG_TAA) ||
+ !cpu_feature_enabled(X86_FEATURE_MSR_TSX_CTRL) ||
!cpu_feature_enabled(X86_FEATURE_SRBDS_CTRL))
return;
- rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl);
+ rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl);
if (mcu_opt_ctrl & RTM_ALLOW) {
mcu_opt_ctrl &= ~RTM_ALLOW;
- wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl);
+ wrmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl);
setup_force_cpu_cap(X86_FEATURE_RTM_ALWAYS_ABORT);
}
}
@@ -191,7 +176,20 @@ void __init tsx_init(void)
return;
}
- if (!tsx_ctrl_is_supported()) {
+ /*
+ * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this
+ * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES.
+ *
+ * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a
+ * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES
+ * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get
+ * MSR_IA32_TSX_CTRL support even after a microcode update. Thus,
+ * tsx= cmdline requests will do nothing on CPUs without
+ * MSR_IA32_TSX_CTRL support.
+ */
+ if (x86_read_arch_cap_msr() & ARCH_CAP_TSX_CTRL_MSR) {
+ setup_force_cpu_cap(X86_FEATURE_MSR_TSX_CTRL);
+ } else {
tsx_ctrl_state = TSX_CTRL_NOT_SUPPORTED;
return;
}
diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c
index ec8064c0ae03..933fcd7ff250 100644
--- a/arch/x86/kernel/cpu/umwait.c
+++ b/arch/x86/kernel/cpu/umwait.c
@@ -33,7 +33,7 @@ static DEFINE_MUTEX(umwait_lock);
static void umwait_update_control_msr(void * unused)
{
lockdep_assert_irqs_disabled();
- wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0);
+ wrmsrq(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached));
}
/*
@@ -71,7 +71,7 @@ static int umwait_cpu_offline(unsigned int cpu)
* the original control MSR value in umwait_init(). So there
* is no race condition here.
*/
- wrmsr(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached, 0);
+ wrmsrq(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached);
return 0;
}
@@ -214,7 +214,7 @@ static int __init umwait_init(void)
* changed. This is the only place where orig_umwait_control_cached
* is modified.
*/
- rdmsrl(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached);
+ rdmsrq(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached);
ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "umwait:online",
umwait_cpu_online, umwait_cpu_offline);
@@ -232,7 +232,11 @@ static int __init umwait_init(void)
* Add umwait control interface. Ignore failure, so at least the
* default values are set up in case the machine manages to boot.
*/
- dev = cpu_subsys.dev_root;
- return sysfs_create_group(&dev->kobj, &umwait_attr_group);
+ dev = bus_get_dev_root(&cpu_subsys);
+ if (dev) {
+ ret = sysfs_create_group(&dev->kobj, &umwait_attr_group);
+ put_device(dev);
+ }
+ return ret;
}
device_initcall(umwait_init);
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 02039ec3597d..cb3f900c46fc 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -26,6 +26,7 @@
#include <linux/export.h>
#include <linux/clocksource.h>
#include <linux/cpu.h>
+#include <linux/efi.h>
#include <linux/reboot.h>
#include <linux/static_call.h>
#include <asm/div64.h>
@@ -41,80 +42,97 @@
#define CPUID_VMWARE_INFO_LEAF 0x40000000
#define CPUID_VMWARE_FEATURES_LEAF 0x40000010
-#define CPUID_VMWARE_FEATURES_ECX_VMMCALL BIT(0)
-#define CPUID_VMWARE_FEATURES_ECX_VMCALL BIT(1)
-#define VMWARE_HYPERVISOR_MAGIC 0x564D5868
-
-#define VMWARE_CMD_GETVERSION 10
-#define VMWARE_CMD_GETHZ 45
-#define VMWARE_CMD_GETVCPU_INFO 68
-#define VMWARE_CMD_LEGACY_X2APIC 3
-#define VMWARE_CMD_VCPU_RESERVED 31
-#define VMWARE_CMD_STEALCLOCK 91
+#define GETVCPU_INFO_LEGACY_X2APIC BIT(3)
+#define GETVCPU_INFO_VCPU_RESERVED BIT(31)
#define STEALCLOCK_NOT_AVAILABLE (-1)
#define STEALCLOCK_DISABLED 0
#define STEALCLOCK_ENABLED 1
-#define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \
- __asm__("inl (%%dx), %%eax" : \
- "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \
- "a"(VMWARE_HYPERVISOR_MAGIC), \
- "c"(VMWARE_CMD_##cmd), \
- "d"(VMWARE_HYPERVISOR_PORT), "b"(UINT_MAX) : \
- "memory")
-
-#define VMWARE_VMCALL(cmd, eax, ebx, ecx, edx) \
- __asm__("vmcall" : \
- "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \
- "a"(VMWARE_HYPERVISOR_MAGIC), \
- "c"(VMWARE_CMD_##cmd), \
- "d"(0), "b"(UINT_MAX) : \
- "memory")
-
-#define VMWARE_VMMCALL(cmd, eax, ebx, ecx, edx) \
- __asm__("vmmcall" : \
- "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \
- "a"(VMWARE_HYPERVISOR_MAGIC), \
- "c"(VMWARE_CMD_##cmd), \
- "d"(0), "b"(UINT_MAX) : \
- "memory")
-
-#define VMWARE_CMD(cmd, eax, ebx, ecx, edx) do { \
- switch (vmware_hypercall_mode) { \
- case CPUID_VMWARE_FEATURES_ECX_VMCALL: \
- VMWARE_VMCALL(cmd, eax, ebx, ecx, edx); \
- break; \
- case CPUID_VMWARE_FEATURES_ECX_VMMCALL: \
- VMWARE_VMMCALL(cmd, eax, ebx, ecx, edx); \
- break; \
- default: \
- VMWARE_PORT(cmd, eax, ebx, ecx, edx); \
- break; \
- } \
- } while (0)
-
struct vmware_steal_time {
union {
- uint64_t clock; /* stolen time counter in units of vtsc */
+ u64 clock; /* stolen time counter in units of vtsc */
struct {
/* only for little-endian */
- uint32_t clock_low;
- uint32_t clock_high;
+ u32 clock_low;
+ u32 clock_high;
};
};
- uint64_t reserved[7];
+ u64 reserved[7];
};
static unsigned long vmware_tsc_khz __ro_after_init;
static u8 vmware_hypercall_mode __ro_after_init;
+unsigned long vmware_hypercall_slow(unsigned long cmd,
+ unsigned long in1, unsigned long in3,
+ unsigned long in4, unsigned long in5,
+ u32 *out1, u32 *out2, u32 *out3,
+ u32 *out4, u32 *out5)
+{
+ unsigned long out0, rbx, rcx, rdx, rsi, rdi;
+
+ switch (vmware_hypercall_mode) {
+ case CPUID_VMWARE_FEATURES_ECX_VMCALL:
+ asm_inline volatile ("vmcall"
+ : "=a" (out0), "=b" (rbx), "=c" (rcx),
+ "=d" (rdx), "=S" (rsi), "=D" (rdi)
+ : "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (in1),
+ "c" (cmd),
+ "d" (in3),
+ "S" (in4),
+ "D" (in5)
+ : "cc", "memory");
+ break;
+ case CPUID_VMWARE_FEATURES_ECX_VMMCALL:
+ asm_inline volatile ("vmmcall"
+ : "=a" (out0), "=b" (rbx), "=c" (rcx),
+ "=d" (rdx), "=S" (rsi), "=D" (rdi)
+ : "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (in1),
+ "c" (cmd),
+ "d" (in3),
+ "S" (in4),
+ "D" (in5)
+ : "cc", "memory");
+ break;
+ default:
+ asm_inline volatile ("movw %[port], %%dx; inl (%%dx), %%eax"
+ : "=a" (out0), "=b" (rbx), "=c" (rcx),
+ "=d" (rdx), "=S" (rsi), "=D" (rdi)
+ : [port] "i" (VMWARE_HYPERVISOR_PORT),
+ "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (in1),
+ "c" (cmd),
+ "d" (in3),
+ "S" (in4),
+ "D" (in5)
+ : "cc", "memory");
+ break;
+ }
+
+ if (out1)
+ *out1 = rbx;
+ if (out2)
+ *out2 = rcx;
+ if (out3)
+ *out3 = rdx;
+ if (out4)
+ *out4 = rsi;
+ if (out5)
+ *out5 = rdi;
+
+ return out0;
+}
+
static inline int __vmware_platform(void)
{
- uint32_t eax, ebx, ecx, edx;
- VMWARE_CMD(GETVERSION, eax, ebx, ecx, edx);
- return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC;
+ u32 eax, ebx, ecx;
+
+ eax = vmware_hypercall3(VMWARE_CMD_GETVERSION, 0, &ebx, &ecx);
+ return eax != UINT_MAX && ebx == VMWARE_HYPERVISOR_MAGIC;
}
static unsigned long vmware_get_tsc_khz(void)
@@ -143,7 +161,7 @@ static __init int parse_no_stealacc(char *arg)
}
early_param("no-steal-acc", parse_no_stealacc);
-static unsigned long long notrace vmware_sched_clock(void)
+static noinstr u64 vmware_sched_clock(void)
{
unsigned long long ns;
@@ -166,21 +184,12 @@ static void __init vmware_cyc2ns_setup(void)
pr_info("using clock offset of %llu ns\n", d->cyc2ns_offset);
}
-static int vmware_cmd_stealclock(uint32_t arg1, uint32_t arg2)
+static int vmware_cmd_stealclock(u32 addr_hi, u32 addr_lo)
{
- uint32_t result, info;
-
- asm volatile (VMWARE_HYPERCALL :
- "=a"(result),
- "=c"(info) :
- "a"(VMWARE_HYPERVISOR_MAGIC),
- "b"(0),
- "c"(VMWARE_CMD_STEALCLOCK),
- "d"(0),
- "S"(arg1),
- "D"(arg2) :
- "memory");
- return result;
+ u32 info;
+
+ return vmware_hypercall5(VMWARE_CMD_STEALCLOCK, 0, 0, addr_hi, addr_lo,
+ &info);
}
static bool stealclock_enable(phys_addr_t pa)
@@ -215,15 +224,15 @@ static bool vmware_is_stealclock_available(void)
* Return:
* The steal clock reading in ns.
*/
-static uint64_t vmware_steal_clock(int cpu)
+static u64 vmware_steal_clock(int cpu)
{
struct vmware_steal_time *steal = &per_cpu(vmw_steal_time, cpu);
- uint64_t clock;
+ u64 clock;
if (IS_ENABLED(CONFIG_64BIT))
clock = READ_ONCE(steal->clock);
else {
- uint32_t initial_high, low, high;
+ u32 initial_high, low, high;
do {
initial_high = READ_ONCE(steal->clock_high);
@@ -235,7 +244,7 @@ static uint64_t vmware_steal_clock(int cpu)
high = READ_ONCE(steal->clock_high);
} while (initial_high != high);
- clock = ((uint64_t)high << 32) | low;
+ clock = ((u64)high << 32) | low;
}
return mul_u64_u32_shr(clock, vmware_cyc2ns.cyc2ns_mul,
@@ -389,13 +398,13 @@ static void __init vmware_set_capabilities(void)
static void __init vmware_platform_setup(void)
{
- uint32_t eax, ebx, ecx, edx;
- uint64_t lpj, tsc_khz;
+ u32 eax, ebx, ecx;
+ u64 lpj, tsc_khz;
- VMWARE_CMD(GETHZ, eax, ebx, ecx, edx);
+ eax = vmware_hypercall3(VMWARE_CMD_GETHZ, UINT_MAX, &ebx, &ecx);
if (ebx != UINT_MAX) {
- lpj = tsc_khz = eax | (((uint64_t)ebx) << 32);
+ lpj = tsc_khz = eax | (((u64)ebx) << 32);
do_div(tsc_khz, 1000);
WARN_ON(tsc_khz >> 32);
pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n",
@@ -421,6 +430,9 @@ static void __init vmware_platform_setup(void)
pr_warn("Failed to get TSC freq from the hypervisor\n");
}
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && !efi_enabled(EFI_BOOT))
+ x86_init.mpparse.find_mptable = mpparse_find_mptable;
+
vmware_paravirt_ops_setup();
#ifdef CONFIG_X86_IO_APIC
@@ -446,7 +458,7 @@ static u8 __init vmware_select_hypercall(void)
* If !boot_cpu_has(X86_FEATURE_HYPERVISOR), vmware_hypercall_mode
* intentionally defaults to 0.
*/
-static uint32_t __init vmware_platform(void)
+static u32 __init vmware_platform(void)
{
if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
unsigned int eax;
@@ -474,11 +486,64 @@ static uint32_t __init vmware_platform(void)
/* Checks if hypervisor supports x2apic without VT-D interrupt remapping. */
static bool __init vmware_legacy_x2apic_available(void)
{
- uint32_t eax, ebx, ecx, edx;
- VMWARE_CMD(GETVCPU_INFO, eax, ebx, ecx, edx);
- return !(eax & BIT(VMWARE_CMD_VCPU_RESERVED)) &&
- (eax & BIT(VMWARE_CMD_LEGACY_X2APIC));
+ u32 eax;
+
+ eax = vmware_hypercall1(VMWARE_CMD_GETVCPU_INFO, 0);
+ return !(eax & GETVCPU_INFO_VCPU_RESERVED) &&
+ (eax & GETVCPU_INFO_LEGACY_X2APIC);
+}
+
+#ifdef CONFIG_INTEL_TDX_GUEST
+/*
+ * TDCALL[TDG.VP.VMCALL] uses %rax (arg0) and %rcx (arg2). Therefore,
+ * we remap those registers to %r12 and %r13, respectively.
+ */
+unsigned long vmware_tdx_hypercall(unsigned long cmd,
+ unsigned long in1, unsigned long in3,
+ unsigned long in4, unsigned long in5,
+ u32 *out1, u32 *out2, u32 *out3,
+ u32 *out4, u32 *out5)
+{
+ struct tdx_module_args args = {};
+
+ if (!hypervisor_is_type(X86_HYPER_VMWARE)) {
+ pr_warn_once("Incorrect usage\n");
+ return ULONG_MAX;
+ }
+
+ if (cmd & ~VMWARE_CMD_MASK) {
+ pr_warn_once("Out of range command %lx\n", cmd);
+ return ULONG_MAX;
+ }
+
+ args.rbx = in1;
+ args.rdx = in3;
+ args.rsi = in4;
+ args.rdi = in5;
+ args.r10 = VMWARE_TDX_VENDOR_LEAF;
+ args.r11 = VMWARE_TDX_HCALL_FUNC;
+ args.r12 = VMWARE_HYPERVISOR_MAGIC;
+ args.r13 = cmd;
+ /* CPL */
+ args.r15 = 0;
+
+ __tdx_hypercall(&args);
+
+ if (out1)
+ *out1 = args.rbx;
+ if (out2)
+ *out2 = args.r13;
+ if (out3)
+ *out3 = args.rdx;
+ if (out4)
+ *out4 = args.rsi;
+ if (out5)
+ *out5 = args.rdi;
+
+ return args.r12;
}
+EXPORT_SYMBOL_GPL(vmware_tdx_hypercall);
+#endif
#ifdef CONFIG_AMD_MEM_ENCRYPT
static void vmware_sev_es_hcall_prepare(struct ghcb *ghcb,
diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c
index 05fa4ef63490..89b1c8a70fe8 100644
--- a/arch/x86/kernel/cpu/zhaoxin.c
+++ b/arch/x86/kernel/cpu/zhaoxin.c
@@ -4,6 +4,7 @@
#include <asm/cpu.h>
#include <asm/cpufeature.h>
+#include <asm/msr.h>
#include "cpu.h"
@@ -65,30 +66,12 @@ static void early_init_zhaoxin(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
}
-
- if (c->cpuid_level >= 0x00000001) {
- u32 eax, ebx, ecx, edx;
-
- cpuid(0x00000001, &eax, &ebx, &ecx, &edx);
- /*
- * If HTT (EDX[28]) is set EBX[16:23] contain the number of
- * apicids which are reserved per package. Store the resulting
- * shift value for the package management code.
- */
- if (edx & (1U << 28))
- c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff);
- }
-
}
static void init_zhaoxin(struct cpuinfo_x86 *c)
{
early_init_zhaoxin(c);
init_intel_cacheinfo(c);
- detect_num_cpu_cores(c);
-#ifdef CONFIG_X86_32
- detect_ht(c);
-#endif
if (c->cpuid_level > 9) {
unsigned int eax = cpuid_eax(10);