diff options
Diffstat (limited to '')
140 files changed, 8613 insertions, 4030 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 2ff3e600f426..f901658d9f7c 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -3,10 +3,6 @@ # Makefile for the linux kernel. # -extra-y := head_$(BITS).o -extra-y += head$(BITS).o -extra-y += ebda.o -extra-y += platform-quirks.o extra-y += vmlinux.lds CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) @@ -21,7 +17,6 @@ CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_early_printk.o = -pg CFLAGS_REMOVE_head64.o = -pg CFLAGS_REMOVE_sev.o = -pg -CFLAGS_REMOVE_cc_platform.o = -pg endif KASAN_SANITIZE_head$(BITS).o := n @@ -30,17 +25,12 @@ KASAN_SANITIZE_dumpstack_$(BITS).o := n KASAN_SANITIZE_stacktrace.o := n KASAN_SANITIZE_paravirt.o := n KASAN_SANITIZE_sev.o := n -KASAN_SANITIZE_cc_platform.o := n # With some compiler versions the generated code results in boot hangs, caused # by several compilation units. To be safe, disable all instrumentation. KCSAN_SANITIZE := n - -OBJECT_FILES_NON_STANDARD_test_nx.o := y - -ifdef CONFIG_FRAME_POINTER -OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y -endif +KMSAN_SANITIZE_head$(BITS).o := n +KMSAN_SANITIZE_nmi.o := n # If instrumentation of this dir is enabled, boot hangs during first second. # Probably could be more selective here, but note that files related to irqs, @@ -48,12 +38,13 @@ endif # non-deterministic coverage. KCOV_INSTRUMENT := n -CFLAGS_head$(BITS).o += -fno-stack-protector -CFLAGS_cc_platform.o += -fno-stack-protector - CFLAGS_irq.o := -I $(srctree)/$(src)/../include/asm/trace -obj-y := process_$(BITS).o signal.o +obj-y += head_$(BITS).o +obj-y += head$(BITS).o +obj-y += ebda.o +obj-y += platform-quirks.o +obj-y += process_$(BITS).o signal.o obj-$(CONFIG_COMPAT) += signal_compat.o obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o dumpstack.o nmi.o @@ -71,7 +62,6 @@ obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o topology.o kdebugfs.o obj-y += alternative.o i8253.o hw_breakpoint.o obj-y += tsc.o tsc_msr.o io_delay.o rtc.o -obj-y += pci-iommu_table.o obj-y += resource.o obj-y += irqflags.o obj-y += static_call.o @@ -84,7 +74,7 @@ obj-$(CONFIG_IA32_EMULATION) += tls.o obj-y += step.o obj-$(CONFIG_INTEL_TXT) += tboot.o obj-$(CONFIG_ISA_DMA_API) += i8237.o -obj-$(CONFIG_STACKTRACE) += stacktrace.o +obj-y += stacktrace.o obj-y += cpu/ obj-y += acpi/ obj-y += reboot.o @@ -106,6 +96,7 @@ obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o obj-$(CONFIG_X86_TSC) += trace_clock.o obj-$(CONFIG_TRACING) += trace.o +obj-$(CONFIG_RETHOOK) += rethook.o obj-$(CONFIG_CRASH_CORE) += crash_core_$(BITS).o obj-$(CONFIG_KEXEC_CORE) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o crash.o @@ -136,7 +127,6 @@ obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o -obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o obj-$(CONFIG_OF) += devicetree.o obj-$(CONFIG_UPROBES) += uprobes.o @@ -151,7 +141,7 @@ obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev.o -obj-$(CONFIG_ARCH_HAS_CC_PLATFORM) += cc_platform.o +obj-$(CONFIG_CFI_CLANG) += cfi.o ### # 64 bit specific files diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile index cf340d85946a..fc17b3f136fe 100644 --- a/arch/x86/kernel/acpi/Makefile +++ b/arch/x86/kernel/acpi/Makefile @@ -3,7 +3,7 @@ obj-$(CONFIG_ACPI) += boot.o obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o obj-$(CONFIG_ACPI_APEI) += apei.o -obj-$(CONFIG_ACPI_CPPC_LIB) += cppc_msr.o +obj-$(CONFIG_ACPI_CPPC_LIB) += cppc.o ifneq ($(CONFIG_ACPI_PROCESSOR),) obj-y += cstate.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 5b6d1a95776f..907cc98b1938 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -65,6 +65,13 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; static bool acpi_support_online_capable; #endif +#ifdef CONFIG_X86_64 +/* Physical address of the Multiprocessor Wakeup Structure mailbox */ +static u64 acpi_mp_wake_mailbox_paddr; +/* Virtual address of the Multiprocessor Wakeup Structure mailbox */ +static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox; +#endif + #ifdef CONFIG_X86_IO_APIC /* * Locks related to IOAPIC hotplug @@ -336,7 +343,60 @@ acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long e return 0; } -#endif /*CONFIG_X86_LOCAL_APIC */ +#ifdef CONFIG_X86_64 +static int acpi_wakeup_cpu(int apicid, unsigned long start_ip) +{ + /* + * Remap mailbox memory only for the first call to acpi_wakeup_cpu(). + * + * Wakeup of secondary CPUs is fully serialized in the core code. + * No need to protect acpi_mp_wake_mailbox from concurrent accesses. + */ + if (!acpi_mp_wake_mailbox) { + acpi_mp_wake_mailbox = memremap(acpi_mp_wake_mailbox_paddr, + sizeof(*acpi_mp_wake_mailbox), + MEMREMAP_WB); + } + + /* + * Mailbox memory is shared between the firmware and OS. Firmware will + * listen on mailbox command address, and once it receives the wakeup + * command, the CPU associated with the given apicid will be booted. + * + * The value of 'apic_id' and 'wakeup_vector' must be visible to the + * firmware before the wakeup command is visible. smp_store_release() + * ensures ordering and visibility. + */ + acpi_mp_wake_mailbox->apic_id = apicid; + acpi_mp_wake_mailbox->wakeup_vector = start_ip; + smp_store_release(&acpi_mp_wake_mailbox->command, + ACPI_MP_WAKE_COMMAND_WAKEUP); + + /* + * Wait for the CPU to wake up. + * + * The CPU being woken up is essentially in a spin loop waiting to be + * woken up. It should not take long for it wake up and acknowledge by + * zeroing out ->command. + * + * ACPI specification doesn't provide any guidance on how long kernel + * has to wait for a wake up acknowledgement. It also doesn't provide + * a way to cancel a wake up request if it takes too long. + * + * In TDX environment, the VMM has control over how long it takes to + * wake up secondary. It can postpone scheduling secondary vCPU + * indefinitely. Giving up on wake up request and reporting error opens + * possible attack vector for VMM: it can wake up a secondary CPU when + * kernel doesn't expect it. Wait until positive result of the wake up + * request. + */ + while (READ_ONCE(acpi_mp_wake_mailbox->command)) + cpu_relax(); + + return 0; +} +#endif /* CONFIG_X86_64 */ +#endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_IO_APIC #define MP_ISA_BUS 0 @@ -375,7 +435,7 @@ static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, isa_irq_to_gsi[bus_irq] = gsi; } -static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, +static void mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, int polarity) { #ifdef CONFIG_X86_MPPARSE @@ -387,9 +447,9 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, u8 pin; if (!acpi_ioapic) - return 0; + return; if (!dev || !dev_is_pci(dev)) - return 0; + return; pdev = to_pci_dev(dev); number = pdev->bus->number; @@ -408,7 +468,6 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, mp_save_irq(&mp_irq); #endif - return 0; } static int __init mp_register_ioapic_irq(u8 bus_irq, u8 polarity, @@ -1083,6 +1142,29 @@ static int __init acpi_parse_madt_lapic_entries(void) } return 0; } + +#ifdef CONFIG_X86_64 +static int __init acpi_parse_mp_wake(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_madt_multiproc_wakeup *mp_wake; + + if (!IS_ENABLED(CONFIG_SMP)) + return -ENODEV; + + mp_wake = (struct acpi_madt_multiproc_wakeup *)header; + if (BAD_MADT_ENTRY(mp_wake, end)) + return -EINVAL; + + acpi_table_print_madt_entry(&header->common); + + acpi_mp_wake_mailbox_paddr = mp_wake->base_address; + + acpi_wake_cpu_handler_update(acpi_wakeup_cpu); + + return 0; +} +#endif /* CONFIG_X86_64 */ #endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_IO_APIC @@ -1278,6 +1360,14 @@ static void __init acpi_process_madt(void) smp_found_config = 1; } + +#ifdef CONFIG_X86_64 + /* + * Parse MADT MP Wake entry. + */ + acpi_table_parse_madt(ACPI_MADT_TYPE_MULTIPROC_WAKEUP, + acpi_parse_mp_wake, 1); +#endif } if (error == -EINVAL) { /* @@ -1328,6 +1418,17 @@ static int __init disable_acpi_pci(const struct dmi_system_id *d) return 0; } +static int __init disable_acpi_xsdt(const struct dmi_system_id *d) +{ + if (!acpi_force) { + pr_notice("%s detected: force use of acpi=rsdt\n", d->ident); + acpi_gbl_do_not_use_xsdt = TRUE; + } else { + pr_notice("Warning: DMI blacklist says broken, but acpi XSDT forced\n"); + } + return 0; +} + static int __init dmi_disable_acpi(const struct dmi_system_id *d) { if (!acpi_force) { @@ -1451,6 +1552,19 @@ static const struct dmi_system_id acpi_dmi_table[] __initconst = { DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"), }, }, + /* + * Boxes that need ACPI XSDT use disabled due to corrupted tables + */ + { + .callback = disable_acpi_xsdt, + .ident = "Advantech DAC-BJ01", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "NEC"), + DMI_MATCH(DMI_PRODUCT_NAME, "Bearlake CRB Board"), + DMI_MATCH(DMI_BIOS_VERSION, "V1.12"), + DMI_MATCH(DMI_BIOS_DATE, "02/01/2011"), + }, + }, {} }; @@ -1748,7 +1862,7 @@ int __acpi_release_global_lock(unsigned int *lock) void __init arch_reserve_mem_area(acpi_physical_address addr, size_t size) { - e820__range_add(addr, size, E820_TYPE_ACPI); + e820__range_add(addr, size, E820_TYPE_NVS); e820__update_table_print(); } diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c new file mode 100644 index 000000000000..8d8752b44f11 --- /dev/null +++ b/arch/x86/kernel/acpi/cppc.c @@ -0,0 +1,118 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * cppc.c: CPPC Interface for x86 + * Copyright (c) 2016, Intel Corporation. + */ + +#include <acpi/cppc_acpi.h> +#include <asm/msr.h> +#include <asm/processor.h> +#include <asm/topology.h> + +/* Refer to drivers/acpi/cppc_acpi.c for the description of functions */ + +bool cpc_supported_by_cpu(void) +{ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + case X86_VENDOR_HYGON: + if (boot_cpu_data.x86 == 0x19 && ((boot_cpu_data.x86_model <= 0x0f) || + (boot_cpu_data.x86_model >= 0x20 && boot_cpu_data.x86_model <= 0x2f))) + return true; + else if (boot_cpu_data.x86 == 0x17 && + boot_cpu_data.x86_model >= 0x70 && boot_cpu_data.x86_model <= 0x7f) + return true; + return boot_cpu_has(X86_FEATURE_CPPC); + } + return false; +} + +bool cpc_ffh_supported(void) +{ + return true; +} + +int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val) +{ + int err; + + err = rdmsrl_safe_on_cpu(cpunum, reg->address, val); + if (!err) { + u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1, + reg->bit_offset); + + *val &= mask; + *val >>= reg->bit_offset; + } + return err; +} + +int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) +{ + u64 rd_val; + int err; + + err = rdmsrl_safe_on_cpu(cpunum, reg->address, &rd_val); + if (!err) { + u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1, + reg->bit_offset); + + val <<= reg->bit_offset; + val &= mask; + rd_val &= ~mask; + rd_val |= val; + err = wrmsrl_safe_on_cpu(cpunum, reg->address, rd_val); + } + return err; +} + +static void amd_set_max_freq_ratio(void) +{ + struct cppc_perf_caps perf_caps; + u64 highest_perf, nominal_perf; + u64 perf_ratio; + int rc; + + rc = cppc_get_perf_caps(0, &perf_caps); + if (rc) { + pr_debug("Could not retrieve perf counters (%d)\n", rc); + return; + } + + highest_perf = amd_get_highest_perf(); + nominal_perf = perf_caps.nominal_perf; + + if (!highest_perf || !nominal_perf) { + pr_debug("Could not retrieve highest or nominal performance\n"); + return; + } + + perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf); + /* midpoint between max_boost and max_P */ + perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1; + if (!perf_ratio) { + pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n"); + return; + } + + freq_invariance_set_perf_ratio(perf_ratio, false); +} + +static DEFINE_MUTEX(freq_invariance_lock); + +void init_freq_invariance_cppc(void) +{ + static bool init_done; + + if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) + return; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return; + + mutex_lock(&freq_invariance_lock); + if (!init_done) + amd_set_max_freq_ratio(); + init_done = true; + mutex_unlock(&freq_invariance_lock); +} diff --git a/arch/x86/kernel/acpi/cppc_msr.c b/arch/x86/kernel/acpi/cppc_msr.c deleted file mode 100644 index b961de569e7e..000000000000 --- a/arch/x86/kernel/acpi/cppc_msr.c +++ /dev/null @@ -1,49 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * cppc_msr.c: MSR Interface for CPPC - * Copyright (c) 2016, Intel Corporation. - */ - -#include <acpi/cppc_acpi.h> -#include <asm/msr.h> - -/* Refer to drivers/acpi/cppc_acpi.c for the description of functions */ - -bool cpc_ffh_supported(void) -{ - return true; -} - -int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val) -{ - int err; - - err = rdmsrl_safe_on_cpu(cpunum, reg->address, val); - if (!err) { - u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1, - reg->bit_offset); - - *val &= mask; - *val >>= reg->bit_offset; - } - return err; -} - -int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) -{ - u64 rd_val; - int err; - - err = rdmsrl_safe_on_cpu(cpunum, reg->address, &rd_val); - if (!err) { - u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1, - reg->bit_offset); - - val <<= reg->bit_offset; - val &= mask; - rd_val &= ~mask; - rd_val |= val; - err = wrmsrl_safe_on_cpu(cpunum, reg->address, rd_val); - } - return err; -} diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 3f85fcae450c..3b7f4cdbf2e0 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -15,6 +15,7 @@ #include <asm/desc.h> #include <asm/cacheflush.h> #include <asm/realmode.h> +#include <asm/hypervisor.h> #include <linux/ftrace.h> #include "../../realmode/rm/wakeup.h" @@ -139,8 +140,10 @@ static int __init acpi_sleep_setup(char *str) if (strncmp(str, "s3_beep", 7) == 0) acpi_realmode_flags |= 4; #ifdef CONFIG_HIBERNATION + if (strncmp(str, "s4_hwsig", 8) == 0) + acpi_check_s4_hw_signature = 1; if (strncmp(str, "s4_nohwsig", 10) == 0) - acpi_no_s4_hw_signature(); + acpi_check_s4_hw_signature = 0; #endif if (strncmp(str, "nonvs", 5) == 0) acpi_nvs_nosave(); @@ -158,3 +161,21 @@ static int __init acpi_sleep_setup(char *str) } __setup("acpi_sleep=", acpi_sleep_setup); + +#if defined(CONFIG_HIBERNATION) && defined(CONFIG_HYPERVISOR_GUEST) +static int __init init_s4_sigcheck(void) +{ + /* + * If running on a hypervisor, honour the ACPI specification + * by default and trigger a clean reboot when the hardware + * signature in FACS is changed after hibernation. + */ + if (acpi_check_s4_hw_signature == -1 && + !hypervisor_is_type(X86_HYPER_NATIVE)) + acpi_check_s4_hw_signature = 1; + + return 0; +} +/* This must happen before acpi_init() which is a subsys initcall */ +arch_initcall(init_s4_sigcheck); +#endif diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S index daf88f8143c5..cf69081073b5 100644 --- a/arch/x86/kernel/acpi/wakeup_32.S +++ b/arch/x86/kernel/acpi/wakeup_32.S @@ -60,7 +60,7 @@ save_registers: popl saved_context_eflags movl $ret_point, saved_eip - ret + RET restore_registers: @@ -70,7 +70,7 @@ restore_registers: movl saved_context_edi, %edi pushl saved_context_eflags popfl - ret + RET SYM_CODE_START(do_suspend_lowlevel) call save_processor_state @@ -86,7 +86,7 @@ SYM_CODE_START(do_suspend_lowlevel) ret_point: call restore_registers call restore_processor_state - ret + RET SYM_CODE_END(do_suspend_lowlevel) .data diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 23fb4d51a5da..5cadcea035e0 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -115,6 +115,8 @@ static void __init_or_module add_nops(void *insns, unsigned int len) } extern s32 __retpoline_sites[], __retpoline_sites_end[]; +extern s32 __return_sites[], __return_sites_end[]; +extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[]; extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; void text_poke_early(void *addr, const void *opcode, size_t len); @@ -337,7 +339,7 @@ next: } } -#if defined(CONFIG_RETPOLINE) && defined(CONFIG_STACK_VALIDATION) +#if defined(CONFIG_RETPOLINE) && defined(CONFIG_OBJTOOL) /* * CALL/JMP *%\reg @@ -389,7 +391,7 @@ static int emit_indirect(int op, int reg, u8 *bytes) * * CALL *%\reg * - * It also tries to inline spectre_v2=retpoline,amd when size permits. + * It also tries to inline spectre_v2=retpoline,lfence when size permits. */ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) { @@ -407,7 +409,7 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) BUG_ON(reg == 4); if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) && - !cpu_feature_enabled(X86_FEATURE_RETPOLINE_AMD)) + !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) return -1; op = insn->opcode.bytes[0]; @@ -438,9 +440,9 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) } /* - * For RETPOLINE_AMD: prepend the indirect CALL/JMP with an LFENCE. + * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE. */ - if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_AMD)) { + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { bytes[i++] = 0x0f; bytes[i++] = 0xae; bytes[i++] = 0xe8; /* LFENCE */ @@ -451,6 +453,15 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) return ret; i += ret; + /* + * The compiler is supposed to EMIT an INT3 after every unconditional + * JMP instruction due to AMD BTC. However, if the compiler is too old + * or SLS isn't enabled, we still need an INT3 after indirect JMPs + * even on Intel. + */ + if (op == JMP32_INSN_OPCODE && i < insn->length) + bytes[i++] = INT3_INSN_OPCODE; + for (; i < insn->length;) bytes[i++] = BYTES_NOP1; @@ -506,11 +517,116 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) } } -#else /* !RETPOLINES || !CONFIG_STACK_VALIDATION */ +#ifdef CONFIG_RETHUNK +/* + * Rewrite the compiler generated return thunk tail-calls. + * + * For example, convert: + * + * JMP __x86_return_thunk + * + * into: + * + * RET + */ +static int patch_return(void *addr, struct insn *insn, u8 *bytes) +{ + int i = 0; + + if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + return -1; + + bytes[i++] = RET_INSN_OPCODE; + + for (; i < insn->length;) + bytes[i++] = INT3_INSN_OPCODE; + + return i; +} + +void __init_or_module noinline apply_returns(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *dest = NULL, *addr = (void *)s + *s; + struct insn insn; + int len, ret; + u8 bytes[16]; + u8 op; + + ret = insn_decode_kernel(&insn, addr); + if (WARN_ON_ONCE(ret < 0)) + continue; + + op = insn.opcode.bytes[0]; + if (op == JMP32_INSN_OPCODE) + dest = addr + insn.length + insn.immediate.value; + + if (__static_call_fixup(addr, op, dest) || + WARN_ONCE(dest != &__x86_return_thunk, + "missing return thunk: %pS-%pS: %*ph", + addr, dest, 5, addr)) + continue; + + DPRINTK("return thunk at: %pS (%px) len: %d to: %pS", + addr, addr, insn.length, + addr + insn.length + insn.immediate.value); + + len = patch_return(addr, &insn, bytes); + if (len == insn.length) { + DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr); + DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr); + text_poke_early(addr, bytes, len); + } + } +} +#else +void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } +#endif /* CONFIG_RETHUNK */ + +#else /* !CONFIG_RETPOLINE || !CONFIG_OBJTOOL */ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } +void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } + +#endif /* CONFIG_RETPOLINE && CONFIG_OBJTOOL */ + +#ifdef CONFIG_X86_KERNEL_IBT + +/* + * Generated by: objtool --ibt + */ +void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + u32 endbr, poison = gen_endbr_poison(); + void *addr = (void *)s + *s; + + if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr))) + continue; + + if (WARN_ON_ONCE(!is_endbr(endbr))) + continue; + + DPRINTK("ENDBR at: %pS (%px)", addr, addr); + + /* + * When we have IBT, the lack of ENDBR will trigger #CP + */ + DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr); + DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr); + text_poke_early(addr, &poison, 4); + } +} -#endif /* CONFIG_RETPOLINE && CONFIG_STACK_VALIDATION */ +#else + +void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) { } + +#endif /* CONFIG_X86_KERNEL_IBT */ #ifdef CONFIG_SMP static void alternatives_smp_lock(const s32 *start, const s32 *end, @@ -713,34 +829,39 @@ asm ( " .pushsection .init.text, \"ax\", @progbits\n" " .type int3_magic, @function\n" "int3_magic:\n" + ANNOTATE_NOENDBR " movl $1, (%" _ASM_ARG1 ")\n" -" ret\n" + ASM_RET " .size int3_magic, .-int3_magic\n" " .popsection\n" ); -extern __initdata unsigned long int3_selftest_ip; /* defined in asm below */ +extern void int3_selftest_ip(void); /* defined in asm below */ static int __init int3_exception_notify(struct notifier_block *self, unsigned long val, void *data) { + unsigned long selftest = (unsigned long)&int3_selftest_ip; struct die_args *args = data; struct pt_regs *regs = args->regs; + OPTIMIZER_HIDE_VAR(selftest); + if (!regs || user_mode(regs)) return NOTIFY_DONE; if (val != DIE_INT3) return NOTIFY_DONE; - if (regs->ip - INT3_INSN_SIZE != int3_selftest_ip) + if (regs->ip - INT3_INSN_SIZE != selftest) return NOTIFY_DONE; int3_emulate_call(regs, (unsigned long)&int3_magic); return NOTIFY_STOP; } -static void __init int3_selftest(void) +/* Must be noinline to ensure uniqueness of int3_selftest_ip. */ +static noinline void __init int3_selftest(void) { static __initdata struct notifier_block int3_exception_nb = { .notifier_call = int3_exception_notify, @@ -753,18 +874,12 @@ static void __init int3_selftest(void) /* * Basically: int3_magic(&val); but really complicated :-) * - * Stick the address of the INT3 instruction into int3_selftest_ip, - * then trigger the INT3, padded with NOPs to match a CALL instruction - * length. + * INT3 padded with NOP to CALL_INSN_SIZE. The int3_exception_nb + * notifier above will emulate CALL for us. */ - asm volatile ("1: int3; nop; nop; nop; nop\n\t" - ".pushsection .init.data,\"aw\"\n\t" - ".align " __ASM_SEL(4, 8) "\n\t" - ".type int3_selftest_ip, @object\n\t" - ".size int3_selftest_ip, " __ASM_SEL(4, 8) "\n\t" - "int3_selftest_ip:\n\t" - __ASM_SEL(.long, .quad) " 1b\n\t" - ".popsection\n\t" + asm volatile ("int3_selftest_ip:\n\t" + ANNOTATE_NOENDBR + " int3; nop; nop; nop; nop\n\t" : ASM_CALL_CONSTRAINT : __ASM_SEL_RAW(a, D) (&val) : "memory"); @@ -824,6 +939,7 @@ void __init alternative_instructions(void) * those can rewrite the retpoline thunks. */ apply_retpolines(__retpoline_sites, __retpoline_sites_end); + apply_returns(__return_sites, __return_sites_end); /* * Then patch alternatives, such that those paravirt calls that are in @@ -831,6 +947,8 @@ void __init alternative_instructions(void) */ apply_alternatives(__alt_instructions, __alt_instructions_end); + apply_ibt_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end); + #ifdef CONFIG_SMP /* Patch to UP if other cpus not imminent. */ if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) { @@ -956,7 +1074,21 @@ static inline void unuse_temporary_mm(temp_mm_state_t prev_state) __ro_after_init struct mm_struct *poking_mm; __ro_after_init unsigned long poking_addr; -static void *__text_poke(void *addr, const void *opcode, size_t len) +static void text_poke_memcpy(void *dst, const void *src, size_t len) +{ + memcpy(dst, src, len); +} + +static void text_poke_memset(void *dst, const void *src, size_t len) +{ + int c = *(const int *)src; + + memset(dst, c, len); +} + +typedef void text_poke_f(void *dst, const void *src, size_t len); + +static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len) { bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; struct page *pages[2] = {NULL}; @@ -1021,7 +1153,7 @@ static void *__text_poke(void *addr, const void *opcode, size_t len) prev = use_temporary_mm(poking_mm); kasan_disable_current(); - memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len); + func((u8 *)poking_addr + offset_in_page(addr), src, len); kasan_enable_current(); /* @@ -1049,11 +1181,13 @@ static void *__text_poke(void *addr, const void *opcode, size_t len) (cross_page_boundary ? 2 : 1) * PAGE_SIZE, PAGE_SHIFT, false); - /* - * If the text does not match what we just wrote then something is - * fundamentally screwy; there's nothing we can really do about that. - */ - BUG_ON(memcmp(addr, opcode, len)); + if (func == text_poke_memcpy) { + /* + * If the text does not match what we just wrote then something is + * fundamentally screwy; there's nothing we can really do about that. + */ + BUG_ON(memcmp(addr, src, len)); + } local_irq_restore(flags); pte_unmap_unlock(ptep, ptl); @@ -1080,7 +1214,7 @@ void *text_poke(void *addr, const void *opcode, size_t len) { lockdep_assert_held(&text_mutex); - return __text_poke(addr, opcode, len); + return __text_poke(text_poke_memcpy, addr, opcode, len); } /** @@ -1099,7 +1233,72 @@ void *text_poke(void *addr, const void *opcode, size_t len) */ void *text_poke_kgdb(void *addr, const void *opcode, size_t len) { - return __text_poke(addr, opcode, len); + return __text_poke(text_poke_memcpy, addr, opcode, len); +} + +/** + * text_poke_copy - Copy instructions into (an unused part of) RX memory + * @addr: address to modify + * @opcode: source of the copy + * @len: length to copy, could be more than 2x PAGE_SIZE + * + * Not safe against concurrent execution; useful for JITs to dump + * new code blocks into unused regions of RX memory. Can be used in + * conjunction with synchronize_rcu_tasks() to wait for existing + * execution to quiesce after having made sure no existing functions + * pointers are live. + */ +void *text_poke_copy(void *addr, const void *opcode, size_t len) +{ + unsigned long start = (unsigned long)addr; + size_t patched = 0; + + if (WARN_ON_ONCE(core_kernel_text(start))) + return NULL; + + mutex_lock(&text_mutex); + while (patched < len) { + unsigned long ptr = start + patched; + size_t s; + + s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched); + + __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s); + patched += s; + } + mutex_unlock(&text_mutex); + return addr; +} + +/** + * text_poke_set - memset into (an unused part of) RX memory + * @addr: address to modify + * @c: the byte to fill the area with + * @len: length to copy, could be more than 2x PAGE_SIZE + * + * This is useful to overwrite unused regions of RX memory with illegal + * instructions. + */ +void *text_poke_set(void *addr, int c, size_t len) +{ + unsigned long start = (unsigned long)addr; + size_t patched = 0; + + if (WARN_ON_ONCE(core_kernel_text(start))) + return NULL; + + mutex_lock(&text_mutex); + while (patched < len) { + unsigned long ptr = start + patched; + size_t s; + + s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched); + + __text_poke(text_poke_memset, (void *)ptr, (void *)&c, s); + patched += s; + } + mutex_unlock(&text_mutex); + return addr; } static void do_sync_core(void *info) @@ -1113,10 +1312,13 @@ void text_poke_sync(void) } struct text_poke_loc { - s32 rel_addr; /* addr := _stext + rel_addr */ - s32 rel32; + /* addr := _stext + rel_addr */ + s32 rel_addr; + s32 disp; + u8 len; u8 opcode; const u8 text[POKE_MAX_OPCODE_SIZE]; + /* see text_poke_bp_batch() */ u8 old; }; @@ -1126,21 +1328,23 @@ struct bp_patching_desc { atomic_t refs; }; -static struct bp_patching_desc *bp_desc; +static struct bp_patching_desc bp_desc; static __always_inline -struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp) +struct bp_patching_desc *try_get_desc(void) { - struct bp_patching_desc *desc = __READ_ONCE(*descp); /* rcu_dereference */ + struct bp_patching_desc *desc = &bp_desc; - if (!desc || !arch_atomic_inc_not_zero(&desc->refs)) + if (!arch_atomic_inc_not_zero(&desc->refs)) return NULL; return desc; } -static __always_inline void put_desc(struct bp_patching_desc *desc) +static __always_inline void put_desc(void) { + struct bp_patching_desc *desc = &bp_desc; + smp_mb__before_atomic(); arch_atomic_dec(&desc->refs); } @@ -1165,7 +1369,7 @@ noinstr int poke_int3_handler(struct pt_regs *regs) { struct bp_patching_desc *desc; struct text_poke_loc *tp; - int len, ret = 0; + int ret = 0; void *ip; if (user_mode(regs)) @@ -1173,15 +1377,15 @@ noinstr int poke_int3_handler(struct pt_regs *regs) /* * Having observed our INT3 instruction, we now must observe - * bp_desc: + * bp_desc with non-zero refcount: * - * bp_desc = desc INT3 + * bp_desc.refs = 1 INT3 * WMB RMB - * write INT3 if (desc) + * write INT3 if (bp_desc.refs != 0) */ smp_rmb(); - desc = try_get_desc(&bp_desc); + desc = try_get_desc(); if (!desc) return 0; @@ -1205,8 +1409,7 @@ noinstr int poke_int3_handler(struct pt_regs *regs) goto out_put; } - len = text_opcode_size(tp->opcode); - ip += len; + ip += tp->len; switch (tp->opcode) { case INT3_INSN_OPCODE: @@ -1221,12 +1424,12 @@ noinstr int poke_int3_handler(struct pt_regs *regs) break; case CALL_INSN_OPCODE: - int3_emulate_call(regs, (long)ip + tp->rel32); + int3_emulate_call(regs, (long)ip + tp->disp); break; case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: - int3_emulate_jmp(regs, (long)ip + tp->rel32); + int3_emulate_jmp(regs, (long)ip + tp->disp); break; default: @@ -1236,7 +1439,7 @@ noinstr int poke_int3_handler(struct pt_regs *regs) ret = 1; out_put: - put_desc(desc); + put_desc(); return ret; } @@ -1267,18 +1470,20 @@ static int tp_vec_nr; */ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) { - struct bp_patching_desc desc = { - .vec = tp, - .nr_entries = nr_entries, - .refs = ATOMIC_INIT(1), - }; unsigned char int3 = INT3_INSN_OPCODE; unsigned int i; int do_sync; lockdep_assert_held(&text_mutex); - smp_store_release(&bp_desc, &desc); /* rcu_assign_pointer */ + bp_desc.vec = tp; + bp_desc.nr_entries = nr_entries; + + /* + * Corresponds to the implicit memory barrier in try_get_desc() to + * ensure reading a non-zero refcount provides up to date bp_desc data. + */ + atomic_set_release(&bp_desc.refs, 1); /* * Corresponding read barrier in int3 notifier for making sure the @@ -1301,7 +1506,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries */ for (do_sync = 0, i = 0; i < nr_entries; i++) { u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, }; - int len = text_opcode_size(tp[i].opcode); + int len = tp[i].len; if (len - INT3_INSN_SIZE > 0) { memcpy(old + INT3_INSN_SIZE, @@ -1366,33 +1571,47 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries text_poke_sync(); /* - * Remove and synchronize_rcu(), except we have a very primitive - * refcount based completion. + * Remove and wait for refs to be zero. */ - WRITE_ONCE(bp_desc, NULL); /* RCU_INIT_POINTER */ - if (!atomic_dec_and_test(&desc.refs)) - atomic_cond_read_acquire(&desc.refs, !VAL); + if (!atomic_dec_and_test(&bp_desc.refs)) + atomic_cond_read_acquire(&bp_desc.refs, !VAL); } static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, const void *opcode, size_t len, const void *emulate) { struct insn insn; - int ret; + int ret, i; memcpy((void *)tp->text, opcode, len); if (!emulate) emulate = opcode; ret = insn_decode_kernel(&insn, emulate); - BUG_ON(ret < 0); - BUG_ON(len != insn.length); tp->rel_addr = addr - (void *)_stext; + tp->len = len; tp->opcode = insn.opcode.bytes[0]; switch (tp->opcode) { + case RET_INSN_OPCODE: + case JMP32_INSN_OPCODE: + case JMP8_INSN_OPCODE: + /* + * Control flow instructions without implied execution of the + * next instruction can be padded with INT3. + */ + for (i = insn.length; i < len; i++) + BUG_ON(tp->text[i] != INT3_INSN_OPCODE); + break; + + default: + BUG_ON(len != insn.length); + }; + + + switch (tp->opcode) { case INT3_INSN_OPCODE: case RET_INSN_OPCODE: break; @@ -1400,7 +1619,7 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, case CALL_INSN_OPCODE: case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: - tp->rel32 = insn.immediate.value; + tp->disp = insn.immediate.value; break; default: /* assume NOP */ @@ -1408,13 +1627,13 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, case 2: /* NOP2 -- emulate as JMP8+0 */ BUG_ON(memcmp(emulate, x86_nops[len], len)); tp->opcode = JMP8_INSN_OPCODE; - tp->rel32 = 0; + tp->disp = 0; break; case 5: /* NOP5 -- emulate as JMP32+0 */ BUG_ON(memcmp(emulate, x86_nops[len], len)); tp->opcode = JMP32_INSN_OPCODE; - tp->rel32 = 0; + tp->disp = 0; break; default: /* unknown instruction */ diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index ed837383de5c..19a0207e529f 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -38,11 +38,9 @@ #include <asm/iommu.h> #include <asm/gart.h> #include <asm/set_memory.h> -#include <asm/swiotlb.h> #include <asm/dma.h> #include <asm/amd_nb.h> #include <asm/x86_init.h> -#include <asm/iommu_table.h> static unsigned long iommu_bus_base; /* GART remapping area (physical) */ static unsigned long iommu_size; /* size of remapping area bytes */ @@ -55,7 +53,7 @@ static u32 *iommu_gatt_base; /* Remapping table */ * of only flushing when an mapping is reused. With it true the GART is * flushed for every mapping. Problem is that doing the lazy flush seems * to trigger bugs with some popular PCI cards, in particular 3ware (but - * has been also also seen with Qlogic at least). + * has been also seen with Qlogic at least). */ static int iommu_fullflush = 1; @@ -808,7 +806,7 @@ int __init gart_iommu_init(void) flush_gart(); dma_ops = &gart_dma_ops; x86_platform.iommu_shutdown = gart_iommu_shutdown; - swiotlb = 0; + x86_swiotlb_enable = false; return 0; } @@ -842,4 +840,3 @@ void __init gart_parse_options(char *p) } } } -IOMMU_INIT_POST(gart_iommu_hole_init); diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index c92c9c774c0e..4266b64631a4 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -19,17 +19,25 @@ #define PCI_DEVICE_ID_AMD_17H_M10H_ROOT 0x15d0 #define PCI_DEVICE_ID_AMD_17H_M30H_ROOT 0x1480 #define PCI_DEVICE_ID_AMD_17H_M60H_ROOT 0x1630 +#define PCI_DEVICE_ID_AMD_17H_MA0H_ROOT 0x14b5 +#define PCI_DEVICE_ID_AMD_19H_M10H_ROOT 0x14a4 +#define PCI_DEVICE_ID_AMD_19H_M60H_ROOT 0x14d8 +#define PCI_DEVICE_ID_AMD_19H_M70H_ROOT 0x14e8 #define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464 #define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec #define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494 #define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c #define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444 +#define PCI_DEVICE_ID_AMD_17H_MA0H_DF_F4 0x1728 #define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654 +#define PCI_DEVICE_ID_AMD_19H_M10H_DF_F4 0x14b1 #define PCI_DEVICE_ID_AMD_19H_M40H_ROOT 0x14b5 #define PCI_DEVICE_ID_AMD_19H_M40H_DF_F4 0x167d #define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e +#define PCI_DEVICE_ID_AMD_19H_M60H_DF_F4 0x14e4 +#define PCI_DEVICE_ID_AMD_19H_M70H_DF_F4 0x14f4 -/* Protect the PCI config register pairs used for SMN and DF indirect access. */ +/* Protect the PCI config register pairs used for SMN. */ static DEFINE_MUTEX(smn_mutex); static u32 *flush_words; @@ -39,7 +47,11 @@ static const struct pci_device_id amd_root_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_ROOT) }, {} }; @@ -58,11 +70,15 @@ static const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F3) }, {} }; @@ -77,7 +93,9 @@ static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) }, @@ -182,55 +200,8 @@ int amd_smn_write(u16 node, u32 address, u32 value) } EXPORT_SYMBOL_GPL(amd_smn_write); -/* - * Data Fabric Indirect Access uses FICAA/FICAD. - * - * Fabric Indirect Configuration Access Address (FICAA): Constructed based - * on the device's Instance Id and the PCI function and register offset of - * the desired register. - * - * Fabric Indirect Configuration Access Data (FICAD): There are FICAD LO - * and FICAD HI registers but so far we only need the LO register. - */ -int amd_df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo) -{ - struct pci_dev *F4; - u32 ficaa; - int err = -ENODEV; - - if (node >= amd_northbridges.num) - goto out; - - F4 = node_to_amd_nb(node)->link; - if (!F4) - goto out; - - ficaa = 1; - ficaa |= reg & 0x3FC; - ficaa |= (func & 0x7) << 11; - ficaa |= instance_id << 16; - - mutex_lock(&smn_mutex); - - err = pci_write_config_dword(F4, 0x5C, ficaa); - if (err) { - pr_warn("Error writing DF Indirect FICAA, FICAA=0x%x\n", ficaa); - goto out_unlock; - } - - err = pci_read_config_dword(F4, 0x98, lo); - if (err) - pr_warn("Error reading DF Indirect FICAD LO, FICAA=0x%x.\n", ficaa); - -out_unlock: - mutex_unlock(&smn_mutex); - -out: - return err; -} -EXPORT_SYMBOL_GPL(amd_df_indirect_read); -int amd_cache_northbridges(void) +static int amd_cache_northbridges(void) { const struct pci_device_id *misc_ids = amd_nb_misc_ids; const struct pci_device_id *link_ids = amd_nb_link_ids; @@ -252,14 +223,14 @@ int amd_cache_northbridges(void) } misc = NULL; - while ((misc = next_northbridge(misc, misc_ids)) != NULL) + while ((misc = next_northbridge(misc, misc_ids))) misc_count++; if (!misc_count) return -ENODEV; root = NULL; - while ((root = next_northbridge(root, root_ids)) != NULL) + while ((root = next_northbridge(root, root_ids))) root_count++; if (root_count) { @@ -332,7 +303,6 @@ int amd_cache_northbridges(void) return 0; } -EXPORT_SYMBOL_GPL(amd_cache_northbridges); /* * Ignores subdevice/subvendor but as far as I can figure out diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index af3ba08b684b..4feaa670d578 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -36,7 +36,7 @@ /* * Using 512M as goal, in case kexec will load kernel_big * that will do the on-position decompress, and could overlap with - * with the gart aperture that is used. + * the gart aperture that is used. * Sequence: * kernel_small * ==> kexec (with kdump trigger path or gart still enabled) @@ -392,7 +392,7 @@ void __init early_gart_iommu_check(void) static int __initdata printed_gart_size_msg; -int __init gart_iommu_hole_init(void) +void __init gart_iommu_hole_init(void) { u32 agp_aper_base = 0, agp_aper_order = 0; u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0; @@ -401,11 +401,11 @@ int __init gart_iommu_hole_init(void) int i, node; if (!amd_gart_present()) - return -ENODEV; + return; if (gart_iommu_aperture_disabled || !fix_aperture || !early_pci_allowed()) - return -ENODEV; + return; pr_info("Checking aperture...\n"); @@ -491,10 +491,8 @@ out: * and fixed up the northbridge */ exclude_from_core(last_aper_base, last_aper_order); - - return 1; } - return 0; + return; } if (!fallback_aper_force) { @@ -527,7 +525,7 @@ out: panic("Not enough memory for aperture"); } } else { - return 0; + return; } /* @@ -561,6 +559,4 @@ out: } set_up_gart_resume(aper_order, aper_alloc); - - return 1; } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b70344bf6600..c6876d3ea4b1 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -61,6 +61,7 @@ #include <asm/cpu_device_id.h> #include <asm/intel-family.h> #include <asm/irq_regs.h> +#include <asm/cpu.h> unsigned int num_processors; @@ -170,7 +171,7 @@ static __init int setup_apicpmtimer(char *s) { apic_calibrate_pmtmr = 1; notsc_setup(NULL); - return 0; + return 1; } __setup("apicpmtimer", setup_apicpmtimer); #endif @@ -275,7 +276,7 @@ void native_apic_icr_write(u32 low, u32 id) unsigned long flags; local_irq_save(flags); - apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id)); + apic_write(APIC_ICR2, SET_XAPIC_DEST_FIELD(id)); apic_write(APIC_ICR, low); local_irq_restore(flags); } @@ -320,6 +321,9 @@ int lapic_get_maxlvt(void) #define APIC_DIVISOR 16 #define TSC_DIVISOR 8 +/* i82489DX specific */ +#define I82489DX_BASE_DIVIDER (((0x2) << 18)) + /* * This function sets up the local APIC timer, with a timeout of * 'clocks' APIC bus clock. During calibration we actually call @@ -340,8 +344,14 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) else if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) lvtt_value |= APIC_LVT_TIMER_TSCDEADLINE; + /* + * The i82489DX APIC uses bit 18 and 19 for the base divider. This + * overlaps with bit 18 on integrated APICs, but is not documented + * in the SDM. No problem though. i82489DX equipped systems do not + * have TSC deadline timer. + */ if (!lapic_is_integrated()) - lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); + lvtt_value |= I82489DX_BASE_DIVIDER; if (!irqen) lvtt_value |= APIC_LVT_MASKED; @@ -1106,11 +1116,6 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_apic_timer_interrupt) set_irq_regs(old_regs); } -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - /* * Local APIC start and shutdown */ @@ -1419,22 +1424,21 @@ void __init apic_intr_mode_init(void) return; case APIC_VIRTUAL_WIRE: pr_info("APIC: Switch to virtual wire mode setup\n"); - default_setup_apic_routing(); break; case APIC_VIRTUAL_WIRE_NO_CONFIG: pr_info("APIC: Switch to virtual wire mode setup with no configuration\n"); upmode = true; - default_setup_apic_routing(); break; case APIC_SYMMETRIC_IO: pr_info("APIC: Switch to symmetric I/O mode setup\n"); - default_setup_apic_routing(); break; case APIC_SYMMETRIC_IO_NO_ROUTING: pr_info("APIC: Switch to symmetric I/O mode setup in no SMP routine\n"); break; } + default_setup_apic_routing(); + if (x86_platform.apic_post_init) x86_platform.apic_post_init(); @@ -1748,11 +1752,26 @@ EXPORT_SYMBOL_GPL(x2apic_mode); enum { X2APIC_OFF, - X2APIC_ON, X2APIC_DISABLED, + /* All states below here have X2APIC enabled */ + X2APIC_ON, + X2APIC_ON_LOCKED }; static int x2apic_state; +static bool x2apic_hw_locked(void) +{ + u64 ia32_cap; + u64 msr; + + ia32_cap = x86_read_arch_cap_msr(); + if (ia32_cap & ARCH_CAP_XAPIC_DISABLE) { + rdmsrl(MSR_IA32_XAPIC_DISABLE_STATUS, msr); + return (msr & LEGACY_XAPIC_DISABLED); + } + return false; +} + static void __x2apic_disable(void) { u64 msr; @@ -1790,6 +1809,10 @@ static int __init setup_nox2apic(char *str) apicid); return 0; } + if (x2apic_hw_locked()) { + pr_warn("APIC locked in x2apic mode, can't disable\n"); + return 0; + } pr_warn("x2apic already enabled.\n"); __x2apic_disable(); } @@ -1804,10 +1827,18 @@ early_param("nox2apic", setup_nox2apic); void x2apic_setup(void) { /* - * If x2apic is not in ON state, disable it if already enabled + * Try to make the AP's APIC state match that of the BSP, but if the + * BSP is unlocked and the AP is locked then there is a state mismatch. + * Warn about the mismatch in case a GP fault occurs due to a locked AP + * trying to be turned off. + */ + if (x2apic_state != X2APIC_ON_LOCKED && x2apic_hw_locked()) + pr_warn("x2apic lock mismatch between BSP and AP.\n"); + /* + * If x2apic is not in ON or LOCKED state, disable it if already enabled * from BIOS. */ - if (x2apic_state != X2APIC_ON) { + if (x2apic_state < X2APIC_ON) { __x2apic_disable(); return; } @@ -1828,6 +1859,11 @@ static __init void x2apic_disable(void) if (x2apic_id >= 255) panic("Cannot disable x2apic, id: %08x\n", x2apic_id); + if (x2apic_hw_locked()) { + pr_warn("Cannot disable locked x2apic, id: %08x\n", x2apic_id); + return; + } + __x2apic_disable(); register_lapic_address(mp_lapic_addr); } @@ -1886,7 +1922,10 @@ void __init check_x2apic(void) if (x2apic_enabled()) { pr_info("x2apic: enabled by BIOS, switching to x2apic ops\n"); x2apic_mode = 1; - x2apic_state = X2APIC_ON; + if (x2apic_hw_locked()) + x2apic_state = X2APIC_ON_LOCKED; + else + x2apic_state = X2APIC_ON; } else if (!boot_cpu_has(X86_FEATURE_X2APIC)) { x2apic_state = X2APIC_DISABLED; } @@ -2551,6 +2590,16 @@ u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid) } EXPORT_SYMBOL_GPL(x86_msi_msg_get_destid); +#ifdef CONFIG_X86_64 +void __init acpi_wake_cpu_handler_update(wakeup_cpu_handler handler) +{ + struct apic **drv; + + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) + (*drv)->wakeup_secondary_cpu_64 = handler; +} +#endif + /* * Override the generic EOI implementation with an optimized version. * Only called during early boot when only one CPU is active and with diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index c1bb384935b0..a868b76cd3d4 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -65,6 +65,7 @@ #include <asm/irq_remapping.h> #include <asm/hw_irq.h> #include <asm/apic.h> +#include <asm/pgtable.h> #define for_each_ioapic(idx) \ for ((idx) = 0; (idx) < nr_ioapics; (idx)++) @@ -2677,6 +2678,19 @@ static struct resource * __init ioapic_setup_resources(void) return res; } +static void io_apic_set_fixmap(enum fixed_addresses idx, phys_addr_t phys) +{ + pgprot_t flags = FIXMAP_PAGE_NOCACHE; + + /* + * Ensure fixmaps for IOAPIC MMIO respect memory encryption pgprot + * bits, just like normal ioremap(): + */ + flags = pgprot_decrypted(flags); + + __set_fixmap(idx, phys, flags); +} + void __init io_apic_init_mappings(void) { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; @@ -2709,7 +2723,7 @@ fake_ioapic_page: __func__, PAGE_SIZE, PAGE_SIZE); ioapic_phys = __pa(ioapic_phys); } - set_fixmap_nocache(idx, ioapic_phys); + io_apic_set_fixmap(idx, ioapic_phys); apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n", __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK), ioapic_phys); @@ -2838,7 +2852,7 @@ int mp_register_ioapic(int id, u32 address, u32 gsi_base, ioapics[idx].mp_config.flags = MPC_APIC_USABLE; ioapics[idx].mp_config.apicaddr = address; - set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); + io_apic_set_fixmap(FIX_IO_APIC_BASE_0 + idx, address); if (bad_ioapic_register(idx)) { clear_fixmap(FIX_IO_APIC_BASE_0 + idx); return -ENODEV; diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index d1fb874fbe64..2a6509e8c840 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -99,7 +99,7 @@ sendmask: static inline int __prepare_ICR2(unsigned int mask) { - return SET_APIC_DEST_FIELD(mask); + return SET_XAPIC_DEST_FIELD(mask); } static inline void __xapic_wait_icr_idle(void) diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index dbacb9ec8843..7517eb05bdc1 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -19,6 +19,7 @@ #include <asm/hw_irq.h> #include <asm/apic.h> #include <asm/irq_remapping.h> +#include <asm/xen/hypervisor.h> struct irq_domain *x86_pci_msi_default_domain __ro_after_init; @@ -159,11 +160,8 @@ static struct irq_chip pci_msi_controller = { int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec, msi_alloc_info_t *arg) { - struct pci_dev *pdev = to_pci_dev(dev); - struct msi_desc *desc = first_pci_msi_entry(pdev); - init_irq_alloc_info(arg, NULL); - if (desc->msi_attrib.is_msix) { + if (to_pci_dev(dev)->msix_enabled) { arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX; } else { arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSI; @@ -345,3 +343,8 @@ void dmar_free_hwirq(int irq) irq_domain_free_irqs(irq, 1); } #endif + +bool arch_restore_msi_irqs(struct pci_dev *dev) +{ + return xen_initdom_restore_msi(dev); +} diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index c132daabe615..3e6f6b448f6a 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -760,9 +760,9 @@ void __init lapic_update_legacy_vectors(void) void __init lapic_assign_system_vectors(void) { - unsigned int i, vector = 0; + unsigned int i, vector; - for_each_set_bit_from(vector, system_vectors, NR_VECTORS) + for_each_set_bit(vector, system_vectors, NR_VECTORS) irq_matrix_assign_system(vector_matrix, vector, false); if (nr_legacy_irqs() > 1) diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index f5a48e66e4f5..482855227964 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -199,7 +199,13 @@ static void __init uv_tsc_check_sync(void) int mmr_shift; char *state; - /* Different returns from different UV BIOS versions */ + /* UV5 guarantees synced TSCs; do not zero TSC_ADJUST */ + if (!is_uv(UV2|UV3|UV4)) { + mark_tsc_async_resets("UV5+"); + return; + } + + /* UV2,3,4, UV BIOS TSC sync state available */ mmr = uv_early_read_mmr(UVH_TSC_SYNC_MMR); mmr_shift = is_uv2_hub() ? UVH_TSC_SYNC_SHIFT_UV2K : UVH_TSC_SYNC_SHIFT; @@ -1340,7 +1346,7 @@ static void __init decode_gam_params(unsigned long ptr) static void __init decode_gam_rng_tbl(unsigned long ptr) { struct uv_gam_range_entry *gre = (struct uv_gam_range_entry *)ptr; - unsigned long lgre = 0; + unsigned long lgre = 0, gend = 0; int index = 0; int sock_min = 999999, pnode_min = 99999; int sock_max = -1, pnode_max = -1; @@ -1374,6 +1380,9 @@ static void __init decode_gam_rng_tbl(unsigned long ptr) flag, size, suffix[order], gre->type, gre->nasid, gre->sockid, gre->pnode); + if (gre->type == UV_GAM_RANGE_TYPE_HOLE) + gend = (unsigned long)gre->limit << UV_GAM_RANGE_SHFT; + /* update to next range start */ lgre = gre->limit; if (sock_min > gre->sockid) @@ -1391,7 +1400,8 @@ static void __init decode_gam_rng_tbl(unsigned long ptr) _max_pnode = pnode_max; _gr_table_len = index; - pr_info("UV: GRT: %d entries, sockets(min:%x,max:%x) pnodes(min:%x,max:%x)\n", index, _min_socket, _max_socket, _min_pnode, _max_pnode); + pr_info("UV: GRT: %d entries, sockets(min:%x,max:%x), pnodes(min:%x,max:%x), gap_end(%d)\n", + index, _min_socket, _max_socket, _min_pnode, _max_pnode, fls64(gend)); } /* Walk through UVsystab decoding the fields */ diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 241dda687eb9..60e330cdbd17 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -232,6 +232,7 @@ #include <asm/paravirt.h> #include <asm/reboot.h> #include <asm/nospec-branch.h> +#include <asm/ibt.h> #if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) extern int (*console_blank_hook)(int); @@ -598,6 +599,7 @@ static long __apm_bios_call(void *_call) struct desc_struct save_desc_40; struct desc_struct *gdt; struct apm_bios_call *call = _call; + u64 ibt; cpu = get_cpu(); BUG_ON(cpu != 0); @@ -607,11 +609,13 @@ static long __apm_bios_call(void *_call) apm_irq_save(flags); firmware_restrict_branch_speculation_start(); + ibt = ibt_save(); APM_DO_SAVE_SEGS; apm_bios_call_asm(call->func, call->ebx, call->ecx, &call->eax, &call->ebx, &call->ecx, &call->edx, &call->esi); APM_DO_RESTORE_SEGS; + ibt_restore(ibt); firmware_restrict_branch_speculation_end(); apm_irq_restore(flags); gdt[0x40 / 8] = save_desc_40; @@ -676,6 +680,7 @@ static long __apm_bios_call_simple(void *_call) struct desc_struct save_desc_40; struct desc_struct *gdt; struct apm_bios_call *call = _call; + u64 ibt; cpu = get_cpu(); BUG_ON(cpu != 0); @@ -685,10 +690,12 @@ static long __apm_bios_call_simple(void *_call) apm_irq_save(flags); firmware_restrict_branch_speculation_start(); + ibt = ibt_save(); APM_DO_SAVE_SEGS; error = apm_bios_call_simple_asm(call->func, call->ebx, call->ecx, &call->eax); APM_DO_RESTORE_SEGS; + ibt_restore(ibt); firmware_restrict_branch_speculation_end(); apm_irq_restore(flags); gdt[0x40 / 8] = save_desc_40; diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index ecd3fd6993d1..437308004ef2 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -18,6 +18,7 @@ #include <asm/bootparam.h> #include <asm/suspend.h> #include <asm/tlbflush.h> +#include <asm/tdx.h> #ifdef CONFIG_XEN #include <xen/interface/xen.h> @@ -38,9 +39,6 @@ static void __used common(void) #endif BLANK(); - OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); - - BLANK(); OFFSET(pbe_address, pbe, address); OFFSET(pbe_orig_address, pbe, orig_address); OFFSET(pbe_next, pbe, next); @@ -69,6 +67,22 @@ static void __used common(void) #endif BLANK(); + OFFSET(TDX_MODULE_rcx, tdx_module_output, rcx); + OFFSET(TDX_MODULE_rdx, tdx_module_output, rdx); + OFFSET(TDX_MODULE_r8, tdx_module_output, r8); + OFFSET(TDX_MODULE_r9, tdx_module_output, r9); + OFFSET(TDX_MODULE_r10, tdx_module_output, r10); + OFFSET(TDX_MODULE_r11, tdx_module_output, r11); + + BLANK(); + OFFSET(TDX_HYPERCALL_r10, tdx_hypercall_args, r10); + OFFSET(TDX_HYPERCALL_r11, tdx_hypercall_args, r11); + OFFSET(TDX_HYPERCALL_r12, tdx_hypercall_args, r12); + OFFSET(TDX_HYPERCALL_r13, tdx_hypercall_args, r13); + OFFSET(TDX_HYPERCALL_r14, tdx_hypercall_args, r14); + OFFSET(TDX_HYPERCALL_r15, tdx_hypercall_args, r15); + + BLANK(); OFFSET(BP_scratch, boot_params, scratch); OFFSET(BP_secure_boot, boot_params, secure_boot); OFFSET(BP_loadflags, boot_params, hdr.loadflags); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index b14533af7676..9b698215d261 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -5,7 +5,7 @@ #include <asm/ia32.h> -#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS) +#if defined(CONFIG_KVM_GUEST) #include <asm/kvm_para.h> #endif @@ -20,7 +20,7 @@ int main(void) BLANK(); #endif -#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS) +#if defined(CONFIG_KVM_GUEST) OFFSET(KVM_STEAL_TIME_preempted, kvm_steal_time, preempted); BLANK(); #endif diff --git a/arch/x86/kernel/cc_platform.c b/arch/x86/kernel/cc_platform.c deleted file mode 100644 index 03bb2f343ddb..000000000000 --- a/arch/x86/kernel/cc_platform.c +++ /dev/null @@ -1,69 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Confidential Computing Platform Capability checks - * - * Copyright (C) 2021 Advanced Micro Devices, Inc. - * - * Author: Tom Lendacky <thomas.lendacky@amd.com> - */ - -#include <linux/export.h> -#include <linux/cc_platform.h> -#include <linux/mem_encrypt.h> - -#include <asm/processor.h> - -static bool __maybe_unused intel_cc_platform_has(enum cc_attr attr) -{ -#ifdef CONFIG_INTEL_TDX_GUEST - return false; -#else - return false; -#endif -} - -/* - * SME and SEV are very similar but they are not the same, so there are - * times that the kernel will need to distinguish between SME and SEV. The - * cc_platform_has() function is used for this. When a distinction isn't - * needed, the CC_ATTR_MEM_ENCRYPT attribute can be used. - * - * The trampoline code is a good example for this requirement. Before - * paging is activated, SME will access all memory as decrypted, but SEV - * will access all memory as encrypted. So, when APs are being brought - * up under SME the trampoline area cannot be encrypted, whereas under SEV - * the trampoline area must be encrypted. - */ -static bool amd_cc_platform_has(enum cc_attr attr) -{ -#ifdef CONFIG_AMD_MEM_ENCRYPT - switch (attr) { - case CC_ATTR_MEM_ENCRYPT: - return sme_me_mask; - - case CC_ATTR_HOST_MEM_ENCRYPT: - return sme_me_mask && !(sev_status & MSR_AMD64_SEV_ENABLED); - - case CC_ATTR_GUEST_MEM_ENCRYPT: - return sev_status & MSR_AMD64_SEV_ENABLED; - - case CC_ATTR_GUEST_STATE_ENCRYPT: - return sev_status & MSR_AMD64_SEV_ES_ENABLED; - - default: - return false; - } -#else - return false; -#endif -} - - -bool cc_platform_has(enum cc_attr attr) -{ - if (sme_me_mask) - return amd_cc_platform_has(attr); - - return false; -} -EXPORT_SYMBOL_GPL(cc_platform_has); diff --git a/arch/x86/kernel/cfi.c b/arch/x86/kernel/cfi.c new file mode 100644 index 000000000000..8674a5c0c031 --- /dev/null +++ b/arch/x86/kernel/cfi.c @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Clang Control Flow Integrity (CFI) support. + * + * Copyright (C) 2022 Google LLC + */ +#include <asm/cfi.h> +#include <asm/insn.h> +#include <asm/insn-eval.h> +#include <linux/string.h> + +/* + * Returns the target address and the expected type when regs->ip points + * to a compiler-generated CFI trap. + */ +static bool decode_cfi_insn(struct pt_regs *regs, unsigned long *target, + u32 *type) +{ + char buffer[MAX_INSN_SIZE]; + struct insn insn; + int offset = 0; + + *target = *type = 0; + + /* + * The compiler generates the following instruction sequence + * for indirect call checks: + * + * Â movl -<id>, %r10d ; 6 bytes + * addl -4(%reg), %r10d ; 4 bytes + * je .Ltmp1 ; 2 bytes + * ud2 ; <- regs->ip + * .Ltmp1: + * + * We can decode the expected type and the target address from the + * movl/addl instructions. + */ + if (copy_from_kernel_nofault(buffer, (void *)regs->ip - 12, MAX_INSN_SIZE)) + return false; + if (insn_decode_kernel(&insn, &buffer[offset])) + return false; + if (insn.opcode.value != 0xBA) + return false; + + *type = -(u32)insn.immediate.value; + + if (copy_from_kernel_nofault(buffer, (void *)regs->ip - 6, MAX_INSN_SIZE)) + return false; + if (insn_decode_kernel(&insn, &buffer[offset])) + return false; + if (insn.opcode.value != 0x3) + return false; + + /* Read the target address from the register. */ + offset = insn_get_modrm_rm_off(&insn, regs); + if (offset < 0) + return false; + + *target = *(unsigned long *)((void *)regs + offset); + + return true; +} + +/* + * Checks if a ud2 trap is because of a CFI failure, and handles the trap + * if needed. Returns a bug_trap_type value similarly to report_bug. + */ +enum bug_trap_type handle_cfi_failure(struct pt_regs *regs) +{ + unsigned long target; + u32 type; + + if (!is_cfi_trap(regs->ip)) + return BUG_TRAP_TYPE_NONE; + + if (!decode_cfi_insn(regs, &target, &type)) + return report_cfi_failure_noaddr(regs, regs->ip); + + return report_cfi_failure(regs, regs->ip, &target, type); +} + +/* + * Ensure that __kcfi_typeid_ symbols are emitted for functions that may + * not be indirectly called with all configurations. + */ +__ADDRESSABLE(__memcpy) diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 9661e3e802be..f10a921ee756 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -12,6 +12,7 @@ endif # If these files are instrumented, boot hangs during the first second. KCOV_INSTRUMENT_common.o := n KCOV_INSTRUMENT_perf_event.o := n +KMSAN_SANITIZE_common.o := n # As above, instrumenting secondary CPU boot code causes boot hangs. KCSAN_SANITIZE_common.o := n diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c index 23f5f27b5a02..485441b7f030 100644 --- a/arch/x86/kernel/cpu/acrn.c +++ b/arch/x86/kernel/cpu/acrn.c @@ -28,6 +28,9 @@ static void __init acrn_init_platform(void) { /* Setup the IDT for ACRN hypervisor callback */ alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_acrn_hv_callback); + + x86_platform.calibrate_tsc = acrn_get_tsc_khz; + x86_platform.calibrate_cpu = acrn_get_tsc_khz; } static bool acrn_x2apic_available(void) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 4edb6f0f628c..860b60273df3 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -394,35 +394,6 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c) per_cpu(cpu_llc_id, cpu) = c->cpu_die_id = c->phys_proc_id; } -static void amd_detect_ppin(struct cpuinfo_x86 *c) -{ - unsigned long long val; - - if (!cpu_has(c, X86_FEATURE_AMD_PPIN)) - return; - - /* When PPIN is defined in CPUID, still need to check PPIN_CTL MSR */ - if (rdmsrl_safe(MSR_AMD_PPIN_CTL, &val)) - goto clear_ppin; - - /* PPIN is locked in disabled mode, clear feature bit */ - if ((val & 3UL) == 1UL) - goto clear_ppin; - - /* If PPIN is disabled, try to enable it */ - if (!(val & 2UL)) { - wrmsrl_safe(MSR_AMD_PPIN_CTL, val | 2UL); - rdmsrl_safe(MSR_AMD_PPIN_CTL, &val); - } - - /* If PPIN_EN bit is 1, return from here; otherwise fall through */ - if (val & 2UL) - return; - -clear_ppin: - clear_cpu_cap(c, X86_FEATURE_AMD_PPIN); -} - u32 amd_get_nodes_per_socket(void) { return nodes_per_socket; @@ -532,7 +503,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) va_align.flags = ALIGN_VA_32 | ALIGN_VA_64; /* A random value per boot for bit slice [12:upper_bit) */ - va_align.bits = get_random_int() & va_align.mask; + va_align.bits = get_random_u32() & va_align.mask; } if (cpu_has(c, X86_FEATURE_MWAITX)) @@ -585,6 +556,8 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) * the SME physical address space reduction value. * If BIOS has not enabled SME then don't advertise the * SME feature (set in scattered.c). + * If the kernel has not enabled SME via any means then + * don't advertise the SME feature. * For SEV: If BIOS has not enabled SEV then don't advertise the * SEV and SEV_ES feature (set in scattered.c). * @@ -607,6 +580,9 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) if (IS_ENABLED(CONFIG_X86_32)) goto clear_all; + if (!sme_me_mask) + setup_clear_cpu_cap(X86_FEATURE_SME); + rdmsrl(MSR_K7_HWCR, msr); if (!(msr & MSR_K7_HWCR_SMMLOCK)) goto clear_sev; @@ -832,7 +808,7 @@ static void clear_rdrand_cpuid_bit(struct cpuinfo_x86 *c) return; /* - * The nordrand option can clear X86_FEATURE_RDRAND, so check for + * The self-test can clear X86_FEATURE_RDRAND, so check for * RDRAND support using the CPUID function directly. */ if (!(cpuid_ecx(1) & BIT(30)) || rdrand_force) @@ -886,6 +862,28 @@ static void init_amd_bd(struct cpuinfo_x86 *c) clear_rdrand_cpuid_bit(c); } +void init_spectral_chicken(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_CPU_UNRET_ENTRY + u64 value; + + /* + * On Zen2 we offer this chicken (bit) on the altar of Speculation. + * + * This suppresses speculation from the middle of a basic block, i.e. it + * suppresses non-branch predictions. + * + * We use STIBP as a heuristic to filter out Zen2 from the rest of F17H + */ + if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && cpu_has(c, X86_FEATURE_AMD_STIBP)) { + if (!rdmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, &value)) { + value |= MSR_ZEN2_SPECTRAL_CHICKEN_BIT; + wrmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, value); + } + } +#endif +} + static void init_amd_zn(struct cpuinfo_x86 *c) { set_cpu_cap(c, X86_FEATURE_ZEN); @@ -894,12 +892,21 @@ static void init_amd_zn(struct cpuinfo_x86 *c) node_reclaim_distance = 32; #endif - /* - * Fix erratum 1076: CPB feature bit not being set in CPUID. - * Always set it, except when running under a hypervisor. - */ - if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_CPB)) - set_cpu_cap(c, X86_FEATURE_CPB); + /* Fix up CPUID bits, but only if not virtualised. */ + if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) { + + /* Erratum 1076: CPB feature bit not being set in CPUID. */ + if (!cpu_has(c, X86_FEATURE_CPB)) + set_cpu_cap(c, X86_FEATURE_CPB); + + /* + * Zen3 (Fam19 model < 0x10) parts are not susceptible to + * Branch Type Confusion, but predate the allocation of the + * BTC_NO bit. + */ + if (c->x86 == 0x19 && !cpu_has(c, X86_FEATURE_BTC_NO)) + set_cpu_cap(c, X86_FEATURE_BTC_NO); + } } static void init_amd(struct cpuinfo_x86 *c) @@ -931,7 +938,8 @@ static void init_amd(struct cpuinfo_x86 *c) case 0x12: init_amd_ln(c); break; case 0x15: init_amd_bd(c); break; case 0x16: init_amd_jg(c); break; - case 0x17: fallthrough; + case 0x17: init_spectral_chicken(c); + fallthrough; case 0x19: init_amd_zn(c); break; } @@ -947,7 +955,6 @@ static void init_amd(struct cpuinfo_x86 *c) amd_detect_cmp(c); amd_get_topology(c); srat_detect_node(c); - amd_detect_ppin(c); init_amd_cacheinfo(c); diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index 22911deacb6e..1f60a2b27936 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -6,146 +6,446 @@ * Copyright (C) 2017 Intel Corp. * Author: Len Brown <len.brown@intel.com> */ - +#include <linux/cpufreq.h> #include <linux/delay.h> #include <linux/ktime.h> #include <linux/math64.h> #include <linux/percpu.h> -#include <linux/cpufreq.h> -#include <linux/smp.h> -#include <linux/sched/isolation.h> #include <linux/rcupdate.h> +#include <linux/sched/isolation.h> +#include <linux/sched/topology.h> +#include <linux/smp.h> +#include <linux/syscore_ops.h> + +#include <asm/cpu.h> +#include <asm/cpu_device_id.h> +#include <asm/intel-family.h> #include "cpu.h" -struct aperfmperf_sample { - unsigned int khz; - atomic_t scfpending; - ktime_t time; - u64 aperf; - u64 mperf; +struct aperfmperf { + seqcount_t seq; + unsigned long last_update; + u64 acnt; + u64 mcnt; + u64 aperf; + u64 mperf; }; -static DEFINE_PER_CPU(struct aperfmperf_sample, samples); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = { + .seq = SEQCNT_ZERO(cpu_samples.seq) +}; -#define APERFMPERF_CACHE_THRESHOLD_MS 10 -#define APERFMPERF_REFRESH_DELAY_MS 10 -#define APERFMPERF_STALE_THRESHOLD_MS 1000 +static void init_counter_refs(void) +{ + u64 aperf, mperf; + + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + this_cpu_write(cpu_samples.aperf, aperf); + this_cpu_write(cpu_samples.mperf, mperf); +} + +#if defined(CONFIG_X86_64) && defined(CONFIG_SMP) /* - * aperfmperf_snapshot_khz() - * On the current CPU, snapshot APERF, MPERF, and jiffies - * unless we already did it within 10ms - * calculate kHz, save snapshot + * APERF/MPERF frequency ratio computation. + * + * The scheduler wants to do frequency invariant accounting and needs a <1 + * ratio to account for the 'current' frequency, corresponding to + * freq_curr / freq_max. + * + * Since the frequency freq_curr on x86 is controlled by micro-controller and + * our P-state setting is little more than a request/hint, we need to observe + * the effective frequency 'BusyMHz', i.e. the average frequency over a time + * interval after discarding idle time. This is given by: + * + * BusyMHz = delta_APERF / delta_MPERF * freq_base + * + * where freq_base is the max non-turbo P-state. + * + * The freq_max term has to be set to a somewhat arbitrary value, because we + * can't know which turbo states will be available at a given point in time: + * it all depends on the thermal headroom of the entire package. We set it to + * the turbo level with 4 cores active. + * + * Benchmarks show that's a good compromise between the 1C turbo ratio + * (freq_curr/freq_max would rarely reach 1) and something close to freq_base, + * which would ignore the entire turbo range (a conspicuous part, making + * freq_curr/freq_max always maxed out). + * + * An exception to the heuristic above is the Atom uarch, where we choose the + * highest turbo level for freq_max since Atom's are generally oriented towards + * power efficiency. + * + * Setting freq_max to anything less than the 1C turbo ratio makes the ratio + * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1. */ -static void aperfmperf_snapshot_khz(void *dummy) + +DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key); + +static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE; +static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE; + +void arch_set_max_freq_ratio(bool turbo_disabled) { - u64 aperf, aperf_delta; - u64 mperf, mperf_delta; - struct aperfmperf_sample *s = this_cpu_ptr(&samples); - unsigned long flags; + arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE : + arch_turbo_freq_ratio; +} +EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio); - local_irq_save(flags); - rdmsrl(MSR_IA32_APERF, aperf); - rdmsrl(MSR_IA32_MPERF, mperf); - local_irq_restore(flags); +static bool __init turbo_disabled(void) +{ + u64 misc_en; + int err; + + err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en); + if (err) + return false; + + return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); +} + +static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) +{ + int err; + + err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq); + if (err) + return false; + + err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq); + if (err) + return false; + + *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */ + *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */ + + return true; +} + +#define X86_MATCH(model) \ + X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \ + INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL) + +static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = { + X86_MATCH(XEON_PHI_KNL), + X86_MATCH(XEON_PHI_KNM), + {} +}; + +static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = { + X86_MATCH(SKYLAKE_X), + {} +}; + +static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = { + X86_MATCH(ATOM_GOLDMONT), + X86_MATCH(ATOM_GOLDMONT_D), + X86_MATCH(ATOM_GOLDMONT_PLUS), + {} +}; + +static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, + int num_delta_fratio) +{ + int fratio, delta_fratio, found; + int err, i; + u64 msr; + + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + if (err) + return false; + + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); + if (err) + return false; + + fratio = (msr >> 8) & 0xFF; + i = 16; + found = 0; + do { + if (found >= num_delta_fratio) { + *turbo_freq = fratio; + return true; + } + + delta_fratio = (msr >> (i + 5)) & 0x7; + + if (delta_fratio) { + found += 1; + fratio -= delta_fratio; + } + + i += 8; + } while (i < 64); + + return true; +} + +static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size) +{ + u64 ratios, counts; + u32 group_size; + int err, i; + + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + if (err) + return false; + + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios); + if (err) + return false; + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts); + if (err) + return false; + + for (i = 0; i < 64; i += 8) { + group_size = (counts >> i) & 0xFF; + if (group_size >= size) { + *turbo_freq = (ratios >> i) & 0xFF; + return true; + } + } + + return false; +} - aperf_delta = aperf - s->aperf; - mperf_delta = mperf - s->mperf; +static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) +{ + u64 msr; + int err; + + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + if (err) + return false; + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); + if (err) + return false; + + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ + *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */ + + /* The CPU may have less than 4 cores */ + if (!*turbo_freq) + *turbo_freq = msr & 0xFF; /* 1C turbo */ + + return true; +} + +static bool __init intel_set_max_freq_ratio(void) +{ + u64 base_freq, turbo_freq; + u64 turbo_ratio; + if (slv_set_max_freq_ratio(&base_freq, &turbo_freq)) + goto out; + + if (x86_match_cpu(has_glm_turbo_ratio_limits) && + skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) + goto out; + + if (x86_match_cpu(has_knl_turbo_ratio_limits) && + knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) + goto out; + + if (x86_match_cpu(has_skx_turbo_ratio_limits) && + skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4)) + goto out; + + if (core_set_max_freq_ratio(&base_freq, &turbo_freq)) + goto out; + + return false; + +out: /* - * There is no architectural guarantee that MPERF - * increments faster than we can read it. + * Some hypervisors advertise X86_FEATURE_APERFMPERF + * but then fill all MSR's with zeroes. + * Some CPUs have turbo boost but don't declare any turbo ratio + * in MSR_TURBO_RATIO_LIMIT. */ - if (mperf_delta == 0) - return; + if (!base_freq || !turbo_freq) { + pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n"); + return false; + } - s->time = ktime_get(); - s->aperf = aperf; - s->mperf = mperf; - s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta); - atomic_set_release(&s->scfpending, 0); + turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq); + if (!turbo_ratio) { + pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n"); + return false; + } + + arch_turbo_freq_ratio = turbo_ratio; + arch_set_max_freq_ratio(turbo_disabled()); + + return true; } -static bool aperfmperf_snapshot_cpu(int cpu, ktime_t now, bool wait) +#ifdef CONFIG_PM_SLEEP +static struct syscore_ops freq_invariance_syscore_ops = { + .resume = init_counter_refs, +}; + +static void register_freq_invariance_syscore_ops(void) { - s64 time_delta = ktime_ms_delta(now, per_cpu(samples.time, cpu)); - struct aperfmperf_sample *s = per_cpu_ptr(&samples, cpu); + register_syscore_ops(&freq_invariance_syscore_ops); +} +#else +static inline void register_freq_invariance_syscore_ops(void) {} +#endif - /* Don't bother re-computing within the cache threshold time. */ - if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS) - return true; +static void freq_invariance_enable(void) +{ + if (static_branch_unlikely(&arch_scale_freq_key)) { + WARN_ON_ONCE(1); + return; + } + static_branch_enable(&arch_scale_freq_key); + register_freq_invariance_syscore_ops(); + pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); +} + +void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) +{ + arch_turbo_freq_ratio = ratio; + arch_set_max_freq_ratio(turbo_disabled); + freq_invariance_enable(); +} + +static void __init bp_init_freq_invariance(void) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return; - if (!atomic_xchg(&s->scfpending, 1) || wait) - smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, wait); + if (intel_set_max_freq_ratio()) + freq_invariance_enable(); +} - /* Return false if the previous iteration was too long ago. */ - return time_delta <= APERFMPERF_STALE_THRESHOLD_MS; +static void disable_freq_invariance_workfn(struct work_struct *work) +{ + static_branch_disable(&arch_scale_freq_key); } -unsigned int aperfmperf_get_khz(int cpu) +static DECLARE_WORK(disable_freq_invariance_work, + disable_freq_invariance_workfn); + +DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; + +static void scale_freq_tick(u64 acnt, u64 mcnt) { - if (!cpu_khz) - return 0; + u64 freq_scale; - if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) - return 0; + if (!arch_scale_freq_invariant()) + return; - if (!housekeeping_cpu(cpu, HK_FLAG_MISC)) - return 0; + if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) + goto error; - if (rcu_is_idle_cpu(cpu)) - return 0; /* Idle CPUs are completely uninteresting. */ + if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt) + goto error; - aperfmperf_snapshot_cpu(cpu, ktime_get(), true); - return per_cpu(samples.khz, cpu); + freq_scale = div64_u64(acnt, mcnt); + if (!freq_scale) + goto error; + + if (freq_scale > SCHED_CAPACITY_SCALE) + freq_scale = SCHED_CAPACITY_SCALE; + + this_cpu_write(arch_freq_scale, freq_scale); + return; + +error: + pr_warn("Scheduler frequency invariance went wobbly, disabling!\n"); + schedule_work(&disable_freq_invariance_work); } +#else +static inline void bp_init_freq_invariance(void) { } +static inline void scale_freq_tick(u64 acnt, u64 mcnt) { } +#endif /* CONFIG_X86_64 && CONFIG_SMP */ -void arch_freq_prepare_all(void) +void arch_scale_freq_tick(void) { - ktime_t now = ktime_get(); - bool wait = false; - int cpu; + struct aperfmperf *s = this_cpu_ptr(&cpu_samples); + u64 acnt, mcnt, aperf, mperf; - if (!cpu_khz) + if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) return; - if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) - return; + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + acnt = aperf - s->aperf; + mcnt = mperf - s->mperf; - for_each_online_cpu(cpu) { - if (!housekeeping_cpu(cpu, HK_FLAG_MISC)) - continue; - if (rcu_is_idle_cpu(cpu)) - continue; /* Idle CPUs are completely uninteresting. */ - if (!aperfmperf_snapshot_cpu(cpu, now, false)) - wait = true; - } + s->aperf = aperf; + s->mperf = mperf; + + raw_write_seqcount_begin(&s->seq); + s->last_update = jiffies; + s->acnt = acnt; + s->mcnt = mcnt; + raw_write_seqcount_end(&s->seq); - if (wait) - msleep(APERFMPERF_REFRESH_DELAY_MS); + scale_freq_tick(acnt, mcnt); } +/* + * Discard samples older than the define maximum sample age of 20ms. There + * is no point in sending IPIs in such a case. If the scheduler tick was + * not running then the CPU is either idle or isolated. + */ +#define MAX_SAMPLE_AGE ((unsigned long)HZ / 50) + unsigned int arch_freq_get_on_cpu(int cpu) { - struct aperfmperf_sample *s = per_cpu_ptr(&samples, cpu); + struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu); + unsigned int seq, freq; + unsigned long last; + u64 acnt, mcnt; - if (!cpu_khz) - return 0; + if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) + goto fallback; - if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) - return 0; + do { + seq = raw_read_seqcount_begin(&s->seq); + last = s->last_update; + acnt = s->acnt; + mcnt = s->mcnt; + } while (read_seqcount_retry(&s->seq, seq)); - if (!housekeeping_cpu(cpu, HK_FLAG_MISC)) - return 0; + /* + * Bail on invalid count and when the last update was too long ago, + * which covers idle and NOHZ full CPUs. + */ + if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE) + goto fallback; + + return div64_u64((cpu_khz * acnt), mcnt); + +fallback: + freq = cpufreq_quick_get(cpu); + return freq ? freq : cpu_khz; +} - if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true)) - return per_cpu(samples.khz, cpu); +static int __init bp_init_aperfmperf(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) + return 0; - msleep(APERFMPERF_REFRESH_DELAY_MS); - atomic_set(&s->scfpending, 1); - smp_mb(); /* ->scfpending before smp_call_function_single(). */ - smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1); + init_counter_refs(); + bp_init_freq_invariance(); + return 0; +} +early_initcall(bp_init_aperfmperf); - return per_cpu(samples.khz, cpu); +void ap_init_aperfmperf(void) +{ + if (cpu_feature_enabled(X86_FEATURE_APERFMPERF)) + init_counter_refs(); } diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 1c1f218a701d..3e3230cccaa7 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -16,6 +16,7 @@ #include <linux/prctl.h> #include <linux/sched/smt.h> #include <linux/pgtable.h> +#include <linux/bpf.h> #include <asm/spec-ctrl.h> #include <asm/cmdline.h> @@ -37,24 +38,52 @@ static void __init spectre_v1_select_mitigation(void); static void __init spectre_v2_select_mitigation(void); +static void __init retbleed_select_mitigation(void); +static void __init spectre_v2_user_select_mitigation(void); static void __init ssb_select_mitigation(void); static void __init l1tf_select_mitigation(void); static void __init mds_select_mitigation(void); -static void __init mds_print_mitigation(void); +static void __init md_clear_update_mitigation(void); +static void __init md_clear_select_mitigation(void); static void __init taa_select_mitigation(void); +static void __init mmio_select_mitigation(void); static void __init srbds_select_mitigation(void); static void __init l1d_flush_select_mitigation(void); -/* The base value of the SPEC_CTRL MSR that always has to be preserved. */ +/* The base value of the SPEC_CTRL MSR without task-specific bits set */ u64 x86_spec_ctrl_base; EXPORT_SYMBOL_GPL(x86_spec_ctrl_base); + +/* The current value of the SPEC_CTRL MSR with task-specific bits set */ +DEFINE_PER_CPU(u64, x86_spec_ctrl_current); +EXPORT_SYMBOL_GPL(x86_spec_ctrl_current); + static DEFINE_MUTEX(spec_ctrl_mutex); /* - * The vendor and possibly platform specific bits which can be modified in - * x86_spec_ctrl_base. + * Keep track of the SPEC_CTRL MSR value for the current task, which may differ + * from x86_spec_ctrl_base due to STIBP/SSB in __speculation_ctrl_update(). */ -static u64 __ro_after_init x86_spec_ctrl_mask = SPEC_CTRL_IBRS; +void write_spec_ctrl_current(u64 val, bool force) +{ + if (this_cpu_read(x86_spec_ctrl_current) == val) + return; + + this_cpu_write(x86_spec_ctrl_current, val); + + /* + * When KERNEL_IBRS this MSR is written on return-to-user, unless + * forced the update can be delayed until that time. + */ + if (force || !cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS)) + wrmsrl(MSR_IA32_SPEC_CTRL, val); +} + +u64 spec_ctrl_current(void) +{ + return this_cpu_read(x86_spec_ctrl_current); +} +EXPORT_SYMBOL_GPL(spec_ctrl_current); /* * AMD specific MSR info for Speculative Store Bypass control. @@ -84,6 +113,10 @@ EXPORT_SYMBOL_GPL(mds_idle_clear); */ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); +/* Controls CPU Fill buffer clear before KVM guest MMIO accesses */ +DEFINE_STATIC_KEY_FALSE(mmio_stale_data_clear); +EXPORT_SYMBOL_GPL(mmio_stale_data_clear); + void __init check_bugs(void) { identify_boot_cpu(); @@ -107,26 +140,27 @@ void __init check_bugs(void) if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); - /* Allow STIBP in MSR_SPEC_CTRL if supported */ - if (boot_cpu_has(X86_FEATURE_STIBP)) - x86_spec_ctrl_mask |= SPEC_CTRL_STIBP; - /* Select the proper CPU mitigations before patching alternatives: */ spectre_v1_select_mitigation(); spectre_v2_select_mitigation(); + /* + * retbleed_select_mitigation() relies on the state set by + * spectre_v2_select_mitigation(); specifically it wants to know about + * spectre_v2=ibrs. + */ + retbleed_select_mitigation(); + /* + * spectre_v2_user_select_mitigation() relies on the state set by + * retbleed_select_mitigation(); specifically the STIBP selection is + * forced for UNRET or IBPB. + */ + spectre_v2_user_select_mitigation(); ssb_select_mitigation(); l1tf_select_mitigation(); - mds_select_mitigation(); - taa_select_mitigation(); + md_clear_select_mitigation(); srbds_select_mitigation(); l1d_flush_select_mitigation(); - /* - * As MDS and TAA mitigations are inter-related, print MDS - * mitigation until after TAA mitigation selection is done. - */ - mds_print_mitigation(); - arch_smt_update(); #ifdef CONFIG_X86_32 @@ -161,37 +195,16 @@ void __init check_bugs(void) #endif } +/* + * NOTE: This function is *only* called for SVM, since Intel uses + * MSR_IA32_SPEC_CTRL for SSBD. + */ void -x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest) +x86_virt_spec_ctrl(u64 guest_virt_spec_ctrl, bool setguest) { - u64 msrval, guestval, hostval = x86_spec_ctrl_base; + u64 guestval, hostval; struct thread_info *ti = current_thread_info(); - /* Is MSR_SPEC_CTRL implemented ? */ - if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) { - /* - * Restrict guest_spec_ctrl to supported values. Clear the - * modifiable bits in the host base value and or the - * modifiable bits from the guest value. - */ - guestval = hostval & ~x86_spec_ctrl_mask; - guestval |= guest_spec_ctrl & x86_spec_ctrl_mask; - - /* SSBD controlled in MSR_SPEC_CTRL */ - if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || - static_cpu_has(X86_FEATURE_AMD_SSBD)) - hostval |= ssbd_tif_to_spec_ctrl(ti->flags); - - /* Conditional STIBP enabled? */ - if (static_branch_unlikely(&switch_to_cond_stibp)) - hostval |= stibp_tif_to_spec_ctrl(ti->flags); - - if (hostval != guestval) { - msrval = setguest ? guestval : hostval; - wrmsrl(MSR_IA32_SPEC_CTRL, msrval); - } - } - /* * If SSBD is not handled in MSR_SPEC_CTRL on AMD, update * MSR_AMD64_L2_CFG or MSR_VIRT_SPEC_CTRL if supported. @@ -266,14 +279,6 @@ static void __init mds_select_mitigation(void) } } -static void __init mds_print_mitigation(void) -{ - if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) - return; - - pr_info("%s\n", mds_strings[mds_mitigation]); -} - static int __init mds_cmdline(char *str) { if (!boot_cpu_has_bug(X86_BUG_MDS)) @@ -328,7 +333,7 @@ static void __init taa_select_mitigation(void) /* TSX previously disabled by tsx=off */ if (!boot_cpu_has(X86_FEATURE_RTM)) { taa_mitigation = TAA_MITIGATION_TSX_DISABLED; - goto out; + return; } if (cpu_mitigations_off()) { @@ -342,7 +347,7 @@ static void __init taa_select_mitigation(void) */ if (taa_mitigation == TAA_MITIGATION_OFF && mds_mitigation == MDS_MITIGATION_OFF) - goto out; + return; if (boot_cpu_has(X86_FEATURE_MD_CLEAR)) taa_mitigation = TAA_MITIGATION_VERW; @@ -374,18 +379,6 @@ static void __init taa_select_mitigation(void) if (taa_nosmt || cpu_mitigations_auto_nosmt()) cpu_smt_disable(false); - - /* - * Update MDS mitigation, if necessary, as the mds_user_clear is - * now enabled for TAA mitigation. - */ - if (mds_mitigation == MDS_MITIGATION_OFF && - boot_cpu_has_bug(X86_BUG_MDS)) { - mds_mitigation = MDS_MITIGATION_FULL; - mds_select_mitigation(); - } -out: - pr_info("%s\n", taa_strings[taa_mitigation]); } static int __init tsx_async_abort_parse_cmdline(char *str) @@ -410,6 +403,154 @@ static int __init tsx_async_abort_parse_cmdline(char *str) early_param("tsx_async_abort", tsx_async_abort_parse_cmdline); #undef pr_fmt +#define pr_fmt(fmt) "MMIO Stale Data: " fmt + +enum mmio_mitigations { + MMIO_MITIGATION_OFF, + MMIO_MITIGATION_UCODE_NEEDED, + MMIO_MITIGATION_VERW, +}; + +/* Default mitigation for Processor MMIO Stale Data vulnerabilities */ +static enum mmio_mitigations mmio_mitigation __ro_after_init = MMIO_MITIGATION_VERW; +static bool mmio_nosmt __ro_after_init = false; + +static const char * const mmio_strings[] = { + [MMIO_MITIGATION_OFF] = "Vulnerable", + [MMIO_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode", + [MMIO_MITIGATION_VERW] = "Mitigation: Clear CPU buffers", +}; + +static void __init mmio_select_mitigation(void) +{ + u64 ia32_cap; + + if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || + boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN) || + cpu_mitigations_off()) { + mmio_mitigation = MMIO_MITIGATION_OFF; + return; + } + + if (mmio_mitigation == MMIO_MITIGATION_OFF) + return; + + ia32_cap = x86_read_arch_cap_msr(); + + /* + * Enable CPU buffer clear mitigation for host and VMM, if also affected + * by MDS or TAA. Otherwise, enable mitigation for VMM only. + */ + if (boot_cpu_has_bug(X86_BUG_MDS) || (boot_cpu_has_bug(X86_BUG_TAA) && + boot_cpu_has(X86_FEATURE_RTM))) + static_branch_enable(&mds_user_clear); + else + static_branch_enable(&mmio_stale_data_clear); + + /* + * If Processor-MMIO-Stale-Data bug is present and Fill Buffer data can + * be propagated to uncore buffers, clearing the Fill buffers on idle + * is required irrespective of SMT state. + */ + if (!(ia32_cap & ARCH_CAP_FBSDP_NO)) + static_branch_enable(&mds_idle_clear); + + /* + * Check if the system has the right microcode. + * + * CPU Fill buffer clear mitigation is enumerated by either an explicit + * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS + * affected systems. + */ + if ((ia32_cap & ARCH_CAP_FB_CLEAR) || + (boot_cpu_has(X86_FEATURE_MD_CLEAR) && + boot_cpu_has(X86_FEATURE_FLUSH_L1D) && + !(ia32_cap & ARCH_CAP_MDS_NO))) + mmio_mitigation = MMIO_MITIGATION_VERW; + else + mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED; + + if (mmio_nosmt || cpu_mitigations_auto_nosmt()) + cpu_smt_disable(false); +} + +static int __init mmio_stale_data_parse_cmdline(char *str) +{ + if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) + return 0; + + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) { + mmio_mitigation = MMIO_MITIGATION_OFF; + } else if (!strcmp(str, "full")) { + mmio_mitigation = MMIO_MITIGATION_VERW; + } else if (!strcmp(str, "full,nosmt")) { + mmio_mitigation = MMIO_MITIGATION_VERW; + mmio_nosmt = true; + } + + return 0; +} +early_param("mmio_stale_data", mmio_stale_data_parse_cmdline); + +#undef pr_fmt +#define pr_fmt(fmt) "" fmt + +static void __init md_clear_update_mitigation(void) +{ + if (cpu_mitigations_off()) + return; + + if (!static_key_enabled(&mds_user_clear)) + goto out; + + /* + * mds_user_clear is now enabled. Update MDS, TAA and MMIO Stale Data + * mitigation, if necessary. + */ + if (mds_mitigation == MDS_MITIGATION_OFF && + boot_cpu_has_bug(X86_BUG_MDS)) { + mds_mitigation = MDS_MITIGATION_FULL; + mds_select_mitigation(); + } + if (taa_mitigation == TAA_MITIGATION_OFF && + boot_cpu_has_bug(X86_BUG_TAA)) { + taa_mitigation = TAA_MITIGATION_VERW; + taa_select_mitigation(); + } + if (mmio_mitigation == MMIO_MITIGATION_OFF && + boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) { + mmio_mitigation = MMIO_MITIGATION_VERW; + mmio_select_mitigation(); + } +out: + if (boot_cpu_has_bug(X86_BUG_MDS)) + pr_info("MDS: %s\n", mds_strings[mds_mitigation]); + if (boot_cpu_has_bug(X86_BUG_TAA)) + pr_info("TAA: %s\n", taa_strings[taa_mitigation]); + if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) + pr_info("MMIO Stale Data: %s\n", mmio_strings[mmio_mitigation]); + else if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) + pr_info("MMIO Stale Data: Unknown: No mitigations\n"); +} + +static void __init md_clear_select_mitigation(void) +{ + mds_select_mitigation(); + taa_select_mitigation(); + mmio_select_mitigation(); + + /* + * As MDS, TAA and MMIO Stale Data mitigations are inter-related, update + * and print their mitigation after MDS, TAA and MMIO Stale Data + * mitigation selection is done. + */ + md_clear_update_mitigation(); +} + +#undef pr_fmt #define pr_fmt(fmt) "SRBDS: " fmt enum srbds_mitigations { @@ -445,6 +586,13 @@ void update_srbds_msr(void) if (srbds_mitigation == SRBDS_MITIGATION_UCODE_NEEDED) return; + /* + * A MDS_NO CPU for which SRBDS mitigation is not needed due to TSX + * being disabled and it hasn't received the SRBDS MSR microcode. + */ + if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL)) + return; + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); switch (srbds_mitigation) { @@ -470,11 +618,13 @@ static void __init srbds_select_mitigation(void) return; /* - * Check to see if this is one of the MDS_NO systems supporting - * TSX that are only exposed to SRBDS when TSX is enabled. + * Check to see if this is one of the MDS_NO systems supporting TSX that + * are only exposed to SRBDS when TSX is enabled or when CPU is affected + * by Processor MMIO Stale Data vulnerability. */ ia32_cap = x86_read_arch_cap_msr(); - if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM)) + if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) && + !boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) srbds_mitigation = SRBDS_MITIGATION_TSX_OFF; else if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR; @@ -618,12 +768,180 @@ static int __init nospectre_v1_cmdline(char *str) } early_param("nospectre_v1", nospectre_v1_cmdline); -#undef pr_fmt -#define pr_fmt(fmt) "Spectre V2 : " fmt - static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = SPECTRE_V2_NONE; +#undef pr_fmt +#define pr_fmt(fmt) "RETBleed: " fmt + +enum retbleed_mitigation { + RETBLEED_MITIGATION_NONE, + RETBLEED_MITIGATION_UNRET, + RETBLEED_MITIGATION_IBPB, + RETBLEED_MITIGATION_IBRS, + RETBLEED_MITIGATION_EIBRS, +}; + +enum retbleed_mitigation_cmd { + RETBLEED_CMD_OFF, + RETBLEED_CMD_AUTO, + RETBLEED_CMD_UNRET, + RETBLEED_CMD_IBPB, +}; + +static const char * const retbleed_strings[] = { + [RETBLEED_MITIGATION_NONE] = "Vulnerable", + [RETBLEED_MITIGATION_UNRET] = "Mitigation: untrained return thunk", + [RETBLEED_MITIGATION_IBPB] = "Mitigation: IBPB", + [RETBLEED_MITIGATION_IBRS] = "Mitigation: IBRS", + [RETBLEED_MITIGATION_EIBRS] = "Mitigation: Enhanced IBRS", +}; + +static enum retbleed_mitigation retbleed_mitigation __ro_after_init = + RETBLEED_MITIGATION_NONE; +static enum retbleed_mitigation_cmd retbleed_cmd __ro_after_init = + RETBLEED_CMD_AUTO; + +static int __ro_after_init retbleed_nosmt = false; + +static int __init retbleed_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + while (str) { + char *next = strchr(str, ','); + if (next) { + *next = 0; + next++; + } + + if (!strcmp(str, "off")) { + retbleed_cmd = RETBLEED_CMD_OFF; + } else if (!strcmp(str, "auto")) { + retbleed_cmd = RETBLEED_CMD_AUTO; + } else if (!strcmp(str, "unret")) { + retbleed_cmd = RETBLEED_CMD_UNRET; + } else if (!strcmp(str, "ibpb")) { + retbleed_cmd = RETBLEED_CMD_IBPB; + } else if (!strcmp(str, "nosmt")) { + retbleed_nosmt = true; + } else { + pr_err("Ignoring unknown retbleed option (%s).", str); + } + + str = next; + } + + return 0; +} +early_param("retbleed", retbleed_parse_cmdline); + +#define RETBLEED_UNTRAIN_MSG "WARNING: BTB untrained return thunk mitigation is only effective on AMD/Hygon!\n" +#define RETBLEED_INTEL_MSG "WARNING: Spectre v2 mitigation leaves CPU vulnerable to RETBleed attacks, data leaks possible!\n" + +static void __init retbleed_select_mitigation(void) +{ + bool mitigate_smt = false; + + if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) + return; + + switch (retbleed_cmd) { + case RETBLEED_CMD_OFF: + return; + + case RETBLEED_CMD_UNRET: + if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY)) { + retbleed_mitigation = RETBLEED_MITIGATION_UNRET; + } else { + pr_err("WARNING: kernel not compiled with CPU_UNRET_ENTRY.\n"); + goto do_cmd_auto; + } + break; + + case RETBLEED_CMD_IBPB: + if (!boot_cpu_has(X86_FEATURE_IBPB)) { + pr_err("WARNING: CPU does not support IBPB.\n"); + goto do_cmd_auto; + } else if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY)) { + retbleed_mitigation = RETBLEED_MITIGATION_IBPB; + } else { + pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n"); + goto do_cmd_auto; + } + break; + +do_cmd_auto: + case RETBLEED_CMD_AUTO: + default: + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { + if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY)) + retbleed_mitigation = RETBLEED_MITIGATION_UNRET; + else if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY) && boot_cpu_has(X86_FEATURE_IBPB)) + retbleed_mitigation = RETBLEED_MITIGATION_IBPB; + } + + /* + * The Intel mitigation (IBRS or eIBRS) was already selected in + * spectre_v2_select_mitigation(). 'retbleed_mitigation' will + * be set accordingly below. + */ + + break; + } + + switch (retbleed_mitigation) { + case RETBLEED_MITIGATION_UNRET: + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + setup_force_cpu_cap(X86_FEATURE_UNRET); + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) + pr_err(RETBLEED_UNTRAIN_MSG); + + mitigate_smt = true; + break; + + case RETBLEED_MITIGATION_IBPB: + setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); + mitigate_smt = true; + break; + + default: + break; + } + + if (mitigate_smt && !boot_cpu_has(X86_FEATURE_STIBP) && + (retbleed_nosmt || cpu_mitigations_auto_nosmt())) + cpu_smt_disable(false); + + /* + * Let IBRS trump all on Intel without affecting the effects of the + * retbleed= cmdline option. + */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + switch (spectre_v2_enabled) { + case SPECTRE_V2_IBRS: + retbleed_mitigation = RETBLEED_MITIGATION_IBRS; + break; + case SPECTRE_V2_EIBRS: + case SPECTRE_V2_EIBRS_RETPOLINE: + case SPECTRE_V2_EIBRS_LFENCE: + retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; + break; + default: + pr_err(RETBLEED_INTEL_MSG); + } + } + + pr_info("%s\n", retbleed_strings[retbleed_mitigation]); +} + +#undef pr_fmt +#define pr_fmt(fmt) "Spectre V2 : " fmt + static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init = SPECTRE_V2_USER_NONE; static enum spectre_v2_user_mitigation spectre_v2_user_ibpb __ro_after_init = @@ -650,6 +968,33 @@ static inline const char *spectre_v2_module_string(void) static inline const char *spectre_v2_module_string(void) { return ""; } #endif +#define SPECTRE_V2_LFENCE_MSG "WARNING: LFENCE mitigation is not recommended for this CPU, data leaks possible!\n" +#define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n" +#define SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS+LFENCE mitigation and SMT, data leaks possible via Spectre v2 BHB attacks!\n" +#define SPECTRE_V2_IBRS_PERF_MSG "WARNING: IBRS mitigation selected on Enhanced IBRS CPU, this may cause unnecessary performance loss\n" + +#ifdef CONFIG_BPF_SYSCALL +void unpriv_ebpf_notify(int new_state) +{ + if (new_state) + return; + + /* Unprivileged eBPF is enabled */ + + switch (spectre_v2_enabled) { + case SPECTRE_V2_EIBRS: + pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); + break; + case SPECTRE_V2_EIBRS_LFENCE: + if (sched_smt_active()) + pr_err(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG); + break; + default: + break; + } +} +#endif + static inline bool match_option(const char *arg, int arglen, const char *opt) { int len = strlen(opt); @@ -664,7 +1009,11 @@ enum spectre_v2_mitigation_cmd { SPECTRE_V2_CMD_FORCE, SPECTRE_V2_CMD_RETPOLINE, SPECTRE_V2_CMD_RETPOLINE_GENERIC, - SPECTRE_V2_CMD_RETPOLINE_AMD, + SPECTRE_V2_CMD_RETPOLINE_LFENCE, + SPECTRE_V2_CMD_EIBRS, + SPECTRE_V2_CMD_EIBRS_RETPOLINE, + SPECTRE_V2_CMD_EIBRS_LFENCE, + SPECTRE_V2_CMD_IBRS, }; enum spectre_v2_user_cmd { @@ -705,13 +1054,15 @@ static void __init spec_v2_user_print_cond(const char *reason, bool secure) pr_info("spectre_v2_user=%s forced on command line.\n", reason); } +static __ro_after_init enum spectre_v2_mitigation_cmd spectre_v2_cmd; + static enum spectre_v2_user_cmd __init -spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd) +spectre_v2_parse_user_cmdline(void) { char arg[20]; int ret, i; - switch (v2_cmd) { + switch (spectre_v2_cmd) { case SPECTRE_V2_CMD_NONE: return SPECTRE_V2_USER_CMD_NONE; case SPECTRE_V2_CMD_FORCE: @@ -737,8 +1088,16 @@ spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd) return SPECTRE_V2_USER_CMD_AUTO; } +static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode) +{ + return mode == SPECTRE_V2_IBRS || + mode == SPECTRE_V2_EIBRS || + mode == SPECTRE_V2_EIBRS_RETPOLINE || + mode == SPECTRE_V2_EIBRS_LFENCE; +} + static void __init -spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) +spectre_v2_user_select_mitigation(void) { enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE; bool smt_possible = IS_ENABLED(CONFIG_SMP); @@ -751,7 +1110,7 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) cpu_smt_control == CPU_SMT_NOT_SUPPORTED) smt_possible = false; - cmd = spectre_v2_parse_user_cmdline(v2_cmd); + cmd = spectre_v2_parse_user_cmdline(); switch (cmd) { case SPECTRE_V2_USER_CMD_NONE: goto set_mode; @@ -799,12 +1158,12 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) } /* - * If no STIBP, enhanced IBRS is enabled or SMT impossible, STIBP is not - * required. + * If no STIBP, IBRS or enhanced IBRS is enabled, or SMT impossible, + * STIBP is not required. */ if (!boot_cpu_has(X86_FEATURE_STIBP) || !smt_possible || - spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) + spectre_v2_in_ibrs_mode(spectre_v2_enabled)) return; /* @@ -816,6 +1175,14 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON)) mode = SPECTRE_V2_USER_STRICT_PREFERRED; + if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET || + retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { + if (mode != SPECTRE_V2_USER_STRICT && + mode != SPECTRE_V2_USER_STRICT_PREFERRED) + pr_info("Selecting STIBP always-on mode to complement retbleed mitigation\n"); + mode = SPECTRE_V2_USER_STRICT_PREFERRED; + } + spectre_v2_user_stibp = mode; set_mode: @@ -824,9 +1191,12 @@ set_mode: static const char * const spectre_v2_strings[] = { [SPECTRE_V2_NONE] = "Vulnerable", - [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline", - [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline", - [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS", + [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines", + [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE", + [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced IBRS", + [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced IBRS + LFENCE", + [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced IBRS + Retpolines", + [SPECTRE_V2_IBRS] = "Mitigation: IBRS", }; static const struct { @@ -837,9 +1207,14 @@ static const struct { { "off", SPECTRE_V2_CMD_NONE, false }, { "on", SPECTRE_V2_CMD_FORCE, true }, { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, - { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false }, + { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, + { "retpoline,lfence", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false }, + { "eibrs", SPECTRE_V2_CMD_EIBRS, false }, + { "eibrs,lfence", SPECTRE_V2_CMD_EIBRS_LFENCE, false }, + { "eibrs,retpoline", SPECTRE_V2_CMD_EIBRS_RETPOLINE, false }, { "auto", SPECTRE_V2_CMD_AUTO, false }, + { "ibrs", SPECTRE_V2_CMD_IBRS, false }, }; static void __init spec_v2_print_cond(const char *reason, bool secure) @@ -875,10 +1250,54 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) } if ((cmd == SPECTRE_V2_CMD_RETPOLINE || - cmd == SPECTRE_V2_CMD_RETPOLINE_AMD || - cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC) && + cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || + cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC || + cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || + cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && !IS_ENABLED(CONFIG_RETPOLINE)) { - pr_err("%s selected but not compiled in. Switching to AUTO select\n", mitigation_options[i].option); + pr_err("%s selected but not compiled in. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if ((cmd == SPECTRE_V2_CMD_EIBRS || + cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || + cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && + !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { + pr_err("%s selected but CPU doesn't have eIBRS. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if ((cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || + cmd == SPECTRE_V2_CMD_EIBRS_LFENCE) && + !boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { + pr_err("%s selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if (cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_CPU_IBRS_ENTRY)) { + pr_err("%s selected but not compiled in. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { + pr_err("%s selected but not Intel CPU. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if (cmd == SPECTRE_V2_CMD_IBRS && !boot_cpu_has(X86_FEATURE_IBRS)) { + pr_err("%s selected but CPU doesn't have IBRS. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_has(X86_FEATURE_XENPV)) { + pr_err("%s selected but running as XenPV guest. Switching to AUTO select\n", + mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } @@ -887,6 +1306,79 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) return cmd; } +static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) +{ + if (!IS_ENABLED(CONFIG_RETPOLINE)) { + pr_err("Kernel not compiled with retpoline; no mitigation available!"); + return SPECTRE_V2_NONE; + } + + return SPECTRE_V2_RETPOLINE; +} + +/* Disable in-kernel use of non-RSB RET predictors */ +static void __init spec_ctrl_disable_kernel_rrsba(void) +{ + u64 ia32_cap; + + if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL)) + return; + + ia32_cap = x86_read_arch_cap_msr(); + + if (ia32_cap & ARCH_CAP_RRSBA) { + x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S; + write_spec_ctrl_current(x86_spec_ctrl_base, true); + } +} + +static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode) +{ + /* + * Similar to context switches, there are two types of RSB attacks + * after VM exit: + * + * 1) RSB underflow + * + * 2) Poisoned RSB entry + * + * When retpoline is enabled, both are mitigated by filling/clearing + * the RSB. + * + * When IBRS is enabled, while #1 would be mitigated by the IBRS branch + * prediction isolation protections, RSB still needs to be cleared + * because of #2. Note that SMEP provides no protection here, unlike + * user-space-poisoned RSB entries. + * + * eIBRS should protect against RSB poisoning, but if the EIBRS_PBRSB + * bug is present then a LITE version of RSB protection is required, + * just a single call needs to retire before a RET is executed. + */ + switch (mode) { + case SPECTRE_V2_NONE: + return; + + case SPECTRE_V2_EIBRS_LFENCE: + case SPECTRE_V2_EIBRS: + if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { + setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE); + pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL on VMEXIT\n"); + } + return; + + case SPECTRE_V2_EIBRS_RETPOLINE: + case SPECTRE_V2_RETPOLINE: + case SPECTRE_V2_LFENCE: + case SPECTRE_V2_IBRS: + setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT); + pr_info("Spectre v2 / SpectreRSB : Filling RSB on VMEXIT\n"); + return; + } + + pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation at VM exit"); + dump_stack(); +} + static void __init spectre_v2_select_mitigation(void) { enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); @@ -907,86 +1399,172 @@ static void __init spectre_v2_select_mitigation(void) case SPECTRE_V2_CMD_FORCE: case SPECTRE_V2_CMD_AUTO: if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { - mode = SPECTRE_V2_IBRS_ENHANCED; - /* Force it so VMEXIT will restore correctly */ - x86_spec_ctrl_base |= SPEC_CTRL_IBRS; - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); - goto specv2_set_mode; + mode = SPECTRE_V2_EIBRS; + break; } - if (IS_ENABLED(CONFIG_RETPOLINE)) - goto retpoline_auto; + + if (IS_ENABLED(CONFIG_CPU_IBRS_ENTRY) && + boot_cpu_has_bug(X86_BUG_RETBLEED) && + retbleed_cmd != RETBLEED_CMD_OFF && + boot_cpu_has(X86_FEATURE_IBRS) && + boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + mode = SPECTRE_V2_IBRS; + break; + } + + mode = spectre_v2_select_retpoline(); break; - case SPECTRE_V2_CMD_RETPOLINE_AMD: - if (IS_ENABLED(CONFIG_RETPOLINE)) - goto retpoline_amd; + + case SPECTRE_V2_CMD_RETPOLINE_LFENCE: + pr_err(SPECTRE_V2_LFENCE_MSG); + mode = SPECTRE_V2_LFENCE; break; + case SPECTRE_V2_CMD_RETPOLINE_GENERIC: - if (IS_ENABLED(CONFIG_RETPOLINE)) - goto retpoline_generic; + mode = SPECTRE_V2_RETPOLINE; break; + case SPECTRE_V2_CMD_RETPOLINE: - if (IS_ENABLED(CONFIG_RETPOLINE)) - goto retpoline_auto; + mode = spectre_v2_select_retpoline(); + break; + + case SPECTRE_V2_CMD_IBRS: + mode = SPECTRE_V2_IBRS; + break; + + case SPECTRE_V2_CMD_EIBRS: + mode = SPECTRE_V2_EIBRS; + break; + + case SPECTRE_V2_CMD_EIBRS_LFENCE: + mode = SPECTRE_V2_EIBRS_LFENCE; + break; + + case SPECTRE_V2_CMD_EIBRS_RETPOLINE: + mode = SPECTRE_V2_EIBRS_RETPOLINE; break; } - pr_err("Spectre mitigation: kernel not compiled with retpoline; no mitigation available!"); - return; -retpoline_auto: - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { - retpoline_amd: - if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { - pr_err("Spectre mitigation: LFENCE not serializing, switching to generic retpoline\n"); - goto retpoline_generic; - } - mode = SPECTRE_V2_RETPOLINE_AMD; - setup_force_cpu_cap(X86_FEATURE_RETPOLINE_AMD); - setup_force_cpu_cap(X86_FEATURE_RETPOLINE); - } else { - retpoline_generic: - mode = SPECTRE_V2_RETPOLINE_GENERIC; + if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); + + if (spectre_v2_in_ibrs_mode(mode)) { + x86_spec_ctrl_base |= SPEC_CTRL_IBRS; + write_spec_ctrl_current(x86_spec_ctrl_base, true); + } + + switch (mode) { + case SPECTRE_V2_NONE: + case SPECTRE_V2_EIBRS: + break; + + case SPECTRE_V2_IBRS: + setup_force_cpu_cap(X86_FEATURE_KERNEL_IBRS); + if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) + pr_warn(SPECTRE_V2_IBRS_PERF_MSG); + break; + + case SPECTRE_V2_LFENCE: + case SPECTRE_V2_EIBRS_LFENCE: + setup_force_cpu_cap(X86_FEATURE_RETPOLINE_LFENCE); + fallthrough; + + case SPECTRE_V2_RETPOLINE: + case SPECTRE_V2_EIBRS_RETPOLINE: setup_force_cpu_cap(X86_FEATURE_RETPOLINE); + break; } -specv2_set_mode: + /* + * Disable alternate RSB predictions in kernel when indirect CALLs and + * JMPs gets protection against BHI and Intramode-BTI, but RET + * prediction from a non-RSB predictor is still a risk. + */ + if (mode == SPECTRE_V2_EIBRS_LFENCE || + mode == SPECTRE_V2_EIBRS_RETPOLINE || + mode == SPECTRE_V2_RETPOLINE) + spec_ctrl_disable_kernel_rrsba(); + spectre_v2_enabled = mode; pr_info("%s\n", spectre_v2_strings[mode]); /* - * If spectre v2 protection has been enabled, unconditionally fill - * RSB during a context switch; this protects against two independent - * issues: + * If Spectre v2 protection has been enabled, fill the RSB during a + * context switch. In general there are two types of RSB attacks + * across context switches, for which the CALLs/RETs may be unbalanced. + * + * 1) RSB underflow + * + * Some Intel parts have "bottomless RSB". When the RSB is empty, + * speculated return targets may come from the branch predictor, + * which could have a user-poisoned BTB or BHB entry. + * + * AMD has it even worse: *all* returns are speculated from the BTB, + * regardless of the state of the RSB. + * + * When IBRS or eIBRS is enabled, the "user -> kernel" attack + * scenario is mitigated by the IBRS branch prediction isolation + * properties, so the RSB buffer filling wouldn't be necessary to + * protect against this type of attack. + * + * The "user -> user" attack scenario is mitigated by RSB filling. * - * - RSB underflow (and switch to BTB) on Skylake+ - * - SpectreRSB variant of spectre v2 on X86_BUG_SPECTRE_V2 CPUs + * 2) Poisoned RSB entry + * + * If the 'next' in-kernel return stack is shorter than 'prev', + * 'next' could be tricked into speculating with a user-poisoned RSB + * entry. + * + * The "user -> kernel" attack scenario is mitigated by SMEP and + * eIBRS. + * + * The "user -> user" scenario, also known as SpectreBHB, requires + * RSB clearing. + * + * So to mitigate all cases, unconditionally fill RSB on context + * switches. + * + * FIXME: Is this pointless for retbleed-affected AMD? */ setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n"); + spectre_v2_determine_rsb_fill_type_at_vmexit(mode); + /* - * Retpoline means the kernel is safe because it has no indirect - * branches. Enhanced IBRS protects firmware too, so, enable restricted - * speculation around firmware calls only when Enhanced IBRS isn't - * supported. + * Retpoline protects the kernel, but doesn't protect firmware. IBRS + * and Enhanced IBRS protect firmware too, so enable IBRS around + * firmware calls only when IBRS / Enhanced IBRS aren't otherwise + * enabled. * * Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because * the user might select retpoline on the kernel command line and if * the CPU supports Enhanced IBRS, kernel might un-intentionally not * enable IBRS around firmware calls. */ - if (boot_cpu_has(X86_FEATURE_IBRS) && mode != SPECTRE_V2_IBRS_ENHANCED) { + if (boot_cpu_has_bug(X86_BUG_RETBLEED) && + boot_cpu_has(X86_FEATURE_IBPB) && + (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)) { + + if (retbleed_cmd != RETBLEED_CMD_IBPB) { + setup_force_cpu_cap(X86_FEATURE_USE_IBPB_FW); + pr_info("Enabling Speculation Barrier for firmware calls\n"); + } + + } else if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_ibrs_mode(mode)) { setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); pr_info("Enabling Restricted Speculation for firmware calls\n"); } /* Set up IBPB and STIBP depending on the general spectre V2 command */ - spectre_v2_user_select_mitigation(cmd); + spectre_v2_cmd = cmd; } static void update_stibp_msr(void * __unused) { - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + u64 val = spec_ctrl_current() | (x86_spec_ctrl_base & SPEC_CTRL_STIBP); + write_spec_ctrl_current(val, true); } /* Update x86_spec_ctrl_base in case SMT state changed. */ @@ -1021,6 +1599,8 @@ static void update_indir_branch_cond(void) /* Update the static key controlling the MDS CPU buffer clear in idle */ static void update_mds_branch_idle(void) { + u64 ia32_cap = x86_read_arch_cap_msr(); + /* * Enable the idle clearing if SMT is active on CPUs which are * affected only by MSBDS and not any other MDS variant. @@ -1032,19 +1612,26 @@ static void update_mds_branch_idle(void) if (!boot_cpu_has_bug(X86_BUG_MSBDS_ONLY)) return; - if (sched_smt_active()) + if (sched_smt_active()) { static_branch_enable(&mds_idle_clear); - else + } else if (mmio_mitigation == MMIO_MITIGATION_OFF || + (ia32_cap & ARCH_CAP_FBSDP_NO)) { static_branch_disable(&mds_idle_clear); + } } #define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n" #define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n" +#define MMIO_MSG_SMT "MMIO Stale Data CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/processor_mmio_stale_data.html for more details.\n" void cpu_bugs_smt_update(void) { mutex_lock(&spec_ctrl_mutex); + if (sched_smt_active() && unprivileged_ebpf_enabled() && + spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) + pr_warn_once(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG); + switch (spectre_v2_user_stibp) { case SPECTRE_V2_USER_NONE: break; @@ -1080,6 +1667,16 @@ void cpu_bugs_smt_update(void) break; } + switch (mmio_mitigation) { + case MMIO_MITIGATION_VERW: + case MMIO_MITIGATION_UCODE_NEEDED: + if (sched_smt_active()) + pr_warn_once(MMIO_MSG_SMT); + break; + case MMIO_MITIGATION_OFF: + break; + } + mutex_unlock(&spec_ctrl_mutex); } @@ -1184,16 +1781,6 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) } /* - * If SSBD is controlled by the SPEC_CTRL MSR, then set the proper - * bit in the mask to allow guests to use the mitigation even in the - * case where the host does not enable it. - */ - if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || - static_cpu_has(X86_FEATURE_AMD_SSBD)) { - x86_spec_ctrl_mask |= SPEC_CTRL_SSBD; - } - - /* * We have three CPU feature flags that are in play here: * - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible. * - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass @@ -1210,7 +1797,7 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) x86_amd_ssb_disable(); } else { x86_spec_ctrl_base |= SPEC_CTRL_SSBD; - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + write_spec_ctrl_current(x86_spec_ctrl_base, true); } } @@ -1461,7 +2048,7 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) void x86_spec_ctrl_setup_ap(void) { if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + write_spec_ctrl_current(x86_spec_ctrl_base, true); if (ssb_mode == SPEC_STORE_BYPASS_DISABLE) x86_amd_ssb_disable(); @@ -1682,9 +2269,26 @@ static ssize_t tsx_async_abort_show_state(char *buf) sched_smt_active() ? "vulnerable" : "disabled"); } +static ssize_t mmio_stale_data_show_state(char *buf) +{ + if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) + return sysfs_emit(buf, "Unknown: No mitigations\n"); + + if (mmio_mitigation == MMIO_MITIGATION_OFF) + return sysfs_emit(buf, "%s\n", mmio_strings[mmio_mitigation]); + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + return sysfs_emit(buf, "%s; SMT Host state unknown\n", + mmio_strings[mmio_mitigation]); + } + + return sysfs_emit(buf, "%s; SMT %s\n", mmio_strings[mmio_mitigation], + sched_smt_active() ? "vulnerable" : "disabled"); +} + static char *stibp_state(void) { - if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) + if (spectre_v2_in_ibrs_mode(spectre_v2_enabled)) return ""; switch (spectre_v2_user_stibp) { @@ -1714,11 +2318,65 @@ static char *ibpb_state(void) return ""; } +static char *pbrsb_eibrs_state(void) +{ + if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { + if (boot_cpu_has(X86_FEATURE_RSB_VMEXIT_LITE) || + boot_cpu_has(X86_FEATURE_RSB_VMEXIT)) + return ", PBRSB-eIBRS: SW sequence"; + else + return ", PBRSB-eIBRS: Vulnerable"; + } else { + return ", PBRSB-eIBRS: Not affected"; + } +} + +static ssize_t spectre_v2_show_state(char *buf) +{ + if (spectre_v2_enabled == SPECTRE_V2_LFENCE) + return sprintf(buf, "Vulnerable: LFENCE\n"); + + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + return sprintf(buf, "Vulnerable: eIBRS with unprivileged eBPF\n"); + + if (sched_smt_active() && unprivileged_ebpf_enabled() && + spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) + return sprintf(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n"); + + return sprintf(buf, "%s%s%s%s%s%s%s\n", + spectre_v2_strings[spectre_v2_enabled], + ibpb_state(), + boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", + stibp_state(), + boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", + pbrsb_eibrs_state(), + spectre_v2_module_string()); +} + static ssize_t srbds_show_state(char *buf) { return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]); } +static ssize_t retbleed_show_state(char *buf) +{ + if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET || + retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) + return sprintf(buf, "Vulnerable: untrained return thunk / IBPB on non-AMD based uarch\n"); + + return sprintf(buf, "%s; SMT %s\n", + retbleed_strings[retbleed_mitigation], + !sched_smt_active() ? "disabled" : + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED ? + "enabled with STIBP protection" : "vulnerable"); + } + + return sprintf(buf, "%s\n", retbleed_strings[retbleed_mitigation]); +} + static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { @@ -1739,12 +2397,7 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr return sprintf(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]); case X86_BUG_SPECTRE_V2: - return sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], - ibpb_state(), - boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", - stibp_state(), - boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", - spectre_v2_module_string()); + return spectre_v2_show_state(buf); case X86_BUG_SPEC_STORE_BYPASS: return sprintf(buf, "%s\n", ssb_strings[ssb_mode]); @@ -1766,6 +2419,13 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_SRBDS: return srbds_show_state(buf); + case X86_BUG_MMIO_STALE_DATA: + case X86_BUG_MMIO_UNKNOWN: + return mmio_stale_data_show_state(buf); + + case X86_BUG_RETBLEED: + return retbleed_show_state(buf); + default: break; } @@ -1817,4 +2477,17 @@ ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char * { return cpu_show_common(dev, attr, buf, X86_BUG_SRBDS); } + +ssize_t cpu_show_mmio_stale_data(struct device *dev, struct device_attribute *attr, char *buf) +{ + if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) + return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_UNKNOWN); + else + return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA); +} + +ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_RETBLEED); +} #endif diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index fe98a1465be6..66556833d7af 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -29,6 +29,12 @@ #define LVL_3 4 #define LVL_TRACE 5 +/* Shared last level cache maps */ +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); + +/* Shared L2 cache maps */ +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map); + struct _cache_table { unsigned char descriptor; char cache_type; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 0083464de5e3..3e508f239098 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -59,6 +59,8 @@ #include <asm/cpu_device_id.h> #include <asm/uv/uv.h> #include <asm/sigframe.h> +#include <asm/traps.h> +#include <asm/sev.h> #include "cpu.h" @@ -88,6 +90,83 @@ EXPORT_SYMBOL_GPL(get_llc_id); /* L2 cache ID of each logical CPU */ DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id) = BAD_APICID; +static struct ppin_info { + int feature; + int msr_ppin_ctl; + int msr_ppin; +} ppin_info[] = { + [X86_VENDOR_INTEL] = { + .feature = X86_FEATURE_INTEL_PPIN, + .msr_ppin_ctl = MSR_PPIN_CTL, + .msr_ppin = MSR_PPIN + }, + [X86_VENDOR_AMD] = { + .feature = X86_FEATURE_AMD_PPIN, + .msr_ppin_ctl = MSR_AMD_PPIN_CTL, + .msr_ppin = MSR_AMD_PPIN + }, +}; + +static const struct x86_cpu_id ppin_cpuids[] = { + X86_MATCH_FEATURE(X86_FEATURE_AMD_PPIN, &ppin_info[X86_VENDOR_AMD]), + X86_MATCH_FEATURE(X86_FEATURE_INTEL_PPIN, &ppin_info[X86_VENDOR_INTEL]), + + /* Legacy models without CPUID enumeration */ + X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &ppin_info[X86_VENDOR_INTEL]), + + {} +}; + +static void ppin_init(struct cpuinfo_x86 *c) +{ + const struct x86_cpu_id *id; + unsigned long long val; + struct ppin_info *info; + + id = x86_match_cpu(ppin_cpuids); + if (!id) + return; + + /* + * Testing the presence of the MSR is not enough. Need to check + * that the PPIN_CTL allows reading of the PPIN. + */ + info = (struct ppin_info *)id->driver_data; + + if (rdmsrl_safe(info->msr_ppin_ctl, &val)) + goto clear_ppin; + + if ((val & 3UL) == 1UL) { + /* PPIN locked in disabled mode */ + goto clear_ppin; + } + + /* If PPIN is disabled, try to enable */ + if (!(val & 2UL)) { + wrmsrl_safe(info->msr_ppin_ctl, val | 2UL); + rdmsrl_safe(info->msr_ppin_ctl, &val); + } + + /* Is the enable bit set? */ + if (val & 2UL) { + c->ppin = __rdmsr(info->msr_ppin); + set_cpu_cap(c, info->feature); + return; + } + +clear_ppin: + clear_cpu_cap(c, info->feature); +} + /* correctly size the local cpu masks */ void __init setup_cpu_local_masks(void) { @@ -220,13 +299,6 @@ static int __init cachesize_setup(char *str) } __setup("cachesize=", cachesize_setup); -static int __init x86_sep_setup(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_SEP); - return 1; -} -__setup("nosep", x86_sep_setup); - /* Standard macro to see if a specific flag is changeable */ static inline int flag_is_changeable_p(u32 flag) { @@ -298,26 +370,12 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) } #endif -static __init int setup_disable_smep(char *arg) -{ - setup_clear_cpu_cap(X86_FEATURE_SMEP); - return 1; -} -__setup("nosmep", setup_disable_smep); - static __always_inline void setup_smep(struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_SMEP)) cr4_set_bits(X86_CR4_SMEP); } -static __init int setup_disable_smap(char *arg) -{ - setup_clear_cpu_cap(X86_FEATURE_SMAP); - return 1; -} -__setup("nosmap", setup_disable_smap); - static __always_inline void setup_smap(struct cpuinfo_x86 *c) { unsigned long eflags = native_save_fl(); @@ -325,14 +383,8 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) /* This should have been cleared long ago */ BUG_ON(eflags & X86_EFLAGS_AC); - if (cpu_has(c, X86_FEATURE_SMAP)) { -#ifdef CONFIG_X86_SMAP + if (cpu_has(c, X86_FEATURE_SMAP)) cr4_set_bits(X86_CR4_SMAP); -#else - clear_cpu_cap(c, X86_FEATURE_SMAP); - cr4_clear_bits(X86_CR4_SMAP); -#endif - } } static __always_inline void setup_umip(struct cpuinfo_x86 *c) @@ -361,7 +413,8 @@ out: /* These bits should not change their value after CPU init is finished. */ static const unsigned long cr4_pinned_mask = - X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP | X86_CR4_FSGSBASE; + X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP | + X86_CR4_FSGSBASE | X86_CR4_CET; static DEFINE_STATIC_KEY_FALSE_RO(cr_pinning); static unsigned long cr4_pinned_bits __ro_after_init; @@ -384,7 +437,7 @@ set_register: } EXPORT_SYMBOL(native_write_cr0); -void native_write_cr4(unsigned long val) +void __no_profile native_write_cr4(unsigned long val) { unsigned long bits_changed = 0; @@ -515,6 +568,58 @@ static __init int setup_disable_pku(char *arg) __setup("nopku", setup_disable_pku); #endif /* CONFIG_X86_64 */ +#ifdef CONFIG_X86_KERNEL_IBT + +__noendbr u64 ibt_save(void) +{ + u64 msr = 0; + + if (cpu_feature_enabled(X86_FEATURE_IBT)) { + rdmsrl(MSR_IA32_S_CET, msr); + wrmsrl(MSR_IA32_S_CET, msr & ~CET_ENDBR_EN); + } + + return msr; +} + +__noendbr void ibt_restore(u64 save) +{ + u64 msr; + + if (cpu_feature_enabled(X86_FEATURE_IBT)) { + rdmsrl(MSR_IA32_S_CET, msr); + msr &= ~CET_ENDBR_EN; + msr |= (save & CET_ENDBR_EN); + wrmsrl(MSR_IA32_S_CET, msr); + } +} + +#endif + +static __always_inline void setup_cet(struct cpuinfo_x86 *c) +{ + u64 msr = CET_ENDBR_EN; + + if (!HAS_KERNEL_IBT || + !cpu_feature_enabled(X86_FEATURE_IBT)) + return; + + wrmsrl(MSR_IA32_S_CET, msr); + cr4_set_bits(X86_CR4_CET); + + if (!ibt_selftest()) { + pr_err("IBT selftest: Failed!\n"); + setup_clear_cpu_cap(X86_FEATURE_IBT); + return; + } +} + +__noendbr void cet_disable(void) +{ + if (cpu_feature_enabled(X86_FEATURE_IBT)) + wrmsrl(MSR_IA32_S_CET, 0); +} + /* * Some CPU features depend on higher CPUID levels, which may not always * be available due to CPUID level capping or broken virtualization @@ -1030,6 +1135,8 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) #define NO_SWAPGS BIT(6) #define NO_ITLB_MULTIHIT BIT(7) #define NO_SPECTRE_V2 BIT(8) +#define NO_MMIO BIT(9) +#define NO_EIBRS_PBRSB BIT(10) #define VULNWL(vendor, family, model, whitelist) \ X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, whitelist) @@ -1052,6 +1159,11 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL(VORTEX, 6, X86_MODEL_ANY, NO_SPECULATION), /* Intel Family 6 */ + VULNWL_INTEL(TIGERLAKE, NO_MMIO), + VULNWL_INTEL(TIGERLAKE_L, NO_MMIO), + VULNWL_INTEL(ALDERLAKE, NO_MMIO), + VULNWL_INTEL(ALDERLAKE_L, NO_MMIO), + VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT), VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT), VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), @@ -1070,9 +1182,9 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), VULNWL_INTEL(ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), /* * Technically, swapgs isn't serializing on AMD (despite it previously @@ -1082,42 +1194,80 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { * good enough for our purposes. */ - VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_TREMONT, NO_EIBRS_PBRSB), + VULNWL_INTEL(ATOM_TREMONT_L, NO_EIBRS_PBRSB), + VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB), /* AMD Family 0xf - 0x12 */ - VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ - VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), /* Zhaoxin Family 7 */ - VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS), - VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS), + VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO), + VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO), {} }; +#define VULNBL(vendor, family, model, blacklist) \ + X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, blacklist) + #define VULNBL_INTEL_STEPPINGS(model, steppings, issues) \ X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, \ INTEL_FAM6_##model, steppings, \ X86_FEATURE_ANY, issues) +#define VULNBL_AMD(family, blacklist) \ + VULNBL(AMD, family, X86_MODEL_ANY, blacklist) + +#define VULNBL_HYGON(family, blacklist) \ + VULNBL(HYGON, family, X86_MODEL_ANY, blacklist) + #define SRBDS BIT(0) +/* CPU is affected by X86_BUG_MMIO_STALE_DATA */ +#define MMIO BIT(1) +/* CPU is affected by Shared Buffers Data Sampling (SBDS), a variant of X86_BUG_MMIO_STALE_DATA */ +#define MMIO_SBDS BIT(2) +/* CPU is affected by RETbleed, speculating where you would not expect it */ +#define RETBLEED BIT(3) static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS), VULNBL_INTEL_STEPPINGS(HASWELL, X86_STEPPING_ANY, SRBDS), VULNBL_INTEL_STEPPINGS(HASWELL_L, X86_STEPPING_ANY, SRBDS), VULNBL_INTEL_STEPPINGS(HASWELL_G, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL_X, X86_STEPPING_ANY, MMIO), + VULNBL_INTEL_STEPPINGS(BROADWELL_D, X86_STEPPING_ANY, MMIO), VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO), VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPINGS(0x0, 0xC), SRBDS), - VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x0, 0xD), SRBDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED), + VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), + VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO), + VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPING_ANY, MMIO), + VULNBL_INTEL_STEPPINGS(COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), + VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), + VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), + VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS), + VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO), + VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS), + + VULNBL_AMD(0x15, RETBLEED), + VULNBL_AMD(0x16, RETBLEED), + VULNBL_AMD(0x17, RETBLEED), + VULNBL_HYGON(0x18, RETBLEED), {} }; @@ -1138,6 +1288,13 @@ u64 x86_read_arch_cap_msr(void) return ia32_cap; } +static bool arch_cap_mmio_immune(u64 ia32_cap) +{ + return (ia32_cap & ARCH_CAP_FBSDP_NO && + ia32_cap & ARCH_CAP_PSDP_NO && + ia32_cap & ARCH_CAP_SBDR_SSDP_NO); +} + static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) { u64 ia32_cap = x86_read_arch_cap_msr(); @@ -1191,12 +1348,43 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) /* * SRBDS affects CPUs which support RDRAND or RDSEED and are listed * in the vulnerability blacklist. + * + * Some of the implications and mitigation of Shared Buffers Data + * Sampling (SBDS) are similar to SRBDS. Give SBDS same treatment as + * SRBDS. */ if ((cpu_has(c, X86_FEATURE_RDRAND) || cpu_has(c, X86_FEATURE_RDSEED)) && - cpu_matches(cpu_vuln_blacklist, SRBDS)) + cpu_matches(cpu_vuln_blacklist, SRBDS | MMIO_SBDS)) setup_force_cpu_bug(X86_BUG_SRBDS); + /* + * Processor MMIO Stale Data bug enumeration + * + * Affected CPU list is generally enough to enumerate the vulnerability, + * but for virtualization case check for ARCH_CAP MSR bits also, VMM may + * not want the guest to enumerate the bug. + * + * Set X86_BUG_MMIO_UNKNOWN for CPUs that are neither in the blacklist, + * nor in the whitelist and also don't enumerate MSR ARCH_CAP MMIO bits. + */ + if (!arch_cap_mmio_immune(ia32_cap)) { + if (cpu_matches(cpu_vuln_blacklist, MMIO)) + setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA); + else if (!cpu_matches(cpu_vuln_whitelist, NO_MMIO)) + setup_force_cpu_bug(X86_BUG_MMIO_UNKNOWN); + } + + if (!cpu_has(c, X86_FEATURE_BTC_NO)) { + if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (ia32_cap & ARCH_CAP_RSBA)) + setup_force_cpu_bug(X86_BUG_RETBLEED); + } + + if (cpu_has(c, X86_FEATURE_IBRS_ENHANCED) && + !cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) && + !(ia32_cap & ARCH_CAP_PBRSB_NO)) + setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB); + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; @@ -1237,8 +1425,8 @@ static void detect_nopl(void) static void __init cpu_parse_early_param(void) { char arg[128]; - char *argptr = arg; - int arglen, res, bit; + char *argptr = arg, *opt; + int arglen, taint = 0; #ifdef CONFIG_X86_32 if (cmdline_find_option_bool(boot_command_line, "no387")) @@ -1266,21 +1454,61 @@ static void __init cpu_parse_early_param(void) return; pr_info("Clearing CPUID bits:"); - do { - res = get_option(&argptr, &bit); - if (res == 0 || res == 3) - break; - /* If the argument was too long, the last bit may be cut off */ - if (res == 1 && arglen >= sizeof(arg)) - break; + while (argptr) { + bool found __maybe_unused = false; + unsigned int bit; - if (bit >= 0 && bit < NCAPINTS * 32) { - pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit)); + opt = strsep(&argptr, ","); + + /* + * Handle naked numbers first for feature flags which don't + * have names. + */ + if (!kstrtouint(opt, 10, &bit)) { + if (bit < NCAPINTS * 32) { + +#ifdef CONFIG_X86_FEATURE_NAMES + /* empty-string, i.e., ""-defined feature flags */ + if (!x86_cap_flags[bit]) + pr_cont(" " X86_CAP_FMT_NUM, x86_cap_flag_num(bit)); + else +#endif + pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit)); + + setup_clear_cpu_cap(bit); + taint++; + } + /* + * The assumption is that there are no feature names with only + * numbers in the name thus go to the next argument. + */ + continue; + } + +#ifdef CONFIG_X86_FEATURE_NAMES + for (bit = 0; bit < 32 * NCAPINTS; bit++) { + if (!x86_cap_flag(bit)) + continue; + + if (strcmp(x86_cap_flag(bit), opt)) + continue; + + pr_cont(" %s", opt); setup_clear_cpu_cap(bit); + taint++; + found = true; + break; } - } while (res == 2); + + if (!found) + pr_cont(" (unknown: %s)", opt); +#endif + } pr_cont("\n"); + + if (taint) + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); } /* @@ -1632,6 +1860,7 @@ static void identify_cpu(struct cpuinfo_x86 *c) x86_init_rdrand(c); setup_pku(c); + setup_cet(c); /* * Clear/Set all flags overridden by options, need do it @@ -1655,6 +1884,8 @@ static void identify_cpu(struct cpuinfo_x86 *c) c->x86_capability[i] |= boot_cpu_data.x86_capability[i]; } + ppin_init(c); + /* Init Machine Check Exception if available. */ mcheck_cpu_init(c); @@ -1698,6 +1929,8 @@ void enable_sep_cpu(void) void __init identify_boot_cpu(void) { identify_cpu(&boot_cpu_data); + if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) + pr_info("CET detected: Indirect Branch Tracking enabled\n"); #ifdef CONFIG_X86_32 sysenter_setup(); enable_sep_cpu(); @@ -1719,15 +1952,9 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) validate_apic_and_package_id(c); x86_spec_ctrl_setup_ap(); update_srbds_msr(); -} -static __init int setup_noclflush(char *arg) -{ - setup_clear_cpu_cap(X86_FEATURE_CLFLUSH); - setup_clear_cpu_cap(X86_FEATURE_CLFLUSHOPT); - return 1; + tsx_ap_init(); } -__setup("noclflush", setup_noclflush); void print_cpu_info(struct cpuinfo_x86 *c) { @@ -1787,6 +2014,17 @@ EXPORT_PER_CPU_SYMBOL(__preempt_count); DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK; +static void wrmsrl_cstar(unsigned long val) +{ + /* + * Intel CPUs do not support 32-bit SYSCALL. Writing to MSR_CSTAR + * is so far ignored by the CPU, but raises a #VE trap in a TDX + * guest. Avoid the pointless write on all Intel CPUs. + */ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + wrmsrl(MSR_CSTAR, val); +} + /* May not be marked __init: used by software suspend */ void syscall_init(void) { @@ -1794,7 +2032,7 @@ void syscall_init(void) wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); #ifdef CONFIG_IA32_EMULATION - wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); + wrmsrl_cstar((unsigned long)entry_SYSCALL_compat); /* * This only works on Intel CPUs. * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. @@ -1806,7 +2044,7 @@ void syscall_init(void) (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1)); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); #else - wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); + wrmsrl_cstar((unsigned long)ignore_sysret); wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); @@ -1977,6 +2215,9 @@ void cpu_init_exception_handling(void) load_TR_desc(); + /* GHCB needs to be setup to handle #VC. */ + setup_ghcb(); + /* Finally load the IDT */ load_current_idt(); } @@ -2064,6 +2305,7 @@ void cpu_init_secondary(void) } #endif +#ifdef CONFIG_MICROCODE_LATE_LOADING /* * The microcode loader calls this upon late microcode load to recheck features, * only when microcode has been updated. Caller holds microcode_mutex and CPU @@ -2093,6 +2335,7 @@ void microcode_check(void) pr_warn("x86/CPU: CPU features have changed after loading microcode, but might not take effect.\n"); pr_warn("x86/CPU: Please consider either early loading through initrd/built-in or a potential BIOS update.\n"); } +#endif /* * Invoked from core CPU hotplug code after hotplug operations diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index ee6f23f7587d..7c9b5893c30a 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -55,13 +55,14 @@ enum tsx_ctrl_states { extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state; extern void __init tsx_init(void); -extern void tsx_enable(void); -extern void tsx_disable(void); -extern void tsx_clear_cpuid(void); +void tsx_ap_init(void); #else static inline void tsx_init(void) { } +static inline void tsx_ap_init(void) { } #endif /* CONFIG_CPU_SUP_INTEL */ +extern void init_spectral_chicken(struct cpuinfo_x86 *c); + extern void get_cpu_cap(struct cpuinfo_x86 *c); extern void get_cpu_address_sizes(struct cpuinfo_x86 *c); extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 7227c15299d0..9651275aecd1 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/bitops.h> #include <linux/delay.h> +#include <linux/isa-dma.h> #include <linux/pci.h> #include <asm/dma.h> #include <linux/io.h> diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c index da696eb4821a..03851240c3e3 100644 --- a/arch/x86/kernel/cpu/feat_ctl.c +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -1,11 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/tboot.h> +#include <asm/cpu.h> #include <asm/cpufeature.h> #include <asm/msr-index.h> #include <asm/processor.h> #include <asm/vmx.h> -#include "cpu.h" #undef pr_fmt #define pr_fmt(fmt) "x86/cpu: " fmt @@ -15,6 +15,8 @@ enum vmx_feature_leafs { MISC_FEATURES = 0, PRIMARY_CTLS, SECONDARY_CTLS, + TERTIARY_CTLS_LOW, + TERTIARY_CTLS_HIGH, NR_VMX_FEATURE_WORDS, }; @@ -22,7 +24,7 @@ enum vmx_feature_leafs { static void init_vmx_capabilities(struct cpuinfo_x86 *c) { - u32 supported, funcs, ept, vpid, ign; + u32 supported, funcs, ept, vpid, ign, low, high; BUILD_BUG_ON(NVMXINTS != NR_VMX_FEATURE_WORDS); @@ -42,6 +44,11 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c) rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS2, &ign, &supported); c->vmx_capability[SECONDARY_CTLS] = supported; + /* All 64 bits of tertiary controls MSR are allowed-1 settings. */ + rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS3, &low, &high); + c->vmx_capability[TERTIARY_CTLS_LOW] = low; + c->vmx_capability[TERTIARY_CTLS_HIGH] = high; + rdmsr(MSR_IA32_VMX_PINBASED_CTLS, ign, supported); rdmsr_safe(MSR_IA32_VMX_VMFUNC, &ign, &funcs); diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index 3fcdda4c1e11..21fd425088fe 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -302,6 +302,12 @@ static void init_hygon(struct cpuinfo_x86 *c) /* get apicid instead of initial apic id from cpuid */ c->apicid = hard_smp_processor_id(); + /* + * XXX someone from Hygon needs to confirm this DTRT + * + init_spectral_chicken(c); + */ + set_cpu_cap(c, X86_FEATURE_ZEN); set_cpu_cap(c, X86_FEATURE_CPB); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 8321c43554a1..2d7ea5480ec3 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -7,10 +7,13 @@ #include <linux/smp.h> #include <linux/sched.h> #include <linux/sched/clock.h> +#include <linux/semaphore.h> #include <linux/thread_info.h> #include <linux/init.h> #include <linux/uaccess.h> +#include <linux/workqueue.h> #include <linux/delay.h> +#include <linux/cpuhotplug.h> #include <asm/cpufeature.h> #include <asm/msr.h> @@ -91,7 +94,7 @@ static bool ring3mwait_disabled __read_mostly; static int __init ring3mwait_disable(char *__unused) { ring3mwait_disabled = true; - return 0; + return 1; } __setup("ring3mwait=disable", ring3mwait_disable); @@ -181,6 +184,38 @@ static bool bad_spectre_microcode(struct cpuinfo_x86 *c) return false; } +int intel_cpu_collect_info(struct ucode_cpu_info *uci) +{ + unsigned int val[2]; + unsigned int family, model; + struct cpu_signature csig = { 0 }; + unsigned int eax, ebx, ecx, edx; + + memset(uci, 0, sizeof(*uci)); + + eax = 0x00000001; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + csig.sig = eax; + + family = x86_family(eax); + model = x86_model(eax); + + if (model >= 5 || family > 6) { + /* get processor flags from MSR 0x17 */ + native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); + csig.pf = 1 << ((val[1] >> 18) & 7); + } + + csig.rev = intel_get_microcode_revision(); + + uci->cpu_sig = csig; + uci->valid = 1; + + return 0; +} +EXPORT_SYMBOL_GPL(intel_cpu_collect_info); + static void early_init_intel(struct cpuinfo_x86 *c) { u64 misc_enable; @@ -647,9 +682,9 @@ static void init_intel(struct cpuinfo_x86 *c) unsigned int l1, l2; rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); - if (!(l1 & (1<<11))) + if (!(l1 & MSR_IA32_MISC_ENABLE_BTS_UNAVAIL)) set_cpu_cap(c, X86_FEATURE_BTS); - if (!(l1 & (1<<12))) + if (!(l1 & MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL)) set_cpu_cap(c, X86_FEATURE_PEBS); } @@ -717,13 +752,6 @@ static void init_intel(struct cpuinfo_x86 *c) init_intel_misc_features(c); - if (tsx_ctrl_state == TSX_CTRL_ENABLE) - tsx_enable(); - else if (tsx_ctrl_state == TSX_CTRL_DISABLE) - tsx_disable(); - else if (tsx_ctrl_state == TSX_CTRL_RTM_ALWAYS_ABORT) - tsx_clear_cpuid(); - split_lock_init(); bus_lock_init(); @@ -1006,6 +1034,8 @@ static const struct { static struct ratelimit_state bld_ratelimit; +static DEFINE_SEMAPHORE(buslock_sem); + static inline bool match_option(const char *arg, int arglen, const char *opt) { int len = strlen(opt), ratelimit; @@ -1116,18 +1146,52 @@ static void split_lock_init(void) split_lock_verify_msr(sld_state != sld_off); } +static void __split_lock_reenable(struct work_struct *work) +{ + sld_update_msr(true); + up(&buslock_sem); +} + +/* + * If a CPU goes offline with pending delayed work to re-enable split lock + * detection then the delayed work will be executed on some other CPU. That + * handles releasing the buslock_sem, but because it executes on a + * different CPU probably won't re-enable split lock detection. This is a + * problem on HT systems since the sibling CPU on the same core may then be + * left running with split lock detection disabled. + * + * Unconditionally re-enable detection here. + */ +static int splitlock_cpu_offline(unsigned int cpu) +{ + sld_update_msr(true); + + return 0; +} + +static DECLARE_DELAYED_WORK(split_lock_reenable, __split_lock_reenable); + static void split_lock_warn(unsigned long ip) { - pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n", - current->comm, current->pid, ip); + int cpu; - /* - * Disable the split lock detection for this task so it can make - * progress and set TIF_SLD so the detection is re-enabled via - * switch_to_sld() when the task is scheduled out. - */ + if (!current->reported_split_lock) + pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n", + current->comm, current->pid, ip); + current->reported_split_lock = 1; + + /* misery factor #1, sleep 10ms before trying to execute split lock */ + if (msleep_interruptible(10) > 0) + return; + /* Misery factor #2, only allow one buslocked disabled core at a time */ + if (down_interruptible(&buslock_sem) == -EINTR) + return; + cpu = get_cpu(); + schedule_delayed_work_on(cpu, &split_lock_reenable, 2); + + /* Disable split lock detection on this CPU to make progress */ sld_update_msr(false); - set_tsk_thread_flag(current, TIF_SLD); + put_cpu(); } bool handle_guest_split_lock(unsigned long ip) @@ -1152,22 +1216,23 @@ static void bus_lock_init(void) { u64 val; - /* - * Warn and fatal are handled by #AC for split lock if #AC for - * split lock is supported. - */ - if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) || - (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && - (sld_state == sld_warn || sld_state == sld_fatal)) || - sld_state == sld_off) + if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) return; - /* - * Enable #DB for bus lock. All bus locks are handled in #DB except - * split locks are handled in #AC in the fatal case. - */ rdmsrl(MSR_IA32_DEBUGCTLMSR, val); - val |= DEBUGCTLMSR_BUS_LOCK_DETECT; + + if ((boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && + (sld_state == sld_warn || sld_state == sld_fatal)) || + sld_state == sld_off) { + /* + * Warn and fatal are handled by #AC for split lock if #AC for + * split lock is supported. + */ + val &= ~DEBUGCTLMSR_BUS_LOCK_DETECT; + } else { + val |= DEBUGCTLMSR_BUS_LOCK_DETECT; + } + wrmsrl(MSR_IA32_DEBUGCTLMSR, val); } @@ -1201,18 +1266,6 @@ void handle_bus_lock(struct pt_regs *regs) } /* - * This function is called only when switching between tasks with - * different split-lock detection modes. It sets the MSR for the - * mode of the new task. This is right most of the time, but since - * the MSR is shared by hyperthreads on a physical core there can - * be glitches when the two threads need different modes. - */ -void switch_to_sld(unsigned long tifn) -{ - sld_update_msr(!(tifn & _TIF_SLD)); -} - -/* * Bits in the IA32_CORE_CAPABILITIES are not architectural, so they should * only be trusted if it is confirmed that a CPU model implements a * specific feature at a particular bit position. @@ -1237,6 +1290,7 @@ static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, 1), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, 1), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, 1), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, 1), {} }; @@ -1281,10 +1335,14 @@ static void sld_state_show(void) pr_info("disabled\n"); break; case sld_warn: - if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) + if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n"); - else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) + if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "x86/splitlock", NULL, splitlock_cpu_offline) < 0) + pr_warn("No splitlock CPU offline handler\n"); + } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) { pr_info("#DB: warning on user-space bus_locks\n"); + } break; case sld_fatal: if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c index f4dd73396f28..fbaf12e43f41 100644 --- a/arch/x86/kernel/cpu/intel_epb.c +++ b/arch/x86/kernel/cpu/intel_epb.c @@ -16,6 +16,7 @@ #include <linux/syscore_ops.h> #include <linux/pm.h> +#include <asm/cpu_device_id.h> #include <asm/cpufeature.h> #include <asm/msr.h> @@ -58,6 +59,22 @@ static DEFINE_PER_CPU(u8, saved_epb); #define EPB_SAVED 0x10ULL #define MAX_EPB EPB_MASK +enum energy_perf_value_index { + EPB_INDEX_PERFORMANCE, + EPB_INDEX_BALANCE_PERFORMANCE, + EPB_INDEX_NORMAL, + EPB_INDEX_BALANCE_POWERSAVE, + EPB_INDEX_POWERSAVE, +}; + +static u8 energ_perf_values[] = { + [EPB_INDEX_PERFORMANCE] = ENERGY_PERF_BIAS_PERFORMANCE, + [EPB_INDEX_BALANCE_PERFORMANCE] = ENERGY_PERF_BIAS_BALANCE_PERFORMANCE, + [EPB_INDEX_NORMAL] = ENERGY_PERF_BIAS_NORMAL, + [EPB_INDEX_BALANCE_POWERSAVE] = ENERGY_PERF_BIAS_BALANCE_POWERSAVE, + [EPB_INDEX_POWERSAVE] = ENERGY_PERF_BIAS_POWERSAVE, +}; + static int intel_epb_save(void) { u64 epb; @@ -90,7 +107,7 @@ static void intel_epb_restore(void) */ val = epb & EPB_MASK; if (val == ENERGY_PERF_BIAS_PERFORMANCE) { - val = ENERGY_PERF_BIAS_NORMAL; + val = energ_perf_values[EPB_INDEX_NORMAL]; pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n"); } } @@ -103,18 +120,11 @@ static struct syscore_ops intel_epb_syscore_ops = { }; static const char * const energy_perf_strings[] = { - "performance", - "balance-performance", - "normal", - "balance-power", - "power" -}; -static const u8 energ_perf_values[] = { - ENERGY_PERF_BIAS_PERFORMANCE, - ENERGY_PERF_BIAS_BALANCE_PERFORMANCE, - ENERGY_PERF_BIAS_NORMAL, - ENERGY_PERF_BIAS_BALANCE_POWERSAVE, - ENERGY_PERF_BIAS_POWERSAVE + [EPB_INDEX_PERFORMANCE] = "performance", + [EPB_INDEX_BALANCE_PERFORMANCE] = "balance-performance", + [EPB_INDEX_NORMAL] = "normal", + [EPB_INDEX_BALANCE_POWERSAVE] = "balance-power", + [EPB_INDEX_POWERSAVE] = "power", }; static ssize_t energy_perf_bias_show(struct device *dev, @@ -193,13 +203,22 @@ static int intel_epb_offline(unsigned int cpu) return 0; } +static const struct x86_cpu_id intel_epb_normal[] = { + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, 7), + {} +}; + static __init int intel_epb_init(void) { + const struct x86_cpu_id *id = x86_match_cpu(intel_epb_normal); int ret; if (!boot_cpu_has(X86_FEATURE_EPB)) return -ENODEV; + if (id) + energ_perf_values[EPB_INDEX_NORMAL] = id->driver_data; + ret = cpuhp_setup_state(CPUHP_AP_X86_INTEL_EPB_ONLINE, "x86/intel/epb:online", intel_epb_online, intel_epb_offline); diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index fc85eb17cb6d..1c87501e0fa3 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -71,6 +71,22 @@ static const char * const smca_umc_block_names[] = { "misc_umc" }; +#define HWID_MCATYPE(hwid, mcatype) (((hwid) << 16) | (mcatype)) + +struct smca_hwid { + unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */ + u32 hwid_mcatype; /* (hwid,mcatype) tuple */ +}; + +struct smca_bank { + const struct smca_hwid *hwid; + u32 id; /* Value of MCA_IPID[InstanceId]. */ + u8 sysfs_id; /* Value used for sysfs name. */ +}; + +static DEFINE_PER_CPU_READ_MOSTLY(struct smca_bank[MAX_NR_BANKS], smca_banks); +static DEFINE_PER_CPU_READ_MOSTLY(u8[N_SMCA_BANK_TYPES], smca_bank_counts); + struct smca_bank_name { const char *name; /* Short name for sysfs */ const char *long_name; /* Long name for pretty-printing */ @@ -95,11 +111,18 @@ static struct smca_bank_name smca_names[] = { [SMCA_PSP ... SMCA_PSP_V2] = { "psp", "Platform Security Processor" }, [SMCA_SMU ... SMCA_SMU_V2] = { "smu", "System Management Unit" }, [SMCA_MP5] = { "mp5", "Microprocessor 5 Unit" }, + [SMCA_MPDMA] = { "mpdma", "MPDMA Unit" }, [SMCA_NBIO] = { "nbio", "Northbridge IO Unit" }, [SMCA_PCIE ... SMCA_PCIE_V2] = { "pcie", "PCI Express Unit" }, [SMCA_XGMI_PCS] = { "xgmi_pcs", "Ext Global Memory Interconnect PCS Unit" }, + [SMCA_NBIF] = { "nbif", "NBIF Unit" }, + [SMCA_SHUB] = { "shub", "System Hub Unit" }, + [SMCA_SATA] = { "sata", "SATA Unit" }, + [SMCA_USB] = { "usb", "USB Unit" }, + [SMCA_GMI_PCS] = { "gmi_pcs", "Global Memory Interconnect PCS Unit" }, [SMCA_XGMI_PHY] = { "xgmi_phy", "Ext Global Memory Interconnect PHY Unit" }, [SMCA_WAFL_PHY] = { "wafl_phy", "WAFL PHY Unit" }, + [SMCA_GMI_PHY] = { "gmi_phy", "Global Memory Interconnect PHY Unit" }, }; static const char *smca_get_name(enum smca_bank_types t) @@ -119,14 +142,14 @@ const char *smca_get_long_name(enum smca_bank_types t) } EXPORT_SYMBOL_GPL(smca_get_long_name); -enum smca_bank_types smca_get_bank_type(unsigned int bank) +enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank) { struct smca_bank *b; if (bank >= MAX_NR_BANKS) return N_SMCA_BANK_TYPES; - b = &smca_banks[bank]; + b = &per_cpu(smca_banks, cpu)[bank]; if (!b->hwid) return N_SMCA_BANK_TYPES; @@ -134,7 +157,7 @@ enum smca_bank_types smca_get_bank_type(unsigned int bank) } EXPORT_SYMBOL_GPL(smca_get_bank_type); -static struct smca_hwid smca_hwid_mcatypes[] = { +static const struct smca_hwid smca_hwid_mcatypes[] = { /* { bank_type, hwid_mcatype } */ /* Reserved type */ @@ -174,6 +197,9 @@ static struct smca_hwid smca_hwid_mcatypes[] = { /* Microprocessor 5 Unit MCA type */ { SMCA_MP5, HWID_MCATYPE(0x01, 0x2) }, + /* MPDMA MCA type */ + { SMCA_MPDMA, HWID_MCATYPE(0x01, 0x3) }, + /* Northbridge IO Unit MCA type */ { SMCA_NBIO, HWID_MCATYPE(0x18, 0x0) }, @@ -181,19 +207,17 @@ static struct smca_hwid smca_hwid_mcatypes[] = { { SMCA_PCIE, HWID_MCATYPE(0x46, 0x0) }, { SMCA_PCIE_V2, HWID_MCATYPE(0x46, 0x1) }, - /* xGMI PCS MCA type */ { SMCA_XGMI_PCS, HWID_MCATYPE(0x50, 0x0) }, - - /* xGMI PHY MCA type */ + { SMCA_NBIF, HWID_MCATYPE(0x6C, 0x0) }, + { SMCA_SHUB, HWID_MCATYPE(0x80, 0x0) }, + { SMCA_SATA, HWID_MCATYPE(0xA8, 0x0) }, + { SMCA_USB, HWID_MCATYPE(0xAA, 0x0) }, + { SMCA_GMI_PCS, HWID_MCATYPE(0x241, 0x0) }, { SMCA_XGMI_PHY, HWID_MCATYPE(0x259, 0x0) }, - - /* WAFL PHY MCA type */ { SMCA_WAFL_PHY, HWID_MCATYPE(0x267, 0x0) }, + { SMCA_GMI_PHY, HWID_MCATYPE(0x269, 0x0) }, }; -struct smca_bank smca_banks[MAX_NR_BANKS]; -EXPORT_SYMBOL_GPL(smca_banks); - /* * In SMCA enabled processors, we can have multiple banks for a given IP type. * So to define a unique name for each bank, we use a temp c-string to append @@ -249,8 +273,9 @@ static void smca_set_misc_banks_map(unsigned int bank, unsigned int cpu) static void smca_configure(unsigned int bank, unsigned int cpu) { + u8 *bank_counts = this_cpu_ptr(smca_bank_counts); + const struct smca_hwid *s_hwid; unsigned int i, hwid_mcatype; - struct smca_hwid *s_hwid; u32 high, low; u32 smca_config = MSR_AMD64_SMCA_MCx_CONFIG(bank); @@ -286,10 +311,6 @@ static void smca_configure(unsigned int bank, unsigned int cpu) smca_set_misc_banks_map(bank, cpu); - /* Return early if this bank was already initialized. */ - if (smca_banks[bank].hwid && smca_banks[bank].hwid->hwid_mcatype != 0) - return; - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) { pr_warn("Failed to read MCA_IPID for bank %d\n", bank); return; @@ -300,10 +321,11 @@ static void smca_configure(unsigned int bank, unsigned int cpu) for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) { s_hwid = &smca_hwid_mcatypes[i]; + if (hwid_mcatype == s_hwid->hwid_mcatype) { - smca_banks[bank].hwid = s_hwid; - smca_banks[bank].id = low; - smca_banks[bank].sysfs_id = s_hwid->count++; + this_cpu_ptr(smca_banks)[bank].hwid = s_hwid; + this_cpu_ptr(smca_banks)[bank].id = low; + this_cpu_ptr(smca_banks)[bank].sysfs_id = bank_counts[s_hwid->bank_type]++; break; } } @@ -401,7 +423,7 @@ static void threshold_restart_bank(void *_tr) u32 hi, lo; /* sysfs write might race against an offline operation */ - if (this_cpu_read(threshold_banks)) + if (!this_cpu_read(threshold_banks) && !tr->set_lvt_off) return; rdmsr(tr->b->address, lo, hi); @@ -589,7 +611,7 @@ out: bool amd_filter_mce(struct mce *m) { - enum smca_bank_types bank_type = smca_get_bank_type(m->bank); + enum smca_bank_types bank_type = smca_get_bank_type(m->extcpu, m->bank); struct cpuinfo_x86 *c = &boot_cpu_data; /* See Family 17h Models 10h-2Fh Erratum #1114. */ @@ -627,7 +649,7 @@ static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank) } else if (c->x86 == 0x17 && (c->x86_model >= 0x10 && c->x86_model <= 0x2F)) { - if (smca_get_bank_type(bank) != SMCA_IF) + if (smca_get_bank_type(smp_processor_id(), bank) != SMCA_IF) return; msrs[0] = MSR_AMD64_SMCA_MCx_MISC(bank); @@ -689,213 +711,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) deferred_error_interrupt_enable(c); } -int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) -{ - u64 dram_base_addr, dram_limit_addr, dram_hole_base; - /* We start from the normalized address */ - u64 ret_addr = norm_addr; - - u32 tmp; - - u8 die_id_shift, die_id_mask, socket_id_shift, socket_id_mask; - u8 intlv_num_dies, intlv_num_chan, intlv_num_sockets; - u8 intlv_addr_sel, intlv_addr_bit; - u8 num_intlv_bits, hashed_bit; - u8 lgcy_mmio_hole_en, base = 0; - u8 cs_mask, cs_id = 0; - bool hash_enabled = false; - - /* Read D18F0x1B4 (DramOffset), check if base 1 is used. */ - if (amd_df_indirect_read(nid, 0, 0x1B4, umc, &tmp)) - goto out_err; - - /* Remove HiAddrOffset from normalized address, if enabled: */ - if (tmp & BIT(0)) { - u64 hi_addr_offset = (tmp & GENMASK_ULL(31, 20)) << 8; - - if (norm_addr >= hi_addr_offset) { - ret_addr -= hi_addr_offset; - base = 1; - } - } - - /* Read D18F0x110 (DramBaseAddress). */ - if (amd_df_indirect_read(nid, 0, 0x110 + (8 * base), umc, &tmp)) - goto out_err; - - /* Check if address range is valid. */ - if (!(tmp & BIT(0))) { - pr_err("%s: Invalid DramBaseAddress range: 0x%x.\n", - __func__, tmp); - goto out_err; - } - - lgcy_mmio_hole_en = tmp & BIT(1); - intlv_num_chan = (tmp >> 4) & 0xF; - intlv_addr_sel = (tmp >> 8) & 0x7; - dram_base_addr = (tmp & GENMASK_ULL(31, 12)) << 16; - - /* {0, 1, 2, 3} map to address bits {8, 9, 10, 11} respectively */ - if (intlv_addr_sel > 3) { - pr_err("%s: Invalid interleave address select %d.\n", - __func__, intlv_addr_sel); - goto out_err; - } - - /* Read D18F0x114 (DramLimitAddress). */ - if (amd_df_indirect_read(nid, 0, 0x114 + (8 * base), umc, &tmp)) - goto out_err; - - intlv_num_sockets = (tmp >> 8) & 0x1; - intlv_num_dies = (tmp >> 10) & 0x3; - dram_limit_addr = ((tmp & GENMASK_ULL(31, 12)) << 16) | GENMASK_ULL(27, 0); - - intlv_addr_bit = intlv_addr_sel + 8; - - /* Re-use intlv_num_chan by setting it equal to log2(#channels) */ - switch (intlv_num_chan) { - case 0: intlv_num_chan = 0; break; - case 1: intlv_num_chan = 1; break; - case 3: intlv_num_chan = 2; break; - case 5: intlv_num_chan = 3; break; - case 7: intlv_num_chan = 4; break; - - case 8: intlv_num_chan = 1; - hash_enabled = true; - break; - default: - pr_err("%s: Invalid number of interleaved channels %d.\n", - __func__, intlv_num_chan); - goto out_err; - } - - num_intlv_bits = intlv_num_chan; - - if (intlv_num_dies > 2) { - pr_err("%s: Invalid number of interleaved nodes/dies %d.\n", - __func__, intlv_num_dies); - goto out_err; - } - - num_intlv_bits += intlv_num_dies; - - /* Add a bit if sockets are interleaved. */ - num_intlv_bits += intlv_num_sockets; - - /* Assert num_intlv_bits <= 4 */ - if (num_intlv_bits > 4) { - pr_err("%s: Invalid interleave bits %d.\n", - __func__, num_intlv_bits); - goto out_err; - } - - if (num_intlv_bits > 0) { - u64 temp_addr_x, temp_addr_i, temp_addr_y; - u8 die_id_bit, sock_id_bit, cs_fabric_id; - - /* - * Read FabricBlockInstanceInformation3_CS[BlockFabricID]. - * This is the fabric id for this coherent slave. Use - * umc/channel# as instance id of the coherent slave - * for FICAA. - */ - if (amd_df_indirect_read(nid, 0, 0x50, umc, &tmp)) - goto out_err; - - cs_fabric_id = (tmp >> 8) & 0xFF; - die_id_bit = 0; - - /* If interleaved over more than 1 channel: */ - if (intlv_num_chan) { - die_id_bit = intlv_num_chan; - cs_mask = (1 << die_id_bit) - 1; - cs_id = cs_fabric_id & cs_mask; - } - - sock_id_bit = die_id_bit; - - /* Read D18F1x208 (SystemFabricIdMask). */ - if (intlv_num_dies || intlv_num_sockets) - if (amd_df_indirect_read(nid, 1, 0x208, umc, &tmp)) - goto out_err; - - /* If interleaved over more than 1 die. */ - if (intlv_num_dies) { - sock_id_bit = die_id_bit + intlv_num_dies; - die_id_shift = (tmp >> 24) & 0xF; - die_id_mask = (tmp >> 8) & 0xFF; - - cs_id |= ((cs_fabric_id & die_id_mask) >> die_id_shift) << die_id_bit; - } - - /* If interleaved over more than 1 socket. */ - if (intlv_num_sockets) { - socket_id_shift = (tmp >> 28) & 0xF; - socket_id_mask = (tmp >> 16) & 0xFF; - - cs_id |= ((cs_fabric_id & socket_id_mask) >> socket_id_shift) << sock_id_bit; - } - - /* - * The pre-interleaved address consists of XXXXXXIIIYYYYY - * where III is the ID for this CS, and XXXXXXYYYYY are the - * address bits from the post-interleaved address. - * "num_intlv_bits" has been calculated to tell us how many "I" - * bits there are. "intlv_addr_bit" tells us how many "Y" bits - * there are (where "I" starts). - */ - temp_addr_y = ret_addr & GENMASK_ULL(intlv_addr_bit-1, 0); - temp_addr_i = (cs_id << intlv_addr_bit); - temp_addr_x = (ret_addr & GENMASK_ULL(63, intlv_addr_bit)) << num_intlv_bits; - ret_addr = temp_addr_x | temp_addr_i | temp_addr_y; - } - - /* Add dram base address */ - ret_addr += dram_base_addr; - - /* If legacy MMIO hole enabled */ - if (lgcy_mmio_hole_en) { - if (amd_df_indirect_read(nid, 0, 0x104, umc, &tmp)) - goto out_err; - - dram_hole_base = tmp & GENMASK(31, 24); - if (ret_addr >= dram_hole_base) - ret_addr += (BIT_ULL(32) - dram_hole_base); - } - - if (hash_enabled) { - /* Save some parentheses and grab ls-bit at the end. */ - hashed_bit = (ret_addr >> 12) ^ - (ret_addr >> 18) ^ - (ret_addr >> 21) ^ - (ret_addr >> 30) ^ - cs_id; - - hashed_bit &= BIT(0); - - if (hashed_bit != ((ret_addr >> intlv_addr_bit) & BIT(0))) - ret_addr ^= BIT(intlv_addr_bit); - } - - /* Is calculated system address is above DRAM limit address? */ - if (ret_addr > dram_limit_addr) - goto out_err; - - *sys_addr = ret_addr; - return 0; - -out_err: - return -EINVAL; -} -EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr); - bool amd_mce_is_memory_error(struct mce *m) { /* ErrCodeExt[20:16] */ u8 xec = (m->status >> 16) & 0x1f; if (mce_flags.smca) - return smca_get_bank_type(m->bank) == SMCA_UMC && xec == 0x0; + return smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC && xec == 0x0; return m->bank == 4 && xec == 0x8; } @@ -1171,6 +993,7 @@ static struct attribute *default_attrs[] = { NULL, /* possibly interrupt_enable if supported, see below */ NULL, }; +ATTRIBUTE_GROUPS(default); #define to_block(k) container_of(k, struct threshold_block, kobj) #define to_attr(a) container_of(a, struct threshold_attr, attr) @@ -1207,11 +1030,11 @@ static void threshold_block_release(struct kobject *kobj); static struct kobj_type threshold_ktype = { .sysfs_ops = &threshold_ops, - .default_attrs = default_attrs, + .default_groups = default_groups, .release = threshold_block_release, }; -static const char *get_name(unsigned int bank, struct threshold_block *b) +static const char *get_name(unsigned int cpu, unsigned int bank, struct threshold_block *b) { enum smca_bank_types bank_type; @@ -1222,7 +1045,7 @@ static const char *get_name(unsigned int bank, struct threshold_block *b) return th_names[bank]; } - bank_type = smca_get_bank_type(bank); + bank_type = smca_get_bank_type(cpu, bank); if (bank_type >= N_SMCA_BANK_TYPES) return NULL; @@ -1232,12 +1055,12 @@ static const char *get_name(unsigned int bank, struct threshold_block *b) return NULL; } - if (smca_banks[bank].hwid->count == 1) + if (per_cpu(smca_bank_counts, cpu)[bank_type] == 1) return smca_get_name(bank_type); snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, - "%s_%x", smca_get_name(bank_type), - smca_banks[bank].sysfs_id); + "%s_%u", smca_get_name(bank_type), + per_cpu(smca_banks, cpu)[bank].sysfs_id); return buf_mcatype; } @@ -1279,10 +1102,10 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb b->threshold_limit = THRESHOLD_MAX; if (b->interrupt_capable) { - threshold_ktype.default_attrs[2] = &interrupt_enable.attr; + default_attrs[2] = &interrupt_enable.attr; b->interrupt_enable = 1; } else { - threshold_ktype.default_attrs[2] = NULL; + default_attrs[2] = NULL; } INIT_LIST_HEAD(&b->miscj); @@ -1293,7 +1116,7 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb else tb->blocks = b; - err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(bank, b)); + err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(cpu, bank, b)); if (err) goto out_free; recurse: @@ -1348,7 +1171,7 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, struct device *dev = this_cpu_read(mce_device); struct amd_northbridge *nb = NULL; struct threshold_bank *b = NULL; - const char *name = get_name(bank, NULL); + const char *name = get_name(cpu, bank, NULL); int err = 0; if (!dev) @@ -1471,10 +1294,23 @@ out_free: kfree(bank); } +static void __threshold_remove_device(struct threshold_bank **bp) +{ + unsigned int bank, numbanks = this_cpu_read(mce_num_banks); + + for (bank = 0; bank < numbanks; bank++) { + if (!bp[bank]) + continue; + + threshold_remove_bank(bp[bank]); + bp[bank] = NULL; + } + kfree(bp); +} + int mce_threshold_remove_device(unsigned int cpu) { struct threshold_bank **bp = this_cpu_read(threshold_banks); - unsigned int bank, numbanks = this_cpu_read(mce_num_banks); if (!bp) return 0; @@ -1485,13 +1321,7 @@ int mce_threshold_remove_device(unsigned int cpu) */ this_cpu_write(threshold_banks, NULL); - for (bank = 0; bank < numbanks; bank++) { - if (bp[bank]) { - threshold_remove_bank(bp[bank]); - bp[bank] = NULL; - } - } - kfree(bp); + __threshold_remove_device(bp); return 0; } @@ -1528,15 +1358,14 @@ int mce_threshold_create_device(unsigned int cpu) if (!(this_cpu_read(bank_map) & (1 << bank))) continue; err = threshold_create_bank(bp, cpu, bank); - if (err) - goto out_err; + if (err) { + __threshold_remove_device(bp); + return err; + } } this_cpu_write(threshold_banks, bp); if (thresholding_irq_en) mce_threshold_vector = amd_threshold_interrupt; return 0; -out_err: - mce_threshold_remove_device(cpu); - return err; } diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c index 0e3ae64d3b76..8ed341714686 100644 --- a/arch/x86/kernel/cpu/mce/apei.c +++ b/arch/x86/kernel/cpu/mce/apei.c @@ -29,15 +29,26 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) { struct mce m; + int lsb; if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) return; + /* + * Even if the ->validation_bits are set for address mask, + * to be extra safe, check and reject an error radius '0', + * and fall back to the default page size. + */ + if (mem_err->validation_bits & CPER_MEM_VALID_PA_MASK) + lsb = find_first_bit((void *)&mem_err->physical_addr_mask, PAGE_SHIFT); + else + lsb = PAGE_SHIFT; + mce_setup(&m); m.bank = -1; /* Fake a memory read error with unknown channel */ m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f; - m.misc = (MCI_MISC_ADDR_PHYS << 6) | PAGE_SHIFT; + m.misc = (MCI_MISC_ADDR_PHYS << 6) | lsb; if (severity >= GHES_SEV_RECOVERABLE) m.status |= MCI_STATUS_UC; @@ -177,16 +188,14 @@ retry: /* no more record */ if (*record_id == APEI_ERST_INVALID_RECORD_ID) goto out; - rc = erst_read(*record_id, &rcd.hdr, sizeof(rcd)); + rc = erst_read_record(*record_id, &rcd.hdr, sizeof(rcd), sizeof(rcd), + &CPER_CREATOR_MCE); /* someone else has cleared the record, try next one */ if (rc == -ENOENT) goto retry; else if (rc < 0) goto out; - /* try to skip other type records in storage */ - else if (rc != sizeof(rcd) || - !guid_equal(&rcd.hdr.creator_id, &CPER_CREATOR_MCE)) - goto retry; + memcpy(m, &rcd.mce, sizeof(*m)); rc = sizeof(*m); out: diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 6ed365337a3b..2c8ec5c71712 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -69,7 +69,9 @@ DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks); struct mce_bank { u64 ctl; /* subevents to enable */ - bool init; /* initialise bank? */ + + __u64 init : 1, /* initialise bank? */ + __reserved_1 : 63; }; static DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array); @@ -86,20 +88,11 @@ struct mce_vendor_flags mce_flags __read_mostly; struct mca_config mca_cfg __read_mostly = { .bootlog = -1, - /* - * Tolerant levels: - * 0: always panic on uncorrected errors, log corrected errors - * 1: panic or SIGBUS on uncorrected errors, log corrected errors - * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors - * 3: never panic or SIGBUS, log all errors (for testing only) - */ - .tolerant = 1, .monarch_timeout = -1 }; static DEFINE_PER_CPU(struct mce, mces_seen); static unsigned long mce_need_notify; -static int cpu_missing; /* * MCA banks polled by the period polling timer for corrected events. @@ -128,7 +121,7 @@ static struct irq_work mce_irq_work; BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain); /* Do initial initialization of a struct mce */ -noinstr void mce_setup(struct mce *m) +void mce_setup(struct mce *m) { memset(m, 0, sizeof(struct mce)); m->cpu = m->extcpu = smp_processor_id(); @@ -139,12 +132,7 @@ noinstr void mce_setup(struct mce *m) m->socketid = cpu_data(m->extcpu).phys_proc_id; m->apicid = cpu_data(m->extcpu).initial_apicid; m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP); - - if (this_cpu_has(X86_FEATURE_INTEL_PPIN)) - m->ppin = __rdmsr(MSR_PPIN); - else if (this_cpu_has(X86_FEATURE_AMD_PPIN)) - m->ppin = __rdmsr(MSR_AMD_PPIN); - + m->ppin = cpu_data(m->extcpu).ppin; m->microcode = boot_cpu_data.microcode; } @@ -174,27 +162,6 @@ void mce_unregister_decode_chain(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); -u32 mca_msr_reg(int bank, enum mca_msr reg) -{ - if (mce_flags.smca) { - switch (reg) { - case MCA_CTL: return MSR_AMD64_SMCA_MCx_CTL(bank); - case MCA_ADDR: return MSR_AMD64_SMCA_MCx_ADDR(bank); - case MCA_MISC: return MSR_AMD64_SMCA_MCx_MISC(bank); - case MCA_STATUS: return MSR_AMD64_SMCA_MCx_STATUS(bank); - } - } - - switch (reg) { - case MCA_CTL: return MSR_IA32_MCx_CTL(bank); - case MCA_ADDR: return MSR_IA32_MCx_ADDR(bank); - case MCA_MISC: return MSR_IA32_MCx_MISC(bank); - case MCA_STATUS: return MSR_IA32_MCx_STATUS(bank); - } - - return 0; -} - static void __print_mce(struct mce *m) { pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n", @@ -267,11 +234,17 @@ static void wait_for_panic(void) panic("Panicing machine check CPU died"); } -static void mce_panic(const char *msg, struct mce *final, char *exp) +static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) { - int apei_err = 0; struct llist_node *pending; struct mce_evt_llist *l; + int apei_err = 0; + + /* + * Allow instrumentation around external facilities usage. Not that it + * matters a whole lot since the machine is going to panic anyway. + */ + instrumentation_begin(); if (!fake_panic) { /* @@ -286,7 +259,7 @@ static void mce_panic(const char *msg, struct mce *final, char *exp) } else { /* Don't log too much for fake panic */ if (atomic_inc_return(&mce_fake_panicked) > 1) - return; + goto out; } pending = mce_gen_pool_prepare_records(); /* First print corrected ones that are still unlogged */ @@ -314,8 +287,6 @@ static void mce_panic(const char *msg, struct mce *final, char *exp) if (!apei_err) apei_err = apei_write_mce(final); } - if (cpu_missing) - pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n"); if (exp) pr_emerg(HW_ERR "Machine check: %s\n", exp); if (!fake_panic) { @@ -324,6 +295,9 @@ static void mce_panic(const char *msg, struct mce *final, char *exp) panic(msg); } else pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); + +out: + instrumentation_end(); } /* Support code for software error injection */ @@ -365,7 +339,7 @@ void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr) } /* MSR access wrappers used for error injection */ -static noinstr u64 mce_rdmsrl(u32 msr) +noinstr u64 mce_rdmsrl(u32 msr) { DECLARE_ARGS(val, low, high); @@ -433,9 +407,15 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v) * check into our "mce" struct so that we can use it later to assess * the severity of the problem as we read per-bank specific details. */ -static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) +static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs) { + /* + * Enable instrumentation around mce_setup() which calls external + * facilities. + */ + instrumentation_begin(); mce_setup(m); + instrumentation_end(); m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); if (regs) { @@ -601,7 +581,7 @@ static int uc_decode_notifier(struct notifier_block *nb, unsigned long val, pfn = mce->addr >> PAGE_SHIFT; if (!memory_failure(pfn, 0)) { - set_mce_nospec(pfn, whole_page(mce)); + set_mce_nospec(pfn); mce->kflags |= MCE_HANDLED_UC; } @@ -636,7 +616,7 @@ static struct notifier_block mce_default_nb = { /* * Read ADDR and MISC registers. */ -static void mce_read_aux(struct mce *m, int i) +static noinstr void mce_read_aux(struct mce *m, int i) { if (m->status & MCI_STATUS_MISCV) m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC)); @@ -762,7 +742,7 @@ log_it: goto clear_it; mce_read_aux(&m, i); - m.severity = mce_severity(&m, NULL, mca_cfg.tolerant, NULL, false); + m.severity = mce_severity(&m, NULL, NULL, false); /* * Don't get the IP here because it's unlikely to * have anything to do with the actual error location. @@ -802,7 +782,8 @@ EXPORT_SYMBOL_GPL(machine_check_poll); * the severity assessment code. Pretend that EIPV was set, and take the * ip/cs values from the pt_regs that mce_gather_info() ignored earlier. */ -static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) +static __always_inline void +quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) { if (bank != 0) return; @@ -823,11 +804,64 @@ static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) } /* + * Disable fast string copy and return from the MCE handler upon the first SRAR + * MCE on bank 1 due to a CPU erratum on Intel Skylake/Cascade Lake/Cooper Lake + * CPUs. + * The fast string copy instructions ("REP; MOVS*") could consume an + * uncorrectable memory error in the cache line _right after_ the desired region + * to copy and raise an MCE with RIP pointing to the instruction _after_ the + * "REP; MOVS*". + * This mitigation addresses the issue completely with the caveat of performance + * degradation on the CPU affected. This is still better than the OS crashing on + * MCEs raised on an irrelevant process due to "REP; MOVS*" accesses from a + * kernel context (e.g., copy_page). + * + * Returns true when fast string copy on CPU has been disabled. + */ +static noinstr bool quirk_skylake_repmov(void) +{ + u64 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + u64 misc_enable = mce_rdmsrl(MSR_IA32_MISC_ENABLE); + u64 mc1_status; + + /* + * Apply the quirk only to local machine checks, i.e., no broadcast + * sync is needed. + */ + if (!(mcgstatus & MCG_STATUS_LMCES) || + !(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) + return false; + + mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(1)); + + /* Check for a software-recoverable data fetch error. */ + if ((mc1_status & + (MCI_STATUS_VAL | MCI_STATUS_OVER | MCI_STATUS_UC | MCI_STATUS_EN | + MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_PCC | + MCI_STATUS_AR | MCI_STATUS_S)) == + (MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN | + MCI_STATUS_ADDRV | MCI_STATUS_MISCV | + MCI_STATUS_AR | MCI_STATUS_S)) { + misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING; + mce_wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); + mce_wrmsrl(MSR_IA32_MCx_STATUS(1), 0); + + instrumentation_begin(); + pr_err_once("Erratum detected, disable fast string copy instructions.\n"); + instrumentation_end(); + + return true; + } + + return false; +} + +/* * Do a quick check if any of the events requires a panic. * This decides if we keep the events around or clear them. */ -static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, - struct pt_regs *regs) +static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, + struct pt_regs *regs) { char *tmp = *msg; int i; @@ -837,12 +871,12 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, if (!(m->status & MCI_STATUS_VAL)) continue; - __set_bit(i, validp); + arch___set_bit(i, validp); if (mce_flags.snb_ifu_quirk) quirk_sandybridge_ifu(i, m, regs); m->bank = i; - if (mce_severity(m, regs, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) { + if (mce_severity(m, regs, &tmp, true) >= MCE_PANIC_SEVERITY) { mce_read_aux(m, i); *msg = tmp; return 1; @@ -871,8 +905,13 @@ static cpumask_t mce_missing_cpus = CPU_MASK_ALL; /* * Check if a timeout waiting for other CPUs happened. */ -static int mce_timed_out(u64 *t, const char *msg) +static noinstr int mce_timed_out(u64 *t, const char *msg) { + int ret = 0; + + /* Enable instrumentation around calls to external facilities */ + instrumentation_begin(); + /* * The others already did panic for some reason. * Bail out like in a timeout. @@ -885,19 +924,22 @@ static int mce_timed_out(u64 *t, const char *msg) if (!mca_cfg.monarch_timeout) goto out; if ((s64)*t < SPINUNIT) { - if (mca_cfg.tolerant <= 1) { - if (cpumask_and(&mce_missing_cpus, cpu_online_mask, &mce_missing_cpus)) - pr_emerg("CPUs not responding to MCE broadcast (may include false positives): %*pbl\n", - cpumask_pr_args(&mce_missing_cpus)); - mce_panic(msg, NULL, NULL); - } - cpu_missing = 1; - return 1; + if (cpumask_and(&mce_missing_cpus, cpu_online_mask, &mce_missing_cpus)) + pr_emerg("CPUs not responding to MCE broadcast (may include false positives): %*pbl\n", + cpumask_pr_args(&mce_missing_cpus)); + mce_panic(msg, NULL, NULL); + + ret = 1; + goto out; } *t -= SPINUNIT; + out: touch_nmi_watchdog(); - return 0; + + instrumentation_end(); + + return ret; } /* @@ -950,9 +992,9 @@ static void mce_reign(void) * This dumps all the mces in the log buffer and stops the * other CPUs. */ - if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) { + if (m && global_worst >= MCE_PANIC_SEVERITY) { /* call mce_severity() to get "msg" for panic */ - mce_severity(m, NULL, mca_cfg.tolerant, &msg, true); + mce_severity(m, NULL, &msg, true); mce_panic("Fatal machine check", m, msg); } @@ -966,7 +1008,7 @@ static void mce_reign(void) * No machine check event found. Must be some external * source or one CPU is hung. Panic. */ - if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3) + if (global_worst <= MCE_KEEP_SEVERITY) mce_panic("Fatal machine check from unknown source", NULL, NULL); /* @@ -986,31 +1028,33 @@ static atomic_t global_nwo; * in the entry order. * TBD double check parallel CPU hotunplug */ -static int mce_start(int *no_way_out) +static noinstr int mce_start(int *no_way_out) { - int order; - int cpus = num_online_cpus(); u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; + int order, ret = -1; if (!timeout) - return -1; + return ret; - atomic_add(*no_way_out, &global_nwo); + arch_atomic_add(*no_way_out, &global_nwo); /* * Rely on the implied barrier below, such that global_nwo * is updated before mce_callin. */ - order = atomic_inc_return(&mce_callin); - cpumask_clear_cpu(smp_processor_id(), &mce_missing_cpus); + order = arch_atomic_inc_return(&mce_callin); + arch_cpumask_clear_cpu(smp_processor_id(), &mce_missing_cpus); + + /* Enable instrumentation around calls to external facilities */ + instrumentation_begin(); /* * Wait for everyone. */ - while (atomic_read(&mce_callin) != cpus) { + while (arch_atomic_read(&mce_callin) != num_online_cpus()) { if (mce_timed_out(&timeout, "Timeout: Not all CPUs entered broadcast exception handler")) { - atomic_set(&global_nwo, 0); - return -1; + arch_atomic_set(&global_nwo, 0); + goto out; } ndelay(SPINUNIT); } @@ -1024,7 +1068,7 @@ static int mce_start(int *no_way_out) /* * Monarch: Starts executing now, the others wait. */ - atomic_set(&mce_executing, 1); + arch_atomic_set(&mce_executing, 1); } else { /* * Subject: Now start the scanning loop one by one in @@ -1032,11 +1076,11 @@ static int mce_start(int *no_way_out) * This way when there are any shared banks it will be * only seen by one CPU before cleared, avoiding duplicates. */ - while (atomic_read(&mce_executing) < order) { + while (arch_atomic_read(&mce_executing) < order) { if (mce_timed_out(&timeout, "Timeout: Subject CPUs unable to finish machine check processing")) { - atomic_set(&global_nwo, 0); - return -1; + arch_atomic_set(&global_nwo, 0); + goto out; } ndelay(SPINUNIT); } @@ -1045,19 +1089,27 @@ static int mce_start(int *no_way_out) /* * Cache the global no_way_out state. */ - *no_way_out = atomic_read(&global_nwo); + *no_way_out = arch_atomic_read(&global_nwo); + + ret = order; - return order; +out: + instrumentation_end(); + + return ret; } /* * Synchronize between CPUs after main scanning loop. * This invokes the bulk of the Monarch processing. */ -static int mce_end(int order) +static noinstr int mce_end(int order) { - int ret = -1; u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; + int ret = -1; + + /* Allow instrumentation around external facilities. */ + instrumentation_begin(); if (!timeout) goto reset; @@ -1070,14 +1122,11 @@ static int mce_end(int order) atomic_inc(&mce_executing); if (order == 1) { - /* CHECKME: Can this race with a parallel hotplug? */ - int cpus = num_online_cpus(); - /* * Monarch: Wait for everyone to go through their scanning * loops. */ - while (atomic_read(&mce_executing) <= cpus) { + while (atomic_read(&mce_executing) <= num_online_cpus()) { if (mce_timed_out(&timeout, "Timeout: Monarch CPU unable to finish machine check processing")) goto reset; @@ -1101,7 +1150,8 @@ static int mce_end(int order) /* * Don't reset anything. That's done by the Monarch. */ - return 0; + ret = 0; + goto out; } /* @@ -1117,15 +1167,19 @@ reset: * Let others run again. */ atomic_set(&mce_executing, 0); + +out: + instrumentation_end(); + return ret; } -static void mce_clear_state(unsigned long *toclear) +static __always_inline void mce_clear_state(unsigned long *toclear) { int i; for (i = 0; i < this_cpu_read(mce_num_banks); i++) { - if (test_bit(i, toclear)) + if (arch_test_bit(i, toclear)) mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); } } @@ -1165,17 +1219,18 @@ static noinstr bool mce_check_crashing_cpu(void) return false; } -static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, - unsigned long *toclear, unsigned long *valid_banks, - int no_way_out, int *worst) +static __always_inline int +__mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, + unsigned long *toclear, unsigned long *valid_banks, int no_way_out, + int *worst) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); struct mca_config *cfg = &mca_cfg; - int severity, i; + int severity, i, taint = 0; for (i = 0; i < this_cpu_read(mce_num_banks); i++) { - __clear_bit(i, toclear); - if (!test_bit(i, valid_banks)) + arch___clear_bit(i, toclear); + if (!arch_test_bit(i, valid_banks)) continue; if (!mce_banks[i].ctl) @@ -1198,9 +1253,9 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin continue; /* Set taint even when machine check was not enabled. */ - add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); + taint++; - severity = mce_severity(m, regs, cfg->tolerant, NULL, true); + severity = mce_severity(m, regs, NULL, true); /* * When machine check was for corrected/deferred handler don't @@ -1210,7 +1265,7 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin severity == MCE_UCNA_SEVERITY) && !no_way_out) continue; - __set_bit(i, toclear); + arch___set_bit(i, toclear); /* Machine check event was not enabled. Clear, but ignore. */ if (severity == MCE_NO_SEVERITY) @@ -1221,7 +1276,13 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin /* assuming valid severity level != 0 */ m->severity = severity; + /* + * Enable instrumentation around the mce_log() call which is + * done in #MC context, where instrumentation is disabled. + */ + instrumentation_begin(); mce_log(m); + instrumentation_end(); if (severity > *worst) { *final = *m; @@ -1231,6 +1292,8 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin /* mce_clear_state will clear *final, save locally for use later */ *m = *final; + + return taint; } static void kill_me_now(struct callback_head *ch) @@ -1255,17 +1318,19 @@ static void kill_me_maybe(struct callback_head *cb) ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags); if (!ret) { - set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); + set_mce_nospec(p->mce_addr >> PAGE_SHIFT); sync_core(); return; } /* * -EHWPOISON from memory_failure() means that it already sent SIGBUS - * to the current process with the proper error info, so no need to - * send SIGBUS here again. + * to the current process with the proper error info, + * -EOPNOTSUPP means hwpoison_filter() filtered the error event, + * + * In both cases, no further processing is required. */ - if (ret == -EHWPOISON) + if (ret == -EHWPOISON || ret == -EOPNOTSUPP) return; pr_err("Memory error not recovered"); @@ -1279,7 +1344,7 @@ static void kill_me_never(struct callback_head *cb) p->mce_count = 0; pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr); if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0)) - set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); + set_mce_nospec(p->mce_addr >> PAGE_SHIFT); } static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *)) @@ -1320,11 +1385,11 @@ static noinstr void unexpected_machine_check(struct pt_regs *regs) } /* - * The actual machine check handler. This only handles real - * exceptions when something got corrupted coming in through int 18. + * The actual machine check handler. This only handles real exceptions when + * something got corrupted coming in through int 18. * - * This is executed in NMI context not subject to normal locking rules. This - * implies that most kernel services cannot be safely used. Don't even + * This is executed in #MC context not subject to normal locking rules. + * This implies that most kernel services cannot be safely used. Don't even * think about putting a printk in there! * * On Intel systems this is entered on all CPUs in parallel through @@ -1336,13 +1401,20 @@ static noinstr void unexpected_machine_check(struct pt_regs *regs) * issues: if the machine check was due to a failure of the memory * backing the user stack, tracing that reads the user stack will cause * potentially infinite recursion. + * + * Currently, the #MC handler calls out to a number of external facilities + * and, therefore, allows instrumentation around them. The optimal thing to + * have would be to do the absolutely minimal work required in #MC context + * and have instrumentation disabled only around that. Further processing can + * then happen in process context where instrumentation is allowed. Achieving + * that requires careful auditing and modifications. Until then, the code + * allows instrumentation temporarily, where required. * */ noinstr void do_machine_check(struct pt_regs *regs) { - int worst = 0, order, no_way_out, kill_current_task, lmce; - DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); - DECLARE_BITMAP(toclear, MAX_NR_BANKS); - struct mca_config *cfg = &mca_cfg; + int worst = 0, order, no_way_out, kill_current_task, lmce, taint = 0; + DECLARE_BITMAP(valid_banks, MAX_NR_BANKS) = { 0 }; + DECLARE_BITMAP(toclear, MAX_NR_BANKS) = { 0 }; struct mce m, *final; char *msg = NULL; @@ -1353,6 +1425,9 @@ noinstr void do_machine_check(struct pt_regs *regs) else if (unlikely(!mca_cfg.initialized)) return unexpected_machine_check(regs); + if (mce_flags.skx_repmov_quirk && quirk_skylake_repmov()) + goto clear; + /* * Establish sequential order between the CPUs entering the machine * check handler. @@ -1361,7 +1436,7 @@ noinstr void do_machine_check(struct pt_regs *regs) /* * If no_way_out gets set, there is no safe way to recover from this - * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway. + * MCE. */ no_way_out = 0; @@ -1385,7 +1460,6 @@ noinstr void do_machine_check(struct pt_regs *regs) final = this_cpu_ptr(&mces_seen); *final = m; - memset(valid_banks, 0, sizeof(valid_banks)); no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs); barrier(); @@ -1396,7 +1470,7 @@ noinstr void do_machine_check(struct pt_regs *regs) * severity is MCE_AR_SEVERITY we have other options. */ if (!(m.mcgstatus & MCG_STATUS_RIPV)) - kill_current_task = (cfg->tolerant == 3) ? 0 : 1; + kill_current_task = 1; /* * Check if this MCE is signaled to only this logical processor, * on Intel, Zhaoxin only. @@ -1413,13 +1487,13 @@ noinstr void do_machine_check(struct pt_regs *regs) * to see it will clear it. */ if (lmce) { - if (no_way_out && cfg->tolerant < 3) + if (no_way_out) mce_panic("Fatal local machine check", &m, msg); } else { order = mce_start(&no_way_out); } - __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst); + taint = __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst); if (!no_way_out) mce_clear_state(toclear); @@ -1433,7 +1507,7 @@ noinstr void do_machine_check(struct pt_regs *regs) if (!no_way_out) no_way_out = worst >= MCE_PANIC_SEVERITY; - if (no_way_out && cfg->tolerant < 3) + if (no_way_out) mce_panic("Fatal machine check on current CPU", &m, msg); } } else { @@ -1445,12 +1519,22 @@ noinstr void do_machine_check(struct pt_regs *regs) * fatal error. We call "mce_severity()" again to * make sure we have the right "msg". */ - if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) { - mce_severity(&m, regs, cfg->tolerant, &msg, true); + if (worst >= MCE_PANIC_SEVERITY) { + mce_severity(&m, regs, &msg, true); mce_panic("Local fatal machine check!", &m, msg); } } + /* + * Enable instrumentation around the external facilities like task_work_add() + * (via queue_task_work()), fixup_exception() etc. For now, that is. Fixing this + * properly would need a lot more involved reorganization. + */ + instrumentation_begin(); + + if (taint) + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); + if (worst != MCE_AR_SEVERITY && !kill_current_task) goto out; @@ -1482,7 +1566,11 @@ noinstr void do_machine_check(struct pt_regs *regs) if (m.kflags & MCE_IN_KERNEL_COPYIN) queue_task_work(&m, msg, kill_me_never); } + out: + instrumentation_end(); + +clear: mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); } EXPORT_SYMBOL_GPL(do_machine_check); @@ -1796,6 +1884,13 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) if (c->x86 == 6 && c->x86_model == 45) mce_flags.snb_ifu_quirk = 1; + + /* + * Skylake, Cascacde Lake and Cooper Lake require a quirk on + * rep movs. + */ + if (c->x86 == 6 && c->x86_model == INTEL_FAM6_SKYLAKE_X) + mce_flags.skx_repmov_quirk = 1; } if (c->x86_vendor == X86_VENDOR_ZHAOXIN) { @@ -2161,10 +2256,9 @@ static int __init mcheck_enable(char *str) cfg->bios_cmci_threshold = 1; else if (!strcmp(str, "recovery")) cfg->recovery = 1; - else if (isdigit(str[0])) { - if (get_option(&str, &cfg->tolerant) == 2) - get_option(&str, &(cfg->monarch_timeout)); - } else { + else if (isdigit(str[0])) + get_option(&str, &(cfg->monarch_timeout)); + else { pr_info("mce argument %s ignored. Please use /sys\n", str); return 0; } @@ -2414,7 +2508,6 @@ static ssize_t store_int_with_restart(struct device *s, return ret; } -static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant); static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout); static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce); static DEVICE_BOOL_ATTR(print_all, 0644, mca_cfg.print_all); @@ -2435,7 +2528,6 @@ static struct dev_ext_attribute dev_attr_cmci_disabled = { }; static struct device_attribute *mce_device_attrs[] = { - &dev_attr_tolerant.attr, &dev_attr_check_interval.attr, #ifdef CONFIG_X86_MCELOG_LEGACY &dev_attr_trigger, @@ -2702,7 +2794,6 @@ struct dentry *mce_get_debugfs_dir(void) static void mce_reset(void) { - cpu_missing = 0; atomic_set(&mce_fake_panicked, 0); atomic_set(&mce_executing, 0); atomic_set(&mce_callin, 0); diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 0bfc14041bbb..12cf2e7ca33c 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -33,6 +33,8 @@ #include "internal.h" +static bool hw_injection_possible = true; + /* * Collect all the MCi_XXX settings */ @@ -74,7 +76,6 @@ MCE_INJECT_SET(status); MCE_INJECT_SET(misc); MCE_INJECT_SET(addr); MCE_INJECT_SET(synd); -MCE_INJECT_SET(ipid); #define MCE_INJECT_GET(reg) \ static int inj_##reg##_get(void *data, u64 *val) \ @@ -95,6 +96,20 @@ DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n"); DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n"); DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n"); DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n"); + +/* Use the user provided IPID value on a sw injection. */ +static int inj_ipid_set(void *data, u64 val) +{ + struct mce *m = (struct mce *)data; + + if (cpu_feature_enabled(X86_FEATURE_SMCA)) { + if (inj_type == SW_INJ) + m->ipid = val; + } + + return 0; +} + DEFINE_SIMPLE_ATTRIBUTE(ipid_fops, inj_ipid_get, inj_ipid_set, "%llx\n"); static void setup_inj_struct(struct mce *m) @@ -326,6 +341,8 @@ static int __set_inj(const char *buf) for (i = 0; i < N_INJ_TYPES; i++) { if (!strncmp(flags_options[i], buf, strlen(flags_options[i]))) { + if (i > SW_INJ && !hw_injection_possible) + continue; inj_type = i; return 0; } @@ -350,7 +367,7 @@ static ssize_t flags_write(struct file *filp, const char __user *ubuf, char buf[MAX_FLAG_OPT_SIZE], *__buf; int err; - if (cnt > MAX_FLAG_OPT_SIZE) + if (!cnt || cnt > MAX_FLAG_OPT_SIZE) return -EINVAL; if (copy_from_user(&buf, ubuf, cnt)) @@ -490,6 +507,8 @@ static void do_inject(void) i_mce.tsc = rdtsc_ordered(); + i_mce.status |= MCI_STATUS_VAL; + if (i_mce.misc) i_mce.status |= MCI_STATUS_MISCV; @@ -577,6 +596,33 @@ static int inj_bank_set(void *data, u64 val) } m->bank = val; + + /* + * sw-only injection allows to write arbitrary values into the MCA + * registers because it tests only the decoding paths. + */ + if (inj_type == SW_INJ) + goto inject; + + /* + * Read IPID value to determine if a bank is populated on the target + * CPU. + */ + if (cpu_feature_enabled(X86_FEATURE_SMCA)) { + u64 ipid; + + if (rdmsrl_on_cpu(m->extcpu, MSR_AMD64_SMCA_MCx_IPID(val), &ipid)) { + pr_err("Error reading IPID on CPU%d\n", m->extcpu); + return -EINVAL; + } + + if (!ipid) { + pr_err("Cannot inject into unpopulated bank %llu\n", val); + return -ENODEV; + } + } + +inject: do_inject(); /* Reset injection struct */ @@ -675,11 +721,54 @@ static void __init debugfs_init(void) &i_mce, dfs_fls[i].fops); } +static void check_hw_inj_possible(void) +{ + int cpu; + u8 bank; + + /* + * This behavior exists only on SMCA systems though its not directly + * related to SMCA. + */ + if (!cpu_feature_enabled(X86_FEATURE_SMCA)) + return; + + cpu = get_cpu(); + + for (bank = 0; bank < MAX_NR_BANKS; ++bank) { + u64 status = MCI_STATUS_VAL, ipid; + + /* Check whether bank is populated */ + rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), ipid); + if (!ipid) + continue; + + toggle_hw_mce_inject(cpu, true); + + wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), status); + rdmsrl_safe(mca_msr_reg(bank, MCA_STATUS), &status); + + if (!status) { + hw_injection_possible = false; + pr_warn("Platform does not allow *hardware* error injection." + "Try using APEI EINJ instead.\n"); + } + + toggle_hw_mce_inject(cpu, false); + + break; + } + + put_cpu(); +} + static int __init inject_init(void) { if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL)) return -ENOMEM; + check_hw_inj_possible(); + debugfs_init(); register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify"); diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index bb9a46a804bf..95275a5e57e0 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -470,46 +470,6 @@ void intel_clear_lmce(void) wrmsrl(MSR_IA32_MCG_EXT_CTL, val); } -static void intel_ppin_init(struct cpuinfo_x86 *c) -{ - unsigned long long val; - - /* - * Even if testing the presence of the MSR would be enough, we don't - * want to risk the situation where other models reuse this MSR for - * other purposes. - */ - switch (c->x86_model) { - case INTEL_FAM6_IVYBRIDGE_X: - case INTEL_FAM6_HASWELL_X: - case INTEL_FAM6_BROADWELL_D: - case INTEL_FAM6_BROADWELL_X: - case INTEL_FAM6_SKYLAKE_X: - case INTEL_FAM6_ICELAKE_X: - case INTEL_FAM6_SAPPHIRERAPIDS_X: - case INTEL_FAM6_XEON_PHI_KNL: - case INTEL_FAM6_XEON_PHI_KNM: - - if (rdmsrl_safe(MSR_PPIN_CTL, &val)) - return; - - if ((val & 3UL) == 1UL) { - /* PPIN locked in disabled mode */ - return; - } - - /* If PPIN is disabled, try to enable */ - if (!(val & 2UL)) { - wrmsrl_safe(MSR_PPIN_CTL, val | 2UL); - rdmsrl_safe(MSR_PPIN_CTL, &val); - } - - /* Is the enable bit set? */ - if (val & 2UL) - set_cpu_cap(c, X86_FEATURE_INTEL_PPIN); - } -} - /* * Enable additional error logs from the integrated * memory controller on processors that support this. @@ -534,7 +494,6 @@ void mce_intel_feature_init(struct cpuinfo_x86 *c) { intel_init_cmci(); intel_init_lmce(); - intel_ppin_init(c); intel_imc_init(c); } diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index acd61c41846c..7e03f5b7f6bd 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -35,7 +35,7 @@ int mce_gen_pool_add(struct mce *mce); int mce_gen_pool_init(void); struct llist_node *mce_gen_pool_prepare_records(void); -int mce_severity(struct mce *a, struct pt_regs *regs, int tolerant, char **msg, bool is_excp); +int mce_severity(struct mce *a, struct pt_regs *regs, char **msg, bool is_excp); struct dentry *mce_get_debugfs_dir(void); extern mce_banks_t mce_banks_ce_disabled; @@ -127,7 +127,6 @@ struct mca_config { bool ignore_ce; bool print_all; - int tolerant; int monarch_timeout; int panic_timeout; u32 rip_msr; @@ -170,7 +169,10 @@ struct mce_vendor_flags { /* SandyBridge IFU quirk */ snb_ifu_quirk : 1, - __reserved_0 : 57; + /* Skylake, Cascade Lake, Cooper Lake REP;MOVS* quirk */ + skx_repmov_quirk : 1, + + __reserved_0 : 56; }; extern struct mce_vendor_flags mce_flags; @@ -182,8 +184,6 @@ enum mca_msr { MCA_MISC, }; -u32 mca_msr_reg(int bank, enum mca_msr reg); - /* Decide whether to add MCE record to MCE event pool or filter it out. */ extern bool filter_mce(struct mce *m); @@ -207,4 +207,27 @@ static inline void pentium_machine_check(struct pt_regs *regs) {} static inline void winchip_machine_check(struct pt_regs *regs) {} #endif +noinstr u64 mce_rdmsrl(u32 msr); + +static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg) +{ + if (cpu_feature_enabled(X86_FEATURE_SMCA)) { + switch (reg) { + case MCA_CTL: return MSR_AMD64_SMCA_MCx_CTL(bank); + case MCA_ADDR: return MSR_AMD64_SMCA_MCx_ADDR(bank); + case MCA_MISC: return MSR_AMD64_SMCA_MCx_MISC(bank); + case MCA_STATUS: return MSR_AMD64_SMCA_MCx_STATUS(bank); + } + } + + switch (reg) { + case MCA_CTL: return MSR_IA32_MCx_CTL(bank); + case MCA_ADDR: return MSR_IA32_MCx_ADDR(bank); + case MCA_MISC: return MSR_IA32_MCx_MISC(bank); + case MCA_STATUS: return MSR_IA32_MCx_STATUS(bank); + } + + return 0; +} + #endif /* __X86_MCE_INTERNAL_H__ */ diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index bb019a594a2c..00483d1c27e4 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -222,6 +222,9 @@ static bool is_copy_from_user(struct pt_regs *regs) struct insn insn; int ret; + if (!regs) + return false; + if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip, MAX_INSN_SIZE)) return false; @@ -263,115 +266,103 @@ static bool is_copy_from_user(struct pt_regs *regs) * distinguish an exception taken in user from from one * taken in the kernel. */ -static int error_context(struct mce *m, struct pt_regs *regs) +static noinstr int error_context(struct mce *m, struct pt_regs *regs) { + int fixup_type; + bool copy_user; + if ((m->cs & 3) == 3) return IN_USER; + if (!mc_recoverable(m->mcgstatus)) return IN_KERNEL; - switch (ex_get_fixup_type(m->ip)) { + /* Allow instrumentation around external facilities usage. */ + instrumentation_begin(); + fixup_type = ex_get_fixup_type(m->ip); + copy_user = is_copy_from_user(regs); + instrumentation_end(); + + switch (fixup_type) { case EX_TYPE_UACCESS: case EX_TYPE_COPY: - if (!regs || !is_copy_from_user(regs)) + if (!copy_user) return IN_KERNEL; m->kflags |= MCE_IN_KERNEL_COPYIN; fallthrough; + case EX_TYPE_FAULT_MCE_SAFE: case EX_TYPE_DEFAULT_MCE_SAFE: m->kflags |= MCE_IN_KERNEL_RECOV; return IN_KERNEL_RECOV; + default: return IN_KERNEL; } } -static int mce_severity_amd_smca(struct mce *m, enum context err_ctx) +/* See AMD PPR(s) section Machine Check Error Handling. */ +static noinstr int mce_severity_amd(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp) { - u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank); - u32 low, high; + char *panic_msg = NULL; + int ret; /* - * We need to look at the following bits: - * - "succor" bit (data poisoning support), and - * - TCC bit (Task Context Corrupt) - * in MCi_STATUS to determine error severity. + * Default return value: Action required, the error must be handled + * immediately. */ - if (!mce_flags.succor) - return MCE_PANIC_SEVERITY; - - if (rdmsr_safe(addr, &low, &high)) - return MCE_PANIC_SEVERITY; - - /* TCC (Task context corrupt). If set and if IN_KERNEL, panic. */ - if ((low & MCI_CONFIG_MCAX) && - (m->status & MCI_STATUS_TCC) && - (err_ctx == IN_KERNEL)) - return MCE_PANIC_SEVERITY; - - /* ...otherwise invoke hwpoison handler. */ - return MCE_AR_SEVERITY; -} - -/* - * See AMD Error Scope Hierarchy table in a newer BKDG. For example - * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features" - */ -static int mce_severity_amd(struct mce *m, struct pt_regs *regs, int tolerant, - char **msg, bool is_excp) -{ - enum context ctx = error_context(m, regs); + ret = MCE_AR_SEVERITY; /* Processor Context Corrupt, no need to fumble too much, die! */ - if (m->status & MCI_STATUS_PCC) - return MCE_PANIC_SEVERITY; - - if (m->status & MCI_STATUS_UC) { - - if (ctx == IN_KERNEL) - return MCE_PANIC_SEVERITY; + if (m->status & MCI_STATUS_PCC) { + panic_msg = "Processor Context Corrupt"; + ret = MCE_PANIC_SEVERITY; + goto out; + } - /* - * On older systems where overflow_recov flag is not present, we - * should simply panic if an error overflow occurs. If - * overflow_recov flag is present and set, then software can try - * to at least kill process to prolong system operation. - */ - if (mce_flags.overflow_recov) { - if (mce_flags.smca) - return mce_severity_amd_smca(m, ctx); - - /* kill current process */ - return MCE_AR_SEVERITY; - } else { - /* at least one error was not logged */ - if (m->status & MCI_STATUS_OVER) - return MCE_PANIC_SEVERITY; - } - - /* - * For any other case, return MCE_UC_SEVERITY so that we log the - * error and exit #MC handler. - */ - return MCE_UC_SEVERITY; + if (m->status & MCI_STATUS_DEFERRED) { + ret = MCE_DEFERRED_SEVERITY; + goto out; } /* - * deferred error: poll handler catches these and adds to mce_ring so - * memory-failure can take recovery actions. + * If the UC bit is not set, the system either corrected or deferred + * the error. No action will be required after logging the error. */ - if (m->status & MCI_STATUS_DEFERRED) - return MCE_DEFERRED_SEVERITY; + if (!(m->status & MCI_STATUS_UC)) { + ret = MCE_KEEP_SEVERITY; + goto out; + } /* - * corrected error: poll handler catches these and passes responsibility - * of decoding the error to EDAC + * On MCA overflow, without the MCA overflow recovery feature the + * system will not be able to recover, panic. */ - return MCE_KEEP_SEVERITY; + if ((m->status & MCI_STATUS_OVER) && !mce_flags.overflow_recov) { + panic_msg = "Overflowed uncorrected error without MCA Overflow Recovery"; + ret = MCE_PANIC_SEVERITY; + goto out; + } + + if (!mce_flags.succor) { + panic_msg = "Uncorrected error without MCA Recovery"; + ret = MCE_PANIC_SEVERITY; + goto out; + } + + if (error_context(m, regs) == IN_KERNEL) { + panic_msg = "Uncorrected unrecoverable error in kernel context"; + ret = MCE_PANIC_SEVERITY; + } + +out: + if (msg && panic_msg) + *msg = panic_msg; + + return ret; } -static int mce_severity_intel(struct mce *m, struct pt_regs *regs, - int tolerant, char **msg, bool is_excp) +static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp) { enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP); enum context ctx = error_context(m, regs); @@ -399,22 +390,21 @@ static int mce_severity_intel(struct mce *m, struct pt_regs *regs, if (msg) *msg = s->msg; s->covered = 1; - if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) { - if (tolerant < 1) - return MCE_PANIC_SEVERITY; - } + + if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) + return MCE_PANIC_SEVERITY; + return s->sev; } } -int mce_severity(struct mce *m, struct pt_regs *regs, int tolerant, char **msg, - bool is_excp) +int noinstr mce_severity(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp) { if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) - return mce_severity_amd(m, regs, tolerant, msg, is_excp); + return mce_severity_amd(m, regs, msg, is_excp); else - return mce_severity_intel(m, regs, tolerant, msg, is_excp); + return mce_severity_intel(m, regs, msg, is_excp); } #ifdef CONFIG_DEBUG_FS diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 8b2fcdfa6d31..3a35dec3ec55 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -440,7 +440,13 @@ apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_p return ret; native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - if (rev >= mc->hdr.patch_id) + + /* + * Allow application of the same revision to pick up SMT-specific + * changes even if the revision of the other SMT thread is already + * up-to-date. + */ + if (rev > mc->hdr.patch_id) return ret; if (!__apply_microcode_amd(mc)) { @@ -528,8 +534,12 @@ void load_ucode_amd_ap(unsigned int cpuid_1_eax) native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - /* Check whether we have saved a new patch already: */ - if (*new_rev && rev < mc->hdr.patch_id) { + /* + * Check whether a new patch has been saved already. Also, allow application of + * the same revision in order to pick up SMT-thread-specific configuration even + * if the sibling SMT thread already has an up-to-date revision. + */ + if (*new_rev && rev <= mc->hdr.patch_id) { if (!__apply_microcode_amd(mc)) { *new_rev = mc->hdr.patch_id; return; @@ -788,6 +798,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover, kfree(patch); return -EINVAL; } + patch->size = *patch_size; mc_hdr = (struct microcode_header_amd *)(fw + SECTION_HDR_SIZE); proc_id = mc_hdr->processor_rev_id; @@ -869,7 +880,7 @@ load_microcode_amd(bool save, u8 family, const u8 *data, size_t size) return ret; memset(amd_ucode_patch, 0, PATCH_MAX_SIZE); - memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data), PATCH_MAX_SIZE)); + memcpy(amd_ucode_patch, p->data, min_t(u32, p->size, PATCH_MAX_SIZE)); return ret; } @@ -924,12 +935,6 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device, return ret; } -static enum ucode_state -request_microcode_user(int cpu, const void __user *buf, size_t size) -{ - return UCODE_ERROR; -} - static void microcode_fini_cpu_amd(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; @@ -938,7 +943,6 @@ static void microcode_fini_cpu_amd(int cpu) } static struct microcode_ops microcode_amd_ops = { - .request_microcode_user = request_microcode_user, .request_microcode_fw = request_microcode_amd, .collect_cpu_info = collect_cpu_info_amd, .apply_microcode = apply_microcode_amd, diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index f955d25076ba..6a41cee242f6 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -373,101 +373,10 @@ static int apply_microcode_on_target(int cpu) return ret; } -#ifdef CONFIG_MICROCODE_OLD_INTERFACE -static int do_microcode_update(const void __user *buf, size_t size) -{ - int error = 0; - int cpu; - - for_each_online_cpu(cpu) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - enum ucode_state ustate; - - if (!uci->valid) - continue; - - ustate = microcode_ops->request_microcode_user(cpu, buf, size); - if (ustate == UCODE_ERROR) { - error = -1; - break; - } else if (ustate == UCODE_NEW) { - apply_microcode_on_target(cpu); - } - } - - return error; -} - -static int microcode_open(struct inode *inode, struct file *file) -{ - return capable(CAP_SYS_RAWIO) ? stream_open(inode, file) : -EPERM; -} - -static ssize_t microcode_write(struct file *file, const char __user *buf, - size_t len, loff_t *ppos) -{ - ssize_t ret = -EINVAL; - unsigned long nr_pages = totalram_pages(); - - if ((len >> PAGE_SHIFT) > nr_pages) { - pr_err("too much data (max %ld pages)\n", nr_pages); - return ret; - } - - cpus_read_lock(); - mutex_lock(µcode_mutex); - - if (do_microcode_update(buf, len) == 0) - ret = (ssize_t)len; - - if (ret > 0) - perf_check_microcode(); - - mutex_unlock(µcode_mutex); - cpus_read_unlock(); - - return ret; -} - -static const struct file_operations microcode_fops = { - .owner = THIS_MODULE, - .write = microcode_write, - .open = microcode_open, - .llseek = no_llseek, -}; - -static struct miscdevice microcode_dev = { - .minor = MICROCODE_MINOR, - .name = "microcode", - .nodename = "cpu/microcode", - .fops = µcode_fops, -}; - -static int __init microcode_dev_init(void) -{ - int error; - - error = misc_register(µcode_dev); - if (error) { - pr_err("can't misc_register on minor=%d\n", MICROCODE_MINOR); - return error; - } - - return 0; -} - -static void __exit microcode_dev_exit(void) -{ - misc_deregister(µcode_dev); -} -#else -#define microcode_dev_init() 0 -#define microcode_dev_exit() do { } while (0) -#endif - /* fake device for request_firmware */ static struct platform_device *microcode_pdev; +#ifdef CONFIG_MICROCODE_LATE_LOADING /* * Late loading dance. Why the heavy-handed stomp_machine effort? * @@ -582,7 +491,10 @@ wait_for_siblings: */ static int microcode_reload_late(void) { - int ret; + int old = boot_cpu_data.microcode, ret; + + pr_err("Attempting late microcode loading - it is dangerous and taints the kernel.\n"); + pr_err("You should switch to early loading, if possible.\n"); atomic_set(&late_cpus_in, 0); atomic_set(&late_cpus_out, 0); @@ -591,7 +503,8 @@ static int microcode_reload_late(void) if (ret == 0) microcode_check(); - pr_info("Reload completed, microcode revision: 0x%x\n", boot_cpu_data.microcode); + pr_info("Reload completed, microcode revision: 0x%x -> 0x%x\n", + old, boot_cpu_data.microcode); return ret; } @@ -632,9 +545,14 @@ put: if (ret == 0) ret = size; + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + return ret; } +static DEVICE_ATTR_WO(reload); +#endif + static ssize_t version_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -651,7 +569,6 @@ static ssize_t pf_show(struct device *dev, return sprintf(buf, "0x%x\n", uci->cpu_sig.pf); } -static DEVICE_ATTR_WO(reload); static DEVICE_ATTR(version, 0444, version_show, NULL); static DEVICE_ATTR(processor_flags, 0444, pf_show, NULL); @@ -758,9 +675,9 @@ static struct subsys_interface mc_cpu_interface = { }; /** - * mc_bp_resume - Update boot CPU microcode during resume. + * microcode_bsp_resume - Update boot CPU microcode during resume. */ -static void mc_bp_resume(void) +void microcode_bsp_resume(void) { int cpu = smp_processor_id(); struct ucode_cpu_info *uci = ucode_cpu_info + cpu; @@ -772,7 +689,7 @@ static void mc_bp_resume(void) } static struct syscore_ops mc_syscore_ops = { - .resume = mc_bp_resume, + .resume = microcode_bsp_resume, }; static int mc_cpu_starting(unsigned int cpu) @@ -804,7 +721,9 @@ static int mc_cpu_down_prep(unsigned int cpu) } static struct attribute *cpu_root_microcode_attrs[] = { +#ifdef CONFIG_MICROCODE_LATE_LOADING &dev_attr_reload.attr, +#endif NULL }; @@ -838,10 +757,7 @@ static int __init microcode_init(void) cpus_read_lock(); mutex_lock(µcode_mutex); - error = subsys_interface_register(&mc_cpu_interface); - if (!error) - perf_check_microcode(); mutex_unlock(µcode_mutex); cpus_read_unlock(); @@ -856,10 +772,6 @@ static int __init microcode_init(void) goto out_driver; } - error = microcode_dev_init(); - if (error) - goto out_ucode_group; - register_syscore_ops(&mc_syscore_ops); cpuhp_setup_state_nocalls(CPUHP_AP_MICROCODE_LOADER, "x86/microcode:starting", mc_cpu_starting, NULL); @@ -870,10 +782,6 @@ static int __init microcode_init(void) return 0; - out_ucode_group: - sysfs_remove_group(&cpu_subsys.dev_root->kobj, - &cpu_root_microcode_group); - out_driver: cpus_read_lock(); mutex_lock(µcode_mutex); diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index d28a9f8f3fec..1fcbd671f1df 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -45,20 +45,6 @@ static struct microcode_intel *intel_ucode_patch; /* last level cache size per core */ static int llc_size_per_core; -static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1, - unsigned int s2, unsigned int p2) -{ - if (s1 != s2) - return false; - - /* Processor flags are either both 0 ... */ - if (!p1 && !p2) - return true; - - /* ... or they intersect. */ - return p1 & p2; -} - /* * Returns 1 if update has been found, 0 otherwise. */ @@ -69,7 +55,7 @@ static int find_matching_signature(void *mc, unsigned int csig, int cpf) struct extended_signature *ext_sig; int i; - if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf)) + if (intel_cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf)) return 1; /* Look for ext. headers: */ @@ -80,7 +66,7 @@ static int find_matching_signature(void *mc, unsigned int csig, int cpf) ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE; for (i = 0; i < ext_hdr->count; i++) { - if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf)) + if (intel_cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf)) return 1; ext_sig++; } @@ -342,37 +328,6 @@ next: return patch; } -static int collect_cpu_info_early(struct ucode_cpu_info *uci) -{ - unsigned int val[2]; - unsigned int family, model; - struct cpu_signature csig = { 0 }; - unsigned int eax, ebx, ecx, edx; - - memset(uci, 0, sizeof(*uci)); - - eax = 0x00000001; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - csig.sig = eax; - - family = x86_family(eax); - model = x86_model(eax); - - if ((model >= 5) || (family > 6)) { - /* get processor flags from MSR 0x17 */ - native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); - csig.pf = 1 << ((val[1] >> 18) & 7); - } - - csig.rev = intel_get_microcode_revision(); - - uci->cpu_sig = csig; - uci->valid = 1; - - return 0; -} - static void show_saved_mc(void) { #ifdef DEBUG @@ -386,7 +341,7 @@ static void show_saved_mc(void) return; } - collect_cpu_info_early(&uci); + intel_cpu_collect_info(&uci); sig = uci.cpu_sig.sig; pf = uci.cpu_sig.pf; @@ -502,7 +457,7 @@ void show_ucode_info_early(void) struct ucode_cpu_info uci; if (delay_ucode_info) { - collect_cpu_info_early(&uci); + intel_cpu_collect_info(&uci); print_ucode_info(&uci, current_mc_date); delay_ucode_info = 0; } @@ -604,7 +559,7 @@ int __init save_microcode_in_initrd_intel(void) if (!(cp.data && cp.size)) return 0; - collect_cpu_info_early(&uci); + intel_cpu_collect_info(&uci); scan_microcode(cp.data, cp.size, &uci, true); @@ -637,7 +592,7 @@ static struct microcode_intel *__load_ucode_intel(struct ucode_cpu_info *uci) if (!(cp.data && cp.size)) return NULL; - collect_cpu_info_early(uci); + intel_cpu_collect_info(uci); return scan_microcode(cp.data, cp.size, uci, false); } @@ -712,7 +667,7 @@ void reload_ucode_intel(void) struct microcode_intel *p; struct ucode_cpu_info uci; - collect_cpu_info_early(&uci); + intel_cpu_collect_info(&uci); p = find_patch(&uci); if (!p) @@ -961,24 +916,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device, return ret; } -static enum ucode_state -request_microcode_user(int cpu, const void __user *buf, size_t size) -{ - struct iov_iter iter; - struct iovec iov; - - if (is_blacklisted(cpu)) - return UCODE_NFOUND; - - iov.iov_base = (void __user *)buf; - iov.iov_len = size; - iov_iter_init(&iter, WRITE, &iov, 1, size); - - return generic_load_microcode(cpu, &iter); -} - static struct microcode_ops microcode_intel_ops = { - .request_microcode_user = request_microcode_user, .request_microcode_fw = request_microcode_fw, .collect_cpu_info = collect_cpu_info, .apply_microcode = apply_microcode_intel, diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index ff55df60228f..831613959a92 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -18,6 +18,7 @@ #include <linux/kexec.h> #include <linux/i8253.h> #include <linux/random.h> +#include <linux/swiotlb.h> #include <asm/processor.h> #include <asm/hypervisor.h> #include <asm/hyperv-tlfs.h> @@ -32,6 +33,7 @@ #include <asm/nmi.h> #include <clocksource/hyperv_timer.h> #include <asm/numa.h> +#include <asm/coco.h> /* Is Linux running as the root partition? */ bool hv_root_partition; @@ -79,7 +81,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0) inc_irq_stat(hyperv_stimer0_count); if (hv_stimer0_handler) hv_stimer0_handler(); - add_interrupt_randomness(HYPERV_STIMER0_VECTOR, 0); + add_interrupt_randomness(HYPERV_STIMER0_VECTOR); ack_APIC_irq(); set_irq_regs(old_regs); @@ -308,10 +310,10 @@ static void __init ms_hyperv_init_platform(void) hv_host_info_ecx = cpuid_ecx(HYPERV_CPUID_VERSION); hv_host_info_edx = cpuid_edx(HYPERV_CPUID_VERSION); - pr_info("Hyper-V Host Build:%d-%d.%d-%d-%d.%d\n", - hv_host_info_eax, hv_host_info_ebx >> 16, - hv_host_info_ebx & 0xFFFF, hv_host_info_ecx, - hv_host_info_edx >> 24, hv_host_info_edx & 0xFFFFFF); + pr_info("Hyper-V: Host Build %d.%d.%d.%d-%d-%d\n", + hv_host_info_ebx >> 16, hv_host_info_ebx & 0xFFFF, + hv_host_info_eax, hv_host_info_edx & 0xFFFFFF, + hv_host_info_ecx, hv_host_info_edx >> 24); } if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS && @@ -329,8 +331,17 @@ static void __init ms_hyperv_init_platform(void) pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n", ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b); - if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) + if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) { static_branch_enable(&isolation_type_snp); +#ifdef CONFIG_SWIOTLB + swiotlb_unencrypted_base = ms_hyperv.shared_gpa_boundary; +#endif + } + /* Isolation VMs are unenlightened SEV-based VMs, thus this check: */ + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) { + if (hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE) + cc_set_vendor(CC_VENDOR_HYPERV); + } } if (hv_max_functions_eax >= HYPERV_CPUID_NESTED_FEATURES) { @@ -446,6 +457,8 @@ static void __init ms_hyperv_init_platform(void) */ if (!(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT)) mark_tsc_unstable("running on Hyper-V"); + + hardlockup_detector_disable(); } static bool __init ms_hyperv_x2apic_available(void) diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 4eec8889b0ff..099b6f0d96bd 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -84,14 +84,9 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "microcode\t: 0x%x\n", c->microcode); if (cpu_has(c, X86_FEATURE_TSC)) { - unsigned int freq = aperfmperf_get_khz(cpu); - - if (!freq) - freq = cpufreq_quick_get(cpu); - if (!freq) - freq = cpu_khz; - seq_printf(m, "cpu MHz\t\t: %u.%03u\n", - freq / 1000, (freq % 1000)); + unsigned int freq = arch_freq_get_on_cpu(cpu); + + seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000)); } /* Cache size */ diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c index c4be62058dd9..26a427fa84ea 100644 --- a/arch/x86/kernel/cpu/rdrand.c +++ b/arch/x86/kernel/cpu/rdrand.c @@ -11,56 +11,39 @@ #include <asm/archrandom.h> #include <asm/sections.h> -static int __init x86_rdrand_setup(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_RDRAND); - setup_clear_cpu_cap(X86_FEATURE_RDSEED); - return 1; -} -__setup("nordrand", x86_rdrand_setup); - /* * RDRAND has Built-In-Self-Test (BIST) that runs on every invocation. - * Run the instruction a few times as a sanity check. - * If it fails, it is simple to disable RDRAND here. + * Run the instruction a few times as a sanity check. Also make sure + * it's not outputting the same value over and over, which has happened + * as a result of past CPU bugs. + * + * If it fails, it is simple to disable RDRAND and RDSEED here. */ -#define SANITY_CHECK_LOOPS 8 -#ifdef CONFIG_ARCH_RANDOM void x86_init_rdrand(struct cpuinfo_x86 *c) { - unsigned int changed = 0; - unsigned long tmp, prev; - int i; + enum { SAMPLES = 8, MIN_CHANGE = 5 }; + unsigned long sample, prev; + bool failure = false; + size_t i, changed; if (!cpu_has(c, X86_FEATURE_RDRAND)) return; - for (i = 0; i < SANITY_CHECK_LOOPS; i++) { - if (!rdrand_long(&tmp)) { - clear_cpu_cap(c, X86_FEATURE_RDRAND); - pr_warn_once("rdrand: disabled\n"); - return; + for (changed = 0, i = 0; i < SAMPLES; ++i) { + if (!rdrand_long(&sample)) { + failure = true; + break; } + changed += i && sample != prev; + prev = sample; } + if (changed < MIN_CHANGE) + failure = true; - /* - * Stupid sanity-check whether RDRAND does *actually* generate - * some at least random-looking data. - */ - prev = tmp; - for (i = 0; i < SANITY_CHECK_LOOPS; i++) { - if (rdrand_long(&tmp)) { - if (prev != tmp) - changed++; - - prev = tmp; - } + if (failure) { + clear_cpu_cap(c, X86_FEATURE_RDRAND); + clear_cpu_cap(c, X86_FEATURE_RDSEED); + pr_emerg("RDRAND is not reliable on this platform; disabling.\n"); } - - if (WARN_ON_ONCE(!changed)) - pr_emerg( -"RDRAND gives funky smelling output, might consider not using it by booting with \"nordrand\""); - } -#endif diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index bb1c3f5f60c8..3266ea36667c 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -66,9 +66,6 @@ struct rdt_hw_resource rdt_resources_all[] = { .rid = RDT_RESOURCE_L3, .name = "L3", .cache_level = 3, - .cache = { - .min_cbm_bits = 1, - }, .domains = domain_init(RDT_RESOURCE_L3), .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", @@ -83,9 +80,6 @@ struct rdt_hw_resource rdt_resources_all[] = { .rid = RDT_RESOURCE_L2, .name = "L2", .cache_level = 2, - .cache = { - .min_cbm_bits = 1, - }, .domains = domain_init(RDT_RESOURCE_L2), .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", @@ -147,7 +141,6 @@ static inline void cache_alloc_hsw_probe(void) r->cache.shareable_bits = 0xc0000; r->cache.min_cbm_bits = 2; r->alloc_capable = true; - r->alloc_enabled = true; rdt_alloc_capable = true; } @@ -211,7 +204,6 @@ static bool __get_mem_config_intel(struct rdt_resource *r) thread_throttle_mode_init(); r->alloc_capable = true; - r->alloc_enabled = true; return true; } @@ -242,7 +234,6 @@ static bool __rdt_get_mem_config_amd(struct rdt_resource *r) r->data_width = 4; r->alloc_capable = true; - r->alloc_enabled = true; return true; } @@ -261,7 +252,6 @@ static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r) r->cache.shareable_bits = ebx & r->default_ctrl; r->data_width = (r->cache.cbm_len + 3) / 4; r->alloc_capable = true; - r->alloc_enabled = true; } static void rdt_get_cdp_config(int level) @@ -300,7 +290,7 @@ mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) * that can be written to QOS_MSRs. * There are currently no SKUs which support non linear delay values. */ -u32 delay_bw_map(unsigned long bw, struct rdt_resource *r) +static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r) { if (r->membw.delay_linear) return MAX_MBA_BW - bw; @@ -401,7 +391,7 @@ struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, return NULL; } -void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm) +static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc) { struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); int i; @@ -410,12 +400,17 @@ void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm) * Initialize the Control MSRs to having no control. * For Cache Allocation: Set all bits in cbm * For Memory Allocation: Set b/w requested to 100% - * and the bandwidth in MBps to U32_MAX */ - for (i = 0; i < hw_res->num_closid; i++, dc++, dm++) { + for (i = 0; i < hw_res->num_closid; i++, dc++) *dc = r->default_ctrl; - *dm = MBA_MAX_MBPS; - } +} + +static void domain_free(struct rdt_hw_domain *hw_dom) +{ + kfree(hw_dom->arch_mbm_total); + kfree(hw_dom->arch_mbm_local); + kfree(hw_dom->ctrl_val); + kfree(hw_dom); } static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) @@ -423,23 +418,15 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); struct msr_param m; - u32 *dc, *dm; + u32 *dc; dc = kmalloc_array(hw_res->num_closid, sizeof(*hw_dom->ctrl_val), GFP_KERNEL); if (!dc) return -ENOMEM; - dm = kmalloc_array(hw_res->num_closid, sizeof(*hw_dom->mbps_val), - GFP_KERNEL); - if (!dm) { - kfree(dc); - return -ENOMEM; - } - hw_dom->ctrl_val = dc; - hw_dom->mbps_val = dm; - setup_default_ctrlval(r, dc, dm); + setup_default_ctrlval(r, dc); m.low = 0; m.high = hw_res->num_closid; @@ -447,39 +434,31 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) return 0; } -static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) +/** + * arch_domain_mbm_alloc() - Allocate arch private storage for the MBM counters + * @num_rmid: The size of the MBM counter array + * @hw_dom: The domain that owns the allocated arrays + */ +static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_domain *hw_dom) { size_t tsize; - if (is_llc_occupancy_enabled()) { - d->rmid_busy_llc = bitmap_zalloc(r->num_rmid, GFP_KERNEL); - if (!d->rmid_busy_llc) - return -ENOMEM; - INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); - } if (is_mbm_total_enabled()) { - tsize = sizeof(*d->mbm_total); - d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL); - if (!d->mbm_total) { - bitmap_free(d->rmid_busy_llc); + tsize = sizeof(*hw_dom->arch_mbm_total); + hw_dom->arch_mbm_total = kcalloc(num_rmid, tsize, GFP_KERNEL); + if (!hw_dom->arch_mbm_total) return -ENOMEM; - } } if (is_mbm_local_enabled()) { - tsize = sizeof(*d->mbm_local); - d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL); - if (!d->mbm_local) { - bitmap_free(d->rmid_busy_llc); - kfree(d->mbm_total); + tsize = sizeof(*hw_dom->arch_mbm_local); + hw_dom->arch_mbm_local = kcalloc(num_rmid, tsize, GFP_KERNEL); + if (!hw_dom->arch_mbm_local) { + kfree(hw_dom->arch_mbm_total); + hw_dom->arch_mbm_total = NULL; return -ENOMEM; } } - if (is_mbm_enabled()) { - INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); - mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL); - } - return 0; } @@ -502,6 +481,7 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) struct list_head *add_pos = NULL; struct rdt_hw_domain *hw_dom; struct rdt_domain *d; + int err; d = rdt_find_domain(r, id, &add_pos); if (IS_ERR(d)) { @@ -527,25 +507,22 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) rdt_domain_reconfigure_cdp(r); if (r->alloc_capable && domain_setup_ctrlval(r, d)) { - kfree(hw_dom); + domain_free(hw_dom); return; } - if (r->mon_capable && domain_setup_mon_state(r, d)) { - kfree(hw_dom->ctrl_val); - kfree(hw_dom->mbps_val); - kfree(hw_dom); + if (r->mon_capable && arch_domain_mbm_alloc(r->num_rmid, hw_dom)) { + domain_free(hw_dom); return; } list_add_tail(&d->list, add_pos); - /* - * If resctrl is mounted, add - * per domain monitor data directories. - */ - if (static_branch_unlikely(&rdt_mon_enable_key)) - mkdir_mondata_subdir_allrdtgrp(r, d); + err = resctrl_online_domain(r, d); + if (err) { + list_del(&d->list); + domain_free(hw_dom); + } } static void domain_remove_cpu(int cpu, struct rdt_resource *r) @@ -563,27 +540,8 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) cpumask_clear_cpu(cpu, &d->cpu_mask); if (cpumask_empty(&d->cpu_mask)) { - /* - * If resctrl is mounted, remove all the - * per domain monitor data directories. - */ - if (static_branch_unlikely(&rdt_mon_enable_key)) - rmdir_mondata_subdir_allrdtgrp(r, d->id); + resctrl_offline_domain(r, d); list_del(&d->list); - if (r->mon_capable && is_mbm_enabled()) - cancel_delayed_work(&d->mbm_over); - if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) { - /* - * When a package is going down, forcefully - * decrement rmid->ebusy. There is no way to know - * that the L3 was flushed and hence may lead to - * incorrect counts in rare scenarios, but leaving - * the RMID as busy creates RMID leaks if the - * package never comes back. - */ - __check_limbo(d, true); - cancel_delayed_work(&d->cqm_limbo); - } /* * rdt_domain "d" is going to be freed below, so clear @@ -591,13 +549,8 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) */ if (d->plr) d->plr->d = NULL; + domain_free(hw_dom); - kfree(hw_dom->ctrl_val); - kfree(hw_dom->mbps_val); - bitmap_free(d->rmid_busy_llc); - kfree(d->mbm_total); - kfree(d->mbm_local); - kfree(hw_dom); return; } @@ -877,6 +830,7 @@ static __init void rdt_init_res_defs_intel(void) r->cache.arch_has_sparse_bitmaps = false; r->cache.arch_has_empty_bitmaps = false; r->cache.arch_has_per_cpu_cfg = false; + r->cache.min_cbm_bits = 1; } else if (r->rid == RDT_RESOURCE_MBA) { hw_res->msr_base = MSR_IA32_MBA_THRTL_BASE; hw_res->msr_update = mba_wrmsr_intel; @@ -897,6 +851,7 @@ static __init void rdt_init_res_defs_amd(void) r->cache.arch_has_sparse_bitmaps = true; r->cache.arch_has_empty_bitmaps = true; r->cache.arch_has_per_cpu_cfg = true; + r->cache.min_cbm_bits = 0; } else if (r->rid == RDT_RESOURCE_MBA) { hw_res->msr_base = MSR_IA32_MBA_BW_BASE; hw_res->msr_update = mba_wrmsr_amd; diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 87666275eed9..1dafbdc5ac31 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -61,6 +61,7 @@ int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, struct rdt_domain *d) { struct resctrl_staged_config *cfg; + u32 closid = data->rdtgrp->closid; struct rdt_resource *r = s->res; unsigned long bw_val; @@ -72,6 +73,12 @@ int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, if (!bw_validate(data->buf, &bw_val, r)) return -EINVAL; + + if (is_mba_sc(r)) { + d->mbps_val[closid] = bw_val; + return 0; + } + cfg->new_ctrl = bw_val; cfg->have_new_ctrl = true; @@ -261,14 +268,13 @@ static u32 get_config_index(u32 closid, enum resctrl_conf_type type) static bool apply_config(struct rdt_hw_domain *hw_dom, struct resctrl_staged_config *cfg, u32 idx, - cpumask_var_t cpu_mask, bool mba_sc) + cpumask_var_t cpu_mask) { struct rdt_domain *dom = &hw_dom->d_resctrl; - u32 *dc = !mba_sc ? hw_dom->ctrl_val : hw_dom->mbps_val; - if (cfg->new_ctrl != dc[idx]) { + if (cfg->new_ctrl != hw_dom->ctrl_val[idx]) { cpumask_set_cpu(cpumask_any(&dom->cpu_mask), cpu_mask); - dc[idx] = cfg->new_ctrl; + hw_dom->ctrl_val[idx] = cfg->new_ctrl; return true; } @@ -276,6 +282,27 @@ static bool apply_config(struct rdt_hw_domain *hw_dom, return false; } +int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d, + u32 closid, enum resctrl_conf_type t, u32 cfg_val) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + u32 idx = get_config_index(closid, t); + struct msr_param msr_param; + + if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask)) + return -EINVAL; + + hw_dom->ctrl_val[idx] = cfg_val; + + msr_param.res = r; + msr_param.low = idx; + msr_param.high = idx + 1; + hw_res->msr_update(d, &msr_param, r); + + return 0; +} + int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) { struct resctrl_staged_config *cfg; @@ -284,14 +311,12 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) enum resctrl_conf_type t; cpumask_var_t cpu_mask; struct rdt_domain *d; - bool mba_sc; int cpu; u32 idx; if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) return -ENOMEM; - mba_sc = is_mba_sc(r); msr_param.res = NULL; list_for_each_entry(d, &r->domains, list) { hw_dom = resctrl_to_arch_dom(d); @@ -301,7 +326,7 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) continue; idx = get_config_index(closid, t); - if (!apply_config(hw_dom, cfg, idx, cpu_mask, mba_sc)) + if (!apply_config(hw_dom, cfg, idx, cpu_mask)) continue; if (!msr_param.res) { @@ -315,11 +340,7 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) } } - /* - * Avoid writing the control msr with control values when - * MBA software controller is enabled - */ - if (cpumask_empty(cpu_mask) || mba_sc) + if (cpumask_empty(cpu_mask)) goto done; cpu = get_cpu(); /* Update resource control msr on this CPU if it's in cpu_mask. */ @@ -406,6 +427,14 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, list_for_each_entry(s, &resctrl_schema_all, list) { r = s->res; + + /* + * Writes to mba_sc resources update the software controller, + * not the control MSR. + */ + if (is_mba_sc(r)) + continue; + ret = resctrl_arch_update_domains(r, rdtgrp->closid); if (ret) goto out; @@ -433,9 +462,7 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d, struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); u32 idx = get_config_index(closid, type); - if (!is_mba_sc(r)) - return hw_dom->ctrl_val[idx]; - return hw_dom->mbps_val[idx]; + return hw_dom->ctrl_val[idx]; } static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid) @@ -450,8 +477,12 @@ static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int clo if (sep) seq_puts(s, ";"); - ctrl_val = resctrl_arch_get_config(r, dom, closid, - schema->conf_type); + if (is_mba_sc(r)) + ctrl_val = dom->mbps_val[closid]; + else + ctrl_val = resctrl_arch_get_config(r, dom, closid, + schema->conf_type); + seq_printf(s, r->format_str, dom->id, max_data_width, ctrl_val); sep = true; @@ -518,7 +549,6 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, int rdtgroup_mondata_show(struct seq_file *m, void *arg) { struct kernfs_open_file *of = m->private; - struct rdt_hw_resource *hw_res; u32 resid, evtid, domid; struct rdtgroup *rdtgrp; struct rdt_resource *r; @@ -538,8 +568,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) domid = md.u.domid; evtid = md.u.evtid; - hw_res = &rdt_resources_all[resid]; - r = &hw_res->r_resctrl; + r = &rdt_resources_all[resid].r_resctrl; d = rdt_find_domain(r, domid, NULL); if (IS_ERR_OR_NULL(d)) { ret = -ENOENT; @@ -548,12 +577,12 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) mon_event_read(&rr, r, d, rdtgrp, evtid, false); - if (rr.val & RMID_VAL_ERROR) + if (rr.err == -EIO) seq_puts(m, "Error\n"); - else if (rr.val & RMID_VAL_UNAVAIL) + else if (rr.err == -EINVAL) seq_puts(m, "Unavailable\n"); else - seq_printf(m, "%llu\n", rr.val * hw_res->mon_scale); + seq_printf(m, "%llu\n", rr.val); out: rdtgroup_kn_unlock(of->kn); diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 1d647188a43b..5f7128686cfd 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -22,21 +22,12 @@ #define L2_QOS_CDP_ENABLE 0x01ULL -/* - * Event IDs are used to program IA32_QM_EVTSEL before reading event - * counter from IA32_QM_CTR - */ -#define QOS_L3_OCCUP_EVENT_ID 0x01 -#define QOS_L3_MBM_TOTAL_EVENT_ID 0x02 -#define QOS_L3_MBM_LOCAL_EVENT_ID 0x03 - #define CQM_LIMBOCHECK_INTERVAL 1000 #define MBM_CNTR_WIDTH_BASE 24 #define MBM_OVERFLOW_INTERVAL 1000 #define MAX_MBA_BW 100u #define MBA_IS_LINEAR 0x4 -#define MBA_MAX_MBPS U32_MAX #define MAX_MBA_BW_AMD 0x800 #define MBM_CNTR_WIDTH_OFFSET_AMD 20 @@ -74,7 +65,7 @@ DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); * @list: entry in &rdt_resource->evt_list */ struct mon_evt { - u32 evtid; + enum resctrl_event_id evtid; char *name; struct list_head list; }; @@ -91,9 +82,9 @@ struct mon_evt { union mon_data_bits { void *priv; struct { - unsigned int rid : 10; - unsigned int evtid : 8; - unsigned int domid : 14; + unsigned int rid : 10; + enum resctrl_event_id evtid : 8; + unsigned int domid : 14; } u; }; @@ -101,12 +92,12 @@ struct rmid_read { struct rdtgroup *rgrp; struct rdt_resource *r; struct rdt_domain *d; - int evtid; + enum resctrl_event_id evtid; bool first; + int err; u64 val; }; -extern unsigned int resctrl_cqm_threshold; extern bool rdt_alloc_capable; extern bool rdt_mon_capable; extern unsigned int rdt_mon_features; @@ -288,35 +279,45 @@ struct rftype { /** * struct mbm_state - status for each MBM counter in each domain - * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes) - * @prev_msr: Value of IA32_QM_CTR for this RMID last time we read it - * @prev_bw_msr:Value of previous IA32_QM_CTR for bandwidth counting + * @prev_bw_bytes: Previous bytes value read for bandwidth calculation * @prev_bw: The most recent bandwidth in MBps * @delta_bw: Difference between the current and previous bandwidth * @delta_comp: Indicates whether to compute the delta_bw */ struct mbm_state { - u64 chunks; - u64 prev_msr; - u64 prev_bw_msr; + u64 prev_bw_bytes; u32 prev_bw; u32 delta_bw; bool delta_comp; }; /** + * struct arch_mbm_state - values used to compute resctrl_arch_rmid_read()s + * return value. + * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes) + * @prev_msr: Value of IA32_QM_CTR last time it was read for the RMID used to + * find this struct. + */ +struct arch_mbm_state { + u64 chunks; + u64 prev_msr; +}; + +/** * struct rdt_hw_domain - Arch private attributes of a set of CPUs that share * a resource * @d_resctrl: Properties exposed to the resctrl file system * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID) - * @mbps_val: When mba_sc is enabled, this holds the bandwidth in MBps + * @arch_mbm_total: arch private state for MBM total bandwidth + * @arch_mbm_local: arch private state for MBM local bandwidth * * Members of this structure are accessed via helpers that provide abstraction. */ struct rdt_hw_domain { struct rdt_domain d_resctrl; u32 *ctrl_val; - u32 *mbps_val; + struct arch_mbm_state *arch_mbm_total; + struct arch_mbm_state *arch_mbm_local; }; static inline struct rdt_hw_domain *resctrl_to_arch_dom(struct rdt_domain *r) @@ -459,14 +460,6 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable); for_each_rdt_resource(r) \ if (r->mon_capable) -#define for_each_alloc_enabled_rdt_resource(r) \ - for_each_rdt_resource(r) \ - if (r->alloc_enabled) - -#define for_each_mon_enabled_rdt_resource(r) \ - for_each_rdt_resource(r) \ - if (r->mon_enabled) - /* CPUID.(EAX=10H, ECX=ResID=1).EAX */ union cpuid_0x10_1_eax { struct { @@ -530,10 +523,6 @@ void free_rmid(u32 rmid); int rdt_get_mon_l3_config(struct rdt_resource *r); void mon_event_count(void *info); int rdtgroup_mondata_show(struct seq_file *m, void *arg); -void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - unsigned int dom_id); -void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - struct rdt_domain *d); void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, struct rdt_domain *d, struct rdtgroup *rdtgrp, int evtid, int first); @@ -542,8 +531,6 @@ void mbm_setup_overflow_handler(struct rdt_domain *dom, void mbm_handle_overflow(struct work_struct *work); void __init intel_rdt_mbm_apply_quirk(void); bool is_mba_sc(struct rdt_resource *r); -void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm); -u32 delay_bw_map(unsigned long bw, struct rdt_resource *r); void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms); void cqm_handle_limbo(struct work_struct *work); bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index c9f0f3d63f75..efe0c30d3a12 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -16,8 +16,12 @@ */ #include <linux/module.h> +#include <linux/sizes.h> #include <linux/slab.h> + #include <asm/cpu_device_id.h> +#include <asm/resctrl.h> + #include "internal.h" struct rmid_entry { @@ -37,8 +41,8 @@ static LIST_HEAD(rmid_free_lru); * @rmid_limbo_count count of currently unused but (potentially) * dirty RMIDs. * This counts RMIDs that no one is currently using but that - * may have a occupancy value > intel_cqm_threshold. User can change - * the threshold occupancy value. + * may have a occupancy value > resctrl_rmid_realloc_threshold. User can + * change the threshold occupancy value. */ static unsigned int rmid_limbo_count; @@ -59,10 +63,15 @@ bool rdt_mon_capable; unsigned int rdt_mon_features; /* - * This is the threshold cache occupancy at which we will consider an + * This is the threshold cache occupancy in bytes at which we will consider an * RMID available for re-allocation. */ -unsigned int resctrl_cqm_threshold; +unsigned int resctrl_rmid_realloc_threshold; + +/* + * This is the maximum value for the reallocation threshold, in bytes. + */ +unsigned int resctrl_rmid_realloc_limit; #define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) @@ -137,9 +146,54 @@ static inline struct rmid_entry *__rmid_entry(u32 rmid) return entry; } -static u64 __rmid_read(u32 rmid, u32 eventid) +static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain *hw_dom, + u32 rmid, + enum resctrl_event_id eventid) +{ + switch (eventid) { + case QOS_L3_OCCUP_EVENT_ID: + return NULL; + case QOS_L3_MBM_TOTAL_EVENT_ID: + return &hw_dom->arch_mbm_total[rmid]; + case QOS_L3_MBM_LOCAL_EVENT_ID: + return &hw_dom->arch_mbm_local[rmid]; + } + + /* Never expect to get here */ + WARN_ON_ONCE(1); + + return NULL; +} + +void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, + u32 rmid, enum resctrl_event_id eventid) +{ + struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + struct arch_mbm_state *am; + + am = get_arch_mbm_state(hw_dom, rmid, eventid); + if (am) + memset(am, 0, sizeof(*am)); +} + +static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) +{ + u64 shift = 64 - width, chunks; + + chunks = (cur_msr << shift) - (prev_msr << shift); + return chunks >> shift; +} + +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, + u32 rmid, enum resctrl_event_id eventid, u64 *val) { - u64 val; + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + struct arch_mbm_state *am; + u64 msr_val, chunks; + + if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask)) + return -EINVAL; /* * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured @@ -150,16 +204,26 @@ static u64 __rmid_read(u32 rmid, u32 eventid) * are error bits. */ wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); - rdmsrl(MSR_IA32_QM_CTR, val); - - return val; -} + rdmsrl(MSR_IA32_QM_CTR, msr_val); + + if (msr_val & RMID_VAL_ERROR) + return -EIO; + if (msr_val & RMID_VAL_UNAVAIL) + return -EINVAL; + + am = get_arch_mbm_state(hw_dom, rmid, eventid); + if (am) { + am->chunks += mbm_overflow_count(am->prev_msr, msr_val, + hw_res->mbm_width); + chunks = get_corrected_mbm_count(rmid, am->chunks); + am->prev_msr = msr_val; + } else { + chunks = msr_val; + } -static bool rmid_dirty(struct rmid_entry *entry) -{ - u64 val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID); + *val = chunks * hw_res->mon_scale; - return val >= resctrl_cqm_threshold; + return 0; } /* @@ -170,11 +234,11 @@ static bool rmid_dirty(struct rmid_entry *entry) */ void __check_limbo(struct rdt_domain *d, bool force_free) { + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; struct rmid_entry *entry; - struct rdt_resource *r; u32 crmid = 1, nrmid; - - r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + bool rmid_dirty; + u64 val = 0; /* * Skip RMID 0 and start from RMID 1 and check all the RMIDs that @@ -188,7 +252,15 @@ void __check_limbo(struct rdt_domain *d, bool force_free) break; entry = __rmid_entry(nrmid); - if (force_free || !rmid_dirty(entry)) { + + if (resctrl_arch_rmid_read(r, d, entry->rmid, + QOS_L3_OCCUP_EVENT_ID, &val)) { + rmid_dirty = true; + } else { + rmid_dirty = (val >= resctrl_rmid_realloc_threshold); + } + + if (force_free || !rmid_dirty) { clear_bit(entry->rmid, d->rmid_busy_llc); if (!--entry->busy) { rmid_limbo_count--; @@ -227,19 +299,19 @@ int alloc_rmid(void) static void add_rmid_to_limbo(struct rmid_entry *entry) { - struct rdt_resource *r; + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; struct rdt_domain *d; - int cpu; - u64 val; - - r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + int cpu, err; + u64 val = 0; entry->busy = 0; cpu = get_cpu(); list_for_each_entry(d, &r->domains, list) { if (cpumask_test_cpu(cpu, &d->cpu_mask)) { - val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID); - if (val <= resctrl_cqm_threshold) + err = resctrl_arch_rmid_read(r, d, entry->rmid, + QOS_L3_OCCUP_EVENT_ID, + &val); + if (err || val <= resctrl_rmid_realloc_threshold) continue; } @@ -277,24 +349,18 @@ void free_rmid(u32 rmid) list_add_tail(&entry->list, &rmid_free_lru); } -static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) +static int __mon_event_count(u32 rmid, struct rmid_read *rr) { - u64 shift = 64 - width, chunks; + struct mbm_state *m; + u64 tval = 0; - chunks = (cur_msr << shift) - (prev_msr << shift); - return chunks >>= shift; -} + if (rr->first) + resctrl_arch_reset_rmid(rr->r, rr->d, rmid, rr->evtid); -static u64 __mon_event_count(u32 rmid, struct rmid_read *rr) -{ - struct rdt_hw_resource *hw_res = resctrl_to_arch_res(rr->r); - struct mbm_state *m; - u64 chunks, tval; + rr->err = resctrl_arch_rmid_read(rr->r, rr->d, rmid, rr->evtid, &tval); + if (rr->err) + return rr->err; - tval = __rmid_read(rmid, rr->evtid); - if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) { - return tval; - } switch (rr->evtid) { case QOS_L3_OCCUP_EVENT_ID: rr->val += tval; @@ -308,48 +374,47 @@ static u64 __mon_event_count(u32 rmid, struct rmid_read *rr) default: /* * Code would never reach here because an invalid - * event id would fail the __rmid_read. + * event id would fail in resctrl_arch_rmid_read(). */ - return RMID_VAL_ERROR; + return -EINVAL; } if (rr->first) { memset(m, 0, sizeof(struct mbm_state)); - m->prev_bw_msr = m->prev_msr = tval; return 0; } - chunks = mbm_overflow_count(m->prev_msr, tval, hw_res->mbm_width); - m->chunks += chunks; - m->prev_msr = tval; - - rr->val += get_corrected_mbm_count(rmid, m->chunks); + rr->val += tval; return 0; } /* + * mbm_bw_count() - Update bw count from values previously read by + * __mon_event_count(). + * @rmid: The rmid used to identify the cached mbm_state. + * @rr: The struct rmid_read populated by __mon_event_count(). + * * Supporting function to calculate the memory bandwidth - * and delta bandwidth in MBps. + * and delta bandwidth in MBps. The chunks value previously read by + * __mon_event_count() is compared with the chunks value from the previous + * invocation. This must be called once per second to maintain values in MBps. */ static void mbm_bw_count(u32 rmid, struct rmid_read *rr) { - struct rdt_hw_resource *hw_res = resctrl_to_arch_res(rr->r); struct mbm_state *m = &rr->d->mbm_local[rmid]; - u64 tval, cur_bw, chunks; + u64 cur_bw, bytes, cur_bytes; - tval = __rmid_read(rmid, rr->evtid); - if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) - return; + cur_bytes = rr->val; + bytes = cur_bytes - m->prev_bw_bytes; + m->prev_bw_bytes = cur_bytes; - chunks = mbm_overflow_count(m->prev_bw_msr, tval, hw_res->mbm_width); - cur_bw = (get_corrected_mbm_count(rmid, chunks) * hw_res->mon_scale) >> 20; + cur_bw = bytes / SZ_1M; if (m->delta_comp) m->delta_bw = abs(cur_bw - m->prev_bw); m->delta_comp = false; m->prev_bw = cur_bw; - m->prev_bw_msr = tval; } /* @@ -361,11 +426,11 @@ void mon_event_count(void *info) struct rdtgroup *rdtgrp, *entry; struct rmid_read *rr = info; struct list_head *head; - u64 ret_val; + int ret; rdtgrp = rr->rgrp; - ret_val = __mon_event_count(rdtgrp->mon.rmid, rr); + ret = __mon_event_count(rdtgrp->mon.rmid, rr); /* * For Ctrl groups read data from child monitor groups and @@ -377,13 +442,17 @@ void mon_event_count(void *info) if (rdtgrp->type == RDTCTRL_GROUP) { list_for_each_entry(entry, head, mon.crdtgrp_list) { if (__mon_event_count(entry->mon.rmid, rr) == 0) - ret_val = 0; + ret = 0; } } - /* Report error if none of rmid_reads are successful */ - if (ret_val) - rr->val = ret_val; + /* + * __mon_event_count() calls for newly created monitor groups may + * report -EINVAL/Unavailable if the monitor hasn't seen any traffic. + * Discard error if any of the monitor event reads succeeded. + */ + if (ret == 0) + rr->err = 0; } /* @@ -420,10 +489,8 @@ void mon_event_count(void *info) */ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) { - u32 closid, rmid, cur_msr, cur_msr_val, new_msr_val; + u32 closid, rmid, cur_msr_val, new_msr_val; struct mbm_state *pmbm_data, *cmbm_data; - struct rdt_hw_resource *hw_r_mba; - struct rdt_hw_domain *hw_dom_mba; u32 cur_bw, delta_bw, user_bw; struct rdt_resource *r_mba; struct rdt_domain *dom_mba; @@ -433,8 +500,8 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) if (!is_mbm_local_enabled()) return; - hw_r_mba = &rdt_resources_all[RDT_RESOURCE_MBA]; - r_mba = &hw_r_mba->r_resctrl; + r_mba = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl; + closid = rgrp->closid; rmid = rgrp->mon.rmid; pmbm_data = &dom_mbm->mbm_local[rmid]; @@ -444,16 +511,13 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) pr_warn_once("Failure to get domain for MBA update\n"); return; } - hw_dom_mba = resctrl_to_arch_dom(dom_mba); cur_bw = pmbm_data->prev_bw; - user_bw = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE); + user_bw = dom_mba->mbps_val[closid]; delta_bw = pmbm_data->delta_bw; - /* - * resctrl_arch_get_config() chooses the mbps/ctrl value to return - * based on is_mba_sc(). For now, reach into the hw_dom. - */ - cur_msr_val = hw_dom_mba->ctrl_val[closid]; + + /* MBA resource doesn't support CDP */ + cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE); /* * For Ctrl groups read data from child monitor groups. @@ -488,9 +552,7 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) return; } - cur_msr = hw_r_mba->msr_base + closid; - wrmsrl(cur_msr, delay_bw_map(new_msr_val, r_mba)); - hw_dom_mba->ctrl_val[closid] = new_msr_val; + resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val); /* * Delta values are updated dynamically package wise for each @@ -523,10 +585,12 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid) */ if (is_mbm_total_enabled()) { rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID; + rr.val = 0; __mon_event_count(rmid, &rr); } if (is_mbm_local_enabled()) { rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; + rr.val = 0; __mon_event_count(rmid, &rr); /* @@ -686,9 +750,10 @@ int rdt_get_mon_l3_config(struct rdt_resource *r) { unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); - unsigned int cl_size = boot_cpu_data.x86_cache_size; + unsigned int threshold; int ret; + resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024; hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale; r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1; hw_res->mbm_width = MBM_CNTR_WIDTH_BASE; @@ -705,10 +770,14 @@ int rdt_get_mon_l3_config(struct rdt_resource *r) * * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. */ - resctrl_cqm_threshold = cl_size * 1024 / r->num_rmid; + threshold = resctrl_rmid_realloc_limit / r->num_rmid; - /* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */ - resctrl_cqm_threshold /= hw_res->mon_scale; + /* + * Because num_rmid may not be a power of two, round the value + * to the nearest multiple of hw_res->mon_scale so it matches a + * value the hardware will measure. mon_scale may not be a power of 2. + */ + resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold); ret = dom_data_init(r); if (ret) @@ -717,7 +786,6 @@ int rdt_get_mon_l3_config(struct rdt_resource *r) l3_mon_evt_init(r); r->mon_capable = true; - r->mon_enabled = true; return 0; } diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index db813f819ad6..d961ae3ed96e 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -420,6 +420,7 @@ static int pseudo_lock_fn(void *_rdtgrp) struct pseudo_lock_region *plr = rdtgrp->plr; u32 rmid_p, closid_p; unsigned long i; + u64 saved_msr; #ifdef CONFIG_KASAN /* * The registers used for local register variables are also used @@ -463,6 +464,7 @@ static int pseudo_lock_fn(void *_rdtgrp) * the buffer and evict pseudo-locked memory read earlier from the * cache. */ + saved_msr = __rdmsr(MSR_MISC_FEATURE_CONTROL); __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); closid_p = this_cpu_read(pqr_state.cur_closid); rmid_p = this_cpu_read(pqr_state.cur_rmid); @@ -514,7 +516,7 @@ static int pseudo_lock_fn(void *_rdtgrp) __wrmsr(IA32_PQR_ASSOC, rmid_p, closid_p); /* Re-enable the hardware prefetcher(s) */ - wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0); + wrmsrl(MSR_MISC_FEATURE_CONTROL, saved_msr); local_irq_enable(); plr->thread_done = 1; @@ -835,7 +837,7 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d) * First determine which cpus have pseudo-locked regions * associated with them. */ - for_each_alloc_enabled_rdt_resource(r) { + for_each_alloc_capable_rdt_resource(r) { list_for_each_entry(d_i, &r->domains, list) { if (d_i->plr) cpumask_or(cpu_with_psl, cpu_with_psl, @@ -871,6 +873,7 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d) static int measure_cycles_lat_fn(void *_plr) { struct pseudo_lock_region *plr = _plr; + u32 saved_low, saved_high; unsigned long i; u64 start, end; void *mem_r; @@ -879,6 +882,7 @@ static int measure_cycles_lat_fn(void *_plr) /* * Disable hardware prefetchers. */ + rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); mem_r = READ_ONCE(plr->kmem); /* @@ -895,7 +899,7 @@ static int measure_cycles_lat_fn(void *_plr) end = rdtsc_ordered(); trace_pseudo_lock_mem_latency((u32)(end - start)); } - wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0); + wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); local_irq_enable(); plr->thread_done = 1; wake_up_interruptible(&plr->lock_thread_wq); @@ -940,6 +944,7 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr, u64 hits_before = 0, hits_after = 0, miss_before = 0, miss_after = 0; struct perf_event *miss_event, *hit_event; int hit_pmcnum, miss_pmcnum; + u32 saved_low, saved_high; unsigned int line_size; unsigned int size; unsigned long i; @@ -973,6 +978,7 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr, /* * Disable hardware prefetchers. */ + rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); /* Initialize rest of local variables */ @@ -1031,7 +1037,7 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr, */ rmb(); /* Re-enable hardware prefetchers */ - wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0); + wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); local_irq_enable(); out_hit: perf_event_release_kernel(hit_event); diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index b57b3db9a6a7..e5a48f05e787 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -341,14 +341,14 @@ static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, /* Check whether cpus belong to parent ctrl group */ cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask); - if (cpumask_weight(tmpmask)) { + if (!cpumask_empty(tmpmask)) { rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n"); return -EINVAL; } /* Check whether cpus are dropped from this group */ cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); - if (cpumask_weight(tmpmask)) { + if (!cpumask_empty(tmpmask)) { /* Give any dropped cpus to parent rdtgroup */ cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask); update_closid_rmid(tmpmask, prgrp); @@ -359,7 +359,7 @@ static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, * and update per-cpu rmid */ cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); - if (cpumask_weight(tmpmask)) { + if (!cpumask_empty(tmpmask)) { head = &prgrp->mon.crdtgrp_list; list_for_each_entry(crgrp, head, mon.crdtgrp_list) { if (crgrp == rdtgrp) @@ -394,7 +394,7 @@ static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, /* Check whether cpus are dropped from this group */ cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); - if (cpumask_weight(tmpmask)) { + if (!cpumask_empty(tmpmask)) { /* Can't drop from default group */ if (rdtgrp == &rdtgroup_default) { rdt_last_cmd_puts("Can't drop CPUs from default group\n"); @@ -413,12 +413,12 @@ static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, * and update per-cpu closid/rmid. */ cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); - if (cpumask_weight(tmpmask)) { + if (!cpumask_empty(tmpmask)) { list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { if (r == rdtgrp) continue; cpumask_and(tmpmask1, &r->cpu_mask, tmpmask); - if (cpumask_weight(tmpmask1)) + if (!cpumask_empty(tmpmask1)) cpumask_rdtgrp_clear(r, tmpmask1); } update_closid_rmid(tmpmask, rdtgrp); @@ -488,7 +488,7 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, /* check that user didn't specify any offline cpus */ cpumask_andnot(tmpmask, newmask, cpu_online_mask); - if (cpumask_weight(tmpmask)) { + if (!cpumask_empty(tmpmask)) { ret = -EINVAL; rdt_last_cmd_puts("Can only assign online CPUs\n"); goto unlock; @@ -1030,10 +1030,7 @@ static int rdt_delay_linear_show(struct kernfs_open_file *of, static int max_threshold_occ_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { - struct rdt_resource *r = of->kn->parent->priv; - struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); - - seq_printf(seq, "%u\n", resctrl_cqm_threshold * hw_res->mon_scale); + seq_printf(seq, "%u\n", resctrl_rmid_realloc_threshold); return 0; } @@ -1055,7 +1052,6 @@ static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of, static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - struct rdt_hw_resource *hw_res; unsigned int bytes; int ret; @@ -1063,11 +1059,10 @@ static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, if (ret) return ret; - if (bytes > (boot_cpu_data.x86_cache_size * 1024)) + if (bytes > resctrl_rmid_realloc_limit) return -EINVAL; - hw_res = resctrl_to_arch_res(of->kn->parent->priv); - resctrl_cqm_threshold = bytes / hw_res->mon_scale; + resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(bytes); return nbytes; } @@ -1356,11 +1351,13 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, struct seq_file *s, void *v) { struct resctrl_schema *schema; + enum resctrl_conf_type type; struct rdtgroup *rdtgrp; struct rdt_resource *r; struct rdt_domain *d; unsigned int size; int ret = 0; + u32 closid; bool sep; u32 ctrl; @@ -1386,8 +1383,11 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, goto out; } + closid = rdtgrp->closid; + list_for_each_entry(schema, &resctrl_schema_all, list) { r = schema->res; + type = schema->conf_type; sep = false; seq_printf(s, "%*s:", max_name_width, schema->name); list_for_each_entry(d, &r->domains, list) { @@ -1396,9 +1396,12 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { size = 0; } else { - ctrl = resctrl_arch_get_config(r, d, - rdtgrp->closid, - schema->conf_type); + if (is_mba_sc(r)) + ctrl = d->mbps_val[closid]; + else + ctrl = resctrl_arch_get_config(r, d, + closid, + type); if (r->rid == RDT_RESOURCE_MBA) size = ctrl; else @@ -1756,7 +1759,7 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) if (ret) goto out_destroy; - /* loop over enabled controls, these are all alloc_enabled */ + /* loop over enabled controls, these are all alloc_capable */ list_for_each_entry(s, &resctrl_schema_all, list) { r = s->res; fflags = r->fflags | RF_CTRL_INFO; @@ -1765,7 +1768,7 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) goto out_destroy; } - for_each_mon_enabled_rdt_resource(r) { + for_each_mon_capable_rdt_resource(r) { fflags = r->fflags | RF_MON_INFO; sprintf(name, "%s_MON", r->name); ret = rdtgroup_mkdir_info_resdir(r, name, fflags); @@ -1889,26 +1892,61 @@ void rdt_domain_reconfigure_cdp(struct rdt_resource *r) l3_qos_cfg_update(&hw_res->cdp_enabled); } +static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_domain *d) +{ + u32 num_closid = resctrl_arch_get_num_closid(r); + int cpu = cpumask_any(&d->cpu_mask); + int i; + + d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val), + GFP_KERNEL, cpu_to_node(cpu)); + if (!d->mbps_val) + return -ENOMEM; + + for (i = 0; i < num_closid; i++) + d->mbps_val[i] = MBA_MAX_MBPS; + + return 0; +} + +static void mba_sc_domain_destroy(struct rdt_resource *r, + struct rdt_domain *d) +{ + kfree(d->mbps_val); + d->mbps_val = NULL; +} + /* - * Enable or disable the MBA software controller - * which helps user specify bandwidth in MBps. * MBA software controller is supported only if * MBM is supported and MBA is in linear scale. */ +static bool supports_mba_mbps(void) +{ + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl; + + return (is_mbm_local_enabled() && + r->alloc_capable && is_mba_linear()); +} + +/* + * Enable or disable the MBA software controller + * which helps user specify bandwidth in MBps. + */ static int set_mba_sc(bool mba_sc) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl; - struct rdt_hw_domain *hw_dom; + u32 num_closid = resctrl_arch_get_num_closid(r); struct rdt_domain *d; + int i; - if (!is_mbm_enabled() || !is_mba_linear() || - mba_sc == is_mba_sc(r)) + if (!supports_mba_mbps() || mba_sc == is_mba_sc(r)) return -EINVAL; r->membw.mba_sc = mba_sc; + list_for_each_entry(d, &r->domains, list) { - hw_dom = resctrl_to_arch_dom(d); - setup_default_ctrlval(r, hw_dom->ctrl_val, hw_dom->mbps_val); + for (i = 0; i < num_closid; i++) + d->mbps_val[i] = MBA_MAX_MBPS; } return 0; @@ -2106,7 +2144,7 @@ static int schemata_list_create(void) struct rdt_resource *r; int ret = 0; - for_each_alloc_enabled_rdt_resource(r) { + for_each_alloc_capable_rdt_resource(r) { if (resctrl_arch_get_cdp_enabled(r->rid)) { ret = schemata_list_add(r, CDP_CODE); if (ret) @@ -2261,7 +2299,7 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) ctx->enable_cdpl2 = true; return 0; case Opt_mba_mbps: - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + if (!supports_mba_mbps()) return -EINVAL; ctx->enable_mba_mbps = true; return 0; @@ -2452,7 +2490,7 @@ static void rdt_kill_sb(struct super_block *sb) set_mba_sc(false); /*Put everything back to default values. */ - for_each_alloc_enabled_rdt_resource(r) + for_each_alloc_capable_rdt_resource(r) reset_all_ctrls(r); cdp_disable_all(); rmdir_all_sub(); @@ -2499,14 +2537,12 @@ static int mon_addfile(struct kernfs_node *parent_kn, const char *name, * Remove all subdirectories of mon_data of ctrl_mon groups * and monitor groups with given domain id. */ -void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, unsigned int dom_id) +static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + unsigned int dom_id) { struct rdtgroup *prgrp, *crgrp; char name[32]; - if (!r->mon_enabled) - return; - list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { sprintf(name, "mon_%s_%02d", r->name, dom_id); kernfs_remove_by_name(prgrp->mon.mon_data_kn, name); @@ -2565,16 +2601,13 @@ out_destroy: * Add all subdirectories of mon_data for "ctrl_mon" groups * and "monitor" groups with given domain id. */ -void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - struct rdt_domain *d) +static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + struct rdt_domain *d) { struct kernfs_node *parent_kn; struct rdtgroup *prgrp, *crgrp; struct list_head *head; - if (!r->mon_enabled) - return; - list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { parent_kn = prgrp->mon.mon_data_kn; mkdir_mondata_subdir(parent_kn, d, r, prgrp); @@ -2642,7 +2675,7 @@ static int mkdir_mondata_all(struct kernfs_node *parent_kn, * Create the subdirectories for each domain. Note that all events * in a domain like L3 are grouped into a resource whose domain is L3 */ - for_each_mon_enabled_rdt_resource(r) { + for_each_mon_capable_rdt_resource(r) { ret = mkdir_mondata_subdir_alldom(kn, r, prgrp); if (ret) goto out_destroy; @@ -2786,14 +2819,19 @@ static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid) } /* Initialize MBA resource with default values. */ -static void rdtgroup_init_mba(struct rdt_resource *r) +static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid) { struct resctrl_staged_config *cfg; struct rdt_domain *d; list_for_each_entry(d, &r->domains, list) { + if (is_mba_sc(r)) { + d->mbps_val[closid] = MBA_MAX_MBPS; + continue; + } + cfg = &d->staged_config[CDP_NONE]; - cfg->new_ctrl = is_mba_sc(r) ? MBA_MAX_MBPS : r->default_ctrl; + cfg->new_ctrl = r->default_ctrl; cfg->have_new_ctrl = true; } } @@ -2808,7 +2846,9 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) list_for_each_entry(s, &resctrl_schema_all, list) { r = s->res; if (r->rid == RDT_RESOURCE_MBA) { - rdtgroup_init_mba(r); + rdtgroup_init_mba(r, rdtgrp->closid); + if (is_mba_sc(r)) + continue; } else { ret = rdtgroup_init_cat(s, rdtgrp->closid); if (ret < 0) @@ -3221,13 +3261,13 @@ static int __init rdtgroup_setup_root(void) list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); - ret = rdtgroup_add_files(rdt_root->kn, RF_CTRL_BASE); + ret = rdtgroup_add_files(kernfs_root_to_node(rdt_root), RF_CTRL_BASE); if (ret) { kernfs_destroy_root(rdt_root); goto out; } - rdtgroup_default.kn = rdt_root->kn; + rdtgroup_default.kn = kernfs_root_to_node(rdt_root); kernfs_activate(rdtgroup_default.kn); out: @@ -3236,6 +3276,110 @@ out: return ret; } +static void domain_destroy_mon_state(struct rdt_domain *d) +{ + bitmap_free(d->rmid_busy_llc); + kfree(d->mbm_total); + kfree(d->mbm_local); +} + +void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d) +{ + lockdep_assert_held(&rdtgroup_mutex); + + if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) + mba_sc_domain_destroy(r, d); + + if (!r->mon_capable) + return; + + /* + * If resctrl is mounted, remove all the + * per domain monitor data directories. + */ + if (static_branch_unlikely(&rdt_mon_enable_key)) + rmdir_mondata_subdir_allrdtgrp(r, d->id); + + if (is_mbm_enabled()) + cancel_delayed_work(&d->mbm_over); + if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) { + /* + * When a package is going down, forcefully + * decrement rmid->ebusy. There is no way to know + * that the L3 was flushed and hence may lead to + * incorrect counts in rare scenarios, but leaving + * the RMID as busy creates RMID leaks if the + * package never comes back. + */ + __check_limbo(d, true); + cancel_delayed_work(&d->cqm_limbo); + } + + domain_destroy_mon_state(d); +} + +static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) +{ + size_t tsize; + + if (is_llc_occupancy_enabled()) { + d->rmid_busy_llc = bitmap_zalloc(r->num_rmid, GFP_KERNEL); + if (!d->rmid_busy_llc) + return -ENOMEM; + } + if (is_mbm_total_enabled()) { + tsize = sizeof(*d->mbm_total); + d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL); + if (!d->mbm_total) { + bitmap_free(d->rmid_busy_llc); + return -ENOMEM; + } + } + if (is_mbm_local_enabled()) { + tsize = sizeof(*d->mbm_local); + d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL); + if (!d->mbm_local) { + bitmap_free(d->rmid_busy_llc); + kfree(d->mbm_total); + return -ENOMEM; + } + } + + return 0; +} + +int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d) +{ + int err; + + lockdep_assert_held(&rdtgroup_mutex); + + if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) + /* RDT_RESOURCE_MBA is never mon_capable */ + return mba_sc_domain_allocate(r, d); + + if (!r->mon_capable) + return 0; + + err = domain_setup_mon_state(r, d); + if (err) + return err; + + if (is_mbm_enabled()) { + INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); + mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL); + } + + if (is_llc_occupancy_enabled()) + INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); + + /* If resctrl is mounted, add per domain monitor data directories. */ + if (static_branch_unlikely(&rdt_mon_enable_key)) + mkdir_mondata_subdir_allrdtgrp(r, d); + + return 0; +} + /* * rdtgroup_init - rdtgroup initialization * diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 21d1f062895a..fc01f81f6e2a 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -26,6 +26,8 @@ struct cpuid_bit { static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 }, + { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 }, { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 }, { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 }, { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 }, @@ -42,6 +44,8 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, + { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 }, + { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 001808e3901c..1ec20807de1e 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -12,6 +12,119 @@ #include "encls.h" #include "sgx.h" +static int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing); + +#define PCMDS_PER_PAGE (PAGE_SIZE / sizeof(struct sgx_pcmd)) +/* + * 32 PCMD entries share a PCMD page. PCMD_FIRST_MASK is used to + * determine the page index associated with the first PCMD entry + * within a PCMD page. + */ +#define PCMD_FIRST_MASK GENMASK(4, 0) + +/** + * reclaimer_writing_to_pcmd() - Query if any enclave page associated with + * a PCMD page is in process of being reclaimed. + * @encl: Enclave to which PCMD page belongs + * @start_addr: Address of enclave page using first entry within the PCMD page + * + * When an enclave page is reclaimed some Paging Crypto MetaData (PCMD) is + * stored. The PCMD data of a reclaimed enclave page contains enough + * information for the processor to verify the page at the time + * it is loaded back into the Enclave Page Cache (EPC). + * + * The backing storage to which enclave pages are reclaimed is laid out as + * follows: + * Encrypted enclave pages:SECS page:PCMD pages + * + * Each PCMD page contains the PCMD metadata of + * PAGE_SIZE/sizeof(struct sgx_pcmd) enclave pages. + * + * A PCMD page can only be truncated if it is (a) empty, and (b) not in the + * process of getting data (and thus soon being non-empty). (b) is tested with + * a check if an enclave page sharing the PCMD page is in the process of being + * reclaimed. + * + * The reclaimer sets the SGX_ENCL_PAGE_BEING_RECLAIMED flag when it + * intends to reclaim that enclave page - it means that the PCMD page + * associated with that enclave page is about to get some data and thus + * even if the PCMD page is empty, it should not be truncated. + * + * Context: Enclave mutex (&sgx_encl->lock) must be held. + * Return: 1 if the reclaimer is about to write to the PCMD page + * 0 if the reclaimer has no intention to write to the PCMD page + */ +static int reclaimer_writing_to_pcmd(struct sgx_encl *encl, + unsigned long start_addr) +{ + int reclaimed = 0; + int i; + + /* + * PCMD_FIRST_MASK is based on number of PCMD entries within + * PCMD page being 32. + */ + BUILD_BUG_ON(PCMDS_PER_PAGE != 32); + + for (i = 0; i < PCMDS_PER_PAGE; i++) { + struct sgx_encl_page *entry; + unsigned long addr; + + addr = start_addr + i * PAGE_SIZE; + + /* + * Stop when reaching the SECS page - it does not + * have a page_array entry and its reclaim is + * started and completed with enclave mutex held so + * it does not use the SGX_ENCL_PAGE_BEING_RECLAIMED + * flag. + */ + if (addr == encl->base + encl->size) + break; + + entry = xa_load(&encl->page_array, PFN_DOWN(addr)); + if (!entry) + continue; + + /* + * VA page slot ID uses same bit as the flag so it is important + * to ensure that the page is not already in backing store. + */ + if (entry->epc_page && + (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)) { + reclaimed = 1; + break; + } + } + + return reclaimed; +} + +/* + * Calculate byte offset of a PCMD struct associated with an enclave page. PCMD's + * follow right after the EPC data in the backing storage. In addition to the + * visible enclave pages, there's one extra page slot for SECS, before PCMD + * structs. + */ +static inline pgoff_t sgx_encl_get_backing_page_pcmd_offset(struct sgx_encl *encl, + unsigned long page_index) +{ + pgoff_t epc_end_off = encl->size + sizeof(struct sgx_secs); + + return epc_end_off + page_index * sizeof(struct sgx_pcmd); +} + +/* + * Free a page from the backing storage in the given page index. + */ +static inline void sgx_encl_truncate_backing_page(struct sgx_encl *encl, unsigned long page_index) +{ + struct inode *inode = file_inode(encl->backing); + + shmem_truncate_range(inode, PFN_PHYS(page_index), PFN_PHYS(page_index) + PAGE_SIZE - 1); +} + /* * ELDU: Load an EPC page as unblocked. For more info, see "OS Management of EPC * Pages" in the SDM. @@ -22,9 +135,12 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, { unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK; struct sgx_encl *encl = encl_page->encl; + pgoff_t page_index, page_pcmd_off; + unsigned long pcmd_first_page; struct sgx_pageinfo pginfo; struct sgx_backing b; - pgoff_t page_index; + bool pcmd_page_empty; + u8 *pcmd_page; int ret; if (secs_page) @@ -32,14 +148,21 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, else page_index = PFN_DOWN(encl->size); - ret = sgx_encl_get_backing(encl, page_index, &b); + /* + * Address of enclave page using the first entry within the PCMD page. + */ + pcmd_first_page = PFN_PHYS(page_index & ~PCMD_FIRST_MASK) + encl->base; + + page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); + + ret = sgx_encl_lookup_backing(encl, page_index, &b); if (ret) return ret; pginfo.addr = encl_page->desc & PAGE_MASK; pginfo.contents = (unsigned long)kmap_atomic(b.contents); - pginfo.metadata = (unsigned long)kmap_atomic(b.pcmd) + - b.pcmd_offset; + pcmd_page = kmap_atomic(b.pcmd); + pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset; if (secs_page) pginfo.secs = (u64)sgx_get_epc_virt_addr(secs_page); @@ -55,10 +178,32 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, ret = -EFAULT; } - kunmap_atomic((void *)(unsigned long)(pginfo.metadata - b.pcmd_offset)); + memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd)); + set_page_dirty(b.pcmd); + + /* + * The area for the PCMD in the page was zeroed above. Check if the + * whole page is now empty meaning that all PCMD's have been zeroed: + */ + pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE); + + kunmap_atomic(pcmd_page); kunmap_atomic((void *)(unsigned long)pginfo.contents); - sgx_encl_put_backing(&b, false); + get_page(b.pcmd); + sgx_encl_put_backing(&b); + + sgx_encl_truncate_backing_page(encl, page_index); + + if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) { + sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off)); + pcmd_page = kmap_atomic(b.pcmd); + if (memchr_inv(pcmd_page, 0, PAGE_SIZE)) + pr_warn("PCMD page not empty after truncate.\n"); + kunmap_atomic(pcmd_page); + } + + put_page(b.pcmd); return ret; } @@ -90,25 +235,10 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, return epc_page; } -static struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, - unsigned long addr, - unsigned long vm_flags) +static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, + struct sgx_encl_page *entry) { - unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC); struct sgx_epc_page *epc_page; - struct sgx_encl_page *entry; - - entry = xa_load(&encl->page_array, PFN_DOWN(addr)); - if (!entry) - return ERR_PTR(-EFAULT); - - /* - * Verify that the faulted page has equal or higher build time - * permissions than the VMA permissions (i.e. the subset of {VM_READ, - * VM_WRITE, VM_EXECUTE} in vma->vm_flags). - */ - if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits) - return ERR_PTR(-EFAULT); /* Entry successfully located. */ if (entry->epc_page) { @@ -134,6 +264,149 @@ static struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, return entry; } +static struct sgx_encl_page *sgx_encl_load_page_in_vma(struct sgx_encl *encl, + unsigned long addr, + unsigned long vm_flags) +{ + unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC); + struct sgx_encl_page *entry; + + entry = xa_load(&encl->page_array, PFN_DOWN(addr)); + if (!entry) + return ERR_PTR(-EFAULT); + + /* + * Verify that the page has equal or higher build time + * permissions than the VMA permissions (i.e. the subset of {VM_READ, + * VM_WRITE, VM_EXECUTE} in vma->vm_flags). + */ + if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits) + return ERR_PTR(-EFAULT); + + return __sgx_encl_load_page(encl, entry); +} + +struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, + unsigned long addr) +{ + struct sgx_encl_page *entry; + + entry = xa_load(&encl->page_array, PFN_DOWN(addr)); + if (!entry) + return ERR_PTR(-EFAULT); + + return __sgx_encl_load_page(encl, entry); +} + +/** + * sgx_encl_eaug_page() - Dynamically add page to initialized enclave + * @vma: VMA obtained from fault info from where page is accessed + * @encl: enclave accessing the page + * @addr: address that triggered the page fault + * + * When an initialized enclave accesses a page with no backing EPC page + * on a SGX2 system then the EPC can be added dynamically via the SGX2 + * ENCLS[EAUG] instruction. + * + * Returns: Appropriate vm_fault_t: VM_FAULT_NOPAGE when PTE was installed + * successfully, VM_FAULT_SIGBUS or VM_FAULT_OOM as error otherwise. + */ +static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, + struct sgx_encl *encl, unsigned long addr) +{ + vm_fault_t vmret = VM_FAULT_SIGBUS; + struct sgx_pageinfo pginfo = {0}; + struct sgx_encl_page *encl_page; + struct sgx_epc_page *epc_page; + struct sgx_va_page *va_page; + unsigned long phys_addr; + u64 secinfo_flags; + int ret; + + if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) + return VM_FAULT_SIGBUS; + + /* + * Ignore internal permission checking for dynamically added pages. + * They matter only for data added during the pre-initialization + * phase. The enclave decides the permissions by the means of + * EACCEPT, EACCEPTCOPY and EMODPE. + */ + secinfo_flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X; + encl_page = sgx_encl_page_alloc(encl, addr - encl->base, secinfo_flags); + if (IS_ERR(encl_page)) + return VM_FAULT_OOM; + + mutex_lock(&encl->lock); + + epc_page = sgx_alloc_epc_page(encl_page, false); + if (IS_ERR(epc_page)) { + if (PTR_ERR(epc_page) == -EBUSY) + vmret = VM_FAULT_NOPAGE; + goto err_out_unlock; + } + + va_page = sgx_encl_grow(encl, false); + if (IS_ERR(va_page)) { + if (PTR_ERR(va_page) == -EBUSY) + vmret = VM_FAULT_NOPAGE; + goto err_out_epc; + } + + if (va_page) + list_add(&va_page->list, &encl->va_pages); + + ret = xa_insert(&encl->page_array, PFN_DOWN(encl_page->desc), + encl_page, GFP_KERNEL); + /* + * If ret == -EBUSY then page was created in another flow while + * running without encl->lock + */ + if (ret) + goto err_out_shrink; + + pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page); + pginfo.addr = encl_page->desc & PAGE_MASK; + pginfo.metadata = 0; + + ret = __eaug(&pginfo, sgx_get_epc_virt_addr(epc_page)); + if (ret) + goto err_out; + + encl_page->encl = encl; + encl_page->epc_page = epc_page; + encl_page->type = SGX_PAGE_TYPE_REG; + encl->secs_child_cnt++; + + sgx_mark_page_reclaimable(encl_page->epc_page); + + phys_addr = sgx_get_epc_phys_addr(epc_page); + /* + * Do not undo everything when creating PTE entry fails - next #PF + * would find page ready for a PTE. + */ + vmret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr)); + if (vmret != VM_FAULT_NOPAGE) { + mutex_unlock(&encl->lock); + return VM_FAULT_SIGBUS; + } + mutex_unlock(&encl->lock); + return VM_FAULT_NOPAGE; + +err_out: + xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc)); + +err_out_shrink: + sgx_encl_shrink(encl, va_page); +err_out_epc: + sgx_encl_free_epc_page(epc_page); +err_out_unlock: + mutex_unlock(&encl->lock); + kfree(encl_page); + + return vmret; +} + static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) { unsigned long addr = (unsigned long)vmf->address; @@ -153,9 +426,20 @@ static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) if (unlikely(!encl)) return VM_FAULT_SIGBUS; + /* + * The page_array keeps track of all enclave pages, whether they + * are swapped out or not. If there is no entry for this page and + * the system supports SGX2 then it is possible to dynamically add + * a new enclave page. This is only possible for an initialized + * enclave that will be checked for right away. + */ + if (cpu_feature_enabled(X86_FEATURE_SGX2) && + (!xa_load(&encl->page_array, PFN_DOWN(addr)))) + return sgx_encl_eaug_page(vma, encl, addr); + mutex_lock(&encl->lock); - entry = sgx_encl_load_page(encl, addr, vma->vm_flags); + entry = sgx_encl_load_page_in_vma(encl, addr, vma->vm_flags); if (IS_ERR(entry)) { mutex_unlock(&encl->lock); @@ -225,6 +509,11 @@ int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, XA_STATE(xas, &encl->page_array, PFN_DOWN(start)); + /* Disallow mapping outside enclave's address range. */ + if (test_bit(SGX_ENCL_INITIALIZED, &encl->flags) && + (start < encl->base || end > encl->base + encl->size)) + return -EACCES; + /* * Disallow READ_IMPLIES_EXEC tasks as their VMA permissions might * conflict with the enclave page permissions. @@ -303,7 +592,7 @@ static struct sgx_encl_page *sgx_encl_reserve_page(struct sgx_encl *encl, for ( ; ; ) { mutex_lock(&encl->lock); - entry = sgx_encl_load_page(encl, addr, vm_flags); + entry = sgx_encl_load_page_in_vma(encl, addr, vm_flags); if (PTR_ERR(entry) != -EBUSY) break; @@ -410,6 +699,8 @@ void sgx_encl_release(struct kref *ref) } kfree(entry); + /* Invoke scheduler to prevent soft lockups. */ + cond_resched(); } xa_destroy(&encl->page_array); @@ -543,7 +834,7 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm) spin_lock(&encl->mm_lock); list_add_rcu(&encl_mm->list, &encl->mm_list); - /* Pairs with smp_rmb() in sgx_reclaimer_block(). */ + /* Pairs with smp_rmb() in sgx_zap_enclave_ptes(). */ smp_wmb(); encl->mm_list_version++; spin_unlock(&encl->mm_lock); @@ -551,18 +842,84 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm) return 0; } +/** + * sgx_encl_cpumask() - Query which CPUs might be accessing the enclave + * @encl: the enclave + * + * Some SGX functions require that no cached linear-to-physical address + * mappings are present before they can succeed. For example, ENCLS[EWB] + * copies a page from the enclave page cache to regular main memory but + * it fails if it cannot ensure that there are no cached + * linear-to-physical address mappings referring to the page. + * + * SGX hardware flushes all cached linear-to-physical mappings on a CPU + * when an enclave is exited via ENCLU[EEXIT] or an Asynchronous Enclave + * Exit (AEX). Exiting an enclave will thus ensure cached linear-to-physical + * address mappings are cleared but coordination with the tracking done within + * the SGX hardware is needed to support the SGX functions that depend on this + * cache clearing. + * + * When the ENCLS[ETRACK] function is issued on an enclave the hardware + * tracks threads operating inside the enclave at that time. The SGX + * hardware tracking require that all the identified threads must have + * exited the enclave in order to flush the mappings before a function such + * as ENCLS[EWB] will be permitted + * + * The following flow is used to support SGX functions that require that + * no cached linear-to-physical address mappings are present: + * 1) Execute ENCLS[ETRACK] to initiate hardware tracking. + * 2) Use this function (sgx_encl_cpumask()) to query which CPUs might be + * accessing the enclave. + * 3) Send IPI to identified CPUs, kicking them out of the enclave and + * thus flushing all locally cached linear-to-physical address mappings. + * 4) Execute SGX function. + * + * Context: It is required to call this function after ENCLS[ETRACK]. + * This will ensure that if any new mm appears (racing with + * sgx_encl_mm_add()) then the new mm will enter into the + * enclave with fresh linear-to-physical address mappings. + * + * It is required that all IPIs are completed before a new + * ENCLS[ETRACK] is issued so be sure to protect steps 1 to 3 + * of the above flow with the enclave's mutex. + * + * Return: cpumask of CPUs that might be accessing @encl + */ +const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl) +{ + cpumask_t *cpumask = &encl->cpumask; + struct sgx_encl_mm *encl_mm; + int idx; + + cpumask_clear(cpumask); + + idx = srcu_read_lock(&encl->srcu); + + list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { + if (!mmget_not_zero(encl_mm->mm)) + continue; + + cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm)); + + mmput_async(encl_mm->mm); + } + + srcu_read_unlock(&encl->srcu, idx); + + return cpumask; +} + static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl, pgoff_t index) { - struct inode *inode = encl->backing->f_path.dentry->d_inode; - struct address_space *mapping = inode->i_mapping; + struct address_space *mapping = encl->backing->f_mapping; gfp_t gfpmask = mapping_gfp_mask(mapping); return shmem_read_mapping_page_gfp(mapping, index, gfpmask); } /** - * sgx_encl_get_backing() - Pin the backing storage + * __sgx_encl_get_backing() - Pin the backing storage * @encl: an enclave pointer * @page_index: enclave page index * @backing: data for accessing backing storage for the page @@ -574,10 +931,10 @@ static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl, * 0 on success, * -errno otherwise. */ -int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, +static int __sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, struct sgx_backing *backing) { - pgoff_t pcmd_index = PFN_DOWN(encl->size) + 1 + (page_index >> 5); + pgoff_t page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); struct page *contents; struct page *pcmd; @@ -585,34 +942,130 @@ int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, if (IS_ERR(contents)) return PTR_ERR(contents); - pcmd = sgx_encl_get_backing_page(encl, pcmd_index); + pcmd = sgx_encl_get_backing_page(encl, PFN_DOWN(page_pcmd_off)); if (IS_ERR(pcmd)) { put_page(contents); return PTR_ERR(pcmd); } - backing->page_index = page_index; backing->contents = contents; backing->pcmd = pcmd; - backing->pcmd_offset = - (page_index & (PAGE_SIZE / sizeof(struct sgx_pcmd) - 1)) * - sizeof(struct sgx_pcmd); + backing->pcmd_offset = page_pcmd_off & (PAGE_SIZE - 1); return 0; } +/* + * When called from ksgxd, returns the mem_cgroup of a struct mm stored + * in the enclave's mm_list. When not called from ksgxd, just returns + * the mem_cgroup of the current task. + */ +static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl) +{ + struct mem_cgroup *memcg = NULL; + struct sgx_encl_mm *encl_mm; + int idx; + + /* + * If called from normal task context, return the mem_cgroup + * of the current task's mm. The remainder of the handling is for + * ksgxd. + */ + if (!current_is_ksgxd()) + return get_mem_cgroup_from_mm(current->mm); + + /* + * Search the enclave's mm_list to find an mm associated with + * this enclave to charge the allocation to. + */ + idx = srcu_read_lock(&encl->srcu); + + list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { + if (!mmget_not_zero(encl_mm->mm)) + continue; + + memcg = get_mem_cgroup_from_mm(encl_mm->mm); + + mmput_async(encl_mm->mm); + + break; + } + + srcu_read_unlock(&encl->srcu, idx); + + /* + * In the rare case that there isn't an mm associated with + * the enclave, set memcg to the current active mem_cgroup. + * This will be the root mem_cgroup if there is no active + * mem_cgroup. + */ + if (!memcg) + return get_mem_cgroup_from_mm(NULL); + + return memcg; +} + /** - * sgx_encl_put_backing() - Unpin the backing storage + * sgx_encl_alloc_backing() - create a new backing storage page + * @encl: an enclave pointer + * @page_index: enclave page index * @backing: data for accessing backing storage for the page - * @do_write: mark pages dirty + * + * When called from ksgxd, sets the active memcg from one of the + * mms in the enclave's mm_list prior to any backing page allocation, + * in order to ensure that shmem page allocations are charged to the + * enclave. Create a backing page for loading data back into an EPC page with + * ELDU. This function takes a reference on a new backing page which + * must be dropped with a corresponding call to sgx_encl_put_backing(). + * + * Return: + * 0 on success, + * -errno otherwise. */ -void sgx_encl_put_backing(struct sgx_backing *backing, bool do_write) +int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing) { - if (do_write) { - set_page_dirty(backing->pcmd); - set_page_dirty(backing->contents); - } + struct mem_cgroup *encl_memcg = sgx_encl_get_mem_cgroup(encl); + struct mem_cgroup *memcg = set_active_memcg(encl_memcg); + int ret; + + ret = __sgx_encl_get_backing(encl, page_index, backing); + + set_active_memcg(memcg); + mem_cgroup_put(encl_memcg); + + return ret; +} + +/** + * sgx_encl_lookup_backing() - retrieve an existing backing storage page + * @encl: an enclave pointer + * @page_index: enclave page index + * @backing: data for accessing backing storage for the page + * + * Retrieve a backing page for loading data back into an EPC page with ELDU. + * It is the caller's responsibility to ensure that it is appropriate to use + * sgx_encl_lookup_backing() rather than sgx_encl_alloc_backing(). If lookup is + * not used correctly, this will cause an allocation which is not accounted for. + * This function takes a reference on an existing backing page which must be + * dropped with a corresponding call to sgx_encl_put_backing(). + * + * Return: + * 0 on success, + * -errno otherwise. + */ +static int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing) +{ + return __sgx_encl_get_backing(encl, page_index, backing); +} +/** + * sgx_encl_put_backing() - Unpin the backing storage + * @backing: data for accessing backing storage for the page + */ +void sgx_encl_put_backing(struct sgx_backing *backing) +{ put_page(backing->pcmd); put_page(backing->contents); } @@ -665,8 +1118,85 @@ int sgx_encl_test_and_clear_young(struct mm_struct *mm, return ret; } +struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, + unsigned long offset, + u64 secinfo_flags) +{ + struct sgx_encl_page *encl_page; + unsigned long prot; + + encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL); + if (!encl_page) + return ERR_PTR(-ENOMEM); + + encl_page->desc = encl->base + offset; + encl_page->encl = encl; + + prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ) | + _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) | + _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC); + + /* + * TCS pages must always RW set for CPU access while the SECINFO + * permissions are *always* zero - the CPU ignores the user provided + * values and silently overwrites them with zero permissions. + */ + if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS) + prot |= PROT_READ | PROT_WRITE; + + /* Calculate maximum of the VM flags for the page. */ + encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0); + + return encl_page; +} + +/** + * sgx_zap_enclave_ptes() - remove PTEs mapping the address from enclave + * @encl: the enclave + * @addr: page aligned pointer to single page for which PTEs will be removed + * + * Multiple VMAs may have an enclave page mapped. Remove the PTE mapping + * @addr from each VMA. Ensure that page fault handler is ready to handle + * new mappings of @addr before calling this function. + */ +void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr) +{ + unsigned long mm_list_version; + struct sgx_encl_mm *encl_mm; + struct vm_area_struct *vma; + int idx, ret; + + do { + mm_list_version = encl->mm_list_version; + + /* Pairs with smp_wmb() in sgx_encl_mm_add(). */ + smp_rmb(); + + idx = srcu_read_lock(&encl->srcu); + + list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { + if (!mmget_not_zero(encl_mm->mm)) + continue; + + mmap_read_lock(encl_mm->mm); + + ret = sgx_encl_find(encl_mm->mm, addr, &vma); + if (!ret && encl == vma->vm_private_data) + zap_vma_ptes(vma, addr, PAGE_SIZE); + + mmap_read_unlock(encl_mm->mm); + + mmput_async(encl_mm->mm); + } + + srcu_read_unlock(&encl->srcu, idx); + } while (unlikely(encl->mm_list_version != mm_list_version)); +} + /** * sgx_alloc_va_page() - Allocate a Version Array (VA) page + * @reclaim: Reclaim EPC pages directly if none available. Enclave + * mutex should not be held if this is set. * * Allocate a free EPC page and convert it to a Version Array (VA) page. * @@ -674,12 +1204,12 @@ int sgx_encl_test_and_clear_young(struct mm_struct *mm, * a VA page, * -errno otherwise */ -struct sgx_epc_page *sgx_alloc_va_page(void) +struct sgx_epc_page *sgx_alloc_va_page(bool reclaim) { struct sgx_epc_page *epc_page; int ret; - epc_page = sgx_alloc_epc_page(NULL, true); + epc_page = sgx_alloc_epc_page(NULL, reclaim); if (IS_ERR(epc_page)) return ERR_CAST(epc_page); diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index fec43ca65065..f94ff14c9486 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -27,7 +27,8 @@ struct sgx_encl_page { unsigned long desc; - unsigned long vm_max_prot_bits; + unsigned long vm_max_prot_bits:8; + enum sgx_page_type type:16; struct sgx_epc_page *epc_page; struct sgx_encl *encl; struct sgx_va_page *va_page; @@ -78,7 +79,6 @@ struct sgx_va_page { }; struct sgx_backing { - pgoff_t page_index; struct page *contents; struct page *pcmd; unsigned long pcmd_offset; @@ -103,18 +103,27 @@ static inline int sgx_encl_find(struct mm_struct *mm, unsigned long addr, int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, unsigned long end, unsigned long vm_flags); +bool current_is_ksgxd(void); void sgx_encl_release(struct kref *ref); int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm); -int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, - struct sgx_backing *backing); -void sgx_encl_put_backing(struct sgx_backing *backing, bool do_write); +const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl); +int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing); +void sgx_encl_put_backing(struct sgx_backing *backing); int sgx_encl_test_and_clear_young(struct mm_struct *mm, struct sgx_encl_page *page); - -struct sgx_epc_page *sgx_alloc_va_page(void); +struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, + unsigned long offset, + u64 secinfo_flags); +void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr); +struct sgx_epc_page *sgx_alloc_va_page(bool reclaim); unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page); void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset); bool sgx_va_page_full(struct sgx_va_page *va_page); void sgx_encl_free_epc_page(struct sgx_epc_page *page); +struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, + unsigned long addr); +struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl, bool reclaim); +void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page); #endif /* _X86_ENCL_H */ diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h index 9b204843b78d..99004b02e2ed 100644 --- a/arch/x86/kernel/cpu/sgx/encls.h +++ b/arch/x86/kernel/cpu/sgx/encls.h @@ -11,26 +11,8 @@ #include <asm/traps.h> #include "sgx.h" -/** - * ENCLS_FAULT_FLAG - flag signifying an ENCLS return code is a trapnr - * - * ENCLS has its own (positive value) error codes and also generates - * ENCLS specific #GP and #PF faults. And the ENCLS values get munged - * with system error codes as everything percolates back up the stack. - * Unfortunately (for us), we need to precisely identify each unique - * error code, e.g. the action taken if EWB fails varies based on the - * type of fault and on the exact SGX error code, i.e. we can't simply - * convert all faults to -EFAULT. - * - * To make all three error types coexist, we set bit 30 to identify an - * ENCLS fault. Bit 31 (technically bits N:31) is used to differentiate - * between positive (faults and SGX error codes) and negative (system - * error codes) values. - */ -#define ENCLS_FAULT_FLAG 0x40000000 - /* Retrieve the encoded trapnr from the specified return code. */ -#define ENCLS_TRAPNR(r) ((r) & ~ENCLS_FAULT_FLAG) +#define ENCLS_TRAPNR(r) ((r) & ~SGX_ENCLS_FAULT_FLAG) /* Issue a WARN() about an ENCLS function. */ #define ENCLS_WARN(r, name) { \ @@ -50,7 +32,7 @@ */ static inline bool encls_faulted(int ret) { - return ret & ENCLS_FAULT_FLAG; + return ret & SGX_ENCLS_FAULT_FLAG; } /** @@ -88,11 +70,7 @@ static inline bool encls_failed(int ret) asm volatile( \ "1: .byte 0x0f, 0x01, 0xcf;\n\t" \ "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3: orl $"__stringify(ENCLS_FAULT_FLAG)",%%eax\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE_FAULT(1b, 3b) \ + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_SGX) \ : "=a"(ret) \ : "a"(rax), inputs \ : "memory", "cc"); \ @@ -127,7 +105,7 @@ static inline bool encls_failed(int ret) * * Return: * 0 on success, - * trapnr with ENCLS_FAULT_FLAG set on fault + * trapnr with SGX_ENCLS_FAULT_FLAG set on fault */ #define __encls_N(rax, rbx_out, inputs...) \ ({ \ @@ -136,11 +114,7 @@ static inline bool encls_failed(int ret) "1: .byte 0x0f, 0x01, 0xcf;\n\t" \ " xor %%eax,%%eax;\n" \ "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3: orl $"__stringify(ENCLS_FAULT_FLAG)",%%eax\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE_FAULT(1b, 3b) \ + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_SGX) \ : "=a"(ret), "=b"(rbx_out) \ : "a"(rax), inputs \ : "memory"); \ @@ -162,57 +136,71 @@ static inline bool encls_failed(int ret) ret; \ }) +/* Initialize an EPC page into an SGX Enclave Control Structure (SECS) page. */ static inline int __ecreate(struct sgx_pageinfo *pginfo, void *secs) { return __encls_2(ECREATE, pginfo, secs); } +/* Hash a 256 byte region of an enclave page to SECS:MRENCLAVE. */ static inline int __eextend(void *secs, void *addr) { return __encls_2(EEXTEND, secs, addr); } +/* + * Associate an EPC page to an enclave either as a REG or TCS page + * populated with the provided data. + */ static inline int __eadd(struct sgx_pageinfo *pginfo, void *addr) { return __encls_2(EADD, pginfo, addr); } +/* Finalize enclave build, initialize enclave for user code execution. */ static inline int __einit(void *sigstruct, void *token, void *secs) { return __encls_ret_3(EINIT, sigstruct, secs, token); } +/* Disassociate EPC page from its enclave and mark it as unused. */ static inline int __eremove(void *addr) { return __encls_ret_1(EREMOVE, addr); } +/* Copy data to an EPC page belonging to a debug enclave. */ static inline int __edbgwr(void *addr, unsigned long *data) { return __encls_2(EDGBWR, *data, addr); } +/* Copy data from an EPC page belonging to a debug enclave. */ static inline int __edbgrd(void *addr, unsigned long *data) { return __encls_1_1(EDGBRD, *data, addr); } +/* Track that software has completed the required TLB address clears. */ static inline int __etrack(void *addr) { return __encls_ret_1(ETRACK, addr); } +/* Load, verify, and unblock an EPC page. */ static inline int __eldu(struct sgx_pageinfo *pginfo, void *addr, void *va) { return __encls_ret_3(ELDU, pginfo, addr, va); } +/* Make EPC page inaccessible to enclave, ready to be written to memory. */ static inline int __eblock(void *addr) { return __encls_ret_1(EBLOCK, addr); } +/* Initialize an EPC page into a Version Array (VA) page. */ static inline int __epa(void *addr) { unsigned long rbx = SGX_PAGE_TYPE_VA; @@ -220,10 +208,29 @@ static inline int __epa(void *addr) return __encls_2(EPA, rbx, addr); } +/* Invalidate an EPC page and write it out to main memory. */ static inline int __ewb(struct sgx_pageinfo *pginfo, void *addr, void *va) { return __encls_ret_3(EWB, pginfo, addr, va); } +/* Restrict the EPCM permissions of an EPC page. */ +static inline int __emodpr(struct sgx_secinfo *secinfo, void *addr) +{ + return __encls_ret_2(EMODPR, secinfo, addr); +} + +/* Change the type of an EPC page. */ +static inline int __emodt(struct sgx_secinfo *secinfo, void *addr) +{ + return __encls_ret_2(EMODT, secinfo, addr); +} + +/* Zero a page of EPC memory and add it to an initialized enclave. */ +static inline int __eaug(struct sgx_pageinfo *pginfo, void *addr) +{ + return __encls_2(EAUG, pginfo, addr); +} + #endif /* _X86_ENCLS_H */ diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 83df20e3e633..ebe79d60619f 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -17,7 +17,7 @@ #include "encl.h" #include "encls.h" -static struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl) +struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl, bool reclaim) { struct sgx_va_page *va_page = NULL; void *err; @@ -30,7 +30,7 @@ static struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl) if (!va_page) return ERR_PTR(-ENOMEM); - va_page->epc_page = sgx_alloc_va_page(); + va_page->epc_page = sgx_alloc_va_page(reclaim); if (IS_ERR(va_page->epc_page)) { err = ERR_CAST(va_page->epc_page); kfree(va_page); @@ -43,7 +43,7 @@ static struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl) return va_page; } -static void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page) +void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page) { encl->page_cnt--; @@ -64,7 +64,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) struct file *backing; long ret; - va_page = sgx_encl_grow(encl); + va_page = sgx_encl_grow(encl, true); if (IS_ERR(va_page)) return PTR_ERR(va_page); else if (va_page) @@ -107,6 +107,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) set_bit(SGX_ENCL_DEBUG, &encl->flags); encl->secs.encl = encl; + encl->secs.type = SGX_PAGE_TYPE_SECS; encl->base = secs->base; encl->size = secs->size; encl->attributes = secs->attributes; @@ -168,38 +169,6 @@ static long sgx_ioc_enclave_create(struct sgx_encl *encl, void __user *arg) return ret; } -static struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, - unsigned long offset, - u64 secinfo_flags) -{ - struct sgx_encl_page *encl_page; - unsigned long prot; - - encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL); - if (!encl_page) - return ERR_PTR(-ENOMEM); - - encl_page->desc = encl->base + offset; - encl_page->encl = encl; - - prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ) | - _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) | - _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC); - - /* - * TCS pages must always RW set for CPU access while the SECINFO - * permissions are *always* zero - the CPU ignores the user provided - * values and silently overwrites them with zero permissions. - */ - if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS) - prot |= PROT_READ | PROT_WRITE; - - /* Calculate maximum of the VM flags for the page. */ - encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0); - - return encl_page; -} - static int sgx_validate_secinfo(struct sgx_secinfo *secinfo) { u64 perm = secinfo->flags & SGX_SECINFO_PERMISSION_MASK; @@ -306,7 +275,7 @@ static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src, return PTR_ERR(epc_page); } - va_page = sgx_encl_grow(encl); + va_page = sgx_encl_grow(encl, true); if (IS_ERR(va_page)) { ret = PTR_ERR(va_page); goto err_out_free; @@ -344,6 +313,7 @@ static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src, */ encl_page->encl = encl; encl_page->epc_page = epc_page; + encl_page->type = (secinfo->flags & SGX_SECINFO_PAGE_TYPE_MASK) >> 8; encl->secs_child_cnt++; if (flags & SGX_PAGE_MEASURE) { @@ -372,6 +342,26 @@ err_out_free: return ret; } +/* + * Ensure user provided offset and length values are valid for + * an enclave. + */ +static int sgx_validate_offset_length(struct sgx_encl *encl, + unsigned long offset, + unsigned long length) +{ + if (!IS_ALIGNED(offset, PAGE_SIZE)) + return -EINVAL; + + if (!length || !IS_ALIGNED(length, PAGE_SIZE)) + return -EINVAL; + + if (offset + length - PAGE_SIZE >= encl->size) + return -EINVAL; + + return 0; +} + /** * sgx_ioc_enclave_add_pages() - The handler for %SGX_IOC_ENCLAVE_ADD_PAGES * @encl: an enclave pointer @@ -425,14 +415,10 @@ static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg) if (copy_from_user(&add_arg, arg, sizeof(add_arg))) return -EFAULT; - if (!IS_ALIGNED(add_arg.offset, PAGE_SIZE) || - !IS_ALIGNED(add_arg.src, PAGE_SIZE)) - return -EINVAL; - - if (!add_arg.length || add_arg.length & (PAGE_SIZE - 1)) + if (!IS_ALIGNED(add_arg.src, PAGE_SIZE)) return -EINVAL; - if (add_arg.offset + add_arg.length - PAGE_SIZE >= encl->size) + if (sgx_validate_offset_length(encl, add_arg.offset, add_arg.length)) return -EINVAL; if (copy_from_user(&secinfo, (void __user *)add_arg.secinfo, @@ -674,6 +660,565 @@ static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg) return sgx_set_attribute(&encl->attributes_mask, params.fd); } +/* + * Ensure enclave is ready for SGX2 functions. Readiness is checked + * by ensuring the hardware supports SGX2 and the enclave is initialized + * and thus able to handle requests to modify pages within it. + */ +static int sgx_ioc_sgx2_ready(struct sgx_encl *encl) +{ + if (!(cpu_feature_enabled(X86_FEATURE_SGX2))) + return -ENODEV; + + if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) + return -EINVAL; + + return 0; +} + +/* + * Some SGX functions require that no cached linear-to-physical address + * mappings are present before they can succeed. Collaborate with + * hardware via ENCLS[ETRACK] to ensure that all cached + * linear-to-physical address mappings belonging to all threads of + * the enclave are cleared. See sgx_encl_cpumask() for details. + * + * Must be called with enclave's mutex held from the time the + * SGX function requiring that no cached linear-to-physical mappings + * are present is executed until this ETRACK flow is complete. + */ +static int sgx_enclave_etrack(struct sgx_encl *encl) +{ + void *epc_virt; + int ret; + + epc_virt = sgx_get_epc_virt_addr(encl->secs.epc_page); + ret = __etrack(epc_virt); + if (ret) { + /* + * ETRACK only fails when there is an OS issue. For + * example, two consecutive ETRACK was sent without + * completed IPI between. + */ + pr_err_once("ETRACK returned %d (0x%x)", ret, ret); + /* + * Send IPIs to kick CPUs out of the enclave and + * try ETRACK again. + */ + on_each_cpu_mask(sgx_encl_cpumask(encl), sgx_ipi_cb, NULL, 1); + ret = __etrack(epc_virt); + if (ret) { + pr_err_once("ETRACK repeat returned %d (0x%x)", + ret, ret); + return -EFAULT; + } + } + on_each_cpu_mask(sgx_encl_cpumask(encl), sgx_ipi_cb, NULL, 1); + + return 0; +} + +/** + * sgx_enclave_restrict_permissions() - Restrict EPCM permissions + * @encl: Enclave to which the pages belong. + * @modp: Checked parameters from user on which pages need modifying and + * their new permissions. + * + * Return: + * - 0: Success. + * - -errno: Otherwise. + */ +static long +sgx_enclave_restrict_permissions(struct sgx_encl *encl, + struct sgx_enclave_restrict_permissions *modp) +{ + struct sgx_encl_page *entry; + struct sgx_secinfo secinfo; + unsigned long addr; + unsigned long c; + void *epc_virt; + int ret; + + memset(&secinfo, 0, sizeof(secinfo)); + secinfo.flags = modp->permissions & SGX_SECINFO_PERMISSION_MASK; + + for (c = 0 ; c < modp->length; c += PAGE_SIZE) { + addr = encl->base + modp->offset + c; + + sgx_reclaim_direct(); + + mutex_lock(&encl->lock); + + entry = sgx_encl_load_page(encl, addr); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT; + goto out_unlock; + } + + /* + * Changing EPCM permissions is only supported on regular + * SGX pages. Attempting this change on other pages will + * result in #PF. + */ + if (entry->type != SGX_PAGE_TYPE_REG) { + ret = -EINVAL; + goto out_unlock; + } + + /* + * Apart from ensuring that read-access remains, do not verify + * the permission bits requested. Kernel has no control over + * how EPCM permissions can be relaxed from within the enclave. + * ENCLS[EMODPR] can only remove existing EPCM permissions, + * attempting to set new permissions will be ignored by the + * hardware. + */ + + /* Change EPCM permissions. */ + epc_virt = sgx_get_epc_virt_addr(entry->epc_page); + ret = __emodpr(&secinfo, epc_virt); + if (encls_faulted(ret)) { + /* + * All possible faults should be avoidable: + * parameters have been checked, will only change + * permissions of a regular page, and no concurrent + * SGX1/SGX2 ENCLS instructions since these + * are protected with mutex. + */ + pr_err_once("EMODPR encountered exception %d\n", + ENCLS_TRAPNR(ret)); + ret = -EFAULT; + goto out_unlock; + } + if (encls_failed(ret)) { + modp->result = ret; + ret = -EFAULT; + goto out_unlock; + } + + ret = sgx_enclave_etrack(encl); + if (ret) { + ret = -EFAULT; + goto out_unlock; + } + + mutex_unlock(&encl->lock); + } + + ret = 0; + goto out; + +out_unlock: + mutex_unlock(&encl->lock); +out: + modp->count = c; + + return ret; +} + +/** + * sgx_ioc_enclave_restrict_permissions() - handler for + * %SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS + * @encl: an enclave pointer + * @arg: userspace pointer to a &struct sgx_enclave_restrict_permissions + * instance + * + * SGX2 distinguishes between relaxing and restricting the enclave page + * permissions maintained by the hardware (EPCM permissions) of pages + * belonging to an initialized enclave (after SGX_IOC_ENCLAVE_INIT). + * + * EPCM permissions cannot be restricted from within the enclave, the enclave + * requires the kernel to run the privileged level 0 instructions ENCLS[EMODPR] + * and ENCLS[ETRACK]. An attempt to relax EPCM permissions with this call + * will be ignored by the hardware. + * + * Return: + * - 0: Success + * - -errno: Otherwise + */ +static long sgx_ioc_enclave_restrict_permissions(struct sgx_encl *encl, + void __user *arg) +{ + struct sgx_enclave_restrict_permissions params; + long ret; + + ret = sgx_ioc_sgx2_ready(encl); + if (ret) + return ret; + + if (copy_from_user(¶ms, arg, sizeof(params))) + return -EFAULT; + + if (sgx_validate_offset_length(encl, params.offset, params.length)) + return -EINVAL; + + if (params.permissions & ~SGX_SECINFO_PERMISSION_MASK) + return -EINVAL; + + /* + * Fail early if invalid permissions requested to prevent ENCLS[EMODPR] + * from faulting later when the CPU does the same check. + */ + if ((params.permissions & SGX_SECINFO_W) && + !(params.permissions & SGX_SECINFO_R)) + return -EINVAL; + + if (params.result || params.count) + return -EINVAL; + + ret = sgx_enclave_restrict_permissions(encl, ¶ms); + + if (copy_to_user(arg, ¶ms, sizeof(params))) + return -EFAULT; + + return ret; +} + +/** + * sgx_enclave_modify_types() - Modify type of SGX enclave pages + * @encl: Enclave to which the pages belong. + * @modt: Checked parameters from user about which pages need modifying + * and their new page type. + * + * Return: + * - 0: Success + * - -errno: Otherwise + */ +static long sgx_enclave_modify_types(struct sgx_encl *encl, + struct sgx_enclave_modify_types *modt) +{ + unsigned long max_prot_restore; + enum sgx_page_type page_type; + struct sgx_encl_page *entry; + struct sgx_secinfo secinfo; + unsigned long prot; + unsigned long addr; + unsigned long c; + void *epc_virt; + int ret; + + page_type = modt->page_type & SGX_PAGE_TYPE_MASK; + + /* + * The only new page types allowed by hardware are PT_TCS and PT_TRIM. + */ + if (page_type != SGX_PAGE_TYPE_TCS && page_type != SGX_PAGE_TYPE_TRIM) + return -EINVAL; + + memset(&secinfo, 0, sizeof(secinfo)); + + secinfo.flags = page_type << 8; + + for (c = 0 ; c < modt->length; c += PAGE_SIZE) { + addr = encl->base + modt->offset + c; + + sgx_reclaim_direct(); + + mutex_lock(&encl->lock); + + entry = sgx_encl_load_page(encl, addr); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT; + goto out_unlock; + } + + /* + * Borrow the logic from the Intel SDM. Regular pages + * (SGX_PAGE_TYPE_REG) can change type to SGX_PAGE_TYPE_TCS + * or SGX_PAGE_TYPE_TRIM but TCS pages can only be trimmed. + * CET pages not supported yet. + */ + if (!(entry->type == SGX_PAGE_TYPE_REG || + (entry->type == SGX_PAGE_TYPE_TCS && + page_type == SGX_PAGE_TYPE_TRIM))) { + ret = -EINVAL; + goto out_unlock; + } + + max_prot_restore = entry->vm_max_prot_bits; + + /* + * Once a regular page becomes a TCS page it cannot be + * changed back. So the maximum allowed protection reflects + * the TCS page that is always RW from kernel perspective but + * will be inaccessible from within enclave. Before doing + * so, do make sure that the new page type continues to + * respect the originally vetted page permissions. + */ + if (entry->type == SGX_PAGE_TYPE_REG && + page_type == SGX_PAGE_TYPE_TCS) { + if (~entry->vm_max_prot_bits & (VM_READ | VM_WRITE)) { + ret = -EPERM; + goto out_unlock; + } + prot = PROT_READ | PROT_WRITE; + entry->vm_max_prot_bits = calc_vm_prot_bits(prot, 0); + + /* + * Prevent page from being reclaimed while mutex + * is released. + */ + if (sgx_unmark_page_reclaimable(entry->epc_page)) { + ret = -EAGAIN; + goto out_entry_changed; + } + + /* + * Do not keep encl->lock because of dependency on + * mmap_lock acquired in sgx_zap_enclave_ptes(). + */ + mutex_unlock(&encl->lock); + + sgx_zap_enclave_ptes(encl, addr); + + mutex_lock(&encl->lock); + + sgx_mark_page_reclaimable(entry->epc_page); + } + + /* Change EPC type */ + epc_virt = sgx_get_epc_virt_addr(entry->epc_page); + ret = __emodt(&secinfo, epc_virt); + if (encls_faulted(ret)) { + /* + * All possible faults should be avoidable: + * parameters have been checked, will only change + * valid page types, and no concurrent + * SGX1/SGX2 ENCLS instructions since these are + * protected with mutex. + */ + pr_err_once("EMODT encountered exception %d\n", + ENCLS_TRAPNR(ret)); + ret = -EFAULT; + goto out_entry_changed; + } + if (encls_failed(ret)) { + modt->result = ret; + ret = -EFAULT; + goto out_entry_changed; + } + + ret = sgx_enclave_etrack(encl); + if (ret) { + ret = -EFAULT; + goto out_unlock; + } + + entry->type = page_type; + + mutex_unlock(&encl->lock); + } + + ret = 0; + goto out; + +out_entry_changed: + entry->vm_max_prot_bits = max_prot_restore; +out_unlock: + mutex_unlock(&encl->lock); +out: + modt->count = c; + + return ret; +} + +/** + * sgx_ioc_enclave_modify_types() - handler for %SGX_IOC_ENCLAVE_MODIFY_TYPES + * @encl: an enclave pointer + * @arg: userspace pointer to a &struct sgx_enclave_modify_types instance + * + * Ability to change the enclave page type supports the following use cases: + * + * * It is possible to add TCS pages to an enclave by changing the type of + * regular pages (%SGX_PAGE_TYPE_REG) to TCS (%SGX_PAGE_TYPE_TCS) pages. + * With this support the number of threads supported by an initialized + * enclave can be increased dynamically. + * + * * Regular or TCS pages can dynamically be removed from an initialized + * enclave by changing the page type to %SGX_PAGE_TYPE_TRIM. Changing the + * page type to %SGX_PAGE_TYPE_TRIM marks the page for removal with actual + * removal done by handler of %SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl() called + * after ENCLU[EACCEPT] is run on %SGX_PAGE_TYPE_TRIM page from within the + * enclave. + * + * Return: + * - 0: Success + * - -errno: Otherwise + */ +static long sgx_ioc_enclave_modify_types(struct sgx_encl *encl, + void __user *arg) +{ + struct sgx_enclave_modify_types params; + long ret; + + ret = sgx_ioc_sgx2_ready(encl); + if (ret) + return ret; + + if (copy_from_user(¶ms, arg, sizeof(params))) + return -EFAULT; + + if (sgx_validate_offset_length(encl, params.offset, params.length)) + return -EINVAL; + + if (params.page_type & ~SGX_PAGE_TYPE_MASK) + return -EINVAL; + + if (params.result || params.count) + return -EINVAL; + + ret = sgx_enclave_modify_types(encl, ¶ms); + + if (copy_to_user(arg, ¶ms, sizeof(params))) + return -EFAULT; + + return ret; +} + +/** + * sgx_encl_remove_pages() - Remove trimmed pages from SGX enclave + * @encl: Enclave to which the pages belong + * @params: Checked parameters from user on which pages need to be removed + * + * Return: + * - 0: Success. + * - -errno: Otherwise. + */ +static long sgx_encl_remove_pages(struct sgx_encl *encl, + struct sgx_enclave_remove_pages *params) +{ + struct sgx_encl_page *entry; + struct sgx_secinfo secinfo; + unsigned long addr; + unsigned long c; + void *epc_virt; + int ret; + + memset(&secinfo, 0, sizeof(secinfo)); + secinfo.flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X; + + for (c = 0 ; c < params->length; c += PAGE_SIZE) { + addr = encl->base + params->offset + c; + + sgx_reclaim_direct(); + + mutex_lock(&encl->lock); + + entry = sgx_encl_load_page(encl, addr); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT; + goto out_unlock; + } + + if (entry->type != SGX_PAGE_TYPE_TRIM) { + ret = -EPERM; + goto out_unlock; + } + + /* + * ENCLS[EMODPR] is a no-op instruction used to inform if + * ENCLU[EACCEPT] was run from within the enclave. If + * ENCLS[EMODPR] is run with RWX on a trimmed page that is + * not yet accepted then it will return + * %SGX_PAGE_NOT_MODIFIABLE, after the trimmed page is + * accepted the instruction will encounter a page fault. + */ + epc_virt = sgx_get_epc_virt_addr(entry->epc_page); + ret = __emodpr(&secinfo, epc_virt); + if (!encls_faulted(ret) || ENCLS_TRAPNR(ret) != X86_TRAP_PF) { + ret = -EPERM; + goto out_unlock; + } + + if (sgx_unmark_page_reclaimable(entry->epc_page)) { + ret = -EBUSY; + goto out_unlock; + } + + /* + * Do not keep encl->lock because of dependency on + * mmap_lock acquired in sgx_zap_enclave_ptes(). + */ + mutex_unlock(&encl->lock); + + sgx_zap_enclave_ptes(encl, addr); + + mutex_lock(&encl->lock); + + sgx_encl_free_epc_page(entry->epc_page); + encl->secs_child_cnt--; + entry->epc_page = NULL; + xa_erase(&encl->page_array, PFN_DOWN(entry->desc)); + sgx_encl_shrink(encl, NULL); + kfree(entry); + + mutex_unlock(&encl->lock); + } + + ret = 0; + goto out; + +out_unlock: + mutex_unlock(&encl->lock); +out: + params->count = c; + + return ret; +} + +/** + * sgx_ioc_enclave_remove_pages() - handler for %SGX_IOC_ENCLAVE_REMOVE_PAGES + * @encl: an enclave pointer + * @arg: userspace pointer to &struct sgx_enclave_remove_pages instance + * + * Final step of the flow removing pages from an initialized enclave. The + * complete flow is: + * + * 1) User changes the type of the pages to be removed to %SGX_PAGE_TYPE_TRIM + * using the %SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl(). + * 2) User approves the page removal by running ENCLU[EACCEPT] from within + * the enclave. + * 3) User initiates actual page removal using the + * %SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl() that is handled here. + * + * First remove any page table entries pointing to the page and then proceed + * with the actual removal of the enclave page and data in support of it. + * + * VA pages are not affected by this removal. It is thus possible that the + * enclave may end up with more VA pages than needed to support all its + * pages. + * + * Return: + * - 0: Success + * - -errno: Otherwise + */ +static long sgx_ioc_enclave_remove_pages(struct sgx_encl *encl, + void __user *arg) +{ + struct sgx_enclave_remove_pages params; + long ret; + + ret = sgx_ioc_sgx2_ready(encl); + if (ret) + return ret; + + if (copy_from_user(¶ms, arg, sizeof(params))) + return -EFAULT; + + if (sgx_validate_offset_length(encl, params.offset, params.length)) + return -EINVAL; + + if (params.count) + return -EINVAL; + + ret = sgx_encl_remove_pages(encl, ¶ms); + + if (copy_to_user(arg, ¶ms, sizeof(params))) + return -EFAULT; + + return ret; +} + long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) { struct sgx_encl *encl = filep->private_data; @@ -695,6 +1240,16 @@ long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) case SGX_IOC_ENCLAVE_PROVISION: ret = sgx_ioc_enclave_provision(encl, (void __user *)arg); break; + case SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS: + ret = sgx_ioc_enclave_restrict_permissions(encl, + (void __user *)arg); + break; + case SGX_IOC_ENCLAVE_MODIFY_TYPES: + ret = sgx_ioc_enclave_modify_types(encl, (void __user *)arg); + break; + case SGX_IOC_ENCLAVE_REMOVE_PAGES: + ret = sgx_ioc_enclave_remove_pages(encl, (void __user *)arg); + break; default: ret = -ENOIOCTLCMD; break; diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 8471a8b9b48e..0aad028f04d4 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -6,11 +6,13 @@ #include <linux/highmem.h> #include <linux/kthread.h> #include <linux/miscdevice.h> +#include <linux/node.h> #include <linux/pagemap.h> #include <linux/ratelimit.h> #include <linux/sched/mm.h> #include <linux/sched/signal.h> #include <linux/slab.h> +#include <linux/sysfs.h> #include <asm/sgx.h> #include "driver.h" #include "encl.h" @@ -20,6 +22,7 @@ struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS]; static int sgx_nr_epc_sections; static struct task_struct *ksgxd_tsk; static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq); +static DEFINE_XARRAY(sgx_epc_address_space); /* * These variables are part of the state of the reclaimer, and must be accessed @@ -46,9 +49,13 @@ static LIST_HEAD(sgx_dirty_page_list); * Reset post-kexec EPC pages to the uninitialized state. The pages are removed * from the input list, and made available for the page allocator. SECS pages * prepending their children in the input list are left intact. + * + * Return 0 when sanitization was successful or kthread was stopped, and the + * number of unsanitized pages otherwise. */ -static void __sgx_sanitize_pages(struct list_head *dirty_page_list) +static unsigned long __sgx_sanitize_pages(struct list_head *dirty_page_list) { + unsigned long left_dirty = 0; struct sgx_epc_page *page; LIST_HEAD(dirty); int ret; @@ -56,10 +63,28 @@ static void __sgx_sanitize_pages(struct list_head *dirty_page_list) /* dirty_page_list is thread-local, no need for a lock: */ while (!list_empty(dirty_page_list)) { if (kthread_should_stop()) - return; + return 0; page = list_first_entry(dirty_page_list, struct sgx_epc_page, list); + /* + * Checking page->poison without holding the node->lock + * is racy, but losing the race (i.e. poison is set just + * after the check) just means __eremove() will be uselessly + * called for a page that sgx_free_epc_page() will put onto + * the node->sgx_poison_page_list later. + */ + if (page->poison) { + struct sgx_epc_section *section = &sgx_epc_sections[page->section]; + struct sgx_numa_node *node = section->node; + + spin_lock(&node->lock); + list_move(&page->list, &node->sgx_poison_page_list); + spin_unlock(&node->lock); + + continue; + } + ret = __eremove(sgx_get_epc_virt_addr(page)); if (!ret) { /* @@ -71,12 +96,14 @@ static void __sgx_sanitize_pages(struct list_head *dirty_page_list) } else { /* The page is not yet clean - move to the dirty list. */ list_move_tail(&page->list, &dirty); + left_dirty++; } cond_resched(); } list_splice(&dirty, dirty_page_list); + return left_dirty; } static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page) @@ -116,36 +143,9 @@ static void sgx_reclaimer_block(struct sgx_epc_page *epc_page) struct sgx_encl_page *page = epc_page->owner; unsigned long addr = page->desc & PAGE_MASK; struct sgx_encl *encl = page->encl; - unsigned long mm_list_version; - struct sgx_encl_mm *encl_mm; - struct vm_area_struct *vma; - int idx, ret; - - do { - mm_list_version = encl->mm_list_version; - - /* Pairs with smp_rmb() in sgx_encl_mm_add(). */ - smp_rmb(); - - idx = srcu_read_lock(&encl->srcu); - - list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { - if (!mmget_not_zero(encl_mm->mm)) - continue; - - mmap_read_lock(encl_mm->mm); - - ret = sgx_encl_find(encl_mm->mm, addr, &vma); - if (!ret && encl == vma->vm_private_data) - zap_vma_ptes(vma, addr, PAGE_SIZE); - - mmap_read_unlock(encl_mm->mm); - - mmput_async(encl_mm->mm); - } + int ret; - srcu_read_unlock(&encl->srcu, idx); - } while (unlikely(encl->mm_list_version != mm_list_version)); + sgx_zap_enclave_ptes(encl, addr); mutex_lock(&encl->lock); @@ -170,6 +170,8 @@ static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot, backing->pcmd_offset; ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot); + set_page_dirty(backing->pcmd); + set_page_dirty(backing->contents); kunmap_atomic((void *)(unsigned long)(pginfo.metadata - backing->pcmd_offset)); @@ -178,37 +180,8 @@ static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot, return ret; } -static void sgx_ipi_cb(void *info) -{ -} - -static const cpumask_t *sgx_encl_ewb_cpumask(struct sgx_encl *encl) +void sgx_ipi_cb(void *info) { - cpumask_t *cpumask = &encl->cpumask; - struct sgx_encl_mm *encl_mm; - int idx; - - /* - * Can race with sgx_encl_mm_add(), but ETRACK has already been - * executed, which means that the CPUs running in the new mm will enter - * into the enclave with a fresh epoch. - */ - cpumask_clear(cpumask); - - idx = srcu_read_lock(&encl->srcu); - - list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { - if (!mmget_not_zero(encl_mm->mm)) - continue; - - cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm)); - - mmput_async(encl_mm->mm); - } - - srcu_read_unlock(&encl->srcu, idx); - - return cpumask; } /* @@ -257,7 +230,7 @@ static void sgx_encl_ewb(struct sgx_epc_page *epc_page, * miss cpus that entered the enclave between * generating the mask and incrementing epoch. */ - on_each_cpu_mask(sgx_encl_ewb_cpumask(encl), + on_each_cpu_mask(sgx_encl_cpumask(encl), sgx_ipi_cb, NULL, 1); ret = __sgx_encl_ewb(epc_page, va_slot, backing); } @@ -287,9 +260,10 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, sgx_encl_ewb(epc_page, backing); encl_page->epc_page = NULL; encl->secs_child_cnt--; + sgx_encl_put_backing(backing); if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) { - ret = sgx_encl_get_backing(encl, PFN_DOWN(encl->size), + ret = sgx_encl_alloc_backing(encl, PFN_DOWN(encl->size), &secs_backing); if (ret) goto out; @@ -299,7 +273,7 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, sgx_encl_free_epc_page(encl->secs.epc_page); encl->secs.epc_page = NULL; - sgx_encl_put_backing(&secs_backing, true); + sgx_encl_put_backing(&secs_backing); } out: @@ -323,10 +297,8 @@ static void sgx_reclaim_pages(void) { struct sgx_epc_page *chunk[SGX_NR_TO_SCAN]; struct sgx_backing backing[SGX_NR_TO_SCAN]; - struct sgx_epc_section *section; struct sgx_encl_page *encl_page; struct sgx_epc_page *epc_page; - struct sgx_numa_node *node; pgoff_t page_index; int cnt = 0; int ret; @@ -360,11 +332,14 @@ static void sgx_reclaim_pages(void) goto skip; page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base); - ret = sgx_encl_get_backing(encl_page->encl, page_index, &backing[i]); - if (ret) - goto skip; mutex_lock(&encl_page->encl->lock); + ret = sgx_encl_alloc_backing(encl_page->encl, page_index, &backing[i]); + if (ret) { + mutex_unlock(&encl_page->encl->lock); + goto skip; + } + encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED; mutex_unlock(&encl_page->encl->lock); continue; @@ -392,18 +367,11 @@ skip: encl_page = epc_page->owner; sgx_reclaimer_write(epc_page, &backing[i]); - sgx_encl_put_backing(&backing[i], true); kref_put(&encl_page->encl->refcount, sgx_encl_release); epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; - section = &sgx_epc_sections[epc_page->section]; - node = section->node; - - spin_lock(&node->lock); - list_add_tail(&epc_page->list, &node->free_page_list); - spin_unlock(&node->lock); - atomic_long_inc(&sgx_nr_free_pages); + sgx_free_epc_page(epc_page); } } @@ -413,6 +381,17 @@ static bool sgx_should_reclaim(unsigned long watermark) !list_empty(&sgx_active_page_list); } +/* + * sgx_reclaim_direct() should be called (without enclave's mutex held) + * in locations where SGX memory resources might be low and might be + * needed in order to make forward progress. + */ +void sgx_reclaim_direct(void) +{ + if (sgx_should_reclaim(SGX_NR_LOW_PAGES)) + sgx_reclaim_pages(); +} + static int ksgxd(void *p) { set_freezable(); @@ -422,10 +401,7 @@ static int ksgxd(void *p) * required for SECS pages, whose child pages blocked EREMOVE. */ __sgx_sanitize_pages(&sgx_dirty_page_list); - __sgx_sanitize_pages(&sgx_dirty_page_list); - - /* sanity check: */ - WARN_ON(!list_empty(&sgx_dirty_page_list)); + WARN_ON(__sgx_sanitize_pages(&sgx_dirty_page_list)); while (!kthread_should_stop()) { if (try_to_freeze()) @@ -457,6 +433,11 @@ static bool __init sgx_page_reclaimer_init(void) return true; } +bool current_is_ksgxd(void) +{ + return current == ksgxd_tsk; +} + static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid) { struct sgx_numa_node *node = &sgx_numa_nodes[nid]; @@ -471,6 +452,7 @@ static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid) page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list); list_del_init(&page->list); + page->flags = 0; spin_unlock(&node->lock); atomic_long_dec(&sgx_nr_free_pages); @@ -624,7 +606,12 @@ void sgx_free_epc_page(struct sgx_epc_page *page) spin_lock(&node->lock); - list_add_tail(&page->list, &node->free_page_list); + page->owner = NULL; + if (page->poison) + list_add(&page->list, &node->sgx_poison_page_list); + else + list_add_tail(&page->list, &node->free_page_list); + page->flags = SGX_EPC_PAGE_IS_FREE; spin_unlock(&node->lock); atomic_long_inc(&sgx_nr_free_pages); @@ -648,17 +635,102 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, } section->phys_addr = phys_addr; + xa_store_range(&sgx_epc_address_space, section->phys_addr, + phys_addr + size - 1, section, GFP_KERNEL); for (i = 0; i < nr_pages; i++) { section->pages[i].section = index; section->pages[i].flags = 0; section->pages[i].owner = NULL; + section->pages[i].poison = 0; list_add_tail(§ion->pages[i].list, &sgx_dirty_page_list); } return true; } +bool arch_is_platform_page(u64 paddr) +{ + return !!xa_load(&sgx_epc_address_space, paddr); +} +EXPORT_SYMBOL_GPL(arch_is_platform_page); + +static struct sgx_epc_page *sgx_paddr_to_page(u64 paddr) +{ + struct sgx_epc_section *section; + + section = xa_load(&sgx_epc_address_space, paddr); + if (!section) + return NULL; + + return §ion->pages[PFN_DOWN(paddr - section->phys_addr)]; +} + +/* + * Called in process context to handle a hardware reported + * error in an SGX EPC page. + * If the MF_ACTION_REQUIRED bit is set in flags, then the + * context is the task that consumed the poison data. Otherwise + * this is called from a kernel thread unrelated to the page. + */ +int arch_memory_failure(unsigned long pfn, int flags) +{ + struct sgx_epc_page *page = sgx_paddr_to_page(pfn << PAGE_SHIFT); + struct sgx_epc_section *section; + struct sgx_numa_node *node; + + /* + * mm/memory-failure.c calls this routine for all errors + * where there isn't a "struct page" for the address. But that + * includes other address ranges besides SGX. + */ + if (!page) + return -ENXIO; + + /* + * If poison was consumed synchronously. Send a SIGBUS to + * the task. Hardware has already exited the SGX enclave and + * will not allow re-entry to an enclave that has a memory + * error. The signal may help the task understand why the + * enclave is broken. + */ + if (flags & MF_ACTION_REQUIRED) + force_sig(SIGBUS); + + section = &sgx_epc_sections[page->section]; + node = section->node; + + spin_lock(&node->lock); + + /* Already poisoned? Nothing more to do */ + if (page->poison) + goto out; + + page->poison = 1; + + /* + * If the page is on a free list, move it to the per-node + * poison page list. + */ + if (page->flags & SGX_EPC_PAGE_IS_FREE) { + list_move(&page->list, &node->sgx_poison_page_list); + goto out; + } + + /* + * TBD: Add additional plumbing to enable pre-emptive + * action for asynchronous poison notification. Until + * then just hope that the poison: + * a) is not accessed - sgx_free_epc_page() will deal with it + * when the user gives it back + * b) results in a recoverable machine check rather than + * a fatal one + */ +out: + spin_unlock(&node->lock); + return 0; +} + /** * A section metric is concatenated in a way that @low bits 12-31 define the * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the @@ -670,6 +742,48 @@ static inline u64 __init sgx_calc_section_metric(u64 low, u64 high) ((high & GENMASK_ULL(19, 0)) << 32); } +#ifdef CONFIG_NUMA +static ssize_t sgx_total_bytes_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lu\n", sgx_numa_nodes[dev->id].size); +} +static DEVICE_ATTR_RO(sgx_total_bytes); + +static umode_t arch_node_attr_is_visible(struct kobject *kobj, + struct attribute *attr, int idx) +{ + /* Make all x86/ attributes invisible when SGX is not initialized: */ + if (nodes_empty(sgx_numa_mask)) + return 0; + + return attr->mode; +} + +static struct attribute *arch_node_dev_attrs[] = { + &dev_attr_sgx_total_bytes.attr, + NULL, +}; + +const struct attribute_group arch_node_dev_group = { + .name = "x86", + .attrs = arch_node_dev_attrs, + .is_visible = arch_node_attr_is_visible, +}; + +static void __init arch_update_sysfs_visibility(int nid) +{ + struct node *node = node_devices[nid]; + int ret; + + ret = sysfs_update_group(&node->dev.kobj, &arch_node_dev_group); + + if (ret) + pr_err("sysfs update failed (%d), files may be invisible", ret); +} +#else /* !CONFIG_NUMA */ +static void __init arch_update_sysfs_visibility(int nid) {} +#endif + static bool __init sgx_page_cache_init(void) { u32 eax, ebx, ecx, edx, type; @@ -713,10 +827,16 @@ static bool __init sgx_page_cache_init(void) if (!node_isset(nid, sgx_numa_mask)) { spin_lock_init(&sgx_numa_nodes[nid].lock); INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list); + INIT_LIST_HEAD(&sgx_numa_nodes[nid].sgx_poison_page_list); node_set(nid, sgx_numa_mask); + sgx_numa_nodes[nid].size = 0; + + /* Make SGX-specific node sysfs files visible: */ + arch_update_sysfs_visibility(nid); } sgx_epc_sections[i].node = &sgx_numa_nodes[nid]; + sgx_numa_nodes[nid].size += size; sgx_nr_epc_sections++; } diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index 4628acec0009..0f2020653fba 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -26,9 +26,13 @@ /* Pages, which are being tracked by the page reclaimer. */ #define SGX_EPC_PAGE_RECLAIMER_TRACKED BIT(0) +/* Pages on free list */ +#define SGX_EPC_PAGE_IS_FREE BIT(1) + struct sgx_epc_page { unsigned int section; - unsigned int flags; + u16 flags; + u16 poison; struct sgx_encl_page *owner; struct list_head list; }; @@ -39,6 +43,8 @@ struct sgx_epc_page { */ struct sgx_numa_node { struct list_head free_page_list; + struct list_head sgx_poison_page_list; + unsigned long size; spinlock_t lock; }; @@ -80,10 +86,13 @@ static inline void *sgx_get_epc_virt_addr(struct sgx_epc_page *page) struct sgx_epc_page *__sgx_alloc_epc_page(void); void sgx_free_epc_page(struct sgx_epc_page *page); +void sgx_reclaim_direct(void); void sgx_mark_page_reclaimable(struct sgx_epc_page *page); int sgx_unmark_page_reclaimable(struct sgx_epc_page *page); struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim); +void sgx_ipi_cb(void *info); + #ifdef CONFIG_X86_SGX_KVM int __init sgx_vepc_init(void); #else diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 132a2de44d2f..5e868b62a7c4 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -96,6 +96,7 @@ int detect_extended_topology(struct cpuinfo_x86 *c) unsigned int ht_mask_width, core_plus_mask_width, die_plus_mask_width; unsigned int core_select_mask, core_level_siblings; unsigned int die_select_mask, die_level_siblings; + unsigned int pkg_mask_width; bool die_level_present = false; int leaf; @@ -111,10 +112,10 @@ int detect_extended_topology(struct cpuinfo_x86 *c) core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx); core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); die_level_siblings = LEVEL_MAX_SIBLINGS(ebx); - die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + pkg_mask_width = die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); sub_index = 1; - do { + while (true) { cpuid_count(leaf, sub_index, &eax, &ebx, &ecx, &edx); /* @@ -132,10 +133,15 @@ int detect_extended_topology(struct cpuinfo_x86 *c) die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); } + if (LEAFB_SUBTYPE(ecx) != INVALID_TYPE) + pkg_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + else + break; + sub_index++; - } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE); + } - core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width; + core_select_mask = (~(-1 << pkg_mask_width)) >> ht_mask_width; die_select_mask = (~(-1 << die_plus_mask_width)) >> core_plus_mask_width; @@ -148,7 +154,7 @@ int detect_extended_topology(struct cpuinfo_x86 *c) } c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, - die_plus_mask_width); + pkg_mask_width); /* * Reinit the apicid, now that we have extended initial_apicid. */ diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c index 9c7a5f049292..ec7bbac3a9f2 100644 --- a/arch/x86/kernel/cpu/tsx.c +++ b/arch/x86/kernel/cpu/tsx.c @@ -19,7 +19,7 @@ enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED; -void tsx_disable(void) +static void tsx_disable(void) { u64 tsx; @@ -39,7 +39,7 @@ void tsx_disable(void) wrmsrl(MSR_IA32_TSX_CTRL, tsx); } -void tsx_enable(void) +static void tsx_enable(void) { u64 tsx; @@ -58,7 +58,7 @@ void tsx_enable(void) wrmsrl(MSR_IA32_TSX_CTRL, tsx); } -static bool __init tsx_ctrl_is_supported(void) +static bool tsx_ctrl_is_supported(void) { u64 ia32_cap = x86_read_arch_cap_msr(); @@ -84,7 +84,45 @@ static enum tsx_ctrl_states x86_get_tsx_auto_mode(void) return TSX_CTRL_ENABLE; } -void tsx_clear_cpuid(void) +/* + * Disabling TSX is not a trivial business. + * + * First of all, there's a CPUID bit: X86_FEATURE_RTM_ALWAYS_ABORT + * which says that TSX is practically disabled (all transactions are + * aborted by default). When that bit is set, the kernel unconditionally + * disables TSX. + * + * In order to do that, however, it needs to dance a bit: + * + * 1. The first method to disable it is through MSR_TSX_FORCE_ABORT and + * the MSR is present only when *two* CPUID bits are set: + * + * - X86_FEATURE_RTM_ALWAYS_ABORT + * - X86_FEATURE_TSX_FORCE_ABORT + * + * 2. The second method is for CPUs which do not have the above-mentioned + * MSR: those use a different MSR - MSR_IA32_TSX_CTRL and disable TSX + * through that one. Those CPUs can also have the initially mentioned + * CPUID bit X86_FEATURE_RTM_ALWAYS_ABORT set and for those the same strategy + * applies: TSX gets disabled unconditionally. + * + * When either of the two methods are present, the kernel disables TSX and + * clears the respective RTM and HLE feature flags. + * + * An additional twist in the whole thing presents late microcode loading + * which, when done, may cause for the X86_FEATURE_RTM_ALWAYS_ABORT CPUID + * bit to be set after the update. + * + * A subsequent hotplug operation on any logical CPU except the BSP will + * cause for the supported CPUID feature bits to get re-detected and, if + * RTM and HLE get cleared all of a sudden, but, userspace did consult + * them before the update, then funny explosions will happen. Long story + * short: the kernel doesn't modify CPUID feature bits after booting. + * + * That's why, this function's call in init_intel() doesn't clear the + * feature flags. + */ +static void tsx_clear_cpuid(void) { u64 msr; @@ -97,6 +135,39 @@ void tsx_clear_cpuid(void) rdmsrl(MSR_TSX_FORCE_ABORT, msr); msr |= MSR_TFA_TSX_CPUID_CLEAR; wrmsrl(MSR_TSX_FORCE_ABORT, msr); + } else if (tsx_ctrl_is_supported()) { + rdmsrl(MSR_IA32_TSX_CTRL, msr); + msr |= TSX_CTRL_CPUID_CLEAR; + wrmsrl(MSR_IA32_TSX_CTRL, msr); + } +} + +/* + * Disable TSX development mode + * + * When the microcode released in Feb 2022 is applied, TSX will be disabled by + * default on some processors. MSR 0x122 (TSX_CTRL) and MSR 0x123 + * (IA32_MCU_OPT_CTRL) can be used to re-enable TSX for development, doing so is + * not recommended for production deployments. In particular, applying MD_CLEAR + * flows for mitigation of the Intel TSX Asynchronous Abort (TAA) transient + * execution attack may not be effective on these processors when Intel TSX is + * enabled with updated microcode. + */ +static void tsx_dev_mode_disable(void) +{ + u64 mcu_opt_ctrl; + + /* Check if RTM_ALLOW exists */ + if (!boot_cpu_has_bug(X86_BUG_TAA) || !tsx_ctrl_is_supported() || + !cpu_feature_enabled(X86_FEATURE_SRBDS_CTRL)) + return; + + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl); + + if (mcu_opt_ctrl & RTM_ALLOW) { + mcu_opt_ctrl &= ~RTM_ALLOW; + wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl); + setup_force_cpu_cap(X86_FEATURE_RTM_ALWAYS_ABORT); } } @@ -105,14 +176,14 @@ void __init tsx_init(void) char arg[5] = {}; int ret; + tsx_dev_mode_disable(); + /* - * Hardware will always abort a TSX transaction if both CPUID bits - * RTM_ALWAYS_ABORT and TSX_FORCE_ABORT are set. In this case, it is - * better not to enumerate CPUID.RTM and CPUID.HLE bits. Clear them - * here. + * Hardware will always abort a TSX transaction when the CPUID bit + * RTM_ALWAYS_ABORT is set. In this case, it is better not to enumerate + * CPUID.RTM and CPUID.HLE bits. Clear them here. */ - if (boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT) && - boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) { + if (boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT)) { tsx_ctrl_state = TSX_CTRL_RTM_ALWAYS_ABORT; tsx_clear_cpuid(); setup_clear_cpu_cap(X86_FEATURE_RTM); @@ -175,3 +246,16 @@ void __init tsx_init(void) setup_force_cpu_cap(X86_FEATURE_HLE); } } + +void tsx_ap_init(void) +{ + tsx_dev_mode_disable(); + + if (tsx_ctrl_state == TSX_CTRL_ENABLE) + tsx_enable(); + else if (tsx_ctrl_state == TSX_CTRL_DISABLE) + tsx_disable(); + else if (tsx_ctrl_state == TSX_CTRL_RTM_ALWAYS_ABORT) + /* See comment over that function for more details. */ + tsx_clear_cpuid(); +} diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index c04b933f48d3..02039ec3597d 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -476,8 +476,8 @@ static bool __init vmware_legacy_x2apic_available(void) { uint32_t eax, ebx, ecx, edx; VMWARE_CMD(GETVCPU_INFO, eax, ebx, ecx, edx); - return (eax & (1 << VMWARE_CMD_VCPU_RESERVED)) == 0 && - (eax & (1 << VMWARE_CMD_LEGACY_X2APIC)) != 0; + return !(eax & BIT(VMWARE_CMD_VCPU_RESERVED)) && + (eax & BIT(VMWARE_CMD_LEGACY_X2APIC)); } #ifdef CONFIG_AMD_MEM_ENCRYPT diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index e8326a8d1c5d..9730c88530fc 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -407,7 +407,7 @@ int crash_load_segments(struct kimage *image) } image->elf_load_addr = kbuf.mem; pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - image->elf_load_addr, kbuf.bufsz, kbuf.bufsz); + image->elf_load_addr, kbuf.bufsz, kbuf.memsz); return ret; } diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index 5fcac46aaf6b..5f4ae5476e19 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -10,8 +10,7 @@ #include <linux/errno.h> #include <linux/highmem.h> #include <linux/crash_dump.h> - -#include <linux/uaccess.h> +#include <linux/uio.h> static inline bool is_crashed_pfn_valid(unsigned long pfn) { @@ -29,21 +28,8 @@ static inline bool is_crashed_pfn_valid(unsigned long pfn) #endif } -/** - * copy_oldmem_page - copy one page from "oldmem" - * @pfn: page frame number to be copied - * @buf: target memory address for the copy; this can be in kernel address - * space or user address space (see @userbuf) - * @csize: number of bytes to copy - * @offset: offset in bytes into the page (based on pfn) to begin the copy - * @userbuf: if set, @buf is in user address space, use copy_to_user(), - * otherwise @buf is in kernel address space, use memcpy(). - * - * Copy a page from "oldmem". For this page, there might be no pte mapped - * in the current kernel. - */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, - unsigned long offset, int userbuf) +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, size_t csize, + unsigned long offset) { void *vaddr; @@ -54,14 +40,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, return -EFAULT; vaddr = kmap_local_pfn(pfn); - - if (!userbuf) { - memcpy(buf, vaddr + offset, csize); - } else { - if (copy_to_user(buf, vaddr + offset, csize)) - csize = -EFAULT; - } - + csize = copy_to_iter(vaddr + offset, csize, iter); kunmap_local(vaddr); return csize; diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index a7f617a3981d..e75bc2f217ff 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -8,12 +8,12 @@ #include <linux/errno.h> #include <linux/crash_dump.h> -#include <linux/uaccess.h> +#include <linux/uio.h> #include <linux/io.h> #include <linux/cc_platform.h> -static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, - unsigned long offset, int userbuf, +static ssize_t __copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, + size_t csize, unsigned long offset, bool encrypted) { void *vaddr; @@ -29,51 +29,36 @@ static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, if (!vaddr) return -ENOMEM; - if (userbuf) { - if (copy_to_user((void __user *)buf, vaddr + offset, csize)) { - iounmap((void __iomem *)vaddr); - return -EFAULT; - } - } else - memcpy(buf, vaddr + offset, csize); + csize = copy_to_iter(vaddr + offset, csize, iter); - set_iounmap_nonlazy(); iounmap((void __iomem *)vaddr); return csize; } -/** - * copy_oldmem_page - copy one page of memory - * @pfn: page frame number to be copied - * @buf: target memory address for the copy; this can be in kernel address - * space or user address space (see @userbuf) - * @csize: number of bytes to copy - * @offset: offset in bytes into the page (based on pfn) to begin the copy - * @userbuf: if set, @buf is in user address space, use copy_to_user(), - * otherwise @buf is in kernel address space, use memcpy(). - * - * Copy a page from the old kernel's memory. For this page, there is no pte - * mapped in the current kernel. We stitch up a pte, similar to kmap_atomic. - */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, - unsigned long offset, int userbuf) +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, size_t csize, + unsigned long offset) { - return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, false); + return __copy_oldmem_page(iter, pfn, csize, offset, false); } -/** +/* * copy_oldmem_page_encrypted - same as copy_oldmem_page() above but ioremap the * memory with the encryption mask set to accommodate kdump on SME-enabled * machines. */ -ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize, - unsigned long offset, int userbuf) +ssize_t copy_oldmem_page_encrypted(struct iov_iter *iter, unsigned long pfn, + size_t csize, unsigned long offset) { - return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, true); + return __copy_oldmem_page(iter, pfn, csize, offset, true); } ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos) { - return read_from_oldmem(buf, count, ppos, 0, + struct kvec kvec = { .iov_base = buf, .iov_len = count }; + struct iov_iter iter; + + iov_iter_kvec(&iter, READ, &kvec, 1, count); + + return read_from_oldmem(&iter, count, ppos, cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)); } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index ea4fe192189d..0bf6779187dd 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -81,12 +81,6 @@ static int copy_code(struct pt_regs *regs, u8 *buf, unsigned long src, /* The user space code from other tasks cannot be accessed. */ if (regs != task_pt_regs(current)) return -EPERM; - /* - * Make sure userspace isn't trying to trick us into dumping kernel - * memory by pointing the userspace instruction pointer at it. - */ - if (__chk_range_not_ok(src, nbytes, TASK_SIZE_MAX)) - return -EINVAL; /* * Even if named copy_from_user_nmi() this can be invoked from @@ -134,7 +128,7 @@ void show_opcodes(struct pt_regs *regs, const char *loglvl) /* No access to the user space stack of other tasks. Ignore. */ break; default: - printk("%sCode: Unable to access opcode bytes at RIP 0x%lx.\n", + printk("%sCode: Unable to access opcode bytes at 0x%lx.\n", loglvl, prologue); break; } @@ -183,6 +177,12 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, } } +/* + * This function reads pointers from the stack and dereferences them. The + * pointers may not have their KMSAN shadow set up properly, which may result + * in false positive reports. Disable instrumentation to avoid those. + */ +__no_kmsan_checks static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, const char *log_lvl) { @@ -351,7 +351,7 @@ unsigned long oops_begin(void) } NOKPROBE_SYMBOL(oops_begin); -void __noreturn rewind_stack_do_exit(int signr); +void __noreturn rewind_stack_and_make_dead(int signr); void oops_end(unsigned long flags, struct pt_regs *regs, int signr) { @@ -386,7 +386,7 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr) * reuse the task stack and that existing poisons are invalid. */ kasan_unpoison_task_stack(current); - rewind_stack_do_exit(signr); + rewind_stack_and_make_dead(signr); } NOKPROBE_SYMBOL(oops_end); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index bc0657f0deed..9dac24680ff8 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -995,8 +995,10 @@ early_param("memmap", parse_memmap_opt); */ void __init e820__reserve_setup_data(void) { + struct setup_indirect *indirect; struct setup_data *data; - u64 pa_data; + u64 pa_data, pa_next; + u32 len; pa_data = boot_params.hdr.setup_data; if (!pa_data) @@ -1004,29 +1006,46 @@ void __init e820__reserve_setup_data(void) while (pa_data) { data = early_memremap(pa_data, sizeof(*data)); + if (!data) { + pr_warn("e820: failed to memremap setup_data entry\n"); + return; + } + + len = sizeof(*data); + pa_next = data->next; + e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); /* - * SETUP_EFI is supplied by kexec and does not need to be - * reserved. + * SETUP_EFI and SETUP_IMA are supplied by kexec and do not need + * to be reserved. */ - if (data->type != SETUP_EFI) + if (data->type != SETUP_EFI && data->type != SETUP_IMA) e820__range_update_kexec(pa_data, sizeof(*data) + data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - if (data->type == SETUP_INDIRECT && - ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) { - e820__range_update(((struct setup_indirect *)data->data)->addr, - ((struct setup_indirect *)data->data)->len, - E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - e820__range_update_kexec(((struct setup_indirect *)data->data)->addr, - ((struct setup_indirect *)data->data)->len, - E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + if (data->type == SETUP_INDIRECT) { + len += data->len; + early_memunmap(data, sizeof(*data)); + data = early_memremap(pa_data, len); + if (!data) { + pr_warn("e820: failed to memremap indirect setup_data\n"); + return; + } + + indirect = (struct setup_indirect *)data->data; + + if (indirect->type != SETUP_INDIRECT) { + e820__range_update(indirect->addr, indirect->len, + E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + e820__range_update_kexec(indirect->addr, indirect->len, + E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + } } - pa_data = data->next; - early_memunmap(data, sizeof(*data)); + pa_data = pa_next; + early_memunmap(data, len); } e820__update_table(e820_table); diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 391a4e2b8604..a6c1867fc7aa 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -18,6 +18,7 @@ #include <linux/bcma/bcma_regs.h> #include <linux/platform_data/x86/apple.h> #include <drm/i915_drm.h> +#include <drm/i915_pciids.h> #include <asm/pci-direct.h> #include <asm/dma.h> #include <asm/io_apic.h> @@ -515,6 +516,7 @@ static const struct intel_early_ops gen11_early_ops __initconst = { .stolen_size = gen9_stolen_size, }; +/* Intel integrated GPUs for which we need to reserve "stolen memory" */ static const struct pci_device_id intel_early_ids[] __initconst = { INTEL_I830_IDS(&i830_early_ops), INTEL_I845G_IDS(&i845_early_ops), @@ -554,6 +556,9 @@ static const struct pci_device_id intel_early_ids[] __initconst = { INTEL_RKL_IDS(&gen11_early_ops), INTEL_ADLS_IDS(&gen11_early_ops), INTEL_ADLP_IDS(&gen11_early_ops), + INTEL_ADLN_IDS(&gen11_early_ops), + INTEL_RPLS_IDS(&gen11_early_ops), + INTEL_RPLP_IDS(&gen11_early_ops), }; struct resource intel_graphics_stolen_res __ro_after_init = DEFINE_RES_MEM(0, 0); @@ -591,6 +596,13 @@ static void __init intel_graphics_quirks(int num, int slot, int func) u16 device; int i; + /* + * Reserve "stolen memory" for an integrated GPU. If we've already + * found one, there's nothing to do for other (discrete) GPUs. + */ + if (resource_size(&intel_graphics_stolen_res)) + return; + device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID); for (i = 0; i < ARRAY_SIZE(intel_early_ids); i++) { @@ -703,7 +715,7 @@ static struct chipset early_qrk[] __initdata = { { PCI_VENDOR_ID_INTEL, 0x3406, PCI_CLASS_BRIDGE_HOST, PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check }, { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID, - QFLAG_APPLY_ONCE, intel_graphics_quirks }, + 0, intel_graphics_quirks }, /* * HPET on the current version of the Baytrail platform has accuracy * problems: it will halt in deep idle state - so we disable it. diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index d3c531d3b244..44f937015e1e 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -264,11 +264,11 @@ static __init void early_pci_serial_init(char *s) bar0 = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0); /* - * Verify it is a UART type device + * Verify it is a 16550-UART type device */ if (((classcode >> 16 != PCI_CLASS_COMMUNICATION_MODEM) && (classcode >> 16 != PCI_CLASS_COMMUNICATION_SERIAL)) || - (((classcode >> 8) & 0xff) != 0x02)) /* 16550 I/F at BAR0 */ { + (((classcode >> 8) & 0xff) != PCI_SERIAL_16550_COMPATIBLE)) { if (!force) return; } @@ -276,22 +276,22 @@ static __init void early_pci_serial_init(char *s) /* * Determine if it is IO or memory mapped */ - if (bar0 & 0x01) { + if ((bar0 & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_IO) { /* it is IO mapped */ serial_in = io_serial_in; serial_out = io_serial_out; - early_serial_base = bar0&0xfffffffc; + early_serial_base = bar0 & PCI_BASE_ADDRESS_IO_MASK; write_pci_config(bus, slot, func, PCI_COMMAND, - cmdreg|PCI_COMMAND_IO); + cmdreg|PCI_COMMAND_IO); } else { /* It is memory mapped - assume 32-bit alignment */ serial_in = mem32_serial_in; serial_out = mem32_serial_out; /* WARNING! assuming the address is always in the first 4G */ early_serial_base = - (unsigned long)early_ioremap(bar0 & 0xfffffff0, 0x10); + (unsigned long)early_ioremap(bar0 & PCI_BASE_ADDRESS_MEM_MASK, 0x10); write_pci_config(bus, slot, func, PCI_COMMAND, - cmdreg|PCI_COMMAND_MEMORY); + cmdreg|PCI_COMMAND_MEMORY); } /* @@ -387,7 +387,7 @@ static int __init setup_early_printk(char *buf) #endif #ifdef CONFIG_EARLY_PRINTK_USB_XDBC if (!strncmp(buf, "xdbc", 4)) - early_xdbc_parse_parameter(buf + 4); + early_xdbc_parse_parameter(buf + 4, keep); #endif buf++; diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 4fe7af58cfe1..9417d5aa7305 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -100,7 +100,7 @@ static void init_espfix_random(void) * This is run before the entropy pools are initialized, * but this is hopefully better than nothing. */ - if (!arch_get_random_long(&rand)) { + if (!arch_get_random_longs(&rand, 1)) { /* The constant is an arbitrary large prime */ rand = rdtsc(); rand *= 0xc345c6b72fd16123UL; diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 8ea306b1bf8e..3b28c5b25e12 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -14,6 +14,8 @@ #include <asm/traps.h> #include <asm/irq_regs.h> +#include <uapi/asm/kvm.h> + #include <linux/hardirq.h> #include <linux/pkeys.h> #include <linux/vmalloc.h> @@ -41,17 +43,7 @@ struct fpu_state_config fpu_user_cfg __ro_after_init; */ struct fpstate init_fpstate __ro_after_init; -/* - * Track whether the kernel is using the FPU state - * currently. - * - * This flag is used: - * - * - by IRQ context code to potentially use the FPU - * if it's unused. - * - * - to debug kernel_fpu_begin()/end() correctness - */ +/* Track in-kernel FPU usage */ static DEFINE_PER_CPU(bool, in_kernel_fpu); /* @@ -59,44 +51,52 @@ static DEFINE_PER_CPU(bool, in_kernel_fpu); */ DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); -static bool kernel_fpu_disabled(void) -{ - return this_cpu_read(in_kernel_fpu); -} - -static bool interrupted_kernel_fpu_idle(void) -{ - return !kernel_fpu_disabled(); -} - /* - * Were we in user mode (or vm86 mode) when we were - * interrupted? - * - * Doing kernel_fpu_begin/end() is ok if we are running - * in an interrupt context from user mode - we'll just - * save the FPU state as required. + * Can we use the FPU in kernel mode with the + * whole "kernel_fpu_begin/end()" sequence? */ -static bool interrupted_user_mode(void) +bool irq_fpu_usable(void) { - struct pt_regs *regs = get_irq_regs(); - return regs && user_mode(regs); + if (WARN_ON_ONCE(in_nmi())) + return false; + + /* In kernel FPU usage already active? */ + if (this_cpu_read(in_kernel_fpu)) + return false; + + /* + * When not in NMI or hard interrupt context, FPU can be used in: + * + * - Task context except from within fpregs_lock()'ed critical + * regions. + * + * - Soft interrupt processing context which cannot happen + * while in a fpregs_lock()'ed critical region. + */ + if (!in_hardirq()) + return true; + + /* + * In hard interrupt context it's safe when soft interrupts + * are enabled, which means the interrupt did not hit in + * a fpregs_lock()'ed critical region. + */ + return !softirq_count(); } +EXPORT_SYMBOL(irq_fpu_usable); /* - * Can we use the FPU in kernel mode with the - * whole "kernel_fpu_begin/end()" sequence? - * - * It's always ok in process context (ie "not interrupt") - * but it is sometimes ok even from an irq. + * Track AVX512 state use because it is known to slow the max clock + * speed of the core. */ -bool irq_fpu_usable(void) +static void update_avx_timestamp(struct fpu *fpu) { - return !in_interrupt() || - interrupted_user_mode() || - interrupted_kernel_fpu_idle(); + +#define AVX512_TRACKING_MASK (XFEATURE_MASK_ZMM_Hi256 | XFEATURE_MASK_Hi16_ZMM) + + if (fpu->fpstate->regs.xsave.header.xfeatures & AVX512_TRACKING_MASK) + fpu->avx512_timestamp = jiffies; } -EXPORT_SYMBOL(irq_fpu_usable); /* * Save the FPU register state in fpu->fpstate->regs. The register state is @@ -116,13 +116,7 @@ void save_fpregs_to_fpstate(struct fpu *fpu) { if (likely(use_xsave())) { os_xsave(fpu->fpstate); - - /* - * AVX512 state is tracked here because its use is - * known to slow the max clock speed of the core. - */ - if (fpu->fpstate->regs.xsave.header.xfeatures & XFEATURE_MASK_AVX512) - fpu->avx512_timestamp = jiffies; + update_avx_timestamp(fpu); return; } @@ -199,7 +193,27 @@ void fpu_reset_from_exception_fixup(void) } #if IS_ENABLED(CONFIG_KVM) -static void __fpstate_reset(struct fpstate *fpstate); +static void __fpstate_reset(struct fpstate *fpstate, u64 xfd); + +static void fpu_init_guest_permissions(struct fpu_guest *gfpu) +{ + struct fpu_state_perm *fpuperm; + u64 perm; + + if (!IS_ENABLED(CONFIG_X86_64)) + return; + + spin_lock_irq(¤t->sighand->siglock); + fpuperm = ¤t->group_leader->thread.fpu.guest_perm; + perm = fpuperm->__state_perm; + + /* First fpstate allocation locks down permissions. */ + WRITE_ONCE(fpuperm->__state_perm, perm | FPU_GUEST_PERM_LOCKED); + + spin_unlock_irq(¤t->sighand->siglock); + + gfpu->perm = perm & ~FPU_GUEST_PERM_LOCKED; +} bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu) { @@ -211,12 +225,31 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu) if (!fpstate) return false; - __fpstate_reset(fpstate); + /* Leave xfd to 0 (the reset value defined by spec) */ + __fpstate_reset(fpstate, 0); fpstate_init_user(fpstate); fpstate->is_valloc = true; fpstate->is_guest = true; - gfpu->fpstate = fpstate; + gfpu->fpstate = fpstate; + gfpu->xfeatures = fpu_user_cfg.default_features; + gfpu->perm = fpu_user_cfg.default_features; + + /* + * KVM sets the FP+SSE bits in the XSAVE header when copying FPU state + * to userspace, even when XSAVE is unsupported, so that restoring FPU + * state on a different CPU that does support XSAVE can cleanly load + * the incoming state using its natural XSAVE. In other words, KVM's + * uABI size may be larger than this host's default size. Conversely, + * the default size should never be larger than KVM's base uABI size; + * all features that can expand the uABI size must be opt-in. + */ + gfpu->uabi_size = sizeof(struct kvm_xsave); + if (WARN_ON_ONCE(fpu_user_cfg.default_size > gfpu->uabi_size)) + gfpu->uabi_size = fpu_user_cfg.default_size; + + fpu_init_guest_permissions(gfpu); + return true; } EXPORT_SYMBOL_GPL(fpu_alloc_guest_fpstate); @@ -236,6 +269,64 @@ void fpu_free_guest_fpstate(struct fpu_guest *gfpu) } EXPORT_SYMBOL_GPL(fpu_free_guest_fpstate); +/* + * fpu_enable_guest_xfd_features - Check xfeatures against guest perm and enable + * @guest_fpu: Pointer to the guest FPU container + * @xfeatures: Features requested by guest CPUID + * + * Enable all dynamic xfeatures according to guest perm and requested CPUID. + * + * Return: 0 on success, error code otherwise + */ +int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures) +{ + lockdep_assert_preemption_enabled(); + + /* Nothing to do if all requested features are already enabled. */ + xfeatures &= ~guest_fpu->xfeatures; + if (!xfeatures) + return 0; + + return __xfd_enable_feature(xfeatures, guest_fpu); +} +EXPORT_SYMBOL_GPL(fpu_enable_guest_xfd_features); + +#ifdef CONFIG_X86_64 +void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) +{ + fpregs_lock(); + guest_fpu->fpstate->xfd = xfd; + if (guest_fpu->fpstate->in_use) + xfd_update_state(guest_fpu->fpstate); + fpregs_unlock(); +} +EXPORT_SYMBOL_GPL(fpu_update_guest_xfd); + +/** + * fpu_sync_guest_vmexit_xfd_state - Synchronize XFD MSR and software state + * + * Must be invoked from KVM after a VMEXIT before enabling interrupts when + * XFD write emulation is disabled. This is required because the guest can + * freely modify XFD and the state at VMEXIT is not guaranteed to be the + * same as the state on VMENTER. So software state has to be udpated before + * any operation which depends on it can take place. + * + * Note: It can be invoked unconditionally even when write emulation is + * enabled for the price of a then pointless MSR read. + */ +void fpu_sync_guest_vmexit_xfd_state(void) +{ + struct fpstate *fps = current->thread.fpu.fpstate; + + lockdep_assert_irqs_disabled(); + if (fpu_state_size_dynamic()) { + rdmsrl(MSR_IA32_XFD, fps->xfd); + __this_cpu_write(xfd_state, fps->xfd); + } +} +EXPORT_SYMBOL_GPL(fpu_sync_guest_vmexit_xfd_state); +#endif /* CONFIG_X86_64 */ + int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest) { struct fpstate *guest_fps = guest_fpu->fpstate; @@ -324,9 +415,6 @@ int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, xpkru = get_xsave_addr(&kstate->regs.xsave, XFEATURE_PKRU); *vpkru = xpkru->pkru; } - - /* Ensure that XCOMP_BV is set up for XSAVES */ - xstate_init_xcomp_bv(&kstate->regs.xsave, kstate->xfeatures); return 0; } EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate); @@ -430,26 +518,28 @@ void fpstate_init_user(struct fpstate *fpstate) fpstate_init_fstate(fpstate); } -static void __fpstate_reset(struct fpstate *fpstate) +static void __fpstate_reset(struct fpstate *fpstate, u64 xfd) { /* Initialize sizes and feature masks */ fpstate->size = fpu_kernel_cfg.default_size; fpstate->user_size = fpu_user_cfg.default_size; fpstate->xfeatures = fpu_kernel_cfg.default_features; fpstate->user_xfeatures = fpu_user_cfg.default_features; - fpstate->xfd = init_fpstate.xfd; + fpstate->xfd = xfd; } void fpstate_reset(struct fpu *fpu) { /* Set the fpstate pointer to the default fpstate */ fpu->fpstate = &fpu->__fpstate; - __fpstate_reset(fpu->fpstate); + __fpstate_reset(fpu->fpstate, init_fpstate.xfd); /* Initialize the permission related info in fpu */ fpu->perm.__state_perm = fpu_kernel_cfg.default_features; fpu->perm.__state_size = fpu_kernel_cfg.default_size; fpu->perm.__user_state_size = fpu_user_cfg.default_size; + /* Same defaults for guests */ + fpu->guest_perm = fpu->perm; } static inline void fpu_inherit_perms(struct fpu *dst_fpu) @@ -460,12 +550,13 @@ static inline void fpu_inherit_perms(struct fpu *dst_fpu) spin_lock_irq(¤t->sighand->siglock); /* Fork also inherits the permissions of the parent */ dst_fpu->perm = src_fpu->perm; + dst_fpu->guest_perm = src_fpu->guest_perm; spin_unlock_irq(¤t->sighand->siglock); } } /* Clone current's FPU state on fork */ -int fpu_clone(struct task_struct *dst, unsigned long clone_flags) +int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal) { struct fpu *src_fpu = ¤t->thread.fpu; struct fpu *dst_fpu = &dst->thread.fpu; @@ -488,7 +579,7 @@ int fpu_clone(struct task_struct *dst, unsigned long clone_flags) * No FPU state inheritance for kernel threads and IO * worker threads. */ - if (dst->flags & (PF_KTHREAD | PF_IO_WORKER)) { + if (minimal) { /* Clear out the minimal state */ memcpy(&dst_fpu->fpstate->regs, &init_fpstate.regs, init_fpstate_copy_size()); @@ -518,6 +609,13 @@ int fpu_clone(struct task_struct *dst, unsigned long clone_flags) fpu_inherit_perms(dst_fpu); fpregs_unlock(); + /* + * Children never inherit PASID state. + * Force it to have its init value: + */ + if (use_xsave()) + dst_fpu->fpstate->regs.xsave.header.xfeatures &= ~XFEATURE_MASK_PASID; + trace_x86_fpu_copy_src(src_fpu); trace_x86_fpu_copy_dst(dst_fpu); @@ -753,3 +851,17 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr) */ return 0; } + +/* + * Initialize register state that may prevent from entering low-power idle. + * This function will be invoked from the cpuidle driver only when needed. + */ +void fpu_idle_fpregs(void) +{ + /* Note: AMX_TILE being enabled implies XGETBV1 support */ + if (cpu_feature_enabled(X86_FEATURE_AMX_TILE) && + (xfeatures_in_use() & XFEATURE_MASK_XTILE)) { + tile_release(); + fpregs_deactivate(¤t->thread.fpu); + } +} diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 621f4b6cac4a..8946f89761cc 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -210,13 +210,6 @@ static void __init fpu__init_system_xstate_size_legacy(void) fpstate_reset(¤t->thread.fpu); } -static void __init fpu__init_init_fpstate(void) -{ - /* Bring init_fpstate size and features up to date */ - init_fpstate.size = fpu_kernel_cfg.max_size; - init_fpstate.xfeatures = fpu_kernel_cfg.max_features; -} - /* * Called on the boot CPU once per system bootup, to set up the initial * FPU state that is later cloned into all processes: @@ -236,5 +229,4 @@ void __init fpu__init_system(struct cpuinfo_x86 *c) fpu__init_system_xstate_size_legacy(); fpu__init_system_xstate(fpu_kernel_cfg.max_size); fpu__init_task_struct_size(); - fpu__init_init_fpstate(); } diff --git a/arch/x86/kernel/fpu/legacy.h b/arch/x86/kernel/fpu/legacy.h index 17c26b164c63..098f367bb8a7 100644 --- a/arch/x86/kernel/fpu/legacy.h +++ b/arch/x86/kernel/fpu/legacy.h @@ -35,11 +35,7 @@ static inline void ldmxcsr(u32 mxcsr) int err; \ asm volatile("1:" #insn "\n\t" \ "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3: movl $-1,%[err]\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE(1b, 3b) \ + _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %[err]) \ : [err] "=r" (err), output \ : "0"(0), input); \ err; \ diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 437d7c930c0b..75ffaef8c299 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -91,11 +91,9 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, const void *kbuf, const void __user *ubuf) { struct fpu *fpu = &target->thread.fpu; - struct user32_fxsr_struct newstate; + struct fxregs_state newstate; int ret; - BUILD_BUG_ON(sizeof(newstate) != sizeof(struct fxregs_state)); - if (!cpu_feature_enabled(X86_FEATURE_FXSR)) return -ENODEV; @@ -116,9 +114,10 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, /* Copy the state */ memcpy(&fpu->fpstate->regs.fxsave, &newstate, sizeof(newstate)); - /* Clear xmm8..15 */ + /* Clear xmm8..15 for 32-bit callers */ BUILD_BUG_ON(sizeof(fpu->__fpstate.regs.fxsave.xmm_space) != 16 * 16); - memset(&fpu->fpstate->regs.fxsave.xmm_space[8], 0, 8 * 16); + if (in_ia32_syscall()) + memset(&fpu->fpstate->regs.fxsave.xmm_space[8*4], 0, 8 * 16); /* Mark FP and SSE as in use when XSAVE is enabled */ if (use_xsave()) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index d28829403ed0..59e543b95a3c 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -81,10 +81,10 @@ static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init = { [ 0 ... XFEATURE_MAX - 1] = -1}; -static unsigned int xstate_comp_offsets[XFEATURE_MAX] __ro_after_init = - { [ 0 ... XFEATURE_MAX - 1] = -1}; -static unsigned int xstate_supervisor_only_offsets[XFEATURE_MAX] __ro_after_init = - { [ 0 ... XFEATURE_MAX - 1] = -1}; +static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init; + +#define XSTATE_FLAG_SUPERVISOR BIT(0) +#define XSTATE_FLAG_ALIGNED64 BIT(1) /* * Return whether the system supports a given xfeature. @@ -124,17 +124,42 @@ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name) } EXPORT_SYMBOL_GPL(cpu_has_xfeatures); +static bool xfeature_is_aligned64(int xfeature_nr) +{ + return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64; +} + static bool xfeature_is_supervisor(int xfeature_nr) { + return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR; +} + +static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature) +{ + unsigned int offs, i; + /* - * Extended State Enumeration Sub-leaves (EAX = 0DH, ECX = n, n > 1) - * returns ECX[0] set to (1) for a supervisor state, and cleared (0) - * for a user state. + * Non-compacted format and legacy features use the cached fixed + * offsets. */ - u32 eax, ebx, ecx, edx; + if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) || + xfeature <= XFEATURE_SSE) + return xstate_offsets[xfeature]; - cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); - return ecx & 1; + /* + * Compacted format offsets depend on the actual content of the + * compacted xsave area which is determined by the xcomp_bv header + * field. + */ + offs = FXSAVE_SIZE + XSAVE_HDR_SIZE; + for_each_extended_xfeature(i, xcomp_bv) { + if (xfeature_is_aligned64(i)) + offs = ALIGN(offs, 64); + if (i == xfeature) + break; + offs += xstate_sizes[i]; + } + return offs; } /* @@ -182,7 +207,7 @@ static bool xfeature_enabled(enum xfeature xfeature) * Record the offsets and sizes of various xstates contained * in the XSAVE state memory layout. */ -static void __init setup_xstate_features(void) +static void __init setup_xstate_cache(void) { u32 eax, ebx, ecx, edx, i; /* start at the beginning of the "extended state" */ @@ -205,6 +230,7 @@ static void __init setup_xstate_features(void) cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); xstate_sizes[i] = eax; + xstate_flags[i] = ecx; /* * If an xfeature is supervisor state, the offset in EBX is @@ -264,94 +290,6 @@ static void __init print_xstate_features(void) } while (0) /* - * We could cache this like xstate_size[], but we only use - * it here, so it would be a waste of space. - */ -static int xfeature_is_aligned(int xfeature_nr) -{ - u32 eax, ebx, ecx, edx; - - CHECK_XFEATURE(xfeature_nr); - - if (!xfeature_enabled(xfeature_nr)) { - WARN_ONCE(1, "Checking alignment of disabled xfeature %d\n", - xfeature_nr); - return 0; - } - - cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); - /* - * The value returned by ECX[1] indicates the alignment - * of state component 'i' when the compacted format - * of the extended region of an XSAVE area is used: - */ - return !!(ecx & 2); -} - -/* - * This function sets up offsets and sizes of all extended states in - * xsave area. This supports both standard format and compacted format - * of the xsave area. - */ -static void __init setup_xstate_comp_offsets(void) -{ - unsigned int next_offset; - int i; - - /* - * The FP xstates and SSE xstates are legacy states. They are always - * in the fixed offsets in the xsave area in either compacted form - * or standard form. - */ - xstate_comp_offsets[XFEATURE_FP] = 0; - xstate_comp_offsets[XFEATURE_SSE] = offsetof(struct fxregs_state, - xmm_space); - - if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) { - for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) - xstate_comp_offsets[i] = xstate_offsets[i]; - return; - } - - next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE; - - for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { - if (xfeature_is_aligned(i)) - next_offset = ALIGN(next_offset, 64); - - xstate_comp_offsets[i] = next_offset; - next_offset += xstate_sizes[i]; - } -} - -/* - * Setup offsets of a supervisor-state-only XSAVES buffer: - * - * The offsets stored in xstate_comp_offsets[] only work for one specific - * value of the Requested Feature BitMap (RFBM). In cases where a different - * RFBM value is used, a different set of offsets is required. This set of - * offsets is for when RFBM=xfeatures_mask_supervisor(). - */ -static void __init setup_supervisor_only_offsets(void) -{ - unsigned int next_offset; - int i; - - next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE; - - for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { - if (!xfeature_is_supervisor(i)) - continue; - - if (xfeature_is_aligned(i)) - next_offset = ALIGN(next_offset, 64); - - xstate_supervisor_only_offsets[i] = next_offset; - next_offset += xstate_sizes[i]; - } -} - -/* * Print out xstate component offsets and sizes */ static void __init print_xstate_offset_size(void) @@ -360,7 +298,8 @@ static void __init print_xstate_offset_size(void) for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", - i, xstate_comp_offsets[i], i, xstate_sizes[i]); + i, xfeature_get_offset(fpu_kernel_cfg.max_features, i), + i, xstate_sizes[i]); } } @@ -419,10 +358,9 @@ static void __init setup_init_fpu_buf(void) if (!boot_cpu_has(X86_FEATURE_XSAVE)) return; - setup_xstate_features(); print_xstate_features(); - xstate_init_xcomp_bv(&init_fpstate.regs.xsave, fpu_kernel_cfg.max_features); + xstate_init_xcomp_bv(&init_fpstate.regs.xsave, init_fpstate.xfeatures); /* * Init all the features state with header.xfeatures being 0x0 @@ -432,12 +370,12 @@ static void __init setup_init_fpu_buf(void) /* * All components are now in init state. Read the state back so * that init_fpstate contains all non-zero init state. This only - * works with XSAVE, but not with XSAVEOPT and XSAVES because + * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because * those use the init optimization which skips writing data for * components in init state. * * XSAVE could be used, but that would require to reshuffle the - * data when XSAVES is available because XSAVES uses xstate + * data when XSAVEC/S is available because XSAVEC/S uses xstate * compaction. But doing so is a pointless exercise because most * components have an all zeros init state except for the legacy * ones (FP and SSE). Those can be saved with FXSAVE into the @@ -448,25 +386,6 @@ static void __init setup_init_fpu_buf(void) fxsave(&init_fpstate.regs.fxsave); } -static int xfeature_uncompacted_offset(int xfeature_nr) -{ - u32 eax, ebx, ecx, edx; - - /* - * Only XSAVES supports supervisor states and it uses compacted - * format. Checking a supervisor state's uncompacted offset is - * an error. - */ - if (XFEATURE_MASK_SUPERVISOR_ALL & BIT_ULL(xfeature_nr)) { - WARN_ONCE(1, "No fixed offset for xstate %d\n", xfeature_nr); - return -1; - } - - CHECK_XFEATURE(xfeature_nr); - cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); - return ebx; -} - int xfeature_size(int xfeature_nr) { u32 eax, ebx, ecx, edx; @@ -644,29 +563,15 @@ static bool __init check_xstate_against_struct(int nr) static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted) { - unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE; - int i; + unsigned int topmost = fls64(xfeatures) - 1; + unsigned int offset = xstate_offsets[topmost]; - for_each_extended_xfeature(i, xfeatures) { - /* Align from the end of the previous feature */ - if (xfeature_is_aligned(i)) - size = ALIGN(size, 64); - /* - * In compacted format the enabled features are packed, - * i.e. disabled features do not occupy space. - * - * In non-compacted format the offsets are fixed and - * disabled states still occupy space in the memory buffer. - */ - if (!compacted) - size = xfeature_uncompacted_offset(i); - /* - * Add the feature size even for non-compacted format - * to make the end result correct - */ - size += xfeature_size(i); - } - return size; + if (topmost <= XFEATURE_SSE) + return sizeof(struct xregs_state); + + if (compacted) + offset = xfeature_get_offset(xfeatures, topmost); + return offset + xstate_sizes[topmost]; } /* @@ -680,7 +585,8 @@ static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted) */ static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) { - bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES); + bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); + bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES); unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE; int i; @@ -691,7 +597,7 @@ static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) * Supervisor state components can be managed only by * XSAVES. */ - if (!compacted && xfeature_is_supervisor(i)) { + if (!xsaves && xfeature_is_supervisor(i)) { XSTATE_WARN_ON(1); return false; } @@ -708,8 +614,11 @@ static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) * the size of the *user* states. If we use it to size a buffer * that we use 'XSAVES' on, we could potentially overflow the * buffer because 'XSAVES' saves system states too. + * + * This also takes compaction into account. So this works for + * XSAVEC as well. */ -static unsigned int __init get_xsaves_size(void) +static unsigned int __init get_compacted_size(void) { unsigned int eax, ebx, ecx, edx; /* @@ -719,6 +628,10 @@ static unsigned int __init get_xsaves_size(void) * containing all the state components * corresponding to bits currently set in * XCR0 | IA32_XSS. + * + * When XSAVES is not available but XSAVEC is (virt), then there + * are no supervisor states, but XSAVEC still uses compacted + * format. */ cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); return ebx; @@ -728,13 +641,13 @@ static unsigned int __init get_xsaves_size(void) * Get the total size of the enabled xstates without the independent supervisor * features. */ -static unsigned int __init get_xsaves_size_no_independent(void) +static unsigned int __init get_xsave_compacted_size(void) { u64 mask = xfeatures_mask_independent(); unsigned int size; if (!mask) - return get_xsaves_size(); + return get_compacted_size(); /* Disable independent features. */ wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()); @@ -743,7 +656,7 @@ static unsigned int __init get_xsaves_size_no_independent(void) * Ask the hardware what size is required of the buffer. * This is the size required for the task->fpu buffer. */ - size = get_xsaves_size(); + size = get_compacted_size(); /* Re-enable independent features so XSAVES will work on them again. */ wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask); @@ -765,48 +678,31 @@ static unsigned int __init get_xsave_size_user(void) return ebx; } -/* - * Will the runtime-enumerated 'xstate_size' fit in the init - * task's statically-allocated buffer? - */ -static bool __init is_supported_xstate_size(unsigned int test_xstate_size) -{ - if (test_xstate_size <= sizeof(init_fpstate.regs)) - return true; - - pr_warn("x86/fpu: xstate buffer too small (%zu < %d), disabling xsave\n", - sizeof(init_fpstate.regs), test_xstate_size); - return false; -} - static int __init init_xstate_size(void) { /* Recompute the context size for enabled features: */ unsigned int user_size, kernel_size, kernel_default_size; - bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES); + bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); /* Uncompacted user space size */ user_size = get_xsave_size_user(); /* - * XSAVES kernel size includes supervisor states and - * uses compacted format when available. + * XSAVES kernel size includes supervisor states and uses compacted + * format. XSAVEC uses compacted format, but does not save + * supervisor states. * - * XSAVE does not support supervisor states so - * kernel and user size is identical. + * XSAVE[OPT] do not support supervisor states so kernel and user + * size is identical. */ if (compacted) - kernel_size = get_xsaves_size_no_independent(); + kernel_size = get_xsave_compacted_size(); else kernel_size = user_size; kernel_default_size = xstate_calculate_size(fpu_kernel_cfg.default_features, compacted); - /* Ensure we have the space to store all default enabled features. */ - if (!is_supported_xstate_size(kernel_default_size)) - return -EINVAL; - if (!paranoid_xstate_size_valid(kernel_size)) return -EINVAL; @@ -909,8 +805,11 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) if (!cpu_feature_enabled(X86_FEATURE_XFD)) fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC; - fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED | - XFEATURE_MASK_SUPERVISOR_SUPPORTED; + if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) + fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; + else + fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED | + XFEATURE_MASK_SUPERVISOR_SUPPORTED; fpu_user_cfg.max_features = fpu_kernel_cfg.max_features; fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; @@ -933,8 +832,17 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) */ init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC; + /* Set up compaction feature bit */ + if (cpu_feature_enabled(X86_FEATURE_XSAVEC) || + cpu_feature_enabled(X86_FEATURE_XSAVES)) + setup_force_cpu_cap(X86_FEATURE_XCOMPACTED); + /* Enable xstate instructions to be able to continue with initialization: */ fpu__init_cpu_xstate(); + + /* Cache size, offset and flags for initialization */ + setup_xstate_cache(); + err = init_xstate_size(); if (err) goto out_disable; @@ -949,9 +857,20 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) update_regset_xstate_info(fpu_user_cfg.max_size, fpu_user_cfg.max_features); + /* + * init_fpstate excludes dynamic states as they are large but init + * state is zero. + */ + init_fpstate.size = fpu_kernel_cfg.default_size; + init_fpstate.xfeatures = fpu_kernel_cfg.default_features; + + if (init_fpstate.size > sizeof(init_fpstate.regs)) { + pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d), disabling XSAVE\n", + sizeof(init_fpstate.regs), init_fpstate.size); + goto out_disable; + } + setup_init_fpu_buf(); - setup_xstate_comp_offsets(); - setup_supervisor_only_offsets(); /* * Paranoia check whether something in the setup modified the @@ -967,7 +886,7 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", fpu_kernel_cfg.max_features, fpu_kernel_cfg.max_size, - boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard"); + boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard"); return; out_disable: @@ -1006,13 +925,19 @@ void fpu__resume_cpu(void) */ static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr) { - if (!xfeature_enabled(xfeature_nr)) { - WARN_ON_FPU(1); + u64 xcomp_bv = xsave->header.xcomp_bv; + + if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) return NULL; + + if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) { + if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr)))) + return NULL; } - return (void *)xsave + xstate_comp_offsets[xfeature_nr]; + return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr); } + /* * Given the xsave area and a state inside, this function returns the * address of the state. @@ -1043,8 +968,9 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) * We should not ever be requesting features that we * have not enabled. */ - WARN_ONCE(!(fpu_kernel_cfg.max_features & BIT_ULL(xfeature_nr)), - "get of unsupported state"); + if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) + return NULL; + /* * This assumes the last 'xsave*' instruction to * have requested that 'xfeature_nr' be saved. @@ -1199,6 +1125,15 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, */ mask = fpstate->user_xfeatures; + /* + * Dynamic features are not present in init_fpstate. When they are + * in an all zeros init state, remove those from 'mask' to zero + * those features in the user buffer instead of retrieving them + * from init_fpstate. + */ + if (fpu_state_size_dynamic()) + mask &= (header.xfeatures | xinit->header.xcomp_bv); + for_each_extended_xfeature(i, mask) { /* * If there was a feature or alignment gap, zero the space @@ -1302,7 +1237,7 @@ static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf, } for (i = 0; i < XFEATURE_MAX; i++) { - u64 mask = ((u64)1 << i); + mask = BIT_ULL(i); if (hdr.xfeatures & mask) { void *dst = __raw_xsave_addr(xsave, i); @@ -1500,35 +1435,13 @@ void fpstate_free(struct fpu *fpu) } /** - * fpu_install_fpstate - Update the active fpstate in the FPU - * - * @fpu: A struct fpu * pointer - * @newfps: A struct fpstate * pointer - * - * Returns: A null pointer if the last active fpstate is the embedded - * one or the new fpstate is already installed; - * otherwise, a pointer to the old fpstate which has to - * be freed by the caller. - */ -static struct fpstate *fpu_install_fpstate(struct fpu *fpu, - struct fpstate *newfps) -{ - struct fpstate *oldfps = fpu->fpstate; - - if (fpu->fpstate == newfps) - return NULL; - - fpu->fpstate = newfps; - return oldfps != &fpu->__fpstate ? oldfps : NULL; -} - -/** * fpstate_realloc - Reallocate struct fpstate for the requested new features * * @xfeatures: A bitmap of xstate features which extend the enabled features * of that task * @ksize: The required size for the kernel buffer * @usize: The required size for user space buffers + * @guest_fpu: Pointer to a guest FPU container. NULL for host allocations * * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer * terminates quickly, vfree()-induced IPIs may be a concern, but tasks @@ -1537,13 +1450,13 @@ static struct fpstate *fpu_install_fpstate(struct fpu *fpu, * Returns: 0 on success, -ENOMEM on allocation error. */ static int fpstate_realloc(u64 xfeatures, unsigned int ksize, - unsigned int usize) + unsigned int usize, struct fpu_guest *guest_fpu) { struct fpu *fpu = ¤t->thread.fpu; struct fpstate *curfps, *newfps = NULL; unsigned int fpsize; + bool in_use; - curfps = fpu->fpstate; fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64); newfps = vzalloc(fpsize); @@ -1553,28 +1466,59 @@ static int fpstate_realloc(u64 xfeatures, unsigned int ksize, newfps->user_size = usize; newfps->is_valloc = true; + /* + * When a guest FPU is supplied, use @guest_fpu->fpstate + * as reference independent whether it is in use or not. + */ + curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate; + + /* Determine whether @curfps is the active fpstate */ + in_use = fpu->fpstate == curfps; + + if (guest_fpu) { + newfps->is_guest = true; + newfps->is_confidential = curfps->is_confidential; + newfps->in_use = curfps->in_use; + guest_fpu->xfeatures |= xfeatures; + guest_fpu->uabi_size = usize; + } + fpregs_lock(); /* - * Ensure that the current state is in the registers before - * swapping fpstate as that might invalidate it due to layout - * changes. + * If @curfps is in use, ensure that the current state is in the + * registers before swapping fpstate as that might invalidate it + * due to layout changes. */ - if (test_thread_flag(TIF_NEED_FPU_LOAD)) + if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD)) fpregs_restore_userregs(); newfps->xfeatures = curfps->xfeatures | xfeatures; - newfps->user_xfeatures = curfps->user_xfeatures | xfeatures; - newfps->xfd = curfps->xfd & ~xfeatures; - curfps = fpu_install_fpstate(fpu, newfps); + if (!guest_fpu) + newfps->user_xfeatures = curfps->user_xfeatures | xfeatures; + + newfps->xfd = curfps->xfd & ~xfeatures; /* Do the final updates within the locked region */ xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures); - xfd_update_state(newfps); + if (guest_fpu) { + guest_fpu->fpstate = newfps; + /* If curfps is active, update the FPU fpstate pointer */ + if (in_use) + fpu->fpstate = newfps; + } else { + fpu->fpstate = newfps; + } + + if (in_use) + xfd_update_state(fpu->fpstate); fpregs_unlock(); - vfree(curfps); + /* Only free valloc'ed state */ + if (curfps && curfps->is_valloc) + vfree(curfps); + return 0; } @@ -1595,7 +1539,7 @@ static int validate_sigaltstack(unsigned int usize) return 0; } -static int __xstate_request_perm(u64 permitted, u64 requested) +static int __xstate_request_perm(u64 permitted, u64 requested, bool guest) { /* * This deliberately does not exclude !XSAVES as we still might @@ -1603,11 +1547,12 @@ static int __xstate_request_perm(u64 permitted, u64 requested) * vendors into extending XFD for the pre AMX states, especially * AVX512. */ - bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES); + bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); struct fpu *fpu = ¤t->group_leader->thread.fpu; + struct fpu_state_perm *perm; unsigned int ksize, usize; u64 mask; - int ret; + int ret = 0; /* Check whether fully enabled */ if ((permitted & requested) == requested) @@ -1615,21 +1560,27 @@ static int __xstate_request_perm(u64 permitted, u64 requested) /* Calculate the resulting kernel state size */ mask = permitted | requested; + /* Take supervisor states into account on the host */ + if (!guest) + mask |= xfeatures_mask_supervisor(); ksize = xstate_calculate_size(mask, compacted); /* Calculate the resulting user state size */ mask &= XFEATURE_MASK_USER_SUPPORTED; usize = xstate_calculate_size(mask, false); - ret = validate_sigaltstack(usize); - if (ret) - return ret; + if (!guest) { + ret = validate_sigaltstack(usize); + if (ret) + return ret; + } + perm = guest ? &fpu->guest_perm : &fpu->perm; /* Pairs with the READ_ONCE() in xstate_get_group_perm() */ - WRITE_ONCE(fpu->perm.__state_perm, requested); + WRITE_ONCE(perm->__state_perm, mask); /* Protected by sighand lock */ - fpu->perm.__state_size = ksize; - fpu->perm.__user_state_size = usize; + perm->__state_size = ksize; + perm->__user_state_size = usize; return ret; } @@ -1640,7 +1591,7 @@ static const u64 xstate_prctl_req[XFEATURE_MAX] = { [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA, }; -static int xstate_request_perm(unsigned long idx) +static int xstate_request_perm(unsigned long idx, bool guest) { u64 permitted, requested; int ret; @@ -1661,26 +1612,33 @@ static int xstate_request_perm(unsigned long idx) return -EOPNOTSUPP; /* Lockless quick check */ - permitted = xstate_get_host_group_perm(); + permitted = xstate_get_group_perm(guest); if ((permitted & requested) == requested) return 0; /* Protect against concurrent modifications */ spin_lock_irq(¤t->sighand->siglock); - permitted = xstate_get_host_group_perm(); - ret = __xstate_request_perm(permitted, requested); + permitted = xstate_get_group_perm(guest); + + /* First vCPU allocation locks the permissions. */ + if (guest && (permitted & FPU_GUEST_PERM_LOCKED)) + ret = -EBUSY; + else + ret = __xstate_request_perm(permitted, requested, guest); spin_unlock_irq(¤t->sighand->siglock); return ret; } -int xfd_enable_feature(u64 xfd_err) +int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu) { u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC; + struct fpu_state_perm *perm; unsigned int ksize, usize; struct fpu *fpu; if (!xfd_event) { - pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err); + if (!guest_fpu) + pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err); return 0; } @@ -1688,14 +1646,16 @@ int xfd_enable_feature(u64 xfd_err) spin_lock_irq(¤t->sighand->siglock); /* If not permitted let it die */ - if ((xstate_get_host_group_perm() & xfd_event) != xfd_event) { + if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) { spin_unlock_irq(¤t->sighand->siglock); return -EPERM; } fpu = ¤t->group_leader->thread.fpu; - ksize = fpu->perm.__state_size; - usize = fpu->perm.__user_state_size; + perm = guest_fpu ? &fpu->guest_perm : &fpu->perm; + ksize = perm->__state_size; + usize = perm->__user_state_size; + /* * The feature is permitted. State size is sufficient. Dropping * the lock is safe here even if more features are added from @@ -1708,17 +1668,29 @@ int xfd_enable_feature(u64 xfd_err) * Try to allocate a new fpstate. If that fails there is no way * out. */ - if (fpstate_realloc(xfd_event, ksize, usize)) + if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu)) return -EFAULT; return 0; } + +int xfd_enable_feature(u64 xfd_err) +{ + return __xfd_enable_feature(xfd_err, NULL); +} + #else /* CONFIG_X86_64 */ -static inline int xstate_request_perm(unsigned long idx) +static inline int xstate_request_perm(unsigned long idx, bool guest) { return -EPERM; } #endif /* !CONFIG_X86_64 */ +u64 xstate_get_guest_group_perm(void) +{ + return xstate_get_group_perm(true); +} +EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm); + /** * fpu_xstate_prctl - xstate permission operations * @tsk: Redundant pointer to current @@ -1737,14 +1709,12 @@ static inline int xstate_request_perm(unsigned long idx) * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18). */ -long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2) +long fpu_xstate_prctl(int option, unsigned long arg2) { u64 __user *uptr = (u64 __user *)arg2; u64 permitted, supported; unsigned long idx = arg2; - - if (tsk != current) - return -EPERM; + bool guest = false; switch (option) { case ARCH_GET_XCOMP_SUPP: @@ -1760,11 +1730,20 @@ long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2) permitted &= XFEATURE_MASK_USER_SUPPORTED; return put_user(permitted, uptr); + case ARCH_GET_XCOMP_GUEST_PERM: + permitted = xstate_get_guest_group_perm(); + permitted &= XFEATURE_MASK_USER_SUPPORTED; + return put_user(permitted, uptr); + + case ARCH_REQ_XCOMP_GUEST_PERM: + guest = true; + fallthrough; + case ARCH_REQ_XCOMP_PERM: if (!IS_ENABLED(CONFIG_X86_64)) return -EOPNOTSUPP; - return xstate_request_perm(idx); + return xstate_request_perm(idx, guest); default: return -EINVAL; diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index 86ea7c0fa2f6..5ad47031383b 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -16,14 +16,23 @@ static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask) * XRSTORS requires these bits set in xcomp_bv, or it will * trigger #GP: */ - if (cpu_feature_enabled(X86_FEATURE_XSAVES)) + if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT; } -static inline u64 xstate_get_host_group_perm(void) +static inline u64 xstate_get_group_perm(bool guest) { + struct fpu *fpu = ¤t->group_leader->thread.fpu; + struct fpu_state_perm *perm; + /* Pairs with WRITE_ONCE() in xstate_request_perm() */ - return READ_ONCE(current->group_leader->thread.fpu.perm.__state_perm); + perm = guest ? &fpu->guest_perm : &fpu->perm; + return READ_ONCE(perm->__state_perm); +} + +static inline u64 xstate_get_host_group_perm(void) +{ + return xstate_get_group_perm(false); } enum xstate_copy_mode { @@ -70,6 +79,7 @@ static inline u64 xfeatures_mask_independent(void) /* These macros all use (%edi)/(%rdi) as the single memory argument. */ #define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" #define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37" +#define XSAVEC ".byte " REX_PREFIX "0x0f,0xc7,0x27" #define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f" #define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f" #define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f" @@ -88,9 +98,11 @@ static inline u64 xfeatures_mask_independent(void) : "memory") /* - * If XSAVES is enabled, it replaces XSAVEOPT because it supports a compact - * format and supervisor states in addition to modified optimization in - * XSAVEOPT. + * If XSAVES is enabled, it replaces XSAVEC because it supports supervisor + * states in addition to XSAVEC. + * + * Otherwise if XSAVEC is enabled, it replaces XSAVEOPT because it supports + * compacted storage format in addition to XSAVEOPT. * * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT * supports modified optimization which is not supported by XSAVE. @@ -102,17 +114,14 @@ static inline u64 xfeatures_mask_independent(void) * address of the instruction where we might get an exception at. */ #define XSTATE_XSAVE(st, lmask, hmask, err) \ - asm volatile(ALTERNATIVE_2(XSAVE, \ + asm volatile(ALTERNATIVE_3(XSAVE, \ XSAVEOPT, X86_FEATURE_XSAVEOPT, \ + XSAVEC, X86_FEATURE_XSAVEC, \ XSAVES, X86_FEATURE_XSAVES) \ "\n" \ "xor %[err], %[err]\n" \ "3:\n" \ - ".pushsection .fixup,\"ax\"\n" \ - "4: movl $-2, %[err]\n" \ - "jmp 3b\n" \ - ".popsection\n" \ - _ASM_EXTABLE(661b, 4b) \ + _ASM_EXTABLE_TYPE_REG(661b, 3b, EX_TYPE_EFAULT_REG, %[err]) \ : [err] "=r" (err) \ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ : "memory") @@ -149,8 +158,14 @@ static inline void xfd_update_state(struct fpstate *fpstate) } } } + +extern int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu); #else static inline void xfd_update_state(struct fpstate *fpstate) { } + +static inline int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu) { + return -EPERM; +} #endif /* diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index c39f906cdc4e..bd165004776d 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -37,7 +37,7 @@ static int ftrace_poke_late = 0; -int ftrace_arch_code_modify_prepare(void) +void ftrace_arch_code_modify_prepare(void) __acquires(&text_mutex) { /* @@ -47,10 +47,9 @@ int ftrace_arch_code_modify_prepare(void) */ mutex_lock(&text_mutex); ftrace_poke_late = 1; - return 0; } -int ftrace_arch_code_modify_post_process(void) +void ftrace_arch_code_modify_post_process(void) __releases(&text_mutex) { /* @@ -61,7 +60,6 @@ int ftrace_arch_code_modify_post_process(void) text_poke_finish(); ftrace_poke_late = 0; mutex_unlock(&text_mutex); - return 0; } static const char *ftrace_nop_replace(void) @@ -93,6 +91,7 @@ static int ftrace_verify_code(unsigned long ip, const char *old_code) /* Make sure it is what we expect it to be */ if (memcmp(cur_code, old_code, MCOUNT_INSN_SIZE) != 0) { + ftrace_expected = old_code; WARN_ON(1); return -EINVAL; } @@ -303,7 +302,7 @@ union ftrace_op_code_union { } __attribute__((packed)); }; -#define RET_SIZE 1 +#define RET_SIZE (IS_ENABLED(CONFIG_RETPOLINE) ? 5 : 1 + IS_ENABLED(CONFIG_SLS)) static unsigned long create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) @@ -316,12 +315,12 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) unsigned long offset; unsigned long npages; unsigned long size; - unsigned long retq; unsigned long *ptr; void *trampoline; void *ip; /* 48 8b 15 <offset> is movq <offset>(%rip), %rdx */ unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 }; + unsigned const char retq[] = { RET_INSN_OPCODE, INT3_INSN_OPCODE }; union ftrace_op_code_union op_ptr; int ret; @@ -359,12 +358,10 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) goto fail; ip = trampoline + size; - - /* The trampoline ends with ret(q) */ - retq = (unsigned long)ftrace_stub; - ret = copy_from_kernel_nofault(ip, (void *)retq, RET_SIZE); - if (WARN_ON(ret < 0)) - goto fail; + if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + __text_gen_insn(ip, JMP32_INSN_OPCODE, ip, &__x86_return_thunk, JMP32_INSN_SIZE); + else + memcpy(ip, retq, sizeof(retq)); /* No need to test direct calls on created trampolines */ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { @@ -584,9 +581,7 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops) #ifdef CONFIG_FUNCTION_GRAPH_TRACER -#ifdef CONFIG_DYNAMIC_FTRACE - -#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS +#if defined(CONFIG_DYNAMIC_FTRACE) && !defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS) extern void ftrace_graph_call(void); static const char *ftrace_jmp_replace(unsigned long ip, unsigned long addr) { @@ -615,18 +610,7 @@ int ftrace_disable_ftrace_graph_caller(void) return ftrace_mod_jmp(ip, &ftrace_stub); } -#else /* !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */ -int ftrace_enable_ftrace_graph_caller(void) -{ - return 0; -} - -int ftrace_disable_ftrace_graph_caller(void) -{ - return 0; -} -#endif /* CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */ -#endif /* !CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_DYNAMIC_FTRACE && !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */ /* * Hook the return address and push it in the stack of return addrs diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S index e405fe1a8bf4..a0ed0e4a2c0c 100644 --- a/arch/x86/kernel/ftrace_32.S +++ b/arch/x86/kernel/ftrace_32.S @@ -19,7 +19,7 @@ #endif SYM_FUNC_START(__fentry__) - ret + RET SYM_FUNC_END(__fentry__) EXPORT_SYMBOL(__fentry__) @@ -84,7 +84,7 @@ ftrace_graph_call: /* This is weak to keep gas from relaxing the jumps */ SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK) - ret + RET SYM_CODE_END(ftrace_caller) SYM_CODE_START(ftrace_regs_caller) @@ -177,7 +177,7 @@ SYM_CODE_START(ftrace_graph_caller) popl %edx popl %ecx popl %eax - ret + RET SYM_CODE_END(ftrace_graph_caller) .globl return_to_handler diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 7a879901f103..2a4be92fd144 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -4,6 +4,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/ptrace.h> #include <asm/ftrace.h> #include <asm/export.h> @@ -129,10 +130,18 @@ .endm +SYM_TYPED_FUNC_START(ftrace_stub) + RET +SYM_FUNC_END(ftrace_stub) + +SYM_TYPED_FUNC_START(ftrace_stub_graph) + RET +SYM_FUNC_END(ftrace_stub_graph) + #ifdef CONFIG_DYNAMIC_FTRACE SYM_FUNC_START(__fentry__) - retq + RET SYM_FUNC_END(__fentry__) EXPORT_SYMBOL(__fentry__) @@ -145,6 +154,7 @@ SYM_FUNC_START(ftrace_caller) movq %rcx, RSP(%rsp) SYM_INNER_LABEL(ftrace_caller_op_ptr, SYM_L_GLOBAL) + ANNOTATE_NOENDBR /* Load the ftrace_ops into the 3rd parameter */ movq function_trace_op(%rip), %rdx @@ -155,6 +165,7 @@ SYM_INNER_LABEL(ftrace_caller_op_ptr, SYM_L_GLOBAL) movq $0, CS(%rsp) SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) + ANNOTATE_NOENDBR call ftrace_stub /* Handlers can change the RIP */ @@ -169,19 +180,10 @@ SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) * layout here. */ SYM_INNER_LABEL(ftrace_caller_end, SYM_L_GLOBAL) - - jmp ftrace_epilogue + ANNOTATE_NOENDBR + RET SYM_FUNC_END(ftrace_caller); - -SYM_FUNC_START(ftrace_epilogue) -/* - * This is weak to keep gas from relaxing the jumps. - * It is also used to copy the retq for trampolines. - */ -SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK) - UNWIND_HINT_FUNC - retq -SYM_FUNC_END(ftrace_epilogue) +STACK_FRAME_NON_STANDARD_FP(ftrace_caller) SYM_FUNC_START(ftrace_regs_caller) /* Save the current flags before any operations that can change them */ @@ -192,6 +194,7 @@ SYM_FUNC_START(ftrace_regs_caller) /* save_mcount_regs fills in first two parameters */ SYM_INNER_LABEL(ftrace_regs_caller_op_ptr, SYM_L_GLOBAL) + ANNOTATE_NOENDBR /* Load the ftrace_ops into the 3rd parameter */ movq function_trace_op(%rip), %rdx @@ -221,6 +224,7 @@ SYM_INNER_LABEL(ftrace_regs_caller_op_ptr, SYM_L_GLOBAL) leaq (%rsp), %rcx SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) + ANNOTATE_NOENDBR call ftrace_stub /* Copy flags back to SS, to restore them */ @@ -248,6 +252,7 @@ SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) */ testq %rax, %rax SYM_INNER_LABEL(ftrace_regs_caller_jmp, SYM_L_GLOBAL) + ANNOTATE_NOENDBR jnz 1f restore_mcount_regs @@ -255,13 +260,11 @@ SYM_INNER_LABEL(ftrace_regs_caller_jmp, SYM_L_GLOBAL) popfq /* - * As this jmp to ftrace_epilogue can be a short jump - * it must not be copied into the trampoline. - * The trampoline will add the code to jump - * to the return. + * The trampoline will add the return. */ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) - jmp ftrace_epilogue + ANNOTATE_NOENDBR + RET /* Swap the flags with orig_rax */ 1: movq MCOUNT_REG_SIZE(%rsp), %rdi @@ -272,9 +275,10 @@ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) /* Restore flags */ popfq UNWIND_HINT_FUNC - jmp ftrace_epilogue + RET SYM_FUNC_END(ftrace_regs_caller) +STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller) #else /* ! CONFIG_DYNAMIC_FTRACE */ @@ -282,9 +286,7 @@ SYM_FUNC_END(ftrace_regs_caller) SYM_FUNC_START(__fentry__) cmpq $ftrace_stub, ftrace_trace_function jnz trace - -SYM_INNER_LABEL(ftrace_stub, SYM_L_GLOBAL) - retq + RET trace: /* save_mcount_regs fills in first two parameters */ @@ -303,11 +305,15 @@ trace: jmp ftrace_stub SYM_FUNC_END(__fentry__) EXPORT_SYMBOL(__fentry__) +STACK_FRAME_NON_STANDARD_FP(__fentry__) + #endif /* CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER -SYM_FUNC_START(return_to_handler) - subq $24, %rsp +SYM_CODE_START(return_to_handler) + UNWIND_HINT_EMPTY + ANNOTATE_NOENDBR + subq $16, %rsp /* Save the return values */ movq %rax, (%rsp) @@ -319,7 +325,18 @@ SYM_FUNC_START(return_to_handler) movq %rax, %rdi movq 8(%rsp), %rdx movq (%rsp), %rax - addq $24, %rsp - JMP_NOSPEC rdi -SYM_FUNC_END(return_to_handler) + + addq $16, %rsp + /* + * Jump back to the old return address. This cannot be JMP_NOSPEC rdi + * since IBT would demand that contain ENDBR, which simply isn't so for + * return addresses. Use a retpoline here to keep the RSB balanced. + */ + ANNOTATE_INTRA_FUNCTION_CALL + call .Ldo_rop + int3 +.Ldo_rop: + mov %rdi, (%rsp) + RET +SYM_CODE_END(return_to_handler) #endif diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index fc5371a7e9d1..6a3cfaf6b72a 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -40,6 +40,7 @@ #include <asm/extable.h> #include <asm/trapnr.h> #include <asm/sev.h> +#include <asm/tdx.h> /* * Manage page tables very early on. @@ -126,6 +127,49 @@ static bool __head check_la57_support(unsigned long physaddr) } #endif +static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdval_t *pmd) +{ + unsigned long vaddr, vaddr_end; + int i; + + /* Encrypt the kernel and related (if SME is active) */ + sme_encrypt_kernel(bp); + + /* + * Clear the memory encryption mask from the .bss..decrypted section. + * The bss section will be memset to zero later in the initialization so + * there is no need to zero it after changing the memory encryption + * attribute. + */ + if (sme_get_me_mask()) { + vaddr = (unsigned long)__start_bss_decrypted; + vaddr_end = (unsigned long)__end_bss_decrypted; + + for (; vaddr < vaddr_end; vaddr += PMD_SIZE) { + /* + * On SNP, transition the page to shared in the RMP table so that + * it is consistent with the page table attribute change. + * + * __start_bss_decrypted has a virtual address in the high range + * mapping (kernel .text). PVALIDATE, by way of + * early_snp_set_memory_shared(), requires a valid virtual + * address but the kernel is currently running off of the identity + * mapping so use __pa() to get a *currently* valid virtual address. + */ + early_snp_set_memory_shared(__pa(vaddr), __pa(vaddr), PTRS_PER_PMD); + + i = pmd_index(vaddr); + pmd[i] -= sme_get_me_mask(); + } + } + + /* + * Return the SME encryption mask (if SME is active) to be used as a + * modifier for the initial pgdir entry programmed into CR3. + */ + return sme_get_me_mask(); +} + /* Code in __startup_64() can be relocated during execution, but the compiler * doesn't have to generate PC-relative relocations when accessing globals from * that function. Clang actually does not generate them, which leads to @@ -135,7 +179,6 @@ static bool __head check_la57_support(unsigned long physaddr) unsigned long __head __startup_64(unsigned long physaddr, struct boot_params *bp) { - unsigned long vaddr, vaddr_end; unsigned long load_delta, *p; unsigned long pgtable_flags; pgdval_t *pgd; @@ -163,9 +206,6 @@ unsigned long __head __startup_64(unsigned long physaddr, if (load_delta & ~PMD_PAGE_MASK) for (;;); - /* Activate Secure Memory Encryption (SME) if supported and enabled */ - sme_enable(bp); - /* Include the SME encryption mask in the fixup value */ load_delta += sme_get_me_mask(); @@ -276,43 +316,7 @@ unsigned long __head __startup_64(unsigned long physaddr, */ *fixup_long(&phys_base, physaddr) += load_delta - sme_get_me_mask(); - /* Encrypt the kernel and related (if SME is active) */ - sme_encrypt_kernel(bp); - - /* - * Clear the memory encryption mask from the .bss..decrypted section. - * The bss section will be memset to zero later in the initialization so - * there is no need to zero it after changing the memory encryption - * attribute. - * - * This is early code, use an open coded check for SME instead of - * using cc_platform_has(). This eliminates worries about removing - * instrumentation or checking boot_cpu_data in the cc_platform_has() - * function. - */ - if (sme_get_me_mask()) { - vaddr = (unsigned long)__start_bss_decrypted; - vaddr_end = (unsigned long)__end_bss_decrypted; - for (; vaddr < vaddr_end; vaddr += PMD_SIZE) { - i = pmd_index(vaddr); - pmd[i] -= sme_get_me_mask(); - } - } - - /* - * Return the SME encryption mask (if SME is active) to be used as a - * modifier for the initial pgdir entry programmed into CR3. - */ - return sme_get_me_mask(); -} - -unsigned long __startup_secondary_64(void) -{ - /* - * Return the SME encryption mask (if SME is active) to be used as a - * modifier for the initial pgdir entry programmed into CR3. - */ - return sme_get_me_mask(); + return sme_postprocess_startup(bp, pmd); } /* Wipe all early page tables except for the kernel symbol map */ @@ -414,15 +418,20 @@ void __init do_early_exception(struct pt_regs *regs, int trapnr) trapnr == X86_TRAP_VC && handle_vc_boot_ghcb(regs)) return; + if (trapnr == X86_TRAP_VE && tdx_early_handle_ve(regs)) + return; + early_fixup_exception(regs, trapnr); } /* Don't add a printk in there. printk relies on the PDA which is not initialized yet. */ -static void __init clear_bss(void) +void __init clear_bss(void) { memset(__bss_start, 0, (unsigned long) __bss_stop - (unsigned long) __bss_start); + memset(__brk_base, 0, + (unsigned long) __brk_limit - (unsigned long) __brk_base); } static unsigned long get_cmd_line_ptr(void) @@ -485,6 +494,10 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) clear_bss(); + /* + * This needs to happen *before* kasan_early_init() because latter maps stuff + * into that page. + */ clear_page(init_top_pgt); /* @@ -496,8 +509,21 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) kasan_early_init(); + /* + * Flush global TLB entries which could be left over from the trampoline page + * table. + * + * This needs to happen *after* kasan_early_init() as KASAN-enabled .configs + * instrument native_write_cr4() so KASAN must be initialized for that + * instrumentation to work. + */ + __native_tlb_flush_global(this_cpu_read(cpu_tlbstate.cr4)); + idt_setup_early_handler(); + /* Needed before cc_platform_has() can be used for TDX */ + tdx_early_init(); + copy_bootdata(__va(real_mode_data)); /* @@ -584,8 +610,10 @@ static void startup_64_load_idt(unsigned long physbase) void early_setup_idt(void) { /* VMM Communication Exception */ - if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) { + setup_ghcb(); set_bringup_idt_handler(bringup_idt_table, X86_TRAP_VC, vc_boot_ghcb); + } bringup_idt_descr.address = (unsigned long)bringup_idt_table; native_load_idt(&bringup_idt_descr); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index d8c64dab0efe..9b7acc9c7874 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -23,6 +23,7 @@ #include <asm/cpufeatures.h> #include <asm/percpu.h> #include <asm/nops.h> +#include <asm/nospec-branch.h> #include <asm/bootparam.h> #include <asm/export.h> #include <asm/pgtable_32.h> @@ -340,7 +341,7 @@ SYM_FUNC_END(startup_32_smp) __INIT setup_once: andl $0,setup_once_ref /* Once is enough, thanks */ - ret + RET SYM_FUNC_START(early_idt_handler_array) # 36(%esp) %eflags diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index d8b3ebd2bb85..d860d437631b 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -65,10 +65,39 @@ SYM_CODE_START_NOALIGN(startup_64) leaq (__end_init_task - FRAME_SIZE)(%rip), %rsp leaq _text(%rip), %rdi + + /* + * initial_gs points to initial fixed_percpu_data struct with storage for + * the stack protector canary. Global pointer fixups are needed at this + * stage, so apply them as is done in fixup_pointer(), and initialize %gs + * such that the canary can be accessed at %gs:40 for subsequent C calls. + */ + movl $MSR_GS_BASE, %ecx + movq initial_gs(%rip), %rax + movq $_text, %rdx + subq %rdx, %rax + addq %rdi, %rax + movq %rax, %rdx + shrq $32, %rdx + wrmsr + pushq %rsi call startup_64_setup_env popq %rsi +#ifdef CONFIG_AMD_MEM_ENCRYPT + /* + * Activate SEV/SME memory encryption if supported/enabled. This needs to + * be done now, since this also includes setup of the SEV-SNP CPUID table, + * which needs to be done before any CPUID instructions are executed in + * subsequent code. + */ + movq %rsi, %rdi + pushq %rsi + call sme_enable + popq %rsi +#endif + /* Now switch to __KERNEL_CS so IRET works reliably */ pushq $__KERNEL_CS leaq .Lon_kernel_cs(%rip), %rax @@ -99,6 +128,7 @@ SYM_CODE_END(startup_64) SYM_CODE_START(secondary_startup_64) UNWIND_HINT_EMPTY + ANNOTATE_NOENDBR /* * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, * and someone has loaded a mapped page table. @@ -127,21 +157,38 @@ SYM_CODE_START(secondary_startup_64) */ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) UNWIND_HINT_EMPTY + ANNOTATE_NOENDBR /* * Retrieve the modifier (SME encryption mask if SME is active) to be * added to the initial pgdir entry that will be programmed into CR3. */ - pushq %rsi - call __startup_secondary_64 - popq %rsi +#ifdef CONFIG_AMD_MEM_ENCRYPT + movq sme_me_mask, %rax +#else + xorq %rax, %rax +#endif /* Form the CR3 value being sure to include the CR3 modifier */ addq $(init_top_pgt - __START_KERNEL_map), %rax 1: +#ifdef CONFIG_X86_MCE + /* + * Preserve CR4.MCE if the kernel will enable #MC support. + * Clearing MCE may fault in some environments (that also force #MC + * support). Any machine check that occurs before #MC support is fully + * configured will crash the system regardless of the CR4.MCE value set + * here. + */ + movq %cr4, %rcx + andl $X86_CR4_MCE, %ecx +#else + movl $0, %ecx +#endif + /* Enable PAE mode, PGE and LA57 */ - movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx + orl $(X86_CR4_PAE | X86_CR4_PGE), %ecx #ifdef CONFIG_X86_5LEVEL testl $1, __pgtable_l5_enabled(%rip) jz 1f @@ -166,15 +213,33 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) call sev_verify_cbit popq %rsi - /* Switch to new page-table */ + /* + * Switch to new page-table + * + * For the boot CPU this switches to early_top_pgt which still has the + * indentity mappings present. The secondary CPUs will switch to the + * init_top_pgt here, away from the trampoline_pgd and unmap the + * indentity mapped ranges. + */ movq %rax, %cr3 + /* + * Do a global TLB flush after the CR3 switch to make sure the TLB + * entries from the identity mapping are flushed. + */ + movq %cr4, %rcx + movq %rcx, %rax + xorq $X86_CR4_PGE, %rcx + movq %rcx, %cr4 + movq %rax, %cr4 + /* Ensure I am executing from virtual addresses */ movq $1f, %rax ANNOTATE_RETPOLINE_SAFE jmp *%rax 1: UNWIND_HINT_EMPTY + ANNOTATE_NOENDBR // above /* * We must switch to a new descriptor in kernel space for the GDT @@ -229,13 +294,23 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) /* Setup EFER (Extended Feature Enable Register) */ movl $MSR_EFER, %ecx rdmsr + /* + * Preserve current value of EFER for comparison and to skip + * EFER writes if no change was made (for TDX guest) + */ + movl %eax, %edx btsl $_EFER_SCE, %eax /* Enable System Call */ btl $20,%edi /* No Execute supported? */ jnc 1f btsl $_EFER_NX, %eax btsq $_PAGE_BIT_NX,early_pmd_flags(%rip) -1: wrmsr /* Make changes effective */ + /* Avoid writing EFER if no change was made (for TDX guest) */ +1: cmpl %edx, %eax + je 1f + xor %edx, %edx + wrmsr /* Make changes effective */ +1: /* Setup cr0 */ movl $CR0_STATE, %eax /* Make changes effective */ @@ -282,6 +357,7 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) pushq %rax # target address in negative space lretq .Lafter_lret: + ANNOTATE_NOENDBR SYM_CODE_END(secondary_startup_64) #include "verify_cpu.S" @@ -311,6 +387,9 @@ SYM_CODE_END(start_cpu0) */ SYM_CODE_START_NOALIGN(vc_boot_ghcb) UNWIND_HINT_IRET_REGS offset=8 + ENDBR + + ANNOTATE_UNRET_END /* Build pt_regs */ PUSH_AND_CLEAR_REGS @@ -328,7 +407,6 @@ SYM_CODE_START_NOALIGN(vc_boot_ghcb) /* Remove Error Code */ addq $8, %rsp - /* Pure iret required here - don't use INTERRUPT_RETURN */ iretq SYM_CODE_END(vc_boot_ghcb) #endif @@ -355,9 +433,11 @@ SYM_CODE_START(early_idt_handler_array) .rept NUM_EXCEPTION_VECTORS .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0 UNWIND_HINT_IRET_REGS + ENDBR pushq $0 # Dummy error code, to make stack frame uniform .else UNWIND_HINT_IRET_REGS offset=8 + ENDBR .endif pushq $i # 72(%rsp) Vector number jmp early_idt_handler_common @@ -365,10 +445,12 @@ SYM_CODE_START(early_idt_handler_array) i = i + 1 .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr - UNWIND_HINT_IRET_REGS offset=16 SYM_CODE_END(early_idt_handler_array) + ANNOTATE_NOENDBR // early_idt_handler_array[NUM_EXCEPTION_VECTORS] SYM_CODE_START_LOCAL(early_idt_handler_common) + UNWIND_HINT_IRET_REGS offset=16 + ANNOTATE_UNRET_END /* * The stack is the hardware frame, an error code or zero, and the * vector number. @@ -409,11 +491,16 @@ SYM_CODE_END(early_idt_handler_common) * early_idt_handler_array can't be used because it returns via the * paravirtualized INTERRUPT_RETURN and pv-ops don't work that early. * + * XXX it does, fix this. + * * This handler will end up in the .init.text section and not be * available to boot secondary CPUs. */ SYM_CODE_START_NOALIGN(vc_no_ghcb) UNWIND_HINT_IRET_REGS offset=8 + ENDBR + + ANNOTATE_UNRET_END /* Build pt_regs */ PUSH_AND_CLEAR_REGS diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 882213df3713..71f336425e58 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -1435,8 +1435,12 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) hpet_rtc_timer_reinit(); memset(&curr_time, 0, sizeof(struct rtc_time)); - if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) - mc146818_get_time(&curr_time); + if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) { + if (unlikely(mc146818_get_time(&curr_time) < 0)) { + pr_err_ratelimited("unable to read current time from RTC\n"); + return IRQ_HANDLED; + } + } if (hpet_rtc_flags & RTC_UIE && curr_time.tm_sec != hpet_prev_update_sec) { diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index df0fa695bb09..a58c6bc1cd68 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -10,6 +10,7 @@ #include <asm/proto.h> #include <asm/desc.h> #include <asm/hw_irq.h> +#include <asm/idtentry.h> #define DPL0 0x0 #define DPL3 0x3 @@ -68,6 +69,9 @@ static const __initconst struct idt_data early_idts[] = { */ INTG(X86_TRAP_PF, asm_exc_page_fault), #endif +#ifdef CONFIG_INTEL_TDX_GUEST + INTG(X86_TRAP_VE, asm_exc_virtualization_exception), +#endif }; /* @@ -103,6 +107,10 @@ static const __initconst struct idt_data def_idts[] = { ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE), #endif +#ifdef CONFIG_X86_KERNEL_IBT + INTG(X86_TRAP_CP, asm_exc_control_protection), +#endif + #ifdef CONFIG_AMD_MEM_ENCRYPT ISTG(X86_TRAP_VC, asm_exc_vmm_communication, IST_INDEX_VC), #endif @@ -272,7 +280,7 @@ void __init idt_setup_apic_and_irq_gates(void) idt_setup_from_table(idt_table, apic_idts, ARRAY_SIZE(apic_idts), true); for_each_clear_bit_from(i, system_vectors, FIRST_SYSTEM_VECTOR) { - entry = irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR); + entry = irq_entries_start + IDT_ALIGN * (i - FIRST_EXTERNAL_VECTOR); set_intr_gate(i, entry); } @@ -283,7 +291,7 @@ void __init idt_setup_apic_and_irq_gates(void) * system_vectors bitmap. Otherwise they show up in * /proc/interrupts. */ - entry = spurious_entries_start + 8 * (i - FIRST_SYSTEM_VECTOR); + entry = spurious_entries_start + IDT_ALIGN * (i - FIRST_SYSTEM_VECTOR); set_intr_gate(i, entry); } #endif diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index e5dd6da78713..01833ebf5e8e 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -132,7 +132,7 @@ int irq_init_percpu_irqstack(unsigned int cpu) return 0; } -#ifndef CONFIG_PREEMPT_RT +#ifdef CONFIG_SOFTIRQ_ON_OWN_STACK void do_softirq_own_stack(void) { struct irq_stack *irqstk; diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S index 760e1f293093..aaf9e776f323 100644 --- a/arch/x86/kernel/irqflags.S +++ b/arch/x86/kernel/irqflags.S @@ -11,7 +11,7 @@ SYM_FUNC_START(native_save_fl) pushf pop %_ASM_AX - ret + RET SYM_FUNC_END(native_save_fl) .popsection EXPORT_SYMBOL(native_save_fl) diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 68f091ba8443..f5b8ef02d172 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -146,16 +146,3 @@ void arch_jump_label_transform_apply(void) text_poke_finish(); mutex_unlock(&text_mutex); } - -static enum { - JL_STATE_START, - JL_STATE_NO_UPDATE, - JL_STATE_UPDATE, -} jlstate __initdata_or_module = JL_STATE_START; - -__init_or_module void arch_jump_label_transform_static(struct jump_entry *entry, - enum jump_label_type type) -{ - if (jlstate == JL_STATE_UPDATE) - jump_label_transform(entry, type, 1); -} diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index 64b6da95af98..e2e89bebcbc3 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -88,11 +88,13 @@ create_setup_data_node(struct dentry *parent, int no, static int __init create_setup_data_nodes(struct dentry *parent) { + struct setup_indirect *indirect; struct setup_data_node *node; struct setup_data *data; - int error; + u64 pa_data, pa_next; struct dentry *d; - u64 pa_data; + int error; + u32 len; int no = 0; d = debugfs_create_dir("setup_data", parent); @@ -112,12 +114,29 @@ static int __init create_setup_data_nodes(struct dentry *parent) error = -ENOMEM; goto err_dir; } - - if (data->type == SETUP_INDIRECT && - ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) { - node->paddr = ((struct setup_indirect *)data->data)->addr; - node->type = ((struct setup_indirect *)data->data)->type; - node->len = ((struct setup_indirect *)data->data)->len; + pa_next = data->next; + + if (data->type == SETUP_INDIRECT) { + len = sizeof(*data) + data->len; + memunmap(data); + data = memremap(pa_data, len, MEMREMAP_WB); + if (!data) { + kfree(node); + error = -ENOMEM; + goto err_dir; + } + + indirect = (struct setup_indirect *)data->data; + + if (indirect->type != SETUP_INDIRECT) { + node->paddr = indirect->addr; + node->type = indirect->type; + node->len = indirect->len; + } else { + node->paddr = pa_data; + node->type = data->type; + node->len = data->len; + } } else { node->paddr = pa_data; node->type = data->type; @@ -125,7 +144,7 @@ static int __init create_setup_data_nodes(struct dentry *parent) } create_setup_data_node(d, no, node); - pa_data = data->next; + pa_data = pa_next; memunmap(data); no++; diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 170d0fd68b1f..6b58610a1552 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -17,7 +17,7 @@ #include <linux/kernel.h> #include <linux/mm.h> #include <linux/efi.h> -#include <linux/verification.h> +#include <linux/random.h> #include <asm/bootparam.h> #include <asm/setup.h> @@ -110,6 +110,26 @@ static int setup_e820_entries(struct boot_params *params) return 0; } +enum { RNG_SEED_LENGTH = 32 }; + +static void +setup_rng_seed(struct boot_params *params, unsigned long params_load_addr, + unsigned int rng_seed_setup_data_offset) +{ + struct setup_data *sd = (void *)params + rng_seed_setup_data_offset; + unsigned long setup_data_phys; + + if (!rng_is_initialized()) + return; + + sd->type = SETUP_RNG_SEED; + sd->len = RNG_SEED_LENGTH; + get_random_bytes(sd->data, RNG_SEED_LENGTH); + setup_data_phys = params_load_addr + rng_seed_setup_data_offset; + sd->next = params->hdr.setup_data; + params->hdr.setup_data = setup_data_phys; +} + #ifdef CONFIG_EFI static int setup_efi_info_memmap(struct boot_params *params, unsigned long params_load_addr, @@ -186,11 +206,38 @@ setup_efi_state(struct boot_params *params, unsigned long params_load_addr, } #endif /* CONFIG_EFI */ +static void +setup_ima_state(const struct kimage *image, struct boot_params *params, + unsigned long params_load_addr, + unsigned int ima_setup_data_offset) +{ +#ifdef CONFIG_IMA_KEXEC + struct setup_data *sd = (void *)params + ima_setup_data_offset; + unsigned long setup_data_phys; + struct ima_setup_data *ima; + + if (!image->ima_buffer_size) + return; + + sd->type = SETUP_IMA; + sd->len = sizeof(*ima); + + ima = (void *)sd + sizeof(struct setup_data); + ima->addr = image->ima_buffer_addr; + ima->size = image->ima_buffer_size; + + /* Add setup data */ + setup_data_phys = params_load_addr + ima_setup_data_offset; + sd->next = params->hdr.setup_data; + params->hdr.setup_data = setup_data_phys; +#endif /* CONFIG_IMA_KEXEC */ +} + static int setup_boot_parameters(struct kimage *image, struct boot_params *params, unsigned long params_load_addr, unsigned int efi_map_offset, unsigned int efi_map_sz, - unsigned int efi_setup_data_offset) + unsigned int setup_data_offset) { unsigned int nr_e820_entries; unsigned long long mem_k, start, end; @@ -245,8 +292,22 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params, #ifdef CONFIG_EFI /* Setup EFI state */ setup_efi_state(params, params_load_addr, efi_map_offset, efi_map_sz, - efi_setup_data_offset); + setup_data_offset); + setup_data_offset += sizeof(struct setup_data) + + sizeof(struct efi_setup_data); #endif + + if (IS_ENABLED(CONFIG_IMA_KEXEC)) { + /* Setup IMA log buffer state */ + setup_ima_state(image, params, params_load_addr, + setup_data_offset); + setup_data_offset += sizeof(struct setup_data) + + sizeof(struct ima_setup_data); + } + + /* Setup RNG seed */ + setup_rng_seed(params, params_load_addr, setup_data_offset); + /* Setup EDD info */ memcpy(params->eddbuf, boot_params.eddbuf, EDDMAXNR * sizeof(struct edd_info)); @@ -401,7 +462,13 @@ static void *bzImage64_load(struct kimage *image, char *kernel, params_cmdline_sz = ALIGN(params_cmdline_sz, 16); kbuf.bufsz = params_cmdline_sz + ALIGN(efi_map_sz, 16) + sizeof(struct setup_data) + - sizeof(struct efi_setup_data); + sizeof(struct efi_setup_data) + + sizeof(struct setup_data) + + RNG_SEED_LENGTH; + + if (IS_ENABLED(CONFIG_IMA_KEXEC)) + kbuf.bufsz += sizeof(struct setup_data) + + sizeof(struct ima_setup_data); params = kzalloc(kbuf.bufsz, GFP_KERNEL); if (!params) @@ -528,28 +595,11 @@ static int bzImage64_cleanup(void *loader_data) return 0; } -#ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG -static int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len) -{ - int ret; - - ret = verify_pefile_signature(kernel, kernel_len, - VERIFY_USE_SECONDARY_KEYRING, - VERIFYING_KEXEC_PE_SIGNATURE); - if (ret == -ENOKEY && IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING)) { - ret = verify_pefile_signature(kernel, kernel_len, - VERIFY_USE_PLATFORM_KEYRING, - VERIFYING_KEXEC_PE_SIGNATURE); - } - return ret; -} -#endif - const struct kexec_file_ops kexec_bzImage64_ops = { .probe = bzImage64_probe, .load = bzImage64_load, .cleanup = bzImage64_cleanup, #ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG - .verify_sig = bzImage64_verify_sig, + .verify_sig = kexec_kernel_verify_pe_sig, #endif }; diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h index 7d3a2e2daf01..c993521d4933 100644 --- a/arch/x86/kernel/kprobes/common.h +++ b/arch/x86/kernel/kprobes/common.h @@ -6,6 +6,7 @@ #include <asm/asm.h> #include <asm/frame.h> +#include <asm/insn.h> #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index fce99e249d61..eb8bc82846b9 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -52,14 +52,13 @@ #include <asm/insn.h> #include <asm/debugreg.h> #include <asm/set_memory.h> +#include <asm/ibt.h> #include "common.h" DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); -#define stack_addr(regs) ((unsigned long *)regs->sp) - #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ @@ -193,17 +192,10 @@ static unsigned long __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr) { struct kprobe *kp; - unsigned long faddr; + bool faddr; kp = get_kprobe((void *)addr); - faddr = ftrace_location(addr); - /* - * Addresses inside the ftrace location are refused by - * arch_check_ftrace_location(). Something went terribly wrong - * if such an address is checked here. - */ - if (WARN_ON(faddr && faddr != addr)) - return 0UL; + faddr = ftrace_location(addr) == addr; /* * Use the current code if it is not modified by Kprobe * and it cannot be modified by ftrace. @@ -301,6 +293,22 @@ static int can_probe(unsigned long paddr) return (addr == paddr); } +/* If x86 supports IBT (ENDBR) it must be skipped. */ +kprobe_opcode_t *arch_adjust_kprobe_addr(unsigned long addr, unsigned long offset, + bool *on_func_entry) +{ + if (is_endbr(*(u32 *)addr)) { + *on_func_entry = !offset || offset == 4; + if (*on_func_entry) + offset = 4; + + } else { + *on_func_entry = !offset; + } + + return (kprobe_opcode_t *)(addr + offset); +} + /* * Copy an instruction with recovering modified instruction by kprobes * and adjust the displacement if the instruction uses the %rip-relative @@ -495,7 +503,7 @@ static void kprobe_emulate_jcc(struct kprobe *p, struct pt_regs *regs) match = ((regs->flags & X86_EFLAGS_SF) >> X86_EFLAGS_SF_BIT) ^ ((regs->flags & X86_EFLAGS_OF) >> X86_EFLAGS_OF_BIT); if (p->ainsn.jcc.type >= 0xe) - match = match && (regs->flags & X86_EFLAGS_ZF); + match = match || (regs->flags & X86_EFLAGS_ZF); } __kprobe_emulate_jmp(p, regs, (match && !invert) || (!match && invert)); } @@ -801,31 +809,23 @@ set_current_kprobe(struct kprobe *p, struct pt_regs *regs, = (regs->flags & X86_EFLAGS_IF); } -void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) -{ - unsigned long *sara = stack_addr(regs); - - ri->ret_addr = (kprobe_opcode_t *) *sara; - ri->fp = sara; - - /* Replace the return addr with trampoline addr */ - *sara = (unsigned long) &__kretprobe_trampoline; -} -NOKPROBE_SYMBOL(arch_prepare_kretprobe); - static void kprobe_post_process(struct kprobe *cur, struct pt_regs *regs, struct kprobe_ctlblk *kcb) { - if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { - kcb->kprobe_status = KPROBE_HIT_SSDONE; - cur->post_handler(cur, regs, 0); - } - /* Restore back the original saved kprobes variables and continue. */ - if (kcb->kprobe_status == KPROBE_REENTER) + if (kcb->kprobe_status == KPROBE_REENTER) { + /* This will restore both kcb and current_kprobe */ restore_previous_kprobe(kcb); - else + } else { + /* + * Always update the kcb status because + * reset_curent_kprobe() doesn't update kcb. + */ + kcb->kprobe_status = KPROBE_HIT_SSDONE; + if (cur->post_handler) + cur->post_handler(cur, regs, 0); reset_current_kprobe(); + } } NOKPROBE_SYMBOL(kprobe_post_process); @@ -1013,100 +1013,6 @@ int kprobe_int3_handler(struct pt_regs *regs) } NOKPROBE_SYMBOL(kprobe_int3_handler); -/* - * When a retprobed function returns, this code saves registers and - * calls trampoline_handler() runs, which calls the kretprobe's handler. - */ -asm( - ".text\n" - ".global __kretprobe_trampoline\n" - ".type __kretprobe_trampoline, @function\n" - "__kretprobe_trampoline:\n" -#ifdef CONFIG_X86_64 - /* Push a fake return address to tell the unwinder it's a kretprobe. */ - " pushq $__kretprobe_trampoline\n" - UNWIND_HINT_FUNC - /* Save the 'sp - 8', this will be fixed later. */ - " pushq %rsp\n" - " pushfq\n" - SAVE_REGS_STRING - " movq %rsp, %rdi\n" - " call trampoline_handler\n" - RESTORE_REGS_STRING - /* In trampoline_handler(), 'regs->flags' is copied to 'regs->sp'. */ - " addq $8, %rsp\n" - " popfq\n" -#else - /* Push a fake return address to tell the unwinder it's a kretprobe. */ - " pushl $__kretprobe_trampoline\n" - UNWIND_HINT_FUNC - /* Save the 'sp - 4', this will be fixed later. */ - " pushl %esp\n" - " pushfl\n" - SAVE_REGS_STRING - " movl %esp, %eax\n" - " call trampoline_handler\n" - RESTORE_REGS_STRING - /* In trampoline_handler(), 'regs->flags' is copied to 'regs->sp'. */ - " addl $4, %esp\n" - " popfl\n" -#endif - " ret\n" - ".size __kretprobe_trampoline, .-__kretprobe_trampoline\n" -); -NOKPROBE_SYMBOL(__kretprobe_trampoline); -/* - * __kretprobe_trampoline() skips updating frame pointer. The frame pointer - * saved in trampoline_handler() points to the real caller function's - * frame pointer. Thus the __kretprobe_trampoline() doesn't have a - * standard stack frame with CONFIG_FRAME_POINTER=y. - * Let's mark it non-standard function. Anyway, FP unwinder can correctly - * unwind without the hint. - */ -STACK_FRAME_NON_STANDARD_FP(__kretprobe_trampoline); - -/* This is called from kretprobe_trampoline_handler(). */ -void arch_kretprobe_fixup_return(struct pt_regs *regs, - kprobe_opcode_t *correct_ret_addr) -{ - unsigned long *frame_pointer = ®s->sp + 1; - - /* Replace fake return address with real one. */ - *frame_pointer = (unsigned long)correct_ret_addr; -} - -/* - * Called from __kretprobe_trampoline - */ -__used __visible void trampoline_handler(struct pt_regs *regs) -{ - unsigned long *frame_pointer; - - /* fixup registers */ - regs->cs = __KERNEL_CS; -#ifdef CONFIG_X86_32 - regs->gs = 0; -#endif - regs->ip = (unsigned long)&__kretprobe_trampoline; - regs->orig_ax = ~0UL; - regs->sp += sizeof(long); - frame_pointer = ®s->sp + 1; - - /* - * The return address at 'frame_pointer' is recovered by the - * arch_kretprobe_fixup_return() which called from the - * kretprobe_trampoline_handler(). - */ - kretprobe_trampoline_handler(regs, frame_pointer); - - /* - * Copy FLAGS to 'pt_regs::sp' so that __kretprobe_trapmoline() - * can do RET right after POPF. - */ - regs->sp = regs->flags; -} -NOKPROBE_SYMBOL(trampoline_handler); - int kprobe_fault_handler(struct pt_regs *regs, int trapnr) { struct kprobe *cur = kprobe_running(); diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index b4a54a52aa59..e6b8c5362b94 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -106,7 +106,8 @@ asm ( ".global optprobe_template_entry\n" "optprobe_template_entry:\n" #ifdef CONFIG_X86_64 - /* We don't bother saving the ss register */ + " pushq $" __stringify(__KERNEL_DS) "\n" + /* Save the 'sp - 8', this will be fixed later. */ " pushq %rsp\n" " pushfq\n" ".global optprobe_template_clac\n" @@ -121,14 +122,17 @@ asm ( ".global optprobe_template_call\n" "optprobe_template_call:\n" ASM_NOP5 - /* Move flags to rsp */ + /* Copy 'regs->flags' into 'regs->ss'. */ " movq 18*8(%rsp), %rdx\n" - " movq %rdx, 19*8(%rsp)\n" + " movq %rdx, 20*8(%rsp)\n" RESTORE_REGS_STRING - /* Skip flags entry */ - " addq $8, %rsp\n" + /* Skip 'regs->flags' and 'regs->sp'. */ + " addq $16, %rsp\n" + /* And pop flags register from 'regs->ss'. */ " popfq\n" #else /* CONFIG_X86_32 */ + " pushl %ss\n" + /* Save the 'sp - 4', this will be fixed later. */ " pushl %esp\n" " pushfl\n" ".global optprobe_template_clac\n" @@ -142,12 +146,13 @@ asm ( ".global optprobe_template_call\n" "optprobe_template_call:\n" ASM_NOP5 - /* Move flags into esp */ + /* Copy 'regs->flags' into 'regs->ss'. */ " movl 14*4(%esp), %edx\n" - " movl %edx, 15*4(%esp)\n" + " movl %edx, 16*4(%esp)\n" RESTORE_REGS_STRING - /* Skip flags entry */ - " addl $4, %esp\n" + /* Skip 'regs->flags' and 'regs->sp'. */ + " addl $8, %esp\n" + /* And pop flags register from 'regs->ss'. */ " popfl\n" #endif ".global optprobe_template_end\n" @@ -179,6 +184,8 @@ optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) kprobes_inc_nmissed_count(&op->kp); } else { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + /* Adjust stack pointer */ + regs->sp += sizeof(long); /* Save skipped registers */ regs->cs = __KERNEL_CS; #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c index d0a19121c6a4..257892fcefa7 100644 --- a/arch/x86/kernel/ksysfs.c +++ b/arch/x86/kernel/ksysfs.c @@ -91,26 +91,41 @@ static int get_setup_data_paddr(int nr, u64 *paddr) static int __init get_setup_data_size(int nr, size_t *size) { - int i = 0; + u64 pa_data = boot_params.hdr.setup_data, pa_next; + struct setup_indirect *indirect; struct setup_data *data; - u64 pa_data = boot_params.hdr.setup_data; + int i = 0; + u32 len; while (pa_data) { data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); if (!data) return -ENOMEM; + pa_next = data->next; + if (nr == i) { - if (data->type == SETUP_INDIRECT && - ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) - *size = ((struct setup_indirect *)data->data)->len; - else + if (data->type == SETUP_INDIRECT) { + len = sizeof(*data) + data->len; + memunmap(data); + data = memremap(pa_data, len, MEMREMAP_WB); + if (!data) + return -ENOMEM; + + indirect = (struct setup_indirect *)data->data; + + if (indirect->type != SETUP_INDIRECT) + *size = indirect->len; + else + *size = data->len; + } else { *size = data->len; + } memunmap(data); return 0; } - pa_data = data->next; + pa_data = pa_next; memunmap(data); i++; } @@ -120,9 +135,11 @@ static int __init get_setup_data_size(int nr, size_t *size) static ssize_t type_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { + struct setup_indirect *indirect; + struct setup_data *data; int nr, ret; u64 paddr; - struct setup_data *data; + u32 len; ret = kobj_to_setup_data_nr(kobj, &nr); if (ret) @@ -135,10 +152,20 @@ static ssize_t type_show(struct kobject *kobj, if (!data) return -ENOMEM; - if (data->type == SETUP_INDIRECT) - ret = sprintf(buf, "0x%x\n", ((struct setup_indirect *)data->data)->type); - else + if (data->type == SETUP_INDIRECT) { + len = sizeof(*data) + data->len; + memunmap(data); + data = memremap(paddr, len, MEMREMAP_WB); + if (!data) + return -ENOMEM; + + indirect = (struct setup_indirect *)data->data; + + ret = sprintf(buf, "0x%x\n", indirect->type); + } else { ret = sprintf(buf, "0x%x\n", data->type); + } + memunmap(data); return ret; } @@ -149,9 +176,10 @@ static ssize_t setup_data_data_read(struct file *fp, char *buf, loff_t off, size_t count) { + struct setup_indirect *indirect; + struct setup_data *data; int nr, ret = 0; u64 paddr, len; - struct setup_data *data; void *p; ret = kobj_to_setup_data_nr(kobj, &nr); @@ -165,10 +193,27 @@ static ssize_t setup_data_data_read(struct file *fp, if (!data) return -ENOMEM; - if (data->type == SETUP_INDIRECT && - ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) { - paddr = ((struct setup_indirect *)data->data)->addr; - len = ((struct setup_indirect *)data->data)->len; + if (data->type == SETUP_INDIRECT) { + len = sizeof(*data) + data->len; + memunmap(data); + data = memremap(paddr, len, MEMREMAP_WB); + if (!data) + return -ENOMEM; + + indirect = (struct setup_indirect *)data->data; + + if (indirect->type != SETUP_INDIRECT) { + paddr = indirect->addr; + len = indirect->len; + } else { + /* + * Even though this is technically undefined, return + * the data as though it is a normal setup_data struct. + * This will at least allow it to be inspected. + */ + paddr += sizeof(*data); + len = data->len; + } } else { paddr += sizeof(*data); len = data->len; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 59abbdad7729..d4e48b4a438b 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -69,6 +69,7 @@ static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __align DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible; static int has_steal_clock = 0; +static int has_guest_poll = 0; /* * No need for any "IO delay" on KVM */ @@ -190,7 +191,7 @@ void kvm_async_pf_task_wake(u32 token) { u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; - struct kvm_task_sleep_node *n; + struct kvm_task_sleep_node *n, *dummy = NULL; if (token == ~0) { apf_task_wake_all(); @@ -202,28 +203,40 @@ again: n = _find_apf_task(b, token); if (!n) { /* - * async PF was not yet handled. - * Add dummy entry for the token. + * Async #PF not yet handled, add a dummy entry for the token. + * Allocating the token must be down outside of the raw lock + * as the allocator is preemptible on PREEMPT_RT kernels. */ - n = kzalloc(sizeof(*n), GFP_ATOMIC); - if (!n) { + if (!dummy) { + raw_spin_unlock(&b->lock); + dummy = kzalloc(sizeof(*dummy), GFP_ATOMIC); + /* - * Allocation failed! Busy wait while other cpu - * handles async PF. + * Continue looping on allocation failure, eventually + * the async #PF will be handled and allocating a new + * node will be unnecessary. + */ + if (!dummy) + cpu_relax(); + + /* + * Recheck for async #PF completion before enqueueing + * the dummy token to avoid duplicate list entries. */ - raw_spin_unlock(&b->lock); - cpu_relax(); goto again; } - n->token = token; - n->cpu = smp_processor_id(); - init_swait_queue_head(&n->wq); - hlist_add_head(&n->link, &b->list); + dummy->token = token; + dummy->cpu = smp_processor_id(); + init_swait_queue_head(&dummy->wq); + hlist_add_head(&dummy->link, &b->list); + dummy = NULL; } else { apf_task_wake_one(n); } raw_spin_unlock(&b->lock); - return; + + /* A dummy token might be allocated and ultimately not used. */ + kfree(dummy); } EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); @@ -313,7 +326,7 @@ static void kvm_register_steal_time(void) return; wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); - pr_info("stealtime: cpu %d, msr %llx\n", cpu, + pr_debug("stealtime: cpu %d, msr %llx\n", cpu, (unsigned long long) slow_virt_to_phys(st)); } @@ -350,7 +363,7 @@ static void kvm_guest_cpu_init(void) wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); __this_cpu_write(apf_reason.enabled, 1); - pr_info("setup async PF for cpu %d\n", smp_processor_id()); + pr_debug("setup async PF for cpu %d\n", smp_processor_id()); } if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { @@ -376,7 +389,7 @@ static void kvm_pv_disable_apf(void) wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); __this_cpu_write(apf_reason.enabled, 0); - pr_info("disable async PF for cpu %d\n", smp_processor_id()); + pr_debug("disable async PF for cpu %d\n", smp_processor_id()); } static void kvm_disable_steal_time(void) @@ -462,19 +475,24 @@ static bool pv_tlb_flush_supported(void) { return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && !kvm_para_has_hint(KVM_HINTS_REALTIME) && - kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)); + kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) && + !boot_cpu_has(X86_FEATURE_MWAIT) && + (num_possible_cpus() != 1)); } static bool pv_ipi_supported(void) { - return kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI); + return (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI) && + (num_possible_cpus() != 1)); } static bool pv_sched_yield_supported(void) { return (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) && !kvm_para_has_hint(KVM_HINTS_REALTIME) && - kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)); + kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) && + !boot_cpu_has(X86_FEATURE_MWAIT) && + (num_possible_cpus() != 1)); } #define KVM_IPI_CLUSTER_SIZE (2 * BITS_PER_LONG) @@ -512,7 +530,7 @@ static void __send_ipi_mask(const struct cpumask *mask, int vector) } else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) { ipi_bitmap <<= min - apic_id; min = apic_id; - } else if (apic_id < min + KVM_IPI_CLUSTER_SIZE) { + } else if (apic_id > min && apic_id < min + KVM_IPI_CLUSTER_SIZE) { max = apic_id < max ? max : apic_id; } else { ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, @@ -619,7 +637,7 @@ static void kvm_smp_send_call_func_ipi(const struct cpumask *mask) /* Make sure other vCPUs get a chance to run if they need to. */ for_each_cpu(cpu, mask) { - if (vcpu_is_preempted(cpu)) { + if (!idle_cpu(cpu) && vcpu_is_preempted(cpu)) { kvm_hypercall1(KVM_HC_SCHED_YIELD, per_cpu(x86_cpu_to_apicid, cpu)); break; } @@ -701,14 +719,26 @@ static int kvm_cpu_down_prepare(unsigned int cpu) static int kvm_suspend(void) { + u64 val = 0; + kvm_guest_cpu_offline(false); +#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL + if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) + rdmsrl(MSR_KVM_POLL_CONTROL, val); + has_guest_poll = !(val & 1); +#endif return 0; } static void kvm_resume(void) { kvm_cpu_online(raw_smp_processor_id()); + +#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL + if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll) + wrmsrl(MSR_KVM_POLL_CONTROL, 0); +#endif } static struct syscore_ops kvm_syscore_ops = { @@ -747,6 +777,42 @@ static void kvm_crash_shutdown(struct pt_regs *regs) } #endif +#if defined(CONFIG_X86_32) || !defined(CONFIG_SMP) +bool __kvm_vcpu_is_preempted(long cpu); + +__visible bool __kvm_vcpu_is_preempted(long cpu) +{ + struct kvm_steal_time *src = &per_cpu(steal_time, cpu); + + return !!(src->preempted & KVM_VCPU_PREEMPTED); +} +PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted); + +#else + +#include <asm/asm-offsets.h> + +extern bool __raw_callee_save___kvm_vcpu_is_preempted(long); + +/* + * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and + * restoring to/from the stack. + */ +asm( +".pushsection .text;" +".global __raw_callee_save___kvm_vcpu_is_preempted;" +".type __raw_callee_save___kvm_vcpu_is_preempted, @function;" +"__raw_callee_save___kvm_vcpu_is_preempted:" +ASM_ENDBR +"movq __per_cpu_offset(,%rdi,8), %rax;" +"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);" +"setne %al;" +ASM_RET +".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;" +".popsection"); + +#endif + static void __init kvm_guest_init(void) { int i; @@ -759,6 +825,9 @@ static void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { has_steal_clock = 1; static_call_update(pv_steal_clock, kvm_steal_clock); + + pv_ops.lock.vcpu_is_preempted = + PV_CALLEE_SAVE(__kvm_vcpu_is_preempted); } if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) @@ -1000,39 +1069,6 @@ static void kvm_wait(u8 *ptr, u8 val) } } -#ifdef CONFIG_X86_32 -__visible bool __kvm_vcpu_is_preempted(long cpu) -{ - struct kvm_steal_time *src = &per_cpu(steal_time, cpu); - - return !!(src->preempted & KVM_VCPU_PREEMPTED); -} -PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted); - -#else - -#include <asm/asm-offsets.h> - -extern bool __raw_callee_save___kvm_vcpu_is_preempted(long); - -/* - * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and - * restoring to/from the stack. - */ -asm( -".pushsection .text;" -".global __raw_callee_save___kvm_vcpu_is_preempted;" -".type __raw_callee_save___kvm_vcpu_is_preempted, @function;" -"__raw_callee_save___kvm_vcpu_is_preempted:" -"movq __per_cpu_offset(,%rdi,8), %rax;" -"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);" -"setne %al;" -"ret;" -".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;" -".popsection"); - -#endif - /* * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present. */ @@ -1076,10 +1112,6 @@ void __init kvm_spinlock_init(void) pv_ops.lock.wait = kvm_wait; pv_ops.lock.kick = kvm_kick_cpu; - if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { - pv_ops.lock.vcpu_is_preempted = - PV_CALLEE_SAVE(__kvm_vcpu_is_preempted); - } /* * When PV spinlock is enabled which is preferred over * virt_spin_lock(), virt_spin_lock_key's value is meaningless. diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 462dd8e9b03d..16333ba1904b 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -174,7 +174,7 @@ static void kvm_register_clock(char *txt) pa = slow_virt_to_phys(&src->pvti) | 0x01ULL; wrmsrl(msr_kvm_system_time, pa); - pr_info("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt); + pr_debug("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt); } static void kvm_save_sched_clock_state(void) @@ -239,6 +239,9 @@ static void __init kvmclock_init_mem(void) static int __init kvm_setup_vsyscall_timeinfo(void) { + if (!kvm_para_available() || !kvmclock || nopv) + return 0; + kvmclock_init_mem(); #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index f5da4a18070a..0611fd83858e 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -27,6 +27,7 @@ #include <asm/kexec-bzimage64.h> #include <asm/setup.h> #include <asm/set_memory.h> +#include <asm/cpu.h> #ifdef CONFIG_ACPI /* @@ -310,6 +311,7 @@ void machine_kexec(struct kimage *image) /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); hw_breakpoint_disable(); + cet_disable(); if (image->preserve_context) { #ifdef CONFIG_X86_IO_APIC @@ -325,7 +327,7 @@ void machine_kexec(struct kimage *image) } control_page = page_address(image->control_code_page) + PAGE_SIZE; - memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); + __memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; @@ -374,9 +376,6 @@ void machine_kexec(struct kimage *image) #ifdef CONFIG_KEXEC_FILE void *arch_kexec_kernel_image_load(struct kimage *image) { - vfree(image->elf_headers); - image->elf_headers = NULL; - if (!image->fops || !image->fops->load) return ERR_PTR(-ENOEXEC); @@ -512,6 +511,15 @@ overflow: (int)ELF64_R_TYPE(rel[i].r_info), value); return -ENOEXEC; } + +int arch_kimage_file_post_load_cleanup(struct kimage *image) +{ + vfree(image->elf_headers); + image->elf_headers = NULL; + image->elf_headers_sz = 0; + + return kexec_image_post_load_cleanup_default(image); +} #endif /* CONFIG_KEXEC_FILE */ static int diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 169fb6f4cd2e..c032edcd3d95 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -53,7 +53,7 @@ static unsigned long int get_module_load_offset(void) */ if (module_load_offset == 0) module_load_offset = - (get_random_int() % 1024 + 1) * PAGE_SIZE; + (prandom_u32_max(1024) + 1) * PAGE_SIZE; mutex_unlock(&module_kaslr_mutex); } return module_load_offset; @@ -67,6 +67,7 @@ static unsigned long int get_module_load_offset(void) void *module_alloc(unsigned long size) { + gfp_t gfp_mask = GFP_KERNEL; void *p; if (PAGE_ALIGN(size) > MODULES_LEN) @@ -74,10 +75,10 @@ void *module_alloc(unsigned long size) p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR + get_module_load_offset(), - MODULES_END, GFP_KERNEL, - PAGE_KERNEL, 0, NUMA_NO_NODE, + MODULES_END, gfp_mask, + PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, __builtin_return_address(0)); - if (p && (kasan_module_alloc(p, size) < 0)) { + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { vfree(p); return NULL; } @@ -252,7 +253,7 @@ int module_finalize(const Elf_Ehdr *hdr, { const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, *para = NULL, *orc = NULL, *orc_ip = NULL, - *retpolines = NULL; + *retpolines = NULL, *returns = NULL, *ibt_endbr = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { @@ -270,17 +271,37 @@ int module_finalize(const Elf_Ehdr *hdr, orc_ip = s; if (!strcmp(".retpoline_sites", secstrings + s->sh_name)) retpolines = s; + if (!strcmp(".return_sites", secstrings + s->sh_name)) + returns = s; + if (!strcmp(".ibt_endbr_seal", secstrings + s->sh_name)) + ibt_endbr = s; } + /* + * See alternative_instructions() for the ordering rules between the + * various patching types. + */ + if (para) { + void *pseg = (void *)para->sh_addr; + apply_paravirt(pseg, pseg + para->sh_size); + } if (retpolines) { void *rseg = (void *)retpolines->sh_addr; apply_retpolines(rseg, rseg + retpolines->sh_size); } + if (returns) { + void *rseg = (void *)returns->sh_addr; + apply_returns(rseg, rseg + returns->sh_size); + } if (alt) { /* patch .altinstructions */ void *aseg = (void *)alt->sh_addr; apply_alternatives(aseg, aseg + alt->sh_size); } + if (ibt_endbr) { + void *iseg = (void *)ibt_endbr->sh_addr; + apply_ibt_endbr(iseg, iseg + ibt_endbr->sh_size); + } if (locks && text) { void *lseg = (void *)locks->sh_addr; void *tseg = (void *)text->sh_addr; @@ -289,14 +310,6 @@ int module_finalize(const Elf_Ehdr *hdr, tseg, tseg + text->sh_size); } - if (para) { - void *pseg = (void *)para->sh_addr; - apply_paravirt(pseg, pseg + para->sh_size); - } - - /* make jump label nops */ - jump_label_apply_nops(me); - if (orc && orc_ip) unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size, (void *)orc->sh_addr, orc->sh_size); diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 4bce802d25fb..cec0bfa3bc04 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -157,7 +157,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action) struct nmi_desc *desc = nmi_to_desc(type); unsigned long flags; - if (!action->handler) + if (WARN_ON_ONCE(!action->handler || !list_empty(&action->list))) return -EINVAL; raw_spin_lock_irqsave(&desc->lock, flags); @@ -177,7 +177,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action) list_add_rcu(&action->list, &desc->head); else list_add_tail_rcu(&action->list, &desc->head); - + raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } @@ -186,7 +186,7 @@ EXPORT_SYMBOL(__register_nmi_handler); void unregister_nmi_handler(unsigned int type, const char *name) { struct nmi_desc *desc = nmi_to_desc(type); - struct nmiaction *n; + struct nmiaction *n, *found = NULL; unsigned long flags; raw_spin_lock_irqsave(&desc->lock, flags); @@ -200,12 +200,16 @@ void unregister_nmi_handler(unsigned int type, const char *name) WARN(in_nmi(), "Trying to free NMI (%s) from NMI context!\n", n->name); list_del_rcu(&n->list); + found = n; break; } } raw_spin_unlock_irqrestore(&desc->lock, flags); - synchronize_rcu(); + if (found) { + synchronize_rcu(); + INIT_LIST_HEAD(&found->list); + } } EXPORT_SYMBOL_GPL(unregister_nmi_handler); @@ -292,7 +296,6 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", reason, smp_processor_id()); - pr_emerg("Do you have a strange power saving mode enabled?\n"); if (unknown_nmi_panic || panic_on_unrecovered_nmi) nmi_panic(regs, "NMI: Not continuing"); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 7f7636aac620..7ca2d46c08cc 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -41,7 +41,8 @@ extern void _paravirt_nop(void); asm (".pushsection .entry.text, \"ax\"\n" ".global _paravirt_nop\n" "_paravirt_nop:\n\t" - "ret\n\t" + ASM_ENDBR + ASM_RET ".size _paravirt_nop, . - _paravirt_nop\n\t" ".type _paravirt_nop, @function\n\t" ".popsection"); @@ -50,8 +51,9 @@ asm (".pushsection .entry.text, \"ax\"\n" asm (".pushsection .entry.text, \"ax\"\n" ".global paravirt_ret0\n" "paravirt_ret0:\n\t" + ASM_ENDBR "xor %" _ASM_AX ", %" _ASM_AX ";\n\t" - "ret\n\t" + ASM_RET ".size paravirt_ret0, . - paravirt_ret0\n\t" ".type paravirt_ret0, @function\n\t" ".popsection"); @@ -69,29 +71,12 @@ noinstr void paravirt_BUG(void) BUG(); } -struct branch { - unsigned char opcode; - u32 delta; -} __attribute__((packed)); - static unsigned paravirt_patch_call(void *insn_buff, const void *target, unsigned long addr, unsigned len) { - const int call_len = 5; - struct branch *b = insn_buff; - unsigned long delta = (unsigned long)target - (addr+call_len); - - if (len < call_len) { - pr_warn("paravirt: Failed to patch indirect CALL at %ps\n", (void *)addr); - /* Kernel might not be viable if patching fails, bail out: */ - BUG_ON(1); - } - - b->opcode = 0xe8; /* call */ - b->delta = delta; - BUILD_BUG_ON(sizeof(*b) != call_len); - - return call_len; + __text_gen_insn(insn_buff, CALL_INSN_OPCODE, + (void *)addr, target, CALL_INSN_SIZE); + return CALL_INSN_SIZE; } #ifdef CONFIG_PARAVIRT_XXL @@ -149,8 +134,6 @@ void paravirt_set_sched_clock(u64 (*func)(void)) } /* These are in entry.S */ -extern void native_iret(void); - static struct resource reserve_ioports = { .start = 0, .end = IO_SPACE_LIMIT, @@ -414,8 +397,6 @@ struct paravirt_patch_template pv_ops = { #ifdef CONFIG_PARAVIRT_XXL NOKPROBE_SYMBOL(native_load_idt); - -void (*paravirt_iret)(void) = native_iret; #endif EXPORT_SYMBOL(pv_ops); diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index de234e7a8962..30bbe4abb5d6 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -7,13 +7,16 @@ #include <linux/memblock.h> #include <linux/gfp.h> #include <linux/pci.h> +#include <linux/amd-iommu.h> #include <asm/proto.h> #include <asm/dma.h> #include <asm/iommu.h> #include <asm/gart.h> #include <asm/x86_init.h> -#include <asm/iommu_table.h> + +#include <xen/xen.h> +#include <xen/swiotlb-xen.h> static bool disable_dac_quirk __read_mostly; @@ -34,24 +37,90 @@ int no_iommu __read_mostly; /* Set this to 1 if there is a HW IOMMU in the system */ int iommu_detected __read_mostly = 0; -extern struct iommu_table_entry __iommu_table[], __iommu_table_end[]; +#ifdef CONFIG_SWIOTLB +bool x86_swiotlb_enable; +static unsigned int x86_swiotlb_flags; + +static void __init pci_swiotlb_detect(void) +{ + /* don't initialize swiotlb if iommu=off (no_iommu=1) */ + if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN) + x86_swiotlb_enable = true; + + /* + * Set swiotlb to 1 so that bounce buffers are allocated and used for + * devices that can't support DMA to encrypted memory. + */ + if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) + x86_swiotlb_enable = true; + + /* + * Guest with guest memory encryption currently perform all DMA through + * bounce buffers as the hypervisor can't access arbitrary VM memory + * that is not explicitly shared with it. + */ + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { + x86_swiotlb_enable = true; + x86_swiotlb_flags |= SWIOTLB_FORCE; + } +} +#else +static inline void __init pci_swiotlb_detect(void) +{ +} +#define x86_swiotlb_flags 0 +#endif /* CONFIG_SWIOTLB */ + +#ifdef CONFIG_SWIOTLB_XEN +static void __init pci_xen_swiotlb_init(void) +{ + if (!xen_initial_domain() && !x86_swiotlb_enable) + return; + x86_swiotlb_enable = true; + x86_swiotlb_flags |= SWIOTLB_ANY; + swiotlb_init_remap(true, x86_swiotlb_flags, xen_swiotlb_fixup); + dma_ops = &xen_swiotlb_dma_ops; + if (IS_ENABLED(CONFIG_PCI)) + pci_request_acs(); +} + +int pci_xen_swiotlb_init_late(void) +{ + if (dma_ops == &xen_swiotlb_dma_ops) + return 0; + + /* we can work with the default swiotlb */ + if (!io_tlb_default_mem.nslabs) { + int rc = swiotlb_init_late(swiotlb_size_or_default(), + GFP_KERNEL, xen_swiotlb_fixup); + if (rc < 0) + return rc; + } + + /* XXX: this switches the dma ops under live devices! */ + dma_ops = &xen_swiotlb_dma_ops; + if (IS_ENABLED(CONFIG_PCI)) + pci_request_acs(); + return 0; +} +EXPORT_SYMBOL_GPL(pci_xen_swiotlb_init_late); +#else +static inline void __init pci_xen_swiotlb_init(void) +{ +} +#endif /* CONFIG_SWIOTLB_XEN */ void __init pci_iommu_alloc(void) { - struct iommu_table_entry *p; - - sort_iommu_table(__iommu_table, __iommu_table_end); - check_iommu_entries(__iommu_table, __iommu_table_end); - - for (p = __iommu_table; p < __iommu_table_end; p++) { - if (p && p->detect && p->detect() > 0) { - p->flags |= IOMMU_DETECTED; - if (p->early_init) - p->early_init(); - if (p->flags & IOMMU_FINISH_IF_DETECTED) - break; - } + if (xen_pv_domain()) { + pci_xen_swiotlb_init(); + return; } + pci_swiotlb_detect(); + gart_iommu_hole_init(); + amd_iommu_detect(); + detect_intel_iommu(); + swiotlb_init(x86_swiotlb_enable, x86_swiotlb_flags); } /* @@ -102,7 +171,7 @@ static __init int iommu_setup(char *p) } #ifdef CONFIG_SWIOTLB if (!strncmp(p, "soft", 4)) - swiotlb = 1; + x86_swiotlb_enable = true; #endif if (!strncmp(p, "pt", 2)) iommu_set_default_passthrough(true); @@ -121,14 +190,17 @@ early_param("iommu", iommu_setup); static int __init pci_iommu_init(void) { - struct iommu_table_entry *p; - x86_init.iommu.iommu_init(); - for (p = __iommu_table; p < __iommu_table_end; p++) { - if (p && (p->flags & IOMMU_DETECTED) && p->late_init) - p->late_init(); +#ifdef CONFIG_SWIOTLB + /* An IOMMU turned us off. */ + if (x86_swiotlb_enable) { + pr_info("PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); + swiotlb_print_info(); + } else { + swiotlb_exit(); } +#endif return 0; } diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c deleted file mode 100644 index 42e92ec62973..000000000000 --- a/arch/x86/kernel/pci-iommu_table.c +++ /dev/null @@ -1,77 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/dma-mapping.h> -#include <asm/iommu_table.h> -#include <linux/string.h> -#include <linux/kallsyms.h> - -static struct iommu_table_entry * __init -find_dependents_of(struct iommu_table_entry *start, - struct iommu_table_entry *finish, - struct iommu_table_entry *q) -{ - struct iommu_table_entry *p; - - if (!q) - return NULL; - - for (p = start; p < finish; p++) - if (p->detect == q->depend) - return p; - - return NULL; -} - - -void __init sort_iommu_table(struct iommu_table_entry *start, - struct iommu_table_entry *finish) { - - struct iommu_table_entry *p, *q, tmp; - - for (p = start; p < finish; p++) { -again: - q = find_dependents_of(start, finish, p); - /* We are bit sneaky here. We use the memory address to figure - * out if the node we depend on is past our point, if so, swap. - */ - if (q > p) { - tmp = *p; - memmove(p, q, sizeof(*p)); - *q = tmp; - goto again; - } - } - -} - -#ifdef DEBUG -void __init check_iommu_entries(struct iommu_table_entry *start, - struct iommu_table_entry *finish) -{ - struct iommu_table_entry *p, *q, *x; - - /* Simple cyclic dependency checker. */ - for (p = start; p < finish; p++) { - q = find_dependents_of(start, finish, p); - x = find_dependents_of(start, finish, q); - if (p == x) { - printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %pS depends on %pS and vice-versa. BREAKING IT.\n", - p->detect, q->detect); - /* Heavy handed way..*/ - x->depend = NULL; - } - } - - for (p = start; p < finish; p++) { - q = find_dependents_of(p, finish, p); - if (q && q > p) { - printk(KERN_ERR "EXECUTION ORDER INVALID! %pS should be called before %pS!\n", - p->detect, q->detect); - } - } -} -#else -void __init check_iommu_entries(struct iommu_table_entry *start, - struct iommu_table_entry *finish) -{ -} -#endif diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c deleted file mode 100644 index 814ab46a0dad..000000000000 --- a/arch/x86/kernel/pci-swiotlb.c +++ /dev/null @@ -1,77 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include <linux/pci.h> -#include <linux/cache.h> -#include <linux/init.h> -#include <linux/swiotlb.h> -#include <linux/memblock.h> -#include <linux/dma-direct.h> -#include <linux/cc_platform.h> - -#include <asm/iommu.h> -#include <asm/swiotlb.h> -#include <asm/dma.h> -#include <asm/xen/swiotlb-xen.h> -#include <asm/iommu_table.h> - -int swiotlb __read_mostly; - -/* - * pci_swiotlb_detect_override - set swiotlb to 1 if necessary - * - * This returns non-zero if we are forced to use swiotlb (by the boot - * option). - */ -int __init pci_swiotlb_detect_override(void) -{ - if (swiotlb_force == SWIOTLB_FORCE) - swiotlb = 1; - - return swiotlb; -} -IOMMU_INIT_FINISH(pci_swiotlb_detect_override, - pci_xen_swiotlb_detect, - pci_swiotlb_init, - pci_swiotlb_late_init); - -/* - * If 4GB or more detected (and iommu=off not set) or if SME is active - * then set swiotlb to 1 and return 1. - */ -int __init pci_swiotlb_detect_4gb(void) -{ - /* don't initialize swiotlb if iommu=off (no_iommu=1) */ - if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN) - swiotlb = 1; - - /* - * Set swiotlb to 1 so that bounce buffers are allocated and used for - * devices that can't support DMA to encrypted memory. - */ - if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) - swiotlb = 1; - - return swiotlb; -} -IOMMU_INIT(pci_swiotlb_detect_4gb, - pci_swiotlb_detect_override, - pci_swiotlb_init, - pci_swiotlb_late_init); - -void __init pci_swiotlb_init(void) -{ - if (swiotlb) - swiotlb_init(0); -} - -void __init pci_swiotlb_late_init(void) -{ - /* An IOMMU turned us off. */ - if (!swiotlb) - swiotlb_exit(); - else { - printk(KERN_INFO "PCI-DMA: " - "Using software bounce buffering for IO (SWIOTLB)\n"); - swiotlb_print_info(); - } -} diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c index 6b07faaa1579..23154d24b117 100644 --- a/arch/x86/kernel/pmem.c +++ b/arch/x86/kernel/pmem.c @@ -27,6 +27,11 @@ static __init int register_e820_pmem(void) * simply here to trigger the module to load on demand. */ pdev = platform_device_alloc("e820_pmem", -1); - return platform_device_add(pdev); + + rc = platform_device_add(pdev); + if (rc) + platform_device_put(pdev); + + return rc; } device_initcall(register_e820_pmem); diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c index 36e84d904260..319fef37d9dc 100644 --- a/arch/x86/kernel/probe_roms.c +++ b/arch/x86/kernel/probe_roms.c @@ -21,6 +21,7 @@ #include <asm/sections.h> #include <asm/io.h> #include <asm/setup_arch.h> +#include <asm/sev.h> static struct resource system_rom_resource = { .name = "System ROM", @@ -197,11 +198,21 @@ static int __init romchecksum(const unsigned char *rom, unsigned long length) void __init probe_roms(void) { - const unsigned char *rom; unsigned long start, length, upper; + const unsigned char *rom; unsigned char c; int i; + /* + * The ROM memory range is not part of the e820 table and is therefore not + * pre-validated by BIOS. The kernel page table maps the ROM region as encrypted + * memory, and SNP requires encrypted memory to be validated before access. + * Do that here. + */ + snp_prep_memory(video_rom_resource.start, + ((system_rom_resource.end + 1) - video_rom_resource.start), + SNP_PAGE_STATE_PRIVATE); + /* video rom */ upper = adapter_rom_resources[0].start; for (start = video_rom_resource.start; start < upper; start += 2048) { diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 04143a653a8a..c21b7347a26d 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -46,6 +46,7 @@ #include <asm/proto.h> #include <asm/frame.h> #include <asm/unwind.h> +#include <asm/tdx.h> #include "process.h" @@ -130,9 +131,11 @@ static int set_new_tls(struct task_struct *p, unsigned long tls) return do_set_thread_area_64(p, ARCH_SET_FS, tls); } -int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, - struct task_struct *p, unsigned long tls) +int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long sp = args->stack; + unsigned long tls = args->tls; struct inactive_task_frame *frame; struct fork_frame *fork_frame; struct pt_regs *childregs; @@ -160,6 +163,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, savesegment(ds, p->thread.ds); #else p->thread.sp0 = (unsigned long) (childregs + 1); + savesegment(gs, p->thread.gs); /* * Clear all status flags including IF and set fixed bit. 64bit * does not have this initialization as the frame does not contain @@ -169,13 +173,13 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, frame->flags = X86_EFLAGS_FIXED; #endif - fpu_clone(p, clone_flags); + fpu_clone(p, clone_flags, args->fn); /* Kernel thread ? */ if (unlikely(p->flags & PF_KTHREAD)) { p->thread.pkru = pkru_get_init_value(); memset(childregs, 0, sizeof(struct pt_regs)); - kthread_frame_init(frame, sp, arg); + kthread_frame_init(frame, args->fn, args->fn_arg); return 0; } @@ -191,14 +195,10 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, if (sp) childregs->sp = sp; -#ifdef CONFIG_X86_32 - task_user_gs(p) = get_user_gs(current_pt_regs()); -#endif - - if (unlikely(p->flags & PF_IO_WORKER)) { + if (unlikely(args->fn)) { /* - * An IO thread is a user space thread, but it doesn't - * return to ret_after_fork(). + * A user space thread, but it doesn't return to + * ret_after_fork(). * * In order to indicate that to tools like gdb, * we reset the stack and instruction pointers. @@ -208,7 +208,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, */ childregs->sp = 0; childregs->ip = 0; - kthread_frame_init(frame, sp, arg); + kthread_frame_init(frame, args->fn, args->fn_arg); return 0; } @@ -334,7 +334,7 @@ static int get_cpuid_mode(void) return !test_thread_flag(TIF_NOCPUID); } -static int set_cpuid_mode(struct task_struct *task, unsigned long cpuid_enabled) +static int set_cpuid_mode(unsigned long cpuid_enabled) { if (!boot_cpu_has(X86_FEATURE_CPUID_FAULT)) return -ENODEV; @@ -365,7 +365,7 @@ void arch_setup_new_exec(void) clear_thread_flag(TIF_SSBD); task_clear_spec_ssb_disable(current); task_clear_spec_ssb_noexec(current); - speculation_ctrl_update(task_thread_info(current)->flags); + speculation_ctrl_update(read_thread_flags()); } } @@ -405,7 +405,7 @@ static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm) } /** - * tss_update_io_bitmap - Update I/O bitmap before exiting to usermode + * native_tss_update_io_bitmap - Update I/O bitmap before exiting to user mode */ void native_tss_update_io_bitmap(void) { @@ -600,7 +600,7 @@ static __always_inline void __speculation_ctrl_update(unsigned long tifp, } if (updmsr) - wrmsrl(MSR_IA32_SPEC_CTRL, msr); + write_spec_ctrl_current(msr, false); } static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk) @@ -617,7 +617,7 @@ static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk) clear_tsk_thread_flag(tsk, TIF_SPEC_IB); } /* Return the updated threadinfo flags*/ - return task_thread_info(tsk)->flags; + return read_task_thread_flags(tsk); } void speculation_ctrl_update(unsigned long tif) @@ -653,8 +653,8 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) { unsigned long tifp, tifn; - tifn = READ_ONCE(task_thread_info(next_p)->flags); - tifp = READ_ONCE(task_thread_info(prev_p)->flags); + tifn = read_task_thread_flags(next_p); + tifp = read_task_thread_flags(prev_p); switch_to_bitmap(tifp); @@ -686,9 +686,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) /* Enforce MSR update to ensure consistent state */ __speculation_ctrl_update(~tifn, tifn); } - - if ((tifp ^ tifn) & _TIF_SLD) - switch_to_sld(tifn); } /* @@ -747,7 +744,7 @@ bool xen_set_default_idle(void) } #endif -void stop_this_cpu(void *dummy) +void __noreturn stop_this_cpu(void *dummy) { local_irq_disable(); /* @@ -765,8 +762,11 @@ void stop_this_cpu(void *dummy) * without the encryption bit, they don't race each other when flushed * and potentially end up with the wrong entry being committed to * memory. + * + * Test the CPUID bit directly because the machine might've cleared + * X86_FEATURE_SME due to cmdline options. */ - if (boot_cpu_has(X86_FEATURE_SME)) + if (cpuid_eax(0x8000001f) & BIT(0)) native_wbinvd(); for (;;) { /* @@ -810,24 +810,43 @@ static void amd_e400_idle(void) } /* - * Intel Core2 and older machines prefer MWAIT over HALT for C1. - * We can't rely on cpuidle installing MWAIT, because it will not load - * on systems that support only C1 -- so the boot default must be MWAIT. - * - * Some AMD machines are the opposite, they depend on using HALT. + * Prefer MWAIT over HALT if MWAIT is supported, MWAIT_CPUID leaf + * exists and whenever MONITOR/MWAIT extensions are present there is at + * least one C1 substate. * - * So for default C1, which is used during boot until cpuidle loads, - * use MWAIT-C1 on Intel HW that has it, else use HALT. + * Do not prefer MWAIT if MONITOR instruction has a bug or idle=nomwait + * is passed to kernel commandline parameter. */ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) { - if (c->x86_vendor != X86_VENDOR_INTEL) + u32 eax, ebx, ecx, edx; + + /* User has disallowed the use of MWAIT. Fallback to HALT */ + if (boot_option_idle_override == IDLE_NOMWAIT) return 0; - if (!cpu_has(c, X86_FEATURE_MWAIT) || boot_cpu_has_bug(X86_BUG_MONITOR)) + /* MWAIT is not supported on this platform. Fallback to HALT */ + if (!cpu_has(c, X86_FEATURE_MWAIT)) return 0; - return 1; + /* Monitor has a bug. Fallback to HALT */ + if (boot_cpu_has_bug(X86_BUG_MONITOR)) + return 0; + + cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); + + /* + * If MWAIT extensions are not available, it is safe to use MWAIT + * with EAX=0, ECX=0. + */ + if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) + return 1; + + /* + * If MWAIT extensions are available, there should be at least one + * MWAIT C1 substate present. + */ + return (edx & MWAIT_C1_SUBSTATE_MASK); } /* @@ -870,6 +889,9 @@ void select_idle_routine(const struct cpuinfo_x86 *c) } else if (prefer_mwait_c1_over_halt(c)) { pr_info("using mwait in idle threads\n"); x86_idle = mwait_idle; + } else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { + pr_info("using TDX aware idle routine\n"); + x86_idle = tdx_safe_halt; } else x86_idle = default_idle; } @@ -929,9 +951,8 @@ static int __init idle_setup(char *str) } else if (!strcmp(str, "nomwait")) { /* * If the boot option of "idle=nomwait" is added, - * it means that mwait will be disabled for CPU C2/C3 - * states. In such case it won't touch the variable - * of boot_option_idle_override. + * it means that mwait will be disabled for CPU C1/C2/C3 + * states. */ boot_option_idle_override = IDLE_NOMWAIT; } else @@ -944,7 +965,7 @@ early_param("idle", idle_setup); unsigned long arch_align_stack(unsigned long sp) { if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() % 8192; + sp -= prandom_u32_max(8192); return sp & ~0xf; } @@ -982,18 +1003,19 @@ unsigned long __get_wchan(struct task_struct *p) return addr; } -long do_arch_prctl_common(struct task_struct *task, int option, - unsigned long arg2) +long do_arch_prctl_common(int option, unsigned long arg2) { switch (option) { case ARCH_GET_CPUID: return get_cpuid_mode(); case ARCH_SET_CPUID: - return set_cpuid_mode(task, arg2); + return set_cpuid_mode(arg2); case ARCH_GET_XCOMP_SUPP: case ARCH_GET_XCOMP_PERM: case ARCH_REQ_XCOMP_PERM: - return fpu_xstate_prctl(task, option, arg2); + case ARCH_GET_XCOMP_GUEST_PERM: + case ARCH_REQ_XCOMP_GUEST_PERM: + return fpu_xstate_prctl(option, arg2); } return -EINVAL; diff --git a/arch/x86/kernel/process.h b/arch/x86/kernel/process.h index 1d0797b2338a..76b547b83232 100644 --- a/arch/x86/kernel/process.h +++ b/arch/x86/kernel/process.h @@ -13,8 +13,8 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p); static inline void switch_to_extra(struct task_struct *prev, struct task_struct *next) { - unsigned long next_tif = task_thread_info(next)->flags; - unsigned long prev_tif = task_thread_info(prev)->flags; + unsigned long next_tif = read_task_thread_flags(next); + unsigned long prev_tif = read_task_thread_flags(prev); if (IS_ENABLED(CONFIG_SMP)) { /* diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 26edb1cd07a4..2f314b170c9f 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -63,10 +63,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, unsigned long d0, d1, d2, d3, d6, d7; unsigned short gs; - if (user_mode(regs)) - gs = get_user_gs(regs); - else - savesegment(gs, gs); + savesegment(gs, gs); show_ip(regs, log_lvl); @@ -114,7 +111,7 @@ void release_thread(struct task_struct *dead_task) void start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) { - set_user_gs(regs, 0); + loadsegment(gs, 0); regs->fs = 0; regs->ds = __USER_DS; regs->es = __USER_DS; @@ -177,7 +174,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * used %fs or %gs (it does not today), or if the kernel is * running inside of a hypervisor layer. */ - lazy_save_gs(prev->gs); + savesegment(gs, prev->gs); /* * Load the per-thread Thread-Local Storage descriptor. @@ -208,7 +205,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * Restore %gs if needed (which is common) */ if (prev->gs | next->gs) - lazy_load_gs(next->gs); + loadsegment(gs, next->gs); this_cpu_write(current_task, next_p); @@ -222,5 +219,5 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) { - return do_arch_prctl_common(current, option, arg2); + return do_arch_prctl_common(option, arg2); } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3402edec236c..6b3418bff326 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -553,6 +553,7 @@ void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32) * Kprobes not supported here. Set the probe on schedule instead. * Function graph tracer not supported too. */ +__no_kmsan_checks __visible __notrace_funcgraph struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { @@ -681,7 +682,7 @@ void set_personality_64bit(void) static void __set_personality_x32(void) { -#ifdef CONFIG_X86_X32 +#ifdef CONFIG_X86_X32_ABI if (current->mm) current->mm->context.flags = 0; @@ -844,7 +845,7 @@ SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) ret = do_arch_prctl_64(current, option, arg2); if (ret == -EINVAL) - ret = do_arch_prctl_common(current, option, arg2); + ret = do_arch_prctl_common(option, arg2); return ret; } @@ -852,7 +853,7 @@ SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) #ifdef CONFIG_IA32_EMULATION COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) { - return do_arch_prctl_common(current, option, arg2); + return do_arch_prctl_common(option, arg2); } #endif diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 6d2244c94799..37c12fb92906 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -13,7 +13,6 @@ #include <linux/errno.h> #include <linux/slab.h> #include <linux/ptrace.h> -#include <linux/tracehook.h> #include <linux/user.h> #include <linux/elf.h> #include <linux/security.h> @@ -171,9 +170,9 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset) retval = *pt_regs_access(task_pt_regs(task), offset); else { if (task == current) - retval = get_user_gs(task_pt_regs(task)); + savesegment(gs, retval); else - retval = task_user_gs(task); + retval = task->thread.gs; } return retval; } @@ -211,7 +210,7 @@ static int set_segment_reg(struct task_struct *task, break; case offsetof(struct user_regs_struct, gs): - task_user_gs(task) = value; + task->thread.gs = value; } return 0; @@ -1224,7 +1223,7 @@ static struct user_regset x86_64_regsets[] __ro_after_init = { }, [REGSET_FP] = { .core_note_type = NT_PRFPREG, - .n = sizeof(struct user_i387_struct) / sizeof(long), + .n = sizeof(struct fxregs_state) / sizeof(long), .size = sizeof(long), .align = sizeof(long), .active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set }, @@ -1271,7 +1270,7 @@ static struct user_regset x86_32_regsets[] __ro_after_init = { }, [REGSET_XFP] = { .core_note_type = NT_PRXFPREG, - .n = sizeof(struct user32_fxsr_struct) / sizeof(u32), + .n = sizeof(struct fxregs_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), .active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set }, diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 0a40df66a40d..c3636ea4aa71 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -113,17 +113,9 @@ void __noreturn machine_real_restart(unsigned int type) spin_unlock(&rtc_lock); /* - * Switch back to the initial page table. + * Switch to the trampoline page table. */ -#ifdef CONFIG_X86_32 - load_cr3(initial_page_table); -#else - write_cr3(real_mode_header->trampoline_pgd); - - /* Exiting long mode will fail if CR4.PCIDE is set. */ - if (boot_cpu_has(X86_FEATURE_PCID)) - cr4_clear_bits(X86_CR4_PCIDE); -#endif + load_trampoline_pgtable(); /* Jump to the identity-mapped low memory code */ #ifdef CONFIG_X86_32 @@ -747,10 +739,10 @@ static void native_machine_halt(void) static void native_machine_power_off(void) { - if (pm_power_off) { + if (kernel_can_power_off()) { if (!reboot_force) machine_shutdown(); - pm_power_off(); + do_kernel_power_off(); } /* A fallback in case there is no PM info available */ tboot_shutdown(TB_SHUTDOWN_HALT); diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index f469153eca8a..c7c4b1917336 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -7,10 +7,12 @@ #include <linux/linkage.h> #include <asm/page_types.h> #include <asm/kexec.h> +#include <asm/nospec-branch.h> #include <asm/processor-flags.h> /* - * Must be relocatable PIC code callable as a C function + * Must be relocatable PIC code callable as a C function, in particular + * there must be a plain RET and not jump to return thunk. */ #define PTR(x) (x << 2) @@ -91,7 +93,9 @@ SYM_CODE_START_NOALIGN(relocate_kernel) movl %edi, %eax addl $(identity_mapped - relocate_kernel), %eax pushl %eax + ANNOTATE_UNRET_SAFE ret + int3 SYM_CODE_END(relocate_kernel) SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) @@ -159,12 +163,15 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) xorl %edx, %edx xorl %esi, %esi xorl %ebp, %ebp + ANNOTATE_UNRET_SAFE ret + int3 1: popl %edx movl CP_PA_SWAP_PAGE(%edi), %esp addl $PAGE_SIZE, %esp 2: + ANNOTATE_RETPOLINE_SAFE call *%edx /* get the re-entry point of the peer system */ @@ -190,7 +197,9 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) movl %edi, %eax addl $(virtual_mapped - relocate_kernel), %eax pushl %eax + ANNOTATE_UNRET_SAFE ret + int3 SYM_CODE_END(identity_mapped) SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) @@ -208,7 +217,9 @@ SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) popl %edi popl %esi popl %ebx + ANNOTATE_UNRET_SAFE ret + int3 SYM_CODE_END(virtual_mapped) /* Do the copies */ @@ -271,7 +282,9 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) popl %edi popl %ebx popl %ebp + ANNOTATE_UNRET_SAFE ret + int3 SYM_CODE_END(swap_pages) .globl kexec_control_code_size diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index c8fe74a28143..4809c0dc4eb0 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -13,7 +13,8 @@ #include <asm/unwind_hints.h> /* - * Must be relocatable PIC code callable as a C function + * Must be relocatable PIC code callable as a C function, in particular + * there must be a plain RET and not jump to return thunk. */ #define PTR(x) (x << 3) @@ -42,6 +43,7 @@ .code64 SYM_CODE_START_NOALIGN(relocate_kernel) UNWIND_HINT_EMPTY + ANNOTATE_NOENDBR /* * %rdi indirection_page * %rsi page_list @@ -104,7 +106,9 @@ SYM_CODE_START_NOALIGN(relocate_kernel) /* jump to identity mapped page */ addq $(identity_mapped - relocate_kernel), %r8 pushq %r8 + ANNOTATE_UNRET_SAFE ret + int3 SYM_CODE_END(relocate_kernel) SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) @@ -115,6 +119,14 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) pushq %rdx /* + * Clear X86_CR4_CET (if it was set) such that we can clear CR0_WP + * below. + */ + movq %cr4, %rax + andq $~(X86_CR4_CET), %rax + movq %rax, %cr4 + + /* * Set cr0 to a known state: * - Paging enabled * - Alignment check disabled @@ -191,7 +203,9 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) xorl %r14d, %r14d xorl %r15d, %r15d + ANNOTATE_UNRET_SAFE ret + int3 1: popq %rdx @@ -210,11 +224,14 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) call swap_pages movq $virtual_mapped, %rax pushq %rax + ANNOTATE_UNRET_SAFE ret + int3 SYM_CODE_END(identity_mapped) SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) UNWIND_HINT_EMPTY + ANNOTATE_NOENDBR // RET target, above movq RSP(%r8), %rsp movq CR4(%r8), %rax movq %rax, %cr4 @@ -231,7 +248,9 @@ SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) popq %r12 popq %rbp popq %rbx + ANNOTATE_UNRET_SAFE ret + int3 SYM_CODE_END(virtual_mapped) /* Do the copies */ @@ -288,7 +307,9 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) lea PAGE_SIZE(%rax), %rsi jmp 0b 3: + ANNOTATE_UNRET_SAFE ret + int3 SYM_CODE_END(swap_pages) .globl kexec_control_code_size diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c index 9b9fb7882c20..bba1abd05bfe 100644 --- a/arch/x86/kernel/resource.c +++ b/arch/x86/kernel/resource.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/ioport.h> +#include <linux/printk.h> #include <asm/e820/api.h> +#include <asm/pci_x86.h> static void resource_clip(struct resource *res, resource_size_t start, resource_size_t end) @@ -27,12 +29,23 @@ static void remove_e820_regions(struct resource *avail) { int i; struct e820_entry *entry; + u64 e820_start, e820_end; + struct resource orig = *avail; + + if (!pci_use_e820) + return; for (i = 0; i < e820_table->nr_entries; i++) { entry = &e820_table->entries[i]; + e820_start = entry->addr; + e820_end = entry->addr + entry->size - 1; - resource_clip(avail, entry->addr, - entry->addr + entry->size - 1); + resource_clip(avail, e820_start, e820_end); + if (orig.start != avail->start || orig.end != avail->end) { + pr_info("clipped %pR to %pR for e820 entry [mem %#010Lx-%#010Lx]\n", + &orig, avail, e820_start, e820_end); + orig = *avail; + } } } diff --git a/arch/x86/kernel/rethook.c b/arch/x86/kernel/rethook.c new file mode 100644 index 000000000000..8a1c0111ae79 --- /dev/null +++ b/arch/x86/kernel/rethook.c @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * x86 implementation of rethook. Mostly copied from arch/x86/kernel/kprobes/core.c. + */ +#include <linux/bug.h> +#include <linux/rethook.h> +#include <linux/kprobes.h> +#include <linux/objtool.h> + +#include "kprobes/common.h" + +__visible void arch_rethook_trampoline_callback(struct pt_regs *regs); + +#ifndef ANNOTATE_NOENDBR +#define ANNOTATE_NOENDBR +#endif + +/* + * When a target function returns, this code saves registers and calls + * arch_rethook_trampoline_callback(), which calls the rethook handler. + */ +asm( + ".text\n" + ".global arch_rethook_trampoline\n" + ".type arch_rethook_trampoline, @function\n" + "arch_rethook_trampoline:\n" +#ifdef CONFIG_X86_64 + ANNOTATE_NOENDBR /* This is only jumped from ret instruction */ + /* Push a fake return address to tell the unwinder it's a rethook. */ + " pushq $arch_rethook_trampoline\n" + UNWIND_HINT_FUNC + " pushq $" __stringify(__KERNEL_DS) "\n" + /* Save the 'sp - 16', this will be fixed later. */ + " pushq %rsp\n" + " pushfq\n" + SAVE_REGS_STRING + " movq %rsp, %rdi\n" + " call arch_rethook_trampoline_callback\n" + RESTORE_REGS_STRING + /* In the callback function, 'regs->flags' is copied to 'regs->ss'. */ + " addq $16, %rsp\n" + " popfq\n" +#else + /* Push a fake return address to tell the unwinder it's a rethook. */ + " pushl $arch_rethook_trampoline\n" + UNWIND_HINT_FUNC + " pushl %ss\n" + /* Save the 'sp - 8', this will be fixed later. */ + " pushl %esp\n" + " pushfl\n" + SAVE_REGS_STRING + " movl %esp, %eax\n" + " call arch_rethook_trampoline_callback\n" + RESTORE_REGS_STRING + /* In the callback function, 'regs->flags' is copied to 'regs->ss'. */ + " addl $8, %esp\n" + " popfl\n" +#endif + ASM_RET + ".size arch_rethook_trampoline, .-arch_rethook_trampoline\n" +); +NOKPROBE_SYMBOL(arch_rethook_trampoline); + +/* + * Called from arch_rethook_trampoline + */ +__used __visible void arch_rethook_trampoline_callback(struct pt_regs *regs) +{ + unsigned long *frame_pointer; + + /* fixup registers */ + regs->cs = __KERNEL_CS; +#ifdef CONFIG_X86_32 + regs->gs = 0; +#endif + regs->ip = (unsigned long)&arch_rethook_trampoline; + regs->orig_ax = ~0UL; + regs->sp += 2*sizeof(long); + frame_pointer = (long *)(regs + 1); + + /* + * The return address at 'frame_pointer' is recovered by the + * arch_rethook_fixup_return() which called from this + * rethook_trampoline_handler(). + */ + rethook_trampoline_handler(regs, (unsigned long)frame_pointer); + + /* + * Copy FLAGS to 'pt_regs::ss' so that arch_rethook_trapmoline() + * can do RET right after POPF. + */ + *(unsigned long *)®s->ss = regs->flags; +} +NOKPROBE_SYMBOL(arch_rethook_trampoline_callback); + +/* + * arch_rethook_trampoline() skips updating frame pointer. The frame pointer + * saved in arch_rethook_trampoline_callback() points to the real caller + * function's frame pointer. Thus the arch_rethook_trampoline() doesn't have + * a standard stack frame with CONFIG_FRAME_POINTER=y. + * Let's mark it non-standard function. Anyway, FP unwinder can correctly + * unwind without the hint. + */ +STACK_FRAME_NON_STANDARD_FP(arch_rethook_trampoline); + +/* This is called from rethook_trampoline_handler(). */ +void arch_rethook_fixup_return(struct pt_regs *regs, + unsigned long correct_ret_addr) +{ + unsigned long *frame_pointer = (void *)(regs + 1); + + /* Replace fake return address with real one. */ + *frame_pointer = correct_ret_addr; +} +NOKPROBE_SYMBOL(arch_rethook_fixup_return); + +void arch_rethook_prepare(struct rethook_node *rh, struct pt_regs *regs, bool mcount) +{ + unsigned long *stack = (unsigned long *)regs->sp; + + rh->ret_addr = stack[0]; + rh->frame = regs->sp; + + /* Replace the return addr with trampoline addr */ + stack[0] = (unsigned long) arch_rethook_trampoline; +} +NOKPROBE_SYMBOL(arch_rethook_prepare); diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 586f718b8e95..349046434513 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -4,11 +4,8 @@ */ #include <linux/platform_device.h> #include <linux/mc146818rtc.h> -#include <linux/acpi.h> -#include <linux/bcd.h> #include <linux/export.h> #include <linux/pnp.h> -#include <linux/of.h> #include <asm/vsyscall.h> #include <asm/x86_init.h> @@ -20,26 +17,23 @@ /* * This is a special lock that is owned by the CPU and holds the index * register we are working with. It is required for NMI access to the - * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details. + * CMOS/RTC registers. See arch/x86/include/asm/mc146818rtc.h for details. */ volatile unsigned long cmos_lock; EXPORT_SYMBOL(cmos_lock); #endif /* CONFIG_X86_32 */ -/* For two digit years assume time is always after that */ -#define CMOS_YEARS_OFFS 2000 - DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL(rtc_lock); /* - * In order to set the CMOS clock precisely, set_rtc_mmss has to be + * In order to set the CMOS clock precisely, mach_set_cmos_time has to be * called 500 ms after the second nowtime has started, because when * nowtime is written into the registers of the CMOS clock, it will * jump to the next second precisely 500 ms later. Check the Motorola * MC146818A or Dallas DS12887 data sheet for details. */ -int mach_set_rtc_mmss(const struct timespec64 *now) +int mach_set_cmos_time(const struct timespec64 *now) { unsigned long long nowtime = now->tv_sec; struct rtc_time tm; @@ -62,8 +56,7 @@ int mach_set_rtc_mmss(const struct timespec64 *now) void mach_get_cmos_time(struct timespec64 *now) { - unsigned int status, year, mon, day, hour, min, sec, century = 0; - unsigned long flags; + struct rtc_time tm; /* * If pm_trace abused the RTC as storage, set the timespec to 0, @@ -74,51 +67,13 @@ void mach_get_cmos_time(struct timespec64 *now) return; } - spin_lock_irqsave(&rtc_lock, flags); - - /* - * If UIP is clear, then we have >= 244 microseconds before - * RTC registers will be updated. Spec sheet says that this - * is the reliable way to read RTC - registers. If UIP is set - * then the register access might be invalid. - */ - while ((CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP)) - cpu_relax(); - - sec = CMOS_READ(RTC_SECONDS); - min = CMOS_READ(RTC_MINUTES); - hour = CMOS_READ(RTC_HOURS); - day = CMOS_READ(RTC_DAY_OF_MONTH); - mon = CMOS_READ(RTC_MONTH); - year = CMOS_READ(RTC_YEAR); - -#ifdef CONFIG_ACPI - if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && - acpi_gbl_FADT.century) - century = CMOS_READ(acpi_gbl_FADT.century); -#endif - - status = CMOS_READ(RTC_CONTROL); - WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY)); - - spin_unlock_irqrestore(&rtc_lock, flags); - - if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) { - sec = bcd2bin(sec); - min = bcd2bin(min); - hour = bcd2bin(hour); - day = bcd2bin(day); - mon = bcd2bin(mon); - year = bcd2bin(year); + if (mc146818_get_time(&tm)) { + pr_err("Unable to read current time from RTC\n"); + now->tv_sec = now->tv_nsec = 0; + return; } - if (century) { - century = bcd2bin(century); - year += century * 100; - } else - year += CMOS_YEARS_OFFS; - - now->tv_sec = mktime64(year, mon, day, hour, min, sec); + now->tv_sec = rtc_tm_to_time64(&tm); now->tv_nsec = 0; } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 6a190c7f4d71..216fee7144ee 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -11,6 +11,7 @@ #include <linux/dma-map-ops.h> #include <linux/dmi.h> #include <linux/efi.h> +#include <linux/ima.h> #include <linux/init_ohci1394_dma.h> #include <linux/initrd.h> #include <linux/iscsi_ibft.h> @@ -23,6 +24,7 @@ #include <linux/usb/xhci-dbgp.h> #include <linux/static_call.h> #include <linux/swiotlb.h> +#include <linux/random.h> #include <uapi/linux/mount.h> @@ -40,6 +42,7 @@ #include <asm/kasan.h> #include <asm/kaslr.h> #include <asm/mce.h> +#include <asm/memtype.h> #include <asm/mtrr.h> #include <asm/realmode.h> #include <asm/olpc_ofw.h> @@ -66,11 +69,6 @@ RESERVE_BRK(dmi_alloc, 65536); #endif -/* - * Range of the BSS area. The size of the BSS area is determined - * at link time, with RESERVE_BRK() facility reserving additional - * chunks. - */ unsigned long _brk_start = (unsigned long)__brk_base; unsigned long _brk_end = (unsigned long)__brk_base; @@ -144,6 +142,11 @@ __visible unsigned long mmu_cr4_features __ro_after_init; __visible unsigned long mmu_cr4_features __ro_after_init = X86_CR4_PAE; #endif +#ifdef CONFIG_IMA +static phys_addr_t ima_kexec_buffer_phys; +static size_t ima_kexec_buffer_size; +#endif + /* Boot loader ID and version as integers, for the benefit of proc_dointvec */ int bootloader_type, bootloader_version; @@ -334,6 +337,60 @@ static void __init reserve_initrd(void) } #endif /* CONFIG_BLK_DEV_INITRD */ +static void __init add_early_ima_buffer(u64 phys_addr) +{ +#ifdef CONFIG_IMA + struct ima_setup_data *data; + + data = early_memremap(phys_addr + sizeof(struct setup_data), sizeof(*data)); + if (!data) { + pr_warn("setup: failed to memremap ima_setup_data entry\n"); + return; + } + + if (data->size) { + memblock_reserve(data->addr, data->size); + ima_kexec_buffer_phys = data->addr; + ima_kexec_buffer_size = data->size; + } + + early_memunmap(data, sizeof(*data)); +#else + pr_warn("Passed IMA kexec data, but CONFIG_IMA not set. Ignoring.\n"); +#endif +} + +#if defined(CONFIG_HAVE_IMA_KEXEC) && !defined(CONFIG_OF_FLATTREE) +int __init ima_free_kexec_buffer(void) +{ + int rc; + + if (!ima_kexec_buffer_size) + return -ENOENT; + + rc = memblock_phys_free(ima_kexec_buffer_phys, + ima_kexec_buffer_size); + if (rc) + return rc; + + ima_kexec_buffer_phys = 0; + ima_kexec_buffer_size = 0; + + return 0; +} + +int __init ima_get_kexec_buffer(void **addr, size_t *size) +{ + if (!ima_kexec_buffer_size) + return -ENOENT; + + *addr = __va(ima_kexec_buffer_phys); + *size = ima_kexec_buffer_size; + + return 0; +} +#endif + static void __init parse_setup_data(void) { struct setup_data *data; @@ -359,6 +416,18 @@ static void __init parse_setup_data(void) case SETUP_EFI: parse_efi_setup(pa_data, data_len); break; + case SETUP_IMA: + add_early_ima_buffer(pa_data); + break; + case SETUP_RNG_SEED: + data = early_memremap(pa_data, data_len); + add_bootloader_randomness(data->data, data->len); + /* Zero seed for forward secrecy. */ + memzero_explicit(data->data, data->len); + /* Zero length in case we find ourselves back here by accident. */ + memzero_explicit(&data->len, sizeof(data->len)); + early_memunmap(data, data_len); + break; default: break; } @@ -368,21 +437,41 @@ static void __init parse_setup_data(void) static void __init memblock_x86_reserve_range_setup_data(void) { + struct setup_indirect *indirect; struct setup_data *data; - u64 pa_data; + u64 pa_data, pa_next; + u32 len; pa_data = boot_params.hdr.setup_data; while (pa_data) { data = early_memremap(pa_data, sizeof(*data)); + if (!data) { + pr_warn("setup: failed to memremap setup_data entry\n"); + return; + } + + len = sizeof(*data); + pa_next = data->next; + memblock_reserve(pa_data, sizeof(*data) + data->len); - if (data->type == SETUP_INDIRECT && - ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) - memblock_reserve(((struct setup_indirect *)data->data)->addr, - ((struct setup_indirect *)data->data)->len); + if (data->type == SETUP_INDIRECT) { + len += data->len; + early_memunmap(data, sizeof(*data)); + data = early_memremap(pa_data, len); + if (!data) { + pr_warn("setup: failed to memremap indirect setup_data\n"); + return; + } - pa_data = data->next; - early_memunmap(data, sizeof(*data)); + indirect = (struct setup_indirect *)data->data; + + if (indirect->type != SETUP_INDIRECT) + memblock_reserve(indirect->addr, indirect->len); + } + + pa_data = pa_next; + early_memunmap(data, len); } } @@ -390,8 +479,6 @@ static void __init memblock_x86_reserve_range_setup_data(void) * --------- Crashkernel reservation ------------------------------ */ -#ifdef CONFIG_KEXEC_CORE - /* 16M alignment for crash kernel regions */ #define CRASH_ALIGN SZ_16M @@ -469,6 +556,9 @@ static void __init reserve_crashkernel(void) bool high = false; int ret; + if (!IS_ENABLED(CONFIG_KEXEC_CORE)) + return; + total_mem = memblock_phys_mem_size(); /* crashkernel=XM */ @@ -534,11 +624,6 @@ static void __init reserve_crashkernel(void) crashk_res.end = crash_base + crash_size - 1; insert_resource(&iomem_resource, &crashk_res); } -#else -static void __init reserve_crashkernel(void) -{ -} -#endif static struct resource standard_io_resources[] = { { .name = "dma1", .start = 0x00, .end = 0x1f, @@ -713,9 +798,6 @@ static void __init early_reserve_memory(void) early_reserve_initrd(); - if (efi_enabled(EFI_BOOT)) - efi_memblock_x86_reserve_range(); - memblock_x86_reserve_range_setup_data(); reserve_ibft_region(); @@ -742,26 +824,28 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) return 0; } -static char * __init prepare_command_line(void) +void x86_configure_nx(void) { -#ifdef CONFIG_CMDLINE_BOOL -#ifdef CONFIG_CMDLINE_OVERRIDE - strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); + if (boot_cpu_has(X86_FEATURE_NX)) + __supported_pte_mask |= _PAGE_NX; + else + __supported_pte_mask &= ~_PAGE_NX; +} + +static void __init x86_report_nx(void) +{ + if (!boot_cpu_has(X86_FEATURE_NX)) { + printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " + "missing in CPU!\n"); + } else { +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) + printk(KERN_INFO "NX (Execute Disable) protection: active\n"); #else - if (builtin_cmdline[0]) { - /* append boot loader cmdline to builtin */ - strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE); - strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE); - strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); - } + /* 32bit non-PAE kernel, NX cannot be used */ + printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " + "cannot be enabled: non-PAE kernel!\n"); #endif -#endif - - strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); - - parse_early_param(); - - return command_line; + } } /* @@ -853,23 +937,6 @@ void __init setup_arch(char **cmdline_p) x86_init.oem.arch_setup(); /* - * x86_configure_nx() is called before parse_early_param() (called by - * prepare_command_line()) to detect whether hardware doesn't support - * NX (so that the early EHCI debug console setup can safely call - * set_fixmap()). It may then be called again from within noexec_setup() - * during parsing early parameters to honor the respective command line - * option. - */ - x86_configure_nx(); - - /* - * This parses early params and it needs to run before - * early_reserve_memory() because latter relies on such settings - * supplied as early params. - */ - *cmdline_p = prepare_command_line(); - - /* * Do some memory reservations *before* memory is added to memblock, so * memblock allocations won't overwrite it. * @@ -902,6 +969,34 @@ void __init setup_arch(char **cmdline_p) bss_resource.start = __pa_symbol(__bss_start); bss_resource.end = __pa_symbol(__bss_stop)-1; +#ifdef CONFIG_CMDLINE_BOOL +#ifdef CONFIG_CMDLINE_OVERRIDE + strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); +#else + if (builtin_cmdline[0]) { + /* append boot loader cmdline to builtin */ + strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE); + strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE); + strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); + } +#endif +#endif + + strscpy(command_line, boot_command_line, COMMAND_LINE_SIZE); + *cmdline_p = command_line; + + /* + * x86_configure_nx() is called before parse_early_param() to detect + * whether hardware doesn't support NX (so that the early EHCI debug + * console setup can safely call set_fixmap()). + */ + x86_configure_nx(); + + parse_early_param(); + + if (efi_enabled(EFI_BOOT)) + efi_memblock_x86_reserve_range(); + #ifdef CONFIG_MEMORY_HOTPLUG /* * Memory used by the kernel cannot be hot-removed because Linux @@ -979,7 +1074,11 @@ void __init setup_arch(char **cmdline_p) max_pfn = e820__end_of_ram_pfn(); /* update e820 for memory not covered by WB MTRRs */ - mtrr_bp_init(); + if (IS_ENABLED(CONFIG_MTRR)) + mtrr_bp_init(); + else + pat_disable("PAT support disabled because CONFIG_MTRR is disabled in the kernel."); + if (mtrr_trim_uncached_memory(max_pfn)) max_pfn = e820__end_of_ram_pfn(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 7b65275544b2..49325caa7307 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -84,60 +84,6 @@ static bool __init pcpu_need_numa(void) } #endif -/** - * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu - * @cpu: cpu to allocate for - * @size: size allocation in bytes - * @align: alignment - * - * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper - * does the right thing for NUMA regardless of the current - * configuration. - * - * RETURNS: - * Pointer to the allocated area on success, NULL on failure. - */ -static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, - unsigned long align) -{ - const unsigned long goal = __pa(MAX_DMA_ADDRESS); -#ifdef CONFIG_NUMA - int node = early_cpu_to_node(cpu); - void *ptr; - - if (!node_online(node) || !NODE_DATA(node)) { - ptr = memblock_alloc_from(size, align, goal); - pr_info("cpu %d has no node %d or node-local memory\n", - cpu, node); - pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n", - cpu, size, __pa(ptr)); - } else { - ptr = memblock_alloc_try_nid(size, align, goal, - MEMBLOCK_ALLOC_ACCESSIBLE, - node); - - pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n", - cpu, size, node, __pa(ptr)); - } - return ptr; -#else - return memblock_alloc_from(size, align, goal); -#endif -} - -/* - * Helpers for first chunk memory allocation - */ -static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) -{ - return pcpu_alloc_bootmem(cpu, size, align); -} - -static void __init pcpu_fc_free(void *ptr, size_t size) -{ - memblock_free(ptr, size); -} - static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) { #ifdef CONFIG_NUMA @@ -150,7 +96,12 @@ static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) #endif } -static void __init pcpup_populate_pte(unsigned long addr) +static int __init pcpu_cpu_to_node(int cpu) +{ + return early_cpu_to_node(cpu); +} + +void __init pcpu_populate_pte(unsigned long addr) { populate_extra_pte(addr); } @@ -205,15 +156,14 @@ void __init setup_per_cpu_areas(void) rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, dyn_size, atom_size, pcpu_cpu_distance, - pcpu_fc_alloc, pcpu_fc_free); + pcpu_cpu_to_node); if (rc < 0) pr_warn("%s allocator failed (%d), falling back to page size\n", pcpu_fc_names[pcpu_chosen_fc], rc); } if (rc < 0) rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, - pcpu_fc_alloc, pcpu_fc_free, - pcpup_populate_pte); + pcpu_cpu_to_node); if (rc < 0) panic("cannot initialize percpu area (err=%d)", rc); diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index 787dc5f568b5..3a5b0c9c4fcc 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -14,6 +14,68 @@ #define has_cpuflag(f) boot_cpu_has(f) #endif +/* I/O parameters for CPUID-related helpers */ +struct cpuid_leaf { + u32 fn; + u32 subfn; + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; +}; + +/* + * Individual entries of the SNP CPUID table, as defined by the SNP + * Firmware ABI, Revision 0.9, Section 7.1, Table 14. + */ +struct snp_cpuid_fn { + u32 eax_in; + u32 ecx_in; + u64 xcr0_in; + u64 xss_in; + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; + u64 __reserved; +} __packed; + +/* + * SNP CPUID table, as defined by the SNP Firmware ABI, Revision 0.9, + * Section 8.14.2.6. Also noted there is the SNP firmware-enforced limit + * of 64 entries per CPUID table. + */ +#define SNP_CPUID_COUNT_MAX 64 + +struct snp_cpuid_table { + u32 count; + u32 __reserved1; + u64 __reserved2; + struct snp_cpuid_fn fn[SNP_CPUID_COUNT_MAX]; +} __packed; + +/* + * Since feature negotiation related variables are set early in the boot + * process they must reside in the .data section so as not to be zeroed + * out when the .bss section is later cleared. + * + * GHCB protocol version negotiated with the hypervisor. + */ +static u16 ghcb_version __ro_after_init; + +/* Copy of the SNP firmware's CPUID page. */ +static struct snp_cpuid_table cpuid_table_copy __ro_after_init; + +/* + * These will be initialized based on CPUID table so that non-present + * all-zero leaves (for sparse tables) can be differentiated from + * invalid/out-of-range leaves. This is needed since all-zero leaves + * still need to be post-processed. + */ +static u32 cpuid_std_range_max __ro_after_init; +static u32 cpuid_hyp_range_max __ro_after_init; +static u32 cpuid_ext_range_max __ro_after_init; + static bool __init sev_es_check_cpu_features(void) { if (!has_cpuflag(X86_FEATURE_RDRAND)) { @@ -24,15 +86,12 @@ static bool __init sev_es_check_cpu_features(void) return true; } -static void __noreturn sev_es_terminate(unsigned int reason) +static void __noreturn sev_es_terminate(unsigned int set, unsigned int reason) { u64 val = GHCB_MSR_TERM_REQ; - /* - * Tell the hypervisor what went wrong - only reason-set 0 is - * currently supported. - */ - val |= GHCB_SEV_TERM_REASON(0, reason); + /* Tell the hypervisor what went wrong. */ + val |= GHCB_SEV_TERM_REASON(set, reason); /* Request Guest Termination from Hypvervisor */ sev_es_wr_ghcb_msr(val); @@ -42,6 +101,42 @@ static void __noreturn sev_es_terminate(unsigned int reason) asm volatile("hlt\n" : : : "memory"); } +/* + * The hypervisor features are available from GHCB version 2 onward. + */ +static u64 get_hv_features(void) +{ + u64 val; + + if (ghcb_version < 2) + return 0; + + sev_es_wr_ghcb_msr(GHCB_MSR_HV_FT_REQ); + VMGEXIT(); + + val = sev_es_rd_ghcb_msr(); + if (GHCB_RESP_CODE(val) != GHCB_MSR_HV_FT_RESP) + return 0; + + return GHCB_MSR_HV_FT_RESP_VAL(val); +} + +static void snp_register_ghcb_early(unsigned long paddr) +{ + unsigned long pfn = paddr >> PAGE_SHIFT; + u64 val; + + sev_es_wr_ghcb_msr(GHCB_MSR_REG_GPA_REQ_VAL(pfn)); + VMGEXIT(); + + val = sev_es_rd_ghcb_msr(); + + /* If the response GPA is not ours then abort the guest */ + if ((GHCB_RESP_CODE(val) != GHCB_MSR_REG_GPA_RESP) || + (GHCB_MSR_REG_GPA_RESP_VAL(val) != pfn)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_REGISTER); +} + static bool sev_es_negotiate_protocol(void) { u64 val; @@ -54,10 +149,12 @@ static bool sev_es_negotiate_protocol(void) if (GHCB_MSR_INFO(val) != GHCB_MSR_SEV_INFO_RESP) return false; - if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTO_OUR || - GHCB_MSR_PROTO_MIN(val) > GHCB_PROTO_OUR) + if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTOCOL_MIN || + GHCB_MSR_PROTO_MIN(val) > GHCB_PROTOCOL_MAX) return false; + ghcb_version = min_t(size_t, GHCB_MSR_PROTO_MAX(val), GHCB_PROTOCOL_MAX); + return true; } @@ -104,10 +201,7 @@ static enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt if (ret == 1) { u64 info = ghcb->save.sw_exit_info_2; - unsigned long v; - - info = ghcb->save.sw_exit_info_2; - v = info & SVM_EVTINJ_VEC_MASK; + unsigned long v = info & SVM_EVTINJ_VEC_MASK; /* Check if exception information from hypervisor is sane. */ if ((info & SVM_EVTINJ_VALID) && @@ -125,29 +219,307 @@ static enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt return ES_VMM_ERROR; } -enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, bool set_ghcb_msr, - struct es_em_ctxt *ctxt, u64 exit_code, - u64 exit_info_1, u64 exit_info_2) +static enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, + struct es_em_ctxt *ctxt, + u64 exit_code, u64 exit_info_1, + u64 exit_info_2) { /* Fill in protocol and format specifiers */ - ghcb->protocol_version = GHCB_PROTOCOL_MAX; + ghcb->protocol_version = ghcb_version; ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; ghcb_set_sw_exit_code(ghcb, exit_code); ghcb_set_sw_exit_info_1(ghcb, exit_info_1); ghcb_set_sw_exit_info_2(ghcb, exit_info_2); + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + return verify_exception_info(ghcb, ctxt); +} + +static int __sev_cpuid_hv(u32 fn, int reg_idx, u32 *reg) +{ + u64 val; + + sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, reg_idx)); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) + return -EIO; + + *reg = (val >> 32); + + return 0; +} + +static int sev_cpuid_hv(struct cpuid_leaf *leaf) +{ + int ret; + /* - * Hyper-V unenlightened guests use a paravisor for communicating and - * GHCB pages are being allocated and set up by that paravisor. Linux - * should not change the GHCB page's physical address. + * MSR protocol does not support fetching non-zero subfunctions, but is + * sufficient to handle current early-boot cases. Should that change, + * make sure to report an error rather than ignoring the index and + * grabbing random values. If this issue arises in the future, handling + * can be added here to use GHCB-page protocol for cases that occur late + * enough in boot that GHCB page is available. */ - if (set_ghcb_msr) - sev_es_wr_ghcb_msr(__pa(ghcb)); + if (cpuid_function_is_indexed(leaf->fn) && leaf->subfn) + return -EINVAL; - VMGEXIT(); + ret = __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EAX, &leaf->eax); + ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EBX, &leaf->ebx); + ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_ECX, &leaf->ecx); + ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EDX, &leaf->edx); - return verify_exception_info(ghcb, ctxt); + return ret; +} + +/* + * This may be called early while still running on the initial identity + * mapping. Use RIP-relative addressing to obtain the correct address + * while running with the initial identity mapping as well as the + * switch-over to kernel virtual addresses later. + */ +static const struct snp_cpuid_table *snp_cpuid_get_table(void) +{ + void *ptr; + + asm ("lea cpuid_table_copy(%%rip), %0" + : "=r" (ptr) + : "p" (&cpuid_table_copy)); + + return ptr; +} + +/* + * The SNP Firmware ABI, Revision 0.9, Section 7.1, details the use of + * XCR0_IN and XSS_IN to encode multiple versions of 0xD subfunctions 0 + * and 1 based on the corresponding features enabled by a particular + * combination of XCR0 and XSS registers so that a guest can look up the + * version corresponding to the features currently enabled in its XCR0/XSS + * registers. The only values that differ between these versions/table + * entries is the enabled XSAVE area size advertised via EBX. + * + * While hypervisors may choose to make use of this support, it is more + * robust/secure for a guest to simply find the entry corresponding to the + * base/legacy XSAVE area size (XCR0=1 or XCR0=3), and then calculate the + * XSAVE area size using subfunctions 2 through 64, as documented in APM + * Volume 3, Rev 3.31, Appendix E.3.8, which is what is done here. + * + * Since base/legacy XSAVE area size is documented as 0x240, use that value + * directly rather than relying on the base size in the CPUID table. + * + * Return: XSAVE area size on success, 0 otherwise. + */ +static u32 snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted) +{ + const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); + u64 xfeatures_found = 0; + u32 xsave_size = 0x240; + int i; + + for (i = 0; i < cpuid_table->count; i++) { + const struct snp_cpuid_fn *e = &cpuid_table->fn[i]; + + if (!(e->eax_in == 0xD && e->ecx_in > 1 && e->ecx_in < 64)) + continue; + if (!(xfeatures_en & (BIT_ULL(e->ecx_in)))) + continue; + if (xfeatures_found & (BIT_ULL(e->ecx_in))) + continue; + + xfeatures_found |= (BIT_ULL(e->ecx_in)); + + if (compacted) + xsave_size += e->eax; + else + xsave_size = max(xsave_size, e->eax + e->ebx); + } + + /* + * Either the guest set unsupported XCR0/XSS bits, or the corresponding + * entries in the CPUID table were not present. This is not a valid + * state to be in. + */ + if (xfeatures_found != (xfeatures_en & GENMASK_ULL(63, 2))) + return 0; + + return xsave_size; +} + +static bool +snp_cpuid_get_validated_func(struct cpuid_leaf *leaf) +{ + const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); + int i; + + for (i = 0; i < cpuid_table->count; i++) { + const struct snp_cpuid_fn *e = &cpuid_table->fn[i]; + + if (e->eax_in != leaf->fn) + continue; + + if (cpuid_function_is_indexed(leaf->fn) && e->ecx_in != leaf->subfn) + continue; + + /* + * For 0xD subfunctions 0 and 1, only use the entry corresponding + * to the base/legacy XSAVE area size (XCR0=1 or XCR0=3, XSS=0). + * See the comments above snp_cpuid_calc_xsave_size() for more + * details. + */ + if (e->eax_in == 0xD && (e->ecx_in == 0 || e->ecx_in == 1)) + if (!(e->xcr0_in == 1 || e->xcr0_in == 3) || e->xss_in) + continue; + + leaf->eax = e->eax; + leaf->ebx = e->ebx; + leaf->ecx = e->ecx; + leaf->edx = e->edx; + + return true; + } + + return false; +} + +static void snp_cpuid_hv(struct cpuid_leaf *leaf) +{ + if (sev_cpuid_hv(leaf)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV); +} + +static int snp_cpuid_postprocess(struct cpuid_leaf *leaf) +{ + struct cpuid_leaf leaf_hv = *leaf; + + switch (leaf->fn) { + case 0x1: + snp_cpuid_hv(&leaf_hv); + + /* initial APIC ID */ + leaf->ebx = (leaf_hv.ebx & GENMASK(31, 24)) | (leaf->ebx & GENMASK(23, 0)); + /* APIC enabled bit */ + leaf->edx = (leaf_hv.edx & BIT(9)) | (leaf->edx & ~BIT(9)); + + /* OSXSAVE enabled bit */ + if (native_read_cr4() & X86_CR4_OSXSAVE) + leaf->ecx |= BIT(27); + break; + case 0x7: + /* OSPKE enabled bit */ + leaf->ecx &= ~BIT(4); + if (native_read_cr4() & X86_CR4_PKE) + leaf->ecx |= BIT(4); + break; + case 0xB: + leaf_hv.subfn = 0; + snp_cpuid_hv(&leaf_hv); + + /* extended APIC ID */ + leaf->edx = leaf_hv.edx; + break; + case 0xD: { + bool compacted = false; + u64 xcr0 = 1, xss = 0; + u32 xsave_size; + + if (leaf->subfn != 0 && leaf->subfn != 1) + return 0; + + if (native_read_cr4() & X86_CR4_OSXSAVE) + xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + if (leaf->subfn == 1) { + /* Get XSS value if XSAVES is enabled. */ + if (leaf->eax & BIT(3)) { + unsigned long lo, hi; + + asm volatile("rdmsr" : "=a" (lo), "=d" (hi) + : "c" (MSR_IA32_XSS)); + xss = (hi << 32) | lo; + } + + /* + * The PPR and APM aren't clear on what size should be + * encoded in 0xD:0x1:EBX when compaction is not enabled + * by either XSAVEC (feature bit 1) or XSAVES (feature + * bit 3) since SNP-capable hardware has these feature + * bits fixed as 1. KVM sets it to 0 in this case, but + * to avoid this becoming an issue it's safer to simply + * treat this as unsupported for SNP guests. + */ + if (!(leaf->eax & (BIT(1) | BIT(3)))) + return -EINVAL; + + compacted = true; + } + + xsave_size = snp_cpuid_calc_xsave_size(xcr0 | xss, compacted); + if (!xsave_size) + return -EINVAL; + + leaf->ebx = xsave_size; + } + break; + case 0x8000001E: + snp_cpuid_hv(&leaf_hv); + + /* extended APIC ID */ + leaf->eax = leaf_hv.eax; + /* compute ID */ + leaf->ebx = (leaf->ebx & GENMASK(31, 8)) | (leaf_hv.ebx & GENMASK(7, 0)); + /* node ID */ + leaf->ecx = (leaf->ecx & GENMASK(31, 8)) | (leaf_hv.ecx & GENMASK(7, 0)); + break; + default: + /* No fix-ups needed, use values as-is. */ + break; + } + + return 0; +} + +/* + * Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value + * should be treated as fatal by caller. + */ +static int snp_cpuid(struct cpuid_leaf *leaf) +{ + const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); + + if (!cpuid_table->count) + return -EOPNOTSUPP; + + if (!snp_cpuid_get_validated_func(leaf)) { + /* + * Some hypervisors will avoid keeping track of CPUID entries + * where all values are zero, since they can be handled the + * same as out-of-range values (all-zero). This is useful here + * as well as it allows virtually all guest configurations to + * work using a single SNP CPUID table. + * + * To allow for this, there is a need to distinguish between + * out-of-range entries and in-range zero entries, since the + * CPUID table entries are only a template that may need to be + * augmented with additional values for things like + * CPU-specific information during post-processing. So if it's + * not in the table, set the values to zero. Then, if they are + * within a valid CPUID range, proceed with post-processing + * using zeros as the initial values. Otherwise, skip + * post-processing and just return zeros immediately. + */ + leaf->eax = leaf->ebx = leaf->ecx = leaf->edx = 0; + + /* Skip post-processing for out-of-range zero leafs. */ + if (!(leaf->fn <= cpuid_std_range_max || + (leaf->fn >= 0x40000000 && leaf->fn <= cpuid_hyp_range_max) || + (leaf->fn >= 0x80000000 && leaf->fn <= cpuid_ext_range_max))) + return 0; + } + + return snp_cpuid_postprocess(leaf); } /* @@ -157,40 +529,33 @@ enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, bool set_ghcb_msr, */ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) { + unsigned int subfn = lower_bits(regs->cx, 32); unsigned int fn = lower_bits(regs->ax, 32); - unsigned long val; + struct cpuid_leaf leaf; + int ret; /* Only CPUID is supported via MSR protocol */ if (exit_code != SVM_EXIT_CPUID) goto fail; - sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EAX)); - VMGEXIT(); - val = sev_es_rd_ghcb_msr(); - if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) - goto fail; - regs->ax = val >> 32; + leaf.fn = fn; + leaf.subfn = subfn; - sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EBX)); - VMGEXIT(); - val = sev_es_rd_ghcb_msr(); - if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) - goto fail; - regs->bx = val >> 32; + ret = snp_cpuid(&leaf); + if (!ret) + goto cpuid_done; - sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_ECX)); - VMGEXIT(); - val = sev_es_rd_ghcb_msr(); - if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) + if (ret != -EOPNOTSUPP) goto fail; - regs->cx = val >> 32; - sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EDX)); - VMGEXIT(); - val = sev_es_rd_ghcb_msr(); - if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) + if (sev_cpuid_hv(&leaf)) goto fail; - regs->dx = val >> 32; + +cpuid_done: + regs->ax = leaf.eax; + regs->bx = leaf.ebx; + regs->cx = leaf.ecx; + regs->dx = leaf.edx; /* * This is a VC handler and the #VC is only raised when SEV-ES is @@ -221,7 +586,7 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) fail: /* Terminate the guest */ - sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); } static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt, @@ -424,7 +789,7 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) */ sw_scratch = __pa(ghcb) + offsetof(struct ghcb, shared_buffer); ghcb_set_sw_scratch(ghcb, sw_scratch); - ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_IOIO, + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, exit_info_1, exit_info_2); if (ret != ES_OK) return ret; @@ -466,8 +831,7 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) ghcb_set_rax(ghcb, rax); - ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, - SVM_EXIT_IOIO, exit_info_1, 0); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, exit_info_1, 0); if (ret != ES_OK) return ret; @@ -481,12 +845,37 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) return ret; } +static int vc_handle_cpuid_snp(struct pt_regs *regs) +{ + struct cpuid_leaf leaf; + int ret; + + leaf.fn = regs->ax; + leaf.subfn = regs->cx; + ret = snp_cpuid(&leaf); + if (!ret) { + regs->ax = leaf.eax; + regs->bx = leaf.ebx; + regs->cx = leaf.ecx; + regs->dx = leaf.edx; + } + + return ret; +} + static enum es_result vc_handle_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt) { struct pt_regs *regs = ctxt->regs; u32 cr4 = native_read_cr4(); enum es_result ret; + int snp_cpuid_ret; + + snp_cpuid_ret = vc_handle_cpuid_snp(regs); + if (!snp_cpuid_ret) + return ES_OK; + if (snp_cpuid_ret != -EOPNOTSUPP) + return ES_VMM_ERROR; ghcb_set_rax(ghcb, regs->ax); ghcb_set_rcx(ghcb, regs->cx); @@ -498,7 +887,7 @@ static enum es_result vc_handle_cpuid(struct ghcb *ghcb, /* xgetbv will cause #GP - use reset value for xcr0 */ ghcb_set_xcr0(ghcb, 1); - ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_CPUID, 0, 0); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); if (ret != ES_OK) return ret; @@ -523,7 +912,7 @@ static enum es_result vc_handle_rdtsc(struct ghcb *ghcb, bool rdtscp = (exit_code == SVM_EXIT_RDTSCP); enum es_result ret; - ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, exit_code, 0, 0); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, 0, 0); if (ret != ES_OK) return ret; @@ -538,3 +927,67 @@ static enum es_result vc_handle_rdtsc(struct ghcb *ghcb, return ES_OK; } + +struct cc_setup_data { + struct setup_data header; + u32 cc_blob_address; +}; + +/* + * Search for a Confidential Computing blob passed in as a setup_data entry + * via the Linux Boot Protocol. + */ +static struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp) +{ + struct cc_setup_data *sd = NULL; + struct setup_data *hdr; + + hdr = (struct setup_data *)bp->hdr.setup_data; + + while (hdr) { + if (hdr->type == SETUP_CC_BLOB) { + sd = (struct cc_setup_data *)hdr; + return (struct cc_blob_sev_info *)(unsigned long)sd->cc_blob_address; + } + hdr = (struct setup_data *)hdr->next; + } + + return NULL; +} + +/* + * Initialize the kernel's copy of the SNP CPUID table, and set up the + * pointer that will be used to access it. + * + * Maintaining a direct mapping of the SNP CPUID table used by firmware would + * be possible as an alternative, but the approach is brittle since the + * mapping needs to be updated in sync with all the changes to virtual memory + * layout and related mapping facilities throughout the boot process. + */ +static void __init setup_cpuid_table(const struct cc_blob_sev_info *cc_info) +{ + const struct snp_cpuid_table *cpuid_table_fw, *cpuid_table; + int i; + + if (!cc_info || !cc_info->cpuid_phys || cc_info->cpuid_len < PAGE_SIZE) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID); + + cpuid_table_fw = (const struct snp_cpuid_table *)cc_info->cpuid_phys; + if (!cpuid_table_fw->count || cpuid_table_fw->count > SNP_CPUID_COUNT_MAX) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID); + + cpuid_table = snp_cpuid_get_table(); + memcpy((void *)cpuid_table, cpuid_table_fw, sizeof(*cpuid_table)); + + /* Initialize CPUID ranges for range-checking. */ + for (i = 0; i < cpuid_table->count; i++) { + const struct snp_cpuid_fn *fn = &cpuid_table->fn[i]; + + if (fn->eax_in == 0x0) + cpuid_std_range_max = fn->eax; + else if (fn->eax_in == 0x40000000) + cpuid_hyp_range_max = fn->eax; + else if (fn->eax_in == 0x80000000) + cpuid_ext_range_max = fn->eax; + } +} diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index a9fc2ac7a8bd..a428c62330d3 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -18,6 +18,10 @@ #include <linux/memblock.h> #include <linux/kernel.h> #include <linux/mm.h> +#include <linux/cpumask.h> +#include <linux/efi.h> +#include <linux/platform_device.h> +#include <linux/io.h> #include <asm/cpu_entry_area.h> #include <asm/stacktrace.h> @@ -26,13 +30,33 @@ #include <asm/fpu/xcr.h> #include <asm/processor.h> #include <asm/realmode.h> +#include <asm/setup.h> #include <asm/traps.h> #include <asm/svm.h> #include <asm/smp.h> #include <asm/cpu.h> +#include <asm/apic.h> +#include <asm/cpuid.h> +#include <asm/cmdline.h> #define DR7_RESET_VALUE 0x400 +/* AP INIT values as documented in the APM2 section "Processor Initialization State" */ +#define AP_INIT_CS_LIMIT 0xffff +#define AP_INIT_DS_LIMIT 0xffff +#define AP_INIT_LDTR_LIMIT 0xffff +#define AP_INIT_GDTR_LIMIT 0xffff +#define AP_INIT_IDTR_LIMIT 0xffff +#define AP_INIT_TR_LIMIT 0xffff +#define AP_INIT_RFLAGS_DEFAULT 0x2 +#define AP_INIT_DR6_DEFAULT 0xffff0ff0 +#define AP_INIT_GPAT_DEFAULT 0x0007040600070406ULL +#define AP_INIT_XCR0_DEFAULT 0x1 +#define AP_INIT_X87_FTW_DEFAULT 0x5555 +#define AP_INIT_X87_FCW_DEFAULT 0x0040 +#define AP_INIT_CR0_DEFAULT 0x60000010 +#define AP_INIT_MXCSR_DEFAULT 0x1f80 + /* For early boot hypervisor communication in SEV-ES enabled guests */ static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); @@ -40,7 +64,10 @@ static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); * Needs to be in the .data section because we need it NULL before bss is * cleared */ -static struct ghcb __initdata *boot_ghcb; +static struct ghcb *boot_ghcb __section(".data"); + +/* Bitmap of SEV features supported by the hypervisor */ +static u64 sev_hv_features __ro_after_init; /* #VC handler runtime per-CPU data */ struct sev_es_runtime_data { @@ -86,8 +113,14 @@ struct ghcb_state { static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data); DEFINE_STATIC_KEY_FALSE(sev_es_enable_key); -/* Needed in vc_early_forward_exception */ -void do_early_exception(struct pt_regs *regs, int trapnr); +static DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa); + +struct sev_config { + __u64 debug : 1, + __reserved : 63; +}; + +static struct sev_config sev_cfg __read_mostly; static __always_inline bool on_vc_stack(struct pt_regs *regs) { @@ -209,9 +242,6 @@ static noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state) return ghcb; } -/* Needed in vc_early_forward_exception */ -void do_early_exception(struct pt_regs *regs, int trapnr); - static inline u64 sev_es_rd_ghcb_msr(void) { return __rdmsr(MSR_AMD64_SEV_ES_GHCB); @@ -528,13 +558,68 @@ void noinstr __sev_es_nmi_complete(void) __sev_put_ghcb(&state); } -static u64 get_jump_table_addr(void) +static u64 __init get_secrets_page(void) +{ + u64 pa_data = boot_params.cc_blob_address; + struct cc_blob_sev_info info; + void *map; + + /* + * The CC blob contains the address of the secrets page, check if the + * blob is present. + */ + if (!pa_data) + return 0; + + map = early_memremap(pa_data, sizeof(info)); + if (!map) { + pr_err("Unable to locate SNP secrets page: failed to map the Confidential Computing blob.\n"); + return 0; + } + memcpy(&info, map, sizeof(info)); + early_memunmap(map, sizeof(info)); + + /* smoke-test the secrets page passed */ + if (!info.secrets_phys || info.secrets_len != PAGE_SIZE) + return 0; + + return info.secrets_phys; +} + +static u64 __init get_snp_jump_table_addr(void) +{ + struct snp_secrets_page_layout *layout; + void __iomem *mem; + u64 pa, addr; + + pa = get_secrets_page(); + if (!pa) + return 0; + + mem = ioremap_encrypted(pa, PAGE_SIZE); + if (!mem) { + pr_err("Unable to locate AP jump table address: failed to map the SNP secrets page.\n"); + return 0; + } + + layout = (__force struct snp_secrets_page_layout *)mem; + + addr = layout->os_area.ap_jump_table_pa; + iounmap(mem); + + return addr; +} + +static u64 __init get_jump_table_addr(void) { struct ghcb_state state; unsigned long flags; struct ghcb *ghcb; u64 ret = 0; + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return get_snp_jump_table_addr(); + local_irq_save(flags); ghcb = __sev_get_ghcb(&state); @@ -558,7 +643,508 @@ static u64 get_jump_table_addr(void) return ret; } -int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) +static void pvalidate_pages(unsigned long vaddr, unsigned int npages, bool validate) +{ + unsigned long vaddr_end; + int rc; + + vaddr = vaddr & PAGE_MASK; + vaddr_end = vaddr + (npages << PAGE_SHIFT); + + while (vaddr < vaddr_end) { + rc = pvalidate(vaddr, RMP_PG_SIZE_4K, validate); + if (WARN(rc, "Failed to validate address 0x%lx ret %d", vaddr, rc)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); + + vaddr = vaddr + PAGE_SIZE; + } +} + +static void __init early_set_pages_state(unsigned long paddr, unsigned int npages, enum psc_op op) +{ + unsigned long paddr_end; + u64 val; + + paddr = paddr & PAGE_MASK; + paddr_end = paddr + (npages << PAGE_SHIFT); + + while (paddr < paddr_end) { + /* + * Use the MSR protocol because this function can be called before + * the GHCB is established. + */ + sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op)); + VMGEXIT(); + + val = sev_es_rd_ghcb_msr(); + + if (WARN(GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP, + "Wrong PSC response code: 0x%x\n", + (unsigned int)GHCB_RESP_CODE(val))) + goto e_term; + + if (WARN(GHCB_MSR_PSC_RESP_VAL(val), + "Failed to change page state to '%s' paddr 0x%lx error 0x%llx\n", + op == SNP_PAGE_STATE_PRIVATE ? "private" : "shared", + paddr, GHCB_MSR_PSC_RESP_VAL(val))) + goto e_term; + + paddr = paddr + PAGE_SIZE; + } + + return; + +e_term: + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); +} + +void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, + unsigned int npages) +{ + /* + * This can be invoked in early boot while running identity mapped, so + * use an open coded check for SNP instead of using cc_platform_has(). + * This eliminates worries about jump tables or checking boot_cpu_data + * in the cc_platform_has() function. + */ + if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) + return; + + /* + * Ask the hypervisor to mark the memory pages as private in the RMP + * table. + */ + early_set_pages_state(paddr, npages, SNP_PAGE_STATE_PRIVATE); + + /* Validate the memory pages after they've been added in the RMP table. */ + pvalidate_pages(vaddr, npages, true); +} + +void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, + unsigned int npages) +{ + /* + * This can be invoked in early boot while running identity mapped, so + * use an open coded check for SNP instead of using cc_platform_has(). + * This eliminates worries about jump tables or checking boot_cpu_data + * in the cc_platform_has() function. + */ + if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) + return; + + /* Invalidate the memory pages before they are marked shared in the RMP table. */ + pvalidate_pages(vaddr, npages, false); + + /* Ask hypervisor to mark the memory pages shared in the RMP table. */ + early_set_pages_state(paddr, npages, SNP_PAGE_STATE_SHARED); +} + +void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) +{ + unsigned long vaddr, npages; + + vaddr = (unsigned long)__va(paddr); + npages = PAGE_ALIGN(sz) >> PAGE_SHIFT; + + if (op == SNP_PAGE_STATE_PRIVATE) + early_snp_set_memory_private(vaddr, paddr, npages); + else if (op == SNP_PAGE_STATE_SHARED) + early_snp_set_memory_shared(vaddr, paddr, npages); + else + WARN(1, "invalid memory op %d\n", op); +} + +static int vmgexit_psc(struct snp_psc_desc *desc) +{ + int cur_entry, end_entry, ret = 0; + struct snp_psc_desc *data; + struct ghcb_state state; + struct es_em_ctxt ctxt; + unsigned long flags; + struct ghcb *ghcb; + + /* + * __sev_get_ghcb() needs to run with IRQs disabled because it is using + * a per-CPU GHCB. + */ + local_irq_save(flags); + + ghcb = __sev_get_ghcb(&state); + if (!ghcb) { + ret = 1; + goto out_unlock; + } + + /* Copy the input desc into GHCB shared buffer */ + data = (struct snp_psc_desc *)ghcb->shared_buffer; + memcpy(ghcb->shared_buffer, desc, min_t(int, GHCB_SHARED_BUF_SIZE, sizeof(*desc))); + + /* + * As per the GHCB specification, the hypervisor can resume the guest + * before processing all the entries. Check whether all the entries + * are processed. If not, then keep retrying. Note, the hypervisor + * will update the data memory directly to indicate the status, so + * reference the data->hdr everywhere. + * + * The strategy here is to wait for the hypervisor to change the page + * state in the RMP table before guest accesses the memory pages. If the + * page state change was not successful, then later memory access will + * result in a crash. + */ + cur_entry = data->hdr.cur_entry; + end_entry = data->hdr.end_entry; + + while (data->hdr.cur_entry <= data->hdr.end_entry) { + ghcb_set_sw_scratch(ghcb, (u64)__pa(data)); + + /* This will advance the shared buffer data points to. */ + ret = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_PSC, 0, 0); + + /* + * Page State Change VMGEXIT can pass error code through + * exit_info_2. + */ + if (WARN(ret || ghcb->save.sw_exit_info_2, + "SNP: PSC failed ret=%d exit_info_2=%llx\n", + ret, ghcb->save.sw_exit_info_2)) { + ret = 1; + goto out; + } + + /* Verify that reserved bit is not set */ + if (WARN(data->hdr.reserved, "Reserved bit is set in the PSC header\n")) { + ret = 1; + goto out; + } + + /* + * Sanity check that entry processing is not going backwards. + * This will happen only if hypervisor is tricking us. + */ + if (WARN(data->hdr.end_entry > end_entry || cur_entry > data->hdr.cur_entry, +"SNP: PSC processing going backward, end_entry %d (got %d) cur_entry %d (got %d)\n", + end_entry, data->hdr.end_entry, cur_entry, data->hdr.cur_entry)) { + ret = 1; + goto out; + } + } + +out: + __sev_put_ghcb(&state); + +out_unlock: + local_irq_restore(flags); + + return ret; +} + +static void __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr, + unsigned long vaddr_end, int op) +{ + struct psc_hdr *hdr; + struct psc_entry *e; + unsigned long pfn; + int i; + + hdr = &data->hdr; + e = data->entries; + + memset(data, 0, sizeof(*data)); + i = 0; + + while (vaddr < vaddr_end) { + if (is_vmalloc_addr((void *)vaddr)) + pfn = vmalloc_to_pfn((void *)vaddr); + else + pfn = __pa(vaddr) >> PAGE_SHIFT; + + e->gfn = pfn; + e->operation = op; + hdr->end_entry = i; + + /* + * Current SNP implementation doesn't keep track of the RMP page + * size so use 4K for simplicity. + */ + e->pagesize = RMP_PG_SIZE_4K; + + vaddr = vaddr + PAGE_SIZE; + e++; + i++; + } + + if (vmgexit_psc(data)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); +} + +static void set_pages_state(unsigned long vaddr, unsigned int npages, int op) +{ + unsigned long vaddr_end, next_vaddr; + struct snp_psc_desc *desc; + + desc = kmalloc(sizeof(*desc), GFP_KERNEL_ACCOUNT); + if (!desc) + panic("SNP: failed to allocate memory for PSC descriptor\n"); + + vaddr = vaddr & PAGE_MASK; + vaddr_end = vaddr + (npages << PAGE_SHIFT); + + while (vaddr < vaddr_end) { + /* Calculate the last vaddr that fits in one struct snp_psc_desc. */ + next_vaddr = min_t(unsigned long, vaddr_end, + (VMGEXIT_PSC_MAX_ENTRY * PAGE_SIZE) + vaddr); + + __set_pages_state(desc, vaddr, next_vaddr, op); + + vaddr = next_vaddr; + } + + kfree(desc); +} + +void snp_set_memory_shared(unsigned long vaddr, unsigned int npages) +{ + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return; + + pvalidate_pages(vaddr, npages, false); + + set_pages_state(vaddr, npages, SNP_PAGE_STATE_SHARED); +} + +void snp_set_memory_private(unsigned long vaddr, unsigned int npages) +{ + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return; + + set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE); + + pvalidate_pages(vaddr, npages, true); +} + +static int snp_set_vmsa(void *va, bool vmsa) +{ + u64 attrs; + + /* + * Running at VMPL0 allows the kernel to change the VMSA bit for a page + * using the RMPADJUST instruction. However, for the instruction to + * succeed it must target the permissions of a lesser privileged + * (higher numbered) VMPL level, so use VMPL1 (refer to the RMPADJUST + * instruction in the AMD64 APM Volume 3). + */ + attrs = 1; + if (vmsa) + attrs |= RMPADJUST_VMSA_PAGE_BIT; + + return rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs); +} + +#define __ATTR_BASE (SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK) +#define INIT_CS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_READ_MASK | SVM_SELECTOR_CODE_MASK) +#define INIT_DS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_WRITE_MASK) + +#define INIT_LDTR_ATTRIBS (SVM_SELECTOR_P_MASK | 2) +#define INIT_TR_ATTRIBS (SVM_SELECTOR_P_MASK | 3) + +static void *snp_alloc_vmsa_page(void) +{ + struct page *p; + + /* + * Allocate VMSA page to work around the SNP erratum where the CPU will + * incorrectly signal an RMP violation #PF if a large page (2MB or 1GB) + * collides with the RMP entry of VMSA page. The recommended workaround + * is to not use a large page. + * + * Allocate an 8k page which is also 8k-aligned. + */ + p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1); + if (!p) + return NULL; + + split_page(p, 1); + + /* Free the first 4k. This page may be 2M/1G aligned and cannot be used. */ + __free_page(p); + + return page_address(p + 1); +} + +static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa) +{ + int err; + + err = snp_set_vmsa(vmsa, false); + if (err) + pr_err("clear VMSA page failed (%u), leaking page\n", err); + else + free_page((unsigned long)vmsa); +} + +static int wakeup_cpu_via_vmgexit(int apic_id, unsigned long start_ip) +{ + struct sev_es_save_area *cur_vmsa, *vmsa; + struct ghcb_state state; + unsigned long flags; + struct ghcb *ghcb; + u8 sipi_vector; + int cpu, ret; + u64 cr4; + + /* + * The hypervisor SNP feature support check has happened earlier, just check + * the AP_CREATION one here. + */ + if (!(sev_hv_features & GHCB_HV_FT_SNP_AP_CREATION)) + return -EOPNOTSUPP; + + /* + * Verify the desired start IP against the known trampoline start IP + * to catch any future new trampolines that may be introduced that + * would require a new protected guest entry point. + */ + if (WARN_ONCE(start_ip != real_mode_header->trampoline_start, + "Unsupported SNP start_ip: %lx\n", start_ip)) + return -EINVAL; + + /* Override start_ip with known protected guest start IP */ + start_ip = real_mode_header->sev_es_trampoline_start; + + /* Find the logical CPU for the APIC ID */ + for_each_present_cpu(cpu) { + if (arch_match_cpu_phys_id(cpu, apic_id)) + break; + } + if (cpu >= nr_cpu_ids) + return -EINVAL; + + cur_vmsa = per_cpu(sev_vmsa, cpu); + + /* + * A new VMSA is created each time because there is no guarantee that + * the current VMSA is the kernels or that the vCPU is not running. If + * an attempt was done to use the current VMSA with a running vCPU, a + * #VMEXIT of that vCPU would wipe out all of the settings being done + * here. + */ + vmsa = (struct sev_es_save_area *)snp_alloc_vmsa_page(); + if (!vmsa) + return -ENOMEM; + + /* CR4 should maintain the MCE value */ + cr4 = native_read_cr4() & X86_CR4_MCE; + + /* Set the CS value based on the start_ip converted to a SIPI vector */ + sipi_vector = (start_ip >> 12); + vmsa->cs.base = sipi_vector << 12; + vmsa->cs.limit = AP_INIT_CS_LIMIT; + vmsa->cs.attrib = INIT_CS_ATTRIBS; + vmsa->cs.selector = sipi_vector << 8; + + /* Set the RIP value based on start_ip */ + vmsa->rip = start_ip & 0xfff; + + /* Set AP INIT defaults as documented in the APM */ + vmsa->ds.limit = AP_INIT_DS_LIMIT; + vmsa->ds.attrib = INIT_DS_ATTRIBS; + vmsa->es = vmsa->ds; + vmsa->fs = vmsa->ds; + vmsa->gs = vmsa->ds; + vmsa->ss = vmsa->ds; + + vmsa->gdtr.limit = AP_INIT_GDTR_LIMIT; + vmsa->ldtr.limit = AP_INIT_LDTR_LIMIT; + vmsa->ldtr.attrib = INIT_LDTR_ATTRIBS; + vmsa->idtr.limit = AP_INIT_IDTR_LIMIT; + vmsa->tr.limit = AP_INIT_TR_LIMIT; + vmsa->tr.attrib = INIT_TR_ATTRIBS; + + vmsa->cr4 = cr4; + vmsa->cr0 = AP_INIT_CR0_DEFAULT; + vmsa->dr7 = DR7_RESET_VALUE; + vmsa->dr6 = AP_INIT_DR6_DEFAULT; + vmsa->rflags = AP_INIT_RFLAGS_DEFAULT; + vmsa->g_pat = AP_INIT_GPAT_DEFAULT; + vmsa->xcr0 = AP_INIT_XCR0_DEFAULT; + vmsa->mxcsr = AP_INIT_MXCSR_DEFAULT; + vmsa->x87_ftw = AP_INIT_X87_FTW_DEFAULT; + vmsa->x87_fcw = AP_INIT_X87_FCW_DEFAULT; + + /* SVME must be set. */ + vmsa->efer = EFER_SVME; + + /* + * Set the SNP-specific fields for this VMSA: + * VMPL level + * SEV_FEATURES (matches the SEV STATUS MSR right shifted 2 bits) + */ + vmsa->vmpl = 0; + vmsa->sev_features = sev_status >> 2; + + /* Switch the page over to a VMSA page now that it is initialized */ + ret = snp_set_vmsa(vmsa, true); + if (ret) { + pr_err("set VMSA page failed (%u)\n", ret); + free_page((unsigned long)vmsa); + + return -EINVAL; + } + + /* Issue VMGEXIT AP Creation NAE event */ + local_irq_save(flags); + + ghcb = __sev_get_ghcb(&state); + + vc_ghcb_invalidate(ghcb); + ghcb_set_rax(ghcb, vmsa->sev_features); + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION); + ghcb_set_sw_exit_info_1(ghcb, ((u64)apic_id << 32) | SVM_VMGEXIT_AP_CREATE); + ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa)); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + if (!ghcb_sw_exit_info_1_is_valid(ghcb) || + lower_32_bits(ghcb->save.sw_exit_info_1)) { + pr_err("SNP AP Creation error\n"); + ret = -EINVAL; + } + + __sev_put_ghcb(&state); + + local_irq_restore(flags); + + /* Perform cleanup if there was an error */ + if (ret) { + snp_cleanup_vmsa(vmsa); + vmsa = NULL; + } + + /* Free up any previous VMSA page */ + if (cur_vmsa) + snp_cleanup_vmsa(cur_vmsa); + + /* Record the current VMSA page */ + per_cpu(sev_vmsa, cpu) = vmsa; + + return ret; +} + +void snp_set_wakeup_secondary_cpu(void) +{ + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return; + + /* + * Always set this override if SNP is enabled. This makes it the + * required method to start APs under SNP. If the hypervisor does + * not support AP creation, then no APs will be started. + */ + apic->wakeup_secondary_cpu = wakeup_cpu_via_vmgexit; +} + +int __init sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { u16 startup_cs, startup_ip; phys_addr_t jump_table_pa; @@ -638,8 +1224,7 @@ static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) ghcb_set_rdx(ghcb, regs->dx); } - ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_MSR, - exit_info_1, 0); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, exit_info_1, 0); if ((ret == ES_OK) && (!exit_info_1)) { regs->ax = ghcb->save.rax; @@ -649,15 +1234,39 @@ static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) return ret; } -/* - * This function runs on the first #VC exception after the kernel - * switched to virtual addresses. - */ -static bool __init sev_es_setup_ghcb(void) +static void snp_register_per_cpu_ghcb(void) { + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + snp_register_ghcb_early(__pa(ghcb)); +} + +void setup_ghcb(void) +{ + if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) + return; + /* First make sure the hypervisor talks a supported protocol. */ if (!sev_es_negotiate_protocol()) - return false; + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); + + /* + * Check whether the runtime #VC exception handler is active. It uses + * the per-CPU GHCB page which is set up by sev_es_init_vc_handling(). + * + * If SNP is active, register the per-CPU GHCB page so that the runtime + * exception handler can use it. + */ + if (initial_vc_handler == (unsigned long)kernel_exc_vmm_communication) { + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + snp_register_per_cpu_ghcb(); + + return; + } /* * Clear the boot_ghcb. The first exception comes in before the bss @@ -668,7 +1277,9 @@ static bool __init sev_es_setup_ghcb(void) /* Alright - Make the boot-ghcb public */ boot_ghcb = &boot_ghcb_page; - return true; + /* SNP guest requires that GHCB GPA must be registered. */ + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + snp_register_ghcb_early(__pa(&boot_ghcb_page)); } #ifdef CONFIG_HOTPLUG_CPU @@ -771,6 +1382,17 @@ void __init sev_es_init_vc_handling(void) if (!sev_es_check_cpu_features()) panic("SEV-ES CPU Features missing"); + /* + * SNP is supported in v2 of the GHCB spec which mandates support for HV + * features. + */ + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) { + sev_hv_features = get_hv_features(); + + if (!(sev_hv_features & GHCB_HV_FT_SNP)) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); + } + /* Enable SEV-ES special handling */ static_branch_enable(&sev_es_enable_key); @@ -797,22 +1419,6 @@ static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt) do_early_exception(ctxt->regs, trapnr); } -static long *vc_insn_get_reg(struct es_em_ctxt *ctxt) -{ - long *reg_array; - int offset; - - reg_array = (long *)ctxt->regs; - offset = insn_get_modrm_reg_off(&ctxt->insn, ctxt->regs); - - if (offset < 0) - return NULL; - - offset /= sizeof(long); - - return reg_array + offset; -} - static long *vc_insn_get_rm(struct es_em_ctxt *ctxt) { long *reg_array; @@ -857,77 +1463,7 @@ static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt, ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer)); - return sev_es_ghcb_hv_call(ghcb, true, ctxt, exit_code, exit_info_1, exit_info_2); -} - -static enum es_result vc_handle_mmio_twobyte_ops(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - struct insn *insn = &ctxt->insn; - unsigned int bytes = 0; - enum es_result ret; - int sign_byte; - long *reg_data; - - switch (insn->opcode.bytes[1]) { - /* MMIO Read w/ zero-extension */ - case 0xb6: - bytes = 1; - fallthrough; - case 0xb7: - if (!bytes) - bytes = 2; - - ret = vc_do_mmio(ghcb, ctxt, bytes, true); - if (ret) - break; - - /* Zero extend based on operand size */ - reg_data = vc_insn_get_reg(ctxt); - if (!reg_data) - return ES_DECODE_FAILED; - - memset(reg_data, 0, insn->opnd_bytes); - - memcpy(reg_data, ghcb->shared_buffer, bytes); - break; - - /* MMIO Read w/ sign-extension */ - case 0xbe: - bytes = 1; - fallthrough; - case 0xbf: - if (!bytes) - bytes = 2; - - ret = vc_do_mmio(ghcb, ctxt, bytes, true); - if (ret) - break; - - /* Sign extend based on operand size */ - reg_data = vc_insn_get_reg(ctxt); - if (!reg_data) - return ES_DECODE_FAILED; - - if (bytes == 1) { - u8 *val = (u8 *)ghcb->shared_buffer; - - sign_byte = (*val & 0x80) ? 0xff : 0x00; - } else { - u16 *val = (u16 *)ghcb->shared_buffer; - - sign_byte = (*val & 0x8000) ? 0xff : 0x00; - } - memset(reg_data, sign_byte, insn->opnd_bytes); - - memcpy(reg_data, ghcb->shared_buffer, bytes); - break; - - default: - ret = ES_UNSUPPORTED; - } - - return ret; + return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2); } /* @@ -997,83 +1533,79 @@ static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt, return ES_RETRY; } -static enum es_result vc_handle_mmio(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) +static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) { struct insn *insn = &ctxt->insn; unsigned int bytes = 0; + enum mmio_type mmio; enum es_result ret; + u8 sign_byte; long *reg_data; - switch (insn->opcode.bytes[0]) { - /* MMIO Write */ - case 0x88: - bytes = 1; - fallthrough; - case 0x89: - if (!bytes) - bytes = insn->opnd_bytes; + mmio = insn_decode_mmio(insn, &bytes); + if (mmio == MMIO_DECODE_FAILED) + return ES_DECODE_FAILED; - reg_data = vc_insn_get_reg(ctxt); + if (mmio != MMIO_WRITE_IMM && mmio != MMIO_MOVS) { + reg_data = insn_get_modrm_reg_ptr(insn, ctxt->regs); if (!reg_data) return ES_DECODE_FAILED; + } + switch (mmio) { + case MMIO_WRITE: memcpy(ghcb->shared_buffer, reg_data, bytes); - ret = vc_do_mmio(ghcb, ctxt, bytes, false); break; - - case 0xc6: - bytes = 1; - fallthrough; - case 0xc7: - if (!bytes) - bytes = insn->opnd_bytes; - + case MMIO_WRITE_IMM: memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes); - ret = vc_do_mmio(ghcb, ctxt, bytes, false); break; - - /* MMIO Read */ - case 0x8a: - bytes = 1; - fallthrough; - case 0x8b: - if (!bytes) - bytes = insn->opnd_bytes; - + case MMIO_READ: ret = vc_do_mmio(ghcb, ctxt, bytes, true); if (ret) break; - reg_data = vc_insn_get_reg(ctxt); - if (!reg_data) - return ES_DECODE_FAILED; - /* Zero-extend for 32-bit operation */ if (bytes == 4) *reg_data = 0; memcpy(reg_data, ghcb->shared_buffer, bytes); break; + case MMIO_READ_ZERO_EXTEND: + ret = vc_do_mmio(ghcb, ctxt, bytes, true); + if (ret) + break; - /* MOVS instruction */ - case 0xa4: - bytes = 1; - fallthrough; - case 0xa5: - if (!bytes) - bytes = insn->opnd_bytes; + /* Zero extend based on operand size */ + memset(reg_data, 0, insn->opnd_bytes); + memcpy(reg_data, ghcb->shared_buffer, bytes); + break; + case MMIO_READ_SIGN_EXTEND: + ret = vc_do_mmio(ghcb, ctxt, bytes, true); + if (ret) + break; - ret = vc_handle_mmio_movs(ctxt, bytes); + if (bytes == 1) { + u8 *val = (u8 *)ghcb->shared_buffer; + + sign_byte = (*val & 0x80) ? 0xff : 0x00; + } else { + u16 *val = (u16 *)ghcb->shared_buffer; + + sign_byte = (*val & 0x8000) ? 0xff : 0x00; + } + + /* Sign extend based on operand size */ + memset(reg_data, sign_byte, insn->opnd_bytes); + memcpy(reg_data, ghcb->shared_buffer, bytes); break; - /* Two-Byte Opcodes */ - case 0x0f: - ret = vc_handle_mmio_twobyte_ops(ghcb, ctxt); + case MMIO_MOVS: + ret = vc_handle_mmio_movs(ctxt, bytes); break; default: ret = ES_UNSUPPORTED; + break; } return ret; @@ -1107,7 +1639,7 @@ static enum es_result vc_handle_dr7_write(struct ghcb *ghcb, /* Using a value of 0 for ExitInfo1 means RAX holds the value */ ghcb_set_rax(ghcb, val); - ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_WRITE_DR7, 0, 0); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, 0, 0); if (ret != ES_OK) return ret; @@ -1137,7 +1669,7 @@ static enum es_result vc_handle_dr7_read(struct ghcb *ghcb, static enum es_result vc_handle_wbinvd(struct ghcb *ghcb, struct es_em_ctxt *ctxt) { - return sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_WBINVD, 0, 0); + return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, 0, 0); } static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt) @@ -1146,7 +1678,7 @@ static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt ghcb_set_rcx(ghcb, ctxt->regs->cx); - ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_RDPMC, 0, 0); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, 0, 0); if (ret != ES_OK) return ret; @@ -1187,7 +1719,7 @@ static enum es_result vc_handle_vmmcall(struct ghcb *ghcb, if (x86_platform.hyper.sev_es_hcall_prepare) x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs); - ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_VMMCALL, 0, 0); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, 0, 0); if (ret != ES_OK) return ret; @@ -1432,7 +1964,7 @@ DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication) show_regs(regs); /* Ask hypervisor to sev_es_terminate */ - sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); /* If that fails and we get here - just panic */ panic("Returned from Terminate-Request to Hypervisor\n"); @@ -1478,10 +2010,6 @@ bool __init handle_vc_boot_ghcb(struct pt_regs *regs) struct es_em_ctxt ctxt; enum es_result result; - /* Do initial setup or terminate the guest */ - if (unlikely(boot_ghcb == NULL && !sev_es_setup_ghcb())) - sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); - vc_ghcb_invalidate(boot_ghcb); result = vc_init_em_ctxt(&ctxt, regs, exit_code); @@ -1520,6 +2048,215 @@ bool __init handle_vc_boot_ghcb(struct pt_regs *regs) fail: show_regs(regs); - while (true) - halt(); + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); +} + +/* + * Initial set up of SNP relies on information provided by the + * Confidential Computing blob, which can be passed to the kernel + * in the following ways, depending on how it is booted: + * + * - when booted via the boot/decompress kernel: + * - via boot_params + * + * - when booted directly by firmware/bootloader (e.g. CONFIG_PVH): + * - via a setup_data entry, as defined by the Linux Boot Protocol + * + * Scan for the blob in that order. + */ +static __init struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp) +{ + struct cc_blob_sev_info *cc_info; + + /* Boot kernel would have passed the CC blob via boot_params. */ + if (bp->cc_blob_address) { + cc_info = (struct cc_blob_sev_info *)(unsigned long)bp->cc_blob_address; + goto found_cc_info; + } + + /* + * If kernel was booted directly, without the use of the + * boot/decompression kernel, the CC blob may have been passed via + * setup_data instead. + */ + cc_info = find_cc_blob_setup_data(bp); + if (!cc_info) + return NULL; + +found_cc_info: + if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC) + snp_abort(); + + return cc_info; +} + +bool __init snp_init(struct boot_params *bp) +{ + struct cc_blob_sev_info *cc_info; + + if (!bp) + return false; + + cc_info = find_cc_blob(bp); + if (!cc_info) + return false; + + setup_cpuid_table(cc_info); + + /* + * The CC blob will be used later to access the secrets page. Cache + * it here like the boot kernel does. + */ + bp->cc_blob_address = (u32)(unsigned long)cc_info; + + return true; +} + +void __init __noreturn snp_abort(void) +{ + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); +} + +static void dump_cpuid_table(void) +{ + const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); + int i = 0; + + pr_info("count=%d reserved=0x%x reserved2=0x%llx\n", + cpuid_table->count, cpuid_table->__reserved1, cpuid_table->__reserved2); + + for (i = 0; i < SNP_CPUID_COUNT_MAX; i++) { + const struct snp_cpuid_fn *fn = &cpuid_table->fn[i]; + + pr_info("index=%3d fn=0x%08x subfn=0x%08x: eax=0x%08x ebx=0x%08x ecx=0x%08x edx=0x%08x xcr0_in=0x%016llx xss_in=0x%016llx reserved=0x%016llx\n", + i, fn->eax_in, fn->ecx_in, fn->eax, fn->ebx, fn->ecx, + fn->edx, fn->xcr0_in, fn->xss_in, fn->__reserved); + } +} + +/* + * It is useful from an auditing/testing perspective to provide an easy way + * for the guest owner to know that the CPUID table has been initialized as + * expected, but that initialization happens too early in boot to print any + * sort of indicator, and there's not really any other good place to do it, + * so do it here. + */ +static int __init report_cpuid_table(void) +{ + const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); + + if (!cpuid_table->count) + return 0; + + pr_info("Using SNP CPUID table, %d entries present.\n", + cpuid_table->count); + + if (sev_cfg.debug) + dump_cpuid_table(); + + return 0; +} +arch_initcall(report_cpuid_table); + +static int __init init_sev_config(char *str) +{ + char *s; + + while ((s = strsep(&str, ","))) { + if (!strcmp(s, "debug")) { + sev_cfg.debug = true; + continue; + } + + pr_info("SEV command-line option '%s' was not recognized\n", s); + } + + return 1; +} +__setup("sev=", init_sev_config); + +int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned long *fw_err) +{ + struct ghcb_state state; + struct es_em_ctxt ctxt; + unsigned long flags; + struct ghcb *ghcb; + int ret; + + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return -ENODEV; + + if (!fw_err) + return -EINVAL; + + /* + * __sev_get_ghcb() needs to run with IRQs disabled because it is using + * a per-CPU GHCB. + */ + local_irq_save(flags); + + ghcb = __sev_get_ghcb(&state); + if (!ghcb) { + ret = -EIO; + goto e_restore_irq; + } + + vc_ghcb_invalidate(ghcb); + + if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) { + ghcb_set_rax(ghcb, input->data_gpa); + ghcb_set_rbx(ghcb, input->data_npages); + } + + ret = sev_es_ghcb_hv_call(ghcb, &ctxt, exit_code, input->req_gpa, input->resp_gpa); + if (ret) + goto e_put; + + if (ghcb->save.sw_exit_info_2) { + /* Number of expected pages are returned in RBX */ + if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST && + ghcb->save.sw_exit_info_2 == SNP_GUEST_REQ_INVALID_LEN) + input->data_npages = ghcb_get_rbx(ghcb); + + *fw_err = ghcb->save.sw_exit_info_2; + + ret = -EIO; + } + +e_put: + __sev_put_ghcb(&state); +e_restore_irq: + local_irq_restore(flags); + + return ret; +} +EXPORT_SYMBOL_GPL(snp_issue_guest_request); + +static struct platform_device sev_guest_device = { + .name = "sev-guest", + .id = -1, +}; + +static int __init snp_init_platform_device(void) +{ + struct sev_guest_platform_data data; + u64 gpa; + + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return -ENODEV; + + gpa = get_secrets_page(); + if (!gpa) + return -ENODEV; + + data.secrets_gpa = gpa; + if (platform_device_add_data(&sev_guest_device, &data, sizeof(data))) + return -ENODEV; + + if (platform_device_register(&sev_guest_device)) + return -ENODEV; + + pr_info("SNP guest platform device initialized.\n"); + return 0; } +device_initcall(snp_init_platform_device); diff --git a/arch/x86/kernel/sev_verify_cbit.S b/arch/x86/kernel/sev_verify_cbit.S index ee04941a6546..3355e27c69eb 100644 --- a/arch/x86/kernel/sev_verify_cbit.S +++ b/arch/x86/kernel/sev_verify_cbit.S @@ -85,5 +85,5 @@ SYM_FUNC_START(sev_verify_cbit) #endif /* Return page-table pointer */ movq %rdi, %rax - ret + RET SYM_FUNC_END(sev_verify_cbit) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index ec71e06ae364..9c7265b524c7 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -18,7 +18,6 @@ #include <linux/kstrtox.h> #include <linux/errno.h> #include <linux/wait.h> -#include <linux/tracehook.h> #include <linux/unistd.h> #include <linux/stddef.h> #include <linux/personality.h> @@ -94,7 +93,7 @@ static bool restore_sigcontext(struct pt_regs *regs, return false; #ifdef CONFIG_X86_32 - set_user_gs(regs, sc.gs); + loadsegment(gs, sc.gs); regs->fs = sc.fs; regs->es = sc.es; regs->ds = sc.ds; @@ -147,8 +146,10 @@ __unsafe_setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, struct pt_regs *regs, unsigned long mask) { #ifdef CONFIG_X86_32 - unsafe_put_user(get_user_gs(regs), - (unsigned int __user *)&sc->gs, Efault); + unsigned int gs; + savesegment(gs, gs); + + unsafe_put_user(gs, (unsigned int __user *)&sc->gs, Efault); unsafe_put_user(regs->fs, (unsigned int __user *)&sc->fs, Efault); unsafe_put_user(regs->es, (unsigned int __user *)&sc->es, Efault); unsafe_put_user(regs->ds, (unsigned int __user *)&sc->ds, Efault); @@ -861,11 +862,11 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) * want to handle. Thus you cannot kill init even with a SIGKILL even by * mistake. */ -void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) +void arch_do_signal_or_restart(struct pt_regs *regs) { struct ksignal ksig; - if (has_signal && get_signal(&ksig)) { + if (get_signal(&ksig)) { /* Whee! Actually deliver the signal. */ handle_signal(&ksig, regs); return; diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index b52407c56000..879ef8c72f5c 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -149,8 +149,10 @@ static inline void signal_compat_build_tests(void) BUILD_BUG_ON(offsetof(siginfo_t, si_perf_data) != 0x18); BUILD_BUG_ON(offsetof(siginfo_t, si_perf_type) != 0x20); + BUILD_BUG_ON(offsetof(siginfo_t, si_perf_flags) != 0x24); BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_data) != 0x10); BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_type) != 0x14); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_flags) != 0x18); CHECK_CSI_OFFSET(_sigpoll); CHECK_CSI_SIZE (_sigpoll, 2*sizeof(int)); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 617012f4619f..3f3ea0287f69 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -56,7 +56,6 @@ #include <linux/numa.h> #include <linux/pgtable.h> #include <linux/overflow.h> -#include <linux/syscore_ops.h> #include <asm/acpi.h> #include <asm/desc.h> @@ -82,10 +81,7 @@ #include <asm/spec-ctrl.h> #include <asm/hw_irq.h> #include <asm/stackprotector.h> - -#ifdef CONFIG_ACPI_CPPC_LIB -#include <acpi/cppc_acpi.h> -#endif +#include <asm/sev.h> /* representing HT siblings of each logical CPU */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); @@ -99,10 +95,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_core_map); DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map); EXPORT_PER_CPU_SYMBOL(cpu_die_map); -DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); - -DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map); - /* Per CPU bogomips and other parameters */ DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); EXPORT_PER_CPU_SYMBOL(cpu_info); @@ -155,8 +147,6 @@ static inline void smpboot_restore_warm_reset_vector(void) *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0; } -static void init_freq_invariance(bool secondary, bool cppc_ready); - /* * Report back to the Boot Processor during boot time or to the caller processor * during CPU online. @@ -193,7 +183,7 @@ static void smp_callin(void) */ set_cpu_sibling_map(raw_smp_processor_id()); - init_freq_invariance(true, false); + ap_init_aperfmperf(); /* * Get our bogomips. @@ -1088,6 +1078,11 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, unsigned long boot_error = 0; unsigned long timeout; +#ifdef CONFIG_X86_64 + /* If 64-bit wakeup method exists, use the 64-bit mode trampoline IP */ + if (apic->wakeup_secondary_cpu_64) + start_ip = real_mode_header->trampoline_start64; +#endif idle->thread.sp = (unsigned long)task_pt_regs(idle); early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu); initial_code = (unsigned long)start_secondary; @@ -1129,11 +1124,14 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, /* * Wake up a CPU in difference cases: - * - Use the method in the APIC driver if it's defined + * - Use a method from the APIC driver if one defined, with wakeup + * straight to 64-bit mode preferred over wakeup to RM. * Otherwise, * - Use an INIT boot APIC message for APs or NMI for BSP. */ - if (apic->wakeup_secondary_cpu) + if (apic->wakeup_secondary_cpu_64) + boot_error = apic->wakeup_secondary_cpu_64(apicid, start_ip); + else if (apic->wakeup_secondary_cpu) boot_error = apic->wakeup_secondary_cpu(apicid, start_ip); else boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid, @@ -1318,7 +1316,7 @@ static void __init smp_sanity_check(void) nr++; } - nr_cpu_ids = 8; + set_nr_cpu_ids(8); } #endif @@ -1403,7 +1401,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) { smp_prepare_cpus_common(); - init_freq_invariance(false, false); smp_sanity_check(); switch (apic_intr_mode) { @@ -1436,6 +1433,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) smp_quirk_init_udelay(); speculative_store_bypass_ht_init(); + + snp_set_wakeup_secondary_cpu(); } void arch_thaw_secondary_cpus_begin(void) @@ -1570,7 +1569,7 @@ __init void prefill_possible_map(void) possible = i; } - nr_cpu_ids = possible; + set_nr_cpu_ids(possible); pr_info("Allowing %d CPUs, %d hotplug CPUs\n", possible, max_t(int, possible - num_processors, 0)); @@ -1853,419 +1852,3 @@ void native_play_dead(void) } #endif - -#ifdef CONFIG_X86_64 -/* - * APERF/MPERF frequency ratio computation. - * - * The scheduler wants to do frequency invariant accounting and needs a <1 - * ratio to account for the 'current' frequency, corresponding to - * freq_curr / freq_max. - * - * Since the frequency freq_curr on x86 is controlled by micro-controller and - * our P-state setting is little more than a request/hint, we need to observe - * the effective frequency 'BusyMHz', i.e. the average frequency over a time - * interval after discarding idle time. This is given by: - * - * BusyMHz = delta_APERF / delta_MPERF * freq_base - * - * where freq_base is the max non-turbo P-state. - * - * The freq_max term has to be set to a somewhat arbitrary value, because we - * can't know which turbo states will be available at a given point in time: - * it all depends on the thermal headroom of the entire package. We set it to - * the turbo level with 4 cores active. - * - * Benchmarks show that's a good compromise between the 1C turbo ratio - * (freq_curr/freq_max would rarely reach 1) and something close to freq_base, - * which would ignore the entire turbo range (a conspicuous part, making - * freq_curr/freq_max always maxed out). - * - * An exception to the heuristic above is the Atom uarch, where we choose the - * highest turbo level for freq_max since Atom's are generally oriented towards - * power efficiency. - * - * Setting freq_max to anything less than the 1C turbo ratio makes the ratio - * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1. - */ - -DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key); - -static DEFINE_PER_CPU(u64, arch_prev_aperf); -static DEFINE_PER_CPU(u64, arch_prev_mperf); -static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE; -static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE; - -void arch_set_max_freq_ratio(bool turbo_disabled) -{ - arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE : - arch_turbo_freq_ratio; -} -EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio); - -static bool turbo_disabled(void) -{ - u64 misc_en; - int err; - - err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en); - if (err) - return false; - - return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); -} - -static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) -{ - int err; - - err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq); - if (err) - return false; - - err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq); - if (err) - return false; - - *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */ - *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */ - - return true; -} - -#define X86_MATCH(model) \ - X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \ - INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL) - -static const struct x86_cpu_id has_knl_turbo_ratio_limits[] = { - X86_MATCH(XEON_PHI_KNL), - X86_MATCH(XEON_PHI_KNM), - {} -}; - -static const struct x86_cpu_id has_skx_turbo_ratio_limits[] = { - X86_MATCH(SKYLAKE_X), - {} -}; - -static const struct x86_cpu_id has_glm_turbo_ratio_limits[] = { - X86_MATCH(ATOM_GOLDMONT), - X86_MATCH(ATOM_GOLDMONT_D), - X86_MATCH(ATOM_GOLDMONT_PLUS), - {} -}; - -static bool knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, - int num_delta_fratio) -{ - int fratio, delta_fratio, found; - int err, i; - u64 msr; - - err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); - if (err) - return false; - - *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ - - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); - if (err) - return false; - - fratio = (msr >> 8) & 0xFF; - i = 16; - found = 0; - do { - if (found >= num_delta_fratio) { - *turbo_freq = fratio; - return true; - } - - delta_fratio = (msr >> (i + 5)) & 0x7; - - if (delta_fratio) { - found += 1; - fratio -= delta_fratio; - } - - i += 8; - } while (i < 64); - - return true; -} - -static bool skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size) -{ - u64 ratios, counts; - u32 group_size; - int err, i; - - err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); - if (err) - return false; - - *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ - - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios); - if (err) - return false; - - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts); - if (err) - return false; - - for (i = 0; i < 64; i += 8) { - group_size = (counts >> i) & 0xFF; - if (group_size >= size) { - *turbo_freq = (ratios >> i) & 0xFF; - return true; - } - } - - return false; -} - -static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) -{ - u64 msr; - int err; - - err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); - if (err) - return false; - - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); - if (err) - return false; - - *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ - *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */ - - /* The CPU may have less than 4 cores */ - if (!*turbo_freq) - *turbo_freq = msr & 0xFF; /* 1C turbo */ - - return true; -} - -static bool intel_set_max_freq_ratio(void) -{ - u64 base_freq, turbo_freq; - u64 turbo_ratio; - - if (slv_set_max_freq_ratio(&base_freq, &turbo_freq)) - goto out; - - if (x86_match_cpu(has_glm_turbo_ratio_limits) && - skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) - goto out; - - if (x86_match_cpu(has_knl_turbo_ratio_limits) && - knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) - goto out; - - if (x86_match_cpu(has_skx_turbo_ratio_limits) && - skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4)) - goto out; - - if (core_set_max_freq_ratio(&base_freq, &turbo_freq)) - goto out; - - return false; - -out: - /* - * Some hypervisors advertise X86_FEATURE_APERFMPERF - * but then fill all MSR's with zeroes. - * Some CPUs have turbo boost but don't declare any turbo ratio - * in MSR_TURBO_RATIO_LIMIT. - */ - if (!base_freq || !turbo_freq) { - pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n"); - return false; - } - - turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq); - if (!turbo_ratio) { - pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n"); - return false; - } - - arch_turbo_freq_ratio = turbo_ratio; - arch_set_max_freq_ratio(turbo_disabled()); - - return true; -} - -#ifdef CONFIG_ACPI_CPPC_LIB -static bool amd_set_max_freq_ratio(void) -{ - struct cppc_perf_caps perf_caps; - u64 highest_perf, nominal_perf; - u64 perf_ratio; - int rc; - - rc = cppc_get_perf_caps(0, &perf_caps); - if (rc) { - pr_debug("Could not retrieve perf counters (%d)\n", rc); - return false; - } - - highest_perf = amd_get_highest_perf(); - nominal_perf = perf_caps.nominal_perf; - - if (!highest_perf || !nominal_perf) { - pr_debug("Could not retrieve highest or nominal performance\n"); - return false; - } - - perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf); - /* midpoint between max_boost and max_P */ - perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1; - if (!perf_ratio) { - pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n"); - return false; - } - - arch_turbo_freq_ratio = perf_ratio; - arch_set_max_freq_ratio(false); - - return true; -} -#else -static bool amd_set_max_freq_ratio(void) -{ - return false; -} -#endif - -static void init_counter_refs(void) -{ - u64 aperf, mperf; - - rdmsrl(MSR_IA32_APERF, aperf); - rdmsrl(MSR_IA32_MPERF, mperf); - - this_cpu_write(arch_prev_aperf, aperf); - this_cpu_write(arch_prev_mperf, mperf); -} - -#ifdef CONFIG_PM_SLEEP -static struct syscore_ops freq_invariance_syscore_ops = { - .resume = init_counter_refs, -}; - -static void register_freq_invariance_syscore_ops(void) -{ - /* Bail out if registered already. */ - if (freq_invariance_syscore_ops.node.prev) - return; - - register_syscore_ops(&freq_invariance_syscore_ops); -} -#else -static inline void register_freq_invariance_syscore_ops(void) {} -#endif - -static void init_freq_invariance(bool secondary, bool cppc_ready) -{ - bool ret = false; - - if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) - return; - - if (secondary) { - if (static_branch_likely(&arch_scale_freq_key)) { - init_counter_refs(); - } - return; - } - - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - ret = intel_set_max_freq_ratio(); - else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { - if (!cppc_ready) { - return; - } - ret = amd_set_max_freq_ratio(); - } - - if (ret) { - init_counter_refs(); - static_branch_enable(&arch_scale_freq_key); - register_freq_invariance_syscore_ops(); - pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); - } else { - pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n"); - } -} - -#ifdef CONFIG_ACPI_CPPC_LIB -static DEFINE_MUTEX(freq_invariance_lock); - -void init_freq_invariance_cppc(void) -{ - static bool secondary; - - mutex_lock(&freq_invariance_lock); - - init_freq_invariance(secondary, true); - secondary = true; - - mutex_unlock(&freq_invariance_lock); -} -#endif - -static void disable_freq_invariance_workfn(struct work_struct *work) -{ - static_branch_disable(&arch_scale_freq_key); -} - -static DECLARE_WORK(disable_freq_invariance_work, - disable_freq_invariance_workfn); - -DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; - -void arch_scale_freq_tick(void) -{ - u64 freq_scale; - u64 aperf, mperf; - u64 acnt, mcnt; - - if (!arch_scale_freq_invariant()) - return; - - rdmsrl(MSR_IA32_APERF, aperf); - rdmsrl(MSR_IA32_MPERF, mperf); - - acnt = aperf - this_cpu_read(arch_prev_aperf); - mcnt = mperf - this_cpu_read(arch_prev_mperf); - - this_cpu_write(arch_prev_aperf, aperf); - this_cpu_write(arch_prev_mperf, mperf); - - if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) - goto error; - - if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt) - goto error; - - freq_scale = div64_u64(acnt, mcnt); - if (!freq_scale) - goto error; - - if (freq_scale > SCHED_CAPACITY_SCALE) - freq_scale = SCHED_CAPACITY_SCALE; - - this_cpu_write(arch_freq_scale, freq_scale); - return; - -error: - pr_warn("Scheduler frequency invariance went wobbly, disabling!\n"); - schedule_work(&disable_freq_invariance_work); -} -#else -static inline void init_freq_invariance(bool secondary, bool cppc_ready) -{ -} -#endif /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 15b058eefc4e..ee117fcf46ed 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -90,7 +90,7 @@ copy_stack_frame(const struct stack_frame_user __user *fp, { int ret; - if (__range_not_ok(fp, sizeof(*frame), TASK_SIZE)) + if (!__access_ok(fp, sizeof(*frame))) return 0; ret = 1; diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index 9c407a33a774..aaaba85d6d7f 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -12,12 +12,21 @@ enum insn_type { }; /* - * data16 data16 xorq %rax, %rax - a single 5 byte instruction that clears %rax - * The REX.W cancels the effect of any data16. + * ud1 %esp, %ecx - a 3 byte #UD that is unique to trampolines, chosen such + * that there is no false-positive trampoline identification while also being a + * speculation stop. */ -static const u8 xor5rax[] = { 0x66, 0x66, 0x48, 0x31, 0xc0 }; +static const u8 tramp_ud[] = { 0x0f, 0xb9, 0xcc }; -static void __ref __static_call_transform(void *insn, enum insn_type type, void *func) +/* + * cs cs cs xorl %eax, %eax - a single 5 byte instruction that clears %[er]ax + */ +static const u8 xor5rax[] = { 0x2e, 0x2e, 0x2e, 0x31, 0xc0 }; + +static const u8 retinsn[] = { RET_INSN_OPCODE, 0xcc, 0xcc, 0xcc, 0xcc }; + +static void __ref __static_call_transform(void *insn, enum insn_type type, + void *func, bool modinit) { const void *emulate = NULL; int size = CALL_INSN_SIZE; @@ -42,15 +51,17 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, void break; case RET: - code = text_gen_insn(RET_INSN_OPCODE, insn, func); - size = RET_INSN_SIZE; + if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + code = text_gen_insn(JMP32_INSN_OPCODE, insn, &__x86_return_thunk); + else + code = &retinsn; break; } if (memcmp(insn, code, size) == 0) return; - if (unlikely(system_state == SYSTEM_BOOTING)) + if (system_state == SYSTEM_BOOTING || modinit) return text_poke_early(insn, code, size); text_poke_bp(insn, code, size, emulate); @@ -60,7 +71,7 @@ static void __static_call_validate(void *insn, bool tail, bool tramp) { u8 opcode = *(u8 *)insn; - if (tramp && memcmp(insn+5, "SCT", 3)) { + if (tramp && memcmp(insn+5, tramp_ud, 3)) { pr_err("trampoline signature fail"); BUG(); } @@ -104,14 +115,42 @@ void arch_static_call_transform(void *site, void *tramp, void *func, bool tail) if (tramp) { __static_call_validate(tramp, true, true); - __static_call_transform(tramp, __sc_insn(!func, true), func); + __static_call_transform(tramp, __sc_insn(!func, true), func, false); } if (IS_ENABLED(CONFIG_HAVE_STATIC_CALL_INLINE) && site) { __static_call_validate(site, tail, false); - __static_call_transform(site, __sc_insn(!func, tail), func); + __static_call_transform(site, __sc_insn(!func, tail), func, false); } mutex_unlock(&text_mutex); } EXPORT_SYMBOL_GPL(arch_static_call_transform); + +#ifdef CONFIG_RETHUNK +/* + * This is called by apply_returns() to fix up static call trampolines, + * specifically ARCH_DEFINE_STATIC_CALL_NULL_TRAMP which is recorded as + * having a return trampoline. + * + * The problem is that static_call() is available before determining + * X86_FEATURE_RETHUNK and, by implication, running alternatives. + * + * This means that __static_call_transform() above can have overwritten the + * return trampoline and we now need to fix things up to be consistent. + */ +bool __static_call_fixup(void *tramp, u8 op, void *dest) +{ + if (memcmp(tramp+5, tramp_ud, 3)) { + /* Not a trampoline site, not our problem. */ + return false; + } + + mutex_lock(&text_mutex); + if (op == RET_INSN_OPCODE || dest == &__x86_return_thunk) + __static_call_transform(tramp, RET, NULL, true); + mutex_unlock(&text_mutex); + + return true; +} +#endif diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 0f3c307b37b3..8e2b2552b5ee 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -180,8 +180,7 @@ void set_task_blockstep(struct task_struct *task, bool on) * * NOTE: this means that set/clear TIF_BLOCKSTEP is only safe if * task is current or it can't be running, otherwise we can race - * with __switch_to_xtra(). We rely on ptrace_freeze_traced() but - * PTRACE_KILL is not safe. + * with __switch_to_xtra(). We rely on ptrace_freeze_traced(). */ local_irq_disable(); debugctl = get_debugctlmsr(); diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 660b78827638..8cc653ffdccd 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -68,9 +68,6 @@ static int __init control_va_addr_alignment(char *str) if (*str == 0) return 1; - if (*str == '=') - str++; - if (!strcmp(str, "32")) va_align.flags = ALIGN_VA_32; else if (!strcmp(str, "64")) @@ -80,11 +77,11 @@ static int __init control_va_addr_alignment(char *str) else if (!strcmp(str, "on")) va_align.flags = ALIGN_VA_32 | ALIGN_VA_64; else - return 0; + pr_warn("invalid option value: 'align_va_addr=%s'\n", str); return 1; } -__setup("align_va_addr", control_va_addr_alignment); +__setup("align_va_addr=", control_va_addr_alignment); SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index f9af561c3cd4..4c1bcb6053fc 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -6,7 +6,6 @@ * Copyright (c) 2006-2009, Intel Corporation */ -#include <linux/intel-iommu.h> #include <linux/init_task.h> #include <linux/spinlock.h> #include <linux/export.h> @@ -24,7 +23,6 @@ #include <asm/processor.h> #include <asm/bootparam.h> #include <asm/pgalloc.h> -#include <asm/swiotlb.h> #include <asm/fixmap.h> #include <asm/proto.h> #include <asm/setup.h> @@ -97,7 +95,7 @@ void __init tboot_probe(void) static pgd_t *tboot_pg_dir; static struct mm_struct tboot_mm = { - .mm_rb = RB_ROOT, + .mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, tboot_mm.mmap_lock), .pgd = swapper_pg_dir, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), @@ -517,17 +515,3 @@ struct acpi_table_header *tboot_get_dmar_table(struct acpi_table_header *dmar_tb return dmar_tbl; } - -int tboot_force_iommu(void) -{ - if (!tboot_enabled()) - return 0; - - if (no_iommu || dmar_disabled) - pr_warn("Forcing Intel-IOMMU to enabled\n"); - - dmar_disabled = 0; - no_iommu = 0; - - return 1; -} diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index bd83748e2bde..8617d1ed9d31 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -154,11 +154,6 @@ static int __init topology_init(void) { int i; -#ifdef CONFIG_NUMA - for_each_online_node(i) - register_one_node(i); -#endif - for_each_present_cpu(i) arch_register_cpu(i); diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c index fcfc077afe2d..03ae1caaa878 100644 --- a/arch/x86/kernel/tracepoint.c +++ b/arch/x86/kernel/tracepoint.c @@ -1,17 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Code for supporting irq vector tracepoints. - * * Copyright (C) 2013 Seiji Aguchi <seiji.aguchi@hds.com> - * */ #include <linux/jump_label.h> #include <linux/atomic.h> -#include <asm/hw_irq.h> -#include <asm/desc.h> #include <asm/trace/exceptions.h> -#include <asm/trace/irq_vectors.h> DEFINE_STATIC_KEY_FALSE(trace_pagefault_key); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index c9d566dcf89a..d3fdec706f1d 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -15,6 +15,7 @@ #include <linux/context_tracking.h> #include <linux/interrupt.h> #include <linux/kallsyms.h> +#include <linux/kmsan.h> #include <linux/spinlock.h> #include <linux/kprobes.h> #include <linux/uaccess.h> @@ -39,6 +40,7 @@ #include <linux/io.h> #include <linux/hardirq.h> #include <linux/atomic.h> +#include <linux/ioasid.h> #include <asm/stacktrace.h> #include <asm/processor.h> @@ -61,6 +63,8 @@ #include <asm/insn.h> #include <asm/insn-eval.h> #include <asm/vdso.h> +#include <asm/tdx.h> +#include <asm/cfi.h> #ifdef CONFIG_X86_64 #include <asm/x86_init.h> @@ -209,6 +213,81 @@ DEFINE_IDTENTRY(exc_overflow) do_error_trap(regs, 0, "overflow", X86_TRAP_OF, SIGSEGV, 0, NULL); } +#ifdef CONFIG_X86_KERNEL_IBT + +static __ro_after_init bool ibt_fatal = true; + +extern void ibt_selftest_ip(void); /* code label defined in asm below */ + +enum cp_error_code { + CP_EC = (1 << 15) - 1, + + CP_RET = 1, + CP_IRET = 2, + CP_ENDBR = 3, + CP_RSTRORSSP = 4, + CP_SETSSBSY = 5, + + CP_ENCL = 1 << 15, +}; + +DEFINE_IDTENTRY_ERRORCODE(exc_control_protection) +{ + if (!cpu_feature_enabled(X86_FEATURE_IBT)) { + pr_err("Unexpected #CP\n"); + BUG(); + } + + if (WARN_ON_ONCE(user_mode(regs) || (error_code & CP_EC) != CP_ENDBR)) + return; + + if (unlikely(regs->ip == (unsigned long)&ibt_selftest_ip)) { + regs->ax = 0; + return; + } + + pr_err("Missing ENDBR: %pS\n", (void *)instruction_pointer(regs)); + if (!ibt_fatal) { + printk(KERN_DEFAULT CUT_HERE); + __warn(__FILE__, __LINE__, (void *)regs->ip, TAINT_WARN, regs, NULL); + return; + } + BUG(); +} + +/* Must be noinline to ensure uniqueness of ibt_selftest_ip. */ +noinline bool ibt_selftest(void) +{ + unsigned long ret; + + asm (" lea ibt_selftest_ip(%%rip), %%rax\n\t" + ANNOTATE_RETPOLINE_SAFE + " jmp *%%rax\n\t" + "ibt_selftest_ip:\n\t" + UNWIND_HINT_FUNC + ANNOTATE_NOENDBR + " nop\n\t" + + : "=a" (ret) : : "memory"); + + return !ret; +} + +static int __init ibt_setup(char *str) +{ + if (!strcmp(str, "off")) + setup_clear_cpu_cap(X86_FEATURE_IBT); + + if (!strcmp(str, "warn")) + ibt_fatal = false; + + return 1; +} + +__setup("ibt=", ibt_setup); + +#endif /* CONFIG_X86_KERNEL_IBT */ + #ifdef CONFIG_X86_F00F_BUG void handle_invalid_op(struct pt_regs *regs) #else @@ -223,6 +302,12 @@ static noinstr bool handle_bug(struct pt_regs *regs) { bool handled = false; + /* + * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug() + * is a rare case that uses @regs without passing them to + * irqentry_enter(). + */ + kmsan_unpoison_entry_regs(regs); if (!is_valid_bugaddr(regs->ip)) return handled; @@ -236,7 +321,8 @@ static noinstr bool handle_bug(struct pt_regs *regs) */ if (regs->flags & X86_EFLAGS_IF) raw_local_irq_enable(); - if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) { + if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN || + handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) { regs->ip += LEN_UD2; handled = true; } @@ -559,13 +645,94 @@ static bool fixup_iopl_exception(struct pt_regs *regs) return true; } +/* + * The unprivileged ENQCMD instruction generates #GPs if the + * IA32_PASID MSR has not been populated. If possible, populate + * the MSR from a PASID previously allocated to the mm. + */ +static bool try_fixup_enqcmd_gp(void) +{ +#ifdef CONFIG_IOMMU_SVA + u32 pasid; + + /* + * MSR_IA32_PASID is managed using XSAVE. Directly + * writing to the MSR is only possible when fpregs + * are valid and the fpstate is not. This is + * guaranteed when handling a userspace exception + * in *before* interrupts are re-enabled. + */ + lockdep_assert_irqs_disabled(); + + /* + * Hardware without ENQCMD will not generate + * #GPs that can be fixed up here. + */ + if (!cpu_feature_enabled(X86_FEATURE_ENQCMD)) + return false; + + pasid = current->mm->pasid; + + /* + * If the mm has not been allocated a + * PASID, the #GP can not be fixed up. + */ + if (!pasid_valid(pasid)) + return false; + + /* + * Did this thread already have its PASID activated? + * If so, the #GP must be from something else. + */ + if (current->pasid_activated) + return false; + + wrmsrl(MSR_IA32_PASID, pasid | MSR_IA32_PASID_VALID); + current->pasid_activated = 1; + + return true; +#else + return false; +#endif +} + +static bool gp_try_fixup_and_notify(struct pt_regs *regs, int trapnr, + unsigned long error_code, const char *str) +{ + if (fixup_exception(regs, trapnr, error_code, 0)) + return true; + + current->thread.error_code = error_code; + current->thread.trap_nr = trapnr; + + /* + * To be potentially processing a kprobe fault and to trust the result + * from kprobe_running(), we have to be non-preemptible. + */ + if (!preemptible() && kprobe_running() && + kprobe_fault_handler(regs, trapnr)) + return true; + + return notify_die(DIE_GPF, str, regs, error_code, trapnr, SIGSEGV) == NOTIFY_STOP; +} + +static void gp_user_force_sig_segv(struct pt_regs *regs, int trapnr, + unsigned long error_code, const char *str) +{ + current->thread.error_code = error_code; + current->thread.trap_nr = trapnr; + show_signal(current, SIGSEGV, "", str, regs, error_code); + force_sig(SIGSEGV); +} + DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) { char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR; enum kernel_gp_hint hint = GP_NO_HINT; - struct task_struct *tsk; unsigned long gp_addr; - int ret; + + if (user_mode(regs) && try_fixup_enqcmd_gp()) + return; cond_local_irq_enable(regs); @@ -581,40 +748,18 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) return; } - tsk = current; - if (user_mode(regs)) { if (fixup_iopl_exception(regs)) goto exit; - tsk->thread.error_code = error_code; - tsk->thread.trap_nr = X86_TRAP_GP; - if (fixup_vdso_exception(regs, X86_TRAP_GP, error_code, 0)) goto exit; - show_signal(tsk, SIGSEGV, "", desc, regs, error_code); - force_sig(SIGSEGV); + gp_user_force_sig_segv(regs, X86_TRAP_GP, error_code, desc); goto exit; } - if (fixup_exception(regs, X86_TRAP_GP, error_code, 0)) - goto exit; - - tsk->thread.error_code = error_code; - tsk->thread.trap_nr = X86_TRAP_GP; - - /* - * To be potentially processing a kprobe fault and to trust the result - * from kprobe_running(), we have to be non-preemptible. - */ - if (!preemptible() && - kprobe_running() && - kprobe_fault_handler(regs, X86_TRAP_GP)) - goto exit; - - ret = notify_die(DIE_GPF, desc, regs, error_code, X86_TRAP_GP, SIGSEGV); - if (ret == NOTIFY_STOP) + if (gp_try_fixup_and_notify(regs, X86_TRAP_GP, error_code, desc)) goto exit; if (error_code) @@ -659,6 +804,7 @@ static bool do_int3(struct pt_regs *regs) return res == NOTIFY_STOP; } +NOKPROBE_SYMBOL(do_int3); static void do_int3_user(struct pt_regs *regs) { @@ -761,14 +907,10 @@ sync: } #endif -struct bad_iret_stack { - void *error_entry_ret; - struct pt_regs regs; -}; - -asmlinkage __visible noinstr -struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) +asmlinkage __visible noinstr struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs) { + struct pt_regs tmp, *new_stack; + /* * This is called from entry_64.S early in handling a fault * caused by a bad iret to user mode. To handle the fault @@ -777,19 +919,18 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) * just below the IRET frame) and we want to pretend that the * exception came from the IRET target. */ - struct bad_iret_stack tmp, *new_stack = - (struct bad_iret_stack *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; + new_stack = (struct pt_regs *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; /* Copy the IRET target to the temporary storage. */ - __memcpy(&tmp.regs.ip, (void *)s->regs.sp, 5*8); + __memcpy(&tmp.ip, (void *)bad_regs->sp, 5*8); /* Copy the remainder of the stack from the current stack. */ - __memcpy(&tmp, s, offsetof(struct bad_iret_stack, regs.ip)); + __memcpy(&tmp, bad_regs, offsetof(struct pt_regs, ip)); /* Update the entry stack */ __memcpy(new_stack, &tmp, sizeof(tmp)); - BUG_ON(!user_mode(&new_stack->regs)); + BUG_ON(!user_mode(new_stack)); return new_stack; } #endif @@ -1212,6 +1353,91 @@ DEFINE_IDTENTRY(exc_device_not_available) } } +#ifdef CONFIG_INTEL_TDX_GUEST + +#define VE_FAULT_STR "VE fault" + +static void ve_raise_fault(struct pt_regs *regs, long error_code) +{ + if (user_mode(regs)) { + gp_user_force_sig_segv(regs, X86_TRAP_VE, error_code, VE_FAULT_STR); + return; + } + + if (gp_try_fixup_and_notify(regs, X86_TRAP_VE, error_code, VE_FAULT_STR)) + return; + + die_addr(VE_FAULT_STR, regs, error_code, 0); +} + +/* + * Virtualization Exceptions (#VE) are delivered to TDX guests due to + * specific guest actions which may happen in either user space or the + * kernel: + * + * * Specific instructions (WBINVD, for example) + * * Specific MSR accesses + * * Specific CPUID leaf accesses + * * Access to specific guest physical addresses + * + * In the settings that Linux will run in, virtualization exceptions are + * never generated on accesses to normal, TD-private memory that has been + * accepted (by BIOS or with tdx_enc_status_changed()). + * + * Syscall entry code has a critical window where the kernel stack is not + * yet set up. Any exception in this window leads to hard to debug issues + * and can be exploited for privilege escalation. Exceptions in the NMI + * entry code also cause issues. Returning from the exception handler with + * IRET will re-enable NMIs and nested NMI will corrupt the NMI stack. + * + * For these reasons, the kernel avoids #VEs during the syscall gap and + * the NMI entry code. Entry code paths do not access TD-shared memory, + * MMIO regions, use #VE triggering MSRs, instructions, or CPUID leaves + * that might generate #VE. VMM can remove memory from TD at any point, + * but access to unaccepted (or missing) private memory leads to VM + * termination, not to #VE. + * + * Similarly to page faults and breakpoints, #VEs are allowed in NMI + * handlers once the kernel is ready to deal with nested NMIs. + * + * During #VE delivery, all interrupts, including NMIs, are blocked until + * TDGETVEINFO is called. It prevents #VE nesting until the kernel reads + * the VE info. + * + * If a guest kernel action which would normally cause a #VE occurs in + * the interrupt-disabled region before TDGETVEINFO, a #DF (fault + * exception) is delivered to the guest which will result in an oops. + * + * The entry code has been audited carefully for following these expectations. + * Changes in the entry code have to be audited for correctness vs. this + * aspect. Similarly to #PF, #VE in these places will expose kernel to + * privilege escalation or may lead to random crashes. + */ +DEFINE_IDTENTRY(exc_virtualization_exception) +{ + struct ve_info ve; + + /* + * NMIs/Machine-checks/Interrupts will be in a disabled state + * till TDGETVEINFO TDCALL is executed. This ensures that VE + * info cannot be overwritten by a nested #VE. + */ + tdx_get_ve_info(&ve); + + cond_local_irq_enable(regs); + + /* + * If tdx_handle_virt_exception() could not process + * it successfully, treat it as #GP(0) and handle it. + */ + if (!tdx_handle_virt_exception(regs, &ve)) + ve_raise_fault(regs, 0); + + cond_local_irq_disable(regs); +} + +#endif + #ifdef CONFIG_X86_32 DEFINE_IDTENTRY_SW(iret_error) { diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index a698196377be..cafacb2e58cc 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1487,6 +1487,9 @@ static unsigned long __init get_loops_per_jiffy(void) static void __init tsc_enable_sched_clock(void) { + loops_per_jiffy = get_loops_per_jiffy(); + use_tsc_delay(); + /* Sanitize TSC ADJUST before cyc2ns gets initialized */ tsc_store_and_check_tsc_adjust(true); cyc2ns_init_boot_cpu(); @@ -1502,8 +1505,6 @@ void __init tsc_early_init(void) return; if (!determine_cpu_tsc_frequencies(true)) return; - loops_per_jiffy = get_loops_per_jiffy(); - tsc_enable_sched_clock(); } @@ -1537,7 +1538,6 @@ void __init tsc_init(void) enable_sched_clock_irqtime(); lpj_fine = get_loops_per_jiffy(); - use_tsc_delay(); check_system_tsc_reliable(); diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 8e1c50c86e5d..d8ba93778ae3 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -183,6 +183,16 @@ static struct pt_regs *decode_frame_pointer(unsigned long *bp) } #endif +/* + * While walking the stack, KMSAN may stomp on stale locals from other + * functions that were marked as uninitialized upon function exit, and + * now hold the call frame information for the current function (e.g. the frame + * pointer). Because KMSAN does not specifically mark call frames as + * initialized, false positive reports are possible. To prevent such reports, + * we mark the functions scanning the stack (here and below) with + * __no_kmsan_checks. + */ +__no_kmsan_checks static bool update_stack_state(struct unwind_state *state, unsigned long *next_bp) { @@ -250,6 +260,7 @@ static bool update_stack_state(struct unwind_state *state, return true; } +__no_kmsan_checks bool unwind_next_frame(struct unwind_state *state) { struct pt_regs *regs; diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 2de3c8c5eba9..c059820dfaea 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -93,22 +93,27 @@ static struct orc_entry *orc_find(unsigned long ip); static struct orc_entry *orc_ftrace_find(unsigned long ip) { struct ftrace_ops *ops; - unsigned long caller; + unsigned long tramp_addr, offset; ops = ftrace_ops_trampoline(ip); if (!ops) return NULL; + /* Set tramp_addr to the start of the code copied by the trampoline */ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) - caller = (unsigned long)ftrace_regs_call; + tramp_addr = (unsigned long)ftrace_regs_caller; else - caller = (unsigned long)ftrace_call; + tramp_addr = (unsigned long)ftrace_caller; + + /* Now place tramp_addr to the location within the trampoline ip is at */ + offset = ip - ops->trampoline; + tramp_addr += offset; /* Prevent unlikely recursion */ - if (ip == caller) + if (ip == tramp_addr) return NULL; - return orc_find(caller); + return orc_find(tramp_addr); } #else static struct orc_entry *orc_ftrace_find(unsigned long ip) @@ -339,11 +344,11 @@ static bool stack_access_ok(struct unwind_state *state, unsigned long _addr, struct stack_info *info = &state->stack_info; void *addr = (void *)_addr; - if (!on_stack(info, addr, len) && - (get_stack_info(addr, state->task, info, &state->stack_mask))) - return false; + if (on_stack(info, addr, len)) + return true; - return true; + return !get_stack_info(addr, state->task, info, &state->stack_mask) && + on_stack(info, addr, len); } static bool deref_stack_reg(struct unwind_state *state, unsigned long addr, @@ -550,15 +555,15 @@ bool unwind_next_frame(struct unwind_state *state) } /* * There is a small chance to interrupt at the entry of - * __kretprobe_trampoline() where the ORC info doesn't exist. - * That point is right after the RET to __kretprobe_trampoline() + * arch_rethook_trampoline() where the ORC info doesn't exist. + * That point is right after the RET to arch_rethook_trampoline() * which was modified return address. - * At that point, the @addr_p of the unwind_recover_kretprobe() + * At that point, the @addr_p of the unwind_recover_rethook() * (this has to point the address of the stack entry storing * the modified return address) must be "SP - (a stack entry)" * because SP is incremented by the RET. */ - state->ip = unwind_recover_kretprobe(state, state->ip, + state->ip = unwind_recover_rethook(state, state->ip, (unsigned long *)(state->sp - sizeof(long))); state->regs = (struct pt_regs *)sp; state->prev_regs = NULL; @@ -573,7 +578,7 @@ bool unwind_next_frame(struct unwind_state *state) goto err; } /* See UNWIND_HINT_TYPE_REGS case comment. */ - state->ip = unwind_recover_kretprobe(state, state->ip, + state->ip = unwind_recover_rethook(state, state->ip, (unsigned long *)(state->sp - sizeof(long))); if (state->full_regs) @@ -708,7 +713,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, /* Otherwise, skip ahead to the user-specified starting frame: */ while (!unwind_done(state) && (!on_stack(&state->stack_info, first_frame, sizeof(long)) || - state->sp < (unsigned long)first_frame)) + state->sp <= (unsigned long)first_frame)) unwind_next_frame(state); return; diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S index 641f0fe1e5b4..1258a5872d12 100644 --- a/arch/x86/kernel/verify_cpu.S +++ b/arch/x86/kernel/verify_cpu.S @@ -132,9 +132,9 @@ SYM_FUNC_START_LOCAL(verify_cpu) .Lverify_cpu_no_longmode: popf # Restore caller passed flags movl $1,%eax - ret + RET .Lverify_cpu_sse_ok: popf # Restore caller passed flags xorl %eax, %eax - ret + RET SYM_FUNC_END(verify_cpu) diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index c21bcd668284..e9e803a4d44c 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -151,7 +151,7 @@ exit_vm86: memcpy(®s->pt, &vm86->regs32, sizeof(struct pt_regs)); - lazy_load_gs(vm86->regs32.gs); + loadsegment(gs, vm86->regs32.gs); regs->pt.ax = retval; return; @@ -325,7 +325,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) * Save old state */ vm86->saved_sp0 = tsk->thread.sp0; - lazy_save_gs(vm86->regs32.gs); + savesegment(gs, vm86->regs32.gs); /* make room for real-mode segments */ preempt_disable(); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 3d6dc12d198f..15f29053cec4 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -137,12 +137,11 @@ SECTIONS ALIGN_ENTRY_TEXT_END SOFTIRQENTRY_TEXT STATIC_CALL_TEXT - *(.fixup) *(.gnu.warning) #ifdef CONFIG_RETPOLINE __indirect_thunk_start = .; - *(.text.__x86.indirect_thunk) + *(.text.__x86.*) __indirect_thunk_end = .; #endif } :text =0xcccc @@ -284,6 +283,22 @@ SECTIONS *(.retpoline_sites) __retpoline_sites_end = .; } + + . = ALIGN(8); + .return_sites : AT(ADDR(.return_sites) - LOAD_OFFSET) { + __return_sites = .; + *(.return_sites) + __return_sites_end = .; + } +#endif + +#ifdef CONFIG_X86_KERNEL_IBT + . = ALIGN(8); + .ibt_endbr_seal : AT(ADDR(.ibt_endbr_seal) - LOAD_OFFSET) { + __ibt_endbr_seal = .; + *(.ibt_endbr_seal) + __ibt_endbr_seal_end = .; + } #endif /* @@ -307,18 +322,6 @@ SECTIONS *(.altinstr_replacement) } - /* - * struct iommu_table_entry entries are injected in this section. - * It is an array of IOMMUs which during run time gets sorted depending - * on its dependency order. After rootfs_initcall is complete - * this section can be safely removed. - */ - .iommu_table : AT(ADDR(.iommu_table) - LOAD_OFFSET) { - __iommu_table = .; - *(.iommu_table) - __iommu_table_end = .; - } - . = ALIGN(8); .apicdrivers : AT(ADDR(.apicdrivers) - LOAD_OFFSET) { __apicdrivers = .; @@ -392,7 +395,7 @@ SECTIONS .brk : AT(ADDR(.brk) - LOAD_OFFSET) { __brk_base = .; . += 64 * 1024; /* 64k alignment slop space */ - *(.brk_reservation) /* areas brk users have reserved */ + *(.bss..brk) /* areas brk users have reserved */ __brk_limit = .; } diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 8b395821cb8d..57353519bc11 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -129,33 +129,33 @@ struct x86_cpuinit_ops x86_cpuinit = { static void default_nmi_init(void) { }; +static void enc_status_change_prepare_noop(unsigned long vaddr, int npages, bool enc) { } +static bool enc_status_change_finish_noop(unsigned long vaddr, int npages, bool enc) { return false; } +static bool enc_tlb_flush_required_noop(bool enc) { return false; } +static bool enc_cache_flush_required_noop(void) { return false; } + struct x86_platform_ops x86_platform __ro_after_init = { .calibrate_cpu = native_calibrate_cpu_early, .calibrate_tsc = native_calibrate_tsc, .get_wallclock = mach_get_cmos_time, - .set_wallclock = mach_set_rtc_mmss, + .set_wallclock = mach_set_cmos_time, .iommu_shutdown = iommu_shutdown_noop, .is_untracked_pat_range = is_ISA_range, .nmi_init = default_nmi_init, .get_nmi_reason = default_get_nmi_reason, - .save_sched_clock_state = tsc_save_sched_clock_state, - .restore_sched_clock_state = tsc_restore_sched_clock_state, + .save_sched_clock_state = tsc_save_sched_clock_state, + .restore_sched_clock_state = tsc_restore_sched_clock_state, .hyper.pin_vcpu = x86_op_int_noop, -}; - -EXPORT_SYMBOL_GPL(x86_platform); -#if defined(CONFIG_PCI_MSI) -struct x86_msi_ops x86_msi __ro_after_init = { - .restore_msi_irqs = default_restore_msi_irqs, + .guest = { + .enc_status_change_prepare = enc_status_change_prepare_noop, + .enc_status_change_finish = enc_status_change_finish_noop, + .enc_tlb_flush_required = enc_tlb_flush_required_noop, + .enc_cache_flush_required = enc_cache_flush_required_noop, + }, }; -/* MSI arch specific hooks */ -void arch_restore_msi_irqs(struct pci_dev *dev) -{ - x86_msi.restore_msi_irqs(dev); -} -#endif +EXPORT_SYMBOL_GPL(x86_platform); struct x86_apic_ops x86_apic_ops __ro_after_init = { .io_apic_read = native_io_apic_read, |