diff options
Diffstat (limited to 'arch/x86/kernel')
227 files changed, 26814 insertions, 12476 deletions
diff --git a/arch/x86/kernel/.gitignore b/arch/x86/kernel/.gitignore index 08f4fd731469..ef66569e7e22 100644 --- a/arch/x86/kernel/.gitignore +++ b/arch/x86/kernel/.gitignore @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only vsyscall.lds vsyscall_32.lds vmlinux.lds diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 9b294c13809a..f901658d9f7c 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -3,10 +3,6 @@ # Makefile for the linux kernel. # -extra-y := head_$(BITS).o -extra-y += head$(BITS).o -extra-y += ebda.o -extra-y += platform-quirks.o extra-y += vmlinux.lds CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) @@ -20,6 +16,7 @@ CFLAGS_REMOVE_kvmclock.o = -pg CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_early_printk.o = -pg CFLAGS_REMOVE_head64.o = -pg +CFLAGS_REMOVE_sev.o = -pg endif KASAN_SANITIZE_head$(BITS).o := n @@ -27,14 +24,13 @@ KASAN_SANITIZE_dumpstack.o := n KASAN_SANITIZE_dumpstack_$(BITS).o := n KASAN_SANITIZE_stacktrace.o := n KASAN_SANITIZE_paravirt.o := n +KASAN_SANITIZE_sev.o := n -OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y -OBJECT_FILES_NON_STANDARD_test_nx.o := y -OBJECT_FILES_NON_STANDARD_paravirt_patch.o := y - -ifdef CONFIG_FRAME_POINTER -OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y -endif +# With some compiler versions the generated code results in boot hangs, caused +# by several compilation units. To be safe, disable all instrumentation. +KCSAN_SANITIZE := n +KMSAN_SANITIZE_head$(BITS).o := n +KMSAN_SANITIZE_nmi.o := n # If instrumentation of this dir is enabled, boot hangs during first second. # Probably could be more selective here, but note that files related to irqs, @@ -44,7 +40,11 @@ KCOV_INSTRUMENT := n CFLAGS_irq.o := -I $(srctree)/$(src)/../include/asm/trace -obj-y := process_$(BITS).o signal.o +obj-y += head_$(BITS).o +obj-y += head$(BITS).o +obj-y += ebda.o +obj-y += platform-quirks.o +obj-y += process_$(BITS).o signal.o obj-$(CONFIG_COMPAT) += signal_compat.o obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o dumpstack.o nmi.o @@ -53,6 +53,8 @@ obj-y += setup.o x86_init.o i8259.o irqinit.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-y += probe_roms.o +obj-$(CONFIG_X86_32) += sys_ia32.o +obj-$(CONFIG_IA32_EMULATION) += sys_ia32.o obj-$(CONFIG_X86_64) += sys_x86_64.o obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o obj-$(CONFIG_SYSFS) += ksysfs.o @@ -60,9 +62,9 @@ obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o topology.o kdebugfs.o obj-y += alternative.o i8253.o hw_breakpoint.o obj-y += tsc.o tsc_msr.o io_delay.o rtc.o -obj-y += pci-iommu_table.o obj-y += resource.o obj-y += irqflags.o +obj-y += static_call.o obj-y += process.o obj-y += fpu/ @@ -72,7 +74,7 @@ obj-$(CONFIG_IA32_EMULATION) += tls.o obj-y += step.o obj-$(CONFIG_INTEL_TXT) += tboot.o obj-$(CONFIG_ISA_DMA_API) += i8237.o -obj-$(CONFIG_STACKTRACE) += stacktrace.o +obj-y += stacktrace.o obj-y += cpu/ obj-y += acpi/ obj-y += reboot.o @@ -89,11 +91,12 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-y += apic/ obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o -obj-$(CONFIG_LIVEPATCH) += livepatch.o obj-$(CONFIG_FUNCTION_TRACER) += ftrace_$(BITS).o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o obj-$(CONFIG_X86_TSC) += trace_clock.o +obj-$(CONFIG_TRACING) += trace.o +obj-$(CONFIG_RETHOOK) += rethook.o obj-$(CONFIG_CRASH_CORE) += crash_core_$(BITS).o obj-$(CONFIG_KEXEC_CORE) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o crash.o @@ -101,21 +104,18 @@ obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o obj-y += kprobes/ obj-$(CONFIG_MODULES) += module.o -ifeq ($(CONFIG_X86_32),y) -obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o -endif +obj-$(CONFIG_X86_32) += doublefault_32.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_VM86) += vm86_32.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_HPET_TIMER) += hpet.o -obj-$(CONFIG_APB_TIMER) += apb_timer.o obj-$(CONFIG_AMD_NB) += amd_nb.o obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o -obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch.o +obj-$(CONFIG_PARAVIRT) += paravirt.o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o @@ -127,12 +127,8 @@ obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o -obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o obj-$(CONFIG_OF) += devicetree.o obj-$(CONFIG_UPROBES) += uprobes.o -obj-y += sysfb.o -obj-$(CONFIG_X86_SYSFB) += sysfb_simplefb.o -obj-$(CONFIG_EFI) += sysfb_efi.o obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_TRACING) += tracepoint.o @@ -143,6 +139,10 @@ obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o +obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev.o + +obj-$(CONFIG_CFI_CLANG) += cfi.o + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) @@ -153,7 +153,3 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_MMCONF_FAM10H) += mmconf-fam10h_64.o obj-y += vsmp_64.o endif - -ifdef CONFIG_EFI -obj-$(CONFIG_IMA) += ima_arch.o -endif diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile index f1bb57b0e41e..fc17b3f136fe 100644 --- a/arch/x86/kernel/acpi/Makefile +++ b/arch/x86/kernel/acpi/Makefile @@ -1,10 +1,9 @@ # SPDX-License-Identifier: GPL-2.0 -OBJECT_FILES_NON_STANDARD_wakeup_$(BITS).o := y obj-$(CONFIG_ACPI) += boot.o obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o obj-$(CONFIG_ACPI_APEI) += apei.o -obj-$(CONFIG_ACPI_CPPC_LIB) += cppc_msr.o +obj-$(CONFIG_ACPI_CPPC_LIB) += cppc.o ifneq ($(CONFIG_ACPI_PROCESSOR),) obj-y += cstate.o diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c index c22fb55abcfd..0916f00a992e 100644 --- a/arch/x86/kernel/acpi/apei.c +++ b/arch/x86/kernel/acpi/apei.c @@ -43,3 +43,8 @@ void arch_apei_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) apei_mce_report_mem_error(sev, mem_err); #endif } + +int arch_apei_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) +{ + return apei_smca_report_x86_error(ctx_info, lapic_id); +} diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 04205ce127a1..907cc98b1938 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -5,6 +5,7 @@ * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com> */ +#define pr_fmt(fmt) "ACPI: " fmt #include <linux/init.h> #include <linux/acpi.h> @@ -20,11 +21,11 @@ #include <linux/pci.h> #include <linux/efi-bgrt.h> #include <linux/serial_core.h> +#include <linux/pgtable.h> #include <asm/e820/api.h> #include <asm/irqdomain.h> #include <asm/pci_x86.h> -#include <asm/pgtable.h> #include <asm/io_apic.h> #include <asm/apic.h> #include <asm/io.h> @@ -42,9 +43,8 @@ EXPORT_SYMBOL(acpi_disabled); # include <asm/proto.h> #endif /* X86 */ -#define PREFIX "ACPI: " - int acpi_noirq; /* skip ACPI IRQ initialization */ +static int acpi_nobgrt; /* skip ACPI BGRT */ int acpi_pci_disabled; /* skip ACPI PCI scan and IRQ initialization */ EXPORT_SYMBOL(acpi_pci_disabled); @@ -62,6 +62,14 @@ int acpi_fix_pin2_polarity __initdata; #ifdef CONFIG_X86_LOCAL_APIC static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; +static bool acpi_support_online_capable; +#endif + +#ifdef CONFIG_X86_64 +/* Physical address of the Multiprocessor Wakeup Structure mailbox */ +static u64 acpi_mp_wake_mailbox_paddr; +/* Virtual address of the Multiprocessor Wakeup Structure mailbox */ +static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox; #endif #ifdef CONFIG_X86_IO_APIC @@ -129,16 +137,17 @@ static int __init acpi_parse_madt(struct acpi_table_header *table) madt = (struct acpi_table_madt *)table; if (!madt) { - printk(KERN_WARNING PREFIX "Unable to map MADT\n"); + pr_warn("Unable to map MADT\n"); return -ENODEV; } if (madt->address) { acpi_lapic_addr = (u64) madt->address; - printk(KERN_DEBUG PREFIX "Local APIC address 0x%08x\n", - madt->address); + pr_debug("Local APIC address 0x%08x\n", madt->address); } + if (madt->header.revision >= 5) + acpi_support_online_capable = true; default_acpi_madt_oem_check(madt->header.oem_id, madt->header.oem_table_id); @@ -160,7 +169,7 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) int cpu; if (id >= MAX_LOCAL_APIC) { - printk(KERN_INFO PREFIX "skipped apicid that is too big\n"); + pr_info("skipped apicid that is too big\n"); return -EINVAL; } @@ -212,13 +221,13 @@ acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end) */ if (!apic->apic_id_valid(apic_id)) { if (enabled) - pr_warn(PREFIX "x2apic entry ignored\n"); + pr_warn("x2apic entry ignored\n"); return 0; } acpi_register_lapic(apic_id, processor->uid, enabled); #else - printk(KERN_WARNING PREFIX "x2apic entry ignored\n"); + pr_warn("x2apic entry ignored\n"); #endif return 0; @@ -240,6 +249,12 @@ acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end) if (processor->id == 0xff) return 0; + /* don't register processors that can not be onlined */ + if (acpi_support_online_capable && + !(processor->lapic_flags & ACPI_MADT_ENABLED) && + !(processor->lapic_flags & ACPI_MADT_ONLINE_CAPABLE)) + return 0; + /* * We need to register disabled CPU as well to permit * counting disabled CPUs. This allows us to size @@ -305,7 +320,7 @@ acpi_parse_x2apic_nmi(union acpi_subtable_headers *header, acpi_table_print_madt_entry(&header->common); if (x2apic_nmi->lint != 1) - printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n"); + pr_warn("NMI not connected to LINT 1!\n"); return 0; } @@ -323,12 +338,65 @@ acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long e acpi_table_print_madt_entry(&header->common); if (lapic_nmi->lint != 1) - printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n"); + pr_warn("NMI not connected to LINT 1!\n"); return 0; } -#endif /*CONFIG_X86_LOCAL_APIC */ +#ifdef CONFIG_X86_64 +static int acpi_wakeup_cpu(int apicid, unsigned long start_ip) +{ + /* + * Remap mailbox memory only for the first call to acpi_wakeup_cpu(). + * + * Wakeup of secondary CPUs is fully serialized in the core code. + * No need to protect acpi_mp_wake_mailbox from concurrent accesses. + */ + if (!acpi_mp_wake_mailbox) { + acpi_mp_wake_mailbox = memremap(acpi_mp_wake_mailbox_paddr, + sizeof(*acpi_mp_wake_mailbox), + MEMREMAP_WB); + } + + /* + * Mailbox memory is shared between the firmware and OS. Firmware will + * listen on mailbox command address, and once it receives the wakeup + * command, the CPU associated with the given apicid will be booted. + * + * The value of 'apic_id' and 'wakeup_vector' must be visible to the + * firmware before the wakeup command is visible. smp_store_release() + * ensures ordering and visibility. + */ + acpi_mp_wake_mailbox->apic_id = apicid; + acpi_mp_wake_mailbox->wakeup_vector = start_ip; + smp_store_release(&acpi_mp_wake_mailbox->command, + ACPI_MP_WAKE_COMMAND_WAKEUP); + + /* + * Wait for the CPU to wake up. + * + * The CPU being woken up is essentially in a spin loop waiting to be + * woken up. It should not take long for it wake up and acknowledge by + * zeroing out ->command. + * + * ACPI specification doesn't provide any guidance on how long kernel + * has to wait for a wake up acknowledgement. It also doesn't provide + * a way to cancel a wake up request if it takes too long. + * + * In TDX environment, the VMM has control over how long it takes to + * wake up secondary. It can postpone scheduling secondary vCPU + * indefinitely. Giving up on wake up request and reporting error opens + * possible attack vector for VMM: it can wake up a secondary CPU when + * kernel doesn't expect it. Wait until positive result of the wake up + * request. + */ + while (READ_ONCE(acpi_mp_wake_mailbox->command)) + cpu_relax(); + + return 0; +} +#endif /* CONFIG_X86_64 */ +#endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_IO_APIC #define MP_ISA_BUS 0 @@ -367,7 +435,7 @@ static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, isa_irq_to_gsi[bus_irq] = gsi; } -static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, +static void mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, int polarity) { #ifdef CONFIG_X86_MPPARSE @@ -379,9 +447,9 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, u8 pin; if (!acpi_ioapic) - return 0; + return; if (!dev || !dev_is_pci(dev)) - return 0; + return; pdev = to_pci_dev(dev); number = pdev->bus->number; @@ -400,7 +468,6 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, mp_save_irq(&mp_irq); #endif - return 0; } static int __init mp_register_ioapic_irq(u8 bus_irq, u8 polarity, @@ -513,14 +580,14 @@ acpi_parse_int_src_ovr(union acpi_subtable_headers * header, if (intsrc->source_irq == 0) { if (acpi_skip_timer_override) { - printk(PREFIX "BIOS IRQ0 override ignored.\n"); + pr_warn("BIOS IRQ0 override ignored.\n"); return 0; } if ((intsrc->global_irq == 2) && acpi_fix_pin2_polarity && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) { intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK; - printk(PREFIX "BIOS IRQ0 pin2 override: forcing polarity to high active.\n"); + pr_warn("BIOS IRQ0 pin2 override: forcing polarity to high active.\n"); } } @@ -559,10 +626,10 @@ acpi_parse_nmi_src(union acpi_subtable_headers * header, const unsigned long end * If a PIC-mode SCI is not recognized or gives spurious IRQ7's * it may require Edge Trigger -- use "acpi_sci=edge" * - * Port 0x4d0-4d1 are ECLR1 and ECLR2, the Edge/Level Control Registers + * Port 0x4d0-4d1 are ELCR1 and ELCR2, the Edge/Level Control Registers * for the 8259 PIC. bit[n] = 1 means irq[n] is Level, otherwise Edge. - * ECLR1 is IRQs 0-7 (IRQ 0, 1, 2 must be 0) - * ECLR2 is IRQs 8-15 (IRQ 8, 13 must be 0) + * ELCR1 is IRQs 0-7 (IRQ 0, 1, 2 must be 0) + * ELCR2 is IRQs 8-15 (IRQ 8, 13 must be 0) */ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger) @@ -571,7 +638,7 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger) unsigned int old, new; /* Real old ELCR mask */ - old = inb(0x4d0) | (inb(0x4d1) << 8); + old = inb(PIC_ELCR1) | (inb(PIC_ELCR2) << 8); /* * If we use ACPI to set PCI IRQs, then we should clear ELCR @@ -596,9 +663,9 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger) if (old == new) return; - printk(PREFIX "setting ELCR to %04x (from %04x)\n", new, old); - outb(new, 0x4d0); - outb(new >> 8, 0x4d1); + pr_warn("setting ELCR to %04x (from %04x)\n", new, old); + outb(new, PIC_ELCR1); + outb(new >> 8, PIC_ELCR2); } int acpi_gsi_to_irq(u32 gsi, unsigned int *irqp) @@ -753,7 +820,7 @@ int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id, cpu = acpi_register_lapic(physid, acpi_id, ACPI_MADT_ENABLED); if (cpu < 0) { - pr_info(PREFIX "Unable to map lapic to logical cpu number\n"); + pr_info("Unable to map lapic to logical cpu number\n"); return cpu; } @@ -829,7 +896,7 @@ int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base) EXPORT_SYMBOL(acpi_unregister_ioapic); /** - * acpi_ioapic_registered - Check whether IOAPIC assoicatied with @gsi_base + * acpi_ioapic_registered - Check whether IOAPIC associated with @gsi_base * has been registered * @handle: ACPI handle of the IOAPIC device * @gsi_base: GSI base associated with the IOAPIC @@ -869,8 +936,7 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table) struct acpi_table_hpet *hpet_tbl = (struct acpi_table_hpet *)table; if (hpet_tbl->address.space_id != ACPI_SPACE_MEM) { - printk(KERN_WARNING PREFIX "HPET timers must be located in " - "memory.\n"); + pr_warn("HPET timers must be located in memory.\n"); return -1; } @@ -882,9 +948,7 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table) * want to allocate a resource there. */ if (!hpet_address) { - printk(KERN_WARNING PREFIX - "HPET id: %#x base: %#lx is invalid\n", - hpet_tbl->id, hpet_address); + pr_warn("HPET id: %#x base: %#lx is invalid\n", hpet_tbl->id, hpet_address); return 0; } #ifdef CONFIG_X86_64 @@ -895,21 +959,17 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table) */ if (hpet_address == 0xfed0000000000000UL) { if (!hpet_force_user) { - printk(KERN_WARNING PREFIX "HPET id: %#x " - "base: 0xfed0000000000000 is bogus\n " - "try hpet=force on the kernel command line to " - "fix it up to 0xfed00000.\n", hpet_tbl->id); + pr_warn("HPET id: %#x base: 0xfed0000000000000 is bogus, try hpet=force on the kernel command line to fix it up to 0xfed00000.\n", + hpet_tbl->id); hpet_address = 0; return 0; } - printk(KERN_WARNING PREFIX - "HPET id: %#x base: 0xfed0000000000000 fixed up " - "to 0xfed00000.\n", hpet_tbl->id); + pr_warn("HPET id: %#x base: 0xfed0000000000000 fixed up to 0xfed00000.\n", + hpet_tbl->id); hpet_address >>= 32; } #endif - printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", - hpet_tbl->id, hpet_address); + pr_info("HPET id: %#x base: %#lx\n", hpet_tbl->id, hpet_address); /* * Allocate and initialize the HPET firmware resource for adding into @@ -954,24 +1014,24 @@ late_initcall(hpet_insert_resource); static int __init acpi_parse_fadt(struct acpi_table_header *table) { if (!(acpi_gbl_FADT.boot_flags & ACPI_FADT_LEGACY_DEVICES)) { - pr_debug("ACPI: no legacy devices present\n"); + pr_debug("no legacy devices present\n"); x86_platform.legacy.devices.pnpbios = 0; } if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && !(acpi_gbl_FADT.boot_flags & ACPI_FADT_8042) && x86_platform.legacy.i8042 != X86_LEGACY_I8042_PLATFORM_ABSENT) { - pr_debug("ACPI: i8042 controller is absent\n"); + pr_debug("i8042 controller is absent\n"); x86_platform.legacy.i8042 = X86_LEGACY_I8042_FIRMWARE_ABSENT; } if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) { - pr_debug("ACPI: not registering RTC platform device\n"); + pr_debug("not registering RTC platform device\n"); x86_platform.legacy.rtc = 0; } if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_VGA) { - pr_debug("ACPI: probing for VGA not safe\n"); + pr_debug("probing for VGA not safe\n"); x86_platform.legacy.no_vga = 1; } @@ -996,8 +1056,7 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table) pmtmr_ioport = acpi_gbl_FADT.pm_timer_block; } if (pmtmr_ioport) - printk(KERN_INFO PREFIX "PM-Timer IO Port: %#x\n", - pmtmr_ioport); + pr_info("PM-Timer IO Port: %#x\n", pmtmr_ioport); #endif return 0; } @@ -1023,8 +1082,7 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void) count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE, acpi_parse_lapic_addr_ovr, 0); if (count < 0) { - printk(KERN_ERR PREFIX - "Error parsing LAPIC address override entry\n"); + pr_err("Error parsing LAPIC address override entry\n"); return count; } @@ -1056,8 +1114,7 @@ static int __init acpi_parse_madt_lapic_entries(void) sizeof(struct acpi_table_madt), madt_proc, ARRAY_SIZE(madt_proc), MAX_LOCAL_APIC); if (ret < 0) { - printk(KERN_ERR PREFIX - "Error parsing LAPIC/X2APIC entries\n"); + pr_err("Error parsing LAPIC/X2APIC entries\n"); return ret; } @@ -1065,11 +1122,11 @@ static int __init acpi_parse_madt_lapic_entries(void) x2count = madt_proc[1].count; } if (!count && !x2count) { - printk(KERN_ERR PREFIX "No LAPIC entries present\n"); + pr_err("No LAPIC entries present\n"); /* TBD: Cleanup to allow fallback to MPS */ return -ENODEV; } else if (count < 0 || x2count < 0) { - printk(KERN_ERR PREFIX "Error parsing LAPIC entry\n"); + pr_err("Error parsing LAPIC entry\n"); /* TBD: Cleanup to allow fallback to MPS */ return count; } @@ -1079,12 +1136,35 @@ static int __init acpi_parse_madt_lapic_entries(void) count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI, acpi_parse_lapic_nmi, 0); if (count < 0 || x2count < 0) { - printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n"); + pr_err("Error parsing LAPIC NMI entry\n"); /* TBD: Cleanup to allow fallback to MPS */ return count; } return 0; } + +#ifdef CONFIG_X86_64 +static int __init acpi_parse_mp_wake(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_madt_multiproc_wakeup *mp_wake; + + if (!IS_ENABLED(CONFIG_SMP)) + return -ENODEV; + + mp_wake = (struct acpi_madt_multiproc_wakeup *)header; + if (BAD_MADT_ENTRY(mp_wake, end)) + return -EINVAL; + + acpi_table_print_madt_entry(&header->common); + + acpi_mp_wake_mailbox_paddr = mp_wake->base_address; + + acpi_wake_cpu_handler_update(acpi_wakeup_cpu); + + return 0; +} +#endif /* CONFIG_X86_64 */ #endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_IO_APIC @@ -1138,7 +1218,7 @@ static void __init mp_config_acpi_legacy_irqs(void) } if (idx != mp_irq_entries) { - printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); + pr_debug("ACPI: IRQ%d used by override.\n", i); continue; /* IRQ already used */ } @@ -1178,26 +1258,24 @@ static int __init acpi_parse_madt_ioapic_entries(void) * if "noapic" boot option, don't look for IO-APICs */ if (skip_ioapic_setup) { - printk(KERN_INFO PREFIX "Skipping IOAPIC probe " - "due to 'noapic' option.\n"); + pr_info("Skipping IOAPIC probe due to 'noapic' option.\n"); return -ENODEV; } count = acpi_table_parse_madt(ACPI_MADT_TYPE_IO_APIC, acpi_parse_ioapic, MAX_IO_APICS); if (!count) { - printk(KERN_ERR PREFIX "No IOAPIC entries present\n"); + pr_err("No IOAPIC entries present\n"); return -ENODEV; } else if (count < 0) { - printk(KERN_ERR PREFIX "Error parsing IOAPIC entry\n"); + pr_err("Error parsing IOAPIC entry\n"); return count; } count = acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr, nr_irqs); if (count < 0) { - printk(KERN_ERR PREFIX - "Error parsing interrupt source overrides entry\n"); + pr_err("Error parsing interrupt source overrides entry\n"); /* TBD: Cleanup to allow fallback to MPS */ return count; } @@ -1217,7 +1295,7 @@ static int __init acpi_parse_madt_ioapic_entries(void) count = acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src, nr_irqs); if (count < 0) { - printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n"); + pr_err("Error parsing NMI SRC entry\n"); /* TBD: Cleanup to allow fallback to MPS */ return count; } @@ -1250,8 +1328,7 @@ static void __init early_acpi_process_madt(void) /* * Dell Precision Workstation 410, 610 come here. */ - printk(KERN_ERR PREFIX - "Invalid BIOS MADT, disabling ACPI\n"); + pr_err("Invalid BIOS MADT, disabling ACPI\n"); disable_acpi(); } } @@ -1283,13 +1360,20 @@ static void __init acpi_process_madt(void) smp_found_config = 1; } + +#ifdef CONFIG_X86_64 + /* + * Parse MADT MP Wake entry. + */ + acpi_table_parse_madt(ACPI_MADT_TYPE_MULTIPROC_WAKEUP, + acpi_parse_mp_wake, 1); +#endif } if (error == -EINVAL) { /* * Dell Precision Workstation 410, 610 come here. */ - printk(KERN_ERR PREFIX - "Invalid BIOS MADT, disabling ACPI\n"); + pr_err("Invalid BIOS MADT, disabling ACPI\n"); disable_acpi(); } } else { @@ -1299,8 +1383,7 @@ static void __init acpi_process_madt(void) * Boot with "acpi=off" to use MPS on such a system. */ if (smp_found_config) { - printk(KERN_WARNING PREFIX - "No APIC-table, disabling MPS\n"); + pr_warn("No APIC-table, disabling MPS\n"); smp_found_config = 0; } } @@ -1310,11 +1393,9 @@ static void __init acpi_process_madt(void) * processors, where MPS only supports physical. */ if (acpi_lapic && acpi_ioapic) - printk(KERN_INFO "Using ACPI (MADT) for SMP configuration " - "information\n"); + pr_info("Using ACPI (MADT) for SMP configuration information\n"); else if (acpi_lapic) - printk(KERN_INFO "Using ACPI for processor (LAPIC) " - "configuration information\n"); + pr_info("Using ACPI for processor (LAPIC) configuration information\n"); #endif return; } @@ -1322,8 +1403,7 @@ static void __init acpi_process_madt(void) static int __init disable_acpi_irq(const struct dmi_system_id *d) { if (!acpi_force) { - printk(KERN_NOTICE "%s detected: force use of acpi=noirq\n", - d->ident); + pr_notice("%s detected: force use of acpi=noirq\n", d->ident); acpi_noirq_set(); } return 0; @@ -1332,21 +1412,30 @@ static int __init disable_acpi_irq(const struct dmi_system_id *d) static int __init disable_acpi_pci(const struct dmi_system_id *d) { if (!acpi_force) { - printk(KERN_NOTICE "%s detected: force use of pci=noacpi\n", - d->ident); + pr_notice("%s detected: force use of pci=noacpi\n", d->ident); acpi_disable_pci(); } return 0; } +static int __init disable_acpi_xsdt(const struct dmi_system_id *d) +{ + if (!acpi_force) { + pr_notice("%s detected: force use of acpi=rsdt\n", d->ident); + acpi_gbl_do_not_use_xsdt = TRUE; + } else { + pr_notice("Warning: DMI blacklist says broken, but acpi XSDT forced\n"); + } + return 0; +} + static int __init dmi_disable_acpi(const struct dmi_system_id *d) { if (!acpi_force) { - printk(KERN_NOTICE "%s detected: acpi off\n", d->ident); + pr_notice("%s detected: acpi off\n", d->ident); disable_acpi(); } else { - printk(KERN_NOTICE - "Warning: DMI blacklist says broken, but acpi forced\n"); + pr_notice("Warning: DMI blacklist says broken, but acpi forced\n"); } return 0; } @@ -1463,6 +1552,19 @@ static const struct dmi_system_id acpi_dmi_table[] __initconst = { DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"), }, }, + /* + * Boxes that need ACPI XSDT use disabled due to corrupted tables + */ + { + .callback = disable_acpi_xsdt, + .ident = "Advantech DAC-BJ01", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "NEC"), + DMI_MATCH(DMI_PRODUCT_NAME, "Bearlake CRB Board"), + DMI_MATCH(DMI_BIOS_VERSION, "V1.12"), + DMI_MATCH(DMI_BIOS_DATE, "02/01/2011"), + }, + }, {} }; @@ -1553,10 +1655,18 @@ void __init acpi_boot_table_init(void) /* * Initialize the ACPI boot-time table parser. */ - if (acpi_table_init()) { + if (acpi_locate_initial_tables()) disable_acpi(); - return; - } + else + acpi_reserve_initial_tables(); +} + +int __init early_acpi_boot_init(void) +{ + if (acpi_disabled) + return 1; + + acpi_table_init_complete(); acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf); @@ -1565,22 +1675,13 @@ void __init acpi_boot_table_init(void) */ if (acpi_blacklisted()) { if (acpi_force) { - printk(KERN_WARNING PREFIX "acpi=force override\n"); + pr_warn("acpi=force override\n"); } else { - printk(KERN_WARNING PREFIX "Disabling ACPI support\n"); + pr_warn("Disabling ACPI support\n"); disable_acpi(); - return; + return 1; } } -} - -int __init early_acpi_boot_init(void) -{ - /* - * If acpi_disabled, bail out - */ - if (acpi_disabled) - return 1; /* * Process the Multiple APIC Description Table (MADT), if present @@ -1619,7 +1720,7 @@ int __init acpi_boot_init(void) acpi_process_madt(); acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet); - if (IS_ENABLED(CONFIG_ACPI_BGRT)) + if (IS_ENABLED(CONFIG_ACPI_BGRT) && !acpi_nobgrt) acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt); if (!acpi_noirq) @@ -1656,7 +1757,7 @@ static int __init parse_acpi(char *arg) else if (strcmp(arg, "noirq") == 0) { acpi_noirq_set(); } - /* "acpi=copy_dsdt" copys DSDT */ + /* "acpi=copy_dsdt" copies DSDT */ else if (strcmp(arg, "copy_dsdt") == 0) { acpi_gbl_copy_dsdt_locally = 1; } @@ -1671,6 +1772,13 @@ static int __init parse_acpi(char *arg) } early_param("acpi", parse_acpi); +static int __init parse_acpi_bgrt(char *arg) +{ + acpi_nobgrt = true; + return 0; +} +early_param("bgrt_disable", parse_acpi_bgrt); + /* FIXME: Using pci= for an ACPI parameter is a travesty. */ static int __init parse_pci(char *arg) { @@ -1685,9 +1793,7 @@ int __init acpi_mps_check(void) #if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_X86_MPPARSE) /* mptable code is not built-in*/ if (acpi_disabled || acpi_noirq) { - printk(KERN_WARNING "MPS support code is not built-in.\n" - "Using acpi=off or acpi=noirq or pci=noacpi " - "may have problem\n"); + pr_warn("MPS support code is not built-in, using acpi=off or acpi=noirq or pci=noacpi may have problem\n"); return 1; } #endif @@ -1740,7 +1846,7 @@ int __acpi_acquire_global_lock(unsigned int *lock) new = (((old & ~0x3) + 2) + ((old >> 1) & 0x1)); val = cmpxchg(lock, old, new); } while (unlikely (val != old)); - return (new < 3) ? -1 : 0; + return ((new & 0x3) < 3) ? -1 : 0; } int __acpi_release_global_lock(unsigned int *lock) @@ -1756,7 +1862,7 @@ int __acpi_release_global_lock(unsigned int *lock) void __init arch_reserve_mem_area(acpi_physical_address addr, size_t size) { - e820__range_add(addr, size, E820_TYPE_ACPI); + e820__range_add(addr, size, E820_TYPE_NVS); e820__update_table_print(); } diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c new file mode 100644 index 000000000000..8d8752b44f11 --- /dev/null +++ b/arch/x86/kernel/acpi/cppc.c @@ -0,0 +1,118 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * cppc.c: CPPC Interface for x86 + * Copyright (c) 2016, Intel Corporation. + */ + +#include <acpi/cppc_acpi.h> +#include <asm/msr.h> +#include <asm/processor.h> +#include <asm/topology.h> + +/* Refer to drivers/acpi/cppc_acpi.c for the description of functions */ + +bool cpc_supported_by_cpu(void) +{ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + case X86_VENDOR_HYGON: + if (boot_cpu_data.x86 == 0x19 && ((boot_cpu_data.x86_model <= 0x0f) || + (boot_cpu_data.x86_model >= 0x20 && boot_cpu_data.x86_model <= 0x2f))) + return true; + else if (boot_cpu_data.x86 == 0x17 && + boot_cpu_data.x86_model >= 0x70 && boot_cpu_data.x86_model <= 0x7f) + return true; + return boot_cpu_has(X86_FEATURE_CPPC); + } + return false; +} + +bool cpc_ffh_supported(void) +{ + return true; +} + +int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val) +{ + int err; + + err = rdmsrl_safe_on_cpu(cpunum, reg->address, val); + if (!err) { + u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1, + reg->bit_offset); + + *val &= mask; + *val >>= reg->bit_offset; + } + return err; +} + +int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) +{ + u64 rd_val; + int err; + + err = rdmsrl_safe_on_cpu(cpunum, reg->address, &rd_val); + if (!err) { + u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1, + reg->bit_offset); + + val <<= reg->bit_offset; + val &= mask; + rd_val &= ~mask; + rd_val |= val; + err = wrmsrl_safe_on_cpu(cpunum, reg->address, rd_val); + } + return err; +} + +static void amd_set_max_freq_ratio(void) +{ + struct cppc_perf_caps perf_caps; + u64 highest_perf, nominal_perf; + u64 perf_ratio; + int rc; + + rc = cppc_get_perf_caps(0, &perf_caps); + if (rc) { + pr_debug("Could not retrieve perf counters (%d)\n", rc); + return; + } + + highest_perf = amd_get_highest_perf(); + nominal_perf = perf_caps.nominal_perf; + + if (!highest_perf || !nominal_perf) { + pr_debug("Could not retrieve highest or nominal performance\n"); + return; + } + + perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf); + /* midpoint between max_boost and max_P */ + perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1; + if (!perf_ratio) { + pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n"); + return; + } + + freq_invariance_set_perf_ratio(perf_ratio, false); +} + +static DEFINE_MUTEX(freq_invariance_lock); + +void init_freq_invariance_cppc(void) +{ + static bool init_done; + + if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) + return; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return; + + mutex_lock(&freq_invariance_lock); + if (!init_done) + amd_set_max_freq_ratio(); + init_done = true; + mutex_unlock(&freq_invariance_lock); +} diff --git a/arch/x86/kernel/acpi/cppc_msr.c b/arch/x86/kernel/acpi/cppc_msr.c deleted file mode 100644 index b961de569e7e..000000000000 --- a/arch/x86/kernel/acpi/cppc_msr.c +++ /dev/null @@ -1,49 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * cppc_msr.c: MSR Interface for CPPC - * Copyright (c) 2016, Intel Corporation. - */ - -#include <acpi/cppc_acpi.h> -#include <asm/msr.h> - -/* Refer to drivers/acpi/cppc_acpi.c for the description of functions */ - -bool cpc_ffh_supported(void) -{ - return true; -} - -int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val) -{ - int err; - - err = rdmsrl_safe_on_cpu(cpunum, reg->address, val); - if (!err) { - u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1, - reg->bit_offset); - - *val &= mask; - *val >>= reg->bit_offset; - } - return err; -} - -int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) -{ - u64 rd_val; - int err; - - err = rdmsrl_safe_on_cpu(cpunum, reg->address, &rd_val); - if (!err) { - u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1, - reg->bit_offset); - - val <<= reg->bit_offset; - val &= mask; - rd_val &= ~mask; - rd_val |= val; - err = wrmsrl_safe_on_cpu(cpunum, reg->address, rd_val); - } - return err; -} diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index caf2edccbad2..7945eae5b315 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -79,6 +79,21 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, */ flags->bm_control = 0; } + if (c->x86_vendor == X86_VENDOR_AMD && c->x86 >= 0x17) { + /* + * For all AMD Zen or newer CPUs that support C3, caches + * should not be flushed by software while entering C3 + * type state. Set bm->check to 1 so that kernel doesn't + * need to execute cache flush operation. + */ + flags->bm_check = 1; + /* + * In current AMD C state implementation ARB_DIS is no longer + * used. So set bm_control to zero to indicate ARB_DIS is not + * required while entering C3 type state. + */ + flags->bm_control = 0; + } } EXPORT_SYMBOL(acpi_processor_power_init_bm_check); @@ -161,7 +176,8 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, /* Make sure we are running on right CPU */ - retval = work_on_cpu(cpu, acpi_processor_ffh_cstate_probe_cpu, cx); + retval = call_on_cpu(cpu, acpi_processor_ffh_cstate_probe_cpu, cx, + false); if (retval == 0) { /* Use the hint in CST */ percpu_entry->states[cx->index].eax = cx->address; @@ -196,7 +212,8 @@ static int __init ffh_cstate_init(void) struct cpuinfo_x86 *c = &boot_cpu_data; if (c->x86_vendor != X86_VENDOR_INTEL && - c->x86_vendor != X86_VENDOR_AMD) + c->x86_vendor != X86_VENDOR_AMD && + c->x86_vendor != X86_VENDOR_HYGON) return -1; cpu_cstate_entry = alloc_percpu(struct cstate_entry); diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 26b7256f590f..3b7f4cdbf2e0 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -10,11 +10,12 @@ #include <linux/memblock.h> #include <linux/dmi.h> #include <linux/cpumask.h> +#include <linux/pgtable.h> #include <asm/segment.h> #include <asm/desc.h> -#include <asm/pgtable.h> #include <asm/cacheflush.h> #include <asm/realmode.h> +#include <asm/hypervisor.h> #include <linux/ftrace.h> #include "../../realmode/rm/wakeup.h" @@ -41,9 +42,9 @@ unsigned long acpi_get_wakeup_address(void) * x86_acpi_enter_sleep_state - enter sleep state * @state: Sleep state to enter. * - * Wrapper around acpi_enter_sleep_state() to be called by assmebly. + * Wrapper around acpi_enter_sleep_state() to be called by assembly. */ -acpi_status asmlinkage __visible x86_acpi_enter_sleep_state(u8 state) +asmlinkage acpi_status __visible x86_acpi_enter_sleep_state(u8 state) { return acpi_enter_sleep_state(state); } @@ -139,8 +140,10 @@ static int __init acpi_sleep_setup(char *str) if (strncmp(str, "s3_beep", 7) == 0) acpi_realmode_flags |= 4; #ifdef CONFIG_HIBERNATION + if (strncmp(str, "s4_hwsig", 8) == 0) + acpi_check_s4_hw_signature = 1; if (strncmp(str, "s4_nohwsig", 10) == 0) - acpi_no_s4_hw_signature(); + acpi_check_s4_hw_signature = 0; #endif if (strncmp(str, "nonvs", 5) == 0) acpi_nvs_nosave(); @@ -158,3 +161,21 @@ static int __init acpi_sleep_setup(char *str) } __setup("acpi_sleep=", acpi_sleep_setup); + +#if defined(CONFIG_HIBERNATION) && defined(CONFIG_HYPERVISOR_GUEST) +static int __init init_s4_sigcheck(void) +{ + /* + * If running on a hypervisor, honour the ACPI specification + * by default and trigger a clean reboot when the hardware + * signature in FACS is changed after hibernation. + */ + if (acpi_check_s4_hw_signature == -1 && + !hypervisor_is_type(X86_HYPER_NATIVE)) + acpi_check_s4_hw_signature = 1; + + return 0; +} +/* This must happen before acpi_init() which is a subsys initcall */ +arch_initcall(init_s4_sigcheck); +#endif diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h index d06c2079b6c1..171a40c74db6 100644 --- a/arch/x86/kernel/acpi/sleep.h +++ b/arch/x86/kernel/acpi/sleep.h @@ -19,4 +19,4 @@ extern void do_suspend_lowlevel(void); extern int x86_acpi_suspend_lowlevel(void); -acpi_status asmlinkage x86_acpi_enter_sleep_state(u8 state); +asmlinkage acpi_status x86_acpi_enter_sleep_state(u8 state); diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S index daf88f8143c5..cf69081073b5 100644 --- a/arch/x86/kernel/acpi/wakeup_32.S +++ b/arch/x86/kernel/acpi/wakeup_32.S @@ -60,7 +60,7 @@ save_registers: popl saved_context_eflags movl $ret_point, saved_eip - ret + RET restore_registers: @@ -70,7 +70,7 @@ restore_registers: movl saved_context_edi, %edi pushl saved_context_eflags popfl - ret + RET SYM_CODE_START(do_suspend_lowlevel) call save_processor_state @@ -86,7 +86,7 @@ SYM_CODE_START(do_suspend_lowlevel) ret_point: call restore_registers call restore_processor_state - ret + RET SYM_CODE_END(do_suspend_lowlevel) .data diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index c8daa92f38dc..d5d8a352eafa 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -1,12 +1,14 @@ /* SPDX-License-Identifier: GPL-2.0-only */ .text #include <linux/linkage.h> +#include <linux/objtool.h> #include <asm/segment.h> #include <asm/pgtable_types.h> #include <asm/page_types.h> #include <asm/msr.h> #include <asm/asm-offsets.h> #include <asm/frame.h> +#include <asm/nospec-branch.h> # Copyright 2003 Pavel Machek <pavel@suse.cz @@ -39,6 +41,7 @@ SYM_FUNC_START(wakeup_long64) movq saved_rbp, %rbp movq saved_rip, %rax + ANNOTATE_RETPOLINE_SAFE jmp *%rax SYM_FUNC_END(wakeup_long64) @@ -112,7 +115,7 @@ SYM_FUNC_START(do_suspend_lowlevel) movq pt_regs_r14(%rax), %r14 movq pt_regs_r15(%rax), %r15 -#ifdef CONFIG_KASAN +#if defined(CONFIG_KASAN) && defined(CONFIG_KASAN_STACK) /* * The suspend path may have poisoned some areas deeper in the stack, * which we now need to unpoison. @@ -126,6 +129,7 @@ SYM_FUNC_START(do_suspend_lowlevel) FRAME_END jmp restore_processor_state SYM_FUNC_END(do_suspend_lowlevel) +STACK_FRAME_NON_STANDARD do_suspend_lowlevel .data saved_rbp: .quad 0 diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 15ac0d5f4b40..5cadcea035e0 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -3,9 +3,11 @@ #include <linux/module.h> #include <linux/sched.h> +#include <linux/perf_event.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/stringify.h> +#include <linux/highmem.h> #include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/memory.h> @@ -15,10 +17,10 @@ #include <linux/kprobes.h> #include <linux/mmu_context.h> #include <linux/bsearch.h> +#include <linux/sync_core.h> #include <asm/text-patching.h> #include <asm/alternative.h> #include <asm/sections.h> -#include <asm/pgtable.h> #include <asm/mce.h> #include <asm/nmi.h> #include <asm/cacheflush.h> @@ -26,6 +28,8 @@ #include <asm/insn.h> #include <asm/io.h> #include <asm/fixmap.h> +#include <asm/paravirt.h> +#include <asm/asm-prototypes.h> int __read_mostly alternatives_patched; @@ -54,7 +58,7 @@ __setup("noreplace-smp", setup_noreplace_smp); #define DPRINTK(fmt, args...) \ do { \ if (debug_alternative) \ - printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \ + printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \ } while (0) #define DUMP_BYTES(buf, len, fmt, args...) \ @@ -65,193 +69,37 @@ do { \ if (!(len)) \ break; \ \ - printk(KERN_DEBUG fmt, ##args); \ + printk(KERN_DEBUG pr_fmt(fmt), ##args); \ for (j = 0; j < (len) - 1; j++) \ printk(KERN_CONT "%02hhx ", buf[j]); \ printk(KERN_CONT "%02hhx\n", buf[j]); \ } \ } while (0) -/* - * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes - * that correspond to that nop. Getting from one nop to the next, we - * add to the array the offset that is equal to the sum of all sizes of - * nops preceding the one we are after. - * - * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the - * nice symmetry of sizes of the previous nops. - */ -#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64) -static const unsigned char intelnops[] = +static const unsigned char x86nops[] = { - GENERIC_NOP1, - GENERIC_NOP2, - GENERIC_NOP3, - GENERIC_NOP4, - GENERIC_NOP5, - GENERIC_NOP6, - GENERIC_NOP7, - GENERIC_NOP8, - GENERIC_NOP5_ATOMIC -}; -static const unsigned char * const intel_nops[ASM_NOP_MAX+2] = -{ - NULL, - intelnops, - intelnops + 1, - intelnops + 1 + 2, - intelnops + 1 + 2 + 3, - intelnops + 1 + 2 + 3 + 4, - intelnops + 1 + 2 + 3 + 4 + 5, - intelnops + 1 + 2 + 3 + 4 + 5 + 6, - intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7, - intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, + BYTES_NOP1, + BYTES_NOP2, + BYTES_NOP3, + BYTES_NOP4, + BYTES_NOP5, + BYTES_NOP6, + BYTES_NOP7, + BYTES_NOP8, }; -#endif -#ifdef K8_NOP1 -static const unsigned char k8nops[] = -{ - K8_NOP1, - K8_NOP2, - K8_NOP3, - K8_NOP4, - K8_NOP5, - K8_NOP6, - K8_NOP7, - K8_NOP8, - K8_NOP5_ATOMIC -}; -static const unsigned char * const k8_nops[ASM_NOP_MAX+2] = +const unsigned char * const x86_nops[ASM_NOP_MAX+1] = { NULL, - k8nops, - k8nops + 1, - k8nops + 1 + 2, - k8nops + 1 + 2 + 3, - k8nops + 1 + 2 + 3 + 4, - k8nops + 1 + 2 + 3 + 4 + 5, - k8nops + 1 + 2 + 3 + 4 + 5 + 6, - k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, - k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, + x86nops, + x86nops + 1, + x86nops + 1 + 2, + x86nops + 1 + 2 + 3, + x86nops + 1 + 2 + 3 + 4, + x86nops + 1 + 2 + 3 + 4 + 5, + x86nops + 1 + 2 + 3 + 4 + 5 + 6, + x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, }; -#endif - -#if defined(K7_NOP1) && !defined(CONFIG_X86_64) -static const unsigned char k7nops[] = -{ - K7_NOP1, - K7_NOP2, - K7_NOP3, - K7_NOP4, - K7_NOP5, - K7_NOP6, - K7_NOP7, - K7_NOP8, - K7_NOP5_ATOMIC -}; -static const unsigned char * const k7_nops[ASM_NOP_MAX+2] = -{ - NULL, - k7nops, - k7nops + 1, - k7nops + 1 + 2, - k7nops + 1 + 2 + 3, - k7nops + 1 + 2 + 3 + 4, - k7nops + 1 + 2 + 3 + 4 + 5, - k7nops + 1 + 2 + 3 + 4 + 5 + 6, - k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, - k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, -}; -#endif - -#ifdef P6_NOP1 -static const unsigned char p6nops[] = -{ - P6_NOP1, - P6_NOP2, - P6_NOP3, - P6_NOP4, - P6_NOP5, - P6_NOP6, - P6_NOP7, - P6_NOP8, - P6_NOP5_ATOMIC -}; -static const unsigned char * const p6_nops[ASM_NOP_MAX+2] = -{ - NULL, - p6nops, - p6nops + 1, - p6nops + 1 + 2, - p6nops + 1 + 2 + 3, - p6nops + 1 + 2 + 3 + 4, - p6nops + 1 + 2 + 3 + 4 + 5, - p6nops + 1 + 2 + 3 + 4 + 5 + 6, - p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, - p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, -}; -#endif - -/* Initialize these to a safe default */ -#ifdef CONFIG_X86_64 -const unsigned char * const *ideal_nops = p6_nops; -#else -const unsigned char * const *ideal_nops = intel_nops; -#endif - -void __init arch_init_ideal_nops(void) -{ - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - /* - * Due to a decoder implementation quirk, some - * specific Intel CPUs actually perform better with - * the "k8_nops" than with the SDM-recommended NOPs. - */ - if (boot_cpu_data.x86 == 6 && - boot_cpu_data.x86_model >= 0x0f && - boot_cpu_data.x86_model != 0x1c && - boot_cpu_data.x86_model != 0x26 && - boot_cpu_data.x86_model != 0x27 && - boot_cpu_data.x86_model < 0x30) { - ideal_nops = k8_nops; - } else if (boot_cpu_has(X86_FEATURE_NOPL)) { - ideal_nops = p6_nops; - } else { -#ifdef CONFIG_X86_64 - ideal_nops = k8_nops; -#else - ideal_nops = intel_nops; -#endif - } - break; - - case X86_VENDOR_HYGON: - ideal_nops = p6_nops; - return; - - case X86_VENDOR_AMD: - if (boot_cpu_data.x86 > 0xf) { - ideal_nops = p6_nops; - return; - } - - /* fall through */ - - default: -#ifdef CONFIG_X86_64 - ideal_nops = k8_nops; -#else - if (boot_cpu_has(X86_FEATURE_K8)) - ideal_nops = k8_nops; - else if (boot_cpu_has(X86_FEATURE_K7)) - ideal_nops = k7_nops; - else - ideal_nops = intel_nops; -#endif - } -} /* Use this to add nops to a buffer, then text_poke the whole buffer. */ static void __init_or_module add_nops(void *insns, unsigned int len) @@ -260,12 +108,15 @@ static void __init_or_module add_nops(void *insns, unsigned int len) unsigned int noplen = len; if (noplen > ASM_NOP_MAX) noplen = ASM_NOP_MAX; - memcpy(insns, ideal_nops[noplen], noplen); + memcpy(insns, x86_nops[noplen], noplen); insns += noplen; len -= noplen; } } +extern s32 __retpoline_sites[], __retpoline_sites_end[]; +extern s32 __return_sites[], __return_sites_end[]; +extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[]; extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; void text_poke_early(void *addr, const void *opcode, size_t len); @@ -336,25 +187,69 @@ done: } /* - * "noinline" to cause control flow change and thus invalidate I$ and - * cause refetch after modification. + * optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90) + * + * @instr: instruction byte stream + * @instrlen: length of the above + * @off: offset within @instr where the first NOP has been detected + * + * Return: number of NOPs found (and replaced). */ -static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr) +static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off) { unsigned long flags; - int i; + int i = off, nnops; - for (i = 0; i < a->padlen; i++) { + while (i < instrlen) { if (instr[i] != 0x90) - return; + break; + + i++; } + nnops = i - off; + + if (nnops <= 1) + return nnops; + local_irq_save(flags); - add_nops(instr + (a->instrlen - a->padlen), a->padlen); + add_nops(instr + off, nnops); local_irq_restore(flags); - DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ", - instr, a->instrlen - a->padlen, a->padlen); + DUMP_BYTES(instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i); + + return nnops; +} + +/* + * "noinline" to cause control flow change and thus invalidate I$ and + * cause refetch after modification. + */ +static void __init_or_module noinline optimize_nops(u8 *instr, size_t len) +{ + struct insn insn; + int i = 0; + + /* + * Jump over the non-NOP insns and optimize single-byte NOPs into bigger + * ones. + */ + for (;;) { + if (insn_decode_kernel(&insn, &instr[i])) + return; + + /* + * See if this and any potentially following NOPs can be + * optimized. + */ + if (insn.length == 1 && insn.opcode.bytes[0] == 0x90) + i += optimize_nops_range(instr, len, i); + else + i += insn.length; + + if (i >= len) + return; + } } /* @@ -386,26 +281,32 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, */ for (a = start; a < end; a++) { int insn_buff_sz = 0; + /* Mask away "NOT" flag bit for feature to test. */ + u16 feature = a->cpuid & ~ALTINSTR_FLAG_INV; instr = (u8 *)&a->instr_offset + a->instr_offset; replacement = (u8 *)&a->repl_offset + a->repl_offset; BUG_ON(a->instrlen > sizeof(insn_buff)); - BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); - if (!boot_cpu_has(a->cpuid)) { - if (a->padlen > 1) - optimize_nops(a, instr); + BUG_ON(feature >= (NCAPINTS + NBUGINTS) * 32); - continue; - } + /* + * Patch if either: + * - feature is present + * - feature not present but ALTINSTR_FLAG_INV is set to mean, + * patch if feature is *NOT* present. + */ + if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV)) + goto next; - DPRINTK("feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d), pad: %d", - a->cpuid >> 5, - a->cpuid & 0x1f, + DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)", + (a->cpuid & ALTINSTR_FLAG_INV) ? "!" : "", + feature >> 5, + feature & 0x1f, instr, instr, a->instrlen, - replacement, a->replacementlen, a->padlen); + replacement, a->replacementlen); - DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr); - DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement); + DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr); + DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement); memcpy(insn_buff, replacement, a->replacementlen); insn_buff_sz = a->replacementlen; @@ -426,17 +327,307 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, if (a->replacementlen && is_jmp(replacement[0])) recompute_jump(a, instr, replacement, insn_buff); - if (a->instrlen > a->replacementlen) { - add_nops(insn_buff + a->replacementlen, - a->instrlen - a->replacementlen); - insn_buff_sz += a->instrlen - a->replacementlen; - } + for (; insn_buff_sz < a->instrlen; insn_buff_sz++) + insn_buff[insn_buff_sz] = 0x90; + DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr); text_poke_early(instr, insn_buff, insn_buff_sz); + +next: + optimize_nops(instr, a->instrlen); } } +#if defined(CONFIG_RETPOLINE) && defined(CONFIG_OBJTOOL) + +/* + * CALL/JMP *%\reg + */ +static int emit_indirect(int op, int reg, u8 *bytes) +{ + int i = 0; + u8 modrm; + + switch (op) { + case CALL_INSN_OPCODE: + modrm = 0x10; /* Reg = 2; CALL r/m */ + break; + + case JMP32_INSN_OPCODE: + modrm = 0x20; /* Reg = 4; JMP r/m */ + break; + + default: + WARN_ON_ONCE(1); + return -1; + } + + if (reg >= 8) { + bytes[i++] = 0x41; /* REX.B prefix */ + reg -= 8; + } + + modrm |= 0xc0; /* Mod = 3 */ + modrm += reg; + + bytes[i++] = 0xff; /* opcode */ + bytes[i++] = modrm; + + return i; +} + +/* + * Rewrite the compiler generated retpoline thunk calls. + * + * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate + * indirect instructions, avoiding the extra indirection. + * + * For example, convert: + * + * CALL __x86_indirect_thunk_\reg + * + * into: + * + * CALL *%\reg + * + * It also tries to inline spectre_v2=retpoline,lfence when size permits. + */ +static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) +{ + retpoline_thunk_t *target; + int reg, ret, i = 0; + u8 op, cc; + + target = addr + insn->length + insn->immediate.value; + reg = target - __x86_indirect_thunk_array; + + if (WARN_ON_ONCE(reg & ~0xf)) + return -1; + + /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */ + BUG_ON(reg == 4); + + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) && + !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) + return -1; + + op = insn->opcode.bytes[0]; + + /* + * Convert: + * + * Jcc.d32 __x86_indirect_thunk_\reg + * + * into: + * + * Jncc.d8 1f + * [ LFENCE ] + * JMP *%\reg + * [ NOP ] + * 1: + */ + /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */ + if (op == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80) { + cc = insn->opcode.bytes[1] & 0xf; + cc ^= 1; /* invert condition */ + + bytes[i++] = 0x70 + cc; /* Jcc.d8 */ + bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */ + + /* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */ + op = JMP32_INSN_OPCODE; + } + + /* + * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE. + */ + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { + bytes[i++] = 0x0f; + bytes[i++] = 0xae; + bytes[i++] = 0xe8; /* LFENCE */ + } + + ret = emit_indirect(op, reg, bytes + i); + if (ret < 0) + return ret; + i += ret; + + /* + * The compiler is supposed to EMIT an INT3 after every unconditional + * JMP instruction due to AMD BTC. However, if the compiler is too old + * or SLS isn't enabled, we still need an INT3 after indirect JMPs + * even on Intel. + */ + if (op == JMP32_INSN_OPCODE && i < insn->length) + bytes[i++] = INT3_INSN_OPCODE; + + for (; i < insn->length;) + bytes[i++] = BYTES_NOP1; + + return i; +} + +/* + * Generated by 'objtool --retpoline'. + */ +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + struct insn insn; + int len, ret; + u8 bytes[16]; + u8 op1, op2; + + ret = insn_decode_kernel(&insn, addr); + if (WARN_ON_ONCE(ret < 0)) + continue; + + op1 = insn.opcode.bytes[0]; + op2 = insn.opcode.bytes[1]; + + switch (op1) { + case CALL_INSN_OPCODE: + case JMP32_INSN_OPCODE: + break; + + case 0x0f: /* escape */ + if (op2 >= 0x80 && op2 <= 0x8f) + break; + fallthrough; + default: + WARN_ON_ONCE(1); + continue; + } + + DPRINTK("retpoline at: %pS (%px) len: %d to: %pS", + addr, addr, insn.length, + addr + insn.length + insn.immediate.value); + + len = patch_retpoline(addr, &insn, bytes); + if (len == insn.length) { + optimize_nops(bytes, len); + DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr); + DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr); + text_poke_early(addr, bytes, len); + } + } +} + +#ifdef CONFIG_RETHUNK +/* + * Rewrite the compiler generated return thunk tail-calls. + * + * For example, convert: + * + * JMP __x86_return_thunk + * + * into: + * + * RET + */ +static int patch_return(void *addr, struct insn *insn, u8 *bytes) +{ + int i = 0; + + if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + return -1; + + bytes[i++] = RET_INSN_OPCODE; + + for (; i < insn->length;) + bytes[i++] = INT3_INSN_OPCODE; + + return i; +} + +void __init_or_module noinline apply_returns(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *dest = NULL, *addr = (void *)s + *s; + struct insn insn; + int len, ret; + u8 bytes[16]; + u8 op; + + ret = insn_decode_kernel(&insn, addr); + if (WARN_ON_ONCE(ret < 0)) + continue; + + op = insn.opcode.bytes[0]; + if (op == JMP32_INSN_OPCODE) + dest = addr + insn.length + insn.immediate.value; + + if (__static_call_fixup(addr, op, dest) || + WARN_ONCE(dest != &__x86_return_thunk, + "missing return thunk: %pS-%pS: %*ph", + addr, dest, 5, addr)) + continue; + + DPRINTK("return thunk at: %pS (%px) len: %d to: %pS", + addr, addr, insn.length, + addr + insn.length + insn.immediate.value); + + len = patch_return(addr, &insn, bytes); + if (len == insn.length) { + DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr); + DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr); + text_poke_early(addr, bytes, len); + } + } +} +#else +void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } +#endif /* CONFIG_RETHUNK */ + +#else /* !CONFIG_RETPOLINE || !CONFIG_OBJTOOL */ + +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } +void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } + +#endif /* CONFIG_RETPOLINE && CONFIG_OBJTOOL */ + +#ifdef CONFIG_X86_KERNEL_IBT + +/* + * Generated by: objtool --ibt + */ +void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + u32 endbr, poison = gen_endbr_poison(); + void *addr = (void *)s + *s; + + if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr))) + continue; + + if (WARN_ON_ONCE(!is_endbr(endbr))) + continue; + + DPRINTK("ENDBR at: %pS (%px)", addr, addr); + + /* + * When we have IBT, the lack of ENDBR will trigger #CP + */ + DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr); + DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr); + text_poke_early(addr, &poison, 4); + } +} + +#else + +void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) { } + +#endif /* CONFIG_X86_KERNEL_IBT */ + #ifdef CONFIG_SMP static void alternatives_smp_lock(const s32 *start, const s32 *end, u8 *text, u8 *text_end) @@ -603,7 +794,7 @@ void __init_or_module apply_paravirt(struct paravirt_patch_site *start, BUG_ON(p->len > MAX_PATCH_LEN); /* prep the buffer with the original instructions */ memcpy(insn_buff, p->instr, p->len); - used = pv_ops.init.patch(p->type, insn_buff, (unsigned long)p->instr, p->len); + used = paravirt_patch(p->type, insn_buff, (unsigned long)p->instr, p->len); BUG_ON(used > p->len); @@ -638,34 +829,39 @@ asm ( " .pushsection .init.text, \"ax\", @progbits\n" " .type int3_magic, @function\n" "int3_magic:\n" + ANNOTATE_NOENDBR " movl $1, (%" _ASM_ARG1 ")\n" -" ret\n" + ASM_RET " .size int3_magic, .-int3_magic\n" " .popsection\n" ); -extern __initdata unsigned long int3_selftest_ip; /* defined in asm below */ +extern void int3_selftest_ip(void); /* defined in asm below */ static int __init int3_exception_notify(struct notifier_block *self, unsigned long val, void *data) { + unsigned long selftest = (unsigned long)&int3_selftest_ip; struct die_args *args = data; struct pt_regs *regs = args->regs; + OPTIMIZER_HIDE_VAR(selftest); + if (!regs || user_mode(regs)) return NOTIFY_DONE; if (val != DIE_INT3) return NOTIFY_DONE; - if (regs->ip - INT3_INSN_SIZE != int3_selftest_ip) + if (regs->ip - INT3_INSN_SIZE != selftest) return NOTIFY_DONE; int3_emulate_call(regs, (unsigned long)&int3_magic); return NOTIFY_STOP; } -static void __init int3_selftest(void) +/* Must be noinline to ensure uniqueness of int3_selftest_ip. */ +static noinline void __init int3_selftest(void) { static __initdata struct notifier_block int3_exception_nb = { .notifier_call = int3_exception_notify, @@ -678,18 +874,12 @@ static void __init int3_selftest(void) /* * Basically: int3_magic(&val); but really complicated :-) * - * Stick the address of the INT3 instruction into int3_selftest_ip, - * then trigger the INT3, padded with NOPs to match a CALL instruction - * length. + * INT3 padded with NOP to CALL_INSN_SIZE. The int3_exception_nb + * notifier above will emulate CALL for us. */ - asm volatile ("1: int3; nop; nop; nop; nop\n\t" - ".pushsection .init.data,\"aw\"\n\t" - ".align " __ASM_SEL(4, 8) "\n\t" - ".type int3_selftest_ip, @object\n\t" - ".size int3_selftest_ip, " __ASM_SEL(4, 8) "\n\t" - "int3_selftest_ip:\n\t" - __ASM_SEL(.long, .quad) " 1b\n\t" - ".popsection\n\t" + asm volatile ("int3_selftest_ip:\n\t" + ANNOTATE_NOENDBR + " int3; nop; nop; nop; nop\n\t" : ASM_CALL_CONSTRAINT : __ASM_SEL_RAW(a, D) (&val) : "memory"); @@ -721,8 +911,44 @@ void __init alternative_instructions(void) * patching. */ + /* + * Paravirt patching and alternative patching can be combined to + * replace a function call with a short direct code sequence (e.g. + * by setting a constant return value instead of doing that in an + * external function). + * In order to make this work the following sequence is required: + * 1. set (artificial) features depending on used paravirt + * functions which can later influence alternative patching + * 2. apply paravirt patching (generally replacing an indirect + * function call with a direct one) + * 3. apply alternative patching (e.g. replacing a direct function + * call with a custom code sequence) + * Doing paravirt patching after alternative patching would clobber + * the optimization of the custom code with a function call again. + */ + paravirt_set_cap(); + + /* + * First patch paravirt functions, such that we overwrite the indirect + * call with the direct call. + */ + apply_paravirt(__parainstructions, __parainstructions_end); + + /* + * Rewrite the retpolines, must be done before alternatives since + * those can rewrite the retpoline thunks. + */ + apply_retpolines(__retpoline_sites, __retpoline_sites_end); + apply_returns(__return_sites, __return_sites_end); + + /* + * Then patch alternatives, such that those paravirt calls that are in + * alternatives can be overwritten by their immediate fragments. + */ apply_alternatives(__alt_instructions, __alt_instructions_end); + apply_ibt_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end); + #ifdef CONFIG_SMP /* Patch to UP if other cpus not imminent. */ if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) { @@ -739,8 +965,6 @@ void __init alternative_instructions(void) } #endif - apply_paravirt(__parainstructions, __parainstructions_end); - restart_nmi(); alternatives_patched = 1; } @@ -783,10 +1007,88 @@ void __init_or_module text_poke_early(void *addr, const void *opcode, } } +typedef struct { + struct mm_struct *mm; +} temp_mm_state_t; + +/* + * Using a temporary mm allows to set temporary mappings that are not accessible + * by other CPUs. Such mappings are needed to perform sensitive memory writes + * that override the kernel memory protections (e.g., W^X), without exposing the + * temporary page-table mappings that are required for these write operations to + * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the + * mapping is torn down. + * + * Context: The temporary mm needs to be used exclusively by a single core. To + * harden security IRQs must be disabled while the temporary mm is + * loaded, thereby preventing interrupt handler bugs from overriding + * the kernel memory protection. + */ +static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) +{ + temp_mm_state_t temp_state; + + lockdep_assert_irqs_disabled(); + + /* + * Make sure not to be in TLB lazy mode, as otherwise we'll end up + * with a stale address space WITHOUT being in lazy mode after + * restoring the previous mm. + */ + if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) + leave_mm(smp_processor_id()); + + temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm); + switch_mm_irqs_off(NULL, mm, current); + + /* + * If breakpoints are enabled, disable them while the temporary mm is + * used. Userspace might set up watchpoints on addresses that are used + * in the temporary mm, which would lead to wrong signals being sent or + * crashes. + * + * Note that breakpoints are not disabled selectively, which also causes + * kernel breakpoints (e.g., perf's) to be disabled. This might be + * undesirable, but still seems reasonable as the code that runs in the + * temporary mm should be short. + */ + if (hw_breakpoint_active()) + hw_breakpoint_disable(); + + return temp_state; +} + +static inline void unuse_temporary_mm(temp_mm_state_t prev_state) +{ + lockdep_assert_irqs_disabled(); + switch_mm_irqs_off(NULL, prev_state.mm, current); + + /* + * Restore the breakpoints if they were disabled before the temporary mm + * was loaded. + */ + if (hw_breakpoint_active()) + hw_breakpoint_restore(); +} + __ro_after_init struct mm_struct *poking_mm; __ro_after_init unsigned long poking_addr; -static void *__text_poke(void *addr, const void *opcode, size_t len) +static void text_poke_memcpy(void *dst, const void *src, size_t len) +{ + memcpy(dst, src, len); +} + +static void text_poke_memset(void *dst, const void *src, size_t len) +{ + int c = *(const int *)src; + + memset(dst, c, len); +} + +typedef void text_poke_f(void *dst, const void *src, size_t len); + +static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len) { bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; struct page *pages[2] = {NULL}; @@ -818,8 +1120,6 @@ static void *__text_poke(void *addr, const void *opcode, size_t len) */ BUG_ON(!pages[0] || (cross_page_boundary && !pages[1])); - local_irq_save(flags); - /* * Map the page without the global bit, as TLB flushing is done with * flush_tlb_mm_range(), which is intended for non-global PTEs. @@ -836,6 +1136,8 @@ static void *__text_poke(void *addr, const void *opcode, size_t len) */ VM_BUG_ON(!ptep); + local_irq_save(flags); + pte = mk_pte(pages[0], pgprot); set_pte_at(poking_mm, poking_addr, ptep, pte); @@ -851,7 +1153,7 @@ static void *__text_poke(void *addr, const void *opcode, size_t len) prev = use_temporary_mm(poking_mm); kasan_disable_current(); - memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len); + func((u8 *)poking_addr + offset_in_page(addr), src, len); kasan_enable_current(); /* @@ -879,14 +1181,16 @@ static void *__text_poke(void *addr, const void *opcode, size_t len) (cross_page_boundary ? 2 : 1) * PAGE_SIZE, PAGE_SHIFT, false); - /* - * If the text does not match what we just wrote then something is - * fundamentally screwy; there's nothing we can really do about that. - */ - BUG_ON(memcmp(addr, opcode, len)); + if (func == text_poke_memcpy) { + /* + * If the text does not match what we just wrote then something is + * fundamentally screwy; there's nothing we can really do about that. + */ + BUG_ON(memcmp(addr, src, len)); + } - pte_unmap_unlock(ptep, ptl); local_irq_restore(flags); + pte_unmap_unlock(ptep, ptl); return addr; } @@ -910,7 +1214,7 @@ void *text_poke(void *addr, const void *opcode, size_t len) { lockdep_assert_held(&text_mutex); - return __text_poke(addr, opcode, len); + return __text_poke(text_poke_memcpy, addr, opcode, len); } /** @@ -929,7 +1233,72 @@ void *text_poke(void *addr, const void *opcode, size_t len) */ void *text_poke_kgdb(void *addr, const void *opcode, size_t len) { - return __text_poke(addr, opcode, len); + return __text_poke(text_poke_memcpy, addr, opcode, len); +} + +/** + * text_poke_copy - Copy instructions into (an unused part of) RX memory + * @addr: address to modify + * @opcode: source of the copy + * @len: length to copy, could be more than 2x PAGE_SIZE + * + * Not safe against concurrent execution; useful for JITs to dump + * new code blocks into unused regions of RX memory. Can be used in + * conjunction with synchronize_rcu_tasks() to wait for existing + * execution to quiesce after having made sure no existing functions + * pointers are live. + */ +void *text_poke_copy(void *addr, const void *opcode, size_t len) +{ + unsigned long start = (unsigned long)addr; + size_t patched = 0; + + if (WARN_ON_ONCE(core_kernel_text(start))) + return NULL; + + mutex_lock(&text_mutex); + while (patched < len) { + unsigned long ptr = start + patched; + size_t s; + + s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched); + + __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s); + patched += s; + } + mutex_unlock(&text_mutex); + return addr; +} + +/** + * text_poke_set - memset into (an unused part of) RX memory + * @addr: address to modify + * @c: the byte to fill the area with + * @len: length to copy, could be more than 2x PAGE_SIZE + * + * This is useful to overwrite unused regions of RX memory with illegal + * instructions. + */ +void *text_poke_set(void *addr, int c, size_t len) +{ + unsigned long start = (unsigned long)addr; + size_t patched = 0; + + if (WARN_ON_ONCE(core_kernel_text(start))) + return NULL; + + mutex_lock(&text_mutex); + while (patched < len) { + unsigned long ptr = start + patched; + size_t s; + + s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched); + + __text_poke(text_poke_memset, (void *)ptr, (void *)&c, s); + patched += s; + } + mutex_unlock(&text_mutex); + return addr; } static void do_sync_core(void *info) @@ -943,10 +1312,14 @@ void text_poke_sync(void) } struct text_poke_loc { - s32 rel_addr; /* addr := _stext + rel_addr */ - s32 rel32; + /* addr := _stext + rel_addr */ + s32 rel_addr; + s32 disp; + u8 len; u8 opcode; const u8 text[POKE_MAX_OPCODE_SIZE]; + /* see text_poke_bp_batch() */ + u8 old; }; struct bp_patching_desc { @@ -955,30 +1328,33 @@ struct bp_patching_desc { atomic_t refs; }; -static struct bp_patching_desc *bp_desc; +static struct bp_patching_desc bp_desc; -static inline struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp) +static __always_inline +struct bp_patching_desc *try_get_desc(void) { - struct bp_patching_desc *desc = READ_ONCE(*descp); /* rcu_dereference */ + struct bp_patching_desc *desc = &bp_desc; - if (!desc || !atomic_inc_not_zero(&desc->refs)) + if (!arch_atomic_inc_not_zero(&desc->refs)) return NULL; return desc; } -static inline void put_desc(struct bp_patching_desc *desc) +static __always_inline void put_desc(void) { + struct bp_patching_desc *desc = &bp_desc; + smp_mb__before_atomic(); - atomic_dec(&desc->refs); + arch_atomic_dec(&desc->refs); } -static inline void *text_poke_addr(struct text_poke_loc *tp) +static __always_inline void *text_poke_addr(struct text_poke_loc *tp) { return _stext + tp->rel_addr; } -static int notrace patch_cmp(const void *key, const void *elt) +static __always_inline int patch_cmp(const void *key, const void *elt) { struct text_poke_loc *tp = (struct text_poke_loc *) elt; @@ -988,13 +1364,12 @@ static int notrace patch_cmp(const void *key, const void *elt) return 1; return 0; } -NOKPROBE_SYMBOL(patch_cmp); -int notrace poke_int3_handler(struct pt_regs *regs) +noinstr int poke_int3_handler(struct pt_regs *regs) { struct bp_patching_desc *desc; struct text_poke_loc *tp; - int len, ret = 0; + int ret = 0; void *ip; if (user_mode(regs)) @@ -1002,15 +1377,15 @@ int notrace poke_int3_handler(struct pt_regs *regs) /* * Having observed our INT3 instruction, we now must observe - * bp_desc: + * bp_desc with non-zero refcount: * - * bp_desc = desc INT3 + * bp_desc.refs = 1 INT3 * WMB RMB - * write INT3 if (desc) + * write INT3 if (bp_desc.refs != 0) */ smp_rmb(); - desc = try_get_desc(&bp_desc); + desc = try_get_desc(); if (!desc) return 0; @@ -1023,9 +1398,9 @@ int notrace poke_int3_handler(struct pt_regs *regs) * Skip the binary search if there is a single member in the vector. */ if (unlikely(desc->nr_entries > 1)) { - tp = bsearch(ip, desc->vec, desc->nr_entries, - sizeof(struct text_poke_loc), - patch_cmp); + tp = __inline_bsearch(ip, desc->vec, desc->nr_entries, + sizeof(struct text_poke_loc), + patch_cmp); if (!tp) goto out_put; } else { @@ -1034,8 +1409,7 @@ int notrace poke_int3_handler(struct pt_regs *regs) goto out_put; } - len = text_opcode_size(tp->opcode); - ip += len; + ip += tp->len; switch (tp->opcode) { case INT3_INSN_OPCODE: @@ -1045,13 +1419,17 @@ int notrace poke_int3_handler(struct pt_regs *regs) */ goto out_put; + case RET_INSN_OPCODE: + int3_emulate_ret(regs); + break; + case CALL_INSN_OPCODE: - int3_emulate_call(regs, (long)ip + tp->rel32); + int3_emulate_call(regs, (long)ip + tp->disp); break; case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: - int3_emulate_jmp(regs, (long)ip + tp->rel32); + int3_emulate_jmp(regs, (long)ip + tp->disp); break; default: @@ -1061,10 +1439,9 @@ int notrace poke_int3_handler(struct pt_regs *regs) ret = 1; out_put: - put_desc(desc); + put_desc(); return ret; } -NOKPROBE_SYMBOL(poke_int3_handler); #define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc)) static struct text_poke_loc tp_vec[TP_VEC_MAX]; @@ -1093,18 +1470,20 @@ static int tp_vec_nr; */ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) { - struct bp_patching_desc desc = { - .vec = tp, - .nr_entries = nr_entries, - .refs = ATOMIC_INIT(1), - }; unsigned char int3 = INT3_INSN_OPCODE; unsigned int i; int do_sync; lockdep_assert_held(&text_mutex); - smp_store_release(&bp_desc, &desc); /* rcu_assign_pointer */ + bp_desc.vec = tp; + bp_desc.nr_entries = nr_entries; + + /* + * Corresponds to the implicit memory barrier in try_get_desc() to + * ensure reading a non-zero refcount provides up to date bp_desc data. + */ + atomic_set_release(&bp_desc.refs, 1); /* * Corresponding read barrier in int3 notifier for making sure the @@ -1115,8 +1494,10 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries /* * First step: add a int3 trap to the address that will be patched. */ - for (i = 0; i < nr_entries; i++) + for (i = 0; i < nr_entries; i++) { + tp[i].old = *(u8 *)text_poke_addr(&tp[i]); text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE); + } text_poke_sync(); @@ -1124,14 +1505,45 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries * Second step: update all but the first byte of the patched range. */ for (do_sync = 0, i = 0; i < nr_entries; i++) { - int len = text_opcode_size(tp[i].opcode); + u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, }; + int len = tp[i].len; if (len - INT3_INSN_SIZE > 0) { + memcpy(old + INT3_INSN_SIZE, + text_poke_addr(&tp[i]) + INT3_INSN_SIZE, + len - INT3_INSN_SIZE); text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE, (const char *)tp[i].text + INT3_INSN_SIZE, len - INT3_INSN_SIZE); do_sync++; } + + /* + * Emit a perf event to record the text poke, primarily to + * support Intel PT decoding which must walk the executable code + * to reconstruct the trace. The flow up to here is: + * - write INT3 byte + * - IPI-SYNC + * - write instruction tail + * At this point the actual control flow will be through the + * INT3 and handler and not hit the old or new instruction. + * Intel PT outputs FUP/TIP packets for the INT3, so the flow + * can still be decoded. Subsequently: + * - emit RECORD_TEXT_POKE with the new instruction + * - IPI-SYNC + * - write first byte + * - IPI-SYNC + * So before the text poke event timestamp, the decoder will see + * either the old instruction flow or FUP/TIP of INT3. After the + * text poke event timestamp, the decoder will see either the + * new instruction flow or FUP/TIP of INT3. Thus decoders can + * use the timestamp as the point at which to modify the + * executable code. + * The old instruction is recorded so that the event can be + * processed forwards or backwards. + */ + perf_event_text_poke(text_poke_addr(&tp[i]), old, len, + tp[i].text, len); } if (do_sync) { @@ -1159,54 +1571,69 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries text_poke_sync(); /* - * Remove and synchronize_rcu(), except we have a very primitive - * refcount based completion. + * Remove and wait for refs to be zero. */ - WRITE_ONCE(bp_desc, NULL); /* RCU_INIT_POINTER */ - if (!atomic_dec_and_test(&desc.refs)) - atomic_cond_read_acquire(&desc.refs, !VAL); + if (!atomic_dec_and_test(&bp_desc.refs)) + atomic_cond_read_acquire(&bp_desc.refs, !VAL); } -void text_poke_loc_init(struct text_poke_loc *tp, void *addr, - const void *opcode, size_t len, const void *emulate) +static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, + const void *opcode, size_t len, const void *emulate) { struct insn insn; + int ret, i; memcpy((void *)tp->text, opcode, len); if (!emulate) emulate = opcode; - kernel_insn_init(&insn, emulate, MAX_INSN_SIZE); - insn_get_length(&insn); - - BUG_ON(!insn_complete(&insn)); - BUG_ON(len != insn.length); + ret = insn_decode_kernel(&insn, emulate); + BUG_ON(ret < 0); tp->rel_addr = addr - (void *)_stext; + tp->len = len; tp->opcode = insn.opcode.bytes[0]; switch (tp->opcode) { + case RET_INSN_OPCODE: + case JMP32_INSN_OPCODE: + case JMP8_INSN_OPCODE: + /* + * Control flow instructions without implied execution of the + * next instruction can be padded with INT3. + */ + for (i = insn.length; i < len; i++) + BUG_ON(tp->text[i] != INT3_INSN_OPCODE); + break; + + default: + BUG_ON(len != insn.length); + }; + + + switch (tp->opcode) { case INT3_INSN_OPCODE: + case RET_INSN_OPCODE: break; case CALL_INSN_OPCODE: case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: - tp->rel32 = insn.immediate.value; + tp->disp = insn.immediate.value; break; default: /* assume NOP */ switch (len) { case 2: /* NOP2 -- emulate as JMP8+0 */ - BUG_ON(memcmp(emulate, ideal_nops[len], len)); + BUG_ON(memcmp(emulate, x86_nops[len], len)); tp->opcode = JMP8_INSN_OPCODE; - tp->rel32 = 0; + tp->disp = 0; break; case 5: /* NOP5 -- emulate as JMP32+0 */ - BUG_ON(memcmp(emulate, ideal_nops[NOP_ATOMIC5], len)); + BUG_ON(memcmp(emulate, x86_nops[len], len)); tp->opcode = JMP32_INSN_OPCODE; - tp->rel32 = 0; + tp->disp = 0; break; default: /* unknown instruction */ @@ -1270,7 +1697,7 @@ void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const voi * @addr: address to patch * @opcode: opcode of new instruction * @len: length to copy - * @handler: address to jump to when the temporary breakpoint is hit + * @emulate: instruction to be emulated * * Update a single instruction with the vector in the stack, avoiding * dynamically allocated memory. This function should be used when it is diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index 4e5f50236048..19a0207e529f 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -6,7 +6,7 @@ * This allows to use PCI devices that only support 32bit addresses on systems * with more than 4GB. * - * See Documentation/DMA-API-HOWTO.txt for the interface specification. + * See Documentation/core-api/dma-api-howto.rst for the interface specification. * * Copyright 2002 Andi Kleen, SuSE Labs. */ @@ -32,17 +32,15 @@ #include <linux/gfp.h> #include <linux/atomic.h> #include <linux/dma-direct.h> +#include <linux/dma-map-ops.h> #include <asm/mtrr.h> -#include <asm/pgtable.h> #include <asm/proto.h> #include <asm/iommu.h> #include <asm/gart.h> #include <asm/set_memory.h> -#include <asm/swiotlb.h> #include <asm/dma.h> #include <asm/amd_nb.h> #include <asm/x86_init.h> -#include <asm/iommu_table.h> static unsigned long iommu_bus_base; /* GART remapping area (physical) */ static unsigned long iommu_size; /* size of remapping area bytes */ @@ -55,7 +53,7 @@ static u32 *iommu_gatt_base; /* Remapping table */ * of only flushing when an mapping is reused. With it true the GART is * flushed for every mapping. Problem is that doing the lazy flush seems * to trigger bugs with some popular PCI cards, in particular 3ware (but - * has been also also seen with Qlogic at least). + * has been also seen with Qlogic at least). */ static int iommu_fullflush = 1; @@ -97,8 +95,7 @@ static unsigned long alloc_iommu(struct device *dev, int size, base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), PAGE_SIZE) >> PAGE_SHIFT; - boundary_size = ALIGN((u64)dma_get_seg_boundary(dev) + 1, - PAGE_SIZE) >> PAGE_SHIFT; + boundary_size = dma_get_seg_boundary_nr_pages(dev, PAGE_SHIFT); spin_lock_irqsave(&iommu_bitmap_lock, flags); offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit, @@ -159,7 +156,7 @@ static void dump_leak(void) return; dump = 1; - show_stack(NULL, NULL); + show_stack(NULL, NULL, KERN_ERR); debug_dma_dump_mappings(NULL); } #endif @@ -332,7 +329,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start, int i; if (iommu_start == -1) - return -1; + return -ENOMEM; for_each_sg(start, s, nelems, i) { unsigned long pages, addr; @@ -381,13 +378,13 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs) { struct scatterlist *s, *ps, *start_sg, *sgmap; - int need = 0, nextneed, i, out, start; + int need = 0, nextneed, i, out, start, ret; unsigned long pages = 0; unsigned int seg_size; unsigned int max_seg_size; if (nents == 0) - return 0; + return -EINVAL; out = 0; start = 0; @@ -415,8 +412,9 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, if (!iommu_merge || !nextneed || !need || s->offset || (s->length + seg_size > max_seg_size) || (ps->offset + ps->length) % PAGE_SIZE) { - if (dma_map_cont(dev, start_sg, i - start, - sgmap, pages, need) < 0) + ret = dma_map_cont(dev, start_sg, i - start, + sgmap, pages, need); + if (ret < 0) goto error; out++; @@ -433,7 +431,8 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, pages += iommu_num_pages(s->offset, s->length, PAGE_SIZE); ps = s; } - if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0) + ret = dma_map_cont(dev, start_sg, i - start, sgmap, pages, need); + if (ret < 0) goto error; out++; flush_gart(); @@ -457,9 +456,7 @@ error: panic("dma_map_sg: overflow on %lu pages\n", pages); iommu_full(dev, pages << PAGE_SHIFT, dir); - for_each_sg(sg, s, nents, i) - s->dma_address = DMA_MAPPING_ERROR; - return 0; + return ret; } /* allocate and map a coherent mapping */ @@ -469,7 +466,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, { void *vaddr; - vaddr = dma_direct_alloc_pages(dev, size, dma_addr, flag, attrs); + vaddr = dma_direct_alloc(dev, size, dma_addr, flag, attrs); if (!vaddr || !force_iommu || dev->coherent_dma_mask <= DMA_BIT_MASK(24)) return vaddr; @@ -481,7 +478,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, goto out_free; return vaddr; out_free: - dma_direct_free_pages(dev, size, vaddr, *dma_addr, attrs); + dma_direct_free(dev, size, vaddr, *dma_addr, attrs); return NULL; } @@ -491,7 +488,7 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_addr, unsigned long attrs) { gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, 0); - dma_direct_free_pages(dev, size, vaddr, dma_addr, attrs); + dma_direct_free(dev, size, vaddr, dma_addr, attrs); } static int no_agp; @@ -679,6 +676,8 @@ static const struct dma_map_ops gart_dma_ops = { .get_sgtable = dma_common_get_sgtable, .dma_supported = dma_direct_supported, .get_required_mask = dma_direct_get_required_mask, + .alloc_pages = dma_direct_alloc_pages, + .free_pages = dma_direct_free_pages, }; static void gart_iommu_shutdown(void) @@ -744,7 +743,8 @@ int __init gart_iommu_init(void) start_pfn = PFN_DOWN(aper_base); if (!pfn_range_is_mapped(start_pfn, end_pfn)) - init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); + init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT, + PAGE_KERNEL); pr_info("PCI-DMA: using GART IOMMU.\n"); iommu_size = check_iommu_size(info.aper_base, aper_size); @@ -806,7 +806,7 @@ int __init gart_iommu_init(void) flush_gart(); dma_ops = &gart_dma_ops; x86_platform.iommu_shutdown = gart_iommu_shutdown; - swiotlb = 0; + x86_swiotlb_enable = false; return 0; } @@ -840,4 +840,3 @@ void __init gart_parse_options(char *p) } } } -IOMMU_INIT_POST(gart_iommu_hole_init); diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 69aed0ebbdfc..4266b64631a4 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Shared support code for AMD K8 northbridges and derivates. + * Shared support code for AMD K8 northbridges and derivatives. * Copyright 2006 Andi Kleen, SUSE Labs. */ @@ -18,13 +18,26 @@ #define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450 #define PCI_DEVICE_ID_AMD_17H_M10H_ROOT 0x15d0 #define PCI_DEVICE_ID_AMD_17H_M30H_ROOT 0x1480 +#define PCI_DEVICE_ID_AMD_17H_M60H_ROOT 0x1630 +#define PCI_DEVICE_ID_AMD_17H_MA0H_ROOT 0x14b5 +#define PCI_DEVICE_ID_AMD_19H_M10H_ROOT 0x14a4 +#define PCI_DEVICE_ID_AMD_19H_M60H_ROOT 0x14d8 +#define PCI_DEVICE_ID_AMD_19H_M70H_ROOT 0x14e8 #define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464 #define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec #define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494 +#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c #define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444 +#define PCI_DEVICE_ID_AMD_17H_MA0H_DF_F4 0x1728 #define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654 - -/* Protect the PCI config register pairs used for SMN and DF indirect access. */ +#define PCI_DEVICE_ID_AMD_19H_M10H_DF_F4 0x14b1 +#define PCI_DEVICE_ID_AMD_19H_M40H_ROOT 0x14b5 +#define PCI_DEVICE_ID_AMD_19H_M40H_DF_F4 0x167d +#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e +#define PCI_DEVICE_ID_AMD_19H_M60H_DF_F4 0x14e4 +#define PCI_DEVICE_ID_AMD_19H_M70H_DF_F4 0x14f4 + +/* Protect the PCI config register pairs used for SMN. */ static DEFINE_MUTEX(smn_mutex); static u32 *flush_words; @@ -33,13 +46,18 @@ static const struct pci_device_id amd_root_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_ROOT) }, {} }; - #define PCI_DEVICE_ID_AMD_CNB17H_F4 0x1704 -const struct pci_device_id amd_nb_misc_ids[] = { +static const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) }, @@ -51,12 +69,18 @@ const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F3) }, {} }; -EXPORT_SYMBOL_GPL(amd_nb_misc_ids); static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, @@ -67,8 +91,13 @@ static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) }, {} }; @@ -171,55 +200,8 @@ int amd_smn_write(u16 node, u32 address, u32 value) } EXPORT_SYMBOL_GPL(amd_smn_write); -/* - * Data Fabric Indirect Access uses FICAA/FICAD. - * - * Fabric Indirect Configuration Access Address (FICAA): Constructed based - * on the device's Instance Id and the PCI function and register offset of - * the desired register. - * - * Fabric Indirect Configuration Access Data (FICAD): There are FICAD LO - * and FICAD HI registers but so far we only need the LO register. - */ -int amd_df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo) -{ - struct pci_dev *F4; - u32 ficaa; - int err = -ENODEV; - - if (node >= amd_northbridges.num) - goto out; - - F4 = node_to_amd_nb(node)->link; - if (!F4) - goto out; - - ficaa = 1; - ficaa |= reg & 0x3FC; - ficaa |= (func & 0x7) << 11; - ficaa |= instance_id << 16; - - mutex_lock(&smn_mutex); - - err = pci_write_config_dword(F4, 0x5C, ficaa); - if (err) { - pr_warn("Error writing DF Indirect FICAA, FICAA=0x%x\n", ficaa); - goto out_unlock; - } - - err = pci_read_config_dword(F4, 0x98, lo); - if (err) - pr_warn("Error reading DF Indirect FICAD LO, FICAA=0x%x.\n", ficaa); - -out_unlock: - mutex_unlock(&smn_mutex); - -out: - return err; -} -EXPORT_SYMBOL_GPL(amd_df_indirect_read); -int amd_cache_northbridges(void) +static int amd_cache_northbridges(void) { const struct pci_device_id *misc_ids = amd_nb_misc_ids; const struct pci_device_id *link_ids = amd_nb_link_ids; @@ -241,14 +223,14 @@ int amd_cache_northbridges(void) } misc = NULL; - while ((misc = next_northbridge(misc, misc_ids)) != NULL) + while ((misc = next_northbridge(misc, misc_ids))) misc_count++; if (!misc_count) return -ENODEV; root = NULL; - while ((root = next_northbridge(root, root_ids)) != NULL) + while ((root = next_northbridge(root, root_ids))) root_count++; if (root_count) { @@ -321,7 +303,6 @@ int amd_cache_northbridges(void) return 0; } -EXPORT_SYMBOL_GPL(amd_cache_northbridges); /* * Ignores subdevice/subvendor but as far as I can figure out @@ -381,7 +362,7 @@ struct resource *amd_get_mmconfig_range(struct resource *res) int amd_get_subcaches(int cpu) { - struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link; + struct pci_dev *link = node_to_amd_nb(topology_die_id(cpu))->link; unsigned int mask; if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) @@ -395,7 +376,7 @@ int amd_get_subcaches(int cpu) int amd_set_subcaches(int cpu, unsigned long mask) { static unsigned int reset, ban; - struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu)); + struct amd_northbridge *nb = node_to_amd_nb(topology_die_id(cpu)); unsigned int reg; int cuid; diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c deleted file mode 100644 index fe698f96617c..000000000000 --- a/arch/x86/kernel/apb_timer.c +++ /dev/null @@ -1,400 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * apb_timer.c: Driver for Langwell APB timers - * - * (C) Copyright 2009 Intel Corporation - * Author: Jacob Pan (jacob.jun.pan@intel.com) - * - * Note: - * Langwell is the south complex of Intel Moorestown MID platform. There are - * eight external timers in total that can be used by the operating system. - * The timer information, such as frequency and addresses, is provided to the - * OS via SFI tables. - * Timer interrupts are routed via FW/HW emulated IOAPIC independently via - * individual redirection table entries (RTE). - * Unlike HPET, there is no master counter, therefore one of the timers are - * used as clocksource. The overall allocation looks like: - * - timer 0 - NR_CPUs for per cpu timer - * - one timer for clocksource - * - one timer for watchdog driver. - * It is also worth notice that APB timer does not support true one-shot mode, - * free-running mode will be used here to emulate one-shot mode. - * APB timer can also be used as broadcast timer along with per cpu local APIC - * timer, but by default APB timer has higher rating than local APIC timers. - */ - -#include <linux/delay.h> -#include <linux/dw_apb_timer.h> -#include <linux/errno.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/pm.h> -#include <linux/sfi.h> -#include <linux/interrupt.h> -#include <linux/cpu.h> -#include <linux/irq.h> - -#include <asm/fixmap.h> -#include <asm/apb_timer.h> -#include <asm/intel-mid.h> -#include <asm/time.h> - -#define APBT_CLOCKEVENT_RATING 110 -#define APBT_CLOCKSOURCE_RATING 250 - -#define APBT_CLOCKEVENT0_NUM (0) -#define APBT_CLOCKSOURCE_NUM (2) - -static phys_addr_t apbt_address; -static int apb_timer_block_enabled; -static void __iomem *apbt_virt_address; - -/* - * Common DW APB timer info - */ -static unsigned long apbt_freq; - -struct apbt_dev { - struct dw_apb_clock_event_device *timer; - unsigned int num; - int cpu; - unsigned int irq; - char name[10]; -}; - -static struct dw_apb_clocksource *clocksource_apbt; - -static inline void __iomem *adev_virt_addr(struct apbt_dev *adev) -{ - return apbt_virt_address + adev->num * APBTMRS_REG_SIZE; -} - -static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); - -#ifdef CONFIG_SMP -static unsigned int apbt_num_timers_used; -#endif - -static inline void apbt_set_mapping(void) -{ - struct sfi_timer_table_entry *mtmr; - int phy_cs_timer_id = 0; - - if (apbt_virt_address) { - pr_debug("APBT base already mapped\n"); - return; - } - mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); - if (mtmr == NULL) { - printk(KERN_ERR "Failed to get MTMR %d from SFI\n", - APBT_CLOCKEVENT0_NUM); - return; - } - apbt_address = (phys_addr_t)mtmr->phys_addr; - if (!apbt_address) { - printk(KERN_WARNING "No timer base from SFI, use default\n"); - apbt_address = APBT_DEFAULT_BASE; - } - apbt_virt_address = ioremap(apbt_address, APBT_MMAP_SIZE); - if (!apbt_virt_address) { - pr_debug("Failed mapping APBT phy address at %lu\n",\ - (unsigned long)apbt_address); - goto panic_noapbt; - } - apbt_freq = mtmr->freq_hz; - sfi_free_mtmr(mtmr); - - /* Now figure out the physical timer id for clocksource device */ - mtmr = sfi_get_mtmr(APBT_CLOCKSOURCE_NUM); - if (mtmr == NULL) - goto panic_noapbt; - - /* Now figure out the physical timer id */ - pr_debug("Use timer %d for clocksource\n", - (int)(mtmr->phys_addr & 0xff) / APBTMRS_REG_SIZE); - phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff) / - APBTMRS_REG_SIZE; - - clocksource_apbt = dw_apb_clocksource_init(APBT_CLOCKSOURCE_RATING, - "apbt0", apbt_virt_address + phy_cs_timer_id * - APBTMRS_REG_SIZE, apbt_freq); - return; - -panic_noapbt: - panic("Failed to setup APB system timer\n"); - -} - -static inline void apbt_clear_mapping(void) -{ - iounmap(apbt_virt_address); - apbt_virt_address = NULL; -} - -static int __init apbt_clockevent_register(void) -{ - struct sfi_timer_table_entry *mtmr; - struct apbt_dev *adev = this_cpu_ptr(&cpu_apbt_dev); - - mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); - if (mtmr == NULL) { - printk(KERN_ERR "Failed to get MTMR %d from SFI\n", - APBT_CLOCKEVENT0_NUM); - return -ENODEV; - } - - adev->num = smp_processor_id(); - adev->timer = dw_apb_clockevent_init(smp_processor_id(), "apbt0", - intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT ? - APBT_CLOCKEVENT_RATING - 100 : APBT_CLOCKEVENT_RATING, - adev_virt_addr(adev), 0, apbt_freq); - /* Firmware does EOI handling for us. */ - adev->timer->eoi = NULL; - - if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT) { - global_clock_event = &adev->timer->ced; - printk(KERN_DEBUG "%s clockevent registered as global\n", - global_clock_event->name); - } - - dw_apb_clockevent_register(adev->timer); - - sfi_free_mtmr(mtmr); - return 0; -} - -#ifdef CONFIG_SMP - -static void apbt_setup_irq(struct apbt_dev *adev) -{ - irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT); - irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); -} - -/* Should be called with per cpu */ -void apbt_setup_secondary_clock(void) -{ - struct apbt_dev *adev; - int cpu; - - /* Don't register boot CPU clockevent */ - cpu = smp_processor_id(); - if (!cpu) - return; - - adev = this_cpu_ptr(&cpu_apbt_dev); - if (!adev->timer) { - adev->timer = dw_apb_clockevent_init(cpu, adev->name, - APBT_CLOCKEVENT_RATING, adev_virt_addr(adev), - adev->irq, apbt_freq); - adev->timer->eoi = NULL; - } else { - dw_apb_clockevent_resume(adev->timer); - } - - printk(KERN_INFO "Registering CPU %d clockevent device %s, cpu %08x\n", - cpu, adev->name, adev->cpu); - - apbt_setup_irq(adev); - dw_apb_clockevent_register(adev->timer); - - return; -} - -/* - * this notify handler process CPU hotplug events. in case of S0i3, nonboot - * cpus are disabled/enabled frequently, for performance reasons, we keep the - * per cpu timer irq registered so that we do need to do free_irq/request_irq. - * - * TODO: it might be more reliable to directly disable percpu clockevent device - * without the notifier chain. currently, cpu 0 may get interrupts from other - * cpu timers during the offline process due to the ordering of notification. - * the extra interrupt is harmless. - */ -static int apbt_cpu_dead(unsigned int cpu) -{ - struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu); - - dw_apb_clockevent_pause(adev->timer); - if (system_state == SYSTEM_RUNNING) { - pr_debug("skipping APBT CPU %u offline\n", cpu); - } else { - pr_debug("APBT clockevent for cpu %u offline\n", cpu); - dw_apb_clockevent_stop(adev->timer); - } - return 0; -} - -static __init int apbt_late_init(void) -{ - if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT || - !apb_timer_block_enabled) - return 0; - return cpuhp_setup_state(CPUHP_X86_APB_DEAD, "x86/apb:dead", NULL, - apbt_cpu_dead); -} -fs_initcall(apbt_late_init); -#else - -void apbt_setup_secondary_clock(void) {} - -#endif /* CONFIG_SMP */ - -static int apbt_clocksource_register(void) -{ - u64 start, now; - u64 t1; - - /* Start the counter, use timer 2 as source, timer 0/1 for event */ - dw_apb_clocksource_start(clocksource_apbt); - - /* Verify whether apbt counter works */ - t1 = dw_apb_clocksource_read(clocksource_apbt); - start = rdtsc(); - - /* - * We don't know the TSC frequency yet, but waiting for - * 200000 TSC cycles is safe: - * 4 GHz == 50us - * 1 GHz == 200us - */ - do { - rep_nop(); - now = rdtsc(); - } while ((now - start) < 200000UL); - - /* APBT is the only always on clocksource, it has to work! */ - if (t1 == dw_apb_clocksource_read(clocksource_apbt)) - panic("APBT counter not counting. APBT disabled\n"); - - dw_apb_clocksource_register(clocksource_apbt); - - return 0; -} - -/* - * Early setup the APBT timer, only use timer 0 for booting then switch to - * per CPU timer if possible. - * returns 1 if per cpu apbt is setup - * returns 0 if no per cpu apbt is chosen - * panic if set up failed, this is the only platform timer on Moorestown. - */ -void __init apbt_time_init(void) -{ -#ifdef CONFIG_SMP - int i; - struct sfi_timer_table_entry *p_mtmr; - struct apbt_dev *adev; -#endif - - if (apb_timer_block_enabled) - return; - apbt_set_mapping(); - if (!apbt_virt_address) - goto out_noapbt; - /* - * Read the frequency and check for a sane value, for ESL model - * we extend the possible clock range to allow time scaling. - */ - - if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) { - pr_debug("APBT has invalid freq 0x%lx\n", apbt_freq); - goto out_noapbt; - } - if (apbt_clocksource_register()) { - pr_debug("APBT has failed to register clocksource\n"); - goto out_noapbt; - } - if (!apbt_clockevent_register()) - apb_timer_block_enabled = 1; - else { - pr_debug("APBT has failed to register clockevent\n"); - goto out_noapbt; - } -#ifdef CONFIG_SMP - /* kernel cmdline disable apb timer, so we will use lapic timers */ - if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT) { - printk(KERN_INFO "apbt: disabled per cpu timer\n"); - return; - } - pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus()); - if (num_possible_cpus() <= sfi_mtimer_num) - apbt_num_timers_used = num_possible_cpus(); - else - apbt_num_timers_used = 1; - pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used); - - /* here we set up per CPU timer data structure */ - for (i = 0; i < apbt_num_timers_used; i++) { - adev = &per_cpu(cpu_apbt_dev, i); - adev->num = i; - adev->cpu = i; - p_mtmr = sfi_get_mtmr(i); - if (p_mtmr) - adev->irq = p_mtmr->irq; - else - printk(KERN_ERR "Failed to get timer for cpu %d\n", i); - snprintf(adev->name, sizeof(adev->name) - 1, "apbt%d", i); - } -#endif - - return; - -out_noapbt: - apbt_clear_mapping(); - apb_timer_block_enabled = 0; - panic("failed to enable APB timer\n"); -} - -/* called before apb_timer_enable, use early map */ -unsigned long apbt_quick_calibrate(void) -{ - int i, scale; - u64 old, new; - u64 t1, t2; - unsigned long khz = 0; - u32 loop, shift; - - apbt_set_mapping(); - dw_apb_clocksource_start(clocksource_apbt); - - /* check if the timer can count down, otherwise return */ - old = dw_apb_clocksource_read(clocksource_apbt); - i = 10000; - while (--i) { - if (old != dw_apb_clocksource_read(clocksource_apbt)) - break; - } - if (!i) - goto failed; - - /* count 16 ms */ - loop = (apbt_freq / 1000) << 4; - - /* restart the timer to ensure it won't get to 0 in the calibration */ - dw_apb_clocksource_start(clocksource_apbt); - - old = dw_apb_clocksource_read(clocksource_apbt); - old += loop; - - t1 = rdtsc(); - - do { - new = dw_apb_clocksource_read(clocksource_apbt); - } while (new < old); - - t2 = rdtsc(); - - shift = 5; - if (unlikely(loop >> shift == 0)) { - printk(KERN_INFO - "APBT TSC calibration failed, not enough resolution\n"); - return 0; - } - scale = (int)div_u64((t2 - t1), loop >> shift); - khz = (scale * (apbt_freq / 1000)) >> shift; - printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz); - return khz; -failed: - return 0; -} diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 294ed4392a0e..4feaa670d578 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -36,7 +36,7 @@ /* * Using 512M as goal, in case kexec will load kernel_big * that will do the on-position decompress, and could overlap with - * with the gart aperture that is used. + * the gart aperture that is used. * Sequence: * kernel_small * ==> kexec (with kdump trigger path or gart still enabled) @@ -73,12 +73,23 @@ static int gart_mem_pfn_is_ram(unsigned long pfn) (pfn >= aperture_pfn_start + aperture_page_count)); } +#ifdef CONFIG_PROC_VMCORE +static bool gart_oldmem_pfn_is_ram(struct vmcore_cb *cb, unsigned long pfn) +{ + return !!gart_mem_pfn_is_ram(pfn); +} + +static struct vmcore_cb gart_vmcore_cb = { + .pfn_is_ram = gart_oldmem_pfn_is_ram, +}; +#endif + static void __init exclude_from_core(u64 aper_base, u32 aper_order) { aperture_pfn_start = aper_base >> PAGE_SHIFT; aperture_page_count = (32 * 1024 * 1024) << aper_order >> PAGE_SHIFT; #ifdef CONFIG_PROC_VMCORE - WARN_ON(register_oldmem_pfn_is_ram(&gart_mem_pfn_is_ram)); + register_vmcore_cb(&gart_vmcore_cb); #endif #ifdef CONFIG_PROC_KCORE WARN_ON(register_mem_pfn_is_ram(&gart_mem_pfn_is_ram)); @@ -109,14 +120,13 @@ static u32 __init allocate_aperture(void) * memory. Unfortunately we cannot move it up because that would * make the IOMMU useless. */ - addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR, - aper_size, aper_size); + addr = memblock_phys_alloc_range(aper_size, aper_size, + GART_MIN_ADDR, GART_MAX_ADDR); if (!addr) { pr_err("Cannot allocate aperture memory hole [mem %#010lx-%#010lx] (%uKB)\n", addr, addr + aper_size - 1, aper_size >> 10); return 0; } - memblock_reserve(addr, aper_size); pr_info("Mapping aperture over RAM [mem %#010lx-%#010lx] (%uKB)\n", addr, addr + aper_size - 1, aper_size >> 10); register_nosave_region(addr >> PAGE_SHIFT, @@ -382,7 +392,7 @@ void __init early_gart_iommu_check(void) static int __initdata printed_gart_size_msg; -int __init gart_iommu_hole_init(void) +void __init gart_iommu_hole_init(void) { u32 agp_aper_base = 0, agp_aper_order = 0; u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0; @@ -391,11 +401,11 @@ int __init gart_iommu_hole_init(void) int i, node; if (!amd_gart_present()) - return -ENODEV; + return; if (gart_iommu_aperture_disabled || !fix_aperture || !early_pci_allowed()) - return -ENODEV; + return; pr_info("Checking aperture...\n"); @@ -481,10 +491,8 @@ out: * and fixed up the northbridge */ exclude_from_core(last_aper_base, last_aper_order); - - return 1; } - return 0; + return; } if (!fallback_aper_force) { @@ -517,7 +525,7 @@ out: panic("Not enough memory for aperture"); } } else { - return 0; + return; } /* @@ -551,6 +559,4 @@ out: } set_up_gart_resume(aper_order, aper_alloc); - - return 1; } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 5f973fed3c9f..c6876d3ea4b1 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -38,15 +38,17 @@ #include <asm/trace/irq_vectors.h> #include <asm/irq_remapping.h> +#include <asm/pc-conf-reg.h> #include <asm/perf_event.h> #include <asm/x86_init.h> -#include <asm/pgalloc.h> #include <linux/atomic.h> +#include <asm/barrier.h> #include <asm/mpspec.h> #include <asm/i8259.h> #include <asm/proto.h> #include <asm/traps.h> #include <asm/apic.h> +#include <asm/acpi.h> #include <asm/io_apic.h> #include <asm/desc.h> #include <asm/hpet.h> @@ -59,6 +61,7 @@ #include <asm/cpu_device_id.h> #include <asm/intel-family.h> #include <asm/irq_regs.h> +#include <asm/cpu.h> unsigned int num_processors; @@ -94,6 +97,11 @@ static unsigned int disabled_cpu_apicid __ro_after_init = BAD_APICID; static int apic_extnmi __ro_after_init = APIC_EXTNMI_BSP; /* + * Hypervisor supports 15 bits of APIC ID in MSI Extended Destination ID + */ +static bool virt_ext_dest_id __ro_after_init; + +/* * Map cpu index to physical APIC ID */ DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID); @@ -126,18 +134,14 @@ static int enabled_via_apicbase __ro_after_init; */ static inline void imcr_pic_to_apic(void) { - /* select IMCR register */ - outb(0x70, 0x22); /* NMI and 8259 INTR go through APIC */ - outb(0x01, 0x23); + pc_conf_set(PC_CONF_MPS_IMCR, 0x01); } static inline void imcr_apic_to_pic(void) { - /* select IMCR register */ - outb(0x70, 0x22); /* NMI and 8259 INTR go directly to BSP */ - outb(0x00, 0x23); + pc_conf_set(PC_CONF_MPS_IMCR, 0x00); } #endif @@ -167,7 +171,7 @@ static __init int setup_apicpmtimer(char *s) { apic_calibrate_pmtmr = 1; notsc_setup(NULL); - return 0; + return 1; } __setup("apicpmtimer", setup_apicpmtimer); #endif @@ -272,7 +276,7 @@ void native_apic_icr_write(u32 low, u32 id) unsigned long flags; local_irq_save(flags); - apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id)); + apic_write(APIC_ICR2, SET_XAPIC_DEST_FIELD(id)); apic_write(APIC_ICR, low); local_irq_restore(flags); } @@ -317,6 +321,9 @@ int lapic_get_maxlvt(void) #define APIC_DIVISOR 16 #define TSC_DIVISOR 8 +/* i82489DX specific */ +#define I82489DX_BASE_DIVIDER (((0x2) << 18)) + /* * This function sets up the local APIC timer, with a timeout of * 'clocks' APIC bus clock. During calibration we actually call @@ -337,8 +344,14 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) else if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) lvtt_value |= APIC_LVT_TIMER_TSCDEADLINE; + /* + * The i82489DX APIC uses bit 18 and 19 for the base divider. This + * overlaps with bit 18 on integrated APICs, but is not documented + * in the SDM. No problem though. i82489DX equipped systems do not + * have TSC deadline timer. + */ if (!lapic_is_integrated()) - lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); + lvtt_value |= I82489DX_BASE_DIVIDER; if (!irqen) lvtt_value |= APIC_LVT_MASKED; @@ -352,8 +365,6 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) * According to Intel, MFENCE can do the serialization here. */ asm volatile("mfence" : : : "memory"); - - printk_once(KERN_DEBUG "TSC deadline timer enabled\n"); return; } @@ -474,6 +485,9 @@ static int lapic_next_deadline(unsigned long delta, { u64 tsc; + /* This MSR is special and need a special fence: */ + weak_wrmsr_fence(); + tsc = rdtsc(); wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR)); return 0; @@ -546,97 +560,60 @@ static struct clock_event_device lapic_clockevent = { }; static DEFINE_PER_CPU(struct clock_event_device, lapic_events); -#define DEADLINE_MODEL_MATCH_FUNC(model, func) \ - { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&func } - -#define DEADLINE_MODEL_MATCH_REV(model, rev) \ - { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)rev } +static const struct x86_cpu_id deadline_match[] __initconst = { + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(HASWELL_X, X86_STEPPINGS(0x2, 0x2), 0x3a), /* EP */ + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(HASWELL_X, X86_STEPPINGS(0x4, 0x4), 0x0f), /* EX */ -static u32 hsx_deadline_rev(void) -{ - switch (boot_cpu_data.x86_stepping) { - case 0x02: return 0x3a; /* EP */ - case 0x04: return 0x0f; /* EX */ - } + X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_X, 0x0b000020), - return ~0U; -} - -static u32 bdx_deadline_rev(void) -{ - switch (boot_cpu_data.x86_stepping) { - case 0x02: return 0x00000011; - case 0x03: return 0x0700000e; - case 0x04: return 0x0f00000c; - case 0x05: return 0x0e000003; - } - - return ~0U; -} - -static u32 skx_deadline_rev(void) -{ - switch (boot_cpu_data.x86_stepping) { - case 0x03: return 0x01000136; - case 0x04: return 0x02000014; - } - - if (boot_cpu_data.x86_stepping > 4) - return 0; - - return ~0U; -} + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x2, 0x2), 0x00000011), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x3, 0x3), 0x0700000e), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x4, 0x4), 0x0f00000c), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x5, 0x5), 0x0e000003), -static const struct x86_cpu_id deadline_match[] = { - DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_HASWELL_X, hsx_deadline_rev), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_X, 0x0b000020), - DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_BROADWELL_D, bdx_deadline_rev), - DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_SKYLAKE_X, skx_deadline_rev), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x3, 0x3), 0x01000136), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x4, 0x4), 0x02000014), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x5, 0xf), 0), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL, 0x22), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_L, 0x20), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_G, 0x17), + X86_MATCH_INTEL_FAM6_MODEL( HASWELL, 0x22), + X86_MATCH_INTEL_FAM6_MODEL( HASWELL_L, 0x20), + X86_MATCH_INTEL_FAM6_MODEL( HASWELL_G, 0x17), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL, 0x25), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_G, 0x17), + X86_MATCH_INTEL_FAM6_MODEL( BROADWELL, 0x25), + X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_G, 0x17), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_L, 0xb2), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE, 0xb2), + X86_MATCH_INTEL_FAM6_MODEL( SKYLAKE_L, 0xb2), + X86_MATCH_INTEL_FAM6_MODEL( SKYLAKE, 0xb2), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE_L, 0x52), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE, 0x52), + X86_MATCH_INTEL_FAM6_MODEL( KABYLAKE_L, 0x52), + X86_MATCH_INTEL_FAM6_MODEL( KABYLAKE, 0x52), {}, }; -static void apic_check_deadline_errata(void) +static __init bool apic_validate_deadline_timer(void) { const struct x86_cpu_id *m; u32 rev; - if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER) || - boot_cpu_has(X86_FEATURE_HYPERVISOR)) - return; + if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + return false; + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return true; m = x86_match_cpu(deadline_match); if (!m) - return; + return true; - /* - * Function pointers will have the MSB set due to address layout, - * immediate revisions will not. - */ - if ((long)m->driver_data < 0) - rev = ((u32 (*)(void))(m->driver_data))(); - else - rev = (u32)m->driver_data; + rev = (u32)m->driver_data; if (boot_cpu_data.microcode >= rev) - return; + return true; setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); pr_err(FW_BUG "TSC_DEADLINE disabled due to Errata; " "please update microcode to version: 0x%x (or later)\n", rev); + return false; } /* @@ -649,7 +626,7 @@ static void setup_APIC_timer(void) if (this_cpu_has(X86_FEATURE_ARAT)) { lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP; - /* Make LAPIC timer preferrable over percpu HPET */ + /* Make LAPIC timer preferable over percpu HPET */ lapic_clockevent.rating = 150; } @@ -696,7 +673,7 @@ void lapic_update_tsc_freq(void) * In this functions we calibrate APIC bus clocks to the external timer. * * We want to do the calibration only once since we want to have local timer - * irqs syncron. CPUs connected by the same APIC bus have the very same bus + * irqs synchronous. CPUs connected by the same APIC bus have the very same bus * frequency. * * This was previously done by reading the PIT/HPET and waiting for a wrap @@ -1127,32 +1104,18 @@ static void local_apic_timer_interrupt(void) * [ if a single-CPU system runs an SMP kernel then we call the local * interrupt as well. Thus we cannot inline the local irq ... ] */ -__visible void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_apic_timer_interrupt) { struct pt_regs *old_regs = set_irq_regs(regs); - /* - * NOTE! We'd better ACK the irq immediately, - * because timer handling can be slow. - * - * update_process_times() expects us to have done irq_enter(). - * Besides, if we don't timer interrupts ignore the global - * interrupt lock, which is the WrongThing (tm) to do. - */ - entering_ack_irq(); + ack_APIC_irq(); trace_local_timer_entry(LOCAL_TIMER_VECTOR); local_apic_timer_interrupt(); trace_local_timer_exit(LOCAL_TIMER_VECTOR); - exiting_irq(); set_irq_regs(old_regs); } -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - /* * Local APIC start and shutdown */ @@ -1461,22 +1424,24 @@ void __init apic_intr_mode_init(void) return; case APIC_VIRTUAL_WIRE: pr_info("APIC: Switch to virtual wire mode setup\n"); - default_setup_apic_routing(); break; case APIC_VIRTUAL_WIRE_NO_CONFIG: pr_info("APIC: Switch to virtual wire mode setup with no configuration\n"); upmode = true; - default_setup_apic_routing(); break; case APIC_SYMMETRIC_IO: pr_info("APIC: Switch to symmetric I/O mode setup\n"); - default_setup_apic_routing(); break; case APIC_SYMMETRIC_IO_NO_ROUTING: pr_info("APIC: Switch to symmetric I/O mode setup in no SMP routine\n"); break; } + default_setup_apic_routing(); + + if (x86_platform.apic_post_init) + x86_platform.apic_post_init(); + apic_bsp_setup(upmode); } @@ -1568,7 +1533,7 @@ static bool apic_check_and_ack(union apic_ir *irr, union apic_ir *isr) * Most probably by now the CPU has serviced that pending interrupt and it * might not have done the ack_APIC_irq() because it thought, interrupt * came from i8259 as ExtInt. LAPIC did not get EOI so it does not clear - * the ISR bit and cpu thinks it has already serivced the interrupt. Hence + * the ISR bit and cpu thinks it has already serviced the interrupt. Hence * a vector might get locked. It was noticed for timer irq (vector * 0x31). Issue an extra EOI to clear ISR. * @@ -1636,7 +1601,7 @@ static void setup_local_APIC(void) apic->init_apic_ldr(); #ifdef CONFIG_X86_32 - if (apic->dest_logical) { + if (apic->dest_mode_logical) { int logical_apicid, ldr_apicid; /* @@ -1693,7 +1658,7 @@ static void setup_local_APIC(void) */ /* * Actually disabling the focus CPU check just makes the hang less - * frequent as it makes the interrupt distributon model be more + * frequent as it makes the interrupt distribution model be more * like LRU than MRU (the short-term load is more even across CPUs). */ @@ -1783,14 +1748,30 @@ void apic_ap_setup(void) #ifdef CONFIG_X86_X2APIC int x2apic_mode; +EXPORT_SYMBOL_GPL(x2apic_mode); enum { X2APIC_OFF, - X2APIC_ON, X2APIC_DISABLED, + /* All states below here have X2APIC enabled */ + X2APIC_ON, + X2APIC_ON_LOCKED }; static int x2apic_state; +static bool x2apic_hw_locked(void) +{ + u64 ia32_cap; + u64 msr; + + ia32_cap = x86_read_arch_cap_msr(); + if (ia32_cap & ARCH_CAP_XAPIC_DISABLE) { + rdmsrl(MSR_IA32_XAPIC_DISABLE_STATUS, msr); + return (msr & LEGACY_XAPIC_DISABLED); + } + return false; +} + static void __x2apic_disable(void) { u64 msr; @@ -1828,6 +1809,10 @@ static int __init setup_nox2apic(char *str) apicid); return 0; } + if (x2apic_hw_locked()) { + pr_warn("APIC locked in x2apic mode, can't disable\n"); + return 0; + } pr_warn("x2apic already enabled.\n"); __x2apic_disable(); } @@ -1842,10 +1827,18 @@ early_param("nox2apic", setup_nox2apic); void x2apic_setup(void) { /* - * If x2apic is not in ON state, disable it if already enabled + * Try to make the AP's APIC state match that of the BSP, but if the + * BSP is unlocked and the AP is locked then there is a state mismatch. + * Warn about the mismatch in case a GP fault occurs due to a locked AP + * trying to be turned off. + */ + if (x2apic_state != X2APIC_ON_LOCKED && x2apic_hw_locked()) + pr_warn("x2apic lock mismatch between BSP and AP.\n"); + /* + * If x2apic is not in ON or LOCKED state, disable it if already enabled * from BIOS. */ - if (x2apic_state != X2APIC_ON) { + if (x2apic_state < X2APIC_ON) { __x2apic_disable(); return; } @@ -1866,6 +1859,11 @@ static __init void x2apic_disable(void) if (x2apic_id >= 255) panic("Cannot disable x2apic, id: %08x\n", x2apic_id); + if (x2apic_hw_locked()) { + pr_warn("Cannot disable locked x2apic, id: %08x\n", x2apic_id); + return; + } + __x2apic_disable(); register_lapic_address(mp_lapic_addr); } @@ -1886,20 +1884,34 @@ static __init void try_to_enable_x2apic(int remap_mode) return; if (remap_mode != IRQ_REMAP_X2APIC_MODE) { - /* IR is required if there is APIC ID > 255 even when running - * under KVM + u32 apic_limit = 255; + + /* + * Using X2APIC without IR is not architecturally supported + * on bare metal but may be supported in guests. */ - if (max_physical_apicid > 255 || - !x86_init.hyper.x2apic_available()) { + if (!x86_init.hyper.x2apic_available()) { pr_info("x2apic: IRQ remapping doesn't support X2APIC mode\n"); x2apic_disable(); return; } /* - * without IR all CPUs can be addressed by IOAPIC/MSI - * only in physical mode + * If the hypervisor supports extended destination ID in + * MSI, that increases the maximum APIC ID that can be + * used for non-remapped IRQ domains. */ + if (x86_init.hyper.msi_ext_dest_id()) { + virt_ext_dest_id = 1; + apic_limit = 32767; + } + + /* + * Without IR, all CPUs can be addressed by IOAPIC/MSI only + * in physical mode, and CPUs with an APIC ID that cannot + * be addressed must not be brought online. + */ + x2apic_set_max_apicid(apic_limit); x2apic_phys = 1; } x2apic_enable(); @@ -1910,7 +1922,10 @@ void __init check_x2apic(void) if (x2apic_enabled()) { pr_info("x2apic: enabled by BIOS, switching to x2apic ops\n"); x2apic_mode = 1; - x2apic_state = X2APIC_ON; + if (x2apic_hw_locked()) + x2apic_state = X2APIC_ON_LOCKED; + else + x2apic_state = X2APIC_ON; } else if (!boot_cpu_has(X86_FEATURE_X2APIC)) { x2apic_state = X2APIC_DISABLED; } @@ -2098,7 +2113,8 @@ void __init init_apic_mappings(void) { unsigned int new_apicid; - apic_check_deadline_errata(); + if (apic_validate_deadline_timer()) + pr_info("TSC deadline timer available\n"); if (x2apic_mode) { boot_cpu_physical_apicid = read_apic_id(); @@ -2159,14 +2175,13 @@ void __init register_lapic_address(unsigned long address) */ /* - * This interrupt should _never_ happen with our APIC/SMP architecture + * Common handling code for spurious_interrupt and spurious_vector entry + * points below. No point in allowing the compiler to inline it twice. */ -__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs) +static noinline void handle_spurious_interrupt(u8 vector) { - u8 vector = ~regs->orig_ax; u32 v; - entering_irq(); trace_spurious_apic_entry(vector); inc_irq_stat(irq_spurious_count); @@ -2196,13 +2211,31 @@ __visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs) } out: trace_spurious_apic_exit(vector); - exiting_irq(); +} + +/** + * spurious_interrupt - Catch all for interrupts raised on unused vectors + * @regs: Pointer to pt_regs on stack + * @vector: The vector number + * + * This is invoked from ASM entry code to catch all interrupts which + * trigger on an entry which is routed to the common_spurious idtentry + * point. + */ +DEFINE_IDTENTRY_IRQ(spurious_interrupt) +{ + handle_spurious_interrupt(vector); +} + +DEFINE_IDTENTRY_SYSVEC(sysvec_spurious_apic_interrupt) +{ + handle_spurious_interrupt(SPURIOUS_APIC_VECTOR); } /* * This interrupt should never happen with our APIC/SMP architecture */ -__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_error_interrupt) { static const char * const error_interrupt_reason[] = { "Send CS error", /* APIC Error Bit 0 */ @@ -2216,7 +2249,6 @@ __visible void __irq_entry smp_error_interrupt(struct pt_regs *regs) }; u32 v, i = 0; - entering_irq(); trace_error_apic_entry(ERROR_APIC_VECTOR); /* First tickle the hardware, only then report what went on. -- REW */ @@ -2240,7 +2272,6 @@ __visible void __irq_entry smp_error_interrupt(struct pt_regs *regs) apic_printk(APIC_DEBUG, KERN_CONT "\n"); trace_error_apic_exit(ERROR_APIC_VECTOR); - exiting_irq(); } /** @@ -2347,6 +2378,11 @@ static int cpuid_to_apicid[] = { [0 ... NR_CPUS - 1] = -1, }; +bool arch_match_cpu_phys_id(int cpu, u64 phys_id) +{ + return phys_id == cpuid_to_apicid[cpu]; +} + #ifdef CONFIG_SMP /** * apic_id_is_primary_thread - Check whether APIC ID belongs to a primary thread @@ -2514,6 +2550,56 @@ int hard_smp_processor_id(void) return read_apic_id(); } +void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg, + bool dmar) +{ + memset(msg, 0, sizeof(*msg)); + + msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW; + msg->arch_addr_lo.dest_mode_logical = apic->dest_mode_logical; + msg->arch_addr_lo.destid_0_7 = cfg->dest_apicid & 0xFF; + + msg->arch_data.delivery_mode = APIC_DELIVERY_MODE_FIXED; + msg->arch_data.vector = cfg->vector; + + msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH; + /* + * Only the IOMMU itself can use the trick of putting destination + * APIC ID into the high bits of the address. Anything else would + * just be writing to memory if it tried that, and needs IR to + * address APICs which can't be addressed in the normal 32-bit + * address range at 0xFFExxxxx. That is typically just 8 bits, but + * some hypervisors allow the extended destination ID field in bits + * 5-11 to be used, giving support for 15 bits of APIC IDs in total. + */ + if (dmar) + msg->arch_addr_hi.destid_8_31 = cfg->dest_apicid >> 8; + else if (virt_ext_dest_id && cfg->dest_apicid < 0x8000) + msg->arch_addr_lo.virt_destid_8_14 = cfg->dest_apicid >> 8; + else + WARN_ON_ONCE(cfg->dest_apicid > 0xFF); +} + +u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid) +{ + u32 dest = msg->arch_addr_lo.destid_0_7; + + if (extid) + dest |= msg->arch_addr_hi.destid_8_31 << 8; + return dest; +} +EXPORT_SYMBOL_GPL(x86_msi_msg_get_destid); + +#ifdef CONFIG_X86_64 +void __init acpi_wake_cpu_handler_update(wakeup_cpu_handler handler) +{ + struct apic **drv; + + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) + (*drv)->wakeup_secondary_cpu_64 = handler; +} +#endif + /* * Override the generic EOI implementation with an optimized version. * Only called during early boot when only one CPU is active and with @@ -2564,6 +2650,7 @@ static void __init apic_bsp_setup(bool upmode) end_local_APIC_setup(); irq_remap_enable_fault_handling(); setup_IO_APIC(); + lapic_update_legacy_vectors(); } #ifdef CONFIG_UP_LATE_INIT diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 7862b152a052..8f72b4351c9f 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -53,7 +53,7 @@ static void _flat_send_IPI_mask(unsigned long mask, int vector) unsigned long flags; local_irq_save(flags); - __default_send_IPI_dest_field(mask, vector, apic->dest_logical); + __default_send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL); local_irq_restore(flags); } @@ -113,15 +113,13 @@ static struct apic apic_flat __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = flat_apic_id_registered, - .irq_delivery_mode = dest_Fixed, - .irq_dest_mode = 1, /* logical */ + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = true, .disable_esr = 0, - .dest_logical = APIC_DEST_LOGICAL, - .check_apicid_used = NULL, + .check_apicid_used = NULL, .init_apic_ldr = flat_init_apic_ldr, - .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, @@ -206,15 +204,13 @@ static struct apic apic_physflat __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = flat_apic_id_registered, - .irq_delivery_mode = dest_Fixed, - .irq_dest_mode = 0, /* physical */ + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = false, .disable_esr = 0, - .dest_logical = 0, - .check_apicid_used = NULL, + .check_apicid_used = NULL, .init_apic_ldr = physflat_init_apic_ldr, - .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 98c9bb75d185..fe78319e0f7a 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -10,6 +10,7 @@ * like self-ipi, etc... */ #include <linux/cpumask.h> +#include <linux/thread_info.h> #include <asm/apic.h> @@ -94,19 +95,15 @@ struct apic apic_noop __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = noop_apic_id_registered, - .irq_delivery_mode = dest_Fixed, - /* logical delivery broadcast to all CPUs: */ - .irq_dest_mode = 1, + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = true, .disable_esr = 0, - .dest_logical = APIC_DEST_LOGICAL, - .check_apicid_used = default_check_apicid_used, + .check_apicid_used = default_check_apicid_used, .init_apic_ldr = noop_init_apic_ldr, - .ioapic_phys_id_map = default_ioapic_phys_id_map, .setup_apic_routing = NULL, - .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = physid_set_mask_of_physid, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index cdf45b4700f2..a54d817eb4b6 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -12,11 +12,11 @@ */ #include <linux/types.h> #include <linux/init.h> +#include <linux/pgtable.h> #include <asm/numachip/numachip.h> #include <asm/numachip/numachip_csr.h> -#include <asm/pgtable.h> #include "local.h" @@ -246,15 +246,13 @@ static const struct apic apic_numachip1 __refconst = { .apic_id_valid = numachip_apic_id_valid, .apic_id_registered = numachip_apic_id_registered, - .irq_delivery_mode = dest_Fixed, - .irq_dest_mode = 0, /* physical */ + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = false, .disable_esr = 0, - .dest_logical = 0, - .check_apicid_used = NULL, + .check_apicid_used = NULL, .init_apic_ldr = flat_init_apic_ldr, - .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, @@ -295,15 +293,13 @@ static const struct apic apic_numachip2 __refconst = { .apic_id_valid = numachip_apic_id_valid, .apic_id_registered = numachip_apic_id_registered, - .irq_delivery_mode = dest_Fixed, - .irq_dest_mode = 0, /* physical */ + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = false, .disable_esr = 0, - .dest_logical = 0, - .check_apicid_used = NULL, + .check_apicid_used = NULL, .init_apic_ldr = flat_init_apic_ldr, - .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 38b5b51d42f6..77555f66c14d 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -9,6 +9,7 @@ #include <linux/smp.h> #include <asm/apic.h> +#include <asm/io_apic.h> #include "local.h" @@ -126,16 +127,13 @@ static struct apic apic_bigsmp __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = bigsmp_apic_id_registered, - .irq_delivery_mode = dest_Fixed, - /* phys delivery to target CPU: */ - .irq_dest_mode = 0, + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = false, .disable_esr = 1, - .dest_logical = 0, - .check_apicid_used = bigsmp_check_apicid_used, + .check_apicid_used = bigsmp_check_apicid_used, .init_apic_ldr = bigsmp_init_apic_ldr, - .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, .setup_apic_routing = bigsmp_setup_apic_routing, .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index d1fc62a67320..34a992e275ef 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -9,6 +9,7 @@ * Bits copied from original nmi.c file * */ +#include <linux/thread_info.h> #include <asm/apic.h> #include <asm/nmi.h> diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 913c88617848..a868b76cd3d4 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -48,6 +48,7 @@ #include <linux/jiffies.h> /* time_after() */ #include <linux/slab.h> #include <linux/memblock.h> +#include <linux/msi.h> #include <asm/irqdomain.h> #include <asm/io.h> @@ -63,8 +64,8 @@ #include <asm/setup.h> #include <asm/irq_remapping.h> #include <asm/hw_irq.h> - #include <asm/apic.h> +#include <asm/pgtable.h> #define for_each_ioapic(idx) \ for ((idx) = 0; (idx) < nr_ioapics; (idx)++) @@ -89,12 +90,12 @@ struct irq_pin_list { }; struct mp_chip_data { - struct list_head irq_2_pin; - struct IO_APIC_route_entry entry; - int trigger; - int polarity; + struct list_head irq_2_pin; + struct IO_APIC_route_entry entry; + bool is_level; + bool active_low; + bool isa_irq; u32 count; - bool isa_irq; }; struct mp_ioapic_gsi { @@ -154,19 +155,6 @@ static inline bool mp_is_legacy_irq(int irq) return irq >= 0 && irq < nr_legacy_irqs(); } -/* - * Initialize all legacy IRQs and all pins on the first IOAPIC - * if we have legacy interrupt controller. Kernel boot option "pirq=" - * may rely on non-legacy pins on the first IOAPIC. - */ -static inline int mp_init_irq_at_boot(int ioapic, int irq) -{ - if (!nr_legacy_irqs()) - return 0; - - return ioapic == 0 || mp_is_legacy_irq(irq); -} - static inline struct irq_domain *mp_ioapic_irqdomain(int ioapic) { return ioapics[ioapic].irqdomain; @@ -211,7 +199,7 @@ static int __init parse_noapic(char *str) } early_param("noapic", parse_noapic); -/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */ +/* Will be called in mpparse/ACPI codes for saving IRQ info */ void mp_save_irq(struct mpc_intsrc *m) { int i; @@ -299,31 +287,26 @@ static void io_apic_write(unsigned int apic, unsigned int reg, writel(value, &io_apic->data); } -union entry_union { - struct { u32 w1, w2; }; - struct IO_APIC_route_entry entry; -}; - static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin) { - union entry_union eu; + struct IO_APIC_route_entry entry; - eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); - eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); + entry.w1 = io_apic_read(apic, 0x10 + 2 * pin); + entry.w2 = io_apic_read(apic, 0x11 + 2 * pin); - return eu.entry; + return entry; } static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) { - union entry_union eu; + struct IO_APIC_route_entry entry; unsigned long flags; raw_spin_lock_irqsave(&ioapic_lock, flags); - eu.entry = __ioapic_read_entry(apic, pin); + entry = __ioapic_read_entry(apic, pin); raw_spin_unlock_irqrestore(&ioapic_lock, flags); - return eu.entry; + return entry; } /* @@ -334,11 +317,8 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) */ static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) { - union entry_union eu = {{0, 0}}; - - eu.entry = e; - io_apic_write(apic, 0x11 + 2*pin, eu.w2); - io_apic_write(apic, 0x10 + 2*pin, eu.w1); + io_apic_write(apic, 0x11 + 2*pin, e.w2); + io_apic_write(apic, 0x10 + 2*pin, e.w1); } static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) @@ -357,12 +337,12 @@ static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) */ static void ioapic_mask_entry(int apic, int pin) { + struct IO_APIC_route_entry e = { .masked = true }; unsigned long flags; - union entry_union eu = { .entry.mask = IOAPIC_MASKED }; raw_spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x10 + 2*pin, eu.w1); - io_apic_write(apic, 0x11 + 2*pin, eu.w2); + io_apic_write(apic, 0x10 + 2*pin, e.w1); + io_apic_write(apic, 0x11 + 2*pin, e.w2); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -435,20 +415,15 @@ static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node, add_pin_to_irq_node(data, node, newapic, newpin); } -static void io_apic_modify_irq(struct mp_chip_data *data, - int mask_and, int mask_or, +static void io_apic_modify_irq(struct mp_chip_data *data, bool masked, void (*final)(struct irq_pin_list *entry)) { - union entry_union eu; struct irq_pin_list *entry; - eu.entry = data->entry; - eu.w1 &= mask_and; - eu.w1 |= mask_or; - data->entry = eu.entry; + data->entry.masked = masked; for_each_irq_pin(entry, data->irq_2_pin) { - io_apic_write(entry->apic, 0x10 + 2 * entry->pin, eu.w1); + io_apic_write(entry->apic, 0x10 + 2 * entry->pin, data->entry.w1); if (final) final(entry); } @@ -472,13 +447,13 @@ static void mask_ioapic_irq(struct irq_data *irq_data) unsigned long flags; raw_spin_lock_irqsave(&ioapic_lock, flags); - io_apic_modify_irq(data, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); + io_apic_modify_irq(data, true, &io_apic_sync); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } static void __unmask_ioapic(struct mp_chip_data *data) { - io_apic_modify_irq(data, ~IO_APIC_REDIR_MASKED, 0, NULL); + io_apic_modify_irq(data, false, NULL); } static void unmask_ioapic_irq(struct irq_data *irq_data) @@ -519,8 +494,8 @@ static void __eoi_ioapic_pin(int apic, int pin, int vector) /* * Mask the entry and change the trigger mode to edge. */ - entry1.mask = IOAPIC_MASKED; - entry1.trigger = IOAPIC_EDGE; + entry1.masked = true; + entry1.is_level = false; __ioapic_write_entry(apic, pin, entry1); @@ -548,15 +523,15 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) /* Check delivery_mode to be sure we're not clearing an SMI pin */ entry = ioapic_read_entry(apic, pin); - if (entry.delivery_mode == dest_SMI) + if (entry.delivery_mode == APIC_DELIVERY_MODE_SMI) return; /* * Make sure the entry is masked and re-read the contents to check * if it is a level triggered pin and if the remote-IRR is set. */ - if (entry.mask == IOAPIC_UNMASKED) { - entry.mask = IOAPIC_MASKED; + if (!entry.masked) { + entry.masked = true; ioapic_write_entry(apic, pin, entry); entry = ioapic_read_entry(apic, pin); } @@ -569,8 +544,8 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) * doesn't clear the remote-IRR if the trigger mode is not * set to level. */ - if (entry.trigger == IOAPIC_EDGE) { - entry.trigger = IOAPIC_LEVEL; + if (!entry.is_level) { + entry.is_level = true; ioapic_write_entry(apic, pin, entry); } raw_spin_lock_irqsave(&ioapic_lock, flags); @@ -672,8 +647,8 @@ void mask_ioapic_entries(void) struct IO_APIC_route_entry entry; entry = ioapics[apic].saved_registers[pin]; - if (entry.mask == IOAPIC_UNMASKED) { - entry.mask = IOAPIC_MASKED; + if (!entry.masked) { + entry.masked = true; ioapic_write_entry(apic, pin, entry); } } @@ -758,44 +733,7 @@ static int __init find_isa_irq_apic(int irq, int type) return -1; } -#ifdef CONFIG_EISA -/* - * EISA Edge/Level control register, ELCR - */ -static int EISA_ELCR(unsigned int irq) -{ - if (irq < nr_legacy_irqs()) { - unsigned int port = 0x4d0 + (irq >> 3); - return (inb(port) >> (irq & 7)) & 1; - } - apic_printk(APIC_VERBOSE, KERN_INFO - "Broken MPtable reports ISA irq %d\n", irq); - return 0; -} - -#endif - -/* ISA interrupts are always active high edge triggered, - * when listed as conforming in the MP table. */ - -#define default_ISA_trigger(idx) (IOAPIC_EDGE) -#define default_ISA_polarity(idx) (IOAPIC_POL_HIGH) - -/* EISA interrupts are always polarity zero and can be edge or level - * trigger depending on the ELCR value. If an interrupt is listed as - * EISA conforming in the MP table, that means its trigger type must - * be read in from the ELCR */ - -#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].srcbusirq)) -#define default_EISA_polarity(idx) default_ISA_polarity(idx) - -/* PCI interrupts are always active low level triggered, - * when listed as conforming in the MP table. */ - -#define default_PCI_trigger(idx) (IOAPIC_LEVEL) -#define default_PCI_polarity(idx) (IOAPIC_POL_LOW) - -static int irq_polarity(int idx) +static bool irq_active_low(int idx) { int bus = mp_irqs[idx].srcbus; @@ -804,127 +742,176 @@ static int irq_polarity(int idx) */ switch (mp_irqs[idx].irqflag & MP_IRQPOL_MASK) { case MP_IRQPOL_DEFAULT: - /* conforms to spec, ie. bus-type dependent polarity */ - if (test_bit(bus, mp_bus_not_pci)) - return default_ISA_polarity(idx); - else - return default_PCI_polarity(idx); + /* + * Conforms to spec, ie. bus-type dependent polarity. PCI + * defaults to low active. [E]ISA defaults to high active. + */ + return !test_bit(bus, mp_bus_not_pci); case MP_IRQPOL_ACTIVE_HIGH: - return IOAPIC_POL_HIGH; + return false; case MP_IRQPOL_RESERVED: pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n"); - /* fall through */ + fallthrough; case MP_IRQPOL_ACTIVE_LOW: default: /* Pointless default required due to do gcc stupidity */ - return IOAPIC_POL_LOW; + return true; } } #ifdef CONFIG_EISA -static int eisa_irq_trigger(int idx, int bus, int trigger) +/* + * EISA Edge/Level control register, ELCR + */ +static bool EISA_ELCR(unsigned int irq) +{ + if (irq < nr_legacy_irqs()) { + unsigned int port = PIC_ELCR1 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; + } + apic_printk(APIC_VERBOSE, KERN_INFO + "Broken MPtable reports ISA irq %d\n", irq); + return false; +} + +/* + * EISA interrupts are always active high and can be edge or level + * triggered depending on the ELCR value. If an interrupt is listed as + * EISA conforming in the MP table, that means its trigger type must be + * read in from the ELCR. + */ +static bool eisa_irq_is_level(int idx, int bus, bool level) { switch (mp_bus_id_to_type[bus]) { case MP_BUS_PCI: case MP_BUS_ISA: - return trigger; + return level; case MP_BUS_EISA: - return default_EISA_trigger(idx); + return EISA_ELCR(mp_irqs[idx].srcbusirq); } pr_warn("IOAPIC: Invalid srcbus: %d defaulting to level\n", bus); - return IOAPIC_LEVEL; + return true; } #else -static inline int eisa_irq_trigger(int idx, int bus, int trigger) +static inline int eisa_irq_is_level(int idx, int bus, bool level) { - return trigger; + return level; } #endif -static int irq_trigger(int idx) +static bool irq_is_level(int idx) { int bus = mp_irqs[idx].srcbus; - int trigger; + bool level; /* * Determine IRQ trigger mode (edge or level sensitive): */ switch (mp_irqs[idx].irqflag & MP_IRQTRIG_MASK) { case MP_IRQTRIG_DEFAULT: - /* conforms to spec, ie. bus-type dependent trigger mode */ - if (test_bit(bus, mp_bus_not_pci)) - trigger = default_ISA_trigger(idx); - else - trigger = default_PCI_trigger(idx); + /* + * Conforms to spec, ie. bus-type dependent trigger + * mode. PCI defaults to level, ISA to edge. + */ + level = !test_bit(bus, mp_bus_not_pci); /* Take EISA into account */ - return eisa_irq_trigger(idx, bus, trigger); + return eisa_irq_is_level(idx, bus, level); case MP_IRQTRIG_EDGE: - return IOAPIC_EDGE; + return false; case MP_IRQTRIG_RESERVED: pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n"); - /* fall through */ + fallthrough; case MP_IRQTRIG_LEVEL: default: /* Pointless default required due to do gcc stupidity */ - return IOAPIC_LEVEL; + return true; } } +static int __acpi_get_override_irq(u32 gsi, bool *trigger, bool *polarity) +{ + int ioapic, pin, idx; + + if (skip_ioapic_setup) + return -1; + + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) + return -1; + + pin = mp_find_ioapic_pin(ioapic, gsi); + if (pin < 0) + return -1; + + idx = find_irq_entry(ioapic, pin, mp_INT); + if (idx < 0) + return -1; + + *trigger = irq_is_level(idx); + *polarity = irq_active_low(idx); + return 0; +} + +#ifdef CONFIG_ACPI +int acpi_get_override_irq(u32 gsi, int *is_level, int *active_low) +{ + *is_level = *active_low = 0; + return __acpi_get_override_irq(gsi, (bool *)is_level, + (bool *)active_low); +} +#endif + void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node, int trigger, int polarity) { init_irq_alloc_info(info, NULL); info->type = X86_IRQ_ALLOC_TYPE_IOAPIC; - info->ioapic_node = node; - info->ioapic_trigger = trigger; - info->ioapic_polarity = polarity; - info->ioapic_valid = 1; + info->ioapic.node = node; + info->ioapic.is_level = trigger; + info->ioapic.active_low = polarity; + info->ioapic.valid = 1; } -#ifndef CONFIG_ACPI -int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity); -#endif - static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst, struct irq_alloc_info *src, u32 gsi, int ioapic_idx, int pin) { - int trigger, polarity; + bool level, pol_low; copy_irq_alloc_info(dst, src); dst->type = X86_IRQ_ALLOC_TYPE_IOAPIC; - dst->ioapic_id = mpc_ioapic_id(ioapic_idx); - dst->ioapic_pin = pin; - dst->ioapic_valid = 1; - if (src && src->ioapic_valid) { - dst->ioapic_node = src->ioapic_node; - dst->ioapic_trigger = src->ioapic_trigger; - dst->ioapic_polarity = src->ioapic_polarity; + dst->devid = mpc_ioapic_id(ioapic_idx); + dst->ioapic.pin = pin; + dst->ioapic.valid = 1; + if (src && src->ioapic.valid) { + dst->ioapic.node = src->ioapic.node; + dst->ioapic.is_level = src->ioapic.is_level; + dst->ioapic.active_low = src->ioapic.active_low; } else { - dst->ioapic_node = NUMA_NO_NODE; - if (acpi_get_override_irq(gsi, &trigger, &polarity) >= 0) { - dst->ioapic_trigger = trigger; - dst->ioapic_polarity = polarity; + dst->ioapic.node = NUMA_NO_NODE; + if (__acpi_get_override_irq(gsi, &level, &pol_low) >= 0) { + dst->ioapic.is_level = level; + dst->ioapic.active_low = pol_low; } else { /* * PCI interrupts are always active low level * triggered. */ - dst->ioapic_trigger = IOAPIC_LEVEL; - dst->ioapic_polarity = IOAPIC_POL_LOW; + dst->ioapic.is_level = true; + dst->ioapic.active_low = true; } } } static int ioapic_alloc_attr_node(struct irq_alloc_info *info) { - return (info && info->ioapic_valid) ? info->ioapic_node : NUMA_NO_NODE; + return (info && info->ioapic.valid) ? info->ioapic.node : NUMA_NO_NODE; } -static void mp_register_handler(unsigned int irq, unsigned long trigger) +static void mp_register_handler(unsigned int irq, bool level) { irq_flow_handler_t hdl; bool fasteoi; - if (trigger) { + if (level) { irq_set_status_flags(irq, IRQ_LEVEL); fasteoi = true; } else { @@ -942,18 +929,18 @@ static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info) /* * setup_IO_APIC_irqs() programs all legacy IRQs with default trigger - * and polarity attirbutes. So allow the first user to reprogram the + * and polarity attributes. So allow the first user to reprogram the * pin with real trigger and polarity attributes. */ if (irq < nr_legacy_irqs() && data->count == 1) { - if (info->ioapic_trigger != data->trigger) - mp_register_handler(irq, info->ioapic_trigger); - data->entry.trigger = data->trigger = info->ioapic_trigger; - data->entry.polarity = data->polarity = info->ioapic_polarity; + if (info->ioapic.is_level != data->is_level) + mp_register_handler(irq, info->ioapic.is_level); + data->entry.is_level = data->is_level = info->ioapic.is_level; + data->entry.active_low = data->active_low = info->ioapic.active_low; } - return data->trigger == info->ioapic_trigger && - data->polarity == info->ioapic_polarity; + return data->is_level == info->ioapic.is_level && + data->active_low == info->ioapic.active_low; } static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi, @@ -1008,14 +995,14 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain, /* * Legacy ISA IRQ has already been allocated, just add pin to - * the pin list assoicated with this IRQ and program the IOAPIC + * the pin list associated with this IRQ and program the IOAPIC * entry. The IOAPIC entry */ if (irq_data && irq_data->parent_data) { if (!mp_check_pin_attr(irq, info)) return -EBUSY; if (__add_pin_to_irq_node(irq_data->chip_data, node, ioapic, - info->ioapic_pin)) + info->ioapic.pin)) return -ENOMEM; } else { info->flags |= X86_IRQ_ALLOC_LEGACY; @@ -1046,6 +1033,16 @@ static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin, if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) { irq = mp_irqs[idx].srcbusirq; legacy = mp_is_legacy_irq(irq); + /* + * IRQ2 is unusable for historical reasons on systems which + * have a legacy PIC. See the comment vs. IRQ2 further down. + * + * If this gets removed at some point then the related code + * in lapic_assign_system_vectors() needs to be adjusted as + * well. + */ + if (legacy && irq == PIC_CASCADE_IR) + return -EINVAL; } mutex_lock(&ioapic_mutex); @@ -1232,10 +1229,9 @@ void ioapic_zap_locks(void) static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries) { - int i; - char buf[256]; struct IO_APIC_route_entry entry; - struct IR_IO_APIC_route_entry *ir_entry = (void *)&entry; + char buf[256]; + int i; printk(KERN_DEBUG "IOAPIC %d:\n", apic); for (i = 0; i <= nr_entries; i++) { @@ -1243,20 +1239,21 @@ static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries) snprintf(buf, sizeof(buf), " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)", i, - entry.mask == IOAPIC_MASKED ? "disabled" : "enabled ", - entry.trigger == IOAPIC_LEVEL ? "level" : "edge ", - entry.polarity == IOAPIC_POL_LOW ? "low " : "high", + entry.masked ? "disabled" : "enabled ", + entry.is_level ? "level" : "edge ", + entry.active_low ? "low " : "high", entry.vector, entry.irr, entry.delivery_status); - if (ir_entry->format) + if (entry.ir_format) { printk(KERN_DEBUG "%s, remapped, I(%04X), Z(%X)\n", - buf, (ir_entry->index2 << 15) | ir_entry->index, - ir_entry->zero); - else - printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n", buf, - entry.dest_mode == IOAPIC_DEST_MODE_LOGICAL ? - "logical " : "physical", - entry.dest, entry.delivery_mode); + (entry.ir_index_15 << 15) | entry.ir_index_0_14, + entry.ir_zero); + } else { + printk(KERN_DEBUG "%s, %s, D(%02X%02X), M(%1d)\n", buf, + entry.dest_mode_logical ? "logical " : "physical", + entry.virt_destid_8_14, entry.destid_0_7, + entry.delivery_mode); + } } } @@ -1381,7 +1378,8 @@ void __init enable_IO_APIC(void) /* If the interrupt line is enabled and in ExtInt mode * I have found the pin where the i8259 is connected. */ - if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { + if (!entry.masked && + entry.delivery_mode == APIC_DELIVERY_MODE_EXTINT) { ioapic_i8259.apic = apic; ioapic_i8259.pin = pin; goto found_i8259; @@ -1423,14 +1421,16 @@ void native_restore_boot_irq_mode(void) */ if (ioapic_i8259.pin != -1) { struct IO_APIC_route_entry entry; + u32 apic_id = read_apic_id(); memset(&entry, 0, sizeof(entry)); - entry.mask = IOAPIC_UNMASKED; - entry.trigger = IOAPIC_EDGE; - entry.polarity = IOAPIC_POL_HIGH; - entry.dest_mode = IOAPIC_DEST_MODE_PHYSICAL; - entry.delivery_mode = dest_ExtINT; - entry.dest = read_apic_id(); + entry.masked = false; + entry.is_level = false; + entry.active_low = false; + entry.dest_mode_logical = false; + entry.delivery_mode = APIC_DELIVERY_MODE_EXTINT; + entry.destid_0_7 = apic_id & 0xFF; + entry.virt_destid_8_14 = apic_id >> 8; /* * Add it to the IO-APIC irq-routing table: @@ -1631,21 +1631,16 @@ static void __init delay_without_tsc(void) static int __init timer_irq_works(void) { unsigned long t1 = jiffies; - unsigned long flags; if (no_timer_check) return 1; - local_save_flags(flags); local_irq_enable(); - if (boot_cpu_has(X86_FEATURE_TSC)) delay_with_tsc(); else delay_without_tsc(); - local_irq_restore(flags); - /* * Expect a few ticks at least, to be sure some possible * glue logic does not lock up after one or two first @@ -1654,10 +1649,10 @@ static int __init timer_irq_works(void) * least one tick may be lost due to delays. */ - /* jiffies wrap? */ - if (time_after(jiffies, t1 + 4)) - return 1; - return 0; + local_irq_disable(); + + /* Did jiffies advance? */ + return time_after(jiffies, t1 + 4); } /* @@ -1709,13 +1704,13 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data) raw_spin_lock_irqsave(&ioapic_lock, flags); for_each_irq_pin(entry, data->irq_2_pin) { - unsigned int reg; + struct IO_APIC_route_entry e; int pin; pin = entry->pin; - reg = io_apic_read(entry->apic, 0x10 + pin*2); + e.w1 = io_apic_read(entry->apic, 0x10 + pin*2); /* Is the remote IRR bit set? */ - if (reg & IO_APIC_REDIR_REMOTE_IRR) { + if (e.irr) { raw_spin_unlock_irqrestore(&ioapic_lock, flags); return true; } @@ -1758,7 +1753,7 @@ static inline void ioapic_finish_move(struct irq_data *data, bool moveit) * with masking the ioapic entry and then polling until * Remote IRR was clear before reprogramming the * ioapic I don't trust the Remote IRR bit to be - * completey accurate. + * completely accurate. * * However there appears to be no other way to plug * this race, so if the Remote IRR bit is not @@ -1836,7 +1831,7 @@ static void ioapic_ack_level(struct irq_data *irq_data) /* * Tail end of clearing remote IRR bit (either by delivering the EOI * message via io-apic EOI register write or simulating it using - * mask+edge followed by unnask+level logic) manually when the + * mask+edge followed by unmask+level logic) manually when the * level triggered interrupt is seen as the edge triggered interrupt * at the cpu. */ @@ -1862,21 +1857,62 @@ static void ioapic_ir_ack_level(struct irq_data *irq_data) eoi_ioapic_pin(data->entry.vector, data); } +/* + * The I/OAPIC is just a device for generating MSI messages from legacy + * interrupt pins. Various fields of the RTE translate into bits of the + * resulting MSI which had a historical meaning. + * + * With interrupt remapping, many of those bits have different meanings + * in the underlying MSI, but the way that the I/OAPIC transforms them + * from its RTE to the MSI message is the same. This function allows + * the parent IRQ domain to compose the MSI message, then takes the + * relevant bits to put them in the appropriate places in the RTE in + * order to generate that message when the IRQ happens. + * + * The setup here relies on a preconfigured route entry (is_level, + * active_low, masked) because the parent domain is merely composing the + * generic message routing information which is used for the MSI. + */ +static void ioapic_setup_msg_from_msi(struct irq_data *irq_data, + struct IO_APIC_route_entry *entry) +{ + struct msi_msg msg; + + /* Let the parent domain compose the MSI message */ + irq_chip_compose_msi_msg(irq_data, &msg); + + /* + * - Real vector + * - DMAR/IR: 8bit subhandle (ioapic.pin) + * - AMD/IR: 8bit IRTE index + */ + entry->vector = msg.arch_data.vector; + /* Delivery mode (for DMAR/IR all 0) */ + entry->delivery_mode = msg.arch_data.delivery_mode; + /* Destination mode or DMAR/IR index bit 15 */ + entry->dest_mode_logical = msg.arch_addr_lo.dest_mode_logical; + /* DMAR/IR: 1, 0 for all other modes */ + entry->ir_format = msg.arch_addr_lo.dmar_format; + /* + * - DMAR/IR: index bit 0-14. + * + * - Virt: If the host supports x2apic without a virtualized IR + * unit then bit 0-6 of dmar_index_0_14 are providing bit + * 8-14 of the destination id. + * + * All other modes have bit 0-6 of dmar_index_0_14 cleared and the + * topmost 8 bits are destination id bit 0-7 (entry::destid_0_7). + */ + entry->ir_index_0_14 = msg.arch_addr_lo.dmar_index_0_14; +} + static void ioapic_configure_entry(struct irq_data *irqd) { struct mp_chip_data *mpd = irqd->chip_data; - struct irq_cfg *cfg = irqd_cfg(irqd); struct irq_pin_list *entry; - /* - * Only update when the parent is the vector domain, don't touch it - * if the parent is the remapping domain. Check the installed - * ioapic chip to verify that. - */ - if (irqd->chip == &ioapic_chip) { - mpd->entry.dest = cfg->dest_apicid; - mpd->entry.vector = cfg->vector; - } + ioapic_setup_msg_from_msi(irqd, &mpd->entry); + for_each_irq_pin(entry, mpd->irq_2_pin) __ioapic_write_entry(entry->apic, entry->pin, mpd->entry); } @@ -1932,7 +1968,7 @@ static int ioapic_irq_get_chip_state(struct irq_data *irqd, * irrelevant because the IO-APIC treats them as fire and * forget. */ - if (rentry.irr && rentry.trigger) { + if (rentry.irr && rentry.is_level) { *state = true; break; } @@ -1951,7 +1987,8 @@ static struct irq_chip ioapic_chip __read_mostly = { .irq_set_affinity = ioapic_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_get_irqchip_state = ioapic_irq_get_chip_state, - .flags = IRQCHIP_SKIP_SET_WAKE, + .flags = IRQCHIP_SKIP_SET_WAKE | + IRQCHIP_AFFINITY_PRE_STARTUP, }; static struct irq_chip ioapic_ir_chip __read_mostly = { @@ -1964,7 +2001,8 @@ static struct irq_chip ioapic_ir_chip __read_mostly = { .irq_set_affinity = ioapic_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_get_irqchip_state = ioapic_irq_get_chip_state, - .flags = IRQCHIP_SKIP_SET_WAKE, + .flags = IRQCHIP_SKIP_SET_WAKE | + IRQCHIP_AFFINITY_PRE_STARTUP, }; static inline void init_IO_APIC_traps(void) @@ -2040,6 +2078,7 @@ static inline void __init unlock_ExtINT_logic(void) int apic, pin, i; struct IO_APIC_route_entry entry0, entry1; unsigned char save_control, save_freq_select; + u32 apic_id; pin = find_isa_irq_pin(8, mp_INT); if (pin == -1) { @@ -2055,14 +2094,16 @@ static inline void __init unlock_ExtINT_logic(void) entry0 = ioapic_read_entry(apic, pin); clear_IO_APIC_pin(apic, pin); + apic_id = hard_smp_processor_id(); memset(&entry1, 0, sizeof(entry1)); - entry1.dest_mode = IOAPIC_DEST_MODE_PHYSICAL; - entry1.mask = IOAPIC_UNMASKED; - entry1.dest = hard_smp_processor_id(); - entry1.delivery_mode = dest_ExtINT; - entry1.polarity = entry0.polarity; - entry1.trigger = IOAPIC_EDGE; + entry1.dest_mode_logical = true; + entry1.masked = false; + entry1.destid_0_7 = apic_id & 0xFF; + entry1.virt_destid_8_14 = apic_id >> 8; + entry1.delivery_mode = APIC_DELIVERY_MODE_EXTINT; + entry1.active_low = entry0.active_low; + entry1.is_level = false; entry1.vector = 0; ioapic_write_entry(apic, pin, entry1); @@ -2105,8 +2146,8 @@ static int mp_alloc_timer_irq(int ioapic, int pin) struct irq_alloc_info info; ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 0, 0); - info.ioapic_id = mpc_ioapic_id(ioapic); - info.ioapic_pin = pin; + info.devid = mpc_ioapic_id(ioapic); + info.ioapic.pin = pin; mutex_lock(&ioapic_mutex); irq = alloc_isa_irq_from_domain(domain, 0, ioapic, pin, &info); mutex_unlock(&ioapic_mutex); @@ -2130,13 +2171,12 @@ static inline void __init check_timer(void) struct irq_cfg *cfg = irqd_cfg(irq_data); int node = cpu_to_node(0); int apic1, pin1, apic2, pin2; - unsigned long flags; int no_pin1 = 0; if (!global_clock_event) return; - local_irq_save(flags); + local_irq_disable(); /* * get/set the timer IRQ vector: @@ -2191,9 +2231,9 @@ static inline void __init check_timer(void) * so only need to unmask if it is level-trigger * do we really have level trigger timer? */ - int idx; - idx = find_irq_entry(apic1, pin1, mp_INT); - if (idx != -1 && irq_trigger(idx)) + int idx = find_irq_entry(apic1, pin1, mp_INT); + + if (idx != -1 && irq_is_level(idx)) unmask_ioapic_irq(irq_get_irq_data(0)); } irq_domain_deactivate_irq(irq_data); @@ -2204,7 +2244,6 @@ static inline void __init check_timer(void) goto out; } panic_if_irq_remap("timer doesn't work through Interrupt-remapped IO-APIC"); - local_irq_disable(); clear_IO_APIC_pin(apic1, pin1); if (!no_pin1) apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " @@ -2228,7 +2267,6 @@ static inline void __init check_timer(void) /* * Cleanup, just in case ... */ - local_irq_disable(); legacy_pic->mask(0); clear_IO_APIC_pin(apic2, pin2); apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); @@ -2245,7 +2283,6 @@ static inline void __init check_timer(void) apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); goto out; } - local_irq_disable(); legacy_pic->mask(0); apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); @@ -2256,6 +2293,7 @@ static inline void __init check_timer(void) legacy_pic->init(0); legacy_pic->make_irq(0); apic_write(APIC_LVT0, APIC_DM_EXTINT); + legacy_pic->unmask(0); unlock_ExtINT_logic(); @@ -2263,7 +2301,6 @@ static inline void __init check_timer(void) apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); goto out; } - local_irq_disable(); apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); if (apic_is_x2apic_enabled()) apic_printk(APIC_QUIET, KERN_INFO @@ -2272,7 +2309,7 @@ static inline void __init check_timer(void) panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " "report. Then try booting with the 'noapic' option.\n"); out: - local_irq_restore(flags); + local_irq_enable(); } /* @@ -2296,45 +2333,46 @@ out: static int mp_irqdomain_create(int ioapic) { - struct irq_alloc_info info; struct irq_domain *parent; int hwirqs = mp_ioapic_pin_count(ioapic); struct ioapic *ip = &ioapics[ioapic]; struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg; struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic); struct fwnode_handle *fn; - char *name = "IO-APIC"; + struct irq_fwspec fwspec; if (cfg->type == IOAPIC_DOMAIN_INVALID) return 0; - init_irq_alloc_info(&info, NULL); - info.type = X86_IRQ_ALLOC_TYPE_IOAPIC; - info.ioapic_id = mpc_ioapic_id(ioapic); - parent = irq_remapping_get_ir_irq_domain(&info); - if (!parent) - parent = x86_vector_domain; - else - name = "IO-APIC-IR"; - /* Handle device tree enumerated APICs proper */ if (cfg->dev) { fn = of_node_to_fwnode(cfg->dev); } else { - fn = irq_domain_alloc_named_id_fwnode(name, ioapic); + fn = irq_domain_alloc_named_id_fwnode("IO-APIC", mpc_ioapic_id(ioapic)); if (!fn) return -ENOMEM; } + fwspec.fwnode = fn; + fwspec.param_count = 1; + fwspec.param[0] = mpc_ioapic_id(ioapic); + + parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_ANY); + if (!parent) { + if (!cfg->dev) + irq_domain_free_fwnode(fn); + return -ENODEV; + } + ip->irqdomain = irq_domain_create_linear(fn, hwirqs, cfg->ops, (void *)(long)ioapic); - /* Release fw handle if it was allocated above */ - if (!cfg->dev) - irq_domain_free_fwnode(fn); - - if (!ip->irqdomain) + if (!ip->irqdomain) { + /* Release fw handle if it was allocated above */ + if (!cfg->dev) + irq_domain_free_fwnode(fn); return -ENOMEM; + } ip->irqdomain->parent = parent; @@ -2348,8 +2386,13 @@ static int mp_irqdomain_create(int ioapic) static void ioapic_destroy_irqdomain(int idx) { + struct ioapic_domain_cfg *cfg = &ioapics[idx].irqdomain_cfg; + struct fwnode_handle *fn = ioapics[idx].irqdomain->fwnode; + if (ioapics[idx].irqdomain) { irq_domain_remove(ioapics[idx].irqdomain); + if (!cfg->dev) + irq_domain_free_fwnode(fn); ioapics[idx].irqdomain = NULL; } } @@ -2594,30 +2637,6 @@ static int io_apic_get_version(int ioapic) return reg_01.bits.version; } -int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity) -{ - int ioapic, pin, idx; - - if (skip_ioapic_setup) - return -1; - - ioapic = mp_find_ioapic(gsi); - if (ioapic < 0) - return -1; - - pin = mp_find_ioapic_pin(ioapic, gsi); - if (pin < 0) - return -1; - - idx = find_irq_entry(ioapic, pin, mp_INT); - if (idx < 0) - return -1; - - *trigger = irq_trigger(idx); - *polarity = irq_polarity(idx); - return 0; -} - /* * This function updates target affinity of IOAPIC interrupts to include * the CPUs which came online during SMP bringup. @@ -2659,6 +2678,19 @@ static struct resource * __init ioapic_setup_resources(void) return res; } +static void io_apic_set_fixmap(enum fixed_addresses idx, phys_addr_t phys) +{ + pgprot_t flags = FIXMAP_PAGE_NOCACHE; + + /* + * Ensure fixmaps for IOAPIC MMIO respect memory encryption pgprot + * bits, just like normal ioremap(): + */ + flags = pgprot_decrypted(flags); + + __set_fixmap(idx, phys, flags); +} + void __init io_apic_init_mappings(void) { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; @@ -2691,7 +2723,7 @@ fake_ioapic_page: __func__, PAGE_SIZE, PAGE_SIZE); ioapic_phys = __pa(ioapic_phys); } - set_fixmap_nocache(idx, ioapic_phys); + io_apic_set_fixmap(idx, ioapic_phys); apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n", __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK), ioapic_phys); @@ -2820,7 +2852,7 @@ int mp_register_ioapic(int id, u32 address, u32 gsi_base, ioapics[idx].mp_config.flags = MPC_APIC_USABLE; ioapics[idx].mp_config.apicaddr = address; - set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); + io_apic_set_fixmap(FIX_IO_APIC_BASE_0 + idx, address); if (bad_ioapic_register(idx)) { clear_fixmap(FIX_IO_APIC_BASE_0 + idx); return -ENODEV; @@ -2857,7 +2889,7 @@ int mp_register_ioapic(int id, u32 address, u32 gsi_base, /* * If mp_register_ioapic() is called during early boot stage when - * walking ACPI/SFI/DT tables, it's too early to create irqdomain, + * walking ACPI/DT tables, it's too early to create irqdomain, * we are still using bootmem allocator. So delay it to setup_IO_APIC(). */ if (hotplug) { @@ -2940,45 +2972,50 @@ int mp_ioapic_registered(u32 gsi_base) static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data, struct irq_alloc_info *info) { - if (info && info->ioapic_valid) { - data->trigger = info->ioapic_trigger; - data->polarity = info->ioapic_polarity; - } else if (acpi_get_override_irq(gsi, &data->trigger, - &data->polarity) < 0) { + if (info && info->ioapic.valid) { + data->is_level = info->ioapic.is_level; + data->active_low = info->ioapic.active_low; + } else if (__acpi_get_override_irq(gsi, &data->is_level, + &data->active_low) < 0) { /* PCI interrupts are always active low level triggered. */ - data->trigger = IOAPIC_LEVEL; - data->polarity = IOAPIC_POL_LOW; + data->is_level = true; + data->active_low = true; } } -static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data, - struct IO_APIC_route_entry *entry) +/* + * Configure the I/O-APIC specific fields in the routing entry. + * + * This is important to setup the I/O-APIC specific bits (is_level, + * active_low, masked) because the underlying parent domain will only + * provide the routing information and is oblivious of the I/O-APIC + * specific bits. + * + * The entry is just preconfigured at this point and not written into the + * RTE. This happens later during activation which will fill in the actual + * routing information. + */ +static void mp_preconfigure_entry(struct mp_chip_data *data) { + struct IO_APIC_route_entry *entry = &data->entry; + memset(entry, 0, sizeof(*entry)); - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; - entry->dest = cfg->dest_apicid; - entry->vector = cfg->vector; - entry->trigger = data->trigger; - entry->polarity = data->polarity; + entry->is_level = data->is_level; + entry->active_low = data->active_low; /* * Mask level triggered irqs. Edge triggered irqs are masked * by the irq core code in case they fire. */ - if (data->trigger == IOAPIC_LEVEL) - entry->mask = IOAPIC_MASKED; - else - entry->mask = IOAPIC_UNMASKED; + entry->masked = data->is_level; } int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *arg) { - int ret, ioapic, pin; - struct irq_cfg *cfg; - struct irq_data *irq_data; - struct mp_chip_data *data; struct irq_alloc_info *info = arg; + struct mp_chip_data *data; + struct irq_data *irq_data; + int ret, ioapic, pin; unsigned long flags; if (!info || nr_irqs > 1) @@ -2988,7 +3025,7 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, return -EINVAL; ioapic = mp_irqdomain_ioapic_idx(domain); - pin = info->ioapic_pin; + pin = info->ioapic.pin; if (irq_find_mapping(domain, (irq_hw_number_t)pin) > 0) return -EEXIST; @@ -2996,7 +3033,6 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, if (!data) return -ENOMEM; - info->ioapic_entry = &data->entry; ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info); if (ret < 0) { kfree(data); @@ -3004,28 +3040,26 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, } INIT_LIST_HEAD(&data->irq_2_pin); - irq_data->hwirq = info->ioapic_pin; + irq_data->hwirq = info->ioapic.pin; irq_data->chip = (domain->parent == x86_vector_domain) ? &ioapic_chip : &ioapic_ir_chip; irq_data->chip_data = data; mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info); - cfg = irqd_cfg(irq_data); add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin); + mp_preconfigure_entry(data); + mp_register_handler(virq, data->is_level); + local_irq_save(flags); - if (info->ioapic_entry) - mp_setup_entry(cfg, data, info->ioapic_entry); - mp_register_handler(virq, data->trigger); if (virq < nr_legacy_irqs()) legacy_pic->mask(virq); local_irq_restore(flags); apic_printk(APIC_VERBOSE, KERN_DEBUG - "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i Dest:%d)\n", - ioapic, mpc_ioapic_id(ioapic), pin, cfg->vector, - virq, data->trigger, data->polarity, cfg->dest_apicid); - + "IOAPIC[%d]: Preconfigured routing entry (%d-%d -> IRQ %d Level:%i ActiveLow:%i)\n", + ioapic, mpc_ioapic_id(ioapic), pin, virq, + data->is_level, data->active_low); return 0; } diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 6ca0f91372fd..2a6509e8c840 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -2,6 +2,7 @@ #include <linux/cpumask.h> #include <linux/smp.h> +#include <asm/io_apic.h> #include "local.h" @@ -98,7 +99,7 @@ sendmask: static inline int __prepare_ICR2(unsigned int mask) { - return SET_APIC_DEST_FIELD(mask); + return SET_XAPIC_DEST_FIELD(mask); } static inline void __xapic_wait_icr_idle(void) @@ -259,7 +260,7 @@ void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, for_each_cpu(query_cpu, mask) __default_send_IPI_dest_field( early_per_cpu(x86_cpu_to_logical_apicid, query_cpu), - vector, apic->dest_logical); + vector, APIC_DEST_LOGICAL); local_irq_restore(flags); } @@ -278,7 +279,7 @@ void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, continue; __default_send_IPI_dest_field( early_per_cpu(x86_cpu_to_logical_apicid, query_cpu), - vector, apic->dest_logical); + vector, APIC_DEST_LOGICAL); } local_irq_restore(flags); } @@ -296,7 +297,7 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector) local_irq_save(flags); WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]); - __default_send_IPI_dest_field(mask, vector, apic->dest_logical); + __default_send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL); local_irq_restore(flags); } diff --git a/arch/x86/kernel/apic/local.h b/arch/x86/kernel/apic/local.h index 04797f05ce94..a997d849509a 100644 --- a/arch/x86/kernel/apic/local.h +++ b/arch/x86/kernel/apic/local.h @@ -10,6 +10,7 @@ #include <linux/jump_label.h> +#include <asm/irq_vectors.h> #include <asm/apic.h> /* APIC flat 64 */ diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 159bd0cb8548..7517eb05bdc1 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -15,46 +15,19 @@ #include <linux/hpet.h> #include <linux/msi.h> #include <asm/irqdomain.h> -#include <asm/msidef.h> #include <asm/hpet.h> #include <asm/hw_irq.h> #include <asm/apic.h> #include <asm/irq_remapping.h> +#include <asm/xen/hypervisor.h> -static struct irq_domain *msi_default_domain; - -static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg) -{ - msg->address_hi = MSI_ADDR_BASE_HI; - - if (x2apic_enabled()) - msg->address_hi |= MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid); - - msg->address_lo = - MSI_ADDR_BASE_LO | - ((apic->irq_dest_mode == 0) ? - MSI_ADDR_DEST_MODE_PHYSICAL : - MSI_ADDR_DEST_MODE_LOGICAL) | - MSI_ADDR_REDIRECTION_CPU | - MSI_ADDR_DEST_ID(cfg->dest_apicid); - - msg->data = - MSI_DATA_TRIGGER_EDGE | - MSI_DATA_LEVEL_ASSERT | - MSI_DATA_DELIVERY_FIXED | - MSI_DATA_VECTOR(cfg->vector); -} - -static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) -{ - __irq_msi_compose_msg(irqd_cfg(data), msg); -} +struct irq_domain *x86_pci_msi_default_domain __ro_after_init; static void irq_msi_update_msg(struct irq_data *irqd, struct irq_cfg *cfg) { struct msi_msg msg[2] = { [1] = { }, }; - __irq_msi_compose_msg(cfg, msg); + __irq_msi_compose_msg(cfg, msg, false); irq_data_get_irq_chip(irqd)->irq_write_msi_msg(irqd, msg); } @@ -86,11 +59,13 @@ msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force) * The quirk bit is not set in this case. * - The new vector is the same as the old vector * - The old vector is MANAGED_IRQ_SHUTDOWN_VECTOR (interrupt starts up) + * - The interrupt is not yet started up * - The new destination CPU is the same as the old destination CPU */ if (!irqd_msi_nomask_quirk(irqd) || cfg->vector == old_cfg.vector || old_cfg.vector == MANAGED_IRQ_SHUTDOWN_VECTOR || + !irqd_is_started(irqd) || cfg->dest_apicid == old_cfg.dest_apicid) { irq_msi_update_msg(irqd, cfg); return ret; @@ -115,7 +90,8 @@ msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force) * denote it as spurious which is no harm as this is a rare event * and interrupt handlers have to cope with spurious interrupts * anyway. If the vector is unused, then it is marked so it won't - * trigger the 'No irq handler for vector' warning in do_IRQ(). + * trigger the 'No irq handler for vector' warning in + * common_interrupt(). * * This requires to hold vector lock to prevent concurrent updates to * the affected vector. @@ -176,52 +152,19 @@ static struct irq_chip pci_msi_controller = { .irq_mask = pci_msi_mask_irq, .irq_ack = irq_chip_ack_parent, .irq_retrigger = irq_chip_retrigger_hierarchy, - .irq_compose_msi_msg = irq_msi_compose_msg, .irq_set_affinity = msi_set_affinity, - .flags = IRQCHIP_SKIP_SET_WAKE, + .flags = IRQCHIP_SKIP_SET_WAKE | + IRQCHIP_AFFINITY_PRE_STARTUP, }; -int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) -{ - struct irq_domain *domain; - struct irq_alloc_info info; - - init_irq_alloc_info(&info, NULL); - info.type = X86_IRQ_ALLOC_TYPE_MSI; - info.msi_dev = dev; - - domain = irq_remapping_get_irq_domain(&info); - if (domain == NULL) - domain = msi_default_domain; - if (domain == NULL) - return -ENOSYS; - - return msi_domain_alloc_irqs(domain, &dev->dev, nvec); -} - -void native_teardown_msi_irq(unsigned int irq) -{ - irq_domain_free_irqs(irq, 1); -} - -static irq_hw_number_t pci_msi_get_hwirq(struct msi_domain_info *info, - msi_alloc_info_t *arg) -{ - return arg->msi_hwirq; -} - int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec, msi_alloc_info_t *arg) { - struct pci_dev *pdev = to_pci_dev(dev); - struct msi_desc *desc = first_pci_msi_entry(pdev); - init_irq_alloc_info(arg, NULL); - arg->msi_dev = pdev; - if (desc->msi_attrib.is_msix) { - arg->type = X86_IRQ_ALLOC_TYPE_MSIX; + if (to_pci_dev(dev)->msix_enabled) { + arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX; } else { - arg->type = X86_IRQ_ALLOC_TYPE_MSI; + arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSI; arg->flags |= X86_IRQ_ALLOC_CONTIGUOUS_VECTORS; } @@ -229,16 +172,8 @@ int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec, } EXPORT_SYMBOL_GPL(pci_msi_prepare); -void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc) -{ - arg->msi_hwirq = pci_msi_domain_calc_hwirq(arg->msi_dev, desc); -} -EXPORT_SYMBOL_GPL(pci_msi_set_desc); - static struct msi_domain_ops pci_msi_domain_ops = { - .get_hwirq = pci_msi_get_hwirq, .msi_prepare = pci_msi_prepare, - .set_desc = pci_msi_set_desc, }; static struct msi_domain_info pci_msi_domain_info = { @@ -250,24 +185,32 @@ static struct msi_domain_info pci_msi_domain_info = { .handler_name = "edge", }; -void __init arch_init_msi_domain(struct irq_domain *parent) +struct irq_domain * __init native_create_pci_msi_domain(void) { struct fwnode_handle *fn; + struct irq_domain *d; if (disable_apic) - return; + return NULL; fn = irq_domain_alloc_named_fwnode("PCI-MSI"); - if (fn) { - msi_default_domain = - pci_msi_create_irq_domain(fn, &pci_msi_domain_info, - parent); + if (!fn) + return NULL; + + d = pci_msi_create_irq_domain(fn, &pci_msi_domain_info, + x86_vector_domain); + if (!d) { irq_domain_free_fwnode(fn); + pr_warn("Failed to initialize PCI-MSI irqdomain.\n"); + } else { + d->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK; } - if (!msi_default_domain) - pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n"); - else - msi_default_domain->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK; + return d; +} + +void __init x86_create_pci_msi_domain(void) +{ + x86_pci_msi_default_domain = x86_init.irqs.create_pci_msi_domain(); } #ifdef CONFIG_IRQ_REMAP @@ -277,8 +220,8 @@ static struct irq_chip pci_msi_ir_controller = { .irq_mask = pci_msi_mask_irq, .irq_ack = irq_chip_ack_parent, .irq_retrigger = irq_chip_retrigger_hierarchy, - .irq_set_vcpu_affinity = irq_chip_set_vcpu_affinity_parent, - .flags = IRQCHIP_SKIP_SET_WAKE, + .flags = IRQCHIP_SKIP_SET_WAKE | + IRQCHIP_AFFINITY_PRE_STARTUP, }; static struct msi_domain_info pci_msi_ir_domain_info = { @@ -300,12 +243,24 @@ struct irq_domain *arch_create_remap_msi_irq_domain(struct irq_domain *parent, if (!fn) return NULL; d = pci_msi_create_irq_domain(fn, &pci_msi_ir_domain_info, parent); - irq_domain_free_fwnode(fn); + if (!d) + irq_domain_free_fwnode(fn); return d; } #endif #ifdef CONFIG_DMAR_TABLE +/* + * The Intel IOMMU (ab)uses the high bits of the MSI address to contain the + * high bits of the destination APIC ID. This can't be done in the general + * case for MSIs as it would be targeting real memory above 4GiB not the + * APIC. + */ +static void dmar_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) +{ + __irq_msi_compose_msg(irqd_cfg(data), msg, true); +} + static void dmar_msi_write_msg(struct irq_data *data, struct msi_msg *msg) { dmar_msi_write(data->irq, msg); @@ -318,35 +273,30 @@ static struct irq_chip dmar_msi_controller = { .irq_ack = irq_chip_ack_parent, .irq_set_affinity = msi_domain_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, - .irq_compose_msi_msg = irq_msi_compose_msg, + .irq_compose_msi_msg = dmar_msi_compose_msg, .irq_write_msi_msg = dmar_msi_write_msg, - .flags = IRQCHIP_SKIP_SET_WAKE, + .flags = IRQCHIP_SKIP_SET_WAKE | + IRQCHIP_AFFINITY_PRE_STARTUP, }; -static irq_hw_number_t dmar_msi_get_hwirq(struct msi_domain_info *info, - msi_alloc_info_t *arg) -{ - return arg->dmar_id; -} - static int dmar_msi_init(struct irq_domain *domain, struct msi_domain_info *info, unsigned int virq, irq_hw_number_t hwirq, msi_alloc_info_t *arg) { - irq_domain_set_info(domain, virq, arg->dmar_id, info->chip, NULL, - handle_edge_irq, arg->dmar_data, "edge"); + irq_domain_set_info(domain, virq, arg->devid, info->chip, NULL, + handle_edge_irq, arg->data, "edge"); return 0; } static struct msi_domain_ops dmar_msi_domain_ops = { - .get_hwirq = dmar_msi_get_hwirq, .msi_init = dmar_msi_init, }; static struct msi_domain_info dmar_msi_domain_info = { .ops = &dmar_msi_domain_ops, .chip = &dmar_msi_controller, + .flags = MSI_FLAG_USE_DEF_DOM_OPS, }; static struct irq_domain *dmar_get_irq_domain(void) @@ -363,7 +313,8 @@ static struct irq_domain *dmar_get_irq_domain(void) if (fn) { dmar_domain = msi_create_irq_domain(fn, &dmar_msi_domain_info, x86_vector_domain); - irq_domain_free_fwnode(fn); + if (!dmar_domain) + irq_domain_free_fwnode(fn); } out: mutex_unlock(&dmar_lock); @@ -380,8 +331,9 @@ int dmar_alloc_hwirq(int id, int node, void *arg) init_irq_alloc_info(&info, NULL); info.type = X86_IRQ_ALLOC_TYPE_DMAR; - info.dmar_id = id; - info.dmar_data = arg; + info.devid = id; + info.hwirq = id; + info.data = arg; return irq_domain_alloc_irqs(domain, 1, node, &info); } @@ -392,117 +344,7 @@ void dmar_free_hwirq(int irq) } #endif -/* - * MSI message composition - */ -#ifdef CONFIG_HPET_TIMER -static inline int hpet_dev_id(struct irq_domain *domain) +bool arch_restore_msi_irqs(struct pci_dev *dev) { - struct msi_domain_info *info = msi_get_domain_info(domain); - - return (int)(long)info->data; -} - -static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg) -{ - hpet_msi_write(irq_data_get_irq_handler_data(data), msg); -} - -static struct irq_chip hpet_msi_controller __ro_after_init = { - .name = "HPET-MSI", - .irq_unmask = hpet_msi_unmask, - .irq_mask = hpet_msi_mask, - .irq_ack = irq_chip_ack_parent, - .irq_set_affinity = msi_domain_set_affinity, - .irq_retrigger = irq_chip_retrigger_hierarchy, - .irq_compose_msi_msg = irq_msi_compose_msg, - .irq_write_msi_msg = hpet_msi_write_msg, - .flags = IRQCHIP_SKIP_SET_WAKE, -}; - -static irq_hw_number_t hpet_msi_get_hwirq(struct msi_domain_info *info, - msi_alloc_info_t *arg) -{ - return arg->hpet_index; -} - -static int hpet_msi_init(struct irq_domain *domain, - struct msi_domain_info *info, unsigned int virq, - irq_hw_number_t hwirq, msi_alloc_info_t *arg) -{ - irq_set_status_flags(virq, IRQ_MOVE_PCNTXT); - irq_domain_set_info(domain, virq, arg->hpet_index, info->chip, NULL, - handle_edge_irq, arg->hpet_data, "edge"); - - return 0; -} - -static void hpet_msi_free(struct irq_domain *domain, - struct msi_domain_info *info, unsigned int virq) -{ - irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT); + return xen_initdom_restore_msi(dev); } - -static struct msi_domain_ops hpet_msi_domain_ops = { - .get_hwirq = hpet_msi_get_hwirq, - .msi_init = hpet_msi_init, - .msi_free = hpet_msi_free, -}; - -static struct msi_domain_info hpet_msi_domain_info = { - .ops = &hpet_msi_domain_ops, - .chip = &hpet_msi_controller, -}; - -struct irq_domain *hpet_create_irq_domain(int hpet_id) -{ - struct msi_domain_info *domain_info; - struct irq_domain *parent, *d; - struct irq_alloc_info info; - struct fwnode_handle *fn; - - if (x86_vector_domain == NULL) - return NULL; - - domain_info = kzalloc(sizeof(*domain_info), GFP_KERNEL); - if (!domain_info) - return NULL; - - *domain_info = hpet_msi_domain_info; - domain_info->data = (void *)(long)hpet_id; - - init_irq_alloc_info(&info, NULL); - info.type = X86_IRQ_ALLOC_TYPE_HPET; - info.hpet_id = hpet_id; - parent = irq_remapping_get_ir_irq_domain(&info); - if (parent == NULL) - parent = x86_vector_domain; - else - hpet_msi_controller.name = "IR-HPET-MSI"; - - fn = irq_domain_alloc_named_id_fwnode(hpet_msi_controller.name, - hpet_id); - if (!fn) { - kfree(domain_info); - return NULL; - } - - d = msi_create_irq_domain(fn, domain_info, parent); - irq_domain_free_fwnode(fn); - return d; -} - -int hpet_assign_irq(struct irq_domain *domain, struct hpet_channel *hc, - int dev_num) -{ - struct irq_alloc_info info; - - init_irq_alloc_info(&info, NULL); - info.type = X86_IRQ_ALLOC_TYPE_HPET; - info.hpet_data = hc; - info.hpet_id = hpet_dev_id(domain); - info.hpet_index = dev_num; - - return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info); -} -#endif diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 67b33d67002f..a61f642b1b90 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -10,6 +10,7 @@ #include <linux/errno.h> #include <linux/smp.h> +#include <asm/io_apic.h> #include <asm/apic.h> #include <asm/acpi.h> @@ -68,16 +69,13 @@ static struct apic apic_default __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = default_apic_id_registered, - .irq_delivery_mode = dest_Fixed, - /* logical delivery broadcast to all CPUs: */ - .irq_dest_mode = 1, + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = true, .disable_esr = 0, - .dest_logical = APIC_DEST_LOGICAL, - .check_apicid_used = default_check_apicid_used, + .check_apicid_used = default_check_apicid_used, .init_apic_ldr = default_init_apic_ldr, - .ioapic_phys_id_map = default_ioapic_phys_id_map, .setup_apic_routing = setup_apic_flat_routing, .cpu_present_to_apicid = default_cpu_present_to_apicid, @@ -148,7 +146,7 @@ void __init default_setup_apic_routing(void) break; } /* P4 and above */ - /* fall through */ + fallthrough; case X86_VENDOR_HYGON: case X86_VENDOR_AMD: def_to_bigsmp = 1; @@ -169,9 +167,6 @@ void __init default_setup_apic_routing(void) if (apic->setup_apic_routing) apic->setup_apic_routing(); - - if (x86_platform.apic_post_init) - x86_platform.apic_post_init(); } void __init generic_apic_probe(void) diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index 29f0e0984557..c46720f185c0 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -8,6 +8,7 @@ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and * James Cleverdon. */ +#include <linux/thread_info.h> #include <asm/apic.h> #include "local.h" @@ -31,9 +32,6 @@ void __init default_setup_apic_routing(void) break; } } - - if (x86_platform.apic_post_init) - x86_platform.apic_post_init(); } int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 48293d15f1e1..3e6f6b448f6a 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -161,6 +161,7 @@ static void apic_update_vector(struct irq_data *irqd, unsigned int newvec, apicd->move_in_progress = true; apicd->prev_vector = apicd->vector; apicd->prev_cpu = apicd->cpu; + WARN_ON_ONCE(apicd->cpu == newcpu); } else { irq_matrix_free(vector_matrix, apicd->cpu, apicd->vector, managed); @@ -272,20 +273,24 @@ static int assign_irq_vector_any_locked(struct irq_data *irqd) const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd); int node = irq_data_get_node(irqd); - if (node == NUMA_NO_NODE) - goto all; - /* Try the intersection of @affmsk and node mask */ - cpumask_and(vector_searchmask, cpumask_of_node(node), affmsk); - if (!assign_vector_locked(irqd, vector_searchmask)) - return 0; - /* Try the node mask */ - if (!assign_vector_locked(irqd, cpumask_of_node(node))) - return 0; -all: + if (node != NUMA_NO_NODE) { + /* Try the intersection of @affmsk and node mask */ + cpumask_and(vector_searchmask, cpumask_of_node(node), affmsk); + if (!assign_vector_locked(irqd, vector_searchmask)) + return 0; + } + /* Try the full affinity mask */ cpumask_and(vector_searchmask, affmsk, cpu_online_mask); if (!assign_vector_locked(irqd, vector_searchmask)) return 0; + + if (node != NUMA_NO_NODE) { + /* Try the node mask */ + if (!assign_vector_locked(irqd, cpumask_of_node(node))) + return 0; + } + /* Try the full online mask */ return assign_vector_locked(irqd, cpu_online_mask); } @@ -446,12 +451,10 @@ static int x86_vector_activate(struct irq_domain *dom, struct irq_data *irqd, trace_vector_activate(irqd->irq, apicd->is_managed, apicd->can_reserve, reserve); - /* Nothing to do for fixed assigned vectors */ - if (!apicd->can_reserve && !apicd->is_managed) - return 0; - raw_spin_lock_irqsave(&vector_lock, flags); - if (reserve || irqd_is_managed_and_shutdown(irqd)) + if (!apicd->can_reserve && !apicd->is_managed) + assign_irq_vector_any_locked(irqd); + else if (reserve || irqd_is_managed_and_shutdown(irqd)) vector_assign_managed_shutdown(irqd); else if (apicd->is_managed) ret = activate_managed(irqd); @@ -540,6 +543,14 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, if ((info->flags & X86_IRQ_ALLOC_CONTIGUOUS_VECTORS) && nr_irqs > 1) return -ENOSYS; + /* + * Catch any attempt to touch the cascade interrupt on a PIC + * equipped system. + */ + if (WARN_ON_ONCE(info->flags & X86_IRQ_ALLOC_LEGACY && + virq == PIC_CASCADE_IR)) + return -EINVAL; + for (i = 0; i < nr_irqs; i++) { irqd = irq_domain_get_irq_data(domain, virq + i); BUG_ON(!irqd); @@ -557,6 +568,16 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, irqd->hwirq = virq + i; irqd_set_single_target(irqd); /* + * Prevent that any of these interrupts is invoked in + * non interrupt context via e.g. generic_handle_irq() + * as that can corrupt the affinity move state. + */ + irqd_set_handle_enforce_irqctx(irqd); + + /* Don't invoke affinity setter on deactivated interrupts */ + irqd_set_affinity_on_activate(irqd); + + /* * Legacy vectors are already assigned when the IOAPIC * takes them over. They stay on the same vector. This is * required for check_timer() to work correctly as it might @@ -627,7 +648,50 @@ static void x86_vector_debug_show(struct seq_file *m, struct irq_domain *d, } #endif +int x86_fwspec_is_ioapic(struct irq_fwspec *fwspec) +{ + if (fwspec->param_count != 1) + return 0; + + if (is_fwnode_irqchip(fwspec->fwnode)) { + const char *fwname = fwnode_get_name(fwspec->fwnode); + return fwname && !strncmp(fwname, "IO-APIC-", 8) && + simple_strtol(fwname+8, NULL, 10) == fwspec->param[0]; + } + return to_of_node(fwspec->fwnode) && + of_device_is_compatible(to_of_node(fwspec->fwnode), + "intel,ce4100-ioapic"); +} + +int x86_fwspec_is_hpet(struct irq_fwspec *fwspec) +{ + if (fwspec->param_count != 1) + return 0; + + if (is_fwnode_irqchip(fwspec->fwnode)) { + const char *fwname = fwnode_get_name(fwspec->fwnode); + return fwname && !strncmp(fwname, "HPET-MSI-", 9) && + simple_strtol(fwname+9, NULL, 10) == fwspec->param[0]; + } + return 0; +} + +static int x86_vector_select(struct irq_domain *d, struct irq_fwspec *fwspec, + enum irq_domain_bus_token bus_token) +{ + /* + * HPET and I/OAPIC cannot be parented in the vector domain + * if IRQ remapping is enabled. APIC IDs above 15 bits are + * only permitted if IRQ remapping is enabled, so check that. + */ + if (apic->apic_id_valid(32768)) + return 0; + + return x86_fwspec_is_ioapic(fwspec) || x86_fwspec_is_hpet(fwspec); +} + static const struct irq_domain_ops x86_vector_domain_ops = { + .select = x86_vector_select, .alloc = x86_vector_alloc_irqs, .free = x86_vector_free_irqs, .activate = x86_vector_activate, @@ -674,11 +738,31 @@ void lapic_assign_legacy_vector(unsigned int irq, bool replace) irq_matrix_assign_system(vector_matrix, ISA_IRQ_VECTOR(irq), replace); } +void __init lapic_update_legacy_vectors(void) +{ + unsigned int i; + + if (IS_ENABLED(CONFIG_X86_IO_APIC) && nr_ioapics > 0) + return; + + /* + * If the IO/APIC is disabled via config, kernel command line or + * lack of enumeration then all legacy interrupts are routed + * through the PIC. Make sure that they are marked as legacy + * vectors. PIC_CASCADE_IRQ has already been marked in + * lapic_assign_system_vectors(). + */ + for (i = 0; i < nr_legacy_irqs(); i++) { + if (i != PIC_CASCADE_IR) + lapic_assign_legacy_vector(i, true); + } +} + void __init lapic_assign_system_vectors(void) { - unsigned int i, vector = 0; + unsigned int i, vector; - for_each_set_bit_from(vector, system_vectors, NR_VECTORS) + for_each_set_bit(vector, system_vectors, NR_VECTORS) irq_matrix_assign_system(vector_matrix, vector, false); if (nr_legacy_irqs() > 1) @@ -689,6 +773,11 @@ void __init lapic_assign_system_vectors(void) /* Mark the preallocated legacy interrupts */ for (i = 0; i < nr_legacy_irqs(); i++) { + /* + * Don't touch the cascade interrupt. It's unusable + * on PIC equipped machines. See the large comment + * in the IO/APIC code. + */ if (i != PIC_CASCADE_IR) irq_matrix_assign(vector_matrix, ISA_IRQ_VECTOR(i)); } @@ -703,11 +792,8 @@ int __init arch_early_irq_init(void) x86_vector_domain = irq_domain_create_tree(fn, &x86_vector_domain_ops, NULL); BUG_ON(x86_vector_domain == NULL); - irq_domain_free_fwnode(fn); irq_set_default_host(x86_vector_domain); - arch_init_msi_domain(x86_vector_domain); - BUG_ON(!alloc_cpumask_var(&vector_searchmask, GFP_KERNEL)); /* @@ -769,20 +855,10 @@ void lapic_offline(void) static int apic_set_affinity(struct irq_data *irqd, const struct cpumask *dest, bool force) { - struct apic_chip_data *apicd = apic_chip_data(irqd); int err; - /* - * Core code can call here for inactive interrupts. For inactive - * interrupts which use managed or reservation mode there is no - * point in going through the vector assignment right now as the - * activation will assign a vector which fits the destination - * cpumask. Let the core code store the destination mask and be - * done with it. - */ - if (!irqd_is_activated(irqd) && - (apicd->is_managed || apicd->can_reserve)) - return IRQ_SET_MASK_OK; + if (WARN_ON_ONCE(!irqd_is_activated(irqd))) + return -EIO; raw_spin_lock(&vector_lock); cpumask_and(vector_searchmask, dest, cpu_online_mask); @@ -822,10 +898,17 @@ void apic_ack_edge(struct irq_data *irqd) apic_ack_irq(irqd); } +static void x86_vector_msi_compose_msg(struct irq_data *data, + struct msi_msg *msg) +{ + __irq_msi_compose_msg(irqd_cfg(data), msg, false); +} + static struct irq_chip lapic_controller = { .name = "APIC", .irq_ack = apic_ack_edge, .irq_set_affinity = apic_set_affinity, + .irq_compose_msi_msg = x86_vector_msi_compose_msg, .irq_retrigger = apic_retrigger_irq, }; @@ -855,13 +938,13 @@ static void free_moved_vector(struct apic_chip_data *apicd) apicd->move_in_progress = 0; } -asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void) +DEFINE_IDTENTRY_SYSVEC(sysvec_irq_move_cleanup) { struct hlist_head *clhead = this_cpu_ptr(&cleanup_list); struct apic_chip_data *apicd; struct hlist_node *tmp; - entering_ack_irq(); + ack_APIC_irq(); /* Prevent vectors vanishing under us */ raw_spin_lock(&vector_lock); @@ -886,7 +969,6 @@ asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void) } raw_spin_unlock(&vector_lock); - exiting_irq(); } static void __send_cleanup_vector(struct apic_chip_data *apicd) @@ -914,7 +996,7 @@ void send_cleanup_vector(struct irq_cfg *cfg) __send_cleanup_vector(apicd); } -static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector) +void irq_complete_move(struct irq_cfg *cfg) { struct apic_chip_data *apicd; @@ -922,15 +1004,16 @@ static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector) if (likely(!apicd->move_in_progress)) return; - if (vector == apicd->vector && apicd->cpu == smp_processor_id()) + /* + * If the interrupt arrived on the new target CPU, cleanup the + * vector on the old target CPU. A vector check is not required + * because an interrupt can never move from one vector to another + * on the same CPU. + */ + if (apicd->cpu == smp_processor_id()) __send_cleanup_vector(apicd); } -void irq_complete_move(struct irq_cfg *cfg) -{ - __irq_complete_move(cfg, ~get_irq_regs()->orig_ax); -} - /* * Called from fixup_irqs() with @desc->lock held and interrupts disabled. */ @@ -995,7 +1078,7 @@ void irq_force_complete_move(struct irq_desc *desc) * * But in case of cpu hotplug this should be a non issue * because if the affinity update happens right before all - * cpus rendevouz in stop machine, there is no way that the + * cpus rendezvous in stop machine, there is no way that the * interrupt can be blocked on the target cpu because all cpus * loops first with interrupts enabled in stop machine, so the * old vector is not yet cleaned up when the interrupt fires. @@ -1004,7 +1087,7 @@ void irq_force_complete_move(struct irq_desc *desc) * of the interrupt on the apic/system bus would be delayed * beyond the point where the target cpu disables interrupts * in stop machine. I doubt that it can happen, but at least - * there is a theroretical chance. Virtualization might be + * there is a theoretical chance. Virtualization might be * able to expose this, but AFAICT the IOAPIC emulation is not * as stupid as the real hardware. * @@ -1216,7 +1299,7 @@ static void __init print_PIC(void) pr_debug("... PIC ISR: %04x\n", v); - v = inb(0x4d1) << 8 | inb(0x4d0); + v = inb(PIC_ELCR2) << 8 | inb(PIC_ELCR1); pr_debug("... PIC ELCR: %04x\n", v); } diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index b0889c48a2ac..e696e22d0531 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -15,9 +15,15 @@ struct cluster_mask { struct cpumask mask; }; -static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); +/* + * __x2apic_send_IPI_mask() possibly needs to read + * x86_cpu_to_logical_apicid for all online cpus in a sequential way. + * Using per cpu variable would cost one cache line per cpu. + */ +static u32 *x86_cpu_to_logical_apicid __read_mostly; + static DEFINE_PER_CPU(cpumask_var_t, ipi_mask); -static DEFINE_PER_CPU(struct cluster_mask *, cluster_masks); +static DEFINE_PER_CPU_READ_MOSTLY(struct cluster_mask *, cluster_masks); static struct cluster_mask *cluster_hotplug_mask; static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) @@ -27,9 +33,10 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) static void x2apic_send_IPI(int cpu, int vector) { - u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu); + u32 dest = x86_cpu_to_logical_apicid[cpu]; - x2apic_wrmsr_fence(); + /* x2apic MSRs are special and need a special fence: */ + weak_wrmsr_fence(); __x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL); } @@ -41,7 +48,8 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) unsigned long flags; u32 dest; - x2apic_wrmsr_fence(); + /* x2apic MSRs are special and need a special fence: */ + weak_wrmsr_fence(); local_irq_save(flags); tmpmsk = this_cpu_cpumask_var_ptr(ipi_mask); @@ -56,12 +64,12 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) dest = 0; for_each_cpu_and(clustercpu, tmpmsk, &cmsk->mask) - dest |= per_cpu(x86_cpu_to_logical_apicid, clustercpu); + dest |= x86_cpu_to_logical_apicid[clustercpu]; if (!dest) continue; - __x2apic_send_IPI_dest(dest, vector, apic->dest_logical); + __x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL); /* Remove cluster CPUs from tmpmask */ cpumask_andnot(tmpmsk, tmpmsk, &cmsk->mask); } @@ -92,7 +100,7 @@ static void x2apic_send_IPI_all(int vector) static u32 x2apic_calc_apicid(unsigned int cpu) { - return per_cpu(x86_cpu_to_logical_apicid, cpu); + return x86_cpu_to_logical_apicid[cpu]; } static void init_x2apic_ldr(void) @@ -101,7 +109,7 @@ static void init_x2apic_ldr(void) u32 cluster, apicid = apic_read(APIC_LDR); unsigned int cpu; - this_cpu_write(x86_cpu_to_logical_apicid, apicid); + x86_cpu_to_logical_apicid[smp_processor_id()] = apicid; if (cmsk) goto update; @@ -164,12 +172,21 @@ static int x2apic_dead_cpu(unsigned int dead_cpu) static int x2apic_cluster_probe(void) { + u32 slots; + if (!x2apic_mode) return 0; + slots = max_t(u32, L1_CACHE_BYTES/sizeof(u32), nr_cpu_ids); + x86_cpu_to_logical_apicid = kcalloc(slots, sizeof(u32), GFP_KERNEL); + if (!x86_cpu_to_logical_apicid) + return 0; + if (cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "x86/x2apic:prepare", x2apic_prepare_cpu, x2apic_dead_cpu) < 0) { pr_err("Failed to register X2APIC_PREPARE\n"); + kfree(x86_cpu_to_logical_apicid); + x86_cpu_to_logical_apicid = NULL; return 0; } init_x2apic_ldr(); @@ -184,15 +201,13 @@ static struct apic apic_x2apic_cluster __ro_after_init = { .apic_id_valid = x2apic_apic_id_valid, .apic_id_registered = x2apic_apic_id_registered, - .irq_delivery_mode = dest_Fixed, - .irq_dest_mode = 1, /* logical */ + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = true, .disable_esr = 0, - .dest_logical = APIC_DEST_LOGICAL, - .check_apicid_used = NULL, + .check_apicid_used = NULL, .init_apic_ldr = init_x2apic_ldr, - .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index bc9693841353..6bde05a86b4e 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -8,6 +8,12 @@ int x2apic_phys; static struct apic apic_x2apic_phys; +static u32 x2apic_max_apicid __ro_after_init; + +void __init x2apic_set_max_apicid(u32 apicid) +{ + x2apic_max_apicid = apicid; +} static int __init set_x2apic_phys_mode(char *arg) { @@ -37,7 +43,8 @@ static void x2apic_send_IPI(int cpu, int vector) { u32 dest = per_cpu(x86_cpu_to_apicid, cpu); - x2apic_wrmsr_fence(); + /* x2apic MSRs are special and need a special fence: */ + weak_wrmsr_fence(); __x2apic_send_IPI_dest(dest, vector, APIC_DEST_PHYSICAL); } @@ -48,7 +55,8 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) unsigned long this_cpu; unsigned long flags; - x2apic_wrmsr_fence(); + /* x2apic MSRs are special and need a special fence: */ + weak_wrmsr_fence(); local_irq_save(flags); @@ -98,6 +106,9 @@ static int x2apic_phys_probe(void) /* Common x2apic functions, also used by x2apic_cluster */ int x2apic_apic_id_valid(u32 apicid) { + if (x2apic_max_apicid && apicid > x2apic_max_apicid) + return 0; + return 1; } @@ -116,7 +127,8 @@ void __x2apic_send_IPI_shorthand(int vector, u32 which) { unsigned long cfg = __prepare_ICR(which, vector, 0); - x2apic_wrmsr_fence(); + /* x2apic MSRs are special and need a special fence: */ + weak_wrmsr_fence(); native_x2apic_icr_write(cfg, 0); } @@ -148,15 +160,13 @@ static struct apic apic_x2apic_phys __ro_after_init = { .apic_id_valid = x2apic_apic_id_valid, .apic_id_registered = x2apic_apic_id_registered, - .irq_delivery_mode = dest_Fixed, - .irq_dest_mode = 0, /* physical */ + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = false, .disable_esr = 0, - .dest_logical = 0, - .check_apicid_used = NULL, + .check_apicid_used = NULL, .init_apic_ldr = init_x2apic_ldr, - .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index ad53b2abc859..482855227964 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -5,6 +5,7 @@ * * SGI UV APIC functions (note: not an Intel compatible APIC) * + * (C) Copyright 2020 Hewlett Packard Enterprise Development LP * Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved. */ #include <linux/crash_dump.h> @@ -24,35 +25,32 @@ #include <asm/uv/uv.h> #include <asm/apic.h> -static DEFINE_PER_CPU(int, x2apic_extra_bits); - static enum uv_system_type uv_system_type; static int uv_hubbed_system; static int uv_hubless_system; static u64 gru_start_paddr, gru_end_paddr; -static u64 gru_dist_base, gru_first_node_paddr = -1LL, gru_last_node_paddr; -static u64 gru_dist_lmask, gru_dist_umask; static union uvh_apicid uvh_apicid; +static int uv_node_id; -/* Unpack OEM/TABLE ID's to be NULL terminated strings */ +/* Unpack AT/OEM/TABLE ID's to be NULL terminated strings */ +static u8 uv_archtype[UV_AT_SIZE + 1]; static u8 oem_id[ACPI_OEM_ID_SIZE + 1]; static u8 oem_table_id[ACPI_OEM_TABLE_ID_SIZE + 1]; -/* Information derived from CPUID: */ +/* Information derived from CPUID and some UV MMRs */ static struct { unsigned int apicid_shift; unsigned int apicid_mask; - unsigned int socketid_shift; /* aka pnode_shift for UV1/2/3 */ + unsigned int socketid_shift; /* aka pnode_shift for UV2/3 */ unsigned int pnode_mask; + unsigned int nasid_shift; unsigned int gpa_shift; unsigned int gnode_shift; + unsigned int m_skt; + unsigned int n_skt; } uv_cpuid; -int uv_min_hub_revision_id; -EXPORT_SYMBOL_GPL(uv_min_hub_revision_id); - -unsigned int uv_apicid_hibits; -EXPORT_SYMBOL_GPL(uv_apicid_hibits); +static int uv_min_hub_revision_id; static struct apic apic_x2apic_uv_x; static struct uv_hub_info_s uv_hub_info_node0; @@ -85,20 +83,10 @@ static unsigned long __init uv_early_read_mmr(unsigned long addr) static inline bool is_GRU_range(u64 start, u64 end) { - if (gru_dist_base) { - u64 su = start & gru_dist_umask; /* Upper (incl pnode) bits */ - u64 sl = start & gru_dist_lmask; /* Base offset bits */ - u64 eu = end & gru_dist_umask; - u64 el = end & gru_dist_lmask; - - /* Must reside completely within a single GRU range: */ - return (sl == gru_dist_base && el == gru_dist_base && - su >= gru_first_node_paddr && - su <= gru_last_node_paddr && - eu == su); - } else { - return start >= gru_start_paddr && end <= gru_end_paddr; - } + if (!gru_start_paddr) + return false; + + return start >= gru_start_paddr && end <= gru_end_paddr; } static bool uv_is_untracked_pat_range(u64 start, u64 end) @@ -106,43 +94,102 @@ static bool uv_is_untracked_pat_range(u64 start, u64 end) return is_ISA_range(start, end) || is_GRU_range(start, end); } -static int __init early_get_pnodeid(void) +static void __init early_get_pnodeid(void) { - union uvh_node_id_u node_id; - union uvh_rh_gam_config_mmr_u m_n_config; int pnode; - /* Currently, all blades have same revision number */ + uv_cpuid.m_skt = 0; + if (UVH_RH10_GAM_ADDR_MAP_CONFIG) { + union uvh_rh10_gam_addr_map_config_u m_n_config; + + m_n_config.v = uv_early_read_mmr(UVH_RH10_GAM_ADDR_MAP_CONFIG); + uv_cpuid.n_skt = m_n_config.s.n_skt; + uv_cpuid.nasid_shift = 0; + } else if (UVH_RH_GAM_ADDR_MAP_CONFIG) { + union uvh_rh_gam_addr_map_config_u m_n_config; + + m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_ADDR_MAP_CONFIG); + uv_cpuid.n_skt = m_n_config.s.n_skt; + if (is_uv(UV3)) + uv_cpuid.m_skt = m_n_config.s3.m_skt; + if (is_uv(UV2)) + uv_cpuid.m_skt = m_n_config.s2.m_skt; + uv_cpuid.nasid_shift = 1; + } else { + unsigned long GAM_ADDR_MAP_CONFIG = 0; + + WARN(GAM_ADDR_MAP_CONFIG == 0, + "UV: WARN: GAM_ADDR_MAP_CONFIG is not available\n"); + uv_cpuid.n_skt = 0; + uv_cpuid.nasid_shift = 0; + } + + if (is_uv(UV4|UVY)) + uv_cpuid.gnode_shift = 2; /* min partition is 4 sockets */ + + uv_cpuid.pnode_mask = (1 << uv_cpuid.n_skt) - 1; + pnode = (uv_node_id >> uv_cpuid.nasid_shift) & uv_cpuid.pnode_mask; + uv_cpuid.gpa_shift = 46; /* Default unless changed */ + + pr_info("UV: n_skt:%d pnmsk:%x pn:%x\n", + uv_cpuid.n_skt, uv_cpuid.pnode_mask, pnode); +} + +/* Running on a UV Hubbed system, determine which UV Hub Type it is */ +static int __init early_set_hub_type(void) +{ + union uvh_node_id_u node_id; + + /* + * The NODE_ID MMR is always at offset 0. + * Contains the chip part # + revision. + * Node_id field started with 15 bits, + * ... now 7 but upper 8 are masked to 0. + * All blades/nodes have the same part # and hub revision. + */ node_id.v = uv_early_read_mmr(UVH_NODE_ID); - m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR); - uv_min_hub_revision_id = node_id.s.revision; + uv_node_id = node_id.sx.node_id; switch (node_id.s.part_number) { - case UV2_HUB_PART_NUMBER: - case UV2_HUB_PART_NUMBER_X: - uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1; + + case UV5_HUB_PART_NUMBER: + uv_min_hub_revision_id = node_id.s.revision + + UV5_HUB_REVISION_BASE; + uv_hub_type_set(UV5); + break; + + /* UV4/4A only have a revision difference */ + case UV4_HUB_PART_NUMBER: + uv_min_hub_revision_id = node_id.s.revision + + UV4_HUB_REVISION_BASE - 1; + uv_hub_type_set(UV4); + if (uv_min_hub_revision_id == UV4A_HUB_REVISION_BASE) + uv_hub_type_set(UV4|UV4A); break; + case UV3_HUB_PART_NUMBER: case UV3_HUB_PART_NUMBER_X: - uv_min_hub_revision_id += UV3_HUB_REVISION_BASE; + uv_min_hub_revision_id = node_id.s.revision + + UV3_HUB_REVISION_BASE; + uv_hub_type_set(UV3); break; - /* Update: UV4A has only a modified revision to indicate HUB fixes */ - case UV4_HUB_PART_NUMBER: - uv_min_hub_revision_id += UV4_HUB_REVISION_BASE - 1; - uv_cpuid.gnode_shift = 2; /* min partition is 4 sockets */ + case UV2_HUB_PART_NUMBER: + case UV2_HUB_PART_NUMBER_X: + uv_min_hub_revision_id = node_id.s.revision + + UV2_HUB_REVISION_BASE - 1; + uv_hub_type_set(UV2); break; + + default: + return 0; } - uv_hub_info->hub_revision = uv_min_hub_revision_id; - uv_cpuid.pnode_mask = (1 << m_n_config.s.n_skt) - 1; - pnode = (node_id.s.node_id >> 1) & uv_cpuid.pnode_mask; - uv_cpuid.gpa_shift = 46; /* Default unless changed */ + pr_info("UV: part#:%x rev:%d rev_id:%d UVtype:0x%x\n", + node_id.s.part_number, node_id.s.revision, + uv_min_hub_revision_id, is_uv(~0)); - pr_info("UV: rev:%d part#:%x nodeid:%04x n_skt:%d pnmsk:%x pn:%x\n", - node_id.s.revision, node_id.s.part_number, node_id.s.node_id, - m_n_config.s.n_skt, uv_cpuid.pnode_mask, pnode); - return pnode; + return 1; } static void __init uv_tsc_check_sync(void) @@ -151,42 +198,47 @@ static void __init uv_tsc_check_sync(void) int sync_state; int mmr_shift; char *state; - bool valid; - /* Accommodate different UV arch BIOSes */ + /* UV5 guarantees synced TSCs; do not zero TSC_ADJUST */ + if (!is_uv(UV2|UV3|UV4)) { + mark_tsc_async_resets("UV5+"); + return; + } + + /* UV2,3,4, UV BIOS TSC sync state available */ mmr = uv_early_read_mmr(UVH_TSC_SYNC_MMR); mmr_shift = - is_uv1_hub() ? 0 : is_uv2_hub() ? UVH_TSC_SYNC_SHIFT_UV2K : UVH_TSC_SYNC_SHIFT; - if (mmr_shift) - sync_state = (mmr >> mmr_shift) & UVH_TSC_SYNC_MASK; - else - sync_state = 0; + sync_state = (mmr >> mmr_shift) & UVH_TSC_SYNC_MASK; + /* Check if TSC is valid for all sockets */ switch (sync_state) { case UVH_TSC_SYNC_VALID: state = "in sync"; - valid = true; + mark_tsc_async_resets("UV BIOS"); break; - case UVH_TSC_SYNC_INVALID: - state = "unstable"; - valid = false; + /* If BIOS state unknown, don't do anything */ + case UVH_TSC_SYNC_UNKNOWN: + state = "unknown"; break; + + /* Otherwise, BIOS indicates problem with TSC */ default: - state = "unknown: assuming valid"; - valid = true; + state = "unstable"; + mark_tsc_unstable("UV BIOS"); break; } pr_info("UV: TSC sync state from BIOS:0%d(%s)\n", sync_state, state); - - /* Mark flag that says TSC != 0 is valid for socket 0 */ - if (valid) - mark_tsc_async_resets("UV BIOS"); - else - mark_tsc_unstable("UV BIOS"); } +/* Selector for (4|4A|5) structs */ +#define uvxy_field(sname, field, undef) ( \ + is_uv(UV4A) ? sname.s4a.field : \ + is_uv(UV4) ? sname.s4.field : \ + is_uv(UV3) ? sname.s3.field : \ + undef) + /* [Copied from arch/x86/kernel/cpu/topology.c:detect_extended_topology()] */ #define SMT_LEVEL 0 /* Leaf 0xb SMT level */ @@ -240,49 +292,138 @@ static void __init early_get_apic_socketid_shift(void) pr_info("UV: socketid_shift:%d pnode_mask:0x%x\n", uv_cpuid.socketid_shift, uv_cpuid.pnode_mask); } -/* - * Add an extra bit as dictated by bios to the destination apicid of - * interrupts potentially passing through the UV HUB. This prevents - * a deadlock between interrupts and IO port operations. - */ -static void __init uv_set_apicid_hibit(void) +static void __init uv_stringify(int len, char *to, char *from) { - union uv1h_lb_target_physical_apic_id_mask_u apicid_mask; + /* Relies on 'to' being NULL chars so result will be NULL terminated */ + strncpy(to, from, len-1); + + /* Trim trailing spaces */ + (void)strim(to); +} + +/* Find UV arch type entry in UVsystab */ +static unsigned long __init early_find_archtype(struct uv_systab *st) +{ + int i; - if (is_uv1_hub()) { - apicid_mask.v = uv_early_read_mmr(UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK); - uv_apicid_hibits = apicid_mask.s1.bit_enables & UV_APICID_HIBIT_MASK; + for (i = 0; st->entry[i].type != UV_SYSTAB_TYPE_UNUSED; i++) { + unsigned long ptr = st->entry[i].offset; + + if (!ptr) + continue; + ptr += (unsigned long)st; + if (st->entry[i].type == UV_SYSTAB_TYPE_ARCH_TYPE) + return ptr; } + return 0; } -static void __init uv_stringify(int len, char *to, char *from) +/* Validate UV arch type field in UVsystab */ +static int __init decode_arch_type(unsigned long ptr) { - /* Relies on 'to' being NULL chars so result will be NULL terminated */ - strncpy(to, from, len-1); + struct uv_arch_type_entry *uv_ate = (struct uv_arch_type_entry *)ptr; + int n = strlen(uv_ate->archtype); + + if (n > 0 && n < sizeof(uv_ate->archtype)) { + pr_info("UV: UVarchtype received from BIOS\n"); + uv_stringify(sizeof(uv_archtype), uv_archtype, uv_ate->archtype); + return 1; + } + return 0; } -static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id) +/* Determine if UV arch type entry might exist in UVsystab */ +static int __init early_get_arch_type(void) +{ + unsigned long uvst_physaddr, uvst_size, ptr; + struct uv_systab *st; + u32 rev; + int ret; + + uvst_physaddr = get_uv_systab_phys(0); + if (!uvst_physaddr) + return 0; + + st = early_memremap_ro(uvst_physaddr, sizeof(struct uv_systab)); + if (!st) { + pr_err("UV: Cannot access UVsystab, remap failed\n"); + return 0; + } + + rev = st->revision; + if (rev < UV_SYSTAB_VERSION_UV5) { + early_memunmap(st, sizeof(struct uv_systab)); + return 0; + } + + uvst_size = st->size; + early_memunmap(st, sizeof(struct uv_systab)); + st = early_memremap_ro(uvst_physaddr, uvst_size); + if (!st) { + pr_err("UV: Cannot access UVarchtype, remap failed\n"); + return 0; + } + + ptr = early_find_archtype(st); + if (!ptr) { + early_memunmap(st, uvst_size); + return 0; + } + + ret = decode_arch_type(ptr); + early_memunmap(st, uvst_size); + return ret; +} + +/* UV system found, check which APIC MODE BIOS already selected */ +static void __init early_set_apic_mode(void) { - int pnodeid; - int uv_apic; + if (x2apic_enabled()) + uv_system_type = UV_X2APIC; + else + uv_system_type = UV_LEGACY_APIC; +} +static int __init uv_set_system_type(char *_oem_id, char *_oem_table_id) +{ + /* Save OEM_ID passed from ACPI MADT */ uv_stringify(sizeof(oem_id), oem_id, _oem_id); - uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id); - if (strncmp(oem_id, "SGI", 3) != 0) { - if (strncmp(oem_id, "NSGI", 4) != 0) + /* Check if BIOS sent us a UVarchtype */ + if (!early_get_arch_type()) + + /* If not use OEM ID for UVarchtype */ + uv_stringify(sizeof(uv_archtype), uv_archtype, oem_id); + + /* Check if not hubbed */ + if (strncmp(uv_archtype, "SGI", 3) != 0) { + + /* (Not hubbed), check if not hubless */ + if (strncmp(uv_archtype, "NSGI", 4) != 0) + + /* (Not hubless), not a UV */ return 0; - /* UV4 Hubless, CH, (0x11:UV4+Any) */ - if (strncmp(oem_id, "NSGI4", 5) == 0) - uv_hubless_system = 0x11; + /* Is UV hubless system */ + uv_hubless_system = 0x01; - /* UV3 Hubless, UV300/MC990X w/o hub (0x9:UV3+Any) */ + /* UV5 Hubless */ + if (strncmp(uv_archtype, "NSGI5", 5) == 0) + uv_hubless_system |= 0x20; + + /* UV4 Hubless: CH */ + else if (strncmp(uv_archtype, "NSGI4", 5) == 0) + uv_hubless_system |= 0x10; + + /* UV3 Hubless: UV300/MC990X w/o hub */ else - uv_hubless_system = 0x9; + uv_hubless_system |= 0x8; + + /* Copy OEM Table ID */ + uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id); - pr_info("UV: OEM IDs %s/%s, HUBLESS(0x%x)\n", - oem_id, oem_table_id, uv_hubless_system); + pr_info("UV: OEM IDs %s/%s, SystemType %d, HUBLESS ID %x\n", + oem_id, oem_table_id, uv_system_type, uv_hubless_system); return 0; } @@ -292,80 +433,70 @@ static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id) return 0; } - /* Set up early hub type field in uv_hub_info for Node 0 */ - uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0; - - /* - * Determine UV arch type. - * SGI: UV100/1000 - * SGI2: UV2000/3000 - * SGI3: UV300 (truncated to 4 chars because of different varieties) - * SGI4: UV400 (truncated to 4 chars because of different varieties) - */ + /* Set hubbed type if true */ uv_hub_info->hub_revision = - !strncmp(oem_id, "SGI4", 4) ? UV4_HUB_REVISION_BASE : - !strncmp(oem_id, "SGI3", 4) ? UV3_HUB_REVISION_BASE : - !strcmp(oem_id, "SGI2") ? UV2_HUB_REVISION_BASE : - !strcmp(oem_id, "SGI") ? UV1_HUB_REVISION_BASE : 0; - - if (uv_hub_info->hub_revision == 0) - goto badbios; + !strncmp(uv_archtype, "SGI5", 4) ? UV5_HUB_REVISION_BASE : + !strncmp(uv_archtype, "SGI4", 4) ? UV4_HUB_REVISION_BASE : + !strncmp(uv_archtype, "SGI3", 4) ? UV3_HUB_REVISION_BASE : + !strcmp(uv_archtype, "SGI2") ? UV2_HUB_REVISION_BASE : 0; switch (uv_hub_info->hub_revision) { + case UV5_HUB_REVISION_BASE: + uv_hubbed_system = 0x21; + uv_hub_type_set(UV5); + break; + case UV4_HUB_REVISION_BASE: uv_hubbed_system = 0x11; + uv_hub_type_set(UV4); break; case UV3_HUB_REVISION_BASE: uv_hubbed_system = 0x9; + uv_hub_type_set(UV3); break; case UV2_HUB_REVISION_BASE: uv_hubbed_system = 0x5; + uv_hub_type_set(UV2); break; - case UV1_HUB_REVISION_BASE: - uv_hubbed_system = 0x3; - break; + default: + return 0; } - pnodeid = early_get_pnodeid(); - early_get_apic_socketid_shift(); + /* Get UV hub chip part number & revision */ + early_set_hub_type(); + /* Other UV setup functions */ + early_set_apic_mode(); + early_get_pnodeid(); + early_get_apic_socketid_shift(); x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; x86_platform.nmi_init = uv_nmi_init; + uv_tsc_check_sync(); - if (!strcmp(oem_table_id, "UVX")) { - /* This is the most common hardware variant: */ - uv_system_type = UV_X2APIC; - uv_apic = 0; - - } else if (!strcmp(oem_table_id, "UVH")) { - /* Only UV1 systems: */ - uv_system_type = UV_NON_UNIQUE_APIC; - x86_platform.legacy.warm_reset = 0; - __this_cpu_write(x2apic_extra_bits, pnodeid << uvh_apicid.s.pnode_shift); - uv_set_apicid_hibit(); - uv_apic = 1; - - } else if (!strcmp(oem_table_id, "UVL")) { - /* Only used for very small systems: */ - uv_system_type = UV_LEGACY_APIC; - uv_apic = 0; + return 1; +} - } else { - goto badbios; - } +/* Called early to probe for the correct APIC driver */ +static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id) +{ + /* Set up early hub info fields for Node 0 */ + uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0; - pr_info("UV: OEM IDs %s/%s, System/HUB Types %d/%d, uv_apic %d\n", oem_id, oem_table_id, uv_system_type, uv_min_hub_revision_id, uv_apic); - uv_tsc_check_sync(); + /* If not UV, return. */ + if (uv_set_system_type(_oem_id, _oem_table_id) == 0) + return 0; + + /* Save for display of the OEM Table ID */ + uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id); - return uv_apic; + pr_info("UV: OEM IDs %s/%s, System/UVType %d/0x%x, HUB RevID %d\n", + oem_id, oem_table_id, uv_system_type, is_uv(UV_ANY), + uv_min_hub_revision_id); -badbios: - pr_err("UV: OEM_ID:%s OEM_TABLE_ID:%s\n", oem_id, oem_table_id); - pr_err("Current BIOS not supported, update kernel and/or BIOS\n"); - BUG(); + return 0; } enum uv_system_type get_uv_system_type(void) @@ -373,6 +504,18 @@ enum uv_system_type get_uv_system_type(void) return uv_system_type; } +int uv_get_hubless_system(void) +{ + return uv_hubless_system; +} +EXPORT_SYMBOL_GPL(uv_get_hubless_system); + +ssize_t uv_get_archtype(char *buf, int len) +{ + return scnprintf(buf, len, "%s/%s", uv_archtype, oem_table_id); +} +EXPORT_SYMBOL_GPL(uv_get_archtype); + int is_uv_system(void) { return uv_system_type != UV_NONE; @@ -385,11 +528,10 @@ int is_uv_hubbed(int uvtype) } EXPORT_SYMBOL_GPL(is_uv_hubbed); -int is_uv_hubless(int uvtype) +static int is_uv_hubless(int uvtype) { return (uv_hubless_system & uvtype); } -EXPORT_SYMBOL_GPL(is_uv_hubless); void **__uv_hub_info_list; EXPORT_SYMBOL_GPL(__uv_hub_info_list); @@ -417,12 +559,6 @@ static __initdata struct uv_gam_range_s *_gr_table; #define SOCK_EMPTY ((unsigned short)~0) -extern int uv_hub_info_version(void) -{ - return UV_HUB_INFO_VERSION; -} -EXPORT_SYMBOL(uv_hub_info_version); - /* Default UV memory block size is 2GB */ static unsigned long mem_block_size __initdata = (2UL << 30); @@ -569,7 +705,6 @@ static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) int pnode; pnode = uv_apicid_to_pnode(phys_apicid); - phys_apicid |= uv_apicid_hibits; val = (1UL << UVH_IPI_INT_SEND_SHFT) | (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | @@ -590,12 +725,21 @@ static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) static void uv_send_IPI_one(int cpu, int vector) { - unsigned long apicid; - int pnode; + unsigned long apicid = per_cpu(x86_cpu_to_apicid, cpu); + int pnode = uv_apicid_to_pnode(apicid); + unsigned long dmode, val; + + if (vector == NMI_VECTOR) + dmode = APIC_DELIVERY_MODE_NMI; + else + dmode = APIC_DELIVERY_MODE_FIXED; + + val = (1UL << UVH_IPI_INT_SEND_SHFT) | + (apicid << UVH_IPI_INT_APIC_ID_SHFT) | + (dmode << UVH_IPI_INT_DELIVERY_MODE_SHFT) | + (vector << UVH_IPI_INT_VECTOR_SHFT); - apicid = per_cpu(x86_cpu_to_apicid, cpu); - pnode = uv_apicid_to_pnode(apicid); - uv_hub_send_ipi(pnode, apicid, vector); + uv_write_global_mmr64(pnode, UVH_IPI_INT, val); } static void uv_send_IPI_mask(const struct cpumask *mask, int vector) @@ -649,22 +793,16 @@ static void uv_init_apic_ldr(void) static u32 apic_uv_calc_apicid(unsigned int cpu) { - return apic_default_calc_apicid(cpu) | uv_apicid_hibits; + return apic_default_calc_apicid(cpu); } -static unsigned int x2apic_get_apic_id(unsigned long x) +static unsigned int x2apic_get_apic_id(unsigned long id) { - unsigned int id; - - WARN_ON(preemptible() && num_online_cpus() > 1); - id = x | __this_cpu_read(x2apic_extra_bits); - return id; } static u32 set_apic_id(unsigned int id) { - /* CHECKME: Do we need to mask out the xapic extra bits? */ return id; } @@ -696,15 +834,13 @@ static struct apic apic_x2apic_uv_x __ro_after_init = { .apic_id_valid = uv_apic_id_valid, .apic_id_registered = uv_apic_id_registered, - .irq_delivery_mode = dest_Fixed, - .irq_dest_mode = 0, /* Physical */ + .delivery_mode = APIC_DELIVERY_MODE_FIXED, + .dest_mode_logical = false, .disable_esr = 0, - .dest_logical = APIC_DEST_LOGICAL, - .check_apicid_used = NULL, + .check_apicid_used = NULL, .init_apic_ldr = uv_init_apic_ldr, - .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, @@ -736,18 +872,13 @@ static struct apic apic_x2apic_uv_x __ro_after_init = { .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, }; -static void set_x2apic_extra_bits(int pnode) -{ - __this_cpu_write(x2apic_extra_bits, pnode << uvh_apicid.s.pnode_shift); -} - #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_LENGTH 3 -#define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT +#define DEST_SHIFT UVXH_RH_GAM_ALIAS_0_REDIRECT_CONFIG_DEST_BASE_SHFT static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) { - union uvh_rh_gam_alias210_overlay_config_2_mmr_u alias; - union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect; + union uvh_rh_gam_alias_2_overlay_config_u alias; + union uvh_rh_gam_alias_2_redirect_config_u redirect; unsigned long m_redirect; unsigned long m_overlay; int i; @@ -755,16 +886,16 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) for (i = 0; i < UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_LENGTH; i++) { switch (i) { case 0: - m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR; - m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR; + m_redirect = UVH_RH_GAM_ALIAS_0_REDIRECT_CONFIG; + m_overlay = UVH_RH_GAM_ALIAS_0_OVERLAY_CONFIG; break; case 1: - m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR; - m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR; + m_redirect = UVH_RH_GAM_ALIAS_1_REDIRECT_CONFIG; + m_overlay = UVH_RH_GAM_ALIAS_1_OVERLAY_CONFIG; break; case 2: - m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR; - m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR; + m_redirect = UVH_RH_GAM_ALIAS_2_REDIRECT_CONFIG; + m_overlay = UVH_RH_GAM_ALIAS_2_OVERLAY_CONFIG; break; } alias.v = uv_read_local_mmr(m_overlay); @@ -779,6 +910,7 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) } enum map_type {map_wb, map_uc}; +static const char * const mt[] = { "WB", "UC" }; static __init void map_high(char *id, unsigned long base, int pshift, int bshift, int max_pnode, enum map_type map_type) { @@ -790,65 +922,36 @@ static __init void map_high(char *id, unsigned long base, int pshift, int bshift pr_info("UV: Map %s_HI base address NULL\n", id); return; } - pr_debug("UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, paddr + bytes); if (map_type == map_uc) init_extra_mapping_uc(paddr, bytes); else init_extra_mapping_wb(paddr, bytes); -} - -static __init void map_gru_distributed(unsigned long c) -{ - union uvh_rh_gam_gru_overlay_config_mmr_u gru; - u64 paddr; - unsigned long bytes; - int nid; - - gru.v = c; - - /* Only base bits 42:28 relevant in dist mode */ - gru_dist_base = gru.v & 0x000007fff0000000UL; - if (!gru_dist_base) { - pr_info("UV: Map GRU_DIST base address NULL\n"); - return; - } - - bytes = 1UL << UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT; - gru_dist_lmask = ((1UL << uv_hub_info->m_val) - 1) & ~(bytes - 1); - gru_dist_umask = ~((1UL << uv_hub_info->m_val) - 1); - gru_dist_base &= gru_dist_lmask; /* Clear bits above M */ - for_each_online_node(nid) { - paddr = ((u64)uv_node_to_pnode(nid) << uv_hub_info->m_val) | - gru_dist_base; - init_extra_mapping_wb(paddr, bytes); - gru_first_node_paddr = min(paddr, gru_first_node_paddr); - gru_last_node_paddr = max(paddr, gru_last_node_paddr); - } - - /* Save upper (63:M) bits of address only for is_GRU_range */ - gru_first_node_paddr &= gru_dist_umask; - gru_last_node_paddr &= gru_dist_umask; - - pr_debug("UV: Map GRU_DIST base 0x%016llx 0x%016llx - 0x%016llx\n", gru_dist_base, gru_first_node_paddr, gru_last_node_paddr); + pr_info("UV: Map %s_HI 0x%lx - 0x%lx %s (%d segments)\n", + id, paddr, paddr + bytes, mt[map_type], max_pnode + 1); } static __init void map_gru_high(int max_pnode) { - union uvh_rh_gam_gru_overlay_config_mmr_u gru; - int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT; - unsigned long mask = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK; - unsigned long base; - - gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); - if (!gru.s.enable) { - pr_info("UV: GRU disabled\n"); + union uvh_rh_gam_gru_overlay_config_u gru; + unsigned long mask, base; + int shift; + + if (UVH_RH_GAM_GRU_OVERLAY_CONFIG) { + gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG); + shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_BASE_SHFT; + mask = UVH_RH_GAM_GRU_OVERLAY_CONFIG_BASE_MASK; + } else if (UVH_RH10_GAM_GRU_OVERLAY_CONFIG) { + gru.v = uv_read_local_mmr(UVH_RH10_GAM_GRU_OVERLAY_CONFIG); + shift = UVH_RH10_GAM_GRU_OVERLAY_CONFIG_BASE_SHFT; + mask = UVH_RH10_GAM_GRU_OVERLAY_CONFIG_BASE_MASK; + } else { + pr_err("UV: GRU unavailable (no MMR)\n"); return; } - /* Only UV3 has distributed GRU mode */ - if (is_uv3_hub() && gru.s3.mode) { - map_gru_distributed(gru.v); + if (!gru.s.enable) { + pr_info("UV: GRU disabled (by BIOS)\n"); return; } @@ -860,62 +963,104 @@ static __init void map_gru_high(int max_pnode) static __init void map_mmr_high(int max_pnode) { - union uvh_rh_gam_mmr_overlay_config_mmr_u mmr; - int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT; + unsigned long base; + int shift; + bool enable; + + if (UVH_RH10_GAM_MMR_OVERLAY_CONFIG) { + union uvh_rh10_gam_mmr_overlay_config_u mmr; + + mmr.v = uv_read_local_mmr(UVH_RH10_GAM_MMR_OVERLAY_CONFIG); + enable = mmr.s.enable; + base = mmr.s.base; + shift = UVH_RH10_GAM_MMR_OVERLAY_CONFIG_BASE_SHFT; + } else if (UVH_RH_GAM_MMR_OVERLAY_CONFIG) { + union uvh_rh_gam_mmr_overlay_config_u mmr; + + mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG); + enable = mmr.s.enable; + base = mmr.s.base; + shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_BASE_SHFT; + } else { + pr_err("UV:%s:RH_GAM_MMR_OVERLAY_CONFIG MMR undefined?\n", + __func__); + return; + } - mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); - if (mmr.s.enable) - map_high("MMR", mmr.s.base, shift, shift, max_pnode, map_uc); + if (enable) + map_high("MMR", base, shift, shift, max_pnode, map_uc); else pr_info("UV: MMR disabled\n"); } -/* UV3/4 have identical MMIOH overlay configs, UV4A is slightly different */ -static __init void map_mmioh_high_uv34(int index, int min_pnode, int max_pnode) -{ - unsigned long overlay; - unsigned long mmr; - unsigned long base; - unsigned long nasid_mask; - unsigned long m_overlay; - int i, n, shift, m_io, max_io; - int nasid, lnasid, fi, li; - char *id; - - if (index == 0) { - id = "MMIOH0"; - m_overlay = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR; - overlay = uv_read_local_mmr(m_overlay); - base = overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_MASK; - mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR; - m_io = (overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK) - >> UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT; - shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT; - n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH; - nasid_mask = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK; - } else { - id = "MMIOH1"; - m_overlay = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR; - overlay = uv_read_local_mmr(m_overlay); - base = overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_MASK; - mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR; - m_io = (overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK) - >> UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT; - shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT; - n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH; - nasid_mask = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK; +/* Arch specific ENUM cases */ +enum mmioh_arch { + UV2_MMIOH = -1, + UVY_MMIOH0, UVY_MMIOH1, + UVX_MMIOH0, UVX_MMIOH1, +}; + +/* Calculate and Map MMIOH Regions */ +static void __init calc_mmioh_map(enum mmioh_arch index, + int min_pnode, int max_pnode, + int shift, unsigned long base, int m_io, int n_io) +{ + unsigned long mmr, nasid_mask; + int nasid, min_nasid, max_nasid, lnasid, mapped; + int i, fi, li, n, max_io; + char id[8]; + + /* One (UV2) mapping */ + if (index == UV2_MMIOH) { + strncpy(id, "MMIOH", sizeof(id)); + max_io = max_pnode; + mapped = 0; + goto map_exit; } - pr_info("UV: %s overlay 0x%lx base:0x%lx m_io:%d\n", id, overlay, base, m_io); - if (!(overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK)) { - pr_info("UV: %s disabled\n", id); + + /* small and large MMIOH mappings */ + switch (index) { + case UVY_MMIOH0: + mmr = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG0; + nasid_mask = UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0_BASE_MASK; + n = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG0_DEPTH; + min_nasid = min_pnode; + max_nasid = max_pnode; + mapped = 1; + break; + case UVY_MMIOH1: + mmr = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG1; + nasid_mask = UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG1_BASE_MASK; + n = UVH_RH10_GAM_MMIOH_REDIRECT_CONFIG1_DEPTH; + min_nasid = min_pnode; + max_nasid = max_pnode; + mapped = 1; + break; + case UVX_MMIOH0: + mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0; + nasid_mask = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_BASE_MASK; + n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_DEPTH; + min_nasid = min_pnode * 2; + max_nasid = max_pnode * 2; + mapped = 1; + break; + case UVX_MMIOH1: + mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1; + nasid_mask = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_BASE_MASK; + n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_DEPTH; + min_nasid = min_pnode * 2; + max_nasid = max_pnode * 2; + mapped = 1; + break; + default: + pr_err("UV:%s:Invalid mapping type:%d\n", __func__, index); return; } - /* Convert to NASID: */ - min_pnode *= 2; - max_pnode *= 2; - max_io = lnasid = fi = li = -1; + /* enum values chosen so (index mod 2) is MMIOH 0/1 (low/high) */ + snprintf(id, sizeof(id), "MMIOH%d", index%2); + max_io = lnasid = fi = li = -1; for (i = 0; i < n; i++) { unsigned long m_redirect = mmr + i * 8; unsigned long redirect = uv_read_local_mmr(m_redirect); @@ -925,9 +1070,12 @@ static __init void map_mmioh_high_uv34(int index, int min_pnode, int max_pnode) pr_info("UV: %s redirect base 0x%lx(@0x%lx) 0x%04x\n", id, redirect, m_redirect, nasid); - /* Invalid NASID: */ - if (nasid < min_pnode || max_pnode < nasid) + /* Invalid NASID check */ + if (nasid < min_nasid || max_nasid < nasid) { + pr_err("UV:%s:Invalid NASID:%x (range:%x..%x)\n", + __func__, index, min_nasid, max_nasid); nasid = -1; + } if (nasid == lnasid) { li = i; @@ -950,7 +1098,8 @@ static __init void map_mmioh_high_uv34(int index, int min_pnode, int max_pnode) } addr1 = (base << shift) + f * (1ULL << m_io); addr2 = (base << shift) + (l + 1) * (1ULL << m_io); - pr_info("UV: %s[%03d..%03d] NASID 0x%04x ADDR 0x%016lx - 0x%016lx\n", id, fi, li, lnasid, addr1, addr2); + pr_info("UV: %s[%03d..%03d] NASID 0x%04x ADDR 0x%016lx - 0x%016lx\n", + id, fi, li, lnasid, addr1, addr2); if (max_io < l) max_io = l; } @@ -958,58 +1107,93 @@ static __init void map_mmioh_high_uv34(int index, int min_pnode, int max_pnode) lnasid = nasid; } - pr_info("UV: %s base:0x%lx shift:%d M_IO:%d MAX_IO:%d\n", id, base, shift, m_io, max_io); +map_exit: + pr_info("UV: %s base:0x%lx shift:%d m_io:%d max_io:%d max_pnode:0x%x\n", + id, base, shift, m_io, max_io, max_pnode); - if (max_io >= 0) + if (max_io >= 0 && !mapped) map_high(id, base, shift, m_io, max_io, map_uc); } static __init void map_mmioh_high(int min_pnode, int max_pnode) { - union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; - unsigned long mmr, base; - int shift, enable, m_io, n_io; + /* UVY flavor */ + if (UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0) { + union uvh_rh10_gam_mmioh_overlay_config0_u mmioh0; + union uvh_rh10_gam_mmioh_overlay_config1_u mmioh1; + + mmioh0.v = uv_read_local_mmr(UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0); + if (unlikely(mmioh0.s.enable == 0)) + pr_info("UV: MMIOH0 disabled\n"); + else + calc_mmioh_map(UVY_MMIOH0, min_pnode, max_pnode, + UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG0_BASE_SHFT, + mmioh0.s.base, mmioh0.s.m_io, mmioh0.s.n_io); - if (is_uv3_hub() || is_uv4_hub()) { - /* Map both MMIOH regions: */ - map_mmioh_high_uv34(0, min_pnode, max_pnode); - map_mmioh_high_uv34(1, min_pnode, max_pnode); + mmioh1.v = uv_read_local_mmr(UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG1); + if (unlikely(mmioh1.s.enable == 0)) + pr_info("UV: MMIOH1 disabled\n"); + else + calc_mmioh_map(UVY_MMIOH1, min_pnode, max_pnode, + UVH_RH10_GAM_MMIOH_OVERLAY_CONFIG1_BASE_SHFT, + mmioh1.s.base, mmioh1.s.m_io, mmioh1.s.n_io); return; } + /* UVX flavor */ + if (UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0) { + union uvh_rh_gam_mmioh_overlay_config0_u mmioh0; + union uvh_rh_gam_mmioh_overlay_config1_u mmioh1; + + mmioh0.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0); + if (unlikely(mmioh0.s.enable == 0)) + pr_info("UV: MMIOH0 disabled\n"); + else { + unsigned long base = uvxy_field(mmioh0, base, 0); + int m_io = uvxy_field(mmioh0, m_io, 0); + int n_io = uvxy_field(mmioh0, n_io, 0); + + calc_mmioh_map(UVX_MMIOH0, min_pnode, max_pnode, + UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_BASE_SHFT, + base, m_io, n_io); + } - if (is_uv1_hub()) { - mmr = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR; - shift = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; - mmioh.v = uv_read_local_mmr(mmr); - enable = !!mmioh.s1.enable; - base = mmioh.s1.base; - m_io = mmioh.s1.m_io; - n_io = mmioh.s1.n_io; - } else if (is_uv2_hub()) { - mmr = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR; - shift = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; - mmioh.v = uv_read_local_mmr(mmr); - enable = !!mmioh.s2.enable; - base = mmioh.s2.base; - m_io = mmioh.s2.m_io; - n_io = mmioh.s2.n_io; - } else { + mmioh1.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1); + if (unlikely(mmioh1.s.enable == 0)) + pr_info("UV: MMIOH1 disabled\n"); + else { + unsigned long base = uvxy_field(mmioh1, base, 0); + int m_io = uvxy_field(mmioh1, m_io, 0); + int n_io = uvxy_field(mmioh1, n_io, 0); + + calc_mmioh_map(UVX_MMIOH1, min_pnode, max_pnode, + UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_BASE_SHFT, + base, m_io, n_io); + } return; } - if (enable) { - max_pnode &= (1 << n_io) - 1; - pr_info("UV: base:0x%lx shift:%d N_IO:%d M_IO:%d max_pnode:0x%x\n", base, shift, m_io, n_io, max_pnode); - map_high("MMIOH", base, shift, m_io, max_pnode, map_uc); - } else { - pr_info("UV: MMIOH disabled\n"); + /* UV2 flavor */ + if (UVH_RH_GAM_MMIOH_OVERLAY_CONFIG) { + union uvh_rh_gam_mmioh_overlay_config_u mmioh; + + mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG); + if (unlikely(mmioh.s2.enable == 0)) + pr_info("UV: MMIOH disabled\n"); + else + calc_mmioh_map(UV2_MMIOH, min_pnode, max_pnode, + UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_BASE_SHFT, + mmioh.s2.base, mmioh.s2.m_io, mmioh.s2.n_io); + return; } } static __init void map_low_mmrs(void) { - init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE); - init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE); + if (UV_GLOBAL_MMR32_BASE) + init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE); + + if (UV_LOCAL_MMR_BASE) + init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE); } static __init void uv_rtc_init(void) @@ -1029,85 +1213,6 @@ static __init void uv_rtc_init(void) } } -/* - * percpu heartbeat timer - */ -static void uv_heartbeat(struct timer_list *timer) -{ - unsigned char bits = uv_scir_info->state; - - /* Flip heartbeat bit: */ - bits ^= SCIR_CPU_HEARTBEAT; - - /* Is this CPU idle? */ - if (idle_cpu(raw_smp_processor_id())) - bits &= ~SCIR_CPU_ACTIVITY; - else - bits |= SCIR_CPU_ACTIVITY; - - /* Update system controller interface reg: */ - uv_set_scir_bits(bits); - - /* Enable next timer period: */ - mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL); -} - -static int uv_heartbeat_enable(unsigned int cpu) -{ - while (!uv_cpu_scir_info(cpu)->enabled) { - struct timer_list *timer = &uv_cpu_scir_info(cpu)->timer; - - uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); - timer_setup(timer, uv_heartbeat, TIMER_PINNED); - timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; - add_timer_on(timer, cpu); - uv_cpu_scir_info(cpu)->enabled = 1; - - /* Also ensure that boot CPU is enabled: */ - cpu = 0; - } - return 0; -} - -#ifdef CONFIG_HOTPLUG_CPU -static int uv_heartbeat_disable(unsigned int cpu) -{ - if (uv_cpu_scir_info(cpu)->enabled) { - uv_cpu_scir_info(cpu)->enabled = 0; - del_timer(&uv_cpu_scir_info(cpu)->timer); - } - uv_set_cpu_scir_bits(cpu, 0xff); - return 0; -} - -static __init void uv_scir_register_cpu_notifier(void) -{ - cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/x2apic-uvx:online", - uv_heartbeat_enable, uv_heartbeat_disable); -} - -#else /* !CONFIG_HOTPLUG_CPU */ - -static __init void uv_scir_register_cpu_notifier(void) -{ -} - -static __init int uv_init_heartbeat(void) -{ - int cpu; - - if (is_uv_system()) { - for_each_online_cpu(cpu) - uv_heartbeat_enable(cpu); - } - - return 0; -} - -late_initcall(uv_init_heartbeat); - -#endif /* !CONFIG_HOTPLUG_CPU */ - /* Direct Legacy VGA I/O traffic to designated IOH */ static int uv_set_vga_state(struct pci_dev *pdev, bool decode, unsigned int command_bits, u32 flags) { @@ -1138,9 +1243,6 @@ void uv_cpu_init(void) return; uv_hub_info->nr_online_cpus++; - - if (get_uv_system_type() == UV_NON_UNIQUE_APIC) - set_x2apic_extra_bits(uv_hub_info->pnode); } struct mn { @@ -1150,37 +1252,29 @@ struct mn { unsigned char n_lshift; }; +/* Initialize caller's MN struct and fill in values */ static void get_mn(struct mn *mnp) { - union uvh_rh_gam_config_mmr_u m_n_config; - union uv3h_gr0_gam_gr_config_u m_gr_config; - - /* Make sure the whole structure is well initialized: */ memset(mnp, 0, sizeof(*mnp)); - - m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR); - mnp->n_val = m_n_config.s.n_skt; - - if (is_uv4_hub()) { + mnp->n_val = uv_cpuid.n_skt; + if (is_uv(UV4|UVY)) { mnp->m_val = 0; mnp->n_lshift = 0; } else if (is_uv3_hub()) { - mnp->m_val = m_n_config.s3.m_skt; - m_gr_config.v = uv_read_local_mmr(UV3H_GR0_GAM_GR_CONFIG); + union uvyh_gr0_gam_gr_config_u m_gr_config; + + mnp->m_val = uv_cpuid.m_skt; + m_gr_config.v = uv_read_local_mmr(UVH_GR0_GAM_GR_CONFIG); mnp->n_lshift = m_gr_config.s3.m_skt; } else if (is_uv2_hub()) { - mnp->m_val = m_n_config.s2.m_skt; + mnp->m_val = uv_cpuid.m_skt; mnp->n_lshift = mnp->m_val == 40 ? 40 : 39; - } else if (is_uv1_hub()) { - mnp->m_val = m_n_config.s1.m_skt; - mnp->n_lshift = mnp->m_val; } mnp->m_shift = mnp->m_val ? 64 - mnp->m_val : 0; } static void __init uv_init_hub_info(struct uv_hub_info_s *hi) { - union uvh_node_id_u node_id; struct mn mn; get_mn(&mn); @@ -1193,7 +1287,9 @@ static void __init uv_init_hub_info(struct uv_hub_info_s *hi) hi->m_shift = mn.m_shift; hi->n_lshift = mn.n_lshift ? mn.n_lshift : 0; hi->hub_revision = uv_hub_info->hub_revision; + hi->hub_type = uv_hub_info->hub_type; hi->pnode_mask = uv_cpuid.pnode_mask; + hi->nasid_shift = uv_cpuid.nasid_shift; hi->min_pnode = _min_pnode; hi->min_socket = _min_socket; hi->pnode_to_socket = _pnode_to_socket; @@ -1202,9 +1298,8 @@ static void __init uv_init_hub_info(struct uv_hub_info_s *hi) hi->gr_table_len = _gr_table_len; hi->gr_table = _gr_table; - node_id.v = uv_read_local_mmr(UVH_NODE_ID); uv_cpuid.gnode_shift = max_t(unsigned int, uv_cpuid.gnode_shift, mn.n_val); - hi->gnode_extra = (node_id.s.node_id & ~((1 << uv_cpuid.gnode_shift) - 1)) >> 1; + hi->gnode_extra = (uv_node_id & ~((1 << uv_cpuid.gnode_shift) - 1)) >> 1; if (mn.m_val) hi->gnode_upper = (u64)hi->gnode_extra << mn.m_val; @@ -1216,7 +1311,9 @@ static void __init uv_init_hub_info(struct uv_hub_info_s *hi) hi->gpa_shift = uv_gp_table->gpa_shift; hi->gpa_mask = (1UL << hi->gpa_shift) - 1; } else { - hi->global_mmr_base = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & ~UV_MMR_ENABLE; + hi->global_mmr_base = + uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG) & + ~UV_MMR_ENABLE; hi->global_mmr_shift = _UV_GLOBAL_MMR64_PNODE_SHIFT; } @@ -1227,7 +1324,11 @@ static void __init uv_init_hub_info(struct uv_hub_info_s *hi) /* Show system specific info: */ pr_info("UV: N:%d M:%d m_shift:%d n_lshift:%d\n", hi->n_val, hi->m_val, hi->m_shift, hi->n_lshift); pr_info("UV: gpa_mask/shift:0x%lx/%d pnode_mask:0x%x apic_pns:%d\n", hi->gpa_mask, hi->gpa_shift, hi->pnode_mask, hi->apic_pnode_shift); - pr_info("UV: mmr_base/shift:0x%lx/%ld gru_base/shift:0x%lx/%ld\n", hi->global_mmr_base, hi->global_mmr_shift, hi->global_gru_base, hi->global_gru_shift); + pr_info("UV: mmr_base/shift:0x%lx/%ld\n", hi->global_mmr_base, hi->global_mmr_shift); + if (hi->global_gru_base) + pr_info("UV: gru_base/shift:0x%lx/%ld\n", + hi->global_gru_base, hi->global_gru_shift); + pr_info("UV: gnode_upper:0x%lx gnode_extra:0x%x\n", hi->gnode_upper, hi->gnode_extra); } @@ -1245,7 +1346,7 @@ static void __init decode_gam_params(unsigned long ptr) static void __init decode_gam_rng_tbl(unsigned long ptr) { struct uv_gam_range_entry *gre = (struct uv_gam_range_entry *)ptr; - unsigned long lgre = 0; + unsigned long lgre = 0, gend = 0; int index = 0; int sock_min = 999999, pnode_min = 99999; int sock_max = -1, pnode_max = -1; @@ -1279,6 +1380,9 @@ static void __init decode_gam_rng_tbl(unsigned long ptr) flag, size, suffix[order], gre->type, gre->nasid, gre->sockid, gre->pnode); + if (gre->type == UV_GAM_RANGE_TYPE_HOLE) + gend = (unsigned long)gre->limit << UV_GAM_RANGE_SHFT; + /* update to next range start */ lgre = gre->limit; if (sock_min > gre->sockid) @@ -1296,24 +1400,29 @@ static void __init decode_gam_rng_tbl(unsigned long ptr) _max_pnode = pnode_max; _gr_table_len = index; - pr_info("UV: GRT: %d entries, sockets(min:%x,max:%x) pnodes(min:%x,max:%x)\n", index, _min_socket, _max_socket, _min_pnode, _max_pnode); + pr_info("UV: GRT: %d entries, sockets(min:%x,max:%x), pnodes(min:%x,max:%x), gap_end(%d)\n", + index, _min_socket, _max_socket, _min_pnode, _max_pnode, fls64(gend)); } +/* Walk through UVsystab decoding the fields */ static int __init decode_uv_systab(void) { struct uv_systab *st; int i; - /* If system is uv3 or lower, there is no extended UVsystab */ - if (is_uv_hubbed(0xfffffe) < uv(4) && is_uv_hubless(0xfffffe) < uv(4)) - return 0; /* No extended UVsystab required */ - + /* Get mapped UVsystab pointer */ st = uv_systab; + + /* If UVsystab is version 1, there is no extended UVsystab */ + if (st && st->revision == UV_SYSTAB_VERSION_1) + return 0; + if ((!st) || (st->revision < UV_SYSTAB_VERSION_UV4_LATEST)) { int rev = st ? st->revision : 0; - pr_err("UV: BIOS UVsystab version(%x) mismatch, expecting(%x)\n", rev, UV_SYSTAB_VERSION_UV4_LATEST); - pr_err("UV: Cannot support UV operations, switching to generic PC\n"); + pr_err("UV: BIOS UVsystab mismatch, (%x < %x)\n", + rev, UV_SYSTAB_VERSION_UV4_LATEST); + pr_err("UV: Does not support UV, switch to non-UV x86_64\n"); uv_system_type = UV_NONE; return -EINVAL; @@ -1325,7 +1434,8 @@ static int __init decode_uv_systab(void) if (!ptr) continue; - ptr = ptr + (unsigned long)st; + /* point to payload */ + ptr += (unsigned long)st; switch (st->entry[i].type) { case UV_SYSTAB_TYPE_GAM_PARAMS: @@ -1335,32 +1445,49 @@ static int __init decode_uv_systab(void) case UV_SYSTAB_TYPE_GAM_RNG_TBL: decode_gam_rng_tbl(ptr); break; + + case UV_SYSTAB_TYPE_ARCH_TYPE: + /* already processed in early startup */ + break; + + default: + pr_err("UV:%s:Unrecognized UV_SYSTAB_TYPE:%d, skipped\n", + __func__, st->entry[i].type); + break; } } return 0; } -/* - * Set up physical blade translations from UVH_NODE_PRESENT_TABLE - * .. NB: UVH_NODE_PRESENT_TABLE is going away, - * .. being replaced by GAM Range Table - */ +/* Set up physical blade translations from UVH_NODE_PRESENT_TABLE */ static __init void boot_init_possible_blades(struct uv_hub_info_s *hub_info) { + unsigned long np; int i, uv_pb = 0; - pr_info("UV: NODE_PRESENT_DEPTH = %d\n", UVH_NODE_PRESENT_TABLE_DEPTH); - for (i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) { - unsigned long np; - - np = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8); - if (np) + if (UVH_NODE_PRESENT_TABLE) { + pr_info("UV: NODE_PRESENT_DEPTH = %d\n", + UVH_NODE_PRESENT_TABLE_DEPTH); + for (i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) { + np = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8); pr_info("UV: NODE_PRESENT(%d) = 0x%016lx\n", i, np); - + uv_pb += hweight64(np); + } + } + if (UVH_NODE_PRESENT_0) { + np = uv_read_local_mmr(UVH_NODE_PRESENT_0); + pr_info("UV: NODE_PRESENT_0 = 0x%016lx\n", np); + uv_pb += hweight64(np); + } + if (UVH_NODE_PRESENT_1) { + np = uv_read_local_mmr(UVH_NODE_PRESENT_1); + pr_info("UV: NODE_PRESENT_1 = 0x%016lx\n", np); uv_pb += hweight64(np); } if (uv_possible_blades != uv_pb) uv_possible_blades = uv_pb; + + pr_info("UV: number nodes/possible blades %d\n", uv_pb); } static void __init build_socket_tables(void) @@ -1375,11 +1502,11 @@ static void __init build_socket_tables(void) size_t bytes; if (!gre) { - if (is_uv1_hub() || is_uv2_hub() || is_uv3_hub()) { + if (is_uv2_hub() || is_uv3_hub()) { pr_info("UV: No UVsystab socket table, ignoring\n"); return; } - pr_crit("UV: Error: UVsystab address translations not available!\n"); + pr_err("UV: Error: UVsystab address translations not available!\n"); BUG(); } @@ -1492,22 +1619,31 @@ static void check_efi_reboot(void) reboot_type = BOOT_ACPI; } -/* Setup user proc fs files */ +/* + * User proc fs file handling now deprecated. + * Recommend using /sys/firmware/sgi_uv/... instead. + */ static int __maybe_unused proc_hubbed_show(struct seq_file *file, void *data) { + pr_notice_once("%s: using deprecated /proc/sgi_uv/hubbed, use /sys/firmware/sgi_uv/hub_type\n", + current->comm); seq_printf(file, "0x%x\n", uv_hubbed_system); return 0; } static int __maybe_unused proc_hubless_show(struct seq_file *file, void *data) { + pr_notice_once("%s: using deprecated /proc/sgi_uv/hubless, use /sys/firmware/sgi_uv/hubless\n", + current->comm); seq_printf(file, "0x%x\n", uv_hubless_system); return 0; } -static int __maybe_unused proc_oemid_show(struct seq_file *file, void *data) +static int __maybe_unused proc_archtype_show(struct seq_file *file, void *data) { - seq_printf(file, "%s/%s\n", oem_id, oem_table_id); + pr_notice_once("%s: using deprecated /proc/sgi_uv/archtype, use /sys/firmware/sgi_uv/archtype\n", + current->comm); + seq_printf(file, "%s/%s\n", uv_archtype, oem_table_id); return 0; } @@ -1516,7 +1652,7 @@ static __init void uv_setup_proc_files(int hubless) struct proc_dir_entry *pde; pde = proc_mkdir(UV_PROC_NODE, NULL); - proc_create_single("oemid", 0, pde, proc_oemid_show); + proc_create_single("archtype", 0, pde, proc_archtype_show); if (hubless) proc_create_single("hubless", 0, pde, proc_hubless_show); else @@ -1541,6 +1677,9 @@ static __init int uv_system_init_hubless(void) if (rc < 0) return rc; + /* Set section block size for current node memory */ + set_block_size(); + /* Create user access node */ if (rc >= 0) uv_setup_proc_files(1); @@ -1555,10 +1694,10 @@ static void __init uv_system_init_hub(void) struct uv_hub_info_s hub_info = {0}; int bytes, cpu, nodeid; unsigned short min_pnode = 9999, max_pnode = 0; - char *hub = is_uv4_hub() ? "UV400" : + char *hub = is_uv5_hub() ? "UV500" : + is_uv4_hub() ? "UV400" : is_uv3_hub() ? "UV300" : - is_uv2_hub() ? "UV2000/3000" : - is_uv1_hub() ? "UV100/1000" : NULL; + is_uv2_hub() ? "UV2000/3000" : NULL; if (!hub) { pr_err("UV: Unknown/unsupported UV hub\n"); @@ -1568,12 +1707,14 @@ static void __init uv_system_init_hub(void) map_low_mmrs(); - /* Get uv_systab for decoding: */ + /* Get uv_systab for decoding, setup UV BIOS calls */ uv_bios_init(); /* If there's an UVsystab problem then abort UV init: */ - if (decode_uv_systab() < 0) + if (decode_uv_systab() < 0) { + pr_err("UV: Mangled UVsystab format\n"); return; + } build_socket_tables(); build_uv_gr_table(); @@ -1644,8 +1785,6 @@ static void __init uv_system_init_hub(void) uv_hub_info_list(numa_node_id)->pnode = pnode; else if (uv_cpu_hub_info(cpu)->pnode == 0xffff) uv_cpu_hub_info(cpu)->pnode = pnode; - - uv_cpu_scir_info(cpu)->offset = uv_scir_offset(apicid); } for_each_node(nodeid) { @@ -1674,7 +1813,6 @@ static void __init uv_system_init_hub(void) uv_nmi_setup(); uv_cpu_init(); - uv_scir_register_cpu_notifier(); uv_setup_proc_files(0); /* Register Legacy VGA I/O redirection handler: */ diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 660270359d39..60e330cdbd17 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -94,7 +94,7 @@ * Remove APM dependencies in arch/i386/kernel/process.c * Remove APM dependencies in drivers/char/sysrq.c * Reset time across standby. - * Allow more inititialisation on SMP. + * Allow more initialisation on SMP. * Remove CONFIG_APM_POWER_OFF and make it boot time * configurable (default on). * Make debug only a boot time parameter (remove APM_DEBUG). @@ -232,6 +232,7 @@ #include <asm/paravirt.h> #include <asm/reboot.h> #include <asm/nospec-branch.h> +#include <asm/ibt.h> #if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) extern int (*console_blank_hook)(int); @@ -598,6 +599,7 @@ static long __apm_bios_call(void *_call) struct desc_struct save_desc_40; struct desc_struct *gdt; struct apm_bios_call *call = _call; + u64 ibt; cpu = get_cpu(); BUG_ON(cpu != 0); @@ -607,11 +609,13 @@ static long __apm_bios_call(void *_call) apm_irq_save(flags); firmware_restrict_branch_speculation_start(); + ibt = ibt_save(); APM_DO_SAVE_SEGS; apm_bios_call_asm(call->func, call->ebx, call->ecx, &call->eax, &call->ebx, &call->ecx, &call->edx, &call->esi); APM_DO_RESTORE_SEGS; + ibt_restore(ibt); firmware_restrict_branch_speculation_end(); apm_irq_restore(flags); gdt[0x40 / 8] = save_desc_40; @@ -676,6 +680,7 @@ static long __apm_bios_call_simple(void *_call) struct desc_struct save_desc_40; struct desc_struct *gdt; struct apm_bios_call *call = _call; + u64 ibt; cpu = get_cpu(); BUG_ON(cpu != 0); @@ -685,10 +690,12 @@ static long __apm_bios_call_simple(void *_call) apm_irq_save(flags); firmware_restrict_branch_speculation_start(); + ibt = ibt_save(); APM_DO_SAVE_SEGS; error = apm_bios_call_simple_asm(call->func, call->ebx, call->ecx, &call->eax); APM_DO_RESTORE_SEGS; + ibt_restore(ibt); firmware_restrict_branch_speculation_end(); apm_irq_restore(flags); gdt[0x40 / 8] = save_desc_40; @@ -766,7 +773,7 @@ static int apm_driver_version(u_short *val) * not cleared until it is acknowledged. * * Additional information is returned in the info pointer, providing - * that APM 1.2 is in use. If no messges are pending the value 0x80 + * that APM 1.2 is in use. If no messages are pending the value 0x80 * is returned (No power management events pending). */ static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info) @@ -1025,7 +1032,7 @@ static int apm_enable_power_management(int enable) * status which gives the rough battery status, and current power * source. The bat value returned give an estimate as a percentage * of life and a status value for the battery. The estimated life - * if reported is a lifetime in secodnds/minutes at current powwer + * if reported is a lifetime in seconds/minutes at current power * consumption. */ diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 5c7ee3df4d0b..437308004ef2 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -18,6 +18,7 @@ #include <asm/bootparam.h> #include <asm/suspend.h> #include <asm/tlbflush.h> +#include <asm/tdx.h> #ifdef CONFIG_XEN #include <xen/interface/xen.h> @@ -38,12 +39,6 @@ static void __used common(void) #endif BLANK(); - OFFSET(TASK_addr_limit, task_struct, thread.addr_limit); - - BLANK(); - OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); - - BLANK(); OFFSET(pbe_address, pbe, address); OFFSET(pbe_orig_address, pbe, orig_address); OFFSET(pbe_next, pbe, next); @@ -64,14 +59,6 @@ static void __used common(void) OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext); #endif -#ifdef CONFIG_PARAVIRT_XXL - BLANK(); - OFFSET(PV_IRQ_irq_disable, paravirt_patch_template, irq.irq_disable); - OFFSET(PV_IRQ_irq_enable, paravirt_patch_template, irq.irq_enable); - OFFSET(PV_CPU_iret, paravirt_patch_template, cpu.iret); - OFFSET(PV_MMU_read_cr2, paravirt_patch_template, mmu.read_cr2); -#endif - #ifdef CONFIG_XEN BLANK(); OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); @@ -80,6 +67,22 @@ static void __used common(void) #endif BLANK(); + OFFSET(TDX_MODULE_rcx, tdx_module_output, rcx); + OFFSET(TDX_MODULE_rdx, tdx_module_output, rdx); + OFFSET(TDX_MODULE_r8, tdx_module_output, r8); + OFFSET(TDX_MODULE_r9, tdx_module_output, r9); + OFFSET(TDX_MODULE_r10, tdx_module_output, r10); + OFFSET(TDX_MODULE_r11, tdx_module_output, r11); + + BLANK(); + OFFSET(TDX_HYPERCALL_r10, tdx_hypercall_args, r10); + OFFSET(TDX_HYPERCALL_r11, tdx_hypercall_args, r11); + OFFSET(TDX_HYPERCALL_r12, tdx_hypercall_args, r12); + OFFSET(TDX_HYPERCALL_r13, tdx_hypercall_args, r13); + OFFSET(TDX_HYPERCALL_r14, tdx_hypercall_args, r14); + OFFSET(TDX_HYPERCALL_r15, tdx_hypercall_args, r15); + + BLANK(); OFFSET(BP_scratch, boot_params, scratch); OFFSET(BP_secure_boot, boot_params, secure_boot); OFFSET(BP_loadflags, boot_params, hdr.loadflags); @@ -88,7 +91,6 @@ static void __used common(void) OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); OFFSET(BP_init_size, boot_params, hdr.init_size); OFFSET(BP_pref_address, boot_params, hdr.pref_address); - OFFSET(BP_code32_start, boot_params, hdr.code32_start); BLANK(); DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 82826f2275cc..2b411cd00a4e 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -3,12 +3,9 @@ # error "Please do not build this file directly, build asm-offsets.c instead" #endif -#include <asm/ucontext.h> +#include <linux/efi.h> -#define __SYSCALL_I386(nr, sym, qual) [nr] = 1, -static char syscalls[] = { -#include <asm/syscalls_32.h> -}; +#include <asm/ucontext.h> /* workaround for a warning with -Wmissing-prototypes */ void foo(void); @@ -56,12 +53,6 @@ void foo(void) offsetof(struct cpu_entry_area, tss.x86_tss.sp1) - offsetofend(struct cpu_entry_area, entry_stack_page.stack)); -#ifdef CONFIG_STACKPROTECTOR - BLANK(); - OFFSET(stack_canary_offset, stack_canary, canary); -#endif - BLANK(); - DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); - DEFINE(NR_syscalls, sizeof(syscalls)); + DEFINE(EFI_svam, offsetof(efi_runtime_services_t, set_virtual_address_map)); } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 24d2fde30d00..9b698215d261 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -5,31 +5,7 @@ #include <asm/ia32.h> -#define __SYSCALL_64(nr, sym, qual) [nr] = 1, -#define __SYSCALL_X32(nr, sym, qual) -static char syscalls_64[] = { -#include <asm/syscalls_64.h> -}; -#undef __SYSCALL_64 -#undef __SYSCALL_X32 - -#ifdef CONFIG_X86_X32_ABI -#define __SYSCALL_64(nr, sym, qual) -#define __SYSCALL_X32(nr, sym, qual) [nr] = 1, -static char syscalls_x32[] = { -#include <asm/syscalls_64.h> -}; -#undef __SYSCALL_64 -#undef __SYSCALL_X32 -#endif - -#define __SYSCALL_I386(nr, sym, qual) [nr] = 1, -static char syscalls_ia32[] = { -#include <asm/syscalls_32.h> -}; -#undef __SYSCALL_I386 - -#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS) +#if defined(CONFIG_KVM_GUEST) #include <asm/kvm_para.h> #endif @@ -37,9 +13,6 @@ int main(void) { #ifdef CONFIG_PARAVIRT #ifdef CONFIG_PARAVIRT_XXL - OFFSET(PV_CPU_usergs_sysret64, paravirt_patch_template, - cpu.usergs_sysret64); - OFFSET(PV_CPU_swapgs, paravirt_patch_template, cpu.swapgs); #ifdef CONFIG_DEBUG_ENTRY OFFSET(PV_IRQ_save_fl, paravirt_patch_template, irq.save_fl); #endif @@ -47,7 +20,7 @@ int main(void) BLANK(); #endif -#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS) +#if defined(CONFIG_KVM_GUEST) OFFSET(KVM_STEAL_TIME_preempted, kvm_steal_time, preempted); BLANK(); #endif @@ -81,26 +54,11 @@ int main(void) BLANK(); #undef ENTRY - OFFSET(TSS_ist, tss_struct, x86_tss.ist); - DEFINE(DB_STACK_OFFSET, offsetof(struct cea_exception_stacks, DB_stack) - - offsetof(struct cea_exception_stacks, DB1_stack)); BLANK(); #ifdef CONFIG_STACKPROTECTOR DEFINE(stack_canary_offset, offsetof(struct fixed_percpu_data, stack_canary)); BLANK(); #endif - - DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); - DEFINE(NR_syscalls, sizeof(syscalls_64)); - -#ifdef CONFIG_X86_X32_ABI - DEFINE(__NR_syscall_x32_max, sizeof(syscalls_x32) - 1); - DEFINE(X32_NR_syscalls, sizeof(syscalls_x32)); -#endif - - DEFINE(__NR_syscall_compat_max, sizeof(syscalls_ia32) - 1); - DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32)); - return 0; } diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c index e1efe44ebefc..44c3601cfdc4 100644 --- a/arch/x86/kernel/audit_64.c +++ b/arch/x86/kernel/audit_64.c @@ -3,6 +3,7 @@ #include <linux/types.h> #include <linux/audit.h> #include <asm/unistd.h> +#include <asm/audit.h> static unsigned dir_class[] = { #include <asm-generic/audit_dir_write.h> @@ -41,20 +42,21 @@ int audit_classify_arch(int arch) int audit_classify_syscall(int abi, unsigned syscall) { #ifdef CONFIG_IA32_EMULATION - extern int ia32_classify_syscall(unsigned); if (abi == AUDIT_ARCH_I386) return ia32_classify_syscall(syscall); #endif switch(syscall) { case __NR_open: - return 2; + return AUDITSC_OPEN; case __NR_openat: - return 3; + return AUDITSC_OPENAT; case __NR_execve: case __NR_execveat: - return 5; + return AUDITSC_EXECVE; + case __NR_openat2: + return AUDITSC_OPENAT2; default: - return 0; + return AUDITSC_NATIVE; } } diff --git a/arch/x86/kernel/cfi.c b/arch/x86/kernel/cfi.c new file mode 100644 index 000000000000..8674a5c0c031 --- /dev/null +++ b/arch/x86/kernel/cfi.c @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Clang Control Flow Integrity (CFI) support. + * + * Copyright (C) 2022 Google LLC + */ +#include <asm/cfi.h> +#include <asm/insn.h> +#include <asm/insn-eval.h> +#include <linux/string.h> + +/* + * Returns the target address and the expected type when regs->ip points + * to a compiler-generated CFI trap. + */ +static bool decode_cfi_insn(struct pt_regs *regs, unsigned long *target, + u32 *type) +{ + char buffer[MAX_INSN_SIZE]; + struct insn insn; + int offset = 0; + + *target = *type = 0; + + /* + * The compiler generates the following instruction sequence + * for indirect call checks: + * + * Â movl -<id>, %r10d ; 6 bytes + * addl -4(%reg), %r10d ; 4 bytes + * je .Ltmp1 ; 2 bytes + * ud2 ; <- regs->ip + * .Ltmp1: + * + * We can decode the expected type and the target address from the + * movl/addl instructions. + */ + if (copy_from_kernel_nofault(buffer, (void *)regs->ip - 12, MAX_INSN_SIZE)) + return false; + if (insn_decode_kernel(&insn, &buffer[offset])) + return false; + if (insn.opcode.value != 0xBA) + return false; + + *type = -(u32)insn.immediate.value; + + if (copy_from_kernel_nofault(buffer, (void *)regs->ip - 6, MAX_INSN_SIZE)) + return false; + if (insn_decode_kernel(&insn, &buffer[offset])) + return false; + if (insn.opcode.value != 0x3) + return false; + + /* Read the target address from the register. */ + offset = insn_get_modrm_rm_off(&insn, regs); + if (offset < 0) + return false; + + *target = *(unsigned long *)((void *)regs + offset); + + return true; +} + +/* + * Checks if a ud2 trap is because of a CFI failure, and handles the trap + * if needed. Returns a bug_trap_type value similarly to report_bug. + */ +enum bug_trap_type handle_cfi_failure(struct pt_regs *regs) +{ + unsigned long target; + u32 type; + + if (!is_cfi_trap(regs->ip)) + return BUG_TRAP_TYPE_NONE; + + if (!decode_cfi_insn(regs, &target, &type)) + return report_cfi_failure_noaddr(regs, regs->ip); + + return report_cfi_failure(regs, regs->ip, &target, type); +} + +/* + * Ensure that __kcfi_typeid_ symbols are emitted for functions that may + * not be indirectly called with all configurations. + */ +__ADDRESSABLE(__memcpy) diff --git a/arch/x86/kernel/cpu/.gitignore b/arch/x86/kernel/cpu/.gitignore index 667df55a4399..0bca7ef7426a 100644 --- a/arch/x86/kernel/cpu/.gitignore +++ b/arch/x86/kernel/cpu/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only capflags.c diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 7dc4ad68eb41..f10a921ee756 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -12,10 +12,13 @@ endif # If these files are instrumented, boot hangs during the first second. KCOV_INSTRUMENT_common.o := n KCOV_INSTRUMENT_perf_event.o := n +KMSAN_SANITIZE_common.o := n + +# As above, instrumenting secondary CPU boot code causes boot hangs. +KCSAN_SANITIZE_common.o := n # Make sure load_percpu_segment has no stackprotector -nostackp := $(call cc-option, -fno-stack-protector) -CFLAGS_common.o := $(nostackp) +CFLAGS_common.o := -fno-stack-protector obj-y := cacheinfo.o scattered.o topology.o obj-y += common.o @@ -41,11 +44,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o obj-$(CONFIG_CPU_SUP_ZHAOXIN) += zhaoxin.o +obj-$(CONFIG_CPU_SUP_VORTEX_32) += vortex.o obj-$(CONFIG_X86_MCE) += mce/ obj-$(CONFIG_MTRR) += mtrr/ obj-$(CONFIG_MICROCODE) += microcode/ obj-$(CONFIG_X86_CPU_RESCTRL) += resctrl/ +obj-$(CONFIG_X86_SGX) += sgx/ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c index 676022e71791..485441b7f030 100644 --- a/arch/x86/kernel/cpu/acrn.c +++ b/arch/x86/kernel/cpu/acrn.c @@ -10,36 +10,37 @@ */ #include <linux/interrupt.h> + #include <asm/acrn.h> #include <asm/apic.h> +#include <asm/cpufeatures.h> #include <asm/desc.h> #include <asm/hypervisor.h> +#include <asm/idtentry.h> #include <asm/irq_regs.h> -static uint32_t __init acrn_detect(void) +static u32 __init acrn_detect(void) { - return hypervisor_cpuid_base("ACRNACRNACRN\0\0", 0); + return acrn_cpuid_base(); } static void __init acrn_init_platform(void) { /* Setup the IDT for ACRN hypervisor callback */ - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, acrn_hv_callback_vector); + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_acrn_hv_callback); + + x86_platform.calibrate_tsc = acrn_get_tsc_khz; + x86_platform.calibrate_cpu = acrn_get_tsc_khz; } static bool acrn_x2apic_available(void) { - /* - * x2apic is not supported for now. Future enablement will have to check - * X86_FEATURE_X2APIC to determine whether x2apic is supported in the - * guest. - */ - return false; + return boot_cpu_has(X86_FEATURE_X2APIC); } static void (*acrn_intr_handler)(void); -__visible void __irq_entry acrn_hv_vector_handler(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_acrn_hv_callback) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -50,16 +51,27 @@ __visible void __irq_entry acrn_hv_vector_handler(struct pt_regs *regs) * will block the interrupt whose vector is lower than * HYPERVISOR_CALLBACK_VECTOR. */ - entering_ack_irq(); + ack_APIC_irq(); inc_irq_stat(irq_hv_callback_count); if (acrn_intr_handler) acrn_intr_handler(); - exiting_irq(); set_irq_regs(old_regs); } +void acrn_setup_intr_handler(void (*handler)(void)) +{ + acrn_intr_handler = handler; +} +EXPORT_SYMBOL_GPL(acrn_setup_intr_handler); + +void acrn_remove_intr_handler(void) +{ + acrn_intr_handler = NULL; +} +EXPORT_SYMBOL_GPL(acrn_remove_intr_handler); + const __initconst struct hypervisor_x86 x86_hyper_acrn = { .name = "ACRN", .detect = acrn_detect, diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 1f875fbe1384..860b60273df3 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -15,13 +15,14 @@ #include <asm/cpu.h> #include <asm/spec-ctrl.h> #include <asm/smp.h> +#include <asm/numa.h> #include <asm/pci-direct.h> #include <asm/delay.h> #include <asm/debugreg.h> +#include <asm/resctrl.h> #ifdef CONFIG_X86_64 # include <asm/mmconfig.h> -# include <asm/set_memory.h> #endif #include "cpu.h" @@ -328,7 +329,6 @@ static void legacy_fixup_core_id(struct cpuinfo_x86 *c) */ static void amd_get_topology(struct cpuinfo_x86 *c) { - u8 node_id; int cpu = smp_processor_id(); /* get information required for multi-node processors */ @@ -338,7 +338,7 @@ static void amd_get_topology(struct cpuinfo_x86 *c) cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); - node_id = ecx & 0xff; + c->cpu_die_id = ecx & 0xff; if (c->x86 == 0x15) c->cu_id = ebx & 0xff; @@ -358,15 +358,15 @@ static void amd_get_topology(struct cpuinfo_x86 *c) if (!err) c->x86_coreid_bits = get_count_order(c->x86_max_cores); - cacheinfo_amd_init_llc_id(c, cpu, node_id); + cacheinfo_amd_init_llc_id(c, cpu); } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { u64 value; rdmsrl(MSR_FAM10H_NODE_ID, value); - node_id = value & 7; + c->cpu_die_id = value & 7; - per_cpu(cpu_llc_id, cpu) = node_id; + per_cpu(cpu_llc_id, cpu) = c->cpu_die_id; } else return; @@ -391,15 +391,9 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c) /* Convert the initial APIC ID into the socket ID */ c->phys_proc_id = c->initial_apicid >> bits; /* use socket ID also for last level cache */ - per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; + per_cpu(cpu_llc_id, cpu) = c->cpu_die_id = c->phys_proc_id; } -u16 amd_get_nb_id(int cpu) -{ - return per_cpu(cpu_llc_id, cpu); -} -EXPORT_SYMBOL_GPL(amd_get_nb_id); - u32 amd_get_nodes_per_socket(void) { return nodes_per_socket; @@ -415,7 +409,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c) node = numa_cpu_node(cpu); if (node == NUMA_NO_NODE) - node = per_cpu(cpu_llc_id, cpu); + node = get_llc_id(cpu); /* * On multi-fabric platform (e.g. Numascale NumaChip) a @@ -485,26 +479,6 @@ static void early_init_amd_mc(struct cpuinfo_x86 *c) static void bsp_init_amd(struct cpuinfo_x86 *c) { - -#ifdef CONFIG_X86_64 - if (c->x86 >= 0xf) { - unsigned long long tseg; - - /* - * Split up direct mapping around the TSEG SMM area. - * Don't do it for gbpages because there seems very little - * benefit in doing so. - */ - if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { - unsigned long pfn = tseg >> PAGE_SHIFT; - - pr_debug("tseg: %010llx\n", tseg); - if (pfn_range_is_mapped(pfn, pfn + 1)) - set_memory_4k((unsigned long)__va(tseg), 1); - } - } -#endif - if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { if (c->x86 > 0x10 || @@ -529,7 +503,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) va_align.flags = ALIGN_VA_32 | ALIGN_VA_64; /* A random value per boot for bit slice [12:upper_bit) */ - va_align.bits = get_random_int() & va_align.mask; + va_align.bits = get_random_u32() & va_align.mask; } if (cpu_has(c, X86_FEATURE_MWAITX)) @@ -539,12 +513,12 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) u32 ecx; ecx = cpuid_ecx(0x8000001e); - nodes_per_socket = ((ecx >> 8) & 7) + 1; + __max_die_per_package = nodes_per_socket = ((ecx >> 8) & 7) + 1; } else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) { u64 value; rdmsrl(MSR_FAM10H_NODE_ID, value); - nodes_per_socket = ((value >> 3) & 7) + 1; + __max_die_per_package = nodes_per_socket = ((value >> 3) & 7) + 1; } if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) && @@ -568,6 +542,8 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) x86_amd_ls_cfg_ssbd_mask = 1ULL << bit; } } + + resctrl_cpu_detect(c); } static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) @@ -580,16 +556,18 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) * the SME physical address space reduction value. * If BIOS has not enabled SME then don't advertise the * SME feature (set in scattered.c). + * If the kernel has not enabled SME via any means then + * don't advertise the SME feature. * For SEV: If BIOS has not enabled SEV then don't advertise the - * SEV feature (set in scattered.c). + * SEV and SEV_ES feature (set in scattered.c). * * In all cases, since support for SME and SEV requires long mode, * don't advertise the feature under CONFIG_X86_32. */ if (cpu_has(c, X86_FEATURE_SME) || cpu_has(c, X86_FEATURE_SEV)) { /* Check if memory encryption is enabled */ - rdmsrl(MSR_K8_SYSCFG, msr); - if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + rdmsrl(MSR_AMD64_SYSCFG, msr); + if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) goto clear_all; /* @@ -602,6 +580,9 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) if (IS_ENABLED(CONFIG_X86_32)) goto clear_all; + if (!sme_me_mask) + setup_clear_cpu_cap(X86_FEATURE_SME); + rdmsrl(MSR_K7_HWCR, msr); if (!(msr & MSR_K7_HWCR_SMMLOCK)) goto clear_sev; @@ -612,6 +593,7 @@ clear_all: setup_clear_cpu_cap(X86_FEATURE_SME); clear_sev: setup_clear_cpu_cap(X86_FEATURE_SEV); + setup_clear_cpu_cap(X86_FEATURE_SEV_ES); } } @@ -622,11 +604,6 @@ static void early_init_amd(struct cpuinfo_x86 *c) early_init_amd_mc(c); -#ifdef CONFIG_X86_32 - if (c->x86 == 6) - set_cpu_cap(c, X86_FEATURE_K7); -#endif - if (c->x86 >= 0xf) set_cpu_cap(c, X86_FEATURE_K8); @@ -645,6 +622,10 @@ static void early_init_amd(struct cpuinfo_x86 *c) if (c->x86_power & BIT(12)) set_cpu_cap(c, X86_FEATURE_ACC_POWER); + /* Bit 14 indicates the Runtime Average Power Limit interface. */ + if (c->x86_power & BIT(14)) + set_cpu_cap(c, X86_FEATURE_RAPL); + #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_SYSCALL32); #else @@ -827,7 +808,7 @@ static void clear_rdrand_cpuid_bit(struct cpuinfo_x86 *c) return; /* - * The nordrand option can clear X86_FEATURE_RDRAND, so check for + * The self-test can clear X86_FEATURE_RDRAND, so check for * RDRAND support using the CPUID function directly. */ if (!(cpuid_ecx(1) & BIT(30)) || rdrand_force) @@ -881,6 +862,28 @@ static void init_amd_bd(struct cpuinfo_x86 *c) clear_rdrand_cpuid_bit(c); } +void init_spectral_chicken(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_CPU_UNRET_ENTRY + u64 value; + + /* + * On Zen2 we offer this chicken (bit) on the altar of Speculation. + * + * This suppresses speculation from the middle of a basic block, i.e. it + * suppresses non-branch predictions. + * + * We use STIBP as a heuristic to filter out Zen2 from the rest of F17H + */ + if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && cpu_has(c, X86_FEATURE_AMD_STIBP)) { + if (!rdmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, &value)) { + value |= MSR_ZEN2_SPECTRAL_CHICKEN_BIT; + wrmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, value); + } + } +#endif +} + static void init_amd_zn(struct cpuinfo_x86 *c) { set_cpu_cap(c, X86_FEATURE_ZEN); @@ -889,12 +892,21 @@ static void init_amd_zn(struct cpuinfo_x86 *c) node_reclaim_distance = 32; #endif - /* - * Fix erratum 1076: CPB feature bit not being set in CPUID. - * Always set it, except when running under a hypervisor. - */ - if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_CPB)) - set_cpu_cap(c, X86_FEATURE_CPB); + /* Fix up CPUID bits, but only if not virtualised. */ + if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) { + + /* Erratum 1076: CPB feature bit not being set in CPUID. */ + if (!cpu_has(c, X86_FEATURE_CPB)) + set_cpu_cap(c, X86_FEATURE_CPB); + + /* + * Zen3 (Fam19 model < 0x10) parts are not susceptible to + * Branch Type Confusion, but predate the allocation of the + * BTC_NO bit. + */ + if (c->x86 == 0x19 && !cpu_has(c, X86_FEATURE_BTC_NO)) + set_cpu_cap(c, X86_FEATURE_BTC_NO); + } } static void init_amd(struct cpuinfo_x86 *c) @@ -926,7 +938,9 @@ static void init_amd(struct cpuinfo_x86 *c) case 0x12: init_amd_ln(c); break; case 0x15: init_amd_bd(c); break; case 0x16: init_amd_jg(c); break; - case 0x17: init_amd_zn(c); break; + case 0x17: init_spectral_chicken(c); + fallthrough; + case 0x19: init_amd_zn(c); break; } /* @@ -982,6 +996,8 @@ static void init_amd(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_IRPERF) && !cpu_has_amd_erratum(c, amd_erratum_1054)) msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT); + + check_null_seg_clears_base(c); } #ifdef CONFIG_X86_32 @@ -1111,8 +1127,7 @@ static const int amd_erratum_383[] = /* #1054: Instructions Retired Performance Counter May Be Inaccurate */ static const int amd_erratum_1054[] = - AMD_OSVW_ERRATUM(0, AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf)); - + AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf)); static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) { @@ -1163,3 +1178,19 @@ void set_dr_addr_mask(unsigned long mask, int dr) break; } } + +u32 amd_get_highest_perf(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) || + (c->x86_model >= 0x70 && c->x86_model < 0x80))) + return 166; + + if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) || + (c->x86_model >= 0x40 && c->x86_model < 0x70))) + return 166; + + return 255; +} +EXPORT_SYMBOL_GPL(amd_get_highest_perf); diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index e2f319dc992d..1f60a2b27936 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -6,132 +6,446 @@ * Copyright (C) 2017 Intel Corp. * Author: Len Brown <len.brown@intel.com> */ - +#include <linux/cpufreq.h> #include <linux/delay.h> #include <linux/ktime.h> #include <linux/math64.h> #include <linux/percpu.h> -#include <linux/cpufreq.h> -#include <linux/smp.h> +#include <linux/rcupdate.h> #include <linux/sched/isolation.h> +#include <linux/sched/topology.h> +#include <linux/smp.h> +#include <linux/syscore_ops.h> + +#include <asm/cpu.h> +#include <asm/cpu_device_id.h> +#include <asm/intel-family.h> #include "cpu.h" -struct aperfmperf_sample { - unsigned int khz; - ktime_t time; - u64 aperf; - u64 mperf; +struct aperfmperf { + seqcount_t seq; + unsigned long last_update; + u64 acnt; + u64 mcnt; + u64 aperf; + u64 mperf; }; -static DEFINE_PER_CPU(struct aperfmperf_sample, samples); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = { + .seq = SEQCNT_ZERO(cpu_samples.seq) +}; -#define APERFMPERF_CACHE_THRESHOLD_MS 10 -#define APERFMPERF_REFRESH_DELAY_MS 10 -#define APERFMPERF_STALE_THRESHOLD_MS 1000 +static void init_counter_refs(void) +{ + u64 aperf, mperf; + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + + this_cpu_write(cpu_samples.aperf, aperf); + this_cpu_write(cpu_samples.mperf, mperf); +} + +#if defined(CONFIG_X86_64) && defined(CONFIG_SMP) /* - * aperfmperf_snapshot_khz() - * On the current CPU, snapshot APERF, MPERF, and jiffies - * unless we already did it within 10ms - * calculate kHz, save snapshot + * APERF/MPERF frequency ratio computation. + * + * The scheduler wants to do frequency invariant accounting and needs a <1 + * ratio to account for the 'current' frequency, corresponding to + * freq_curr / freq_max. + * + * Since the frequency freq_curr on x86 is controlled by micro-controller and + * our P-state setting is little more than a request/hint, we need to observe + * the effective frequency 'BusyMHz', i.e. the average frequency over a time + * interval after discarding idle time. This is given by: + * + * BusyMHz = delta_APERF / delta_MPERF * freq_base + * + * where freq_base is the max non-turbo P-state. + * + * The freq_max term has to be set to a somewhat arbitrary value, because we + * can't know which turbo states will be available at a given point in time: + * it all depends on the thermal headroom of the entire package. We set it to + * the turbo level with 4 cores active. + * + * Benchmarks show that's a good compromise between the 1C turbo ratio + * (freq_curr/freq_max would rarely reach 1) and something close to freq_base, + * which would ignore the entire turbo range (a conspicuous part, making + * freq_curr/freq_max always maxed out). + * + * An exception to the heuristic above is the Atom uarch, where we choose the + * highest turbo level for freq_max since Atom's are generally oriented towards + * power efficiency. + * + * Setting freq_max to anything less than the 1C turbo ratio makes the ratio + * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1. */ -static void aperfmperf_snapshot_khz(void *dummy) + +DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key); + +static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE; +static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE; + +void arch_set_max_freq_ratio(bool turbo_disabled) { - u64 aperf, aperf_delta; - u64 mperf, mperf_delta; - struct aperfmperf_sample *s = this_cpu_ptr(&samples); - unsigned long flags; + arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE : + arch_turbo_freq_ratio; +} +EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio); - local_irq_save(flags); - rdmsrl(MSR_IA32_APERF, aperf); - rdmsrl(MSR_IA32_MPERF, mperf); - local_irq_restore(flags); +static bool __init turbo_disabled(void) +{ + u64 misc_en; + int err; + + err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en); + if (err) + return false; + + return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); +} + +static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) +{ + int err; + + err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq); + if (err) + return false; + + err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq); + if (err) + return false; + + *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */ + *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */ + + return true; +} + +#define X86_MATCH(model) \ + X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \ + INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL) + +static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = { + X86_MATCH(XEON_PHI_KNL), + X86_MATCH(XEON_PHI_KNM), + {} +}; + +static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = { + X86_MATCH(SKYLAKE_X), + {} +}; + +static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = { + X86_MATCH(ATOM_GOLDMONT), + X86_MATCH(ATOM_GOLDMONT_D), + X86_MATCH(ATOM_GOLDMONT_PLUS), + {} +}; + +static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, + int num_delta_fratio) +{ + int fratio, delta_fratio, found; + int err, i; + u64 msr; + + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + if (err) + return false; + + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); + if (err) + return false; + + fratio = (msr >> 8) & 0xFF; + i = 16; + found = 0; + do { + if (found >= num_delta_fratio) { + *turbo_freq = fratio; + return true; + } + + delta_fratio = (msr >> (i + 5)) & 0x7; + + if (delta_fratio) { + found += 1; + fratio -= delta_fratio; + } + + i += 8; + } while (i < 64); + + return true; +} + +static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size) +{ + u64 ratios, counts; + u32 group_size; + int err, i; + + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + if (err) + return false; + + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios); + if (err) + return false; + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts); + if (err) + return false; + + for (i = 0; i < 64; i += 8) { + group_size = (counts >> i) & 0xFF; + if (group_size >= size) { + *turbo_freq = (ratios >> i) & 0xFF; + return true; + } + } + + return false; +} + +static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) +{ + u64 msr; + int err; - aperf_delta = aperf - s->aperf; - mperf_delta = mperf - s->mperf; + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + if (err) + return false; + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); + if (err) + return false; + + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ + *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */ + + /* The CPU may have less than 4 cores */ + if (!*turbo_freq) + *turbo_freq = msr & 0xFF; /* 1C turbo */ + + return true; +} + +static bool __init intel_set_max_freq_ratio(void) +{ + u64 base_freq, turbo_freq; + u64 turbo_ratio; + + if (slv_set_max_freq_ratio(&base_freq, &turbo_freq)) + goto out; + + if (x86_match_cpu(has_glm_turbo_ratio_limits) && + skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) + goto out; + + if (x86_match_cpu(has_knl_turbo_ratio_limits) && + knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) + goto out; + + if (x86_match_cpu(has_skx_turbo_ratio_limits) && + skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4)) + goto out; + + if (core_set_max_freq_ratio(&base_freq, &turbo_freq)) + goto out; + + return false; + +out: /* - * There is no architectural guarantee that MPERF - * increments faster than we can read it. + * Some hypervisors advertise X86_FEATURE_APERFMPERF + * but then fill all MSR's with zeroes. + * Some CPUs have turbo boost but don't declare any turbo ratio + * in MSR_TURBO_RATIO_LIMIT. */ - if (mperf_delta == 0) - return; + if (!base_freq || !turbo_freq) { + pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n"); + return false; + } - s->time = ktime_get(); - s->aperf = aperf; - s->mperf = mperf; - s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta); + turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq); + if (!turbo_ratio) { + pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n"); + return false; + } + + arch_turbo_freq_ratio = turbo_ratio; + arch_set_max_freq_ratio(turbo_disabled()); + + return true; } -static bool aperfmperf_snapshot_cpu(int cpu, ktime_t now, bool wait) +#ifdef CONFIG_PM_SLEEP +static struct syscore_ops freq_invariance_syscore_ops = { + .resume = init_counter_refs, +}; + +static void register_freq_invariance_syscore_ops(void) { - s64 time_delta = ktime_ms_delta(now, per_cpu(samples.time, cpu)); + register_syscore_ops(&freq_invariance_syscore_ops); +} +#else +static inline void register_freq_invariance_syscore_ops(void) {} +#endif - /* Don't bother re-computing within the cache threshold time. */ - if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS) - return true; +static void freq_invariance_enable(void) +{ + if (static_branch_unlikely(&arch_scale_freq_key)) { + WARN_ON_ONCE(1); + return; + } + static_branch_enable(&arch_scale_freq_key); + register_freq_invariance_syscore_ops(); + pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); +} + +void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) +{ + arch_turbo_freq_ratio = ratio; + arch_set_max_freq_ratio(turbo_disabled); + freq_invariance_enable(); +} - smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, wait); +static void __init bp_init_freq_invariance(void) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return; - /* Return false if the previous iteration was too long ago. */ - return time_delta <= APERFMPERF_STALE_THRESHOLD_MS; + if (intel_set_max_freq_ratio()) + freq_invariance_enable(); } -unsigned int aperfmperf_get_khz(int cpu) +static void disable_freq_invariance_workfn(struct work_struct *work) { - if (!cpu_khz) - return 0; + static_branch_disable(&arch_scale_freq_key); +} - if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) - return 0; +static DECLARE_WORK(disable_freq_invariance_work, + disable_freq_invariance_workfn); - if (!housekeeping_cpu(cpu, HK_FLAG_MISC)) - return 0; +DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; + +static void scale_freq_tick(u64 acnt, u64 mcnt) +{ + u64 freq_scale; + + if (!arch_scale_freq_invariant()) + return; + + if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) + goto error; + + if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt) + goto error; + + freq_scale = div64_u64(acnt, mcnt); + if (!freq_scale) + goto error; - aperfmperf_snapshot_cpu(cpu, ktime_get(), true); - return per_cpu(samples.khz, cpu); + if (freq_scale > SCHED_CAPACITY_SCALE) + freq_scale = SCHED_CAPACITY_SCALE; + + this_cpu_write(arch_freq_scale, freq_scale); + return; + +error: + pr_warn("Scheduler frequency invariance went wobbly, disabling!\n"); + schedule_work(&disable_freq_invariance_work); } +#else +static inline void bp_init_freq_invariance(void) { } +static inline void scale_freq_tick(u64 acnt, u64 mcnt) { } +#endif /* CONFIG_X86_64 && CONFIG_SMP */ -void arch_freq_prepare_all(void) +void arch_scale_freq_tick(void) { - ktime_t now = ktime_get(); - bool wait = false; - int cpu; + struct aperfmperf *s = this_cpu_ptr(&cpu_samples); + u64 acnt, mcnt, aperf, mperf; - if (!cpu_khz) + if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) return; - if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) - return; + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + acnt = aperf - s->aperf; + mcnt = mperf - s->mperf; - for_each_online_cpu(cpu) { - if (!housekeeping_cpu(cpu, HK_FLAG_MISC)) - continue; - if (!aperfmperf_snapshot_cpu(cpu, now, false)) - wait = true; - } + s->aperf = aperf; + s->mperf = mperf; - if (wait) - msleep(APERFMPERF_REFRESH_DELAY_MS); + raw_write_seqcount_begin(&s->seq); + s->last_update = jiffies; + s->acnt = acnt; + s->mcnt = mcnt; + raw_write_seqcount_end(&s->seq); + + scale_freq_tick(acnt, mcnt); } +/* + * Discard samples older than the define maximum sample age of 20ms. There + * is no point in sending IPIs in such a case. If the scheduler tick was + * not running then the CPU is either idle or isolated. + */ +#define MAX_SAMPLE_AGE ((unsigned long)HZ / 50) + unsigned int arch_freq_get_on_cpu(int cpu) { - if (!cpu_khz) - return 0; + struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu); + unsigned int seq, freq; + unsigned long last; + u64 acnt, mcnt; - if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) - return 0; + if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) + goto fallback; - if (!housekeeping_cpu(cpu, HK_FLAG_MISC)) - return 0; + do { + seq = raw_read_seqcount_begin(&s->seq); + last = s->last_update; + acnt = s->acnt; + mcnt = s->mcnt; + } while (read_seqcount_retry(&s->seq, seq)); - if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true)) - return per_cpu(samples.khz, cpu); + /* + * Bail on invalid count and when the last update was too long ago, + * which covers idle and NOHZ full CPUs. + */ + if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE) + goto fallback; - msleep(APERFMPERF_REFRESH_DELAY_MS); - smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1); + return div64_u64((cpu_khz * acnt), mcnt); - return per_cpu(samples.khz, cpu); +fallback: + freq = cpufreq_quick_get(cpu); + return freq ? freq : cpu_khz; +} + +static int __init bp_init_aperfmperf(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) + return 0; + + init_counter_refs(); + bp_init_freq_invariance(); + return 0; +} +early_initcall(bp_init_aperfmperf); + +void ap_init_aperfmperf(void) +{ + if (cpu_feature_enabled(X86_FEATURE_APERFMPERF)) + init_counter_refs(); } diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index ed54b3b21c39..3e3230cccaa7 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -15,43 +15,75 @@ #include <linux/nospec.h> #include <linux/prctl.h> #include <linux/sched/smt.h> +#include <linux/pgtable.h> +#include <linux/bpf.h> #include <asm/spec-ctrl.h> #include <asm/cmdline.h> #include <asm/bugs.h> #include <asm/processor.h> #include <asm/processor-flags.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> #include <asm/msr.h> #include <asm/vmx.h> #include <asm/paravirt.h> #include <asm/alternative.h> -#include <asm/pgtable.h> #include <asm/set_memory.h> #include <asm/intel-family.h> #include <asm/e820/api.h> #include <asm/hypervisor.h> +#include <asm/tlbflush.h> #include "cpu.h" static void __init spectre_v1_select_mitigation(void); static void __init spectre_v2_select_mitigation(void); +static void __init retbleed_select_mitigation(void); +static void __init spectre_v2_user_select_mitigation(void); static void __init ssb_select_mitigation(void); static void __init l1tf_select_mitigation(void); static void __init mds_select_mitigation(void); -static void __init mds_print_mitigation(void); +static void __init md_clear_update_mitigation(void); +static void __init md_clear_select_mitigation(void); static void __init taa_select_mitigation(void); +static void __init mmio_select_mitigation(void); +static void __init srbds_select_mitigation(void); +static void __init l1d_flush_select_mitigation(void); -/* The base value of the SPEC_CTRL MSR that always has to be preserved. */ +/* The base value of the SPEC_CTRL MSR without task-specific bits set */ u64 x86_spec_ctrl_base; EXPORT_SYMBOL_GPL(x86_spec_ctrl_base); + +/* The current value of the SPEC_CTRL MSR with task-specific bits set */ +DEFINE_PER_CPU(u64, x86_spec_ctrl_current); +EXPORT_SYMBOL_GPL(x86_spec_ctrl_current); + static DEFINE_MUTEX(spec_ctrl_mutex); /* - * The vendor and possibly platform specific bits which can be modified in - * x86_spec_ctrl_base. + * Keep track of the SPEC_CTRL MSR value for the current task, which may differ + * from x86_spec_ctrl_base due to STIBP/SSB in __speculation_ctrl_update(). */ -static u64 __ro_after_init x86_spec_ctrl_mask = SPEC_CTRL_IBRS; +void write_spec_ctrl_current(u64 val, bool force) +{ + if (this_cpu_read(x86_spec_ctrl_current) == val) + return; + + this_cpu_write(x86_spec_ctrl_current, val); + + /* + * When KERNEL_IBRS this MSR is written on return-to-user, unless + * forced the update can be delayed until that time. + */ + if (force || !cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS)) + wrmsrl(MSR_IA32_SPEC_CTRL, val); +} + +u64 spec_ctrl_current(void) +{ + return this_cpu_read(x86_spec_ctrl_current); +} +EXPORT_SYMBOL_GPL(spec_ctrl_current); /* * AMD specific MSR info for Speculative Store Bypass control. @@ -74,6 +106,17 @@ EXPORT_SYMBOL_GPL(mds_user_clear); DEFINE_STATIC_KEY_FALSE(mds_idle_clear); EXPORT_SYMBOL_GPL(mds_idle_clear); +/* + * Controls whether l1d flush based mitigations are enabled, + * based on hw features and admin setting via boot parameter + * defaults to false + */ +DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); + +/* Controls CPU Fill buffer clear before KVM guest MMIO accesses */ +DEFINE_STATIC_KEY_FALSE(mmio_stale_data_clear); +EXPORT_SYMBOL_GPL(mmio_stale_data_clear); + void __init check_bugs(void) { identify_boot_cpu(); @@ -97,23 +140,26 @@ void __init check_bugs(void) if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); - /* Allow STIBP in MSR_SPEC_CTRL if supported */ - if (boot_cpu_has(X86_FEATURE_STIBP)) - x86_spec_ctrl_mask |= SPEC_CTRL_STIBP; - /* Select the proper CPU mitigations before patching alternatives: */ spectre_v1_select_mitigation(); spectre_v2_select_mitigation(); - ssb_select_mitigation(); - l1tf_select_mitigation(); - mds_select_mitigation(); - taa_select_mitigation(); - /* - * As MDS and TAA mitigations are inter-related, print MDS - * mitigation until after TAA mitigation selection is done. + * retbleed_select_mitigation() relies on the state set by + * spectre_v2_select_mitigation(); specifically it wants to know about + * spectre_v2=ibrs. + */ + retbleed_select_mitigation(); + /* + * spectre_v2_user_select_mitigation() relies on the state set by + * retbleed_select_mitigation(); specifically the STIBP selection is + * forced for UNRET or IBPB. */ - mds_print_mitigation(); + spectre_v2_user_select_mitigation(); + ssb_select_mitigation(); + l1tf_select_mitigation(); + md_clear_select_mitigation(); + srbds_select_mitigation(); + l1d_flush_select_mitigation(); arch_smt_update(); @@ -149,37 +195,16 @@ void __init check_bugs(void) #endif } +/* + * NOTE: This function is *only* called for SVM, since Intel uses + * MSR_IA32_SPEC_CTRL for SSBD. + */ void -x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest) +x86_virt_spec_ctrl(u64 guest_virt_spec_ctrl, bool setguest) { - u64 msrval, guestval, hostval = x86_spec_ctrl_base; + u64 guestval, hostval; struct thread_info *ti = current_thread_info(); - /* Is MSR_SPEC_CTRL implemented ? */ - if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) { - /* - * Restrict guest_spec_ctrl to supported values. Clear the - * modifiable bits in the host base value and or the - * modifiable bits from the guest value. - */ - guestval = hostval & ~x86_spec_ctrl_mask; - guestval |= guest_spec_ctrl & x86_spec_ctrl_mask; - - /* SSBD controlled in MSR_SPEC_CTRL */ - if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || - static_cpu_has(X86_FEATURE_AMD_SSBD)) - hostval |= ssbd_tif_to_spec_ctrl(ti->flags); - - /* Conditional STIBP enabled? */ - if (static_branch_unlikely(&switch_to_cond_stibp)) - hostval |= stibp_tif_to_spec_ctrl(ti->flags); - - if (hostval != guestval) { - msrval = setguest ? guestval : hostval; - wrmsrl(MSR_IA32_SPEC_CTRL, msrval); - } - } - /* * If SSBD is not handled in MSR_SPEC_CTRL on AMD, update * MSR_AMD64_L2_CFG or MSR_VIRT_SPEC_CTRL if supported. @@ -254,14 +279,6 @@ static void __init mds_select_mitigation(void) } } -static void __init mds_print_mitigation(void) -{ - if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) - return; - - pr_info("%s\n", mds_strings[mds_mitigation]); -} - static int __init mds_cmdline(char *str) { if (!boot_cpu_has_bug(X86_BUG_MDS)) @@ -316,7 +333,7 @@ static void __init taa_select_mitigation(void) /* TSX previously disabled by tsx=off */ if (!boot_cpu_has(X86_FEATURE_RTM)) { taa_mitigation = TAA_MITIGATION_TSX_DISABLED; - goto out; + return; } if (cpu_mitigations_off()) { @@ -330,7 +347,7 @@ static void __init taa_select_mitigation(void) */ if (taa_mitigation == TAA_MITIGATION_OFF && mds_mitigation == MDS_MITIGATION_OFF) - goto out; + return; if (boot_cpu_has(X86_FEATURE_MD_CLEAR)) taa_mitigation = TAA_MITIGATION_VERW; @@ -362,18 +379,6 @@ static void __init taa_select_mitigation(void) if (taa_nosmt || cpu_mitigations_auto_nosmt()) cpu_smt_disable(false); - - /* - * Update MDS mitigation, if necessary, as the mds_user_clear is - * now enabled for TAA mitigation. - */ - if (mds_mitigation == MDS_MITIGATION_OFF && - boot_cpu_has_bug(X86_BUG_MDS)) { - mds_mitigation = MDS_MITIGATION_FULL; - mds_select_mitigation(); - } -out: - pr_info("%s\n", taa_strings[taa_mitigation]); } static int __init tsx_async_abort_parse_cmdline(char *str) @@ -398,6 +403,282 @@ static int __init tsx_async_abort_parse_cmdline(char *str) early_param("tsx_async_abort", tsx_async_abort_parse_cmdline); #undef pr_fmt +#define pr_fmt(fmt) "MMIO Stale Data: " fmt + +enum mmio_mitigations { + MMIO_MITIGATION_OFF, + MMIO_MITIGATION_UCODE_NEEDED, + MMIO_MITIGATION_VERW, +}; + +/* Default mitigation for Processor MMIO Stale Data vulnerabilities */ +static enum mmio_mitigations mmio_mitigation __ro_after_init = MMIO_MITIGATION_VERW; +static bool mmio_nosmt __ro_after_init = false; + +static const char * const mmio_strings[] = { + [MMIO_MITIGATION_OFF] = "Vulnerable", + [MMIO_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode", + [MMIO_MITIGATION_VERW] = "Mitigation: Clear CPU buffers", +}; + +static void __init mmio_select_mitigation(void) +{ + u64 ia32_cap; + + if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || + boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN) || + cpu_mitigations_off()) { + mmio_mitigation = MMIO_MITIGATION_OFF; + return; + } + + if (mmio_mitigation == MMIO_MITIGATION_OFF) + return; + + ia32_cap = x86_read_arch_cap_msr(); + + /* + * Enable CPU buffer clear mitigation for host and VMM, if also affected + * by MDS or TAA. Otherwise, enable mitigation for VMM only. + */ + if (boot_cpu_has_bug(X86_BUG_MDS) || (boot_cpu_has_bug(X86_BUG_TAA) && + boot_cpu_has(X86_FEATURE_RTM))) + static_branch_enable(&mds_user_clear); + else + static_branch_enable(&mmio_stale_data_clear); + + /* + * If Processor-MMIO-Stale-Data bug is present and Fill Buffer data can + * be propagated to uncore buffers, clearing the Fill buffers on idle + * is required irrespective of SMT state. + */ + if (!(ia32_cap & ARCH_CAP_FBSDP_NO)) + static_branch_enable(&mds_idle_clear); + + /* + * Check if the system has the right microcode. + * + * CPU Fill buffer clear mitigation is enumerated by either an explicit + * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS + * affected systems. + */ + if ((ia32_cap & ARCH_CAP_FB_CLEAR) || + (boot_cpu_has(X86_FEATURE_MD_CLEAR) && + boot_cpu_has(X86_FEATURE_FLUSH_L1D) && + !(ia32_cap & ARCH_CAP_MDS_NO))) + mmio_mitigation = MMIO_MITIGATION_VERW; + else + mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED; + + if (mmio_nosmt || cpu_mitigations_auto_nosmt()) + cpu_smt_disable(false); +} + +static int __init mmio_stale_data_parse_cmdline(char *str) +{ + if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) + return 0; + + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) { + mmio_mitigation = MMIO_MITIGATION_OFF; + } else if (!strcmp(str, "full")) { + mmio_mitigation = MMIO_MITIGATION_VERW; + } else if (!strcmp(str, "full,nosmt")) { + mmio_mitigation = MMIO_MITIGATION_VERW; + mmio_nosmt = true; + } + + return 0; +} +early_param("mmio_stale_data", mmio_stale_data_parse_cmdline); + +#undef pr_fmt +#define pr_fmt(fmt) "" fmt + +static void __init md_clear_update_mitigation(void) +{ + if (cpu_mitigations_off()) + return; + + if (!static_key_enabled(&mds_user_clear)) + goto out; + + /* + * mds_user_clear is now enabled. Update MDS, TAA and MMIO Stale Data + * mitigation, if necessary. + */ + if (mds_mitigation == MDS_MITIGATION_OFF && + boot_cpu_has_bug(X86_BUG_MDS)) { + mds_mitigation = MDS_MITIGATION_FULL; + mds_select_mitigation(); + } + if (taa_mitigation == TAA_MITIGATION_OFF && + boot_cpu_has_bug(X86_BUG_TAA)) { + taa_mitigation = TAA_MITIGATION_VERW; + taa_select_mitigation(); + } + if (mmio_mitigation == MMIO_MITIGATION_OFF && + boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) { + mmio_mitigation = MMIO_MITIGATION_VERW; + mmio_select_mitigation(); + } +out: + if (boot_cpu_has_bug(X86_BUG_MDS)) + pr_info("MDS: %s\n", mds_strings[mds_mitigation]); + if (boot_cpu_has_bug(X86_BUG_TAA)) + pr_info("TAA: %s\n", taa_strings[taa_mitigation]); + if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) + pr_info("MMIO Stale Data: %s\n", mmio_strings[mmio_mitigation]); + else if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) + pr_info("MMIO Stale Data: Unknown: No mitigations\n"); +} + +static void __init md_clear_select_mitigation(void) +{ + mds_select_mitigation(); + taa_select_mitigation(); + mmio_select_mitigation(); + + /* + * As MDS, TAA and MMIO Stale Data mitigations are inter-related, update + * and print their mitigation after MDS, TAA and MMIO Stale Data + * mitigation selection is done. + */ + md_clear_update_mitigation(); +} + +#undef pr_fmt +#define pr_fmt(fmt) "SRBDS: " fmt + +enum srbds_mitigations { + SRBDS_MITIGATION_OFF, + SRBDS_MITIGATION_UCODE_NEEDED, + SRBDS_MITIGATION_FULL, + SRBDS_MITIGATION_TSX_OFF, + SRBDS_MITIGATION_HYPERVISOR, +}; + +static enum srbds_mitigations srbds_mitigation __ro_after_init = SRBDS_MITIGATION_FULL; + +static const char * const srbds_strings[] = { + [SRBDS_MITIGATION_OFF] = "Vulnerable", + [SRBDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", + [SRBDS_MITIGATION_FULL] = "Mitigation: Microcode", + [SRBDS_MITIGATION_TSX_OFF] = "Mitigation: TSX disabled", + [SRBDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status", +}; + +static bool srbds_off; + +void update_srbds_msr(void) +{ + u64 mcu_ctrl; + + if (!boot_cpu_has_bug(X86_BUG_SRBDS)) + return; + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return; + + if (srbds_mitigation == SRBDS_MITIGATION_UCODE_NEEDED) + return; + + /* + * A MDS_NO CPU for which SRBDS mitigation is not needed due to TSX + * being disabled and it hasn't received the SRBDS MSR microcode. + */ + if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL)) + return; + + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + + switch (srbds_mitigation) { + case SRBDS_MITIGATION_OFF: + case SRBDS_MITIGATION_TSX_OFF: + mcu_ctrl |= RNGDS_MITG_DIS; + break; + case SRBDS_MITIGATION_FULL: + mcu_ctrl &= ~RNGDS_MITG_DIS; + break; + default: + break; + } + + wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); +} + +static void __init srbds_select_mitigation(void) +{ + u64 ia32_cap; + + if (!boot_cpu_has_bug(X86_BUG_SRBDS)) + return; + + /* + * Check to see if this is one of the MDS_NO systems supporting TSX that + * are only exposed to SRBDS when TSX is enabled or when CPU is affected + * by Processor MMIO Stale Data vulnerability. + */ + ia32_cap = x86_read_arch_cap_msr(); + if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) && + !boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) + srbds_mitigation = SRBDS_MITIGATION_TSX_OFF; + else if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR; + else if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL)) + srbds_mitigation = SRBDS_MITIGATION_UCODE_NEEDED; + else if (cpu_mitigations_off() || srbds_off) + srbds_mitigation = SRBDS_MITIGATION_OFF; + + update_srbds_msr(); + pr_info("%s\n", srbds_strings[srbds_mitigation]); +} + +static int __init srbds_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!boot_cpu_has_bug(X86_BUG_SRBDS)) + return 0; + + srbds_off = !strcmp(str, "off"); + return 0; +} +early_param("srbds", srbds_parse_cmdline); + +#undef pr_fmt +#define pr_fmt(fmt) "L1D Flush : " fmt + +enum l1d_flush_mitigations { + L1D_FLUSH_OFF = 0, + L1D_FLUSH_ON, +}; + +static enum l1d_flush_mitigations l1d_flush_mitigation __initdata = L1D_FLUSH_OFF; + +static void __init l1d_flush_select_mitigation(void) +{ + if (!l1d_flush_mitigation || !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) + return; + + static_branch_enable(&switch_mm_cond_l1d_flush); + pr_info("Conditional flush on switch_mm() enabled\n"); +} + +static int __init l1d_flush_parse_cmdline(char *str) +{ + if (!strcmp(str, "on")) + l1d_flush_mitigation = L1D_FLUSH_ON; + + return 0; +} +early_param("l1d_flush", l1d_flush_parse_cmdline); + +#undef pr_fmt #define pr_fmt(fmt) "Spectre V1 : " fmt enum spectre_v1_mitigation { @@ -450,14 +731,12 @@ static void __init spectre_v1_select_mitigation(void) * If FSGSBASE is enabled, the user can put a kernel address in * GS, in which case SMAP provides no protection. * - * [ NOTE: Don't check for X86_FEATURE_FSGSBASE until the - * FSGSBASE enablement patches have been merged. ] - * * If FSGSBASE is disabled, the user can only put a user space * address in GS. That makes an attack harder, but still * possible if there's no SMAP protection. */ - if (!smap_works_speculatively()) { + if (boot_cpu_has(X86_FEATURE_FSGSBASE) || + !smap_works_speculatively()) { /* * Mitigation can be provided from SWAPGS itself or * PTI as the CR3 write in the Meltdown mitigation @@ -489,13 +768,183 @@ static int __init nospectre_v1_cmdline(char *str) } early_param("nospectre_v1", nospectre_v1_cmdline); -#undef pr_fmt -#define pr_fmt(fmt) "Spectre V2 : " fmt - static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = SPECTRE_V2_NONE; -static enum spectre_v2_user_mitigation spectre_v2_user __ro_after_init = +#undef pr_fmt +#define pr_fmt(fmt) "RETBleed: " fmt + +enum retbleed_mitigation { + RETBLEED_MITIGATION_NONE, + RETBLEED_MITIGATION_UNRET, + RETBLEED_MITIGATION_IBPB, + RETBLEED_MITIGATION_IBRS, + RETBLEED_MITIGATION_EIBRS, +}; + +enum retbleed_mitigation_cmd { + RETBLEED_CMD_OFF, + RETBLEED_CMD_AUTO, + RETBLEED_CMD_UNRET, + RETBLEED_CMD_IBPB, +}; + +static const char * const retbleed_strings[] = { + [RETBLEED_MITIGATION_NONE] = "Vulnerable", + [RETBLEED_MITIGATION_UNRET] = "Mitigation: untrained return thunk", + [RETBLEED_MITIGATION_IBPB] = "Mitigation: IBPB", + [RETBLEED_MITIGATION_IBRS] = "Mitigation: IBRS", + [RETBLEED_MITIGATION_EIBRS] = "Mitigation: Enhanced IBRS", +}; + +static enum retbleed_mitigation retbleed_mitigation __ro_after_init = + RETBLEED_MITIGATION_NONE; +static enum retbleed_mitigation_cmd retbleed_cmd __ro_after_init = + RETBLEED_CMD_AUTO; + +static int __ro_after_init retbleed_nosmt = false; + +static int __init retbleed_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + while (str) { + char *next = strchr(str, ','); + if (next) { + *next = 0; + next++; + } + + if (!strcmp(str, "off")) { + retbleed_cmd = RETBLEED_CMD_OFF; + } else if (!strcmp(str, "auto")) { + retbleed_cmd = RETBLEED_CMD_AUTO; + } else if (!strcmp(str, "unret")) { + retbleed_cmd = RETBLEED_CMD_UNRET; + } else if (!strcmp(str, "ibpb")) { + retbleed_cmd = RETBLEED_CMD_IBPB; + } else if (!strcmp(str, "nosmt")) { + retbleed_nosmt = true; + } else { + pr_err("Ignoring unknown retbleed option (%s).", str); + } + + str = next; + } + + return 0; +} +early_param("retbleed", retbleed_parse_cmdline); + +#define RETBLEED_UNTRAIN_MSG "WARNING: BTB untrained return thunk mitigation is only effective on AMD/Hygon!\n" +#define RETBLEED_INTEL_MSG "WARNING: Spectre v2 mitigation leaves CPU vulnerable to RETBleed attacks, data leaks possible!\n" + +static void __init retbleed_select_mitigation(void) +{ + bool mitigate_smt = false; + + if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) + return; + + switch (retbleed_cmd) { + case RETBLEED_CMD_OFF: + return; + + case RETBLEED_CMD_UNRET: + if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY)) { + retbleed_mitigation = RETBLEED_MITIGATION_UNRET; + } else { + pr_err("WARNING: kernel not compiled with CPU_UNRET_ENTRY.\n"); + goto do_cmd_auto; + } + break; + + case RETBLEED_CMD_IBPB: + if (!boot_cpu_has(X86_FEATURE_IBPB)) { + pr_err("WARNING: CPU does not support IBPB.\n"); + goto do_cmd_auto; + } else if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY)) { + retbleed_mitigation = RETBLEED_MITIGATION_IBPB; + } else { + pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n"); + goto do_cmd_auto; + } + break; + +do_cmd_auto: + case RETBLEED_CMD_AUTO: + default: + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { + if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY)) + retbleed_mitigation = RETBLEED_MITIGATION_UNRET; + else if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY) && boot_cpu_has(X86_FEATURE_IBPB)) + retbleed_mitigation = RETBLEED_MITIGATION_IBPB; + } + + /* + * The Intel mitigation (IBRS or eIBRS) was already selected in + * spectre_v2_select_mitigation(). 'retbleed_mitigation' will + * be set accordingly below. + */ + + break; + } + + switch (retbleed_mitigation) { + case RETBLEED_MITIGATION_UNRET: + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + setup_force_cpu_cap(X86_FEATURE_UNRET); + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) + pr_err(RETBLEED_UNTRAIN_MSG); + + mitigate_smt = true; + break; + + case RETBLEED_MITIGATION_IBPB: + setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); + mitigate_smt = true; + break; + + default: + break; + } + + if (mitigate_smt && !boot_cpu_has(X86_FEATURE_STIBP) && + (retbleed_nosmt || cpu_mitigations_auto_nosmt())) + cpu_smt_disable(false); + + /* + * Let IBRS trump all on Intel without affecting the effects of the + * retbleed= cmdline option. + */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + switch (spectre_v2_enabled) { + case SPECTRE_V2_IBRS: + retbleed_mitigation = RETBLEED_MITIGATION_IBRS; + break; + case SPECTRE_V2_EIBRS: + case SPECTRE_V2_EIBRS_RETPOLINE: + case SPECTRE_V2_EIBRS_LFENCE: + retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; + break; + default: + pr_err(RETBLEED_INTEL_MSG); + } + } + + pr_info("%s\n", retbleed_strings[retbleed_mitigation]); +} + +#undef pr_fmt +#define pr_fmt(fmt) "Spectre V2 : " fmt + +static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init = + SPECTRE_V2_USER_NONE; +static enum spectre_v2_user_mitigation spectre_v2_user_ibpb __ro_after_init = SPECTRE_V2_USER_NONE; #ifdef CONFIG_RETPOLINE @@ -519,6 +968,33 @@ static inline const char *spectre_v2_module_string(void) static inline const char *spectre_v2_module_string(void) { return ""; } #endif +#define SPECTRE_V2_LFENCE_MSG "WARNING: LFENCE mitigation is not recommended for this CPU, data leaks possible!\n" +#define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n" +#define SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS+LFENCE mitigation and SMT, data leaks possible via Spectre v2 BHB attacks!\n" +#define SPECTRE_V2_IBRS_PERF_MSG "WARNING: IBRS mitigation selected on Enhanced IBRS CPU, this may cause unnecessary performance loss\n" + +#ifdef CONFIG_BPF_SYSCALL +void unpriv_ebpf_notify(int new_state) +{ + if (new_state) + return; + + /* Unprivileged eBPF is enabled */ + + switch (spectre_v2_enabled) { + case SPECTRE_V2_EIBRS: + pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); + break; + case SPECTRE_V2_EIBRS_LFENCE: + if (sched_smt_active()) + pr_err(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG); + break; + default: + break; + } +} +#endif + static inline bool match_option(const char *arg, int arglen, const char *opt) { int len = strlen(opt); @@ -533,7 +1009,11 @@ enum spectre_v2_mitigation_cmd { SPECTRE_V2_CMD_FORCE, SPECTRE_V2_CMD_RETPOLINE, SPECTRE_V2_CMD_RETPOLINE_GENERIC, - SPECTRE_V2_CMD_RETPOLINE_AMD, + SPECTRE_V2_CMD_RETPOLINE_LFENCE, + SPECTRE_V2_CMD_EIBRS, + SPECTRE_V2_CMD_EIBRS_RETPOLINE, + SPECTRE_V2_CMD_EIBRS_LFENCE, + SPECTRE_V2_CMD_IBRS, }; enum spectre_v2_user_cmd { @@ -574,13 +1054,15 @@ static void __init spec_v2_user_print_cond(const char *reason, bool secure) pr_info("spectre_v2_user=%s forced on command line.\n", reason); } +static __ro_after_init enum spectre_v2_mitigation_cmd spectre_v2_cmd; + static enum spectre_v2_user_cmd __init -spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd) +spectre_v2_parse_user_cmdline(void) { char arg[20]; int ret, i; - switch (v2_cmd) { + switch (spectre_v2_cmd) { case SPECTRE_V2_CMD_NONE: return SPECTRE_V2_USER_CMD_NONE; case SPECTRE_V2_CMD_FORCE: @@ -606,8 +1088,16 @@ spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd) return SPECTRE_V2_USER_CMD_AUTO; } +static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode) +{ + return mode == SPECTRE_V2_IBRS || + mode == SPECTRE_V2_EIBRS || + mode == SPECTRE_V2_EIBRS_RETPOLINE || + mode == SPECTRE_V2_EIBRS_LFENCE; +} + static void __init -spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) +spectre_v2_user_select_mitigation(void) { enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE; bool smt_possible = IS_ENABLED(CONFIG_SMP); @@ -620,18 +1110,18 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) cpu_smt_control == CPU_SMT_NOT_SUPPORTED) smt_possible = false; - cmd = spectre_v2_parse_user_cmdline(v2_cmd); + cmd = spectre_v2_parse_user_cmdline(); switch (cmd) { case SPECTRE_V2_USER_CMD_NONE: goto set_mode; case SPECTRE_V2_USER_CMD_FORCE: mode = SPECTRE_V2_USER_STRICT; break; + case SPECTRE_V2_USER_CMD_AUTO: case SPECTRE_V2_USER_CMD_PRCTL: case SPECTRE_V2_USER_CMD_PRCTL_IBPB: mode = SPECTRE_V2_USER_PRCTL; break; - case SPECTRE_V2_USER_CMD_AUTO: case SPECTRE_V2_USER_CMD_SECCOMP: case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: if (IS_ENABLED(CONFIG_SECCOMP)) @@ -641,24 +1131,17 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) break; } - /* - * At this point, an STIBP mode other than "off" has been set. - * If STIBP support is not being forced, check if STIBP always-on - * is preferred. - */ - if (mode != SPECTRE_V2_USER_STRICT && - boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON)) - mode = SPECTRE_V2_USER_STRICT_PREFERRED; - /* Initialize Indirect Branch Prediction Barrier */ if (boot_cpu_has(X86_FEATURE_IBPB)) { setup_force_cpu_cap(X86_FEATURE_USE_IBPB); + spectre_v2_user_ibpb = mode; switch (cmd) { case SPECTRE_V2_USER_CMD_FORCE: case SPECTRE_V2_USER_CMD_PRCTL_IBPB: case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: static_branch_enable(&switch_mm_always_ibpb); + spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; break; case SPECTRE_V2_USER_CMD_PRCTL: case SPECTRE_V2_USER_CMD_AUTO: @@ -674,28 +1157,46 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) "always-on" : "conditional"); } - /* If enhanced IBRS is enabled no STIBP required */ - if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) + /* + * If no STIBP, IBRS or enhanced IBRS is enabled, or SMT impossible, + * STIBP is not required. + */ + if (!boot_cpu_has(X86_FEATURE_STIBP) || + !smt_possible || + spectre_v2_in_ibrs_mode(spectre_v2_enabled)) return; /* - * If SMT is not possible or STIBP is not available clear the STIBP - * mode. + * At this point, an STIBP mode other than "off" has been set. + * If STIBP support is not being forced, check if STIBP always-on + * is preferred. */ - if (!smt_possible || !boot_cpu_has(X86_FEATURE_STIBP)) - mode = SPECTRE_V2_USER_NONE; + if (mode != SPECTRE_V2_USER_STRICT && + boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON)) + mode = SPECTRE_V2_USER_STRICT_PREFERRED; + + if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET || + retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { + if (mode != SPECTRE_V2_USER_STRICT && + mode != SPECTRE_V2_USER_STRICT_PREFERRED) + pr_info("Selecting STIBP always-on mode to complement retbleed mitigation\n"); + mode = SPECTRE_V2_USER_STRICT_PREFERRED; + } + + spectre_v2_user_stibp = mode; + set_mode: - spectre_v2_user = mode; - /* Only print the STIBP mode when SMT possible */ - if (smt_possible) - pr_info("%s\n", spectre_v2_user_strings[mode]); + pr_info("%s\n", spectre_v2_user_strings[mode]); } static const char * const spectre_v2_strings[] = { [SPECTRE_V2_NONE] = "Vulnerable", - [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline", - [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline", - [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS", + [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines", + [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE", + [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced IBRS", + [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced IBRS + LFENCE", + [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced IBRS + Retpolines", + [SPECTRE_V2_IBRS] = "Mitigation: IBRS", }; static const struct { @@ -706,9 +1207,14 @@ static const struct { { "off", SPECTRE_V2_CMD_NONE, false }, { "on", SPECTRE_V2_CMD_FORCE, true }, { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, - { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false }, + { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, + { "retpoline,lfence", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false }, + { "eibrs", SPECTRE_V2_CMD_EIBRS, false }, + { "eibrs,lfence", SPECTRE_V2_CMD_EIBRS_LFENCE, false }, + { "eibrs,retpoline", SPECTRE_V2_CMD_EIBRS_RETPOLINE, false }, { "auto", SPECTRE_V2_CMD_AUTO, false }, + { "ibrs", SPECTRE_V2_CMD_IBRS, false }, }; static void __init spec_v2_print_cond(const char *reason, bool secure) @@ -744,17 +1250,54 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) } if ((cmd == SPECTRE_V2_CMD_RETPOLINE || - cmd == SPECTRE_V2_CMD_RETPOLINE_AMD || - cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC) && + cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || + cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC || + cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || + cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && !IS_ENABLED(CONFIG_RETPOLINE)) { - pr_err("%s selected but not compiled in. Switching to AUTO select\n", mitigation_options[i].option); + pr_err("%s selected but not compiled in. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if ((cmd == SPECTRE_V2_CMD_EIBRS || + cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || + cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && + !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { + pr_err("%s selected but CPU doesn't have eIBRS. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if ((cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || + cmd == SPECTRE_V2_CMD_EIBRS_LFENCE) && + !boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { + pr_err("%s selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if (cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_CPU_IBRS_ENTRY)) { + pr_err("%s selected but not compiled in. Switching to AUTO select\n", + mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } - if (cmd == SPECTRE_V2_CMD_RETPOLINE_AMD && - boot_cpu_data.x86_vendor != X86_VENDOR_HYGON && - boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { - pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n"); + if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { + pr_err("%s selected but not Intel CPU. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if (cmd == SPECTRE_V2_CMD_IBRS && !boot_cpu_has(X86_FEATURE_IBRS)) { + pr_err("%s selected but CPU doesn't have IBRS. Switching to AUTO select\n", + mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_has(X86_FEATURE_XENPV)) { + pr_err("%s selected but running as XenPV guest. Switching to AUTO select\n", + mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } @@ -763,6 +1306,79 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) return cmd; } +static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) +{ + if (!IS_ENABLED(CONFIG_RETPOLINE)) { + pr_err("Kernel not compiled with retpoline; no mitigation available!"); + return SPECTRE_V2_NONE; + } + + return SPECTRE_V2_RETPOLINE; +} + +/* Disable in-kernel use of non-RSB RET predictors */ +static void __init spec_ctrl_disable_kernel_rrsba(void) +{ + u64 ia32_cap; + + if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL)) + return; + + ia32_cap = x86_read_arch_cap_msr(); + + if (ia32_cap & ARCH_CAP_RRSBA) { + x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S; + write_spec_ctrl_current(x86_spec_ctrl_base, true); + } +} + +static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode) +{ + /* + * Similar to context switches, there are two types of RSB attacks + * after VM exit: + * + * 1) RSB underflow + * + * 2) Poisoned RSB entry + * + * When retpoline is enabled, both are mitigated by filling/clearing + * the RSB. + * + * When IBRS is enabled, while #1 would be mitigated by the IBRS branch + * prediction isolation protections, RSB still needs to be cleared + * because of #2. Note that SMEP provides no protection here, unlike + * user-space-poisoned RSB entries. + * + * eIBRS should protect against RSB poisoning, but if the EIBRS_PBRSB + * bug is present then a LITE version of RSB protection is required, + * just a single call needs to retire before a RET is executed. + */ + switch (mode) { + case SPECTRE_V2_NONE: + return; + + case SPECTRE_V2_EIBRS_LFENCE: + case SPECTRE_V2_EIBRS: + if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { + setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE); + pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL on VMEXIT\n"); + } + return; + + case SPECTRE_V2_EIBRS_RETPOLINE: + case SPECTRE_V2_RETPOLINE: + case SPECTRE_V2_LFENCE: + case SPECTRE_V2_IBRS: + setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT); + pr_info("Spectre v2 / SpectreRSB : Filling RSB on VMEXIT\n"); + return; + } + + pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation at VM exit"); + dump_stack(); +} + static void __init spectre_v2_select_mitigation(void) { enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); @@ -783,86 +1399,172 @@ static void __init spectre_v2_select_mitigation(void) case SPECTRE_V2_CMD_FORCE: case SPECTRE_V2_CMD_AUTO: if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { - mode = SPECTRE_V2_IBRS_ENHANCED; - /* Force it so VMEXIT will restore correctly */ - x86_spec_ctrl_base |= SPEC_CTRL_IBRS; - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); - goto specv2_set_mode; + mode = SPECTRE_V2_EIBRS; + break; + } + + if (IS_ENABLED(CONFIG_CPU_IBRS_ENTRY) && + boot_cpu_has_bug(X86_BUG_RETBLEED) && + retbleed_cmd != RETBLEED_CMD_OFF && + boot_cpu_has(X86_FEATURE_IBRS) && + boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + mode = SPECTRE_V2_IBRS; + break; } - if (IS_ENABLED(CONFIG_RETPOLINE)) - goto retpoline_auto; + + mode = spectre_v2_select_retpoline(); break; - case SPECTRE_V2_CMD_RETPOLINE_AMD: - if (IS_ENABLED(CONFIG_RETPOLINE)) - goto retpoline_amd; + + case SPECTRE_V2_CMD_RETPOLINE_LFENCE: + pr_err(SPECTRE_V2_LFENCE_MSG); + mode = SPECTRE_V2_LFENCE; break; + case SPECTRE_V2_CMD_RETPOLINE_GENERIC: - if (IS_ENABLED(CONFIG_RETPOLINE)) - goto retpoline_generic; + mode = SPECTRE_V2_RETPOLINE; break; + case SPECTRE_V2_CMD_RETPOLINE: - if (IS_ENABLED(CONFIG_RETPOLINE)) - goto retpoline_auto; + mode = spectre_v2_select_retpoline(); + break; + + case SPECTRE_V2_CMD_IBRS: + mode = SPECTRE_V2_IBRS; + break; + + case SPECTRE_V2_CMD_EIBRS: + mode = SPECTRE_V2_EIBRS; + break; + + case SPECTRE_V2_CMD_EIBRS_LFENCE: + mode = SPECTRE_V2_EIBRS_LFENCE; + break; + + case SPECTRE_V2_CMD_EIBRS_RETPOLINE: + mode = SPECTRE_V2_EIBRS_RETPOLINE; break; } - pr_err("Spectre mitigation: kernel not compiled with retpoline; no mitigation available!"); - return; -retpoline_auto: - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { - retpoline_amd: - if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { - pr_err("Spectre mitigation: LFENCE not serializing, switching to generic retpoline\n"); - goto retpoline_generic; - } - mode = SPECTRE_V2_RETPOLINE_AMD; - setup_force_cpu_cap(X86_FEATURE_RETPOLINE_AMD); - setup_force_cpu_cap(X86_FEATURE_RETPOLINE); - } else { - retpoline_generic: - mode = SPECTRE_V2_RETPOLINE_GENERIC; + if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); + + if (spectre_v2_in_ibrs_mode(mode)) { + x86_spec_ctrl_base |= SPEC_CTRL_IBRS; + write_spec_ctrl_current(x86_spec_ctrl_base, true); + } + + switch (mode) { + case SPECTRE_V2_NONE: + case SPECTRE_V2_EIBRS: + break; + + case SPECTRE_V2_IBRS: + setup_force_cpu_cap(X86_FEATURE_KERNEL_IBRS); + if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) + pr_warn(SPECTRE_V2_IBRS_PERF_MSG); + break; + + case SPECTRE_V2_LFENCE: + case SPECTRE_V2_EIBRS_LFENCE: + setup_force_cpu_cap(X86_FEATURE_RETPOLINE_LFENCE); + fallthrough; + + case SPECTRE_V2_RETPOLINE: + case SPECTRE_V2_EIBRS_RETPOLINE: setup_force_cpu_cap(X86_FEATURE_RETPOLINE); + break; } -specv2_set_mode: + /* + * Disable alternate RSB predictions in kernel when indirect CALLs and + * JMPs gets protection against BHI and Intramode-BTI, but RET + * prediction from a non-RSB predictor is still a risk. + */ + if (mode == SPECTRE_V2_EIBRS_LFENCE || + mode == SPECTRE_V2_EIBRS_RETPOLINE || + mode == SPECTRE_V2_RETPOLINE) + spec_ctrl_disable_kernel_rrsba(); + spectre_v2_enabled = mode; pr_info("%s\n", spectre_v2_strings[mode]); /* - * If spectre v2 protection has been enabled, unconditionally fill - * RSB during a context switch; this protects against two independent - * issues: + * If Spectre v2 protection has been enabled, fill the RSB during a + * context switch. In general there are two types of RSB attacks + * across context switches, for which the CALLs/RETs may be unbalanced. + * + * 1) RSB underflow + * + * Some Intel parts have "bottomless RSB". When the RSB is empty, + * speculated return targets may come from the branch predictor, + * which could have a user-poisoned BTB or BHB entry. + * + * AMD has it even worse: *all* returns are speculated from the BTB, + * regardless of the state of the RSB. + * + * When IBRS or eIBRS is enabled, the "user -> kernel" attack + * scenario is mitigated by the IBRS branch prediction isolation + * properties, so the RSB buffer filling wouldn't be necessary to + * protect against this type of attack. + * + * The "user -> user" attack scenario is mitigated by RSB filling. + * + * 2) Poisoned RSB entry + * + * If the 'next' in-kernel return stack is shorter than 'prev', + * 'next' could be tricked into speculating with a user-poisoned RSB + * entry. + * + * The "user -> kernel" attack scenario is mitigated by SMEP and + * eIBRS. + * + * The "user -> user" scenario, also known as SpectreBHB, requires + * RSB clearing. + * + * So to mitigate all cases, unconditionally fill RSB on context + * switches. * - * - RSB underflow (and switch to BTB) on Skylake+ - * - SpectreRSB variant of spectre v2 on X86_BUG_SPECTRE_V2 CPUs + * FIXME: Is this pointless for retbleed-affected AMD? */ setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n"); + spectre_v2_determine_rsb_fill_type_at_vmexit(mode); + /* - * Retpoline means the kernel is safe because it has no indirect - * branches. Enhanced IBRS protects firmware too, so, enable restricted - * speculation around firmware calls only when Enhanced IBRS isn't - * supported. + * Retpoline protects the kernel, but doesn't protect firmware. IBRS + * and Enhanced IBRS protect firmware too, so enable IBRS around + * firmware calls only when IBRS / Enhanced IBRS aren't otherwise + * enabled. * * Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because * the user might select retpoline on the kernel command line and if * the CPU supports Enhanced IBRS, kernel might un-intentionally not * enable IBRS around firmware calls. */ - if (boot_cpu_has(X86_FEATURE_IBRS) && mode != SPECTRE_V2_IBRS_ENHANCED) { + if (boot_cpu_has_bug(X86_BUG_RETBLEED) && + boot_cpu_has(X86_FEATURE_IBPB) && + (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)) { + + if (retbleed_cmd != RETBLEED_CMD_IBPB) { + setup_force_cpu_cap(X86_FEATURE_USE_IBPB_FW); + pr_info("Enabling Speculation Barrier for firmware calls\n"); + } + + } else if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_ibrs_mode(mode)) { setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); pr_info("Enabling Restricted Speculation for firmware calls\n"); } /* Set up IBPB and STIBP depending on the general spectre V2 command */ - spectre_v2_user_select_mitigation(cmd); + spectre_v2_cmd = cmd; } static void update_stibp_msr(void * __unused) { - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + u64 val = spec_ctrl_current() | (x86_spec_ctrl_base & SPEC_CTRL_STIBP); + write_spec_ctrl_current(val, true); } /* Update x86_spec_ctrl_base in case SMT state changed. */ @@ -897,6 +1599,8 @@ static void update_indir_branch_cond(void) /* Update the static key controlling the MDS CPU buffer clear in idle */ static void update_mds_branch_idle(void) { + u64 ia32_cap = x86_read_arch_cap_msr(); + /* * Enable the idle clearing if SMT is active on CPUs which are * affected only by MSBDS and not any other MDS variant. @@ -908,20 +1612,27 @@ static void update_mds_branch_idle(void) if (!boot_cpu_has_bug(X86_BUG_MSBDS_ONLY)) return; - if (sched_smt_active()) + if (sched_smt_active()) { static_branch_enable(&mds_idle_clear); - else + } else if (mmio_mitigation == MMIO_MITIGATION_OFF || + (ia32_cap & ARCH_CAP_FBSDP_NO)) { static_branch_disable(&mds_idle_clear); + } } #define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n" #define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n" +#define MMIO_MSG_SMT "MMIO Stale Data CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/processor_mmio_stale_data.html for more details.\n" void cpu_bugs_smt_update(void) { mutex_lock(&spec_ctrl_mutex); - switch (spectre_v2_user) { + if (sched_smt_active() && unprivileged_ebpf_enabled() && + spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) + pr_warn_once(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG); + + switch (spectre_v2_user_stibp) { case SPECTRE_V2_USER_NONE: break; case SPECTRE_V2_USER_STRICT: @@ -956,6 +1667,16 @@ void cpu_bugs_smt_update(void) break; } + switch (mmio_mitigation) { + case MMIO_MITIGATION_VERW: + case MMIO_MITIGATION_UCODE_NEEDED: + if (sched_smt_active()) + pr_warn_once(MMIO_MSG_SMT); + break; + case MMIO_MITIGATION_OFF: + break; + } + mutex_unlock(&spec_ctrl_mutex); } @@ -1038,7 +1759,6 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) return mode; switch (cmd) { - case SPEC_STORE_BYPASS_CMD_AUTO: case SPEC_STORE_BYPASS_CMD_SECCOMP: /* * Choose prctl+seccomp as the default mode if seccomp is @@ -1052,6 +1772,7 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) case SPEC_STORE_BYPASS_CMD_ON: mode = SPEC_STORE_BYPASS_DISABLE; break; + case SPEC_STORE_BYPASS_CMD_AUTO: case SPEC_STORE_BYPASS_CMD_PRCTL: mode = SPEC_STORE_BYPASS_PRCTL; break; @@ -1060,16 +1781,6 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) } /* - * If SSBD is controlled by the SPEC_CTRL MSR, then set the proper - * bit in the mask to allow guests to use the mitigation even in the - * case where the host does not enable it. - */ - if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || - static_cpu_has(X86_FEATURE_AMD_SSBD)) { - x86_spec_ctrl_mask |= SPEC_CTRL_SSBD; - } - - /* * We have three CPU feature flags that are in play here: * - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible. * - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass @@ -1086,7 +1797,7 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) x86_amd_ssb_disable(); } else { x86_spec_ctrl_base |= SPEC_CTRL_SSBD; - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + write_spec_ctrl_current(x86_spec_ctrl_base, true); } } @@ -1121,6 +1832,24 @@ static void task_update_spec_tif(struct task_struct *tsk) speculation_ctrl_update_current(); } +static int l1d_flush_prctl_set(struct task_struct *task, unsigned long ctrl) +{ + + if (!static_branch_unlikely(&switch_mm_cond_l1d_flush)) + return -EPERM; + + switch (ctrl) { + case PR_SPEC_ENABLE: + set_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH); + return 0; + case PR_SPEC_DISABLE: + clear_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH); + return 0; + default: + return -ERANGE; + } +} + static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) { if (ssb_mode != SPEC_STORE_BYPASS_PRCTL && @@ -1160,19 +1889,41 @@ static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) return 0; } +static bool is_spec_ib_user_controlled(void) +{ + return spectre_v2_user_ibpb == SPECTRE_V2_USER_PRCTL || + spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP || + spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL || + spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP; +} + static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) { switch (ctrl) { case PR_SPEC_ENABLE: - if (spectre_v2_user == SPECTRE_V2_USER_NONE) + if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && + spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) return 0; + /* - * Indirect branch speculation is always disabled in strict - * mode. + * With strict mode for both IBPB and STIBP, the instruction + * code paths avoid checking this task flag and instead, + * unconditionally run the instruction. However, STIBP and IBPB + * are independent and either can be set to conditionally + * enabled regardless of the mode of the other. + * + * If either is set to conditional, allow the task flag to be + * updated, unless it was force-disabled by a previous prctl + * call. Currently, this is possible on an AMD CPU which has the + * feature X86_FEATURE_AMD_STIBP_ALWAYS_ON. In this case, if the + * kernel is booted with 'spectre_v2_user=seccomp', then + * spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP and + * spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED. */ - if (spectre_v2_user == SPECTRE_V2_USER_STRICT || - spectre_v2_user == SPECTRE_V2_USER_STRICT_PREFERRED) + if (!is_spec_ib_user_controlled() || + task_spec_ib_force_disable(task)) return -EPERM; + task_clear_spec_ib_disable(task); task_update_spec_tif(task); break; @@ -1182,11 +1933,13 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) * Indirect branch speculation is always allowed when * mitigation is force disabled. */ - if (spectre_v2_user == SPECTRE_V2_USER_NONE) + if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && + spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) return -EPERM; - if (spectre_v2_user == SPECTRE_V2_USER_STRICT || - spectre_v2_user == SPECTRE_V2_USER_STRICT_PREFERRED) + + if (!is_spec_ib_user_controlled()) return 0; + task_set_spec_ib_disable(task); if (ctrl == PR_SPEC_FORCE_DISABLE) task_set_spec_ib_force_disable(task); @@ -1206,6 +1959,8 @@ int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which, return ssb_prctl_set(task, ctrl); case PR_SPEC_INDIRECT_BRANCH: return ib_prctl_set(task, ctrl); + case PR_SPEC_L1D_FLUSH: + return l1d_flush_prctl_set(task, ctrl); default: return -ENODEV; } @@ -1216,11 +1971,23 @@ void arch_seccomp_spec_mitigate(struct task_struct *task) { if (ssb_mode == SPEC_STORE_BYPASS_SECCOMP) ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE); - if (spectre_v2_user == SPECTRE_V2_USER_SECCOMP) + if (spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP || + spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP) ib_prctl_set(task, PR_SPEC_FORCE_DISABLE); } #endif +static int l1d_flush_prctl_get(struct task_struct *task) +{ + if (!static_branch_unlikely(&switch_mm_cond_l1d_flush)) + return PR_SPEC_FORCE_DISABLE; + + if (test_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH)) + return PR_SPEC_PRCTL | PR_SPEC_ENABLE; + else + return PR_SPEC_PRCTL | PR_SPEC_DISABLE; +} + static int ssb_prctl_get(struct task_struct *task) { switch (ssb_mode) { @@ -1247,22 +2014,21 @@ static int ib_prctl_get(struct task_struct *task) if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) return PR_SPEC_NOT_AFFECTED; - switch (spectre_v2_user) { - case SPECTRE_V2_USER_NONE: + if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && + spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) return PR_SPEC_ENABLE; - case SPECTRE_V2_USER_PRCTL: - case SPECTRE_V2_USER_SECCOMP: + else if (is_spec_ib_user_controlled()) { if (task_spec_ib_force_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; if (task_spec_ib_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_DISABLE; return PR_SPEC_PRCTL | PR_SPEC_ENABLE; - case SPECTRE_V2_USER_STRICT: - case SPECTRE_V2_USER_STRICT_PREFERRED: + } else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED) return PR_SPEC_DISABLE; - default: + else return PR_SPEC_NOT_AFFECTED; - } } int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) @@ -1272,6 +2038,8 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) return ssb_prctl_get(task); case PR_SPEC_INDIRECT_BRANCH: return ib_prctl_get(task); + case PR_SPEC_L1D_FLUSH: + return l1d_flush_prctl_get(task); default: return -ENODEV; } @@ -1280,7 +2048,7 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) void x86_spec_ctrl_setup_ap(void) { if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + write_spec_ctrl_current(x86_spec_ctrl_base, true); if (ssb_mode == SPEC_STORE_BYPASS_DISABLE) x86_amd_ssb_disable(); @@ -1447,7 +2215,12 @@ static ssize_t l1tf_show_state(char *buf) static ssize_t itlb_multihit_show_state(char *buf) { - if (itlb_multihit_kvm_mitigation) + if (!boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || + !boot_cpu_has(X86_FEATURE_VMX)) + return sprintf(buf, "KVM: Mitigation: VMX unsupported\n"); + else if (!(cr4_read_shadow() & X86_CR4_VMXE)) + return sprintf(buf, "KVM: Mitigation: VMX disabled\n"); + else if (itlb_multihit_kvm_mitigation) return sprintf(buf, "KVM: Mitigation: Split huge pages\n"); else return sprintf(buf, "KVM: Vulnerable\n"); @@ -1496,12 +2269,29 @@ static ssize_t tsx_async_abort_show_state(char *buf) sched_smt_active() ? "vulnerable" : "disabled"); } +static ssize_t mmio_stale_data_show_state(char *buf) +{ + if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) + return sysfs_emit(buf, "Unknown: No mitigations\n"); + + if (mmio_mitigation == MMIO_MITIGATION_OFF) + return sysfs_emit(buf, "%s\n", mmio_strings[mmio_mitigation]); + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + return sysfs_emit(buf, "%s; SMT Host state unknown\n", + mmio_strings[mmio_mitigation]); + } + + return sysfs_emit(buf, "%s; SMT %s\n", mmio_strings[mmio_mitigation], + sched_smt_active() ? "vulnerable" : "disabled"); +} + static char *stibp_state(void) { - if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) + if (spectre_v2_in_ibrs_mode(spectre_v2_enabled)) return ""; - switch (spectre_v2_user) { + switch (spectre_v2_user_stibp) { case SPECTRE_V2_USER_NONE: return ", STIBP: disabled"; case SPECTRE_V2_USER_STRICT: @@ -1528,6 +2318,65 @@ static char *ibpb_state(void) return ""; } +static char *pbrsb_eibrs_state(void) +{ + if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { + if (boot_cpu_has(X86_FEATURE_RSB_VMEXIT_LITE) || + boot_cpu_has(X86_FEATURE_RSB_VMEXIT)) + return ", PBRSB-eIBRS: SW sequence"; + else + return ", PBRSB-eIBRS: Vulnerable"; + } else { + return ", PBRSB-eIBRS: Not affected"; + } +} + +static ssize_t spectre_v2_show_state(char *buf) +{ + if (spectre_v2_enabled == SPECTRE_V2_LFENCE) + return sprintf(buf, "Vulnerable: LFENCE\n"); + + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + return sprintf(buf, "Vulnerable: eIBRS with unprivileged eBPF\n"); + + if (sched_smt_active() && unprivileged_ebpf_enabled() && + spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) + return sprintf(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n"); + + return sprintf(buf, "%s%s%s%s%s%s%s\n", + spectre_v2_strings[spectre_v2_enabled], + ibpb_state(), + boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", + stibp_state(), + boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", + pbrsb_eibrs_state(), + spectre_v2_module_string()); +} + +static ssize_t srbds_show_state(char *buf) +{ + return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]); +} + +static ssize_t retbleed_show_state(char *buf) +{ + if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET || + retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) + return sprintf(buf, "Vulnerable: untrained return thunk / IBPB on non-AMD based uarch\n"); + + return sprintf(buf, "%s; SMT %s\n", + retbleed_strings[retbleed_mitigation], + !sched_smt_active() ? "disabled" : + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED ? + "enabled with STIBP protection" : "vulnerable"); + } + + return sprintf(buf, "%s\n", retbleed_strings[retbleed_mitigation]); +} + static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { @@ -1548,12 +2397,7 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr return sprintf(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]); case X86_BUG_SPECTRE_V2: - return sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], - ibpb_state(), - boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", - stibp_state(), - boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", - spectre_v2_module_string()); + return spectre_v2_show_state(buf); case X86_BUG_SPEC_STORE_BYPASS: return sprintf(buf, "%s\n", ssb_strings[ssb_mode]); @@ -1572,6 +2416,16 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_ITLB_MULTIHIT: return itlb_multihit_show_state(buf); + case X86_BUG_SRBDS: + return srbds_show_state(buf); + + case X86_BUG_MMIO_STALE_DATA: + case X86_BUG_MMIO_UNKNOWN: + return mmio_stale_data_show_state(buf); + + case X86_BUG_RETBLEED: + return retbleed_show_state(buf); + default: break; } @@ -1618,4 +2472,22 @@ ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr { return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT); } + +ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_SRBDS); +} + +ssize_t cpu_show_mmio_stale_data(struct device *dev, struct device_attribute *attr, char *buf) +{ + if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) + return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_UNKNOWN); + else + return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA); +} + +ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_RETBLEED); +} #endif diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index c7503be92f35..66556833d7af 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -29,6 +29,12 @@ #define LVL_3 4 #define LVL_TRACE 5 +/* Shared last level cache maps */ +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); + +/* Shared L2 cache maps */ +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map); + struct _cache_table { unsigned char descriptor; char cache_type; @@ -248,7 +254,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, switch (leaf) { case 1: l1 = &l1i; - /* fall through */ + fallthrough; case 0: if (!l1->val) return; @@ -580,7 +586,7 @@ static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) if (index < 3) return; - node = amd_get_nb_id(smp_processor_id()); + node = topology_die_id(smp_processor_id()); this_leaf->nb = node_to_amd_nb(node); if (this_leaf->nb && !this_leaf->nb->l3_cache.indices) amd_calc_l3_indices(this_leaf->nb); @@ -646,7 +652,7 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c) return i; } -void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id) +void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu) { /* * We may have multiple LLCs if L3 caches exist, so check if we @@ -657,7 +663,7 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id) if (c->x86 < 0x17) { /* LLC is at the node level. */ - per_cpu(cpu_llc_id, cpu) = node_id; + per_cpu(cpu_llc_id, cpu) = c->cpu_die_id; } else if (c->x86 == 0x17 && c->x86_model <= 0x1F) { /* * LLC is at the core complex level. @@ -684,7 +690,7 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id) } } -void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id) +void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu) { /* * We may have multiple LLCs if L3 caches exist, so check if we @@ -846,6 +852,7 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) l2 = new_l2; #ifdef CONFIG_SMP per_cpu(cpu_llc_id, cpu) = l2_id; + per_cpu(cpu_l2c_id, cpu) = l2_id; #endif } @@ -877,7 +884,7 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) static int __cache_amd_cpumap_setup(unsigned int cpu, int index, struct _cpuid4_info_regs *base) { - struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + struct cpu_cacheinfo *this_cpu_ci; struct cacheinfo *this_leaf; int i, sibling; @@ -985,7 +992,7 @@ static void ci_leaf_init(struct cacheinfo *this_leaf, this_leaf->priv = base->nb; } -static int __init_cache_level(unsigned int cpu) +int init_cache_level(unsigned int cpu) { struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); @@ -1014,7 +1021,7 @@ static void get_cache_id(int cpu, struct _cpuid4_info_regs *id4_regs) id4_regs->id = c->apicid >> index_msb; } -static int __populate_cache_leaves(unsigned int cpu) +int populate_cache_leaves(unsigned int cpu) { unsigned int idx, ret; struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); @@ -1033,6 +1040,3 @@ static int __populate_cache_leaves(unsigned int cpu) return 0; } - -DEFINE_SMP_CALL_CACHE_FUNCTION(init_cache_level) -DEFINE_SMP_CALL_CACHE_FUNCTION(populate_cache_leaves) diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 426792565d86..345f7d905db6 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -3,6 +3,7 @@ #include <linux/sched.h> #include <linux/sched/clock.h> +#include <asm/cpu.h> #include <asm/cpufeature.h> #include <asm/e820/api.h> #include <asm/mtrr.h> @@ -64,6 +65,9 @@ static void init_c3(struct cpuinfo_x86 *c) c->x86_cache_alignment = c->x86_clflush_size * 2; set_cpu_cap(c, X86_FEATURE_REP_GOOD); } + + if (c->x86 >= 7) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); } enum { @@ -89,18 +93,15 @@ enum { static void early_init_centaur(struct cpuinfo_x86 *c) { - switch (c->x86) { #ifdef CONFIG_X86_32 - case 5: - /* Emulate MTRRs using Centaur's MCR. */ + /* Emulate MTRRs using Centaur's MCR. */ + if (c->x86 == 5) set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR); - break; #endif - case 6: - if (c->x86_model >= 0xf) - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); - break; - } + if ((c->x86 == 6 && c->x86_model >= 0xf) || + (c->x86 >= 7)) + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_SYSENTER32); #endif @@ -144,9 +145,8 @@ static void init_centaur(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); } - switch (c->x86) { #ifdef CONFIG_X86_32 - case 5: + if (c->x86 == 5) { switch (c->x86_model) { case 4: name = "C6"; @@ -206,12 +206,10 @@ static void init_centaur(struct cpuinfo_x86 *c) c->x86_cache_size = (cc>>24)+(dd>>24); } sprintf(c->x86_model_id, "WinChip %s", name); - break; + } #endif - case 6: + if (c->x86 == 6 || c->x86 >= 7) init_c3(c); - break; - } #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4cdb123ff66a..3e508f239098 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -21,7 +21,9 @@ #include <linux/smp.h> #include <linux/io.h> #include <linux/syscore_ops.h> +#include <linux/pgtable.h> +#include <asm/cmdline.h> #include <asm/stackprotector.h> #include <asm/perf_event.h> #include <asm/mmu_context.h> @@ -35,16 +37,16 @@ #include <asm/vsyscall.h> #include <linux/topology.h> #include <linux/cpumask.h> -#include <asm/pgtable.h> #include <linux/atomic.h> #include <asm/proto.h> #include <asm/setup.h> #include <asm/apic.h> #include <asm/desc.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> #include <asm/mtrr.h> #include <asm/hwcap2.h> #include <linux/numa.h> +#include <asm/numa.h> #include <asm/asm.h> #include <asm/bugs.h> #include <asm/cpu.h> @@ -56,6 +58,9 @@ #include <asm/intel-family.h> #include <asm/cpu_device_id.h> #include <asm/uv/uv.h> +#include <asm/sigframe.h> +#include <asm/traps.h> +#include <asm/sev.h> #include "cpu.h" @@ -76,6 +81,92 @@ EXPORT_SYMBOL(smp_num_siblings); /* Last level cache ID of each logical CPU */ DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID; +u16 get_llc_id(unsigned int cpu) +{ + return per_cpu(cpu_llc_id, cpu); +} +EXPORT_SYMBOL_GPL(get_llc_id); + +/* L2 cache ID of each logical CPU */ +DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id) = BAD_APICID; + +static struct ppin_info { + int feature; + int msr_ppin_ctl; + int msr_ppin; +} ppin_info[] = { + [X86_VENDOR_INTEL] = { + .feature = X86_FEATURE_INTEL_PPIN, + .msr_ppin_ctl = MSR_PPIN_CTL, + .msr_ppin = MSR_PPIN + }, + [X86_VENDOR_AMD] = { + .feature = X86_FEATURE_AMD_PPIN, + .msr_ppin_ctl = MSR_AMD_PPIN_CTL, + .msr_ppin = MSR_AMD_PPIN + }, +}; + +static const struct x86_cpu_id ppin_cpuids[] = { + X86_MATCH_FEATURE(X86_FEATURE_AMD_PPIN, &ppin_info[X86_VENDOR_AMD]), + X86_MATCH_FEATURE(X86_FEATURE_INTEL_PPIN, &ppin_info[X86_VENDOR_INTEL]), + + /* Legacy models without CPUID enumeration */ + X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &ppin_info[X86_VENDOR_INTEL]), + + {} +}; + +static void ppin_init(struct cpuinfo_x86 *c) +{ + const struct x86_cpu_id *id; + unsigned long long val; + struct ppin_info *info; + + id = x86_match_cpu(ppin_cpuids); + if (!id) + return; + + /* + * Testing the presence of the MSR is not enough. Need to check + * that the PPIN_CTL allows reading of the PPIN. + */ + info = (struct ppin_info *)id->driver_data; + + if (rdmsrl_safe(info->msr_ppin_ctl, &val)) + goto clear_ppin; + + if ((val & 3UL) == 1UL) { + /* PPIN locked in disabled mode */ + goto clear_ppin; + } + + /* If PPIN is disabled, try to enable */ + if (!(val & 2UL)) { + wrmsrl_safe(info->msr_ppin_ctl, val | 2UL); + rdmsrl_safe(info->msr_ppin_ctl, &val); + } + + /* Is the enable bit set? */ + if (val & 2UL) { + c->ppin = __rdmsr(info->msr_ppin); + set_cpu_cap(c, info->feature); + return; + } + +clear_ppin: + clear_cpu_cap(c, info->feature); +} + /* correctly size the local cpu masks */ void __init setup_cpu_local_masks(void) { @@ -159,7 +250,6 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), - GDT_STACK_CANARY_INIT #endif } }; EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); @@ -209,13 +299,6 @@ static int __init cachesize_setup(char *str) } __setup("cachesize=", cachesize_setup); -static int __init x86_sep_setup(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_SEP); - return 1; -} -__setup("nosep", x86_sep_setup); - /* Standard macro to see if a specific flag is changeable */ static inline int flag_is_changeable_p(u32 flag) { @@ -287,26 +370,12 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) } #endif -static __init int setup_disable_smep(char *arg) -{ - setup_clear_cpu_cap(X86_FEATURE_SMEP); - return 1; -} -__setup("nosmep", setup_disable_smep); - static __always_inline void setup_smep(struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_SMEP)) cr4_set_bits(X86_CR4_SMEP); } -static __init int setup_disable_smap(char *arg) -{ - setup_clear_cpu_cap(X86_FEATURE_SMAP); - return 1; -} -__setup("nosmap", setup_disable_smap); - static __always_inline void setup_smap(struct cpuinfo_x86 *c) { unsigned long eflags = native_save_fl(); @@ -314,13 +383,8 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) /* This should have been cleared long ago */ BUG_ON(eflags & X86_EFLAGS_AC); - if (cpu_has(c, X86_FEATURE_SMAP)) { -#ifdef CONFIG_X86_SMAP + if (cpu_has(c, X86_FEATURE_SMAP)) cr4_set_bits(X86_CR4_SMAP); -#else - cr4_clear_bits(X86_CR4_SMAP); -#endif - } } static __always_inline void setup_umip(struct cpuinfo_x86 *c) @@ -347,6 +411,10 @@ out: cr4_clear_bits(X86_CR4_UMIP); } +/* These bits should not change their value after CPU init is finished. */ +static const unsigned long cr4_pinned_mask = + X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP | + X86_CR4_FSGSBASE | X86_CR4_CET; static DEFINE_STATIC_KEY_FALSE_RO(cr_pinning); static unsigned long cr4_pinned_bits __ro_after_init; @@ -355,7 +423,7 @@ void native_write_cr0(unsigned long val) unsigned long bits_missing = 0; set_register: - asm volatile("mov %0,%%cr0": "+r" (val), "+m" (__force_order)); + asm volatile("mov %0,%%cr0": "+r" (val) : : "memory"); if (static_branch_likely(&cr_pinning)) { if (unlikely((val & X86_CR0_WP) != X86_CR0_WP)) { @@ -369,25 +437,48 @@ set_register: } EXPORT_SYMBOL(native_write_cr0); -void native_write_cr4(unsigned long val) +void __no_profile native_write_cr4(unsigned long val) { - unsigned long bits_missing = 0; + unsigned long bits_changed = 0; set_register: - asm volatile("mov %0,%%cr4": "+r" (val), "+m" (cr4_pinned_bits)); + asm volatile("mov %0,%%cr4": "+r" (val) : : "memory"); if (static_branch_likely(&cr_pinning)) { - if (unlikely((val & cr4_pinned_bits) != cr4_pinned_bits)) { - bits_missing = ~val & cr4_pinned_bits; - val |= bits_missing; + if (unlikely((val & cr4_pinned_mask) != cr4_pinned_bits)) { + bits_changed = (val & cr4_pinned_mask) ^ cr4_pinned_bits; + val = (val & ~cr4_pinned_mask) | cr4_pinned_bits; goto set_register; } - /* Warn after we've set the missing bits. */ - WARN_ONCE(bits_missing, "CR4 bits went missing: %lx!?\n", - bits_missing); + /* Warn after we've corrected the changed bits. */ + WARN_ONCE(bits_changed, "pinned CR4 bits changed: 0x%lx!?\n", + bits_changed); + } +} +#if IS_MODULE(CONFIG_LKDTM) +EXPORT_SYMBOL_GPL(native_write_cr4); +#endif + +void cr4_update_irqsoff(unsigned long set, unsigned long clear) +{ + unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4); + + lockdep_assert_irqs_disabled(); + + newval = (cr4 & ~clear) | set; + if (newval != cr4) { + this_cpu_write(cpu_tlbstate.cr4, newval); + __write_cr4(newval); } } -EXPORT_SYMBOL(native_write_cr4); +EXPORT_SYMBOL(cr4_update_irqsoff); + +/* Read the CR4 shadow. */ +unsigned long cr4_read_shadow(void) +{ + return this_cpu_read(cpu_tlbstate.cr4); +} +EXPORT_SYMBOL_GPL(cr4_read_shadow); void cr4_init(void) { @@ -396,7 +487,7 @@ void cr4_init(void) if (boot_cpu_has(X86_FEATURE_PCID)) cr4 |= X86_CR4_PCIDE; if (static_branch_likely(&cr_pinning)) - cr4 |= cr4_pinned_bits; + cr4 = (cr4 & ~cr4_pinned_mask) | cr4_pinned_bits; __write_cr4(cr4); @@ -411,13 +502,26 @@ void cr4_init(void) */ static void __init setup_cr_pinning(void) { - unsigned long mask; - - mask = (X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP); - cr4_pinned_bits = this_cpu_read(cpu_tlbstate.cr4) & mask; + cr4_pinned_bits = this_cpu_read(cpu_tlbstate.cr4) & cr4_pinned_mask; static_key_enable(&cr_pinning.key); } +static __init int x86_nofsgsbase_setup(char *arg) +{ + /* Require an exact match without trailing characters. */ + if (strlen(arg)) + return 0; + + /* Do not emit a message if the feature is not present. */ + if (!boot_cpu_has(X86_FEATURE_FSGSBASE)) + return 1; + + setup_clear_cpu_cap(X86_FEATURE_FSGSBASE); + pr_info("FSGSBASE disabled via kernel command line\n"); + return 1; +} +__setup("nofsgsbase", x86_nofsgsbase_setup); + /* * Protection Keys are not available in 32-bit mode. */ @@ -425,27 +529,22 @@ static bool pku_disabled; static __always_inline void setup_pku(struct cpuinfo_x86 *c) { - struct pkru_state *pk; + if (c == &boot_cpu_data) { + if (pku_disabled || !cpu_feature_enabled(X86_FEATURE_PKU)) + return; + /* + * Setting CR4.PKE will cause the X86_FEATURE_OSPKE cpuid + * bit to be set. Enforce it. + */ + setup_force_cpu_cap(X86_FEATURE_OSPKE); - /* check the boot processor, plus compile options for PKU: */ - if (!cpu_feature_enabled(X86_FEATURE_PKU)) - return; - /* checks the actual processor's cpuid bits: */ - if (!cpu_has(c, X86_FEATURE_PKU)) - return; - if (pku_disabled) + } else if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) { return; + } cr4_set_bits(X86_CR4_PKE); - pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU); - if (pk) - pk->pkru = init_pkru_value; - /* - * Seting X86_CR4_PKE will cause the X86_FEATURE_OSPKE - * cpuid bit to be set. We need to ensure that we - * update that bit in this CPU's "cpu_info". - */ - set_cpu_cap(c, X86_FEATURE_OSPKE); + /* Load the default PKRU value */ + pkru_write_default(); } #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS @@ -469,6 +568,58 @@ static __init int setup_disable_pku(char *arg) __setup("nopku", setup_disable_pku); #endif /* CONFIG_X86_64 */ +#ifdef CONFIG_X86_KERNEL_IBT + +__noendbr u64 ibt_save(void) +{ + u64 msr = 0; + + if (cpu_feature_enabled(X86_FEATURE_IBT)) { + rdmsrl(MSR_IA32_S_CET, msr); + wrmsrl(MSR_IA32_S_CET, msr & ~CET_ENDBR_EN); + } + + return msr; +} + +__noendbr void ibt_restore(u64 save) +{ + u64 msr; + + if (cpu_feature_enabled(X86_FEATURE_IBT)) { + rdmsrl(MSR_IA32_S_CET, msr); + msr &= ~CET_ENDBR_EN; + msr |= (save & CET_ENDBR_EN); + wrmsrl(MSR_IA32_S_CET, msr); + } +} + +#endif + +static __always_inline void setup_cet(struct cpuinfo_x86 *c) +{ + u64 msr = CET_ENDBR_EN; + + if (!HAS_KERNEL_IBT || + !cpu_feature_enabled(X86_FEATURE_IBT)) + return; + + wrmsrl(MSR_IA32_S_CET, msr); + cr4_set_bits(X86_CR4_CET); + + if (!ibt_selftest()) { + pr_err("IBT selftest: Failed!\n"); + setup_clear_cpu_cap(X86_FEATURE_IBT); + return; + } +} + +__noendbr void cet_disable(void) +{ + if (cpu_feature_enabled(X86_FEATURE_IBT)) + wrmsrl(MSR_IA32_S_CET, 0); +} + /* * Some CPU features depend on higher CPUID levels, which may not always * be available due to CPUID level capping or broken virtualization @@ -558,7 +709,6 @@ void load_percpu_segment(int cpu) __loadsegment_simple(gs, 0); wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu)); #endif - load_stack_canary_segment(); } #ifdef CONFIG_X86_32 @@ -854,30 +1004,6 @@ static void init_speculation_control(struct cpuinfo_x86 *c) } } -static void init_cqm(struct cpuinfo_x86 *c) -{ - if (!cpu_has(c, X86_FEATURE_CQM_LLC)) { - c->x86_cache_max_rmid = -1; - c->x86_cache_occ_scale = -1; - return; - } - - /* will be overridden if occupancy monitoring exists */ - c->x86_cache_max_rmid = cpuid_ebx(0xf); - - if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) || - cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) || - cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) { - u32 eax, ebx, ecx, edx; - - /* QoS sub-leaf, EAX=0Fh, ECX=1 */ - cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx); - - c->x86_cache_max_rmid = ecx; - c->x86_cache_occ_scale = ebx; - } -} - void get_cpu_cap(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; @@ -943,9 +1069,11 @@ void get_cpu_cap(struct cpuinfo_x86 *c) if (c->extended_cpuid_level >= 0x8000000a) c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a); + if (c->extended_cpuid_level >= 0x8000001f) + c->x86_capability[CPUID_8000_001F_EAX] = cpuid_eax(0x8000001f); + init_scattered_cpuid_features(c); init_speculation_control(c); - init_cqm(c); /* * Clear/Set all flags overridden by options, after probe. @@ -1007,9 +1135,11 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) #define NO_SWAPGS BIT(6) #define NO_ITLB_MULTIHIT BIT(7) #define NO_SPECTRE_V2 BIT(8) +#define NO_MMIO BIT(9) +#define NO_EIBRS_PBRSB BIT(10) -#define VULNWL(_vendor, _family, _model, _whitelist) \ - { X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist } +#define VULNWL(vendor, family, model, whitelist) \ + X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, whitelist) #define VULNWL_INTEL(model, whitelist) \ VULNWL(INTEL, 6, INTEL_FAM6_##model, whitelist) @@ -1025,8 +1155,15 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL(CENTAUR, 5, X86_MODEL_ANY, NO_SPECULATION), VULNWL(INTEL, 5, X86_MODEL_ANY, NO_SPECULATION), VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION), + VULNWL(VORTEX, 5, X86_MODEL_ANY, NO_SPECULATION), + VULNWL(VORTEX, 6, X86_MODEL_ANY, NO_SPECULATION), /* Intel Family 6 */ + VULNWL_INTEL(TIGERLAKE, NO_MMIO), + VULNWL_INTEL(TIGERLAKE_L, NO_MMIO), + VULNWL_INTEL(ALDERLAKE, NO_MMIO), + VULNWL_INTEL(ALDERLAKE_L, NO_MMIO), + VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT), VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT), VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), @@ -1045,9 +1182,9 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), VULNWL_INTEL(ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), /* * Technically, swapgs isn't serializing on AMD (despite it previously @@ -1057,27 +1194,86 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { * good enough for our purposes. */ - VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_TREMONT, NO_EIBRS_PBRSB), + VULNWL_INTEL(ATOM_TREMONT_L, NO_EIBRS_PBRSB), + VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB), /* AMD Family 0xf - 0x12 */ - VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ - VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), /* Zhaoxin Family 7 */ - VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS), - VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS), + VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO), + VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO), + {} +}; + +#define VULNBL(vendor, family, model, blacklist) \ + X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, blacklist) + +#define VULNBL_INTEL_STEPPINGS(model, steppings, issues) \ + X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, \ + INTEL_FAM6_##model, steppings, \ + X86_FEATURE_ANY, issues) + +#define VULNBL_AMD(family, blacklist) \ + VULNBL(AMD, family, X86_MODEL_ANY, blacklist) + +#define VULNBL_HYGON(family, blacklist) \ + VULNBL(HYGON, family, X86_MODEL_ANY, blacklist) + +#define SRBDS BIT(0) +/* CPU is affected by X86_BUG_MMIO_STALE_DATA */ +#define MMIO BIT(1) +/* CPU is affected by Shared Buffers Data Sampling (SBDS), a variant of X86_BUG_MMIO_STALE_DATA */ +#define MMIO_SBDS BIT(2) +/* CPU is affected by RETbleed, speculating where you would not expect it */ +#define RETBLEED BIT(3) + +static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { + VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL_L, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL_G, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL_X, X86_STEPPING_ANY, MMIO), + VULNBL_INTEL_STEPPINGS(BROADWELL_D, X86_STEPPING_ANY, MMIO), + VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO), + VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED), + VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), + VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO), + VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPING_ANY, MMIO), + VULNBL_INTEL_STEPPINGS(COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), + VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), + VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), + VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED), + VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS), + VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO), + VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS), + + VULNBL_AMD(0x15, RETBLEED), + VULNBL_AMD(0x16, RETBLEED), + VULNBL_AMD(0x17, RETBLEED), + VULNBL_HYGON(0x18, RETBLEED), {} }; -static bool __init cpu_matches(unsigned long which) +static bool __init cpu_matches(const struct x86_cpu_id *table, unsigned long which) { - const struct x86_cpu_id *m = x86_match_cpu(cpu_vuln_whitelist); + const struct x86_cpu_id *m = x86_match_cpu(table); return m && !!(m->driver_data & which); } @@ -1092,36 +1288,46 @@ u64 x86_read_arch_cap_msr(void) return ia32_cap; } +static bool arch_cap_mmio_immune(u64 ia32_cap) +{ + return (ia32_cap & ARCH_CAP_FBSDP_NO && + ia32_cap & ARCH_CAP_PSDP_NO && + ia32_cap & ARCH_CAP_SBDR_SSDP_NO); +} + static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) { u64 ia32_cap = x86_read_arch_cap_msr(); /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */ - if (!cpu_matches(NO_ITLB_MULTIHIT) && !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO)) + if (!cpu_matches(cpu_vuln_whitelist, NO_ITLB_MULTIHIT) && + !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO)) setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT); - if (cpu_matches(NO_SPECULATION)) + if (cpu_matches(cpu_vuln_whitelist, NO_SPECULATION)) return; setup_force_cpu_bug(X86_BUG_SPECTRE_V1); - if (!cpu_matches(NO_SPECTRE_V2)) + if (!cpu_matches(cpu_vuln_whitelist, NO_SPECTRE_V2)) setup_force_cpu_bug(X86_BUG_SPECTRE_V2); - if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) && + if (!cpu_matches(cpu_vuln_whitelist, NO_SSB) && + !(ia32_cap & ARCH_CAP_SSB_NO) && !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); if (ia32_cap & ARCH_CAP_IBRS_ALL) setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED); - if (!cpu_matches(NO_MDS) && !(ia32_cap & ARCH_CAP_MDS_NO)) { + if (!cpu_matches(cpu_vuln_whitelist, NO_MDS) && + !(ia32_cap & ARCH_CAP_MDS_NO)) { setup_force_cpu_bug(X86_BUG_MDS); - if (cpu_matches(MSBDS_ONLY)) + if (cpu_matches(cpu_vuln_whitelist, MSBDS_ONLY)) setup_force_cpu_bug(X86_BUG_MSBDS_ONLY); } - if (!cpu_matches(NO_SWAPGS)) + if (!cpu_matches(cpu_vuln_whitelist, NO_SWAPGS)) setup_force_cpu_bug(X86_BUG_SWAPGS); /* @@ -1139,7 +1345,47 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) (ia32_cap & ARCH_CAP_TSX_CTRL_MSR))) setup_force_cpu_bug(X86_BUG_TAA); - if (cpu_matches(NO_MELTDOWN)) + /* + * SRBDS affects CPUs which support RDRAND or RDSEED and are listed + * in the vulnerability blacklist. + * + * Some of the implications and mitigation of Shared Buffers Data + * Sampling (SBDS) are similar to SRBDS. Give SBDS same treatment as + * SRBDS. + */ + if ((cpu_has(c, X86_FEATURE_RDRAND) || + cpu_has(c, X86_FEATURE_RDSEED)) && + cpu_matches(cpu_vuln_blacklist, SRBDS | MMIO_SBDS)) + setup_force_cpu_bug(X86_BUG_SRBDS); + + /* + * Processor MMIO Stale Data bug enumeration + * + * Affected CPU list is generally enough to enumerate the vulnerability, + * but for virtualization case check for ARCH_CAP MSR bits also, VMM may + * not want the guest to enumerate the bug. + * + * Set X86_BUG_MMIO_UNKNOWN for CPUs that are neither in the blacklist, + * nor in the whitelist and also don't enumerate MSR ARCH_CAP MMIO bits. + */ + if (!arch_cap_mmio_immune(ia32_cap)) { + if (cpu_matches(cpu_vuln_blacklist, MMIO)) + setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA); + else if (!cpu_matches(cpu_vuln_whitelist, NO_MMIO)) + setup_force_cpu_bug(X86_BUG_MMIO_UNKNOWN); + } + + if (!cpu_has(c, X86_FEATURE_BTC_NO)) { + if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (ia32_cap & ARCH_CAP_RSBA)) + setup_force_cpu_bug(X86_BUG_RETBLEED); + } + + if (cpu_has(c, X86_FEATURE_IBRS_ENHANCED) && + !cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) && + !(ia32_cap & ARCH_CAP_PBRSB_NO)) + setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB); + + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; /* Rogue Data Cache Load? No! */ @@ -1148,7 +1394,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); - if (cpu_matches(NO_L1TF)) + if (cpu_matches(cpu_vuln_whitelist, NO_L1TF)) return; setup_force_cpu_bug(X86_BUG_L1TF); @@ -1173,6 +1419,99 @@ static void detect_nopl(void) } /* + * We parse cpu parameters early because fpu__init_system() is executed + * before parse_early_param(). + */ +static void __init cpu_parse_early_param(void) +{ + char arg[128]; + char *argptr = arg, *opt; + int arglen, taint = 0; + +#ifdef CONFIG_X86_32 + if (cmdline_find_option_bool(boot_command_line, "no387")) +#ifdef CONFIG_MATH_EMULATION + setup_clear_cpu_cap(X86_FEATURE_FPU); +#else + pr_err("Option 'no387' required CONFIG_MATH_EMULATION enabled.\n"); +#endif + + if (cmdline_find_option_bool(boot_command_line, "nofxsr")) + setup_clear_cpu_cap(X86_FEATURE_FXSR); +#endif + + if (cmdline_find_option_bool(boot_command_line, "noxsave")) + setup_clear_cpu_cap(X86_FEATURE_XSAVE); + + if (cmdline_find_option_bool(boot_command_line, "noxsaveopt")) + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + + if (cmdline_find_option_bool(boot_command_line, "noxsaves")) + setup_clear_cpu_cap(X86_FEATURE_XSAVES); + + arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg)); + if (arglen <= 0) + return; + + pr_info("Clearing CPUID bits:"); + + while (argptr) { + bool found __maybe_unused = false; + unsigned int bit; + + opt = strsep(&argptr, ","); + + /* + * Handle naked numbers first for feature flags which don't + * have names. + */ + if (!kstrtouint(opt, 10, &bit)) { + if (bit < NCAPINTS * 32) { + +#ifdef CONFIG_X86_FEATURE_NAMES + /* empty-string, i.e., ""-defined feature flags */ + if (!x86_cap_flags[bit]) + pr_cont(" " X86_CAP_FMT_NUM, x86_cap_flag_num(bit)); + else +#endif + pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit)); + + setup_clear_cpu_cap(bit); + taint++; + } + /* + * The assumption is that there are no feature names with only + * numbers in the name thus go to the next argument. + */ + continue; + } + +#ifdef CONFIG_X86_FEATURE_NAMES + for (bit = 0; bit < 32 * NCAPINTS; bit++) { + if (!x86_cap_flag(bit)) + continue; + + if (strcmp(x86_cap_flag(bit), opt)) + continue; + + pr_cont(" %s", opt); + setup_clear_cpu_cap(bit); + taint++; + found = true; + break; + } + + if (!found) + pr_cont(" (unknown: %s)", opt); +#endif + } + pr_cont("\n"); + + if (taint) + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); +} + +/* * Do minimum CPU detection early. * Fields really needed: vendor, cpuid_level, family, model, mask, * cache alignment. @@ -1207,6 +1546,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) get_cpu_cap(c); get_cpu_address_sizes(c); setup_force_cpu_cap(X86_FEATURE_CPUID); + cpu_parse_early_param(); if (this_cpu->c_early_init) this_cpu->c_early_init(c); @@ -1224,8 +1564,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) cpu_set_bug_bits(c); + sld_setup(c); + fpu__init_system(c); + init_sigframe_size(); + #ifdef CONFIG_X86_32 /* * Regardless of whether PCID is enumerated, the SDM says @@ -1285,9 +1629,8 @@ void __init early_cpu_init(void) early_identify_cpu(&boot_cpu_data); } -static void detect_null_seg_behavior(struct cpuinfo_x86 *c) +static bool detect_null_seg_behavior(void) { -#ifdef CONFIG_X86_64 /* * Empirically, writing zero to a segment selector on AMD does * not clear the base, whereas writing zero to a segment @@ -1296,7 +1639,7 @@ static void detect_null_seg_behavior(struct cpuinfo_x86 *c) * where GS is unused by the prev and next threads. * * Since neither vendor documents this anywhere that I can see, - * detect it directly instead of hardcoding the choice by + * detect it directly instead of hard-coding the choice by * vendor. * * I've designated AMD's behavior as the "bug" because it's @@ -1308,10 +1651,43 @@ static void detect_null_seg_behavior(struct cpuinfo_x86 *c) wrmsrl(MSR_FS_BASE, 1); loadsegment(fs, 0); rdmsrl(MSR_FS_BASE, tmp); - if (tmp != 0) - set_cpu_bug(c, X86_BUG_NULL_SEG); wrmsrl(MSR_FS_BASE, old_base); -#endif + return tmp == 0; +} + +void check_null_seg_clears_base(struct cpuinfo_x86 *c) +{ + /* BUG_NULL_SEG is only relevant with 64bit userspace */ + if (!IS_ENABLED(CONFIG_X86_64)) + return; + + /* Zen3 CPUs advertise Null Selector Clears Base in CPUID. */ + if (c->extended_cpuid_level >= 0x80000021 && + cpuid_eax(0x80000021) & BIT(6)) + return; + + /* + * CPUID bit above wasn't set. If this kernel is still running + * as a HV guest, then the HV has decided not to advertize + * that CPUID bit for whatever reason. For example, one + * member of the migration pool might be vulnerable. Which + * means, the bug is present: set the BUG flag and return. + */ + if (cpu_has(c, X86_FEATURE_HYPERVISOR)) { + set_cpu_bug(c, X86_BUG_NULL_SEG); + return; + } + + /* + * Zen2 CPUs also have this behaviour, but no CPUID bit. + * 0x18 is the respective family for Hygon. + */ + if ((c->x86 == 0x17 || c->x86 == 0x18) && + detect_null_seg_behavior()) + return; + + /* All the remaining ones are affected */ + set_cpu_bug(c, X86_BUG_NULL_SEG); } static void generic_identify(struct cpuinfo_x86 *c) @@ -1347,8 +1723,6 @@ static void generic_identify(struct cpuinfo_x86 *c) get_model_name(c); /* Default name */ - detect_null_seg_behavior(c); - /* * ESPFIX is a strange bug. All real CPUs have it. Paravirt * systems that run Linux at CPL > 0 may or may not have the @@ -1363,32 +1737,10 @@ static void generic_identify(struct cpuinfo_x86 *c) * ESPFIX issue, we can change this. */ #ifdef CONFIG_X86_32 -# ifdef CONFIG_PARAVIRT_XXL - do { - extern void native_iret(void); - if (pv_ops.cpu.iret == native_iret) - set_cpu_bug(c, X86_BUG_ESPFIX); - } while (0); -# else set_cpu_bug(c, X86_BUG_ESPFIX); -# endif #endif } -static void x86_init_cache_qos(struct cpuinfo_x86 *c) -{ - /* - * The heavy lifting of max_rmid and cache_occ_scale are handled - * in get_cpu_cap(). Here we just set the max_rmid for the boot_cpu - * in case CQM bits really aren't there in this CPU. - */ - if (c != &boot_cpu_data) { - boot_cpu_data.x86_cache_max_rmid = - min(boot_cpu_data.x86_cache_max_rmid, - c->x86_cache_max_rmid); - } -} - /* * Validate that ACPI/mptables have the same information about the * effective APIC id and update the package map. @@ -1476,6 +1828,12 @@ static void identify_cpu(struct cpuinfo_x86 *c) setup_smap(c); setup_umip(c); + /* Enable FSGSBASE instructions if available. */ + if (cpu_has(c, X86_FEATURE_FSGSBASE)) { + cr4_set_bits(X86_CR4_FSGSBASE); + elf_hwcap2 |= HWCAP2_FSGSBASE; + } + /* * The vendor-specific functions might have changed features. * Now we do "generic changes." @@ -1501,8 +1859,8 @@ static void identify_cpu(struct cpuinfo_x86 *c) #endif x86_init_rdrand(c); - x86_init_cache_qos(c); setup_pku(c); + setup_cet(c); /* * Clear/Set all flags overridden by options, need do it @@ -1526,6 +1884,8 @@ static void identify_cpu(struct cpuinfo_x86 *c) c->x86_capability[i] |= boot_cpu_data.x86_capability[i]; } + ppin_init(c); + /* Init Machine Check Exception if available. */ mcheck_cpu_init(c); @@ -1569,6 +1929,8 @@ void enable_sep_cpu(void) void __init identify_boot_cpu(void) { identify_cpu(&boot_cpu_data); + if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) + pr_info("CET detected: Indirect Branch Tracking enabled\n"); #ifdef CONFIG_X86_32 sysenter_setup(); enable_sep_cpu(); @@ -1589,15 +1951,10 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) mtrr_ap_init(); validate_apic_and_package_id(c); x86_spec_ctrl_setup_ap(); -} + update_srbds_msr(); -static __init int setup_noclflush(char *arg) -{ - setup_clear_cpu_cap(X86_FEATURE_CLFLUSH); - setup_clear_cpu_cap(X86_FEATURE_CLFLUSHOPT); - return 1; + tsx_ap_init(); } -__setup("noclflush", setup_noclflush); void print_cpu_info(struct cpuinfo_x86 *c) { @@ -1627,9 +1984,8 @@ void print_cpu_info(struct cpuinfo_x86 *c) } /* - * clearcpuid= was already parsed in fpu__init_parse_early_param. - * But we need to keep a dummy __setup around otherwise it would - * show up as an environment variable for init. + * clearcpuid= was already parsed in cpu_parse_early_param(). This dummy + * function prevents it from becoming an environment variable for init. */ static __init int setup_clearcpuid(char *arg) { @@ -1650,12 +2006,25 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = &init_task; EXPORT_PER_CPU_SYMBOL(current_task); -DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); -DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; +DEFINE_PER_CPU(void *, hardirq_stack_ptr); +DEFINE_PER_CPU(bool, hardirq_stack_inuse); DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; EXPORT_PER_CPU_SYMBOL(__preempt_count); +DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK; + +static void wrmsrl_cstar(unsigned long val) +{ + /* + * Intel CPUs do not support 32-bit SYSCALL. Writing to MSR_CSTAR + * is so far ignored by the CPU, but raises a #VE trap in a TDX + * guest. Avoid the pointless write on all Intel CPUs. + */ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + wrmsrl(MSR_CSTAR, val); +} + /* May not be marked __init: used by software suspend */ void syscall_init(void) { @@ -1663,7 +2032,7 @@ void syscall_init(void) wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); #ifdef CONFIG_IA32_EMULATION - wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); + wrmsrl_cstar((unsigned long)entry_SYSCALL_compat); /* * This only works on Intel CPUs. * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. @@ -1675,36 +2044,23 @@ void syscall_init(void) (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1)); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); #else - wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); + wrmsrl_cstar((unsigned long)ignore_sysret); wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); #endif - /* Flags to clear on syscall */ + /* + * Flags to clear on syscall; clear as much as possible + * to minimize user space-kernel interference. + */ wrmsrl(MSR_SYSCALL_MASK, - X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF| - X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT); -} - -DEFINE_PER_CPU(int, debug_stack_usage); -DEFINE_PER_CPU(u32, debug_idt_ctr); - -void debug_stack_set_zero(void) -{ - this_cpu_inc(debug_idt_ctr); - load_current_idt(); + X86_EFLAGS_CF|X86_EFLAGS_PF|X86_EFLAGS_AF| + X86_EFLAGS_ZF|X86_EFLAGS_SF|X86_EFLAGS_TF| + X86_EFLAGS_IF|X86_EFLAGS_DF|X86_EFLAGS_OF| + X86_EFLAGS_IOPL|X86_EFLAGS_NT|X86_EFLAGS_RF| + X86_EFLAGS_AC|X86_EFLAGS_ID); } -NOKPROBE_SYMBOL(debug_stack_set_zero); - -void debug_stack_reset(void) -{ - if (WARN_ON(!this_cpu_read(debug_idt_ctr))) - return; - if (this_cpu_dec_return(debug_idt_ctr) == 0) - load_current_idt(); -} -NOKPROBE_SYMBOL(debug_stack_reset); #else /* CONFIG_X86_64 */ @@ -1723,7 +2079,8 @@ DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack); #ifdef CONFIG_STACKPROTECTOR -DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); +DEFINE_PER_CPU(unsigned long, __stack_chk_guard); +EXPORT_PER_CPU_SYMBOL(__stack_chk_guard); #endif #endif /* CONFIG_X86_64 */ @@ -1777,8 +2134,8 @@ static inline void setup_getcpu(int cpu) unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu)); struct desc_struct d = { }; - if (boot_cpu_has(X86_FEATURE_RDTSCP)) - write_rdtscp_aux(cpudata); + if (boot_cpu_has(X86_FEATURE_RDTSCP) || boot_cpu_has(X86_FEATURE_RDPID)) + wrmsr(MSR_TSC_AUX, cpudata, 0); /* Store CPU and node number in limit. */ d.limit0 = cpudata; @@ -1806,6 +2163,8 @@ static inline void tss_setup_ist(struct tss_struct *tss) tss->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI); tss->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB); tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE); + /* Only mapped when SEV-ES is active */ + tss->x86_tss.ist[IST_INDEX_VC] = __this_cpu_ist_top_va(VC); } #else /* CONFIG_X86_64 */ @@ -1838,14 +2197,39 @@ static inline void tss_setup_io_bitmap(struct tss_struct *tss) } /* + * Setup everything needed to handle exceptions from the IDT, including the IST + * exceptions which use paranoid_entry(). + */ +void cpu_init_exception_handling(void) +{ + struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw); + int cpu = raw_smp_processor_id(); + + /* paranoid_entry() gets the CPU number from the GDT */ + setup_getcpu(cpu); + + /* IST vectors need TSS to be set up. */ + tss_setup_ist(tss); + tss_setup_io_bitmap(tss); + set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); + + load_TR_desc(); + + /* GHCB needs to be setup to handle #VC. */ + setup_ghcb(); + + /* Finally load the IDT */ + load_current_idt(); +} + +/* * cpu_init() initializes state that is per-CPU. Some data is already - * initialized (naturally) in the bootstrap process, such as the GDT - * and IDT. We reload them nevertheless, this function acts as a - * 'CPU state barrier', nothing should get across. + * initialized (naturally) in the bootstrap process, such as the GDT. We + * reload it nevertheless, this function acts as a 'CPU state barrier', + * nothing should get across. */ void cpu_init(void) { - struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw); struct task_struct *cur = current; int cpu = raw_smp_processor_id(); @@ -1858,8 +2242,6 @@ void cpu_init(void) early_cpu_to_node(cpu) != NUMA_NO_NODE) set_numa_node(early_cpu_to_node(cpu)); #endif - setup_getcpu(cpu); - pr_debug("Initializing CPU#%d\n", cpu); if (IS_ENABLED(CONFIG_X86_64) || cpu_feature_enabled(X86_FEATURE_VME) || @@ -1871,7 +2253,6 @@ void cpu_init(void) * and set up the GDT descriptor: */ switch_to_new_gdt(cpu); - load_current_idt(); if (IS_ENABLED(CONFIG_X86_64)) { loadsegment(fs, 0); @@ -1891,12 +2272,6 @@ void cpu_init(void) initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, cur); - /* Initialize the TSS. */ - tss_setup_ist(tss); - tss_setup_io_bitmap(tss); - set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); - - load_TR_desc(); /* * sp0 points to the entry trampoline stack regardless of what task * is running. @@ -1918,6 +2293,19 @@ void cpu_init(void) load_fixmap_gdt(cpu); } +#ifdef CONFIG_SMP +void cpu_init_secondary(void) +{ + /* + * Relies on the BP having set-up the IDT tables, which are loaded + * on this CPU in cpu_init_exception_handling(). + */ + cpu_init_exception_handling(); + cpu_init(); +} +#endif + +#ifdef CONFIG_MICROCODE_LATE_LOADING /* * The microcode loader calls this upon late microcode load to recheck features, * only when microcode has been updated. Caller holds microcode_mutex and CPU @@ -1947,6 +2335,7 @@ void microcode_check(void) pr_warn("x86/CPU: CPU features have changed after loading microcode, but might not take effect.\n"); pr_warn("x86/CPU: Please consider either early loading through initrd/built-in or a potential BIOS update.\n"); } +#endif /* * Invoked from core CPU hotplug code after hotplug operations diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 37fdefd14f28..7c9b5893c30a 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -38,7 +38,7 @@ struct _tlb_table { #define cpu_dev_register(cpu_devX) \ static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \ - __attribute__((__section__(".x86_cpu_dev.init"))) = \ + __section(".x86_cpu_dev.init") = \ &cpu_devX; extern const struct cpu_dev *const __x86_cpu_dev_start[], @@ -48,18 +48,21 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[], enum tsx_ctrl_states { TSX_CTRL_ENABLE, TSX_CTRL_DISABLE, + TSX_CTRL_RTM_ALWAYS_ABORT, TSX_CTRL_NOT_SUPPORTED, }; extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state; extern void __init tsx_init(void); -extern void tsx_enable(void); -extern void tsx_disable(void); +void tsx_ap_init(void); #else static inline void tsx_init(void) { } +static inline void tsx_ap_init(void) { } #endif /* CONFIG_CPU_SUP_INTEL */ +extern void init_spectral_chicken(struct cpuinfo_x86 *c); + extern void get_cpu_cap(struct cpuinfo_x86 *c); extern void get_cpu_address_sizes(struct cpuinfo_x86 *c); extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); @@ -73,15 +76,13 @@ extern int detect_extended_topology_early(struct cpuinfo_x86 *c); extern int detect_extended_topology(struct cpuinfo_x86 *c); extern int detect_ht_early(struct cpuinfo_x86 *c); extern void detect_ht(struct cpuinfo_x86 *c); +extern void check_null_seg_clears_base(struct cpuinfo_x86 *c); unsigned int aperfmperf_get_khz(int cpu); extern void x86_spec_ctrl_setup_ap(void); +extern void update_srbds_msr(void); extern u64 x86_read_arch_cap_msr(void); -#ifdef CONFIG_IA32_FEAT_CTL -void init_ia32_feat_ctl(struct cpuinfo_x86 *c); -#endif - #endif /* ARCH_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index 3cbe24ca80ab..c881bcafba7d 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -69,6 +69,15 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC }, { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC }, { X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_FP16, X86_FEATURE_AVX512BW }, + { X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES }, + { X86_FEATURE_PER_THREAD_MBA, X86_FEATURE_MBA }, + { X86_FEATURE_SGX_LC, X86_FEATURE_SGX }, + { X86_FEATURE_SGX1, X86_FEATURE_SGX }, + { X86_FEATURE_SGX2, X86_FEATURE_SGX1 }, + { X86_FEATURE_XFD, X86_FEATURE_XSAVES }, + { X86_FEATURE_XFD, X86_FEATURE_XGETBV1 }, + { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, {} }; diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 1d9b8aaea06c..9651275aecd1 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/bitops.h> #include <linux/delay.h> +#include <linux/isa-dma.h> #include <linux/pci.h> #include <asm/dma.h> #include <linux/io.h> @@ -291,7 +292,7 @@ static void init_cyrix(struct cpuinfo_x86 *c) mark_tsc_unstable("cyrix 5510/5520 detected"); } #endif - c->x86_cache_size = 16; /* Yep 16K integrated cache thats it */ + c->x86_cache_size = 16; /* Yep 16K integrated cache that's it */ /* GXm supports extended cpuid levels 'ala' AMD */ if (c->cpuid_level == 2) { diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c index 0268185bef94..03851240c3e3 100644 --- a/arch/x86/kernel/cpu/feat_ctl.c +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/tboot.h> +#include <asm/cpu.h> #include <asm/cpufeature.h> #include <asm/msr-index.h> #include <asm/processor.h> @@ -14,6 +15,8 @@ enum vmx_feature_leafs { MISC_FEATURES = 0, PRIMARY_CTLS, SECONDARY_CTLS, + TERTIARY_CTLS_LOW, + TERTIARY_CTLS_HIGH, NR_VMX_FEATURE_WORDS, }; @@ -21,7 +24,7 @@ enum vmx_feature_leafs { static void init_vmx_capabilities(struct cpuinfo_x86 *c) { - u32 supported, funcs, ept, vpid, ign; + u32 supported, funcs, ept, vpid, ign, low, high; BUILD_BUG_ON(NVMXINTS != NR_VMX_FEATURE_WORDS); @@ -41,6 +44,11 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c) rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS2, &ign, &supported); c->vmx_capability[SECONDARY_CTLS] = supported; + /* All 64 bits of tertiary controls MSR are allowed-1 settings. */ + rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS3, &low, &high); + c->vmx_capability[TERTIARY_CTLS_LOW] = low; + c->vmx_capability[TERTIARY_CTLS_HIGH] = high; + rdmsr(MSR_IA32_VMX_PINBASED_CTLS, ign, supported); rdmsr_safe(MSR_IA32_VMX_VMFUNC, &ign, &funcs); @@ -92,16 +100,42 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c) } #endif /* CONFIG_X86_VMX_FEATURE_NAMES */ +static int __init nosgx(char *str) +{ + setup_clear_cpu_cap(X86_FEATURE_SGX); + + return 0; +} + +early_param("nosgx", nosgx); + void init_ia32_feat_ctl(struct cpuinfo_x86 *c) { + bool enable_sgx_kvm = false, enable_sgx_driver = false; bool tboot = tboot_enabled(); + bool enable_vmx; u64 msr; if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) { clear_cpu_cap(c, X86_FEATURE_VMX); + clear_cpu_cap(c, X86_FEATURE_SGX); return; } + enable_vmx = cpu_has(c, X86_FEATURE_VMX) && + IS_ENABLED(CONFIG_KVM_INTEL); + + if (cpu_has(c, X86_FEATURE_SGX) && IS_ENABLED(CONFIG_X86_SGX)) { + /* + * Separate out SGX driver enabling from KVM. This allows KVM + * guests to use SGX even if the kernel SGX driver refuses to + * use it. This happens if flexible Launch Control is not + * available. + */ + enable_sgx_driver = cpu_has(c, X86_FEATURE_SGX_LC); + enable_sgx_kvm = enable_vmx && IS_ENABLED(CONFIG_X86_SGX_KVM); + } + if (msr & FEAT_CTL_LOCKED) goto update_caps; @@ -116,20 +150,26 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c) * i.e. KVM is enabled, to avoid unnecessarily adding an attack vector * for the kernel, e.g. using VMX to hide malicious code. */ - if (cpu_has(c, X86_FEATURE_VMX) && IS_ENABLED(CONFIG_KVM_INTEL)) { + if (enable_vmx) { msr |= FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; if (tboot) msr |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX; } + if (enable_sgx_kvm || enable_sgx_driver) { + msr |= FEAT_CTL_SGX_ENABLED; + if (enable_sgx_driver) + msr |= FEAT_CTL_SGX_LC_ENABLED; + } + wrmsrl(MSR_IA32_FEAT_CTL, msr); update_caps: set_cpu_cap(c, X86_FEATURE_MSR_IA32_FEAT_CTL); if (!cpu_has(c, X86_FEATURE_VMX)) - return; + goto update_sgx; if ( (tboot && !(msr & FEAT_CTL_VMX_ENABLED_INSIDE_SMX)) || (!tboot && !(msr & FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX))) { @@ -142,4 +182,31 @@ update_caps: init_vmx_capabilities(c); #endif } + +update_sgx: + if (!(msr & FEAT_CTL_SGX_ENABLED)) { + if (enable_sgx_kvm || enable_sgx_driver) + pr_err_once("SGX disabled by BIOS.\n"); + clear_cpu_cap(c, X86_FEATURE_SGX); + return; + } + + /* + * VMX feature bit may be cleared due to being disabled in BIOS, + * in which case SGX virtualization cannot be supported either. + */ + if (!cpu_has(c, X86_FEATURE_VMX) && enable_sgx_kvm) { + pr_err_once("SGX virtualization disabled due to lack of VMX.\n"); + enable_sgx_kvm = 0; + } + + if (!(msr & FEAT_CTL_SGX_LC_ENABLED) && enable_sgx_driver) { + if (!enable_sgx_kvm) { + pr_err_once("SGX Launch Control is locked. Disable SGX.\n"); + clear_cpu_cap(c, X86_FEATURE_SGX); + } else { + pr_err_once("SGX Launch Control is locked. Support SGX virtualization only.\n"); + clear_cpu_cap(c, X86_FEATURE_SGX_LC); + } + } } diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index 4e28c1fc8749..21fd425088fe 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -10,12 +10,10 @@ #include <asm/cpu.h> #include <asm/smp.h> +#include <asm/numa.h> #include <asm/cacheinfo.h> #include <asm/spec-ctrl.h> #include <asm/delay.h> -#ifdef CONFIG_X86_64 -# include <asm/set_memory.h> -#endif #include "cpu.h" @@ -64,7 +62,6 @@ static void hygon_get_topology_early(struct cpuinfo_x86 *c) */ static void hygon_get_topology(struct cpuinfo_x86 *c) { - u8 node_id; int cpu = smp_processor_id(); /* get information required for multi-node processors */ @@ -74,7 +71,7 @@ static void hygon_get_topology(struct cpuinfo_x86 *c) cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); - node_id = ecx & 0xff; + c->cpu_die_id = ecx & 0xff; c->cpu_core_id = ebx & 0xff; @@ -92,14 +89,14 @@ static void hygon_get_topology(struct cpuinfo_x86 *c) /* Socket ID is ApicId[6] for these processors. */ c->phys_proc_id = c->apicid >> APICID_SOCKET_ID_BIT; - cacheinfo_hygon_init_llc_id(c, cpu, node_id); + cacheinfo_hygon_init_llc_id(c, cpu); } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { u64 value; rdmsrl(MSR_FAM10H_NODE_ID, value); - node_id = value & 7; + c->cpu_die_id = value & 7; - per_cpu(cpu_llc_id, cpu) = node_id; + per_cpu(cpu_llc_id, cpu) = c->cpu_die_id; } else return; @@ -122,7 +119,7 @@ static void hygon_detect_cmp(struct cpuinfo_x86 *c) /* Convert the initial APIC ID into the socket ID */ c->phys_proc_id = c->initial_apicid >> bits; /* use socket ID also for last level cache */ - per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; + per_cpu(cpu_llc_id, cpu) = c->cpu_die_id = c->phys_proc_id; } static void srat_detect_node(struct cpuinfo_x86 *c) @@ -203,23 +200,6 @@ static void early_init_hygon_mc(struct cpuinfo_x86 *c) static void bsp_init_hygon(struct cpuinfo_x86 *c) { -#ifdef CONFIG_X86_64 - unsigned long long tseg; - - /* - * Split up direct mapping around the TSEG SMM area. - * Don't do it for gbpages because there seems very little - * benefit in doing so. - */ - if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { - unsigned long pfn = tseg >> PAGE_SHIFT; - - pr_debug("tseg: %010llx\n", tseg); - if (pfn_range_is_mapped(pfn, pfn + 1)) - set_memory_4k((unsigned long)__va(tseg), 1); - } -#endif - if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { u64 val; @@ -235,12 +215,12 @@ static void bsp_init_hygon(struct cpuinfo_x86 *c) u32 ecx; ecx = cpuid_ecx(0x8000001e); - nodes_per_socket = ((ecx >> 8) & 7) + 1; + __max_die_per_package = nodes_per_socket = ((ecx >> 8) & 7) + 1; } else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) { u64 value; rdmsrl(MSR_FAM10H_NODE_ID, value); - nodes_per_socket = ((value >> 3) & 7) + 1; + __max_die_per_package = nodes_per_socket = ((value >> 3) & 7) + 1; } if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) && @@ -280,6 +260,10 @@ static void early_init_hygon(struct cpuinfo_x86 *c) if (c->x86_power & BIT(12)) set_cpu_cap(c, X86_FEATURE_ACC_POWER); + /* Bit 14 indicates the Runtime Average Power Limit interface. */ + if (c->x86_power & BIT(14)) + set_cpu_cap(c, X86_FEATURE_RAPL); + #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_SYSCALL32); #endif @@ -318,6 +302,12 @@ static void init_hygon(struct cpuinfo_x86 *c) /* get apicid instead of initial apic id from cpuid */ c->apicid = hard_smp_processor_id(); + /* + * XXX someone from Hygon needs to confirm this DTRT + * + init_spectral_chicken(c); + */ + set_cpu_cap(c, X86_FEATURE_ZEN); set_cpu_cap(c, X86_FEATURE_CPB); @@ -351,6 +341,8 @@ static void init_hygon(struct cpuinfo_x86 *c) /* Hygon CPUs don't reset SS attributes on SYSRET, Xen does. */ if (!cpu_has(c, X86_FEATURE_XENPV)) set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); + + check_null_seg_clears_base(c); } static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index be82cd5841c3..2d7ea5480ec3 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -1,17 +1,21 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> +#include <linux/pgtable.h> #include <linux/string.h> #include <linux/bitops.h> #include <linux/smp.h> #include <linux/sched.h> #include <linux/sched/clock.h> +#include <linux/semaphore.h> #include <linux/thread_info.h> #include <linux/init.h> #include <linux/uaccess.h> +#include <linux/workqueue.h> +#include <linux/delay.h> +#include <linux/cpuhotplug.h> #include <asm/cpufeature.h> -#include <asm/pgtable.h> #include <asm/msr.h> #include <asm/bugs.h> #include <asm/cpu.h> @@ -19,6 +23,12 @@ #include <asm/microcode_intel.h> #include <asm/hwcap2.h> #include <asm/elf.h> +#include <asm/cpu_device_id.h> +#include <asm/cmdline.h> +#include <asm/traps.h> +#include <asm/resctrl.h> +#include <asm/numa.h> +#include <asm/thermal.h> #ifdef CONFIG_X86_64 #include <linux/topology.h> @@ -31,6 +41,28 @@ #include <asm/apic.h> #endif +enum split_lock_detect_state { + sld_off = 0, + sld_warn, + sld_fatal, + sld_ratelimit, +}; + +/* + * Default to sld_off because most systems do not support split lock detection. + * sld_state_setup() will switch this to sld_warn on systems that support + * split lock/bus lock detect, unless there is a command line override. + */ +static enum split_lock_detect_state sld_state __ro_after_init = sld_off; +static u64 msr_test_ctrl_cache __ro_after_init; + +/* + * With a name like MSR_TEST_CTL it should go without saying, but don't touch + * MSR_TEST_CTL unless the CPU is one of the whitelisted models. Writing it + * on CPUs that do not support SLD can cause fireworks, even when writing '0'. + */ +static bool cpu_model_supports_sld __ro_after_init; + /* * Processors which have self-snooping capability can handle conflicting * memory type across CPUs by snooping its own cache. However, there exists @@ -62,7 +94,7 @@ static bool ring3mwait_disabled __read_mostly; static int __init ring3mwait_disable(char *__unused) { ring3mwait_disabled = true; - return 0; + return 1; } __setup("ring3mwait=disable", ring3mwait_disable); @@ -152,6 +184,38 @@ static bool bad_spectre_microcode(struct cpuinfo_x86 *c) return false; } +int intel_cpu_collect_info(struct ucode_cpu_info *uci) +{ + unsigned int val[2]; + unsigned int family, model; + struct cpu_signature csig = { 0 }; + unsigned int eax, ebx, ecx, edx; + + memset(uci, 0, sizeof(*uci)); + + eax = 0x00000001; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + csig.sig = eax; + + family = x86_family(eax); + model = x86_model(eax); + + if (model >= 5 || family > 6) { + /* get processor flags from MSR 0x17 */ + native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); + csig.pf = 1 << ((val[1] >> 18) & 7); + } + + csig.rev = intel_get_microcode_revision(); + + uci->cpu_sig = csig; + uci->valid = 1; + + return 0; +} +EXPORT_SYMBOL_GPL(intel_cpu_collect_info); + static void early_init_intel(struct cpuinfo_x86 *c) { u64 misc_enable; @@ -274,7 +338,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) * The operating system must reload CR3 to cause the TLB to be flushed" * * As a result, boot_cpu_has(X86_FEATURE_PGE) in arch/x86/include/asm/tlbflush.h - * should be false so that __flush_tlb_all() causes CR3 insted of CR4.PGE + * should be false so that __flush_tlb_all() causes CR3 instead of CR4.PGE * to be modified. */ if (c->x86 == 5 && c->x86_model == 9) { @@ -305,6 +369,11 @@ static void early_init_intel(struct cpuinfo_x86 *c) detect_ht_early(c); } +static void bsp_init_intel(struct cpuinfo_x86 *c) +{ + resctrl_cpu_detect(c); +} + #ifdef CONFIG_X86_32 /* * Early probe support logic for ppro memory erratum #50 @@ -570,6 +639,9 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c) wrmsrl(MSR_MISC_FEATURES_ENABLES, msr); } +static void split_lock_init(void); +static void bus_lock_init(void); + static void init_intel(struct cpuinfo_x86 *c) { early_init_intel(c); @@ -610,9 +682,9 @@ static void init_intel(struct cpuinfo_x86 *c) unsigned int l1, l2; rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); - if (!(l1 & (1<<11))) + if (!(l1 & MSR_IA32_MISC_ENABLE_BTS_UNAVAIL)) set_cpu_cap(c, X86_FEATURE_BTS); - if (!(l1 & (1<<12))) + if (!(l1 & MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL)) set_cpu_cap(c, X86_FEATURE_PEBS); } @@ -680,10 +752,10 @@ static void init_intel(struct cpuinfo_x86 *c) init_intel_misc_features(c); - if (tsx_ctrl_state == TSX_CTRL_ENABLE) - tsx_enable(); - if (tsx_ctrl_state == TSX_CTRL_DISABLE) - tsx_disable(); + split_lock_init(); + bus_lock_init(); + + intel_init_thermal(c); } #ifdef CONFIG_X86_32 @@ -940,8 +1012,373 @@ static const struct cpu_dev intel_cpu_dev = { #endif .c_detect_tlb = intel_detect_tlb, .c_early_init = early_init_intel, + .c_bsp_init = bsp_init_intel, .c_init = init_intel, .c_x86_vendor = X86_VENDOR_INTEL, }; cpu_dev_register(intel_cpu_dev); + +#undef pr_fmt +#define pr_fmt(fmt) "x86/split lock detection: " fmt + +static const struct { + const char *option; + enum split_lock_detect_state state; +} sld_options[] __initconst = { + { "off", sld_off }, + { "warn", sld_warn }, + { "fatal", sld_fatal }, + { "ratelimit:", sld_ratelimit }, +}; + +static struct ratelimit_state bld_ratelimit; + +static DEFINE_SEMAPHORE(buslock_sem); + +static inline bool match_option(const char *arg, int arglen, const char *opt) +{ + int len = strlen(opt), ratelimit; + + if (strncmp(arg, opt, len)) + return false; + + /* + * Min ratelimit is 1 bus lock/sec. + * Max ratelimit is 1000 bus locks/sec. + */ + if (sscanf(arg, "ratelimit:%d", &ratelimit) == 1 && + ratelimit > 0 && ratelimit <= 1000) { + ratelimit_state_init(&bld_ratelimit, HZ, ratelimit); + ratelimit_set_flags(&bld_ratelimit, RATELIMIT_MSG_ON_RELEASE); + return true; + } + + return len == arglen; +} + +static bool split_lock_verify_msr(bool on) +{ + u64 ctrl, tmp; + + if (rdmsrl_safe(MSR_TEST_CTRL, &ctrl)) + return false; + if (on) + ctrl |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; + else + ctrl &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT; + if (wrmsrl_safe(MSR_TEST_CTRL, ctrl)) + return false; + rdmsrl(MSR_TEST_CTRL, tmp); + return ctrl == tmp; +} + +static void __init sld_state_setup(void) +{ + enum split_lock_detect_state state = sld_warn; + char arg[20]; + int i, ret; + + if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && + !boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) + return; + + ret = cmdline_find_option(boot_command_line, "split_lock_detect", + arg, sizeof(arg)); + if (ret >= 0) { + for (i = 0; i < ARRAY_SIZE(sld_options); i++) { + if (match_option(arg, ret, sld_options[i].option)) { + state = sld_options[i].state; + break; + } + } + } + sld_state = state; +} + +static void __init __split_lock_setup(void) +{ + if (!split_lock_verify_msr(false)) { + pr_info("MSR access failed: Disabled\n"); + return; + } + + rdmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); + + if (!split_lock_verify_msr(true)) { + pr_info("MSR access failed: Disabled\n"); + return; + } + + /* Restore the MSR to its cached value. */ + wrmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); + + setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT); +} + +/* + * MSR_TEST_CTRL is per core, but we treat it like a per CPU MSR. Locking + * is not implemented as one thread could undo the setting of the other + * thread immediately after dropping the lock anyway. + */ +static void sld_update_msr(bool on) +{ + u64 test_ctrl_val = msr_test_ctrl_cache; + + if (on) + test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; + + wrmsrl(MSR_TEST_CTRL, test_ctrl_val); +} + +static void split_lock_init(void) +{ + /* + * #DB for bus lock handles ratelimit and #AC for split lock is + * disabled. + */ + if (sld_state == sld_ratelimit) { + split_lock_verify_msr(false); + return; + } + + if (cpu_model_supports_sld) + split_lock_verify_msr(sld_state != sld_off); +} + +static void __split_lock_reenable(struct work_struct *work) +{ + sld_update_msr(true); + up(&buslock_sem); +} + +/* + * If a CPU goes offline with pending delayed work to re-enable split lock + * detection then the delayed work will be executed on some other CPU. That + * handles releasing the buslock_sem, but because it executes on a + * different CPU probably won't re-enable split lock detection. This is a + * problem on HT systems since the sibling CPU on the same core may then be + * left running with split lock detection disabled. + * + * Unconditionally re-enable detection here. + */ +static int splitlock_cpu_offline(unsigned int cpu) +{ + sld_update_msr(true); + + return 0; +} + +static DECLARE_DELAYED_WORK(split_lock_reenable, __split_lock_reenable); + +static void split_lock_warn(unsigned long ip) +{ + int cpu; + + if (!current->reported_split_lock) + pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n", + current->comm, current->pid, ip); + current->reported_split_lock = 1; + + /* misery factor #1, sleep 10ms before trying to execute split lock */ + if (msleep_interruptible(10) > 0) + return; + /* Misery factor #2, only allow one buslocked disabled core at a time */ + if (down_interruptible(&buslock_sem) == -EINTR) + return; + cpu = get_cpu(); + schedule_delayed_work_on(cpu, &split_lock_reenable, 2); + + /* Disable split lock detection on this CPU to make progress */ + sld_update_msr(false); + put_cpu(); +} + +bool handle_guest_split_lock(unsigned long ip) +{ + if (sld_state == sld_warn) { + split_lock_warn(ip); + return true; + } + + pr_warn_once("#AC: %s/%d %s split_lock trap at address: 0x%lx\n", + current->comm, current->pid, + sld_state == sld_fatal ? "fatal" : "bogus", ip); + + current->thread.error_code = 0; + current->thread.trap_nr = X86_TRAP_AC; + force_sig_fault(SIGBUS, BUS_ADRALN, NULL); + return false; +} +EXPORT_SYMBOL_GPL(handle_guest_split_lock); + +static void bus_lock_init(void) +{ + u64 val; + + if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) + return; + + rdmsrl(MSR_IA32_DEBUGCTLMSR, val); + + if ((boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && + (sld_state == sld_warn || sld_state == sld_fatal)) || + sld_state == sld_off) { + /* + * Warn and fatal are handled by #AC for split lock if #AC for + * split lock is supported. + */ + val &= ~DEBUGCTLMSR_BUS_LOCK_DETECT; + } else { + val |= DEBUGCTLMSR_BUS_LOCK_DETECT; + } + + wrmsrl(MSR_IA32_DEBUGCTLMSR, val); +} + +bool handle_user_split_lock(struct pt_regs *regs, long error_code) +{ + if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal) + return false; + split_lock_warn(regs->ip); + return true; +} + +void handle_bus_lock(struct pt_regs *regs) +{ + switch (sld_state) { + case sld_off: + break; + case sld_ratelimit: + /* Enforce no more than bld_ratelimit bus locks/sec. */ + while (!__ratelimit(&bld_ratelimit)) + msleep(20); + /* Warn on the bus lock. */ + fallthrough; + case sld_warn: + pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n", + current->comm, current->pid, regs->ip); + break; + case sld_fatal: + force_sig_fault(SIGBUS, BUS_ADRALN, NULL); + break; + } +} + +/* + * Bits in the IA32_CORE_CAPABILITIES are not architectural, so they should + * only be trusted if it is confirmed that a CPU model implements a + * specific feature at a particular bit position. + * + * The possible driver data field values: + * + * - 0: CPU models that are known to have the per-core split-lock detection + * feature even though they do not enumerate IA32_CORE_CAPABILITIES. + * + * - 1: CPU models which may enumerate IA32_CORE_CAPABILITIES and if so use + * bit 5 to enumerate the per-core split-lock detection feature. + */ +static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = { + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, 0), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, 0), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, 0), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, 1), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, 1), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, 1), + X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, 1), + X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, 1), + X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, 1), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, 1), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, 1), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, 1), + {} +}; + +static void __init split_lock_setup(struct cpuinfo_x86 *c) +{ + const struct x86_cpu_id *m; + u64 ia32_core_caps; + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return; + + m = x86_match_cpu(split_lock_cpu_ids); + if (!m) + return; + + switch (m->driver_data) { + case 0: + break; + case 1: + if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES)) + return; + rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps); + if (!(ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT)) + return; + break; + default: + return; + } + + cpu_model_supports_sld = true; + __split_lock_setup(); +} + +static void sld_state_show(void) +{ + if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) && + !boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) + return; + + switch (sld_state) { + case sld_off: + pr_info("disabled\n"); + break; + case sld_warn: + if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { + pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n"); + if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "x86/splitlock", NULL, splitlock_cpu_offline) < 0) + pr_warn("No splitlock CPU offline handler\n"); + } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) { + pr_info("#DB: warning on user-space bus_locks\n"); + } + break; + case sld_fatal: + if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { + pr_info("#AC: crashing the kernel on kernel split_locks and sending SIGBUS on user-space split_locks\n"); + } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) { + pr_info("#DB: sending SIGBUS on user-space bus_locks%s\n", + boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) ? + " from non-WB" : ""); + } + break; + case sld_ratelimit: + if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) + pr_info("#DB: setting system wide bus lock rate limit to %u/sec\n", bld_ratelimit.burst); + break; + } +} + +void __init sld_setup(struct cpuinfo_x86 *c) +{ + split_lock_setup(c); + sld_state_setup(); + sld_state_show(); +} + +#define X86_HYBRID_CPU_TYPE_ID_SHIFT 24 + +/** + * get_this_hybrid_cpu_type() - Get the type of this hybrid CPU + * + * Returns the CPU type [31:24] (i.e., Atom or Core) of a CPU in + * a hybrid processor. If the processor is not hybrid, returns 0. + */ +u8 get_this_hybrid_cpu_type(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) + return 0; + + return cpuid_eax(0x0000001a) >> X86_HYBRID_CPU_TYPE_ID_SHIFT; +} diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c index f4dd73396f28..fbaf12e43f41 100644 --- a/arch/x86/kernel/cpu/intel_epb.c +++ b/arch/x86/kernel/cpu/intel_epb.c @@ -16,6 +16,7 @@ #include <linux/syscore_ops.h> #include <linux/pm.h> +#include <asm/cpu_device_id.h> #include <asm/cpufeature.h> #include <asm/msr.h> @@ -58,6 +59,22 @@ static DEFINE_PER_CPU(u8, saved_epb); #define EPB_SAVED 0x10ULL #define MAX_EPB EPB_MASK +enum energy_perf_value_index { + EPB_INDEX_PERFORMANCE, + EPB_INDEX_BALANCE_PERFORMANCE, + EPB_INDEX_NORMAL, + EPB_INDEX_BALANCE_POWERSAVE, + EPB_INDEX_POWERSAVE, +}; + +static u8 energ_perf_values[] = { + [EPB_INDEX_PERFORMANCE] = ENERGY_PERF_BIAS_PERFORMANCE, + [EPB_INDEX_BALANCE_PERFORMANCE] = ENERGY_PERF_BIAS_BALANCE_PERFORMANCE, + [EPB_INDEX_NORMAL] = ENERGY_PERF_BIAS_NORMAL, + [EPB_INDEX_BALANCE_POWERSAVE] = ENERGY_PERF_BIAS_BALANCE_POWERSAVE, + [EPB_INDEX_POWERSAVE] = ENERGY_PERF_BIAS_POWERSAVE, +}; + static int intel_epb_save(void) { u64 epb; @@ -90,7 +107,7 @@ static void intel_epb_restore(void) */ val = epb & EPB_MASK; if (val == ENERGY_PERF_BIAS_PERFORMANCE) { - val = ENERGY_PERF_BIAS_NORMAL; + val = energ_perf_values[EPB_INDEX_NORMAL]; pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n"); } } @@ -103,18 +120,11 @@ static struct syscore_ops intel_epb_syscore_ops = { }; static const char * const energy_perf_strings[] = { - "performance", - "balance-performance", - "normal", - "balance-power", - "power" -}; -static const u8 energ_perf_values[] = { - ENERGY_PERF_BIAS_PERFORMANCE, - ENERGY_PERF_BIAS_BALANCE_PERFORMANCE, - ENERGY_PERF_BIAS_NORMAL, - ENERGY_PERF_BIAS_BALANCE_POWERSAVE, - ENERGY_PERF_BIAS_POWERSAVE + [EPB_INDEX_PERFORMANCE] = "performance", + [EPB_INDEX_BALANCE_PERFORMANCE] = "balance-performance", + [EPB_INDEX_NORMAL] = "normal", + [EPB_INDEX_BALANCE_POWERSAVE] = "balance-power", + [EPB_INDEX_POWERSAVE] = "power", }; static ssize_t energy_perf_bias_show(struct device *dev, @@ -193,13 +203,22 @@ static int intel_epb_offline(unsigned int cpu) return 0; } +static const struct x86_cpu_id intel_epb_normal[] = { + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, 7), + {} +}; + static __init int intel_epb_init(void) { + const struct x86_cpu_id *id = x86_match_cpu(intel_epb_normal); int ret; if (!boot_cpu_has(X86_FEATURE_EPB)) return -ENODEV; + if (id) + energ_perf_values[EPB_INDEX_NORMAL] = id->driver_data; + ret = cpuhp_setup_state(CPUHP_AP_X86_INTEL_EPB_ONLINE, "x86/intel/epb:online", intel_epb_online, intel_epb_offline); diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index 6dd78d8235e4..ad6776081e60 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -16,12 +16,17 @@ * respective wildcard entries. * * A typical table entry would be to match a specific CPU - * { X86_VENDOR_INTEL, 6, 0x12 } - * or to match a specific CPU feature - * { X86_FEATURE_MATCH(X86_FEATURE_FOOBAR) } + * + * X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_BROADWELL, + * X86_FEATURE_ANY, NULL); * * Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY, - * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor) + * %X86_MODEL_ANY, %X86_FEATURE_ANY (except for vendor) + * + * asm/cpu_device_id.h contains a set of useful macros which are shortcuts + * for various common selections. The above can be shortened to: + * + * X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, NULL); * * Arrays used to match for this should also be declared using * MODULE_DEVICE_TABLE(x86cpu, ...) @@ -34,13 +39,18 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match) const struct x86_cpu_id *m; struct cpuinfo_x86 *c = &boot_cpu_data; - for (m = match; m->vendor | m->family | m->model | m->feature; m++) { + for (m = match; + m->vendor | m->family | m->model | m->steppings | m->feature; + m++) { if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor) continue; if (m->family != X86_FAMILY_ANY && c->x86 != m->family) continue; if (m->model != X86_MODEL_ANY && c->x86_model != m->model) continue; + if (m->steppings != X86_STEPPING_ANY && + !(BIT(c->x86_stepping) & m->steppings)) + continue; if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature)) continue; return m; diff --git a/arch/x86/kernel/cpu/mce/Makefile b/arch/x86/kernel/cpu/mce/Makefile index 9f020c994154..015856abdbb1 100644 --- a/arch/x86/kernel/cpu/mce/Makefile +++ b/arch/x86/kernel/cpu/mce/Makefile @@ -9,8 +9,6 @@ obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o mce-inject-y := inject.o obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o -obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o - obj-$(CONFIG_ACPI_APEI) += apei.o obj-$(CONFIG_X86_MCELOG_LEGACY) += dev-mcelog.o diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 52de616a8065..1c87501e0fa3 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -71,33 +71,58 @@ static const char * const smca_umc_block_names[] = { "misc_umc" }; +#define HWID_MCATYPE(hwid, mcatype) (((hwid) << 16) | (mcatype)) + +struct smca_hwid { + unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */ + u32 hwid_mcatype; /* (hwid,mcatype) tuple */ +}; + +struct smca_bank { + const struct smca_hwid *hwid; + u32 id; /* Value of MCA_IPID[InstanceId]. */ + u8 sysfs_id; /* Value used for sysfs name. */ +}; + +static DEFINE_PER_CPU_READ_MOSTLY(struct smca_bank[MAX_NR_BANKS], smca_banks); +static DEFINE_PER_CPU_READ_MOSTLY(u8[N_SMCA_BANK_TYPES], smca_bank_counts); + struct smca_bank_name { const char *name; /* Short name for sysfs */ const char *long_name; /* Long name for pretty-printing */ }; static struct smca_bank_name smca_names[] = { - [SMCA_LS] = { "load_store", "Load Store Unit" }, - [SMCA_LS_V2] = { "load_store", "Load Store Unit" }, - [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" }, - [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" }, - [SMCA_DE] = { "decode_unit", "Decode Unit" }, - [SMCA_RESERVED] = { "reserved", "Reserved" }, - [SMCA_EX] = { "execution_unit", "Execution Unit" }, - [SMCA_FP] = { "floating_point", "Floating Point Unit" }, - [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" }, - [SMCA_CS] = { "coherent_slave", "Coherent Slave" }, - [SMCA_CS_V2] = { "coherent_slave", "Coherent Slave" }, - [SMCA_PIE] = { "pie", "Power, Interrupts, etc." }, - [SMCA_UMC] = { "umc", "Unified Memory Controller" }, - [SMCA_PB] = { "param_block", "Parameter Block" }, - [SMCA_PSP] = { "psp", "Platform Security Processor" }, - [SMCA_PSP_V2] = { "psp", "Platform Security Processor" }, - [SMCA_SMU] = { "smu", "System Management Unit" }, - [SMCA_SMU_V2] = { "smu", "System Management Unit" }, - [SMCA_MP5] = { "mp5", "Microprocessor 5 Unit" }, - [SMCA_NBIO] = { "nbio", "Northbridge IO Unit" }, - [SMCA_PCIE] = { "pcie", "PCI Express Unit" }, + [SMCA_LS ... SMCA_LS_V2] = { "load_store", "Load Store Unit" }, + [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" }, + [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" }, + [SMCA_DE] = { "decode_unit", "Decode Unit" }, + [SMCA_RESERVED] = { "reserved", "Reserved" }, + [SMCA_EX] = { "execution_unit", "Execution Unit" }, + [SMCA_FP] = { "floating_point", "Floating Point Unit" }, + [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" }, + [SMCA_CS ... SMCA_CS_V2] = { "coherent_slave", "Coherent Slave" }, + [SMCA_PIE] = { "pie", "Power, Interrupts, etc." }, + + /* UMC v2 is separate because both of them can exist in a single system. */ + [SMCA_UMC] = { "umc", "Unified Memory Controller" }, + [SMCA_UMC_V2] = { "umc_v2", "Unified Memory Controller v2" }, + [SMCA_PB] = { "param_block", "Parameter Block" }, + [SMCA_PSP ... SMCA_PSP_V2] = { "psp", "Platform Security Processor" }, + [SMCA_SMU ... SMCA_SMU_V2] = { "smu", "System Management Unit" }, + [SMCA_MP5] = { "mp5", "Microprocessor 5 Unit" }, + [SMCA_MPDMA] = { "mpdma", "MPDMA Unit" }, + [SMCA_NBIO] = { "nbio", "Northbridge IO Unit" }, + [SMCA_PCIE ... SMCA_PCIE_V2] = { "pcie", "PCI Express Unit" }, + [SMCA_XGMI_PCS] = { "xgmi_pcs", "Ext Global Memory Interconnect PCS Unit" }, + [SMCA_NBIF] = { "nbif", "NBIF Unit" }, + [SMCA_SHUB] = { "shub", "System Hub Unit" }, + [SMCA_SATA] = { "sata", "SATA Unit" }, + [SMCA_USB] = { "usb", "USB Unit" }, + [SMCA_GMI_PCS] = { "gmi_pcs", "Global Memory Interconnect PCS Unit" }, + [SMCA_XGMI_PHY] = { "xgmi_phy", "Ext Global Memory Interconnect PHY Unit" }, + [SMCA_WAFL_PHY] = { "wafl_phy", "WAFL PHY Unit" }, + [SMCA_GMI_PHY] = { "gmi_phy", "Global Memory Interconnect PHY Unit" }, }; static const char *smca_get_name(enum smca_bank_types t) @@ -117,69 +142,82 @@ const char *smca_get_long_name(enum smca_bank_types t) } EXPORT_SYMBOL_GPL(smca_get_long_name); -static enum smca_bank_types smca_get_bank_type(unsigned int bank) +enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank) { struct smca_bank *b; if (bank >= MAX_NR_BANKS) return N_SMCA_BANK_TYPES; - b = &smca_banks[bank]; + b = &per_cpu(smca_banks, cpu)[bank]; if (!b->hwid) return N_SMCA_BANK_TYPES; return b->hwid->bank_type; } +EXPORT_SYMBOL_GPL(smca_get_bank_type); -static struct smca_hwid smca_hwid_mcatypes[] = { - /* { bank_type, hwid_mcatype, xec_bitmap } */ +static const struct smca_hwid smca_hwid_mcatypes[] = { + /* { bank_type, hwid_mcatype } */ /* Reserved type */ - { SMCA_RESERVED, HWID_MCATYPE(0x00, 0x0), 0x0 }, + { SMCA_RESERVED, HWID_MCATYPE(0x00, 0x0) }, /* ZN Core (HWID=0xB0) MCA types */ - { SMCA_LS, HWID_MCATYPE(0xB0, 0x0), 0x1FFFFF }, - { SMCA_LS_V2, HWID_MCATYPE(0xB0, 0x10), 0xFFFFFF }, - { SMCA_IF, HWID_MCATYPE(0xB0, 0x1), 0x3FFF }, - { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2), 0xF }, - { SMCA_DE, HWID_MCATYPE(0xB0, 0x3), 0x1FF }, + { SMCA_LS, HWID_MCATYPE(0xB0, 0x0) }, + { SMCA_LS_V2, HWID_MCATYPE(0xB0, 0x10) }, + { SMCA_IF, HWID_MCATYPE(0xB0, 0x1) }, + { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2) }, + { SMCA_DE, HWID_MCATYPE(0xB0, 0x3) }, /* HWID 0xB0 MCATYPE 0x4 is Reserved */ - { SMCA_EX, HWID_MCATYPE(0xB0, 0x5), 0xFFF }, - { SMCA_FP, HWID_MCATYPE(0xB0, 0x6), 0x7F }, - { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7), 0xFF }, + { SMCA_EX, HWID_MCATYPE(0xB0, 0x5) }, + { SMCA_FP, HWID_MCATYPE(0xB0, 0x6) }, + { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7) }, /* Data Fabric MCA types */ - { SMCA_CS, HWID_MCATYPE(0x2E, 0x0), 0x1FF }, - { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1), 0x1F }, - { SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2), 0x3FFF }, + { SMCA_CS, HWID_MCATYPE(0x2E, 0x0) }, + { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1) }, + { SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2) }, /* Unified Memory Controller MCA type */ - { SMCA_UMC, HWID_MCATYPE(0x96, 0x0), 0xFF }, + { SMCA_UMC, HWID_MCATYPE(0x96, 0x0) }, + { SMCA_UMC_V2, HWID_MCATYPE(0x96, 0x1) }, /* Parameter Block MCA type */ - { SMCA_PB, HWID_MCATYPE(0x05, 0x0), 0x1 }, + { SMCA_PB, HWID_MCATYPE(0x05, 0x0) }, /* Platform Security Processor MCA type */ - { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0), 0x1 }, - { SMCA_PSP_V2, HWID_MCATYPE(0xFF, 0x1), 0x3FFFF }, + { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0) }, + { SMCA_PSP_V2, HWID_MCATYPE(0xFF, 0x1) }, /* System Management Unit MCA type */ - { SMCA_SMU, HWID_MCATYPE(0x01, 0x0), 0x1 }, - { SMCA_SMU_V2, HWID_MCATYPE(0x01, 0x1), 0x7FF }, + { SMCA_SMU, HWID_MCATYPE(0x01, 0x0) }, + { SMCA_SMU_V2, HWID_MCATYPE(0x01, 0x1) }, /* Microprocessor 5 Unit MCA type */ - { SMCA_MP5, HWID_MCATYPE(0x01, 0x2), 0x3FF }, + { SMCA_MP5, HWID_MCATYPE(0x01, 0x2) }, + + /* MPDMA MCA type */ + { SMCA_MPDMA, HWID_MCATYPE(0x01, 0x3) }, /* Northbridge IO Unit MCA type */ - { SMCA_NBIO, HWID_MCATYPE(0x18, 0x0), 0x1F }, + { SMCA_NBIO, HWID_MCATYPE(0x18, 0x0) }, /* PCI Express Unit MCA type */ - { SMCA_PCIE, HWID_MCATYPE(0x46, 0x0), 0x1F }, + { SMCA_PCIE, HWID_MCATYPE(0x46, 0x0) }, + { SMCA_PCIE_V2, HWID_MCATYPE(0x46, 0x1) }, + + { SMCA_XGMI_PCS, HWID_MCATYPE(0x50, 0x0) }, + { SMCA_NBIF, HWID_MCATYPE(0x6C, 0x0) }, + { SMCA_SHUB, HWID_MCATYPE(0x80, 0x0) }, + { SMCA_SATA, HWID_MCATYPE(0xA8, 0x0) }, + { SMCA_USB, HWID_MCATYPE(0xAA, 0x0) }, + { SMCA_GMI_PCS, HWID_MCATYPE(0x241, 0x0) }, + { SMCA_XGMI_PHY, HWID_MCATYPE(0x259, 0x0) }, + { SMCA_WAFL_PHY, HWID_MCATYPE(0x267, 0x0) }, + { SMCA_GMI_PHY, HWID_MCATYPE(0x269, 0x0) }, }; -struct smca_bank smca_banks[MAX_NR_BANKS]; -EXPORT_SYMBOL_GPL(smca_banks); - /* * In SMCA enabled processors, we can have multiple banks for a given IP type. * So to define a unique name for each bank, we use a temp c-string to append @@ -192,7 +230,12 @@ EXPORT_SYMBOL_GPL(smca_banks); static char buf_mcatype[MAX_MCATYPE_NAME_LEN]; static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); -static DEFINE_PER_CPU(unsigned int, bank_map); /* see which banks are on */ + +/* + * A list of the banks enabled on each logical CPU. Controls which respective + * descriptors to initialize later in mce_threshold_create_device(). + */ +static DEFINE_PER_CPU(unsigned int, bank_map); /* Map of banks that have more than MCA_MISC0 available. */ static DEFINE_PER_CPU(u32, smca_misc_banks_map); @@ -230,8 +273,9 @@ static void smca_set_misc_banks_map(unsigned int bank, unsigned int cpu) static void smca_configure(unsigned int bank, unsigned int cpu) { + u8 *bank_counts = this_cpu_ptr(smca_bank_counts); + const struct smca_hwid *s_hwid; unsigned int i, hwid_mcatype; - struct smca_hwid *s_hwid; u32 high, low; u32 smca_config = MSR_AMD64_SMCA_MCx_CONFIG(bank); @@ -267,10 +311,6 @@ static void smca_configure(unsigned int bank, unsigned int cpu) smca_set_misc_banks_map(bank, cpu); - /* Return early if this bank was already initialized. */ - if (smca_banks[bank].hwid && smca_banks[bank].hwid->hwid_mcatype != 0) - return; - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) { pr_warn("Failed to read MCA_IPID for bank %d\n", bank); return; @@ -281,10 +321,11 @@ static void smca_configure(unsigned int bank, unsigned int cpu) for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) { s_hwid = &smca_hwid_mcatypes[i]; + if (hwid_mcatype == s_hwid->hwid_mcatype) { - smca_banks[bank].hwid = s_hwid; - smca_banks[bank].id = low; - smca_banks[bank].sysfs_id = s_hwid->count++; + this_cpu_ptr(smca_banks)[bank].hwid = s_hwid; + this_cpu_ptr(smca_banks)[bank].id = low; + this_cpu_ptr(smca_banks)[bank].sysfs_id = bank_counts[s_hwid->bank_type]++; break; } } @@ -381,6 +422,10 @@ static void threshold_restart_bank(void *_tr) struct thresh_restart *tr = _tr; u32 hi, lo; + /* sysfs write might race against an offline operation */ + if (!this_cpu_read(threshold_banks) && !tr->set_lvt_off) + return; + rdmsr(tr->b->address, lo, hi); if (tr->b->threshold_limit < (hi & THRESHOLD_MAX)) @@ -504,7 +549,7 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high, /* Fall back to method we used for older processors: */ switch (block) { case 0: - addr = msr_ops.misc(bank); + addr = mca_msr_reg(bank, MCA_MISC); break; case 1: offset = ((low & MASK_BLKPTR_LO) >> 21); @@ -566,16 +611,21 @@ out: bool amd_filter_mce(struct mce *m) { - enum smca_bank_types bank_type = smca_get_bank_type(m->bank); + enum smca_bank_types bank_type = smca_get_bank_type(m->extcpu, m->bank); struct cpuinfo_x86 *c = &boot_cpu_data; - u8 xec = (m->status >> 16) & 0x3F; /* See Family 17h Models 10h-2Fh Erratum #1114. */ if (c->x86 == 0x17 && c->x86_model >= 0x10 && c->x86_model <= 0x2F && - bank_type == SMCA_IF && xec == 10) + bank_type == SMCA_IF && XEC(m->status, 0x3f) == 10) return true; + /* NB GART TLB error reporting is disabled by default. */ + if (c->x86 < 0x17) { + if (m->bank == 4 && XEC(m->status, 0x1f) == 0x5) + return true; + } + return false; } @@ -599,7 +649,7 @@ static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank) } else if (c->x86 == 0x17 && (c->x86_model >= 0x10 && c->x86_model <= 0x2F)) { - if (smca_get_bank_type(bank) != SMCA_IF) + if (smca_get_bank_type(smp_processor_id(), bank) != SMCA_IF) return; msrs[0] = MSR_AMD64_SMCA_MCx_MISC(bank); @@ -661,213 +711,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) deferred_error_interrupt_enable(c); } -int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) -{ - u64 dram_base_addr, dram_limit_addr, dram_hole_base; - /* We start from the normalized address */ - u64 ret_addr = norm_addr; - - u32 tmp; - - u8 die_id_shift, die_id_mask, socket_id_shift, socket_id_mask; - u8 intlv_num_dies, intlv_num_chan, intlv_num_sockets; - u8 intlv_addr_sel, intlv_addr_bit; - u8 num_intlv_bits, hashed_bit; - u8 lgcy_mmio_hole_en, base = 0; - u8 cs_mask, cs_id = 0; - bool hash_enabled = false; - - /* Read D18F0x1B4 (DramOffset), check if base 1 is used. */ - if (amd_df_indirect_read(nid, 0, 0x1B4, umc, &tmp)) - goto out_err; - - /* Remove HiAddrOffset from normalized address, if enabled: */ - if (tmp & BIT(0)) { - u64 hi_addr_offset = (tmp & GENMASK_ULL(31, 20)) << 8; - - if (norm_addr >= hi_addr_offset) { - ret_addr -= hi_addr_offset; - base = 1; - } - } - - /* Read D18F0x110 (DramBaseAddress). */ - if (amd_df_indirect_read(nid, 0, 0x110 + (8 * base), umc, &tmp)) - goto out_err; - - /* Check if address range is valid. */ - if (!(tmp & BIT(0))) { - pr_err("%s: Invalid DramBaseAddress range: 0x%x.\n", - __func__, tmp); - goto out_err; - } - - lgcy_mmio_hole_en = tmp & BIT(1); - intlv_num_chan = (tmp >> 4) & 0xF; - intlv_addr_sel = (tmp >> 8) & 0x7; - dram_base_addr = (tmp & GENMASK_ULL(31, 12)) << 16; - - /* {0, 1, 2, 3} map to address bits {8, 9, 10, 11} respectively */ - if (intlv_addr_sel > 3) { - pr_err("%s: Invalid interleave address select %d.\n", - __func__, intlv_addr_sel); - goto out_err; - } - - /* Read D18F0x114 (DramLimitAddress). */ - if (amd_df_indirect_read(nid, 0, 0x114 + (8 * base), umc, &tmp)) - goto out_err; - - intlv_num_sockets = (tmp >> 8) & 0x1; - intlv_num_dies = (tmp >> 10) & 0x3; - dram_limit_addr = ((tmp & GENMASK_ULL(31, 12)) << 16) | GENMASK_ULL(27, 0); - - intlv_addr_bit = intlv_addr_sel + 8; - - /* Re-use intlv_num_chan by setting it equal to log2(#channels) */ - switch (intlv_num_chan) { - case 0: intlv_num_chan = 0; break; - case 1: intlv_num_chan = 1; break; - case 3: intlv_num_chan = 2; break; - case 5: intlv_num_chan = 3; break; - case 7: intlv_num_chan = 4; break; - - case 8: intlv_num_chan = 1; - hash_enabled = true; - break; - default: - pr_err("%s: Invalid number of interleaved channels %d.\n", - __func__, intlv_num_chan); - goto out_err; - } - - num_intlv_bits = intlv_num_chan; - - if (intlv_num_dies > 2) { - pr_err("%s: Invalid number of interleaved nodes/dies %d.\n", - __func__, intlv_num_dies); - goto out_err; - } - - num_intlv_bits += intlv_num_dies; - - /* Add a bit if sockets are interleaved. */ - num_intlv_bits += intlv_num_sockets; - - /* Assert num_intlv_bits <= 4 */ - if (num_intlv_bits > 4) { - pr_err("%s: Invalid interleave bits %d.\n", - __func__, num_intlv_bits); - goto out_err; - } - - if (num_intlv_bits > 0) { - u64 temp_addr_x, temp_addr_i, temp_addr_y; - u8 die_id_bit, sock_id_bit, cs_fabric_id; - - /* - * Read FabricBlockInstanceInformation3_CS[BlockFabricID]. - * This is the fabric id for this coherent slave. Use - * umc/channel# as instance id of the coherent slave - * for FICAA. - */ - if (amd_df_indirect_read(nid, 0, 0x50, umc, &tmp)) - goto out_err; - - cs_fabric_id = (tmp >> 8) & 0xFF; - die_id_bit = 0; - - /* If interleaved over more than 1 channel: */ - if (intlv_num_chan) { - die_id_bit = intlv_num_chan; - cs_mask = (1 << die_id_bit) - 1; - cs_id = cs_fabric_id & cs_mask; - } - - sock_id_bit = die_id_bit; - - /* Read D18F1x208 (SystemFabricIdMask). */ - if (intlv_num_dies || intlv_num_sockets) - if (amd_df_indirect_read(nid, 1, 0x208, umc, &tmp)) - goto out_err; - - /* If interleaved over more than 1 die. */ - if (intlv_num_dies) { - sock_id_bit = die_id_bit + intlv_num_dies; - die_id_shift = (tmp >> 24) & 0xF; - die_id_mask = (tmp >> 8) & 0xFF; - - cs_id |= ((cs_fabric_id & die_id_mask) >> die_id_shift) << die_id_bit; - } - - /* If interleaved over more than 1 socket. */ - if (intlv_num_sockets) { - socket_id_shift = (tmp >> 28) & 0xF; - socket_id_mask = (tmp >> 16) & 0xFF; - - cs_id |= ((cs_fabric_id & socket_id_mask) >> socket_id_shift) << sock_id_bit; - } - - /* - * The pre-interleaved address consists of XXXXXXIIIYYYYY - * where III is the ID for this CS, and XXXXXXYYYYY are the - * address bits from the post-interleaved address. - * "num_intlv_bits" has been calculated to tell us how many "I" - * bits there are. "intlv_addr_bit" tells us how many "Y" bits - * there are (where "I" starts). - */ - temp_addr_y = ret_addr & GENMASK_ULL(intlv_addr_bit-1, 0); - temp_addr_i = (cs_id << intlv_addr_bit); - temp_addr_x = (ret_addr & GENMASK_ULL(63, intlv_addr_bit)) << num_intlv_bits; - ret_addr = temp_addr_x | temp_addr_i | temp_addr_y; - } - - /* Add dram base address */ - ret_addr += dram_base_addr; - - /* If legacy MMIO hole enabled */ - if (lgcy_mmio_hole_en) { - if (amd_df_indirect_read(nid, 0, 0x104, umc, &tmp)) - goto out_err; - - dram_hole_base = tmp & GENMASK(31, 24); - if (ret_addr >= dram_hole_base) - ret_addr += (BIT_ULL(32) - dram_hole_base); - } - - if (hash_enabled) { - /* Save some parentheses and grab ls-bit at the end. */ - hashed_bit = (ret_addr >> 12) ^ - (ret_addr >> 18) ^ - (ret_addr >> 21) ^ - (ret_addr >> 30) ^ - cs_id; - - hashed_bit &= BIT(0); - - if (hashed_bit != ((ret_addr >> intlv_addr_bit) & BIT(0))) - ret_addr ^= BIT(intlv_addr_bit); - } - - /* Is calculated system address is above DRAM limit address? */ - if (ret_addr > dram_limit_addr) - goto out_err; - - *sys_addr = ret_addr; - return 0; - -out_err: - return -EINVAL; -} -EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr); - bool amd_mce_is_memory_error(struct mce *m) { /* ErrCodeExt[20:16] */ u8 xec = (m->status >> 16) & 0x1f; if (mce_flags.smca) - return smca_get_bank_type(m->bank) == SMCA_UMC && xec == 0x0; + return smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC && xec == 0x0; return m->bank == 4 && xec == 0x8; } @@ -907,14 +757,13 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) mce_log(&m); } -asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error) { - entering_irq(); trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR); inc_irq_stat(irq_deferred_error_count); deferred_error_int_vector(); trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR); - exiting_ack_irq(); + ack_APIC_irq(); } /* @@ -952,8 +801,8 @@ static void log_error_deferred(unsigned int bank) { bool defrd; - defrd = _log_error_bank(bank, msr_ops.status(bank), - msr_ops.addr(bank), 0); + defrd = _log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS), + mca_msr_reg(bank, MCA_ADDR), 0); if (!mce_flags.smca) return; @@ -983,7 +832,7 @@ static void amd_deferred_error_interrupt(void) static void log_error_thresholding(unsigned int bank, u64 misc) { - _log_error_bank(bank, msr_ops.status(bank), msr_ops.addr(bank), misc); + _log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS), mca_msr_reg(bank, MCA_ADDR), misc); } static void log_and_reset_block(struct threshold_block *block) @@ -1016,13 +865,22 @@ static void log_and_reset_block(struct threshold_block *block) static void amd_threshold_interrupt(void) { struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL; + struct threshold_bank **bp = this_cpu_read(threshold_banks); unsigned int bank, cpu = smp_processor_id(); + /* + * Validate that the threshold bank has been initialized already. The + * handler is installed at boot time, but on a hotplug event the + * interrupt might fire before the data has been initialized. + */ + if (!bp) + return; + for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) { if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; - first_block = per_cpu(threshold_banks, cpu)[bank]->blocks; + first_block = bp[bank]->blocks; if (!first_block) continue; @@ -1071,7 +929,8 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) memset(&tr, 0, sizeof(tr)); tr.b = b; - smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); + if (smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1)) + return -ENODEV; return size; } @@ -1095,7 +954,8 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) b->threshold_limit = new; tr.b = b; - smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); + if (smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1)) + return -ENODEV; return size; } @@ -1104,7 +964,9 @@ static ssize_t show_error_count(struct threshold_block *b, char *buf) { u32 lo, hi; - rdmsr_on_cpu(b->cpu, b->address, &lo, &hi); + /* CPU might be offline by now */ + if (rdmsr_on_cpu(b->cpu, b->address, &lo, &hi)) + return -ENODEV; return sprintf(buf, "%u\n", ((hi & THRESHOLD_MAX) - (THRESHOLD_MAX - b->threshold_limit))); @@ -1131,6 +993,7 @@ static struct attribute *default_attrs[] = { NULL, /* possibly interrupt_enable if supported, see below */ NULL, }; +ATTRIBUTE_GROUPS(default); #define to_block(k) container_of(k, struct threshold_block, kobj) #define to_attr(a) container_of(a, struct threshold_attr, attr) @@ -1167,11 +1030,11 @@ static void threshold_block_release(struct kobject *kobj); static struct kobj_type threshold_ktype = { .sysfs_ops = &threshold_ops, - .default_attrs = default_attrs, + .default_groups = default_groups, .release = threshold_block_release, }; -static const char *get_name(unsigned int bank, struct threshold_block *b) +static const char *get_name(unsigned int cpu, unsigned int bank, struct threshold_block *b) { enum smca_bank_types bank_type; @@ -1182,7 +1045,7 @@ static const char *get_name(unsigned int bank, struct threshold_block *b) return th_names[bank]; } - bank_type = smca_get_bank_type(bank); + bank_type = smca_get_bank_type(cpu, bank); if (bank_type >= N_SMCA_BANK_TYPES) return NULL; @@ -1192,12 +1055,12 @@ static const char *get_name(unsigned int bank, struct threshold_block *b) return NULL; } - if (smca_banks[bank].hwid->count == 1) + if (per_cpu(smca_bank_counts, cpu)[bank_type] == 1) return smca_get_name(bank_type); snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, - "%s_%x", smca_get_name(bank_type), - smca_banks[bank].sysfs_id); + "%s_%u", smca_get_name(bank_type), + per_cpu(smca_banks, cpu)[bank].sysfs_id); return buf_mcatype; } @@ -1209,10 +1072,10 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb u32 low, high; int err; - if ((bank >= per_cpu(mce_num_banks, cpu)) || (block >= NR_BLOCKS)) + if ((bank >= this_cpu_read(mce_num_banks)) || (block >= NR_BLOCKS)) return 0; - if (rdmsr_safe_on_cpu(cpu, address, &low, &high)) + if (rdmsr_safe(address, &low, &high)) return 0; if (!(high & MASK_VALID_HI)) { @@ -1239,20 +1102,21 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb b->threshold_limit = THRESHOLD_MAX; if (b->interrupt_capable) { - threshold_ktype.default_attrs[2] = &interrupt_enable.attr; + default_attrs[2] = &interrupt_enable.attr; b->interrupt_enable = 1; } else { - threshold_ktype.default_attrs[2] = NULL; + default_attrs[2] = NULL; } INIT_LIST_HEAD(&b->miscj); + /* This is safe as @tb is not visible yet */ if (tb->blocks) list_add(&b->miscj, &tb->blocks->miscj); else tb->blocks = b; - err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(bank, b)); + err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(cpu, bank, b)); if (err) goto out_free; recurse: @@ -1267,13 +1131,12 @@ recurse: if (b) kobject_uevent(&b->kobj, KOBJ_ADD); - return err; + return 0; out_free: if (b) { - kobject_put(&b->kobj); list_del(&b->miscj); - kfree(b); + kobject_put(&b->kobj); } return err; } @@ -1302,19 +1165,20 @@ static int __threshold_add_blocks(struct threshold_bank *b) return err; } -static int threshold_create_bank(unsigned int cpu, unsigned int bank) +static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, + unsigned int bank) { - struct device *dev = per_cpu(mce_device, cpu); + struct device *dev = this_cpu_read(mce_device); struct amd_northbridge *nb = NULL; struct threshold_bank *b = NULL; - const char *name = get_name(bank, NULL); + const char *name = get_name(cpu, bank, NULL); int err = 0; if (!dev) return -ENODEV; if (is_shared_bank(bank)) { - nb = node_to_amd_nb(amd_get_nb_id(cpu)); + nb = node_to_amd_nb(topology_die_id(cpu)); /* threshold descriptor already initialized on this node? */ if (nb && nb->bank4) { @@ -1324,7 +1188,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) if (err) goto out; - per_cpu(threshold_banks, cpu)[bank] = b; + bp[bank] = b; refcount_inc(&b->cpus); err = __threshold_add_blocks(b); @@ -1339,6 +1203,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) goto out; } + /* Associate the bank with the per-CPU MCE device */ b->kobj = kobject_create_and_add(name, &dev->kobj); if (!b->kobj) { err = -EINVAL; @@ -1346,6 +1211,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) } if (is_shared_bank(bank)) { + b->shared = 1; refcount_set(&b->cpus, 1); /* nb is already initialized, see above */ @@ -1355,18 +1221,18 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) } } - err = allocate_threshold_blocks(cpu, b, bank, 0, msr_ops.misc(bank)); + err = allocate_threshold_blocks(cpu, b, bank, 0, mca_msr_reg(bank, MCA_MISC)); if (err) - goto out_free; - - per_cpu(threshold_banks, cpu)[bank] = b; + goto out_kobj; + bp[bank] = b; return 0; - out_free: +out_kobj: + kobject_put(b->kobj); +out_free: kfree(b); - - out: +out: return err; } @@ -1375,21 +1241,16 @@ static void threshold_block_release(struct kobject *kobj) kfree(to_block(kobj)); } -static void deallocate_threshold_block(unsigned int cpu, unsigned int bank) +static void deallocate_threshold_blocks(struct threshold_bank *bank) { - struct threshold_block *pos = NULL; - struct threshold_block *tmp = NULL; - struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank]; + struct threshold_block *pos, *tmp; - if (!head) - return; - - list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) { + list_for_each_entry_safe(pos, tmp, &bank->blocks->miscj, miscj) { list_del(&pos->miscj); kobject_put(&pos->kobj); } - kobject_put(&head->blocks->kobj); + kobject_put(&bank->blocks->kobj); } static void __threshold_remove_blocks(struct threshold_bank *b) @@ -1403,122 +1264,108 @@ static void __threshold_remove_blocks(struct threshold_bank *b) kobject_del(&pos->kobj); } -static void threshold_remove_bank(unsigned int cpu, int bank) +static void threshold_remove_bank(struct threshold_bank *bank) { struct amd_northbridge *nb; - struct threshold_bank *b; - b = per_cpu(threshold_banks, cpu)[bank]; - if (!b) - return; + if (!bank->blocks) + goto out_free; - if (!b->blocks) - goto free_out; + if (!bank->shared) + goto out_dealloc; - if (is_shared_bank(bank)) { - if (!refcount_dec_and_test(&b->cpus)) { - __threshold_remove_blocks(b); - per_cpu(threshold_banks, cpu)[bank] = NULL; - return; - } else { - /* - * the last CPU on this node using the shared bank is - * going away, remove that bank now. - */ - nb = node_to_amd_nb(amd_get_nb_id(cpu)); - nb->bank4 = NULL; - } + if (!refcount_dec_and_test(&bank->cpus)) { + __threshold_remove_blocks(bank); + return; + } else { + /* + * The last CPU on this node using the shared bank is going + * away, remove that bank now. + */ + nb = node_to_amd_nb(topology_die_id(smp_processor_id())); + nb->bank4 = NULL; } - deallocate_threshold_block(cpu, bank); +out_dealloc: + deallocate_threshold_blocks(bank); -free_out: - kobject_del(b->kobj); - kobject_put(b->kobj); - kfree(b); - per_cpu(threshold_banks, cpu)[bank] = NULL; +out_free: + kobject_put(bank->kobj); + kfree(bank); } -int mce_threshold_remove_device(unsigned int cpu) +static void __threshold_remove_device(struct threshold_bank **bp) { - unsigned int bank; + unsigned int bank, numbanks = this_cpu_read(mce_num_banks); - for (bank = 0; bank < per_cpu(mce_num_banks, cpu); ++bank) { - if (!(per_cpu(bank_map, cpu) & (1 << bank))) + for (bank = 0; bank < numbanks; bank++) { + if (!bp[bank]) continue; - threshold_remove_bank(cpu, bank); + + threshold_remove_bank(bp[bank]); + bp[bank] = NULL; } - kfree(per_cpu(threshold_banks, cpu)); - per_cpu(threshold_banks, cpu) = NULL; + kfree(bp); +} + +int mce_threshold_remove_device(unsigned int cpu) +{ + struct threshold_bank **bp = this_cpu_read(threshold_banks); + + if (!bp) + return 0; + + /* + * Clear the pointer before cleaning up, so that the interrupt won't + * touch anything of this. + */ + this_cpu_write(threshold_banks, NULL); + + __threshold_remove_device(bp); return 0; } -/* create dir/files for all valid threshold banks */ +/** + * mce_threshold_create_device - Create the per-CPU MCE threshold device + * @cpu: The plugged in CPU + * + * Create directories and files for all valid threshold banks. + * + * This is invoked from the CPU hotplug callback which was installed in + * mcheck_init_device(). The invocation happens in context of the hotplug + * thread running on @cpu. The callback is invoked on all CPUs which are + * online when the callback is installed or during a real hotplug event. + */ int mce_threshold_create_device(unsigned int cpu) { - unsigned int bank; + unsigned int numbanks, bank; struct threshold_bank **bp; - int err = 0; + int err; + + if (!mce_flags.amd_threshold) + return 0; - bp = per_cpu(threshold_banks, cpu); + bp = this_cpu_read(threshold_banks); if (bp) return 0; - bp = kcalloc(per_cpu(mce_num_banks, cpu), sizeof(struct threshold_bank *), - GFP_KERNEL); + numbanks = this_cpu_read(mce_num_banks); + bp = kcalloc(numbanks, sizeof(*bp), GFP_KERNEL); if (!bp) return -ENOMEM; - per_cpu(threshold_banks, cpu) = bp; - - for (bank = 0; bank < per_cpu(mce_num_banks, cpu); ++bank) { - if (!(per_cpu(bank_map, cpu) & (1 << bank))) + for (bank = 0; bank < numbanks; ++bank) { + if (!(this_cpu_read(bank_map) & (1 << bank))) continue; - err = threshold_create_bank(cpu, bank); - if (err) - goto err; - } - return err; -err: - mce_threshold_remove_device(cpu); - return err; -} - -static __init int threshold_init_device(void) -{ - unsigned lcpu = 0; - - /* to hit CPUs online before the notifier is up */ - for_each_online_cpu(lcpu) { - int err = mce_threshold_create_device(lcpu); - - if (err) + err = threshold_create_bank(bp, cpu, bank); + if (err) { + __threshold_remove_device(bp); return err; + } } + this_cpu_write(threshold_banks, bp); if (thresholding_irq_en) mce_threshold_vector = amd_threshold_interrupt; - return 0; } -/* - * there are 3 funcs which need to be _initcalled in a logic sequence: - * 1. xen_late_init_mcelog - * 2. mcheck_init_device - * 3. threshold_init_device - * - * xen_late_init_mcelog must register xen_mce_chrdev_device before - * native mce_chrdev_device registration if running under xen platform; - * - * mcheck_init_device should be inited before threshold_init_device to - * initialize mce_device, otherwise a NULL ptr dereference will cause panic. - * - * so we use following _initcalls - * 1. device_initcall(xen_late_init_mcelog); - * 2. device_initcall_sync(mcheck_init_device); - * 3. late_initcall(threshold_init_device); - * - * when running under xen, the initcall order is 1,2,3; - * on baremetal, we skip 1 and we do only 2 and 3. - */ -late_initcall(threshold_init_device); diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c index af8d37962586..8ed341714686 100644 --- a/arch/x86/kernel/cpu/mce/apei.c +++ b/arch/x86/kernel/cpu/mce/apei.c @@ -29,14 +29,26 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) { struct mce m; + int lsb; if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) return; + /* + * Even if the ->validation_bits are set for address mask, + * to be extra safe, check and reject an error radius '0', + * and fall back to the default page size. + */ + if (mem_err->validation_bits & CPER_MEM_VALID_PA_MASK) + lsb = find_first_bit((void *)&mem_err->physical_addr_mask, PAGE_SHIFT); + else + lsb = PAGE_SHIFT; + mce_setup(&m); m.bank = -1; /* Fake a memory read error with unknown channel */ - m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f; + m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f; + m.misc = (MCI_MISC_ADDR_PHYS << 6) | lsb; if (severity >= GHES_SEV_RECOVERABLE) m.status |= MCI_STATUS_UC; @@ -51,6 +63,67 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) } EXPORT_SYMBOL_GPL(apei_mce_report_mem_error); +int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) +{ + const u64 *i_mce = ((const u64 *) (ctx_info + 1)); + unsigned int cpu; + struct mce m; + + if (!boot_cpu_has(X86_FEATURE_SMCA)) + return -EINVAL; + + /* + * The starting address of the register array extracted from BERT must + * match with the first expected register in the register layout of + * SMCA address space. This address corresponds to banks's MCA_STATUS + * register. + * + * Match any MCi_STATUS register by turning off bank numbers. + */ + if ((ctx_info->msr_addr & MSR_AMD64_SMCA_MC0_STATUS) != + MSR_AMD64_SMCA_MC0_STATUS) + return -EINVAL; + + /* + * The register array size must be large enough to include all the + * SMCA registers which need to be extracted. + * + * The number of registers in the register array is determined by + * Register Array Size/8 as defined in UEFI spec v2.8, sec N.2.4.2.2. + * The register layout is fixed and currently the raw data in the + * register array includes 6 SMCA registers which the kernel can + * extract. + */ + if (ctx_info->reg_arr_size < 48) + return -EINVAL; + + mce_setup(&m); + + m.extcpu = -1; + m.socketid = -1; + + for_each_possible_cpu(cpu) { + if (cpu_data(cpu).initial_apicid == lapic_id) { + m.extcpu = cpu; + m.socketid = cpu_data(m.extcpu).phys_proc_id; + break; + } + } + + m.apicid = lapic_id; + m.bank = (ctx_info->msr_addr >> 4) & 0xFF; + m.status = *i_mce; + m.addr = *(i_mce + 1); + m.misc = *(i_mce + 2); + /* Skipping MCA_CONFIG */ + m.ipid = *(i_mce + 4); + m.synd = *(i_mce + 5); + + mce_log(&m); + + return 0; +} + #define CPER_CREATOR_MCE \ GUID_INIT(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \ 0x64, 0x90, 0xb8, 0x9d) @@ -115,16 +188,14 @@ retry: /* no more record */ if (*record_id == APEI_ERST_INVALID_RECORD_ID) goto out; - rc = erst_read(*record_id, &rcd.hdr, sizeof(rcd)); + rc = erst_read_record(*record_id, &rcd.hdr, sizeof(rcd), sizeof(rcd), + &CPER_CREATOR_MCE); /* someone else has cleared the record, try next one */ if (rc == -ENOENT) goto retry; else if (rc < 0) goto out; - /* try to skip other type records in storage */ - else if (rc != sizeof(rcd) || - !guid_equal(&rcd.hdr.creator_id, &CPER_CREATOR_MCE)) - goto retry; + memcpy(m, &rcd.mce, sizeof(*m)); rc = sizeof(*m); out: diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 2c4f949611e4..2c8ec5c71712 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -40,8 +40,10 @@ #include <linux/debugfs.h> #include <linux/irq_work.h> #include <linux/export.h> -#include <linux/jump_label.h> #include <linux/set_memory.h> +#include <linux/sync_core.h> +#include <linux/task_work.h> +#include <linux/hardirq.h> #include <asm/intel-family.h> #include <asm/processor.h> @@ -67,7 +69,9 @@ DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks); struct mce_bank { u64 ctl; /* subevents to enable */ - bool init; /* initialise bank? */ + + __u64 init : 1, /* initialise bank? */ + __reserved_1 : 63; }; static DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array); @@ -84,20 +88,11 @@ struct mce_vendor_flags mce_flags __read_mostly; struct mca_config mca_cfg __read_mostly = { .bootlog = -1, - /* - * Tolerant levels: - * 0: always panic on uncorrected errors, log corrected errors - * 1: panic or SIGBUS on uncorrected errors, log corrected errors - * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors - * 3: never panic or SIGBUS, log all errors (for testing only) - */ - .tolerant = 1, .monarch_timeout = -1 }; static DEFINE_PER_CPU(struct mce, mces_seen); static unsigned long mce_need_notify; -static int cpu_missing; /* * MCA banks polled by the period polling timer for corrected events. @@ -119,8 +114,6 @@ mce_banks_t mce_banks_ce_disabled; static struct work_struct mce_work; static struct irq_work mce_irq_work; -static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); - /* * CPU/chipset specific EDAC code can register a notifier call here to print * MCE errors in a human-readable form. @@ -138,11 +131,8 @@ void mce_setup(struct mce *m) m->cpuid = cpuid_eax(1); m->socketid = cpu_data(m->extcpu).phys_proc_id; m->apicid = cpu_data(m->extcpu).initial_apicid; - rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); - - if (this_cpu_has(X86_FEATURE_INTEL_PPIN)) - rdmsrl(MSR_PPIN, m->ppin); - + m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP); + m->ppin = cpu_data(m->extcpu).ppin; m->microcode = boot_cpu_data.microcode; } @@ -156,80 +146,22 @@ void mce_log(struct mce *m) } EXPORT_SYMBOL_GPL(mce_log); -/* - * We run the default notifier if we have only the UC, the first and the - * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS - * notifiers registered on the chain. - */ -#define NUM_DEFAULT_NOTIFIERS 3 -static atomic_t num_notifiers; - void mce_register_decode_chain(struct notifier_block *nb) { - if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC)) + if (WARN_ON(nb->priority < MCE_PRIO_LOWEST || + nb->priority > MCE_PRIO_HIGHEST)) return; - atomic_inc(&num_notifiers); - blocking_notifier_chain_register(&x86_mce_decoder_chain, nb); } EXPORT_SYMBOL_GPL(mce_register_decode_chain); void mce_unregister_decode_chain(struct notifier_block *nb) { - atomic_dec(&num_notifiers); - blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb); } EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); -static inline u32 ctl_reg(int bank) -{ - return MSR_IA32_MCx_CTL(bank); -} - -static inline u32 status_reg(int bank) -{ - return MSR_IA32_MCx_STATUS(bank); -} - -static inline u32 addr_reg(int bank) -{ - return MSR_IA32_MCx_ADDR(bank); -} - -static inline u32 misc_reg(int bank) -{ - return MSR_IA32_MCx_MISC(bank); -} - -static inline u32 smca_ctl_reg(int bank) -{ - return MSR_AMD64_SMCA_MCx_CTL(bank); -} - -static inline u32 smca_status_reg(int bank) -{ - return MSR_AMD64_SMCA_MCx_STATUS(bank); -} - -static inline u32 smca_addr_reg(int bank) -{ - return MSR_AMD64_SMCA_MCx_ADDR(bank); -} - -static inline u32 smca_misc_reg(int bank) -{ - return MSR_AMD64_SMCA_MCx_MISC(bank); -} - -struct mca_msr_regs msr_ops = { - .ctl = ctl_reg, - .status = status_reg, - .addr = addr_reg, - .misc = misc_reg -}; - static void __print_mce(struct mce *m) { pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n", @@ -252,6 +184,8 @@ static void __print_mce(struct mce *m) pr_cont("ADDR %llx ", m->addr); if (m->misc) pr_cont("MISC %llx ", m->misc); + if (m->ppin) + pr_cont("PPIN %llx ", m->ppin); if (mce_flags.smca) { if (m->synd) @@ -261,6 +195,7 @@ static void __print_mce(struct mce *m) } pr_cont("\n"); + /* * Note this output is parsed by external tools and old fields * should not be changed. @@ -299,11 +234,17 @@ static void wait_for_panic(void) panic("Panicing machine check CPU died"); } -static void mce_panic(const char *msg, struct mce *final, char *exp) +static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) { - int apei_err = 0; struct llist_node *pending; struct mce_evt_llist *l; + int apei_err = 0; + + /* + * Allow instrumentation around external facilities usage. Not that it + * matters a whole lot since the machine is going to panic anyway. + */ + instrumentation_begin(); if (!fake_panic) { /* @@ -318,7 +259,7 @@ static void mce_panic(const char *msg, struct mce *final, char *exp) } else { /* Don't log too much for fake panic */ if (atomic_inc_return(&mce_fake_panicked) > 1) - return; + goto out; } pending = mce_gen_pool_prepare_records(); /* First print corrected ones that are still unlogged */ @@ -346,8 +287,6 @@ static void mce_panic(const char *msg, struct mce *final, char *exp) if (!apei_err) apei_err = apei_write_mce(final); } - if (cpu_missing) - pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n"); if (exp) pr_emerg(HW_ERR "Machine check: %s\n", exp); if (!fake_panic) { @@ -356,6 +295,9 @@ static void mce_panic(const char *msg, struct mce *final, char *exp) panic(msg); } else pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); + +out: + instrumentation_end(); } /* Support code for software error injection */ @@ -366,53 +308,98 @@ static int msr_to_offset(u32 msr) if (msr == mca_cfg.rip_msr) return offsetof(struct mce, ip); - if (msr == msr_ops.status(bank)) + if (msr == mca_msr_reg(bank, MCA_STATUS)) return offsetof(struct mce, status); - if (msr == msr_ops.addr(bank)) + if (msr == mca_msr_reg(bank, MCA_ADDR)) return offsetof(struct mce, addr); - if (msr == msr_ops.misc(bank)) + if (msr == mca_msr_reg(bank, MCA_MISC)) return offsetof(struct mce, misc); if (msr == MSR_IA32_MCG_STATUS) return offsetof(struct mce, mcgstatus); return -1; } +void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr) +{ + if (wrmsr) { + pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n", + (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax, + regs->ip, (void *)regs->ip); + } else { + pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n", + (unsigned int)regs->cx, regs->ip, (void *)regs->ip); + } + + show_stack_regs(regs); + + panic("MCA architectural violation!\n"); + + while (true) + cpu_relax(); +} + /* MSR access wrappers used for error injection */ -static u64 mce_rdmsrl(u32 msr) +noinstr u64 mce_rdmsrl(u32 msr) { - u64 v; + DECLARE_ARGS(val, low, high); if (__this_cpu_read(injectm.finished)) { - int offset = msr_to_offset(msr); + int offset; + u64 ret; + + instrumentation_begin(); + offset = msr_to_offset(msr); if (offset < 0) - return 0; - return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset); - } + ret = 0; + else + ret = *(u64 *)((char *)this_cpu_ptr(&injectm) + offset); - if (rdmsrl_safe(msr, &v)) { - WARN_ONCE(1, "mce: Unable to read MSR 0x%x!\n", msr); - /* - * Return zero in case the access faulted. This should - * not happen normally but can happen if the CPU does - * something weird, or if the code is buggy. - */ - v = 0; + instrumentation_end(); + + return ret; } - return v; + /* + * RDMSR on MCA MSRs should not fault. If they do, this is very much an + * architectural violation and needs to be reported to hw vendor. Panic + * the box to not allow any further progress. + */ + asm volatile("1: rdmsr\n" + "2:\n" + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR_IN_MCE) + : EAX_EDX_RET(val, low, high) : "c" (msr)); + + + return EAX_EDX_VAL(val, low, high); } -static void mce_wrmsrl(u32 msr, u64 v) +static noinstr void mce_wrmsrl(u32 msr, u64 v) { + u32 low, high; + if (__this_cpu_read(injectm.finished)) { - int offset = msr_to_offset(msr); + int offset; + instrumentation_begin(); + + offset = msr_to_offset(msr); if (offset >= 0) *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v; + + instrumentation_end(); + return; } - wrmsrl(msr, v); + + low = (u32)v; + high = (u32)(v >> 32); + + /* See comment in mce_rdmsrl() */ + asm volatile("1: wrmsr\n" + "2:\n" + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR_IN_MCE) + : : "c" (msr), "a"(low), "d" (high) : "memory"); } /* @@ -420,9 +407,15 @@ static void mce_wrmsrl(u32 msr, u64 v) * check into our "mce" struct so that we can use it later to assess * the severity of the problem as we read per-bank specific details. */ -static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) +static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs) { + /* + * Enable instrumentation around mce_setup() which calls external + * facilities. + */ + instrumentation_begin(); mce_setup(m); + instrumentation_end(); m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); if (regs) { @@ -470,7 +463,7 @@ static void mce_irq_work_cb(struct irq_work *entry) * Check if the address reported by the CPU is in a format we can parse. * It would be possible to add code for most other cases, but all would * be somewhat complicated (e.g. segment offset would require an instruction - * parser). So only support physical addresses up to page granuality for now. + * parser). So only support physical addresses up to page granularity for now. */ int mce_usable_address(struct mce *m) { @@ -527,6 +520,14 @@ bool mce_is_memory_error(struct mce *m) } EXPORT_SYMBOL_GPL(mce_is_memory_error); +static bool whole_page(struct mce *m) +{ + if (!mca_cfg.ser || !(m->status & MCI_STATUS_MISCV)) + return true; + + return MCI_MISC_ADDR_LSB(m->misc) >= PAGE_SHIFT; +} + bool mce_is_correctable(struct mce *m) { if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED) @@ -542,22 +543,7 @@ bool mce_is_correctable(struct mce *m) } EXPORT_SYMBOL_GPL(mce_is_correctable); -static bool cec_add_mce(struct mce *m) -{ - if (!m) - return false; - - /* We eat only correctable DRAM errors with usable addresses. */ - if (mce_is_memory_error(m) && - mce_is_correctable(m) && - mce_usable_address(m)) - if (!cec_add_elem(m->addr >> PAGE_SHIFT)) - return true; - - return false; -} - -static int mce_first_notifier(struct notifier_block *nb, unsigned long val, +static int mce_early_notifier(struct notifier_block *nb, unsigned long val, void *data) { struct mce *m = (struct mce *)data; @@ -565,9 +551,6 @@ static int mce_first_notifier(struct notifier_block *nb, unsigned long val, if (!m) return NOTIFY_DONE; - if (cec_add_mce(m)) - return NOTIFY_STOP; - /* Emit the trace record: */ trace_mce_record(m); @@ -578,9 +561,9 @@ static int mce_first_notifier(struct notifier_block *nb, unsigned long val, return NOTIFY_DONE; } -static struct notifier_block first_nb = { - .notifier_call = mce_first_notifier, - .priority = MCE_PRIO_FIRST, +static struct notifier_block early_nb = { + .notifier_call = mce_early_notifier, + .priority = MCE_PRIO_EARLY, }; static int uc_decode_notifier(struct notifier_block *nb, unsigned long val, @@ -597,8 +580,10 @@ static int uc_decode_notifier(struct notifier_block *nb, unsigned long val, return NOTIFY_DONE; pfn = mce->addr >> PAGE_SHIFT; - if (!memory_failure(pfn, 0)) + if (!memory_failure(pfn, 0)) { set_mce_nospec(pfn); + mce->kflags |= MCE_HANDLED_UC; + } return NOTIFY_OK; } @@ -616,10 +601,8 @@ static int mce_default_notifier(struct notifier_block *nb, unsigned long val, if (!m) return NOTIFY_DONE; - if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS) - return NOTIFY_DONE; - - __print_mce(m); + if (mca_cfg.print_all || !m->kflags) + __print_mce(m); return NOTIFY_DONE; } @@ -633,13 +616,13 @@ static struct notifier_block mce_default_nb = { /* * Read ADDR and MISC registers. */ -static void mce_read_aux(struct mce *m, int i) +static noinstr void mce_read_aux(struct mce *m, int i) { if (m->status & MCI_STATUS_MISCV) - m->misc = mce_rdmsrl(msr_ops.misc(i)); + m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC)); if (m->status & MCI_STATUS_ADDRV) { - m->addr = mce_rdmsrl(msr_ops.addr(i)); + m->addr = mce_rdmsrl(mca_msr_reg(i, MCA_ADDR)); /* * Mask the reported address by the reported granularity. @@ -709,7 +692,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) m.bank = i; barrier(); - m.status = mce_rdmsrl(msr_ops.status(i)); + m.status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); /* If this entry is not valid, ignore it */ if (!(m.status & MCI_STATUS_VAL)) @@ -759,7 +742,7 @@ log_it: goto clear_it; mce_read_aux(&m, i); - m.severity = mce_severity(&m, mca_cfg.tolerant, NULL, false); + m.severity = mce_severity(&m, NULL, NULL, false); /* * Don't get the IP here because it's unlikely to * have anything to do with the actual error location. @@ -768,13 +751,16 @@ log_it: if (mca_cfg.dont_log_ce && !mce_usable_address(&m)) goto clear_it; - mce_log(&m); + if (flags & MCP_QUEUE_LOG) + mce_gen_pool_add(&m); + else + mce_log(&m); clear_it: /* * Clear state for this bank. */ - mce_wrmsrl(msr_ops.status(i), 0); + mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); } /* @@ -789,26 +775,108 @@ clear_it: EXPORT_SYMBOL_GPL(machine_check_poll); /* + * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and + * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM + * Vol 3B Table 15-20). But this confuses both the code that determines + * whether the machine check occurred in kernel or user mode, and also + * the severity assessment code. Pretend that EIPV was set, and take the + * ip/cs values from the pt_regs that mce_gather_info() ignored earlier. + */ +static __always_inline void +quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) +{ + if (bank != 0) + return; + if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0) + return; + if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC| + MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV| + MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR| + MCACOD)) != + (MCI_STATUS_UC|MCI_STATUS_EN| + MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S| + MCI_STATUS_AR|MCACOD_INSTR)) + return; + + m->mcgstatus |= MCG_STATUS_EIPV; + m->ip = regs->ip; + m->cs = regs->cs; +} + +/* + * Disable fast string copy and return from the MCE handler upon the first SRAR + * MCE on bank 1 due to a CPU erratum on Intel Skylake/Cascade Lake/Cooper Lake + * CPUs. + * The fast string copy instructions ("REP; MOVS*") could consume an + * uncorrectable memory error in the cache line _right after_ the desired region + * to copy and raise an MCE with RIP pointing to the instruction _after_ the + * "REP; MOVS*". + * This mitigation addresses the issue completely with the caveat of performance + * degradation on the CPU affected. This is still better than the OS crashing on + * MCEs raised on an irrelevant process due to "REP; MOVS*" accesses from a + * kernel context (e.g., copy_page). + * + * Returns true when fast string copy on CPU has been disabled. + */ +static noinstr bool quirk_skylake_repmov(void) +{ + u64 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + u64 misc_enable = mce_rdmsrl(MSR_IA32_MISC_ENABLE); + u64 mc1_status; + + /* + * Apply the quirk only to local machine checks, i.e., no broadcast + * sync is needed. + */ + if (!(mcgstatus & MCG_STATUS_LMCES) || + !(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) + return false; + + mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(1)); + + /* Check for a software-recoverable data fetch error. */ + if ((mc1_status & + (MCI_STATUS_VAL | MCI_STATUS_OVER | MCI_STATUS_UC | MCI_STATUS_EN | + MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_PCC | + MCI_STATUS_AR | MCI_STATUS_S)) == + (MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN | + MCI_STATUS_ADDRV | MCI_STATUS_MISCV | + MCI_STATUS_AR | MCI_STATUS_S)) { + misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING; + mce_wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); + mce_wrmsrl(MSR_IA32_MCx_STATUS(1), 0); + + instrumentation_begin(); + pr_err_once("Erratum detected, disable fast string copy instructions.\n"); + instrumentation_end(); + + return true; + } + + return false; +} + +/* * Do a quick check if any of the events requires a panic. * This decides if we keep the events around or clear them. */ -static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, - struct pt_regs *regs) +static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, + struct pt_regs *regs) { char *tmp = *msg; int i; for (i = 0; i < this_cpu_read(mce_num_banks); i++) { - m->status = mce_rdmsrl(msr_ops.status(i)); + m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); if (!(m->status & MCI_STATUS_VAL)) continue; - __set_bit(i, validp); - if (quirk_no_way_out) - quirk_no_way_out(i, m, regs); + arch___set_bit(i, validp); + if (mce_flags.snb_ifu_quirk) + quirk_sandybridge_ifu(i, m, regs); m->bank = i; - if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) { + if (mce_severity(m, regs, &tmp, true) >= MCE_PANIC_SEVERITY) { mce_read_aux(m, i); *msg = tmp; return 1; @@ -829,10 +897,21 @@ static atomic_t mce_executing; static atomic_t mce_callin; /* + * Track which CPUs entered the MCA broadcast synchronization and which not in + * order to print holdouts. + */ +static cpumask_t mce_missing_cpus = CPU_MASK_ALL; + +/* * Check if a timeout waiting for other CPUs happened. */ -static int mce_timed_out(u64 *t, const char *msg) +static noinstr int mce_timed_out(u64 *t, const char *msg) { + int ret = 0; + + /* Enable instrumentation around calls to external facilities */ + instrumentation_begin(); + /* * The others already did panic for some reason. * Bail out like in a timeout. @@ -845,15 +924,22 @@ static int mce_timed_out(u64 *t, const char *msg) if (!mca_cfg.monarch_timeout) goto out; if ((s64)*t < SPINUNIT) { - if (mca_cfg.tolerant <= 1) - mce_panic(msg, NULL, NULL); - cpu_missing = 1; - return 1; + if (cpumask_and(&mce_missing_cpus, cpu_online_mask, &mce_missing_cpus)) + pr_emerg("CPUs not responding to MCE broadcast (may include false positives): %*pbl\n", + cpumask_pr_args(&mce_missing_cpus)); + mce_panic(msg, NULL, NULL); + + ret = 1; + goto out; } *t -= SPINUNIT; + out: touch_nmi_watchdog(); - return 0; + + instrumentation_end(); + + return ret; } /* @@ -886,7 +972,6 @@ static void mce_reign(void) struct mce *m = NULL; int global_worst = 0; char *msg = NULL; - char *nmsg = NULL; /* * This CPU is the Monarch and the other CPUs have run @@ -894,12 +979,10 @@ static void mce_reign(void) * Grade the severity of the errors of all the CPUs. */ for_each_possible_cpu(cpu) { - int severity = mce_severity(&per_cpu(mces_seen, cpu), - mca_cfg.tolerant, - &nmsg, true); - if (severity > global_worst) { - msg = nmsg; - global_worst = severity; + struct mce *mtmp = &per_cpu(mces_seen, cpu); + + if (mtmp->severity > global_worst) { + global_worst = mtmp->severity; m = &per_cpu(mces_seen, cpu); } } @@ -909,8 +992,11 @@ static void mce_reign(void) * This dumps all the mces in the log buffer and stops the * other CPUs. */ - if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) + if (m && global_worst >= MCE_PANIC_SEVERITY) { + /* call mce_severity() to get "msg" for panic */ + mce_severity(m, NULL, &msg, true); mce_panic("Fatal machine check", m, msg); + } /* * For UC somewhere we let the CPU who detects it handle it. @@ -922,7 +1008,7 @@ static void mce_reign(void) * No machine check event found. Must be some external * source or one CPU is hung. Panic. */ - if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3) + if (global_worst <= MCE_KEEP_SEVERITY) mce_panic("Fatal machine check from unknown source", NULL, NULL); /* @@ -942,30 +1028,33 @@ static atomic_t global_nwo; * in the entry order. * TBD double check parallel CPU hotunplug */ -static int mce_start(int *no_way_out) +static noinstr int mce_start(int *no_way_out) { - int order; - int cpus = num_online_cpus(); u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; + int order, ret = -1; if (!timeout) - return -1; + return ret; - atomic_add(*no_way_out, &global_nwo); + arch_atomic_add(*no_way_out, &global_nwo); /* * Rely on the implied barrier below, such that global_nwo * is updated before mce_callin. */ - order = atomic_inc_return(&mce_callin); + order = arch_atomic_inc_return(&mce_callin); + arch_cpumask_clear_cpu(smp_processor_id(), &mce_missing_cpus); + + /* Enable instrumentation around calls to external facilities */ + instrumentation_begin(); /* * Wait for everyone. */ - while (atomic_read(&mce_callin) != cpus) { + while (arch_atomic_read(&mce_callin) != num_online_cpus()) { if (mce_timed_out(&timeout, "Timeout: Not all CPUs entered broadcast exception handler")) { - atomic_set(&global_nwo, 0); - return -1; + arch_atomic_set(&global_nwo, 0); + goto out; } ndelay(SPINUNIT); } @@ -979,7 +1068,7 @@ static int mce_start(int *no_way_out) /* * Monarch: Starts executing now, the others wait. */ - atomic_set(&mce_executing, 1); + arch_atomic_set(&mce_executing, 1); } else { /* * Subject: Now start the scanning loop one by one in @@ -987,11 +1076,11 @@ static int mce_start(int *no_way_out) * This way when there are any shared banks it will be * only seen by one CPU before cleared, avoiding duplicates. */ - while (atomic_read(&mce_executing) < order) { + while (arch_atomic_read(&mce_executing) < order) { if (mce_timed_out(&timeout, "Timeout: Subject CPUs unable to finish machine check processing")) { - atomic_set(&global_nwo, 0); - return -1; + arch_atomic_set(&global_nwo, 0); + goto out; } ndelay(SPINUNIT); } @@ -1000,19 +1089,27 @@ static int mce_start(int *no_way_out) /* * Cache the global no_way_out state. */ - *no_way_out = atomic_read(&global_nwo); + *no_way_out = arch_atomic_read(&global_nwo); - return order; + ret = order; + +out: + instrumentation_end(); + + return ret; } /* * Synchronize between CPUs after main scanning loop. * This invokes the bulk of the Monarch processing. */ -static int mce_end(int order) +static noinstr int mce_end(int order) { - int ret = -1; u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; + int ret = -1; + + /* Allow instrumentation around external facilities. */ + instrumentation_begin(); if (!timeout) goto reset; @@ -1025,14 +1122,11 @@ static int mce_end(int order) atomic_inc(&mce_executing); if (order == 1) { - /* CHECKME: Can this race with a parallel hotplug? */ - int cpus = num_online_cpus(); - /* * Monarch: Wait for everyone to go through their scanning * loops. */ - while (atomic_read(&mce_executing) <= cpus) { + while (atomic_read(&mce_executing) <= num_online_cpus()) { if (mce_timed_out(&timeout, "Timeout: Monarch CPU unable to finish machine check processing")) goto reset; @@ -1056,7 +1150,8 @@ static int mce_end(int order) /* * Don't reset anything. That's done by the Monarch. */ - return 0; + ret = 0; + goto out; } /* @@ -1065,42 +1160,30 @@ static int mce_end(int order) reset: atomic_set(&global_nwo, 0); atomic_set(&mce_callin, 0); + cpumask_setall(&mce_missing_cpus); barrier(); /* * Let others run again. */ atomic_set(&mce_executing, 0); + +out: + instrumentation_end(); + return ret; } -static void mce_clear_state(unsigned long *toclear) +static __always_inline void mce_clear_state(unsigned long *toclear) { int i; for (i = 0; i < this_cpu_read(mce_num_banks); i++) { - if (test_bit(i, toclear)) - mce_wrmsrl(msr_ops.status(i), 0); + if (arch_test_bit(i, toclear)) + mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); } } -static int do_memory_failure(struct mce *m) -{ - int flags = MF_ACTION_REQUIRED; - int ret; - - pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr); - if (!(m->mcgstatus & MCG_STATUS_RIPV)) - flags |= MF_MUST_KILL; - ret = memory_failure(m->addr >> PAGE_SHIFT, flags); - if (ret) - pr_err("Memory error not recovered"); - else - set_mce_nospec(m->addr >> PAGE_SHIFT); - return ret; -} - - /* * Cases where we avoid rendezvous handler timeout: * 1) If this CPU is offline. @@ -1113,13 +1196,15 @@ static int do_memory_failure(struct mce *m) * kdump kernel establishing a new #MC handler where a broadcasted MCE * might not get handled properly. */ -static bool __mc_check_crashing_cpu(int cpu) +static noinstr bool mce_check_crashing_cpu(void) { - if (cpu_is_offline(cpu) || + unsigned int cpu = smp_processor_id(); + + if (arch_cpu_is_offline(cpu) || (crashing_cpu != -1 && crashing_cpu != cpu)) { u64 mcgstatus; - mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + mcgstatus = __rdmsr(MSR_IA32_MCG_STATUS); if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) { if (mcgstatus & MCG_STATUS_LMCES) @@ -1127,24 +1212,25 @@ static bool __mc_check_crashing_cpu(int cpu) } if (mcgstatus & MCG_STATUS_RIPV) { - mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); + __wrmsr(MSR_IA32_MCG_STATUS, 0, 0); return true; } } return false; } -static void __mc_scan_banks(struct mce *m, struct mce *final, - unsigned long *toclear, unsigned long *valid_banks, - int no_way_out, int *worst) +static __always_inline int +__mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, + unsigned long *toclear, unsigned long *valid_banks, int no_way_out, + int *worst) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); struct mca_config *cfg = &mca_cfg; - int severity, i; + int severity, i, taint = 0; for (i = 0; i < this_cpu_read(mce_num_banks); i++) { - __clear_bit(i, toclear); - if (!test_bit(i, valid_banks)) + arch___clear_bit(i, toclear); + if (!arch_test_bit(i, valid_banks)) continue; if (!mce_banks[i].ctl) @@ -1154,7 +1240,7 @@ static void __mc_scan_banks(struct mce *m, struct mce *final, m->addr = 0; m->bank = i; - m->status = mce_rdmsrl(msr_ops.status(i)); + m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); if (!(m->status & MCI_STATUS_VAL)) continue; @@ -1167,9 +1253,9 @@ static void __mc_scan_banks(struct mce *m, struct mce *final, continue; /* Set taint even when machine check was not enabled. */ - add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); + taint++; - severity = mce_severity(m, cfg->tolerant, NULL, true); + severity = mce_severity(m, regs, NULL, true); /* * When machine check was for corrected/deferred handler don't @@ -1179,7 +1265,7 @@ static void __mc_scan_banks(struct mce *m, struct mce *final, severity == MCE_UCNA_SEVERITY) && !no_way_out) continue; - __set_bit(i, toclear); + arch___set_bit(i, toclear); /* Machine check event was not enabled. Clear, but ignore. */ if (severity == MCE_NO_SEVERITY) @@ -1190,7 +1276,13 @@ static void __mc_scan_banks(struct mce *m, struct mce *final, /* assuming valid severity level != 0 */ m->severity = severity; + /* + * Enable instrumentation around the mce_log() call which is + * done in #MC context, where instrumentation is disabled. + */ + instrumentation_begin(); mce_log(m); + instrumentation_end(); if (severity > *worst) { *final = *m; @@ -1200,58 +1292,165 @@ static void __mc_scan_banks(struct mce *m, struct mce *final, /* mce_clear_state will clear *final, save locally for use later */ *m = *final; + + return taint; +} + +static void kill_me_now(struct callback_head *ch) +{ + struct task_struct *p = container_of(ch, struct task_struct, mce_kill_me); + + p->mce_count = 0; + force_sig(SIGBUS); +} + +static void kill_me_maybe(struct callback_head *cb) +{ + struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me); + int flags = MF_ACTION_REQUIRED; + int ret; + + p->mce_count = 0; + pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr); + + if (!p->mce_ripv) + flags |= MF_MUST_KILL; + + ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags); + if (!ret) { + set_mce_nospec(p->mce_addr >> PAGE_SHIFT); + sync_core(); + return; + } + + /* + * -EHWPOISON from memory_failure() means that it already sent SIGBUS + * to the current process with the proper error info, + * -EOPNOTSUPP means hwpoison_filter() filtered the error event, + * + * In both cases, no further processing is required. + */ + if (ret == -EHWPOISON || ret == -EOPNOTSUPP) + return; + + pr_err("Memory error not recovered"); + kill_me_now(cb); +} + +static void kill_me_never(struct callback_head *cb) +{ + struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me); + + p->mce_count = 0; + pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr); + if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0)) + set_mce_nospec(p->mce_addr >> PAGE_SHIFT); +} + +static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *)) +{ + int count = ++current->mce_count; + + /* First call, save all the details */ + if (count == 1) { + current->mce_addr = m->addr; + current->mce_kflags = m->kflags; + current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV); + current->mce_whole_page = whole_page(m); + current->mce_kill_me.func = func; + } + + /* Ten is likely overkill. Don't expect more than two faults before task_work() */ + if (count > 10) + mce_panic("Too many consecutive machine checks while accessing user data", m, msg); + + /* Second or later call, make sure page address matches the one from first call */ + if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT)) + mce_panic("Consecutive machine checks to different user pages", m, msg); + + /* Do not call task_work_add() more than once */ + if (count > 1) + return; + + task_work_add(current, ¤t->mce_kill_me, TWA_RESUME); +} + +/* Handle unconfigured int18 (should never happen) */ +static noinstr void unexpected_machine_check(struct pt_regs *regs) +{ + instrumentation_begin(); + pr_err("CPU#%d: Unexpected int18 (Machine Check)\n", + smp_processor_id()); + instrumentation_end(); } /* - * The actual machine check handler. This only handles real - * exceptions when something got corrupted coming in through int 18. + * The actual machine check handler. This only handles real exceptions when + * something got corrupted coming in through int 18. * - * This is executed in NMI context not subject to normal locking rules. This - * implies that most kernel services cannot be safely used. Don't even + * This is executed in #MC context not subject to normal locking rules. + * This implies that most kernel services cannot be safely used. Don't even * think about putting a printk in there! * * On Intel systems this is entered on all CPUs in parallel through * MCE broadcast. However some CPUs might be broken beyond repair, * so be always careful when synchronizing with others. + * + * Tracing and kprobes are disabled: if we interrupted a kernel context + * with IF=1, we need to minimize stack usage. There are also recursion + * issues: if the machine check was due to a failure of the memory + * backing the user stack, tracing that reads the user stack will cause + * potentially infinite recursion. + * + * Currently, the #MC handler calls out to a number of external facilities + * and, therefore, allows instrumentation around them. The optimal thing to + * have would be to do the absolutely minimal work required in #MC context + * and have instrumentation disabled only around that. Further processing can + * then happen in process context where instrumentation is allowed. Achieving + * that requires careful auditing and modifications. Until then, the code + * allows instrumentation temporarily, where required. * */ -void do_machine_check(struct pt_regs *regs, long error_code) +noinstr void do_machine_check(struct pt_regs *regs) { - DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); - DECLARE_BITMAP(toclear, MAX_NR_BANKS); - struct mca_config *cfg = &mca_cfg; - int cpu = smp_processor_id(); + int worst = 0, order, no_way_out, kill_current_task, lmce, taint = 0; + DECLARE_BITMAP(valid_banks, MAX_NR_BANKS) = { 0 }; + DECLARE_BITMAP(toclear, MAX_NR_BANKS) = { 0 }; struct mce m, *final; char *msg = NULL; - int worst = 0; + + if (unlikely(mce_flags.p5)) + return pentium_machine_check(regs); + else if (unlikely(mce_flags.winchip)) + return winchip_machine_check(regs); + else if (unlikely(!mca_cfg.initialized)) + return unexpected_machine_check(regs); + + if (mce_flags.skx_repmov_quirk && quirk_skylake_repmov()) + goto clear; /* * Establish sequential order between the CPUs entering the machine * check handler. */ - int order = -1; + order = -1; /* * If no_way_out gets set, there is no safe way to recover from this - * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway. + * MCE. */ - int no_way_out = 0; + no_way_out = 0; /* - * If kill_it gets set, there might be a way to recover from this + * If kill_current_task is not set, there might be a way to recover from this * error. */ - int kill_it = 0; + kill_current_task = 0; /* * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES * on Intel. */ - int lmce = 1; - - if (__mc_check_crashing_cpu(cpu)) - return; - - ist_enter(regs); + lmce = 1; this_cpu_inc(mce_exception_count); @@ -1261,7 +1460,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) final = this_cpu_ptr(&mces_seen); *final = m; - memset(valid_banks, 0, sizeof(valid_banks)); no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs); barrier(); @@ -1272,8 +1470,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) * severity is MCE_AR_SEVERITY we have other options. */ if (!(m.mcgstatus & MCG_STATUS_RIPV)) - kill_it = 1; - + kill_current_task = 1; /* * Check if this MCE is signaled to only this logical processor, * on Intel, Zhaoxin only. @@ -1296,7 +1493,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) order = mce_start(&no_way_out); } - __mc_scan_banks(&m, final, toclear, valid_banks, no_way_out, &worst); + taint = __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst); if (!no_way_out) mce_clear_state(toclear); @@ -1306,8 +1503,13 @@ void do_machine_check(struct pt_regs *regs, long error_code) * When there's any problem use only local no_way_out state. */ if (!lmce) { - if (mce_end(order) < 0) - no_way_out = worst >= MCE_PANIC_SEVERITY; + if (mce_end(order) < 0) { + if (!no_way_out) + no_way_out = worst >= MCE_PANIC_SEVERITY; + + if (no_way_out) + mce_panic("Fatal machine check on current CPU", &m, msg); + } } else { /* * If there was a fatal machine check we should have @@ -1317,47 +1519,59 @@ void do_machine_check(struct pt_regs *regs, long error_code) * fatal error. We call "mce_severity()" again to * make sure we have the right "msg". */ - if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) { - mce_severity(&m, cfg->tolerant, &msg, true); + if (worst >= MCE_PANIC_SEVERITY) { + mce_severity(&m, regs, &msg, true); mce_panic("Local fatal machine check!", &m, msg); } } /* - * If tolerant is at an insane level we drop requests to kill - * processes and continue even when there is no way out. + * Enable instrumentation around the external facilities like task_work_add() + * (via queue_task_work()), fixup_exception() etc. For now, that is. Fixing this + * properly would need a lot more involved reorganization. */ - if (cfg->tolerant == 3) - kill_it = 0; - else if (no_way_out) - mce_panic("Fatal machine check on current CPU", &m, msg); - - if (worst > 0) - irq_work_queue(&mce_irq_work); + instrumentation_begin(); - mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); - - sync_core(); + if (taint) + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - if (worst != MCE_AR_SEVERITY && !kill_it) - goto out_ist; + if (worst != MCE_AR_SEVERITY && !kill_current_task) + goto out; /* Fault was in user mode and we need to take some action */ if ((m.cs & 3) == 3) { - ist_begin_non_atomic(regs); - local_irq_enable(); + /* If this triggers there is no way to recover. Die hard. */ + BUG_ON(!on_thread_stack() || !user_mode(regs)); + + if (kill_current_task) + queue_task_work(&m, msg, kill_me_now); + else + queue_task_work(&m, msg, kill_me_maybe); - if (kill_it || do_memory_failure(&m)) - force_sig(SIGBUS); - local_irq_disable(); - ist_end_non_atomic(); } else { - if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0)) - mce_panic("Failed kernel mode recovery", &m, msg); + /* + * Handle an MCE which has happened in kernel space but from + * which the kernel can recover: ex_has_fault_handler() has + * already verified that the rIP at which the error happened is + * a rIP from which the kernel can recover (by jumping to + * recovery code specified in _ASM_EXTABLE_FAULT()) and the + * corresponding exception handler which would do that is the + * proper one. + */ + if (m.kflags & MCE_IN_KERNEL_RECOV) { + if (!fixup_exception(regs, X86_TRAP_MC, 0, 0)) + mce_panic("Failed kernel mode recovery", &m, msg); + } + + if (m.kflags & MCE_IN_KERNEL_COPYIN) + queue_task_work(&m, msg, kill_me_never); } -out_ist: - ist_exit(regs); +out: + instrumentation_end(); + +clear: + mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); } EXPORT_SYMBOL_GPL(do_machine_check); @@ -1496,7 +1710,7 @@ static void __mcheck_cpu_mce_banks_init(void) * __mcheck_cpu_init_clear_banks() does the final bank setup. */ b->ctl = -1ULL; - b->init = 1; + b->init = true; } } @@ -1540,10 +1754,12 @@ static void __mcheck_cpu_init_generic(void) m_fl = MCP_DONTLOG; /* - * Log the machine checks left over from the previous reset. + * Log the machine checks left over from the previous reset. Log them + * only, do not start processing them. That will happen in mcheck_late_init() + * when all consumers have been registered on the notifier chain. */ bitmap_fill(all_banks, MAX_NR_BANKS); - machine_check_poll(MCP_UC | m_fl, &all_banks); + machine_check_poll(MCP_UC | MCP_QUEUE_LOG | m_fl, &all_banks); cr4_set_bits(X86_CR4_MCE); @@ -1562,8 +1778,8 @@ static void __mcheck_cpu_init_clear_banks(void) if (!b->init) continue; - wrmsrl(msr_ops.ctl(i), b->ctl); - wrmsrl(msr_ops.status(i), 0); + wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl); + wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); } } @@ -1589,39 +1805,11 @@ static void __mcheck_cpu_check_banks(void) if (!b->init) continue; - rdmsrl(msr_ops.ctl(i), msrval); + rdmsrl(mca_msr_reg(i, MCA_CTL), msrval); b->init = !!msrval; } } -/* - * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and - * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM - * Vol 3B Table 15-20). But this confuses both the code that determines - * whether the machine check occurred in kernel or user mode, and also - * the severity assessment code. Pretend that EIPV was set, and take the - * ip/cs values from the pt_regs that mce_gather_info() ignored earlier. - */ -static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) -{ - if (bank != 0) - return; - if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0) - return; - if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC| - MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV| - MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR| - MCACOD)) != - (MCI_STATUS_UC|MCI_STATUS_EN| - MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S| - MCI_STATUS_AR|MCACOD_INSTR)) - return; - - m->mcgstatus |= MCG_STATUS_EIPV; - m->ip = regs->ip; - m->cs = regs->cs; -} - /* Add per CPU specific workarounds here */ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) { @@ -1677,7 +1865,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) */ if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0) - mce_banks[0].init = 0; + mce_banks[0].init = false; /* * All newer Intel systems support MCE broadcasting. Enable @@ -1695,7 +1883,14 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) cfg->bootlog = 0; if (c->x86 == 6 && c->x86_model == 45) - quirk_no_way_out = quirk_sandybridge_ifu; + mce_flags.snb_ifu_quirk = 1; + + /* + * Skylake, Cascacde Lake and Cooper Lake require a quirk on + * rep movs. + */ + if (c->x86 == 6 && c->x86_model == INTEL_FAM6_SKYLAKE_X) + mce_flags.skx_repmov_quirk = 1; } if (c->x86_vendor == X86_VENDOR_ZHAOXIN) { @@ -1725,12 +1920,12 @@ static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) switch (c->x86_vendor) { case X86_VENDOR_INTEL: intel_p5_mcheck_init(c); + mce_flags.p5 = 1; return 1; - break; case X86_VENDOR_CENTAUR: winchip_mcheck_init(c); + mce_flags.winchip = 1; return 1; - break; default: return 0; } @@ -1747,13 +1942,7 @@ static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c) mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV); mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR); mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA); - - if (mce_flags.smca) { - msr_ops.ctl = smca_ctl_reg; - msr_ops.status = smca_status_reg; - msr_ops.addr = smca_addr_reg; - msr_ops.misc = smca_misc_reg; - } + mce_flags.amd_threshold = 1; } } @@ -1877,25 +2066,75 @@ bool filter_mce(struct mce *m) { if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) return amd_filter_mce(m); + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + return intel_filter_mce(m); return false; } -/* Handle unconfigured int18 (should never happen) */ -static void unexpected_machine_check(struct pt_regs *regs, long error_code) +static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) { - pr_err("CPU#%d: Unexpected int18 (Machine Check)\n", - smp_processor_id()); + irqentry_state_t irq_state; + + WARN_ON_ONCE(user_mode(regs)); + + /* + * Only required when from kernel mode. See + * mce_check_crashing_cpu() for details. + */ + if (mca_cfg.initialized && mce_check_crashing_cpu()) + return; + + irq_state = irqentry_nmi_enter(regs); + + do_machine_check(regs); + + irqentry_nmi_exit(regs, irq_state); +} + +static __always_inline void exc_machine_check_user(struct pt_regs *regs) +{ + irqentry_enter_from_user_mode(regs); + + do_machine_check(regs); + + irqentry_exit_to_user_mode(regs); } -/* Call the installed machine check handler for this CPU setup. */ -void (*machine_check_vector)(struct pt_regs *, long error_code) = - unexpected_machine_check; +#ifdef CONFIG_X86_64 +/* MCE hit kernel mode */ +DEFINE_IDTENTRY_MCE(exc_machine_check) +{ + unsigned long dr7; + + dr7 = local_db_save(); + exc_machine_check_kernel(regs); + local_db_restore(dr7); +} -dotraplinkage void do_mce(struct pt_regs *regs, long error_code) +/* The user mode variant. */ +DEFINE_IDTENTRY_MCE_USER(exc_machine_check) { - machine_check_vector(regs, error_code); + unsigned long dr7; + + dr7 = local_db_save(); + exc_machine_check_user(regs); + local_db_restore(dr7); } +#else +/* 32bit unified entry point */ +DEFINE_IDTENTRY_RAW(exc_machine_check) +{ + unsigned long dr7; + + dr7 = local_db_save(); + if (user_mode(regs)) + exc_machine_check_user(regs); + else + exc_machine_check_kernel(regs); + local_db_restore(dr7); +} +#endif /* * Called for each booted CPU to set up machine checks. @@ -1925,7 +2164,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) return; } - machine_check_vector = do_machine_check; + mca_cfg.initialized = 1; __mcheck_cpu_init_early(c); __mcheck_cpu_init_generic(); @@ -1978,6 +2217,7 @@ void mce_disable_bank(int bank) * mce=no_cmci Disables CMCI * mce=no_lmce Disables LMCE * mce=dont_log_ce Clears corrected events silently, no log created for CEs. + * mce=print_all Print all machine check logs to console * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) * monarchtimeout is how long to wait for other CPUs on machine @@ -1986,7 +2226,7 @@ void mce_disable_bank(int bank) and older. * mce=nobootlog Don't log MCEs from before booting. * mce=bios_cmci_threshold Don't program the CMCI threshold - * mce=recovery force enable memcpy_mcsafe() + * mce=recovery force enable copy_mc_fragile() */ static int __init mcheck_enable(char *str) { @@ -2006,6 +2246,8 @@ static int __init mcheck_enable(char *str) cfg->lmce_disabled = 1; else if (!strcmp(str, "dont_log_ce")) cfg->dont_log_ce = true; + else if (!strcmp(str, "print_all")) + cfg->print_all = true; else if (!strcmp(str, "ignore_ce")) cfg->ignore_ce = true; else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) @@ -2014,10 +2256,9 @@ static int __init mcheck_enable(char *str) cfg->bios_cmci_threshold = 1; else if (!strcmp(str, "recovery")) cfg->recovery = 1; - else if (isdigit(str[0])) { - if (get_option(&str, &cfg->tolerant) == 2) - get_option(&str, &(cfg->monarch_timeout)); - } else { + else if (isdigit(str[0])) + get_option(&str, &(cfg->monarch_timeout)); + else { pr_info("mce argument %s ignored. Please use /sys\n", str); return 0; } @@ -2027,11 +2268,9 @@ __setup("mce", mcheck_enable); int __init mcheck_init(void) { - mcheck_intel_therm_init(); - mce_register_decode_chain(&first_nb); + mce_register_decode_chain(&early_nb); mce_register_decode_chain(&mce_uc_nb); mce_register_decode_chain(&mce_default_nb); - mcheck_vendor_init_severity(); INIT_WORK(&mce_work, mce_gen_pool_process); init_irq_work(&mce_irq_work, mce_irq_work_cb); @@ -2056,7 +2295,7 @@ static void mce_disable_error_reporting(void) struct mce_bank *b = &mce_banks[i]; if (b->init) - wrmsrl(msr_ops.ctl(i), 0); + wrmsrl(mca_msr_reg(i, MCA_CTL), 0); } return; } @@ -2269,9 +2508,9 @@ static ssize_t store_int_with_restart(struct device *s, return ret; } -static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant); static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout); static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce); +static DEVICE_BOOL_ATTR(print_all, 0644, mca_cfg.print_all); static struct dev_ext_attribute dev_attr_check_interval = { __ATTR(check_interval, 0644, device_show_int, store_int_with_restart), @@ -2289,13 +2528,13 @@ static struct dev_ext_attribute dev_attr_cmci_disabled = { }; static struct device_attribute *mce_device_attrs[] = { - &dev_attr_tolerant.attr, &dev_attr_check_interval.attr, #ifdef CONFIG_X86_MCELOG_LEGACY &dev_attr_trigger, #endif &dev_attr_monarch_timeout.attr, &dev_attr_dont_log_ce.attr, + &dev_attr_print_all.attr, &dev_attr_ignore_ce.attr, &dev_attr_cmci_disabled.attr, NULL @@ -2406,7 +2645,7 @@ static void mce_reenable_cpu(void) struct mce_bank *b = &mce_banks[i]; if (b->init) - wrmsrl(msr_ops.ctl(i), b->ctl); + wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl); } } @@ -2468,6 +2707,13 @@ static __init void mce_init_banks(void) } } +/* + * When running on XEN, this initcall is ordered against the XEN mcelog + * initcall: + * + * device_initcall(xen_late_init_mcelog); + * device_initcall_sync(mcheck_init_device); + */ static __init int mcheck_init_device(void) { int err; @@ -2499,6 +2745,10 @@ static __init int mcheck_init_device(void) if (err) goto err_out_mem; + /* + * Invokes mce_cpu_online() on all CPUs which are online when + * the state is installed. + */ err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online", mce_cpu_online, mce_cpu_pre_down); if (err < 0) @@ -2544,11 +2794,11 @@ struct dentry *mce_get_debugfs_dir(void) static void mce_reset(void) { - cpu_missing = 0; atomic_set(&mce_fake_panicked, 0); atomic_set(&mce_executing, 0); atomic_set(&mce_callin, 0); atomic_set(&global_nwo, 0); + cpumask_setall(&mce_missing_cpus); } static int fake_panic_get(void *data, u64 *val) @@ -2579,16 +2829,12 @@ static void __init mcheck_debugfs_init(void) static void __init mcheck_debugfs_init(void) { } #endif -DEFINE_STATIC_KEY_FALSE(mcsafe_key); -EXPORT_SYMBOL_GPL(mcsafe_key); - static int __init mcheck_late_init(void) { if (mca_cfg.recovery) - static_branch_inc(&mcsafe_key); + enable_copy_mc_fragile(); mcheck_debugfs_init(); - cec_init(); /* * Flush out everything that has been logged during early boot, now that diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c index 7c8958dee103..100fbeebdc72 100644 --- a/arch/x86/kernel/cpu/mce/dev-mcelog.c +++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c @@ -29,11 +29,7 @@ static char *mce_helper_argv[2] = { mce_helper, NULL }; * separate MCEs from kernel messages to avoid bogus bug reports. */ -static struct mce_log_buffer mcelog = { - .signature = MCE_LOG_SIGNATURE, - .len = MCE_LOG_LEN, - .recordlen = sizeof(struct mce), -}; +static struct mce_log_buffer *mcelog; static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); @@ -43,23 +39,27 @@ static int dev_mce_log(struct notifier_block *nb, unsigned long val, struct mce *mce = (struct mce *)data; unsigned int entry; + if (mce->kflags & MCE_HANDLED_CEC) + return NOTIFY_DONE; + mutex_lock(&mce_chrdev_read_mutex); - entry = mcelog.next; + entry = mcelog->next; /* * When the buffer fills up discard new entries. Assume that the * earlier errors are the more interesting ones: */ - if (entry >= MCE_LOG_LEN) { - set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags); + if (entry >= mcelog->len) { + set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog->flags); goto unlock; } - mcelog.next = entry + 1; + mcelog->next = entry + 1; - memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); - mcelog.entry[entry].finished = 1; + memcpy(mcelog->entry + entry, mce, sizeof(struct mce)); + mcelog->entry[entry].finished = 1; + mcelog->entry[entry].kflags = 0; /* wake processes polling /dev/mcelog */ wake_up_interruptible(&mce_chrdev_wait); @@ -67,6 +67,9 @@ static int dev_mce_log(struct notifier_block *nb, unsigned long val, unlock: mutex_unlock(&mce_chrdev_read_mutex); + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + mce->kflags |= MCE_HANDLED_MCELOG; + return NOTIFY_OK; } @@ -214,21 +217,21 @@ static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, /* Only supports full reads right now */ err = -EINVAL; - if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) + if (*off != 0 || usize < mcelog->len * sizeof(struct mce)) goto out; - next = mcelog.next; + next = mcelog->next; err = 0; for (i = 0; i < next; i++) { - struct mce *m = &mcelog.entry[i]; + struct mce *m = &mcelog->entry[i]; err |= copy_to_user(buf, m, sizeof(*m)); buf += sizeof(*m); } - memset(mcelog.entry, 0, next * sizeof(struct mce)); - mcelog.next = 0; + memset(mcelog->entry, 0, next * sizeof(struct mce)); + mcelog->next = 0; if (err) err = -EFAULT; @@ -242,7 +245,7 @@ out: static __poll_t mce_chrdev_poll(struct file *file, poll_table *wait) { poll_wait(file, &mce_chrdev_wait, wait); - if (READ_ONCE(mcelog.next)) + if (READ_ONCE(mcelog->next)) return EPOLLIN | EPOLLRDNORM; if (!mce_apei_read_done && apei_check_mce()) return EPOLLIN | EPOLLRDNORM; @@ -261,13 +264,13 @@ static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, case MCE_GET_RECORD_LEN: return put_user(sizeof(struct mce), p); case MCE_GET_LOG_LEN: - return put_user(MCE_LOG_LEN, p); + return put_user(mcelog->len, p); case MCE_GETCLEAR_FLAGS: { unsigned flags; do { - flags = mcelog.flags; - } while (cmpxchg(&mcelog.flags, flags, 0) != flags); + flags = mcelog->flags; + } while (cmpxchg(&mcelog->flags, flags, 0) != flags); return put_user(flags, p); } @@ -328,6 +331,7 @@ static const struct file_operations mce_chrdev_ops = { .write = mce_chrdev_write, .poll = mce_chrdev_poll, .unlocked_ioctl = mce_chrdev_ioctl, + .compat_ioctl = compat_ptr_ioctl, .llseek = no_llseek, }; @@ -339,8 +343,18 @@ static struct miscdevice mce_chrdev_device = { static __init int dev_mcelog_init_device(void) { + int mce_log_len; int err; + mce_log_len = max(MCE_LOG_MIN_LEN, num_online_cpus()); + mcelog = kzalloc(struct_size(mcelog, entry, mce_log_len), GFP_KERNEL); + if (!mcelog) + return -ENOMEM; + + memcpy(mcelog->signature, MCE_LOG_SIGNATURE, sizeof(mcelog->signature)); + mcelog->len = mce_log_len; + mcelog->recordlen = sizeof(struct mce); + /* register character device /dev/mcelog */ err = misc_register(&mce_chrdev_device); if (err) { @@ -350,6 +364,7 @@ static __init int dev_mcelog_init_device(void) else pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); + kfree(mcelog); return err; } diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 3413b41b8d55..12cf2e7ca33c 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -33,6 +33,8 @@ #include "internal.h" +static bool hw_injection_possible = true; + /* * Collect all the MCi_XXX settings */ @@ -88,12 +90,28 @@ MCE_INJECT_GET(status); MCE_INJECT_GET(misc); MCE_INJECT_GET(addr); MCE_INJECT_GET(synd); +MCE_INJECT_GET(ipid); DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n"); DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n"); DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n"); DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n"); +/* Use the user provided IPID value on a sw injection. */ +static int inj_ipid_set(void *data, u64 val) +{ + struct mce *m = (struct mce *)data; + + if (cpu_feature_enabled(X86_FEATURE_SMCA)) { + if (inj_type == SW_INJ) + m->ipid = val; + } + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(ipid_fops, inj_ipid_get, inj_ipid_set, "%llx\n"); + static void setup_inj_struct(struct mce *m) { memset(m, 0, sizeof(struct mce)); @@ -146,9 +164,9 @@ static void raise_exception(struct mce *m, struct pt_regs *pregs) regs.cs = m->cs; pregs = ®s; } - /* in mcheck exeception handler, irq will be disabled */ + /* do_machine_check() expects interrupts disabled -- at least */ local_irq_save(flags); - do_machine_check(pregs, 0); + do_machine_check(pregs); local_irq_restore(flags); m->finished = 0; } @@ -199,7 +217,7 @@ static int raise_local(void) * calling irq_enter, but the necessary * machinery isn't exported currently. */ - /*FALL THROUGH*/ + fallthrough; case MCJ_CTX_PROCESS: raise_exception(m, NULL); break; @@ -232,7 +250,7 @@ static void __maybe_unused raise_mce(struct mce *m) unsigned long start; int cpu; - get_online_cpus(); + cpus_read_lock(); cpumask_copy(mce_inject_cpumask, cpu_online_mask); cpumask_clear_cpu(get_cpu(), mce_inject_cpumask); for_each_online_cpu(cpu) { @@ -266,7 +284,7 @@ static void __maybe_unused raise_mce(struct mce *m) } raise_local(); put_cpu(); - put_online_cpus(); + cpus_read_unlock(); } else { preempt_disable(); raise_local(); @@ -323,6 +341,8 @@ static int __set_inj(const char *buf) for (i = 0; i < N_INJ_TYPES; i++) { if (!strncmp(flags_options[i], buf, strlen(flags_options[i]))) { + if (i > SW_INJ && !hw_injection_possible) + continue; inj_type = i; return 0; } @@ -347,7 +367,7 @@ static ssize_t flags_write(struct file *filp, const char __user *ubuf, char buf[MAX_FLAG_OPT_SIZE], *__buf; int err; - if (cnt > MAX_FLAG_OPT_SIZE) + if (!cnt || cnt > MAX_FLAG_OPT_SIZE) return -EINVAL; if (copy_from_user(&buf, ubuf, cnt)) @@ -487,6 +507,8 @@ static void do_inject(void) i_mce.tsc = rdtsc_ordered(); + i_mce.status |= MCI_STATUS_VAL; + if (i_mce.misc) i_mce.status |= MCI_STATUS_MISCV; @@ -511,7 +533,7 @@ static void do_inject(void) */ if (inj_type == DFR_INT_INJ) { i_mce.status |= MCI_STATUS_DEFERRED; - i_mce.status |= (i_mce.status & ~MCI_STATUS_UC); + i_mce.status &= ~MCI_STATUS_UC; } /* @@ -522,11 +544,11 @@ static void do_inject(void) if (boot_cpu_has(X86_FEATURE_AMD_DCM) && b == 4 && boot_cpu_data.x86 < 0x17) { - toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu)); - cpu = get_nbc_for_node(amd_get_nb_id(cpu)); + toggle_nb_mca_mst_cpu(topology_die_id(cpu)); + cpu = get_nbc_for_node(topology_die_id(cpu)); } - get_online_cpus(); + cpus_read_lock(); if (!cpu_online(cpu)) goto err; @@ -550,7 +572,7 @@ static void do_inject(void) } err: - put_online_cpus(); + cpus_read_unlock(); } @@ -574,6 +596,33 @@ static int inj_bank_set(void *data, u64 val) } m->bank = val; + + /* + * sw-only injection allows to write arbitrary values into the MCA + * registers because it tests only the decoding paths. + */ + if (inj_type == SW_INJ) + goto inject; + + /* + * Read IPID value to determine if a bank is populated on the target + * CPU. + */ + if (cpu_feature_enabled(X86_FEATURE_SMCA)) { + u64 ipid; + + if (rdmsrl_on_cpu(m->extcpu, MSR_AMD64_SMCA_MCx_IPID(val), &ipid)) { + pr_err("Error reading IPID on CPU%d\n", m->extcpu); + return -EINVAL; + } + + if (!ipid) { + pr_err("Cannot inject into unpopulated bank %llu\n", val); + return -ENODEV; + } + } + +inject: do_inject(); /* Reset injection struct */ @@ -629,6 +678,8 @@ static const char readme_msg[] = "\t is present in hardware. \n" "\t - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n" "\t APIC interrupt handler to handle the error. \n" +"\n" +"ipid:\t IPID (AMD-specific)\n" "\n"; static ssize_t @@ -652,6 +703,7 @@ static struct dfs_node { { .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "synd", .fops = &synd_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "ipid", .fops = &ipid_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR }, @@ -669,11 +721,54 @@ static void __init debugfs_init(void) &i_mce, dfs_fls[i].fops); } +static void check_hw_inj_possible(void) +{ + int cpu; + u8 bank; + + /* + * This behavior exists only on SMCA systems though its not directly + * related to SMCA. + */ + if (!cpu_feature_enabled(X86_FEATURE_SMCA)) + return; + + cpu = get_cpu(); + + for (bank = 0; bank < MAX_NR_BANKS; ++bank) { + u64 status = MCI_STATUS_VAL, ipid; + + /* Check whether bank is populated */ + rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), ipid); + if (!ipid) + continue; + + toggle_hw_mce_inject(cpu, true); + + wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), status); + rdmsrl_safe(mca_msr_reg(bank, MCA_STATUS), &status); + + if (!status) { + hw_injection_possible = false; + pr_warn("Platform does not allow *hardware* error injection." + "Try using APEI EINJ instead.\n"); + } + + toggle_hw_mce_inject(cpu, false); + + break; + } + + put_cpu(); +} + static int __init inject_init(void) { if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL)) return -ENOMEM; + check_hw_inj_possible(); + debugfs_init(); register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify"); diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index f996ffb887bc..95275a5e57e0 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -193,7 +193,7 @@ unsigned long cmci_intel_adjust_timer(unsigned long interval) if (!atomic_sub_return(1, &cmci_storm_on_cpus)) pr_notice("CMCI storm subsided: switching to interrupt mode\n"); - /* FALLTHROUGH */ + fallthrough; case CMCI_STORM_SUBSIDED: /* @@ -470,54 +470,52 @@ void intel_clear_lmce(void) wrmsrl(MSR_IA32_MCG_EXT_CTL, val); } -static void intel_ppin_init(struct cpuinfo_x86 *c) +/* + * Enable additional error logs from the integrated + * memory controller on processors that support this. + */ +static void intel_imc_init(struct cpuinfo_x86 *c) { - unsigned long long val; + u64 error_control; - /* - * Even if testing the presence of the MSR would be enough, we don't - * want to risk the situation where other models reuse this MSR for - * other purposes. - */ switch (c->x86_model) { + case INTEL_FAM6_SANDYBRIDGE_X: case INTEL_FAM6_IVYBRIDGE_X: case INTEL_FAM6_HASWELL_X: - case INTEL_FAM6_BROADWELL_D: - case INTEL_FAM6_BROADWELL_X: - case INTEL_FAM6_SKYLAKE_X: - case INTEL_FAM6_ICELAKE_X: - case INTEL_FAM6_XEON_PHI_KNL: - case INTEL_FAM6_XEON_PHI_KNM: - - if (rdmsrl_safe(MSR_PPIN_CTL, &val)) + if (rdmsrl_safe(MSR_ERROR_CONTROL, &error_control)) return; - - if ((val & 3UL) == 1UL) { - /* PPIN locked in disabled mode */ - return; - } - - /* If PPIN is disabled, try to enable */ - if (!(val & 2UL)) { - wrmsrl_safe(MSR_PPIN_CTL, val | 2UL); - rdmsrl_safe(MSR_PPIN_CTL, &val); - } - - /* Is the enable bit set? */ - if (val & 2UL) - set_cpu_cap(c, X86_FEATURE_INTEL_PPIN); + error_control |= 2; + wrmsrl_safe(MSR_ERROR_CONTROL, error_control); + break; } } void mce_intel_feature_init(struct cpuinfo_x86 *c) { - intel_init_thermal(c); intel_init_cmci(); intel_init_lmce(); - intel_ppin_init(c); + intel_imc_init(c); } void mce_intel_feature_clear(struct cpuinfo_x86 *c) { intel_clear_lmce(); } + +bool intel_filter_mce(struct mce *m) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + /* MCE errata HSD131, HSM142, HSW131, BDM48, HSM142 and SKX37 */ + if ((c->x86 == 6) && + ((c->x86_model == INTEL_FAM6_HASWELL) || + (c->x86_model == INTEL_FAM6_HASWELL_L) || + (c->x86_model == INTEL_FAM6_BROADWELL) || + (c->x86_model == INTEL_FAM6_HASWELL_G) || + (c->x86_model == INTEL_FAM6_SKYLAKE_X)) && + (m->bank == 0) && + ((m->status & 0xa0000000ffffffff) == 0x80000000000f0005)) + return true; + + return false; +} diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index b785c0d0b590..7e03f5b7f6bd 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -35,7 +35,7 @@ int mce_gen_pool_add(struct mce *mce); int mce_gen_pool_init(void); struct llist_node *mce_gen_pool_prepare_records(void); -extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp); +int mce_severity(struct mce *a, struct pt_regs *regs, char **msg, bool is_excp); struct dentry *mce_get_debugfs_dir(void); extern mce_banks_t mce_banks_ce_disabled; @@ -48,6 +48,7 @@ void cmci_disable_bank(int bank); void intel_init_cmci(void); void intel_init_lmce(void); void intel_clear_lmce(void); +bool intel_filter_mce(struct mce *m); #else # define cmci_intel_adjust_timer mce_adjust_timer_default static inline bool mce_intel_cmci_poll(void) { return false; } @@ -56,6 +57,7 @@ static inline void cmci_disable_bank(int bank) { } static inline void intel_init_cmci(void) { } static inline void intel_init_lmce(void) { } static inline void intel_clear_lmce(void) { } +static inline bool intel_filter_mce(struct mce *m) { return false; } #endif void mce_timer_kick(unsigned long interval); @@ -111,22 +113,24 @@ static inline void mce_unregister_injector_chain(struct notifier_block *nb) { } #endif struct mca_config { - bool dont_log_ce; - bool cmci_disabled; - bool ignore_ce; - __u64 lmce_disabled : 1, disabled : 1, ser : 1, recovery : 1, bios_cmci_threshold : 1, - __reserved : 59; + /* Proper #MC exception handler is set */ + initialized : 1, + __reserved : 58; + + bool dont_log_ce; + bool cmci_disabled; + bool ignore_ce; + bool print_all; - s8 bootlog; - int tolerant; int monarch_timeout; int panic_timeout; u32 rip_msr; + s8 bootlog; }; extern struct mca_config mca_cfg; @@ -143,7 +147,7 @@ struct mce_vendor_flags { * Recovery. It indicates support for data poisoning in HW and deferred * error interrupts. */ - succor : 1, + succor : 1, /* * (AMD) SMCA: This bit indicates support for Scalable MCA which expands @@ -151,29 +155,79 @@ struct mce_vendor_flags { * banks. Also, to accommodate the new banks and registers, the MCA * register space is moved to a new MSR range. */ - smca : 1, + smca : 1, + + /* AMD-style error thresholding banks present. */ + amd_threshold : 1, + + /* Pentium, family 5-style MCA */ + p5 : 1, + + /* Centaur Winchip C6-style MCA */ + winchip : 1, - __reserved_0 : 61; + /* SandyBridge IFU quirk */ + snb_ifu_quirk : 1, + + /* Skylake, Cascade Lake, Cooper Lake REP;MOVS* quirk */ + skx_repmov_quirk : 1, + + __reserved_0 : 56; }; extern struct mce_vendor_flags mce_flags; -struct mca_msr_regs { - u32 (*ctl) (int bank); - u32 (*status) (int bank); - u32 (*addr) (int bank); - u32 (*misc) (int bank); +enum mca_msr { + MCA_CTL, + MCA_STATUS, + MCA_ADDR, + MCA_MISC, }; -extern struct mca_msr_regs msr_ops; - /* Decide whether to add MCE record to MCE event pool or filter it out. */ extern bool filter_mce(struct mce *m); #ifdef CONFIG_X86_MCE_AMD extern bool amd_filter_mce(struct mce *m); #else -static inline bool amd_filter_mce(struct mce *m) { return false; }; +static inline bool amd_filter_mce(struct mce *m) { return false; } +#endif + +#ifdef CONFIG_X86_ANCIENT_MCE +void intel_p5_mcheck_init(struct cpuinfo_x86 *c); +void winchip_mcheck_init(struct cpuinfo_x86 *c); +noinstr void pentium_machine_check(struct pt_regs *regs); +noinstr void winchip_machine_check(struct pt_regs *regs); +static inline void enable_p5_mce(void) { mce_p5_enabled = 1; } +#else +static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {} +static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {} +static inline void enable_p5_mce(void) {} +static inline void pentium_machine_check(struct pt_regs *regs) {} +static inline void winchip_machine_check(struct pt_regs *regs) {} #endif +noinstr u64 mce_rdmsrl(u32 msr); + +static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg) +{ + if (cpu_feature_enabled(X86_FEATURE_SMCA)) { + switch (reg) { + case MCA_CTL: return MSR_AMD64_SMCA_MCx_CTL(bank); + case MCA_ADDR: return MSR_AMD64_SMCA_MCx_ADDR(bank); + case MCA_MISC: return MSR_AMD64_SMCA_MCx_MISC(bank); + case MCA_STATUS: return MSR_AMD64_SMCA_MCx_STATUS(bank); + } + } + + switch (reg) { + case MCA_CTL: return MSR_IA32_MCx_CTL(bank); + case MCA_ADDR: return MSR_IA32_MCx_ADDR(bank); + case MCA_MISC: return MSR_IA32_MCx_MISC(bank); + case MCA_STATUS: return MSR_IA32_MCx_STATUS(bank); + } + + return 0; +} + #endif /* __X86_MCE_INTERNAL_H__ */ diff --git a/arch/x86/kernel/cpu/mce/p5.c b/arch/x86/kernel/cpu/mce/p5.c index 4ae6df556526..2272ad53fc33 100644 --- a/arch/x86/kernel/cpu/mce/p5.c +++ b/arch/x86/kernel/cpu/mce/p5.c @@ -7,6 +7,7 @@ #include <linux/kernel.h> #include <linux/types.h> #include <linux/smp.h> +#include <linux/hardirq.h> #include <asm/processor.h> #include <asm/traps.h> @@ -20,12 +21,11 @@ int mce_p5_enabled __read_mostly; /* Machine check handler for Pentium class Intel CPUs: */ -static void pentium_machine_check(struct pt_regs *regs, long error_code) +noinstr void pentium_machine_check(struct pt_regs *regs) { u32 loaddr, hi, lotype; - ist_enter(regs); - + instrumentation_begin(); rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); @@ -38,8 +38,7 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code) } add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - - ist_exit(regs); + instrumentation_end(); } /* Set up machine check reporting for processors with Intel style MCE: */ @@ -55,10 +54,6 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c) if (!cpu_has(c, X86_FEATURE_MCE)) return; - machine_check_vector = pentium_machine_check; - /* Make sure the vector pointer is visible before we enable MCEs: */ - wmb(); - /* Read registers before enabling: */ rdmsr(MSR_IA32_P5_MC_ADDR, l, h); rdmsr(MSR_IA32_P5_MC_TYPE, l, h); diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index 87bcdc6dc2f0..00483d1c27e4 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -9,9 +9,14 @@ #include <linux/seq_file.h> #include <linux/init.h> #include <linux/debugfs.h> -#include <asm/mce.h> #include <linux/uaccess.h> +#include <asm/mce.h> +#include <asm/intel-family.h> +#include <asm/traps.h> +#include <asm/insn.h> +#include <asm/insn-eval.h> + #include "internal.h" /* @@ -40,9 +45,14 @@ static struct severity { unsigned char context; unsigned char excp; unsigned char covered; + unsigned char cpu_model; + unsigned char cpu_minstepping; + unsigned char bank_lo, bank_hi; char *msg; } severities[] = { #define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c } +#define BANK_RANGE(l, h) .bank_lo = l, .bank_hi = h +#define MODEL_STEPPING(m, s) .cpu_model = m, .cpu_minstepping = s #define KERNEL .context = IN_KERNEL #define USER .context = IN_USER #define KERNEL_RECOV .context = IN_KERNEL_RECOV @@ -90,14 +100,9 @@ static struct severity { EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0) ), MCESEV( - DEFERRED, "Deferred error", - NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED) - ), - MCESEV( KEEP, "Corrected error", NOSER, BITCLR(MCI_STATUS_UC) ), - /* * known AO MCACODs reported via MCE or CMC: * @@ -113,6 +118,18 @@ static struct severity { AO, "Action optional: last level cache writeback error", SER, MASK(MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB) ), + /* + * Quirk for Skylake/Cascade Lake. Patrol scrubber may be configured + * to report uncorrected errors using CMCI with a special signature. + * UC=0, MSCOD=0x0010, MCACOD=binary(000X 0000 1100 XXXX) reported + * in one of the memory controller banks. + * Set severity to "AO" for same action as normal patrol scrub error. + */ + MCESEV( + AO, "Uncorrected Patrol Scrub Error", + SER, MASK(MCI_STATUS_UC|MCI_ADDR|0xffffeff0, MCI_ADDR|0x001000c0), + MODEL_STEPPING(INTEL_FAM6_SKYLAKE_X, 4), BANK_RANGE(13, 18) + ), /* ignore OVER for UCNA */ MCESEV( @@ -125,7 +142,7 @@ static struct severity { MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR) ), MCESEV( - KEEP, "Non signalled machine check", + KEEP, "Non signaled machine check", SER, BITCLR(MCI_STATUS_S) ), @@ -198,6 +215,46 @@ static struct severity { #define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \ (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) +static bool is_copy_from_user(struct pt_regs *regs) +{ + u8 insn_buf[MAX_INSN_SIZE]; + unsigned long addr; + struct insn insn; + int ret; + + if (!regs) + return false; + + if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip, MAX_INSN_SIZE)) + return false; + + ret = insn_decode_kernel(&insn, insn_buf); + if (ret < 0) + return false; + + switch (insn.opcode.value) { + /* MOV mem,reg */ + case 0x8A: case 0x8B: + /* MOVZ mem,reg */ + case 0xB60F: case 0xB70F: + addr = (unsigned long)insn_get_addr_ref(&insn, regs); + break; + /* REP MOVS */ + case 0xA4: case 0xA5: + addr = regs->si; + break; + default: + return false; + } + + if (fault_in_kernel_space(addr)) + return false; + + current->mce_vaddr = (void __user *)addr; + + return true; +} + /* * If mcgstatus indicated that ip/cs on the stack were * no good, then "m->cs" will be zero and we will have @@ -209,102 +266,106 @@ static struct severity { * distinguish an exception taken in user from from one * taken in the kernel. */ -static int error_context(struct mce *m) +static noinstr int error_context(struct mce *m, struct pt_regs *regs) { + int fixup_type; + bool copy_user; + if ((m->cs & 3) == 3) return IN_USER; - if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip)) + + if (!mc_recoverable(m->mcgstatus)) + return IN_KERNEL; + + /* Allow instrumentation around external facilities usage. */ + instrumentation_begin(); + fixup_type = ex_get_fixup_type(m->ip); + copy_user = is_copy_from_user(regs); + instrumentation_end(); + + switch (fixup_type) { + case EX_TYPE_UACCESS: + case EX_TYPE_COPY: + if (!copy_user) + return IN_KERNEL; + m->kflags |= MCE_IN_KERNEL_COPYIN; + fallthrough; + + case EX_TYPE_FAULT_MCE_SAFE: + case EX_TYPE_DEFAULT_MCE_SAFE: + m->kflags |= MCE_IN_KERNEL_RECOV; return IN_KERNEL_RECOV; - return IN_KERNEL; + + default: + return IN_KERNEL; + } } -static int mce_severity_amd_smca(struct mce *m, enum context err_ctx) +/* See AMD PPR(s) section Machine Check Error Handling. */ +static noinstr int mce_severity_amd(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp) { - u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank); - u32 low, high; + char *panic_msg = NULL; + int ret; /* - * We need to look at the following bits: - * - "succor" bit (data poisoning support), and - * - TCC bit (Task Context Corrupt) - * in MCi_STATUS to determine error severity. + * Default return value: Action required, the error must be handled + * immediately. */ - if (!mce_flags.succor) - return MCE_PANIC_SEVERITY; - - if (rdmsr_safe(addr, &low, &high)) - return MCE_PANIC_SEVERITY; - - /* TCC (Task context corrupt). If set and if IN_KERNEL, panic. */ - if ((low & MCI_CONFIG_MCAX) && - (m->status & MCI_STATUS_TCC) && - (err_ctx == IN_KERNEL)) - return MCE_PANIC_SEVERITY; - - /* ...otherwise invoke hwpoison handler. */ - return MCE_AR_SEVERITY; -} - -/* - * See AMD Error Scope Hierarchy table in a newer BKDG. For example - * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features" - */ -static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_excp) -{ - enum context ctx = error_context(m); + ret = MCE_AR_SEVERITY; /* Processor Context Corrupt, no need to fumble too much, die! */ - if (m->status & MCI_STATUS_PCC) - return MCE_PANIC_SEVERITY; - - if (m->status & MCI_STATUS_UC) { - - if (ctx == IN_KERNEL) - return MCE_PANIC_SEVERITY; + if (m->status & MCI_STATUS_PCC) { + panic_msg = "Processor Context Corrupt"; + ret = MCE_PANIC_SEVERITY; + goto out; + } - /* - * On older systems where overflow_recov flag is not present, we - * should simply panic if an error overflow occurs. If - * overflow_recov flag is present and set, then software can try - * to at least kill process to prolong system operation. - */ - if (mce_flags.overflow_recov) { - if (mce_flags.smca) - return mce_severity_amd_smca(m, ctx); - - /* kill current process */ - return MCE_AR_SEVERITY; - } else { - /* at least one error was not logged */ - if (m->status & MCI_STATUS_OVER) - return MCE_PANIC_SEVERITY; - } - - /* - * For any other case, return MCE_UC_SEVERITY so that we log the - * error and exit #MC handler. - */ - return MCE_UC_SEVERITY; + if (m->status & MCI_STATUS_DEFERRED) { + ret = MCE_DEFERRED_SEVERITY; + goto out; } /* - * deferred error: poll handler catches these and adds to mce_ring so - * memory-failure can take recovery actions. + * If the UC bit is not set, the system either corrected or deferred + * the error. No action will be required after logging the error. */ - if (m->status & MCI_STATUS_DEFERRED) - return MCE_DEFERRED_SEVERITY; + if (!(m->status & MCI_STATUS_UC)) { + ret = MCE_KEEP_SEVERITY; + goto out; + } /* - * corrected error: poll handler catches these and passes responsibility - * of decoding the error to EDAC + * On MCA overflow, without the MCA overflow recovery feature the + * system will not be able to recover, panic. */ - return MCE_KEEP_SEVERITY; + if ((m->status & MCI_STATUS_OVER) && !mce_flags.overflow_recov) { + panic_msg = "Overflowed uncorrected error without MCA Overflow Recovery"; + ret = MCE_PANIC_SEVERITY; + goto out; + } + + if (!mce_flags.succor) { + panic_msg = "Uncorrected error without MCA Recovery"; + ret = MCE_PANIC_SEVERITY; + goto out; + } + + if (error_context(m, regs) == IN_KERNEL) { + panic_msg = "Uncorrected unrecoverable error in kernel context"; + ret = MCE_PANIC_SEVERITY; + } + +out: + if (msg && panic_msg) + *msg = panic_msg; + + return ret; } -static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_excp) +static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp) { enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP); - enum context ctx = error_context(m); + enum context ctx = error_context(m, regs); struct severity *s; for (s = severities;; s++) { @@ -320,26 +381,30 @@ static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_e continue; if (s->excp && excp != s->excp) continue; + if (s->cpu_model && boot_cpu_data.x86_model != s->cpu_model) + continue; + if (s->cpu_minstepping && boot_cpu_data.x86_stepping < s->cpu_minstepping) + continue; + if (s->bank_lo && (m->bank < s->bank_lo || m->bank > s->bank_hi)) + continue; if (msg) *msg = s->msg; s->covered = 1; - if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) { - if (tolerant < 1) - return MCE_PANIC_SEVERITY; - } + + if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) + return MCE_PANIC_SEVERITY; + return s->sev; } } -/* Default to mce_severity_intel */ -int (*mce_severity)(struct mce *m, int tolerant, char **msg, bool is_excp) = - mce_severity_intel; - -void __init mcheck_vendor_init_severity(void) +int noinstr mce_severity(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp) { if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) - mce_severity = mce_severity_amd; + return mce_severity_amd(m, regs, msg, is_excp); + else + return mce_severity_intel(m, regs, msg, is_excp); } #ifdef CONFIG_DEBUG_FS diff --git a/arch/x86/kernel/cpu/mce/therm_throt.c b/arch/x86/kernel/cpu/mce/therm_throt.c deleted file mode 100644 index f36dc0742085..000000000000 --- a/arch/x86/kernel/cpu/mce/therm_throt.c +++ /dev/null @@ -1,740 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Thermal throttle event support code (such as syslog messaging and rate - * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c). - * - * This allows consistent reporting of CPU thermal throttle events. - * - * Maintains a counter in /sys that keeps track of the number of thermal - * events, such that the user knows how bad the thermal problem might be - * (since the logging to syslog is rate limited). - * - * Author: Dmitriy Zavin (dmitriyz@google.com) - * - * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. - * Inspired by Ross Biro's and Al Borchers' counter code. - */ -#include <linux/interrupt.h> -#include <linux/notifier.h> -#include <linux/jiffies.h> -#include <linux/kernel.h> -#include <linux/percpu.h> -#include <linux/export.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/cpu.h> - -#include <asm/processor.h> -#include <asm/traps.h> -#include <asm/apic.h> -#include <asm/mce.h> -#include <asm/msr.h> -#include <asm/trace/irq_vectors.h> - -#include "internal.h" - -/* How long to wait between reporting thermal events */ -#define CHECK_INTERVAL (300 * HZ) - -#define THERMAL_THROTTLING_EVENT 0 -#define POWER_LIMIT_EVENT 1 - -/** - * struct _thermal_state - Represent the current thermal event state - * @next_check: Stores the next timestamp, when it is allowed - * to log the next warning message. - * @last_interrupt_time: Stores the timestamp for the last threshold - * high event. - * @therm_work: Delayed workqueue structure - * @count: Stores the current running count for thermal - * or power threshold interrupts. - * @last_count: Stores the previous running count for thermal - * or power threshold interrupts. - * @max_time_ms: This shows the maximum amount of time CPU was - * in throttled state for a single thermal - * threshold high to low state. - * @total_time_ms: This is a cumulative time during which CPU was - * in the throttled state. - * @rate_control_active: Set when a throttling message is logged. - * This is used for the purpose of rate-control. - * @new_event: Stores the last high/low status of the - * THERM_STATUS_PROCHOT or - * THERM_STATUS_POWER_LIMIT. - * @level: Stores whether this _thermal_state instance is - * for a CORE level or for PACKAGE level. - * @sample_index: Index for storing the next sample in the buffer - * temp_samples[]. - * @sample_count: Total number of samples collected in the buffer - * temp_samples[]. - * @average: The last moving average of temperature samples - * @baseline_temp: Temperature at which thermal threshold high - * interrupt was generated. - * @temp_samples: Storage for temperature samples to calculate - * moving average. - * - * This structure is used to represent data related to thermal state for a CPU. - * There is a separate storage for core and package level for each CPU. - */ -struct _thermal_state { - u64 next_check; - u64 last_interrupt_time; - struct delayed_work therm_work; - unsigned long count; - unsigned long last_count; - unsigned long max_time_ms; - unsigned long total_time_ms; - bool rate_control_active; - bool new_event; - u8 level; - u8 sample_index; - u8 sample_count; - u8 average; - u8 baseline_temp; - u8 temp_samples[3]; -}; - -struct thermal_state { - struct _thermal_state core_throttle; - struct _thermal_state core_power_limit; - struct _thermal_state package_throttle; - struct _thermal_state package_power_limit; - struct _thermal_state core_thresh0; - struct _thermal_state core_thresh1; - struct _thermal_state pkg_thresh0; - struct _thermal_state pkg_thresh1; -}; - -/* Callback to handle core threshold interrupts */ -int (*platform_thermal_notify)(__u64 msr_val); -EXPORT_SYMBOL(platform_thermal_notify); - -/* Callback to handle core package threshold_interrupts */ -int (*platform_thermal_package_notify)(__u64 msr_val); -EXPORT_SYMBOL_GPL(platform_thermal_package_notify); - -/* Callback support of rate control, return true, if - * callback has rate control */ -bool (*platform_thermal_package_rate_control)(void); -EXPORT_SYMBOL_GPL(platform_thermal_package_rate_control); - - -static DEFINE_PER_CPU(struct thermal_state, thermal_state); - -static atomic_t therm_throt_en = ATOMIC_INIT(0); - -static u32 lvtthmr_init __read_mostly; - -#ifdef CONFIG_SYSFS -#define define_therm_throt_device_one_ro(_name) \ - static DEVICE_ATTR(_name, 0444, \ - therm_throt_device_show_##_name, \ - NULL) \ - -#define define_therm_throt_device_show_func(event, name) \ - \ -static ssize_t therm_throt_device_show_##event##_##name( \ - struct device *dev, \ - struct device_attribute *attr, \ - char *buf) \ -{ \ - unsigned int cpu = dev->id; \ - ssize_t ret; \ - \ - preempt_disable(); /* CPU hotplug */ \ - if (cpu_online(cpu)) { \ - ret = sprintf(buf, "%lu\n", \ - per_cpu(thermal_state, cpu).event.name); \ - } else \ - ret = 0; \ - preempt_enable(); \ - \ - return ret; \ -} - -define_therm_throt_device_show_func(core_throttle, count); -define_therm_throt_device_one_ro(core_throttle_count); - -define_therm_throt_device_show_func(core_power_limit, count); -define_therm_throt_device_one_ro(core_power_limit_count); - -define_therm_throt_device_show_func(package_throttle, count); -define_therm_throt_device_one_ro(package_throttle_count); - -define_therm_throt_device_show_func(package_power_limit, count); -define_therm_throt_device_one_ro(package_power_limit_count); - -define_therm_throt_device_show_func(core_throttle, max_time_ms); -define_therm_throt_device_one_ro(core_throttle_max_time_ms); - -define_therm_throt_device_show_func(package_throttle, max_time_ms); -define_therm_throt_device_one_ro(package_throttle_max_time_ms); - -define_therm_throt_device_show_func(core_throttle, total_time_ms); -define_therm_throt_device_one_ro(core_throttle_total_time_ms); - -define_therm_throt_device_show_func(package_throttle, total_time_ms); -define_therm_throt_device_one_ro(package_throttle_total_time_ms); - -static struct attribute *thermal_throttle_attrs[] = { - &dev_attr_core_throttle_count.attr, - &dev_attr_core_throttle_max_time_ms.attr, - &dev_attr_core_throttle_total_time_ms.attr, - NULL -}; - -static const struct attribute_group thermal_attr_group = { - .attrs = thermal_throttle_attrs, - .name = "thermal_throttle" -}; -#endif /* CONFIG_SYSFS */ - -#define CORE_LEVEL 0 -#define PACKAGE_LEVEL 1 - -#define THERM_THROT_POLL_INTERVAL HZ -#define THERM_STATUS_PROCHOT_LOG BIT(1) - -#define THERM_STATUS_CLEAR_CORE_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11) | BIT(13) | BIT(15)) -#define THERM_STATUS_CLEAR_PKG_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11)) - -static void clear_therm_status_log(int level) -{ - int msr; - u64 mask, msr_val; - - if (level == CORE_LEVEL) { - msr = MSR_IA32_THERM_STATUS; - mask = THERM_STATUS_CLEAR_CORE_MASK; - } else { - msr = MSR_IA32_PACKAGE_THERM_STATUS; - mask = THERM_STATUS_CLEAR_PKG_MASK; - } - - rdmsrl(msr, msr_val); - msr_val &= mask; - wrmsrl(msr, msr_val & ~THERM_STATUS_PROCHOT_LOG); -} - -static void get_therm_status(int level, bool *proc_hot, u8 *temp) -{ - int msr; - u64 msr_val; - - if (level == CORE_LEVEL) - msr = MSR_IA32_THERM_STATUS; - else - msr = MSR_IA32_PACKAGE_THERM_STATUS; - - rdmsrl(msr, msr_val); - if (msr_val & THERM_STATUS_PROCHOT_LOG) - *proc_hot = true; - else - *proc_hot = false; - - *temp = (msr_val >> 16) & 0x7F; -} - -static void __maybe_unused throttle_active_work(struct work_struct *work) -{ - struct _thermal_state *state = container_of(to_delayed_work(work), - struct _thermal_state, therm_work); - unsigned int i, avg, this_cpu = smp_processor_id(); - u64 now = get_jiffies_64(); - bool hot; - u8 temp; - - get_therm_status(state->level, &hot, &temp); - /* temperature value is offset from the max so lesser means hotter */ - if (!hot && temp > state->baseline_temp) { - if (state->rate_control_active) - pr_info("CPU%d: %s temperature/speed normal (total events = %lu)\n", - this_cpu, - state->level == CORE_LEVEL ? "Core" : "Package", - state->count); - - state->rate_control_active = false; - return; - } - - if (time_before64(now, state->next_check) && - state->rate_control_active) - goto re_arm; - - state->next_check = now + CHECK_INTERVAL; - - if (state->count != state->last_count) { - /* There was one new thermal interrupt */ - state->last_count = state->count; - state->average = 0; - state->sample_count = 0; - state->sample_index = 0; - } - - state->temp_samples[state->sample_index] = temp; - state->sample_count++; - state->sample_index = (state->sample_index + 1) % ARRAY_SIZE(state->temp_samples); - if (state->sample_count < ARRAY_SIZE(state->temp_samples)) - goto re_arm; - - avg = 0; - for (i = 0; i < ARRAY_SIZE(state->temp_samples); ++i) - avg += state->temp_samples[i]; - - avg /= ARRAY_SIZE(state->temp_samples); - - if (state->average > avg) { - pr_warn("CPU%d: %s temperature is above threshold, cpu clock is throttled (total events = %lu)\n", - this_cpu, - state->level == CORE_LEVEL ? "Core" : "Package", - state->count); - state->rate_control_active = true; - } - - state->average = avg; - -re_arm: - clear_therm_status_log(state->level); - schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL); -} - -/*** - * therm_throt_process - Process thermal throttling event from interrupt - * @curr: Whether the condition is current or not (boolean), since the - * thermal interrupt normally gets called both when the thermal - * event begins and once the event has ended. - * - * This function is called by the thermal interrupt after the - * IRQ has been acknowledged. - * - * It will take care of rate limiting and printing messages to the syslog. - */ -static void therm_throt_process(bool new_event, int event, int level) -{ - struct _thermal_state *state; - unsigned int this_cpu = smp_processor_id(); - bool old_event; - u64 now; - struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu); - - now = get_jiffies_64(); - if (level == CORE_LEVEL) { - if (event == THERMAL_THROTTLING_EVENT) - state = &pstate->core_throttle; - else if (event == POWER_LIMIT_EVENT) - state = &pstate->core_power_limit; - else - return; - } else if (level == PACKAGE_LEVEL) { - if (event == THERMAL_THROTTLING_EVENT) - state = &pstate->package_throttle; - else if (event == POWER_LIMIT_EVENT) - state = &pstate->package_power_limit; - else - return; - } else - return; - - old_event = state->new_event; - state->new_event = new_event; - - if (new_event) - state->count++; - - if (event != THERMAL_THROTTLING_EVENT) - return; - - if (new_event && !state->last_interrupt_time) { - bool hot; - u8 temp; - - get_therm_status(state->level, &hot, &temp); - /* - * Ignore short temperature spike as the system is not close - * to PROCHOT. 10C offset is large enough to ignore. It is - * already dropped from the high threshold temperature. - */ - if (temp > 10) - return; - - state->baseline_temp = temp; - state->last_interrupt_time = now; - schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL); - } else if (old_event && state->last_interrupt_time) { - unsigned long throttle_time; - - throttle_time = jiffies_delta_to_msecs(now - state->last_interrupt_time); - if (throttle_time > state->max_time_ms) - state->max_time_ms = throttle_time; - state->total_time_ms += throttle_time; - state->last_interrupt_time = 0; - } -} - -static int thresh_event_valid(int level, int event) -{ - struct _thermal_state *state; - unsigned int this_cpu = smp_processor_id(); - struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu); - u64 now = get_jiffies_64(); - - if (level == PACKAGE_LEVEL) - state = (event == 0) ? &pstate->pkg_thresh0 : - &pstate->pkg_thresh1; - else - state = (event == 0) ? &pstate->core_thresh0 : - &pstate->core_thresh1; - - if (time_before64(now, state->next_check)) - return 0; - - state->next_check = now + CHECK_INTERVAL; - - return 1; -} - -static bool int_pln_enable; -static int __init int_pln_enable_setup(char *s) -{ - int_pln_enable = true; - - return 1; -} -__setup("int_pln_enable", int_pln_enable_setup); - -#ifdef CONFIG_SYSFS -/* Add/Remove thermal_throttle interface for CPU device: */ -static int thermal_throttle_add_dev(struct device *dev, unsigned int cpu) -{ - int err; - struct cpuinfo_x86 *c = &cpu_data(cpu); - - err = sysfs_create_group(&dev->kobj, &thermal_attr_group); - if (err) - return err; - - if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) { - err = sysfs_add_file_to_group(&dev->kobj, - &dev_attr_core_power_limit_count.attr, - thermal_attr_group.name); - if (err) - goto del_group; - } - - if (cpu_has(c, X86_FEATURE_PTS)) { - err = sysfs_add_file_to_group(&dev->kobj, - &dev_attr_package_throttle_count.attr, - thermal_attr_group.name); - if (err) - goto del_group; - - err = sysfs_add_file_to_group(&dev->kobj, - &dev_attr_package_throttle_max_time_ms.attr, - thermal_attr_group.name); - if (err) - goto del_group; - - err = sysfs_add_file_to_group(&dev->kobj, - &dev_attr_package_throttle_total_time_ms.attr, - thermal_attr_group.name); - if (err) - goto del_group; - - if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) { - err = sysfs_add_file_to_group(&dev->kobj, - &dev_attr_package_power_limit_count.attr, - thermal_attr_group.name); - if (err) - goto del_group; - } - } - - return 0; - -del_group: - sysfs_remove_group(&dev->kobj, &thermal_attr_group); - - return err; -} - -static void thermal_throttle_remove_dev(struct device *dev) -{ - sysfs_remove_group(&dev->kobj, &thermal_attr_group); -} - -/* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static int thermal_throttle_online(unsigned int cpu) -{ - struct thermal_state *state = &per_cpu(thermal_state, cpu); - struct device *dev = get_cpu_device(cpu); - u32 l; - - state->package_throttle.level = PACKAGE_LEVEL; - state->core_throttle.level = CORE_LEVEL; - - INIT_DELAYED_WORK(&state->package_throttle.therm_work, throttle_active_work); - INIT_DELAYED_WORK(&state->core_throttle.therm_work, throttle_active_work); - - /* Unmask the thermal vector after the above workqueues are initialized. */ - l = apic_read(APIC_LVTTHMR); - apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); - - return thermal_throttle_add_dev(dev, cpu); -} - -static int thermal_throttle_offline(unsigned int cpu) -{ - struct thermal_state *state = &per_cpu(thermal_state, cpu); - struct device *dev = get_cpu_device(cpu); - u32 l; - - /* Mask the thermal vector before draining evtl. pending work */ - l = apic_read(APIC_LVTTHMR); - apic_write(APIC_LVTTHMR, l | APIC_LVT_MASKED); - - cancel_delayed_work_sync(&state->package_throttle.therm_work); - cancel_delayed_work_sync(&state->core_throttle.therm_work); - - state->package_throttle.rate_control_active = false; - state->core_throttle.rate_control_active = false; - - thermal_throttle_remove_dev(dev); - return 0; -} - -static __init int thermal_throttle_init_device(void) -{ - int ret; - - if (!atomic_read(&therm_throt_en)) - return 0; - - ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/therm:online", - thermal_throttle_online, - thermal_throttle_offline); - return ret < 0 ? ret : 0; -} -device_initcall(thermal_throttle_init_device); - -#endif /* CONFIG_SYSFS */ - -static void notify_package_thresholds(__u64 msr_val) -{ - bool notify_thres_0 = false; - bool notify_thres_1 = false; - - if (!platform_thermal_package_notify) - return; - - /* lower threshold check */ - if (msr_val & THERM_LOG_THRESHOLD0) - notify_thres_0 = true; - /* higher threshold check */ - if (msr_val & THERM_LOG_THRESHOLD1) - notify_thres_1 = true; - - if (!notify_thres_0 && !notify_thres_1) - return; - - if (platform_thermal_package_rate_control && - platform_thermal_package_rate_control()) { - /* Rate control is implemented in callback */ - platform_thermal_package_notify(msr_val); - return; - } - - /* lower threshold reached */ - if (notify_thres_0 && thresh_event_valid(PACKAGE_LEVEL, 0)) - platform_thermal_package_notify(msr_val); - /* higher threshold reached */ - if (notify_thres_1 && thresh_event_valid(PACKAGE_LEVEL, 1)) - platform_thermal_package_notify(msr_val); -} - -static void notify_thresholds(__u64 msr_val) -{ - /* check whether the interrupt handler is defined; - * otherwise simply return - */ - if (!platform_thermal_notify) - return; - - /* lower threshold reached */ - if ((msr_val & THERM_LOG_THRESHOLD0) && - thresh_event_valid(CORE_LEVEL, 0)) - platform_thermal_notify(msr_val); - /* higher threshold reached */ - if ((msr_val & THERM_LOG_THRESHOLD1) && - thresh_event_valid(CORE_LEVEL, 1)) - platform_thermal_notify(msr_val); -} - -/* Thermal transition interrupt handler */ -static void intel_thermal_interrupt(void) -{ - __u64 msr_val; - - if (static_cpu_has(X86_FEATURE_HWP)) - wrmsrl_safe(MSR_HWP_STATUS, 0); - - rdmsrl(MSR_IA32_THERM_STATUS, msr_val); - - /* Check for violation of core thermal thresholds*/ - notify_thresholds(msr_val); - - therm_throt_process(msr_val & THERM_STATUS_PROCHOT, - THERMAL_THROTTLING_EVENT, - CORE_LEVEL); - - if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable) - therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, - POWER_LIMIT_EVENT, - CORE_LEVEL); - - if (this_cpu_has(X86_FEATURE_PTS)) { - rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); - /* check violations of package thermal thresholds */ - notify_package_thresholds(msr_val); - therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, - THERMAL_THROTTLING_EVENT, - PACKAGE_LEVEL); - if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable) - therm_throt_process(msr_val & - PACKAGE_THERM_STATUS_POWER_LIMIT, - POWER_LIMIT_EVENT, - PACKAGE_LEVEL); - } -} - -static void unexpected_thermal_interrupt(void) -{ - pr_err("CPU%d: Unexpected LVT thermal interrupt!\n", - smp_processor_id()); -} - -static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; - -asmlinkage __visible void __irq_entry smp_thermal_interrupt(struct pt_regs *regs) -{ - entering_irq(); - trace_thermal_apic_entry(THERMAL_APIC_VECTOR); - inc_irq_stat(irq_thermal_count); - smp_thermal_vector(); - trace_thermal_apic_exit(THERMAL_APIC_VECTOR); - exiting_ack_irq(); -} - -/* Thermal monitoring depends on APIC, ACPI and clock modulation */ -static int intel_thermal_supported(struct cpuinfo_x86 *c) -{ - if (!boot_cpu_has(X86_FEATURE_APIC)) - return 0; - if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) - return 0; - return 1; -} - -void __init mcheck_intel_therm_init(void) -{ - /* - * This function is only called on boot CPU. Save the init thermal - * LVT value on BSP and use that value to restore APs' thermal LVT - * entry BIOS programmed later - */ - if (intel_thermal_supported(&boot_cpu_data)) - lvtthmr_init = apic_read(APIC_LVTTHMR); -} - -void intel_init_thermal(struct cpuinfo_x86 *c) -{ - unsigned int cpu = smp_processor_id(); - int tm2 = 0; - u32 l, h; - - if (!intel_thermal_supported(c)) - return; - - /* - * First check if its enabled already, in which case there might - * be some SMM goo which handles it, so we can't even put a handler - * since it might be delivered via SMI already: - */ - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - - h = lvtthmr_init; - /* - * The initial value of thermal LVT entries on all APs always reads - * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI - * sequence to them and LVT registers are reset to 0s except for - * the mask bits which are set to 1s when APs receive INIT IPI. - * If BIOS takes over the thermal interrupt and sets its interrupt - * delivery mode to SMI (not fixed), it restores the value that the - * BIOS has programmed on AP based on BSP's info we saved since BIOS - * is always setting the same value for all threads/cores. - */ - if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED) - apic_write(APIC_LVTTHMR, lvtthmr_init); - - - if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { - if (system_state == SYSTEM_BOOTING) - pr_debug("CPU%d: Thermal monitoring handled by SMI\n", cpu); - return; - } - - /* early Pentium M models use different method for enabling TM2 */ - if (cpu_has(c, X86_FEATURE_TM2)) { - if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) { - rdmsr(MSR_THERM2_CTL, l, h); - if (l & MSR_THERM2_CTL_TM_SELECT) - tm2 = 1; - } else if (l & MSR_IA32_MISC_ENABLE_TM2) - tm2 = 1; - } - - /* We'll mask the thermal vector in the lapic till we're ready: */ - h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; - apic_write(APIC_LVTTHMR, h); - - rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); - if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable) - wrmsr(MSR_IA32_THERM_INTERRUPT, - (l | (THERM_INT_LOW_ENABLE - | THERM_INT_HIGH_ENABLE)) & ~THERM_INT_PLN_ENABLE, h); - else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) - wrmsr(MSR_IA32_THERM_INTERRUPT, - l | (THERM_INT_LOW_ENABLE - | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h); - else - wrmsr(MSR_IA32_THERM_INTERRUPT, - l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); - - if (cpu_has(c, X86_FEATURE_PTS)) { - rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); - if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable) - wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, - (l | (PACKAGE_THERM_INT_LOW_ENABLE - | PACKAGE_THERM_INT_HIGH_ENABLE)) - & ~PACKAGE_THERM_INT_PLN_ENABLE, h); - else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) - wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, - l | (PACKAGE_THERM_INT_LOW_ENABLE - | PACKAGE_THERM_INT_HIGH_ENABLE - | PACKAGE_THERM_INT_PLN_ENABLE), h); - else - wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, - l | (PACKAGE_THERM_INT_LOW_ENABLE - | PACKAGE_THERM_INT_HIGH_ENABLE), h); - } - - smp_thermal_vector = intel_thermal_interrupt; - - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); - - pr_info_once("CPU0: Thermal monitoring enabled (%s)\n", - tm2 ? "TM2" : "TM1"); - - /* enable thermal throttle processing */ - atomic_set(&therm_throt_en, 1); -} diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c index 28812cc15300..6a059a035021 100644 --- a/arch/x86/kernel/cpu/mce/threshold.c +++ b/arch/x86/kernel/cpu/mce/threshold.c @@ -21,12 +21,11 @@ static void default_threshold_interrupt(void) void (*mce_threshold_vector)(void) = default_threshold_interrupt; -asmlinkage __visible void __irq_entry smp_threshold_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_threshold) { - entering_irq(); trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR); inc_irq_stat(irq_threshold_count); mce_threshold_vector(); trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR); - exiting_ack_irq(); + ack_APIC_irq(); } diff --git a/arch/x86/kernel/cpu/mce/winchip.c b/arch/x86/kernel/cpu/mce/winchip.c index a30ea13cccc2..6c99f2941909 100644 --- a/arch/x86/kernel/cpu/mce/winchip.c +++ b/arch/x86/kernel/cpu/mce/winchip.c @@ -6,6 +6,7 @@ #include <linux/interrupt.h> #include <linux/kernel.h> #include <linux/types.h> +#include <linux/hardirq.h> #include <asm/processor.h> #include <asm/traps.h> @@ -16,14 +17,12 @@ #include "internal.h" /* Machine check handler for WinChip C6: */ -static void winchip_machine_check(struct pt_regs *regs, long error_code) +noinstr void winchip_machine_check(struct pt_regs *regs) { - ist_enter(regs); - + instrumentation_begin(); pr_emerg("CPU0: Machine Check Exception.\n"); add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - - ist_exit(regs); + instrumentation_end(); } /* Set up machine check reporting on the Winchip C6 series */ @@ -31,10 +30,6 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c) { u32 lo, hi; - machine_check_vector = winchip_machine_check; - /* Make sure the vector pointer is visible before we enable MCEs: */ - wmb(); - rdmsr(MSR_IDT_FCR1, lo, hi); lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */ lo &= ~(1<<4); /* Enable MCE */ diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 3f6b137ef4e6..3a35dec3ec55 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -215,7 +215,6 @@ static unsigned int __verify_patch_size(u8 family, u32 sh_psize, size_t buf_size default: WARN(1, "%s: WTF family: 0x%x\n", __func__, family); return 0; - break; } if (sh_psize > min_t(u32, buf_size, max_size)) @@ -441,7 +440,13 @@ apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_p return ret; native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - if (rev >= mc->hdr.patch_id) + + /* + * Allow application of the same revision to pick up SMT-specific + * changes even if the revision of the other SMT thread is already + * up-to-date. + */ + if (rev > mc->hdr.patch_id) return ret; if (!__apply_microcode_amd(mc)) { @@ -457,17 +462,23 @@ apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_p static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family) { -#ifdef CONFIG_X86_64 char fw_name[36] = "amd-ucode/microcode_amd.bin"; + struct firmware fw; + + if (IS_ENABLED(CONFIG_X86_32)) + return false; if (family >= 0x15) snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", family); - return get_builtin_firmware(cp, fw_name); -#else + if (firmware_request_builtin(&fw, fw_name)) { + cp->size = fw.size; + cp->data = (void *)fw.data; + return true; + } + return false; -#endif } static void __load_ucode_amd(unsigned int cpuid_1_eax, struct cpio_data *ret) @@ -523,8 +534,12 @@ void load_ucode_amd_ap(unsigned int cpuid_1_eax) native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - /* Check whether we have saved a new patch already: */ - if (*new_rev && rev < mc->hdr.patch_id) { + /* + * Check whether a new patch has been saved already. Also, allow application of + * the same revision in order to pick up SMT-thread-specific configuration even + * if the sibling SMT thread already has an up-to-date revision. + */ + if (*new_rev && rev <= mc->hdr.patch_id) { if (!__apply_microcode_amd(mc)) { *new_rev = mc->hdr.patch_id; return; @@ -783,6 +798,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover, kfree(patch); return -EINVAL; } + patch->size = *patch_size; mc_hdr = (struct microcode_header_amd *)(fw + SECTION_HDR_SIZE); proc_id = mc_hdr->processor_rev_id; @@ -864,7 +880,7 @@ load_microcode_amd(bool save, u8 family, const u8 *data, size_t size) return ret; memset(amd_ucode_patch, 0, PATCH_MAX_SIZE); - memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data), PATCH_MAX_SIZE)); + memcpy(amd_ucode_patch, p->data, min_t(u32, p->size, PATCH_MAX_SIZE)); return ret; } @@ -919,12 +935,6 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device, return ret; } -static enum ucode_state -request_microcode_user(int cpu, const void __user *buf, size_t size) -{ - return UCODE_ERROR; -} - static void microcode_fini_cpu_amd(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; @@ -933,7 +943,6 @@ static void microcode_fini_cpu_amd(int cpu) } static struct microcode_ops microcode_amd_ops = { - .request_microcode_user = request_microcode_user, .request_microcode_fw = request_microcode_amd, .collect_cpu_info = collect_cpu_info_amd, .apply_microcode = apply_microcode_amd, diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 7019d4b2df0c..6a41cee242f6 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -55,7 +55,7 @@ LIST_HEAD(microcode_cache); * All non cpu-hotplug-callback call sites use: * * - microcode_mutex to synchronize with each other; - * - get/put_online_cpus() to synchronize with + * - cpus_read_lock/unlock() to synchronize with * the cpu-hotplug-callback call sites. * * We guarantee that only a single cpu is being @@ -140,25 +140,6 @@ static bool __init check_loader_disabled_bsp(void) return *res; } -extern struct builtin_fw __start_builtin_fw[]; -extern struct builtin_fw __end_builtin_fw[]; - -bool get_builtin_firmware(struct cpio_data *cd, const char *name) -{ -#ifdef CONFIG_FW_LOADER - struct builtin_fw *b_fw; - - for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) { - if (!strcmp(name, b_fw->name)) { - cd->size = b_fw->size; - cd->data = b_fw->data; - return true; - } - } -#endif - return false; -} - void __init load_ucode_bsp(void) { unsigned int cpuid_1_eax; @@ -392,101 +373,10 @@ static int apply_microcode_on_target(int cpu) return ret; } -#ifdef CONFIG_MICROCODE_OLD_INTERFACE -static int do_microcode_update(const void __user *buf, size_t size) -{ - int error = 0; - int cpu; - - for_each_online_cpu(cpu) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - enum ucode_state ustate; - - if (!uci->valid) - continue; - - ustate = microcode_ops->request_microcode_user(cpu, buf, size); - if (ustate == UCODE_ERROR) { - error = -1; - break; - } else if (ustate == UCODE_NEW) { - apply_microcode_on_target(cpu); - } - } - - return error; -} - -static int microcode_open(struct inode *inode, struct file *file) -{ - return capable(CAP_SYS_RAWIO) ? stream_open(inode, file) : -EPERM; -} - -static ssize_t microcode_write(struct file *file, const char __user *buf, - size_t len, loff_t *ppos) -{ - ssize_t ret = -EINVAL; - unsigned long nr_pages = totalram_pages(); - - if ((len >> PAGE_SHIFT) > nr_pages) { - pr_err("too much data (max %ld pages)\n", nr_pages); - return ret; - } - - get_online_cpus(); - mutex_lock(µcode_mutex); - - if (do_microcode_update(buf, len) == 0) - ret = (ssize_t)len; - - if (ret > 0) - perf_check_microcode(); - - mutex_unlock(µcode_mutex); - put_online_cpus(); - - return ret; -} - -static const struct file_operations microcode_fops = { - .owner = THIS_MODULE, - .write = microcode_write, - .open = microcode_open, - .llseek = no_llseek, -}; - -static struct miscdevice microcode_dev = { - .minor = MICROCODE_MINOR, - .name = "microcode", - .nodename = "cpu/microcode", - .fops = µcode_fops, -}; - -static int __init microcode_dev_init(void) -{ - int error; - - error = misc_register(µcode_dev); - if (error) { - pr_err("can't misc_register on minor=%d\n", MICROCODE_MINOR); - return error; - } - - return 0; -} - -static void __exit microcode_dev_exit(void) -{ - misc_deregister(µcode_dev); -} -#else -#define microcode_dev_init() 0 -#define microcode_dev_exit() do { } while (0) -#endif - /* fake device for request_firmware */ static struct platform_device *microcode_pdev; +#ifdef CONFIG_MICROCODE_LATE_LOADING /* * Late loading dance. Why the heavy-handed stomp_machine effort? * @@ -545,8 +435,7 @@ static int __wait_for_cpus(atomic_t *t, long long timeout) /* * Returns: * < 0 - on error - * 0 - no update done - * 1 - microcode was updated + * 0 - success (no update done or microcode was updated) */ static int __reload_late(void *info) { @@ -573,11 +462,11 @@ static int __reload_late(void *info) else goto wait_for_siblings; - if (err > UCODE_NFOUND) { - pr_warn("Error reloading microcode on CPU %d\n", cpu); + if (err >= UCODE_NFOUND) { + if (err == UCODE_ERROR) + pr_warn("Error reloading microcode on CPU %d\n", cpu); + ret = -1; - } else if (err == UCODE_UPDATED || err == UCODE_OK) { - ret = 1; } wait_for_siblings: @@ -602,16 +491,20 @@ wait_for_siblings: */ static int microcode_reload_late(void) { - int ret; + int old = boot_cpu_data.microcode, ret; + + pr_err("Attempting late microcode loading - it is dangerous and taints the kernel.\n"); + pr_err("You should switch to early loading, if possible.\n"); atomic_set(&late_cpus_in, 0); atomic_set(&late_cpus_out, 0); ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask); - if (ret > 0) + if (ret == 0) microcode_check(); - pr_info("Reload completed, microcode revision: 0x%x\n", boot_cpu_data.microcode); + pr_info("Reload completed, microcode revision: 0x%x -> 0x%x\n", + old, boot_cpu_data.microcode); return ret; } @@ -632,29 +525,34 @@ static ssize_t reload_store(struct device *dev, if (val != 1) return size; - tmp_ret = microcode_ops->request_microcode_fw(bsp, µcode_pdev->dev, true); - if (tmp_ret != UCODE_NEW) - return size; - - get_online_cpus(); + cpus_read_lock(); ret = check_online_cpus(); if (ret) goto put; + tmp_ret = microcode_ops->request_microcode_fw(bsp, µcode_pdev->dev, true); + if (tmp_ret != UCODE_NEW) + goto put; + mutex_lock(µcode_mutex); ret = microcode_reload_late(); mutex_unlock(µcode_mutex); put: - put_online_cpus(); + cpus_read_unlock(); - if (ret >= 0) + if (ret == 0) ret = size; + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + return ret; } +static DEVICE_ATTR_WO(reload); +#endif + static ssize_t version_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -671,7 +569,6 @@ static ssize_t pf_show(struct device *dev, return sprintf(buf, "0x%x\n", uci->cpu_sig.pf); } -static DEVICE_ATTR_WO(reload); static DEVICE_ATTR(version, 0444, version_show, NULL); static DEVICE_ATTR(processor_flags, 0444, pf_show, NULL); @@ -778,9 +675,9 @@ static struct subsys_interface mc_cpu_interface = { }; /** - * mc_bp_resume - Update boot CPU microcode during resume. + * microcode_bsp_resume - Update boot CPU microcode during resume. */ -static void mc_bp_resume(void) +void microcode_bsp_resume(void) { int cpu = smp_processor_id(); struct ucode_cpu_info *uci = ucode_cpu_info + cpu; @@ -792,7 +689,7 @@ static void mc_bp_resume(void) } static struct syscore_ops mc_syscore_ops = { - .resume = mc_bp_resume, + .resume = microcode_bsp_resume, }; static int mc_cpu_starting(unsigned int cpu) @@ -824,7 +721,9 @@ static int mc_cpu_down_prep(unsigned int cpu) } static struct attribute *cpu_root_microcode_attrs[] = { +#ifdef CONFIG_MICROCODE_LATE_LOADING &dev_attr_reload.attr, +#endif NULL }; @@ -833,7 +732,7 @@ static const struct attribute_group cpu_root_microcode_group = { .attrs = cpu_root_microcode_attrs, }; -int __init microcode_init(void) +static int __init microcode_init(void) { struct cpuinfo_x86 *c = &boot_cpu_data; int error; @@ -856,14 +755,11 @@ int __init microcode_init(void) if (IS_ERR(microcode_pdev)) return PTR_ERR(microcode_pdev); - get_online_cpus(); + cpus_read_lock(); mutex_lock(µcode_mutex); - error = subsys_interface_register(&mc_cpu_interface); - if (!error) - perf_check_microcode(); mutex_unlock(µcode_mutex); - put_online_cpus(); + cpus_read_unlock(); if (error) goto out_pdev; @@ -876,10 +772,6 @@ int __init microcode_init(void) goto out_driver; } - error = microcode_dev_init(); - if (error) - goto out_ucode_group; - register_syscore_ops(&mc_syscore_ops); cpuhp_setup_state_nocalls(CPUHP_AP_MICROCODE_LOADER, "x86/microcode:starting", mc_cpu_starting, NULL); @@ -890,18 +782,14 @@ int __init microcode_init(void) return 0; - out_ucode_group: - sysfs_remove_group(&cpu_subsys.dev_root->kobj, - &cpu_root_microcode_group); - out_driver: - get_online_cpus(); + cpus_read_lock(); mutex_lock(µcode_mutex); subsys_interface_unregister(&mc_cpu_interface); mutex_unlock(µcode_mutex); - put_online_cpus(); + cpus_read_unlock(); out_pdev: platform_device_unregister(microcode_pdev); diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 6a99535d7f37..1fcbd671f1df 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -45,20 +45,6 @@ static struct microcode_intel *intel_ucode_patch; /* last level cache size per core */ static int llc_size_per_core; -static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1, - unsigned int s2, unsigned int p2) -{ - if (s1 != s2) - return false; - - /* Processor flags are either both 0 ... */ - if (!p1 && !p2) - return true; - - /* ... or they intersect. */ - return p1 & p2; -} - /* * Returns 1 if update has been found, 0 otherwise. */ @@ -69,7 +55,7 @@ static int find_matching_signature(void *mc, unsigned int csig, int cpf) struct extended_signature *ext_sig; int i; - if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf)) + if (intel_cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf)) return 1; /* Look for ext. headers: */ @@ -80,7 +66,7 @@ static int find_matching_signature(void *mc, unsigned int csig, int cpf) ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE; for (i = 0; i < ext_hdr->count; i++) { - if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf)) + if (intel_cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf)) return 1; ext_sig++; } @@ -100,53 +86,6 @@ static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev return find_matching_signature(mc, csig, cpf); } -/* - * Given CPU signature and a microcode patch, this function finds if the - * microcode patch has matching family and model with the CPU. - * - * %true - if there's a match - * %false - otherwise - */ -static bool microcode_matches(struct microcode_header_intel *mc_header, - unsigned long sig) -{ - unsigned long total_size = get_totalsize(mc_header); - unsigned long data_size = get_datasize(mc_header); - struct extended_sigtable *ext_header; - unsigned int fam_ucode, model_ucode; - struct extended_signature *ext_sig; - unsigned int fam, model; - int ext_sigcount, i; - - fam = x86_family(sig); - model = x86_model(sig); - - fam_ucode = x86_family(mc_header->sig); - model_ucode = x86_model(mc_header->sig); - - if (fam == fam_ucode && model == model_ucode) - return true; - - /* Look for ext. headers: */ - if (total_size <= data_size + MC_HEADER_SIZE) - return false; - - ext_header = (void *) mc_header + data_size + MC_HEADER_SIZE; - ext_sig = (void *)ext_header + EXT_HEADER_SIZE; - ext_sigcount = ext_header->count; - - for (i = 0; i < ext_sigcount; i++) { - fam_ucode = x86_family(ext_sig->sig); - model_ucode = x86_model(ext_sig->sig); - - if (fam == fam_ucode && model == model_ucode) - return true; - - ext_sig++; - } - return false; -} - static struct ucode_patch *memdup_patch(void *data, unsigned int size) { struct ucode_patch *p; @@ -164,7 +103,7 @@ static struct ucode_patch *memdup_patch(void *data, unsigned int size) return p; } -static void save_microcode_patch(void *data, unsigned int size) +static void save_microcode_patch(struct ucode_cpu_info *uci, void *data, unsigned int size) { struct microcode_header_intel *mc_hdr, *mc_saved_hdr; struct ucode_patch *iter, *tmp, *p = NULL; @@ -210,6 +149,9 @@ static void save_microcode_patch(void *data, unsigned int size) if (!p) return; + if (!find_matching_signature(p->data, uci->cpu_sig.sig, uci->cpu_sig.pf)) + return; + /* * Save for early loading. On 32-bit, that needs to be a physical * address as the APs are running from physical addresses, before @@ -344,13 +286,14 @@ scan_microcode(void *data, size_t size, struct ucode_cpu_info *uci, bool save) size -= mc_size; - if (!microcode_matches(mc_header, uci->cpu_sig.sig)) { + if (!find_matching_signature(data, uci->cpu_sig.sig, + uci->cpu_sig.pf)) { data += mc_size; continue; } if (save) { - save_microcode_patch(data, mc_size); + save_microcode_patch(uci, data, mc_size); goto next; } @@ -385,37 +328,6 @@ next: return patch; } -static int collect_cpu_info_early(struct ucode_cpu_info *uci) -{ - unsigned int val[2]; - unsigned int family, model; - struct cpu_signature csig = { 0 }; - unsigned int eax, ebx, ecx, edx; - - memset(uci, 0, sizeof(*uci)); - - eax = 0x00000001; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - csig.sig = eax; - - family = x86_family(eax); - model = x86_model(eax); - - if ((model >= 5) || (family > 6)) { - /* get processor flags from MSR 0x17 */ - native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); - csig.pf = 1 << ((val[1] >> 18) & 7); - } - - csig.rev = intel_get_microcode_revision(); - - uci->cpu_sig = csig; - uci->valid = 1; - - return 0; -} - static void show_saved_mc(void) { #ifdef DEBUG @@ -429,7 +341,7 @@ static void show_saved_mc(void) return; } - collect_cpu_info_early(&uci); + intel_cpu_collect_info(&uci); sig = uci.cpu_sig.sig; pf = uci.cpu_sig.pf; @@ -483,14 +395,14 @@ static void show_saved_mc(void) * Save this microcode patch. It will be loaded early when a CPU is * hot-added or resumes. */ -static void save_mc_for_early(u8 *mc, unsigned int size) +static void save_mc_for_early(struct ucode_cpu_info *uci, u8 *mc, unsigned int size) { /* Synchronization during CPU hotplug. */ static DEFINE_MUTEX(x86_cpu_microcode_mutex); mutex_lock(&x86_cpu_microcode_mutex); - save_microcode_patch(mc, size); + save_microcode_patch(uci, mc, size); show_saved_mc(); mutex_unlock(&x86_cpu_microcode_mutex); @@ -499,6 +411,7 @@ static void save_mc_for_early(u8 *mc, unsigned int size) static bool load_builtin_intel_microcode(struct cpio_data *cp) { unsigned int eax = 1, ebx, ecx = 0, edx; + struct firmware fw; char name[30]; if (IS_ENABLED(CONFIG_X86_32)) @@ -509,7 +422,13 @@ static bool load_builtin_intel_microcode(struct cpio_data *cp) sprintf(name, "intel-ucode/%02x-%02x-%02x", x86_family(eax), x86_model(eax), x86_stepping(eax)); - return get_builtin_firmware(cp, name); + if (firmware_request_builtin(&fw, name)) { + cp->size = fw.size; + cp->data = (void *)fw.data; + return true; + } + + return false; } /* @@ -538,7 +457,7 @@ void show_ucode_info_early(void) struct ucode_cpu_info uci; if (delay_ucode_info) { - collect_cpu_info_early(&uci); + intel_cpu_collect_info(&uci); print_ucode_info(&uci, current_mc_date); delay_ucode_info = 0; } @@ -640,7 +559,7 @@ int __init save_microcode_in_initrd_intel(void) if (!(cp.data && cp.size)) return 0; - collect_cpu_info_early(&uci); + intel_cpu_collect_info(&uci); scan_microcode(cp.data, cp.size, &uci, true); @@ -673,7 +592,7 @@ static struct microcode_intel *__load_ucode_intel(struct ucode_cpu_info *uci) if (!(cp.data && cp.size)) return NULL; - collect_cpu_info_early(uci); + intel_cpu_collect_info(uci); return scan_microcode(cp.data, cp.size, uci, false); } @@ -748,7 +667,7 @@ void reload_ucode_intel(void) struct microcode_intel *p; struct ucode_cpu_info uci; - collect_cpu_info_early(&uci); + intel_cpu_collect_info(&uci); p = find_patch(&uci); if (!p) @@ -935,7 +854,7 @@ static enum ucode_state generic_load_microcode(int cpu, struct iov_iter *iter) * permanent memory. So it will be loaded early when a CPU is hot added * or resumes. */ - save_mc_for_early(new_mc, new_mc_size); + save_mc_for_early(uci, new_mc, new_mc_size); pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", cpu, new_rev, uci->cpu_sig.rev); @@ -997,24 +916,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device, return ret; } -static enum ucode_state -request_microcode_user(int cpu, const void __user *buf, size_t size) -{ - struct iov_iter iter; - struct iovec iov; - - if (is_blacklisted(cpu)) - return UCODE_NFOUND; - - iov.iov_base = (void __user *)buf; - iov.iov_len = size; - iov_iter_init(&iter, WRITE, &iov, 1, size); - - return generic_load_microcode(cpu, &iter); -} - static struct microcode_ops microcode_intel_ops = { - .request_microcode_user = request_microcode_user, .request_microcode_fw = request_microcode_fw, .collect_cpu_info = collect_cpu_info, .apply_microcode = apply_microcode_intel, diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index caa032ce3fe3..831613959a92 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -18,11 +18,13 @@ #include <linux/kexec.h> #include <linux/i8253.h> #include <linux/random.h> +#include <linux/swiotlb.h> #include <asm/processor.h> #include <asm/hypervisor.h> #include <asm/hyperv-tlfs.h> #include <asm/mshyperv.h> #include <asm/desc.h> +#include <asm/idtentry.h> #include <asm/irq_regs.h> #include <asm/i8259.h> #include <asm/apic.h> @@ -30,9 +32,12 @@ #include <asm/reboot.h> #include <asm/nmi.h> #include <clocksource/hyperv_timer.h> +#include <asm/numa.h> +#include <asm/coco.h> +/* Is Linux running as the root partition? */ +bool hv_root_partition; struct ms_hyperv_info ms_hyperv; -EXPORT_SYMBOL_GPL(ms_hyperv); #if IS_ENABLED(CONFIG_HYPERV) static void (*vmbus_handler)(void); @@ -40,11 +45,10 @@ static void (*hv_stimer0_handler)(void); static void (*hv_kexec_handler)(void); static void (*hv_crash_handler)(struct pt_regs *regs); -__visible void __irq_entry hyperv_vector_handler(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback) { struct pt_regs *old_regs = set_irq_regs(regs); - entering_irq(); inc_irq_stat(irq_hv_callback_count); if (vmbus_handler) vmbus_handler(); @@ -52,96 +56,100 @@ __visible void __irq_entry hyperv_vector_handler(struct pt_regs *regs) if (ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED) ack_APIC_irq(); - exiting_irq(); set_irq_regs(old_regs); } -void hv_setup_vmbus_irq(void (*handler)(void)) +void hv_setup_vmbus_handler(void (*handler)(void)) { vmbus_handler = handler; } -void hv_remove_vmbus_irq(void) +void hv_remove_vmbus_handler(void) { /* We have no way to deallocate the interrupt gate */ vmbus_handler = NULL; } -EXPORT_SYMBOL_GPL(hv_setup_vmbus_irq); -EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq); /* * Routines to do per-architecture handling of stimer0 * interrupts when in Direct Mode */ - -__visible void __irq_entry hv_stimer0_vector_handler(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0) { struct pt_regs *old_regs = set_irq_regs(regs); - entering_irq(); inc_irq_stat(hyperv_stimer0_count); if (hv_stimer0_handler) hv_stimer0_handler(); - add_interrupt_randomness(HYPERV_STIMER0_VECTOR, 0); + add_interrupt_randomness(HYPERV_STIMER0_VECTOR); ack_APIC_irq(); - exiting_irq(); set_irq_regs(old_regs); } -int hv_setup_stimer0_irq(int *irq, int *vector, void (*handler)(void)) +/* For x86/x64, override weak placeholders in hyperv_timer.c */ +void hv_setup_stimer0_handler(void (*handler)(void)) { - *vector = HYPERV_STIMER0_VECTOR; - *irq = -1; /* Unused on x86/x64 */ hv_stimer0_handler = handler; - return 0; } -EXPORT_SYMBOL_GPL(hv_setup_stimer0_irq); -void hv_remove_stimer0_irq(int irq) +void hv_remove_stimer0_handler(void) { /* We have no way to deallocate the interrupt gate */ hv_stimer0_handler = NULL; } -EXPORT_SYMBOL_GPL(hv_remove_stimer0_irq); void hv_setup_kexec_handler(void (*handler)(void)) { hv_kexec_handler = handler; } -EXPORT_SYMBOL_GPL(hv_setup_kexec_handler); void hv_remove_kexec_handler(void) { hv_kexec_handler = NULL; } -EXPORT_SYMBOL_GPL(hv_remove_kexec_handler); void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs)) { hv_crash_handler = handler; } -EXPORT_SYMBOL_GPL(hv_setup_crash_handler); void hv_remove_crash_handler(void) { hv_crash_handler = NULL; } -EXPORT_SYMBOL_GPL(hv_remove_crash_handler); #ifdef CONFIG_KEXEC_CORE static void hv_machine_shutdown(void) { if (kexec_in_progress && hv_kexec_handler) hv_kexec_handler(); + + /* + * Call hv_cpu_die() on all the CPUs, otherwise later the hypervisor + * corrupts the old VP Assist Pages and can crash the kexec kernel. + */ + if (kexec_in_progress && hyperv_init_cpuhp > 0) + cpuhp_remove_state(hyperv_init_cpuhp); + + /* The function calls stop_other_cpus(). */ native_machine_shutdown(); + + /* Disable the hypercall page when there is only 1 active CPU. */ + if (kexec_in_progress) + hyperv_cleanup(); } static void hv_machine_crash_shutdown(struct pt_regs *regs) { if (hv_crash_handler) hv_crash_handler(regs); + + /* The function calls crash_smp_send_stop(). */ native_machine_crash_shutdown(regs); + + /* Disable the hypercall page when there is only 1 active CPU. */ + hyperv_cleanup(); } #endif /* CONFIG_KEXEC_CORE */ #endif /* CONFIG_HYPERV */ @@ -157,12 +165,22 @@ static uint32_t __init ms_hyperv_platform(void) cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS, &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]); - if (eax >= HYPERV_CPUID_MIN && - eax <= HYPERV_CPUID_MAX && - !memcmp("Microsoft Hv", hyp_signature, 12)) - return HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS; + if (eax < HYPERV_CPUID_MIN || eax > HYPERV_CPUID_MAX || + memcmp("Microsoft Hv", hyp_signature, 12)) + return 0; - return 0; + /* HYPERCALL and VP_INDEX MSRs are mandatory for all features. */ + eax = cpuid_eax(HYPERV_CPUID_FEATURES); + if (!(eax & HV_MSR_HYPERCALL_AVAILABLE)) { + pr_warn("x86/hyperv: HYPERCALL MSR not available.\n"); + return 0; + } + if (!(eax & HV_MSR_VP_INDEX_AVAILABLE)) { + pr_warn("x86/hyperv: VP_INDEX MSR not available.\n"); + return 0; + } + + return HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS; } static unsigned char hv_get_nmi_reason(void) @@ -173,7 +191,7 @@ static unsigned char hv_get_nmi_reason(void) #ifdef CONFIG_X86_LOCAL_APIC /* * Prior to WS2016 Debug-VM sends NMIs to all CPUs which makes - * it dificult to process CHANNELMSG_UNLOAD in case of crash. Handle + * it difficult to process CHANNELMSG_UNLOAD in case of crash. Handle * unknown NMI on the first CPU which gets it. */ static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs) @@ -207,10 +225,37 @@ static void __init hv_smp_prepare_boot_cpu(void) hv_init_spinlocks(); #endif } + +static void __init hv_smp_prepare_cpus(unsigned int max_cpus) +{ +#ifdef CONFIG_X86_64 + int i; + int ret; +#endif + + native_smp_prepare_cpus(max_cpus); + +#ifdef CONFIG_X86_64 + for_each_present_cpu(i) { + if (i == 0) + continue; + ret = hv_call_add_logical_proc(numa_cpu_node(i), i, cpu_physical_id(i)); + BUG_ON(ret); + } + + for_each_present_cpu(i) { + if (i == 0) + continue; + ret = hv_call_create_vp(numa_cpu_node(i), hv_current_partition_id, i, i); + BUG_ON(ret); + } +#endif +} #endif static void __init ms_hyperv_init_platform(void) { + int hv_max_functions_eax; int hv_host_info_eax; int hv_host_info_ebx; int hv_host_info_ecx; @@ -224,11 +269,15 @@ static void __init ms_hyperv_init_platform(void) * Extract the features and hints */ ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES); + ms_hyperv.priv_high = cpuid_ebx(HYPERV_CPUID_FEATURES); ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES); ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); - pr_info("Hyper-V: features 0x%x, hints 0x%x\n", - ms_hyperv.features, ms_hyperv.hints); + hv_max_functions_eax = cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS); + + pr_info("Hyper-V: privilege flags low 0x%x, high 0x%x, hints 0x%x, misc 0x%x\n", + ms_hyperv.features, ms_hyperv.priv_high, ms_hyperv.hints, + ms_hyperv.misc_features); ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS); ms_hyperv.max_lp_index = cpuid_ebx(HYPERV_CPUID_IMPLEMENT_LIMITS); @@ -237,34 +286,73 @@ static void __init ms_hyperv_init_platform(void) ms_hyperv.max_vp_index, ms_hyperv.max_lp_index); /* + * Check CPU management privilege. + * + * To mirror what Windows does we should extract CPU management + * features and use the ReservedIdentityBit to detect if Linux is the + * root partition. But that requires negotiating CPU management + * interface (a process to be finalized). + * + * For now, use the privilege flag as the indicator for running as + * root. + */ + if (cpuid_ebx(HYPERV_CPUID_FEATURES) & HV_CPU_MANAGEMENT) { + hv_root_partition = true; + pr_info("Hyper-V: running as root partition\n"); + } + + /* * Extract host information. */ - if (cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS) >= - HYPERV_CPUID_VERSION) { + if (hv_max_functions_eax >= HYPERV_CPUID_VERSION) { hv_host_info_eax = cpuid_eax(HYPERV_CPUID_VERSION); hv_host_info_ebx = cpuid_ebx(HYPERV_CPUID_VERSION); hv_host_info_ecx = cpuid_ecx(HYPERV_CPUID_VERSION); hv_host_info_edx = cpuid_edx(HYPERV_CPUID_VERSION); - pr_info("Hyper-V Host Build:%d-%d.%d-%d-%d.%d\n", - hv_host_info_eax, hv_host_info_ebx >> 16, - hv_host_info_ebx & 0xFFFF, hv_host_info_ecx, - hv_host_info_edx >> 24, hv_host_info_edx & 0xFFFFFF); + pr_info("Hyper-V: Host Build %d.%d.%d.%d-%d-%d\n", + hv_host_info_ebx >> 16, hv_host_info_ebx & 0xFFFF, + hv_host_info_eax, hv_host_info_edx & 0xFFFFFF, + hv_host_info_ecx, hv_host_info_edx >> 24); } - if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS && + if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS && ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) { x86_platform.calibrate_tsc = hv_get_tsc_khz; x86_platform.calibrate_cpu = hv_get_tsc_khz; } - if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED) { + if (ms_hyperv.priv_high & HV_ISOLATION) { + ms_hyperv.isolation_config_a = cpuid_eax(HYPERV_CPUID_ISOLATION_CONFIG); + ms_hyperv.isolation_config_b = cpuid_ebx(HYPERV_CPUID_ISOLATION_CONFIG); + ms_hyperv.shared_gpa_boundary = + BIT_ULL(ms_hyperv.shared_gpa_boundary_bits); + + pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n", + ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b); + + if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) { + static_branch_enable(&isolation_type_snp); +#ifdef CONFIG_SWIOTLB + swiotlb_unencrypted_base = ms_hyperv.shared_gpa_boundary; +#endif + } + /* Isolation VMs are unenlightened SEV-based VMs, thus this check: */ + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) { + if (hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE) + cc_set_vendor(CC_VENDOR_HYPERV); + } + } + + if (hv_max_functions_eax >= HYPERV_CPUID_NESTED_FEATURES) { ms_hyperv.nested_features = cpuid_eax(HYPERV_CPUID_NESTED_FEATURES); + pr_info("Hyper-V: Nested features: 0x%x\n", + ms_hyperv.nested_features); } #ifdef CONFIG_X86_LOCAL_APIC - if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS && + if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS && ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) { /* * Get the APIC frequency. @@ -290,11 +378,18 @@ static void __init ms_hyperv_init_platform(void) machine_ops.shutdown = hv_machine_shutdown; machine_ops.crash_shutdown = hv_machine_crash_shutdown; #endif - if (ms_hyperv.features & HV_X64_ACCESS_TSC_INVARIANT) { + if (ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) { + /* + * Writing to synthetic MSR 0x40000118 updates/changes the + * guest visible CPUIDs. Setting bit 0 of this MSR enables + * guests to report invariant TSC feature through CPUID + * instruction, CPUID 0x800000007/EDX, bit 8. See code in + * early_init_intel() where this bit is examined. The + * setting of this MSR bit should happen before init_intel() + * is called. + */ wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, 0x1); setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); - } else { - mark_tsc_unstable("running on Hyper-V"); } /* @@ -321,25 +416,29 @@ static void __init ms_hyperv_init_platform(void) x86_platform.apic_post_init = hyperv_init; hyperv_setup_mmu_ops(); /* Setup the IDT for hypervisor callback */ - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector); + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_hyperv_callback); /* Setup the IDT for reenlightenment notifications */ - if (ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT) + if (ms_hyperv.features & HV_ACCESS_REENLIGHTENMENT) { alloc_intr_gate(HYPERV_REENLIGHTENMENT_VECTOR, - hyperv_reenlightenment_vector); + asm_sysvec_hyperv_reenlightenment); + } /* Setup the IDT for stimer0 */ - if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) + if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) { alloc_intr_gate(HYPERV_STIMER0_VECTOR, - hv_stimer0_callback_vector); + asm_sysvec_hyperv_stimer0); + } # ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu; + if (hv_root_partition) + smp_ops.smp_prepare_cpus = hv_smp_prepare_cpus; # endif /* * Hyper-V doesn't provide irq remapping for IO-APIC. To enable x2apic, - * set x2apic destination mode to physcial mode when x2apic is available + * set x2apic destination mode to physical mode when x2apic is available * and Hyper-V IOMMU driver makes sure cpus assigned with IO-APIC irqs * have 8-bit APIC id. */ @@ -351,18 +450,49 @@ static void __init ms_hyperv_init_platform(void) /* Register Hyper-V specific clocksource */ hv_init_clocksource(); #endif + /* + * TSC should be marked as unstable only after Hyper-V + * clocksource has been initialized. This ensures that the + * stability of the sched_clock is not altered. + */ + if (!(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT)) + mark_tsc_unstable("running on Hyper-V"); + + hardlockup_detector_disable(); } -void hv_setup_sched_clock(void *sched_clock) +static bool __init ms_hyperv_x2apic_available(void) { -#ifdef CONFIG_PARAVIRT - pv_ops.time.sched_clock = sched_clock; -#endif + return x2apic_supported(); +} + +/* + * If ms_hyperv_msi_ext_dest_id() returns true, hyperv_prepare_irq_remapping() + * returns -ENODEV and the Hyper-V IOMMU driver is not used; instead, the + * generic support of the 15-bit APIC ID is used: see __irq_msi_compose_msg(). + * + * Note: for a VM on Hyper-V, the I/O-APIC is the only device which + * (logically) generates MSIs directly to the system APIC irq domain. + * There is no HPET, and PCI MSI/MSI-X interrupts are remapped by the + * pci-hyperv host bridge. + */ +static bool __init ms_hyperv_msi_ext_dest_id(void) +{ + u32 eax; + + eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_INTERFACE); + if (eax != HYPERV_VS_INTERFACE_EAX_SIGNATURE) + return false; + + eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_PROPERTIES); + return eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE; } const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = { .name = "Microsoft Hyper-V", .detect = ms_hyperv_platform, .type = X86_HYPER_MS_HYPERV, + .init.x2apic_available = ms_hyperv_x2apic_available, + .init.msi_ext_dest_id = ms_hyperv_msi_ext_dest_id, .init.init_platform = ms_hyperv_init_platform, }; diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 5bd011737272..b5f43049fa5f 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -434,7 +434,7 @@ set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn, state->range_sizek = sizek - second_sizek; } -/* Mininum size of mtrr block that can take hole: */ +/* Minimum size of mtrr block that can take hole: */ static u64 mtrr_chunk_size __initdata = (256ULL<<20); static int __init parse_mtrr_chunk_size_opt(char *p) @@ -537,9 +537,9 @@ static void __init print_out_mtrr_range_state(void) if (!size_base) continue; - size_base = to_size_factor(size_base, &size_factor), + size_base = to_size_factor(size_base, &size_factor); start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10); - start_base = to_size_factor(start_base, &start_factor), + start_base = to_size_factor(start_base, &start_factor); type = range_state[i].type; pr_debug("reg %d, base: %ld%cB, range: %ld%cB, type %s\n", @@ -836,7 +836,7 @@ int __init amd_special_default_mtrr(void) if (boot_cpu_data.x86 < 0xf) return 0; /* In case some hypervisor doesn't pass SYSCFG through: */ - if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) + if (rdmsr_safe(MSR_AMD64_SYSCFG, &l, &h) < 0) return 0; /* * Memory between 4GB and top of mem is forced WB by this magic bit. diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index 72182809b333..ca670919b561 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -98,7 +98,7 @@ cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg) case 7: if (size < 0x40) break; - /* Else, fall through */ + fallthrough; case 6: case 5: case 4: diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 51b9190c628b..558108296f3c 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -3,7 +3,6 @@ * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong * because MTRRs can span up to 40 bits (36bits on most modern x86) */ -#define DEBUG #include <linux/export.h> #include <linux/init.h> @@ -54,13 +53,13 @@ static inline void k8_check_syscfg_dram_mod_en(void) (boot_cpu_data.x86 >= 0x0f))) return; - rdmsr(MSR_K8_SYSCFG, lo, hi); + rdmsr(MSR_AMD64_SYSCFG, lo, hi); if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) { pr_err(FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]" " not cleared by BIOS, clearing this bit\n", smp_processor_id()); lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY; - mtrr_wrmsr(MSR_K8_SYSCFG, lo, hi); + mtrr_wrmsr(MSR_AMD64_SYSCFG, lo, hi); } } @@ -167,9 +166,6 @@ static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end, *repeat = 0; *uniform = 1; - /* Make end inclusive instead of exclusive */ - end--; - prev_match = MTRR_TYPE_INVALID; for (i = 0; i < num_var_ranges; ++i) { unsigned short start_state, end_state, inclusive; @@ -261,6 +257,9 @@ u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform) int repeat; u64 partial_end; + /* Make end inclusive instead of exclusive */ + end--; + if (!mtrr_state_set) return MTRR_TYPE_INVALID; @@ -761,7 +760,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - __flush_tlb(); + flush_tlb_local(); /* Save MTRR state */ rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); @@ -778,7 +777,7 @@ static void post_set(void) __releases(set_atomicity_lock) { /* Flush TLBs (no need to flush caches - they are disabled) */ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - __flush_tlb(); + flush_tlb_local(); /* Intel (P6) standard MTRRs */ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c index 6a80f36b5d59..2746cac9d8a9 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.c +++ b/arch/x86/kernel/cpu/mtrr/mtrr.c @@ -31,8 +31,6 @@ System Programming Guide; Section 9.11. (1997 edition - PPro). */ -#define DEBUG - #include <linux/types.h> /* FIXME: kvm_para.h needs this */ #include <linux/stop_machine.h> @@ -338,7 +336,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, replace = -1; /* No CPU hotplug when we change MTRR entries */ - get_online_cpus(); + cpus_read_lock(); /* Search for existing MTRR */ mutex_lock(&mtrr_mutex); @@ -400,7 +398,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, error = i; out: mutex_unlock(&mtrr_mutex); - put_online_cpus(); + cpus_read_unlock(); return error; } @@ -487,7 +485,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) max = num_var_ranges; /* No CPU hotplug when we change MTRR entries */ - get_online_cpus(); + cpus_read_lock(); mutex_lock(&mtrr_mutex); if (reg < 0) { /* Search for existing MTRR */ @@ -522,7 +520,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) error = reg; out: mutex_unlock(&mtrr_mutex); - put_online_cpus(); + cpus_read_unlock(); return error; } @@ -794,8 +792,6 @@ void mtrr_ap_init(void) if (!use_intel() || mtrr_aps_delayed_init) return; - rcu_cpu_starting(smp_processor_id()); - /* * Ideally we should hold mtrr_mutex here to avoid mtrr entries * changed, but this routine will be called in cpu boot time, @@ -803,7 +799,7 @@ void mtrr_ap_init(void) * * This routine is called in two cases: * - * 1. very earily time of software resume, when there absolutely + * 1. very early time of software resume, when there absolutely * isn't mtrr entry changes; * * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug @@ -813,7 +809,8 @@ void mtrr_ap_init(void) } /** - * Save current fixed-range MTRR state of the first cpu in cpu_online_mask. + * mtrr_save_state - Save current fixed-range MTRR state of the first + * cpu in cpu_online_mask. */ void mtrr_save_state(void) { diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 9556930cd8c1..7aecb2fc3186 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -3,7 +3,7 @@ * local apic based NMI watchdog for various CPUs. * * This file also handles reservation of performance counters for coordination - * with other users (like oprofile). + * with other users. * * Note that these events normally don't tick when the CPU idles. This means * the frequency varies with CPU load. @@ -63,6 +63,10 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) case 15: return msr - MSR_P4_BPU_PERFCTR0; } + break; + case X86_VENDOR_ZHAOXIN: + case X86_VENDOR_CENTAUR: + return msr - MSR_ARCH_PERFMON_PERFCTR0; } return 0; } @@ -92,20 +96,15 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) case 15: return msr - MSR_P4_BSU_ESCR0; } + break; + case X86_VENDOR_ZHAOXIN: + case X86_VENDOR_CENTAUR: + return msr - MSR_ARCH_PERFMON_EVENTSEL0; } return 0; } -/* checks for a bit availability (hack for oprofile) */ -int avail_to_resrv_perfctr_nmi_bit(unsigned int counter) -{ - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - return !test_bit(counter, perfctr_nmi_owner); -} -EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); - int reserve_perfctr_nmi(unsigned int msr) { unsigned int counter; diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 4eec8889b0ff..099b6f0d96bd 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -84,14 +84,9 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "microcode\t: 0x%x\n", c->microcode); if (cpu_has(c, X86_FEATURE_TSC)) { - unsigned int freq = aperfmperf_get_khz(cpu); - - if (!freq) - freq = cpufreq_quick_get(cpu); - if (!freq) - freq = cpu_khz; - seq_printf(m, "cpu MHz\t\t: %u.%03u\n", - freq / 1000, (freq % 1000)); + unsigned int freq = arch_freq_get_on_cpu(cpu); + + seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000)); } /* Cache size */ diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c index c4be62058dd9..26a427fa84ea 100644 --- a/arch/x86/kernel/cpu/rdrand.c +++ b/arch/x86/kernel/cpu/rdrand.c @@ -11,56 +11,39 @@ #include <asm/archrandom.h> #include <asm/sections.h> -static int __init x86_rdrand_setup(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_RDRAND); - setup_clear_cpu_cap(X86_FEATURE_RDSEED); - return 1; -} -__setup("nordrand", x86_rdrand_setup); - /* * RDRAND has Built-In-Self-Test (BIST) that runs on every invocation. - * Run the instruction a few times as a sanity check. - * If it fails, it is simple to disable RDRAND here. + * Run the instruction a few times as a sanity check. Also make sure + * it's not outputting the same value over and over, which has happened + * as a result of past CPU bugs. + * + * If it fails, it is simple to disable RDRAND and RDSEED here. */ -#define SANITY_CHECK_LOOPS 8 -#ifdef CONFIG_ARCH_RANDOM void x86_init_rdrand(struct cpuinfo_x86 *c) { - unsigned int changed = 0; - unsigned long tmp, prev; - int i; + enum { SAMPLES = 8, MIN_CHANGE = 5 }; + unsigned long sample, prev; + bool failure = false; + size_t i, changed; if (!cpu_has(c, X86_FEATURE_RDRAND)) return; - for (i = 0; i < SANITY_CHECK_LOOPS; i++) { - if (!rdrand_long(&tmp)) { - clear_cpu_cap(c, X86_FEATURE_RDRAND); - pr_warn_once("rdrand: disabled\n"); - return; + for (changed = 0, i = 0; i < SAMPLES; ++i) { + if (!rdrand_long(&sample)) { + failure = true; + break; } + changed += i && sample != prev; + prev = sample; } + if (changed < MIN_CHANGE) + failure = true; - /* - * Stupid sanity-check whether RDRAND does *actually* generate - * some at least random-looking data. - */ - prev = tmp; - for (i = 0; i < SANITY_CHECK_LOOPS; i++) { - if (rdrand_long(&tmp)) { - if (prev != tmp) - changed++; - - prev = tmp; - } + if (failure) { + clear_cpu_cap(c, X86_FEATURE_RDRAND); + clear_cpu_cap(c, X86_FEATURE_RDSEED); + pr_emerg("RDRAND is not reliable on this platform; disabling.\n"); } - - if (WARN_ON_ONCE(!changed)) - pr_emerg( -"RDRAND gives funky smelling output, might consider not using it by booting with \"nordrand\""); - } -#endif diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 89049b343c7a..3266ea36667c 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -22,7 +22,7 @@ #include <linux/cpuhotplug.h> #include <asm/intel-family.h> -#include <asm/resctrl_sched.h> +#include <asm/resctrl.h> #include "internal.h" /* Mutex to protect rdtgroup access. */ @@ -57,127 +57,51 @@ static void mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r); -#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains) +#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.domains) -struct rdt_resource rdt_resources_all[] = { +struct rdt_hw_resource rdt_resources_all[] = { [RDT_RESOURCE_L3] = { - .rid = RDT_RESOURCE_L3, - .name = "L3", - .domains = domain_init(RDT_RESOURCE_L3), - .msr_base = MSR_IA32_L3_CBM_BASE, - .msr_update = cat_wrmsr, - .cache_level = 3, - .cache = { - .min_cbm_bits = 1, - .cbm_idx_mult = 1, - .cbm_idx_offset = 0, + .r_resctrl = { + .rid = RDT_RESOURCE_L3, + .name = "L3", + .cache_level = 3, + .domains = domain_init(RDT_RESOURCE_L3), + .parse_ctrlval = parse_cbm, + .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, }, - .parse_ctrlval = parse_cbm, - .format_str = "%d=%0*x", - .fflags = RFTYPE_RES_CACHE, - }, - [RDT_RESOURCE_L3DATA] = - { - .rid = RDT_RESOURCE_L3DATA, - .name = "L3DATA", - .domains = domain_init(RDT_RESOURCE_L3DATA), .msr_base = MSR_IA32_L3_CBM_BASE, .msr_update = cat_wrmsr, - .cache_level = 3, - .cache = { - .min_cbm_bits = 1, - .cbm_idx_mult = 2, - .cbm_idx_offset = 0, - }, - .parse_ctrlval = parse_cbm, - .format_str = "%d=%0*x", - .fflags = RFTYPE_RES_CACHE, - }, - [RDT_RESOURCE_L3CODE] = - { - .rid = RDT_RESOURCE_L3CODE, - .name = "L3CODE", - .domains = domain_init(RDT_RESOURCE_L3CODE), - .msr_base = MSR_IA32_L3_CBM_BASE, - .msr_update = cat_wrmsr, - .cache_level = 3, - .cache = { - .min_cbm_bits = 1, - .cbm_idx_mult = 2, - .cbm_idx_offset = 1, - }, - .parse_ctrlval = parse_cbm, - .format_str = "%d=%0*x", - .fflags = RFTYPE_RES_CACHE, }, [RDT_RESOURCE_L2] = { - .rid = RDT_RESOURCE_L2, - .name = "L2", - .domains = domain_init(RDT_RESOURCE_L2), - .msr_base = MSR_IA32_L2_CBM_BASE, - .msr_update = cat_wrmsr, - .cache_level = 2, - .cache = { - .min_cbm_bits = 1, - .cbm_idx_mult = 1, - .cbm_idx_offset = 0, - }, - .parse_ctrlval = parse_cbm, - .format_str = "%d=%0*x", - .fflags = RFTYPE_RES_CACHE, - }, - [RDT_RESOURCE_L2DATA] = - { - .rid = RDT_RESOURCE_L2DATA, - .name = "L2DATA", - .domains = domain_init(RDT_RESOURCE_L2DATA), - .msr_base = MSR_IA32_L2_CBM_BASE, - .msr_update = cat_wrmsr, - .cache_level = 2, - .cache = { - .min_cbm_bits = 1, - .cbm_idx_mult = 2, - .cbm_idx_offset = 0, + .r_resctrl = { + .rid = RDT_RESOURCE_L2, + .name = "L2", + .cache_level = 2, + .domains = domain_init(RDT_RESOURCE_L2), + .parse_ctrlval = parse_cbm, + .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, }, - .parse_ctrlval = parse_cbm, - .format_str = "%d=%0*x", - .fflags = RFTYPE_RES_CACHE, - }, - [RDT_RESOURCE_L2CODE] = - { - .rid = RDT_RESOURCE_L2CODE, - .name = "L2CODE", - .domains = domain_init(RDT_RESOURCE_L2CODE), .msr_base = MSR_IA32_L2_CBM_BASE, .msr_update = cat_wrmsr, - .cache_level = 2, - .cache = { - .min_cbm_bits = 1, - .cbm_idx_mult = 2, - .cbm_idx_offset = 1, - }, - .parse_ctrlval = parse_cbm, - .format_str = "%d=%0*x", - .fflags = RFTYPE_RES_CACHE, }, [RDT_RESOURCE_MBA] = { - .rid = RDT_RESOURCE_MBA, - .name = "MB", - .domains = domain_init(RDT_RESOURCE_MBA), - .cache_level = 3, - .format_str = "%d=%*u", - .fflags = RFTYPE_RES_MB, + .r_resctrl = { + .rid = RDT_RESOURCE_MBA, + .name = "MB", + .cache_level = 3, + .domains = domain_init(RDT_RESOURCE_MBA), + .parse_ctrlval = parse_bw, + .format_str = "%d=%*u", + .fflags = RFTYPE_RES_MB, + }, }, }; -static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid) -{ - return closid * r->cache.cbm_idx_mult + r->cache.cbm_idx_offset; -} - /* * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs * as they do not have CPUID enumeration support for Cache allocation. @@ -191,14 +115,15 @@ static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid) * Intel(R) Xeon(R) CPU E5-2608L v3 @ 2.00GHz * Intel(R) Xeon(R) CPU E5-2658A v3 @ 2.20GHz * - * Probe by trying to write the first of the L3 cach mask registers + * Probe by trying to write the first of the L3 cache mask registers * and checking that the bits stick. Max CLOSids is always 4 and max cbm length * is always 20 on hsw server parts. The minimum cache bitmask length * allowed for HSW server is always 2 bits. Hardcode all of them. */ static inline void cache_alloc_hsw_probe(void) { - struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; + struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_L3]; + struct rdt_resource *r = &hw_res->r_resctrl; u32 l, h, max_cbm = BIT_MASK(20) - 1; if (wrmsr_safe(MSR_IA32_L3_CBM_BASE, max_cbm, 0)) @@ -210,13 +135,12 @@ static inline void cache_alloc_hsw_probe(void) if (l != max_cbm) return; - r->num_closid = 4; + hw_res->num_closid = 4; r->default_ctrl = max_cbm; r->cache.cbm_len = 20; r->cache.shareable_bits = 0xc0000; r->cache.min_cbm_bits = 2; r->alloc_capable = true; - r->alloc_enabled = true; rdt_alloc_capable = true; } @@ -224,7 +148,7 @@ static inline void cache_alloc_hsw_probe(void) bool is_mba_sc(struct rdt_resource *r) { if (!r) - return rdt_resources_all[RDT_RESOURCE_MBA].membw.mba_sc; + return rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl.membw.mba_sc; return r->membw.mba_sc; } @@ -252,120 +176,113 @@ static inline bool rdt_get_mb_table(struct rdt_resource *r) static bool __get_mem_config_intel(struct rdt_resource *r) { + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); union cpuid_0x10_3_eax eax; union cpuid_0x10_x_edx edx; - u32 ebx, ecx; + u32 ebx, ecx, max_delay; cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full); - r->num_closid = edx.split.cos_max + 1; - r->membw.max_delay = eax.split.max_delay + 1; + hw_res->num_closid = edx.split.cos_max + 1; + max_delay = eax.split.max_delay + 1; r->default_ctrl = MAX_MBA_BW; + r->membw.arch_needs_linear = true; if (ecx & MBA_IS_LINEAR) { r->membw.delay_linear = true; - r->membw.min_bw = MAX_MBA_BW - r->membw.max_delay; - r->membw.bw_gran = MAX_MBA_BW - r->membw.max_delay; + r->membw.min_bw = MAX_MBA_BW - max_delay; + r->membw.bw_gran = MAX_MBA_BW - max_delay; } else { if (!rdt_get_mb_table(r)) return false; + r->membw.arch_needs_linear = false; } r->data_width = 3; + if (boot_cpu_has(X86_FEATURE_PER_THREAD_MBA)) + r->membw.throttle_mode = THREAD_THROTTLE_PER_THREAD; + else + r->membw.throttle_mode = THREAD_THROTTLE_MAX; + thread_throttle_mode_init(); + r->alloc_capable = true; - r->alloc_enabled = true; return true; } static bool __rdt_get_mem_config_amd(struct rdt_resource *r) { + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); union cpuid_0x10_3_eax eax; union cpuid_0x10_x_edx edx; u32 ebx, ecx; cpuid_count(0x80000020, 1, &eax.full, &ebx, &ecx, &edx.full); - r->num_closid = edx.split.cos_max + 1; + hw_res->num_closid = edx.split.cos_max + 1; r->default_ctrl = MAX_MBA_BW_AMD; /* AMD does not use delay */ r->membw.delay_linear = false; + r->membw.arch_needs_linear = false; + /* + * AMD does not use memory delay throttle model to control + * the allocation like Intel does. + */ + r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED; r->membw.min_bw = 0; r->membw.bw_gran = 1; /* Max value is 2048, Data width should be 4 in decimal */ r->data_width = 4; r->alloc_capable = true; - r->alloc_enabled = true; return true; } static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r) { + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); union cpuid_0x10_1_eax eax; union cpuid_0x10_x_edx edx; u32 ebx, ecx; cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx, &edx.full); - r->num_closid = edx.split.cos_max + 1; + hw_res->num_closid = edx.split.cos_max + 1; r->cache.cbm_len = eax.split.cbm_len + 1; r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1; r->cache.shareable_bits = ebx & r->default_ctrl; r->data_width = (r->cache.cbm_len + 3) / 4; r->alloc_capable = true; - r->alloc_enabled = true; } -static void rdt_get_cdp_config(int level, int type) +static void rdt_get_cdp_config(int level) { - struct rdt_resource *r_l = &rdt_resources_all[level]; - struct rdt_resource *r = &rdt_resources_all[type]; - - r->num_closid = r_l->num_closid / 2; - r->cache.cbm_len = r_l->cache.cbm_len; - r->default_ctrl = r_l->default_ctrl; - r->cache.shareable_bits = r_l->cache.shareable_bits; - r->data_width = (r->cache.cbm_len + 3) / 4; - r->alloc_capable = true; /* * By default, CDP is disabled. CDP can be enabled by mount parameter * "cdp" during resctrl file system mount time. */ - r->alloc_enabled = false; + rdt_resources_all[level].cdp_enabled = false; + rdt_resources_all[level].r_resctrl.cdp_capable = true; } static void rdt_get_cdp_l3_config(void) { - rdt_get_cdp_config(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA); - rdt_get_cdp_config(RDT_RESOURCE_L3, RDT_RESOURCE_L3CODE); + rdt_get_cdp_config(RDT_RESOURCE_L3); } static void rdt_get_cdp_l2_config(void) { - rdt_get_cdp_config(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA); - rdt_get_cdp_config(RDT_RESOURCE_L2, RDT_RESOURCE_L2CODE); -} - -static int get_cache_id(int cpu, int level) -{ - struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu); - int i; - - for (i = 0; i < ci->num_leaves; i++) { - if (ci->info_list[i].level == level) - return ci->info_list[i].id; - } - - return -1; + rdt_get_cdp_config(RDT_RESOURCE_L2); } static void mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) { unsigned int i; + struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); for (i = m->low; i < m->high; i++) - wrmsrl(r->msr_base + i, d->ctrl_val[i]); + wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]); } /* @@ -373,7 +290,7 @@ mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) * that can be written to QOS_MSRs. * There are currently no SKUs which support non linear delay values. */ -u32 delay_bw_map(unsigned long bw, struct rdt_resource *r) +static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r) { if (r->membw.delay_linear) return MAX_MBA_BW - bw; @@ -387,19 +304,23 @@ mba_wrmsr_intel(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) { unsigned int i; + struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); /* Write the delay values for mba. */ for (i = m->low; i < m->high; i++) - wrmsrl(r->msr_base + i, delay_bw_map(d->ctrl_val[i], r)); + wrmsrl(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], r)); } static void cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) { unsigned int i; + struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); for (i = m->low; i < m->high; i++) - wrmsrl(r->msr_base + cbm_idx(r, i), d->ctrl_val[i]); + wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]); } struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r) @@ -415,16 +336,22 @@ struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r) return NULL; } +u32 resctrl_arch_get_num_closid(struct rdt_resource *r) +{ + return resctrl_to_arch_res(r)->num_closid; +} + void rdt_ctrl_update(void *arg) { struct msr_param *m = arg; + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res); struct rdt_resource *r = m->res; int cpu = smp_processor_id(); struct rdt_domain *d; d = get_domain_from_cpu(cpu, r); if (d) { - r->msr_update(d, m, r); + hw_res->msr_update(d, m, r); return; } pr_warn_once("cpu %d not found in any domain for resource %s\n", @@ -464,80 +391,74 @@ struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, return NULL; } -void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm) +static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc) { + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); int i; /* * Initialize the Control MSRs to having no control. * For Cache Allocation: Set all bits in cbm * For Memory Allocation: Set b/w requested to 100% - * and the bandwidth in MBps to U32_MAX */ - for (i = 0; i < r->num_closid; i++, dc++, dm++) { + for (i = 0; i < hw_res->num_closid; i++, dc++) *dc = r->default_ctrl; - *dm = MBA_MAX_MBPS; - } +} + +static void domain_free(struct rdt_hw_domain *hw_dom) +{ + kfree(hw_dom->arch_mbm_total); + kfree(hw_dom->arch_mbm_local); + kfree(hw_dom->ctrl_val); + kfree(hw_dom); } static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) { + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); struct msr_param m; - u32 *dc, *dm; + u32 *dc; - dc = kmalloc_array(r->num_closid, sizeof(*d->ctrl_val), GFP_KERNEL); + dc = kmalloc_array(hw_res->num_closid, sizeof(*hw_dom->ctrl_val), + GFP_KERNEL); if (!dc) return -ENOMEM; - dm = kmalloc_array(r->num_closid, sizeof(*d->mbps_val), GFP_KERNEL); - if (!dm) { - kfree(dc); - return -ENOMEM; - } - - d->ctrl_val = dc; - d->mbps_val = dm; - setup_default_ctrlval(r, dc, dm); + hw_dom->ctrl_val = dc; + setup_default_ctrlval(r, dc); m.low = 0; - m.high = r->num_closid; - r->msr_update(d, &m, r); + m.high = hw_res->num_closid; + hw_res->msr_update(d, &m, r); return 0; } -static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) +/** + * arch_domain_mbm_alloc() - Allocate arch private storage for the MBM counters + * @num_rmid: The size of the MBM counter array + * @hw_dom: The domain that owns the allocated arrays + */ +static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_domain *hw_dom) { size_t tsize; - if (is_llc_occupancy_enabled()) { - d->rmid_busy_llc = bitmap_zalloc(r->num_rmid, GFP_KERNEL); - if (!d->rmid_busy_llc) - return -ENOMEM; - INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); - } if (is_mbm_total_enabled()) { - tsize = sizeof(*d->mbm_total); - d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL); - if (!d->mbm_total) { - bitmap_free(d->rmid_busy_llc); + tsize = sizeof(*hw_dom->arch_mbm_total); + hw_dom->arch_mbm_total = kcalloc(num_rmid, tsize, GFP_KERNEL); + if (!hw_dom->arch_mbm_total) return -ENOMEM; - } } if (is_mbm_local_enabled()) { - tsize = sizeof(*d->mbm_local); - d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL); - if (!d->mbm_local) { - bitmap_free(d->rmid_busy_llc); - kfree(d->mbm_total); + tsize = sizeof(*hw_dom->arch_mbm_local); + hw_dom->arch_mbm_local = kcalloc(num_rmid, tsize, GFP_KERNEL); + if (!hw_dom->arch_mbm_local) { + kfree(hw_dom->arch_mbm_total); + hw_dom->arch_mbm_total = NULL; return -ENOMEM; } } - if (is_mbm_enabled()) { - INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); - mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL); - } - return 0; } @@ -556,82 +477,71 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) */ static void domain_add_cpu(int cpu, struct rdt_resource *r) { - int id = get_cache_id(cpu, r->cache_level); + int id = get_cpu_cacheinfo_id(cpu, r->cache_level); struct list_head *add_pos = NULL; + struct rdt_hw_domain *hw_dom; struct rdt_domain *d; + int err; d = rdt_find_domain(r, id, &add_pos); if (IS_ERR(d)) { - pr_warn("Could't find cache id for cpu %d\n", cpu); + pr_warn("Couldn't find cache id for CPU %d\n", cpu); return; } if (d) { cpumask_set_cpu(cpu, &d->cpu_mask); + if (r->cache.arch_has_per_cpu_cfg) + rdt_domain_reconfigure_cdp(r); return; } - d = kzalloc_node(sizeof(*d), GFP_KERNEL, cpu_to_node(cpu)); - if (!d) + hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu)); + if (!hw_dom) return; + d = &hw_dom->d_resctrl; d->id = id; cpumask_set_cpu(cpu, &d->cpu_mask); + rdt_domain_reconfigure_cdp(r); + if (r->alloc_capable && domain_setup_ctrlval(r, d)) { - kfree(d); + domain_free(hw_dom); return; } - if (r->mon_capable && domain_setup_mon_state(r, d)) { - kfree(d); + if (r->mon_capable && arch_domain_mbm_alloc(r->num_rmid, hw_dom)) { + domain_free(hw_dom); return; } list_add_tail(&d->list, add_pos); - /* - * If resctrl is mounted, add - * per domain monitor data directories. - */ - if (static_branch_unlikely(&rdt_mon_enable_key)) - mkdir_mondata_subdir_allrdtgrp(r, d); + err = resctrl_online_domain(r, d); + if (err) { + list_del(&d->list); + domain_free(hw_dom); + } } static void domain_remove_cpu(int cpu, struct rdt_resource *r) { - int id = get_cache_id(cpu, r->cache_level); + int id = get_cpu_cacheinfo_id(cpu, r->cache_level); + struct rdt_hw_domain *hw_dom; struct rdt_domain *d; d = rdt_find_domain(r, id, NULL); if (IS_ERR_OR_NULL(d)) { - pr_warn("Could't find cache id for cpu %d\n", cpu); + pr_warn("Couldn't find cache id for CPU %d\n", cpu); return; } + hw_dom = resctrl_to_arch_dom(d); cpumask_clear_cpu(cpu, &d->cpu_mask); if (cpumask_empty(&d->cpu_mask)) { - /* - * If resctrl is mounted, remove all the - * per domain monitor data directories. - */ - if (static_branch_unlikely(&rdt_mon_enable_key)) - rmdir_mondata_subdir_allrdtgrp(r, d->id); + resctrl_offline_domain(r, d); list_del(&d->list); - if (r->mon_capable && is_mbm_enabled()) - cancel_delayed_work(&d->mbm_over); - if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) { - /* - * When a package is going down, forcefully - * decrement rmid->ebusy. There is no way to know - * that the L3 was flushed and hence may lead to - * incorrect counts in rare scenarios, but leaving - * the RMID as busy creates RMID leaks if the - * package never comes back. - */ - __check_limbo(d, true); - cancel_delayed_work(&d->cqm_limbo); - } /* * rdt_domain "d" is going to be freed below, so clear @@ -639,17 +549,12 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) */ if (d->plr) d->plr->d = NULL; + domain_free(hw_dom); - kfree(d->ctrl_val); - kfree(d->mbps_val); - bitmap_free(d->rmid_busy_llc); - kfree(d->mbm_total); - kfree(d->mbm_local); - kfree(d); return; } - if (r == &rdt_resources_all[RDT_RESOURCE_L3]) { + if (r == &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl) { if (is_mbm_enabled() && cpu == d->mbm_work_cpu) { cancel_delayed_work(&d->mbm_over); mbm_setup_overflow_handler(d, 0); @@ -726,13 +631,8 @@ static int resctrl_offline_cpu(unsigned int cpu) static __init void rdt_init_padding(void) { struct rdt_resource *r; - int cl; for_each_alloc_capable_rdt_resource(r) { - cl = strlen(r->name); - if (cl > max_name_width) - max_name_width = cl; - if (r->data_width > max_data_width) max_data_width = r->data_width; } @@ -821,19 +721,22 @@ static bool __init rdt_cpu_has(int flag) static __init bool get_mem_config(void) { + struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_MBA]; + if (!rdt_cpu_has(X86_FEATURE_MBA)) return false; if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - return __get_mem_config_intel(&rdt_resources_all[RDT_RESOURCE_MBA]); + return __get_mem_config_intel(&hw_res->r_resctrl); else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) - return __rdt_get_mem_config_amd(&rdt_resources_all[RDT_RESOURCE_MBA]); + return __rdt_get_mem_config_amd(&hw_res->r_resctrl); return false; } static __init bool get_rdt_alloc_resources(void) { + struct rdt_resource *r; bool ret = false; if (rdt_alloc_capable) @@ -843,14 +746,16 @@ static __init bool get_rdt_alloc_resources(void) return false; if (rdt_cpu_has(X86_FEATURE_CAT_L3)) { - rdt_get_cache_alloc_cfg(1, &rdt_resources_all[RDT_RESOURCE_L3]); + r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + rdt_get_cache_alloc_cfg(1, r); if (rdt_cpu_has(X86_FEATURE_CDP_L3)) rdt_get_cdp_l3_config(); ret = true; } if (rdt_cpu_has(X86_FEATURE_CAT_L2)) { /* CPUID 0x10.2 fields are same format at 0x10.1 */ - rdt_get_cache_alloc_cfg(2, &rdt_resources_all[RDT_RESOURCE_L2]); + r = &rdt_resources_all[RDT_RESOURCE_L2].r_resctrl; + rdt_get_cache_alloc_cfg(2, r); if (rdt_cpu_has(X86_FEATURE_CDP_L2)) rdt_get_cdp_l2_config(); ret = true; @@ -864,6 +769,8 @@ static __init bool get_rdt_alloc_resources(void) static __init bool get_rdt_mon_resources(void) { + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID); if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) @@ -874,7 +781,7 @@ static __init bool get_rdt_mon_resources(void) if (!rdt_mon_features) return false; - return !rdt_get_mon_l3_config(&rdt_resources_all[RDT_RESOURCE_L3]); + return !rdt_get_mon_l3_config(r); } static __init void __check_quirks_intel(void) @@ -889,6 +796,10 @@ static __init void __check_quirks_intel(void) set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat"); else set_rdt_options("!l3cat"); + fallthrough; + case INTEL_FAM6_BROADWELL_X: + intel_rdt_mbm_apply_quirk(); + break; } } @@ -908,40 +819,42 @@ static __init bool get_rdt_resources(void) static __init void rdt_init_res_defs_intel(void) { + struct rdt_hw_resource *hw_res; struct rdt_resource *r; for_each_rdt_resource(r) { + hw_res = resctrl_to_arch_res(r); + if (r->rid == RDT_RESOURCE_L3 || - r->rid == RDT_RESOURCE_L3DATA || - r->rid == RDT_RESOURCE_L3CODE || - r->rid == RDT_RESOURCE_L2 || - r->rid == RDT_RESOURCE_L2DATA || - r->rid == RDT_RESOURCE_L2CODE) - r->cbm_validate = cbm_validate_intel; - else if (r->rid == RDT_RESOURCE_MBA) { - r->msr_base = MSR_IA32_MBA_THRTL_BASE; - r->msr_update = mba_wrmsr_intel; - r->parse_ctrlval = parse_bw_intel; + r->rid == RDT_RESOURCE_L2) { + r->cache.arch_has_sparse_bitmaps = false; + r->cache.arch_has_empty_bitmaps = false; + r->cache.arch_has_per_cpu_cfg = false; + r->cache.min_cbm_bits = 1; + } else if (r->rid == RDT_RESOURCE_MBA) { + hw_res->msr_base = MSR_IA32_MBA_THRTL_BASE; + hw_res->msr_update = mba_wrmsr_intel; } } } static __init void rdt_init_res_defs_amd(void) { + struct rdt_hw_resource *hw_res; struct rdt_resource *r; for_each_rdt_resource(r) { + hw_res = resctrl_to_arch_res(r); + if (r->rid == RDT_RESOURCE_L3 || - r->rid == RDT_RESOURCE_L3DATA || - r->rid == RDT_RESOURCE_L3CODE || - r->rid == RDT_RESOURCE_L2 || - r->rid == RDT_RESOURCE_L2DATA || - r->rid == RDT_RESOURCE_L2CODE) - r->cbm_validate = cbm_validate_amd; - else if (r->rid == RDT_RESOURCE_MBA) { - r->msr_base = MSR_IA32_MBA_BW_BASE; - r->msr_update = mba_wrmsr_amd; - r->parse_ctrlval = parse_bw_amd; + r->rid == RDT_RESOURCE_L2) { + r->cache.arch_has_sparse_bitmaps = true; + r->cache.arch_has_empty_bitmaps = true; + r->cache.arch_has_per_cpu_cfg = true; + r->cache.min_cbm_bits = 0; + } else if (r->rid == RDT_RESOURCE_MBA) { + hw_res->msr_base = MSR_IA32_MBA_BW_BASE; + hw_res->msr_update = mba_wrmsr_amd; } } } @@ -956,6 +869,36 @@ static __init void rdt_init_res_defs(void) static enum cpuhp_state rdt_online; +/* Runs once on the BSP during boot. */ +void resctrl_cpu_detect(struct cpuinfo_x86 *c) +{ + if (!cpu_has(c, X86_FEATURE_CQM_LLC)) { + c->x86_cache_max_rmid = -1; + c->x86_cache_occ_scale = -1; + c->x86_cache_mbm_width_offset = -1; + return; + } + + /* will be overridden if occupancy monitoring exists */ + c->x86_cache_max_rmid = cpuid_ebx(0xf); + + if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) || + cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) || + cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) { + u32 eax, ebx, ecx, edx; + + /* QoS sub-leaf, EAX=0Fh, ECX=1 */ + cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx); + + c->x86_cache_max_rmid = ecx; + c->x86_cache_occ_scale = ebx; + c->x86_cache_mbm_width_offset = eax & 0xff; + + if (c->x86_vendor == X86_VENDOR_AMD && !c->x86_cache_mbm_width_offset) + c->x86_cache_mbm_width_offset = MBM_CNTR_WIDTH_OFFSET_AMD; + } +} + static int __init resctrl_late_init(void) { struct rdt_resource *r; diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 055c8613b531..1dafbdc5ac31 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -23,53 +23,6 @@ /* * Check whether MBA bandwidth percentage value is correct. The value is - * checked against the minimum and maximum bandwidth values specified by - * the hardware. The allocated bandwidth percentage is rounded to the next - * control step available on the hardware. - */ -static bool bw_validate_amd(char *buf, unsigned long *data, - struct rdt_resource *r) -{ - unsigned long bw; - int ret; - - ret = kstrtoul(buf, 10, &bw); - if (ret) { - rdt_last_cmd_printf("Non-decimal digit in MB value %s\n", buf); - return false; - } - - if (bw < r->membw.min_bw || bw > r->default_ctrl) { - rdt_last_cmd_printf("MB value %ld out of range [%d,%d]\n", bw, - r->membw.min_bw, r->default_ctrl); - return false; - } - - *data = roundup(bw, (unsigned long)r->membw.bw_gran); - return true; -} - -int parse_bw_amd(struct rdt_parse_data *data, struct rdt_resource *r, - struct rdt_domain *d) -{ - unsigned long bw_val; - - if (d->have_new_ctrl) { - rdt_last_cmd_printf("Duplicate domain %d\n", d->id); - return -EINVAL; - } - - if (!bw_validate_amd(data->buf, &bw_val, r)) - return -EINVAL; - - d->new_ctrl = bw_val; - d->have_new_ctrl = true; - - return 0; -} - -/* - * Check whether MBA bandwidth percentage value is correct. The value is * checked against the minimum and max bandwidth values specified by the * hardware. The allocated bandwidth percentage is rounded to the next * control step available on the hardware. @@ -82,7 +35,7 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) /* * Only linear delay values is supported for current Intel SKUs. */ - if (!r->membw.delay_linear) { + if (!r->membw.delay_linear && r->membw.arch_needs_linear) { rdt_last_cmd_puts("No support for non-linear MB domains\n"); return false; } @@ -104,31 +57,43 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) return true; } -int parse_bw_intel(struct rdt_parse_data *data, struct rdt_resource *r, - struct rdt_domain *d) +int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, + struct rdt_domain *d) { + struct resctrl_staged_config *cfg; + u32 closid = data->rdtgrp->closid; + struct rdt_resource *r = s->res; unsigned long bw_val; - if (d->have_new_ctrl) { + cfg = &d->staged_config[s->conf_type]; + if (cfg->have_new_ctrl) { rdt_last_cmd_printf("Duplicate domain %d\n", d->id); return -EINVAL; } if (!bw_validate(data->buf, &bw_val, r)) return -EINVAL; - d->new_ctrl = bw_val; - d->have_new_ctrl = true; + + if (is_mba_sc(r)) { + d->mbps_val[closid] = bw_val; + return 0; + } + + cfg->new_ctrl = bw_val; + cfg->have_new_ctrl = true; return 0; } /* - * Check whether a cache bit mask is valid. The SDM says: + * Check whether a cache bit mask is valid. + * For Intel the SDM says: * Please note that all (and only) contiguous '1' combinations * are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.). * Additionally Haswell requires at least two bits set. + * AMD allows non-contiguous bitmasks. */ -bool cbm_validate_intel(char *buf, u32 *data, struct rdt_resource *r) +static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) { unsigned long first_bit, zero_bit, val; unsigned int cbm_len = r->cache.cbm_len; @@ -140,7 +105,8 @@ bool cbm_validate_intel(char *buf, u32 *data, struct rdt_resource *r) return false; } - if (val == 0 || val > r->default_ctrl) { + if ((!r->cache.arch_has_empty_bitmaps && val == 0) || + val > r->default_ctrl) { rdt_last_cmd_puts("Mask out of range\n"); return false; } @@ -148,7 +114,9 @@ bool cbm_validate_intel(char *buf, u32 *data, struct rdt_resource *r) first_bit = find_first_bit(&val, cbm_len); zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); - if (find_next_bit(&val, cbm_len, zero_bit) < cbm_len) { + /* Are non-contiguous bitmaps allowed? */ + if (!r->cache.arch_has_sparse_bitmaps && + (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) { rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val); return false; } @@ -164,40 +132,19 @@ bool cbm_validate_intel(char *buf, u32 *data, struct rdt_resource *r) } /* - * Check whether a cache bit mask is valid. AMD allows non-contiguous - * bitmasks - */ -bool cbm_validate_amd(char *buf, u32 *data, struct rdt_resource *r) -{ - unsigned long val; - int ret; - - ret = kstrtoul(buf, 16, &val); - if (ret) { - rdt_last_cmd_printf("Non-hex character in the mask %s\n", buf); - return false; - } - - if (val > r->default_ctrl) { - rdt_last_cmd_puts("Mask out of range\n"); - return false; - } - - *data = val; - return true; -} - -/* * Read one cache bit mask (hex). Check that it is valid for the current * resource type. */ -int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r, +int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, struct rdt_domain *d) { struct rdtgroup *rdtgrp = data->rdtgrp; + struct resctrl_staged_config *cfg; + struct rdt_resource *r = s->res; u32 cbm_val; - if (d->have_new_ctrl) { + cfg = &d->staged_config[s->conf_type]; + if (cfg->have_new_ctrl) { rdt_last_cmd_printf("Duplicate domain %d\n", d->id); return -EINVAL; } @@ -212,7 +159,7 @@ int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r, return -EINVAL; } - if (!r->cbm_validate(data->buf, &cbm_val, r)) + if (!cbm_validate(data->buf, &cbm_val, r)) return -EINVAL; if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE || @@ -226,12 +173,12 @@ int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r, * The CBM may not overlap with the CBM of another closid if * either is exclusive. */ - if (rdtgroup_cbm_overlaps(r, d, cbm_val, rdtgrp->closid, true)) { + if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, true)) { rdt_last_cmd_puts("Overlaps with exclusive group\n"); return -EINVAL; } - if (rdtgroup_cbm_overlaps(r, d, cbm_val, rdtgrp->closid, false)) { + if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, false)) { if (rdtgrp->mode == RDT_MODE_EXCLUSIVE || rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { rdt_last_cmd_puts("Overlaps with other group\n"); @@ -239,8 +186,8 @@ int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r, } } - d->new_ctrl = cbm_val; - d->have_new_ctrl = true; + cfg->new_ctrl = cbm_val; + cfg->have_new_ctrl = true; return 0; } @@ -251,9 +198,12 @@ int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r, * separated by ";". The "id" is in decimal, and must match one of * the "id"s for this resource. */ -static int parse_line(char *line, struct rdt_resource *r, +static int parse_line(char *line, struct resctrl_schema *s, struct rdtgroup *rdtgrp) { + enum resctrl_conf_type t = s->conf_type; + struct resctrl_staged_config *cfg; + struct rdt_resource *r = s->res; struct rdt_parse_data data; char *dom = NULL, *id; struct rdt_domain *d; @@ -279,9 +229,10 @@ next: if (d->id == dom_id) { data.buf = dom; data.rdtgrp = rdtgrp; - if (r->parse_ctrlval(&data, r, d)) + if (r->parse_ctrlval(&data, s, d)) return -EINVAL; if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + cfg = &d->staged_config[t]; /* * In pseudo-locking setup mode and just * parsed a valid CBM that should be @@ -290,9 +241,9 @@ next: * the required initialization for single * region and return. */ - rdtgrp->plr->r = r; + rdtgrp->plr->s = s; rdtgrp->plr->d = d; - rdtgrp->plr->cbm = d->new_ctrl; + rdtgrp->plr->cbm = cfg->new_ctrl; d->plr = rdtgrp->plr; return 0; } @@ -302,36 +253,94 @@ next: return -EINVAL; } -int update_domains(struct rdt_resource *r, int closid) +static u32 get_config_index(u32 closid, enum resctrl_conf_type type) +{ + switch (type) { + default: + case CDP_NONE: + return closid; + case CDP_CODE: + return closid * 2 + 1; + case CDP_DATA: + return closid * 2; + } +} + +static bool apply_config(struct rdt_hw_domain *hw_dom, + struct resctrl_staged_config *cfg, u32 idx, + cpumask_var_t cpu_mask) { + struct rdt_domain *dom = &hw_dom->d_resctrl; + + if (cfg->new_ctrl != hw_dom->ctrl_val[idx]) { + cpumask_set_cpu(cpumask_any(&dom->cpu_mask), cpu_mask); + hw_dom->ctrl_val[idx] = cfg->new_ctrl; + + return true; + } + + return false; +} + +int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d, + u32 closid, enum resctrl_conf_type t, u32 cfg_val) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + u32 idx = get_config_index(closid, t); struct msr_param msr_param; + + if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask)) + return -EINVAL; + + hw_dom->ctrl_val[idx] = cfg_val; + + msr_param.res = r; + msr_param.low = idx; + msr_param.high = idx + 1; + hw_res->msr_update(d, &msr_param, r); + + return 0; +} + +int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) +{ + struct resctrl_staged_config *cfg; + struct rdt_hw_domain *hw_dom; + struct msr_param msr_param; + enum resctrl_conf_type t; cpumask_var_t cpu_mask; struct rdt_domain *d; - bool mba_sc; - u32 *dc; int cpu; + u32 idx; if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) return -ENOMEM; - msr_param.low = closid; - msr_param.high = msr_param.low + 1; - msr_param.res = r; - - mba_sc = is_mba_sc(r); + msr_param.res = NULL; list_for_each_entry(d, &r->domains, list) { - dc = !mba_sc ? d->ctrl_val : d->mbps_val; - if (d->have_new_ctrl && d->new_ctrl != dc[closid]) { - cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); - dc[closid] = d->new_ctrl; + hw_dom = resctrl_to_arch_dom(d); + for (t = 0; t < CDP_NUM_TYPES; t++) { + cfg = &hw_dom->d_resctrl.staged_config[t]; + if (!cfg->have_new_ctrl) + continue; + + idx = get_config_index(closid, t); + if (!apply_config(hw_dom, cfg, idx, cpu_mask)) + continue; + + if (!msr_param.res) { + msr_param.low = idx; + msr_param.high = msr_param.low + 1; + msr_param.res = r; + } else { + msr_param.low = min(msr_param.low, idx); + msr_param.high = max(msr_param.high, idx + 1); + } } } - /* - * Avoid writing the control msr with control values when - * MBA software controller is enabled - */ - if (cpumask_empty(cpu_mask) || mba_sc) + if (cpumask_empty(cpu_mask)) goto done; cpu = get_cpu(); /* Update resource control msr on this CPU if it's in cpu_mask. */ @@ -350,11 +359,11 @@ done: static int rdtgroup_parse_resource(char *resname, char *tok, struct rdtgroup *rdtgrp) { - struct rdt_resource *r; + struct resctrl_schema *s; - for_each_alloc_enabled_rdt_resource(r) { - if (!strcmp(resname, r->name) && rdtgrp->closid < r->num_closid) - return parse_line(tok, r, rdtgrp); + list_for_each_entry(s, &resctrl_schema_all, list) { + if (!strcmp(resname, s->name) && rdtgrp->closid < s->num_closid) + return parse_line(tok, s, rdtgrp); } rdt_last_cmd_printf("Unknown or unsupported resource name '%s'\n", resname); return -EINVAL; @@ -363,6 +372,7 @@ static int rdtgroup_parse_resource(char *resname, char *tok, ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { + struct resctrl_schema *s; struct rdtgroup *rdtgrp; struct rdt_domain *dom; struct rdt_resource *r; @@ -393,9 +403,9 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, goto out; } - for_each_alloc_enabled_rdt_resource(r) { - list_for_each_entry(dom, &r->domains, list) - dom->have_new_ctrl = false; + list_for_each_entry(s, &resctrl_schema_all, list) { + list_for_each_entry(dom, &s->res->domains, list) + memset(dom->staged_config, 0, sizeof(dom->staged_config)); } while ((tok = strsep(&buf, "\n")) != NULL) { @@ -415,8 +425,17 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, goto out; } - for_each_alloc_enabled_rdt_resource(r) { - ret = update_domains(r, rdtgrp->closid); + list_for_each_entry(s, &resctrl_schema_all, list) { + r = s->res; + + /* + * Writes to mba_sc resources update the software controller, + * not the control MSR. + */ + if (is_mba_sc(r)) + continue; + + ret = resctrl_arch_update_domains(r, rdtgrp->closid); if (ret) goto out; } @@ -437,19 +456,33 @@ out: return ret ?: nbytes; } -static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid) +u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d, + u32 closid, enum resctrl_conf_type type) +{ + struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + u32 idx = get_config_index(closid, type); + + return hw_dom->ctrl_val[idx]; +} + +static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid) { + struct rdt_resource *r = schema->res; struct rdt_domain *dom; bool sep = false; u32 ctrl_val; - seq_printf(s, "%*s:", max_name_width, r->name); + seq_printf(s, "%*s:", max_name_width, schema->name); list_for_each_entry(dom, &r->domains, list) { if (sep) seq_puts(s, ";"); - ctrl_val = (!is_mba_sc(r) ? dom->ctrl_val[closid] : - dom->mbps_val[closid]); + if (is_mba_sc(r)) + ctrl_val = dom->mbps_val[closid]; + else + ctrl_val = resctrl_arch_get_config(r, dom, closid, + schema->conf_type); + seq_printf(s, r->format_str, dom->id, max_data_width, ctrl_val); sep = true; @@ -460,16 +493,17 @@ static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid) int rdtgroup_schemata_show(struct kernfs_open_file *of, struct seq_file *s, void *v) { + struct resctrl_schema *schema; struct rdtgroup *rdtgrp; - struct rdt_resource *r; int ret = 0; u32 closid; rdtgrp = rdtgroup_kn_lock_live(of->kn); if (rdtgrp) { if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - for_each_alloc_enabled_rdt_resource(r) - seq_printf(s, "%s:uninitialized\n", r->name); + list_for_each_entry(schema, &resctrl_schema_all, list) { + seq_printf(s, "%s:uninitialized\n", schema->name); + } } else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { if (!rdtgrp->plr->d) { rdt_last_cmd_clear(); @@ -477,15 +511,15 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, ret = -ENODEV; } else { seq_printf(s, "%s:%d=%x\n", - rdtgrp->plr->r->name, + rdtgrp->plr->s->res->name, rdtgrp->plr->d->id, rdtgrp->plr->cbm); } } else { closid = rdtgrp->closid; - for_each_alloc_enabled_rdt_resource(r) { - if (closid < r->num_closid) - show_doms(s, r, closid); + list_for_each_entry(schema, &resctrl_schema_all, list) { + if (closid < schema->num_closid) + show_doms(s, schema, closid); } } } else { @@ -495,14 +529,16 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, return ret; } -void mon_event_read(struct rmid_read *rr, struct rdt_domain *d, - struct rdtgroup *rdtgrp, int evtid, int first) +void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, + struct rdt_domain *d, struct rdtgroup *rdtgrp, + int evtid, int first) { /* * setup the parameters to send to the IPI to read the data. */ rr->rgrp = rdtgrp; rr->evtid = evtid; + rr->r = r; rr->d = d; rr->val = 0; rr->first = first; @@ -532,21 +568,21 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) domid = md.u.domid; evtid = md.u.evtid; - r = &rdt_resources_all[resid]; + r = &rdt_resources_all[resid].r_resctrl; d = rdt_find_domain(r, domid, NULL); if (IS_ERR_OR_NULL(d)) { ret = -ENOENT; goto out; } - mon_event_read(&rr, d, rdtgrp, evtid, false); + mon_event_read(&rr, r, d, rdtgrp, evtid, false); - if (rr.val & RMID_VAL_ERROR) + if (rr.err == -EIO) seq_puts(m, "Error\n"); - else if (rr.val & RMID_VAL_UNAVAIL) + else if (rr.err == -EINVAL) seq_puts(m, "Unavailable\n"); else - seq_printf(m, "%llu\n", rr.val * r->mon_scale); + seq_printf(m, "%llu\n", rr.val); out: rdtgroup_kn_unlock(of->kn); diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 181c992f448c..5f7128686cfd 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -2,6 +2,7 @@ #ifndef _ASM_X86_RESCTRL_INTERNAL_H #define _ASM_X86_RESCTRL_INTERNAL_H +#include <linux/resctrl.h> #include <linux/sched.h> #include <linux/kernfs.h> #include <linux/fs_context.h> @@ -21,25 +22,23 @@ #define L2_QOS_CDP_ENABLE 0x01ULL -/* - * Event IDs are used to program IA32_QM_EVTSEL before reading event - * counter from IA32_QM_CTR - */ -#define QOS_L3_OCCUP_EVENT_ID 0x01 -#define QOS_L3_MBM_TOTAL_EVENT_ID 0x02 -#define QOS_L3_MBM_LOCAL_EVENT_ID 0x03 - #define CQM_LIMBOCHECK_INTERVAL 1000 -#define MBM_CNTR_WIDTH 24 +#define MBM_CNTR_WIDTH_BASE 24 #define MBM_OVERFLOW_INTERVAL 1000 #define MAX_MBA_BW 100u #define MBA_IS_LINEAR 0x4 -#define MBA_MAX_MBPS U32_MAX #define MAX_MBA_BW_AMD 0x800 +#define MBM_CNTR_WIDTH_OFFSET_AMD 20 #define RMID_VAL_ERROR BIT_ULL(63) #define RMID_VAL_UNAVAIL BIT_ULL(62) +/* + * With the above fields in use 62 bits remain in MSR_IA32_QM_CTR for + * data to be returned. The counter width is discovered from the hardware + * as an offset from MBM_CNTR_WIDTH_BASE. + */ +#define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE) struct rdt_fs_context { @@ -63,40 +62,46 @@ DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); * struct mon_evt - Entry in the event list of a resource * @evtid: event id * @name: name of the event + * @list: entry in &rdt_resource->evt_list */ struct mon_evt { - u32 evtid; + enum resctrl_event_id evtid; char *name; struct list_head list; }; /** - * struct mon_data_bits - Monitoring details for each event file - * @rid: Resource id associated with the event file. + * union mon_data_bits - Monitoring details for each event file + * @priv: Used to store monitoring event data in @u + * as kernfs private data + * @rid: Resource id associated with the event file * @evtid: Event id associated with the event file * @domid: The domain to which the event file belongs + * @u: Name of the bit fields struct */ union mon_data_bits { void *priv; struct { - unsigned int rid : 10; - unsigned int evtid : 8; - unsigned int domid : 14; + unsigned int rid : 10; + enum resctrl_event_id evtid : 8; + unsigned int domid : 14; } u; }; struct rmid_read { struct rdtgroup *rgrp; + struct rdt_resource *r; struct rdt_domain *d; - int evtid; + enum resctrl_event_id evtid; bool first; + int err; u64 val; }; -extern unsigned int resctrl_cqm_threshold; extern bool rdt_alloc_capable; extern bool rdt_mon_capable; extern unsigned int rdt_mon_features; +extern struct list_head resctrl_schema_all; enum rdt_group_type { RDTCTRL_GROUP = 0, @@ -111,6 +116,7 @@ enum rdt_group_type { * @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking * @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations * allowed AND the allocations are Cache Pseudo-Locked + * @RDT_NUM_MODES: Total number of modes * * The mode of a resource group enables control over the allowed overlap * between allocations associated with different resource groups (classes @@ -134,7 +140,7 @@ enum rdtgrp_mode { /** * struct mongroup - store mon group's data in resctrl fs. - * @mon_data_kn kernlfs node for the mon_data directory + * @mon_data_kn: kernfs node for the mon_data directory * @parent: parent rdtgrp * @crdtgrp_list: child rdtgroup node list * @rmid: rmid for this rdtgroup @@ -148,8 +154,8 @@ struct mongroup { /** * struct pseudo_lock_region - pseudo-lock region information - * @r: RDT resource to which this pseudo-locked region - * belongs + * @s: Resctrl schema for the resource to which this + * pseudo-locked region belongs * @d: RDT domain to which this pseudo-locked region * belongs * @cbm: bitmask of the pseudo-locked region @@ -169,7 +175,7 @@ struct mongroup { * @pm_reqs: Power management QoS requests related to this region */ struct pseudo_lock_region { - struct rdt_resource *r; + struct resctrl_schema *s; struct rdt_domain *d; u32 cbm; wait_queue_head_t lock_thread_wq; @@ -256,7 +262,7 @@ void __exit rdtgroup_exit(void); struct rftype { char *name; umode_t mode; - struct kernfs_ops *kf_ops; + const struct kernfs_ops *kf_ops; unsigned long flags; unsigned long fflags; @@ -273,63 +279,52 @@ struct rftype { /** * struct mbm_state - status for each MBM counter in each domain - * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes) - * @prev_msr Value of IA32_QM_CTR for this RMID last time we read it - * @chunks_bw Total local data moved. Used for bandwidth calculation - * @prev_bw_msr:Value of previous IA32_QM_CTR for bandwidth counting - * @prev_bw The most recent bandwidth in MBps - * @delta_bw Difference between the current and previous bandwidth - * @delta_comp Indicates whether to compute the delta_bw + * @prev_bw_bytes: Previous bytes value read for bandwidth calculation + * @prev_bw: The most recent bandwidth in MBps + * @delta_bw: Difference between the current and previous bandwidth + * @delta_comp: Indicates whether to compute the delta_bw */ struct mbm_state { - u64 chunks; - u64 prev_msr; - u64 chunks_bw; - u64 prev_bw_msr; + u64 prev_bw_bytes; u32 prev_bw; u32 delta_bw; bool delta_comp; }; /** - * struct rdt_domain - group of cpus sharing an RDT resource - * @list: all instances of this resource - * @id: unique id for this instance - * @cpu_mask: which cpus share this resource - * @rmid_busy_llc: - * bitmap of which limbo RMIDs are above threshold - * @mbm_total: saved state for MBM total bandwidth - * @mbm_local: saved state for MBM local bandwidth - * @mbm_over: worker to periodically read MBM h/w counters - * @cqm_limbo: worker to periodically read CQM h/w counters - * @mbm_work_cpu: - * worker cpu for MBM h/w counters - * @cqm_work_cpu: - * worker cpu for CQM h/w counters + * struct arch_mbm_state - values used to compute resctrl_arch_rmid_read()s + * return value. + * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes) + * @prev_msr: Value of IA32_QM_CTR last time it was read for the RMID used to + * find this struct. + */ +struct arch_mbm_state { + u64 chunks; + u64 prev_msr; +}; + +/** + * struct rdt_hw_domain - Arch private attributes of a set of CPUs that share + * a resource + * @d_resctrl: Properties exposed to the resctrl file system * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID) - * @mbps_val: When mba_sc is enabled, this holds the bandwidth in MBps - * @new_ctrl: new ctrl value to be loaded - * @have_new_ctrl: did user provide new_ctrl for this domain - * @plr: pseudo-locked region (if any) associated with domain + * @arch_mbm_total: arch private state for MBM total bandwidth + * @arch_mbm_local: arch private state for MBM local bandwidth + * + * Members of this structure are accessed via helpers that provide abstraction. */ -struct rdt_domain { - struct list_head list; - int id; - struct cpumask cpu_mask; - unsigned long *rmid_busy_llc; - struct mbm_state *mbm_total; - struct mbm_state *mbm_local; - struct delayed_work mbm_over; - struct delayed_work cqm_limbo; - int mbm_work_cpu; - int cqm_work_cpu; +struct rdt_hw_domain { + struct rdt_domain d_resctrl; u32 *ctrl_val; - u32 *mbps_val; - u32 new_ctrl; - bool have_new_ctrl; - struct pseudo_lock_region *plr; + struct arch_mbm_state *arch_mbm_total; + struct arch_mbm_state *arch_mbm_local; }; +static inline struct rdt_hw_domain *resctrl_to_arch_dom(struct rdt_domain *r) +{ + return container_of(r, struct rdt_hw_domain, d_resctrl); +} + /** * struct msr_param - set a range of MSRs from a domain * @res: The resource to use @@ -338,46 +333,8 @@ struct rdt_domain { */ struct msr_param { struct rdt_resource *res; - int low; - int high; -}; - -/** - * struct rdt_cache - Cache allocation related data - * @cbm_len: Length of the cache bit mask - * @min_cbm_bits: Minimum number of consecutive bits to be set - * @cbm_idx_mult: Multiplier of CBM index - * @cbm_idx_offset: Offset of CBM index. CBM index is computed by: - * closid * cbm_idx_multi + cbm_idx_offset - * in a cache bit mask - * @shareable_bits: Bitmask of shareable resource with other - * executing entities - */ -struct rdt_cache { - unsigned int cbm_len; - unsigned int min_cbm_bits; - unsigned int cbm_idx_mult; - unsigned int cbm_idx_offset; - unsigned int shareable_bits; -}; - -/** - * struct rdt_membw - Memory bandwidth allocation related data - * @max_delay: Max throttle delay. Delay is the hardware - * representation for memory bandwidth. - * @min_bw: Minimum memory bandwidth percentage user can request - * @bw_gran: Granularity at which the memory bandwidth is allocated - * @delay_linear: True if memory B/W delay is in linear scale - * @mba_sc: True if MBA software controller(mba_sc) is enabled - * @mb_map: Mapping of memory B/W percentage to memory B/W delay - */ -struct rdt_membw { - u32 max_delay; - u32 min_bw; - u32 bw_gran; - u32 delay_linear; - bool mba_sc; - u32 *mb_map; + u32 low; + u32 high; }; static inline bool is_llc_occupancy_enabled(void) @@ -412,114 +369,97 @@ struct rdt_parse_data { }; /** - * struct rdt_resource - attributes of an RDT resource - * @rid: The index of the resource - * @alloc_enabled: Is allocation enabled on this machine - * @mon_enabled: Is monitoring enabled for this feature - * @alloc_capable: Is allocation available on this machine - * @mon_capable: Is monitor feature available on this machine - * @name: Name to use in "schemata" file - * @num_closid: Number of CLOSIDs available - * @cache_level: Which cache level defines scope of this resource - * @default_ctrl: Specifies default cache cbm or memory B/W percent. + * struct rdt_hw_resource - arch private attributes of a resctrl resource + * @r_resctrl: Attributes of the resource used directly by resctrl. + * @num_closid: Maximum number of closid this hardware can support, + * regardless of CDP. This is exposed via + * resctrl_arch_get_num_closid() to avoid confusion + * with struct resctrl_schema's property of the same name, + * which has been corrected for features like CDP. * @msr_base: Base MSR address for CBMs * @msr_update: Function pointer to update QOS MSRs - * @data_width: Character width of data when displaying - * @domains: All domains for this resource - * @cache: Cache allocation related data - * @format_str: Per resource format string to show domain value - * @parse_ctrlval: Per resource function pointer to parse control values - * @cbm_validate Cache bitmask validate function - * @evt_list: List of monitoring events - * @num_rmid: Number of RMIDs available * @mon_scale: cqm counter * mon_scale = occupancy in bytes - * @fflags: flags to choose base and info files + * @mbm_width: Monitor width, to detect and correct for overflow. + * @cdp_enabled: CDP state of this resource + * + * Members of this structure are either private to the architecture + * e.g. mbm_width, or accessed via helpers that provide abstraction. e.g. + * msr_update and msr_base. */ -struct rdt_resource { - int rid; - bool alloc_enabled; - bool mon_enabled; - bool alloc_capable; - bool mon_capable; - char *name; - int num_closid; - int cache_level; - u32 default_ctrl; +struct rdt_hw_resource { + struct rdt_resource r_resctrl; + u32 num_closid; unsigned int msr_base; void (*msr_update) (struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r); - int data_width; - struct list_head domains; - struct rdt_cache cache; - struct rdt_membw membw; - const char *format_str; - int (*parse_ctrlval)(struct rdt_parse_data *data, - struct rdt_resource *r, - struct rdt_domain *d); - bool (*cbm_validate)(char *buf, u32 *data, struct rdt_resource *r); - struct list_head evt_list; - int num_rmid; unsigned int mon_scale; - unsigned long fflags; + unsigned int mbm_width; + bool cdp_enabled; }; -int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r, +static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r) +{ + return container_of(r, struct rdt_hw_resource, r_resctrl); +} + +int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, struct rdt_domain *d); -int parse_bw_intel(struct rdt_parse_data *data, struct rdt_resource *r, - struct rdt_domain *d); -int parse_bw_amd(struct rdt_parse_data *data, struct rdt_resource *r, - struct rdt_domain *d); +int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, + struct rdt_domain *d); extern struct mutex rdtgroup_mutex; -extern struct rdt_resource rdt_resources_all[]; +extern struct rdt_hw_resource rdt_resources_all[]; extern struct rdtgroup rdtgroup_default; DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); extern struct dentry *debugfs_resctrl; -enum { +enum resctrl_res_level { RDT_RESOURCE_L3, - RDT_RESOURCE_L3DATA, - RDT_RESOURCE_L3CODE, RDT_RESOURCE_L2, - RDT_RESOURCE_L2DATA, - RDT_RESOURCE_L2CODE, RDT_RESOURCE_MBA, /* Must be the last */ RDT_NUM_RESOURCES, }; +static inline struct rdt_resource *resctrl_inc(struct rdt_resource *res) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(res); + + hw_res++; + return &hw_res->r_resctrl; +} + +static inline bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l) +{ + return rdt_resources_all[l].cdp_enabled; +} + +int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable); + +/* + * To return the common struct rdt_resource, which is contained in struct + * rdt_hw_resource, walk the resctrl member of struct rdt_hw_resource. + */ #define for_each_rdt_resource(r) \ - for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ - r++) + for (r = &rdt_resources_all[0].r_resctrl; \ + r <= &rdt_resources_all[RDT_NUM_RESOURCES - 1].r_resctrl; \ + r = resctrl_inc(r)) #define for_each_capable_rdt_resource(r) \ - for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ - r++) \ + for_each_rdt_resource(r) \ if (r->alloc_capable || r->mon_capable) #define for_each_alloc_capable_rdt_resource(r) \ - for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ - r++) \ + for_each_rdt_resource(r) \ if (r->alloc_capable) #define for_each_mon_capable_rdt_resource(r) \ - for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ - r++) \ + for_each_rdt_resource(r) \ if (r->mon_capable) -#define for_each_alloc_enabled_rdt_resource(r) \ - for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ - r++) \ - if (r->alloc_enabled) - -#define for_each_mon_enabled_rdt_resource(r) \ - for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ - r++) \ - if (r->mon_enabled) - /* CPUID.(EAX=10H, ECX=ResID=1).EAX */ union cpuid_0x10_1_eax { struct { @@ -546,6 +486,7 @@ union cpuid_0x10_x_edx { void rdt_last_cmd_clear(void); void rdt_last_cmd_puts(const char *s); +__printf(1, 2) void rdt_last_cmd_printf(const char *fmt, ...); void rdt_ctrl_update(void *arg); @@ -560,7 +501,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); int rdtgroup_schemata_show(struct kernfs_open_file *of, struct seq_file *s, void *v); -bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, +bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d, unsigned long cbm, int closid, bool exclusive); unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_domain *d, unsigned long cbm); @@ -575,7 +516,6 @@ void rdt_pseudo_lock_release(void); int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp); void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp); struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); -int update_domains(struct rdt_resource *r, int closid); int closids_supported(void); void closid_free(int closid); int alloc_rmid(void); @@ -583,23 +523,19 @@ void free_rmid(u32 rmid); int rdt_get_mon_l3_config(struct rdt_resource *r); void mon_event_count(void *info); int rdtgroup_mondata_show(struct seq_file *m, void *arg); -void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - unsigned int dom_id); -void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - struct rdt_domain *d); -void mon_event_read(struct rmid_read *rr, struct rdt_domain *d, - struct rdtgroup *rdtgrp, int evtid, int first); +void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, + struct rdt_domain *d, struct rdtgroup *rdtgrp, + int evtid, int first); void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms); void mbm_handle_overflow(struct work_struct *work); +void __init intel_rdt_mbm_apply_quirk(void); bool is_mba_sc(struct rdt_resource *r); -void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm); -u32 delay_bw_map(unsigned long bw, struct rdt_resource *r); void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms); void cqm_handle_limbo(struct work_struct *work); bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); void __check_limbo(struct rdt_domain *d, bool force_free); -bool cbm_validate_intel(char *buf, u32 *data, struct rdt_resource *r); -bool cbm_validate_amd(char *buf, u32 *data, struct rdt_resource *r); +void rdt_domain_reconfigure_cdp(struct rdt_resource *r); +void __init thread_throttle_mode_init(void); #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 773124b0e18a..efe0c30d3a12 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -16,8 +16,12 @@ */ #include <linux/module.h> +#include <linux/sizes.h> #include <linux/slab.h> + #include <asm/cpu_device_id.h> +#include <asm/resctrl.h> + #include "internal.h" struct rmid_entry { @@ -37,8 +41,8 @@ static LIST_HEAD(rmid_free_lru); * @rmid_limbo_count count of currently unused but (potentially) * dirty RMIDs. * This counts RMIDs that no one is currently using but that - * may have a occupancy value > intel_cqm_threshold. User can change - * the threshold occupancy value. + * may have a occupancy value > resctrl_rmid_realloc_threshold. User can + * change the threshold occupancy value. */ static unsigned int rmid_limbo_count; @@ -59,10 +63,78 @@ bool rdt_mon_capable; unsigned int rdt_mon_features; /* - * This is the threshold cache occupancy at which we will consider an + * This is the threshold cache occupancy in bytes at which we will consider an * RMID available for re-allocation. */ -unsigned int resctrl_cqm_threshold; +unsigned int resctrl_rmid_realloc_threshold; + +/* + * This is the maximum value for the reallocation threshold, in bytes. + */ +unsigned int resctrl_rmid_realloc_limit; + +#define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) + +/* + * The correction factor table is documented in Documentation/x86/resctrl.rst. + * If rmid > rmid threshold, MBM total and local values should be multiplied + * by the correction factor. + * + * The original table is modified for better code: + * + * 1. The threshold 0 is changed to rmid count - 1 so don't do correction + * for the case. + * 2. MBM total and local correction table indexed by core counter which is + * equal to (x86_cache_max_rmid + 1) / 8 - 1 and is from 0 up to 27. + * 3. The correction factor is normalized to 2^20 (1048576) so it's faster + * to calculate corrected value by shifting: + * corrected_value = (original_value * correction_factor) >> 20 + */ +static const struct mbm_correction_factor_table { + u32 rmidthreshold; + u64 cf; +} mbm_cf_table[] __initconst = { + {7, CF(1.000000)}, + {15, CF(1.000000)}, + {15, CF(0.969650)}, + {31, CF(1.000000)}, + {31, CF(1.066667)}, + {31, CF(0.969650)}, + {47, CF(1.142857)}, + {63, CF(1.000000)}, + {63, CF(1.185115)}, + {63, CF(1.066553)}, + {79, CF(1.454545)}, + {95, CF(1.000000)}, + {95, CF(1.230769)}, + {95, CF(1.142857)}, + {95, CF(1.066667)}, + {127, CF(1.000000)}, + {127, CF(1.254863)}, + {127, CF(1.185255)}, + {151, CF(1.000000)}, + {127, CF(1.066667)}, + {167, CF(1.000000)}, + {159, CF(1.454334)}, + {183, CF(1.000000)}, + {127, CF(0.969744)}, + {191, CF(1.280246)}, + {191, CF(1.230921)}, + {215, CF(1.000000)}, + {191, CF(1.143118)}, +}; + +static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX; +static u64 mbm_cf __read_mostly; + +static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) +{ + /* Correct MBM value. */ + if (rmid > mbm_cf_rmidthreshold) + val = (val * mbm_cf) >> 20; + + return val; +} static inline struct rmid_entry *__rmid_entry(u32 rmid) { @@ -74,9 +146,54 @@ static inline struct rmid_entry *__rmid_entry(u32 rmid) return entry; } -static u64 __rmid_read(u32 rmid, u32 eventid) +static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain *hw_dom, + u32 rmid, + enum resctrl_event_id eventid) +{ + switch (eventid) { + case QOS_L3_OCCUP_EVENT_ID: + return NULL; + case QOS_L3_MBM_TOTAL_EVENT_ID: + return &hw_dom->arch_mbm_total[rmid]; + case QOS_L3_MBM_LOCAL_EVENT_ID: + return &hw_dom->arch_mbm_local[rmid]; + } + + /* Never expect to get here */ + WARN_ON_ONCE(1); + + return NULL; +} + +void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, + u32 rmid, enum resctrl_event_id eventid) +{ + struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + struct arch_mbm_state *am; + + am = get_arch_mbm_state(hw_dom, rmid, eventid); + if (am) + memset(am, 0, sizeof(*am)); +} + +static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) { - u64 val; + u64 shift = 64 - width, chunks; + + chunks = (cur_msr << shift) - (prev_msr << shift); + return chunks >> shift; +} + +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, + u32 rmid, enum resctrl_event_id eventid, u64 *val) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + struct arch_mbm_state *am; + u64 msr_val, chunks; + + if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask)) + return -EINVAL; /* * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured @@ -87,16 +204,26 @@ static u64 __rmid_read(u32 rmid, u32 eventid) * are error bits. */ wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); - rdmsrl(MSR_IA32_QM_CTR, val); + rdmsrl(MSR_IA32_QM_CTR, msr_val); - return val; -} + if (msr_val & RMID_VAL_ERROR) + return -EIO; + if (msr_val & RMID_VAL_UNAVAIL) + return -EINVAL; -static bool rmid_dirty(struct rmid_entry *entry) -{ - u64 val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID); + am = get_arch_mbm_state(hw_dom, rmid, eventid); + if (am) { + am->chunks += mbm_overflow_count(am->prev_msr, msr_val, + hw_res->mbm_width); + chunks = get_corrected_mbm_count(rmid, am->chunks); + am->prev_msr = msr_val; + } else { + chunks = msr_val; + } - return val >= resctrl_cqm_threshold; + *val = chunks * hw_res->mon_scale; + + return 0; } /* @@ -107,11 +234,11 @@ static bool rmid_dirty(struct rmid_entry *entry) */ void __check_limbo(struct rdt_domain *d, bool force_free) { + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; struct rmid_entry *entry; - struct rdt_resource *r; u32 crmid = 1, nrmid; - - r = &rdt_resources_all[RDT_RESOURCE_L3]; + bool rmid_dirty; + u64 val = 0; /* * Skip RMID 0 and start from RMID 1 and check all the RMIDs that @@ -125,7 +252,15 @@ void __check_limbo(struct rdt_domain *d, bool force_free) break; entry = __rmid_entry(nrmid); - if (force_free || !rmid_dirty(entry)) { + + if (resctrl_arch_rmid_read(r, d, entry->rmid, + QOS_L3_OCCUP_EVENT_ID, &val)) { + rmid_dirty = true; + } else { + rmid_dirty = (val >= resctrl_rmid_realloc_threshold); + } + + if (force_free || !rmid_dirty) { clear_bit(entry->rmid, d->rmid_busy_llc); if (!--entry->busy) { rmid_limbo_count--; @@ -164,19 +299,19 @@ int alloc_rmid(void) static void add_rmid_to_limbo(struct rmid_entry *entry) { - struct rdt_resource *r; + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; struct rdt_domain *d; - int cpu; - u64 val; - - r = &rdt_resources_all[RDT_RESOURCE_L3]; + int cpu, err; + u64 val = 0; entry->busy = 0; cpu = get_cpu(); list_for_each_entry(d, &r->domains, list) { if (cpumask_test_cpu(cpu, &d->cpu_mask)) { - val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID); - if (val <= resctrl_cqm_threshold) + err = resctrl_arch_rmid_read(r, d, entry->rmid, + QOS_L3_OCCUP_EVENT_ID, + &val); + if (err || val <= resctrl_rmid_realloc_threshold) continue; } @@ -214,24 +349,18 @@ void free_rmid(u32 rmid) list_add_tail(&entry->list, &rmid_free_lru); } -static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr) -{ - u64 shift = 64 - MBM_CNTR_WIDTH, chunks; - - chunks = (cur_msr << shift) - (prev_msr << shift); - return chunks >>= shift; -} - static int __mon_event_count(u32 rmid, struct rmid_read *rr) { struct mbm_state *m; - u64 chunks, tval; + u64 tval = 0; + + if (rr->first) + resctrl_arch_reset_rmid(rr->r, rr->d, rmid, rr->evtid); + + rr->err = resctrl_arch_rmid_read(rr->r, rr->d, rmid, rr->evtid, &tval); + if (rr->err) + return rr->err; - tval = __rmid_read(rmid, rr->evtid); - if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) { - rr->val = tval; - return -EINVAL; - } switch (rr->evtid) { case QOS_L3_OCCUP_EVENT_ID: rr->val += tval; @@ -244,50 +373,48 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr) break; default: /* - * Code would never reach here because - * an invalid event id would fail the __rmid_read. + * Code would never reach here because an invalid + * event id would fail in resctrl_arch_rmid_read(). */ return -EINVAL; } if (rr->first) { memset(m, 0, sizeof(struct mbm_state)); - m->prev_bw_msr = m->prev_msr = tval; return 0; } - chunks = mbm_overflow_count(m->prev_msr, tval); - m->chunks += chunks; - m->prev_msr = tval; + rr->val += tval; - rr->val += m->chunks; return 0; } /* + * mbm_bw_count() - Update bw count from values previously read by + * __mon_event_count(). + * @rmid: The rmid used to identify the cached mbm_state. + * @rr: The struct rmid_read populated by __mon_event_count(). + * * Supporting function to calculate the memory bandwidth - * and delta bandwidth in MBps. + * and delta bandwidth in MBps. The chunks value previously read by + * __mon_event_count() is compared with the chunks value from the previous + * invocation. This must be called once per second to maintain values in MBps. */ static void mbm_bw_count(u32 rmid, struct rmid_read *rr) { - struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; struct mbm_state *m = &rr->d->mbm_local[rmid]; - u64 tval, cur_bw, chunks; + u64 cur_bw, bytes, cur_bytes; - tval = __rmid_read(rmid, rr->evtid); - if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) - return; + cur_bytes = rr->val; + bytes = cur_bytes - m->prev_bw_bytes; + m->prev_bw_bytes = cur_bytes; - chunks = mbm_overflow_count(m->prev_bw_msr, tval); - m->chunks_bw += chunks; - m->chunks = m->chunks_bw; - cur_bw = (chunks * r->mon_scale) >> 20; + cur_bw = bytes / SZ_1M; if (m->delta_comp) m->delta_bw = abs(cur_bw - m->prev_bw); m->delta_comp = false; m->prev_bw = cur_bw; - m->prev_bw_msr = tval; } /* @@ -299,23 +426,33 @@ void mon_event_count(void *info) struct rdtgroup *rdtgrp, *entry; struct rmid_read *rr = info; struct list_head *head; + int ret; rdtgrp = rr->rgrp; - if (__mon_event_count(rdtgrp->mon.rmid, rr)) - return; + ret = __mon_event_count(rdtgrp->mon.rmid, rr); /* - * For Ctrl groups read data from child monitor groups. + * For Ctrl groups read data from child monitor groups and + * add them together. Count events which are read successfully. + * Discard the rmid_read's reporting errors. */ head = &rdtgrp->mon.crdtgrp_list; if (rdtgrp->type == RDTCTRL_GROUP) { list_for_each_entry(entry, head, mon.crdtgrp_list) { - if (__mon_event_count(entry->mon.rmid, rr)) - return; + if (__mon_event_count(entry->mon.rmid, rr) == 0) + ret = 0; } } + + /* + * __mon_event_count() calls for newly created monitor groups may + * report -EINVAL/Unavailable if the monitor hasn't seen any traffic. + * Discard error if any of the monitor event reads succeeded. + */ + if (ret == 0) + rr->err = 0; } /* @@ -325,7 +462,7 @@ void mon_event_count(void *info) * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so * that: * - * current bandwdith(cur_bw) < user specified bandwidth(user_bw) + * current bandwidth(cur_bw) < user specified bandwidth(user_bw) * * This uses the MBM counters to measure the bandwidth and MBA throttle * MSRs to control the bandwidth for a particular rdtgrp. It builds on the @@ -335,7 +472,7 @@ void mon_event_count(void *info) * timer. Having 1s interval makes the calculation of bandwidth simpler. * * Although MBA's goal is to restrict the bandwidth to a maximum, there may - * be a need to increase the bandwidth to avoid uncecessarily restricting + * be a need to increase the bandwidth to avoid unnecessarily restricting * the L2 <-> L3 traffic. * * Since MBA controls the L2 external bandwidth where as MBM measures the @@ -352,7 +489,7 @@ void mon_event_count(void *info) */ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) { - u32 closid, rmid, cur_msr, cur_msr_val, new_msr_val; + u32 closid, rmid, cur_msr_val, new_msr_val; struct mbm_state *pmbm_data, *cmbm_data; u32 cur_bw, delta_bw, user_bw; struct rdt_resource *r_mba; @@ -363,7 +500,8 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) if (!is_mbm_local_enabled()) return; - r_mba = &rdt_resources_all[RDT_RESOURCE_MBA]; + r_mba = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl; + closid = rgrp->closid; rmid = rgrp->mon.rmid; pmbm_data = &dom_mbm->mbm_local[rmid]; @@ -377,7 +515,9 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) cur_bw = pmbm_data->prev_bw; user_bw = dom_mba->mbps_val[closid]; delta_bw = pmbm_data->delta_bw; - cur_msr_val = dom_mba->ctrl_val[closid]; + + /* MBA resource doesn't support CDP */ + cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE); /* * For Ctrl groups read data from child monitor groups. @@ -412,13 +552,11 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) return; } - cur_msr = r_mba->msr_base + closid; - wrmsrl(cur_msr, delay_bw_map(new_msr_val, r_mba)); - dom_mba->ctrl_val[closid] = new_msr_val; + resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val); /* * Delta values are updated dynamically package wise for each - * rdtgrp everytime the throttle MSR changes value. + * rdtgrp every time the throttle MSR changes value. * * This is because (1)the increase in bandwidth is not perfectly * linear and only "approximately" linear even when the hardware @@ -433,11 +571,12 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) } } -static void mbm_update(struct rdt_domain *d, int rmid) +static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid) { struct rmid_read rr; rr.first = false; + rr.r = r; rr.d = d; /* @@ -446,19 +585,20 @@ static void mbm_update(struct rdt_domain *d, int rmid) */ if (is_mbm_total_enabled()) { rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID; + rr.val = 0; __mon_event_count(rmid, &rr); } if (is_mbm_local_enabled()) { rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; + rr.val = 0; + __mon_event_count(rmid, &rr); /* * Call the MBA software controller only for the * control groups and when user has enabled * the software controller explicitly. */ - if (!is_mba_sc(NULL)) - __mon_event_count(rmid, &rr); - else + if (is_mba_sc(NULL)) mbm_bw_count(rmid, &rr); } } @@ -476,20 +616,14 @@ void cqm_handle_limbo(struct work_struct *work) mutex_lock(&rdtgroup_mutex); - r = &rdt_resources_all[RDT_RESOURCE_L3]; - d = get_domain_from_cpu(cpu, r); - - if (!d) { - pr_warn_once("Failure to get domain for limbo worker\n"); - goto out_unlock; - } + r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + d = container_of(work, struct rdt_domain, cqm_limbo.work); __check_limbo(d, false); if (has_busy_rmid(r, d)) schedule_delayed_work_on(cpu, &d->cqm_limbo, delay); -out_unlock: mutex_unlock(&rdtgroup_mutex); } @@ -510,6 +644,7 @@ void mbm_handle_overflow(struct work_struct *work) struct rdtgroup *prgrp, *crgrp; int cpu = smp_processor_id(); struct list_head *head; + struct rdt_resource *r; struct rdt_domain *d; mutex_lock(&rdtgroup_mutex); @@ -517,16 +652,15 @@ void mbm_handle_overflow(struct work_struct *work) if (!static_branch_likely(&rdt_mon_enable_key)) goto out_unlock; - d = get_domain_from_cpu(cpu, &rdt_resources_all[RDT_RESOURCE_L3]); - if (!d) - goto out_unlock; + r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + d = container_of(work, struct rdt_domain, mbm_over.work); list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - mbm_update(d, prgrp->mon.rmid); + mbm_update(r, d, prgrp->mon.rmid); head = &prgrp->mon.crdtgrp_list; list_for_each_entry(crgrp, head, mon.crdtgrp_list) - mbm_update(d, crgrp->mon.rmid); + mbm_update(r, d, crgrp->mon.rmid); if (is_mba_sc(NULL)) update_mba_bw(prgrp, d); @@ -614,11 +748,20 @@ static void l3_mon_evt_init(struct rdt_resource *r) int rdt_get_mon_l3_config(struct rdt_resource *r) { - unsigned int cl_size = boot_cpu_data.x86_cache_size; + unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + unsigned int threshold; int ret; - r->mon_scale = boot_cpu_data.x86_cache_occ_scale; + resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024; + hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale; r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1; + hw_res->mbm_width = MBM_CNTR_WIDTH_BASE; + + if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX) + hw_res->mbm_width += mbm_offset; + else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX) + pr_warn("Ignoring impossible MBM counter offset\n"); /* * A reasonable upper limit on the max threshold is the number @@ -627,10 +770,14 @@ int rdt_get_mon_l3_config(struct rdt_resource *r) * * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. */ - resctrl_cqm_threshold = cl_size * 1024 / r->num_rmid; + threshold = resctrl_rmid_realloc_limit / r->num_rmid; - /* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */ - resctrl_cqm_threshold /= r->mon_scale; + /* + * Because num_rmid may not be a power of two, round the value + * to the nearest multiple of hw_res->mon_scale so it matches a + * value the hardware will measure. mon_scale may not be a power of 2. + */ + resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold); ret = dom_data_init(r); if (ret) @@ -639,7 +786,20 @@ int rdt_get_mon_l3_config(struct rdt_resource *r) l3_mon_evt_init(r); r->mon_capable = true; - r->mon_enabled = true; return 0; } + +void __init intel_rdt_mbm_apply_quirk(void) +{ + int cf_index; + + cf_index = (boot_cpu_data.x86_cache_max_rmid + 1) / 8 - 1; + if (cf_index >= ARRAY_SIZE(mbm_cf_table)) { + pr_info("No MBM correction factor available\n"); + return; + } + + mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold; + mbm_cf = mbm_cf_table[cf_index].cf; +} diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index d7623e1b927d..d961ae3ed96e 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -24,7 +24,7 @@ #include <asm/cacheflush.h> #include <asm/intel-family.h> -#include <asm/resctrl_sched.h> +#include <asm/resctrl.h> #include <asm/perf_event.h> #include "../../events/perf_event.h" /* For X86_CONFIG() */ @@ -49,6 +49,7 @@ static struct class *pseudo_lock_class; /** * get_prefetch_disable_bits - prefetch disable bits of supported platforms + * @void: It takes no parameters. * * Capture the list of platforms that have been validated to support * pseudo-locking. This includes testing to ensure pseudo-locked regions @@ -162,7 +163,7 @@ static struct rdtgroup *region_find_by_minor(unsigned int minor) } /** - * pseudo_lock_pm_req - A power management QoS request list entry + * struct pseudo_lock_pm_req - A power management QoS request list entry * @list: Entry within the @pm_reqs list for a pseudo-locked region * @req: PM QoS request */ @@ -184,6 +185,7 @@ static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr) /** * pseudo_lock_cstates_constrain - Restrict cores from entering C6 + * @plr: Pseudo-locked region * * To prevent the cache from being affected by power management entering * C6 has to be avoided. This is accomplished by requesting a latency @@ -196,6 +198,8 @@ static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr) * the ACPI latencies need to be considered while keeping in mind that C2 * may be set to map to deeper sleep states. In this case the latency * requirement needs to prevent entering C2 also. + * + * Return: 0 on success, <0 on failure */ static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr) { @@ -246,7 +250,7 @@ static void pseudo_lock_region_clear(struct pseudo_lock_region *plr) plr->line_size = 0; kfree(plr->kmem); plr->kmem = NULL; - plr->r = NULL; + plr->s = NULL; if (plr->d) plr->d->plr = NULL; plr->d = NULL; @@ -290,10 +294,10 @@ static int pseudo_lock_region_init(struct pseudo_lock_region *plr) ci = get_cpu_cacheinfo(plr->cpu); - plr->size = rdtgroup_cbm_to_size(plr->r, plr->d, plr->cbm); + plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm); for (i = 0; i < ci->num_leaves; i++) { - if (ci->info_list[i].level == plr->r->cache_level) { + if (ci->info_list[i].level == plr->s->res->cache_level) { plr->line_size = ci->info_list[i].coherency_line_size; return 0; } @@ -416,6 +420,7 @@ static int pseudo_lock_fn(void *_rdtgrp) struct pseudo_lock_region *plr = rdtgrp->plr; u32 rmid_p, closid_p; unsigned long i; + u64 saved_msr; #ifdef CONFIG_KASAN /* * The registers used for local register variables are also used @@ -459,6 +464,7 @@ static int pseudo_lock_fn(void *_rdtgrp) * the buffer and evict pseudo-locked memory read earlier from the * cache. */ + saved_msr = __rdmsr(MSR_MISC_FEATURE_CONTROL); __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); closid_p = this_cpu_read(pqr_state.cur_closid); rmid_p = this_cpu_read(pqr_state.cur_rmid); @@ -510,7 +516,7 @@ static int pseudo_lock_fn(void *_rdtgrp) __wrmsr(IA32_PQR_ASSOC, rmid_p, closid_p); /* Re-enable the hardware prefetcher(s) */ - wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0); + wrmsrl(MSR_MISC_FEATURE_CONTROL, saved_msr); local_irq_enable(); plr->thread_done = 1; @@ -520,7 +526,7 @@ static int pseudo_lock_fn(void *_rdtgrp) /** * rdtgroup_monitor_in_progress - Test if monitoring in progress - * @r: resource group being queried + * @rdtgrp: resource group being queried * * Return: 1 if monitor groups have been created for this resource * group, 0 otherwise. @@ -684,8 +690,8 @@ int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) * resource, the portion of cache used by it should be made * unavailable to all future allocations from both resources. */ - if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled || - rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled) { + if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3) || + resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) { rdt_last_cmd_puts("CDP enabled\n"); return -EINVAL; } @@ -796,7 +802,7 @@ bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm unsigned long cbm_b; if (d->plr) { - cbm_len = d->plr->r->cache.cbm_len; + cbm_len = d->plr->s->res->cache.cbm_len; cbm_b = d->plr->cbm; if (bitmap_intersects(&cbm, &cbm_b, cbm_len)) return true; @@ -831,7 +837,7 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d) * First determine which cpus have pseudo-locked regions * associated with them. */ - for_each_alloc_enabled_rdt_resource(r) { + for_each_alloc_capable_rdt_resource(r) { list_for_each_entry(d_i, &r->domains, list) { if (d_i->plr) cpumask_or(cpu_with_psl, cpu_with_psl, @@ -867,6 +873,7 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d) static int measure_cycles_lat_fn(void *_plr) { struct pseudo_lock_region *plr = _plr; + u32 saved_low, saved_high; unsigned long i; u64 start, end; void *mem_r; @@ -875,6 +882,7 @@ static int measure_cycles_lat_fn(void *_plr) /* * Disable hardware prefetchers. */ + rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); mem_r = READ_ONCE(plr->kmem); /* @@ -891,7 +899,7 @@ static int measure_cycles_lat_fn(void *_plr) end = rdtsc_ordered(); trace_pseudo_lock_mem_latency((u32)(end - start)); } - wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0); + wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); local_irq_enable(); plr->thread_done = 1; wake_up_interruptible(&plr->lock_thread_wq); @@ -936,6 +944,7 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr, u64 hits_before = 0, hits_after = 0, miss_before = 0, miss_after = 0; struct perf_event *miss_event, *hit_event; int hit_pmcnum, miss_pmcnum; + u32 saved_low, saved_high; unsigned int line_size; unsigned int size; unsigned long i; @@ -969,6 +978,7 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr, /* * Disable hardware prefetchers. */ + rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); /* Initialize rest of local variables */ @@ -1027,7 +1037,7 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr, */ rmb(); /* Re-enable hardware prefetchers */ - wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0); + wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); local_irq_enable(); out_hit: perf_event_release_kernel(hit_event); @@ -1140,6 +1150,8 @@ out: /** * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region + * @rdtgrp: Resource group to which the pseudo-locked region belongs. + * @sel: Selector of which measurement to perform on a pseudo-locked region. * * The measurement of latency to access a pseudo-locked region should be * done from a cpu that is associated with that pseudo-locked region. @@ -1307,7 +1319,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) * If the thread does not get on the CPU for whatever * reason and the process which sets up the region is * interrupted then this will leave the thread in runnable - * state and once it gets on the CPU it will derefence + * state and once it gets on the CPU it will dereference * the cleared, but not freed, plr struct resulting in an * empty pseudo-locking loop. */ @@ -1326,9 +1338,9 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) * pseudo-locked region will still be here on return. * * The mutex has to be released temporarily to avoid a potential - * deadlock with the mm->mmap_sem semaphore which is obtained in - * the device_create() and debugfs_create_dir() callpath below - * as well as before the mmap() callback is called. + * deadlock with the mm->mmap_lock which is obtained in the + * device_create() and debugfs_create_dir() callpath below as well as + * before the mmap() callback is called. */ mutex_unlock(&rdtgroup_mutex); @@ -1391,7 +1403,7 @@ out: * group is removed from user space via a "rmdir" from userspace or the * unmount of the resctrl filesystem. On removal the resource group does * not go back to pseudo-locksetup mode before it is removed, instead it is - * removed directly. There is thus assymmetry with the creation where the + * removed directly. There is thus asymmetry with the creation where the * &struct pseudo_lock_region is removed here while it was not created in * rdtgroup_pseudo_lock_create(). * diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 064e9ef44cd6..e5a48f05e787 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * User interface for Resource Alloction in Resource Director Technology(RDT) + * User interface for Resource Allocation in Resource Director Technology(RDT) * * Copyright (C) 2016 Intel Corporation * @@ -29,7 +29,7 @@ #include <uapi/linux/magic.h> -#include <asm/resctrl_sched.h> +#include <asm/resctrl.h> #include "internal.h" DEFINE_STATIC_KEY_FALSE(rdt_enable_key); @@ -39,6 +39,9 @@ static struct kernfs_root *rdt_root; struct rdtgroup rdtgroup_default; LIST_HEAD(rdt_all_groups); +/* list of entries for the schemata file */ +LIST_HEAD(resctrl_schema_all); + /* Kernel fs node for "info" directory under root */ static struct kernfs_node *kn_info; @@ -100,12 +103,12 @@ int closids_supported(void) static void closid_init(void) { - struct rdt_resource *r; - int rdt_min_closid = 32; + struct resctrl_schema *s; + u32 rdt_min_closid = 32; /* Compute rdt_min_closid across all resources */ - for_each_alloc_enabled_rdt_resource(r) - rdt_min_closid = min(rdt_min_closid, r->num_closid); + list_for_each_entry(s, &resctrl_schema_all, list) + rdt_min_closid = min(rdt_min_closid, s->num_closid); closid_free_map = BIT_MASK(rdt_min_closid) - 1; @@ -240,13 +243,13 @@ static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf, return -EINVAL; } -static struct kernfs_ops rdtgroup_kf_single_ops = { +static const struct kernfs_ops rdtgroup_kf_single_ops = { .atomic_write_len = PAGE_SIZE, .write = rdtgroup_file_write, .seq_show = rdtgroup_seqfile_show, }; -static struct kernfs_ops kf_mondata_ops = { +static const struct kernfs_ops kf_mondata_ops = { .atomic_write_len = PAGE_SIZE, .seq_show = rdtgroup_mondata_show, }; @@ -294,7 +297,7 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of, /* * This is safe against resctrl_sched_in() called from __switch_to() * because __switch_to() is executed with interrupts disabled. A local call - * from update_closid_rmid() is proteced against __switch_to() because + * from update_closid_rmid() is protected against __switch_to() because * preemption is disabled. */ static void update_cpu_closid_rmid(void *info) @@ -338,14 +341,14 @@ static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, /* Check whether cpus belong to parent ctrl group */ cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask); - if (cpumask_weight(tmpmask)) { + if (!cpumask_empty(tmpmask)) { rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n"); return -EINVAL; } /* Check whether cpus are dropped from this group */ cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); - if (cpumask_weight(tmpmask)) { + if (!cpumask_empty(tmpmask)) { /* Give any dropped cpus to parent rdtgroup */ cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask); update_closid_rmid(tmpmask, prgrp); @@ -356,7 +359,7 @@ static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, * and update per-cpu rmid */ cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); - if (cpumask_weight(tmpmask)) { + if (!cpumask_empty(tmpmask)) { head = &prgrp->mon.crdtgrp_list; list_for_each_entry(crgrp, head, mon.crdtgrp_list) { if (crgrp == rdtgrp) @@ -391,7 +394,7 @@ static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, /* Check whether cpus are dropped from this group */ cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); - if (cpumask_weight(tmpmask)) { + if (!cpumask_empty(tmpmask)) { /* Can't drop from default group */ if (rdtgrp == &rdtgroup_default) { rdt_last_cmd_puts("Can't drop CPUs from default group\n"); @@ -410,12 +413,12 @@ static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, * and update per-cpu closid/rmid. */ cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); - if (cpumask_weight(tmpmask)) { + if (!cpumask_empty(tmpmask)) { list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { if (r == rdtgrp) continue; cpumask_and(tmpmask1, &r->cpu_mask, tmpmask); - if (cpumask_weight(tmpmask1)) + if (!cpumask_empty(tmpmask1)) cpumask_rdtgrp_clear(r, tmpmask1); } update_closid_rmid(tmpmask, rdtgrp); @@ -485,7 +488,7 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, /* check that user didn't specify any offline cpus */ cpumask_andnot(tmpmask, newmask, cpu_online_mask); - if (cpumask_weight(tmpmask)) { + if (!cpumask_empty(tmpmask)) { ret = -EINVAL; rdt_last_cmd_puts("Can only assign online CPUs\n"); goto unlock; @@ -507,89 +510,100 @@ unlock: return ret ?: nbytes; } -struct task_move_callback { - struct callback_head work; - struct rdtgroup *rdtgrp; -}; - -static void move_myself(struct callback_head *head) +/** + * rdtgroup_remove - the helper to remove resource group safely + * @rdtgrp: resource group to remove + * + * On resource group creation via a mkdir, an extra kernfs_node reference is + * taken to ensure that the rdtgroup structure remains accessible for the + * rdtgroup_kn_unlock() calls where it is removed. + * + * Drop the extra reference here, then free the rdtgroup structure. + * + * Return: void + */ +static void rdtgroup_remove(struct rdtgroup *rdtgrp) { - struct task_move_callback *callback; - struct rdtgroup *rdtgrp; - - callback = container_of(head, struct task_move_callback, work); - rdtgrp = callback->rdtgrp; + kernfs_put(rdtgrp->kn); + kfree(rdtgrp); +} +static void _update_task_closid_rmid(void *task) +{ /* - * If resource group was deleted before this task work callback - * was invoked, then assign the task to root group and free the - * resource group. + * If the task is still current on this CPU, update PQR_ASSOC MSR. + * Otherwise, the MSR is updated when the task is scheduled in. */ - if (atomic_dec_and_test(&rdtgrp->waitcount) && - (rdtgrp->flags & RDT_DELETED)) { - current->closid = 0; - current->rmid = 0; - kfree(rdtgrp); - } - - if (unlikely(current->flags & PF_EXITING)) - goto out; - - preempt_disable(); - /* update PQR_ASSOC MSR to make resource group go into effect */ - resctrl_sched_in(); - preempt_enable(); + if (task == current) + resctrl_sched_in(); +} -out: - kfree(callback); +static void update_task_closid_rmid(struct task_struct *t) +{ + if (IS_ENABLED(CONFIG_SMP) && task_curr(t)) + smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1); + else + _update_task_closid_rmid(t); } static int __rdtgroup_move_task(struct task_struct *tsk, struct rdtgroup *rdtgrp) { - struct task_move_callback *callback; - int ret; - - callback = kzalloc(sizeof(*callback), GFP_KERNEL); - if (!callback) - return -ENOMEM; - callback->work.func = move_myself; - callback->rdtgrp = rdtgrp; + /* If the task is already in rdtgrp, no need to move the task. */ + if ((rdtgrp->type == RDTCTRL_GROUP && tsk->closid == rdtgrp->closid && + tsk->rmid == rdtgrp->mon.rmid) || + (rdtgrp->type == RDTMON_GROUP && tsk->rmid == rdtgrp->mon.rmid && + tsk->closid == rdtgrp->mon.parent->closid)) + return 0; /* - * Take a refcount, so rdtgrp cannot be freed before the - * callback has been invoked. + * Set the task's closid/rmid before the PQR_ASSOC MSR can be + * updated by them. + * + * For ctrl_mon groups, move both closid and rmid. + * For monitor groups, can move the tasks only from + * their parent CTRL group. */ - atomic_inc(&rdtgrp->waitcount); - ret = task_work_add(tsk, &callback->work, true); - if (ret) { - /* - * Task is exiting. Drop the refcount and free the callback. - * No need to check the refcount as the group cannot be - * deleted before the write function unlocks rdtgroup_mutex. - */ - atomic_dec(&rdtgrp->waitcount); - kfree(callback); - rdt_last_cmd_puts("Task exited\n"); - } else { - /* - * For ctrl_mon groups move both closid and rmid. - * For monitor groups, can move the tasks only from - * their parent CTRL group. - */ - if (rdtgrp->type == RDTCTRL_GROUP) { - tsk->closid = rdtgrp->closid; - tsk->rmid = rdtgrp->mon.rmid; - } else if (rdtgrp->type == RDTMON_GROUP) { - if (rdtgrp->mon.parent->closid == tsk->closid) { - tsk->rmid = rdtgrp->mon.rmid; - } else { - rdt_last_cmd_puts("Can't move task to different control group\n"); - ret = -EINVAL; - } + + if (rdtgrp->type == RDTCTRL_GROUP) { + WRITE_ONCE(tsk->closid, rdtgrp->closid); + WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid); + } else if (rdtgrp->type == RDTMON_GROUP) { + if (rdtgrp->mon.parent->closid == tsk->closid) { + WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid); + } else { + rdt_last_cmd_puts("Can't move task to different control group\n"); + return -EINVAL; } } - return ret; + + /* + * Ensure the task's closid and rmid are written before determining if + * the task is current that will decide if it will be interrupted. + */ + barrier(); + + /* + * By now, the task's closid and rmid are set. If the task is current + * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource + * group go into effect. If the task is not current, the MSR will be + * updated when the task is scheduled in. + */ + update_task_closid_rmid(tsk); + + return 0; +} + +static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) +{ + return (rdt_alloc_capable && + (r->type == RDTCTRL_GROUP) && (t->closid == r->closid)); +} + +static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) +{ + return (rdt_mon_capable && + (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid)); } /** @@ -607,8 +621,7 @@ int rdtgroup_tasks_assigned(struct rdtgroup *r) rcu_read_lock(); for_each_process_thread(p, t) { - if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) || - (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid)) { + if (is_closid_match(t, r) || is_rmid_match(t, r)) { ret = 1; break; } @@ -706,8 +719,7 @@ static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) rcu_read_lock(); for_each_process_thread(p, t) { - if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) || - (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid)) + if (is_closid_match(t, r) || is_rmid_match(t, r)) seq_printf(s, "%d\n", t->pid); } rcu_read_unlock(); @@ -833,16 +845,17 @@ static int rdt_last_cmd_status_show(struct kernfs_open_file *of, static int rdt_num_closids_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { - struct rdt_resource *r = of->kn->parent->priv; + struct resctrl_schema *s = of->kn->parent->priv; - seq_printf(seq, "%d\n", r->num_closid); + seq_printf(seq, "%u\n", s->num_closid); return 0; } static int rdt_default_ctrl_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { - struct rdt_resource *r = of->kn->parent->priv; + struct resctrl_schema *s = of->kn->parent->priv; + struct rdt_resource *r = s->res; seq_printf(seq, "%x\n", r->default_ctrl); return 0; @@ -851,7 +864,8 @@ static int rdt_default_ctrl_show(struct kernfs_open_file *of, static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { - struct rdt_resource *r = of->kn->parent->priv; + struct resctrl_schema *s = of->kn->parent->priv; + struct rdt_resource *r = s->res; seq_printf(seq, "%u\n", r->cache.min_cbm_bits); return 0; @@ -860,7 +874,8 @@ static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, static int rdt_shareable_bits_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { - struct rdt_resource *r = of->kn->parent->priv; + struct resctrl_schema *s = of->kn->parent->priv; + struct rdt_resource *r = s->res; seq_printf(seq, "%x\n", r->cache.shareable_bits); return 0; @@ -883,38 +898,40 @@ static int rdt_shareable_bits_show(struct kernfs_open_file *of, static int rdt_bit_usage_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { - struct rdt_resource *r = of->kn->parent->priv; + struct resctrl_schema *s = of->kn->parent->priv; /* * Use unsigned long even though only 32 bits are used to ensure * test_bit() is used safely. */ unsigned long sw_shareable = 0, hw_shareable = 0; unsigned long exclusive = 0, pseudo_locked = 0; + struct rdt_resource *r = s->res; struct rdt_domain *dom; int i, hwb, swb, excl, psl; enum rdtgrp_mode mode; bool sep = false; - u32 *ctrl; + u32 ctrl_val; mutex_lock(&rdtgroup_mutex); hw_shareable = r->cache.shareable_bits; list_for_each_entry(dom, &r->domains, list) { if (sep) seq_putc(seq, ';'); - ctrl = dom->ctrl_val; sw_shareable = 0; exclusive = 0; seq_printf(seq, "%d=", dom->id); - for (i = 0; i < closids_supported(); i++, ctrl++) { + for (i = 0; i < closids_supported(); i++) { if (!closid_allocated(i)) continue; + ctrl_val = resctrl_arch_get_config(r, dom, i, + s->conf_type); mode = rdtgroup_mode_by_closid(i); switch (mode) { case RDT_MODE_SHAREABLE: - sw_shareable |= *ctrl; + sw_shareable |= ctrl_val; break; case RDT_MODE_EXCLUSIVE: - exclusive |= *ctrl; + exclusive |= ctrl_val; break; case RDT_MODE_PSEUDO_LOCKSETUP: /* @@ -961,7 +978,8 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, static int rdt_min_bw_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { - struct rdt_resource *r = of->kn->parent->priv; + struct resctrl_schema *s = of->kn->parent->priv; + struct rdt_resource *r = s->res; seq_printf(seq, "%u\n", r->membw.min_bw); return 0; @@ -992,7 +1010,8 @@ static int rdt_mon_features_show(struct kernfs_open_file *of, static int rdt_bw_gran_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { - struct rdt_resource *r = of->kn->parent->priv; + struct resctrl_schema *s = of->kn->parent->priv; + struct rdt_resource *r = s->res; seq_printf(seq, "%u\n", r->membw.bw_gran); return 0; @@ -1001,7 +1020,8 @@ static int rdt_bw_gran_show(struct kernfs_open_file *of, static int rdt_delay_linear_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { - struct rdt_resource *r = of->kn->parent->priv; + struct resctrl_schema *s = of->kn->parent->priv; + struct rdt_resource *r = s->res; seq_printf(seq, "%u\n", r->membw.delay_linear); return 0; @@ -1010,9 +1030,21 @@ static int rdt_delay_linear_show(struct kernfs_open_file *of, static int max_threshold_occ_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { - struct rdt_resource *r = of->kn->parent->priv; + seq_printf(seq, "%u\n", resctrl_rmid_realloc_threshold); - seq_printf(seq, "%u\n", resctrl_cqm_threshold * r->mon_scale); + return 0; +} + +static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = of->kn->parent->priv; + struct rdt_resource *r = s->res; + + if (r->membw.throttle_mode == THREAD_THROTTLE_PER_THREAD) + seq_puts(seq, "per-thread\n"); + else + seq_puts(seq, "max\n"); return 0; } @@ -1020,7 +1052,6 @@ static int max_threshold_occ_show(struct kernfs_open_file *of, static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - struct rdt_resource *r = of->kn->parent->priv; unsigned int bytes; int ret; @@ -1028,10 +1059,10 @@ static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, if (ret) return ret; - if (bytes > (boot_cpu_data.x86_cache_size * 1024)) + if (bytes > resctrl_rmid_realloc_limit) return -EINVAL; - resctrl_cqm_threshold = bytes / r->mon_scale; + resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(bytes); return nbytes; } @@ -1056,75 +1087,17 @@ static int rdtgroup_mode_show(struct kernfs_open_file *of, return 0; } -/** - * rdt_cdp_peer_get - Retrieve CDP peer if it exists - * @r: RDT resource to which RDT domain @d belongs - * @d: Cache instance for which a CDP peer is requested - * @r_cdp: RDT resource that shares hardware with @r (RDT resource peer) - * Used to return the result. - * @d_cdp: RDT domain that shares hardware with @d (RDT domain peer) - * Used to return the result. - * - * RDT resources are managed independently and by extension the RDT domains - * (RDT resource instances) are managed independently also. The Code and - * Data Prioritization (CDP) RDT resources, while managed independently, - * could refer to the same underlying hardware. For example, - * RDT_RESOURCE_L2CODE and RDT_RESOURCE_L2DATA both refer to the L2 cache. - * - * When provided with an RDT resource @r and an instance of that RDT - * resource @d rdt_cdp_peer_get() will return if there is a peer RDT - * resource and the exact instance that shares the same hardware. - * - * Return: 0 if a CDP peer was found, <0 on error or if no CDP peer exists. - * If a CDP peer was found, @r_cdp will point to the peer RDT resource - * and @d_cdp will point to the peer RDT domain. - */ -static int rdt_cdp_peer_get(struct rdt_resource *r, struct rdt_domain *d, - struct rdt_resource **r_cdp, - struct rdt_domain **d_cdp) +static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type) { - struct rdt_resource *_r_cdp = NULL; - struct rdt_domain *_d_cdp = NULL; - int ret = 0; - - switch (r->rid) { - case RDT_RESOURCE_L3DATA: - _r_cdp = &rdt_resources_all[RDT_RESOURCE_L3CODE]; - break; - case RDT_RESOURCE_L3CODE: - _r_cdp = &rdt_resources_all[RDT_RESOURCE_L3DATA]; - break; - case RDT_RESOURCE_L2DATA: - _r_cdp = &rdt_resources_all[RDT_RESOURCE_L2CODE]; - break; - case RDT_RESOURCE_L2CODE: - _r_cdp = &rdt_resources_all[RDT_RESOURCE_L2DATA]; - break; + switch (my_type) { + case CDP_CODE: + return CDP_DATA; + case CDP_DATA: + return CDP_CODE; default: - ret = -ENOENT; - goto out; - } - - /* - * When a new CPU comes online and CDP is enabled then the new - * RDT domains (if any) associated with both CDP RDT resources - * are added in the same CPU online routine while the - * rdtgroup_mutex is held. It should thus not happen for one - * RDT domain to exist and be associated with its RDT CDP - * resource but there is no RDT domain associated with the - * peer RDT CDP resource. Hence the WARN. - */ - _d_cdp = rdt_find_domain(_r_cdp, d->id, NULL); - if (WARN_ON(IS_ERR_OR_NULL(_d_cdp))) { - _r_cdp = NULL; - ret = -EINVAL; + case CDP_NONE: + return CDP_NONE; } - -out: - *r_cdp = _r_cdp; - *d_cdp = _d_cdp; - - return ret; } /** @@ -1148,11 +1121,11 @@ out: * Return: false if CBM does not overlap, true if it does. */ static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, - unsigned long cbm, int closid, bool exclusive) + unsigned long cbm, int closid, + enum resctrl_conf_type type, bool exclusive) { enum rdtgrp_mode mode; unsigned long ctrl_b; - u32 *ctrl; int i; /* Check for any overlap with regions used by hardware directly */ @@ -1163,9 +1136,8 @@ static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d } /* Check for overlap with other resource groups */ - ctrl = d->ctrl_val; - for (i = 0; i < closids_supported(); i++, ctrl++) { - ctrl_b = *ctrl; + for (i = 0; i < closids_supported(); i++) { + ctrl_b = resctrl_arch_get_config(r, d, i, type); mode = rdtgroup_mode_by_closid(i); if (closid_allocated(i) && i != closid && mode != RDT_MODE_PSEUDO_LOCKSETUP) { @@ -1185,7 +1157,7 @@ static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d /** * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware - * @r: Resource to which domain instance @d belongs. + * @s: Schema for the resource to which domain instance @d belongs. * @d: The domain instance for which @closid is being tested. * @cbm: Capacity bitmask being tested. * @closid: Intended closid for @cbm. @@ -1203,19 +1175,19 @@ static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d * * Return: true if CBM overlap detected, false if there is no overlap */ -bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, +bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d, unsigned long cbm, int closid, bool exclusive) { - struct rdt_resource *r_cdp; - struct rdt_domain *d_cdp; + enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type); + struct rdt_resource *r = s->res; - if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, exclusive)) + if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, s->conf_type, + exclusive)) return true; - if (rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp) < 0) + if (!resctrl_arch_get_cdp_enabled(r->rid)) return false; - - return __rdtgroup_cbm_overlaps(r_cdp, d_cdp, cbm, closid, exclusive); + return __rdtgroup_cbm_overlaps(r, d, cbm, closid, peer_type, exclusive); } /** @@ -1233,17 +1205,21 @@ bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) { int closid = rdtgrp->closid; + struct resctrl_schema *s; struct rdt_resource *r; bool has_cache = false; struct rdt_domain *d; + u32 ctrl; - for_each_alloc_enabled_rdt_resource(r) { + list_for_each_entry(s, &resctrl_schema_all, list) { + r = s->res; if (r->rid == RDT_RESOURCE_MBA) continue; has_cache = true; list_for_each_entry(d, &r->domains, list) { - if (rdtgroup_cbm_overlaps(r, d, d->ctrl_val[closid], - rdtgrp->closid, false)) { + ctrl = resctrl_arch_get_config(r, d, closid, + s->conf_type); + if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) { rdt_last_cmd_puts("Schemata overlaps\n"); return false; } @@ -1374,11 +1350,14 @@ unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, static int rdtgroup_size_show(struct kernfs_open_file *of, struct seq_file *s, void *v) { + struct resctrl_schema *schema; + enum resctrl_conf_type type; struct rdtgroup *rdtgrp; struct rdt_resource *r; struct rdt_domain *d; unsigned int size; int ret = 0; + u32 closid; bool sep; u32 ctrl; @@ -1395,8 +1374,8 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, ret = -ENODEV; } else { seq_printf(s, "%*s:", max_name_width, - rdtgrp->plr->r->name); - size = rdtgroup_cbm_to_size(rdtgrp->plr->r, + rdtgrp->plr->s->name); + size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res, rdtgrp->plr->d, rdtgrp->plr->cbm); seq_printf(s, "%d=%u\n", rdtgrp->plr->d->id, size); @@ -1404,18 +1383,25 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, goto out; } - for_each_alloc_enabled_rdt_resource(r) { + closid = rdtgrp->closid; + + list_for_each_entry(schema, &resctrl_schema_all, list) { + r = schema->res; + type = schema->conf_type; sep = false; - seq_printf(s, "%*s:", max_name_width, r->name); + seq_printf(s, "%*s:", max_name_width, schema->name); list_for_each_entry(d, &r->domains, list) { if (sep) seq_putc(s, ';'); if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { size = 0; } else { - ctrl = (!is_mba_sc(r) ? - d->ctrl_val[rdtgrp->closid] : - d->mbps_val[rdtgrp->closid]); + if (is_mba_sc(r)) + ctrl = d->mbps_val[closid]; + else + ctrl = resctrl_arch_get_config(r, d, + closid, + type); if (r->rid == RDT_RESOURCE_MBA) size = ctrl; else @@ -1512,6 +1498,17 @@ static struct rftype res_common_files[] = { .seq_show = rdt_delay_linear_show, .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, }, + /* + * Platform specific which (if any) capabilities are provided by + * thread_throttle_mode. Defer "fflags" initialization to platform + * discovery. + */ + { + .name = "thread_throttle_mode", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_thread_throttle_mode_show, + }, { .name = "max_threshold_occupancy", .mode = 0644, @@ -1582,7 +1579,7 @@ static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) lockdep_assert_held(&rdtgroup_mutex); for (rft = rfts; rft < rfts + len; rft++) { - if ((fflags & rft->fflags) == rft->fflags) { + if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) { ret = rdtgroup_add_file(kn, rft); if (ret) goto error; @@ -1599,6 +1596,33 @@ error: return ret; } +static struct rftype *rdtgroup_get_rftype_by_name(const char *name) +{ + struct rftype *rfts, *rft; + int len; + + rfts = res_common_files; + len = ARRAY_SIZE(res_common_files); + + for (rft = rfts; rft < rfts + len; rft++) { + if (!strcmp(rft->name, name)) + return rft; + } + + return NULL; +} + +void __init thread_throttle_mode_init(void) +{ + struct rftype *rft; + + rft = rdtgroup_get_rftype_by_name("thread_throttle_mode"); + if (!rft) + return; + + rft->fflags = RF_CTRL_INFO | RFTYPE_RES_MB; +} + /** * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file * @r: The resource group with which the file is associated. @@ -1696,18 +1720,17 @@ int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, return ret; } -static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name, +static int rdtgroup_mkdir_info_resdir(void *priv, char *name, unsigned long fflags) { struct kernfs_node *kn_subdir; int ret; kn_subdir = kernfs_create_dir(kn_info, name, - kn_info->mode, r); + kn_info->mode, priv); if (IS_ERR(kn_subdir)) return PTR_ERR(kn_subdir); - kernfs_get(kn_subdir); ret = rdtgroup_kn_set_ugid(kn_subdir); if (ret) return ret; @@ -1721,6 +1744,7 @@ static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name, static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) { + struct resctrl_schema *s; struct rdt_resource *r; unsigned long fflags; char name[32]; @@ -1730,20 +1754,21 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); if (IS_ERR(kn_info)) return PTR_ERR(kn_info); - kernfs_get(kn_info); ret = rdtgroup_add_files(kn_info, RF_TOP_INFO); if (ret) goto out_destroy; - for_each_alloc_enabled_rdt_resource(r) { + /* loop over enabled controls, these are all alloc_capable */ + list_for_each_entry(s, &resctrl_schema_all, list) { + r = s->res; fflags = r->fflags | RF_CTRL_INFO; - ret = rdtgroup_mkdir_info_resdir(r, r->name, fflags); + ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags); if (ret) goto out_destroy; } - for_each_mon_enabled_rdt_resource(r) { + for_each_mon_capable_rdt_resource(r) { fflags = r->fflags | RF_MON_INFO; sprintf(name, "%s_MON", r->name); ret = rdtgroup_mkdir_info_resdir(r, name, fflags); @@ -1751,12 +1776,6 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) goto out_destroy; } - /* - * This extra ref will be put in kernfs_remove() and guarantees - * that @rdtgrp->kn is always accessible. - */ - kernfs_get(kn_info); - ret = rdtgroup_kn_set_ugid(kn_info); if (ret) goto out_destroy; @@ -1785,12 +1804,6 @@ mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp, if (dest_kn) *dest_kn = kn; - /* - * This extra ref will be put in kernfs_remove() and guarantees - * that @rdtgrp->kn is always accessible. - */ - kernfs_get(kn); - ret = rdtgroup_kn_set_ugid(kn); if (ret) goto out_destroy; @@ -1820,7 +1833,7 @@ static void l2_qos_cfg_update(void *arg) static inline bool is_mba_linear(void) { - return rdt_resources_all[RDT_RESOURCE_MBA].membw.delay_linear; + return rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl.membw.delay_linear; } static int set_cache_qos_cfg(int level, bool enable) @@ -1841,10 +1854,15 @@ static int set_cache_qos_cfg(int level, bool enable) if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) return -ENOMEM; - r_l = &rdt_resources_all[level]; + r_l = &rdt_resources_all[level].r_resctrl; list_for_each_entry(d, &r_l->domains, list) { - /* Pick one CPU from each domain instance to update MSR */ - cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); + if (r_l->cache.arch_has_per_cpu_cfg) + /* Pick all the CPUs in the domain instance */ + for_each_cpu(cpu, &d->cpu_mask) + cpumask_set_cpu(cpu, cpu_mask); + else + /* Pick one CPU from each domain instance to update MSR */ + cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); } cpu = get_cpu(); /* Update QOS_CFG MSR on this cpu if it's in cpu_mask. */ @@ -1859,89 +1877,127 @@ static int set_cache_qos_cfg(int level, bool enable) return 0; } +/* Restore the qos cfg state when a domain comes online */ +void rdt_domain_reconfigure_cdp(struct rdt_resource *r) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + + if (!r->cdp_capable) + return; + + if (r->rid == RDT_RESOURCE_L2) + l2_qos_cfg_update(&hw_res->cdp_enabled); + + if (r->rid == RDT_RESOURCE_L3) + l3_qos_cfg_update(&hw_res->cdp_enabled); +} + +static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_domain *d) +{ + u32 num_closid = resctrl_arch_get_num_closid(r); + int cpu = cpumask_any(&d->cpu_mask); + int i; + + d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val), + GFP_KERNEL, cpu_to_node(cpu)); + if (!d->mbps_val) + return -ENOMEM; + + for (i = 0; i < num_closid; i++) + d->mbps_val[i] = MBA_MAX_MBPS; + + return 0; +} + +static void mba_sc_domain_destroy(struct rdt_resource *r, + struct rdt_domain *d) +{ + kfree(d->mbps_val); + d->mbps_val = NULL; +} + /* - * Enable or disable the MBA software controller - * which helps user specify bandwidth in MBps. * MBA software controller is supported only if * MBM is supported and MBA is in linear scale. */ +static bool supports_mba_mbps(void) +{ + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl; + + return (is_mbm_local_enabled() && + r->alloc_capable && is_mba_linear()); +} + +/* + * Enable or disable the MBA software controller + * which helps user specify bandwidth in MBps. + */ static int set_mba_sc(bool mba_sc) { - struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA]; + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl; + u32 num_closid = resctrl_arch_get_num_closid(r); struct rdt_domain *d; + int i; - if (!is_mbm_enabled() || !is_mba_linear() || - mba_sc == is_mba_sc(r)) + if (!supports_mba_mbps() || mba_sc == is_mba_sc(r)) return -EINVAL; r->membw.mba_sc = mba_sc; - list_for_each_entry(d, &r->domains, list) - setup_default_ctrlval(r, d->ctrl_val, d->mbps_val); + + list_for_each_entry(d, &r->domains, list) { + for (i = 0; i < num_closid; i++) + d->mbps_val[i] = MBA_MAX_MBPS; + } return 0; } -static int cdp_enable(int level, int data_type, int code_type) +static int cdp_enable(int level) { - struct rdt_resource *r_ldata = &rdt_resources_all[data_type]; - struct rdt_resource *r_lcode = &rdt_resources_all[code_type]; - struct rdt_resource *r_l = &rdt_resources_all[level]; + struct rdt_resource *r_l = &rdt_resources_all[level].r_resctrl; int ret; - if (!r_l->alloc_capable || !r_ldata->alloc_capable || - !r_lcode->alloc_capable) + if (!r_l->alloc_capable) return -EINVAL; ret = set_cache_qos_cfg(level, true); - if (!ret) { - r_l->alloc_enabled = false; - r_ldata->alloc_enabled = true; - r_lcode->alloc_enabled = true; - } + if (!ret) + rdt_resources_all[level].cdp_enabled = true; + return ret; } -static int cdpl3_enable(void) +static void cdp_disable(int level) { - return cdp_enable(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA, - RDT_RESOURCE_L3CODE); -} + struct rdt_hw_resource *r_hw = &rdt_resources_all[level]; -static int cdpl2_enable(void) -{ - return cdp_enable(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA, - RDT_RESOURCE_L2CODE); + if (r_hw->cdp_enabled) { + set_cache_qos_cfg(level, false); + r_hw->cdp_enabled = false; + } } -static void cdp_disable(int level, int data_type, int code_type) +int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable) { - struct rdt_resource *r = &rdt_resources_all[level]; + struct rdt_hw_resource *hw_res = &rdt_resources_all[l]; - r->alloc_enabled = r->alloc_capable; + if (!hw_res->r_resctrl.cdp_capable) + return -EINVAL; - if (rdt_resources_all[data_type].alloc_enabled) { - rdt_resources_all[data_type].alloc_enabled = false; - rdt_resources_all[code_type].alloc_enabled = false; - set_cache_qos_cfg(level, false); - } -} + if (enable) + return cdp_enable(l); -static void cdpl3_disable(void) -{ - cdp_disable(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA, RDT_RESOURCE_L3CODE); -} + cdp_disable(l); -static void cdpl2_disable(void) -{ - cdp_disable(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA, RDT_RESOURCE_L2CODE); + return 0; } static void cdp_disable_all(void) { - if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) - cdpl3_disable(); - if (rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled) - cdpl2_disable(); + if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3)) + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); + if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); } /* @@ -2004,8 +2060,7 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn) rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) rdtgroup_pseudo_lock_remove(rdtgrp); kernfs_unbreak_active_protection(kn); - kernfs_put(rdtgrp->kn); - kfree(rdtgrp); + rdtgroup_remove(rdtgrp); } else { kernfs_unbreak_active_protection(kn); } @@ -2020,10 +2075,10 @@ static int rdt_enable_ctx(struct rdt_fs_context *ctx) int ret = 0; if (ctx->enable_cdpl2) - ret = cdpl2_enable(); + ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true); if (!ret && ctx->enable_cdpl3) - ret = cdpl3_enable(); + ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true); if (!ret && ctx->enable_mba_mbps) ret = set_mba_sc(true); @@ -2031,6 +2086,92 @@ static int rdt_enable_ctx(struct rdt_fs_context *ctx) return ret; } +static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type) +{ + struct resctrl_schema *s; + const char *suffix = ""; + int ret, cl; + + s = kzalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + s->res = r; + s->num_closid = resctrl_arch_get_num_closid(r); + if (resctrl_arch_get_cdp_enabled(r->rid)) + s->num_closid /= 2; + + s->conf_type = type; + switch (type) { + case CDP_CODE: + suffix = "CODE"; + break; + case CDP_DATA: + suffix = "DATA"; + break; + case CDP_NONE: + suffix = ""; + break; + } + + ret = snprintf(s->name, sizeof(s->name), "%s%s", r->name, suffix); + if (ret >= sizeof(s->name)) { + kfree(s); + return -EINVAL; + } + + cl = strlen(s->name); + + /* + * If CDP is supported by this resource, but not enabled, + * include the suffix. This ensures the tabular format of the + * schemata file does not change between mounts of the filesystem. + */ + if (r->cdp_capable && !resctrl_arch_get_cdp_enabled(r->rid)) + cl += 4; + + if (cl > max_name_width) + max_name_width = cl; + + INIT_LIST_HEAD(&s->list); + list_add(&s->list, &resctrl_schema_all); + + return 0; +} + +static int schemata_list_create(void) +{ + struct rdt_resource *r; + int ret = 0; + + for_each_alloc_capable_rdt_resource(r) { + if (resctrl_arch_get_cdp_enabled(r->rid)) { + ret = schemata_list_add(r, CDP_CODE); + if (ret) + break; + + ret = schemata_list_add(r, CDP_DATA); + } else { + ret = schemata_list_add(r, CDP_NONE); + } + + if (ret) + break; + } + + return ret; +} + +static void schemata_list_destroy(void) +{ + struct resctrl_schema *s, *tmp; + + list_for_each_entry_safe(s, tmp, &resctrl_schema_all, list) { + list_del(&s->list); + kfree(s); + } +} + static int rdt_get_tree(struct fs_context *fc) { struct rdt_fs_context *ctx = rdt_fc2context(fc); @@ -2052,11 +2193,17 @@ static int rdt_get_tree(struct fs_context *fc) if (ret < 0) goto out_cdp; + ret = schemata_list_create(); + if (ret) { + schemata_list_destroy(); + goto out_mba; + } + closid_init(); ret = rdtgroup_create_info_dir(rdtgroup_default.kn); if (ret < 0) - goto out_mba; + goto out_schemata_free; if (rdt_mon_capable) { ret = mongroup_create_dir(rdtgroup_default.kn, @@ -2064,13 +2211,11 @@ static int rdt_get_tree(struct fs_context *fc) &kn_mongrp); if (ret < 0) goto out_info; - kernfs_get(kn_mongrp); ret = mkdir_mondata_all(rdtgroup_default.kn, &rdtgroup_default, &kn_mondata); if (ret < 0) goto out_mongrp; - kernfs_get(kn_mondata); rdtgroup_default.mon.mon_data_kn = kn_mondata; } @@ -2091,7 +2236,7 @@ static int rdt_get_tree(struct fs_context *fc) static_branch_enable_cpuslocked(&rdt_enable_key); if (is_mbm_enabled()) { - r = &rdt_resources_all[RDT_RESOURCE_L3]; + r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; list_for_each_entry(dom, &r->domains, list) mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL); } @@ -2108,6 +2253,8 @@ out_mongrp: kernfs_remove(kn_mongrp); out_info: kernfs_remove(kn_info); +out_schemata_free: + schemata_list_destroy(); out_mba: if (ctx->enable_mba_mbps) set_mba_sc(false); @@ -2152,7 +2299,7 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) ctx->enable_cdpl2 = true; return 0; case Opt_mba_mbps: - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + if (!supports_mba_mbps()) return -EINVAL; ctx->enable_mba_mbps = true; return 0; @@ -2195,6 +2342,8 @@ static int rdt_init_fs_context(struct fs_context *fc) static int reset_all_ctrls(struct rdt_resource *r) { + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + struct rdt_hw_domain *hw_dom; struct msr_param msr_param; cpumask_var_t cpu_mask; struct rdt_domain *d; @@ -2205,7 +2354,7 @@ static int reset_all_ctrls(struct rdt_resource *r) msr_param.res = r; msr_param.low = 0; - msr_param.high = r->num_closid; + msr_param.high = hw_res->num_closid; /* * Disable resource control for this resource by setting all @@ -2213,10 +2362,11 @@ static int reset_all_ctrls(struct rdt_resource *r) * from each domain to update the MSRs below. */ list_for_each_entry(d, &r->domains, list) { + hw_dom = resctrl_to_arch_dom(d); cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); - for (i = 0; i < r->num_closid; i++) - d->ctrl_val[i] = r->default_ctrl; + for (i = 0; i < hw_res->num_closid; i++) + hw_dom->ctrl_val[i] = r->default_ctrl; } cpu = get_cpu(); /* Update CBM on this cpu if it's in cpu_mask. */ @@ -2231,18 +2381,6 @@ static int reset_all_ctrls(struct rdt_resource *r) return 0; } -static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) -{ - return (rdt_alloc_capable && - (r->type == RDTCTRL_GROUP) && (t->closid == r->closid)); -} - -static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) -{ - return (rdt_mon_capable && - (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid)); -} - /* * Move tasks from one to the other group. If @from is NULL, then all tasks * in the systems are moved unconditionally (used for teardown). @@ -2260,22 +2398,18 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, for_each_process_thread(p, t) { if (!from || is_closid_match(t, from) || is_rmid_match(t, from)) { - t->closid = to->closid; - t->rmid = to->mon.rmid; + WRITE_ONCE(t->closid, to->closid); + WRITE_ONCE(t->rmid, to->mon.rmid); -#ifdef CONFIG_SMP /* - * This is safe on x86 w/o barriers as the ordering - * of writing to task_cpu() and t->on_cpu is - * reverse to the reading here. The detection is - * inaccurate as tasks might move or schedule - * before the smp function call takes place. In - * such a case the function call is pointless, but + * If the task is on a CPU, set the CPU in the mask. + * The detection is inaccurate as tasks might move or + * schedule before the smp function call takes place. + * In such a case the function call is pointless, but * there is no other side effect. */ - if (mask && t->on_cpu) + if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t)) cpumask_set_cpu(task_cpu(t), mask); -#endif } } read_unlock(&tasklist_lock); @@ -2294,7 +2428,7 @@ static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) if (atomic_read(&sentry->waitcount) != 0) sentry->flags = RDT_DELETED; else - kfree(sentry); + rdtgroup_remove(sentry); } } @@ -2336,7 +2470,7 @@ static void rmdir_all_sub(void) if (atomic_read(&rdtgrp->waitcount) != 0) rdtgrp->flags = RDT_DELETED; else - kfree(rdtgrp); + rdtgroup_remove(rdtgrp); } /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ update_closid_rmid(cpu_online_mask, &rdtgroup_default); @@ -2356,12 +2490,13 @@ static void rdt_kill_sb(struct super_block *sb) set_mba_sc(false); /*Put everything back to default values. */ - for_each_alloc_enabled_rdt_resource(r) + for_each_alloc_capable_rdt_resource(r) reset_all_ctrls(r); cdp_disable_all(); rmdir_all_sub(); rdt_pseudo_lock_release(); rdtgroup_default.mode = RDT_MODE_SHAREABLE; + schemata_list_destroy(); static_branch_disable_cpuslocked(&rdt_alloc_enable_key); static_branch_disable_cpuslocked(&rdt_mon_enable_key); static_branch_disable_cpuslocked(&rdt_enable_key); @@ -2402,14 +2537,12 @@ static int mon_addfile(struct kernfs_node *parent_kn, const char *name, * Remove all subdirectories of mon_data of ctrl_mon groups * and monitor groups with given domain id. */ -void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, unsigned int dom_id) +static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + unsigned int dom_id) { struct rdtgroup *prgrp, *crgrp; char name[32]; - if (!r->mon_enabled) - return; - list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { sprintf(name, "mon_%s_%02d", r->name, dom_id); kernfs_remove_by_name(prgrp->mon.mon_data_kn, name); @@ -2436,11 +2569,6 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, if (IS_ERR(kn)) return PTR_ERR(kn); - /* - * This extra ref will be put in kernfs_remove() and guarantees - * that kn is always accessible. - */ - kernfs_get(kn); ret = rdtgroup_kn_set_ugid(kn); if (ret) goto out_destroy; @@ -2459,7 +2587,7 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, goto out_destroy; if (is_mbm_event(mevt->evtid)) - mon_event_read(&rr, d, prgrp, mevt->evtid, true); + mon_event_read(&rr, r, d, prgrp, mevt->evtid, true); } kernfs_activate(kn); return 0; @@ -2473,16 +2601,13 @@ out_destroy: * Add all subdirectories of mon_data for "ctrl_mon" groups * and "monitor" groups with given domain id. */ -void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - struct rdt_domain *d) +static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + struct rdt_domain *d) { struct kernfs_node *parent_kn; struct rdtgroup *prgrp, *crgrp; struct list_head *head; - if (!r->mon_enabled) - return; - list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { parent_kn = prgrp->mon.mon_data_kn; mkdir_mondata_subdir(parent_kn, d, r, prgrp); @@ -2514,7 +2639,7 @@ static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, /* * This creates a directory mon_data which contains the monitored data. * - * mon_data has one directory for each domain whic are named + * mon_data has one directory for each domain which are named * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data * with L3 domain looks as below: * ./mon_data: @@ -2550,7 +2675,7 @@ static int mkdir_mondata_all(struct kernfs_node *parent_kn, * Create the subdirectories for each domain. Note that all events * in a domain like L3 are grouped into a resource whose domain is L3 */ - for_each_mon_enabled_rdt_resource(r) { + for_each_mon_capable_rdt_resource(r) { ret = mkdir_mondata_subdir_alldom(kn, r, prgrp); if (ret) goto out_destroy; @@ -2601,23 +2726,24 @@ static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r) * Set the RDT domain up to start off with all usable allocations. That is, * all shareable and unused bits. All-zero CBM is invalid. */ -static int __init_one_rdt_domain(struct rdt_domain *d, struct rdt_resource *r, +static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s, u32 closid) { - struct rdt_resource *r_cdp = NULL; - struct rdt_domain *d_cdp = NULL; + enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type); + enum resctrl_conf_type t = s->conf_type; + struct resctrl_staged_config *cfg; + struct rdt_resource *r = s->res; u32 used_b = 0, unused_b = 0; unsigned long tmp_cbm; enum rdtgrp_mode mode; - u32 peer_ctl, *ctrl; + u32 peer_ctl, ctrl_val; int i; - rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp); - d->have_new_ctrl = false; - d->new_ctrl = r->cache.shareable_bits; + cfg = &d->staged_config[t]; + cfg->have_new_ctrl = false; + cfg->new_ctrl = r->cache.shareable_bits; used_b = r->cache.shareable_bits; - ctrl = d->ctrl_val; - for (i = 0; i < closids_supported(); i++, ctrl++) { + for (i = 0; i < closids_supported(); i++) { if (closid_allocated(i) && i != closid) { mode = rdtgroup_mode_by_closid(i); if (mode == RDT_MODE_PSEUDO_LOCKSETUP) @@ -2632,35 +2758,38 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct rdt_resource *r, * usage to ensure there is no overlap * with an exclusive group. */ - if (d_cdp) - peer_ctl = d_cdp->ctrl_val[i]; + if (resctrl_arch_get_cdp_enabled(r->rid)) + peer_ctl = resctrl_arch_get_config(r, d, i, + peer_type); else peer_ctl = 0; - used_b |= *ctrl | peer_ctl; + ctrl_val = resctrl_arch_get_config(r, d, i, + s->conf_type); + used_b |= ctrl_val | peer_ctl; if (mode == RDT_MODE_SHAREABLE) - d->new_ctrl |= *ctrl | peer_ctl; + cfg->new_ctrl |= ctrl_val | peer_ctl; } } if (d->plr && d->plr->cbm > 0) used_b |= d->plr->cbm; unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1); unused_b &= BIT_MASK(r->cache.cbm_len) - 1; - d->new_ctrl |= unused_b; + cfg->new_ctrl |= unused_b; /* * Force the initial CBM to be valid, user can * modify the CBM based on system availability. */ - d->new_ctrl = cbm_ensure_valid(d->new_ctrl, r); + cfg->new_ctrl = cbm_ensure_valid(cfg->new_ctrl, r); /* * Assign the u32 CBM to an unsigned long to ensure that * bitmap_weight() does not access out-of-bound memory. */ - tmp_cbm = d->new_ctrl; + tmp_cbm = cfg->new_ctrl; if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) { - rdt_last_cmd_printf("No space on %s:%d\n", r->name, d->id); + rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->id); return -ENOSPC; } - d->have_new_ctrl = true; + cfg->have_new_ctrl = true; return 0; } @@ -2675,13 +2804,13 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct rdt_resource *r, * If there are no more shareable bits available on any domain then * the entire allocation will fail. */ -static int rdtgroup_init_cat(struct rdt_resource *r, u32 closid) +static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid) { struct rdt_domain *d; int ret; - list_for_each_entry(d, &r->domains, list) { - ret = __init_one_rdt_domain(d, r, closid); + list_for_each_entry(d, &s->res->domains, list) { + ret = __init_one_rdt_domain(d, s, closid); if (ret < 0) return ret; } @@ -2690,32 +2819,43 @@ static int rdtgroup_init_cat(struct rdt_resource *r, u32 closid) } /* Initialize MBA resource with default values. */ -static void rdtgroup_init_mba(struct rdt_resource *r) +static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid) { + struct resctrl_staged_config *cfg; struct rdt_domain *d; list_for_each_entry(d, &r->domains, list) { - d->new_ctrl = is_mba_sc(r) ? MBA_MAX_MBPS : r->default_ctrl; - d->have_new_ctrl = true; + if (is_mba_sc(r)) { + d->mbps_val[closid] = MBA_MAX_MBPS; + continue; + } + + cfg = &d->staged_config[CDP_NONE]; + cfg->new_ctrl = r->default_ctrl; + cfg->have_new_ctrl = true; } } /* Initialize the RDT group's allocations. */ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) { + struct resctrl_schema *s; struct rdt_resource *r; int ret; - for_each_alloc_enabled_rdt_resource(r) { + list_for_each_entry(s, &resctrl_schema_all, list) { + r = s->res; if (r->rid == RDT_RESOURCE_MBA) { - rdtgroup_init_mba(r); + rdtgroup_init_mba(r, rdtgrp->closid); + if (is_mba_sc(r)) + continue; } else { - ret = rdtgroup_init_cat(r, rdtgrp->closid); + ret = rdtgroup_init_cat(s, rdtgrp->closid); if (ret < 0) return ret; } - ret = update_domains(r, rdtgrp->closid); + ret = resctrl_arch_update_domains(r, rdtgrp->closid); if (ret < 0) { rdt_last_cmd_puts("Failed to initialize allocations\n"); return ret; @@ -2775,8 +2915,8 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, /* * kernfs_remove() will drop the reference count on "kn" which * will free it. But we still need it to stick around for the - * rdtgroup_kn_unlock(kn} call below. Take one extra reference - * here, which will be dropped inside rdtgroup_kn_unlock(). + * rdtgroup_kn_unlock(kn) call. Take one extra reference here, + * which will be dropped by kernfs_put() in rdtgroup_remove(). */ kernfs_get(kn); @@ -2817,6 +2957,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, out_idfree: free_rmid(rdtgrp->mon.rmid); out_destroy: + kernfs_put(rdtgrp->kn); kernfs_remove(rdtgrp->kn); out_free_rgrp: kfree(rdtgrp); @@ -2829,7 +2970,7 @@ static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp) { kernfs_remove(rgrp->kn); free_rmid(rgrp->mon.rmid); - kfree(rgrp); + rdtgroup_remove(rgrp); } /* @@ -2958,8 +3099,7 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, return -EPERM; } -static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp, - cpumask_var_t tmpmask) +static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) { struct rdtgroup *prdtgrp = rdtgrp->mon.parent; int cpu; @@ -2986,33 +3126,21 @@ static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp, WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); list_del(&rdtgrp->mon.crdtgrp_list); - /* - * one extra hold on this, will drop when we kfree(rdtgrp) - * in rdtgroup_kn_unlock() - */ - kernfs_get(kn); kernfs_remove(rdtgrp->kn); return 0; } -static int rdtgroup_ctrl_remove(struct kernfs_node *kn, - struct rdtgroup *rdtgrp) +static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp) { rdtgrp->flags = RDT_DELETED; list_del(&rdtgrp->rdtgroup_list); - /* - * one extra hold on this, will drop when we kfree(rdtgrp) - * in rdtgroup_kn_unlock() - */ - kernfs_get(kn); kernfs_remove(rdtgrp->kn); return 0; } -static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp, - cpumask_var_t tmpmask) +static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) { int cpu; @@ -3039,7 +3167,7 @@ static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp, closid_free(rdtgrp->closid); free_rmid(rdtgrp->mon.rmid); - rdtgroup_ctrl_remove(kn, rdtgrp); + rdtgroup_ctrl_remove(rdtgrp); /* * Free all the child monitor group rmids. @@ -3072,16 +3200,17 @@ static int rdtgroup_rmdir(struct kernfs_node *kn) * If the rdtgroup is a mon group and parent directory * is a valid "mon_groups" directory, remove the mon group. */ - if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn) { + if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn && + rdtgrp != &rdtgroup_default) { if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - ret = rdtgroup_ctrl_remove(kn, rdtgrp); + ret = rdtgroup_ctrl_remove(rdtgrp); } else { - ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask); + ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask); } } else if (rdtgrp->type == RDTMON_GROUP && is_mon_groups(parent_kn, kn->name)) { - ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask); + ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask); } else { ret = -EPERM; } @@ -3094,13 +3223,13 @@ out: static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) { - if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) + if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3)) seq_puts(seq, ",cdp"); - if (rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled) + if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) seq_puts(seq, ",cdpl2"); - if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA])) + if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl)) seq_puts(seq, ",mba_MBps"); return 0; @@ -3132,13 +3261,13 @@ static int __init rdtgroup_setup_root(void) list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); - ret = rdtgroup_add_files(rdt_root->kn, RF_CTRL_BASE); + ret = rdtgroup_add_files(kernfs_root_to_node(rdt_root), RF_CTRL_BASE); if (ret) { kernfs_destroy_root(rdt_root); goto out; } - rdtgroup_default.kn = rdt_root->kn; + rdtgroup_default.kn = kernfs_root_to_node(rdt_root); kernfs_activate(rdtgroup_default.kn); out: @@ -3147,6 +3276,110 @@ out: return ret; } +static void domain_destroy_mon_state(struct rdt_domain *d) +{ + bitmap_free(d->rmid_busy_llc); + kfree(d->mbm_total); + kfree(d->mbm_local); +} + +void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d) +{ + lockdep_assert_held(&rdtgroup_mutex); + + if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) + mba_sc_domain_destroy(r, d); + + if (!r->mon_capable) + return; + + /* + * If resctrl is mounted, remove all the + * per domain monitor data directories. + */ + if (static_branch_unlikely(&rdt_mon_enable_key)) + rmdir_mondata_subdir_allrdtgrp(r, d->id); + + if (is_mbm_enabled()) + cancel_delayed_work(&d->mbm_over); + if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) { + /* + * When a package is going down, forcefully + * decrement rmid->ebusy. There is no way to know + * that the L3 was flushed and hence may lead to + * incorrect counts in rare scenarios, but leaving + * the RMID as busy creates RMID leaks if the + * package never comes back. + */ + __check_limbo(d, true); + cancel_delayed_work(&d->cqm_limbo); + } + + domain_destroy_mon_state(d); +} + +static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) +{ + size_t tsize; + + if (is_llc_occupancy_enabled()) { + d->rmid_busy_llc = bitmap_zalloc(r->num_rmid, GFP_KERNEL); + if (!d->rmid_busy_llc) + return -ENOMEM; + } + if (is_mbm_total_enabled()) { + tsize = sizeof(*d->mbm_total); + d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL); + if (!d->mbm_total) { + bitmap_free(d->rmid_busy_llc); + return -ENOMEM; + } + } + if (is_mbm_local_enabled()) { + tsize = sizeof(*d->mbm_local); + d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL); + if (!d->mbm_local) { + bitmap_free(d->rmid_busy_llc); + kfree(d->mbm_total); + return -ENOMEM; + } + } + + return 0; +} + +int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d) +{ + int err; + + lockdep_assert_held(&rdtgroup_mutex); + + if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) + /* RDT_RESOURCE_MBA is never mon_capable */ + return mba_sc_domain_allocate(r, d); + + if (!r->mon_capable) + return 0; + + err = domain_setup_mon_state(r, d); + if (err) + return err; + + if (is_mbm_enabled()) { + INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); + mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL); + } + + if (is_llc_occupancy_enabled()) + INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); + + /* If resctrl is mounted, add per domain monitor data directories. */ + if (static_branch_unlikely(&rdt_mon_enable_key)) + mkdir_mondata_subdir_allrdtgrp(r, d); + + return 0; +} + /* * rdtgroup_init - rdtgroup initialization * @@ -3181,14 +3414,14 @@ int __init rdtgroup_init(void) * It may also be ok since that would enable debugging of RDT before * resctrl is mounted. * The reason why the debugfs directory is created here and not in - * rdt_mount() is because rdt_mount() takes rdtgroup_mutex and + * rdt_get_tree() is because rdt_get_tree() takes rdtgroup_mutex and * during the debugfs directory creation also &sb->s_type->i_mutex_key * (the lockdep class of inode->i_rwsem). Other filesystem * interactions (eg. SyS_getdents) have the lock ordering: - * &sb->s_type->i_mutex_key --> &mm->mmap_sem - * During mmap(), called with &mm->mmap_sem, the rdtgroup_mutex + * &sb->s_type->i_mutex_key --> &mm->mmap_lock + * During mmap(), called with &mm->mmap_lock, the rdtgroup_mutex * is taken, thus creating dependency: - * &mm->mmap_sem --> rdtgroup_mutex for the latter that can cause + * &mm->mmap_lock --> rdtgroup_mutex for the latter that can cause * issues considering the other two lock dependencies. * By creating the debugfs directory here we avoid a dependency * that may cause deadlock (even though file operations cannot diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 62b137c3c97a..fc01f81f6e2a 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -26,6 +26,8 @@ struct cpuid_bit { static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 }, + { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 }, { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 }, { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 }, { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 }, @@ -35,12 +37,15 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 }, { X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 }, { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 }, + { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 }, + { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 }, + { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 }, { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, - { X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 }, - { X86_FEATURE_SEV, CPUID_EAX, 1, 0x8000001f, 0 }, + { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 }, + { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile new file mode 100644 index 000000000000..9c1656779b2a --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/Makefile @@ -0,0 +1,6 @@ +obj-y += \ + driver.o \ + encl.o \ + ioctl.o \ + main.o +obj-$(CONFIG_X86_SGX_KVM) += virt.o diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c new file mode 100644 index 000000000000..aa9b8b868867 --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/driver.c @@ -0,0 +1,180 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016-20 Intel Corporation. */ + +#include <linux/acpi.h> +#include <linux/miscdevice.h> +#include <linux/mman.h> +#include <linux/security.h> +#include <linux/suspend.h> +#include <asm/traps.h> +#include "driver.h" +#include "encl.h" + +u64 sgx_attributes_reserved_mask; +u64 sgx_xfrm_reserved_mask = ~0x3; +u32 sgx_misc_reserved_mask; + +static int sgx_open(struct inode *inode, struct file *file) +{ + struct sgx_encl *encl; + int ret; + + encl = kzalloc(sizeof(*encl), GFP_KERNEL); + if (!encl) + return -ENOMEM; + + kref_init(&encl->refcount); + xa_init(&encl->page_array); + mutex_init(&encl->lock); + INIT_LIST_HEAD(&encl->va_pages); + INIT_LIST_HEAD(&encl->mm_list); + spin_lock_init(&encl->mm_lock); + + ret = init_srcu_struct(&encl->srcu); + if (ret) { + kfree(encl); + return ret; + } + + file->private_data = encl; + + return 0; +} + +static int sgx_release(struct inode *inode, struct file *file) +{ + struct sgx_encl *encl = file->private_data; + struct sgx_encl_mm *encl_mm; + + /* + * Drain the remaining mm_list entries. At this point the list contains + * entries for processes, which have closed the enclave file but have + * not exited yet. The processes, which have exited, are gone from the + * list by sgx_mmu_notifier_release(). + */ + for ( ; ; ) { + spin_lock(&encl->mm_lock); + + if (list_empty(&encl->mm_list)) { + encl_mm = NULL; + } else { + encl_mm = list_first_entry(&encl->mm_list, + struct sgx_encl_mm, list); + list_del_rcu(&encl_mm->list); + } + + spin_unlock(&encl->mm_lock); + + /* The enclave is no longer mapped by any mm. */ + if (!encl_mm) + break; + + synchronize_srcu(&encl->srcu); + mmu_notifier_unregister(&encl_mm->mmu_notifier, encl_mm->mm); + kfree(encl_mm); + + /* 'encl_mm' is gone, put encl_mm->encl reference: */ + kref_put(&encl->refcount, sgx_encl_release); + } + + kref_put(&encl->refcount, sgx_encl_release); + return 0; +} + +static int sgx_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct sgx_encl *encl = file->private_data; + int ret; + + ret = sgx_encl_may_map(encl, vma->vm_start, vma->vm_end, vma->vm_flags); + if (ret) + return ret; + + ret = sgx_encl_mm_add(encl, vma->vm_mm); + if (ret) + return ret; + + vma->vm_ops = &sgx_vm_ops; + vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO; + vma->vm_private_data = encl; + + return 0; +} + +static unsigned long sgx_get_unmapped_area(struct file *file, + unsigned long addr, + unsigned long len, + unsigned long pgoff, + unsigned long flags) +{ + if ((flags & MAP_TYPE) == MAP_PRIVATE) + return -EINVAL; + + if (flags & MAP_FIXED) + return addr; + + return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); +} + +#ifdef CONFIG_COMPAT +static long sgx_compat_ioctl(struct file *filep, unsigned int cmd, + unsigned long arg) +{ + return sgx_ioctl(filep, cmd, arg); +} +#endif + +static const struct file_operations sgx_encl_fops = { + .owner = THIS_MODULE, + .open = sgx_open, + .release = sgx_release, + .unlocked_ioctl = sgx_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = sgx_compat_ioctl, +#endif + .mmap = sgx_mmap, + .get_unmapped_area = sgx_get_unmapped_area, +}; + +static struct miscdevice sgx_dev_enclave = { + .minor = MISC_DYNAMIC_MINOR, + .name = "sgx_enclave", + .nodename = "sgx_enclave", + .fops = &sgx_encl_fops, +}; + +int __init sgx_drv_init(void) +{ + unsigned int eax, ebx, ecx, edx; + u64 attr_mask; + u64 xfrm_mask; + int ret; + + if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) + return -ENODEV; + + cpuid_count(SGX_CPUID, 0, &eax, &ebx, &ecx, &edx); + + if (!(eax & 1)) { + pr_err("SGX disabled: SGX1 instruction support not available.\n"); + return -ENODEV; + } + + sgx_misc_reserved_mask = ~ebx | SGX_MISC_RESERVED_MASK; + + cpuid_count(SGX_CPUID, 1, &eax, &ebx, &ecx, &edx); + + attr_mask = (((u64)ebx) << 32) + (u64)eax; + sgx_attributes_reserved_mask = ~attr_mask | SGX_ATTR_RESERVED_MASK; + + if (cpu_feature_enabled(X86_FEATURE_OSXSAVE)) { + xfrm_mask = (((u64)edx) << 32) + (u64)ecx; + sgx_xfrm_reserved_mask = ~xfrm_mask; + } + + ret = misc_register(&sgx_dev_enclave); + if (ret) + return ret; + + return 0; +} diff --git a/arch/x86/kernel/cpu/sgx/driver.h b/arch/x86/kernel/cpu/sgx/driver.h new file mode 100644 index 000000000000..4eddb4d571ef --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/driver.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ARCH_SGX_DRIVER_H__ +#define __ARCH_SGX_DRIVER_H__ + +#include <crypto/hash.h> +#include <linux/kref.h> +#include <linux/mmu_notifier.h> +#include <linux/radix-tree.h> +#include <linux/rwsem.h> +#include <linux/sched.h> +#include <linux/workqueue.h> +#include <uapi/asm/sgx.h> +#include "sgx.h" + +#define SGX_EINIT_SPIN_COUNT 20 +#define SGX_EINIT_SLEEP_COUNT 50 +#define SGX_EINIT_SLEEP_TIME 20 + +extern u64 sgx_attributes_reserved_mask; +extern u64 sgx_xfrm_reserved_mask; +extern u32 sgx_misc_reserved_mask; + +extern const struct file_operations sgx_provision_fops; + +long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); + +int sgx_drv_init(void); + +#endif /* __ARCH_X86_SGX_DRIVER_H__ */ diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c new file mode 100644 index 000000000000..1ec20807de1e --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -0,0 +1,1288 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016-20 Intel Corporation. */ + +#include <linux/lockdep.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/shmem_fs.h> +#include <linux/suspend.h> +#include <linux/sched/mm.h> +#include <asm/sgx.h> +#include "encl.h" +#include "encls.h" +#include "sgx.h" + +static int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing); + +#define PCMDS_PER_PAGE (PAGE_SIZE / sizeof(struct sgx_pcmd)) +/* + * 32 PCMD entries share a PCMD page. PCMD_FIRST_MASK is used to + * determine the page index associated with the first PCMD entry + * within a PCMD page. + */ +#define PCMD_FIRST_MASK GENMASK(4, 0) + +/** + * reclaimer_writing_to_pcmd() - Query if any enclave page associated with + * a PCMD page is in process of being reclaimed. + * @encl: Enclave to which PCMD page belongs + * @start_addr: Address of enclave page using first entry within the PCMD page + * + * When an enclave page is reclaimed some Paging Crypto MetaData (PCMD) is + * stored. The PCMD data of a reclaimed enclave page contains enough + * information for the processor to verify the page at the time + * it is loaded back into the Enclave Page Cache (EPC). + * + * The backing storage to which enclave pages are reclaimed is laid out as + * follows: + * Encrypted enclave pages:SECS page:PCMD pages + * + * Each PCMD page contains the PCMD metadata of + * PAGE_SIZE/sizeof(struct sgx_pcmd) enclave pages. + * + * A PCMD page can only be truncated if it is (a) empty, and (b) not in the + * process of getting data (and thus soon being non-empty). (b) is tested with + * a check if an enclave page sharing the PCMD page is in the process of being + * reclaimed. + * + * The reclaimer sets the SGX_ENCL_PAGE_BEING_RECLAIMED flag when it + * intends to reclaim that enclave page - it means that the PCMD page + * associated with that enclave page is about to get some data and thus + * even if the PCMD page is empty, it should not be truncated. + * + * Context: Enclave mutex (&sgx_encl->lock) must be held. + * Return: 1 if the reclaimer is about to write to the PCMD page + * 0 if the reclaimer has no intention to write to the PCMD page + */ +static int reclaimer_writing_to_pcmd(struct sgx_encl *encl, + unsigned long start_addr) +{ + int reclaimed = 0; + int i; + + /* + * PCMD_FIRST_MASK is based on number of PCMD entries within + * PCMD page being 32. + */ + BUILD_BUG_ON(PCMDS_PER_PAGE != 32); + + for (i = 0; i < PCMDS_PER_PAGE; i++) { + struct sgx_encl_page *entry; + unsigned long addr; + + addr = start_addr + i * PAGE_SIZE; + + /* + * Stop when reaching the SECS page - it does not + * have a page_array entry and its reclaim is + * started and completed with enclave mutex held so + * it does not use the SGX_ENCL_PAGE_BEING_RECLAIMED + * flag. + */ + if (addr == encl->base + encl->size) + break; + + entry = xa_load(&encl->page_array, PFN_DOWN(addr)); + if (!entry) + continue; + + /* + * VA page slot ID uses same bit as the flag so it is important + * to ensure that the page is not already in backing store. + */ + if (entry->epc_page && + (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)) { + reclaimed = 1; + break; + } + } + + return reclaimed; +} + +/* + * Calculate byte offset of a PCMD struct associated with an enclave page. PCMD's + * follow right after the EPC data in the backing storage. In addition to the + * visible enclave pages, there's one extra page slot for SECS, before PCMD + * structs. + */ +static inline pgoff_t sgx_encl_get_backing_page_pcmd_offset(struct sgx_encl *encl, + unsigned long page_index) +{ + pgoff_t epc_end_off = encl->size + sizeof(struct sgx_secs); + + return epc_end_off + page_index * sizeof(struct sgx_pcmd); +} + +/* + * Free a page from the backing storage in the given page index. + */ +static inline void sgx_encl_truncate_backing_page(struct sgx_encl *encl, unsigned long page_index) +{ + struct inode *inode = file_inode(encl->backing); + + shmem_truncate_range(inode, PFN_PHYS(page_index), PFN_PHYS(page_index) + PAGE_SIZE - 1); +} + +/* + * ELDU: Load an EPC page as unblocked. For more info, see "OS Management of EPC + * Pages" in the SDM. + */ +static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, + struct sgx_epc_page *epc_page, + struct sgx_epc_page *secs_page) +{ + unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK; + struct sgx_encl *encl = encl_page->encl; + pgoff_t page_index, page_pcmd_off; + unsigned long pcmd_first_page; + struct sgx_pageinfo pginfo; + struct sgx_backing b; + bool pcmd_page_empty; + u8 *pcmd_page; + int ret; + + if (secs_page) + page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base); + else + page_index = PFN_DOWN(encl->size); + + /* + * Address of enclave page using the first entry within the PCMD page. + */ + pcmd_first_page = PFN_PHYS(page_index & ~PCMD_FIRST_MASK) + encl->base; + + page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); + + ret = sgx_encl_lookup_backing(encl, page_index, &b); + if (ret) + return ret; + + pginfo.addr = encl_page->desc & PAGE_MASK; + pginfo.contents = (unsigned long)kmap_atomic(b.contents); + pcmd_page = kmap_atomic(b.pcmd); + pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset; + + if (secs_page) + pginfo.secs = (u64)sgx_get_epc_virt_addr(secs_page); + else + pginfo.secs = 0; + + ret = __eldu(&pginfo, sgx_get_epc_virt_addr(epc_page), + sgx_get_epc_virt_addr(encl_page->va_page->epc_page) + va_offset); + if (ret) { + if (encls_failed(ret)) + ENCLS_WARN(ret, "ELDU"); + + ret = -EFAULT; + } + + memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd)); + set_page_dirty(b.pcmd); + + /* + * The area for the PCMD in the page was zeroed above. Check if the + * whole page is now empty meaning that all PCMD's have been zeroed: + */ + pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE); + + kunmap_atomic(pcmd_page); + kunmap_atomic((void *)(unsigned long)pginfo.contents); + + get_page(b.pcmd); + sgx_encl_put_backing(&b); + + sgx_encl_truncate_backing_page(encl, page_index); + + if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) { + sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off)); + pcmd_page = kmap_atomic(b.pcmd); + if (memchr_inv(pcmd_page, 0, PAGE_SIZE)) + pr_warn("PCMD page not empty after truncate.\n"); + kunmap_atomic(pcmd_page); + } + + put_page(b.pcmd); + + return ret; +} + +static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, + struct sgx_epc_page *secs_page) +{ + + unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK; + struct sgx_encl *encl = encl_page->encl; + struct sgx_epc_page *epc_page; + int ret; + + epc_page = sgx_alloc_epc_page(encl_page, false); + if (IS_ERR(epc_page)) + return epc_page; + + ret = __sgx_encl_eldu(encl_page, epc_page, secs_page); + if (ret) { + sgx_encl_free_epc_page(epc_page); + return ERR_PTR(ret); + } + + sgx_free_va_slot(encl_page->va_page, va_offset); + list_move(&encl_page->va_page->list, &encl->va_pages); + encl_page->desc &= ~SGX_ENCL_PAGE_VA_OFFSET_MASK; + encl_page->epc_page = epc_page; + + return epc_page; +} + +static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, + struct sgx_encl_page *entry) +{ + struct sgx_epc_page *epc_page; + + /* Entry successfully located. */ + if (entry->epc_page) { + if (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED) + return ERR_PTR(-EBUSY); + + return entry; + } + + if (!(encl->secs.epc_page)) { + epc_page = sgx_encl_eldu(&encl->secs, NULL); + if (IS_ERR(epc_page)) + return ERR_CAST(epc_page); + } + + epc_page = sgx_encl_eldu(entry, encl->secs.epc_page); + if (IS_ERR(epc_page)) + return ERR_CAST(epc_page); + + encl->secs_child_cnt++; + sgx_mark_page_reclaimable(entry->epc_page); + + return entry; +} + +static struct sgx_encl_page *sgx_encl_load_page_in_vma(struct sgx_encl *encl, + unsigned long addr, + unsigned long vm_flags) +{ + unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC); + struct sgx_encl_page *entry; + + entry = xa_load(&encl->page_array, PFN_DOWN(addr)); + if (!entry) + return ERR_PTR(-EFAULT); + + /* + * Verify that the page has equal or higher build time + * permissions than the VMA permissions (i.e. the subset of {VM_READ, + * VM_WRITE, VM_EXECUTE} in vma->vm_flags). + */ + if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits) + return ERR_PTR(-EFAULT); + + return __sgx_encl_load_page(encl, entry); +} + +struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, + unsigned long addr) +{ + struct sgx_encl_page *entry; + + entry = xa_load(&encl->page_array, PFN_DOWN(addr)); + if (!entry) + return ERR_PTR(-EFAULT); + + return __sgx_encl_load_page(encl, entry); +} + +/** + * sgx_encl_eaug_page() - Dynamically add page to initialized enclave + * @vma: VMA obtained from fault info from where page is accessed + * @encl: enclave accessing the page + * @addr: address that triggered the page fault + * + * When an initialized enclave accesses a page with no backing EPC page + * on a SGX2 system then the EPC can be added dynamically via the SGX2 + * ENCLS[EAUG] instruction. + * + * Returns: Appropriate vm_fault_t: VM_FAULT_NOPAGE when PTE was installed + * successfully, VM_FAULT_SIGBUS or VM_FAULT_OOM as error otherwise. + */ +static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, + struct sgx_encl *encl, unsigned long addr) +{ + vm_fault_t vmret = VM_FAULT_SIGBUS; + struct sgx_pageinfo pginfo = {0}; + struct sgx_encl_page *encl_page; + struct sgx_epc_page *epc_page; + struct sgx_va_page *va_page; + unsigned long phys_addr; + u64 secinfo_flags; + int ret; + + if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) + return VM_FAULT_SIGBUS; + + /* + * Ignore internal permission checking for dynamically added pages. + * They matter only for data added during the pre-initialization + * phase. The enclave decides the permissions by the means of + * EACCEPT, EACCEPTCOPY and EMODPE. + */ + secinfo_flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X; + encl_page = sgx_encl_page_alloc(encl, addr - encl->base, secinfo_flags); + if (IS_ERR(encl_page)) + return VM_FAULT_OOM; + + mutex_lock(&encl->lock); + + epc_page = sgx_alloc_epc_page(encl_page, false); + if (IS_ERR(epc_page)) { + if (PTR_ERR(epc_page) == -EBUSY) + vmret = VM_FAULT_NOPAGE; + goto err_out_unlock; + } + + va_page = sgx_encl_grow(encl, false); + if (IS_ERR(va_page)) { + if (PTR_ERR(va_page) == -EBUSY) + vmret = VM_FAULT_NOPAGE; + goto err_out_epc; + } + + if (va_page) + list_add(&va_page->list, &encl->va_pages); + + ret = xa_insert(&encl->page_array, PFN_DOWN(encl_page->desc), + encl_page, GFP_KERNEL); + /* + * If ret == -EBUSY then page was created in another flow while + * running without encl->lock + */ + if (ret) + goto err_out_shrink; + + pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page); + pginfo.addr = encl_page->desc & PAGE_MASK; + pginfo.metadata = 0; + + ret = __eaug(&pginfo, sgx_get_epc_virt_addr(epc_page)); + if (ret) + goto err_out; + + encl_page->encl = encl; + encl_page->epc_page = epc_page; + encl_page->type = SGX_PAGE_TYPE_REG; + encl->secs_child_cnt++; + + sgx_mark_page_reclaimable(encl_page->epc_page); + + phys_addr = sgx_get_epc_phys_addr(epc_page); + /* + * Do not undo everything when creating PTE entry fails - next #PF + * would find page ready for a PTE. + */ + vmret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr)); + if (vmret != VM_FAULT_NOPAGE) { + mutex_unlock(&encl->lock); + return VM_FAULT_SIGBUS; + } + mutex_unlock(&encl->lock); + return VM_FAULT_NOPAGE; + +err_out: + xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc)); + +err_out_shrink: + sgx_encl_shrink(encl, va_page); +err_out_epc: + sgx_encl_free_epc_page(epc_page); +err_out_unlock: + mutex_unlock(&encl->lock); + kfree(encl_page); + + return vmret; +} + +static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) +{ + unsigned long addr = (unsigned long)vmf->address; + struct vm_area_struct *vma = vmf->vma; + struct sgx_encl_page *entry; + unsigned long phys_addr; + struct sgx_encl *encl; + vm_fault_t ret; + + encl = vma->vm_private_data; + + /* + * It's very unlikely but possible that allocating memory for the + * mm_list entry of a forked process failed in sgx_vma_open(). When + * this happens, vm_private_data is set to NULL. + */ + if (unlikely(!encl)) + return VM_FAULT_SIGBUS; + + /* + * The page_array keeps track of all enclave pages, whether they + * are swapped out or not. If there is no entry for this page and + * the system supports SGX2 then it is possible to dynamically add + * a new enclave page. This is only possible for an initialized + * enclave that will be checked for right away. + */ + if (cpu_feature_enabled(X86_FEATURE_SGX2) && + (!xa_load(&encl->page_array, PFN_DOWN(addr)))) + return sgx_encl_eaug_page(vma, encl, addr); + + mutex_lock(&encl->lock); + + entry = sgx_encl_load_page_in_vma(encl, addr, vma->vm_flags); + if (IS_ERR(entry)) { + mutex_unlock(&encl->lock); + + if (PTR_ERR(entry) == -EBUSY) + return VM_FAULT_NOPAGE; + + return VM_FAULT_SIGBUS; + } + + phys_addr = sgx_get_epc_phys_addr(entry->epc_page); + + ret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr)); + if (ret != VM_FAULT_NOPAGE) { + mutex_unlock(&encl->lock); + + return VM_FAULT_SIGBUS; + } + + sgx_encl_test_and_clear_young(vma->vm_mm, entry); + mutex_unlock(&encl->lock); + + return VM_FAULT_NOPAGE; +} + +static void sgx_vma_open(struct vm_area_struct *vma) +{ + struct sgx_encl *encl = vma->vm_private_data; + + /* + * It's possible but unlikely that vm_private_data is NULL. This can + * happen in a grandchild of a process, when sgx_encl_mm_add() had + * failed to allocate memory in this callback. + */ + if (unlikely(!encl)) + return; + + if (sgx_encl_mm_add(encl, vma->vm_mm)) + vma->vm_private_data = NULL; +} + + +/** + * sgx_encl_may_map() - Check if a requested VMA mapping is allowed + * @encl: an enclave pointer + * @start: lower bound of the address range, inclusive + * @end: upper bound of the address range, exclusive + * @vm_flags: VMA flags + * + * Iterate through the enclave pages contained within [@start, @end) to verify + * that the permissions requested by a subset of {VM_READ, VM_WRITE, VM_EXEC} + * do not contain any permissions that are not contained in the build time + * permissions of any of the enclave pages within the given address range. + * + * An enclave creator must declare the strongest permissions that will be + * needed for each enclave page. This ensures that mappings have the identical + * or weaker permissions than the earlier declared permissions. + * + * Return: 0 on success, -EACCES otherwise + */ +int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, + unsigned long end, unsigned long vm_flags) +{ + unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC); + struct sgx_encl_page *page; + unsigned long count = 0; + int ret = 0; + + XA_STATE(xas, &encl->page_array, PFN_DOWN(start)); + + /* Disallow mapping outside enclave's address range. */ + if (test_bit(SGX_ENCL_INITIALIZED, &encl->flags) && + (start < encl->base || end > encl->base + encl->size)) + return -EACCES; + + /* + * Disallow READ_IMPLIES_EXEC tasks as their VMA permissions might + * conflict with the enclave page permissions. + */ + if (current->personality & READ_IMPLIES_EXEC) + return -EACCES; + + mutex_lock(&encl->lock); + xas_lock(&xas); + xas_for_each(&xas, page, PFN_DOWN(end - 1)) { + if (~page->vm_max_prot_bits & vm_prot_bits) { + ret = -EACCES; + break; + } + + /* Reschedule on every XA_CHECK_SCHED iteration. */ + if (!(++count % XA_CHECK_SCHED)) { + xas_pause(&xas); + xas_unlock(&xas); + mutex_unlock(&encl->lock); + + cond_resched(); + + mutex_lock(&encl->lock); + xas_lock(&xas); + } + } + xas_unlock(&xas); + mutex_unlock(&encl->lock); + + return ret; +} + +static int sgx_vma_mprotect(struct vm_area_struct *vma, unsigned long start, + unsigned long end, unsigned long newflags) +{ + return sgx_encl_may_map(vma->vm_private_data, start, end, newflags); +} + +static int sgx_encl_debug_read(struct sgx_encl *encl, struct sgx_encl_page *page, + unsigned long addr, void *data) +{ + unsigned long offset = addr & ~PAGE_MASK; + int ret; + + + ret = __edbgrd(sgx_get_epc_virt_addr(page->epc_page) + offset, data); + if (ret) + return -EIO; + + return 0; +} + +static int sgx_encl_debug_write(struct sgx_encl *encl, struct sgx_encl_page *page, + unsigned long addr, void *data) +{ + unsigned long offset = addr & ~PAGE_MASK; + int ret; + + ret = __edbgwr(sgx_get_epc_virt_addr(page->epc_page) + offset, data); + if (ret) + return -EIO; + + return 0; +} + +/* + * Load an enclave page to EPC if required, and take encl->lock. + */ +static struct sgx_encl_page *sgx_encl_reserve_page(struct sgx_encl *encl, + unsigned long addr, + unsigned long vm_flags) +{ + struct sgx_encl_page *entry; + + for ( ; ; ) { + mutex_lock(&encl->lock); + + entry = sgx_encl_load_page_in_vma(encl, addr, vm_flags); + if (PTR_ERR(entry) != -EBUSY) + break; + + mutex_unlock(&encl->lock); + } + + if (IS_ERR(entry)) + mutex_unlock(&encl->lock); + + return entry; +} + +static int sgx_vma_access(struct vm_area_struct *vma, unsigned long addr, + void *buf, int len, int write) +{ + struct sgx_encl *encl = vma->vm_private_data; + struct sgx_encl_page *entry = NULL; + char data[sizeof(unsigned long)]; + unsigned long align; + int offset; + int cnt; + int ret = 0; + int i; + + /* + * If process was forked, VMA is still there but vm_private_data is set + * to NULL. + */ + if (!encl) + return -EFAULT; + + if (!test_bit(SGX_ENCL_DEBUG, &encl->flags)) + return -EFAULT; + + for (i = 0; i < len; i += cnt) { + entry = sgx_encl_reserve_page(encl, (addr + i) & PAGE_MASK, + vma->vm_flags); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry); + break; + } + + align = ALIGN_DOWN(addr + i, sizeof(unsigned long)); + offset = (addr + i) & (sizeof(unsigned long) - 1); + cnt = sizeof(unsigned long) - offset; + cnt = min(cnt, len - i); + + ret = sgx_encl_debug_read(encl, entry, align, data); + if (ret) + goto out; + + if (write) { + memcpy(data + offset, buf + i, cnt); + ret = sgx_encl_debug_write(encl, entry, align, data); + if (ret) + goto out; + } else { + memcpy(buf + i, data + offset, cnt); + } + +out: + mutex_unlock(&encl->lock); + + if (ret) + break; + } + + return ret < 0 ? ret : i; +} + +const struct vm_operations_struct sgx_vm_ops = { + .fault = sgx_vma_fault, + .mprotect = sgx_vma_mprotect, + .open = sgx_vma_open, + .access = sgx_vma_access, +}; + +/** + * sgx_encl_release - Destroy an enclave instance + * @ref: address of a kref inside &sgx_encl + * + * Used together with kref_put(). Frees all the resources associated with the + * enclave and the instance itself. + */ +void sgx_encl_release(struct kref *ref) +{ + struct sgx_encl *encl = container_of(ref, struct sgx_encl, refcount); + struct sgx_va_page *va_page; + struct sgx_encl_page *entry; + unsigned long index; + + xa_for_each(&encl->page_array, index, entry) { + if (entry->epc_page) { + /* + * The page and its radix tree entry cannot be freed + * if the page is being held by the reclaimer. + */ + if (sgx_unmark_page_reclaimable(entry->epc_page)) + continue; + + sgx_encl_free_epc_page(entry->epc_page); + encl->secs_child_cnt--; + entry->epc_page = NULL; + } + + kfree(entry); + /* Invoke scheduler to prevent soft lockups. */ + cond_resched(); + } + + xa_destroy(&encl->page_array); + + if (!encl->secs_child_cnt && encl->secs.epc_page) { + sgx_encl_free_epc_page(encl->secs.epc_page); + encl->secs.epc_page = NULL; + } + + while (!list_empty(&encl->va_pages)) { + va_page = list_first_entry(&encl->va_pages, struct sgx_va_page, + list); + list_del(&va_page->list); + sgx_encl_free_epc_page(va_page->epc_page); + kfree(va_page); + } + + if (encl->backing) + fput(encl->backing); + + cleanup_srcu_struct(&encl->srcu); + + WARN_ON_ONCE(!list_empty(&encl->mm_list)); + + /* Detect EPC page leak's. */ + WARN_ON_ONCE(encl->secs_child_cnt); + WARN_ON_ONCE(encl->secs.epc_page); + + kfree(encl); +} + +/* + * 'mm' is exiting and no longer needs mmu notifications. + */ +static void sgx_mmu_notifier_release(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier); + struct sgx_encl_mm *tmp = NULL; + + /* + * The enclave itself can remove encl_mm. Note, objects can't be moved + * off an RCU protected list, but deletion is ok. + */ + spin_lock(&encl_mm->encl->mm_lock); + list_for_each_entry(tmp, &encl_mm->encl->mm_list, list) { + if (tmp == encl_mm) { + list_del_rcu(&encl_mm->list); + break; + } + } + spin_unlock(&encl_mm->encl->mm_lock); + + if (tmp == encl_mm) { + synchronize_srcu(&encl_mm->encl->srcu); + mmu_notifier_put(mn); + } +} + +static void sgx_mmu_notifier_free(struct mmu_notifier *mn) +{ + struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier); + + /* 'encl_mm' is going away, put encl_mm->encl reference: */ + kref_put(&encl_mm->encl->refcount, sgx_encl_release); + + kfree(encl_mm); +} + +static const struct mmu_notifier_ops sgx_mmu_notifier_ops = { + .release = sgx_mmu_notifier_release, + .free_notifier = sgx_mmu_notifier_free, +}; + +static struct sgx_encl_mm *sgx_encl_find_mm(struct sgx_encl *encl, + struct mm_struct *mm) +{ + struct sgx_encl_mm *encl_mm = NULL; + struct sgx_encl_mm *tmp; + int idx; + + idx = srcu_read_lock(&encl->srcu); + + list_for_each_entry_rcu(tmp, &encl->mm_list, list) { + if (tmp->mm == mm) { + encl_mm = tmp; + break; + } + } + + srcu_read_unlock(&encl->srcu, idx); + + return encl_mm; +} + +int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm) +{ + struct sgx_encl_mm *encl_mm; + int ret; + + /* + * Even though a single enclave may be mapped into an mm more than once, + * each 'mm' only appears once on encl->mm_list. This is guaranteed by + * holding the mm's mmap lock for write before an mm can be added or + * remove to an encl->mm_list. + */ + mmap_assert_write_locked(mm); + + /* + * It's possible that an entry already exists in the mm_list, because it + * is removed only on VFS release or process exit. + */ + if (sgx_encl_find_mm(encl, mm)) + return 0; + + encl_mm = kzalloc(sizeof(*encl_mm), GFP_KERNEL); + if (!encl_mm) + return -ENOMEM; + + /* Grab a refcount for the encl_mm->encl reference: */ + kref_get(&encl->refcount); + encl_mm->encl = encl; + encl_mm->mm = mm; + encl_mm->mmu_notifier.ops = &sgx_mmu_notifier_ops; + + ret = __mmu_notifier_register(&encl_mm->mmu_notifier, mm); + if (ret) { + kfree(encl_mm); + return ret; + } + + spin_lock(&encl->mm_lock); + list_add_rcu(&encl_mm->list, &encl->mm_list); + /* Pairs with smp_rmb() in sgx_zap_enclave_ptes(). */ + smp_wmb(); + encl->mm_list_version++; + spin_unlock(&encl->mm_lock); + + return 0; +} + +/** + * sgx_encl_cpumask() - Query which CPUs might be accessing the enclave + * @encl: the enclave + * + * Some SGX functions require that no cached linear-to-physical address + * mappings are present before they can succeed. For example, ENCLS[EWB] + * copies a page from the enclave page cache to regular main memory but + * it fails if it cannot ensure that there are no cached + * linear-to-physical address mappings referring to the page. + * + * SGX hardware flushes all cached linear-to-physical mappings on a CPU + * when an enclave is exited via ENCLU[EEXIT] or an Asynchronous Enclave + * Exit (AEX). Exiting an enclave will thus ensure cached linear-to-physical + * address mappings are cleared but coordination with the tracking done within + * the SGX hardware is needed to support the SGX functions that depend on this + * cache clearing. + * + * When the ENCLS[ETRACK] function is issued on an enclave the hardware + * tracks threads operating inside the enclave at that time. The SGX + * hardware tracking require that all the identified threads must have + * exited the enclave in order to flush the mappings before a function such + * as ENCLS[EWB] will be permitted + * + * The following flow is used to support SGX functions that require that + * no cached linear-to-physical address mappings are present: + * 1) Execute ENCLS[ETRACK] to initiate hardware tracking. + * 2) Use this function (sgx_encl_cpumask()) to query which CPUs might be + * accessing the enclave. + * 3) Send IPI to identified CPUs, kicking them out of the enclave and + * thus flushing all locally cached linear-to-physical address mappings. + * 4) Execute SGX function. + * + * Context: It is required to call this function after ENCLS[ETRACK]. + * This will ensure that if any new mm appears (racing with + * sgx_encl_mm_add()) then the new mm will enter into the + * enclave with fresh linear-to-physical address mappings. + * + * It is required that all IPIs are completed before a new + * ENCLS[ETRACK] is issued so be sure to protect steps 1 to 3 + * of the above flow with the enclave's mutex. + * + * Return: cpumask of CPUs that might be accessing @encl + */ +const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl) +{ + cpumask_t *cpumask = &encl->cpumask; + struct sgx_encl_mm *encl_mm; + int idx; + + cpumask_clear(cpumask); + + idx = srcu_read_lock(&encl->srcu); + + list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { + if (!mmget_not_zero(encl_mm->mm)) + continue; + + cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm)); + + mmput_async(encl_mm->mm); + } + + srcu_read_unlock(&encl->srcu, idx); + + return cpumask; +} + +static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl, + pgoff_t index) +{ + struct address_space *mapping = encl->backing->f_mapping; + gfp_t gfpmask = mapping_gfp_mask(mapping); + + return shmem_read_mapping_page_gfp(mapping, index, gfpmask); +} + +/** + * __sgx_encl_get_backing() - Pin the backing storage + * @encl: an enclave pointer + * @page_index: enclave page index + * @backing: data for accessing backing storage for the page + * + * Pin the backing storage pages for storing the encrypted contents and Paging + * Crypto MetaData (PCMD) of an enclave page. + * + * Return: + * 0 on success, + * -errno otherwise. + */ +static int __sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing) +{ + pgoff_t page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); + struct page *contents; + struct page *pcmd; + + contents = sgx_encl_get_backing_page(encl, page_index); + if (IS_ERR(contents)) + return PTR_ERR(contents); + + pcmd = sgx_encl_get_backing_page(encl, PFN_DOWN(page_pcmd_off)); + if (IS_ERR(pcmd)) { + put_page(contents); + return PTR_ERR(pcmd); + } + + backing->contents = contents; + backing->pcmd = pcmd; + backing->pcmd_offset = page_pcmd_off & (PAGE_SIZE - 1); + + return 0; +} + +/* + * When called from ksgxd, returns the mem_cgroup of a struct mm stored + * in the enclave's mm_list. When not called from ksgxd, just returns + * the mem_cgroup of the current task. + */ +static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl) +{ + struct mem_cgroup *memcg = NULL; + struct sgx_encl_mm *encl_mm; + int idx; + + /* + * If called from normal task context, return the mem_cgroup + * of the current task's mm. The remainder of the handling is for + * ksgxd. + */ + if (!current_is_ksgxd()) + return get_mem_cgroup_from_mm(current->mm); + + /* + * Search the enclave's mm_list to find an mm associated with + * this enclave to charge the allocation to. + */ + idx = srcu_read_lock(&encl->srcu); + + list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { + if (!mmget_not_zero(encl_mm->mm)) + continue; + + memcg = get_mem_cgroup_from_mm(encl_mm->mm); + + mmput_async(encl_mm->mm); + + break; + } + + srcu_read_unlock(&encl->srcu, idx); + + /* + * In the rare case that there isn't an mm associated with + * the enclave, set memcg to the current active mem_cgroup. + * This will be the root mem_cgroup if there is no active + * mem_cgroup. + */ + if (!memcg) + return get_mem_cgroup_from_mm(NULL); + + return memcg; +} + +/** + * sgx_encl_alloc_backing() - create a new backing storage page + * @encl: an enclave pointer + * @page_index: enclave page index + * @backing: data for accessing backing storage for the page + * + * When called from ksgxd, sets the active memcg from one of the + * mms in the enclave's mm_list prior to any backing page allocation, + * in order to ensure that shmem page allocations are charged to the + * enclave. Create a backing page for loading data back into an EPC page with + * ELDU. This function takes a reference on a new backing page which + * must be dropped with a corresponding call to sgx_encl_put_backing(). + * + * Return: + * 0 on success, + * -errno otherwise. + */ +int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing) +{ + struct mem_cgroup *encl_memcg = sgx_encl_get_mem_cgroup(encl); + struct mem_cgroup *memcg = set_active_memcg(encl_memcg); + int ret; + + ret = __sgx_encl_get_backing(encl, page_index, backing); + + set_active_memcg(memcg); + mem_cgroup_put(encl_memcg); + + return ret; +} + +/** + * sgx_encl_lookup_backing() - retrieve an existing backing storage page + * @encl: an enclave pointer + * @page_index: enclave page index + * @backing: data for accessing backing storage for the page + * + * Retrieve a backing page for loading data back into an EPC page with ELDU. + * It is the caller's responsibility to ensure that it is appropriate to use + * sgx_encl_lookup_backing() rather than sgx_encl_alloc_backing(). If lookup is + * not used correctly, this will cause an allocation which is not accounted for. + * This function takes a reference on an existing backing page which must be + * dropped with a corresponding call to sgx_encl_put_backing(). + * + * Return: + * 0 on success, + * -errno otherwise. + */ +static int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing) +{ + return __sgx_encl_get_backing(encl, page_index, backing); +} + +/** + * sgx_encl_put_backing() - Unpin the backing storage + * @backing: data for accessing backing storage for the page + */ +void sgx_encl_put_backing(struct sgx_backing *backing) +{ + put_page(backing->pcmd); + put_page(backing->contents); +} + +static int sgx_encl_test_and_clear_young_cb(pte_t *ptep, unsigned long addr, + void *data) +{ + pte_t pte; + int ret; + + ret = pte_young(*ptep); + if (ret) { + pte = pte_mkold(*ptep); + set_pte_at((struct mm_struct *)data, addr, ptep, pte); + } + + return ret; +} + +/** + * sgx_encl_test_and_clear_young() - Test and reset the accessed bit + * @mm: mm_struct that is checked + * @page: enclave page to be tested for recent access + * + * Checks the Access (A) bit from the PTE corresponding to the enclave page and + * clears it. + * + * Return: 1 if the page has been recently accessed and 0 if not. + */ +int sgx_encl_test_and_clear_young(struct mm_struct *mm, + struct sgx_encl_page *page) +{ + unsigned long addr = page->desc & PAGE_MASK; + struct sgx_encl *encl = page->encl; + struct vm_area_struct *vma; + int ret; + + ret = sgx_encl_find(mm, addr, &vma); + if (ret) + return 0; + + if (encl != vma->vm_private_data) + return 0; + + ret = apply_to_page_range(vma->vm_mm, addr, PAGE_SIZE, + sgx_encl_test_and_clear_young_cb, vma->vm_mm); + if (ret < 0) + return 0; + + return ret; +} + +struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, + unsigned long offset, + u64 secinfo_flags) +{ + struct sgx_encl_page *encl_page; + unsigned long prot; + + encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL); + if (!encl_page) + return ERR_PTR(-ENOMEM); + + encl_page->desc = encl->base + offset; + encl_page->encl = encl; + + prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ) | + _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) | + _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC); + + /* + * TCS pages must always RW set for CPU access while the SECINFO + * permissions are *always* zero - the CPU ignores the user provided + * values and silently overwrites them with zero permissions. + */ + if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS) + prot |= PROT_READ | PROT_WRITE; + + /* Calculate maximum of the VM flags for the page. */ + encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0); + + return encl_page; +} + +/** + * sgx_zap_enclave_ptes() - remove PTEs mapping the address from enclave + * @encl: the enclave + * @addr: page aligned pointer to single page for which PTEs will be removed + * + * Multiple VMAs may have an enclave page mapped. Remove the PTE mapping + * @addr from each VMA. Ensure that page fault handler is ready to handle + * new mappings of @addr before calling this function. + */ +void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr) +{ + unsigned long mm_list_version; + struct sgx_encl_mm *encl_mm; + struct vm_area_struct *vma; + int idx, ret; + + do { + mm_list_version = encl->mm_list_version; + + /* Pairs with smp_wmb() in sgx_encl_mm_add(). */ + smp_rmb(); + + idx = srcu_read_lock(&encl->srcu); + + list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { + if (!mmget_not_zero(encl_mm->mm)) + continue; + + mmap_read_lock(encl_mm->mm); + + ret = sgx_encl_find(encl_mm->mm, addr, &vma); + if (!ret && encl == vma->vm_private_data) + zap_vma_ptes(vma, addr, PAGE_SIZE); + + mmap_read_unlock(encl_mm->mm); + + mmput_async(encl_mm->mm); + } + + srcu_read_unlock(&encl->srcu, idx); + } while (unlikely(encl->mm_list_version != mm_list_version)); +} + +/** + * sgx_alloc_va_page() - Allocate a Version Array (VA) page + * @reclaim: Reclaim EPC pages directly if none available. Enclave + * mutex should not be held if this is set. + * + * Allocate a free EPC page and convert it to a Version Array (VA) page. + * + * Return: + * a VA page, + * -errno otherwise + */ +struct sgx_epc_page *sgx_alloc_va_page(bool reclaim) +{ + struct sgx_epc_page *epc_page; + int ret; + + epc_page = sgx_alloc_epc_page(NULL, reclaim); + if (IS_ERR(epc_page)) + return ERR_CAST(epc_page); + + ret = __epa(sgx_get_epc_virt_addr(epc_page)); + if (ret) { + WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret); + sgx_encl_free_epc_page(epc_page); + return ERR_PTR(-EFAULT); + } + + return epc_page; +} + +/** + * sgx_alloc_va_slot - allocate a VA slot + * @va_page: a &struct sgx_va_page instance + * + * Allocates a slot from a &struct sgx_va_page instance. + * + * Return: offset of the slot inside the VA page + */ +unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page) +{ + int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT); + + if (slot < SGX_VA_SLOT_COUNT) + set_bit(slot, va_page->slots); + + return slot << 3; +} + +/** + * sgx_free_va_slot - free a VA slot + * @va_page: a &struct sgx_va_page instance + * @offset: offset of the slot inside the VA page + * + * Frees a slot from a &struct sgx_va_page instance. + */ +void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset) +{ + clear_bit(offset >> 3, va_page->slots); +} + +/** + * sgx_va_page_full - is the VA page full? + * @va_page: a &struct sgx_va_page instance + * + * Return: true if all slots have been taken + */ +bool sgx_va_page_full(struct sgx_va_page *va_page) +{ + int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT); + + return slot == SGX_VA_SLOT_COUNT; +} + +/** + * sgx_encl_free_epc_page - free an EPC page assigned to an enclave + * @page: EPC page to be freed + * + * Free an EPC page assigned to an enclave. It does EREMOVE for the page, and + * only upon success, it puts the page back to free page list. Otherwise, it + * gives a WARNING to indicate page is leaked. + */ +void sgx_encl_free_epc_page(struct sgx_epc_page *page) +{ + int ret; + + WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED); + + ret = __eremove(sgx_get_epc_virt_addr(page)); + if (WARN_ONCE(ret, EREMOVE_ERROR_MESSAGE, ret, ret)) + return; + + sgx_free_epc_page(page); +} diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h new file mode 100644 index 000000000000..f94ff14c9486 --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/** + * Copyright(c) 2016-20 Intel Corporation. + * + * Contains the software defined data structures for enclaves. + */ +#ifndef _X86_ENCL_H +#define _X86_ENCL_H + +#include <linux/cpumask.h> +#include <linux/kref.h> +#include <linux/list.h> +#include <linux/mm_types.h> +#include <linux/mmu_notifier.h> +#include <linux/mutex.h> +#include <linux/notifier.h> +#include <linux/srcu.h> +#include <linux/workqueue.h> +#include <linux/xarray.h> +#include "sgx.h" + +/* 'desc' bits holding the offset in the VA (version array) page. */ +#define SGX_ENCL_PAGE_VA_OFFSET_MASK GENMASK_ULL(11, 3) + +/* 'desc' bit marking that the page is being reclaimed. */ +#define SGX_ENCL_PAGE_BEING_RECLAIMED BIT(3) + +struct sgx_encl_page { + unsigned long desc; + unsigned long vm_max_prot_bits:8; + enum sgx_page_type type:16; + struct sgx_epc_page *epc_page; + struct sgx_encl *encl; + struct sgx_va_page *va_page; +}; + +enum sgx_encl_flags { + SGX_ENCL_IOCTL = BIT(0), + SGX_ENCL_DEBUG = BIT(1), + SGX_ENCL_CREATED = BIT(2), + SGX_ENCL_INITIALIZED = BIT(3), +}; + +struct sgx_encl_mm { + struct sgx_encl *encl; + struct mm_struct *mm; + struct list_head list; + struct mmu_notifier mmu_notifier; +}; + +struct sgx_encl { + unsigned long base; + unsigned long size; + unsigned long flags; + unsigned int page_cnt; + unsigned int secs_child_cnt; + struct mutex lock; + struct xarray page_array; + struct sgx_encl_page secs; + unsigned long attributes; + unsigned long attributes_mask; + + cpumask_t cpumask; + struct file *backing; + struct kref refcount; + struct list_head va_pages; + unsigned long mm_list_version; + struct list_head mm_list; + spinlock_t mm_lock; + struct srcu_struct srcu; +}; + +#define SGX_VA_SLOT_COUNT 512 + +struct sgx_va_page { + struct sgx_epc_page *epc_page; + DECLARE_BITMAP(slots, SGX_VA_SLOT_COUNT); + struct list_head list; +}; + +struct sgx_backing { + struct page *contents; + struct page *pcmd; + unsigned long pcmd_offset; +}; + +extern const struct vm_operations_struct sgx_vm_ops; + +static inline int sgx_encl_find(struct mm_struct *mm, unsigned long addr, + struct vm_area_struct **vma) +{ + struct vm_area_struct *result; + + result = vma_lookup(mm, addr); + if (!result || result->vm_ops != &sgx_vm_ops) + return -EINVAL; + + *vma = result; + + return 0; +} + +int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, + unsigned long end, unsigned long vm_flags); + +bool current_is_ksgxd(void); +void sgx_encl_release(struct kref *ref); +int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm); +const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl); +int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing); +void sgx_encl_put_backing(struct sgx_backing *backing); +int sgx_encl_test_and_clear_young(struct mm_struct *mm, + struct sgx_encl_page *page); +struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, + unsigned long offset, + u64 secinfo_flags); +void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr); +struct sgx_epc_page *sgx_alloc_va_page(bool reclaim); +unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page); +void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset); +bool sgx_va_page_full(struct sgx_va_page *va_page); +void sgx_encl_free_epc_page(struct sgx_epc_page *page); +struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, + unsigned long addr); +struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl, bool reclaim); +void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page); + +#endif /* _X86_ENCL_H */ diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h new file mode 100644 index 000000000000..99004b02e2ed --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/encls.h @@ -0,0 +1,236 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _X86_ENCLS_H +#define _X86_ENCLS_H + +#include <linux/bitops.h> +#include <linux/err.h> +#include <linux/io.h> +#include <linux/rwsem.h> +#include <linux/types.h> +#include <asm/asm.h> +#include <asm/traps.h> +#include "sgx.h" + +/* Retrieve the encoded trapnr from the specified return code. */ +#define ENCLS_TRAPNR(r) ((r) & ~SGX_ENCLS_FAULT_FLAG) + +/* Issue a WARN() about an ENCLS function. */ +#define ENCLS_WARN(r, name) { \ + do { \ + int _r = (r); \ + WARN_ONCE(_r, "%s returned %d (0x%x)\n", (name), _r, _r); \ + } while (0); \ +} + +/* + * encls_faulted() - Check if an ENCLS leaf faulted given an error code + * @ret: the return value of an ENCLS leaf function call + * + * Return: + * - true: ENCLS leaf faulted. + * - false: Otherwise. + */ +static inline bool encls_faulted(int ret) +{ + return ret & SGX_ENCLS_FAULT_FLAG; +} + +/** + * encls_failed() - Check if an ENCLS function failed + * @ret: the return value of an ENCLS function call + * + * Check if an ENCLS function failed. This happens when the function causes a + * fault that is not caused by an EPCM conflict or when the function returns a + * non-zero value. + */ +static inline bool encls_failed(int ret) +{ + if (encls_faulted(ret)) + return ENCLS_TRAPNR(ret) != X86_TRAP_PF; + + return !!ret; +} + +/** + * __encls_ret_N - encode an ENCLS function that returns an error code in EAX + * @rax: function number + * @inputs: asm inputs for the function + * + * Emit assembly for an ENCLS function that returns an error code, e.g. EREMOVE. + * And because SGX isn't complex enough as it is, function that return an error + * code also modify flags. + * + * Return: + * 0 on success, + * SGX error code on failure + */ +#define __encls_ret_N(rax, inputs...) \ + ({ \ + int ret; \ + asm volatile( \ + "1: .byte 0x0f, 0x01, 0xcf;\n\t" \ + "2:\n" \ + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_SGX) \ + : "=a"(ret) \ + : "a"(rax), inputs \ + : "memory", "cc"); \ + ret; \ + }) + +#define __encls_ret_1(rax, rcx) \ + ({ \ + __encls_ret_N(rax, "c"(rcx)); \ + }) + +#define __encls_ret_2(rax, rbx, rcx) \ + ({ \ + __encls_ret_N(rax, "b"(rbx), "c"(rcx)); \ + }) + +#define __encls_ret_3(rax, rbx, rcx, rdx) \ + ({ \ + __encls_ret_N(rax, "b"(rbx), "c"(rcx), "d"(rdx)); \ + }) + +/** + * __encls_N - encode an ENCLS function that doesn't return an error code + * @rax: function number + * @rbx_out: optional output variable + * @inputs: asm inputs for the function + * + * Emit assembly for an ENCLS function that does not return an error code, e.g. + * ECREATE. Leaves without error codes either succeed or fault. @rbx_out is an + * optional parameter for use by EDGBRD, which returns the requested value in + * RBX. + * + * Return: + * 0 on success, + * trapnr with SGX_ENCLS_FAULT_FLAG set on fault + */ +#define __encls_N(rax, rbx_out, inputs...) \ + ({ \ + int ret; \ + asm volatile( \ + "1: .byte 0x0f, 0x01, 0xcf;\n\t" \ + " xor %%eax,%%eax;\n" \ + "2:\n" \ + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_SGX) \ + : "=a"(ret), "=b"(rbx_out) \ + : "a"(rax), inputs \ + : "memory"); \ + ret; \ + }) + +#define __encls_2(rax, rbx, rcx) \ + ({ \ + unsigned long ign_rbx_out; \ + __encls_N(rax, ign_rbx_out, "b"(rbx), "c"(rcx)); \ + }) + +#define __encls_1_1(rax, data, rcx) \ + ({ \ + unsigned long rbx_out; \ + int ret = __encls_N(rax, rbx_out, "c"(rcx)); \ + if (!ret) \ + data = rbx_out; \ + ret; \ + }) + +/* Initialize an EPC page into an SGX Enclave Control Structure (SECS) page. */ +static inline int __ecreate(struct sgx_pageinfo *pginfo, void *secs) +{ + return __encls_2(ECREATE, pginfo, secs); +} + +/* Hash a 256 byte region of an enclave page to SECS:MRENCLAVE. */ +static inline int __eextend(void *secs, void *addr) +{ + return __encls_2(EEXTEND, secs, addr); +} + +/* + * Associate an EPC page to an enclave either as a REG or TCS page + * populated with the provided data. + */ +static inline int __eadd(struct sgx_pageinfo *pginfo, void *addr) +{ + return __encls_2(EADD, pginfo, addr); +} + +/* Finalize enclave build, initialize enclave for user code execution. */ +static inline int __einit(void *sigstruct, void *token, void *secs) +{ + return __encls_ret_3(EINIT, sigstruct, secs, token); +} + +/* Disassociate EPC page from its enclave and mark it as unused. */ +static inline int __eremove(void *addr) +{ + return __encls_ret_1(EREMOVE, addr); +} + +/* Copy data to an EPC page belonging to a debug enclave. */ +static inline int __edbgwr(void *addr, unsigned long *data) +{ + return __encls_2(EDGBWR, *data, addr); +} + +/* Copy data from an EPC page belonging to a debug enclave. */ +static inline int __edbgrd(void *addr, unsigned long *data) +{ + return __encls_1_1(EDGBRD, *data, addr); +} + +/* Track that software has completed the required TLB address clears. */ +static inline int __etrack(void *addr) +{ + return __encls_ret_1(ETRACK, addr); +} + +/* Load, verify, and unblock an EPC page. */ +static inline int __eldu(struct sgx_pageinfo *pginfo, void *addr, + void *va) +{ + return __encls_ret_3(ELDU, pginfo, addr, va); +} + +/* Make EPC page inaccessible to enclave, ready to be written to memory. */ +static inline int __eblock(void *addr) +{ + return __encls_ret_1(EBLOCK, addr); +} + +/* Initialize an EPC page into a Version Array (VA) page. */ +static inline int __epa(void *addr) +{ + unsigned long rbx = SGX_PAGE_TYPE_VA; + + return __encls_2(EPA, rbx, addr); +} + +/* Invalidate an EPC page and write it out to main memory. */ +static inline int __ewb(struct sgx_pageinfo *pginfo, void *addr, + void *va) +{ + return __encls_ret_3(EWB, pginfo, addr, va); +} + +/* Restrict the EPCM permissions of an EPC page. */ +static inline int __emodpr(struct sgx_secinfo *secinfo, void *addr) +{ + return __encls_ret_2(EMODPR, secinfo, addr); +} + +/* Change the type of an EPC page. */ +static inline int __emodt(struct sgx_secinfo *secinfo, void *addr) +{ + return __encls_ret_2(EMODT, secinfo, addr); +} + +/* Zero a page of EPC memory and add it to an initialized enclave. */ +static inline int __eaug(struct sgx_pageinfo *pginfo, void *addr) +{ + return __encls_2(EAUG, pginfo, addr); +} + +#endif /* _X86_ENCLS_H */ diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c new file mode 100644 index 000000000000..ebe79d60619f --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -0,0 +1,1260 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016-20 Intel Corporation. */ + +#include <asm/mman.h> +#include <asm/sgx.h> +#include <linux/mman.h> +#include <linux/delay.h> +#include <linux/file.h> +#include <linux/hashtable.h> +#include <linux/highmem.h> +#include <linux/ratelimit.h> +#include <linux/sched/signal.h> +#include <linux/shmem_fs.h> +#include <linux/slab.h> +#include <linux/suspend.h> +#include "driver.h" +#include "encl.h" +#include "encls.h" + +struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl, bool reclaim) +{ + struct sgx_va_page *va_page = NULL; + void *err; + + BUILD_BUG_ON(SGX_VA_SLOT_COUNT != + (SGX_ENCL_PAGE_VA_OFFSET_MASK >> 3) + 1); + + if (!(encl->page_cnt % SGX_VA_SLOT_COUNT)) { + va_page = kzalloc(sizeof(*va_page), GFP_KERNEL); + if (!va_page) + return ERR_PTR(-ENOMEM); + + va_page->epc_page = sgx_alloc_va_page(reclaim); + if (IS_ERR(va_page->epc_page)) { + err = ERR_CAST(va_page->epc_page); + kfree(va_page); + return err; + } + + WARN_ON_ONCE(encl->page_cnt % SGX_VA_SLOT_COUNT); + } + encl->page_cnt++; + return va_page; +} + +void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page) +{ + encl->page_cnt--; + + if (va_page) { + sgx_encl_free_epc_page(va_page->epc_page); + list_del(&va_page->list); + kfree(va_page); + } +} + +static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) +{ + struct sgx_epc_page *secs_epc; + struct sgx_va_page *va_page; + struct sgx_pageinfo pginfo; + struct sgx_secinfo secinfo; + unsigned long encl_size; + struct file *backing; + long ret; + + va_page = sgx_encl_grow(encl, true); + if (IS_ERR(va_page)) + return PTR_ERR(va_page); + else if (va_page) + list_add(&va_page->list, &encl->va_pages); + /* else the tail page of the VA page list had free slots. */ + + /* The extra page goes to SECS. */ + encl_size = secs->size + PAGE_SIZE; + + backing = shmem_file_setup("SGX backing", encl_size + (encl_size >> 5), + VM_NORESERVE); + if (IS_ERR(backing)) { + ret = PTR_ERR(backing); + goto err_out_shrink; + } + + encl->backing = backing; + + secs_epc = sgx_alloc_epc_page(&encl->secs, true); + if (IS_ERR(secs_epc)) { + ret = PTR_ERR(secs_epc); + goto err_out_backing; + } + + encl->secs.epc_page = secs_epc; + + pginfo.addr = 0; + pginfo.contents = (unsigned long)secs; + pginfo.metadata = (unsigned long)&secinfo; + pginfo.secs = 0; + memset(&secinfo, 0, sizeof(secinfo)); + + ret = __ecreate((void *)&pginfo, sgx_get_epc_virt_addr(secs_epc)); + if (ret) { + ret = -EIO; + goto err_out; + } + + if (secs->attributes & SGX_ATTR_DEBUG) + set_bit(SGX_ENCL_DEBUG, &encl->flags); + + encl->secs.encl = encl; + encl->secs.type = SGX_PAGE_TYPE_SECS; + encl->base = secs->base; + encl->size = secs->size; + encl->attributes = secs->attributes; + encl->attributes_mask = SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT | SGX_ATTR_KSS; + + /* Set only after completion, as encl->lock has not been taken. */ + set_bit(SGX_ENCL_CREATED, &encl->flags); + + return 0; + +err_out: + sgx_encl_free_epc_page(encl->secs.epc_page); + encl->secs.epc_page = NULL; + +err_out_backing: + fput(encl->backing); + encl->backing = NULL; + +err_out_shrink: + sgx_encl_shrink(encl, va_page); + + return ret; +} + +/** + * sgx_ioc_enclave_create() - handler for %SGX_IOC_ENCLAVE_CREATE + * @encl: An enclave pointer. + * @arg: The ioctl argument. + * + * Allocate kernel data structures for the enclave and invoke ECREATE. + * + * Return: + * - 0: Success. + * - -EIO: ECREATE failed. + * - -errno: POSIX error. + */ +static long sgx_ioc_enclave_create(struct sgx_encl *encl, void __user *arg) +{ + struct sgx_enclave_create create_arg; + void *secs; + int ret; + + if (test_bit(SGX_ENCL_CREATED, &encl->flags)) + return -EINVAL; + + if (copy_from_user(&create_arg, arg, sizeof(create_arg))) + return -EFAULT; + + secs = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!secs) + return -ENOMEM; + + if (copy_from_user(secs, (void __user *)create_arg.src, PAGE_SIZE)) + ret = -EFAULT; + else + ret = sgx_encl_create(encl, secs); + + kfree(secs); + return ret; +} + +static int sgx_validate_secinfo(struct sgx_secinfo *secinfo) +{ + u64 perm = secinfo->flags & SGX_SECINFO_PERMISSION_MASK; + u64 pt = secinfo->flags & SGX_SECINFO_PAGE_TYPE_MASK; + + if (pt != SGX_SECINFO_REG && pt != SGX_SECINFO_TCS) + return -EINVAL; + + if ((perm & SGX_SECINFO_W) && !(perm & SGX_SECINFO_R)) + return -EINVAL; + + /* + * CPU will silently overwrite the permissions as zero, which means + * that we need to validate it ourselves. + */ + if (pt == SGX_SECINFO_TCS && perm) + return -EINVAL; + + if (secinfo->flags & SGX_SECINFO_RESERVED_MASK) + return -EINVAL; + + if (memchr_inv(secinfo->reserved, 0, sizeof(secinfo->reserved))) + return -EINVAL; + + return 0; +} + +static int __sgx_encl_add_page(struct sgx_encl *encl, + struct sgx_encl_page *encl_page, + struct sgx_epc_page *epc_page, + struct sgx_secinfo *secinfo, unsigned long src) +{ + struct sgx_pageinfo pginfo; + struct vm_area_struct *vma; + struct page *src_page; + int ret; + + /* Deny noexec. */ + vma = find_vma(current->mm, src); + if (!vma) + return -EFAULT; + + if (!(vma->vm_flags & VM_MAYEXEC)) + return -EACCES; + + ret = get_user_pages(src, 1, 0, &src_page, NULL); + if (ret < 1) + return -EFAULT; + + pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page); + pginfo.addr = encl_page->desc & PAGE_MASK; + pginfo.metadata = (unsigned long)secinfo; + pginfo.contents = (unsigned long)kmap_atomic(src_page); + + ret = __eadd(&pginfo, sgx_get_epc_virt_addr(epc_page)); + + kunmap_atomic((void *)pginfo.contents); + put_page(src_page); + + return ret ? -EIO : 0; +} + +/* + * If the caller requires measurement of the page as a proof for the content, + * use EEXTEND to add a measurement for 256 bytes of the page. Repeat this + * operation until the entire page is measured." + */ +static int __sgx_encl_extend(struct sgx_encl *encl, + struct sgx_epc_page *epc_page) +{ + unsigned long offset; + int ret; + + for (offset = 0; offset < PAGE_SIZE; offset += SGX_EEXTEND_BLOCK_SIZE) { + ret = __eextend(sgx_get_epc_virt_addr(encl->secs.epc_page), + sgx_get_epc_virt_addr(epc_page) + offset); + if (ret) { + if (encls_failed(ret)) + ENCLS_WARN(ret, "EEXTEND"); + + return -EIO; + } + } + + return 0; +} + +static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src, + unsigned long offset, struct sgx_secinfo *secinfo, + unsigned long flags) +{ + struct sgx_encl_page *encl_page; + struct sgx_epc_page *epc_page; + struct sgx_va_page *va_page; + int ret; + + encl_page = sgx_encl_page_alloc(encl, offset, secinfo->flags); + if (IS_ERR(encl_page)) + return PTR_ERR(encl_page); + + epc_page = sgx_alloc_epc_page(encl_page, true); + if (IS_ERR(epc_page)) { + kfree(encl_page); + return PTR_ERR(epc_page); + } + + va_page = sgx_encl_grow(encl, true); + if (IS_ERR(va_page)) { + ret = PTR_ERR(va_page); + goto err_out_free; + } + + mmap_read_lock(current->mm); + mutex_lock(&encl->lock); + + /* + * Adding to encl->va_pages must be done under encl->lock. Ditto for + * deleting (via sgx_encl_shrink()) in the error path. + */ + if (va_page) + list_add(&va_page->list, &encl->va_pages); + + /* + * Insert prior to EADD in case of OOM. EADD modifies MRENCLAVE, i.e. + * can't be gracefully unwound, while failure on EADD/EXTEND is limited + * to userspace errors (or kernel/hardware bugs). + */ + ret = xa_insert(&encl->page_array, PFN_DOWN(encl_page->desc), + encl_page, GFP_KERNEL); + if (ret) + goto err_out_unlock; + + ret = __sgx_encl_add_page(encl, encl_page, epc_page, secinfo, + src); + if (ret) + goto err_out; + + /* + * Complete the "add" before doing the "extend" so that the "add" + * isn't in a half-baked state in the extremely unlikely scenario + * the enclave will be destroyed in response to EEXTEND failure. + */ + encl_page->encl = encl; + encl_page->epc_page = epc_page; + encl_page->type = (secinfo->flags & SGX_SECINFO_PAGE_TYPE_MASK) >> 8; + encl->secs_child_cnt++; + + if (flags & SGX_PAGE_MEASURE) { + ret = __sgx_encl_extend(encl, epc_page); + if (ret) + goto err_out; + } + + sgx_mark_page_reclaimable(encl_page->epc_page); + mutex_unlock(&encl->lock); + mmap_read_unlock(current->mm); + return ret; + +err_out: + xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc)); + +err_out_unlock: + sgx_encl_shrink(encl, va_page); + mutex_unlock(&encl->lock); + mmap_read_unlock(current->mm); + +err_out_free: + sgx_encl_free_epc_page(epc_page); + kfree(encl_page); + + return ret; +} + +/* + * Ensure user provided offset and length values are valid for + * an enclave. + */ +static int sgx_validate_offset_length(struct sgx_encl *encl, + unsigned long offset, + unsigned long length) +{ + if (!IS_ALIGNED(offset, PAGE_SIZE)) + return -EINVAL; + + if (!length || !IS_ALIGNED(length, PAGE_SIZE)) + return -EINVAL; + + if (offset + length - PAGE_SIZE >= encl->size) + return -EINVAL; + + return 0; +} + +/** + * sgx_ioc_enclave_add_pages() - The handler for %SGX_IOC_ENCLAVE_ADD_PAGES + * @encl: an enclave pointer + * @arg: a user pointer to a struct sgx_enclave_add_pages instance + * + * Add one or more pages to an uninitialized enclave, and optionally extend the + * measurement with the contents of the page. The SECINFO and measurement mask + * are applied to all pages. + * + * A SECINFO for a TCS is required to always contain zero permissions because + * CPU silently zeros them. Allowing anything else would cause a mismatch in + * the measurement. + * + * mmap()'s protection bits are capped by the page permissions. For each page + * address, the maximum protection bits are computed with the following + * heuristics: + * + * 1. A regular page: PROT_R, PROT_W and PROT_X match the SECINFO permissions. + * 2. A TCS page: PROT_R | PROT_W. + * + * mmap() is not allowed to surpass the minimum of the maximum protection bits + * within the given address range. + * + * The function deinitializes kernel data structures for enclave and returns + * -EIO in any of the following conditions: + * + * - Enclave Page Cache (EPC), the physical memory holding enclaves, has + * been invalidated. This will cause EADD and EEXTEND to fail. + * - If the source address is corrupted somehow when executing EADD. + * + * Return: + * - 0: Success. + * - -EACCES: The source page is located in a noexec partition. + * - -ENOMEM: Out of EPC pages. + * - -EINTR: The call was interrupted before data was processed. + * - -EIO: Either EADD or EEXTEND failed because invalid source address + * or power cycle. + * - -errno: POSIX error. + */ +static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg) +{ + struct sgx_enclave_add_pages add_arg; + struct sgx_secinfo secinfo; + unsigned long c; + int ret; + + if (!test_bit(SGX_ENCL_CREATED, &encl->flags) || + test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) + return -EINVAL; + + if (copy_from_user(&add_arg, arg, sizeof(add_arg))) + return -EFAULT; + + if (!IS_ALIGNED(add_arg.src, PAGE_SIZE)) + return -EINVAL; + + if (sgx_validate_offset_length(encl, add_arg.offset, add_arg.length)) + return -EINVAL; + + if (copy_from_user(&secinfo, (void __user *)add_arg.secinfo, + sizeof(secinfo))) + return -EFAULT; + + if (sgx_validate_secinfo(&secinfo)) + return -EINVAL; + + for (c = 0 ; c < add_arg.length; c += PAGE_SIZE) { + if (signal_pending(current)) { + if (!c) + ret = -ERESTARTSYS; + + break; + } + + if (need_resched()) + cond_resched(); + + ret = sgx_encl_add_page(encl, add_arg.src + c, add_arg.offset + c, + &secinfo, add_arg.flags); + if (ret) + break; + } + + add_arg.count = c; + + if (copy_to_user(arg, &add_arg, sizeof(add_arg))) + return -EFAULT; + + return ret; +} + +static int __sgx_get_key_hash(struct crypto_shash *tfm, const void *modulus, + void *hash) +{ + SHASH_DESC_ON_STACK(shash, tfm); + + shash->tfm = tfm; + + return crypto_shash_digest(shash, modulus, SGX_MODULUS_SIZE, hash); +} + +static int sgx_get_key_hash(const void *modulus, void *hash) +{ + struct crypto_shash *tfm; + int ret; + + tfm = crypto_alloc_shash("sha256", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + ret = __sgx_get_key_hash(tfm, modulus, hash); + + crypto_free_shash(tfm); + return ret; +} + +static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, + void *token) +{ + u64 mrsigner[4]; + int i, j; + void *addr; + int ret; + + /* + * Deny initializing enclaves with attributes (namely provisioning) + * that have not been explicitly allowed. + */ + if (encl->attributes & ~encl->attributes_mask) + return -EACCES; + + /* + * Attributes should not be enforced *only* against what's available on + * platform (done in sgx_encl_create) but checked and enforced against + * the mask for enforcement in sigstruct. For example an enclave could + * opt to sign with AVX bit in xfrm, but still be loadable on a platform + * without it if the sigstruct->body.attributes_mask does not turn that + * bit on. + */ + if (sigstruct->body.attributes & sigstruct->body.attributes_mask & + sgx_attributes_reserved_mask) + return -EINVAL; + + if (sigstruct->body.miscselect & sigstruct->body.misc_mask & + sgx_misc_reserved_mask) + return -EINVAL; + + if (sigstruct->body.xfrm & sigstruct->body.xfrm_mask & + sgx_xfrm_reserved_mask) + return -EINVAL; + + ret = sgx_get_key_hash(sigstruct->modulus, mrsigner); + if (ret) + return ret; + + mutex_lock(&encl->lock); + + /* + * ENCLS[EINIT] is interruptible because it has such a high latency, + * e.g. 50k+ cycles on success. If an IRQ/NMI/SMI becomes pending, + * EINIT may fail with SGX_UNMASKED_EVENT so that the event can be + * serviced. + */ + for (i = 0; i < SGX_EINIT_SLEEP_COUNT; i++) { + for (j = 0; j < SGX_EINIT_SPIN_COUNT; j++) { + addr = sgx_get_epc_virt_addr(encl->secs.epc_page); + + preempt_disable(); + + sgx_update_lepubkeyhash(mrsigner); + + ret = __einit(sigstruct, token, addr); + + preempt_enable(); + + if (ret == SGX_UNMASKED_EVENT) + continue; + else + break; + } + + if (ret != SGX_UNMASKED_EVENT) + break; + + msleep_interruptible(SGX_EINIT_SLEEP_TIME); + + if (signal_pending(current)) { + ret = -ERESTARTSYS; + goto err_out; + } + } + + if (encls_faulted(ret)) { + if (encls_failed(ret)) + ENCLS_WARN(ret, "EINIT"); + + ret = -EIO; + } else if (ret) { + pr_debug("EINIT returned %d\n", ret); + ret = -EPERM; + } else { + set_bit(SGX_ENCL_INITIALIZED, &encl->flags); + } + +err_out: + mutex_unlock(&encl->lock); + return ret; +} + +/** + * sgx_ioc_enclave_init() - handler for %SGX_IOC_ENCLAVE_INIT + * @encl: an enclave pointer + * @arg: userspace pointer to a struct sgx_enclave_init instance + * + * Flush any outstanding enqueued EADD operations and perform EINIT. The + * Launch Enclave Public Key Hash MSRs are rewritten as necessary to match + * the enclave's MRSIGNER, which is caculated from the provided sigstruct. + * + * Return: + * - 0: Success. + * - -EPERM: Invalid SIGSTRUCT. + * - -EIO: EINIT failed because of a power cycle. + * - -errno: POSIX error. + */ +static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg) +{ + struct sgx_sigstruct *sigstruct; + struct sgx_enclave_init init_arg; + void *token; + int ret; + + if (!test_bit(SGX_ENCL_CREATED, &encl->flags) || + test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) + return -EINVAL; + + if (copy_from_user(&init_arg, arg, sizeof(init_arg))) + return -EFAULT; + + /* + * 'sigstruct' must be on a page boundary and 'token' on a 512 byte + * boundary. kmalloc() will give this alignment when allocating + * PAGE_SIZE bytes. + */ + sigstruct = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!sigstruct) + return -ENOMEM; + + token = (void *)((unsigned long)sigstruct + PAGE_SIZE / 2); + memset(token, 0, SGX_LAUNCH_TOKEN_SIZE); + + if (copy_from_user(sigstruct, (void __user *)init_arg.sigstruct, + sizeof(*sigstruct))) { + ret = -EFAULT; + goto out; + } + + /* + * A legacy field used with Intel signed enclaves. These used to mean + * regular and architectural enclaves. The CPU only accepts these values + * but they do not have any other meaning. + * + * Thus, reject any other values. + */ + if (sigstruct->header.vendor != 0x0000 && + sigstruct->header.vendor != 0x8086) { + ret = -EINVAL; + goto out; + } + + ret = sgx_encl_init(encl, sigstruct, token); + +out: + kfree(sigstruct); + return ret; +} + +/** + * sgx_ioc_enclave_provision() - handler for %SGX_IOC_ENCLAVE_PROVISION + * @encl: an enclave pointer + * @arg: userspace pointer to a struct sgx_enclave_provision instance + * + * Allow ATTRIBUTE.PROVISION_KEY for an enclave by providing a file handle to + * /dev/sgx_provision. + * + * Return: + * - 0: Success. + * - -errno: Otherwise. + */ +static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg) +{ + struct sgx_enclave_provision params; + + if (copy_from_user(¶ms, arg, sizeof(params))) + return -EFAULT; + + return sgx_set_attribute(&encl->attributes_mask, params.fd); +} + +/* + * Ensure enclave is ready for SGX2 functions. Readiness is checked + * by ensuring the hardware supports SGX2 and the enclave is initialized + * and thus able to handle requests to modify pages within it. + */ +static int sgx_ioc_sgx2_ready(struct sgx_encl *encl) +{ + if (!(cpu_feature_enabled(X86_FEATURE_SGX2))) + return -ENODEV; + + if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) + return -EINVAL; + + return 0; +} + +/* + * Some SGX functions require that no cached linear-to-physical address + * mappings are present before they can succeed. Collaborate with + * hardware via ENCLS[ETRACK] to ensure that all cached + * linear-to-physical address mappings belonging to all threads of + * the enclave are cleared. See sgx_encl_cpumask() for details. + * + * Must be called with enclave's mutex held from the time the + * SGX function requiring that no cached linear-to-physical mappings + * are present is executed until this ETRACK flow is complete. + */ +static int sgx_enclave_etrack(struct sgx_encl *encl) +{ + void *epc_virt; + int ret; + + epc_virt = sgx_get_epc_virt_addr(encl->secs.epc_page); + ret = __etrack(epc_virt); + if (ret) { + /* + * ETRACK only fails when there is an OS issue. For + * example, two consecutive ETRACK was sent without + * completed IPI between. + */ + pr_err_once("ETRACK returned %d (0x%x)", ret, ret); + /* + * Send IPIs to kick CPUs out of the enclave and + * try ETRACK again. + */ + on_each_cpu_mask(sgx_encl_cpumask(encl), sgx_ipi_cb, NULL, 1); + ret = __etrack(epc_virt); + if (ret) { + pr_err_once("ETRACK repeat returned %d (0x%x)", + ret, ret); + return -EFAULT; + } + } + on_each_cpu_mask(sgx_encl_cpumask(encl), sgx_ipi_cb, NULL, 1); + + return 0; +} + +/** + * sgx_enclave_restrict_permissions() - Restrict EPCM permissions + * @encl: Enclave to which the pages belong. + * @modp: Checked parameters from user on which pages need modifying and + * their new permissions. + * + * Return: + * - 0: Success. + * - -errno: Otherwise. + */ +static long +sgx_enclave_restrict_permissions(struct sgx_encl *encl, + struct sgx_enclave_restrict_permissions *modp) +{ + struct sgx_encl_page *entry; + struct sgx_secinfo secinfo; + unsigned long addr; + unsigned long c; + void *epc_virt; + int ret; + + memset(&secinfo, 0, sizeof(secinfo)); + secinfo.flags = modp->permissions & SGX_SECINFO_PERMISSION_MASK; + + for (c = 0 ; c < modp->length; c += PAGE_SIZE) { + addr = encl->base + modp->offset + c; + + sgx_reclaim_direct(); + + mutex_lock(&encl->lock); + + entry = sgx_encl_load_page(encl, addr); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT; + goto out_unlock; + } + + /* + * Changing EPCM permissions is only supported on regular + * SGX pages. Attempting this change on other pages will + * result in #PF. + */ + if (entry->type != SGX_PAGE_TYPE_REG) { + ret = -EINVAL; + goto out_unlock; + } + + /* + * Apart from ensuring that read-access remains, do not verify + * the permission bits requested. Kernel has no control over + * how EPCM permissions can be relaxed from within the enclave. + * ENCLS[EMODPR] can only remove existing EPCM permissions, + * attempting to set new permissions will be ignored by the + * hardware. + */ + + /* Change EPCM permissions. */ + epc_virt = sgx_get_epc_virt_addr(entry->epc_page); + ret = __emodpr(&secinfo, epc_virt); + if (encls_faulted(ret)) { + /* + * All possible faults should be avoidable: + * parameters have been checked, will only change + * permissions of a regular page, and no concurrent + * SGX1/SGX2 ENCLS instructions since these + * are protected with mutex. + */ + pr_err_once("EMODPR encountered exception %d\n", + ENCLS_TRAPNR(ret)); + ret = -EFAULT; + goto out_unlock; + } + if (encls_failed(ret)) { + modp->result = ret; + ret = -EFAULT; + goto out_unlock; + } + + ret = sgx_enclave_etrack(encl); + if (ret) { + ret = -EFAULT; + goto out_unlock; + } + + mutex_unlock(&encl->lock); + } + + ret = 0; + goto out; + +out_unlock: + mutex_unlock(&encl->lock); +out: + modp->count = c; + + return ret; +} + +/** + * sgx_ioc_enclave_restrict_permissions() - handler for + * %SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS + * @encl: an enclave pointer + * @arg: userspace pointer to a &struct sgx_enclave_restrict_permissions + * instance + * + * SGX2 distinguishes between relaxing and restricting the enclave page + * permissions maintained by the hardware (EPCM permissions) of pages + * belonging to an initialized enclave (after SGX_IOC_ENCLAVE_INIT). + * + * EPCM permissions cannot be restricted from within the enclave, the enclave + * requires the kernel to run the privileged level 0 instructions ENCLS[EMODPR] + * and ENCLS[ETRACK]. An attempt to relax EPCM permissions with this call + * will be ignored by the hardware. + * + * Return: + * - 0: Success + * - -errno: Otherwise + */ +static long sgx_ioc_enclave_restrict_permissions(struct sgx_encl *encl, + void __user *arg) +{ + struct sgx_enclave_restrict_permissions params; + long ret; + + ret = sgx_ioc_sgx2_ready(encl); + if (ret) + return ret; + + if (copy_from_user(¶ms, arg, sizeof(params))) + return -EFAULT; + + if (sgx_validate_offset_length(encl, params.offset, params.length)) + return -EINVAL; + + if (params.permissions & ~SGX_SECINFO_PERMISSION_MASK) + return -EINVAL; + + /* + * Fail early if invalid permissions requested to prevent ENCLS[EMODPR] + * from faulting later when the CPU does the same check. + */ + if ((params.permissions & SGX_SECINFO_W) && + !(params.permissions & SGX_SECINFO_R)) + return -EINVAL; + + if (params.result || params.count) + return -EINVAL; + + ret = sgx_enclave_restrict_permissions(encl, ¶ms); + + if (copy_to_user(arg, ¶ms, sizeof(params))) + return -EFAULT; + + return ret; +} + +/** + * sgx_enclave_modify_types() - Modify type of SGX enclave pages + * @encl: Enclave to which the pages belong. + * @modt: Checked parameters from user about which pages need modifying + * and their new page type. + * + * Return: + * - 0: Success + * - -errno: Otherwise + */ +static long sgx_enclave_modify_types(struct sgx_encl *encl, + struct sgx_enclave_modify_types *modt) +{ + unsigned long max_prot_restore; + enum sgx_page_type page_type; + struct sgx_encl_page *entry; + struct sgx_secinfo secinfo; + unsigned long prot; + unsigned long addr; + unsigned long c; + void *epc_virt; + int ret; + + page_type = modt->page_type & SGX_PAGE_TYPE_MASK; + + /* + * The only new page types allowed by hardware are PT_TCS and PT_TRIM. + */ + if (page_type != SGX_PAGE_TYPE_TCS && page_type != SGX_PAGE_TYPE_TRIM) + return -EINVAL; + + memset(&secinfo, 0, sizeof(secinfo)); + + secinfo.flags = page_type << 8; + + for (c = 0 ; c < modt->length; c += PAGE_SIZE) { + addr = encl->base + modt->offset + c; + + sgx_reclaim_direct(); + + mutex_lock(&encl->lock); + + entry = sgx_encl_load_page(encl, addr); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT; + goto out_unlock; + } + + /* + * Borrow the logic from the Intel SDM. Regular pages + * (SGX_PAGE_TYPE_REG) can change type to SGX_PAGE_TYPE_TCS + * or SGX_PAGE_TYPE_TRIM but TCS pages can only be trimmed. + * CET pages not supported yet. + */ + if (!(entry->type == SGX_PAGE_TYPE_REG || + (entry->type == SGX_PAGE_TYPE_TCS && + page_type == SGX_PAGE_TYPE_TRIM))) { + ret = -EINVAL; + goto out_unlock; + } + + max_prot_restore = entry->vm_max_prot_bits; + + /* + * Once a regular page becomes a TCS page it cannot be + * changed back. So the maximum allowed protection reflects + * the TCS page that is always RW from kernel perspective but + * will be inaccessible from within enclave. Before doing + * so, do make sure that the new page type continues to + * respect the originally vetted page permissions. + */ + if (entry->type == SGX_PAGE_TYPE_REG && + page_type == SGX_PAGE_TYPE_TCS) { + if (~entry->vm_max_prot_bits & (VM_READ | VM_WRITE)) { + ret = -EPERM; + goto out_unlock; + } + prot = PROT_READ | PROT_WRITE; + entry->vm_max_prot_bits = calc_vm_prot_bits(prot, 0); + + /* + * Prevent page from being reclaimed while mutex + * is released. + */ + if (sgx_unmark_page_reclaimable(entry->epc_page)) { + ret = -EAGAIN; + goto out_entry_changed; + } + + /* + * Do not keep encl->lock because of dependency on + * mmap_lock acquired in sgx_zap_enclave_ptes(). + */ + mutex_unlock(&encl->lock); + + sgx_zap_enclave_ptes(encl, addr); + + mutex_lock(&encl->lock); + + sgx_mark_page_reclaimable(entry->epc_page); + } + + /* Change EPC type */ + epc_virt = sgx_get_epc_virt_addr(entry->epc_page); + ret = __emodt(&secinfo, epc_virt); + if (encls_faulted(ret)) { + /* + * All possible faults should be avoidable: + * parameters have been checked, will only change + * valid page types, and no concurrent + * SGX1/SGX2 ENCLS instructions since these are + * protected with mutex. + */ + pr_err_once("EMODT encountered exception %d\n", + ENCLS_TRAPNR(ret)); + ret = -EFAULT; + goto out_entry_changed; + } + if (encls_failed(ret)) { + modt->result = ret; + ret = -EFAULT; + goto out_entry_changed; + } + + ret = sgx_enclave_etrack(encl); + if (ret) { + ret = -EFAULT; + goto out_unlock; + } + + entry->type = page_type; + + mutex_unlock(&encl->lock); + } + + ret = 0; + goto out; + +out_entry_changed: + entry->vm_max_prot_bits = max_prot_restore; +out_unlock: + mutex_unlock(&encl->lock); +out: + modt->count = c; + + return ret; +} + +/** + * sgx_ioc_enclave_modify_types() - handler for %SGX_IOC_ENCLAVE_MODIFY_TYPES + * @encl: an enclave pointer + * @arg: userspace pointer to a &struct sgx_enclave_modify_types instance + * + * Ability to change the enclave page type supports the following use cases: + * + * * It is possible to add TCS pages to an enclave by changing the type of + * regular pages (%SGX_PAGE_TYPE_REG) to TCS (%SGX_PAGE_TYPE_TCS) pages. + * With this support the number of threads supported by an initialized + * enclave can be increased dynamically. + * + * * Regular or TCS pages can dynamically be removed from an initialized + * enclave by changing the page type to %SGX_PAGE_TYPE_TRIM. Changing the + * page type to %SGX_PAGE_TYPE_TRIM marks the page for removal with actual + * removal done by handler of %SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl() called + * after ENCLU[EACCEPT] is run on %SGX_PAGE_TYPE_TRIM page from within the + * enclave. + * + * Return: + * - 0: Success + * - -errno: Otherwise + */ +static long sgx_ioc_enclave_modify_types(struct sgx_encl *encl, + void __user *arg) +{ + struct sgx_enclave_modify_types params; + long ret; + + ret = sgx_ioc_sgx2_ready(encl); + if (ret) + return ret; + + if (copy_from_user(¶ms, arg, sizeof(params))) + return -EFAULT; + + if (sgx_validate_offset_length(encl, params.offset, params.length)) + return -EINVAL; + + if (params.page_type & ~SGX_PAGE_TYPE_MASK) + return -EINVAL; + + if (params.result || params.count) + return -EINVAL; + + ret = sgx_enclave_modify_types(encl, ¶ms); + + if (copy_to_user(arg, ¶ms, sizeof(params))) + return -EFAULT; + + return ret; +} + +/** + * sgx_encl_remove_pages() - Remove trimmed pages from SGX enclave + * @encl: Enclave to which the pages belong + * @params: Checked parameters from user on which pages need to be removed + * + * Return: + * - 0: Success. + * - -errno: Otherwise. + */ +static long sgx_encl_remove_pages(struct sgx_encl *encl, + struct sgx_enclave_remove_pages *params) +{ + struct sgx_encl_page *entry; + struct sgx_secinfo secinfo; + unsigned long addr; + unsigned long c; + void *epc_virt; + int ret; + + memset(&secinfo, 0, sizeof(secinfo)); + secinfo.flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X; + + for (c = 0 ; c < params->length; c += PAGE_SIZE) { + addr = encl->base + params->offset + c; + + sgx_reclaim_direct(); + + mutex_lock(&encl->lock); + + entry = sgx_encl_load_page(encl, addr); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT; + goto out_unlock; + } + + if (entry->type != SGX_PAGE_TYPE_TRIM) { + ret = -EPERM; + goto out_unlock; + } + + /* + * ENCLS[EMODPR] is a no-op instruction used to inform if + * ENCLU[EACCEPT] was run from within the enclave. If + * ENCLS[EMODPR] is run with RWX on a trimmed page that is + * not yet accepted then it will return + * %SGX_PAGE_NOT_MODIFIABLE, after the trimmed page is + * accepted the instruction will encounter a page fault. + */ + epc_virt = sgx_get_epc_virt_addr(entry->epc_page); + ret = __emodpr(&secinfo, epc_virt); + if (!encls_faulted(ret) || ENCLS_TRAPNR(ret) != X86_TRAP_PF) { + ret = -EPERM; + goto out_unlock; + } + + if (sgx_unmark_page_reclaimable(entry->epc_page)) { + ret = -EBUSY; + goto out_unlock; + } + + /* + * Do not keep encl->lock because of dependency on + * mmap_lock acquired in sgx_zap_enclave_ptes(). + */ + mutex_unlock(&encl->lock); + + sgx_zap_enclave_ptes(encl, addr); + + mutex_lock(&encl->lock); + + sgx_encl_free_epc_page(entry->epc_page); + encl->secs_child_cnt--; + entry->epc_page = NULL; + xa_erase(&encl->page_array, PFN_DOWN(entry->desc)); + sgx_encl_shrink(encl, NULL); + kfree(entry); + + mutex_unlock(&encl->lock); + } + + ret = 0; + goto out; + +out_unlock: + mutex_unlock(&encl->lock); +out: + params->count = c; + + return ret; +} + +/** + * sgx_ioc_enclave_remove_pages() - handler for %SGX_IOC_ENCLAVE_REMOVE_PAGES + * @encl: an enclave pointer + * @arg: userspace pointer to &struct sgx_enclave_remove_pages instance + * + * Final step of the flow removing pages from an initialized enclave. The + * complete flow is: + * + * 1) User changes the type of the pages to be removed to %SGX_PAGE_TYPE_TRIM + * using the %SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl(). + * 2) User approves the page removal by running ENCLU[EACCEPT] from within + * the enclave. + * 3) User initiates actual page removal using the + * %SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl() that is handled here. + * + * First remove any page table entries pointing to the page and then proceed + * with the actual removal of the enclave page and data in support of it. + * + * VA pages are not affected by this removal. It is thus possible that the + * enclave may end up with more VA pages than needed to support all its + * pages. + * + * Return: + * - 0: Success + * - -errno: Otherwise + */ +static long sgx_ioc_enclave_remove_pages(struct sgx_encl *encl, + void __user *arg) +{ + struct sgx_enclave_remove_pages params; + long ret; + + ret = sgx_ioc_sgx2_ready(encl); + if (ret) + return ret; + + if (copy_from_user(¶ms, arg, sizeof(params))) + return -EFAULT; + + if (sgx_validate_offset_length(encl, params.offset, params.length)) + return -EINVAL; + + if (params.count) + return -EINVAL; + + ret = sgx_encl_remove_pages(encl, ¶ms); + + if (copy_to_user(arg, ¶ms, sizeof(params))) + return -EFAULT; + + return ret; +} + +long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) +{ + struct sgx_encl *encl = filep->private_data; + int ret; + + if (test_and_set_bit(SGX_ENCL_IOCTL, &encl->flags)) + return -EBUSY; + + switch (cmd) { + case SGX_IOC_ENCLAVE_CREATE: + ret = sgx_ioc_enclave_create(encl, (void __user *)arg); + break; + case SGX_IOC_ENCLAVE_ADD_PAGES: + ret = sgx_ioc_enclave_add_pages(encl, (void __user *)arg); + break; + case SGX_IOC_ENCLAVE_INIT: + ret = sgx_ioc_enclave_init(encl, (void __user *)arg); + break; + case SGX_IOC_ENCLAVE_PROVISION: + ret = sgx_ioc_enclave_provision(encl, (void __user *)arg); + break; + case SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS: + ret = sgx_ioc_enclave_restrict_permissions(encl, + (void __user *)arg); + break; + case SGX_IOC_ENCLAVE_MODIFY_TYPES: + ret = sgx_ioc_enclave_modify_types(encl, (void __user *)arg); + break; + case SGX_IOC_ENCLAVE_REMOVE_PAGES: + ret = sgx_ioc_enclave_remove_pages(encl, (void __user *)arg); + break; + default: + ret = -ENOIOCTLCMD; + break; + } + + clear_bit(SGX_ENCL_IOCTL, &encl->flags); + return ret; +} diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c new file mode 100644 index 000000000000..0aad028f04d4 --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -0,0 +1,963 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016-20 Intel Corporation. */ + +#include <linux/file.h> +#include <linux/freezer.h> +#include <linux/highmem.h> +#include <linux/kthread.h> +#include <linux/miscdevice.h> +#include <linux/node.h> +#include <linux/pagemap.h> +#include <linux/ratelimit.h> +#include <linux/sched/mm.h> +#include <linux/sched/signal.h> +#include <linux/slab.h> +#include <linux/sysfs.h> +#include <asm/sgx.h> +#include "driver.h" +#include "encl.h" +#include "encls.h" + +struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS]; +static int sgx_nr_epc_sections; +static struct task_struct *ksgxd_tsk; +static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq); +static DEFINE_XARRAY(sgx_epc_address_space); + +/* + * These variables are part of the state of the reclaimer, and must be accessed + * with sgx_reclaimer_lock acquired. + */ +static LIST_HEAD(sgx_active_page_list); +static DEFINE_SPINLOCK(sgx_reclaimer_lock); + +static atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(0); + +/* Nodes with one or more EPC sections. */ +static nodemask_t sgx_numa_mask; + +/* + * Array with one list_head for each possible NUMA node. Each + * list contains all the sgx_epc_section's which are on that + * node. + */ +static struct sgx_numa_node *sgx_numa_nodes; + +static LIST_HEAD(sgx_dirty_page_list); + +/* + * Reset post-kexec EPC pages to the uninitialized state. The pages are removed + * from the input list, and made available for the page allocator. SECS pages + * prepending their children in the input list are left intact. + * + * Return 0 when sanitization was successful or kthread was stopped, and the + * number of unsanitized pages otherwise. + */ +static unsigned long __sgx_sanitize_pages(struct list_head *dirty_page_list) +{ + unsigned long left_dirty = 0; + struct sgx_epc_page *page; + LIST_HEAD(dirty); + int ret; + + /* dirty_page_list is thread-local, no need for a lock: */ + while (!list_empty(dirty_page_list)) { + if (kthread_should_stop()) + return 0; + + page = list_first_entry(dirty_page_list, struct sgx_epc_page, list); + + /* + * Checking page->poison without holding the node->lock + * is racy, but losing the race (i.e. poison is set just + * after the check) just means __eremove() will be uselessly + * called for a page that sgx_free_epc_page() will put onto + * the node->sgx_poison_page_list later. + */ + if (page->poison) { + struct sgx_epc_section *section = &sgx_epc_sections[page->section]; + struct sgx_numa_node *node = section->node; + + spin_lock(&node->lock); + list_move(&page->list, &node->sgx_poison_page_list); + spin_unlock(&node->lock); + + continue; + } + + ret = __eremove(sgx_get_epc_virt_addr(page)); + if (!ret) { + /* + * page is now sanitized. Make it available via the SGX + * page allocator: + */ + list_del(&page->list); + sgx_free_epc_page(page); + } else { + /* The page is not yet clean - move to the dirty list. */ + list_move_tail(&page->list, &dirty); + left_dirty++; + } + + cond_resched(); + } + + list_splice(&dirty, dirty_page_list); + return left_dirty; +} + +static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page) +{ + struct sgx_encl_page *page = epc_page->owner; + struct sgx_encl *encl = page->encl; + struct sgx_encl_mm *encl_mm; + bool ret = true; + int idx; + + idx = srcu_read_lock(&encl->srcu); + + list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { + if (!mmget_not_zero(encl_mm->mm)) + continue; + + mmap_read_lock(encl_mm->mm); + ret = !sgx_encl_test_and_clear_young(encl_mm->mm, page); + mmap_read_unlock(encl_mm->mm); + + mmput_async(encl_mm->mm); + + if (!ret) + break; + } + + srcu_read_unlock(&encl->srcu, idx); + + if (!ret) + return false; + + return true; +} + +static void sgx_reclaimer_block(struct sgx_epc_page *epc_page) +{ + struct sgx_encl_page *page = epc_page->owner; + unsigned long addr = page->desc & PAGE_MASK; + struct sgx_encl *encl = page->encl; + int ret; + + sgx_zap_enclave_ptes(encl, addr); + + mutex_lock(&encl->lock); + + ret = __eblock(sgx_get_epc_virt_addr(epc_page)); + if (encls_failed(ret)) + ENCLS_WARN(ret, "EBLOCK"); + + mutex_unlock(&encl->lock); +} + +static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot, + struct sgx_backing *backing) +{ + struct sgx_pageinfo pginfo; + int ret; + + pginfo.addr = 0; + pginfo.secs = 0; + + pginfo.contents = (unsigned long)kmap_atomic(backing->contents); + pginfo.metadata = (unsigned long)kmap_atomic(backing->pcmd) + + backing->pcmd_offset; + + ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot); + set_page_dirty(backing->pcmd); + set_page_dirty(backing->contents); + + kunmap_atomic((void *)(unsigned long)(pginfo.metadata - + backing->pcmd_offset)); + kunmap_atomic((void *)(unsigned long)pginfo.contents); + + return ret; +} + +void sgx_ipi_cb(void *info) +{ +} + +/* + * Swap page to the regular memory transformed to the blocked state by using + * EBLOCK, which means that it can no longer be referenced (no new TLB entries). + * + * The first trial just tries to write the page assuming that some other thread + * has reset the count for threads inside the enclave by using ETRACK, and + * previous thread count has been zeroed out. The second trial calls ETRACK + * before EWB. If that fails we kick all the HW threads out, and then do EWB, + * which should be guaranteed the succeed. + */ +static void sgx_encl_ewb(struct sgx_epc_page *epc_page, + struct sgx_backing *backing) +{ + struct sgx_encl_page *encl_page = epc_page->owner; + struct sgx_encl *encl = encl_page->encl; + struct sgx_va_page *va_page; + unsigned int va_offset; + void *va_slot; + int ret; + + encl_page->desc &= ~SGX_ENCL_PAGE_BEING_RECLAIMED; + + va_page = list_first_entry(&encl->va_pages, struct sgx_va_page, + list); + va_offset = sgx_alloc_va_slot(va_page); + va_slot = sgx_get_epc_virt_addr(va_page->epc_page) + va_offset; + if (sgx_va_page_full(va_page)) + list_move_tail(&va_page->list, &encl->va_pages); + + ret = __sgx_encl_ewb(epc_page, va_slot, backing); + if (ret == SGX_NOT_TRACKED) { + ret = __etrack(sgx_get_epc_virt_addr(encl->secs.epc_page)); + if (ret) { + if (encls_failed(ret)) + ENCLS_WARN(ret, "ETRACK"); + } + + ret = __sgx_encl_ewb(epc_page, va_slot, backing); + if (ret == SGX_NOT_TRACKED) { + /* + * Slow path, send IPIs to kick cpus out of the + * enclave. Note, it's imperative that the cpu + * mask is generated *after* ETRACK, else we'll + * miss cpus that entered the enclave between + * generating the mask and incrementing epoch. + */ + on_each_cpu_mask(sgx_encl_cpumask(encl), + sgx_ipi_cb, NULL, 1); + ret = __sgx_encl_ewb(epc_page, va_slot, backing); + } + } + + if (ret) { + if (encls_failed(ret)) + ENCLS_WARN(ret, "EWB"); + + sgx_free_va_slot(va_page, va_offset); + } else { + encl_page->desc |= va_offset; + encl_page->va_page = va_page; + } +} + +static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, + struct sgx_backing *backing) +{ + struct sgx_encl_page *encl_page = epc_page->owner; + struct sgx_encl *encl = encl_page->encl; + struct sgx_backing secs_backing; + int ret; + + mutex_lock(&encl->lock); + + sgx_encl_ewb(epc_page, backing); + encl_page->epc_page = NULL; + encl->secs_child_cnt--; + sgx_encl_put_backing(backing); + + if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) { + ret = sgx_encl_alloc_backing(encl, PFN_DOWN(encl->size), + &secs_backing); + if (ret) + goto out; + + sgx_encl_ewb(encl->secs.epc_page, &secs_backing); + + sgx_encl_free_epc_page(encl->secs.epc_page); + encl->secs.epc_page = NULL; + + sgx_encl_put_backing(&secs_backing); + } + +out: + mutex_unlock(&encl->lock); +} + +/* + * Take a fixed number of pages from the head of the active page pool and + * reclaim them to the enclave's private shmem files. Skip the pages, which have + * been accessed since the last scan. Move those pages to the tail of active + * page pool so that the pages get scanned in LRU like fashion. + * + * Batch process a chunk of pages (at the moment 16) in order to degrade amount + * of IPI's and ETRACK's potentially required. sgx_encl_ewb() does degrade a bit + * among the HW threads with three stage EWB pipeline (EWB, ETRACK + EWB and IPI + * + EWB) but not sufficiently. Reclaiming one page at a time would also be + * problematic as it would increase the lock contention too much, which would + * halt forward progress. + */ +static void sgx_reclaim_pages(void) +{ + struct sgx_epc_page *chunk[SGX_NR_TO_SCAN]; + struct sgx_backing backing[SGX_NR_TO_SCAN]; + struct sgx_encl_page *encl_page; + struct sgx_epc_page *epc_page; + pgoff_t page_index; + int cnt = 0; + int ret; + int i; + + spin_lock(&sgx_reclaimer_lock); + for (i = 0; i < SGX_NR_TO_SCAN; i++) { + if (list_empty(&sgx_active_page_list)) + break; + + epc_page = list_first_entry(&sgx_active_page_list, + struct sgx_epc_page, list); + list_del_init(&epc_page->list); + encl_page = epc_page->owner; + + if (kref_get_unless_zero(&encl_page->encl->refcount) != 0) + chunk[cnt++] = epc_page; + else + /* The owner is freeing the page. No need to add the + * page back to the list of reclaimable pages. + */ + epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; + } + spin_unlock(&sgx_reclaimer_lock); + + for (i = 0; i < cnt; i++) { + epc_page = chunk[i]; + encl_page = epc_page->owner; + + if (!sgx_reclaimer_age(epc_page)) + goto skip; + + page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base); + + mutex_lock(&encl_page->encl->lock); + ret = sgx_encl_alloc_backing(encl_page->encl, page_index, &backing[i]); + if (ret) { + mutex_unlock(&encl_page->encl->lock); + goto skip; + } + + encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED; + mutex_unlock(&encl_page->encl->lock); + continue; + +skip: + spin_lock(&sgx_reclaimer_lock); + list_add_tail(&epc_page->list, &sgx_active_page_list); + spin_unlock(&sgx_reclaimer_lock); + + kref_put(&encl_page->encl->refcount, sgx_encl_release); + + chunk[i] = NULL; + } + + for (i = 0; i < cnt; i++) { + epc_page = chunk[i]; + if (epc_page) + sgx_reclaimer_block(epc_page); + } + + for (i = 0; i < cnt; i++) { + epc_page = chunk[i]; + if (!epc_page) + continue; + + encl_page = epc_page->owner; + sgx_reclaimer_write(epc_page, &backing[i]); + + kref_put(&encl_page->encl->refcount, sgx_encl_release); + epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; + + sgx_free_epc_page(epc_page); + } +} + +static bool sgx_should_reclaim(unsigned long watermark) +{ + return atomic_long_read(&sgx_nr_free_pages) < watermark && + !list_empty(&sgx_active_page_list); +} + +/* + * sgx_reclaim_direct() should be called (without enclave's mutex held) + * in locations where SGX memory resources might be low and might be + * needed in order to make forward progress. + */ +void sgx_reclaim_direct(void) +{ + if (sgx_should_reclaim(SGX_NR_LOW_PAGES)) + sgx_reclaim_pages(); +} + +static int ksgxd(void *p) +{ + set_freezable(); + + /* + * Sanitize pages in order to recover from kexec(). The 2nd pass is + * required for SECS pages, whose child pages blocked EREMOVE. + */ + __sgx_sanitize_pages(&sgx_dirty_page_list); + WARN_ON(__sgx_sanitize_pages(&sgx_dirty_page_list)); + + while (!kthread_should_stop()) { + if (try_to_freeze()) + continue; + + wait_event_freezable(ksgxd_waitq, + kthread_should_stop() || + sgx_should_reclaim(SGX_NR_HIGH_PAGES)); + + if (sgx_should_reclaim(SGX_NR_HIGH_PAGES)) + sgx_reclaim_pages(); + + cond_resched(); + } + + return 0; +} + +static bool __init sgx_page_reclaimer_init(void) +{ + struct task_struct *tsk; + + tsk = kthread_run(ksgxd, NULL, "ksgxd"); + if (IS_ERR(tsk)) + return false; + + ksgxd_tsk = tsk; + + return true; +} + +bool current_is_ksgxd(void) +{ + return current == ksgxd_tsk; +} + +static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid) +{ + struct sgx_numa_node *node = &sgx_numa_nodes[nid]; + struct sgx_epc_page *page = NULL; + + spin_lock(&node->lock); + + if (list_empty(&node->free_page_list)) { + spin_unlock(&node->lock); + return NULL; + } + + page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list); + list_del_init(&page->list); + page->flags = 0; + + spin_unlock(&node->lock); + atomic_long_dec(&sgx_nr_free_pages); + + return page; +} + +/** + * __sgx_alloc_epc_page() - Allocate an EPC page + * + * Iterate through NUMA nodes and reserve ia free EPC page to the caller. Start + * from the NUMA node, where the caller is executing. + * + * Return: + * - an EPC page: A borrowed EPC pages were available. + * - NULL: Out of EPC pages. + */ +struct sgx_epc_page *__sgx_alloc_epc_page(void) +{ + struct sgx_epc_page *page; + int nid_of_current = numa_node_id(); + int nid = nid_of_current; + + if (node_isset(nid_of_current, sgx_numa_mask)) { + page = __sgx_alloc_epc_page_from_node(nid_of_current); + if (page) + return page; + } + + /* Fall back to the non-local NUMA nodes: */ + while (true) { + nid = next_node_in(nid, sgx_numa_mask); + if (nid == nid_of_current) + break; + + page = __sgx_alloc_epc_page_from_node(nid); + if (page) + return page; + } + + return ERR_PTR(-ENOMEM); +} + +/** + * sgx_mark_page_reclaimable() - Mark a page as reclaimable + * @page: EPC page + * + * Mark a page as reclaimable and add it to the active page list. Pages + * are automatically removed from the active list when freed. + */ +void sgx_mark_page_reclaimable(struct sgx_epc_page *page) +{ + spin_lock(&sgx_reclaimer_lock); + page->flags |= SGX_EPC_PAGE_RECLAIMER_TRACKED; + list_add_tail(&page->list, &sgx_active_page_list); + spin_unlock(&sgx_reclaimer_lock); +} + +/** + * sgx_unmark_page_reclaimable() - Remove a page from the reclaim list + * @page: EPC page + * + * Clear the reclaimable flag and remove the page from the active page list. + * + * Return: + * 0 on success, + * -EBUSY if the page is in the process of being reclaimed + */ +int sgx_unmark_page_reclaimable(struct sgx_epc_page *page) +{ + spin_lock(&sgx_reclaimer_lock); + if (page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED) { + /* The page is being reclaimed. */ + if (list_empty(&page->list)) { + spin_unlock(&sgx_reclaimer_lock); + return -EBUSY; + } + + list_del(&page->list); + page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; + } + spin_unlock(&sgx_reclaimer_lock); + + return 0; +} + +/** + * sgx_alloc_epc_page() - Allocate an EPC page + * @owner: the owner of the EPC page + * @reclaim: reclaim pages if necessary + * + * Iterate through EPC sections and borrow a free EPC page to the caller. When a + * page is no longer needed it must be released with sgx_free_epc_page(). If + * @reclaim is set to true, directly reclaim pages when we are out of pages. No + * mm's can be locked when @reclaim is set to true. + * + * Finally, wake up ksgxd when the number of pages goes below the watermark + * before returning back to the caller. + * + * Return: + * an EPC page, + * -errno on error + */ +struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim) +{ + struct sgx_epc_page *page; + + for ( ; ; ) { + page = __sgx_alloc_epc_page(); + if (!IS_ERR(page)) { + page->owner = owner; + break; + } + + if (list_empty(&sgx_active_page_list)) + return ERR_PTR(-ENOMEM); + + if (!reclaim) { + page = ERR_PTR(-EBUSY); + break; + } + + if (signal_pending(current)) { + page = ERR_PTR(-ERESTARTSYS); + break; + } + + sgx_reclaim_pages(); + cond_resched(); + } + + if (sgx_should_reclaim(SGX_NR_LOW_PAGES)) + wake_up(&ksgxd_waitq); + + return page; +} + +/** + * sgx_free_epc_page() - Free an EPC page + * @page: an EPC page + * + * Put the EPC page back to the list of free pages. It's the caller's + * responsibility to make sure that the page is in uninitialized state. In other + * words, do EREMOVE, EWB or whatever operation is necessary before calling + * this function. + */ +void sgx_free_epc_page(struct sgx_epc_page *page) +{ + struct sgx_epc_section *section = &sgx_epc_sections[page->section]; + struct sgx_numa_node *node = section->node; + + spin_lock(&node->lock); + + page->owner = NULL; + if (page->poison) + list_add(&page->list, &node->sgx_poison_page_list); + else + list_add_tail(&page->list, &node->free_page_list); + page->flags = SGX_EPC_PAGE_IS_FREE; + + spin_unlock(&node->lock); + atomic_long_inc(&sgx_nr_free_pages); +} + +static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, + unsigned long index, + struct sgx_epc_section *section) +{ + unsigned long nr_pages = size >> PAGE_SHIFT; + unsigned long i; + + section->virt_addr = memremap(phys_addr, size, MEMREMAP_WB); + if (!section->virt_addr) + return false; + + section->pages = vmalloc(nr_pages * sizeof(struct sgx_epc_page)); + if (!section->pages) { + memunmap(section->virt_addr); + return false; + } + + section->phys_addr = phys_addr; + xa_store_range(&sgx_epc_address_space, section->phys_addr, + phys_addr + size - 1, section, GFP_KERNEL); + + for (i = 0; i < nr_pages; i++) { + section->pages[i].section = index; + section->pages[i].flags = 0; + section->pages[i].owner = NULL; + section->pages[i].poison = 0; + list_add_tail(§ion->pages[i].list, &sgx_dirty_page_list); + } + + return true; +} + +bool arch_is_platform_page(u64 paddr) +{ + return !!xa_load(&sgx_epc_address_space, paddr); +} +EXPORT_SYMBOL_GPL(arch_is_platform_page); + +static struct sgx_epc_page *sgx_paddr_to_page(u64 paddr) +{ + struct sgx_epc_section *section; + + section = xa_load(&sgx_epc_address_space, paddr); + if (!section) + return NULL; + + return §ion->pages[PFN_DOWN(paddr - section->phys_addr)]; +} + +/* + * Called in process context to handle a hardware reported + * error in an SGX EPC page. + * If the MF_ACTION_REQUIRED bit is set in flags, then the + * context is the task that consumed the poison data. Otherwise + * this is called from a kernel thread unrelated to the page. + */ +int arch_memory_failure(unsigned long pfn, int flags) +{ + struct sgx_epc_page *page = sgx_paddr_to_page(pfn << PAGE_SHIFT); + struct sgx_epc_section *section; + struct sgx_numa_node *node; + + /* + * mm/memory-failure.c calls this routine for all errors + * where there isn't a "struct page" for the address. But that + * includes other address ranges besides SGX. + */ + if (!page) + return -ENXIO; + + /* + * If poison was consumed synchronously. Send a SIGBUS to + * the task. Hardware has already exited the SGX enclave and + * will not allow re-entry to an enclave that has a memory + * error. The signal may help the task understand why the + * enclave is broken. + */ + if (flags & MF_ACTION_REQUIRED) + force_sig(SIGBUS); + + section = &sgx_epc_sections[page->section]; + node = section->node; + + spin_lock(&node->lock); + + /* Already poisoned? Nothing more to do */ + if (page->poison) + goto out; + + page->poison = 1; + + /* + * If the page is on a free list, move it to the per-node + * poison page list. + */ + if (page->flags & SGX_EPC_PAGE_IS_FREE) { + list_move(&page->list, &node->sgx_poison_page_list); + goto out; + } + + /* + * TBD: Add additional plumbing to enable pre-emptive + * action for asynchronous poison notification. Until + * then just hope that the poison: + * a) is not accessed - sgx_free_epc_page() will deal with it + * when the user gives it back + * b) results in a recoverable machine check rather than + * a fatal one + */ +out: + spin_unlock(&node->lock); + return 0; +} + +/** + * A section metric is concatenated in a way that @low bits 12-31 define the + * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the + * metric. + */ +static inline u64 __init sgx_calc_section_metric(u64 low, u64 high) +{ + return (low & GENMASK_ULL(31, 12)) + + ((high & GENMASK_ULL(19, 0)) << 32); +} + +#ifdef CONFIG_NUMA +static ssize_t sgx_total_bytes_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lu\n", sgx_numa_nodes[dev->id].size); +} +static DEVICE_ATTR_RO(sgx_total_bytes); + +static umode_t arch_node_attr_is_visible(struct kobject *kobj, + struct attribute *attr, int idx) +{ + /* Make all x86/ attributes invisible when SGX is not initialized: */ + if (nodes_empty(sgx_numa_mask)) + return 0; + + return attr->mode; +} + +static struct attribute *arch_node_dev_attrs[] = { + &dev_attr_sgx_total_bytes.attr, + NULL, +}; + +const struct attribute_group arch_node_dev_group = { + .name = "x86", + .attrs = arch_node_dev_attrs, + .is_visible = arch_node_attr_is_visible, +}; + +static void __init arch_update_sysfs_visibility(int nid) +{ + struct node *node = node_devices[nid]; + int ret; + + ret = sysfs_update_group(&node->dev.kobj, &arch_node_dev_group); + + if (ret) + pr_err("sysfs update failed (%d), files may be invisible", ret); +} +#else /* !CONFIG_NUMA */ +static void __init arch_update_sysfs_visibility(int nid) {} +#endif + +static bool __init sgx_page_cache_init(void) +{ + u32 eax, ebx, ecx, edx, type; + u64 pa, size; + int nid; + int i; + + sgx_numa_nodes = kmalloc_array(num_possible_nodes(), sizeof(*sgx_numa_nodes), GFP_KERNEL); + if (!sgx_numa_nodes) + return false; + + for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) { + cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx); + + type = eax & SGX_CPUID_EPC_MASK; + if (type == SGX_CPUID_EPC_INVALID) + break; + + if (type != SGX_CPUID_EPC_SECTION) { + pr_err_once("Unknown EPC section type: %u\n", type); + break; + } + + pa = sgx_calc_section_metric(eax, ebx); + size = sgx_calc_section_metric(ecx, edx); + + pr_info("EPC section 0x%llx-0x%llx\n", pa, pa + size - 1); + + if (!sgx_setup_epc_section(pa, size, i, &sgx_epc_sections[i])) { + pr_err("No free memory for an EPC section\n"); + break; + } + + nid = numa_map_to_online_node(phys_to_target_node(pa)); + if (nid == NUMA_NO_NODE) { + /* The physical address is already printed above. */ + pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n"); + nid = 0; + } + + if (!node_isset(nid, sgx_numa_mask)) { + spin_lock_init(&sgx_numa_nodes[nid].lock); + INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list); + INIT_LIST_HEAD(&sgx_numa_nodes[nid].sgx_poison_page_list); + node_set(nid, sgx_numa_mask); + sgx_numa_nodes[nid].size = 0; + + /* Make SGX-specific node sysfs files visible: */ + arch_update_sysfs_visibility(nid); + } + + sgx_epc_sections[i].node = &sgx_numa_nodes[nid]; + sgx_numa_nodes[nid].size += size; + + sgx_nr_epc_sections++; + } + + if (!sgx_nr_epc_sections) { + pr_err("There are zero EPC sections.\n"); + return false; + } + + return true; +} + +/* + * Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller. + * Bare-metal driver requires to update them to hash of enclave's signer + * before EINIT. KVM needs to update them to guest's virtual MSR values + * before doing EINIT from guest. + */ +void sgx_update_lepubkeyhash(u64 *lepubkeyhash) +{ + int i; + + WARN_ON_ONCE(preemptible()); + + for (i = 0; i < 4; i++) + wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]); +} + +const struct file_operations sgx_provision_fops = { + .owner = THIS_MODULE, +}; + +static struct miscdevice sgx_dev_provision = { + .minor = MISC_DYNAMIC_MINOR, + .name = "sgx_provision", + .nodename = "sgx_provision", + .fops = &sgx_provision_fops, +}; + +/** + * sgx_set_attribute() - Update allowed attributes given file descriptor + * @allowed_attributes: Pointer to allowed enclave attributes + * @attribute_fd: File descriptor for specific attribute + * + * Append enclave attribute indicated by file descriptor to allowed + * attributes. Currently only SGX_ATTR_PROVISIONKEY indicated by + * /dev/sgx_provision is supported. + * + * Return: + * -0: SGX_ATTR_PROVISIONKEY is appended to allowed_attributes + * -EINVAL: Invalid, or not supported file descriptor + */ +int sgx_set_attribute(unsigned long *allowed_attributes, + unsigned int attribute_fd) +{ + struct file *file; + + file = fget(attribute_fd); + if (!file) + return -EINVAL; + + if (file->f_op != &sgx_provision_fops) { + fput(file); + return -EINVAL; + } + + *allowed_attributes |= SGX_ATTR_PROVISIONKEY; + + fput(file); + return 0; +} +EXPORT_SYMBOL_GPL(sgx_set_attribute); + +static int __init sgx_init(void) +{ + int ret; + int i; + + if (!cpu_feature_enabled(X86_FEATURE_SGX)) + return -ENODEV; + + if (!sgx_page_cache_init()) + return -ENOMEM; + + if (!sgx_page_reclaimer_init()) { + ret = -ENOMEM; + goto err_page_cache; + } + + ret = misc_register(&sgx_dev_provision); + if (ret) + goto err_kthread; + + /* + * Always try to initialize the native *and* KVM drivers. + * The KVM driver is less picky than the native one and + * can function if the native one is not supported on the + * current system or fails to initialize. + * + * Error out only if both fail to initialize. + */ + ret = sgx_drv_init(); + + if (sgx_vepc_init() && ret) + goto err_provision; + + return 0; + +err_provision: + misc_deregister(&sgx_dev_provision); + +err_kthread: + kthread_stop(ksgxd_tsk); + +err_page_cache: + for (i = 0; i < sgx_nr_epc_sections; i++) { + vfree(sgx_epc_sections[i].pages); + memunmap(sgx_epc_sections[i].virt_addr); + } + + return ret; +} + +device_initcall(sgx_init); diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h new file mode 100644 index 000000000000..0f2020653fba --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -0,0 +1,107 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _X86_SGX_H +#define _X86_SGX_H + +#include <linux/bitops.h> +#include <linux/err.h> +#include <linux/io.h> +#include <linux/rwsem.h> +#include <linux/types.h> +#include <asm/asm.h> +#include <asm/sgx.h> + +#undef pr_fmt +#define pr_fmt(fmt) "sgx: " fmt + +#define EREMOVE_ERROR_MESSAGE \ + "EREMOVE returned %d (0x%x) and an EPC page was leaked. SGX may become unusable. " \ + "Refer to Documentation/x86/sgx.rst for more information." + +#define SGX_MAX_EPC_SECTIONS 8 +#define SGX_EEXTEND_BLOCK_SIZE 256 +#define SGX_NR_TO_SCAN 16 +#define SGX_NR_LOW_PAGES 32 +#define SGX_NR_HIGH_PAGES 64 + +/* Pages, which are being tracked by the page reclaimer. */ +#define SGX_EPC_PAGE_RECLAIMER_TRACKED BIT(0) + +/* Pages on free list */ +#define SGX_EPC_PAGE_IS_FREE BIT(1) + +struct sgx_epc_page { + unsigned int section; + u16 flags; + u16 poison; + struct sgx_encl_page *owner; + struct list_head list; +}; + +/* + * Contains the tracking data for NUMA nodes having EPC pages. Most importantly, + * the free page list local to the node is stored here. + */ +struct sgx_numa_node { + struct list_head free_page_list; + struct list_head sgx_poison_page_list; + unsigned long size; + spinlock_t lock; +}; + +/* + * The firmware can define multiple chunks of EPC to the different areas of the + * physical memory e.g. for memory areas of the each node. This structure is + * used to store EPC pages for one EPC section and virtual memory area where + * the pages have been mapped. + */ +struct sgx_epc_section { + unsigned long phys_addr; + void *virt_addr; + struct sgx_epc_page *pages; + struct sgx_numa_node *node; +}; + +extern struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS]; + +static inline unsigned long sgx_get_epc_phys_addr(struct sgx_epc_page *page) +{ + struct sgx_epc_section *section = &sgx_epc_sections[page->section]; + unsigned long index; + + index = ((unsigned long)page - (unsigned long)section->pages) / sizeof(*page); + + return section->phys_addr + index * PAGE_SIZE; +} + +static inline void *sgx_get_epc_virt_addr(struct sgx_epc_page *page) +{ + struct sgx_epc_section *section = &sgx_epc_sections[page->section]; + unsigned long index; + + index = ((unsigned long)page - (unsigned long)section->pages) / sizeof(*page); + + return section->virt_addr + index * PAGE_SIZE; +} + +struct sgx_epc_page *__sgx_alloc_epc_page(void); +void sgx_free_epc_page(struct sgx_epc_page *page); + +void sgx_reclaim_direct(void); +void sgx_mark_page_reclaimable(struct sgx_epc_page *page); +int sgx_unmark_page_reclaimable(struct sgx_epc_page *page); +struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim); + +void sgx_ipi_cb(void *info); + +#ifdef CONFIG_X86_SGX_KVM +int __init sgx_vepc_init(void); +#else +static inline int __init sgx_vepc_init(void) +{ + return -ENODEV; +} +#endif + +void sgx_update_lepubkeyhash(u64 *lepubkeyhash); + +#endif /* _X86_SGX_H */ diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c new file mode 100644 index 000000000000..6a77a14eee38 --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/virt.c @@ -0,0 +1,432 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Device driver to expose SGX enclave memory to KVM guests. + * + * Copyright(c) 2021 Intel Corporation. + */ + +#include <linux/miscdevice.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/sched/mm.h> +#include <linux/sched/signal.h> +#include <linux/slab.h> +#include <linux/xarray.h> +#include <asm/sgx.h> +#include <uapi/asm/sgx.h> + +#include "encls.h" +#include "sgx.h" + +struct sgx_vepc { + struct xarray page_array; + struct mutex lock; +}; + +/* + * Temporary SECS pages that cannot be EREMOVE'd due to having child in other + * virtual EPC instances, and the lock to protect it. + */ +static struct mutex zombie_secs_pages_lock; +static struct list_head zombie_secs_pages; + +static int __sgx_vepc_fault(struct sgx_vepc *vepc, + struct vm_area_struct *vma, unsigned long addr) +{ + struct sgx_epc_page *epc_page; + unsigned long index, pfn; + int ret; + + WARN_ON(!mutex_is_locked(&vepc->lock)); + + /* Calculate index of EPC page in virtual EPC's page_array */ + index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start); + + epc_page = xa_load(&vepc->page_array, index); + if (epc_page) + return 0; + + epc_page = sgx_alloc_epc_page(vepc, false); + if (IS_ERR(epc_page)) + return PTR_ERR(epc_page); + + ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL)); + if (ret) + goto err_free; + + pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page)); + + ret = vmf_insert_pfn(vma, addr, pfn); + if (ret != VM_FAULT_NOPAGE) { + ret = -EFAULT; + goto err_delete; + } + + return 0; + +err_delete: + xa_erase(&vepc->page_array, index); +err_free: + sgx_free_epc_page(epc_page); + return ret; +} + +static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct sgx_vepc *vepc = vma->vm_private_data; + int ret; + + mutex_lock(&vepc->lock); + ret = __sgx_vepc_fault(vepc, vma, vmf->address); + mutex_unlock(&vepc->lock); + + if (!ret) + return VM_FAULT_NOPAGE; + + if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) { + mmap_read_unlock(vma->vm_mm); + return VM_FAULT_RETRY; + } + + return VM_FAULT_SIGBUS; +} + +static const struct vm_operations_struct sgx_vepc_vm_ops = { + .fault = sgx_vepc_fault, +}; + +static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct sgx_vepc *vepc = file->private_data; + + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + vma->vm_ops = &sgx_vepc_vm_ops; + /* Don't copy VMA in fork() */ + vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY; + vma->vm_private_data = vepc; + + return 0; +} + +static int sgx_vepc_remove_page(struct sgx_epc_page *epc_page) +{ + /* + * Take a previously guest-owned EPC page and return it to the + * general EPC page pool. + * + * Guests can not be trusted to have left this page in a good + * state, so run EREMOVE on the page unconditionally. In the + * case that a guest properly EREMOVE'd this page, a superfluous + * EREMOVE is harmless. + */ + return __eremove(sgx_get_epc_virt_addr(epc_page)); +} + +static int sgx_vepc_free_page(struct sgx_epc_page *epc_page) +{ + int ret = sgx_vepc_remove_page(epc_page); + if (ret) { + /* + * Only SGX_CHILD_PRESENT is expected, which is because of + * EREMOVE'ing an SECS still with child, in which case it can + * be handled by EREMOVE'ing the SECS again after all pages in + * virtual EPC have been EREMOVE'd. See comments in below in + * sgx_vepc_release(). + * + * The user of virtual EPC (KVM) needs to guarantee there's no + * logical processor is still running in the enclave in guest, + * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be + * handled here. + */ + WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE, + ret, ret); + return ret; + } + + sgx_free_epc_page(epc_page); + return 0; +} + +static long sgx_vepc_remove_all(struct sgx_vepc *vepc) +{ + struct sgx_epc_page *entry; + unsigned long index; + long failures = 0; + + xa_for_each(&vepc->page_array, index, entry) { + int ret = sgx_vepc_remove_page(entry); + if (ret) { + if (ret == SGX_CHILD_PRESENT) { + /* The page is a SECS, userspace will retry. */ + failures++; + } else { + /* + * Report errors due to #GP or SGX_ENCLAVE_ACT; do not + * WARN, as userspace can induce said failures by + * calling the ioctl concurrently on multiple vEPCs or + * while one or more CPUs is running the enclave. Only + * a #PF on EREMOVE indicates a kernel/hardware issue. + */ + WARN_ON_ONCE(encls_faulted(ret) && + ENCLS_TRAPNR(ret) != X86_TRAP_GP); + return -EBUSY; + } + } + cond_resched(); + } + + /* + * Return the number of SECS pages that failed to be removed, so + * userspace knows that it has to retry. + */ + return failures; +} + +static int sgx_vepc_release(struct inode *inode, struct file *file) +{ + struct sgx_vepc *vepc = file->private_data; + struct sgx_epc_page *epc_page, *tmp, *entry; + unsigned long index; + + LIST_HEAD(secs_pages); + + xa_for_each(&vepc->page_array, index, entry) { + /* + * Remove all normal, child pages. sgx_vepc_free_page() + * will fail if EREMOVE fails, but this is OK and expected on + * SECS pages. Those can only be EREMOVE'd *after* all their + * child pages. Retries below will clean them up. + */ + if (sgx_vepc_free_page(entry)) + continue; + + xa_erase(&vepc->page_array, index); + } + + /* + * Retry EREMOVE'ing pages. This will clean up any SECS pages that + * only had children in this 'epc' area. + */ + xa_for_each(&vepc->page_array, index, entry) { + epc_page = entry; + /* + * An EREMOVE failure here means that the SECS page still + * has children. But, since all children in this 'sgx_vepc' + * have been removed, the SECS page must have a child on + * another instance. + */ + if (sgx_vepc_free_page(epc_page)) + list_add_tail(&epc_page->list, &secs_pages); + + xa_erase(&vepc->page_array, index); + } + + /* + * SECS pages are "pinned" by child pages, and "unpinned" once all + * children have been EREMOVE'd. A child page in this instance + * may have pinned an SECS page encountered in an earlier release(), + * creating a zombie. Since some children were EREMOVE'd above, + * try to EREMOVE all zombies in the hopes that one was unpinned. + */ + mutex_lock(&zombie_secs_pages_lock); + list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) { + /* + * Speculatively remove the page from the list of zombies, + * if the page is successfully EREMOVE'd it will be added to + * the list of free pages. If EREMOVE fails, throw the page + * on the local list, which will be spliced on at the end. + */ + list_del(&epc_page->list); + + if (sgx_vepc_free_page(epc_page)) + list_add_tail(&epc_page->list, &secs_pages); + } + + if (!list_empty(&secs_pages)) + list_splice_tail(&secs_pages, &zombie_secs_pages); + mutex_unlock(&zombie_secs_pages_lock); + + xa_destroy(&vepc->page_array); + kfree(vepc); + + return 0; +} + +static int sgx_vepc_open(struct inode *inode, struct file *file) +{ + struct sgx_vepc *vepc; + + vepc = kzalloc(sizeof(struct sgx_vepc), GFP_KERNEL); + if (!vepc) + return -ENOMEM; + mutex_init(&vepc->lock); + xa_init(&vepc->page_array); + + file->private_data = vepc; + + return 0; +} + +static long sgx_vepc_ioctl(struct file *file, + unsigned int cmd, unsigned long arg) +{ + struct sgx_vepc *vepc = file->private_data; + + switch (cmd) { + case SGX_IOC_VEPC_REMOVE_ALL: + if (arg) + return -EINVAL; + return sgx_vepc_remove_all(vepc); + + default: + return -ENOTTY; + } +} + +static const struct file_operations sgx_vepc_fops = { + .owner = THIS_MODULE, + .open = sgx_vepc_open, + .unlocked_ioctl = sgx_vepc_ioctl, + .compat_ioctl = sgx_vepc_ioctl, + .release = sgx_vepc_release, + .mmap = sgx_vepc_mmap, +}; + +static struct miscdevice sgx_vepc_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "sgx_vepc", + .nodename = "sgx_vepc", + .fops = &sgx_vepc_fops, +}; + +int __init sgx_vepc_init(void) +{ + /* SGX virtualization requires KVM to work */ + if (!cpu_feature_enabled(X86_FEATURE_VMX)) + return -ENODEV; + + INIT_LIST_HEAD(&zombie_secs_pages); + mutex_init(&zombie_secs_pages_lock); + + return misc_register(&sgx_vepc_dev); +} + +/** + * sgx_virt_ecreate() - Run ECREATE on behalf of guest + * @pageinfo: Pointer to PAGEINFO structure + * @secs: Userspace pointer to SECS page + * @trapnr: trap number injected to guest in case of ECREATE error + * + * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose + * of enforcing policies of guest's enclaves, and return the trap number + * which should be injected to guest in case of any ECREATE error. + * + * Return: + * - 0: ECREATE was successful. + * - <0: on error. + */ +int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs, + int *trapnr) +{ + int ret; + + /* + * @secs is an untrusted, userspace-provided address. It comes from + * KVM and is assumed to be a valid pointer which points somewhere in + * userspace. This can fault and call SGX or other fault handlers when + * userspace mapping @secs doesn't exist. + * + * Add a WARN() to make sure @secs is already valid userspace pointer + * from caller (KVM), who should already have handled invalid pointer + * case (for instance, made by malicious guest). All other checks, + * such as alignment of @secs, are deferred to ENCLS itself. + */ + if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE))) + return -EINVAL; + + __uaccess_begin(); + ret = __ecreate(pageinfo, (void *)secs); + __uaccess_end(); + + if (encls_faulted(ret)) { + *trapnr = ENCLS_TRAPNR(ret); + return -EFAULT; + } + + /* ECREATE doesn't return an error code, it faults or succeeds. */ + WARN_ON_ONCE(ret); + return 0; +} +EXPORT_SYMBOL_GPL(sgx_virt_ecreate); + +static int __sgx_virt_einit(void __user *sigstruct, void __user *token, + void __user *secs) +{ + int ret; + + /* + * Make sure all userspace pointers from caller (KVM) are valid. + * All other checks deferred to ENCLS itself. Also see comment + * for @secs in sgx_virt_ecreate(). + */ +#define SGX_EINITTOKEN_SIZE 304 + if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) || + !access_ok(token, SGX_EINITTOKEN_SIZE) || + !access_ok(secs, PAGE_SIZE))) + return -EINVAL; + + __uaccess_begin(); + ret = __einit((void *)sigstruct, (void *)token, (void *)secs); + __uaccess_end(); + + return ret; +} + +/** + * sgx_virt_einit() - Run EINIT on behalf of guest + * @sigstruct: Userspace pointer to SIGSTRUCT structure + * @token: Userspace pointer to EINITTOKEN structure + * @secs: Userspace pointer to SECS page + * @lepubkeyhash: Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values + * @trapnr: trap number injected to guest in case of EINIT error + * + * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available + * in host, SGX driver may rewrite the hardware values at wish, therefore KVM + * needs to update hardware values to guest's virtual MSR values in order to + * ensure EINIT is executed with expected hardware values. + * + * Return: + * - 0: EINIT was successful. + * - <0: on error. + */ +int sgx_virt_einit(void __user *sigstruct, void __user *token, + void __user *secs, u64 *lepubkeyhash, int *trapnr) +{ + int ret; + + if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) { + ret = __sgx_virt_einit(sigstruct, token, secs); + } else { + preempt_disable(); + + sgx_update_lepubkeyhash(lepubkeyhash); + + ret = __sgx_virt_einit(sigstruct, token, secs); + preempt_enable(); + } + + /* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */ + if (ret == -EINVAL) + return ret; + + if (encls_faulted(ret)) { + *trapnr = ENCLS_TRAPNR(ret); + return -EFAULT; + } + + return ret; +} +EXPORT_SYMBOL_GPL(sgx_virt_einit); diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index d3a0791bc052..5e868b62a7c4 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -25,12 +25,12 @@ #define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) #define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff) -#ifdef CONFIG_SMP unsigned int __max_die_per_package __read_mostly = 1; EXPORT_SYMBOL(__max_die_per_package); +#ifdef CONFIG_SMP /* - * Check if given CPUID extended toplogy "leaf" is implemented + * Check if given CPUID extended topology "leaf" is implemented */ static int check_extended_topology_leaf(int leaf) { @@ -44,7 +44,7 @@ static int check_extended_topology_leaf(int leaf) return 0; } /* - * Return best CPUID Extended Toplogy Leaf supported + * Return best CPUID Extended Topology Leaf supported */ static int detect_extended_topology_leaf(struct cpuinfo_x86 *c) { @@ -96,6 +96,8 @@ int detect_extended_topology(struct cpuinfo_x86 *c) unsigned int ht_mask_width, core_plus_mask_width, die_plus_mask_width; unsigned int core_select_mask, core_level_siblings; unsigned int die_select_mask, die_level_siblings; + unsigned int pkg_mask_width; + bool die_level_present = false; int leaf; leaf = detect_extended_topology_leaf(c); @@ -110,10 +112,10 @@ int detect_extended_topology(struct cpuinfo_x86 *c) core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx); core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); die_level_siblings = LEVEL_MAX_SIBLINGS(ebx); - die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + pkg_mask_width = die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); sub_index = 1; - do { + while (true) { cpuid_count(leaf, sub_index, &eax, &ebx, &ecx, &edx); /* @@ -126,23 +128,33 @@ int detect_extended_topology(struct cpuinfo_x86 *c) die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); } if (LEAFB_SUBTYPE(ecx) == DIE_TYPE) { + die_level_present = true; die_level_siblings = LEVEL_MAX_SIBLINGS(ebx); die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); } + if (LEAFB_SUBTYPE(ecx) != INVALID_TYPE) + pkg_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + else + break; + sub_index++; - } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE); + } - core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width; + core_select_mask = (~(-1 << pkg_mask_width)) >> ht_mask_width; die_select_mask = (~(-1 << die_plus_mask_width)) >> core_plus_mask_width; c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, ht_mask_width) & core_select_mask; - c->cpu_die_id = apic->phys_pkg_id(c->initial_apicid, - core_plus_mask_width) & die_select_mask; + + if (die_level_present) { + c->cpu_die_id = apic->phys_pkg_id(c->initial_apicid, + core_plus_mask_width) & die_select_mask; + } + c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, - die_plus_mask_width); + pkg_mask_width); /* * Reinit the apicid, now that we have extended initial_apicid. */ diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c index e2ad30e474f8..ec7bbac3a9f2 100644 --- a/arch/x86/kernel/cpu/tsx.c +++ b/arch/x86/kernel/cpu/tsx.c @@ -2,7 +2,7 @@ /* * Intel Transactional Synchronization Extensions (TSX) control. * - * Copyright (C) 2019 Intel Corporation + * Copyright (C) 2019-2021 Intel Corporation * * Author: * Pawan Gupta <pawan.kumar.gupta@linux.intel.com> @@ -19,7 +19,7 @@ enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED; -void tsx_disable(void) +static void tsx_disable(void) { u64 tsx; @@ -39,7 +39,7 @@ void tsx_disable(void) wrmsrl(MSR_IA32_TSX_CTRL, tsx); } -void tsx_enable(void) +static void tsx_enable(void) { u64 tsx; @@ -58,7 +58,7 @@ void tsx_enable(void) wrmsrl(MSR_IA32_TSX_CTRL, tsx); } -static bool __init tsx_ctrl_is_supported(void) +static bool tsx_ctrl_is_supported(void) { u64 ia32_cap = x86_read_arch_cap_msr(); @@ -84,13 +84,117 @@ static enum tsx_ctrl_states x86_get_tsx_auto_mode(void) return TSX_CTRL_ENABLE; } +/* + * Disabling TSX is not a trivial business. + * + * First of all, there's a CPUID bit: X86_FEATURE_RTM_ALWAYS_ABORT + * which says that TSX is practically disabled (all transactions are + * aborted by default). When that bit is set, the kernel unconditionally + * disables TSX. + * + * In order to do that, however, it needs to dance a bit: + * + * 1. The first method to disable it is through MSR_TSX_FORCE_ABORT and + * the MSR is present only when *two* CPUID bits are set: + * + * - X86_FEATURE_RTM_ALWAYS_ABORT + * - X86_FEATURE_TSX_FORCE_ABORT + * + * 2. The second method is for CPUs which do not have the above-mentioned + * MSR: those use a different MSR - MSR_IA32_TSX_CTRL and disable TSX + * through that one. Those CPUs can also have the initially mentioned + * CPUID bit X86_FEATURE_RTM_ALWAYS_ABORT set and for those the same strategy + * applies: TSX gets disabled unconditionally. + * + * When either of the two methods are present, the kernel disables TSX and + * clears the respective RTM and HLE feature flags. + * + * An additional twist in the whole thing presents late microcode loading + * which, when done, may cause for the X86_FEATURE_RTM_ALWAYS_ABORT CPUID + * bit to be set after the update. + * + * A subsequent hotplug operation on any logical CPU except the BSP will + * cause for the supported CPUID feature bits to get re-detected and, if + * RTM and HLE get cleared all of a sudden, but, userspace did consult + * them before the update, then funny explosions will happen. Long story + * short: the kernel doesn't modify CPUID feature bits after booting. + * + * That's why, this function's call in init_intel() doesn't clear the + * feature flags. + */ +static void tsx_clear_cpuid(void) +{ + u64 msr; + + /* + * MSR_TFA_TSX_CPUID_CLEAR bit is only present when both CPUID + * bits RTM_ALWAYS_ABORT and TSX_FORCE_ABORT are present. + */ + if (boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT) && + boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) { + rdmsrl(MSR_TSX_FORCE_ABORT, msr); + msr |= MSR_TFA_TSX_CPUID_CLEAR; + wrmsrl(MSR_TSX_FORCE_ABORT, msr); + } else if (tsx_ctrl_is_supported()) { + rdmsrl(MSR_IA32_TSX_CTRL, msr); + msr |= TSX_CTRL_CPUID_CLEAR; + wrmsrl(MSR_IA32_TSX_CTRL, msr); + } +} + +/* + * Disable TSX development mode + * + * When the microcode released in Feb 2022 is applied, TSX will be disabled by + * default on some processors. MSR 0x122 (TSX_CTRL) and MSR 0x123 + * (IA32_MCU_OPT_CTRL) can be used to re-enable TSX for development, doing so is + * not recommended for production deployments. In particular, applying MD_CLEAR + * flows for mitigation of the Intel TSX Asynchronous Abort (TAA) transient + * execution attack may not be effective on these processors when Intel TSX is + * enabled with updated microcode. + */ +static void tsx_dev_mode_disable(void) +{ + u64 mcu_opt_ctrl; + + /* Check if RTM_ALLOW exists */ + if (!boot_cpu_has_bug(X86_BUG_TAA) || !tsx_ctrl_is_supported() || + !cpu_feature_enabled(X86_FEATURE_SRBDS_CTRL)) + return; + + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl); + + if (mcu_opt_ctrl & RTM_ALLOW) { + mcu_opt_ctrl &= ~RTM_ALLOW; + wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl); + setup_force_cpu_cap(X86_FEATURE_RTM_ALWAYS_ABORT); + } +} + void __init tsx_init(void) { char arg[5] = {}; int ret; - if (!tsx_ctrl_is_supported()) + tsx_dev_mode_disable(); + + /* + * Hardware will always abort a TSX transaction when the CPUID bit + * RTM_ALWAYS_ABORT is set. In this case, it is better not to enumerate + * CPUID.RTM and CPUID.HLE bits. Clear them here. + */ + if (boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT)) { + tsx_ctrl_state = TSX_CTRL_RTM_ALWAYS_ABORT; + tsx_clear_cpuid(); + setup_clear_cpu_cap(X86_FEATURE_RTM); + setup_clear_cpu_cap(X86_FEATURE_HLE); + return; + } + + if (!tsx_ctrl_is_supported()) { + tsx_ctrl_state = TSX_CTRL_NOT_SUPPORTED; return; + } ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg)); if (ret >= 0) { @@ -142,3 +246,16 @@ void __init tsx_init(void) setup_force_cpu_cap(X86_FEATURE_HLE); } } + +void tsx_ap_init(void) +{ + tsx_dev_mode_disable(); + + if (tsx_ctrl_state == TSX_CTRL_ENABLE) + tsx_enable(); + else if (tsx_ctrl_state == TSX_CTRL_DISABLE) + tsx_disable(); + else if (tsx_ctrl_state == TSX_CTRL_RTM_ALWAYS_ABORT) + /* See comment over that function for more details. */ + tsx_clear_cpuid(); +} diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c index c222f283b456..ec8064c0ae03 100644 --- a/arch/x86/kernel/cpu/umwait.c +++ b/arch/x86/kernel/cpu/umwait.c @@ -4,6 +4,7 @@ #include <linux/cpu.h> #include <asm/msr.h> +#include <asm/mwait.h> #define UMWAIT_C02_ENABLE 0 @@ -17,12 +18,6 @@ */ static u32 umwait_control_cached = UMWAIT_CTRL_VAL(100000, UMWAIT_C02_ENABLE); -u32 get_umwait_control_msr(void) -{ - return umwait_control_cached; -} -EXPORT_SYMBOL_GPL(get_umwait_control_msr); - /* * Cache the original IA32_UMWAIT_CONTROL MSR value which is configured by * hardware or BIOS before kernel boot. diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 46d732696c1c..02039ec3597d 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -25,12 +25,16 @@ #include <linux/init.h> #include <linux/export.h> #include <linux/clocksource.h> +#include <linux/cpu.h> +#include <linux/reboot.h> +#include <linux/static_call.h> #include <asm/div64.h> #include <asm/x86_init.h> #include <asm/hypervisor.h> #include <asm/timer.h> #include <asm/apic.h> #include <asm/vmware.h> +#include <asm/svm.h> #undef pr_fmt #define pr_fmt(fmt) "vmware: " fmt @@ -47,6 +51,11 @@ #define VMWARE_CMD_GETVCPU_INFO 68 #define VMWARE_CMD_LEGACY_X2APIC 3 #define VMWARE_CMD_VCPU_RESERVED 31 +#define VMWARE_CMD_STEALCLOCK 91 + +#define STEALCLOCK_NOT_AVAILABLE (-1) +#define STEALCLOCK_DISABLED 0 +#define STEALCLOCK_ENABLED 1 #define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \ __asm__("inl (%%dx), %%eax" : \ @@ -86,6 +95,18 @@ } \ } while (0) +struct vmware_steal_time { + union { + uint64_t clock; /* stolen time counter in units of vtsc */ + struct { + /* only for little-endian */ + uint32_t clock_low; + uint32_t clock_high; + }; + }; + uint64_t reserved[7]; +}; + static unsigned long vmware_tsc_khz __ro_after_init; static u8 vmware_hypercall_mode __ro_after_init; @@ -103,15 +124,25 @@ static unsigned long vmware_get_tsc_khz(void) #ifdef CONFIG_PARAVIRT static struct cyc2ns_data vmware_cyc2ns __ro_after_init; -static int vmw_sched_clock __initdata = 1; +static bool vmw_sched_clock __initdata = true; +static DEFINE_PER_CPU_DECRYPTED(struct vmware_steal_time, vmw_steal_time) __aligned(64); +static bool has_steal_clock; +static bool steal_acc __initdata = true; /* steal time accounting */ static __init int setup_vmw_sched_clock(char *s) { - vmw_sched_clock = 0; + vmw_sched_clock = false; return 0; } early_param("no-vmw-sched-clock", setup_vmw_sched_clock); +static __init int parse_no_stealacc(char *arg) +{ + steal_acc = false; + return 0; +} +early_param("no-steal-acc", parse_no_stealacc); + static unsigned long long notrace vmware_sched_clock(void) { unsigned long long ns; @@ -122,7 +153,7 @@ static unsigned long long notrace vmware_sched_clock(void) return ns; } -static void __init vmware_sched_clock_setup(void) +static void __init vmware_cyc2ns_setup(void) { struct cyc2ns_data *d = &vmware_cyc2ns; unsigned long long tsc_now = rdtsc(); @@ -132,17 +163,201 @@ static void __init vmware_sched_clock_setup(void) d->cyc2ns_offset = mul_u64_u32_shr(tsc_now, d->cyc2ns_mul, d->cyc2ns_shift); - pv_ops.time.sched_clock = vmware_sched_clock; - pr_info("using sched offset of %llu ns\n", d->cyc2ns_offset); + pr_info("using clock offset of %llu ns\n", d->cyc2ns_offset); +} + +static int vmware_cmd_stealclock(uint32_t arg1, uint32_t arg2) +{ + uint32_t result, info; + + asm volatile (VMWARE_HYPERCALL : + "=a"(result), + "=c"(info) : + "a"(VMWARE_HYPERVISOR_MAGIC), + "b"(0), + "c"(VMWARE_CMD_STEALCLOCK), + "d"(0), + "S"(arg1), + "D"(arg2) : + "memory"); + return result; +} + +static bool stealclock_enable(phys_addr_t pa) +{ + return vmware_cmd_stealclock(upper_32_bits(pa), + lower_32_bits(pa)) == STEALCLOCK_ENABLED; +} + +static int __stealclock_disable(void) +{ + return vmware_cmd_stealclock(0, 1); +} + +static void stealclock_disable(void) +{ + __stealclock_disable(); +} + +static bool vmware_is_stealclock_available(void) +{ + return __stealclock_disable() != STEALCLOCK_NOT_AVAILABLE; +} + +/** + * vmware_steal_clock() - read the per-cpu steal clock + * @cpu: the cpu number whose steal clock we want to read + * + * The function reads the steal clock if we are on a 64-bit system, otherwise + * reads it in parts, checking that the high part didn't change in the + * meantime. + * + * Return: + * The steal clock reading in ns. + */ +static uint64_t vmware_steal_clock(int cpu) +{ + struct vmware_steal_time *steal = &per_cpu(vmw_steal_time, cpu); + uint64_t clock; + + if (IS_ENABLED(CONFIG_64BIT)) + clock = READ_ONCE(steal->clock); + else { + uint32_t initial_high, low, high; + + do { + initial_high = READ_ONCE(steal->clock_high); + /* Do not reorder initial_high and high readings */ + virt_rmb(); + low = READ_ONCE(steal->clock_low); + /* Keep low reading in between */ + virt_rmb(); + high = READ_ONCE(steal->clock_high); + } while (initial_high != high); + + clock = ((uint64_t)high << 32) | low; + } + + return mul_u64_u32_shr(clock, vmware_cyc2ns.cyc2ns_mul, + vmware_cyc2ns.cyc2ns_shift); +} + +static void vmware_register_steal_time(void) +{ + int cpu = smp_processor_id(); + struct vmware_steal_time *st = &per_cpu(vmw_steal_time, cpu); + + if (!has_steal_clock) + return; + + if (!stealclock_enable(slow_virt_to_phys(st))) { + has_steal_clock = false; + return; + } + + pr_info("vmware-stealtime: cpu %d, pa %llx\n", + cpu, (unsigned long long) slow_virt_to_phys(st)); } +static void vmware_disable_steal_time(void) +{ + if (!has_steal_clock) + return; + + stealclock_disable(); +} + +static void vmware_guest_cpu_init(void) +{ + if (has_steal_clock) + vmware_register_steal_time(); +} + +static void vmware_pv_guest_cpu_reboot(void *unused) +{ + vmware_disable_steal_time(); +} + +static int vmware_pv_reboot_notify(struct notifier_block *nb, + unsigned long code, void *unused) +{ + if (code == SYS_RESTART) + on_each_cpu(vmware_pv_guest_cpu_reboot, NULL, 1); + return NOTIFY_DONE; +} + +static struct notifier_block vmware_pv_reboot_nb = { + .notifier_call = vmware_pv_reboot_notify, +}; + +#ifdef CONFIG_SMP +static void __init vmware_smp_prepare_boot_cpu(void) +{ + vmware_guest_cpu_init(); + native_smp_prepare_boot_cpu(); +} + +static int vmware_cpu_online(unsigned int cpu) +{ + local_irq_disable(); + vmware_guest_cpu_init(); + local_irq_enable(); + return 0; +} + +static int vmware_cpu_down_prepare(unsigned int cpu) +{ + local_irq_disable(); + vmware_disable_steal_time(); + local_irq_enable(); + return 0; +} +#endif + +static __init int activate_jump_labels(void) +{ + if (has_steal_clock) { + static_key_slow_inc(¶virt_steal_enabled); + if (steal_acc) + static_key_slow_inc(¶virt_steal_rq_enabled); + } + + return 0; +} +arch_initcall(activate_jump_labels); + static void __init vmware_paravirt_ops_setup(void) { pv_info.name = "VMware hypervisor"; pv_ops.cpu.io_delay = paravirt_nop; - if (vmware_tsc_khz && vmw_sched_clock) - vmware_sched_clock_setup(); + if (vmware_tsc_khz == 0) + return; + + vmware_cyc2ns_setup(); + + if (vmw_sched_clock) + paravirt_set_sched_clock(vmware_sched_clock); + + if (vmware_is_stealclock_available()) { + has_steal_clock = true; + static_call_update(pv_steal_clock, vmware_steal_clock); + + /* We use reboot notifier only to disable steal clock */ + register_reboot_notifier(&vmware_pv_reboot_nb); + +#ifdef CONFIG_SMP + smp_ops.smp_prepare_boot_cpu = + vmware_smp_prepare_boot_cpu; + if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "x86/vmware:online", + vmware_cpu_online, + vmware_cpu_down_prepare) < 0) + pr_err("vmware_guest: Failed to install cpu hotplug callbacks\n"); +#else + vmware_guest_cpu_init(); +#endif + } } #else #define vmware_paravirt_ops_setup() do {} while (0) @@ -164,6 +379,8 @@ static void __init vmware_set_capabilities(void) { setup_force_cpu_cap(X86_FEATURE_CONSTANT_TSC); setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); + if (vmware_tsc_khz) + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMCALL) setup_force_cpu_cap(X86_FEATURE_VMCALL); else if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMMCALL) @@ -213,7 +430,7 @@ static void __init vmware_platform_setup(void) vmware_set_capabilities(); } -static u8 vmware_select_hypercall(void) +static u8 __init vmware_select_hypercall(void) { int eax, ebx, ecx, edx; @@ -259,14 +476,53 @@ static bool __init vmware_legacy_x2apic_available(void) { uint32_t eax, ebx, ecx, edx; VMWARE_CMD(GETVCPU_INFO, eax, ebx, ecx, edx); - return (eax & (1 << VMWARE_CMD_VCPU_RESERVED)) == 0 && - (eax & (1 << VMWARE_CMD_LEGACY_X2APIC)) != 0; + return !(eax & BIT(VMWARE_CMD_VCPU_RESERVED)) && + (eax & BIT(VMWARE_CMD_LEGACY_X2APIC)); +} + +#ifdef CONFIG_AMD_MEM_ENCRYPT +static void vmware_sev_es_hcall_prepare(struct ghcb *ghcb, + struct pt_regs *regs) +{ + /* Copy VMWARE specific Hypercall parameters to the GHCB */ + ghcb_set_rip(ghcb, regs->ip); + ghcb_set_rbx(ghcb, regs->bx); + ghcb_set_rcx(ghcb, regs->cx); + ghcb_set_rdx(ghcb, regs->dx); + ghcb_set_rsi(ghcb, regs->si); + ghcb_set_rdi(ghcb, regs->di); + ghcb_set_rbp(ghcb, regs->bp); } +static bool vmware_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs) +{ + if (!(ghcb_rbx_is_valid(ghcb) && + ghcb_rcx_is_valid(ghcb) && + ghcb_rdx_is_valid(ghcb) && + ghcb_rsi_is_valid(ghcb) && + ghcb_rdi_is_valid(ghcb) && + ghcb_rbp_is_valid(ghcb))) + return false; + + regs->bx = ghcb_get_rbx(ghcb); + regs->cx = ghcb_get_rcx(ghcb); + regs->dx = ghcb_get_rdx(ghcb); + regs->si = ghcb_get_rsi(ghcb); + regs->di = ghcb_get_rdi(ghcb); + regs->bp = ghcb_get_rbp(ghcb); + + return true; +} +#endif + const __initconst struct hypervisor_x86 x86_hyper_vmware = { - .name = "VMware", - .detect = vmware_platform, - .type = X86_HYPER_VMWARE, - .init.init_platform = vmware_platform_setup, - .init.x2apic_available = vmware_legacy_x2apic_available, + .name = "VMware", + .detect = vmware_platform, + .type = X86_HYPER_VMWARE, + .init.init_platform = vmware_platform_setup, + .init.x2apic_available = vmware_legacy_x2apic_available, +#ifdef CONFIG_AMD_MEM_ENCRYPT + .runtime.sev_es_hcall_prepare = vmware_sev_es_hcall_prepare, + .runtime.sev_es_hcall_finish = vmware_sev_es_hcall_finish, +#endif }; diff --git a/arch/x86/kernel/cpu/vortex.c b/arch/x86/kernel/cpu/vortex.c new file mode 100644 index 000000000000..e2685470ba94 --- /dev/null +++ b/arch/x86/kernel/cpu/vortex.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <asm/processor.h> +#include "cpu.h" + +/* + * No special init required for Vortex processors. + */ + +static const struct cpu_dev vortex_cpu_dev = { + .c_vendor = "Vortex", + .c_ident = { "Vortex86 SoC" }, + .legacy_models = { + { + .family = 5, + .model_names = { + [2] = "Vortex86DX", + [8] = "Vortex86MX", + }, + }, + { + .family = 6, + .model_names = { + /* + * Both the Vortex86EX and the Vortex86EX2 + * have the same family and model id. + * + * However, the -EX2 supports the product name + * CPUID call, so this name will only be used + * for the -EX, which does not. + */ + [0] = "Vortex86EX", + }, + }, + }, + .c_x86_vendor = X86_VENDOR_VORTEX, +}; + +cpu_dev_register(vortex_cpu_dev); diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c index df1358ba622b..05fa4ef63490 100644 --- a/arch/x86/kernel/cpu/zhaoxin.c +++ b/arch/x86/kernel/cpu/zhaoxin.c @@ -2,6 +2,7 @@ #include <linux/sched.h> #include <linux/sched/clock.h> +#include <asm/cpu.h> #include <asm/cpufeature.h> #include "cpu.h" diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 3492aa36bf09..6f7b8cc1bc9f 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -74,10 +74,9 @@ static ssize_t cpuid_read(struct file *file, char __user *buf, init_completion(&cmd.done); for (; count; count -= 16) { - call_single_data_t csd = { - .func = cpuid_smp_cpuid, - .info = &cmd, - }; + call_single_data_t csd; + + INIT_CSD(&csd, cpuid_smp_cpuid, &cmd); cmd.regs.eax = pos; cmd.regs.ecx = pos >> 32; diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index fd87b59452a3..9730c88530fc 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -70,19 +70,6 @@ static inline void cpu_crash_vmclear_loaded_vmcss(void) rcu_read_unlock(); } -/* - * When the crashkernel option is specified, only use the low - * 1M for the real mode trampoline. - */ -void __init crash_reserve_low_1M(void) -{ - if (cmdline_find_option(boot_command_line, "crashkernel", NULL, 0) < 0) - return; - - memblock_reserve(0, 1<<20); - pr_info("Reserving the low 1M of memory for crashkernel\n"); -} - #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) static void kdump_nmi_callback(int cpu, struct pt_regs *regs) @@ -230,7 +217,7 @@ static int elf_header_exclude_ranges(struct crash_mem *cmem) int ret = 0; /* Exclude the low 1M because it is always reserved */ - ret = crash_exclude_mem_range(cmem, 0, 1<<20); + ret = crash_exclude_mem_range(cmem, 0, (1<<20)-1); if (ret) return ret; @@ -323,8 +310,8 @@ static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem, cmem->nr_ranges = 1; /* Exclude elf header region */ - start = image->arch.elf_load_addr; - end = start + image->arch.elf_headers_sz - 1; + start = image->elf_load_addr; + end = start + image->elf_headers_sz - 1; return crash_exclude_mem_range(cmem, start, end); } @@ -337,7 +324,7 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) struct crash_memmap_data cmd; struct crash_mem *cmem; - cmem = vzalloc(sizeof(struct crash_mem)); + cmem = vzalloc(struct_size(cmem, ranges, 1)); if (!cmem) return -ENOMEM; @@ -407,20 +394,20 @@ int crash_load_segments(struct kimage *image) if (ret) return ret; - image->arch.elf_headers = kbuf.buffer; - image->arch.elf_headers_sz = kbuf.bufsz; + image->elf_headers = kbuf.buffer; + image->elf_headers_sz = kbuf.bufsz; kbuf.memsz = kbuf.bufsz; kbuf.buf_align = ELF_CORE_HEADER_ALIGN; kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; ret = kexec_add_buffer(&kbuf); if (ret) { - vfree((void *)image->arch.elf_headers); + vfree((void *)image->elf_headers); return ret; } - image->arch.elf_load_addr = kbuf.mem; + image->elf_load_addr = kbuf.mem; pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - image->arch.elf_load_addr, kbuf.bufsz, kbuf.bufsz); + image->elf_load_addr, kbuf.bufsz, kbuf.memsz); return ret; } diff --git a/arch/x86/kernel/crash_core_32.c b/arch/x86/kernel/crash_core_32.c index c0159a7bca6d..8a89c109e20a 100644 --- a/arch/x86/kernel/crash_core_32.c +++ b/arch/x86/kernel/crash_core_32.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-only #include <linux/crash_core.h> +#include <linux/pgtable.h> -#include <asm/pgtable.h> #include <asm/setup.h> void arch_crash_save_vmcoreinfo(void) diff --git a/arch/x86/kernel/crash_core_64.c b/arch/x86/kernel/crash_core_64.c index 845a57eb4eb7..7d255f882afe 100644 --- a/arch/x86/kernel/crash_core_64.c +++ b/arch/x86/kernel/crash_core_64.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-only #include <linux/crash_core.h> +#include <linux/pgtable.h> -#include <asm/pgtable.h> #include <asm/setup.h> void arch_crash_save_vmcoreinfo(void) diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index 33ee47670b99..5f4ae5476e19 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -10,10 +10,7 @@ #include <linux/errno.h> #include <linux/highmem.h> #include <linux/crash_dump.h> - -#include <linux/uaccess.h> - -static void *kdump_buf_page; +#include <linux/uio.h> static inline bool is_crashed_pfn_valid(unsigned long pfn) { @@ -31,25 +28,8 @@ static inline bool is_crashed_pfn_valid(unsigned long pfn) #endif } -/** - * copy_oldmem_page - copy one page from "oldmem" - * @pfn: page frame number to be copied - * @buf: target memory address for the copy; this can be in kernel address - * space or user address space (see @userbuf) - * @csize: number of bytes to copy - * @offset: offset in bytes into the page (based on pfn) to begin the copy - * @userbuf: if set, @buf is in user address space, use copy_to_user(), - * otherwise @buf is in kernel address space, use memcpy(). - * - * Copy a page from "oldmem". For this page, there is no pte mapped - * in the current kernel. We stitch up a pte, similar to kmap_atomic. - * - * Calling copy_to_user() in atomic context is not desirable. Hence first - * copying the data to a pre-allocated kernel page and then copying to user - * space in non-atomic context. - */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, - size_t csize, unsigned long offset, int userbuf) +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, size_t csize, + unsigned long offset) { void *vaddr; @@ -59,38 +39,9 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, if (!is_crashed_pfn_valid(pfn)) return -EFAULT; - vaddr = kmap_atomic_pfn(pfn); - - if (!userbuf) { - memcpy(buf, (vaddr + offset), csize); - kunmap_atomic(vaddr); - } else { - if (!kdump_buf_page) { - printk(KERN_WARNING "Kdump: Kdump buffer page not" - " allocated\n"); - kunmap_atomic(vaddr); - return -EFAULT; - } - copy_page(kdump_buf_page, vaddr); - kunmap_atomic(vaddr); - if (copy_to_user(buf, (kdump_buf_page + offset), csize)) - return -EFAULT; - } + vaddr = kmap_local_pfn(pfn); + csize = copy_to_iter(vaddr + offset, csize, iter); + kunmap_local(vaddr); return csize; } - -static int __init kdump_buf_page_init(void) -{ - int ret = 0; - - kdump_buf_page = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!kdump_buf_page) { - printk(KERN_WARNING "Kdump: Failed to allocate kdump buffer" - " page\n"); - ret = -ENOMEM; - } - - return ret; -} -arch_initcall(kdump_buf_page_init); diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index 045e82e8945b..e75bc2f217ff 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -8,11 +8,12 @@ #include <linux/errno.h> #include <linux/crash_dump.h> -#include <linux/uaccess.h> +#include <linux/uio.h> #include <linux/io.h> +#include <linux/cc_platform.h> -static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, - unsigned long offset, int userbuf, +static ssize_t __copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, + size_t csize, unsigned long offset, bool encrypted) { void *vaddr; @@ -28,50 +29,36 @@ static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, if (!vaddr) return -ENOMEM; - if (userbuf) { - if (copy_to_user((void __user *)buf, vaddr + offset, csize)) { - iounmap((void __iomem *)vaddr); - return -EFAULT; - } - } else - memcpy(buf, vaddr + offset, csize); + csize = copy_to_iter(vaddr + offset, csize, iter); - set_iounmap_nonlazy(); iounmap((void __iomem *)vaddr); return csize; } -/** - * copy_oldmem_page - copy one page of memory - * @pfn: page frame number to be copied - * @buf: target memory address for the copy; this can be in kernel address - * space or user address space (see @userbuf) - * @csize: number of bytes to copy - * @offset: offset in bytes into the page (based on pfn) to begin the copy - * @userbuf: if set, @buf is in user address space, use copy_to_user(), - * otherwise @buf is in kernel address space, use memcpy(). - * - * Copy a page from the old kernel's memory. For this page, there is no pte - * mapped in the current kernel. We stitch up a pte, similar to kmap_atomic. - */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, - unsigned long offset, int userbuf) +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, size_t csize, + unsigned long offset) { - return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, false); + return __copy_oldmem_page(iter, pfn, csize, offset, false); } -/** +/* * copy_oldmem_page_encrypted - same as copy_oldmem_page() above but ioremap the * memory with the encryption mask set to accommodate kdump on SME-enabled * machines. */ -ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize, - unsigned long offset, int userbuf) +ssize_t copy_oldmem_page_encrypted(struct iov_iter *iter, unsigned long pfn, + size_t csize, unsigned long offset) { - return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, true); + return __copy_oldmem_page(iter, pfn, csize, offset, true); } ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos) { - return read_from_oldmem(buf, count, ppos, 0, sev_active()); + struct kvec kvec = { .iov_base = buf, .iov_len = count }; + struct iov_iter iter; + + iov_iter_kvec(&iter, READ, &kvec, 1, count); + + return read_from_oldmem(&iter, count, ppos, + cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)); } diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 8d85e00bb40a..5cd51f25f446 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -20,6 +20,7 @@ #include <asm/irqdomain.h> #include <asm/hpet.h> #include <asm/apic.h> +#include <asm/io_apic.h> #include <asm/pci_x86.h> #include <asm/setup.h> #include <asm/i8259.h> @@ -30,11 +31,6 @@ char __initdata cmd_line[COMMAND_LINE_SIZE]; int __initdata of_ioapic; -void __init early_init_dt_scan_chosen_arch(unsigned long node) -{ - BUG(); -} - void __init early_init_dt_add_memory_arch(u64 base, u64 size) { BUG(); @@ -138,12 +134,11 @@ static void __init dtb_cpu_setup(void) { struct device_node *dn; u32 apic_id, version; - int ret; version = GET_APIC_VERSION(apic_read(APIC_LVR)); for_each_of_cpu_node(dn) { - ret = of_property_read_u32(dn, "reg", &apic_id); - if (ret < 0) { + apic_id = of_get_cpu_hwid(dn, 0); + if (apic_id == ~0U) { pr_warn("%pOF: missing local APIC ID\n", dn); continue; } @@ -183,31 +178,31 @@ static unsigned int ioapic_id; struct of_ioapic_type { u32 out_type; - u32 trigger; - u32 polarity; + u32 is_level; + u32 active_low; }; static struct of_ioapic_type of_ioapic_type[] = { { - .out_type = IRQ_TYPE_EDGE_RISING, - .trigger = IOAPIC_EDGE, - .polarity = 1, + .out_type = IRQ_TYPE_EDGE_FALLING, + .is_level = 0, + .active_low = 1, }, { - .out_type = IRQ_TYPE_LEVEL_LOW, - .trigger = IOAPIC_LEVEL, - .polarity = 0, + .out_type = IRQ_TYPE_LEVEL_HIGH, + .is_level = 1, + .active_low = 0, }, { - .out_type = IRQ_TYPE_LEVEL_HIGH, - .trigger = IOAPIC_LEVEL, - .polarity = 1, + .out_type = IRQ_TYPE_LEVEL_LOW, + .is_level = 1, + .active_low = 1, }, { - .out_type = IRQ_TYPE_EDGE_FALLING, - .trigger = IOAPIC_EDGE, - .polarity = 0, + .out_type = IRQ_TYPE_EDGE_RISING, + .is_level = 0, + .active_low = 0, }, }; @@ -227,9 +222,9 @@ static int dt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, return -EINVAL; it = &of_ioapic_type[type_index]; - ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->trigger, it->polarity); - tmp.ioapic_id = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain)); - tmp.ioapic_pin = fwspec->param[0]; + ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->is_level, it->active_low); + tmp.devid = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain)); + tmp.ioapic.pin = fwspec->param[0]; return mp_irqdomain_alloc(domain, virq, nr_irqs, &tmp); } diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c index 3793646f0fb5..3b58d8703094 100644 --- a/arch/x86/kernel/doublefault_32.c +++ b/arch/x86/kernel/doublefault_32.c @@ -6,12 +6,10 @@ #include <linux/fs.h> #include <linux/uaccess.h> -#include <asm/pgtable.h> #include <asm/processor.h> #include <asm/desc.h> #include <asm/traps.h> -extern void double_fault(void); #define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM) #define TSS(x) this_cpu_read(cpu_tss_rw.x86_tss.x) @@ -22,7 +20,7 @@ static void set_df_gdt_entry(unsigned int cpu); * Called by double_fault with CR0.TS and EFLAGS.NT cleared. The CPU thinks * we're running the doublefault task. Cannot return. */ -asmlinkage notrace void __noreturn doublefault_shim(void) +asmlinkage noinstr void __noreturn doublefault_shim(void) { unsigned long cr2; struct pt_regs regs; @@ -41,7 +39,7 @@ asmlinkage notrace void __noreturn doublefault_shim(void) * Fill in pt_regs. A downside of doing this in C is that the unwinder * won't see it (no ENCODE_FRAME_POINTER), so a nested stack dump * won't successfully unwind to the source of the double fault. - * The main dump from do_double_fault() is fine, though, since it + * The main dump from exc_double_fault() is fine, though, since it * uses these regs directly. * * If anyone ever cares, this could be moved to asm. @@ -71,7 +69,7 @@ asmlinkage notrace void __noreturn doublefault_shim(void) regs.cx = TSS(cx); regs.bx = TSS(bx); - do_double_fault(®s, 0, cr2); + exc_double_fault(®s, 0, cr2); /* * x86_32 does not save the original CR3 anywhere on a task switch. @@ -79,13 +77,9 @@ asmlinkage notrace void __noreturn doublefault_shim(void) * some way to reconstruct CR3. We could make a credible guess based * on cpu_tlbstate, but that would be racy and would not account for * PTI. - * - * Instead, don't bother. We can return through - * rewind_stack_do_exit() instead. */ panic("cannot return from double fault\n"); } -NOKPROBE_SYMBOL(doublefault_shim); DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = { .tss = { @@ -96,16 +90,14 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = { .ldt = 0, .io_bitmap_base = IO_BITMAP_OFFSET_INVALID, - .ip = (unsigned long) double_fault, + .ip = (unsigned long) asm_exc_double_fault, .flags = X86_EFLAGS_FIXED, .es = __USER_DS, .cs = __KERNEL_CS, .ss = __KERNEL_DS, .ds = __USER_DS, .fs = __KERNEL_PERCPU, -#ifndef CONFIG_X86_32_LAZY_GS - .gs = __KERNEL_STACK_CANARY, -#endif + .gs = 0, .__cr3 = __pa_nodebug(swapper_pg_dir), }, diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index ae64ec7f752f..0bf6779187dd 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -29,8 +29,8 @@ static int die_counter; static struct pt_regs exec_summary_regs; -bool in_task_stack(unsigned long *stack, struct task_struct *task, - struct stack_info *info) +bool noinstr in_task_stack(unsigned long *stack, struct task_struct *task, + struct stack_info *info) { unsigned long *begin = task_stack_page(task); unsigned long *end = task_stack_page(task) + THREAD_SIZE; @@ -46,7 +46,8 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task, return true; } -bool in_entry_stack(unsigned long *stack, struct stack_info *info) +/* Called from get_stack_info_noinstr - so must be noinstr too */ +bool noinstr in_entry_stack(unsigned long *stack, struct stack_info *info) { struct entry_stack *ss = cpu_entry_stack(smp_processor_id()); @@ -65,10 +66,29 @@ bool in_entry_stack(unsigned long *stack, struct stack_info *info) } static void printk_stack_address(unsigned long address, int reliable, - char *log_lvl) + const char *log_lvl) { touch_nmi_watchdog(); - printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address); + printk("%s %s%pBb\n", log_lvl, reliable ? "" : "? ", (void *)address); +} + +static int copy_code(struct pt_regs *regs, u8 *buf, unsigned long src, + unsigned int nbytes) +{ + if (!user_mode(regs)) + return copy_from_kernel_nofault(buf, (u8 *)src, nbytes); + + /* The user space code from other tasks cannot be accessed. */ + if (regs != task_pt_regs(current)) + return -EPERM; + + /* + * Even if named copy_from_user_nmi() this can be invoked from + * other contexts and will not try to resolve a pagefault, which is + * the correct thing to do here as this code can be called from any + * context. + */ + return copy_from_user_nmi(buf, (void __user *)src, nbytes); } /* @@ -97,22 +117,20 @@ void show_opcodes(struct pt_regs *regs, const char *loglvl) #define OPCODE_BUFSIZE (PROLOGUE_SIZE + 1 + EPILOGUE_SIZE) u8 opcodes[OPCODE_BUFSIZE]; unsigned long prologue = regs->ip - PROLOGUE_SIZE; - bool bad_ip; - - /* - * Make sure userspace isn't trying to trick us into dumping kernel - * memory by pointing the userspace instruction pointer at it. - */ - bad_ip = user_mode(regs) && - __chk_range_not_ok(prologue, OPCODE_BUFSIZE, TASK_SIZE_MAX); - if (bad_ip || probe_kernel_read(opcodes, (u8 *)prologue, - OPCODE_BUFSIZE)) { - printk("%sCode: Bad RIP value.\n", loglvl); - } else { + switch (copy_code(regs, opcodes, prologue, sizeof(opcodes))) { + case 0: printk("%sCode: %" __stringify(PROLOGUE_SIZE) "ph <%02x> %" __stringify(EPILOGUE_SIZE) "ph\n", loglvl, opcodes, opcodes[PROLOGUE_SIZE], opcodes + PROLOGUE_SIZE + 1); + break; + case -EPERM: + /* No access to the user space stack of other tasks. Ignore. */ + break; + default: + printk("%sCode: Unable to access opcode bytes at 0x%lx.\n", + loglvl, prologue); + break; } } @@ -126,15 +144,15 @@ void show_ip(struct pt_regs *regs, const char *loglvl) show_opcodes(regs, loglvl); } -void show_iret_regs(struct pt_regs *regs) +void show_iret_regs(struct pt_regs *regs, const char *log_lvl) { - show_ip(regs, KERN_DEFAULT); - printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss, + show_ip(regs, log_lvl); + printk("%sRSP: %04x:%016lx EFLAGS: %08lx", log_lvl, (int)regs->ss, regs->sp, regs->flags); } static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, - bool partial) + bool partial, const char *log_lvl) { /* * These on_stack() checks aren't strictly necessary: the unwind code @@ -146,7 +164,7 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, * they can be printed in the right context. */ if (!partial && on_stack(info, regs, sizeof(*regs))) { - __show_regs(regs, SHOW_REGS_SHORT); + __show_regs(regs, SHOW_REGS_SHORT, log_lvl); } else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET, IRET_FRAME_SIZE)) { @@ -155,12 +173,18 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, * full pt_regs might not have been saved yet. In that case * just print the iret frame. */ - show_iret_regs(regs); + show_iret_regs(regs, log_lvl); } } -void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, char *log_lvl) +/* + * This function reads pointers from the stack and dereferences them. The + * pointers may not have their KMSAN shadow set up properly, which may result + * in false positive reports. Disable instrumentation to avoid those. + */ +__no_kmsan_checks +static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, const char *log_lvl) { struct unwind_state state; struct stack_info stack_info = {0}; @@ -210,7 +234,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, printk("%s <%s>\n", log_lvl, stack_name); if (regs) - show_regs_if_on_stack(&stack_info, regs, partial); + show_regs_if_on_stack(&stack_info, regs, partial, log_lvl); /* * Scan the stack, printing any text addresses we find. At the @@ -271,7 +295,7 @@ next: /* if the frame has entry regs, print them */ regs = unwind_get_entry_regs(&state, &partial); if (regs) - show_regs_if_on_stack(&stack_info, regs, partial); + show_regs_if_on_stack(&stack_info, regs, partial, log_lvl); } if (stack_name) @@ -279,7 +303,8 @@ next: } } -void show_stack(struct task_struct *task, unsigned long *sp) +void show_stack(struct task_struct *task, unsigned long *sp, + const char *loglvl) { task = task ? : current; @@ -290,7 +315,7 @@ void show_stack(struct task_struct *task, unsigned long *sp) if (!sp && task == current) sp = get_stack_pointer(current, NULL); - show_trace_log_lvl(task, NULL, sp, KERN_DEFAULT); + show_trace_log_lvl(task, NULL, sp, loglvl); } void show_stack_regs(struct pt_regs *regs) @@ -326,7 +351,7 @@ unsigned long oops_begin(void) } NOKPROBE_SYMBOL(oops_begin); -void __noreturn rewind_stack_do_exit(int signr); +void __noreturn rewind_stack_and_make_dead(int signr); void oops_end(unsigned long flags, struct pt_regs *regs, int signr) { @@ -344,7 +369,7 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr) oops_exit(); /* Executive summary in case the oops scrolled away */ - __show_regs(&exec_summary_regs, SHOW_REGS_ALL); + __show_regs(&exec_summary_regs, SHOW_REGS_ALL, KERN_DEFAULT); if (!signr) return; @@ -361,7 +386,7 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr) * reuse the task stack and that existing poisons are invalid. */ kasan_unpoison_task_stack(current); - rewind_stack_do_exit(signr); + rewind_stack_and_make_dead(signr); } NOKPROBE_SYMBOL(oops_end); @@ -436,9 +461,12 @@ void die_addr(const char *str, struct pt_regs *regs, long err, long gp_addr) void show_regs(struct pt_regs *regs) { + enum show_regs_mode print_kernel_regs; + show_regs_print_info(KERN_DEFAULT); - __show_regs(regs, user_mode(regs) ? SHOW_REGS_USER : SHOW_REGS_ALL); + print_kernel_regs = user_mode(regs) ? SHOW_REGS_USER : SHOW_REGS_ALL; + __show_regs(regs, print_kernel_regs, KERN_DEFAULT); /* * When in-kernel, we also print out the stack at the time of the fault.. diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 8e3a8fedfa4d..722fd712e1cf 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -87,7 +87,6 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) static bool in_doublefault_stack(unsigned long *stack, struct stack_info *info) { -#ifdef CONFIG_DOUBLEFAULT struct cpu_entry_area *cea = get_cpu_entry_area(raw_smp_processor_id()); struct doublefault_stack *ss = &cea->doublefault_stack; @@ -103,9 +102,6 @@ static bool in_doublefault_stack(unsigned long *stack, struct stack_info *info) info->next_sp = (unsigned long *)this_cpu_read(cpu_tss_rw.x86_tss.sp); return true; -#else - return false; -#endif } diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 87b97897a881..6c5defd6569a 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -22,19 +22,25 @@ static const char * const exception_stack_names[] = { [ ESTACK_DF ] = "#DF", [ ESTACK_NMI ] = "NMI", - [ ESTACK_DB2 ] = "#DB2", - [ ESTACK_DB1 ] = "#DB1", [ ESTACK_DB ] = "#DB", [ ESTACK_MCE ] = "#MC", + [ ESTACK_VC ] = "#VC", + [ ESTACK_VC2 ] = "#VC2", }; const char *stack_type_name(enum stack_type type) { BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); + if (type == STACK_TYPE_TASK) + return "TASK"; + if (type == STACK_TYPE_IRQ) return "IRQ"; + if (type == STACK_TYPE_SOFTIRQ) + return "SOFTIRQ"; + if (type == STACK_TYPE_ENTRY) { /* * On 64-bit, we have a generic entry stack that we @@ -79,12 +85,13 @@ static const struct estack_pages estack_pages[CEA_ESTACK_PAGES] ____cacheline_aligned = { EPAGERANGE(DF), EPAGERANGE(NMI), - EPAGERANGE(DB1), EPAGERANGE(DB), EPAGERANGE(MCE), + EPAGERANGE(VC), + EPAGERANGE(VC2), }; -static bool in_exception_stack(unsigned long *stack, struct stack_info *info) +static __always_inline bool in_exception_stack(unsigned long *stack, struct stack_info *info) { unsigned long begin, end, stk = (unsigned long)stack; const struct estack_pages *ep; @@ -125,14 +132,23 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info) return true; } -static bool in_irq_stack(unsigned long *stack, struct stack_info *info) +static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr); - unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long)); + unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr); + unsigned long *begin; /* - * This is a software stack, so 'end' can be a valid stack pointer. - * It just means the stack is empty. + * @end points directly to the top most stack entry to avoid a -8 + * adjustment in the stack switch hotpath. Adjust it back before + * calculating @begin. + */ + end++; + begin = end - (IRQ_STACK_SIZE / sizeof(long)); + + /* + * Due to the switching logic RSP can never be == @end because the + * final operation is 'popq %rsp' which means after that RSP points + * to the original stack and not to @end. */ if (stack < begin || stack >= end) return false; @@ -142,40 +158,47 @@ static bool in_irq_stack(unsigned long *stack, struct stack_info *info) info->end = end; /* - * The next stack pointer is the first thing pushed by the entry code - * after switching to the irq stack. + * The next stack pointer is stored at the top of the irq stack + * before switching to the irq stack. Actual stack entries are all + * below that. */ info->next_sp = (unsigned long *)*(end - 1); return true; } -int get_stack_info(unsigned long *stack, struct task_struct *task, - struct stack_info *info, unsigned long *visit_mask) +bool noinstr get_stack_info_noinstr(unsigned long *stack, struct task_struct *task, + struct stack_info *info) { - if (!stack) - goto unknown; - - task = task ? : current; - if (in_task_stack(stack, task, info)) - goto recursion_check; + return true; if (task != current) - goto unknown; + return false; if (in_exception_stack(stack, info)) - goto recursion_check; + return true; if (in_irq_stack(stack, info)) - goto recursion_check; + return true; if (in_entry_stack(stack, info)) - goto recursion_check; + return true; + + return false; +} + +int get_stack_info(unsigned long *stack, struct task_struct *task, + struct stack_info *info, unsigned long *visit_mask) +{ + task = task ? : current; + + if (!stack) + goto unknown; - goto unknown; + if (!get_stack_info_noinstr(stack, task, info)) + goto unknown; -recursion_check: /* * Make sure we don't iterate through any given stack more than once. * If it comes up a second time then there's something wrong going on: @@ -183,7 +206,8 @@ recursion_check: */ if (visit_mask) { if (*visit_mask & (1UL << info->type)) { - printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type); + if (task == current) + printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type); goto unknown; } *visit_mask |= 1UL << info->type; diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index c5399e80c59c..9dac24680ff8 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -31,8 +31,8 @@ * - inform the user about the firmware's notion of memory layout * via /sys/firmware/memmap * - * - the hibernation code uses it to generate a kernel-independent MD5 - * fingerprint of the physical memory layout of a system. + * - the hibernation code uses it to generate a kernel-independent CRC32 + * checksum of the physical memory layout of a system. * * - 'e820_table_kexec': a slightly modified (by the kernel) firmware version * passed to us by the bootloader - the major difference between @@ -305,6 +305,20 @@ static int __init cpcompare(const void *a, const void *b) return (ap->addr != ap->entry->addr) - (bp->addr != bp->entry->addr); } +static bool e820_nomerge(enum e820_type type) +{ + /* + * These types may indicate distinct platform ranges aligned to + * numa node, protection domain, performance domain, or other + * boundaries. Do not merge them. + */ + if (type == E820_TYPE_PRAM) + return true; + if (type == E820_TYPE_SOFT_RESERVED) + return true; + return false; +} + int __init e820__update_table(struct e820_table *table) { struct e820_entry *entries = table->entries; @@ -380,7 +394,7 @@ int __init e820__update_table(struct e820_table *table) } /* Continue building up new map based on this information: */ - if (current_type != last_type || current_type == E820_TYPE_PRAM) { + if (current_type != last_type || e820_nomerge(current_type)) { if (last_type != 0) { new_entries[new_nr_entries].size = change_point[chg_idx]->addr - last_addr; /* Move forward only if the new size was non-zero: */ @@ -779,7 +793,7 @@ core_initcall(e820__register_nvs_regions); #endif /* - * Allocate the requested number of bytes with the requsted alignment + * Allocate the requested number of bytes with the requested alignment * and return (the physical address) to the caller. Also register this * range in the 'kexec' E820 table as a reserved range. * @@ -910,14 +924,6 @@ static int __init parse_memmap_one(char *p) return -EINVAL; if (!strncmp(p, "exactmap", 8)) { -#ifdef CONFIG_CRASH_DUMP - /* - * If we are doing a crash dump, we still need to know - * the real memory size before the original memory map is - * reset. - */ - saved_max_pfn = e820__end_of_ram_pfn(); -#endif e820_table->nr_entries = 0; userdef = 1; return 0; @@ -989,8 +995,10 @@ early_param("memmap", parse_memmap_opt); */ void __init e820__reserve_setup_data(void) { + struct setup_indirect *indirect; struct setup_data *data; - u64 pa_data; + u64 pa_data, pa_next; + u32 len; pa_data = boot_params.hdr.setup_data; if (!pa_data) @@ -998,21 +1006,46 @@ void __init e820__reserve_setup_data(void) while (pa_data) { data = early_memremap(pa_data, sizeof(*data)); + if (!data) { + pr_warn("e820: failed to memremap setup_data entry\n"); + return; + } + + len = sizeof(*data); + pa_next = data->next; + e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - e820__range_update_kexec(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - - if (data->type == SETUP_INDIRECT && - ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) { - e820__range_update(((struct setup_indirect *)data->data)->addr, - ((struct setup_indirect *)data->data)->len, - E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - e820__range_update_kexec(((struct setup_indirect *)data->data)->addr, - ((struct setup_indirect *)data->data)->len, + + /* + * SETUP_EFI and SETUP_IMA are supplied by kexec and do not need + * to be reserved. + */ + if (data->type != SETUP_EFI && data->type != SETUP_IMA) + e820__range_update_kexec(pa_data, + sizeof(*data) + data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + + if (data->type == SETUP_INDIRECT) { + len += data->len; + early_memunmap(data, sizeof(*data)); + data = early_memremap(pa_data, len); + if (!data) { + pr_warn("e820: failed to memremap indirect setup_data\n"); + return; + } + + indirect = (struct setup_indirect *)data->data; + + if (indirect->type != SETUP_INDIRECT) { + e820__range_update(indirect->addr, indirect->len, + E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + e820__range_update_kexec(indirect->addr, indirect->len, + E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + } } - pa_data = data->next; - early_memunmap(data, sizeof(*data)); + pa_data = pa_next; + early_memunmap(data, len); } e820__update_table(e820_table); diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 2f9ec14be3b1..a6c1867fc7aa 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -18,6 +18,7 @@ #include <linux/bcma/bcma_regs.h> #include <linux/platform_data/x86/apple.h> #include <drm/i915_drm.h> +#include <drm/i915_pciids.h> #include <asm/pci-direct.h> #include <asm/dma.h> #include <asm/io_apic.h> @@ -515,6 +516,7 @@ static const struct intel_early_ops gen11_early_ops __initconst = { .stolen_size = gen9_stolen_size, }; +/* Intel integrated GPUs for which we need to reserve "stolen memory" */ static const struct pci_device_id intel_early_ids[] __initconst = { INTEL_I830_IDS(&i830_early_ops), INTEL_I845G_IDS(&i845_early_ops), @@ -549,7 +551,14 @@ static const struct pci_device_id intel_early_ids[] __initconst = { INTEL_CNL_IDS(&gen9_early_ops), INTEL_ICL_11_IDS(&gen11_early_ops), INTEL_EHL_IDS(&gen11_early_ops), + INTEL_JSL_IDS(&gen11_early_ops), INTEL_TGL_12_IDS(&gen11_early_ops), + INTEL_RKL_IDS(&gen11_early_ops), + INTEL_ADLS_IDS(&gen11_early_ops), + INTEL_ADLP_IDS(&gen11_early_ops), + INTEL_ADLN_IDS(&gen11_early_ops), + INTEL_RPLS_IDS(&gen11_early_ops), + INTEL_RPLP_IDS(&gen11_early_ops), }; struct resource intel_graphics_stolen_res __ro_after_init = DEFINE_RES_MEM(0, 0); @@ -587,6 +596,13 @@ static void __init intel_graphics_quirks(int num, int slot, int func) u16 device; int i; + /* + * Reserve "stolen memory" for an integrated GPU. If we've already + * found one, there's nothing to do for other (discrete) GPUs. + */ + if (resource_size(&intel_graphics_stolen_res)) + return; + device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID); for (i = 0; i < ARRAY_SIZE(intel_early_ids); i++) { @@ -699,7 +715,7 @@ static struct chipset early_qrk[] __initdata = { { PCI_VENDOR_ID_INTEL, 0x3406, PCI_CLASS_BRIDGE_HOST, PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check }, { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID, - QFLAG_APPLY_ONCE, intel_graphics_quirks }, + 0, intel_graphics_quirks }, /* * HPET on the current version of the Baytrail platform has accuracy * problems: it will halt in deep idle state - so we disable it. @@ -710,12 +726,6 @@ static struct chipset early_qrk[] __initdata = { */ { PCI_VENDOR_ID_INTEL, 0x0f00, PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, - { PCI_VENDOR_ID_INTEL, 0x3e20, - PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, - { PCI_VENDOR_ID_INTEL, 0x3ec4, - PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, - { PCI_VENDOR_ID_INTEL, 0x8a12, - PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, { PCI_VENDOR_ID_BROADCOM, 0x4331, PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset}, {} diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 9b33904251a9..44f937015e1e 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -8,6 +8,7 @@ #include <linux/pci_regs.h> #include <linux/pci_ids.h> #include <linux/errno.h> +#include <linux/pgtable.h> #include <asm/io.h> #include <asm/processor.h> #include <asm/fcntl.h> @@ -15,12 +16,8 @@ #include <xen/hvc-console.h> #include <asm/pci-direct.h> #include <asm/fixmap.h> -#include <asm/intel-mid.h> -#include <asm/pgtable.h> #include <linux/usb/ehci_def.h> #include <linux/usb/xhci-dbgp.h> -#include <linux/efi.h> -#include <asm/efi.h> #include <asm/pci_x86.h> /* Simple VGA output */ @@ -267,11 +264,11 @@ static __init void early_pci_serial_init(char *s) bar0 = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0); /* - * Verify it is a UART type device + * Verify it is a 16550-UART type device */ if (((classcode >> 16 != PCI_CLASS_COMMUNICATION_MODEM) && (classcode >> 16 != PCI_CLASS_COMMUNICATION_SERIAL)) || - (((classcode >> 8) & 0xff) != 0x02)) /* 16550 I/F at BAR0 */ { + (((classcode >> 8) & 0xff) != PCI_SERIAL_16550_COMPATIBLE)) { if (!force) return; } @@ -279,22 +276,22 @@ static __init void early_pci_serial_init(char *s) /* * Determine if it is IO or memory mapped */ - if (bar0 & 0x01) { + if ((bar0 & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_IO) { /* it is IO mapped */ serial_in = io_serial_in; serial_out = io_serial_out; - early_serial_base = bar0&0xfffffffc; + early_serial_base = bar0 & PCI_BASE_ADDRESS_IO_MASK; write_pci_config(bus, slot, func, PCI_COMMAND, - cmdreg|PCI_COMMAND_IO); + cmdreg|PCI_COMMAND_IO); } else { /* It is memory mapped - assume 32-bit alignment */ serial_in = mem32_serial_in; serial_out = mem32_serial_out; /* WARNING! assuming the address is always in the first 4G */ early_serial_base = - (unsigned long)early_ioremap(bar0 & 0xfffffff0, 0x10); + (unsigned long)early_ioremap(bar0 & PCI_BASE_ADDRESS_MEM_MASK, 0x10); write_pci_config(bus, slot, func, PCI_COMMAND, - cmdreg|PCI_COMMAND_MEMORY); + cmdreg|PCI_COMMAND_MEMORY); } /* @@ -390,7 +387,7 @@ static int __init setup_early_printk(char *buf) #endif #ifdef CONFIG_EARLY_PRINTK_USB_XDBC if (!strncmp(buf, "xdbc", 4)) - early_xdbc_parse_parameter(buf + 4); + early_xdbc_parse_parameter(buf + 4, keep); #endif buf++; diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 12e7d4406c32..9417d5aa7305 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -29,7 +29,7 @@ #include <linux/percpu.h> #include <linux/gfp.h> #include <linux/random.h> -#include <asm/pgtable.h> +#include <linux/pgtable.h> #include <asm/pgalloc.h> #include <asm/setup.h> #include <asm/espfix.h> @@ -100,7 +100,7 @@ static void init_espfix_random(void) * This is run before the entropy pools are initialized, * but this is hopefully better than nothing. */ - if (!arch_get_random_long(&rand)) { + if (!arch_get_random_longs(&rand, 1)) { /* The constant is an arbitrary large prime */ rand = rdtsc(); rand *= 0xc345c6b72fd16123UL; diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c index 2954fab15e51..794e70151203 100644 --- a/arch/x86/kernel/fpu/bugs.c +++ b/arch/x86/kernel/fpu/bugs.c @@ -2,7 +2,7 @@ /* * x86 FPU bug checks: */ -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> /* * Boot time CPU/FPU FDIV bug detection code: diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h new file mode 100644 index 000000000000..958accf2ccf0 --- /dev/null +++ b/arch/x86/kernel/fpu/context.h @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __X86_KERNEL_FPU_CONTEXT_H +#define __X86_KERNEL_FPU_CONTEXT_H + +#include <asm/fpu/xstate.h> +#include <asm/trace/fpu.h> + +/* Functions related to FPU context tracking */ + +/* + * The in-register FPU state for an FPU context on a CPU is assumed to be + * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx + * matches the FPU. + * + * If the FPU register state is valid, the kernel can skip restoring the + * FPU state from memory. + * + * Any code that clobbers the FPU registers or updates the in-memory + * FPU state for a task MUST let the rest of the kernel know that the + * FPU registers are no longer valid for this task. + * + * Either one of these invalidation functions is enough. Invalidate + * a resource you control: CPU if using the CPU for something else + * (with preemption disabled), FPU for the current task, or a task that + * is prevented from running by the current task. + */ +static inline void __cpu_invalidate_fpregs_state(void) +{ + __this_cpu_write(fpu_fpregs_owner_ctx, NULL); +} + +static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu) +{ + fpu->last_cpu = -1; +} + +static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu) +{ + return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; +} + +static inline void fpregs_deactivate(struct fpu *fpu) +{ + __this_cpu_write(fpu_fpregs_owner_ctx, NULL); + trace_x86_fpu_regs_deactivated(fpu); +} + +static inline void fpregs_activate(struct fpu *fpu) +{ + __this_cpu_write(fpu_fpregs_owner_ctx, fpu); + trace_x86_fpu_regs_activated(fpu); +} + +/* Internal helper for switch_fpu_return() and signal frame setup */ +static inline void fpregs_restore_userregs(void) +{ + struct fpu *fpu = ¤t->thread.fpu; + int cpu = smp_processor_id(); + + if (WARN_ON_ONCE(current->flags & PF_KTHREAD)) + return; + + if (!fpregs_state_valid(fpu, cpu)) { + /* + * This restores _all_ xstate which has not been + * established yet. + * + * If PKRU is enabled, then the PKRU value is already + * correct because it was either set in switch_to() or in + * flush_thread(). So it is excluded because it might be + * not up to date in current->thread.fpu.xsave state. + * + * XFD state is handled in restore_fpregs_from_fpstate(). + */ + restore_fpregs_from_fpstate(fpu->fpstate, XFEATURE_MASK_FPSTATE); + + fpregs_activate(fpu); + fpu->last_cpu = cpu; + } + clear_thread_flag(TIF_NEED_FPU_LOAD); +} + +#endif diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 12c70840980e..3b28c5b25e12 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -6,36 +6,44 @@ * General FPU state handling cleanups * Gareth Hughes <gareth@valinux.com>, May 2000 */ -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> #include <asm/fpu/regset.h> +#include <asm/fpu/sched.h> #include <asm/fpu/signal.h> #include <asm/fpu/types.h> #include <asm/traps.h> #include <asm/irq_regs.h> +#include <uapi/asm/kvm.h> + #include <linux/hardirq.h> #include <linux/pkeys.h> +#include <linux/vmalloc.h> + +#include "context.h" +#include "internal.h" +#include "legacy.h" +#include "xstate.h" #define CREATE_TRACE_POINTS #include <asm/trace/fpu.h> +#ifdef CONFIG_X86_64 +DEFINE_STATIC_KEY_FALSE(__fpu_state_size_dynamic); +DEFINE_PER_CPU(u64, xfd_state); +#endif + +/* The FPU state configuration data for kernel and user space */ +struct fpu_state_config fpu_kernel_cfg __ro_after_init; +struct fpu_state_config fpu_user_cfg __ro_after_init; + /* * Represents the initial FPU state. It's mostly (but not completely) zeroes, * depending on the FPU hardware format: */ -union fpregs_state init_fpstate __read_mostly; +struct fpstate init_fpstate __ro_after_init; -/* - * Track whether the kernel is using the FPU state - * currently. - * - * This flag is used: - * - * - by IRQ context code to potentially use the FPU - * if it's unused. - * - * - to debug kernel_fpu_begin()/end() correctness - */ +/* Track in-kernel FPU usage */ static DEFINE_PER_CPU(bool, in_kernel_fpu); /* @@ -43,46 +51,376 @@ static DEFINE_PER_CPU(bool, in_kernel_fpu); */ DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); -static bool kernel_fpu_disabled(void) +/* + * Can we use the FPU in kernel mode with the + * whole "kernel_fpu_begin/end()" sequence? + */ +bool irq_fpu_usable(void) { - return this_cpu_read(in_kernel_fpu); + if (WARN_ON_ONCE(in_nmi())) + return false; + + /* In kernel FPU usage already active? */ + if (this_cpu_read(in_kernel_fpu)) + return false; + + /* + * When not in NMI or hard interrupt context, FPU can be used in: + * + * - Task context except from within fpregs_lock()'ed critical + * regions. + * + * - Soft interrupt processing context which cannot happen + * while in a fpregs_lock()'ed critical region. + */ + if (!in_hardirq()) + return true; + + /* + * In hard interrupt context it's safe when soft interrupts + * are enabled, which means the interrupt did not hit in + * a fpregs_lock()'ed critical region. + */ + return !softirq_count(); } +EXPORT_SYMBOL(irq_fpu_usable); -static bool interrupted_kernel_fpu_idle(void) +/* + * Track AVX512 state use because it is known to slow the max clock + * speed of the core. + */ +static void update_avx_timestamp(struct fpu *fpu) { - return !kernel_fpu_disabled(); + +#define AVX512_TRACKING_MASK (XFEATURE_MASK_ZMM_Hi256 | XFEATURE_MASK_Hi16_ZMM) + + if (fpu->fpstate->regs.xsave.header.xfeatures & AVX512_TRACKING_MASK) + fpu->avx512_timestamp = jiffies; } /* - * Were we in user mode (or vm86 mode) when we were - * interrupted? + * Save the FPU register state in fpu->fpstate->regs. The register state is + * preserved. + * + * Must be called with fpregs_lock() held. * - * Doing kernel_fpu_begin/end() is ok if we are running - * in an interrupt context from user mode - we'll just - * save the FPU state as required. + * The legacy FNSAVE instruction clears all FPU state unconditionally, so + * register state has to be reloaded. That might be a pointless exercise + * when the FPU is going to be used by another task right after that. But + * this only affects 20+ years old 32bit systems and avoids conditionals all + * over the place. + * + * FXSAVE and all XSAVE variants preserve the FPU register state. */ -static bool interrupted_user_mode(void) +void save_fpregs_to_fpstate(struct fpu *fpu) +{ + if (likely(use_xsave())) { + os_xsave(fpu->fpstate); + update_avx_timestamp(fpu); + return; + } + + if (likely(use_fxsr())) { + fxsave(&fpu->fpstate->regs.fxsave); + return; + } + + /* + * Legacy FPU register saving, FNSAVE always clears FPU registers, + * so we have to reload them from the memory state. + */ + asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->fpstate->regs.fsave)); + frstor(&fpu->fpstate->regs.fsave); +} + +void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask) +{ + /* + * AMD K7/K8 and later CPUs up to Zen don't save/restore + * FDP/FIP/FOP unless an exception is pending. Clear the x87 state + * here by setting it to fixed values. "m" is a random variable + * that should be in L1. + */ + if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) { + asm volatile( + "fnclex\n\t" + "emms\n\t" + "fildl %P[addr]" /* set F?P to defined value */ + : : [addr] "m" (fpstate)); + } + + if (use_xsave()) { + /* + * Dynamically enabled features are enabled in XCR0, but + * usage requires also that the corresponding bits in XFD + * are cleared. If the bits are set then using a related + * instruction will raise #NM. This allows to do the + * allocation of the larger FPU buffer lazy from #NM or if + * the task has no permission to kill it which would happen + * via #UD if the feature is disabled in XCR0. + * + * XFD state is following the same life time rules as + * XSTATE and to restore state correctly XFD has to be + * updated before XRSTORS otherwise the component would + * stay in or go into init state even if the bits are set + * in fpstate::regs::xsave::xfeatures. + */ + xfd_update_state(fpstate); + + /* + * Restoring state always needs to modify all features + * which are in @mask even if the current task cannot use + * extended features. + * + * So fpstate->xfeatures cannot be used here, because then + * a feature for which the task has no permission but was + * used by the previous task would not go into init state. + */ + mask = fpu_kernel_cfg.max_features & mask; + + os_xrstor(fpstate, mask); + } else { + if (use_fxsr()) + fxrstor(&fpstate->regs.fxsave); + else + frstor(&fpstate->regs.fsave); + } +} + +void fpu_reset_from_exception_fixup(void) { - struct pt_regs *regs = get_irq_regs(); - return regs && user_mode(regs); + restore_fpregs_from_fpstate(&init_fpstate, XFEATURE_MASK_FPSTATE); } +#if IS_ENABLED(CONFIG_KVM) +static void __fpstate_reset(struct fpstate *fpstate, u64 xfd); + +static void fpu_init_guest_permissions(struct fpu_guest *gfpu) +{ + struct fpu_state_perm *fpuperm; + u64 perm; + + if (!IS_ENABLED(CONFIG_X86_64)) + return; + + spin_lock_irq(¤t->sighand->siglock); + fpuperm = ¤t->group_leader->thread.fpu.guest_perm; + perm = fpuperm->__state_perm; + + /* First fpstate allocation locks down permissions. */ + WRITE_ONCE(fpuperm->__state_perm, perm | FPU_GUEST_PERM_LOCKED); + + spin_unlock_irq(¤t->sighand->siglock); + + gfpu->perm = perm & ~FPU_GUEST_PERM_LOCKED; +} + +bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu) +{ + struct fpstate *fpstate; + unsigned int size; + + size = fpu_user_cfg.default_size + ALIGN(offsetof(struct fpstate, regs), 64); + fpstate = vzalloc(size); + if (!fpstate) + return false; + + /* Leave xfd to 0 (the reset value defined by spec) */ + __fpstate_reset(fpstate, 0); + fpstate_init_user(fpstate); + fpstate->is_valloc = true; + fpstate->is_guest = true; + + gfpu->fpstate = fpstate; + gfpu->xfeatures = fpu_user_cfg.default_features; + gfpu->perm = fpu_user_cfg.default_features; + + /* + * KVM sets the FP+SSE bits in the XSAVE header when copying FPU state + * to userspace, even when XSAVE is unsupported, so that restoring FPU + * state on a different CPU that does support XSAVE can cleanly load + * the incoming state using its natural XSAVE. In other words, KVM's + * uABI size may be larger than this host's default size. Conversely, + * the default size should never be larger than KVM's base uABI size; + * all features that can expand the uABI size must be opt-in. + */ + gfpu->uabi_size = sizeof(struct kvm_xsave); + if (WARN_ON_ONCE(fpu_user_cfg.default_size > gfpu->uabi_size)) + gfpu->uabi_size = fpu_user_cfg.default_size; + + fpu_init_guest_permissions(gfpu); + + return true; +} +EXPORT_SYMBOL_GPL(fpu_alloc_guest_fpstate); + +void fpu_free_guest_fpstate(struct fpu_guest *gfpu) +{ + struct fpstate *fps = gfpu->fpstate; + + if (!fps) + return; + + if (WARN_ON_ONCE(!fps->is_valloc || !fps->is_guest || fps->in_use)) + return; + + gfpu->fpstate = NULL; + vfree(fps); +} +EXPORT_SYMBOL_GPL(fpu_free_guest_fpstate); + /* - * Can we use the FPU in kernel mode with the - * whole "kernel_fpu_begin/end()" sequence? + * fpu_enable_guest_xfd_features - Check xfeatures against guest perm and enable + * @guest_fpu: Pointer to the guest FPU container + * @xfeatures: Features requested by guest CPUID + * + * Enable all dynamic xfeatures according to guest perm and requested CPUID. + * + * Return: 0 on success, error code otherwise + */ +int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures) +{ + lockdep_assert_preemption_enabled(); + + /* Nothing to do if all requested features are already enabled. */ + xfeatures &= ~guest_fpu->xfeatures; + if (!xfeatures) + return 0; + + return __xfd_enable_feature(xfeatures, guest_fpu); +} +EXPORT_SYMBOL_GPL(fpu_enable_guest_xfd_features); + +#ifdef CONFIG_X86_64 +void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) +{ + fpregs_lock(); + guest_fpu->fpstate->xfd = xfd; + if (guest_fpu->fpstate->in_use) + xfd_update_state(guest_fpu->fpstate); + fpregs_unlock(); +} +EXPORT_SYMBOL_GPL(fpu_update_guest_xfd); + +/** + * fpu_sync_guest_vmexit_xfd_state - Synchronize XFD MSR and software state + * + * Must be invoked from KVM after a VMEXIT before enabling interrupts when + * XFD write emulation is disabled. This is required because the guest can + * freely modify XFD and the state at VMEXIT is not guaranteed to be the + * same as the state on VMENTER. So software state has to be udpated before + * any operation which depends on it can take place. * - * It's always ok in process context (ie "not interrupt") - * but it is sometimes ok even from an irq. + * Note: It can be invoked unconditionally even when write emulation is + * enabled for the price of a then pointless MSR read. */ -bool irq_fpu_usable(void) +void fpu_sync_guest_vmexit_xfd_state(void) { - return !in_interrupt() || - interrupted_user_mode() || - interrupted_kernel_fpu_idle(); + struct fpstate *fps = current->thread.fpu.fpstate; + + lockdep_assert_irqs_disabled(); + if (fpu_state_size_dynamic()) { + rdmsrl(MSR_IA32_XFD, fps->xfd); + __this_cpu_write(xfd_state, fps->xfd); + } } -EXPORT_SYMBOL(irq_fpu_usable); +EXPORT_SYMBOL_GPL(fpu_sync_guest_vmexit_xfd_state); +#endif /* CONFIG_X86_64 */ + +int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest) +{ + struct fpstate *guest_fps = guest_fpu->fpstate; + struct fpu *fpu = ¤t->thread.fpu; + struct fpstate *cur_fps = fpu->fpstate; + + fpregs_lock(); + if (!cur_fps->is_confidential && !test_thread_flag(TIF_NEED_FPU_LOAD)) + save_fpregs_to_fpstate(fpu); + + /* Swap fpstate */ + if (enter_guest) { + fpu->__task_fpstate = cur_fps; + fpu->fpstate = guest_fps; + guest_fps->in_use = true; + } else { + guest_fps->in_use = false; + fpu->fpstate = fpu->__task_fpstate; + fpu->__task_fpstate = NULL; + } + + cur_fps = fpu->fpstate; + + if (!cur_fps->is_confidential) { + /* Includes XFD update */ + restore_fpregs_from_fpstate(cur_fps, XFEATURE_MASK_FPSTATE); + } else { + /* + * XSTATE is restored by firmware from encrypted + * memory. Make sure XFD state is correct while + * running with guest fpstate + */ + xfd_update_state(cur_fps); + } + + fpregs_mark_activate(); + fpregs_unlock(); + return 0; +} +EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpstate); + +void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, + unsigned int size, u32 pkru) +{ + struct fpstate *kstate = gfpu->fpstate; + union fpregs_state *ustate = buf; + struct membuf mb = { .p = buf, .left = size }; + + if (cpu_feature_enabled(X86_FEATURE_XSAVE)) { + __copy_xstate_to_uabi_buf(mb, kstate, pkru, XSTATE_COPY_XSAVE); + } else { + memcpy(&ustate->fxsave, &kstate->regs.fxsave, + sizeof(ustate->fxsave)); + /* Make it restorable on a XSAVE enabled host */ + ustate->xsave.header.xfeatures = XFEATURE_MASK_FPSSE; + } +} +EXPORT_SYMBOL_GPL(fpu_copy_guest_fpstate_to_uabi); + +int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, + u64 xcr0, u32 *vpkru) +{ + struct fpstate *kstate = gfpu->fpstate; + const union fpregs_state *ustate = buf; + struct pkru_state *xpkru; + int ret; + + if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) { + if (ustate->xsave.header.xfeatures & ~XFEATURE_MASK_FPSSE) + return -EINVAL; + if (ustate->fxsave.mxcsr & ~mxcsr_feature_mask) + return -EINVAL; + memcpy(&kstate->regs.fxsave, &ustate->fxsave, sizeof(ustate->fxsave)); + return 0; + } + + if (ustate->xsave.header.xfeatures & ~xcr0) + return -EINVAL; + + ret = copy_uabi_from_kernel_to_xstate(kstate, ustate); + if (ret) + return ret; -void kernel_fpu_begin(void) + /* Retrieve PKRU if not in init state */ + if (kstate->regs.xsave.header.xfeatures & XFEATURE_MASK_PKRU) { + xpkru = get_xsave_addr(&kstate->regs.xsave, XFEATURE_PKRU); + *vpkru = xpkru->pkru; + } + return 0; +} +EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate); +#endif /* CONFIG_KVM */ + +void kernel_fpu_begin_mask(unsigned int kfpu_mask) { preempt_disable(); @@ -94,15 +432,18 @@ void kernel_fpu_begin(void) if (!(current->flags & PF_KTHREAD) && !test_thread_flag(TIF_NEED_FPU_LOAD)) { set_thread_flag(TIF_NEED_FPU_LOAD); - /* - * Ignore return value -- we don't care if reg state - * is clobbered. - */ - copy_fpregs_to_fpstate(¤t->thread.fpu); + save_fpregs_to_fpstate(¤t->thread.fpu); } __cpu_invalidate_fpregs_state(); + + /* Put sane initial values into the control registers. */ + if (likely(kfpu_mask & KFPU_MXCSR) && boot_cpu_has(X86_FEATURE_XMM)) + ldmxcsr(MXCSR_DEFAULT); + + if (unlikely(kfpu_mask & KFPU_387) && boot_cpu_has(X86_FEATURE_FPU)) + asm volatile ("fninit"); } -EXPORT_SYMBOL_GPL(kernel_fpu_begin); +EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask); void kernel_fpu_end(void) { @@ -114,92 +455,166 @@ void kernel_fpu_end(void) EXPORT_SYMBOL_GPL(kernel_fpu_end); /* - * Save the FPU state (mark it for reload if necessary): - * - * This only ever gets called for the current task. + * Sync the FPU register state to current's memory register state when the + * current task owns the FPU. The hardware register state is preserved. */ -void fpu__save(struct fpu *fpu) +void fpu_sync_fpstate(struct fpu *fpu) { WARN_ON_FPU(fpu != ¤t->thread.fpu); fpregs_lock(); trace_x86_fpu_before_save(fpu); - if (!test_thread_flag(TIF_NEED_FPU_LOAD)) { - if (!copy_fpregs_to_fpstate(fpu)) { - copy_kernel_to_fpregs(&fpu->state); - } - } + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) + save_fpregs_to_fpstate(fpu); trace_x86_fpu_after_save(fpu); fpregs_unlock(); } +static inline unsigned int init_fpstate_copy_size(void) +{ + if (!use_xsave()) + return fpu_kernel_cfg.default_size; + + /* XSAVE(S) just needs the legacy and the xstate header part */ + return sizeof(init_fpstate.regs.xsave); +} + +static inline void fpstate_init_fxstate(struct fpstate *fpstate) +{ + fpstate->regs.fxsave.cwd = 0x37f; + fpstate->regs.fxsave.mxcsr = MXCSR_DEFAULT; +} + /* * Legacy x87 fpstate state init: */ -static inline void fpstate_init_fstate(struct fregs_state *fp) +static inline void fpstate_init_fstate(struct fpstate *fpstate) { - fp->cwd = 0xffff037fu; - fp->swd = 0xffff0000u; - fp->twd = 0xffffffffu; - fp->fos = 0xffff0000u; + fpstate->regs.fsave.cwd = 0xffff037fu; + fpstate->regs.fsave.swd = 0xffff0000u; + fpstate->regs.fsave.twd = 0xffffffffu; + fpstate->regs.fsave.fos = 0xffff0000u; } -void fpstate_init(union fpregs_state *state) +/* + * Used in two places: + * 1) Early boot to setup init_fpstate for non XSAVE systems + * 2) fpu_init_fpstate_user() which is invoked from KVM + */ +void fpstate_init_user(struct fpstate *fpstate) { - if (!static_cpu_has(X86_FEATURE_FPU)) { - fpstate_init_soft(&state->soft); + if (!cpu_feature_enabled(X86_FEATURE_FPU)) { + fpstate_init_soft(&fpstate->regs.soft); return; } - memset(state, 0, fpu_kernel_xstate_size); + xstate_init_xcomp_bv(&fpstate->regs.xsave, fpstate->xfeatures); - if (static_cpu_has(X86_FEATURE_XSAVES)) - fpstate_init_xstate(&state->xsave); - if (static_cpu_has(X86_FEATURE_FXSR)) - fpstate_init_fxstate(&state->fxsave); + if (cpu_feature_enabled(X86_FEATURE_FXSR)) + fpstate_init_fxstate(fpstate); else - fpstate_init_fstate(&state->fsave); + fpstate_init_fstate(fpstate); +} + +static void __fpstate_reset(struct fpstate *fpstate, u64 xfd) +{ + /* Initialize sizes and feature masks */ + fpstate->size = fpu_kernel_cfg.default_size; + fpstate->user_size = fpu_user_cfg.default_size; + fpstate->xfeatures = fpu_kernel_cfg.default_features; + fpstate->user_xfeatures = fpu_user_cfg.default_features; + fpstate->xfd = xfd; +} + +void fpstate_reset(struct fpu *fpu) +{ + /* Set the fpstate pointer to the default fpstate */ + fpu->fpstate = &fpu->__fpstate; + __fpstate_reset(fpu->fpstate, init_fpstate.xfd); + + /* Initialize the permission related info in fpu */ + fpu->perm.__state_perm = fpu_kernel_cfg.default_features; + fpu->perm.__state_size = fpu_kernel_cfg.default_size; + fpu->perm.__user_state_size = fpu_user_cfg.default_size; + /* Same defaults for guests */ + fpu->guest_perm = fpu->perm; +} + +static inline void fpu_inherit_perms(struct fpu *dst_fpu) +{ + if (fpu_state_size_dynamic()) { + struct fpu *src_fpu = ¤t->group_leader->thread.fpu; + + spin_lock_irq(¤t->sighand->siglock); + /* Fork also inherits the permissions of the parent */ + dst_fpu->perm = src_fpu->perm; + dst_fpu->guest_perm = src_fpu->guest_perm; + spin_unlock_irq(¤t->sighand->siglock); + } } -EXPORT_SYMBOL_GPL(fpstate_init); -int fpu__copy(struct task_struct *dst, struct task_struct *src) +/* Clone current's FPU state on fork */ +int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal) { + struct fpu *src_fpu = ¤t->thread.fpu; struct fpu *dst_fpu = &dst->thread.fpu; - struct fpu *src_fpu = &src->thread.fpu; + /* The new task's FPU state cannot be valid in the hardware. */ dst_fpu->last_cpu = -1; - if (!static_cpu_has(X86_FEATURE_FPU)) + fpstate_reset(dst_fpu); + + if (!cpu_feature_enabled(X86_FEATURE_FPU)) return 0; - WARN_ON_FPU(src_fpu != ¤t->thread.fpu); + /* + * Enforce reload for user space tasks and prevent kernel threads + * from trying to save the FPU registers on context switch. + */ + set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD); /* - * Don't let 'init optimized' areas of the XSAVE area - * leak into the child task: + * No FPU state inheritance for kernel threads and IO + * worker threads. */ - memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size); + if (minimal) { + /* Clear out the minimal state */ + memcpy(&dst_fpu->fpstate->regs, &init_fpstate.regs, + init_fpstate_copy_size()); + return 0; + } /* - * If the FPU registers are not current just memcpy() the state. - * Otherwise save current FPU registers directly into the child's FPU - * context, without any memory-to-memory copying. + * If a new feature is added, ensure all dynamic features are + * caller-saved from here! + */ + BUILD_BUG_ON(XFEATURE_MASK_USER_DYNAMIC != XFEATURE_MASK_XTILE_DATA); + + /* + * Save the default portion of the current FPU state into the + * clone. Assume all dynamic features to be defined as caller- + * saved, which enables skipping both the expansion of fpstate + * and the copying of any dynamic state. * - * ( The function 'fails' in the FNSAVE case, which destroys - * register contents so we have to load them back. ) + * Do not use memcpy() when TIF_NEED_FPU_LOAD is set because + * copying is not valid when current uses non-default states. */ fpregs_lock(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) - memcpy(&dst_fpu->state, &src_fpu->state, fpu_kernel_xstate_size); - - else if (!copy_fpregs_to_fpstate(dst_fpu)) - copy_kernel_to_fpregs(&dst_fpu->state); - + fpregs_restore_userregs(); + save_fpregs_to_fpstate(dst_fpu); + if (!(clone_flags & CLONE_THREAD)) + fpu_inherit_perms(dst_fpu); fpregs_unlock(); - set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD); + /* + * Children never inherit PASID state. + * Force it to have its init value: + */ + if (use_xsave()) + dst_fpu->fpstate->regs.xsave.header.xfeatures &= ~XFEATURE_MASK_PASID; trace_x86_fpu_copy_src(src_fpu); trace_x86_fpu_copy_dst(dst_fpu); @@ -208,60 +623,13 @@ int fpu__copy(struct task_struct *dst, struct task_struct *src) } /* - * Activate the current task's in-memory FPU context, - * if it has not been used before: - */ -static void fpu__initialize(struct fpu *fpu) -{ - WARN_ON_FPU(fpu != ¤t->thread.fpu); - - set_thread_flag(TIF_NEED_FPU_LOAD); - fpstate_init(&fpu->state); - trace_x86_fpu_init_state(fpu); -} - -/* - * This function must be called before we read a task's fpstate. - * - * There's two cases where this gets called: - * - * - for the current task (when coredumping), in which case we have - * to save the latest FPU registers into the fpstate, - * - * - or it's called for stopped tasks (ptrace), in which case the - * registers were already saved by the context-switch code when - * the task scheduled out. - * - * If the task has used the FPU before then save it. - */ -void fpu__prepare_read(struct fpu *fpu) -{ - if (fpu == ¤t->thread.fpu) - fpu__save(fpu); -} - -/* - * This function must be called before we write a task's fpstate. - * - * Invalidate any cached FPU registers. - * - * After this function call, after registers in the fpstate are - * modified and the child task has woken up, the child task will - * restore the modified FPU state from the modified context. If we - * didn't clear its cached status here then the cached in-registers - * state pending on its former CPU could be restored, corrupting - * the modifications. + * Whitelist the FPU register state embedded into task_struct for hardened + * usercopy. */ -void fpu__prepare_write(struct fpu *fpu) +void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size) { - /* - * Only stopped child tasks can be used to modify the FPU - * state in the fpstate buffer: - */ - WARN_ON_FPU(fpu == ¤t->thread.fpu); - - /* Invalidate any cached state: */ - __fpu_invalidate_fpregs_state(fpu); + *offset = offsetof(struct thread_struct, fpu.__fpstate.regs); + *size = fpu_kernel_cfg.default_size; } /* @@ -291,47 +659,91 @@ void fpu__drop(struct fpu *fpu) } /* - * Clear FPU registers by setting them up from - * the init fpstate: + * Clear FPU registers by setting them up from the init fpstate. + * Caller must do fpregs_[un]lock() around it. */ -static inline void copy_init_fpstate_to_fpregs(void) +static inline void restore_fpregs_from_init_fpstate(u64 features_mask) { - fpregs_lock(); - if (use_xsave()) - copy_kernel_to_xregs(&init_fpstate.xsave, -1); - else if (static_cpu_has(X86_FEATURE_FXSR)) - copy_kernel_to_fxregs(&init_fpstate.fxsave); + os_xrstor(&init_fpstate, features_mask); + else if (use_fxsr()) + fxrstor(&init_fpstate.regs.fxsave); else - copy_kernel_to_fregs(&init_fpstate.fsave); + frstor(&init_fpstate.regs.fsave); - if (boot_cpu_has(X86_FEATURE_OSPKE)) - copy_init_pkru_to_fpregs(); + pkru_write_default(); +} - fpregs_mark_activate(); +/* + * Reset current->fpu memory state to the init values. + */ +static void fpu_reset_fpregs(void) +{ + struct fpu *fpu = ¤t->thread.fpu; + + fpregs_lock(); + fpu__drop(fpu); + /* + * This does not change the actual hardware registers. It just + * resets the memory image and sets TIF_NEED_FPU_LOAD so a + * subsequent return to usermode will reload the registers from the + * task's memory image. + * + * Do not use fpstate_init() here. Just copy init_fpstate which has + * the correct content already except for PKRU. + * + * PKRU handling does not rely on the xstate when restoring for + * user space as PKRU is eagerly written in switch_to() and + * flush_thread(). + */ + memcpy(&fpu->fpstate->regs, &init_fpstate.regs, init_fpstate_copy_size()); + set_thread_flag(TIF_NEED_FPU_LOAD); fpregs_unlock(); } /* - * Clear the FPU state back to init state. - * - * Called by sys_execve(), by the signal handler code and by various - * error paths. + * Reset current's user FPU states to the init states. current's + * supervisor states, if any, are not modified by this function. The + * caller guarantees that the XSTATE header in memory is intact. */ -void fpu__clear(struct fpu *fpu) +void fpu__clear_user_states(struct fpu *fpu) { - WARN_ON_FPU(fpu != ¤t->thread.fpu); /* Almost certainly an anomaly */ + WARN_ON_FPU(fpu != ¤t->thread.fpu); - fpu__drop(fpu); + fpregs_lock(); + if (!cpu_feature_enabled(X86_FEATURE_FPU)) { + fpu_reset_fpregs(); + fpregs_unlock(); + return; + } /* - * Make sure fpstate is cleared and initialized. + * Ensure that current's supervisor states are loaded into their + * corresponding registers. */ - fpu__initialize(fpu); - if (static_cpu_has(X86_FEATURE_FPU)) - copy_init_fpstate_to_fpregs(); + if (xfeatures_mask_supervisor() && + !fpregs_state_valid(fpu, smp_processor_id())) + os_xrstor_supervisor(fpu->fpstate); + + /* Reset user states in registers. */ + restore_fpregs_from_init_fpstate(XFEATURE_MASK_USER_RESTORE); + + /* + * Now all FPU registers have their desired values. Inform the FPU + * state machine that current's FPU registers are in the hardware + * registers. The memory image does not need to be updated because + * any operation relying on it has to save the registers first when + * current's FPU is marked active. + */ + fpregs_mark_activate(); + fpregs_unlock(); } +void fpu_flush_thread(void) +{ + fpstate_reset(¤t->thread.fpu); + fpu_reset_fpregs(); +} /* * Load FPU context before returning to userspace. */ @@ -340,7 +752,7 @@ void switch_fpu_return(void) if (!static_cpu_has(X86_FEATURE_FPU)) return; - __fpregs_load_activate(); + fpregs_restore_userregs(); } EXPORT_SYMBOL_GPL(switch_fpu_return); @@ -370,7 +782,6 @@ void fpregs_mark_activate(void) fpu->last_cpu = smp_processor_id(); clear_thread_flag(TIF_NEED_FPU_LOAD); } -EXPORT_SYMBOL_GPL(fpregs_mark_activate); /* * x87 math exception handling: @@ -393,11 +804,11 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr) * fully reproduce the context of the exception. */ if (boot_cpu_has(X86_FEATURE_FXSR)) { - cwd = fpu->state.fxsave.cwd; - swd = fpu->state.fxsave.swd; + cwd = fpu->fpstate->regs.fxsave.cwd; + swd = fpu->fpstate->regs.fxsave.swd; } else { - cwd = (unsigned short)fpu->state.fsave.cwd; - swd = (unsigned short)fpu->state.fsave.swd; + cwd = (unsigned short)fpu->fpstate->regs.fsave.cwd; + swd = (unsigned short)fpu->fpstate->regs.fsave.swd; } err = swd & ~cwd; @@ -411,7 +822,7 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr) unsigned short mxcsr = MXCSR_DEFAULT; if (boot_cpu_has(X86_FEATURE_XMM)) - mxcsr = fpu->state.fxsave.mxcsr; + mxcsr = fpu->fpstate->regs.fxsave.mxcsr; err = ~(mxcsr >> 7) & mxcsr; } @@ -440,3 +851,17 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr) */ return 0; } + +/* + * Initialize register state that may prevent from entering low-power idle. + * This function will be invoked from the cpuidle driver only when needed. + */ +void fpu_idle_fpregs(void) +{ + /* Note: AMX_TILE being enabled implies XGETBV1 support */ + if (cpu_feature_enabled(X86_FEATURE_AMX_TILE) && + (xfeatures_in_use() & XFEATURE_MASK_XTILE)) { + tile_release(); + fpregs_deactivate(¤t->thread.fpu); + } +} diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 6ce7e0a23268..8946f89761cc 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -2,15 +2,18 @@ /* * x86 FPU boot time init code: */ -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> #include <asm/tlbflush.h> #include <asm/setup.h> -#include <asm/cmdline.h> #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/init.h> +#include "internal.h" +#include "legacy.h" +#include "xstate.h" + /* * Initialize the registers found in all CPUs, CR0 and CR4: */ @@ -35,7 +38,7 @@ static void fpu__init_cpu_generic(void) /* Flush out any pending x87 state: */ #ifdef CONFIG_MATH_EMULATION if (!boot_cpu_has(X86_FEATURE_FPU)) - fpstate_init_soft(¤t->thread.fpu.state.soft); + fpstate_init_soft(¤t->thread.fpu.fpstate->regs.soft); else #endif asm volatile ("fninit"); @@ -90,7 +93,7 @@ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c) /* * Boot time FPU feature detection code: */ -unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; +unsigned int mxcsr_feature_mask __ro_after_init = 0xffffffffu; EXPORT_SYMBOL_GPL(mxcsr_feature_mask); static void __init fpu__init_system_mxcsr(void) @@ -122,23 +125,14 @@ static void __init fpu__init_system_mxcsr(void) static void __init fpu__init_system_generic(void) { /* - * Set up the legacy init FPU context. (xstate init might overwrite this - * with a more modern format, if the CPU supports it.) + * Set up the legacy init FPU context. Will be updated when the + * CPU supports XSAVE[S]. */ - fpstate_init(&init_fpstate); + fpstate_init_user(&init_fpstate); fpu__init_system_mxcsr(); } -/* - * Size of the FPU context state. All tasks in the system use the - * same context size, regardless of what portion they use. - * This is inherent to the XSAVE architecture which puts all state - * components into a single, continuous memory block: - */ -unsigned int fpu_kernel_xstate_size; -EXPORT_SYMBOL_GPL(fpu_kernel_xstate_size); - /* Get alignment of the TYPE. */ #define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test) @@ -163,13 +157,13 @@ static void __init fpu__init_task_struct_size(void) * Subtract off the static size of the register state. * It potentially has a bunch of padding. */ - task_size -= sizeof(((struct task_struct *)0)->thread.fpu.state); + task_size -= sizeof(current->thread.fpu.__fpstate.regs); /* * Add back the dynamically-calculated register state * size. */ - task_size += fpu_kernel_xstate_size; + task_size += fpu_kernel_cfg.default_size; /* * We dynamically size 'struct fpu', so we require that @@ -178,7 +172,7 @@ static void __init fpu__init_task_struct_size(void) * you hit a compile error here, check the structure to * see if something got added to the end. */ - CHECK_MEMBER_AT_END_OF(struct fpu, state); + CHECK_MEMBER_AT_END_OF(struct fpu, __fpstate); CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu); CHECK_MEMBER_AT_END_OF(struct task_struct, thread); @@ -193,86 +187,27 @@ static void __init fpu__init_task_struct_size(void) */ static void __init fpu__init_system_xstate_size_legacy(void) { - static int on_boot_cpu __initdata = 1; - - WARN_ON_FPU(!on_boot_cpu); - on_boot_cpu = 0; + unsigned int size; /* - * Note that xstate sizes might be overwritten later during - * fpu__init_system_xstate(). + * Note that the size configuration might be overwritten later + * during fpu__init_system_xstate(). */ - - if (!boot_cpu_has(X86_FEATURE_FPU)) { - fpu_kernel_xstate_size = sizeof(struct swregs_state); + if (!cpu_feature_enabled(X86_FEATURE_FPU)) { + size = sizeof(struct swregs_state); + } else if (cpu_feature_enabled(X86_FEATURE_FXSR)) { + size = sizeof(struct fxregs_state); + fpu_user_cfg.legacy_features = XFEATURE_MASK_FPSSE; } else { - if (boot_cpu_has(X86_FEATURE_FXSR)) - fpu_kernel_xstate_size = - sizeof(struct fxregs_state); - else - fpu_kernel_xstate_size = - sizeof(struct fregs_state); + size = sizeof(struct fregs_state); + fpu_user_cfg.legacy_features = XFEATURE_MASK_FP; } - fpu_user_xstate_size = fpu_kernel_xstate_size; -} - -/* - * Find supported xfeatures based on cpu features and command-line input. - * This must be called after fpu__init_parse_early_param() is called and - * xfeatures_mask is enumerated. - */ -u64 __init fpu__get_supported_xfeatures_mask(void) -{ - return XCNTXT_MASK; -} - -/* Legacy code to initialize eager fpu mode. */ -static void __init fpu__init_system_ctx_switch(void) -{ - static bool on_boot_cpu __initdata = 1; - - WARN_ON_FPU(!on_boot_cpu); - on_boot_cpu = 0; -} - -/* - * We parse fpu parameters early because fpu__init_system() is executed - * before parse_early_param(). - */ -static void __init fpu__init_parse_early_param(void) -{ - char arg[32]; - char *argptr = arg; - int bit; - -#ifdef CONFIG_X86_32 - if (cmdline_find_option_bool(boot_command_line, "no387")) -#ifdef CONFIG_MATH_EMULATION - setup_clear_cpu_cap(X86_FEATURE_FPU); -#else - pr_err("Option 'no387' required CONFIG_MATH_EMULATION enabled.\n"); -#endif - - if (cmdline_find_option_bool(boot_command_line, "nofxsr")) - setup_clear_cpu_cap(X86_FEATURE_FXSR); -#endif - - if (cmdline_find_option_bool(boot_command_line, "noxsave")) - setup_clear_cpu_cap(X86_FEATURE_XSAVE); - - if (cmdline_find_option_bool(boot_command_line, "noxsaveopt")) - setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); - - if (cmdline_find_option_bool(boot_command_line, "noxsaves")) - setup_clear_cpu_cap(X86_FEATURE_XSAVES); - - if (cmdline_find_option(boot_command_line, "clearcpuid", arg, - sizeof(arg)) && - get_option(&argptr, &bit) && - bit >= 0 && - bit < NCAPINTS * 32) - setup_clear_cpu_cap(bit); + fpu_kernel_cfg.max_size = size; + fpu_kernel_cfg.default_size = size; + fpu_user_cfg.max_size = size; + fpu_user_cfg.default_size = size; + fpstate_reset(¤t->thread.fpu); } /* @@ -281,7 +216,7 @@ static void __init fpu__init_parse_early_param(void) */ void __init fpu__init_system(struct cpuinfo_x86 *c) { - fpu__init_parse_early_param(); + fpstate_reset(¤t->thread.fpu); fpu__init_system_early_generic(c); /* @@ -292,8 +227,6 @@ void __init fpu__init_system(struct cpuinfo_x86 *c) fpu__init_system_generic(); fpu__init_system_xstate_size_legacy(); - fpu__init_system_xstate(); + fpu__init_system_xstate(fpu_kernel_cfg.max_size); fpu__init_task_struct_size(); - - fpu__init_system_ctx_switch(); } diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h new file mode 100644 index 000000000000..dbdb31f55fc7 --- /dev/null +++ b/arch/x86/kernel/fpu/internal.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __X86_KERNEL_FPU_INTERNAL_H +#define __X86_KERNEL_FPU_INTERNAL_H + +extern struct fpstate init_fpstate; + +/* CPU feature check wrappers */ +static __always_inline __pure bool use_xsave(void) +{ + return cpu_feature_enabled(X86_FEATURE_XSAVE); +} + +static __always_inline __pure bool use_fxsr(void) +{ + return cpu_feature_enabled(X86_FEATURE_FXSR); +} + +#ifdef CONFIG_X86_DEBUG_FPU +# define WARN_ON_FPU(x) WARN_ON_ONCE(x) +#else +# define WARN_ON_FPU(x) ({ (void)(x); 0; }) +#endif + +/* Used in init.c */ +extern void fpstate_init_user(struct fpstate *fpstate); +extern void fpstate_reset(struct fpu *fpu); + +#endif diff --git a/arch/x86/kernel/fpu/legacy.h b/arch/x86/kernel/fpu/legacy.h new file mode 100644 index 000000000000..098f367bb8a7 --- /dev/null +++ b/arch/x86/kernel/fpu/legacy.h @@ -0,0 +1,111 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __X86_KERNEL_FPU_LEGACY_H +#define __X86_KERNEL_FPU_LEGACY_H + +#include <asm/fpu/types.h> + +extern unsigned int mxcsr_feature_mask; + +static inline void ldmxcsr(u32 mxcsr) +{ + asm volatile("ldmxcsr %0" :: "m" (mxcsr)); +} + +/* + * Returns 0 on success or the trap number when the operation raises an + * exception. + */ +#define user_insn(insn, output, input...) \ +({ \ + int err; \ + \ + might_fault(); \ + \ + asm volatile(ASM_STAC "\n" \ + "1: " #insn "\n" \ + "2: " ASM_CLAC "\n" \ + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \ + : [err] "=a" (err), output \ + : "0"(0), input); \ + err; \ +}) + +#define kernel_insn_err(insn, output, input...) \ +({ \ + int err; \ + asm volatile("1:" #insn "\n\t" \ + "2:\n" \ + _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %[err]) \ + : [err] "=r" (err), output \ + : "0"(0), input); \ + err; \ +}) + +#define kernel_insn(insn, output, input...) \ + asm volatile("1:" #insn "\n\t" \ + "2:\n" \ + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FPU_RESTORE) \ + : output : input) + +static inline int fnsave_to_user_sigframe(struct fregs_state __user *fx) +{ + return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx)); +} + +static inline int fxsave_to_user_sigframe(struct fxregs_state __user *fx) +{ + if (IS_ENABLED(CONFIG_X86_32)) + return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx)); + else + return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx)); + +} + +static inline void fxrstor(struct fxregs_state *fx) +{ + if (IS_ENABLED(CONFIG_X86_32)) + kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + else + kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline int fxrstor_safe(struct fxregs_state *fx) +{ + if (IS_ENABLED(CONFIG_X86_32)) + return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + else + return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline int fxrstor_from_user_sigframe(struct fxregs_state __user *fx) +{ + if (IS_ENABLED(CONFIG_X86_32)) + return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + else + return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline void frstor(struct fregs_state *fx) +{ + kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline int frstor_safe(struct fregs_state *fx) +{ + return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline int frstor_from_user_sigframe(struct fregs_state __user *fx) +{ + return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline void fxsave(struct fxregs_state *fx) +{ + if (IS_ENABLED(CONFIG_X86_32)) + asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx)); + else + asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx)); +} + +#endif diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index d652b939ccfb..75ffaef8c299 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -2,11 +2,17 @@ /* * FPU register's regset abstraction, for ptrace, core dumps, etc. */ -#include <asm/fpu/internal.h> +#include <linux/sched/task_stack.h> +#include <linux/vmalloc.h> + +#include <asm/fpu/api.h> #include <asm/fpu/signal.h> #include <asm/fpu/regset.h> -#include <asm/fpu/xstate.h> -#include <linux/sched/task_stack.h> + +#include "context.h" +#include "internal.h" +#include "legacy.h" +#include "xstate.h" /* * The xstateregs_active() routine is the same as the regset_fpregs_active() routine, @@ -26,20 +32,58 @@ int regset_xregset_fpregs_active(struct task_struct *target, const struct user_r return 0; } +/* + * The regset get() functions are invoked from: + * + * - coredump to dump the current task's fpstate. If the current task + * owns the FPU then the memory state has to be synchronized and the + * FPU register state preserved. Otherwise fpstate is already in sync. + * + * - ptrace to dump fpstate of a stopped task, in which case the registers + * have already been saved to fpstate on context switch. + */ +static void sync_fpstate(struct fpu *fpu) +{ + if (fpu == ¤t->thread.fpu) + fpu_sync_fpstate(fpu); +} + +/* + * Invalidate cached FPU registers before modifying the stopped target + * task's fpstate. + * + * This forces the target task on resume to restore the FPU registers from + * modified fpstate. Otherwise the task might skip the restore and operate + * with the cached FPU registers which discards the modifications. + */ +static void fpu_force_restore(struct fpu *fpu) +{ + /* + * Only stopped child tasks can be used to modify the FPU + * state in the fpstate buffer: + */ + WARN_ON_FPU(fpu == ¤t->thread.fpu); + + __fpu_invalidate_fpregs_state(fpu); +} + int xfpregs_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { struct fpu *fpu = &target->thread.fpu; - if (!boot_cpu_has(X86_FEATURE_FXSR)) + if (!cpu_feature_enabled(X86_FEATURE_FXSR)) return -ENODEV; - fpu__prepare_read(fpu); - fpstate_sanitize_xstate(fpu); + sync_fpstate(fpu); + + if (!use_xsave()) { + return membuf_write(&to, &fpu->fpstate->regs.fxsave, + sizeof(fpu->fpstate->regs.fxsave)); + } - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &fpu->state.fxsave, 0, -1); + copy_xstate_to_uabi_buf(to, target, XSTATE_COPY_FX); + return 0; } int xfpregs_set(struct task_struct *target, const struct user_regset *regset, @@ -47,67 +91,51 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, const void *kbuf, const void __user *ubuf) { struct fpu *fpu = &target->thread.fpu; + struct fxregs_state newstate; int ret; - if (!boot_cpu_has(X86_FEATURE_FXSR)) + if (!cpu_feature_enabled(X86_FEATURE_FXSR)) return -ENODEV; - fpu__prepare_write(fpu); - fpstate_sanitize_xstate(fpu); + /* No funny business with partial or oversized writes is permitted. */ + if (pos != 0 || count != sizeof(newstate)) + return -EINVAL; - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &fpu->state.fxsave, 0, -1); + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &newstate, 0, -1); + if (ret) + return ret; - /* - * mxcsr reserved bits must be masked to zero for security reasons. - */ - fpu->state.fxsave.mxcsr &= mxcsr_feature_mask; + /* Do not allow an invalid MXCSR value. */ + if (newstate.mxcsr & ~mxcsr_feature_mask) + return -EINVAL; - /* - * update the header bits in the xsave header, indicating the - * presence of FP and SSE state. - */ - if (boot_cpu_has(X86_FEATURE_XSAVE)) - fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE; + fpu_force_restore(fpu); - return ret; + /* Copy the state */ + memcpy(&fpu->fpstate->regs.fxsave, &newstate, sizeof(newstate)); + + /* Clear xmm8..15 for 32-bit callers */ + BUILD_BUG_ON(sizeof(fpu->__fpstate.regs.fxsave.xmm_space) != 16 * 16); + if (in_ia32_syscall()) + memset(&fpu->fpstate->regs.fxsave.xmm_space[8*4], 0, 8 * 16); + + /* Mark FP and SSE as in use when XSAVE is enabled */ + if (use_xsave()) + fpu->fpstate->regs.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE; + + return 0; } int xstateregs_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { - struct fpu *fpu = &target->thread.fpu; - struct xregs_state *xsave; - int ret; - - if (!boot_cpu_has(X86_FEATURE_XSAVE)) + if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) return -ENODEV; - xsave = &fpu->state.xsave; - - fpu__prepare_read(fpu); + sync_fpstate(&target->thread.fpu); - if (using_compacted_format()) { - if (kbuf) - ret = copy_xstate_to_kernel(kbuf, xsave, pos, count); - else - ret = copy_xstate_to_user(ubuf, xsave, pos, count); - } else { - fpstate_sanitize_xstate(fpu); - /* - * Copy the 48 bytes defined by the software into the xsave - * area in the thread struct, so that we can copy the whole - * area to user using one user_regset_copyout(). - */ - memcpy(&xsave->i387.sw_reserved, xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); - - /* - * Copy the xstate memory layout. - */ - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); - } - return ret; + copy_xstate_to_uabi_buf(to, target, XSTATE_COPY_XSAVE); + return 0; } int xstateregs_set(struct task_struct *target, const struct user_regset *regset, @@ -115,44 +143,34 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, const void *kbuf, const void __user *ubuf) { struct fpu *fpu = &target->thread.fpu; - struct xregs_state *xsave; + struct xregs_state *tmpbuf = NULL; int ret; - if (!boot_cpu_has(X86_FEATURE_XSAVE)) + if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) return -ENODEV; /* * A whole standard-format XSAVE buffer is needed: */ - if ((pos != 0) || (count < fpu_user_xstate_size)) + if (pos != 0 || count != fpu_user_cfg.max_size) return -EFAULT; - xsave = &fpu->state.xsave; + if (!kbuf) { + tmpbuf = vmalloc(count); + if (!tmpbuf) + return -ENOMEM; - fpu__prepare_write(fpu); - - if (using_compacted_format()) { - if (kbuf) - ret = copy_kernel_to_xstate(xsave, kbuf); - else - ret = copy_user_to_xstate(xsave, ubuf); - } else { - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); - if (!ret) - ret = validate_xstate_header(&xsave->header); + if (copy_from_user(tmpbuf, ubuf, count)) { + ret = -EFAULT; + goto out; + } } - /* - * mxcsr reserved bits must be masked to zero for security reasons. - */ - xsave->i387.mxcsr &= mxcsr_feature_mask; - - /* - * In case of failure, mark all states as init: - */ - if (ret) - fpstate_init(&fpu->state); + fpu_force_restore(fpu); + ret = copy_uabi_from_kernel_to_xstate(fpu->fpstate, kbuf ?: tmpbuf); +out: + vfree(tmpbuf); return ret; } @@ -228,10 +246,10 @@ static inline u32 twd_fxsr_to_i387(struct fxregs_state *fxsave) * FXSR floating point environment conversions. */ -void -convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) +static void __convert_from_fxsr(struct user_i387_ia32_struct *env, + struct task_struct *tsk, + struct fxregs_state *fxsave) { - struct fxregs_state *fxsave = &tsk->thread.fpu.state.fxsave; struct _fpreg *to = (struct _fpreg *) &env->st_space[0]; struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0]; int i; @@ -265,6 +283,12 @@ convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) memcpy(&to[i], &from[i], sizeof(to[0])); } +void +convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) +{ + __convert_from_fxsr(env, tsk, &tsk->thread.fpu.fpstate->regs.fxsave); +} + void convert_to_fxsr(struct fxregs_state *fxsave, const struct user_i387_ia32_struct *env) @@ -293,32 +317,34 @@ void convert_to_fxsr(struct fxregs_state *fxsave, } int fpregs_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { struct fpu *fpu = &target->thread.fpu; struct user_i387_ia32_struct env; + struct fxregs_state fxsave, *fx; - fpu__prepare_read(fpu); + sync_fpstate(fpu); - if (!boot_cpu_has(X86_FEATURE_FPU)) - return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf); + if (!cpu_feature_enabled(X86_FEATURE_FPU)) + return fpregs_soft_get(target, regset, to); - if (!boot_cpu_has(X86_FEATURE_FXSR)) - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &fpu->state.fsave, 0, - -1); + if (!cpu_feature_enabled(X86_FEATURE_FXSR)) { + return membuf_write(&to, &fpu->fpstate->regs.fsave, + sizeof(struct fregs_state)); + } - fpstate_sanitize_xstate(fpu); + if (use_xsave()) { + struct membuf mb = { .p = &fxsave, .left = sizeof(fxsave) }; - if (kbuf && pos == 0 && count == sizeof(env)) { - convert_from_fxsr(kbuf, target); - return 0; + /* Handle init state optimized xstate correctly */ + copy_xstate_to_uabi_buf(mb, target, XSTATE_COPY_FP); + fx = &fxsave; + } else { + fx = &fpu->fpstate->regs.fxsave; } - convert_from_fxsr(&env, target); - - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &env, 0, -1); + __convert_from_fxsr(&env, target, fx); + return membuf_write(&to, &env, sizeof(env)); } int fpregs_set(struct task_struct *target, const struct user_regset *regset, @@ -329,47 +355,32 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, struct user_i387_ia32_struct env; int ret; - fpu__prepare_write(fpu); - fpstate_sanitize_xstate(fpu); + /* No funny business with partial or oversized writes is permitted. */ + if (pos != 0 || count != sizeof(struct user_i387_ia32_struct)) + return -EINVAL; - if (!boot_cpu_has(X86_FEATURE_FPU)) + if (!cpu_feature_enabled(X86_FEATURE_FPU)) return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); - if (!boot_cpu_has(X86_FEATURE_FXSR)) - return user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &fpu->state.fsave, 0, - -1); + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1); + if (ret) + return ret; - if (pos > 0 || count < sizeof(env)) - convert_from_fxsr(&env, target); + fpu_force_restore(fpu); - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1); - if (!ret) - convert_to_fxsr(&target->thread.fpu.state.fxsave, &env); + if (cpu_feature_enabled(X86_FEATURE_FXSR)) + convert_to_fxsr(&fpu->fpstate->regs.fxsave, &env); + else + memcpy(&fpu->fpstate->regs.fsave, &env, sizeof(env)); /* - * update the header bit in the xsave header, indicating the + * Update the header bit in the xsave header, indicating the * presence of FP. */ - if (boot_cpu_has(X86_FEATURE_XSAVE)) - fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FP; - return ret; -} - -/* - * FPU state for core dumps. - * This is only used for a.out dumps now. - * It is declared generically using elf_fpregset_t (which is - * struct user_i387_struct) but is in fact only used for 32-bit - * dumps, so on 64-bit it is really struct user_i387_ia32_struct. - */ -int dump_fpu(struct pt_regs *regs, struct user_i387_struct *ufpu) -{ - struct task_struct *tsk = current; + if (cpu_feature_enabled(X86_FEATURE_XSAVE)) + fpu->fpstate->regs.xsave.header.xfeatures |= XFEATURE_MASK_FP; - return !fpregs_get(tsk, NULL, 0, sizeof(struct user_i387_ia32_struct), - ufpu, NULL); + return 0; } -EXPORT_SYMBOL(dump_fpu); #endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */ diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 400a05e1c1c5..91d4b6de58ab 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -7,37 +7,40 @@ #include <linux/cpu.h> #include <linux/pagemap.h> -#include <asm/fpu/internal.h> #include <asm/fpu/signal.h> #include <asm/fpu/regset.h> #include <asm/fpu/xstate.h> #include <asm/sigframe.h> +#include <asm/trapnr.h> #include <asm/trace/fpu.h> -static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32; +#include "context.h" +#include "internal.h" +#include "legacy.h" +#include "xstate.h" /* * Check for the presence of extended state information in the * user fpstate pointer in the sigcontext. */ -static inline int check_for_xstate(struct fxregs_state __user *buf, - void __user *fpstate, - struct _fpx_sw_bytes *fx_sw) +static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf, + struct _fpx_sw_bytes *fx_sw) { int min_xstate_size = sizeof(struct fxregs_state) + sizeof(struct xstate_header); + void __user *fpstate = fxbuf; unsigned int magic2; - if (__copy_from_user(fx_sw, &buf->sw_reserved[0], sizeof(*fx_sw))) - return -1; + if (__copy_from_user(fx_sw, &fxbuf->sw_reserved[0], sizeof(*fx_sw))) + return false; /* Check for the first magic field and other error scenarios. */ if (fx_sw->magic1 != FP_XSTATE_MAGIC1 || fx_sw->xstate_size < min_xstate_size || - fx_sw->xstate_size > fpu_user_xstate_size || + fx_sw->xstate_size > current->thread.fpu.fpstate->user_size || fx_sw->xstate_size > fx_sw->extended_size) - return -1; + goto setfx; /* * Check for the presence of second magic word at the end of memory @@ -45,26 +48,34 @@ static inline int check_for_xstate(struct fxregs_state __user *buf, * fpstate layout with out copying the extended state information * in the memory layout. */ - if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size)) - || magic2 != FP_XSTATE_MAGIC2) - return -1; - - return 0; + if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size))) + return false; + + if (likely(magic2 == FP_XSTATE_MAGIC2)) + return true; +setfx: + trace_x86_fpu_xstate_check_failed(¤t->thread.fpu); + + /* Set the parameters for fx only state */ + fx_sw->magic1 = 0; + fx_sw->xstate_size = sizeof(struct fxregs_state); + fx_sw->xfeatures = XFEATURE_MASK_FPSSE; + return true; } /* * Signal frame handlers. */ -static inline int save_fsave_header(struct task_struct *tsk, void __user *buf) +static inline bool save_fsave_header(struct task_struct *tsk, void __user *buf) { if (use_fxsr()) { - struct xregs_state *xsave = &tsk->thread.fpu.state.xsave; + struct xregs_state *xsave = &tsk->thread.fpu.fpstate->regs.xsave; struct user_i387_ia32_struct env; struct _fpstate_32 __user *fp = buf; fpregs_lock(); if (!test_thread_flag(TIF_NEED_FPU_LOAD)) - copy_fxregs_to_kernel(&tsk->thread.fpu); + fxsave(&tsk->thread.fpu.fpstate->regs.fxsave); fpregs_unlock(); convert_from_fxsr(&env, tsk); @@ -72,33 +83,54 @@ static inline int save_fsave_header(struct task_struct *tsk, void __user *buf) if (__copy_to_user(buf, &env, sizeof(env)) || __put_user(xsave->i387.swd, &fp->status) || __put_user(X86_FXSR_MAGIC, &fp->magic)) - return -1; + return false; } else { struct fregs_state __user *fp = buf; u32 swd; + if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status)) - return -1; + return false; } - return 0; + return true; +} + +/* + * Prepare the SW reserved portion of the fxsave memory layout, indicating + * the presence of the extended state information in the memory layout + * pointed to by the fpstate pointer in the sigcontext. + * This is saved when ever the FP and extended state context is + * saved on the user stack during the signal handler delivery to the user. + */ +static inline void save_sw_bytes(struct _fpx_sw_bytes *sw_bytes, bool ia32_frame, + struct fpstate *fpstate) +{ + sw_bytes->magic1 = FP_XSTATE_MAGIC1; + sw_bytes->extended_size = fpstate->user_size + FP_XSTATE_MAGIC2_SIZE; + sw_bytes->xfeatures = fpstate->user_xfeatures; + sw_bytes->xstate_size = fpstate->user_size; + + if (ia32_frame) + sw_bytes->extended_size += sizeof(struct fregs_state); } -static inline int save_xstate_epilog(void __user *buf, int ia32_frame) +static inline bool save_xstate_epilog(void __user *buf, int ia32_frame, + struct fpstate *fpstate) { struct xregs_state __user *x = buf; - struct _fpx_sw_bytes *sw_bytes; + struct _fpx_sw_bytes sw_bytes = {}; u32 xfeatures; int err; /* Setup the bytes not touched by the [f]xsave and reserved for SW. */ - sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved; - err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes)); + save_sw_bytes(&sw_bytes, ia32_frame, fpstate); + err = __copy_to_user(&x->i387.sw_reserved, &sw_bytes, sizeof(sw_bytes)); if (!use_xsave()) - return err; + return !err; err |= __put_user(FP_XSTATE_MAGIC2, - (__u32 __user *)(buf + fpu_user_xstate_size)); + (__u32 __user *)(buf + fpstate->user_size)); /* * Read the xfeatures which we copied (directly from the cpu or @@ -121,23 +153,17 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame) err |= __put_user(xfeatures, (__u32 __user *)&x->header.xfeatures); - return err; + return !err; } static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) { - int err; - if (use_xsave()) - err = copy_xregs_to_user(buf); - else if (use_fxsr()) - err = copy_fxregs_to_user((struct fxregs_state __user *) buf); + return xsave_to_user_sigframe(buf); + if (use_fxsr()) + return fxsave_to_user_sigframe((struct fxregs_state __user *) buf); else - err = copy_fregs_to_user((struct fregs_state __user *) buf); - - if (unlikely(err) && __clear_user(buf, fpu_user_xstate_size)) - err = -EFAULT; - return err; + return fnsave_to_user_sigframe((struct fregs_state __user *) buf); } /* @@ -150,10 +176,8 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) * buf == buf_fx for 64-bit frames and 32-bit fsave frame. * buf != buf_fx for 32-bit frames with fxstate. * - * Try to save it directly to the user frame with disabled page fault handler. - * If this fails then do the slow path where the FPU state is first saved to - * task's fpu->state and then copy it to the user frame pointed to by the - * aligned pointer 'buf_fx'. + * Save it directly to the user frame with disabled page fault handler. If + * that faults, try to clear the frame which handles the page fault. * * If this is a 32-bit frame with fxstate, put a fsave header before * the aligned state at 'buf_fx'. @@ -161,23 +185,37 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) * For [f]xsave state, update the SW reserved fields in the [f]xsave frame * indicating the absence/presence of the extended state to the user. */ -int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) +bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) { struct task_struct *tsk = current; - int ia32_fxstate = (buf != buf_fx); + struct fpstate *fpstate = tsk->thread.fpu.fpstate; + bool ia32_fxstate = (buf != buf_fx); int ret; ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) || IS_ENABLED(CONFIG_IA32_EMULATION)); + if (!static_cpu_has(X86_FEATURE_FPU)) { + struct user_i387_ia32_struct fp; + + fpregs_soft_get(current, NULL, (struct membuf){.p = &fp, + .left = sizeof(fp)}); + return !copy_to_user(buf, &fp, sizeof(fp)); + } + if (!access_ok(buf, size)) - return -EACCES; + return false; - if (!static_cpu_has(X86_FEATURE_FPU)) - return fpregs_soft_get(current, NULL, 0, - sizeof(struct user_i387_ia32_struct), NULL, - (struct _fpstate_32 __user *) buf) ? -1 : 1; + if (use_xsave()) { + struct xregs_state __user *xbuf = buf_fx; + /* + * Clear the xsave header first, so that reserved fields are + * initialized to zero. + */ + if (__clear_user(&xbuf->header, sizeof(xbuf->header))) + return false; + } retry: /* * Load the FPU registers if they are not valid for the current task. @@ -187,7 +225,7 @@ retry: */ fpregs_lock(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) - __fpregs_load_activate(); + fpregs_restore_userregs(); pagefault_disable(); ret = copy_fpregs_to_sigframe(buf_fx); @@ -195,252 +233,280 @@ retry: fpregs_unlock(); if (ret) { - if (!fault_in_pages_writeable(buf_fx, fpu_user_xstate_size)) + if (!__clear_user(buf_fx, fpstate->user_size)) goto retry; - return -EFAULT; + return false; } /* Save the fsave header for the 32-bit frames. */ - if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf)) - return -1; + if ((ia32_fxstate || !use_fxsr()) && !save_fsave_header(tsk, buf)) + return false; - if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate)) - return -1; + if (use_fxsr() && !save_xstate_epilog(buf_fx, ia32_fxstate, fpstate)) + return false; - return 0; + return true; } -static inline void -sanitize_restored_xstate(union fpregs_state *state, - struct user_i387_ia32_struct *ia32_env, - u64 xfeatures, int fx_only) +static int __restore_fpregs_from_user(void __user *buf, u64 ufeatures, + u64 xrestore, bool fx_only) { - struct xregs_state *xsave = &state->xsave; - struct xstate_header *header = &xsave->header; - if (use_xsave()) { - /* - * Note: we don't need to zero the reserved bits in the - * xstate_header here because we either didn't copy them at all, - * or we checked earlier that they aren't set. - */ + u64 init_bv = ufeatures & ~xrestore; + int ret; - /* - * Init the state that is not present in the memory - * layout and not enabled by the OS. - */ - if (fx_only) - header->xfeatures = XFEATURE_MASK_FPSSE; + if (likely(!fx_only)) + ret = xrstor_from_user_sigframe(buf, xrestore); else - header->xfeatures &= xfeatures; - } - - if (use_fxsr()) { - /* - * mscsr reserved bits must be masked to zero for security - * reasons. - */ - xsave->i387.mxcsr &= mxcsr_feature_mask; + ret = fxrstor_from_user_sigframe(buf); - if (ia32_env) - convert_to_fxsr(&state->fxsave, ia32_env); + if (!ret && unlikely(init_bv)) + os_xrstor(&init_fpstate, init_bv); + return ret; + } else if (use_fxsr()) { + return fxrstor_from_user_sigframe(buf); + } else { + return frstor_from_user_sigframe(buf); } } /* - * Restore the extended state if present. Otherwise, restore the FP/SSE state. + * Attempt to restore the FPU registers directly from user memory. + * Pagefaults are handled and any errors returned are fatal. */ -static int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only) +static bool restore_fpregs_from_user(void __user *buf, u64 xrestore, + bool fx_only, unsigned int size) { - if (use_xsave()) { - if (fx_only) { - u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE; - copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); - return copy_user_to_fxregs(buf); - } else { - u64 init_bv = xfeatures_mask & ~xbv; - if (unlikely(init_bv)) - copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); - return copy_user_to_xregs(buf, xbv); - } - } else if (use_fxsr()) { - return copy_user_to_fxregs(buf); - } else - return copy_user_to_fregs(buf); + struct fpu *fpu = ¤t->thread.fpu; + int ret; + +retry: + fpregs_lock(); + /* Ensure that XFD is up to date */ + xfd_update_state(fpu->fpstate); + pagefault_disable(); + ret = __restore_fpregs_from_user(buf, fpu->fpstate->user_xfeatures, + xrestore, fx_only); + pagefault_enable(); + + if (unlikely(ret)) { + /* + * The above did an FPU restore operation, restricted to + * the user portion of the registers, and failed, but the + * microcode might have modified the FPU registers + * nevertheless. + * + * If the FPU registers do not belong to current, then + * invalidate the FPU register state otherwise the task + * might preempt current and return to user space with + * corrupted FPU registers. + */ + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + __cpu_invalidate_fpregs_state(); + fpregs_unlock(); + + /* Try to handle #PF, but anything else is fatal. */ + if (ret != X86_TRAP_PF) + return false; + + if (!fault_in_readable(buf, size)) + goto retry; + return false; + } + + /* + * Restore supervisor states: previous context switch etc has done + * XSAVES and saved the supervisor states in the kernel buffer from + * which they can be restored now. + * + * It would be optimal to handle this with a single XRSTORS, but + * this does not work because the rest of the FPU registers have + * been restored from a user buffer directly. + */ + if (test_thread_flag(TIF_NEED_FPU_LOAD) && xfeatures_mask_supervisor()) + os_xrstor_supervisor(fpu->fpstate); + + fpregs_mark_activate(); + fpregs_unlock(); + return true; } -static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) +static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, + bool ia32_fxstate) { - struct user_i387_ia32_struct *envp = NULL; - int state_size = fpu_kernel_xstate_size; - int ia32_fxstate = (buf != buf_fx); struct task_struct *tsk = current; struct fpu *fpu = &tsk->thread.fpu; struct user_i387_ia32_struct env; - u64 xfeatures = 0; - int fx_only = 0; - int ret = 0; - - ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) || - IS_ENABLED(CONFIG_IA32_EMULATION)); + bool success, fx_only = false; + union fpregs_state *fpregs; + unsigned int state_size; + u64 user_xfeatures = 0; - if (!buf) { - fpu__clear(fpu); - return 0; - } + if (use_xsave()) { + struct _fpx_sw_bytes fx_sw_user; - if (!access_ok(buf, size)) - return -EACCES; + if (!check_xstate_in_sigframe(buf_fx, &fx_sw_user)) + return false; - if (!static_cpu_has(X86_FEATURE_FPU)) - return fpregs_soft_set(current, NULL, - 0, sizeof(struct user_i387_ia32_struct), - NULL, buf) != 0; + fx_only = !fx_sw_user.magic1; + state_size = fx_sw_user.xstate_size; + user_xfeatures = fx_sw_user.xfeatures; + } else { + user_xfeatures = XFEATURE_MASK_FPSSE; + state_size = fpu->fpstate->user_size; + } - if (use_xsave()) { - struct _fpx_sw_bytes fx_sw_user; - if (unlikely(check_for_xstate(buf_fx, buf_fx, &fx_sw_user))) { - /* - * Couldn't find the extended state information in the - * memory layout. Restore just the FP/SSE and init all - * the other extended state. - */ - state_size = sizeof(struct fxregs_state); - fx_only = 1; - trace_x86_fpu_xstate_check_failed(fpu); - } else { - state_size = fx_sw_user.xstate_size; - xfeatures = fx_sw_user.xfeatures; - } + if (likely(!ia32_fxstate)) { + /* Restore the FPU registers directly from user memory. */ + return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only, + state_size); } /* - * The current state of the FPU registers does not matter. By setting - * TIF_NEED_FPU_LOAD unconditionally it is ensured that the our xstate - * is not modified on context switch and that the xstate is considered - * to be loaded again on return to userland (overriding last_cpu avoids - * the optimisation). + * Copy the legacy state because the FP portion of the FX frame has + * to be ignored for histerical raisins. The legacy state is folded + * in once the larger state has been copied. */ - set_thread_flag(TIF_NEED_FPU_LOAD); - __fpu_invalidate_fpregs_state(fpu); + if (__copy_from_user(&env, buf, sizeof(env))) + return false; - if ((unsigned long)buf_fx % 64) - fx_only = 1; /* - * For 32-bit frames with fxstate, copy the fxstate so it can be - * reconstructed later. + * By setting TIF_NEED_FPU_LOAD it is ensured that our xstate is + * not modified on context switch and that the xstate is considered + * to be loaded again on return to userland (overriding last_cpu avoids + * the optimisation). */ - if (ia32_fxstate) { - ret = __copy_from_user(&env, buf, sizeof(env)); - if (ret) - goto err_out; - envp = &env; - } else { + fpregs_lock(); + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) { /* - * Attempt to restore the FPU registers directly from user - * memory. For that to succeed, the user access cannot cause - * page faults. If it does, fall back to the slow path below, - * going through the kernel buffer with the enabled pagefault - * handler. + * If supervisor states are available then save the + * hardware state in current's fpstate so that the + * supervisor state is preserved. Save the full state for + * simplicity. There is no point in optimizing this by only + * saving the supervisor states and then shuffle them to + * the right place in memory. It's ia32 mode. Shrug. */ - fpregs_lock(); - pagefault_disable(); - ret = copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only); - pagefault_enable(); - if (!ret) { - fpregs_mark_activate(); - fpregs_unlock(); - return 0; - } - fpregs_deactivate(fpu); - fpregs_unlock(); + if (xfeatures_mask_supervisor()) + os_xsave(fpu->fpstate); + set_thread_flag(TIF_NEED_FPU_LOAD); } + __fpu_invalidate_fpregs_state(fpu); + __cpu_invalidate_fpregs_state(); + fpregs_unlock(); - + fpregs = &fpu->fpstate->regs; if (use_xsave() && !fx_only) { - u64 init_bv = xfeatures_mask & ~xfeatures; - - if (using_compacted_format()) { - ret = copy_user_to_xstate(&fpu->state.xsave, buf_fx); + if (copy_sigframe_from_user_to_xstate(fpu->fpstate, buf_fx)) + return false; + } else { + if (__copy_from_user(&fpregs->fxsave, buf_fx, + sizeof(fpregs->fxsave))) + return false; + + if (IS_ENABLED(CONFIG_X86_64)) { + /* Reject invalid MXCSR values. */ + if (fpregs->fxsave.mxcsr & ~mxcsr_feature_mask) + return false; } else { - ret = __copy_from_user(&fpu->state.xsave, buf_fx, state_size); - - if (!ret && state_size > offsetof(struct xregs_state, header)) - ret = validate_xstate_header(&fpu->state.xsave.header); + /* Mask invalid bits out for historical reasons (broken hardware). */ + fpregs->fxsave.mxcsr &= mxcsr_feature_mask; } - if (ret) - goto err_out; - - sanitize_restored_xstate(&fpu->state, envp, xfeatures, fx_only); - - fpregs_lock(); - if (unlikely(init_bv)) - copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); - ret = copy_kernel_to_xregs_err(&fpu->state.xsave, xfeatures); - } else if (use_fxsr()) { - ret = __copy_from_user(&fpu->state.fxsave, buf_fx, state_size); - if (ret) { - ret = -EFAULT; - goto err_out; - } + /* Enforce XFEATURE_MASK_FPSSE when XSAVE is enabled */ + if (use_xsave()) + fpregs->xsave.header.xfeatures |= XFEATURE_MASK_FPSSE; + } - sanitize_restored_xstate(&fpu->state, envp, xfeatures, fx_only); + /* Fold the legacy FP storage */ + convert_to_fxsr(&fpregs->fxsave, &env); - fpregs_lock(); - if (use_xsave()) { - u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE; - copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); - } + fpregs_lock(); + if (use_xsave()) { + /* + * Remove all UABI feature bits not set in user_xfeatures + * from the memory xstate header which makes the full + * restore below bring them into init state. This works for + * fx_only mode as well because that has only FP and SSE + * set in user_xfeatures. + * + * Preserve supervisor states! + */ + u64 mask = user_xfeatures | xfeatures_mask_supervisor(); - ret = copy_kernel_to_fxregs_err(&fpu->state.fxsave); + fpregs->xsave.header.xfeatures &= mask; + success = !os_xrstor_safe(fpu->fpstate, + fpu_kernel_cfg.max_features); } else { - ret = __copy_from_user(&fpu->state.fsave, buf_fx, state_size); - if (ret) - goto err_out; - - fpregs_lock(); - ret = copy_kernel_to_fregs_err(&fpu->state.fsave); + success = !fxrstor_safe(&fpregs->fxsave); } - if (!ret) + + if (likely(success)) fpregs_mark_activate(); - else - fpregs_deactivate(fpu); - fpregs_unlock(); -err_out: - if (ret) - fpu__clear(fpu); - return ret; + fpregs_unlock(); + return success; } -static inline int xstate_sigframe_size(void) +static inline unsigned int xstate_sigframe_size(struct fpstate *fpstate) { - return use_xsave() ? fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE : - fpu_user_xstate_size; + unsigned int size = fpstate->user_size; + + return use_xsave() ? size + FP_XSTATE_MAGIC2_SIZE : size; } /* * Restore FPU state from a sigframe: */ -int fpu__restore_sig(void __user *buf, int ia32_frame) +bool fpu__restore_sig(void __user *buf, int ia32_frame) { + struct fpu *fpu = ¤t->thread.fpu; void __user *buf_fx = buf; - int size = xstate_sigframe_size(); + bool ia32_fxstate = false; + bool success = false; + unsigned int size; + + if (unlikely(!buf)) { + fpu__clear_user_states(fpu); + return true; + } + size = xstate_sigframe_size(fpu->fpstate); + + ia32_frame &= (IS_ENABLED(CONFIG_X86_32) || + IS_ENABLED(CONFIG_IA32_EMULATION)); + + /* + * Only FXSR enabled systems need the FX state quirk. + * FRSTOR does not need it and can use the fast path. + */ if (ia32_frame && use_fxsr()) { buf_fx = buf + sizeof(struct fregs_state); size += sizeof(struct fregs_state); + ia32_fxstate = true; } - return __fpu__restore_sig(buf, buf_fx, size); + if (!access_ok(buf, size)) + goto out; + + if (!IS_ENABLED(CONFIG_X86_64) && !cpu_feature_enabled(X86_FEATURE_FPU)) { + success = !fpregs_soft_set(current, NULL, 0, + sizeof(struct user_i387_ia32_struct), + NULL, buf); + } else { + success = __fpu_restore_sig(buf, buf_fx, ia32_fxstate); + } + +out: + if (unlikely(!success)) + fpu__clear_user_states(fpu); + return success; } unsigned long fpu__alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long *buf_fx, unsigned long *size) { - unsigned long frame_size = xstate_sigframe_size(); + unsigned long frame_size = xstate_sigframe_size(current->thread.fpu.fpstate); *buf_fx = sp = round_down(sp - frame_size, 64); if (ia32_frame && use_fxsr()) { @@ -452,28 +518,25 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame, return sp; } -/* - * Prepare the SW reserved portion of the fxsave memory layout, indicating - * the presence of the extended state information in the memory layout - * pointed by the fpstate pointer in the sigcontext. - * This will be saved when ever the FP and extended state context is - * saved on the user stack during the signal handler delivery to the user. - */ -void fpu__init_prepare_fx_sw_frame(void) + +unsigned long __init fpu__get_fpstate_size(void) { - int size = fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE; + unsigned long ret = fpu_user_cfg.max_size; - fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; - fx_sw_reserved.extended_size = size; - fx_sw_reserved.xfeatures = xfeatures_mask; - fx_sw_reserved.xstate_size = fpu_user_xstate_size; + if (use_xsave()) + ret += FP_XSTATE_MAGIC2_SIZE; - if (IS_ENABLED(CONFIG_IA32_EMULATION) || - IS_ENABLED(CONFIG_X86_32)) { - int fsave_header_size = sizeof(struct fregs_state); + /* + * This space is needed on (most) 32-bit kernels, or when a 32-bit + * app is running on a 64-bit kernel. To keep things simple, just + * assume the worst case and always include space for 'freg_state', + * even for 64-bit apps on 64-bit kernels. This wastes a bit of + * space, but keeps the code simple. + */ + if ((IS_ENABLED(CONFIG_IA32_EMULATION) || + IS_ENABLED(CONFIG_X86_32)) && use_fxsr()) + ret += sizeof(struct fregs_state); - fx_sw_reserved_ia32 = fx_sw_reserved; - fx_sw_reserved_ia32.extended_size = size + fsave_header_size; - } + return ret; } diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index a1806598aaa4..59e543b95a3c 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -4,21 +4,33 @@ * * Author: Suresh Siddha <suresh.b.siddha@intel.com> */ +#include <linux/bitops.h> #include <linux/compat.h> #include <linux/cpu.h> #include <linux/mman.h> +#include <linux/nospec.h> #include <linux/pkeys.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> +#include <linux/vmalloc.h> #include <asm/fpu/api.h> -#include <asm/fpu/internal.h> -#include <asm/fpu/signal.h> #include <asm/fpu/regset.h> -#include <asm/fpu/xstate.h> +#include <asm/fpu/signal.h> +#include <asm/fpu/xcr.h> #include <asm/tlbflush.h> -#include <asm/cpufeature.h> +#include <asm/prctl.h> +#include <asm/elf.h> + +#include "context.h" +#include "internal.h" +#include "legacy.h" +#include "xstate.h" + +#define for_each_extended_xfeature(bit, mask) \ + (bit) = FIRST_EXTENDED_XFEATURE; \ + for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask)) /* * Although we spell it out in here, the Processor Trace @@ -37,37 +49,42 @@ static const char *xfeature_names[] = "AVX-512 ZMM_Hi256" , "Processor Trace (unused)" , "Protection Keys User registers", + "PASID state", + "unknown xstate feature" , + "unknown xstate feature" , + "unknown xstate feature" , + "unknown xstate feature" , + "unknown xstate feature" , + "unknown xstate feature" , + "AMX Tile config" , + "AMX Tile data" , "unknown xstate feature" , }; -static short xsave_cpuid_features[] __initdata = { - X86_FEATURE_FPU, - X86_FEATURE_XMM, - X86_FEATURE_AVX, - X86_FEATURE_MPX, - X86_FEATURE_MPX, - X86_FEATURE_AVX512F, - X86_FEATURE_AVX512F, - X86_FEATURE_AVX512F, - X86_FEATURE_INTEL_PT, - X86_FEATURE_PKU, +static unsigned short xsave_cpuid_features[] __initdata = { + [XFEATURE_FP] = X86_FEATURE_FPU, + [XFEATURE_SSE] = X86_FEATURE_XMM, + [XFEATURE_YMM] = X86_FEATURE_AVX, + [XFEATURE_BNDREGS] = X86_FEATURE_MPX, + [XFEATURE_BNDCSR] = X86_FEATURE_MPX, + [XFEATURE_OPMASK] = X86_FEATURE_AVX512F, + [XFEATURE_ZMM_Hi256] = X86_FEATURE_AVX512F, + [XFEATURE_Hi16_ZMM] = X86_FEATURE_AVX512F, + [XFEATURE_PT_UNIMPLEMENTED_SO_FAR] = X86_FEATURE_INTEL_PT, + [XFEATURE_PKRU] = X86_FEATURE_PKU, + [XFEATURE_PASID] = X86_FEATURE_ENQCMD, + [XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE, + [XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE, }; -/* - * Mask of xstate features supported by the CPU and the kernel: - */ -u64 xfeatures_mask __read_mostly; - -static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; -static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; -static unsigned int xstate_comp_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; +static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init = + { [ 0 ... XFEATURE_MAX - 1] = -1}; +static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init = + { [ 0 ... XFEATURE_MAX - 1] = -1}; +static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init; -/* - * The XSAVE area of kernel can be in standard or compacted format; - * it is always in standard format for user mode. This is the user - * mode standard format size used for signal and ptrace frames. - */ -unsigned int fpu_user_xstate_size; +#define XSTATE_FLAG_SUPERVISOR BIT(0) +#define XSTATE_FLAG_ALIGNED64 BIT(1) /* * Return whether the system supports a given xfeature. @@ -76,7 +93,7 @@ unsigned int fpu_user_xstate_size; */ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name) { - u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask; + u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features; if (unlikely(feature_name)) { long xfeature_idx, max_idx; @@ -107,101 +124,42 @@ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name) } EXPORT_SYMBOL_GPL(cpu_has_xfeatures); -static bool xfeature_is_supervisor(int xfeature_nr) +static bool xfeature_is_aligned64(int xfeature_nr) { - /* - * Extended State Enumeration Sub-leaves (EAX = 0DH, ECX = n, n > 1) - * returns ECX[0] set to (1) for a supervisor state, and cleared (0) - * for a user state. - */ - u32 eax, ebx, ecx, edx; - - cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); - return ecx & 1; + return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64; } -static bool xfeature_is_user(int xfeature_nr) +static bool xfeature_is_supervisor(int xfeature_nr) { - return !xfeature_is_supervisor(xfeature_nr); + return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR; } -/* - * When executing XSAVEOPT (or other optimized XSAVE instructions), if - * a processor implementation detects that an FPU state component is still - * (or is again) in its initialized state, it may clear the corresponding - * bit in the header.xfeatures field, and can skip the writeout of registers - * to the corresponding memory layout. - * - * This means that when the bit is zero, the state component might still contain - * some previous - non-initialized register state. - * - * Before writing xstate information to user-space we sanitize those components, - * to always ensure that the memory layout of a feature will be in the init state - * if the corresponding header bit is zero. This is to ensure that user-space doesn't - * see some stale state in the memory layout during signal handling, debugging etc. - */ -void fpstate_sanitize_xstate(struct fpu *fpu) +static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature) { - struct fxregs_state *fx = &fpu->state.fxsave; - int feature_bit; - u64 xfeatures; - - if (!use_xsaveopt()) - return; - - xfeatures = fpu->state.xsave.header.xfeatures; + unsigned int offs, i; /* - * None of the feature bits are in init state. So nothing else - * to do for us, as the memory layout is up to date. + * Non-compacted format and legacy features use the cached fixed + * offsets. */ - if ((xfeatures & xfeatures_mask) == xfeatures_mask) - return; + if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) || + xfeature <= XFEATURE_SSE) + return xstate_offsets[xfeature]; /* - * FP is in init state + * Compacted format offsets depend on the actual content of the + * compacted xsave area which is determined by the xcomp_bv header + * field. */ - if (!(xfeatures & XFEATURE_MASK_FP)) { - fx->cwd = 0x37f; - fx->swd = 0; - fx->twd = 0; - fx->fop = 0; - fx->rip = 0; - fx->rdp = 0; - memset(&fx->st_space[0], 0, 128); - } - - /* - * SSE is in init state - */ - if (!(xfeatures & XFEATURE_MASK_SSE)) - memset(&fx->xmm_space[0], 0, 256); - - /* - * First two features are FPU and SSE, which above we handled - * in a special way already: - */ - feature_bit = 0x2; - xfeatures = (xfeatures_mask & ~xfeatures) >> 2; - - /* - * Update all the remaining memory layouts according to their - * standard xstate layout, if their header bit is in the init - * state: - */ - while (xfeatures) { - if (xfeatures & 0x1) { - int offset = xstate_comp_offsets[feature_bit]; - int size = xstate_sizes[feature_bit]; - - memcpy((void *)fx + offset, - (void *)&init_fpstate.xsave + offset, - size); - } - - xfeatures >>= 1; - feature_bit++; + offs = FXSAVE_SIZE + XSAVE_HDR_SIZE; + for_each_extended_xfeature(i, xcomp_bv) { + if (xfeature_is_aligned64(i)) + offs = ALIGN(offs, 64); + if (i == xfeature) + break; + offs += xstate_sizes[i]; } + return offs; } /* @@ -210,40 +168,49 @@ void fpstate_sanitize_xstate(struct fpu *fpu) */ void fpu__init_cpu_xstate(void) { - if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask) + if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features) return; + + cr4_set_bits(X86_CR4_OSXSAVE); + /* - * Make it clear that XSAVES supervisor states are not yet - * implemented should anyone expect it to work by changing - * bits in XFEATURE_MASK_* macros and XCR0. + * Must happen after CR4 setup and before xsetbv() to allow KVM + * lazy passthrough. Write independent of the dynamic state static + * key as that does not work on the boot CPU. This also ensures + * that any stale state is wiped out from XFD. */ - WARN_ONCE((xfeatures_mask & XFEATURE_MASK_SUPERVISOR), - "x86/fpu: XSAVES supervisor states are not yet implemented.\n"); + if (cpu_feature_enabled(X86_FEATURE_XFD)) + wrmsrl(MSR_IA32_XFD, init_fpstate.xfd); - xfeatures_mask &= ~XFEATURE_MASK_SUPERVISOR; + /* + * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features + * managed by XSAVE{C, OPT, S} and XRSTOR{S}. Only XSAVE user + * states can be set here. + */ + xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features); - cr4_set_bits(X86_CR4_OSXSAVE); - xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask); + /* + * MSR_IA32_XSS sets supervisor states managed by XSAVES. + */ + if (boot_cpu_has(X86_FEATURE_XSAVES)) { + wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | + xfeatures_mask_independent()); + } } -/* - * Note that in the future we will likely need a pair of - * functions here: one for user xstates and the other for - * system xstates. For now, they are the same. - */ -static int xfeature_enabled(enum xfeature xfeature) +static bool xfeature_enabled(enum xfeature xfeature) { - return !!(xfeatures_mask & (1UL << xfeature)); + return fpu_kernel_cfg.max_features & BIT_ULL(xfeature); } /* * Record the offsets and sizes of various xstates contained * in the XSAVE state memory layout. */ -static void __init setup_xstate_features(void) +static void __init setup_xstate_cache(void) { u32 eax, ebx, ecx, edx, i; - /* start at the beginnning of the "extended state" */ + /* start at the beginning of the "extended state" */ unsigned int last_good_offset = offsetof(struct xregs_state, extended_state_area); /* @@ -259,27 +226,29 @@ static void __init setup_xstate_features(void) xstate_sizes[XFEATURE_SSE] = sizeof_field(struct fxregs_state, xmm_space); - for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - if (!xfeature_enabled(i)) - continue; - + for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); + xstate_sizes[i] = eax; + xstate_flags[i] = ecx; + /* - * If an xfeature is supervisor state, the offset - * in EBX is invalid. We leave it to -1. + * If an xfeature is supervisor state, the offset in EBX is + * invalid, leave it to -1. */ - if (xfeature_is_user(i)) - xstate_offsets[i] = ebx; + if (xfeature_is_supervisor(i)) + continue; + + xstate_offsets[i] = ebx; - xstate_sizes[i] = eax; /* - * In our xstate size checks, we assume that the - * highest-numbered xstate feature has the - * highest offset in the buffer. Ensure it does. + * In our xstate size checks, we assume that the highest-numbered + * xstate feature has the highest offset in the buffer. Ensure + * it does. */ WARN_ONCE(last_good_offset > xstate_offsets[i], - "x86/fpu: misordered xstate at %d\n", last_good_offset); + "x86/fpu: misordered xstate at %d\n", last_good_offset); + last_good_offset = xstate_offsets[i]; } } @@ -306,6 +275,9 @@ static void __init print_xstate_features(void) print_xstate_feature(XFEATURE_MASK_ZMM_Hi256); print_xstate_feature(XFEATURE_MASK_Hi16_ZMM); print_xstate_feature(XFEATURE_MASK_PKRU); + print_xstate_feature(XFEATURE_MASK_PASID); + print_xstate_feature(XFEATURE_MASK_XTILE_CFG); + print_xstate_feature(XFEATURE_MASK_XTILE_DATA); } /* @@ -318,139 +290,103 @@ static void __init print_xstate_features(void) } while (0) /* - * We could cache this like xstate_size[], but we only use - * it here, so it would be a waste of space. + * Print out xstate component offsets and sizes */ -static int xfeature_is_aligned(int xfeature_nr) +static void __init print_xstate_offset_size(void) { - u32 eax, ebx, ecx, edx; + int i; - CHECK_XFEATURE(xfeature_nr); - cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); - /* - * The value returned by ECX[1] indicates the alignment - * of state component 'i' when the compacted format - * of the extended region of an XSAVE area is used: - */ - return !!(ecx & 2); + for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { + pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", + i, xfeature_get_offset(fpu_kernel_cfg.max_features, i), + i, xstate_sizes[i]); + } } /* - * This function sets up offsets and sizes of all extended states in - * xsave area. This supports both standard format and compacted format - * of the xsave aread. + * This function is called only during boot time when x86 caps are not set + * up and alternative can not be used yet. */ -static void __init setup_xstate_comp(void) +static __init void os_xrstor_booting(struct xregs_state *xstate) { - unsigned int xstate_comp_sizes[XFEATURE_MAX]; - int i; + u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE; + u32 lmask = mask; + u32 hmask = mask >> 32; + int err; + + if (cpu_feature_enabled(X86_FEATURE_XSAVES)) + XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); + else + XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); /* - * The FP xstates and SSE xstates are legacy states. They are always - * in the fixed offsets in the xsave area in either compacted form - * or standard form. + * We should never fault when copying from a kernel buffer, and the FPU + * state we set at boot time should be valid. */ - xstate_comp_offsets[XFEATURE_FP] = 0; - xstate_comp_offsets[XFEATURE_SSE] = offsetof(struct fxregs_state, - xmm_space); - - if (!boot_cpu_has(X86_FEATURE_XSAVES)) { - for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - if (xfeature_enabled(i)) { - xstate_comp_offsets[i] = xstate_offsets[i]; - xstate_comp_sizes[i] = xstate_sizes[i]; - } - } - return; - } - - xstate_comp_offsets[FIRST_EXTENDED_XFEATURE] = - FXSAVE_SIZE + XSAVE_HDR_SIZE; - - for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - if (xfeature_enabled(i)) - xstate_comp_sizes[i] = xstate_sizes[i]; - else - xstate_comp_sizes[i] = 0; - - if (i > FIRST_EXTENDED_XFEATURE) { - xstate_comp_offsets[i] = xstate_comp_offsets[i-1] - + xstate_comp_sizes[i-1]; - - if (xfeature_is_aligned(i)) - xstate_comp_offsets[i] = - ALIGN(xstate_comp_offsets[i], 64); - } - } + WARN_ON_FPU(err); } /* - * Print out xstate component offsets and sizes + * All supported features have either init state all zeros or are + * handled in setup_init_fpu() individually. This is an explicit + * feature list and does not use XFEATURE_MASK*SUPPORTED to catch + * newly added supported features at build time and make people + * actually look at the init state for the new feature. */ -static void __init print_xstate_offset_size(void) -{ - int i; - - for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - if (!xfeature_enabled(i)) - continue; - pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", - i, xstate_comp_offsets[i], i, xstate_sizes[i]); - } -} +#define XFEATURES_INIT_FPSTATE_HANDLED \ + (XFEATURE_MASK_FP | \ + XFEATURE_MASK_SSE | \ + XFEATURE_MASK_YMM | \ + XFEATURE_MASK_OPMASK | \ + XFEATURE_MASK_ZMM_Hi256 | \ + XFEATURE_MASK_Hi16_ZMM | \ + XFEATURE_MASK_PKRU | \ + XFEATURE_MASK_BNDREGS | \ + XFEATURE_MASK_BNDCSR | \ + XFEATURE_MASK_PASID | \ + XFEATURE_MASK_XTILE) /* * setup the xstate image representing the init state */ static void __init setup_init_fpu_buf(void) { - static int on_boot_cpu __initdata = 1; - - WARN_ON_FPU(!on_boot_cpu); - on_boot_cpu = 0; + BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED | + XFEATURE_MASK_SUPERVISOR_SUPPORTED) != + XFEATURES_INIT_FPSTATE_HANDLED); if (!boot_cpu_has(X86_FEATURE_XSAVE)) return; - setup_xstate_features(); print_xstate_features(); - if (boot_cpu_has(X86_FEATURE_XSAVES)) - init_fpstate.xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | - xfeatures_mask; + xstate_init_xcomp_bv(&init_fpstate.regs.xsave, init_fpstate.xfeatures); /* * Init all the features state with header.xfeatures being 0x0 */ - copy_kernel_to_xregs_booting(&init_fpstate.xsave); - - /* - * Dump the init state again. This is to identify the init state - * of any feature which is not represented by all zero's. - */ - copy_xregs_to_kernel_booting(&init_fpstate.xsave); -} - -static int xfeature_uncompacted_offset(int xfeature_nr) -{ - u32 eax, ebx, ecx, edx; + os_xrstor_booting(&init_fpstate.regs.xsave); /* - * Only XSAVES supports supervisor states and it uses compacted - * format. Checking a supervisor state's uncompacted offset is - * an error. + * All components are now in init state. Read the state back so + * that init_fpstate contains all non-zero init state. This only + * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because + * those use the init optimization which skips writing data for + * components in init state. + * + * XSAVE could be used, but that would require to reshuffle the + * data when XSAVEC/S is available because XSAVEC/S uses xstate + * compaction. But doing so is a pointless exercise because most + * components have an all zeros init state except for the legacy + * ones (FP and SSE). Those can be saved with FXSAVE into the + * legacy area. Adding new features requires to ensure that init + * state is all zeroes or if not to add the necessary handling + * here. */ - if (XFEATURE_MASK_SUPERVISOR & BIT_ULL(xfeature_nr)) { - WARN_ONCE(1, "No fixed offset for xstate %d\n", xfeature_nr); - return -1; - } - - CHECK_XFEATURE(xfeature_nr); - cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); - return ebx; + fxsave(&init_fpstate.regs.fxsave); } -static int xfeature_size(int xfeature_nr) +int xfeature_size(int xfeature_nr) { u32 eax, ebx, ecx, edx; @@ -459,25 +395,12 @@ static int xfeature_size(int xfeature_nr) return eax; } -/* - * 'XSAVES' implies two different things: - * 1. saving of supervisor/system state - * 2. using the compacted format - * - * Use this function when dealing with the compacted format so - * that it is obvious which aspect of 'XSAVES' is being handled - * by the calling code. - */ -int using_compacted_format(void) -{ - return boot_cpu_has(X86_FEATURE_XSAVES); -} - /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ -int validate_xstate_header(const struct xstate_header *hdr) +static int validate_user_xstate_header(const struct xstate_header *hdr, + struct fpstate *fpstate) { /* No unknown or supervisor features may be set */ - if (hdr->xfeatures & (~xfeatures_mask | XFEATURE_MASK_SUPERVISOR)) + if (hdr->xfeatures & ~fpstate->user_xfeatures) return -EINVAL; /* Userspace must use the uncompacted format */ @@ -497,7 +420,7 @@ int validate_xstate_header(const struct xstate_header *hdr) return 0; } -static void __xstate_dump_leaves(void) +static void __init __xstate_dump_leaves(void) { int i; u32 eax, ebx, ecx, edx; @@ -532,12 +455,73 @@ static void __xstate_dump_leaves(void) } \ } while (0) +/** + * check_xtile_data_against_struct - Check tile data state size. + * + * Calculate the state size by multiplying the single tile size which is + * recorded in a C struct, and the number of tiles that the CPU informs. + * Compare the provided size with the calculation. + * + * @size: The tile data state size + * + * Returns: 0 on success, -EINVAL on mismatch. + */ +static int __init check_xtile_data_against_struct(int size) +{ + u32 max_palid, palid, state_size; + u32 eax, ebx, ecx, edx; + u16 max_tile; + + /* + * Check the maximum palette id: + * eax: the highest numbered palette subleaf. + */ + cpuid_count(TILE_CPUID, 0, &max_palid, &ebx, &ecx, &edx); + + /* + * Cross-check each tile size and find the maximum number of + * supported tiles. + */ + for (palid = 1, max_tile = 0; palid <= max_palid; palid++) { + u16 tile_size, max; + + /* + * Check the tile size info: + * eax[31:16]: bytes per title + * ebx[31:16]: the max names (or max number of tiles) + */ + cpuid_count(TILE_CPUID, palid, &eax, &ebx, &edx, &edx); + tile_size = eax >> 16; + max = ebx >> 16; + + if (tile_size != sizeof(struct xtile_data)) { + pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n", + __stringify(XFEATURE_XTILE_DATA), + sizeof(struct xtile_data), tile_size); + __xstate_dump_leaves(); + return -EINVAL; + } + + if (max > max_tile) + max_tile = max; + } + + state_size = sizeof(struct xtile_data) * max_tile; + if (size != state_size) { + pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n", + __stringify(XFEATURE_XTILE_DATA), state_size, size); + __xstate_dump_leaves(); + return -EINVAL; + } + return 0; +} + /* * We have a C struct for each 'xstate'. We need to ensure * that our software representation matches what the CPU * tells us about the state's size. */ -static void check_xstate_against_struct(int nr) +static bool __init check_xstate_against_struct(int nr) { /* * Ask the CPU for the size of the state. @@ -554,6 +538,12 @@ static void check_xstate_against_struct(int nr) XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state); XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM, struct avx_512_hi16_state); XCHECK_SZ(sz, nr, XFEATURE_PKRU, struct pkru_state); + XCHECK_SZ(sz, nr, XFEATURE_PASID, struct ia32_pasid_state); + XCHECK_SZ(sz, nr, XFEATURE_XTILE_CFG, struct xtile_cfg); + + /* The tile data size varies between implementations. */ + if (nr == XFEATURE_XTILE_DATA) + check_xtile_data_against_struct(sz); /* * Make *SURE* to add any feature numbers in below if @@ -562,67 +552,73 @@ static void check_xstate_against_struct(int nr) */ if ((nr < XFEATURE_YMM) || (nr >= XFEATURE_MAX) || - (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR)) { + (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) || + ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_RSRVD_COMP_16))) { WARN_ONCE(1, "no structure for xstate: %d\n", nr); XSTATE_WARN_ON(1); + return false; } + return true; +} + +static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted) +{ + unsigned int topmost = fls64(xfeatures) - 1; + unsigned int offset = xstate_offsets[topmost]; + + if (topmost <= XFEATURE_SSE) + return sizeof(struct xregs_state); + + if (compacted) + offset = xfeature_get_offset(xfeatures, topmost); + return offset + xstate_sizes[topmost]; } /* * This essentially double-checks what the cpu told us about * how large the XSAVE buffer needs to be. We are recalculating * it to be safe. + * + * Independent XSAVE features allocate their own buffers and are not + * covered by these checks. Only the size of the buffer for task->fpu + * is checked here. */ -static void do_extra_xstate_size_checks(void) +static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) { - int paranoid_xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE; + bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); + bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES); + unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE; int i; - for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - if (!xfeature_enabled(i)) - continue; - - check_xstate_against_struct(i); + for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { + if (!check_xstate_against_struct(i)) + return false; /* * Supervisor state components can be managed only by - * XSAVES, which is compacted-format only. - */ - if (!using_compacted_format()) - XSTATE_WARN_ON(xfeature_is_supervisor(i)); - - /* Align from the end of the previous feature */ - if (xfeature_is_aligned(i)) - paranoid_xstate_size = ALIGN(paranoid_xstate_size, 64); - /* - * The offset of a given state in the non-compacted - * format is given to us in a CPUID leaf. We check - * them for being ordered (increasing offsets) in - * setup_xstate_features(). - */ - if (!using_compacted_format()) - paranoid_xstate_size = xfeature_uncompacted_offset(i); - /* - * The compacted-format offset always depends on where - * the previous state ended. + * XSAVES. */ - paranoid_xstate_size += xfeature_size(i); + if (!xsaves && xfeature_is_supervisor(i)) { + XSTATE_WARN_ON(1); + return false; + } } - XSTATE_WARN_ON(paranoid_xstate_size != fpu_kernel_xstate_size); + size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted); + XSTATE_WARN_ON(size != kernel_size); + return size == kernel_size; } - /* - * Get total size of enabled xstates in XCR0/xfeatures_mask. + * Get total size of enabled xstates in XCR0 | IA32_XSS. * * Note the SDM's wording here. "sub-function 0" only enumerates * the size of the *user* states. If we use it to size a buffer * that we use 'XSAVES' on, we could potentially overflow the * buffer because 'XSAVES' saves system states too. * - * Note that we do not currently set any bits on IA32_XSS so - * 'XCR0 | IA32_XSS == XCR0' for now. + * This also takes compaction into account. So this works for + * XSAVEC as well. */ -static unsigned int __init get_xsaves_size(void) +static unsigned int __init get_compacted_size(void) { unsigned int eax, ebx, ecx, edx; /* @@ -632,12 +628,43 @@ static unsigned int __init get_xsaves_size(void) * containing all the state components * corresponding to bits currently set in * XCR0 | IA32_XSS. + * + * When XSAVES is not available but XSAVEC is (virt), then there + * are no supervisor states, but XSAVEC still uses compacted + * format. */ cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); return ebx; } -static unsigned int __init get_xsave_size(void) +/* + * Get the total size of the enabled xstates without the independent supervisor + * features. + */ +static unsigned int __init get_xsave_compacted_size(void) +{ + u64 mask = xfeatures_mask_independent(); + unsigned int size; + + if (!mask) + return get_compacted_size(); + + /* Disable independent features. */ + wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()); + + /* + * Ask the hardware what size is required of the buffer. + * This is the size required for the task->fpu buffer. + */ + size = get_compacted_size(); + + /* Re-enable independent features so XSAVES will work on them again. */ + wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask); + + return size; +} + +static unsigned int __init get_xsave_size_user(void) { unsigned int eax, ebx, ecx, edx; /* @@ -651,48 +678,41 @@ static unsigned int __init get_xsave_size(void) return ebx; } -/* - * Will the runtime-enumerated 'xstate_size' fit in the init - * task's statically-allocated buffer? - */ -static bool is_supported_xstate_size(unsigned int test_xstate_size) -{ - if (test_xstate_size <= sizeof(union fpregs_state)) - return true; - - pr_warn("x86/fpu: xstate buffer too small (%zu < %d), disabling xsave\n", - sizeof(union fpregs_state), test_xstate_size); - return false; -} - static int __init init_xstate_size(void) { /* Recompute the context size for enabled features: */ - unsigned int possible_xstate_size; - unsigned int xsave_size; + unsigned int user_size, kernel_size, kernel_default_size; + bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); - xsave_size = get_xsave_size(); + /* Uncompacted user space size */ + user_size = get_xsave_size_user(); - if (boot_cpu_has(X86_FEATURE_XSAVES)) - possible_xstate_size = get_xsaves_size(); + /* + * XSAVES kernel size includes supervisor states and uses compacted + * format. XSAVEC uses compacted format, but does not save + * supervisor states. + * + * XSAVE[OPT] do not support supervisor states so kernel and user + * size is identical. + */ + if (compacted) + kernel_size = get_xsave_compacted_size(); else - possible_xstate_size = xsave_size; + kernel_size = user_size; - /* Ensure we have the space to store all enabled: */ - if (!is_supported_xstate_size(possible_xstate_size)) + kernel_default_size = + xstate_calculate_size(fpu_kernel_cfg.default_features, compacted); + + if (!paranoid_xstate_size_valid(kernel_size)) return -EINVAL; - /* - * The size is OK, we are definitely going to use xsave, - * make it known to the world that we need more space. - */ - fpu_kernel_xstate_size = possible_xstate_size; - do_extra_xstate_size_checks(); + fpu_kernel_cfg.max_size = kernel_size; + fpu_user_cfg.max_size = user_size; + + fpu_kernel_cfg.default_size = kernel_default_size; + fpu_user_cfg.default_size = + xstate_calculate_size(fpu_user_cfg.default_features, false); - /* - * User space is always in standard format. - */ - fpu_user_xstate_size = xsave_size; return 0; } @@ -700,27 +720,38 @@ static int __init init_xstate_size(void) * We enabled the XSAVE hardware, but something went wrong and * we can not use it. Disable it. */ -static void fpu__init_disable_system_xstate(void) +static void __init fpu__init_disable_system_xstate(unsigned int legacy_size) { - xfeatures_mask = 0; + fpu_kernel_cfg.max_features = 0; cr4_clear_bits(X86_CR4_OSXSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVE); + + /* Restore the legacy size.*/ + fpu_kernel_cfg.max_size = legacy_size; + fpu_kernel_cfg.default_size = legacy_size; + fpu_user_cfg.max_size = legacy_size; + fpu_user_cfg.default_size = legacy_size; + + /* + * Prevent enabling the static branch which enables writes to the + * XFD MSR. + */ + init_fpstate.xfd = 0; + + fpstate_reset(¤t->thread.fpu); } /* * Enable and initialize the xsave feature. * Called once per system bootup. */ -void __init fpu__init_system_xstate(void) +void __init fpu__init_system_xstate(unsigned int legacy_size) { unsigned int eax, ebx, ecx, edx; - static int on_boot_cpu __initdata = 1; + u64 xfeatures; int err; int i; - WARN_ON_FPU(!on_boot_cpu); - on_boot_cpu = 0; - if (!boot_cpu_has(X86_FEATURE_FPU)) { pr_info("x86/fpu: No FPU detected\n"); return; @@ -737,16 +768,26 @@ void __init fpu__init_system_xstate(void) return; } + /* + * Find user xstates supported by the processor. + */ cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); - xfeatures_mask = eax + ((u64)edx << 32); + fpu_kernel_cfg.max_features = eax + ((u64)edx << 32); + + /* + * Find supervisor xstates supported by the processor. + */ + cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); + fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32); - if ((xfeatures_mask & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { + if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { /* * This indicates that something really unexpected happened * with the enumeration. Disable XSAVE and try to continue * booting without it. This is too early to BUG(). */ - pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask); + pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", + fpu_kernel_cfg.max_features); goto out_disable; } @@ -754,38 +795,103 @@ void __init fpu__init_system_xstate(void) * Clear XSAVE features that are disabled in the normal CPUID. */ for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) { - if (!boot_cpu_has(xsave_cpuid_features[i])) - xfeatures_mask &= ~BIT(i); + unsigned short cid = xsave_cpuid_features[i]; + + /* Careful: X86_FEATURE_FPU is 0! */ + if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid)) + fpu_kernel_cfg.max_features &= ~BIT_ULL(i); } - xfeatures_mask &= fpu__get_supported_xfeatures_mask(); + if (!cpu_feature_enabled(X86_FEATURE_XFD)) + fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC; + + if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) + fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; + else + fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED | + XFEATURE_MASK_SUPERVISOR_SUPPORTED; + + fpu_user_cfg.max_features = fpu_kernel_cfg.max_features; + fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; + + /* Clean out dynamic features from default */ + fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features; + fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC; + + fpu_user_cfg.default_features = fpu_user_cfg.max_features; + fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC; + + /* Store it for paranoia check at the end */ + xfeatures = fpu_kernel_cfg.max_features; + + /* + * Initialize the default XFD state in initfp_state and enable the + * dynamic sizing mechanism if dynamic states are available. The + * static key cannot be enabled here because this runs before + * jump_label_init(). This is delayed to an initcall. + */ + init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC; + + /* Set up compaction feature bit */ + if (cpu_feature_enabled(X86_FEATURE_XSAVEC) || + cpu_feature_enabled(X86_FEATURE_XSAVES)) + setup_force_cpu_cap(X86_FEATURE_XCOMPACTED); /* Enable xstate instructions to be able to continue with initialization: */ fpu__init_cpu_xstate(); + + /* Cache size, offset and flags for initialization */ + setup_xstate_cache(); + err = init_xstate_size(); if (err) goto out_disable; + /* Reset the state for the current task */ + fpstate_reset(¤t->thread.fpu); + /* * Update info used for ptrace frames; use standard-format size and no * supervisor xstates: */ - update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR); + update_regset_xstate_info(fpu_user_cfg.max_size, + fpu_user_cfg.max_features); + + /* + * init_fpstate excludes dynamic states as they are large but init + * state is zero. + */ + init_fpstate.size = fpu_kernel_cfg.default_size; + init_fpstate.xfeatures = fpu_kernel_cfg.default_features; + + if (init_fpstate.size > sizeof(init_fpstate.regs)) { + pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d), disabling XSAVE\n", + sizeof(init_fpstate.regs), init_fpstate.size); + goto out_disable; + } - fpu__init_prepare_fx_sw_frame(); setup_init_fpu_buf(); - setup_xstate_comp(); - print_xstate_offset_size(); + /* + * Paranoia check whether something in the setup modified the + * xfeatures mask. + */ + if (xfeatures != fpu_kernel_cfg.max_features) { + pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n", + xfeatures, fpu_kernel_cfg.max_features); + goto out_disable; + } + + print_xstate_offset_size(); pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", - xfeatures_mask, - fpu_kernel_xstate_size, - boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard"); + fpu_kernel_cfg.max_features, + fpu_kernel_cfg.max_size, + boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard"); return; out_disable: /* something went wrong, try to boot without any XSAVE support */ - fpu__init_disable_system_xstate(); + fpu__init_disable_system_xstate(legacy_size); } /* @@ -796,8 +902,20 @@ void fpu__resume_cpu(void) /* * Restore XCR0 on xsave capable CPUs: */ - if (boot_cpu_has(X86_FEATURE_XSAVE)) - xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask); + if (cpu_feature_enabled(X86_FEATURE_XSAVE)) + xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features); + + /* + * Restore IA32_XSS. The same CPUID bit enumerates support + * of XSAVES and MSR_IA32_XSS. + */ + if (cpu_feature_enabled(X86_FEATURE_XSAVES)) { + wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | + xfeatures_mask_independent()); + } + + if (fpu_state_size_dynamic()) + wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd); } /* @@ -807,13 +925,19 @@ void fpu__resume_cpu(void) */ static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr) { - if (!xfeature_enabled(xfeature_nr)) { - WARN_ON_FPU(1); + u64 xcomp_bv = xsave->header.xcomp_bv; + + if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) return NULL; + + if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) { + if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr)))) + return NULL; } - return (void *)xsave + xstate_comp_offsets[xfeature_nr]; + return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr); } + /* * Given the xsave area and a state inside, this function returns the * address of the state. @@ -842,11 +966,11 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) /* * We should not ever be requesting features that we - * have not enabled. Remember that xfeatures_mask is - * what we write to the XCR0 register. + * have not enabled. */ - WARN_ONCE(!(xfeatures_mask & BIT_ULL(xfeature_nr)), - "get of unsupported state"); + if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) + return NULL; + /* * This assumes the last 'xsave*' instruction to * have requested that 'xfeature_nr' be saved. @@ -863,58 +987,32 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) return __raw_xsave_addr(xsave, xfeature_nr); } -EXPORT_SYMBOL_GPL(get_xsave_addr); - -/* - * This wraps up the common operations that need to occur when retrieving - * data from xsave state. It first ensures that the current task was - * using the FPU and retrieves the data in to a buffer. It then calculates - * the offset of the requested field in the buffer. - * - * This function is safe to call whether the FPU is in use or not. - * - * Note that this only works on the current task. - * - * Inputs: - * @xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP, - * XFEATURE_SSE, etc...) - * Output: - * address of the state in the xsave area or NULL if the state - * is not present or is in its 'init state'. - */ -const void *get_xsave_field_ptr(int xfeature_nr) -{ - struct fpu *fpu = ¤t->thread.fpu; - - /* - * fpu__save() takes the CPU's xstate registers - * and saves them off to the 'fpu memory buffer. - */ - fpu__save(fpu); - - return get_xsave_addr(&fpu->state.xsave, xfeature_nr); -} #ifdef CONFIG_ARCH_HAS_PKEYS -#define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2) -#define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1) /* * This will go out and modify PKRU register to set the access * rights for @pkey to @init_val. */ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, - unsigned long init_val) + unsigned long init_val) { - u32 old_pkru; - int pkey_shift = (pkey * PKRU_BITS_PER_PKEY); - u32 new_pkru_bits = 0; + u32 old_pkru, new_pkru_bits = 0; + int pkey_shift; /* * This check implies XSAVE support. OSPKE only gets * set if we enable XSAVE and we enable PKU in XCR0. */ - if (!boot_cpu_has(X86_FEATURE_OSPKE)) + if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) + return -EINVAL; + + /* + * This code should only be called with valid 'pkey' + * values originating from in-kernel users. Complain + * if a bad value is observed. + */ + if (WARN_ON_ONCE(pkey >= arch_max_pkey())) return -EINVAL; /* Set the bits we need in PKRU: */ @@ -924,6 +1022,7 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, new_pkru_bits |= PKRU_WD_BIT; /* Shift the bits in to the correct place in PKRU for pkey: */ + pkey_shift = pkey * PKRU_BITS_PER_PKEY; new_pkru_bits <<= pkey_shift; /* Get old PKRU and mask off any old bits in place: */ @@ -937,305 +1036,720 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, } #endif /* ! CONFIG_ARCH_HAS_PKEYS */ -/* - * Weird legacy quirk: SSE and YMM states store information in the - * MXCSR and MXCSR_FLAGS fields of the FP area. That means if the FP - * area is marked as unused in the xfeatures header, we need to copy - * MXCSR and MXCSR_FLAGS if either SSE or YMM are in use. +static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, + void *init_xstate, unsigned int size) +{ + membuf_write(to, from_xstate ? xstate : init_xstate, size); +} + +/** + * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer + * @to: membuf descriptor + * @fpstate: The fpstate buffer from which to copy + * @pkru_val: The PKRU value to store in the PKRU component + * @copy_mode: The requested copy mode + * + * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming + * format, i.e. from the kernel internal hardware dependent storage format + * to the requested @mode. UABI XSTATE is always uncompacted! + * + * It supports partial copy but @to.pos always starts from zero. */ -static inline bool xfeatures_mxcsr_quirk(u64 xfeatures) +void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, + u32 pkru_val, enum xstate_copy_mode copy_mode) { - if (!(xfeatures & (XFEATURE_MASK_SSE|XFEATURE_MASK_YMM))) - return false; + const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr); + struct xregs_state *xinit = &init_fpstate.regs.xsave; + struct xregs_state *xsave = &fpstate->regs.xsave; + struct xstate_header header; + unsigned int zerofrom; + u64 mask; + int i; - if (xfeatures & XFEATURE_MASK_FP) - return false; + memset(&header, 0, sizeof(header)); + header.xfeatures = xsave->header.xfeatures; - return true; + /* Mask out the feature bits depending on copy mode */ + switch (copy_mode) { + case XSTATE_COPY_FP: + header.xfeatures &= XFEATURE_MASK_FP; + break; + + case XSTATE_COPY_FX: + header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE; + break; + + case XSTATE_COPY_XSAVE: + header.xfeatures &= fpstate->user_xfeatures; + break; + } + + /* Copy FP state up to MXCSR */ + copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387, + &xinit->i387, off_mxcsr); + + /* Copy MXCSR when SSE or YMM are set in the feature mask */ + copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM), + &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr, + MXCSR_AND_FLAGS_SIZE); + + /* Copy the remaining FP state */ + copy_feature(header.xfeatures & XFEATURE_MASK_FP, + &to, &xsave->i387.st_space, &xinit->i387.st_space, + sizeof(xsave->i387.st_space)); + + /* Copy the SSE state - shared with YMM, but independently managed */ + copy_feature(header.xfeatures & XFEATURE_MASK_SSE, + &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space, + sizeof(xsave->i387.xmm_space)); + + if (copy_mode != XSTATE_COPY_XSAVE) + goto out; + + /* Zero the padding area */ + membuf_zero(&to, sizeof(xsave->i387.padding)); + + /* Copy xsave->i387.sw_reserved */ + membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved)); + + /* Copy the user space relevant state of @xsave->header */ + membuf_write(&to, &header, sizeof(header)); + + zerofrom = offsetof(struct xregs_state, extended_state_area); + + /* + * The ptrace buffer is in non-compacted XSAVE format. In + * non-compacted format disabled features still occupy state space, + * but there is no state to copy from in the compacted + * init_fpstate. The gap tracking will zero these states. + */ + mask = fpstate->user_xfeatures; + + /* + * Dynamic features are not present in init_fpstate. When they are + * in an all zeros init state, remove those from 'mask' to zero + * those features in the user buffer instead of retrieving them + * from init_fpstate. + */ + if (fpu_state_size_dynamic()) + mask &= (header.xfeatures | xinit->header.xcomp_bv); + + for_each_extended_xfeature(i, mask) { + /* + * If there was a feature or alignment gap, zero the space + * in the destination buffer. + */ + if (zerofrom < xstate_offsets[i]) + membuf_zero(&to, xstate_offsets[i] - zerofrom); + + if (i == XFEATURE_PKRU) { + struct pkru_state pkru = {0}; + /* + * PKRU is not necessarily up to date in the + * XSAVE buffer. Use the provided value. + */ + pkru.pkru = pkru_val; + membuf_write(&to, &pkru, sizeof(pkru)); + } else { + copy_feature(header.xfeatures & BIT_ULL(i), &to, + __raw_xsave_addr(xsave, i), + __raw_xsave_addr(xinit, i), + xstate_sizes[i]); + } + /* + * Keep track of the last copied state in the non-compacted + * target buffer for gap zeroing. + */ + zerofrom = xstate_offsets[i] + xstate_sizes[i]; + } + +out: + if (to.left) + membuf_zero(&to, to.left); } -/* - * This is similar to user_regset_copyout(), but will not add offset to - * the source data pointer or increment pos, count, kbuf, and ubuf. +/** + * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer + * @to: membuf descriptor + * @tsk: The task from which to copy the saved xstate + * @copy_mode: The requested copy mode + * + * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming + * format, i.e. from the kernel internal hardware dependent storage format + * to the requested @mode. UABI XSTATE is always uncompacted! + * + * It supports partial copy but @to.pos always starts from zero. */ -static inline void -__copy_xstate_to_kernel(void *kbuf, const void *data, - unsigned int offset, unsigned int size, unsigned int size_total) +void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, + enum xstate_copy_mode copy_mode) { - if (offset < size_total) { - unsigned int copy = min(size, size_total - offset); + __copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate, + tsk->thread.pkru, copy_mode); +} - memcpy(kbuf + offset, data, copy); +static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size, + const void *kbuf, const void __user *ubuf) +{ + if (kbuf) { + memcpy(dst, kbuf + offset, size); + } else { + if (copy_from_user(dst, ubuf + offset, size)) + return -EFAULT; } + return 0; } -/* - * Convert from kernel XSAVES compacted format to standard format and copy - * to a kernel-space ptrace buffer. - * - * It supports partial copy but pos always starts from zero. This is called - * from xstateregs_get() and there we check the CPU has XSAVES. - */ -int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total) + +static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf, + const void __user *ubuf) { + struct xregs_state *xsave = &fpstate->regs.xsave; unsigned int offset, size; - struct xstate_header header; + struct xstate_header hdr; + u64 mask; int i; - /* - * Currently copy_regset_to_user() starts from pos 0: - */ - if (unlikely(offset_start != 0)) + offset = offsetof(struct xregs_state, header); + if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf)) return -EFAULT; - /* - * The destination is a ptrace buffer; we put in only user xstates: - */ - memset(&header, 0, sizeof(header)); - header.xfeatures = xsave->header.xfeatures; - header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR; + if (validate_user_xstate_header(&hdr, fpstate)) + return -EINVAL; - /* - * Copy xregs_state->header: - */ - offset = offsetof(struct xregs_state, header); - size = sizeof(header); + /* Validate MXCSR when any of the related features is in use */ + mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM; + if (hdr.xfeatures & mask) { + u32 mxcsr[2]; - __copy_xstate_to_kernel(kbuf, &header, offset, size, size_total); + offset = offsetof(struct fxregs_state, mxcsr); + if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf)) + return -EFAULT; + + /* Reserved bits in MXCSR must be zero. */ + if (mxcsr[0] & ~mxcsr_feature_mask) + return -EINVAL; + + /* SSE and YMM require MXCSR even when FP is not in use. */ + if (!(hdr.xfeatures & XFEATURE_MASK_FP)) { + xsave->i387.mxcsr = mxcsr[0]; + xsave->i387.mxcsr_mask = mxcsr[1]; + } + } for (i = 0; i < XFEATURE_MAX; i++) { - /* - * Copy only in-use xstates: - */ - if ((header.xfeatures >> i) & 1) { - void *src = __raw_xsave_addr(xsave, i); + mask = BIT_ULL(i); + + if (hdr.xfeatures & mask) { + void *dst = __raw_xsave_addr(xsave, i); offset = xstate_offsets[i]; size = xstate_sizes[i]; - /* The next component has to fit fully into the output buffer: */ - if (offset + size > size_total) - break; - - __copy_xstate_to_kernel(kbuf, src, offset, size, size_total); + if (copy_from_buffer(dst, offset, size, kbuf, ubuf)) + return -EFAULT; } - - } - - if (xfeatures_mxcsr_quirk(header.xfeatures)) { - offset = offsetof(struct fxregs_state, mxcsr); - size = MXCSR_AND_FLAGS_SIZE; - __copy_xstate_to_kernel(kbuf, &xsave->i387.mxcsr, offset, size, size_total); } /* - * Fill xsave->i387.sw_reserved value for ptrace frame: + * The state that came in from userspace was user-state only. + * Mask all the user states out of 'xfeatures': */ - offset = offsetof(struct fxregs_state, sw_reserved); - size = sizeof(xstate_fx_sw_bytes); + xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL; - __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, size_total); + /* + * Add back in the features that came in from userspace: + */ + xsave->header.xfeatures |= hdr.xfeatures; return 0; } -static inline int -__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int offset, unsigned int size, unsigned int size_total) +/* + * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S] + * format and copy to the target thread. Used by ptrace and KVM. + */ +int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf) { - if (!size) - return 0; + return copy_uabi_to_xstate(fpstate, kbuf, NULL); +} - if (offset < size_total) { - unsigned int copy = min(size, size_total - offset); +/* + * Convert from a sigreturn standard-format user-space buffer to kernel + * XSAVE[S] format and copy to the target thread. This is called from the + * sigreturn() and rt_sigreturn() system calls. + */ +int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate, + const void __user *ubuf) +{ + return copy_uabi_to_xstate(fpstate, NULL, ubuf); +} - if (__copy_to_user(ubuf + offset, data, copy)) - return -EFAULT; - } - return 0; +static bool validate_independent_components(u64 mask) +{ + u64 xchk; + + if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES))) + return false; + + xchk = ~xfeatures_mask_independent(); + + if (WARN_ON_ONCE(!mask || mask & xchk)) + return false; + + return true; } +/** + * xsaves - Save selected components to a kernel xstate buffer + * @xstate: Pointer to the buffer + * @mask: Feature mask to select the components to save + * + * The @xstate buffer must be 64 byte aligned and correctly initialized as + * XSAVES does not write the full xstate header. Before first use the + * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer + * can #GP. + * + * The feature mask must be a subset of the independent features. + */ +void xsaves(struct xregs_state *xstate, u64 mask) +{ + int err; + + if (!validate_independent_components(mask)) + return; + + XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err); + WARN_ON_ONCE(err); +} + +/** + * xrstors - Restore selected components from a kernel xstate buffer + * @xstate: Pointer to the buffer + * @mask: Feature mask to select the components to restore + * + * The @xstate buffer must be 64 byte aligned and correctly initialized + * otherwise XRSTORS from that buffer can #GP. + * + * Proper usage is to restore the state which was saved with + * xsaves() into @xstate. + * + * The feature mask must be a subset of the independent features. + */ +void xrstors(struct xregs_state *xstate, u64 mask) +{ + int err; + + if (!validate_independent_components(mask)) + return; + + XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err); + WARN_ON_ONCE(err); +} + +#if IS_ENABLED(CONFIG_KVM) +void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature) +{ + void *addr = get_xsave_addr(&fps->regs.xsave, xfeature); + + if (addr) + memset(addr, 0, xstate_sizes[xfeature]); +} +EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component); +#endif + +#ifdef CONFIG_X86_64 + +#ifdef CONFIG_X86_DEBUG_FPU /* - * Convert from kernel XSAVES compacted format to standard format and copy - * to a user-space buffer. It supports partial copy but pos always starts from - * zero. This is called from xstateregs_get() and there we check the CPU - * has XSAVES. + * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask + * can safely operate on the @fpstate buffer. */ -int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total) +static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor) { - unsigned int offset, size; - int ret, i; - struct xstate_header header; + u64 xfd = __this_cpu_read(xfd_state); + + if (fpstate->xfd == xfd) + return true; + + /* + * The XFD MSR does not match fpstate->xfd. That's invalid when + * the passed in fpstate is current's fpstate. + */ + if (fpstate->xfd == current->thread.fpu.fpstate->xfd) + return false; /* - * Currently copy_regset_to_user() starts from pos 0: + * XRSTOR(S) from init_fpstate are always correct as it will just + * bring all components into init state and not read from the + * buffer. XSAVE(S) raises #PF after init. */ - if (unlikely(offset_start != 0)) - return -EFAULT; + if (fpstate == &init_fpstate) + return rstor; /* - * The destination is a ptrace buffer; we put in only user xstates: + * XSAVE(S): clone(), fpu_swap_kvm_fpu() + * XRSTORS(S): fpu_swap_kvm_fpu() */ - memset(&header, 0, sizeof(header)); - header.xfeatures = xsave->header.xfeatures; - header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR; /* - * Copy xregs_state->header: + * No XSAVE/XRSTOR instructions (except XSAVE itself) touch + * the buffer area for XFD-disabled state components. */ - offset = offsetof(struct xregs_state, header); - size = sizeof(header); + mask &= ~xfd; - ret = __copy_xstate_to_user(ubuf, &header, offset, size, size_total); - if (ret) - return ret; + /* + * Remove features which are valid in fpstate. They + * have space allocated in fpstate. + */ + mask &= ~fpstate->xfeatures; - for (i = 0; i < XFEATURE_MAX; i++) { - /* - * Copy only in-use xstates: - */ - if ((header.xfeatures >> i) & 1) { - void *src = __raw_xsave_addr(xsave, i); + /* + * Any remaining state components in 'mask' might be written + * by XSAVE/XRSTOR. Fail validation it found. + */ + return !mask; +} - offset = xstate_offsets[i]; - size = xstate_sizes[i]; +void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) +{ + WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor)); +} +#endif /* CONFIG_X86_DEBUG_FPU */ - /* The next component has to fit fully into the output buffer: */ - if (offset + size > size_total) - break; +static int __init xfd_update_static_branch(void) +{ + /* + * If init_fpstate.xfd has bits set then dynamic features are + * available and the dynamic sizing must be enabled. + */ + if (init_fpstate.xfd) + static_branch_enable(&__fpu_state_size_dynamic); + return 0; +} +arch_initcall(xfd_update_static_branch) - ret = __copy_xstate_to_user(ubuf, src, offset, size, size_total); - if (ret) - return ret; - } +void fpstate_free(struct fpu *fpu) +{ + if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate) + vfree(fpu->fpstate); +} - } +/** + * fpstate_realloc - Reallocate struct fpstate for the requested new features + * + * @xfeatures: A bitmap of xstate features which extend the enabled features + * of that task + * @ksize: The required size for the kernel buffer + * @usize: The required size for user space buffers + * @guest_fpu: Pointer to a guest FPU container. NULL for host allocations + * + * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer + * terminates quickly, vfree()-induced IPIs may be a concern, but tasks + * with large states are likely to live longer. + * + * Returns: 0 on success, -ENOMEM on allocation error. + */ +static int fpstate_realloc(u64 xfeatures, unsigned int ksize, + unsigned int usize, struct fpu_guest *guest_fpu) +{ + struct fpu *fpu = ¤t->thread.fpu; + struct fpstate *curfps, *newfps = NULL; + unsigned int fpsize; + bool in_use; - if (xfeatures_mxcsr_quirk(header.xfeatures)) { - offset = offsetof(struct fxregs_state, mxcsr); - size = MXCSR_AND_FLAGS_SIZE; - __copy_xstate_to_user(ubuf, &xsave->i387.mxcsr, offset, size, size_total); + fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64); + + newfps = vzalloc(fpsize); + if (!newfps) + return -ENOMEM; + newfps->size = ksize; + newfps->user_size = usize; + newfps->is_valloc = true; + + /* + * When a guest FPU is supplied, use @guest_fpu->fpstate + * as reference independent whether it is in use or not. + */ + curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate; + + /* Determine whether @curfps is the active fpstate */ + in_use = fpu->fpstate == curfps; + + if (guest_fpu) { + newfps->is_guest = true; + newfps->is_confidential = curfps->is_confidential; + newfps->in_use = curfps->in_use; + guest_fpu->xfeatures |= xfeatures; + guest_fpu->uabi_size = usize; } + fpregs_lock(); /* - * Fill xsave->i387.sw_reserved value for ptrace frame: + * If @curfps is in use, ensure that the current state is in the + * registers before swapping fpstate as that might invalidate it + * due to layout changes. */ - offset = offsetof(struct fxregs_state, sw_reserved); - size = sizeof(xstate_fx_sw_bytes); + if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD)) + fpregs_restore_userregs(); - ret = __copy_xstate_to_user(ubuf, xstate_fx_sw_bytes, offset, size, size_total); - if (ret) - return ret; + newfps->xfeatures = curfps->xfeatures | xfeatures; - return 0; -} + if (!guest_fpu) + newfps->user_xfeatures = curfps->user_xfeatures | xfeatures; -/* - * Convert from a ptrace standard-format kernel buffer to kernel XSAVES format - * and copy to the target thread. This is called from xstateregs_set(). - */ -int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) -{ - unsigned int offset, size; - int i; - struct xstate_header hdr; + newfps->xfd = curfps->xfd & ~xfeatures; - offset = offsetof(struct xregs_state, header); - size = sizeof(hdr); + /* Do the final updates within the locked region */ + xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures); - memcpy(&hdr, kbuf + offset, size); + if (guest_fpu) { + guest_fpu->fpstate = newfps; + /* If curfps is active, update the FPU fpstate pointer */ + if (in_use) + fpu->fpstate = newfps; + } else { + fpu->fpstate = newfps; + } - if (validate_xstate_header(&hdr)) - return -EINVAL; + if (in_use) + xfd_update_state(fpu->fpstate); + fpregs_unlock(); - for (i = 0; i < XFEATURE_MAX; i++) { - u64 mask = ((u64)1 << i); + /* Only free valloc'ed state */ + if (curfps && curfps->is_valloc) + vfree(curfps); - if (hdr.xfeatures & mask) { - void *dst = __raw_xsave_addr(xsave, i); + return 0; +} - offset = xstate_offsets[i]; - size = xstate_sizes[i]; +static int validate_sigaltstack(unsigned int usize) +{ + struct task_struct *thread, *leader = current->group_leader; + unsigned long framesize = get_sigframe_size(); - memcpy(dst, kbuf + offset, size); - } - } + lockdep_assert_held(¤t->sighand->siglock); - if (xfeatures_mxcsr_quirk(hdr.xfeatures)) { - offset = offsetof(struct fxregs_state, mxcsr); - size = MXCSR_AND_FLAGS_SIZE; - memcpy(&xsave->i387.mxcsr, kbuf + offset, size); + /* get_sigframe_size() is based on fpu_user_cfg.max_size */ + framesize -= fpu_user_cfg.max_size; + framesize += usize; + for_each_thread(leader, thread) { + if (thread->sas_ss_size && thread->sas_ss_size < framesize) + return -ENOSPC; } + return 0; +} +static int __xstate_request_perm(u64 permitted, u64 requested, bool guest) +{ /* - * The state that came in from userspace was user-state only. - * Mask all the user states out of 'xfeatures': + * This deliberately does not exclude !XSAVES as we still might + * decide to optionally context switch XCR0 or talk the silicon + * vendors into extending XFD for the pre AMX states, especially + * AVX512. */ - xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR; + bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); + struct fpu *fpu = ¤t->group_leader->thread.fpu; + struct fpu_state_perm *perm; + unsigned int ksize, usize; + u64 mask; + int ret = 0; + + /* Check whether fully enabled */ + if ((permitted & requested) == requested) + return 0; - /* - * Add back in the features that came in from userspace: - */ - xsave->header.xfeatures |= hdr.xfeatures; + /* Calculate the resulting kernel state size */ + mask = permitted | requested; + /* Take supervisor states into account on the host */ + if (!guest) + mask |= xfeatures_mask_supervisor(); + ksize = xstate_calculate_size(mask, compacted); + + /* Calculate the resulting user state size */ + mask &= XFEATURE_MASK_USER_SUPPORTED; + usize = xstate_calculate_size(mask, false); + + if (!guest) { + ret = validate_sigaltstack(usize); + if (ret) + return ret; + } - return 0; + perm = guest ? &fpu->guest_perm : &fpu->perm; + /* Pairs with the READ_ONCE() in xstate_get_group_perm() */ + WRITE_ONCE(perm->__state_perm, mask); + /* Protected by sighand lock */ + perm->__state_size = ksize; + perm->__user_state_size = usize; + return ret; } /* - * Convert from a ptrace or sigreturn standard-format user-space buffer to - * kernel XSAVES format and copy to the target thread. This is called from - * xstateregs_set(), as well as potentially from the sigreturn() and - * rt_sigreturn() system calls. + * Permissions array to map facilities with more than one component */ -int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) +static const u64 xstate_prctl_req[XFEATURE_MAX] = { + [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA, +}; + +static int xstate_request_perm(unsigned long idx, bool guest) { - unsigned int offset, size; - int i; - struct xstate_header hdr; + u64 permitted, requested; + int ret; - offset = offsetof(struct xregs_state, header); - size = sizeof(hdr); + if (idx >= XFEATURE_MAX) + return -EINVAL; - if (__copy_from_user(&hdr, ubuf + offset, size)) - return -EFAULT; + /* + * Look up the facility mask which can require more than + * one xstate component. + */ + idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req)); + requested = xstate_prctl_req[idx]; + if (!requested) + return -EOPNOTSUPP; - if (validate_xstate_header(&hdr)) - return -EINVAL; + if ((fpu_user_cfg.max_features & requested) != requested) + return -EOPNOTSUPP; - for (i = 0; i < XFEATURE_MAX; i++) { - u64 mask = ((u64)1 << i); + /* Lockless quick check */ + permitted = xstate_get_group_perm(guest); + if ((permitted & requested) == requested) + return 0; - if (hdr.xfeatures & mask) { - void *dst = __raw_xsave_addr(xsave, i); + /* Protect against concurrent modifications */ + spin_lock_irq(¤t->sighand->siglock); + permitted = xstate_get_group_perm(guest); - offset = xstate_offsets[i]; - size = xstate_sizes[i]; + /* First vCPU allocation locks the permissions. */ + if (guest && (permitted & FPU_GUEST_PERM_LOCKED)) + ret = -EBUSY; + else + ret = __xstate_request_perm(permitted, requested, guest); + spin_unlock_irq(¤t->sighand->siglock); + return ret; +} - if (__copy_from_user(dst, ubuf + offset, size)) - return -EFAULT; - } +int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu) +{ + u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC; + struct fpu_state_perm *perm; + unsigned int ksize, usize; + struct fpu *fpu; + + if (!xfd_event) { + if (!guest_fpu) + pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err); + return 0; } - if (xfeatures_mxcsr_quirk(hdr.xfeatures)) { - offset = offsetof(struct fxregs_state, mxcsr); - size = MXCSR_AND_FLAGS_SIZE; - if (__copy_from_user(&xsave->i387.mxcsr, ubuf + offset, size)) - return -EFAULT; + /* Protect against concurrent modifications */ + spin_lock_irq(¤t->sighand->siglock); + + /* If not permitted let it die */ + if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) { + spin_unlock_irq(¤t->sighand->siglock); + return -EPERM; } + fpu = ¤t->group_leader->thread.fpu; + perm = guest_fpu ? &fpu->guest_perm : &fpu->perm; + ksize = perm->__state_size; + usize = perm->__user_state_size; + /* - * The state that came in from userspace was user-state only. - * Mask all the user states out of 'xfeatures': + * The feature is permitted. State size is sufficient. Dropping + * the lock is safe here even if more features are added from + * another task, the retrieved buffer sizes are valid for the + * currently requested feature(s). */ - xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR; + spin_unlock_irq(¤t->sighand->siglock); /* - * Add back in the features that came in from userspace: + * Try to allocate a new fpstate. If that fails there is no way + * out. */ - xsave->header.xfeatures |= hdr.xfeatures; - + if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu)) + return -EFAULT; return 0; } +int xfd_enable_feature(u64 xfd_err) +{ + return __xfd_enable_feature(xfd_err, NULL); +} + +#else /* CONFIG_X86_64 */ +static inline int xstate_request_perm(unsigned long idx, bool guest) +{ + return -EPERM; +} +#endif /* !CONFIG_X86_64 */ + +u64 xstate_get_guest_group_perm(void) +{ + return xstate_get_group_perm(true); +} +EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm); + +/** + * fpu_xstate_prctl - xstate permission operations + * @tsk: Redundant pointer to current + * @option: A subfunction of arch_prctl() + * @arg2: option argument + * Return: 0 if successful; otherwise, an error code + * + * Option arguments: + * + * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info + * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info + * ARCH_REQ_XCOMP_PERM: Facility number requested + * + * For facilities which require more than one XSTATE component, the request + * must be the highest state component number related to that facility, + * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and + * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18). + */ +long fpu_xstate_prctl(int option, unsigned long arg2) +{ + u64 __user *uptr = (u64 __user *)arg2; + u64 permitted, supported; + unsigned long idx = arg2; + bool guest = false; + + switch (option) { + case ARCH_GET_XCOMP_SUPP: + supported = fpu_user_cfg.max_features | fpu_user_cfg.legacy_features; + return put_user(supported, uptr); + + case ARCH_GET_XCOMP_PERM: + /* + * Lockless snapshot as it can also change right after the + * dropping the lock. + */ + permitted = xstate_get_host_group_perm(); + permitted &= XFEATURE_MASK_USER_SUPPORTED; + return put_user(permitted, uptr); + + case ARCH_GET_XCOMP_GUEST_PERM: + permitted = xstate_get_guest_group_perm(); + permitted &= XFEATURE_MASK_USER_SUPPORTED; + return put_user(permitted, uptr); + + case ARCH_REQ_XCOMP_GUEST_PERM: + guest = true; + fallthrough; + + case ARCH_REQ_XCOMP_PERM: + if (!IS_ENABLED(CONFIG_X86_64)) + return -EOPNOTSUPP; + + return xstate_request_perm(idx, guest); + + default: + return -EINVAL; + } +} + #ifdef CONFIG_PROC_PID_ARCH_STATUS /* * Report the amount of time elapsed in millisecond since last AVX512 diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h new file mode 100644 index 000000000000..5ad47031383b --- /dev/null +++ b/arch/x86/kernel/fpu/xstate.h @@ -0,0 +1,326 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __X86_KERNEL_FPU_XSTATE_H +#define __X86_KERNEL_FPU_XSTATE_H + +#include <asm/cpufeature.h> +#include <asm/fpu/xstate.h> +#include <asm/fpu/xcr.h> + +#ifdef CONFIG_X86_64 +DECLARE_PER_CPU(u64, xfd_state); +#endif + +static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask) +{ + /* + * XRSTORS requires these bits set in xcomp_bv, or it will + * trigger #GP: + */ + if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) + xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT; +} + +static inline u64 xstate_get_group_perm(bool guest) +{ + struct fpu *fpu = ¤t->group_leader->thread.fpu; + struct fpu_state_perm *perm; + + /* Pairs with WRITE_ONCE() in xstate_request_perm() */ + perm = guest ? &fpu->guest_perm : &fpu->perm; + return READ_ONCE(perm->__state_perm); +} + +static inline u64 xstate_get_host_group_perm(void) +{ + return xstate_get_group_perm(false); +} + +enum xstate_copy_mode { + XSTATE_COPY_FP, + XSTATE_COPY_FX, + XSTATE_COPY_XSAVE, +}; + +struct membuf; +extern void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, + u32 pkru_val, enum xstate_copy_mode copy_mode); +extern void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, + enum xstate_copy_mode mode); +extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf); +extern int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate, const void __user *ubuf); + + +extern void fpu__init_cpu_xstate(void); +extern void fpu__init_system_xstate(unsigned int legacy_size); + +extern void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr); + +static inline u64 xfeatures_mask_supervisor(void) +{ + return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED; +} + +static inline u64 xfeatures_mask_independent(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) + return XFEATURE_MASK_INDEPENDENT & ~XFEATURE_MASK_LBR; + + return XFEATURE_MASK_INDEPENDENT; +} + +/* XSAVE/XRSTOR wrapper functions */ + +#ifdef CONFIG_X86_64 +#define REX_PREFIX "0x48, " +#else +#define REX_PREFIX +#endif + +/* These macros all use (%edi)/(%rdi) as the single memory argument. */ +#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" +#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37" +#define XSAVEC ".byte " REX_PREFIX "0x0f,0xc7,0x27" +#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f" +#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f" +#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f" + +/* + * After this @err contains 0 on success or the trap number when the + * operation raises an exception. + */ +#define XSTATE_OP(op, st, lmask, hmask, err) \ + asm volatile("1:" op "\n\t" \ + "xor %[err], %[err]\n" \ + "2:\n\t" \ + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \ + : [err] "=a" (err) \ + : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ + : "memory") + +/* + * If XSAVES is enabled, it replaces XSAVEC because it supports supervisor + * states in addition to XSAVEC. + * + * Otherwise if XSAVEC is enabled, it replaces XSAVEOPT because it supports + * compacted storage format in addition to XSAVEOPT. + * + * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT + * supports modified optimization which is not supported by XSAVE. + * + * We use XSAVE as a fallback. + * + * The 661 label is defined in the ALTERNATIVE* macros as the address of the + * original instruction which gets replaced. We need to use it here as the + * address of the instruction where we might get an exception at. + */ +#define XSTATE_XSAVE(st, lmask, hmask, err) \ + asm volatile(ALTERNATIVE_3(XSAVE, \ + XSAVEOPT, X86_FEATURE_XSAVEOPT, \ + XSAVEC, X86_FEATURE_XSAVEC, \ + XSAVES, X86_FEATURE_XSAVES) \ + "\n" \ + "xor %[err], %[err]\n" \ + "3:\n" \ + _ASM_EXTABLE_TYPE_REG(661b, 3b, EX_TYPE_EFAULT_REG, %[err]) \ + : [err] "=r" (err) \ + : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ + : "memory") + +/* + * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact + * XSAVE area format. + */ +#define XSTATE_XRESTORE(st, lmask, hmask) \ + asm volatile(ALTERNATIVE(XRSTOR, \ + XRSTORS, X86_FEATURE_XSAVES) \ + "\n" \ + "3:\n" \ + _ASM_EXTABLE_TYPE(661b, 3b, EX_TYPE_FPU_RESTORE) \ + : \ + : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ + : "memory") + +#if defined(CONFIG_X86_64) && defined(CONFIG_X86_DEBUG_FPU) +extern void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor); +#else +static inline void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) { } +#endif + +#ifdef CONFIG_X86_64 +static inline void xfd_update_state(struct fpstate *fpstate) +{ + if (fpu_state_size_dynamic()) { + u64 xfd = fpstate->xfd; + + if (__this_cpu_read(xfd_state) != xfd) { + wrmsrl(MSR_IA32_XFD, xfd); + __this_cpu_write(xfd_state, xfd); + } + } +} + +extern int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu); +#else +static inline void xfd_update_state(struct fpstate *fpstate) { } + +static inline int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu) { + return -EPERM; +} +#endif + +/* + * Save processor xstate to xsave area. + * + * Uses either XSAVE or XSAVEOPT or XSAVES depending on the CPU features + * and command line options. The choice is permanent until the next reboot. + */ +static inline void os_xsave(struct fpstate *fpstate) +{ + u64 mask = fpstate->xfeatures; + u32 lmask = mask; + u32 hmask = mask >> 32; + int err; + + WARN_ON_FPU(!alternatives_patched); + xfd_validate_state(fpstate, mask, false); + + XSTATE_XSAVE(&fpstate->regs.xsave, lmask, hmask, err); + + /* We should never fault when copying to a kernel buffer: */ + WARN_ON_FPU(err); +} + +/* + * Restore processor xstate from xsave area. + * + * Uses XRSTORS when XSAVES is used, XRSTOR otherwise. + */ +static inline void os_xrstor(struct fpstate *fpstate, u64 mask) +{ + u32 lmask = mask; + u32 hmask = mask >> 32; + + xfd_validate_state(fpstate, mask, true); + XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask); +} + +/* Restore of supervisor state. Does not require XFD */ +static inline void os_xrstor_supervisor(struct fpstate *fpstate) +{ + u64 mask = xfeatures_mask_supervisor(); + u32 lmask = mask; + u32 hmask = mask >> 32; + + XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask); +} + +/* + * XSAVE itself always writes all requested xfeatures. Removing features + * from the request bitmap reduces the features which are written. + * Generate a mask of features which must be written to a sigframe. The + * unset features can be optimized away and not written. + * + * This optimization is user-visible. Only use for states where + * uninitialized sigframe contents are tolerable, like dynamic features. + * + * Users of buffers produced with this optimization must check XSTATE_BV + * to determine which features have been optimized out. + */ +static inline u64 xfeatures_need_sigframe_write(void) +{ + u64 xfeaures_to_write; + + /* In-use features must be written: */ + xfeaures_to_write = xfeatures_in_use(); + + /* Also write all non-optimizable sigframe features: */ + xfeaures_to_write |= XFEATURE_MASK_USER_SUPPORTED & + ~XFEATURE_MASK_SIGFRAME_INITOPT; + + return xfeaures_to_write; +} + +/* + * Save xstate to user space xsave area. + * + * We don't use modified optimization because xrstor/xrstors might track + * a different application. + * + * We don't use compacted format xsave area for backward compatibility for + * old applications which don't understand the compacted format of the + * xsave area. + * + * The caller has to zero buf::header before calling this because XSAVE* + * does not touch the reserved fields in the header. + */ +static inline int xsave_to_user_sigframe(struct xregs_state __user *buf) +{ + /* + * Include the features which are not xsaved/rstored by the kernel + * internally, e.g. PKRU. That's user space ABI and also required + * to allow the signal handler to modify PKRU. + */ + struct fpstate *fpstate = current->thread.fpu.fpstate; + u64 mask = fpstate->user_xfeatures; + u32 lmask; + u32 hmask; + int err; + + /* Optimize away writing unnecessary xfeatures: */ + if (fpu_state_size_dynamic()) + mask &= xfeatures_need_sigframe_write(); + + lmask = mask; + hmask = mask >> 32; + xfd_validate_state(fpstate, mask, false); + + stac(); + XSTATE_OP(XSAVE, buf, lmask, hmask, err); + clac(); + + return err; +} + +/* + * Restore xstate from user space xsave area. + */ +static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64 mask) +{ + struct xregs_state *xstate = ((__force struct xregs_state *)buf); + u32 lmask = mask; + u32 hmask = mask >> 32; + int err; + + xfd_validate_state(current->thread.fpu.fpstate, mask, true); + + stac(); + XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); + clac(); + + return err; +} + +/* + * Restore xstate from kernel space xsave area, return an error code instead of + * an exception. + */ +static inline int os_xrstor_safe(struct fpstate *fpstate, u64 mask) +{ + struct xregs_state *xstate = &fpstate->regs.xsave; + u32 lmask = mask; + u32 hmask = mask >> 32; + int err; + + /* Ensure that XFD is up to date */ + xfd_update_state(fpstate); + + if (cpu_feature_enabled(X86_FEATURE_XSAVES)) + XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); + else + XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); + + return err; +} + + +#endif diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 37a0aeaf89e7..bd165004776d 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -37,7 +37,7 @@ static int ftrace_poke_late = 0; -int ftrace_arch_code_modify_prepare(void) +void ftrace_arch_code_modify_prepare(void) __acquires(&text_mutex) { /* @@ -47,10 +47,9 @@ int ftrace_arch_code_modify_prepare(void) */ mutex_lock(&text_mutex); ftrace_poke_late = 1; - return 0; } -int ftrace_arch_code_modify_post_process(void) +void ftrace_arch_code_modify_post_process(void) __releases(&text_mutex) { /* @@ -61,12 +60,11 @@ int ftrace_arch_code_modify_post_process(void) text_poke_finish(); ftrace_poke_late = 0; mutex_unlock(&text_mutex); - return 0; } static const char *ftrace_nop_replace(void) { - return ideal_nops[NOP_ATOMIC5]; + return x86_nops[5]; } static const char *ftrace_call_replace(unsigned long ip, unsigned long addr) @@ -86,13 +84,14 @@ static int ftrace_verify_code(unsigned long ip, const char *old_code) * sure what we read is what we expected it to be before modifying it. */ /* read the text we want to modify */ - if (probe_kernel_read(cur_code, (void *)ip, MCOUNT_INSN_SIZE)) { + if (copy_from_kernel_nofault(cur_code, (void *)ip, MCOUNT_INSN_SIZE)) { WARN_ON(1); return -EFAULT; } /* Make sure it is what we expect it to be */ if (memcmp(cur_code, old_code, MCOUNT_INSN_SIZE) != 0) { + ftrace_expected = old_code; WARN_ON(1); return -EINVAL; } @@ -252,11 +251,6 @@ void arch_ftrace_update_code(int command) ftrace_modify_all_code(command); } -int __init ftrace_dyn_arch_init(void) -{ - return 0; -} - /* Currently only x86_64 supports dynamic trampolines */ #ifdef CONFIG_X86_64 @@ -282,9 +276,11 @@ static inline void tramp_free(void *tramp) { } /* Defined as markers to the end of the ftrace default trampolines */ extern void ftrace_regs_caller_end(void); -extern void ftrace_epilogue(void); +extern void ftrace_regs_caller_ret(void); +extern void ftrace_caller_end(void); extern void ftrace_caller_op_ptr(void); extern void ftrace_regs_caller_op_ptr(void); +extern void ftrace_regs_caller_jmp(void); /* movq function_trace_op(%rip), %rdx */ /* 0x48 0x8b 0x15 <offset-to-ftrace_trace_op (4 bytes)> */ @@ -306,7 +302,7 @@ union ftrace_op_code_union { } __attribute__((packed)); }; -#define RET_SIZE 1 +#define RET_SIZE (IS_ENABLED(CONFIG_RETPOLINE) ? 5 : 1 + IS_ENABLED(CONFIG_SLS)) static unsigned long create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) @@ -315,15 +311,16 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) unsigned long end_offset; unsigned long op_offset; unsigned long call_offset; + unsigned long jmp_offset; unsigned long offset; unsigned long npages; unsigned long size; - unsigned long retq; unsigned long *ptr; void *trampoline; void *ip; /* 48 8b 15 <offset> is movq <offset>(%rip), %rdx */ unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 }; + unsigned const char retq[] = { RET_INSN_OPCODE, INT3_INSN_OPCODE }; union ftrace_op_code_union op_ptr; int ret; @@ -332,11 +329,13 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) end_offset = (unsigned long)ftrace_regs_caller_end; op_offset = (unsigned long)ftrace_regs_caller_op_ptr; call_offset = (unsigned long)ftrace_regs_call; + jmp_offset = (unsigned long)ftrace_regs_caller_jmp; } else { start_offset = (unsigned long)ftrace_caller; - end_offset = (unsigned long)ftrace_epilogue; + end_offset = (unsigned long)ftrace_caller_end; op_offset = (unsigned long)ftrace_caller_op_ptr; call_offset = (unsigned long)ftrace_call; + jmp_offset = 0; } size = end_offset - start_offset; @@ -354,17 +353,26 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE); /* Copy ftrace_caller onto the trampoline memory */ - ret = probe_kernel_read(trampoline, (void *)start_offset, size); + ret = copy_from_kernel_nofault(trampoline, (void *)start_offset, size); if (WARN_ON(ret < 0)) goto fail; ip = trampoline + size; + if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + __text_gen_insn(ip, JMP32_INSN_OPCODE, ip, &__x86_return_thunk, JMP32_INSN_SIZE); + else + memcpy(ip, retq, sizeof(retq)); - /* The trampoline ends with ret(q) */ - retq = (unsigned long)ftrace_stub; - ret = probe_kernel_read(ip, (void *)retq, RET_SIZE); - if (WARN_ON(ret < 0)) - goto fail; + /* No need to test direct calls on created trampolines */ + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { + /* NOP the jnz 1f; but make sure it's a 2 byte jnz */ + ip = trampoline + (jmp_offset - start_offset); + if (WARN_ON(*(char *)ip != 0x75)) + goto fail; + ret = copy_from_kernel_nofault(ip, x86_nops[2], 2); + if (ret < 0) + goto fail; + } /* * The address of the ftrace_ops that is used for this trampoline @@ -407,7 +415,8 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) set_vm_flush_reset_perms(trampoline); - set_memory_ro((unsigned long)trampoline, npages); + if (likely(system_state != SYSTEM_BOOTING)) + set_memory_ro((unsigned long)trampoline, npages); set_memory_x((unsigned long)trampoline, npages); return (unsigned long)trampoline; fail: @@ -415,6 +424,32 @@ fail: return 0; } +void set_ftrace_ops_ro(void) +{ + struct ftrace_ops *ops; + unsigned long start_offset; + unsigned long end_offset; + unsigned long npages; + unsigned long size; + + do_for_each_ftrace_op(ops, ftrace_ops_list) { + if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) + continue; + + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { + start_offset = (unsigned long)ftrace_regs_caller; + end_offset = (unsigned long)ftrace_regs_caller_end; + } else { + start_offset = (unsigned long)ftrace_caller; + end_offset = (unsigned long)ftrace_caller_end; + } + size = end_offset - start_offset; + size = size + RET_SIZE + sizeof(void *); + npages = DIV_ROUND_UP(size, PAGE_SIZE); + set_memory_ro((unsigned long)ops->trampoline, npages); + } while_for_each_ftrace_op(ops); +} + static unsigned long calc_trampoline_call_offset(bool save_regs) { unsigned long start_offset; @@ -471,7 +506,7 @@ static void *addr_from_call(void *ptr) union text_poke_insn call; int ret; - ret = probe_kernel_read(&call, ptr, CALL_INSN_SIZE); + ret = copy_from_kernel_nofault(&call, ptr, CALL_INSN_SIZE); if (WARN_ON_ONCE(ret < 0)) return NULL; @@ -484,7 +519,7 @@ static void *addr_from_call(void *ptr) return ptr + CALL_INSN_SIZE + call.disp; } -void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, +void prepare_ftrace_return(unsigned long ip, unsigned long *parent, unsigned long frame_pointer); /* @@ -498,7 +533,8 @@ static void *static_tramp_func(struct ftrace_ops *ops, struct dyn_ftrace *rec) void *ptr; if (ops && ops->trampoline) { -#ifdef CONFIG_FUNCTION_GRAPH_TRACER +#if !defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS) && \ + defined(CONFIG_FUNCTION_GRAPH_TRACER) /* * We only know about function graph tracer setting as static * trampoline. @@ -545,9 +581,8 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops) #ifdef CONFIG_FUNCTION_GRAPH_TRACER -#ifdef CONFIG_DYNAMIC_FTRACE +#if defined(CONFIG_DYNAMIC_FTRACE) && !defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS) extern void ftrace_graph_call(void); - static const char *ftrace_jmp_replace(unsigned long ip, unsigned long addr) { return text_gen_insn(JMP32_INSN_OPCODE, (void *)ip, (void *)addr); @@ -575,19 +610,17 @@ int ftrace_disable_ftrace_graph_caller(void) return ftrace_mod_jmp(ip, &ftrace_stub); } - -#endif /* !CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_DYNAMIC_FTRACE && !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */ /* * Hook the return address and push it in the stack of return addrs * in current thread info. */ -void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, +void prepare_ftrace_return(unsigned long ip, unsigned long *parent, unsigned long frame_pointer) { unsigned long return_hooker = (unsigned long)&return_to_handler; - unsigned long old; - int faulted; + int bit; /* * When resuming from suspend-to-ram, this function can be indirectly @@ -607,37 +640,25 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, if (unlikely(atomic_read(¤t->tracing_graph_pause))) return; - /* - * Protect against fault, even if it shouldn't - * happen. This tool is too much intrusive to - * ignore such a protection. - */ - asm volatile( - "1: " _ASM_MOV " (%[parent]), %[old]\n" - "2: " _ASM_MOV " %[return_hooker], (%[parent])\n" - " movl $0, %[faulted]\n" - "3:\n" - - ".section .fixup, \"ax\"\n" - "4: movl $1, %[faulted]\n" - " jmp 3b\n" - ".previous\n" - - _ASM_EXTABLE(1b, 4b) - _ASM_EXTABLE(2b, 4b) - - : [old] "=&r" (old), [faulted] "=r" (faulted) - : [parent] "r" (parent), [return_hooker] "r" (return_hooker) - : "memory" - ); - - if (unlikely(faulted)) { - ftrace_graph_stop(); - WARN_ON(1); + bit = ftrace_test_recursion_trylock(ip, *parent); + if (bit < 0) return; - } - if (function_graph_enter(old, self_addr, frame_pointer, parent)) - *parent = old; + if (!function_graph_enter(*parent, ip, frame_pointer, parent)) + *parent = return_hooker; + + ftrace_test_recursion_unlock(bit); } + +#ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS +void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) +{ + struct pt_regs *regs = &fregs->regs; + unsigned long *stack = (unsigned long *)kernel_stack_pointer(regs); + + prepare_ftrace_return(ip, (unsigned long *)stack, 0); +} +#endif + #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S index e8a9f8370112..a0ed0e4a2c0c 100644 --- a/arch/x86/kernel/ftrace_32.S +++ b/arch/x86/kernel/ftrace_32.S @@ -19,7 +19,7 @@ #endif SYM_FUNC_START(__fentry__) - ret + RET SYM_FUNC_END(__fentry__) EXPORT_SYMBOL(__fentry__) @@ -84,7 +84,7 @@ ftrace_graph_call: /* This is weak to keep gas from relaxing the jumps */ SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK) - ret + RET SYM_CODE_END(ftrace_caller) SYM_CODE_START(ftrace_regs_caller) @@ -177,7 +177,7 @@ SYM_CODE_START(ftrace_graph_caller) popl %edx popl %ecx popl %eax - ret + RET SYM_CODE_END(ftrace_graph_caller) .globl return_to_handler @@ -189,5 +189,5 @@ return_to_handler: movl %eax, %ecx popl %edx popl %eax - JMP_NOSPEC %ecx + JMP_NOSPEC ecx #endif diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 369e61faacfe..2a4be92fd144 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -4,6 +4,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/ptrace.h> #include <asm/ftrace.h> #include <asm/export.h> @@ -12,7 +13,7 @@ #include <asm/frame.h> .code64 - .section .entry.text, "ax" + .section .text, "ax" #ifdef CONFIG_FRAME_POINTER /* Save parent and function stack frames (rip and rbp) */ @@ -23,7 +24,7 @@ #endif /* CONFIG_FRAME_POINTER */ /* Size of stack used to save mcount regs in save_mcount_regs */ -#define MCOUNT_REG_SIZE (SS+8 + MCOUNT_FRAME_SIZE) +#define MCOUNT_REG_SIZE (FRAME_SIZE + MCOUNT_FRAME_SIZE) /* * gcc -pg option adds a call to 'mcount' in most functions. @@ -77,7 +78,7 @@ /* * We add enough stack to save all regs. */ - subq $(MCOUNT_REG_SIZE - MCOUNT_FRAME_SIZE), %rsp + subq $(FRAME_SIZE), %rsp movq %rax, RAX(%rsp) movq %rcx, RCX(%rsp) movq %rdx, RDX(%rsp) @@ -129,10 +130,18 @@ .endm +SYM_TYPED_FUNC_START(ftrace_stub) + RET +SYM_FUNC_END(ftrace_stub) + +SYM_TYPED_FUNC_START(ftrace_stub_graph) + RET +SYM_FUNC_END(ftrace_stub_graph) + #ifdef CONFIG_DYNAMIC_FTRACE SYM_FUNC_START(__fentry__) - retq + RET SYM_FUNC_END(__fentry__) EXPORT_SYMBOL(__fentry__) @@ -140,16 +149,29 @@ SYM_FUNC_START(ftrace_caller) /* save_mcount_regs fills in first two parameters */ save_mcount_regs + /* Stack - skipping return address of ftrace_caller */ + leaq MCOUNT_REG_SIZE+8(%rsp), %rcx + movq %rcx, RSP(%rsp) + SYM_INNER_LABEL(ftrace_caller_op_ptr, SYM_L_GLOBAL) + ANNOTATE_NOENDBR /* Load the ftrace_ops into the 3rd parameter */ movq function_trace_op(%rip), %rdx - /* regs go into 4th parameter (but make it NULL) */ - movq $0, %rcx + /* regs go into 4th parameter */ + leaq (%rsp), %rcx + + /* Only ops with REGS flag set should have CS register set */ + movq $0, CS(%rsp) SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) + ANNOTATE_NOENDBR call ftrace_stub + /* Handlers can change the RIP */ + movq RIP(%rsp), %rax + movq %rax, MCOUNT_REG_SIZE(%rsp) + restore_mcount_regs /* @@ -157,32 +179,22 @@ SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) * think twice before adding any new code or changing the * layout here. */ -SYM_INNER_LABEL(ftrace_epilogue, SYM_L_GLOBAL) - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL) - jmp ftrace_stub -#endif - -/* - * This is weak to keep gas from relaxing the jumps. - * It is also used to copy the retq for trampolines. - */ -SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK) - retq -SYM_FUNC_END(ftrace_caller) +SYM_INNER_LABEL(ftrace_caller_end, SYM_L_GLOBAL) + ANNOTATE_NOENDBR + RET +SYM_FUNC_END(ftrace_caller); +STACK_FRAME_NON_STANDARD_FP(ftrace_caller) SYM_FUNC_START(ftrace_regs_caller) /* Save the current flags before any operations that can change them */ pushfq - UNWIND_HINT_SAVE - /* added 8 bytes to save flags */ save_mcount_regs 8 /* save_mcount_regs fills in first two parameters */ SYM_INNER_LABEL(ftrace_regs_caller_op_ptr, SYM_L_GLOBAL) + ANNOTATE_NOENDBR /* Load the ftrace_ops into the 3rd parameter */ movq function_trace_op(%rip), %rdx @@ -212,6 +224,7 @@ SYM_INNER_LABEL(ftrace_regs_caller_op_ptr, SYM_L_GLOBAL) leaq (%rsp), %rcx SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) + ANNOTATE_NOENDBR call ftrace_stub /* Copy flags back to SS, to restore them */ @@ -233,45 +246,39 @@ SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) movq ORIG_RAX(%rsp), %rax movq %rax, MCOUNT_REG_SIZE-8(%rsp) - /* If ORIG_RAX is anything but zero, make this a call to that */ - movq ORIG_RAX(%rsp), %rax - cmpq $0, %rax - je 1f - - /* Swap the flags with orig_rax */ - movq MCOUNT_REG_SIZE(%rsp), %rdi - movq %rdi, MCOUNT_REG_SIZE-8(%rsp) - movq %rax, MCOUNT_REG_SIZE(%rsp) - - restore_mcount_regs 8 - - jmp 2f - -1: restore_mcount_regs - - -2: /* - * The stack layout is nondetermistic here, depending on which path was - * taken. This confuses objtool and ORC, rightfully so. For now, - * pretend the stack always looks like the non-direct case. + * If ORIG_RAX is anything but zero, make this a call to that. + * See arch_ftrace_set_direct_caller(). */ - UNWIND_HINT_RESTORE + testq %rax, %rax +SYM_INNER_LABEL(ftrace_regs_caller_jmp, SYM_L_GLOBAL) + ANNOTATE_NOENDBR + jnz 1f + restore_mcount_regs /* Restore flags */ popfq /* - * As this jmp to ftrace_epilogue can be a short jump - * it must not be copied into the trampoline. - * The trampoline will add the code to jump - * to the return. + * The trampoline will add the return. */ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) + ANNOTATE_NOENDBR + RET + + /* Swap the flags with orig_rax */ +1: movq MCOUNT_REG_SIZE(%rsp), %rdi + movq %rdi, MCOUNT_REG_SIZE-8(%rsp) + movq %rax, MCOUNT_REG_SIZE(%rsp) - jmp ftrace_epilogue + restore_mcount_regs 8 + /* Restore flags */ + popfq + UNWIND_HINT_FUNC + RET SYM_FUNC_END(ftrace_regs_caller) +STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller) #else /* ! CONFIG_DYNAMIC_FTRACE */ @@ -279,18 +286,7 @@ SYM_FUNC_END(ftrace_regs_caller) SYM_FUNC_START(__fentry__) cmpq $ftrace_stub, ftrace_trace_function jnz trace - -fgraph_trace: -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - cmpq $ftrace_stub, ftrace_graph_return - jnz ftrace_graph_caller - - cmpq $ftrace_graph_entry_stub, ftrace_graph_entry - jnz ftrace_graph_caller -#endif - -SYM_INNER_LABEL(ftrace_stub, SYM_L_GLOBAL) - retq + RET trace: /* save_mcount_regs fills in first two parameters */ @@ -303,31 +299,21 @@ trace: * function tracing is enabled. */ movq ftrace_trace_function, %r8 - CALL_NOSPEC %r8 + CALL_NOSPEC r8 restore_mcount_regs - jmp fgraph_trace + jmp ftrace_stub SYM_FUNC_END(__fentry__) EXPORT_SYMBOL(__fentry__) +STACK_FRAME_NON_STANDARD_FP(__fentry__) + #endif /* CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER -SYM_FUNC_START(ftrace_graph_caller) - /* Saves rbp into %rdx and fills first parameter */ - save_mcount_regs - - leaq MCOUNT_REG_SIZE+8(%rsp), %rsi - movq $0, %rdx /* No framepointers needed */ - call prepare_ftrace_return - - restore_mcount_regs - - retq -SYM_FUNC_END(ftrace_graph_caller) - SYM_CODE_START(return_to_handler) UNWIND_HINT_EMPTY - subq $24, %rsp + ANNOTATE_NOENDBR + subq $16, %rsp /* Save the return values */ movq %rax, (%rsp) @@ -339,7 +325,18 @@ SYM_CODE_START(return_to_handler) movq %rax, %rdi movq 8(%rsp), %rdx movq (%rsp), %rax - addq $24, %rsp - JMP_NOSPEC %rdi + + addq $16, %rsp + /* + * Jump back to the old return address. This cannot be JMP_NOSPEC rdi + * since IBT would demand that contain ENDBR, which simply isn't so for + * return addresses. Use a retpoline here to keep the RSB balanced. + */ + ANNOTATE_INTRA_FUNCTION_CALL + call .Ldo_rop + int3 +.Ldo_rop: + mov %rdi, (%rsp) + RET SYM_CODE_END(return_to_handler) #endif diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 206a4b6144c2..6a3cfaf6b72a 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -19,14 +19,14 @@ #include <linux/start_kernel.h> #include <linux/io.h> #include <linux/memblock.h> -#include <linux/mem_encrypt.h> +#include <linux/cc_platform.h> +#include <linux/pgtable.h> #include <asm/processor.h> #include <asm/proto.h> #include <asm/smp.h> #include <asm/setup.h> #include <asm/desc.h> -#include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/sections.h> #include <asm/kdebug.h> @@ -36,6 +36,11 @@ #include <asm/microcode.h> #include <asm/kasan.h> #include <asm/fixmap.h> +#include <asm/realmode.h> +#include <asm/extable.h> +#include <asm/trapnr.h> +#include <asm/sev.h> +#include <asm/tdx.h> /* * Manage page tables very early on. @@ -61,7 +66,25 @@ unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4; EXPORT_SYMBOL(vmemmap_base); #endif -#define __head __section(.head.text) +/* + * GDT used on the boot CPU before switching to virtual addresses. + */ +static struct desc_struct startup_gdt[GDT_ENTRIES] = { + [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff), +}; + +/* + * Address needs to be set at runtime because it references the startup_gdt + * while the kernel still uses a direct mapping. + */ +static struct desc_ptr startup_gdt_descr = { + .size = sizeof(startup_gdt), + .address = 0, +}; + +#define __head __section(".head.text") static void __head *fixup_pointer(void *ptr, unsigned long physaddr) { @@ -82,7 +105,7 @@ static unsigned int __head *fixup_int(void *ptr, unsigned long physaddr) static bool __head check_la57_support(unsigned long physaddr) { /* - * 5-level paging is detected and enabled at kernel decomression + * 5-level paging is detected and enabled at kernel decompression * stage. Only check if it has been enabled there. */ if (!(native_read_cr4() & X86_CR4_LA57)) @@ -104,6 +127,49 @@ static bool __head check_la57_support(unsigned long physaddr) } #endif +static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdval_t *pmd) +{ + unsigned long vaddr, vaddr_end; + int i; + + /* Encrypt the kernel and related (if SME is active) */ + sme_encrypt_kernel(bp); + + /* + * Clear the memory encryption mask from the .bss..decrypted section. + * The bss section will be memset to zero later in the initialization so + * there is no need to zero it after changing the memory encryption + * attribute. + */ + if (sme_get_me_mask()) { + vaddr = (unsigned long)__start_bss_decrypted; + vaddr_end = (unsigned long)__end_bss_decrypted; + + for (; vaddr < vaddr_end; vaddr += PMD_SIZE) { + /* + * On SNP, transition the page to shared in the RMP table so that + * it is consistent with the page table attribute change. + * + * __start_bss_decrypted has a virtual address in the high range + * mapping (kernel .text). PVALIDATE, by way of + * early_snp_set_memory_shared(), requires a valid virtual + * address but the kernel is currently running off of the identity + * mapping so use __pa() to get a *currently* valid virtual address. + */ + early_snp_set_memory_shared(__pa(vaddr), __pa(vaddr), PTRS_PER_PMD); + + i = pmd_index(vaddr); + pmd[i] -= sme_get_me_mask(); + } + } + + /* + * Return the SME encryption mask (if SME is active) to be used as a + * modifier for the initial pgdir entry programmed into CR3. + */ + return sme_get_me_mask(); +} + /* Code in __startup_64() can be relocated during execution, but the compiler * doesn't have to generate PC-relative relocations when accessing globals from * that function. Clang actually does not generate them, which leads to @@ -113,7 +179,6 @@ static bool __head check_la57_support(unsigned long physaddr) unsigned long __head __startup_64(unsigned long physaddr, struct boot_params *bp) { - unsigned long vaddr, vaddr_end; unsigned long load_delta, *p; unsigned long pgtable_flags; pgdval_t *pgd; @@ -141,9 +206,6 @@ unsigned long __head __startup_64(unsigned long physaddr, if (load_delta & ~PMD_PAGE_MASK) for (;;); - /* Activate Secure Memory Encryption (SME) if supported and enabled */ - sme_enable(bp); - /* Include the SME encryption mask in the fixup value */ load_delta += sme_get_me_mask(); @@ -254,38 +316,7 @@ unsigned long __head __startup_64(unsigned long physaddr, */ *fixup_long(&phys_base, physaddr) += load_delta - sme_get_me_mask(); - /* Encrypt the kernel and related (if SME is active) */ - sme_encrypt_kernel(bp); - - /* - * Clear the memory encryption mask from the .bss..decrypted section. - * The bss section will be memset to zero later in the initialization so - * there is no need to zero it after changing the memory encryption - * attribute. - */ - if (mem_encrypt_active()) { - vaddr = (unsigned long)__start_bss_decrypted; - vaddr_end = (unsigned long)__end_bss_decrypted; - for (; vaddr < vaddr_end; vaddr += PMD_SIZE) { - i = pmd_index(vaddr); - pmd[i] -= sme_get_me_mask(); - } - } - - /* - * Return the SME encryption mask (if SME is active) to be used as a - * modifier for the initial pgdir entry programmed into CR3. - */ - return sme_get_me_mask(); -} - -unsigned long __startup_secondary_64(void) -{ - /* - * Return the SME encryption mask (if SME is active) to be used as a - * modifier for the initial pgdir entry programmed into CR3. - */ - return sme_get_me_mask(); + return sme_postprocess_startup(bp, pmd); } /* Wipe all early page tables except for the kernel symbol map */ @@ -297,7 +328,7 @@ static void __init reset_early_page_tables(void) } /* Create a new PMD entry */ -int __init __early_make_pgtable(unsigned long address, pmdval_t pmd) +bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd) { unsigned long physaddr = address - __PAGE_OFFSET; pgdval_t pgd, *pgd_p; @@ -307,7 +338,7 @@ int __init __early_make_pgtable(unsigned long address, pmdval_t pmd) /* Invalid address or early pgt is done ? */ if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt)) - return -1; + return false; again: pgd_p = &early_top_pgt[pgd_index(address)].pgd; @@ -364,10 +395,10 @@ again: } pmd_p[pmd_index(address)] = pmd; - return 0; + return true; } -int __init early_make_pgtable(unsigned long address) +static bool __init early_make_pgtable(unsigned long address) { unsigned long physaddr = address - __PAGE_OFFSET; pmdval_t pmd; @@ -377,12 +408,30 @@ int __init early_make_pgtable(unsigned long address) return __early_make_pgtable(address, pmd); } +void __init do_early_exception(struct pt_regs *regs, int trapnr) +{ + if (trapnr == X86_TRAP_PF && + early_make_pgtable(native_read_cr2())) + return; + + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT) && + trapnr == X86_TRAP_VC && handle_vc_boot_ghcb(regs)) + return; + + if (trapnr == X86_TRAP_VE && tdx_early_handle_ve(regs)) + return; + + early_fixup_exception(regs, trapnr); +} + /* Don't add a printk in there. printk relies on the PDA which is not initialized yet. */ -static void __init clear_bss(void) +void __init clear_bss(void) { memset(__bss_start, 0, (unsigned long) __bss_stop - (unsigned long) __bss_start); + memset(__brk_base, 0, + (unsigned long) __brk_limit - (unsigned long) __brk_base); } static unsigned long get_cmd_line_ptr(void) @@ -445,6 +494,10 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) clear_bss(); + /* + * This needs to happen *before* kasan_early_init() because latter maps stuff + * into that page. + */ clear_page(init_top_pgt); /* @@ -456,8 +509,21 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) kasan_early_init(); + /* + * Flush global TLB entries which could be left over from the trampoline page + * table. + * + * This needs to happen *after* kasan_early_init() as KASAN-enabled .configs + * instrument native_write_cr4() so KASAN must be initialized for that + * instrumentation to work. + */ + __native_tlb_flush_global(this_cpu_read(cpu_tlbstate.cr4)); + idt_setup_early_handler(); + /* Needed before cc_platform_has() can be used for TDX */ + tdx_early_init(); + copy_bootdata(__va(real_mode_data)); /* @@ -489,3 +555,83 @@ void __init x86_64_start_reservations(char *real_mode_data) start_kernel(); } + +/* + * Data structures and code used for IDT setup in head_64.S. The bringup-IDT is + * used until the idt_table takes over. On the boot CPU this happens in + * x86_64_start_kernel(), on secondary CPUs in start_secondary(). In both cases + * this happens in the functions called from head_64.S. + * + * The idt_table can't be used that early because all the code modifying it is + * in idt.c and can be instrumented by tracing or KASAN, which both don't work + * during early CPU bringup. Also the idt_table has the runtime vectors + * configured which require certain CPU state to be setup already (like TSS), + * which also hasn't happened yet in early CPU bringup. + */ +static gate_desc bringup_idt_table[NUM_EXCEPTION_VECTORS] __page_aligned_data; + +static struct desc_ptr bringup_idt_descr = { + .size = (NUM_EXCEPTION_VECTORS * sizeof(gate_desc)) - 1, + .address = 0, /* Set at runtime */ +}; + +static void set_bringup_idt_handler(gate_desc *idt, int n, void *handler) +{ +#ifdef CONFIG_AMD_MEM_ENCRYPT + struct idt_data data; + gate_desc desc; + + init_idt_data(&data, n, handler); + idt_init_desc(&desc, &data); + native_write_idt_entry(idt, n, &desc); +#endif +} + +/* This runs while still in the direct mapping */ +static void startup_64_load_idt(unsigned long physbase) +{ + struct desc_ptr *desc = fixup_pointer(&bringup_idt_descr, physbase); + gate_desc *idt = fixup_pointer(bringup_idt_table, physbase); + + + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) { + void *handler; + + /* VMM Communication Exception */ + handler = fixup_pointer(vc_no_ghcb, physbase); + set_bringup_idt_handler(idt, X86_TRAP_VC, handler); + } + + desc->address = (unsigned long)idt; + native_load_idt(desc); +} + +/* This is used when running on kernel addresses */ +void early_setup_idt(void) +{ + /* VMM Communication Exception */ + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) { + setup_ghcb(); + set_bringup_idt_handler(bringup_idt_table, X86_TRAP_VC, vc_boot_ghcb); + } + + bringup_idt_descr.address = (unsigned long)bringup_idt_table; + native_load_idt(&bringup_idt_descr); +} + +/* + * Setup boot CPU state needed before kernel switches to virtual addresses. + */ +void __head startup_64_setup_env(unsigned long physbase) +{ + /* Load GDT */ + startup_gdt_descr.address = (unsigned long)fixup_pointer(startup_gdt, physbase); + native_load_gdt(&startup_gdt_descr); + + /* New GDT is live - reload data segment registers */ + asm volatile("movl %%eax, %%ds\n" + "movl %%eax, %%ss\n" + "movl %%eax, %%es\n" : : "a"(__KERNEL_DS) : "memory"); + + startup_64_load_idt(physbase); +} diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 3923ab4630d7..9b7acc9c7874 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -23,6 +23,7 @@ #include <asm/cpufeatures.h> #include <asm/percpu.h> #include <asm/nops.h> +#include <asm/nospec-branch.h> #include <asm/bootparam.h> #include <asm/export.h> #include <asm/pgtable_32.h> @@ -67,11 +68,6 @@ __HEAD SYM_CODE_START(startup_32) movl pa(initial_stack),%ecx - /* test KEEP_SEGMENTS flag to see if the bootloader is asking - us to not reload segments */ - testb $KEEP_SEGMENTS, BP_loadflags(%esi) - jnz 2f - /* * Set segments to known values. */ @@ -82,7 +78,6 @@ SYM_CODE_START(startup_32) movl %eax,%fs movl %eax,%gs movl %eax,%ss -2: leal -__PAGE_OFFSET(%ecx),%esp /* @@ -140,38 +135,7 @@ SYM_CODE_START(startup_32) movl %eax,pa(initial_page_table+0xffc) #endif -#ifdef CONFIG_PARAVIRT - /* This is can only trip for a broken bootloader... */ - cmpw $0x207, pa(boot_params + BP_version) - jb .Ldefault_entry - - /* Paravirt-compatible boot parameters. Look to see what architecture - we're booting under. */ - movl pa(boot_params + BP_hardware_subarch), %eax - cmpl $num_subarch_entries, %eax - jae .Lbad_subarch - - movl pa(subarch_entries)(,%eax,4), %eax - subl $__PAGE_OFFSET, %eax - jmp *%eax - -.Lbad_subarch: -SYM_INNER_LABEL_ALIGN(xen_entry, SYM_L_WEAK) - /* Unknown implementation; there's really - nothing we can do at this point. */ - ud2a - - __INITDATA - -subarch_entries: - .long .Ldefault_entry /* normal x86/PC */ - .long xen_entry /* Xen hypervisor */ - .long .Ldefault_entry /* Moorestown MID */ -num_subarch_entries = (. - subarch_entries) / 4 -.previous -#else jmp .Ldefault_entry -#endif /* CONFIG_PARAVIRT */ SYM_CODE_END(startup_32) #ifdef CONFIG_HOTPLUG_CPU @@ -355,8 +319,8 @@ SYM_FUNC_START(startup_32_smp) movl $(__KERNEL_PERCPU), %eax movl %eax,%fs # set this cpu's percpu - movl $(__KERNEL_STACK_CANARY),%eax - movl %eax,%gs + xorl %eax,%eax + movl %eax,%gs # clear possible garbage in %gs xorl %eax,%eax # Clear LDT lldt %ax @@ -376,22 +340,8 @@ SYM_FUNC_END(startup_32_smp) */ __INIT setup_once: -#ifdef CONFIG_STACKPROTECTOR - /* - * Configure the stack canary. The linker can't handle this by - * relocation. Manually set base address in stack canary - * segment descriptor. - */ - movl $gdt_page,%eax - movl $stack_canary,%ecx - movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) - shrl $16, %ecx - movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) - movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax) -#endif - andl $0,setup_once_ref /* Once is enough, thanks */ - ret + RET SYM_FUNC_START(early_idt_handler_array) # 36(%esp) %eflags @@ -483,7 +433,7 @@ SYM_FUNC_START(early_ignore_irq) pushl 32(%esp) pushl 40(%esp) pushl $int_msg - call printk + call _printk call dump_stack diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 4bbc770af632..d860d437631b 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -13,8 +13,8 @@ #include <linux/linkage.h> #include <linux/threads.h> #include <linux/init.h> +#include <linux/pgtable.h> #include <asm/segment.h> -#include <asm/pgtable.h> #include <asm/page.h> #include <asm/msr.h> #include <asm/cache.h> @@ -26,18 +26,10 @@ #include <asm/nospec-branch.h> #include <asm/fixmap.h> -#ifdef CONFIG_PARAVIRT_XXL -#include <asm/asm-offsets.h> -#include <asm/paravirt.h> -#else -#define INTERRUPT_RETURN iretq -#endif - -/* we are not able to switch in one step to the final KERNEL ADDRESS SPACE +/* + * We are not able to switch in one step to the final KERNEL ADDRESS SPACE * because we need identity-mapped pages. - * */ - #define l4_index(x) (((x) >> 39) & 511) #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) @@ -70,7 +62,50 @@ SYM_CODE_START_NOALIGN(startup_64) */ /* Set up the stack for verify_cpu(), similar to initial_stack below */ - leaq (__end_init_task - SIZEOF_PTREGS)(%rip), %rsp + leaq (__end_init_task - FRAME_SIZE)(%rip), %rsp + + leaq _text(%rip), %rdi + + /* + * initial_gs points to initial fixed_percpu_data struct with storage for + * the stack protector canary. Global pointer fixups are needed at this + * stage, so apply them as is done in fixup_pointer(), and initialize %gs + * such that the canary can be accessed at %gs:40 for subsequent C calls. + */ + movl $MSR_GS_BASE, %ecx + movq initial_gs(%rip), %rax + movq $_text, %rdx + subq %rdx, %rax + addq %rdi, %rax + movq %rax, %rdx + shrq $32, %rdx + wrmsr + + pushq %rsi + call startup_64_setup_env + popq %rsi + +#ifdef CONFIG_AMD_MEM_ENCRYPT + /* + * Activate SEV/SME memory encryption if supported/enabled. This needs to + * be done now, since this also includes setup of the SEV-SNP CPUID table, + * which needs to be done before any CPUID instructions are executed in + * subsequent code. + */ + movq %rsi, %rdi + pushq %rsi + call sme_enable + popq %rsi +#endif + + /* Now switch to __KERNEL_CS so IRET works reliably */ + pushq $__KERNEL_CS + leaq .Lon_kernel_cs(%rip), %rax + pushq %rax + lretq + +.Lon_kernel_cs: + UNWIND_HINT_EMPTY /* Sanitize CPU configuration */ call verify_cpu @@ -93,6 +128,7 @@ SYM_CODE_END(startup_64) SYM_CODE_START(secondary_startup_64) UNWIND_HINT_EMPTY + ANNOTATE_NOENDBR /* * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, * and someone has loaded a mapped page table. @@ -111,19 +147,48 @@ SYM_CODE_START(secondary_startup_64) call verify_cpu /* + * The secondary_startup_64_no_verify entry point is only used by + * SEV-ES guests. In those guests the call to verify_cpu() would cause + * #VC exceptions which can not be handled at this stage of secondary + * CPU bringup. + * + * All non SEV-ES systems, especially Intel systems, need to execute + * verify_cpu() above to make sure NX is enabled. + */ +SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) + UNWIND_HINT_EMPTY + ANNOTATE_NOENDBR + + /* * Retrieve the modifier (SME encryption mask if SME is active) to be * added to the initial pgdir entry that will be programmed into CR3. */ - pushq %rsi - call __startup_secondary_64 - popq %rsi +#ifdef CONFIG_AMD_MEM_ENCRYPT + movq sme_me_mask, %rax +#else + xorq %rax, %rax +#endif /* Form the CR3 value being sure to include the CR3 modifier */ addq $(init_top_pgt - __START_KERNEL_map), %rax 1: +#ifdef CONFIG_X86_MCE + /* + * Preserve CR4.MCE if the kernel will enable #MC support. + * Clearing MCE may fault in some environments (that also force #MC + * support). Any machine check that occurs before #MC support is fully + * configured will crash the system regardless of the CR4.MCE value set + * here. + */ + movq %cr4, %rcx + andl $X86_CR4_MCE, %ecx +#else + movl $0, %ecx +#endif + /* Enable PAE mode, PGE and LA57 */ - movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx + orl $(X86_CR4_PAE | X86_CR4_PGE), %ecx #ifdef CONFIG_X86_5LEVEL testl $1, __pgtable_l5_enabled(%rip) jz 1f @@ -134,41 +199,47 @@ SYM_CODE_START(secondary_startup_64) /* Setup early boot stage 4-/5-level pagetables. */ addq phys_base(%rip), %rax + + /* + * For SEV guests: Verify that the C-bit is correct. A malicious + * hypervisor could lie about the C-bit position to perform a ROP + * attack on the guest by writing to the unencrypted stack and wait for + * the next RET instruction. + * %rsi carries pointer to realmode data and is callee-clobbered. Save + * and restore it. + */ + pushq %rsi + movq %rax, %rdi + call sev_verify_cbit + popq %rsi + + /* + * Switch to new page-table + * + * For the boot CPU this switches to early_top_pgt which still has the + * indentity mappings present. The secondary CPUs will switch to the + * init_top_pgt here, away from the trampoline_pgd and unmap the + * indentity mapped ranges. + */ movq %rax, %cr3 + /* + * Do a global TLB flush after the CR3 switch to make sure the TLB + * entries from the identity mapping are flushed. + */ + movq %cr4, %rcx + movq %rcx, %rax + xorq $X86_CR4_PGE, %rcx + movq %rcx, %cr4 + movq %rax, %cr4 + /* Ensure I am executing from virtual addresses */ movq $1f, %rax ANNOTATE_RETPOLINE_SAFE jmp *%rax 1: UNWIND_HINT_EMPTY - - /* Check if nx is implemented */ - movl $0x80000001, %eax - cpuid - movl %edx,%edi - - /* Setup EFER (Extended Feature Enable Register) */ - movl $MSR_EFER, %ecx - rdmsr - btsl $_EFER_SCE, %eax /* Enable System Call */ - btl $20,%edi /* No Execute supported? */ - jnc 1f - btsl $_EFER_NX, %eax - btsq $_PAGE_BIT_NX,early_pmd_flags(%rip) -1: wrmsr /* Make changes effective */ - - /* Setup cr0 */ - movl $CR0_STATE, %eax - /* Make changes effective */ - movq %rax, %cr0 - - /* Setup a boot time stack */ - movq initial_stack(%rip), %rsp - - /* zero EFLAGS after setting rsp */ - pushq $0 - popfq + ANNOTATE_NOENDBR // above /* * We must switch to a new descriptor in kernel space for the GDT @@ -204,6 +275,51 @@ SYM_CODE_START(secondary_startup_64) movl initial_gs+4(%rip),%edx wrmsr + /* + * Setup a boot time stack - Any secondary CPU will have lost its stack + * by now because the cr3-switch above unmaps the real-mode stack + */ + movq initial_stack(%rip), %rsp + + /* Setup and Load IDT */ + pushq %rsi + call early_setup_idt + popq %rsi + + /* Check if nx is implemented */ + movl $0x80000001, %eax + cpuid + movl %edx,%edi + + /* Setup EFER (Extended Feature Enable Register) */ + movl $MSR_EFER, %ecx + rdmsr + /* + * Preserve current value of EFER for comparison and to skip + * EFER writes if no change was made (for TDX guest) + */ + movl %eax, %edx + btsl $_EFER_SCE, %eax /* Enable System Call */ + btl $20,%edi /* No Execute supported? */ + jnc 1f + btsl $_EFER_NX, %eax + btsq $_PAGE_BIT_NX,early_pmd_flags(%rip) + + /* Avoid writing EFER if no change was made (for TDX guest) */ +1: cmpl %edx, %eax + je 1f + xor %edx, %edx + wrmsr /* Make changes effective */ +1: + /* Setup cr0 */ + movl $CR0_STATE, %eax + /* Make changes effective */ + movq %rax, %cr0 + + /* zero EFLAGS after setting rsp */ + pushq $0 + popfq + /* rsi is pointer to real mode structure with interesting info. pass it to C */ movq %rsi, %rdi @@ -241,9 +357,11 @@ SYM_CODE_START(secondary_startup_64) pushq %rax # target address in negative space lretq .Lafter_lret: + ANNOTATE_NOENDBR SYM_CODE_END(secondary_startup_64) #include "verify_cpu.S" +#include "sev_verify_cbit.S" #ifdef CONFIG_HOTPLUG_CPU /* @@ -258,17 +376,55 @@ SYM_CODE_START(start_cpu0) SYM_CODE_END(start_cpu0) #endif +#ifdef CONFIG_AMD_MEM_ENCRYPT +/* + * VC Exception handler used during early boot when running on kernel + * addresses, but before the switch to the idt_table can be made. + * The early_idt_handler_array can't be used here because it calls into a lot + * of __init code and this handler is also used during CPU offlining/onlining. + * Therefore this handler ends up in the .text section so that it stays around + * when .init.text is freed. + */ +SYM_CODE_START_NOALIGN(vc_boot_ghcb) + UNWIND_HINT_IRET_REGS offset=8 + ENDBR + + ANNOTATE_UNRET_END + + /* Build pt_regs */ + PUSH_AND_CLEAR_REGS + + /* Call C handler */ + movq %rsp, %rdi + movq ORIG_RAX(%rsp), %rsi + movq initial_vc_handler(%rip), %rax + ANNOTATE_RETPOLINE_SAFE + call *%rax + + /* Unwind pt_regs */ + POP_REGS + + /* Remove Error Code */ + addq $8, %rsp + + iretq +SYM_CODE_END(vc_boot_ghcb) +#endif + /* Both SMP bootup and ACPI suspend change these variables */ __REFDATA .balign 8 SYM_DATA(initial_code, .quad x86_64_start_kernel) SYM_DATA(initial_gs, .quad INIT_PER_CPU_VAR(fixed_percpu_data)) +#ifdef CONFIG_AMD_MEM_ENCRYPT +SYM_DATA(initial_vc_handler, .quad handle_vc_boot_ghcb) +#endif /* - * The SIZEOF_PTREGS gap is a convention which helps the in-kernel unwinder + * The FRAME_SIZE gap is a convention which helps the in-kernel unwinder * reliably detect the end of the stack. */ -SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS) +SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - FRAME_SIZE) __FINITDATA __INIT @@ -277,9 +433,11 @@ SYM_CODE_START(early_idt_handler_array) .rept NUM_EXCEPTION_VECTORS .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0 UNWIND_HINT_IRET_REGS + ENDBR pushq $0 # Dummy error code, to make stack frame uniform .else UNWIND_HINT_IRET_REGS offset=8 + ENDBR .endif pushq $i # 72(%rsp) Vector number jmp early_idt_handler_common @@ -287,10 +445,12 @@ SYM_CODE_START(early_idt_handler_array) i = i + 1 .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr - UNWIND_HINT_IRET_REGS offset=16 SYM_CODE_END(early_idt_handler_array) + ANNOTATE_NOENDBR // early_idt_handler_array[NUM_EXCEPTION_VECTORS] SYM_CODE_START_LOCAL(early_idt_handler_common) + UNWIND_HINT_IRET_REGS offset=16 + ANNOTATE_UNRET_END /* * The stack is the hardware frame, an error code or zero, and the * vector number. @@ -318,22 +478,48 @@ SYM_CODE_START_LOCAL(early_idt_handler_common) pushq %r15 /* pt_regs->r15 */ UNWIND_HINT_REGS - cmpq $14,%rsi /* Page fault? */ - jnz 10f - GET_CR2_INTO(%rdi) /* can clobber %rax if pv */ - call early_make_pgtable - andl %eax,%eax - jz 20f /* All good */ - -10: movq %rsp,%rdi /* RDI = pt_regs; RSI is already trapnr */ - call early_fixup_exception + call do_early_exception -20: decl early_recursion_flag(%rip) jmp restore_regs_and_return_to_kernel SYM_CODE_END(early_idt_handler_common) +#ifdef CONFIG_AMD_MEM_ENCRYPT +/* + * VC Exception handler used during very early boot. The + * early_idt_handler_array can't be used because it returns via the + * paravirtualized INTERRUPT_RETURN and pv-ops don't work that early. + * + * XXX it does, fix this. + * + * This handler will end up in the .init.text section and not be + * available to boot secondary CPUs. + */ +SYM_CODE_START_NOALIGN(vc_no_ghcb) + UNWIND_HINT_IRET_REGS offset=8 + ENDBR + + ANNOTATE_UNRET_END + + /* Build pt_regs */ + PUSH_AND_CLEAR_REGS + + /* Call C handler */ + movq %rsp, %rdi + movq ORIG_RAX(%rsp), %rsi + call do_vc_no_ghcb + + /* Unwind pt_regs */ + POP_REGS + + /* Remove Error Code */ + addq $8, %rsp + + /* Pure iret required here - don't use INTERRUPT_RETURN */ + iretq +SYM_CODE_END(vc_no_ghcb) +#endif #define SYM_DATA_START_PAGE_ALIGNED(name) \ SYM_START(name, SYM_L_GLOBAL, .balign PAGE_SIZE) @@ -432,21 +618,19 @@ SYM_DATA_END(level3_kernel_pgt) SYM_DATA_START_PAGE_ALIGNED(level2_kernel_pgt) /* - * 512 MB kernel mapping. We spend a full page on this pagetable - * anyway. + * Kernel high mapping. * - * The kernel code+data+bss must not be bigger than that. + * The kernel code+data+bss must be located below KERNEL_IMAGE_SIZE in + * virtual address space, which is 1 GiB if RANDOMIZE_BASE is enabled, + * 512 MiB otherwise. * - * (NOTE: at +512MB starts the module area, see MODULES_VADDR. - * If you want to increase this then increase MODULES_VADDR - * too.) + * (NOTE: after that starts the module area, see MODULES_VADDR.) * - * This table is eventually used by the kernel during normal - * runtime. Care must be taken to clear out undesired bits - * later, like _PAGE_RW or _PAGE_GLOBAL in some cases. + * This table is eventually used by the kernel during normal runtime. + * Care must be taken to clear out undesired bits later, like _PAGE_RW + * or _PAGE_GLOBAL in some cases. */ - PMDS(0, __PAGE_KERNEL_LARGE_EXEC, - KERNEL_IMAGE_SIZE/PMD_SIZE) + PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE/PMD_SIZE) SYM_DATA_END(level2_kernel_pgt) SYM_DATA_START_PAGE_ALIGNED(level2_fixmap_pgt) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 7a50f0b62a70..71f336425e58 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -7,8 +7,10 @@ #include <linux/cpu.h> #include <linux/irq.h> +#include <asm/irq_remapping.h> #include <asm/hpet.h> #include <asm/time.h> +#include <asm/mwait.h> #undef pr_fmt #define pr_fmt(fmt) "hpet: " fmt @@ -50,7 +52,7 @@ unsigned long hpet_address; u8 hpet_blockid; /* OS timer block num */ bool hpet_msi_disable; -#ifdef CONFIG_PCI_MSI +#ifdef CONFIG_GENERIC_MSI_IRQ static DEFINE_PER_CPU(struct hpet_channel *, cpu_hpet_channel); static struct irq_domain *hpet_domain; #endif @@ -467,9 +469,8 @@ static void __init hpet_legacy_clockevent_register(struct hpet_channel *hc) /* * HPET MSI Support */ -#ifdef CONFIG_PCI_MSI - -void hpet_msi_unmask(struct irq_data *data) +#ifdef CONFIG_GENERIC_MSI_IRQ +static void hpet_msi_unmask(struct irq_data *data) { struct hpet_channel *hc = irq_data_get_irq_handler_data(data); unsigned int cfg; @@ -479,7 +480,7 @@ void hpet_msi_unmask(struct irq_data *data) hpet_writel(cfg, HPET_Tn_CFG(hc->num)); } -void hpet_msi_mask(struct irq_data *data) +static void hpet_msi_mask(struct irq_data *data) { struct hpet_channel *hc = irq_data_get_irq_handler_data(data); unsigned int cfg; @@ -489,12 +490,122 @@ void hpet_msi_mask(struct irq_data *data) hpet_writel(cfg, HPET_Tn_CFG(hc->num)); } -void hpet_msi_write(struct hpet_channel *hc, struct msi_msg *msg) +static void hpet_msi_write(struct hpet_channel *hc, struct msi_msg *msg) { hpet_writel(msg->data, HPET_Tn_ROUTE(hc->num)); hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hc->num) + 4); } +static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg) +{ + hpet_msi_write(irq_data_get_irq_handler_data(data), msg); +} + +static struct irq_chip hpet_msi_controller __ro_after_init = { + .name = "HPET-MSI", + .irq_unmask = hpet_msi_unmask, + .irq_mask = hpet_msi_mask, + .irq_ack = irq_chip_ack_parent, + .irq_set_affinity = msi_domain_set_affinity, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_write_msi_msg = hpet_msi_write_msg, + .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_AFFINITY_PRE_STARTUP, +}; + +static int hpet_msi_init(struct irq_domain *domain, + struct msi_domain_info *info, unsigned int virq, + irq_hw_number_t hwirq, msi_alloc_info_t *arg) +{ + irq_set_status_flags(virq, IRQ_MOVE_PCNTXT); + irq_domain_set_info(domain, virq, arg->hwirq, info->chip, NULL, + handle_edge_irq, arg->data, "edge"); + + return 0; +} + +static void hpet_msi_free(struct irq_domain *domain, + struct msi_domain_info *info, unsigned int virq) +{ + irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT); +} + +static struct msi_domain_ops hpet_msi_domain_ops = { + .msi_init = hpet_msi_init, + .msi_free = hpet_msi_free, +}; + +static struct msi_domain_info hpet_msi_domain_info = { + .ops = &hpet_msi_domain_ops, + .chip = &hpet_msi_controller, + .flags = MSI_FLAG_USE_DEF_DOM_OPS, +}; + +static struct irq_domain *hpet_create_irq_domain(int hpet_id) +{ + struct msi_domain_info *domain_info; + struct irq_domain *parent, *d; + struct fwnode_handle *fn; + struct irq_fwspec fwspec; + + if (x86_vector_domain == NULL) + return NULL; + + domain_info = kzalloc(sizeof(*domain_info), GFP_KERNEL); + if (!domain_info) + return NULL; + + *domain_info = hpet_msi_domain_info; + domain_info->data = (void *)(long)hpet_id; + + fn = irq_domain_alloc_named_id_fwnode(hpet_msi_controller.name, + hpet_id); + if (!fn) { + kfree(domain_info); + return NULL; + } + + fwspec.fwnode = fn; + fwspec.param_count = 1; + fwspec.param[0] = hpet_id; + + parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_ANY); + if (!parent) { + irq_domain_free_fwnode(fn); + kfree(domain_info); + return NULL; + } + if (parent != x86_vector_domain) + hpet_msi_controller.name = "IR-HPET-MSI"; + + d = msi_create_irq_domain(fn, domain_info, parent); + if (!d) { + irq_domain_free_fwnode(fn); + kfree(domain_info); + } + return d; +} + +static inline int hpet_dev_id(struct irq_domain *domain) +{ + struct msi_domain_info *info = msi_get_domain_info(domain); + + return (int)(long)info->data; +} + +static int hpet_assign_irq(struct irq_domain *domain, struct hpet_channel *hc, + int dev_num) +{ + struct irq_alloc_info info; + + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_HPET; + info.data = hc; + info.devid = hpet_dev_id(domain); + info.hwirq = dev_num; + + return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info); +} + static int hpet_clkevt_msi_resume(struct clock_event_device *evt) { struct hpet_channel *hc = clockevent_to_channel(evt); @@ -806,6 +917,83 @@ static bool __init hpet_counting(void) return false; } +static bool __init mwait_pc10_supported(void) +{ + unsigned int eax, ebx, ecx, mwait_substates; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return false; + + if (!cpu_feature_enabled(X86_FEATURE_MWAIT)) + return false; + + if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) + return false; + + cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates); + + return (ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) && + (ecx & CPUID5_ECX_INTERRUPT_BREAK) && + (mwait_substates & (0xF << 28)); +} + +/* + * Check whether the system supports PC10. If so force disable HPET as that + * stops counting in PC10. This check is overbroad as it does not take any + * of the following into account: + * + * - ACPI tables + * - Enablement of intel_idle + * - Command line arguments which limit intel_idle C-state support + * + * That's perfectly fine. HPET is a piece of hardware designed by committee + * and the only reasons why it is still in use on modern systems is the + * fact that it is impossible to reliably query TSC and CPU frequency via + * CPUID or firmware. + * + * If HPET is functional it is useful for calibrating TSC, but this can be + * done via PMTIMER as well which seems to be the last remaining timer on + * X86/INTEL platforms that has not been completely wreckaged by feature + * creep. + * + * In theory HPET support should be removed altogether, but there are older + * systems out there which depend on it because TSC and APIC timer are + * dysfunctional in deeper C-states. + * + * It's only 20 years now that hardware people have been asked to provide + * reliable and discoverable facilities which can be used for timekeeping + * and per CPU timer interrupts. + * + * The probability that this problem is going to be solved in the + * forseeable future is close to zero, so the kernel has to be cluttered + * with heuristics to keep up with the ever growing amount of hardware and + * firmware trainwrecks. Hopefully some day hardware people will understand + * that the approach of "This can be fixed in software" is not sustainable. + * Hope dies last... + */ +static bool __init hpet_is_pc10_damaged(void) +{ + unsigned long long pcfg; + + /* Check whether PC10 substates are supported */ + if (!mwait_pc10_supported()) + return false; + + /* Check whether PC10 is enabled in PKG C-state limit */ + rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, pcfg); + if ((pcfg & 0xF) < 8) + return false; + + if (hpet_force_user) { + pr_warn("HPET force enabled via command line, but dysfunctional in PC10.\n"); + return false; + } + + pr_info("HPET dysfunctional in PC10. Force disabled.\n"); + boot_hpet_disable = true; + return true; +} + /** * hpet_enable - Try to setup the HPET timer. Returns 1 on success. */ @@ -819,6 +1007,9 @@ int __init hpet_enable(void) if (!is_hpet_capable()) return 0; + if (hpet_is_pc10_damaged()) + return 0; + hpet_set_mapping(); if (!hpet_virt_address) return 0; @@ -1244,8 +1435,12 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) hpet_rtc_timer_reinit(); memset(&curr_time, 0, sizeof(struct rtc_time)); - if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) - mc146818_get_time(&curr_time); + if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) { + if (unlikely(mc146818_get_time(&curr_time) < 0)) { + pr_err_ratelimited("unable to read current time from RTC\n"); + return IRQ_HANDLED; + } + } if (hpet_rtc_flags & RTC_UIE && curr_time.tm_sec != hpet_prev_update_sec) { diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 4d8d53ed02c9..668a4a6533d9 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -32,6 +32,8 @@ #include <asm/processor.h> #include <asm/debugreg.h> #include <asm/user.h> +#include <asm/desc.h> +#include <asm/tlbflush.h> /* Per cpu debug control register value */ DEFINE_PER_CPU(unsigned long, cpu_dr7); @@ -97,6 +99,8 @@ int arch_install_hw_breakpoint(struct perf_event *bp) unsigned long *dr7; int i; + lockdep_assert_irqs_disabled(); + for (i = 0; i < HBP_NUM; i++) { struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]); @@ -115,6 +119,12 @@ int arch_install_hw_breakpoint(struct perf_event *bp) dr7 = this_cpu_ptr(&cpu_dr7); *dr7 |= encode_dr7(i, info->len, info->type); + /* + * Ensure we first write cpu_dr7 before we set the DR7 register. + * This ensures an NMI never see cpu_dr7 0 when DR7 is not. + */ + barrier(); + set_debugreg(*dr7, 7); if (info->mask) set_dr_addr_mask(info->mask, i); @@ -134,9 +144,11 @@ int arch_install_hw_breakpoint(struct perf_event *bp) void arch_uninstall_hw_breakpoint(struct perf_event *bp) { struct arch_hw_breakpoint *info = counter_arch_bp(bp); - unsigned long *dr7; + unsigned long dr7; int i; + lockdep_assert_irqs_disabled(); + for (i = 0; i < HBP_NUM; i++) { struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]); @@ -149,12 +161,20 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp) if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot")) return; - dr7 = this_cpu_ptr(&cpu_dr7); - *dr7 &= ~__encode_dr7(i, info->len, info->type); + dr7 = this_cpu_read(cpu_dr7); + dr7 &= ~__encode_dr7(i, info->len, info->type); - set_debugreg(*dr7, 7); + set_debugreg(dr7, 7); if (info->mask) set_dr_addr_mask(0, i); + + /* + * Ensure the write to cpu_dr7 is after we've set the DR7 register. + * This ensures an NMI never see cpu_dr7 0 when DR7 is not. + */ + barrier(); + + this_cpu_write(cpu_dr7, dr7); } static int arch_bp_generic_len(int x86_len) @@ -227,10 +247,98 @@ int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw) return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX); } +/* + * Checks whether the range [addr, end], overlaps the area [base, base + size). + */ +static inline bool within_area(unsigned long addr, unsigned long end, + unsigned long base, unsigned long size) +{ + return end >= base && addr < (base + size); +} + +/* + * Checks whether the range from addr to end, inclusive, overlaps the fixed + * mapped CPU entry area range or other ranges used for CPU entry. + */ +static inline bool within_cpu_entry(unsigned long addr, unsigned long end) +{ + int cpu; + + /* CPU entry erea is always used for CPU entry */ + if (within_area(addr, end, CPU_ENTRY_AREA_BASE, + CPU_ENTRY_AREA_TOTAL_SIZE)) + return true; + + /* + * When FSGSBASE is enabled, paranoid_entry() fetches the per-CPU + * GSBASE value via __per_cpu_offset or pcpu_unit_offsets. + */ +#ifdef CONFIG_SMP + if (within_area(addr, end, (unsigned long)__per_cpu_offset, + sizeof(unsigned long) * nr_cpu_ids)) + return true; +#else + if (within_area(addr, end, (unsigned long)&pcpu_unit_offsets, + sizeof(pcpu_unit_offsets))) + return true; +#endif + + for_each_possible_cpu(cpu) { + /* The original rw GDT is being used after load_direct_gdt() */ + if (within_area(addr, end, (unsigned long)get_cpu_gdt_rw(cpu), + GDT_SIZE)) + return true; + + /* + * cpu_tss_rw is not directly referenced by hardware, but + * cpu_tss_rw is also used in CPU entry code, + */ + if (within_area(addr, end, + (unsigned long)&per_cpu(cpu_tss_rw, cpu), + sizeof(struct tss_struct))) + return true; + + /* + * cpu_tlbstate.user_pcid_flush_mask is used for CPU entry. + * If a data breakpoint on it, it will cause an unwanted #DB. + * Protect the full cpu_tlbstate structure to be sure. + */ + if (within_area(addr, end, + (unsigned long)&per_cpu(cpu_tlbstate, cpu), + sizeof(struct tlb_state))) + return true; + + /* + * When in guest (X86_FEATURE_HYPERVISOR), local_db_save() + * will read per-cpu cpu_dr7 before clear dr7 register. + */ + if (within_area(addr, end, (unsigned long)&per_cpu(cpu_dr7, cpu), + sizeof(cpu_dr7))) + return true; + } + + return false; +} + static int arch_build_bp_info(struct perf_event *bp, const struct perf_event_attr *attr, struct arch_hw_breakpoint *hw) { + unsigned long bp_end; + + bp_end = attr->bp_addr + attr->bp_len - 1; + if (bp_end < attr->bp_addr) + return -EINVAL; + + /* + * Prevent any breakpoint of any type that overlaps the CPU + * entry area and data. This protects the IST stacks and also + * reduces the chance that we ever find out what happens if + * there's a data breakpoint on the GDT, IDT, or TSS. + */ + if (within_cpu_entry(attr->bp_addr, bp_end)) + return -EINVAL; + hw->address = attr->bp_addr; hw->mask = 0; @@ -263,7 +371,7 @@ static int arch_build_bp_info(struct perf_event *bp, hw->len = X86_BREAKPOINT_LEN_X; return 0; } - /* fall through */ + fallthrough; default: return -EINVAL; } @@ -356,42 +464,6 @@ int hw_breakpoint_arch_parse(struct perf_event *bp, } /* - * Dump the debug register contents to the user. - * We can't dump our per cpu values because it - * may contain cpu wide breakpoint, something that - * doesn't belong to the current task. - * - * TODO: include non-ptrace user breakpoints (perf) - */ -void aout_dump_debugregs(struct user *dump) -{ - int i; - int dr7 = 0; - struct perf_event *bp; - struct arch_hw_breakpoint *info; - struct thread_struct *thread = ¤t->thread; - - for (i = 0; i < HBP_NUM; i++) { - bp = thread->ptrace_bps[i]; - - if (bp && !bp->attr.disabled) { - dump->u_debugreg[i] = bp->attr.bp_addr; - info = counter_arch_bp(bp); - dr7 |= encode_dr7(i, info->len, info->type); - } else { - dump->u_debugreg[i] = 0; - } - } - - dump->u_debugreg[4] = 0; - dump->u_debugreg[5] = 0; - dump->u_debugreg[6] = current->thread.debugreg6; - - dump->u_debugreg[7] = dr7; -} -EXPORT_SYMBOL_GPL(aout_dump_debugregs); - -/* * Release the user breakpoints used by ptrace */ void flush_ptrace_hw_breakpoint(struct task_struct *tsk) @@ -404,7 +476,7 @@ void flush_ptrace_hw_breakpoint(struct task_struct *tsk) t->ptrace_bps[i] = NULL; } - t->debugreg6 = 0; + t->virtual_dr6 = 0; t->ptrace_dr7 = 0; } @@ -414,7 +486,7 @@ void hw_breakpoint_restore(void) set_debugreg(__this_cpu_read(cpu_debugreg[1]), 1); set_debugreg(__this_cpu_read(cpu_debugreg[2]), 2); set_debugreg(__this_cpu_read(cpu_debugreg[3]), 3); - set_debugreg(current->thread.debugreg6, 6); + set_debugreg(DR6_RESERVED, 6); set_debugreg(__this_cpu_read(cpu_dr7), 7); } EXPORT_SYMBOL_GPL(hw_breakpoint_restore); @@ -437,61 +509,48 @@ EXPORT_SYMBOL_GPL(hw_breakpoint_restore); */ static int hw_breakpoint_handler(struct die_args *args) { - int i, cpu, rc = NOTIFY_STOP; + int i, rc = NOTIFY_STOP; struct perf_event *bp; - unsigned long dr7, dr6; unsigned long *dr6_p; + unsigned long dr6; + bool bpx; /* The DR6 value is pointed by args->err */ dr6_p = (unsigned long *)ERR_PTR(args->err); dr6 = *dr6_p; - /* If it's a single step, TRAP bits are random */ - if (dr6 & DR_STEP) - return NOTIFY_DONE; - /* Do an early return if no trap bits are set in DR6 */ if ((dr6 & DR_TRAP_BITS) == 0) return NOTIFY_DONE; - get_debugreg(dr7, 7); - /* Disable breakpoints during exception handling */ - set_debugreg(0UL, 7); - /* - * Assert that local interrupts are disabled - * Reset the DRn bits in the virtualized register value. - * The ptrace trigger routine will add in whatever is needed. - */ - current->thread.debugreg6 &= ~DR_TRAP_BITS; - cpu = get_cpu(); - /* Handle all the breakpoints that were triggered */ for (i = 0; i < HBP_NUM; ++i) { if (likely(!(dr6 & (DR_TRAP0 << i)))) continue; + bp = this_cpu_read(bp_per_reg[i]); + if (!bp) + continue; + + bpx = bp->hw.info.type == X86_BREAKPOINT_EXECUTE; + /* - * The counter may be concurrently released but that can only - * occur from a call_rcu() path. We can then safely fetch - * the breakpoint, use its callback, touch its counter - * while we are in an rcu_read_lock() path. + * TF and data breakpoints are traps and can be merged, however + * instruction breakpoints are faults and will be raised + * separately. + * + * However DR6 can indicate both TF and instruction + * breakpoints. In that case take TF as that has precedence and + * delay the instruction breakpoint for the next exception. */ - rcu_read_lock(); + if (bpx && (dr6 & DR_STEP)) + continue; - bp = per_cpu(bp_per_reg[i], cpu); /* * Reset the 'i'th TRAP bit in dr6 to denote completion of * exception handling */ (*dr6_p) &= ~(DR_TRAP0 << i); - /* - * bp can be NULL due to lazy debug register switching - * or due to concurrent perf counter removing. - */ - if (!bp) { - rcu_read_unlock(); - break; - } perf_bp_event(bp, args->regs); @@ -499,23 +558,19 @@ static int hw_breakpoint_handler(struct die_args *args) * Set up resume flag to avoid breakpoint recursion when * returning back to origin. */ - if (bp->hw.info.type == X86_BREAKPOINT_EXECUTE) + if (bpx) args->regs->flags |= X86_EFLAGS_RF; - - rcu_read_unlock(); } + /* * Further processing in do_debug() is needed for a) user-space * breakpoints (to generate signals) and b) when the system has * taken exception due to multiple causes */ - if ((current->thread.debugreg6 & DR_TRAP_BITS) || + if ((current->thread.virtual_dr6 & DR_TRAP_BITS) || (dr6 & (~DR_TRAP_BITS))) rc = NOTIFY_DONE; - set_debugreg(dr7, 7); - put_cpu(); - return rc; } diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 519649ddf100..15aefa3f3e18 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -15,11 +15,11 @@ #include <linux/acpi.h> #include <linux/io.h> #include <linux/delay.h> +#include <linux/pgtable.h> #include <linux/atomic.h> #include <asm/timer.h> #include <asm/hw_irq.h> -#include <asm/pgtable.h> #include <asm/desc.h> #include <asm/apic.h> #include <asm/i8259.h> @@ -207,7 +207,7 @@ spurious_8259A_irq: * lets ACK and report it. [once per IRQ] */ if (!(spurious_irq_mask & irqmask)) { - printk(KERN_DEBUG + printk_deferred(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); spurious_irq_mask |= irqmask; } @@ -235,15 +235,15 @@ static char irq_trigger[2]; */ static void restore_ELCR(char *trigger) { - outb(trigger[0], 0x4d0); - outb(trigger[1], 0x4d1); + outb(trigger[0], PIC_ELCR1); + outb(trigger[1], PIC_ELCR2); } static void save_ELCR(char *trigger) { /* IRQ 0,1,2,8,13 are marked as reserved */ - trigger[0] = inb(0x4d0) & 0xF8; - trigger[1] = inb(0x4d1) & 0xDE; + trigger[0] = inb(PIC_ELCR1) & 0xF8; + trigger[1] = inb(PIC_ELCR2) & 0xDE; } static void i8259A_resume(void) diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 87ef69a72c52..a58c6bc1cd68 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -4,17 +4,13 @@ */ #include <linux/interrupt.h> +#include <asm/cpu_entry_area.h> +#include <asm/set_memory.h> #include <asm/traps.h> #include <asm/proto.h> #include <asm/desc.h> #include <asm/hw_irq.h> - -struct idt_data { - unsigned int vector; - unsigned int segment; - struct idt_bits bits; - const void *addr; -}; +#include <asm/idtentry.h> #define DPL0 0x0 #define DPL3 0x3 @@ -40,26 +36,41 @@ struct idt_data { #define SYSG(_vector, _addr) \ G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL3, __KERNEL_CS) +#ifdef CONFIG_X86_64 /* * Interrupt gate with interrupt stack. The _ist index is the index in * the tss.ist[] array, but for the descriptor it needs to start at 1. */ #define ISTG(_vector, _addr, _ist) \ G(_vector, _addr, _ist + 1, GATE_INTERRUPT, DPL0, __KERNEL_CS) +#else +#define ISTG(_vector, _addr, _ist) INTG(_vector, _addr) +#endif /* Task gate */ #define TSKG(_vector, _gdt) \ G(_vector, NULL, DEFAULT_STACK, GATE_TASK, DPL0, _gdt << 3) +#define IDT_TABLE_SIZE (IDT_ENTRIES * sizeof(gate_desc)) + +static bool idt_setup_done __initdata; + /* * Early traps running on the DEFAULT_STACK because the other interrupt * stacks work only after cpu_init(). */ static const __initconst struct idt_data early_idts[] = { - INTG(X86_TRAP_DB, debug), - SYSG(X86_TRAP_BP, int3), + INTG(X86_TRAP_DB, asm_exc_debug), + SYSG(X86_TRAP_BP, asm_exc_int3), + #ifdef CONFIG_X86_32 - INTG(X86_TRAP_PF, page_fault), + /* + * Not possible on 64-bit. See idt_setup_early_pf() for details. + */ + INTG(X86_TRAP_PF, asm_exc_page_fault), +#endif +#ifdef CONFIG_INTEL_TDX_GUEST + INTG(X86_TRAP_VE, asm_exc_virtualization_exception), #endif }; @@ -70,33 +81,41 @@ static const __initconst struct idt_data early_idts[] = { * set up TSS. */ static const __initconst struct idt_data def_idts[] = { - INTG(X86_TRAP_DE, divide_error), - INTG(X86_TRAP_NMI, nmi), - INTG(X86_TRAP_BR, bounds), - INTG(X86_TRAP_UD, invalid_op), - INTG(X86_TRAP_NM, device_not_available), - INTG(X86_TRAP_OLD_MF, coprocessor_segment_overrun), - INTG(X86_TRAP_TS, invalid_TSS), - INTG(X86_TRAP_NP, segment_not_present), - INTG(X86_TRAP_SS, stack_segment), - INTG(X86_TRAP_GP, general_protection), - INTG(X86_TRAP_SPURIOUS, spurious_interrupt_bug), - INTG(X86_TRAP_MF, coprocessor_error), - INTG(X86_TRAP_AC, alignment_check), - INTG(X86_TRAP_XF, simd_coprocessor_error), + INTG(X86_TRAP_DE, asm_exc_divide_error), + ISTG(X86_TRAP_NMI, asm_exc_nmi, IST_INDEX_NMI), + INTG(X86_TRAP_BR, asm_exc_bounds), + INTG(X86_TRAP_UD, asm_exc_invalid_op), + INTG(X86_TRAP_NM, asm_exc_device_not_available), + INTG(X86_TRAP_OLD_MF, asm_exc_coproc_segment_overrun), + INTG(X86_TRAP_TS, asm_exc_invalid_tss), + INTG(X86_TRAP_NP, asm_exc_segment_not_present), + INTG(X86_TRAP_SS, asm_exc_stack_segment), + INTG(X86_TRAP_GP, asm_exc_general_protection), + INTG(X86_TRAP_SPURIOUS, asm_exc_spurious_interrupt_bug), + INTG(X86_TRAP_MF, asm_exc_coprocessor_error), + INTG(X86_TRAP_AC, asm_exc_alignment_check), + INTG(X86_TRAP_XF, asm_exc_simd_coprocessor_error), #ifdef CONFIG_X86_32 TSKG(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS), #else - INTG(X86_TRAP_DF, double_fault), + ISTG(X86_TRAP_DF, asm_exc_double_fault, IST_INDEX_DF), #endif - INTG(X86_TRAP_DB, debug), + ISTG(X86_TRAP_DB, asm_exc_debug, IST_INDEX_DB), #ifdef CONFIG_X86_MCE - INTG(X86_TRAP_MC, &machine_check), + ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE), #endif - SYSG(X86_TRAP_OF, overflow), +#ifdef CONFIG_X86_KERNEL_IBT + INTG(X86_TRAP_CP, asm_exc_control_protection), +#endif + +#ifdef CONFIG_AMD_MEM_ENCRYPT + ISTG(X86_TRAP_VC, asm_exc_vmm_communication, IST_INDEX_VC), +#endif + + SYSG(X86_TRAP_OF, asm_exc_overflow), #if defined(CONFIG_IA32_EMULATION) SYSG(IA32_SYSCALL_VECTOR, entry_INT80_compat), #elif defined(CONFIG_X86_32) @@ -109,112 +128,63 @@ static const __initconst struct idt_data def_idts[] = { */ static const __initconst struct idt_data apic_idts[] = { #ifdef CONFIG_SMP - INTG(RESCHEDULE_VECTOR, reschedule_interrupt), - INTG(CALL_FUNCTION_VECTOR, call_function_interrupt), - INTG(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt), - INTG(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt), - INTG(REBOOT_VECTOR, reboot_interrupt), + INTG(RESCHEDULE_VECTOR, asm_sysvec_reschedule_ipi), + INTG(CALL_FUNCTION_VECTOR, asm_sysvec_call_function), + INTG(CALL_FUNCTION_SINGLE_VECTOR, asm_sysvec_call_function_single), + INTG(IRQ_MOVE_CLEANUP_VECTOR, asm_sysvec_irq_move_cleanup), + INTG(REBOOT_VECTOR, asm_sysvec_reboot), #endif #ifdef CONFIG_X86_THERMAL_VECTOR - INTG(THERMAL_APIC_VECTOR, thermal_interrupt), + INTG(THERMAL_APIC_VECTOR, asm_sysvec_thermal), #endif #ifdef CONFIG_X86_MCE_THRESHOLD - INTG(THRESHOLD_APIC_VECTOR, threshold_interrupt), + INTG(THRESHOLD_APIC_VECTOR, asm_sysvec_threshold), #endif #ifdef CONFIG_X86_MCE_AMD - INTG(DEFERRED_ERROR_VECTOR, deferred_error_interrupt), + INTG(DEFERRED_ERROR_VECTOR, asm_sysvec_deferred_error), #endif #ifdef CONFIG_X86_LOCAL_APIC - INTG(LOCAL_TIMER_VECTOR, apic_timer_interrupt), - INTG(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi), + INTG(LOCAL_TIMER_VECTOR, asm_sysvec_apic_timer_interrupt), + INTG(X86_PLATFORM_IPI_VECTOR, asm_sysvec_x86_platform_ipi), # ifdef CONFIG_HAVE_KVM - INTG(POSTED_INTR_VECTOR, kvm_posted_intr_ipi), - INTG(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi), - INTG(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi), + INTG(POSTED_INTR_VECTOR, asm_sysvec_kvm_posted_intr_ipi), + INTG(POSTED_INTR_WAKEUP_VECTOR, asm_sysvec_kvm_posted_intr_wakeup_ipi), + INTG(POSTED_INTR_NESTED_VECTOR, asm_sysvec_kvm_posted_intr_nested_ipi), # endif # ifdef CONFIG_IRQ_WORK - INTG(IRQ_WORK_VECTOR, irq_work_interrupt), + INTG(IRQ_WORK_VECTOR, asm_sysvec_irq_work), # endif -#ifdef CONFIG_X86_UV - INTG(UV_BAU_MESSAGE, uv_bau_message_intr1), -#endif - INTG(SPURIOUS_APIC_VECTOR, spurious_interrupt), - INTG(ERROR_APIC_VECTOR, error_interrupt), + INTG(SPURIOUS_APIC_VECTOR, asm_sysvec_spurious_apic_interrupt), + INTG(ERROR_APIC_VECTOR, asm_sysvec_error_interrupt), #endif }; -#ifdef CONFIG_X86_64 -/* - * Early traps running on the DEFAULT_STACK because the other interrupt - * stacks work only after cpu_init(). - */ -static const __initconst struct idt_data early_pf_idts[] = { - INTG(X86_TRAP_PF, page_fault), -}; - -/* - * Override for the debug_idt. Same as the default, but with interrupt - * stack set to DEFAULT_STACK (0). Required for NMI trap handling. - */ -static const __initconst struct idt_data dbg_idts[] = { - INTG(X86_TRAP_DB, debug), -}; -#endif - -/* Must be page-aligned because the real IDT is used in a fixmap. */ -gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss; +/* Must be page-aligned because the real IDT is used in the cpu entry area */ +static gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss; -struct desc_ptr idt_descr __ro_after_init = { - .size = (IDT_ENTRIES * 2 * sizeof(unsigned long)) - 1, +static struct desc_ptr idt_descr __ro_after_init = { + .size = IDT_TABLE_SIZE - 1, .address = (unsigned long) idt_table, }; -#ifdef CONFIG_X86_64 -/* No need to be aligned, but done to keep all IDTs defined the same way. */ -gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; - -/* - * The exceptions which use Interrupt stacks. They are setup after - * cpu_init() when the TSS has been initialized. - */ -static const __initconst struct idt_data ist_idts[] = { - ISTG(X86_TRAP_DB, debug, IST_INDEX_DB), - ISTG(X86_TRAP_NMI, nmi, IST_INDEX_NMI), - ISTG(X86_TRAP_DF, double_fault, IST_INDEX_DF), -#ifdef CONFIG_X86_MCE - ISTG(X86_TRAP_MC, &machine_check, IST_INDEX_MCE), -#endif -}; - -/* - * Override for the debug_idt. Same as the default, but with interrupt - * stack set to DEFAULT_STACK (0). Required for NMI trap handling. - */ -const struct desc_ptr debug_idt_descr = { - .size = IDT_ENTRIES * 16 - 1, - .address = (unsigned long) debug_idt_table, -}; -#endif - -static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d) +void load_current_idt(void) { - unsigned long addr = (unsigned long) d->addr; + lockdep_assert_irqs_disabled(); + load_idt(&idt_descr); +} - gate->offset_low = (u16) addr; - gate->segment = (u16) d->segment; - gate->bits = d->bits; - gate->offset_middle = (u16) (addr >> 16); -#ifdef CONFIG_X86_64 - gate->offset_high = (u32) (addr >> 32); - gate->reserved = 0; -#endif +#ifdef CONFIG_X86_F00F_BUG +bool idt_is_f00f_address(unsigned long address) +{ + return ((address - idt_descr.address) >> 3) == 6; } +#endif -static void +static __init void idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sys) { gate_desc desc; @@ -227,18 +197,11 @@ idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sy } } -static void set_intr_gate(unsigned int n, const void *addr) +static __init void set_intr_gate(unsigned int n, const void *addr) { struct idt_data data; - BUG_ON(n > 0xFF); - - memset(&data, 0, sizeof(data)); - data.vector = n; - data.addr = addr; - data.segment = __KERNEL_CS; - data.bits.type = GATE_INTERRUPT; - data.bits.p = 1; + init_idt_data(&data, n, addr); idt_setup_from_table(idt_table, &data, 1, false); } @@ -266,6 +229,14 @@ void __init idt_setup_traps(void) } #ifdef CONFIG_X86_64 +/* + * Early traps running on the DEFAULT_STACK because the other interrupt + * stacks work only after cpu_init(). + */ +static const __initconst struct idt_data early_pf_idts[] = { + INTG(X86_TRAP_PF, asm_exc_page_fault), +}; + /** * idt_setup_early_pf - Initialize the idt table with early pagefault handler * @@ -273,35 +244,32 @@ void __init idt_setup_traps(void) * cpu_init() is invoked and sets up TSS. The IST variant is installed * after that. * - * FIXME: Why is 32bit and 64bit installing the PF handler at different - * places in the early setup code? + * Note, that X86_64 cannot install the real #PF handler in + * idt_setup_early_traps() because the memory initialization needs the #PF + * handler from the early_idt_handler_array to initialize the early page + * tables. */ void __init idt_setup_early_pf(void) { idt_setup_from_table(idt_table, early_pf_idts, ARRAY_SIZE(early_pf_idts), true); } +#endif -/** - * idt_setup_ist_traps - Initialize the idt table with traps using IST - */ -void __init idt_setup_ist_traps(void) +static void __init idt_map_in_cea(void) { - idt_setup_from_table(idt_table, ist_idts, ARRAY_SIZE(ist_idts), true); + /* + * Set the IDT descriptor to a fixed read-only location in the cpu + * entry area, so that the "sidt" instruction will not leak the + * location of the kernel, and to defend the IDT against arbitrary + * memory write vulnerabilities. + */ + cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table), + PAGE_KERNEL_RO); + idt_descr.address = CPU_ENTRY_AREA_RO_IDT; } /** - * idt_setup_debugidt_traps - Initialize the debug idt table with debug traps - */ -void __init idt_setup_debugidt_traps(void) -{ - memcpy(&debug_idt_table, &idt_table, IDT_ENTRIES * 16); - - idt_setup_from_table(debug_idt_table, dbg_idts, ARRAY_SIZE(dbg_idts), false); -} -#endif - -/** * idt_setup_apic_and_irq_gates - Setup APIC/SMP and normal interrupt gates */ void __init idt_setup_apic_and_irq_gates(void) @@ -312,17 +280,29 @@ void __init idt_setup_apic_and_irq_gates(void) idt_setup_from_table(idt_table, apic_idts, ARRAY_SIZE(apic_idts), true); for_each_clear_bit_from(i, system_vectors, FIRST_SYSTEM_VECTOR) { - entry = irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR); + entry = irq_entries_start + IDT_ALIGN * (i - FIRST_EXTERNAL_VECTOR); set_intr_gate(i, entry); } #ifdef CONFIG_X86_LOCAL_APIC for_each_clear_bit_from(i, system_vectors, NR_VECTORS) { - set_bit(i, system_vectors); - entry = spurious_entries_start + 8 * (i - FIRST_SYSTEM_VECTOR); + /* + * Don't set the non assigned system vectors in the + * system_vectors bitmap. Otherwise they show up in + * /proc/interrupts. + */ + entry = spurious_entries_start + IDT_ALIGN * (i - FIRST_SYSTEM_VECTOR); set_intr_gate(i, entry); } #endif + /* Map IDT into CPU entry area and reload it. */ + idt_map_in_cea(); + load_idt(&idt_descr); + + /* Make the IDT table read only */ + set_memory_ro((unsigned long)&idt_table, 1); + + idt_setup_done = true; } /** @@ -343,25 +323,22 @@ void __init idt_setup_early_handler(void) /** * idt_invalidate - Invalidate interrupt descriptor table - * @addr: The virtual address of the 'invalid' IDT */ -void idt_invalidate(void *addr) +void idt_invalidate(void) { - struct desc_ptr idt = { .address = (unsigned long) addr, .size = 0 }; + static const struct desc_ptr idt = { .address = 0, .size = 0 }; load_idt(&idt); } -void __init update_intr_gate(unsigned int n, const void *addr) +void __init alloc_intr_gate(unsigned int n, const void *addr) { - if (WARN_ON_ONCE(!test_bit(n, system_vectors))) + if (WARN_ON(n < FIRST_SYSTEM_VECTOR)) return; - set_intr_gate(n, addr); -} -void alloc_intr_gate(unsigned int n, const void *addr) -{ - BUG_ON(n < FIRST_SYSTEM_VECTOR); - if (!test_and_set_bit(n, system_vectors)) + if (WARN_ON(idt_setup_done)) + return; + + if (!WARN_ON(test_and_set_bit(n, system_vectors))) set_intr_gate(n, addr); } diff --git a/arch/x86/kernel/ima_arch.c b/arch/x86/kernel/ima_arch.c deleted file mode 100644 index 23054909c8dd..000000000000 --- a/arch/x86/kernel/ima_arch.c +++ /dev/null @@ -1,94 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ */ -/* - * Copyright (C) 2018 IBM Corporation - */ -#include <linux/efi.h> -#include <linux/module.h> -#include <linux/ima.h> - -extern struct boot_params boot_params; - -static enum efi_secureboot_mode get_sb_mode(void) -{ - efi_guid_t efi_variable_guid = EFI_GLOBAL_VARIABLE_GUID; - efi_status_t status; - unsigned long size; - u8 secboot, setupmode; - - size = sizeof(secboot); - - if (!efi_enabled(EFI_RUNTIME_SERVICES)) { - pr_info("ima: secureboot mode unknown, no efi\n"); - return efi_secureboot_mode_unknown; - } - - /* Get variable contents into buffer */ - status = efi.get_variable(L"SecureBoot", &efi_variable_guid, - NULL, &size, &secboot); - if (status == EFI_NOT_FOUND) { - pr_info("ima: secureboot mode disabled\n"); - return efi_secureboot_mode_disabled; - } - - if (status != EFI_SUCCESS) { - pr_info("ima: secureboot mode unknown\n"); - return efi_secureboot_mode_unknown; - } - - size = sizeof(setupmode); - status = efi.get_variable(L"SetupMode", &efi_variable_guid, - NULL, &size, &setupmode); - - if (status != EFI_SUCCESS) /* ignore unknown SetupMode */ - setupmode = 0; - - if (secboot == 0 || setupmode == 1) { - pr_info("ima: secureboot mode disabled\n"); - return efi_secureboot_mode_disabled; - } - - pr_info("ima: secureboot mode enabled\n"); - return efi_secureboot_mode_enabled; -} - -bool arch_ima_get_secureboot(void) -{ - static enum efi_secureboot_mode sb_mode; - static bool initialized; - - if (!initialized && efi_enabled(EFI_BOOT)) { - sb_mode = boot_params.secure_boot; - - if (sb_mode == efi_secureboot_mode_unset) - sb_mode = get_sb_mode(); - initialized = true; - } - - if (sb_mode == efi_secureboot_mode_enabled) - return true; - else - return false; -} - -/* secureboot arch rules */ -static const char * const sb_arch_rules[] = { -#if !IS_ENABLED(CONFIG_KEXEC_SIG) - "appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig", -#endif /* CONFIG_KEXEC_SIG */ - "measure func=KEXEC_KERNEL_CHECK", -#if !IS_ENABLED(CONFIG_MODULE_SIG) - "appraise func=MODULE_CHECK appraise_type=imasig", -#endif - "measure func=MODULE_CHECK", - NULL -}; - -const char * const *arch_get_ima_policy(void) -{ - if (IS_ENABLED(CONFIG_IMA_ARCH_POLICY) && arch_ima_get_secureboot()) { - if (IS_ENABLED(CONFIG_MODULE_SIG)) - set_module_sig_enforced(); - return sb_arch_rules; - } - return NULL; -} diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 8abeee0dd7bf..e2fab3ceb09f 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -13,6 +13,7 @@ #include <asm/io_bitmap.h> #include <asm/desc.h> +#include <asm/syscalls.h> #ifdef CONFIG_X86_IOPL_IOPERM @@ -32,15 +33,15 @@ void io_bitmap_share(struct task_struct *tsk) set_tsk_thread_flag(tsk, TIF_IO_BITMAP); } -static void task_update_io_bitmap(void) +static void task_update_io_bitmap(struct task_struct *tsk) { - struct thread_struct *t = ¤t->thread; + struct thread_struct *t = &tsk->thread; if (t->iopl_emul == 3 || t->io_bitmap) { /* TSS update is handled on exit to user space */ - set_thread_flag(TIF_IO_BITMAP); + set_tsk_thread_flag(tsk, TIF_IO_BITMAP); } else { - clear_thread_flag(TIF_IO_BITMAP); + clear_tsk_thread_flag(tsk, TIF_IO_BITMAP); /* Invalidate TSS */ preempt_disable(); tss_update_io_bitmap(); @@ -48,12 +49,12 @@ static void task_update_io_bitmap(void) } } -void io_bitmap_exit(void) +void io_bitmap_exit(struct task_struct *tsk) { - struct io_bitmap *iobm = current->thread.io_bitmap; + struct io_bitmap *iobm = tsk->thread.io_bitmap; - current->thread.io_bitmap = NULL; - task_update_io_bitmap(); + tsk->thread.io_bitmap = NULL; + task_update_io_bitmap(tsk); if (iobm && refcount_dec_and_test(&iobm->refcnt)) kfree(iobm); } @@ -101,7 +102,7 @@ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on) if (!iobm) return -ENOMEM; refcount_set(&iobm->refcnt, 1); - io_bitmap_exit(); + io_bitmap_exit(current); } /* @@ -133,7 +134,7 @@ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on) } /* All permissions dropped? */ if (max_long == UINT_MAX) { - io_bitmap_exit(); + io_bitmap_exit(current); return 0; } @@ -191,7 +192,7 @@ SYSCALL_DEFINE1(iopl, unsigned int, level) } t->iopl_emul = level; - task_update_io_bitmap(); + task_update_io_bitmap(current); return 0; } diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 21efee32e2b1..766ffe3ba313 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -13,12 +13,15 @@ #include <linux/export.h> #include <linux/irq.h> +#include <asm/irq_stack.h> #include <asm/apic.h> #include <asm/io_apic.h> #include <asm/irq.h> #include <asm/mce.h> #include <asm/hw_irq.h> #include <asm/desc.h> +#include <asm/traps.h> +#include <asm/thermal.h> #define CREATE_TRACE_POINTS #include <asm/trace/irq_vectors.h> @@ -26,9 +29,6 @@ DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); EXPORT_PER_CPU_SYMBOL(irq_stat); -DEFINE_PER_CPU(struct pt_regs *, irq_regs); -EXPORT_PER_CPU_SYMBOL(irq_regs); - atomic_t irq_err_count; /* @@ -224,35 +224,35 @@ u64 arch_irq_stat(void) return sum; } +static __always_inline void handle_irq(struct irq_desc *desc, + struct pt_regs *regs) +{ + if (IS_ENABLED(CONFIG_X86_64)) + generic_handle_irq_desc(desc); + else + __handle_irq(desc, regs); +} /* - * do_IRQ handles all normal device IRQ's (the special - * SMP cross-CPU interrupts have their own specific - * handlers). + * common_interrupt() handles all normal device IRQ's (the special SMP + * cross-CPU interrupts have their own entry points). */ -__visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) +DEFINE_IDTENTRY_IRQ(common_interrupt) { struct pt_regs *old_regs = set_irq_regs(regs); - struct irq_desc * desc; - /* high bit used in ret_from_ code */ - unsigned vector = ~regs->orig_ax; - - entering_irq(); + struct irq_desc *desc; - /* entering_irq() tells RCU that we're not quiescent. Check it. */ + /* entry code tells RCU that we're not quiescent. Check it. */ RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU"); desc = __this_cpu_read(vector_irq[vector]); if (likely(!IS_ERR_OR_NULL(desc))) { - if (IS_ENABLED(CONFIG_X86_32)) - handle_irq(desc, regs); - else - generic_handle_irq_desc(desc); + handle_irq(desc, regs); } else { ack_APIC_irq(); if (desc == VECTOR_UNUSED) { - pr_emerg_ratelimited("%s: %d.%d No irq handler for vector\n", + pr_emerg_ratelimited("%s: %d.%u No irq handler for vector\n", __func__, smp_processor_id(), vector); } else { @@ -260,10 +260,7 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) } } - exiting_irq(); - set_irq_regs(old_regs); - return 1; } #ifdef CONFIG_X86_LOCAL_APIC @@ -272,17 +269,16 @@ void (*x86_platform_ipi_callback)(void) = NULL; /* * Handler for X86_PLATFORM_IPI_VECTOR. */ -__visible void __irq_entry smp_x86_platform_ipi(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_x86_platform_ipi) { struct pt_regs *old_regs = set_irq_regs(regs); - entering_ack_irq(); + ack_APIC_irq(); trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR); inc_irq_stat(x86_platform_ipis); if (x86_platform_ipi_callback) x86_platform_ipi_callback(); trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR); - exiting_irq(); set_irq_regs(old_regs); } #endif @@ -295,49 +291,39 @@ void kvm_set_posted_intr_wakeup_handler(void (*handler)(void)) { if (handler) kvm_posted_intr_wakeup_handler = handler; - else + else { kvm_posted_intr_wakeup_handler = dummy_handler; + synchronize_rcu(); + } } EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler); /* * Handler for POSTED_INTERRUPT_VECTOR. */ -__visible void smp_kvm_posted_intr_ipi(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_ipi) { - struct pt_regs *old_regs = set_irq_regs(regs); - - entering_ack_irq(); + ack_APIC_irq(); inc_irq_stat(kvm_posted_intr_ipis); - exiting_irq(); - set_irq_regs(old_regs); } /* * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. */ -__visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_posted_intr_wakeup_ipi) { - struct pt_regs *old_regs = set_irq_regs(regs); - - entering_ack_irq(); + ack_APIC_irq(); inc_irq_stat(kvm_posted_intr_wakeup_ipis); kvm_posted_intr_wakeup_handler(); - exiting_irq(); - set_irq_regs(old_regs); } /* * Handler for POSTED_INTERRUPT_NESTED_VECTOR. */ -__visible void smp_kvm_posted_intr_nested_ipi(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_nested_ipi) { - struct pt_regs *old_regs = set_irq_regs(regs); - - entering_ack_irq(); + ack_APIC_irq(); inc_irq_stat(kvm_posted_intr_nested_ipis); - exiting_irq(); - set_irq_regs(old_regs); } #endif @@ -354,7 +340,7 @@ void fixup_irqs(void) irq_migrate_all_off_this_cpu(); /* - * We can remove mdelay() and then send spuriuous interrupts to + * We can remove mdelay() and then send spurious interrupts to * new cpu targets for all the irqs that were handled previously by * this cpu. While it works, I have seen spurious interrupt messages * (nothing wrong but still...). @@ -391,3 +377,23 @@ void fixup_irqs(void) } } #endif + +#ifdef CONFIG_X86_THERMAL_VECTOR +static void smp_thermal_vector(void) +{ + if (x86_thermal_enabled()) + intel_thermal_interrupt(); + else + pr_err("CPU%d: Unexpected LVT thermal interrupt!\n", + smp_processor_id()); +} + +DEFINE_IDTENTRY_SYSVEC(sysvec_thermal) +{ + trace_thermal_apic_entry(THERMAL_APIC_VECTOR); + inc_irq_stat(irq_thermal_count); + smp_thermal_vector(); + trace_thermal_apic_exit(THERMAL_APIC_VECTOR); + ack_APIC_irq(); +} +#endif diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index a759ca97cd01..01833ebf5e8e 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -22,6 +22,7 @@ #include <asm/apic.h> #include <asm/nospec-branch.h> +#include <asm/softirq_stack.h> #ifdef CONFIG_DEBUG_STACKOVERFLOW @@ -131,6 +132,7 @@ int irq_init_percpu_irqstack(unsigned int cpu) return 0; } +#ifdef CONFIG_SOFTIRQ_ON_OWN_STACK void do_softirq_own_stack(void) { struct irq_stack *irqstk; @@ -147,8 +149,9 @@ void do_softirq_own_stack(void) call_on_stack(__do_softirq, isp); } +#endif -void handle_irq(struct irq_desc *desc, struct pt_regs *regs) +void __handle_irq(struct irq_desc *desc, struct pt_regs *regs) { int overflow = check_stack_overflow(); diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 12df3a4abfdd..1c0fb96b9e39 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -20,6 +20,8 @@ #include <linux/sched/task_stack.h> #include <asm/cpu_entry_area.h> +#include <asm/softirq_stack.h> +#include <asm/irq_stack.h> #include <asm/io_apic.h> #include <asm/apic.h> @@ -43,11 +45,12 @@ static int map_irq_stack(unsigned int cpu) pages[i] = pfn_to_page(pa >> PAGE_SHIFT); } - va = vmap(pages, IRQ_STACK_SIZE / PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL); + va = vmap(pages, IRQ_STACK_SIZE / PAGE_SIZE, VM_MAP, PAGE_KERNEL); if (!va) return -ENOMEM; - per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE; + /* Store actual TOS to avoid adjustment in the hotpath */ + per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; return 0; } #else @@ -59,7 +62,8 @@ static int map_irq_stack(unsigned int cpu) { void *va = per_cpu_ptr(&irq_stack_backing_store, cpu); - per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE; + /* Store actual TOS to avoid adjustment in the hotpath */ + per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; return 0; } #endif diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index 80bee7695a20..890d4778cd35 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -9,18 +9,18 @@ #include <linux/irq_work.h> #include <linux/hardirq.h> #include <asm/apic.h> +#include <asm/idtentry.h> #include <asm/trace/irq_vectors.h> #include <linux/interrupt.h> #ifdef CONFIG_X86_LOCAL_APIC -__visible void __irq_entry smp_irq_work_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_irq_work) { - ipi_entering_ack_irq(); + ack_APIC_irq(); trace_irq_work_entry(IRQ_WORK_VECTOR); inc_irq_stat(apic_irq_work_irqs); irq_work_run(); trace_irq_work_exit(IRQ_WORK_VECTOR); - exiting_irq(); } void arch_irq_work_raise(void) diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S index 0db0375235b4..aaf9e776f323 100644 --- a/arch/x86/kernel/irqflags.S +++ b/arch/x86/kernel/irqflags.S @@ -7,20 +7,11 @@ /* * unsigned long native_save_fl(void) */ +.pushsection .noinstr.text, "ax" SYM_FUNC_START(native_save_fl) pushf pop %_ASM_AX - ret + RET SYM_FUNC_END(native_save_fl) +.popsection EXPORT_SYMBOL(native_save_fl) - -/* - * void native_restore_fl(unsigned long flags) - * %eax/%rdi: flags - */ -SYM_FUNC_START(native_restore_fl) - push %_ASM_ARG1 - popf - ret -SYM_FUNC_END(native_restore_fl) -EXPORT_SYMBOL(native_restore_fl) diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 16919a9671fa..beb1bada1b0a 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -16,12 +16,14 @@ #include <linux/acpi.h> #include <linux/io.h> #include <linux/delay.h> +#include <linux/pgtable.h> #include <linux/atomic.h> #include <asm/timer.h> #include <asm/hw_irq.h> -#include <asm/pgtable.h> #include <asm/desc.h> +#include <asm/io_apic.h> +#include <asm/acpi.h> #include <asm/apic.h> #include <asm/setup.h> #include <asm/i8259.h> @@ -44,15 +46,6 @@ * (these are usually mapped into the 0x30-0xff vector range) */ -/* - * IRQ2 is cascade interrupt to second interrupt controller - */ -static struct irqaction irq2 = { - .handler = no_action, - .name = "cascade", - .flags = IRQF_NO_THREAD, -}; - DEFINE_PER_CPU(vector_irq_t, vector_irq) = { [0 ... NR_VECTORS - 1] = VECTOR_UNUSED, }; @@ -84,7 +77,7 @@ void __init init_IRQ(void) * On cpu 0, Assign ISA_IRQ_VECTOR(irq) to IRQ 0..15. * If these IRQ's are handled by legacy interrupt-controllers like PIC, * then this configuration will likely be static after the boot. If - * these IRQ's are handled by more mordern controllers like IO-APIC, + * these IRQs are handled by more modern controllers like IO-APIC, * then this vector space can be freed and re-used dynamically as the * irq's migrate etc. */ @@ -104,6 +97,9 @@ void __init native_init_IRQ(void) idt_setup_apic_and_irq_gates(); lapic_assign_system_vectors(); - if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) - setup_irq(2, &irq2); + if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) { + /* IRQ2 is cascade interrupt to second interrupt controller */ + if (request_irq(2, no_action, IRQF_NO_THREAD, "cascade", NULL)) + pr_err("%s: request_irq() failed\n", "cascade"); + } } diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index 1cb3ca9bba49..9ff480e94511 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c @@ -39,8 +39,7 @@ static bool __read_mostly sched_itmt_capable; unsigned int __read_mostly sysctl_sched_itmt_enabled; static int sched_itmt_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { unsigned int old_sysctl; int ret; @@ -199,7 +198,7 @@ void sched_set_itmt_core_prio(int prio, int core_cpu) * of the priority chain and only used when * all other high priority cpus are out of capacity. */ - smt_prio = prio * smp_num_siblings / i; + smt_prio = prio * smp_num_siblings / (i * i); per_cpu(sched_core_priority, cpu) = smt_prio; i++; } diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c index 6eb8b50ea07e..4eb8f2d19a87 100644 --- a/arch/x86/kernel/jailhouse.c +++ b/arch/x86/kernel/jailhouse.c @@ -13,6 +13,8 @@ #include <linux/reboot.h> #include <linux/serial_8250.h> #include <asm/apic.h> +#include <asm/io_apic.h> +#include <asm/acpi.h> #include <asm/cpu.h> #include <asm/hypervisor.h> #include <asm/i8259.h> diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 9c4498ea0b3c..f5b8ef02d172 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -15,54 +15,76 @@ #include <asm/kprobes.h> #include <asm/alternative.h> #include <asm/text-patching.h> +#include <asm/insn.h> -static void bug_at(const void *ip, int line) +int arch_jump_entry_size(struct jump_entry *entry) { - /* - * The location is not an op that we were expecting. - * Something went wrong. Crash the box, as something could be - * corrupting the kernel. - */ - pr_crit("jump_label: Fatal kernel bug, unexpected op at %pS [%p] (%5ph) %d\n", ip, ip, ip, line); - BUG(); + struct insn insn = {}; + + insn_decode_kernel(&insn, (void *)jump_entry_code(entry)); + BUG_ON(insn.length != 2 && insn.length != 5); + + return insn.length; } -static const void * -__jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type, int init) +struct jump_label_patch { + const void *code; + int size; +}; + +static struct jump_label_patch +__jump_label_patch(struct jump_entry *entry, enum jump_label_type type) { - const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP }; - const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5]; - const void *expect, *code; + const void *expect, *code, *nop; const void *addr, *dest; - int line; + int size; addr = (void *)jump_entry_code(entry); dest = (void *)jump_entry_target(entry); - code = text_gen_insn(JMP32_INSN_OPCODE, addr, dest); + size = arch_jump_entry_size(entry); + switch (size) { + case JMP8_INSN_SIZE: + code = text_gen_insn(JMP8_INSN_OPCODE, addr, dest); + nop = x86_nops[size]; + break; - if (init) { - expect = default_nop; line = __LINE__; - } else if (type == JUMP_LABEL_JMP) { - expect = ideal_nop; line = __LINE__; - } else { - expect = code; line = __LINE__; + case JMP32_INSN_SIZE: + code = text_gen_insn(JMP32_INSN_OPCODE, addr, dest); + nop = x86_nops[size]; + break; + + default: BUG(); } - if (memcmp(addr, expect, JUMP_LABEL_NOP_SIZE)) - bug_at(addr, line); + if (type == JUMP_LABEL_JMP) + expect = nop; + else + expect = code; + + if (memcmp(addr, expect, size)) { + /* + * The location is not an op that we were expecting. + * Something went wrong. Crash the box, as something could be + * corrupting the kernel. + */ + pr_crit("jump_label: Fatal kernel bug, unexpected op at %pS [%p] (%5ph != %5ph)) size:%d type:%d\n", + addr, addr, addr, expect, size, type); + BUG(); + } if (type == JUMP_LABEL_NOP) - code = ideal_nop; + code = nop; - return code; + return (struct jump_label_patch){.code = code, .size = size}; } -static void inline __jump_label_transform(struct jump_entry *entry, - enum jump_label_type type, - int init) +static __always_inline void +__jump_label_transform(struct jump_entry *entry, + enum jump_label_type type, + int init) { - const void *opcode = __jump_label_set_jump_code(entry, type, init); + const struct jump_label_patch jlp = __jump_label_patch(entry, type); /* * As long as only a single processor is running and the code is still @@ -76,12 +98,11 @@ static void inline __jump_label_transform(struct jump_entry *entry, * always nop being the 'currently valid' instruction */ if (init || system_state == SYSTEM_BOOTING) { - text_poke_early((void *)jump_entry_code(entry), opcode, - JUMP_LABEL_NOP_SIZE); + text_poke_early((void *)jump_entry_code(entry), jlp.code, jlp.size); return; } - text_poke_bp((void *)jump_entry_code(entry), opcode, JUMP_LABEL_NOP_SIZE, NULL); + text_poke_bp((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); } static void __ref jump_label_transform(struct jump_entry *entry, @@ -102,7 +123,7 @@ void arch_jump_label_transform(struct jump_entry *entry, bool arch_jump_label_transform_queue(struct jump_entry *entry, enum jump_label_type type) { - const void *opcode; + struct jump_label_patch jlp; if (system_state == SYSTEM_BOOTING) { /* @@ -113,9 +134,8 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry, } mutex_lock(&text_mutex); - opcode = __jump_label_set_jump_code(entry, type, 0); - text_poke_queue((void *)jump_entry_code(entry), - opcode, JUMP_LABEL_NOP_SIZE, NULL); + jlp = __jump_label_patch(entry, type); + text_poke_queue((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); mutex_unlock(&text_mutex); return true; } @@ -126,32 +146,3 @@ void arch_jump_label_transform_apply(void) text_poke_finish(); mutex_unlock(&text_mutex); } - -static enum { - JL_STATE_START, - JL_STATE_NO_UPDATE, - JL_STATE_UPDATE, -} jlstate __initdata_or_module = JL_STATE_START; - -__init_or_module void arch_jump_label_transform_static(struct jump_entry *entry, - enum jump_label_type type) -{ - /* - * This function is called at boot up and when modules are - * first loaded. Check if the default nop, the one that is - * inserted at compile time, is the ideal nop. If it is, then - * we do not need to update the nop, and we can leave it as is. - * If it is not, then we need to update the nop to the ideal nop. - */ - if (jlstate == JL_STATE_START) { - const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP }; - const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5]; - - if (memcmp(ideal_nop, default_nop, 5) != 0) - jlstate = JL_STATE_UPDATE; - else - jlstate = JL_STATE_NO_UPDATE; - } - if (jlstate == JL_STATE_UPDATE) - jump_label_transform(entry, type, 1); -} diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index 64b6da95af98..e2e89bebcbc3 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -88,11 +88,13 @@ create_setup_data_node(struct dentry *parent, int no, static int __init create_setup_data_nodes(struct dentry *parent) { + struct setup_indirect *indirect; struct setup_data_node *node; struct setup_data *data; - int error; + u64 pa_data, pa_next; struct dentry *d; - u64 pa_data; + int error; + u32 len; int no = 0; d = debugfs_create_dir("setup_data", parent); @@ -112,12 +114,29 @@ static int __init create_setup_data_nodes(struct dentry *parent) error = -ENOMEM; goto err_dir; } - - if (data->type == SETUP_INDIRECT && - ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) { - node->paddr = ((struct setup_indirect *)data->data)->addr; - node->type = ((struct setup_indirect *)data->data)->type; - node->len = ((struct setup_indirect *)data->data)->len; + pa_next = data->next; + + if (data->type == SETUP_INDIRECT) { + len = sizeof(*data) + data->len; + memunmap(data); + data = memremap(pa_data, len, MEMREMAP_WB); + if (!data) { + kfree(node); + error = -ENOMEM; + goto err_dir; + } + + indirect = (struct setup_indirect *)data->data; + + if (indirect->type != SETUP_INDIRECT) { + node->paddr = indirect->addr; + node->type = indirect->type; + node->len = indirect->len; + } else { + node->paddr = pa_data; + node->type = data->type; + node->len = data->len; + } } else { node->paddr = pa_data; node->type = data->type; @@ -125,7 +144,7 @@ static int __init create_setup_data_nodes(struct dentry *parent) } create_setup_data_node(d, no, node); - pa_data = data->next; + pa_data = pa_next; memunmap(data); no++; diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index f293d872602a..6b58610a1552 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -17,7 +17,7 @@ #include <linux/kernel.h> #include <linux/mm.h> #include <linux/efi.h> -#include <linux/verification.h> +#include <linux/random.h> #include <asm/bootparam.h> #include <asm/setup.h> @@ -75,7 +75,7 @@ static int setup_cmdline(struct kimage *image, struct boot_params *params, if (image->type == KEXEC_TYPE_CRASH) { len = sprintf(cmdline_ptr, - "elfcorehdr=0x%lx ", image->arch.elf_load_addr); + "elfcorehdr=0x%lx ", image->elf_load_addr); } memcpy(cmdline_ptr + len, cmdline, cmdline_len); cmdline_len += len; @@ -110,6 +110,26 @@ static int setup_e820_entries(struct boot_params *params) return 0; } +enum { RNG_SEED_LENGTH = 32 }; + +static void +setup_rng_seed(struct boot_params *params, unsigned long params_load_addr, + unsigned int rng_seed_setup_data_offset) +{ + struct setup_data *sd = (void *)params + rng_seed_setup_data_offset; + unsigned long setup_data_phys; + + if (!rng_is_initialized()) + return; + + sd->type = SETUP_RNG_SEED; + sd->len = RNG_SEED_LENGTH; + get_random_bytes(sd->data, RNG_SEED_LENGTH); + setup_data_phys = params_load_addr + rng_seed_setup_data_offset; + sd->next = params->hdr.setup_data; + params->hdr.setup_data = setup_data_phys; +} + #ifdef CONFIG_EFI static int setup_efi_info_memmap(struct boot_params *params, unsigned long params_load_addr, @@ -141,9 +161,8 @@ prepare_add_efi_setup_data(struct boot_params *params, struct setup_data *sd = (void *)params + efi_setup_data_offset; struct efi_setup_data *esd = (void *)sd + sizeof(struct setup_data); - esd->fw_vendor = efi.fw_vendor; - esd->runtime = efi.runtime; - esd->tables = efi.config_table; + esd->fw_vendor = efi_fw_vendor; + esd->tables = efi_config_table; esd->smbios = efi.smbios; sd->type = SETUP_EFI; @@ -171,15 +190,6 @@ setup_efi_state(struct boot_params *params, unsigned long params_load_addr, if (!current_ei->efi_memmap_size) return 0; - /* - * If 1:1 mapping is not enabled, second kernel can not setup EFI - * and use EFI run time services. User space will have to pass - * acpi_rsdp=<addr> on kernel command line to make second kernel boot - * without efi. - */ - if (efi_have_uv1_memmap()) - return 0; - params->secure_boot = boot_params.secure_boot; ei->efi_loader_signature = current_ei->efi_loader_signature; ei->efi_systab = current_ei->efi_systab; @@ -196,11 +206,38 @@ setup_efi_state(struct boot_params *params, unsigned long params_load_addr, } #endif /* CONFIG_EFI */ +static void +setup_ima_state(const struct kimage *image, struct boot_params *params, + unsigned long params_load_addr, + unsigned int ima_setup_data_offset) +{ +#ifdef CONFIG_IMA_KEXEC + struct setup_data *sd = (void *)params + ima_setup_data_offset; + unsigned long setup_data_phys; + struct ima_setup_data *ima; + + if (!image->ima_buffer_size) + return; + + sd->type = SETUP_IMA; + sd->len = sizeof(*ima); + + ima = (void *)sd + sizeof(struct setup_data); + ima->addr = image->ima_buffer_addr; + ima->size = image->ima_buffer_size; + + /* Add setup data */ + setup_data_phys = params_load_addr + ima_setup_data_offset; + sd->next = params->hdr.setup_data; + params->hdr.setup_data = setup_data_phys; +#endif /* CONFIG_IMA_KEXEC */ +} + static int setup_boot_parameters(struct kimage *image, struct boot_params *params, unsigned long params_load_addr, unsigned int efi_map_offset, unsigned int efi_map_sz, - unsigned int efi_setup_data_offset) + unsigned int setup_data_offset) { unsigned int nr_e820_entries; unsigned long long mem_k, start, end; @@ -210,8 +247,7 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params, params->hdr.hardware_subarch = boot_params.hdr.hardware_subarch; /* Copying screen_info will do? */ - memcpy(¶ms->screen_info, &boot_params.screen_info, - sizeof(struct screen_info)); + memcpy(¶ms->screen_info, &screen_info, sizeof(struct screen_info)); /* Fill in memsize later */ params->screen_info.ext_mem_k = 0; @@ -256,8 +292,22 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params, #ifdef CONFIG_EFI /* Setup EFI state */ setup_efi_state(params, params_load_addr, efi_map_offset, efi_map_sz, - efi_setup_data_offset); + setup_data_offset); + setup_data_offset += sizeof(struct setup_data) + + sizeof(struct efi_setup_data); #endif + + if (IS_ENABLED(CONFIG_IMA_KEXEC)) { + /* Setup IMA log buffer state */ + setup_ima_state(image, params, params_load_addr, + setup_data_offset); + setup_data_offset += sizeof(struct setup_data) + + sizeof(struct ima_setup_data); + } + + /* Setup RNG seed */ + setup_rng_seed(params, params_load_addr, setup_data_offset); + /* Setup EDD info */ memcpy(params->eddbuf, boot_params.eddbuf, EDDMAXNR * sizeof(struct edd_info)); @@ -412,7 +462,13 @@ static void *bzImage64_load(struct kimage *image, char *kernel, params_cmdline_sz = ALIGN(params_cmdline_sz, 16); kbuf.bufsz = params_cmdline_sz + ALIGN(efi_map_sz, 16) + sizeof(struct setup_data) + - sizeof(struct efi_setup_data); + sizeof(struct efi_setup_data) + + sizeof(struct setup_data) + + RNG_SEED_LENGTH; + + if (IS_ENABLED(CONFIG_IMA_KEXEC)) + kbuf.bufsz += sizeof(struct setup_data) + + sizeof(struct ima_setup_data); params = kzalloc(kbuf.bufsz, GFP_KERNEL); if (!params) @@ -539,28 +595,11 @@ static int bzImage64_cleanup(void *loader_data) return 0; } -#ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG -static int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len) -{ - int ret; - - ret = verify_pefile_signature(kernel, kernel_len, - VERIFY_USE_SECONDARY_KEYRING, - VERIFYING_KEXEC_PE_SIGNATURE); - if (ret == -ENOKEY && IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING)) { - ret = verify_pefile_signature(kernel, kernel_len, - VERIFY_USE_PLATFORM_KEYRING, - VERIFYING_KEXEC_PE_SIGNATURE); - } - return ret; -} -#endif - const struct kexec_file_ops kexec_bzImage64_ops = { .probe = bzImage64_probe, .load = bzImage64_load, .cleanup = bzImage64_cleanup, #ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG - .verify_sig = bzImage64_verify_sig, + .verify_sig = kexec_kernel_verify_pe_sig, #endif }; diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index c44fe7d8d9a4..3a43a2dee658 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -17,7 +17,7 @@ * Updated by: Tom Rini <trini@kernel.crashing.org> * Updated by: Jason Wessel <jason.wessel@windriver.com> * Modified for 386 by Jim Kingdon, Cygnus Support. - * Origianl kgdb, compatibility with 2.1.xx kernel by + * Original kgdb, compatibility with 2.1.xx kernel by * David Grothe <dave@gcom.com> * Integrated into 2.2.5 kernel by Tigran Aivazian <tigran@sco.com> * X86_64 changes from Andi Kleen's patch merged by Jim Houston @@ -450,7 +450,7 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, ptr = &remcomInBuffer[1]; if (kgdb_hex2long(&ptr, &addr)) linux_regs->ip = addr; - /* fall through */ + fallthrough; case 'D': case 'k': /* clear the trace bit */ @@ -539,7 +539,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd) * a system call which should be ignored */ return NOTIFY_DONE; - /* fall through */ + fallthrough; default: if (user_mode(regs)) return NOTIFY_DONE; @@ -629,9 +629,10 @@ static void kgdb_hw_overflow_handler(struct perf_event *event, struct task_struct *tsk = current; int i; - for (i = 0; i < 4; i++) + for (i = 0; i < 4; i++) { if (breakinfo[i].enabled) - tsk->thread.debugreg6 |= (DR_TRAP0 << i); + tsk->thread.virtual_dr6 |= (DR_TRAP0 << i); + } } void kgdb_arch_late(void) @@ -641,7 +642,7 @@ void kgdb_arch_late(void) struct perf_event **pevent; /* - * Pre-allocate the hw breakpoint structions in the non-atomic + * Pre-allocate the hw breakpoint instructions in the non-atomic * portion of kgdb because this operation requires mutexs to * complete. */ @@ -732,11 +733,11 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) int err; bpt->type = BP_BREAKPOINT; - err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr, + err = copy_from_kernel_nofault(bpt->saved_instr, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE); if (err) return err; - err = probe_kernel_write((char *)bpt->bpt_addr, + err = copy_to_kernel_nofault((char *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE); if (!err) return err; @@ -768,7 +769,7 @@ int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) return 0; knl_write: - return probe_kernel_write((char *)bpt->bpt_addr, + return copy_to_kernel_nofault((char *)bpt->bpt_addr, (char *)bpt->saved_instr, BREAK_INSTR_SIZE); } diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h index 7d3a2e2daf01..c993521d4933 100644 --- a/arch/x86/kernel/kprobes/common.h +++ b/arch/x86/kernel/kprobes/common.h @@ -6,6 +6,7 @@ #include <asm/asm.h> #include <asm/frame.h> +#include <asm/insn.h> #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 4d7022a740ab..eb8bc82846b9 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -33,32 +33,32 @@ #include <linux/hardirq.h> #include <linux/preempt.h> #include <linux/sched/debug.h> +#include <linux/perf_event.h> #include <linux/extable.h> #include <linux/kdebug.h> #include <linux/kallsyms.h> #include <linux/ftrace.h> -#include <linux/frame.h> #include <linux/kasan.h> #include <linux/moduleloader.h> +#include <linux/objtool.h> #include <linux/vmalloc.h> +#include <linux/pgtable.h> #include <asm/text-patching.h> #include <asm/cacheflush.h> #include <asm/desc.h> -#include <asm/pgtable.h> #include <linux/uaccess.h> #include <asm/alternative.h> #include <asm/insn.h> #include <asm/debugreg.h> #include <asm/set_memory.h> +#include <asm/ibt.h> #include "common.h" DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); -#define stack_addr(regs) ((unsigned long *)regs->sp) - #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ @@ -132,32 +132,14 @@ void synthesize_relcall(void *dest, void *from, void *to) NOKPROBE_SYMBOL(synthesize_relcall); /* - * Skip the prefixes of the instruction. - */ -static kprobe_opcode_t *skip_prefixes(kprobe_opcode_t *insn) -{ - insn_attr_t attr; - - attr = inat_get_opcode_attribute((insn_byte_t)*insn); - while (inat_is_legacy_prefix(attr)) { - insn++; - attr = inat_get_opcode_attribute((insn_byte_t)*insn); - } -#ifdef CONFIG_X86_64 - if (inat_is_rex_prefix(attr)) - insn++; -#endif - return insn; -} -NOKPROBE_SYMBOL(skip_prefixes); - -/* * Returns non-zero if INSN is boostable. * RIP relative instructions are adjusted at copying time in 64 bits mode */ int can_boost(struct insn *insn, void *addr) { kprobe_opcode_t opcode; + insn_byte_t prefix; + int i; if (search_exception_tables((unsigned long)addr)) return 0; /* Page fault may occur on this address. */ @@ -170,35 +152,39 @@ int can_boost(struct insn *insn, void *addr) if (insn->opcode.nbytes != 1) return 0; - /* Can't boost Address-size override prefix */ - if (unlikely(inat_is_address_size_prefix(insn->attr))) - return 0; + for_each_insn_prefix(insn, i, prefix) { + insn_attr_t attr; + + attr = inat_get_opcode_attribute(prefix); + /* Can't boost Address-size override prefix and CS override prefix */ + if (prefix == 0x2e || inat_is_address_size_prefix(attr)) + return 0; + } opcode = insn->opcode.bytes[0]; - switch (opcode & 0xf0) { - case 0x60: - /* can't boost "bound" */ - return (opcode != 0x62); - case 0x70: - return 0; /* can't boost conditional jump */ - case 0x90: - return opcode != 0x9a; /* can't boost call far */ - case 0xc0: - /* can't boost software-interruptions */ - return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf; - case 0xd0: - /* can boost AA* and XLAT */ - return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7); - case 0xe0: - /* can boost in/out and absolute jmps */ - return ((opcode & 0x04) || opcode == 0xea); - case 0xf0: - /* clear and set flags are boostable */ - return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe)); + switch (opcode) { + case 0x62: /* bound */ + case 0x70 ... 0x7f: /* Conditional jumps */ + case 0x9a: /* Call far */ + case 0xc0 ... 0xc1: /* Grp2 */ + case 0xcc ... 0xce: /* software exceptions */ + case 0xd0 ... 0xd3: /* Grp2 */ + case 0xd6: /* (UD) */ + case 0xd8 ... 0xdf: /* ESC */ + case 0xe0 ... 0xe3: /* LOOP*, JCXZ */ + case 0xe8 ... 0xe9: /* near Call, JMP */ + case 0xeb: /* Short JMP */ + case 0xf0 ... 0xf4: /* LOCK/REP, HLT */ + case 0xf6 ... 0xf7: /* Grp3 */ + case 0xfe: /* Grp4 */ + /* ... are not boostable */ + return 0; + case 0xff: /* Grp5 */ + /* Only indirect jmp is boostable */ + return X86_MODRM_REG(insn->modrm.bytes[0]) == 4; default: - /* CS override prefix and call are not boostable */ - return (opcode != 0x2e && opcode != 0x9a); + return 1; } } @@ -206,17 +192,10 @@ static unsigned long __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr) { struct kprobe *kp; - unsigned long faddr; + bool faddr; kp = get_kprobe((void *)addr); - faddr = ftrace_location(addr); - /* - * Addresses inside the ftrace location are refused by - * arch_check_ftrace_location(). Something went terribly wrong - * if such an address is checked here. - */ - if (WARN_ON(faddr && faddr != addr)) - return 0UL; + faddr = ftrace_location(addr) == addr; /* * Use the current code if it is not modified by Kprobe * and it cannot be modified by ftrace. @@ -243,12 +222,12 @@ __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr) * Fortunately, we know that the original code is the ideal 5-byte * long NOP. */ - if (probe_kernel_read(buf, (void *)addr, + if (copy_from_kernel_nofault(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) return 0UL; if (faddr) - memcpy(buf, ideal_nops[NOP_ATOMIC5], 5); + memcpy(buf, x86_nops[5], 5); else buf[0] = kp->opcode; return (unsigned long)buf; @@ -284,6 +263,8 @@ static int can_probe(unsigned long paddr) /* Decode instructions */ addr = paddr - offset; while (addr < paddr) { + int ret; + /* * Check if the instruction has been modified by another * kprobe, in which case we replace the breakpoint by the @@ -295,8 +276,10 @@ static int can_probe(unsigned long paddr) __addr = recover_probed_instruction(buf, addr); if (!__addr) return 0; - kernel_insn_init(&insn, (void *)__addr, MAX_INSN_SIZE); - insn_get_length(&insn); + + ret = insn_decode_kernel(&insn, (void *)__addr); + if (ret < 0) + return 0; /* * Another debugging subsystem might insert this breakpoint. @@ -310,23 +293,20 @@ static int can_probe(unsigned long paddr) return (addr == paddr); } -/* - * Returns non-zero if opcode modifies the interrupt flag. - */ -static int is_IF_modifier(kprobe_opcode_t *insn) +/* If x86 supports IBT (ENDBR) it must be skipped. */ +kprobe_opcode_t *arch_adjust_kprobe_addr(unsigned long addr, unsigned long offset, + bool *on_func_entry) { - /* Skip prefixes */ - insn = skip_prefixes(insn); + if (is_endbr(*(u32 *)addr)) { + *on_func_entry = !offset || offset == 4; + if (*on_func_entry) + offset = 4; - switch (*insn) { - case 0xfa: /* cli */ - case 0xfb: /* sti */ - case 0xcf: /* iret/iretd */ - case 0x9d: /* popf/popfd */ - return 1; + } else { + *on_func_entry = !offset; } - return 0; + return (kprobe_opcode_t *)(addr + offset); } /* @@ -339,18 +319,20 @@ static int is_IF_modifier(kprobe_opcode_t *insn) int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn) { kprobe_opcode_t buf[MAX_INSN_SIZE]; - unsigned long recovered_insn = - recover_probed_instruction(buf, (unsigned long)src); + unsigned long recovered_insn = recover_probed_instruction(buf, (unsigned long)src); + int ret; if (!recovered_insn || !insn) return 0; /* This can access kernel text if given address is not recovered */ - if (probe_kernel_read(dest, (void *)recovered_insn, MAX_INSN_SIZE)) + if (copy_from_kernel_nofault(dest, (void *)recovered_insn, + MAX_INSN_SIZE)) return 0; - kernel_insn_init(insn, dest, MAX_INSN_SIZE); - insn_get_length(insn); + ret = insn_decode_kernel(insn, dest); + if (ret < 0) + return 0; /* We can not probe force emulate prefixed instruction */ if (insn_has_emulate_prefix(insn)) @@ -394,13 +376,14 @@ int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn) return insn->length; } -/* Prepare reljump right after instruction to boost */ -static int prepare_boost(kprobe_opcode_t *buf, struct kprobe *p, - struct insn *insn) +/* Prepare reljump or int3 right after instruction */ +static int prepare_singlestep(kprobe_opcode_t *buf, struct kprobe *p, + struct insn *insn) { int len = insn->length; - if (can_boost(insn, p->addr) && + if (!IS_ENABLED(CONFIG_PREEMPTION) && + !p->post_handler && can_boost(insn, p->addr) && MAX_INSN_SIZE - len >= JMP32_INSN_SIZE) { /* * These instructions can be executed directly if it @@ -409,9 +392,14 @@ static int prepare_boost(kprobe_opcode_t *buf, struct kprobe *p, synthesize_reljump(buf + len, p->ainsn.insn + len, p->addr + insn->length); len += JMP32_INSN_SIZE; - p->ainsn.boostable = true; + p->ainsn.boostable = 1; } else { - p->ainsn.boostable = false; + /* Otherwise, put an int3 for trapping singlestep */ + if (MAX_INSN_SIZE - len < INT3_INSN_SIZE) + return -ENOSPC; + + buf[len] = INT3_INSN_OPCODE; + len += INT3_INSN_SIZE; } return len; @@ -442,35 +430,297 @@ void *alloc_insn_page(void) return page; } -/* Recover page to RW mode before releasing it */ -void free_insn_page(void *page) +/* Kprobe x86 instruction emulation - only regs->ip or IF flag modifiers */ + +static void kprobe_emulate_ifmodifiers(struct kprobe *p, struct pt_regs *regs) { - module_memfree(page); + switch (p->ainsn.opcode) { + case 0xfa: /* cli */ + regs->flags &= ~(X86_EFLAGS_IF); + break; + case 0xfb: /* sti */ + regs->flags |= X86_EFLAGS_IF; + break; + case 0x9c: /* pushf */ + int3_emulate_push(regs, regs->flags); + break; + case 0x9d: /* popf */ + regs->flags = int3_emulate_pop(regs); + break; + } + regs->ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size; +} +NOKPROBE_SYMBOL(kprobe_emulate_ifmodifiers); + +static void kprobe_emulate_ret(struct kprobe *p, struct pt_regs *regs) +{ + int3_emulate_ret(regs); +} +NOKPROBE_SYMBOL(kprobe_emulate_ret); + +static void kprobe_emulate_call(struct kprobe *p, struct pt_regs *regs) +{ + unsigned long func = regs->ip - INT3_INSN_SIZE + p->ainsn.size; + + func += p->ainsn.rel32; + int3_emulate_call(regs, func); +} +NOKPROBE_SYMBOL(kprobe_emulate_call); + +static nokprobe_inline +void __kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs, bool cond) +{ + unsigned long ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size; + + if (cond) + ip += p->ainsn.rel32; + int3_emulate_jmp(regs, ip); +} + +static void kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs) +{ + __kprobe_emulate_jmp(p, regs, true); +} +NOKPROBE_SYMBOL(kprobe_emulate_jmp); + +static const unsigned long jcc_mask[6] = { + [0] = X86_EFLAGS_OF, + [1] = X86_EFLAGS_CF, + [2] = X86_EFLAGS_ZF, + [3] = X86_EFLAGS_CF | X86_EFLAGS_ZF, + [4] = X86_EFLAGS_SF, + [5] = X86_EFLAGS_PF, +}; + +static void kprobe_emulate_jcc(struct kprobe *p, struct pt_regs *regs) +{ + bool invert = p->ainsn.jcc.type & 1; + bool match; + + if (p->ainsn.jcc.type < 0xc) { + match = regs->flags & jcc_mask[p->ainsn.jcc.type >> 1]; + } else { + match = ((regs->flags & X86_EFLAGS_SF) >> X86_EFLAGS_SF_BIT) ^ + ((regs->flags & X86_EFLAGS_OF) >> X86_EFLAGS_OF_BIT); + if (p->ainsn.jcc.type >= 0xe) + match = match || (regs->flags & X86_EFLAGS_ZF); + } + __kprobe_emulate_jmp(p, regs, (match && !invert) || (!match && invert)); +} +NOKPROBE_SYMBOL(kprobe_emulate_jcc); + +static void kprobe_emulate_loop(struct kprobe *p, struct pt_regs *regs) +{ + bool match; + + if (p->ainsn.loop.type != 3) { /* LOOP* */ + if (p->ainsn.loop.asize == 32) + match = ((*(u32 *)®s->cx)--) != 0; +#ifdef CONFIG_X86_64 + else if (p->ainsn.loop.asize == 64) + match = ((*(u64 *)®s->cx)--) != 0; +#endif + else + match = ((*(u16 *)®s->cx)--) != 0; + } else { /* JCXZ */ + if (p->ainsn.loop.asize == 32) + match = *(u32 *)(®s->cx) == 0; +#ifdef CONFIG_X86_64 + else if (p->ainsn.loop.asize == 64) + match = *(u64 *)(®s->cx) == 0; +#endif + else + match = *(u16 *)(®s->cx) == 0; + } + + if (p->ainsn.loop.type == 0) /* LOOPNE */ + match = match && !(regs->flags & X86_EFLAGS_ZF); + else if (p->ainsn.loop.type == 1) /* LOOPE */ + match = match && (regs->flags & X86_EFLAGS_ZF); + + __kprobe_emulate_jmp(p, regs, match); +} +NOKPROBE_SYMBOL(kprobe_emulate_loop); + +static const int addrmode_regoffs[] = { + offsetof(struct pt_regs, ax), + offsetof(struct pt_regs, cx), + offsetof(struct pt_regs, dx), + offsetof(struct pt_regs, bx), + offsetof(struct pt_regs, sp), + offsetof(struct pt_regs, bp), + offsetof(struct pt_regs, si), + offsetof(struct pt_regs, di), +#ifdef CONFIG_X86_64 + offsetof(struct pt_regs, r8), + offsetof(struct pt_regs, r9), + offsetof(struct pt_regs, r10), + offsetof(struct pt_regs, r11), + offsetof(struct pt_regs, r12), + offsetof(struct pt_regs, r13), + offsetof(struct pt_regs, r14), + offsetof(struct pt_regs, r15), +#endif +}; + +static void kprobe_emulate_call_indirect(struct kprobe *p, struct pt_regs *regs) +{ + unsigned long offs = addrmode_regoffs[p->ainsn.indirect.reg]; + + int3_emulate_call(regs, regs_get_register(regs, offs)); +} +NOKPROBE_SYMBOL(kprobe_emulate_call_indirect); + +static void kprobe_emulate_jmp_indirect(struct kprobe *p, struct pt_regs *regs) +{ + unsigned long offs = addrmode_regoffs[p->ainsn.indirect.reg]; + + int3_emulate_jmp(regs, regs_get_register(regs, offs)); +} +NOKPROBE_SYMBOL(kprobe_emulate_jmp_indirect); + +static int prepare_emulation(struct kprobe *p, struct insn *insn) +{ + insn_byte_t opcode = insn->opcode.bytes[0]; + + switch (opcode) { + case 0xfa: /* cli */ + case 0xfb: /* sti */ + case 0x9c: /* pushfl */ + case 0x9d: /* popf/popfd */ + /* + * IF modifiers must be emulated since it will enable interrupt while + * int3 single stepping. + */ + p->ainsn.emulate_op = kprobe_emulate_ifmodifiers; + p->ainsn.opcode = opcode; + break; + case 0xc2: /* ret/lret */ + case 0xc3: + case 0xca: + case 0xcb: + p->ainsn.emulate_op = kprobe_emulate_ret; + break; + case 0x9a: /* far call absolute -- segment is not supported */ + case 0xea: /* far jmp absolute -- segment is not supported */ + case 0xcc: /* int3 */ + case 0xcf: /* iret -- in-kernel IRET is not supported */ + return -EOPNOTSUPP; + break; + case 0xe8: /* near call relative */ + p->ainsn.emulate_op = kprobe_emulate_call; + if (insn->immediate.nbytes == 2) + p->ainsn.rel32 = *(s16 *)&insn->immediate.value; + else + p->ainsn.rel32 = *(s32 *)&insn->immediate.value; + break; + case 0xeb: /* short jump relative */ + case 0xe9: /* near jump relative */ + p->ainsn.emulate_op = kprobe_emulate_jmp; + if (insn->immediate.nbytes == 1) + p->ainsn.rel32 = *(s8 *)&insn->immediate.value; + else if (insn->immediate.nbytes == 2) + p->ainsn.rel32 = *(s16 *)&insn->immediate.value; + else + p->ainsn.rel32 = *(s32 *)&insn->immediate.value; + break; + case 0x70 ... 0x7f: + /* 1 byte conditional jump */ + p->ainsn.emulate_op = kprobe_emulate_jcc; + p->ainsn.jcc.type = opcode & 0xf; + p->ainsn.rel32 = *(char *)insn->immediate.bytes; + break; + case 0x0f: + opcode = insn->opcode.bytes[1]; + if ((opcode & 0xf0) == 0x80) { + /* 2 bytes Conditional Jump */ + p->ainsn.emulate_op = kprobe_emulate_jcc; + p->ainsn.jcc.type = opcode & 0xf; + if (insn->immediate.nbytes == 2) + p->ainsn.rel32 = *(s16 *)&insn->immediate.value; + else + p->ainsn.rel32 = *(s32 *)&insn->immediate.value; + } else if (opcode == 0x01 && + X86_MODRM_REG(insn->modrm.bytes[0]) == 0 && + X86_MODRM_MOD(insn->modrm.bytes[0]) == 3) { + /* VM extensions - not supported */ + return -EOPNOTSUPP; + } + break; + case 0xe0: /* Loop NZ */ + case 0xe1: /* Loop */ + case 0xe2: /* Loop */ + case 0xe3: /* J*CXZ */ + p->ainsn.emulate_op = kprobe_emulate_loop; + p->ainsn.loop.type = opcode & 0x3; + p->ainsn.loop.asize = insn->addr_bytes * 8; + p->ainsn.rel32 = *(s8 *)&insn->immediate.value; + break; + case 0xff: + /* + * Since the 0xff is an extended group opcode, the instruction + * is determined by the MOD/RM byte. + */ + opcode = insn->modrm.bytes[0]; + if ((opcode & 0x30) == 0x10) { + if ((opcode & 0x8) == 0x8) + return -EOPNOTSUPP; /* far call */ + /* call absolute, indirect */ + p->ainsn.emulate_op = kprobe_emulate_call_indirect; + } else if ((opcode & 0x30) == 0x20) { + if ((opcode & 0x8) == 0x8) + return -EOPNOTSUPP; /* far jmp */ + /* jmp near absolute indirect */ + p->ainsn.emulate_op = kprobe_emulate_jmp_indirect; + } else + break; + + if (insn->addr_bytes != sizeof(unsigned long)) + return -EOPNOTSUPP; /* Don't support different size */ + if (X86_MODRM_MOD(opcode) != 3) + return -EOPNOTSUPP; /* TODO: support memory addressing */ + + p->ainsn.indirect.reg = X86_MODRM_RM(opcode); +#ifdef CONFIG_X86_64 + if (X86_REX_B(insn->rex_prefix.value)) + p->ainsn.indirect.reg += 8; +#endif + break; + default: + break; + } + p->ainsn.size = insn->length; + + return 0; } static int arch_copy_kprobe(struct kprobe *p) { struct insn insn; kprobe_opcode_t buf[MAX_INSN_SIZE]; - int len; + int ret, len; /* Copy an instruction with recovering if other optprobe modifies it.*/ len = __copy_instruction(buf, p->addr, p->ainsn.insn, &insn); if (!len) return -EINVAL; - /* - * __copy_instruction can modify the displacement of the instruction, - * but it doesn't affect boostable check. - */ - len = prepare_boost(buf, p, &insn); + /* Analyze the opcode and setup emulate functions */ + ret = prepare_emulation(p, &insn); + if (ret < 0) + return ret; - /* Check whether the instruction modifies Interrupt Flag or not */ - p->ainsn.if_modifier = is_IF_modifier(buf); + /* Add int3 for single-step or booster jmp */ + len = prepare_singlestep(buf, p, &insn); + if (len < 0) + return len; /* Also, displacement change doesn't affect the first byte */ p->opcode = buf[0]; + p->ainsn.tp_len = len; + perf_event_text_poke(p->ainsn.insn, NULL, 0, buf, len); + /* OK, write back the instruction(s) into ROX insn buffer */ text_poke(p->ainsn.insn, buf, len); @@ -486,6 +736,9 @@ int arch_prepare_kprobe(struct kprobe *p) if (!can_probe((unsigned long)p->addr)) return -EILSEQ; + + memset(&p->ainsn, 0, sizeof(p->ainsn)); + /* insn: must be on special executable page on x86. */ p->ainsn.insn = get_insn_slot(); if (!p->ainsn.insn) @@ -502,12 +755,18 @@ int arch_prepare_kprobe(struct kprobe *p) void arch_arm_kprobe(struct kprobe *p) { - text_poke(p->addr, ((unsigned char []){INT3_INSN_OPCODE}), 1); + u8 int3 = INT3_INSN_OPCODE; + + text_poke(p->addr, &int3, 1); text_poke_sync(); + perf_event_text_poke(p->addr, &p->opcode, 1, &int3, 1); } void arch_disarm_kprobe(struct kprobe *p) { + u8 int3 = INT3_INSN_OPCODE; + + perf_event_text_poke(p->addr, &int3, 1, &p->opcode, 1); text_poke(p->addr, &p->opcode, 1); text_poke_sync(); } @@ -515,6 +774,9 @@ void arch_disarm_kprobe(struct kprobe *p) void arch_remove_kprobe(struct kprobe *p) { if (p->ainsn.insn) { + /* Record the perf event before freeing the slot */ + perf_event_text_poke(p->ainsn.insn, p->ainsn.insn, + p->ainsn.tp_len, NULL, 0); free_insn_slot(p->ainsn.insn, p->ainsn.boostable); p->ainsn.insn = NULL; } @@ -544,42 +806,28 @@ set_current_kprobe(struct kprobe *p, struct pt_regs *regs, { __this_cpu_write(current_kprobe, p); kcb->kprobe_saved_flags = kcb->kprobe_old_flags - = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); - if (p->ainsn.if_modifier) - kcb->kprobe_saved_flags &= ~X86_EFLAGS_IF; + = (regs->flags & X86_EFLAGS_IF); } -static nokprobe_inline void clear_btf(void) +static void kprobe_post_process(struct kprobe *cur, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) { - if (test_thread_flag(TIF_BLOCKSTEP)) { - unsigned long debugctl = get_debugctlmsr(); - - debugctl &= ~DEBUGCTLMSR_BTF; - update_debugctlmsr(debugctl); - } -} - -static nokprobe_inline void restore_btf(void) -{ - if (test_thread_flag(TIF_BLOCKSTEP)) { - unsigned long debugctl = get_debugctlmsr(); - - debugctl |= DEBUGCTLMSR_BTF; - update_debugctlmsr(debugctl); + /* Restore back the original saved kprobes variables and continue. */ + if (kcb->kprobe_status == KPROBE_REENTER) { + /* This will restore both kcb and current_kprobe */ + restore_previous_kprobe(kcb); + } else { + /* + * Always update the kcb status because + * reset_curent_kprobe() doesn't update kcb. + */ + kcb->kprobe_status = KPROBE_HIT_SSDONE; + if (cur->post_handler) + cur->post_handler(cur, regs, 0); + reset_current_kprobe(); } } - -void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) -{ - unsigned long *sara = stack_addr(regs); - - ri->ret_addr = (kprobe_opcode_t *) *sara; - ri->fp = sara; - - /* Replace the return addr with trampoline addr */ - *sara = (unsigned long) &kretprobe_trampoline; -} -NOKPROBE_SYMBOL(arch_prepare_kretprobe); +NOKPROBE_SYMBOL(kprobe_post_process); static void setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb, int reenter) @@ -588,7 +836,7 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs, return; #if !defined(CONFIG_PREEMPTION) - if (p->ainsn.boostable && !p->post_handler) { + if (p->ainsn.boostable) { /* Boost up -- we can execute copied instructions directly */ if (!reenter) reset_current_kprobe(); @@ -607,19 +855,51 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs, kcb->kprobe_status = KPROBE_REENTER; } else kcb->kprobe_status = KPROBE_HIT_SS; - /* Prepare real single stepping */ - clear_btf(); - regs->flags |= X86_EFLAGS_TF; + + if (p->ainsn.emulate_op) { + p->ainsn.emulate_op(p, regs); + kprobe_post_process(p, regs, kcb); + return; + } + + /* Disable interrupt, and set ip register on trampoline */ regs->flags &= ~X86_EFLAGS_IF; - /* single step inline if the instruction is an int3 */ - if (p->opcode == INT3_INSN_OPCODE) - regs->ip = (unsigned long)p->addr; - else - regs->ip = (unsigned long)p->ainsn.insn; + regs->ip = (unsigned long)p->ainsn.insn; } NOKPROBE_SYMBOL(setup_singlestep); /* + * Called after single-stepping. p->addr is the address of the + * instruction whose first byte has been replaced by the "int3" + * instruction. To avoid the SMP problems that can occur when we + * temporarily put back the original opcode to single-step, we + * single-stepped a copy of the instruction. The address of this + * copy is p->ainsn.insn. We also doesn't use trap, but "int3" again + * right after the copied instruction. + * Different from the trap single-step, "int3" single-step can not + * handle the instruction which changes the ip register, e.g. jmp, + * call, conditional jmp, and the instructions which changes the IF + * flags because interrupt must be disabled around the single-stepping. + * Such instructions are software emulated, but others are single-stepped + * using "int3". + * + * When the 2nd "int3" handled, the regs->ip and regs->flags needs to + * be adjusted, so that we can resume execution on correct code. + */ +static void resume_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + unsigned long copy_ip = (unsigned long)p->ainsn.insn; + unsigned long orig_ip = (unsigned long)p->addr; + + /* Restore saved interrupt flag and ip register */ + regs->flags |= kcb->kprobe_saved_flags; + /* Note that regs->ip is executed int3 so must be a step back */ + regs->ip += (orig_ip - copy_ip) - INT3_INSN_SIZE; +} +NOKPROBE_SYMBOL(resume_singlestep); + +/* * We have reentered the kprobe_handler(), since another probe was hit while * within the handler. We save the original kprobes variables and just single * step on the instruction of the new probe without calling any user handlers. @@ -654,6 +934,12 @@ static int reenter_kprobe(struct kprobe *p, struct pt_regs *regs, } NOKPROBE_SYMBOL(reenter_kprobe); +static nokprobe_inline int kprobe_is_ss(struct kprobe_ctlblk *kcb) +{ + return (kcb->kprobe_status == KPROBE_HIT_SS || + kcb->kprobe_status == KPROBE_REENTER); +} + /* * Interrupts are disabled on entry as trap3 is an interrupt gate and they * remain disabled throughout this function. @@ -698,7 +984,18 @@ int kprobe_int3_handler(struct pt_regs *regs) reset_current_kprobe(); return 1; } - } else if (*addr != INT3_INSN_OPCODE) { + } else if (kprobe_is_ss(kcb)) { + p = kprobe_running(); + if ((unsigned long)p->ainsn.insn < regs->ip && + (unsigned long)p->ainsn.insn + MAX_INSN_SIZE > regs->ip) { + /* Most provably this is the second int3 for singlestep */ + resume_singlestep(p, regs, kcb); + kprobe_post_process(p, regs, kcb); + return 1; + } + } + + if (*addr != INT3_INSN_OPCODE) { /* * The breakpoint instruction was removed right * after we hit it. Another cpu has removed @@ -716,303 +1013,6 @@ int kprobe_int3_handler(struct pt_regs *regs) } NOKPROBE_SYMBOL(kprobe_int3_handler); -/* - * When a retprobed function returns, this code saves registers and - * calls trampoline_handler() runs, which calls the kretprobe's handler. - */ -asm( - ".text\n" - ".global kretprobe_trampoline\n" - ".type kretprobe_trampoline, @function\n" - "kretprobe_trampoline:\n" - /* We don't bother saving the ss register */ -#ifdef CONFIG_X86_64 - " pushq %rsp\n" - " pushfq\n" - SAVE_REGS_STRING - " movq %rsp, %rdi\n" - " call trampoline_handler\n" - /* Replace saved sp with true return address. */ - " movq %rax, 19*8(%rsp)\n" - RESTORE_REGS_STRING - " popfq\n" -#else - " pushl %esp\n" - " pushfl\n" - SAVE_REGS_STRING - " movl %esp, %eax\n" - " call trampoline_handler\n" - /* Replace saved sp with true return address. */ - " movl %eax, 15*4(%esp)\n" - RESTORE_REGS_STRING - " popfl\n" -#endif - " ret\n" - ".size kretprobe_trampoline, .-kretprobe_trampoline\n" -); -NOKPROBE_SYMBOL(kretprobe_trampoline); -STACK_FRAME_NON_STANDARD(kretprobe_trampoline); - -static struct kprobe kretprobe_kprobe = { - .addr = (void *)kretprobe_trampoline, -}; - -/* - * Called from kretprobe_trampoline - */ -__used __visible void *trampoline_handler(struct pt_regs *regs) -{ - struct kprobe_ctlblk *kcb; - struct kretprobe_instance *ri = NULL; - struct hlist_head *head, empty_rp; - struct hlist_node *tmp; - unsigned long flags, orig_ret_address = 0; - unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; - kprobe_opcode_t *correct_ret_addr = NULL; - void *frame_pointer; - bool skipped = false; - - preempt_disable(); - - /* - * Set a dummy kprobe for avoiding kretprobe recursion. - * Since kretprobe never run in kprobe handler, kprobe must not - * be running at this point. - */ - kcb = get_kprobe_ctlblk(); - __this_cpu_write(current_kprobe, &kretprobe_kprobe); - kcb->kprobe_status = KPROBE_HIT_ACTIVE; - - INIT_HLIST_HEAD(&empty_rp); - kretprobe_hash_lock(current, &head, &flags); - /* fixup registers */ - regs->cs = __KERNEL_CS; -#ifdef CONFIG_X86_32 - regs->cs |= get_kernel_rpl(); - regs->gs = 0; -#endif - /* We use pt_regs->sp for return address holder. */ - frame_pointer = ®s->sp; - regs->ip = trampoline_address; - regs->orig_ax = ~0UL; - - /* - * It is possible to have multiple instances associated with a given - * task either because multiple functions in the call path have - * return probes installed on them, and/or more than one - * return probe was registered for a target function. - * - * We can handle this because: - * - instances are always pushed into the head of the list - * - when multiple return probes are registered for the same - * function, the (chronologically) first instance's ret_addr - * will be the real return address, and all the rest will - * point to kretprobe_trampoline. - */ - hlist_for_each_entry(ri, head, hlist) { - if (ri->task != current) - /* another task is sharing our hash bucket */ - continue; - /* - * Return probes must be pushed on this hash list correct - * order (same as return order) so that it can be popped - * correctly. However, if we find it is pushed it incorrect - * order, this means we find a function which should not be - * probed, because the wrong order entry is pushed on the - * path of processing other kretprobe itself. - */ - if (ri->fp != frame_pointer) { - if (!skipped) - pr_warn("kretprobe is stacked incorrectly. Trying to fixup.\n"); - skipped = true; - continue; - } - - orig_ret_address = (unsigned long)ri->ret_addr; - if (skipped) - pr_warn("%ps must be blacklisted because of incorrect kretprobe order\n", - ri->rp->kp.addr); - - if (orig_ret_address != trampoline_address) - /* - * This is the real return address. Any other - * instances associated with this task are for - * other calls deeper on the call stack - */ - break; - } - - kretprobe_assert(ri, orig_ret_address, trampoline_address); - - correct_ret_addr = ri->ret_addr; - hlist_for_each_entry_safe(ri, tmp, head, hlist) { - if (ri->task != current) - /* another task is sharing our hash bucket */ - continue; - if (ri->fp != frame_pointer) - continue; - - orig_ret_address = (unsigned long)ri->ret_addr; - if (ri->rp && ri->rp->handler) { - __this_cpu_write(current_kprobe, &ri->rp->kp); - ri->ret_addr = correct_ret_addr; - ri->rp->handler(ri, regs); - __this_cpu_write(current_kprobe, &kretprobe_kprobe); - } - - recycle_rp_inst(ri, &empty_rp); - - if (orig_ret_address != trampoline_address) - /* - * This is the real return address. Any other - * instances associated with this task are for - * other calls deeper on the call stack - */ - break; - } - - kretprobe_hash_unlock(current, &flags); - - __this_cpu_write(current_kprobe, NULL); - preempt_enable(); - - hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { - hlist_del(&ri->hlist); - kfree(ri); - } - return (void *)orig_ret_address; -} -NOKPROBE_SYMBOL(trampoline_handler); - -/* - * Called after single-stepping. p->addr is the address of the - * instruction whose first byte has been replaced by the "int 3" - * instruction. To avoid the SMP problems that can occur when we - * temporarily put back the original opcode to single-step, we - * single-stepped a copy of the instruction. The address of this - * copy is p->ainsn.insn. - * - * This function prepares to return from the post-single-step - * interrupt. We have to fix up the stack as follows: - * - * 0) Except in the case of absolute or indirect jump or call instructions, - * the new ip is relative to the copied instruction. We need to make - * it relative to the original instruction. - * - * 1) If the single-stepped instruction was pushfl, then the TF and IF - * flags are set in the just-pushed flags, and may need to be cleared. - * - * 2) If the single-stepped instruction was a call, the return address - * that is atop the stack is the address following the copied instruction. - * We need to make it the address following the original instruction. - * - * If this is the first time we've single-stepped the instruction at - * this probepoint, and the instruction is boostable, boost it: add a - * jump instruction after the copied instruction, that jumps to the next - * instruction after the probepoint. - */ -static void resume_execution(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) -{ - unsigned long *tos = stack_addr(regs); - unsigned long copy_ip = (unsigned long)p->ainsn.insn; - unsigned long orig_ip = (unsigned long)p->addr; - kprobe_opcode_t *insn = p->ainsn.insn; - - /* Skip prefixes */ - insn = skip_prefixes(insn); - - regs->flags &= ~X86_EFLAGS_TF; - switch (*insn) { - case 0x9c: /* pushfl */ - *tos &= ~(X86_EFLAGS_TF | X86_EFLAGS_IF); - *tos |= kcb->kprobe_old_flags; - break; - case 0xc2: /* iret/ret/lret */ - case 0xc3: - case 0xca: - case 0xcb: - case 0xcf: - case 0xea: /* jmp absolute -- ip is correct */ - /* ip is already adjusted, no more changes required */ - p->ainsn.boostable = true; - goto no_change; - case 0xe8: /* call relative - Fix return addr */ - *tos = orig_ip + (*tos - copy_ip); - break; -#ifdef CONFIG_X86_32 - case 0x9a: /* call absolute -- same as call absolute, indirect */ - *tos = orig_ip + (*tos - copy_ip); - goto no_change; -#endif - case 0xff: - if ((insn[1] & 0x30) == 0x10) { - /* - * call absolute, indirect - * Fix return addr; ip is correct. - * But this is not boostable - */ - *tos = orig_ip + (*tos - copy_ip); - goto no_change; - } else if (((insn[1] & 0x31) == 0x20) || - ((insn[1] & 0x31) == 0x21)) { - /* - * jmp near and far, absolute indirect - * ip is correct. And this is boostable - */ - p->ainsn.boostable = true; - goto no_change; - } - default: - break; - } - - regs->ip += orig_ip - copy_ip; - -no_change: - restore_btf(); -} -NOKPROBE_SYMBOL(resume_execution); - -/* - * Interrupts are disabled on entry as trap1 is an interrupt gate and they - * remain disabled throughout this function. - */ -int kprobe_debug_handler(struct pt_regs *regs) -{ - struct kprobe *cur = kprobe_running(); - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - if (!cur) - return 0; - - resume_execution(cur, regs, kcb); - regs->flags |= kcb->kprobe_saved_flags; - - if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { - kcb->kprobe_status = KPROBE_HIT_SSDONE; - cur->post_handler(cur, regs, 0); - } - - /* Restore back the original saved kprobes variables and continue. */ - if (kcb->kprobe_status == KPROBE_REENTER) { - restore_previous_kprobe(kcb); - goto out; - } - reset_current_kprobe(); -out: - /* - * if somebody else is singlestepping across a probe point, flags - * will have TF set, in which case, continue the remaining processing - * of do_debug, as if this is not a probe hit. - */ - if (regs->flags & X86_EFLAGS_TF) - return 0; - - return 1; -} -NOKPROBE_SYMBOL(kprobe_debug_handler); - int kprobe_fault_handler(struct pt_regs *regs, int trapnr) { struct kprobe *cur = kprobe_running(); @@ -1030,15 +1030,9 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr) * normal page fault. */ regs->ip = (unsigned long)cur->addr; - /* - * Trap flag (TF) has been set here because this fault - * happened where the single stepping will be done. - * So clear it by resetting the current kprobe: - */ - regs->flags &= ~X86_EFLAGS_TF; /* - * If the TF flag was set before the kprobe hit, + * If the IF flag was set before the kprobe hit, * don't touch it: */ regs->flags |= kcb->kprobe_old_flags; @@ -1047,24 +1041,6 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr) restore_previous_kprobe(kcb); else reset_current_kprobe(); - } else if (kcb->kprobe_status == KPROBE_HIT_ACTIVE || - kcb->kprobe_status == KPROBE_HIT_SSDONE) { - /* - * We increment the nmissed count for accounting, - * we can also use npre/npostfault count for accounting - * these specific fault cases. - */ - kprobes_inc_nmissed_count(cur); - - /* - * We come here because instructions in the pre/post - * handler caused the page_fault, this could happen - * if handler tries to access user space by - * copy_from_user(), get_user() etc. Let the - * user-specified handler try to fix it first. - */ - if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr)) - return 1; } return 0; @@ -1073,13 +1049,6 @@ NOKPROBE_SYMBOL(kprobe_fault_handler); int __init arch_populate_kprobe_blacklist(void) { - int ret; - - ret = kprobe_add_area_blacklist((unsigned long)__irqentry_text_start, - (unsigned long)__irqentry_text_end); - if (ret) - return ret; - return kprobe_add_area_blacklist((unsigned long)__entry_text_start, (unsigned long)__entry_text_end); } diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 681a4b36e9bb..dd2ec14adb77 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -12,17 +12,22 @@ #include "common.h" -/* Ftrace callback handler for kprobes -- called under preepmt disabed */ +/* Ftrace callback handler for kprobes -- called under preempt disabled */ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct pt_regs *regs) + struct ftrace_ops *ops, struct ftrace_regs *fregs) { + struct pt_regs *regs = ftrace_get_regs(fregs); struct kprobe *p; struct kprobe_ctlblk *kcb; + int bit; + + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) + return; - /* Preempt is disabled by ftrace */ p = get_kprobe((kprobe_opcode_t *)ip); if (unlikely(!p) || kprobe_disabled(p)) - return; + goto out; kcb = get_kprobe_ctlblk(); if (kprobe_running()) { @@ -52,6 +57,8 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, */ __this_cpu_write(current_kprobe, NULL); } +out: + ftrace_test_recursion_unlock(bit); } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 3f45b5c43a71..e6b8c5362b94 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -6,6 +6,7 @@ * Copyright (C) Hitachi Ltd., 2012 */ #include <linux/kprobes.h> +#include <linux/perf_event.h> #include <linux/ptrace.h> #include <linux/string.h> #include <linux/slab.h> @@ -15,12 +16,13 @@ #include <linux/kdebug.h> #include <linux/kallsyms.h> #include <linux/ftrace.h> -#include <linux/frame.h> +#include <linux/objtool.h> +#include <linux/pgtable.h> +#include <linux/static_call.h> #include <asm/text-patching.h> #include <asm/cacheflush.h> #include <asm/desc.h> -#include <asm/pgtable.h> #include <linux/uaccess.h> #include <asm/alternative.h> #include <asm/insn.h> @@ -56,7 +58,7 @@ found: * overwritten by jump destination address. In this case, original * bytes must be recovered from op->optinsn.copied_insn buffer. */ - if (probe_kernel_read(buf, (void *)addr, + if (copy_from_kernel_nofault(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) return 0UL; @@ -71,6 +73,21 @@ found: return (unsigned long)buf; } +static void synthesize_clac(kprobe_opcode_t *addr) +{ + /* + * Can't be static_cpu_has() due to how objtool treats this feature bit. + * This isn't a fast path anyway. + */ + if (!boot_cpu_has(X86_FEATURE_SMAP)) + return; + + /* Replace the NOP3 with CLAC */ + addr[0] = 0x0f; + addr[1] = 0x01; + addr[2] = 0xca; +} + /* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */ static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val) { @@ -89,9 +106,13 @@ asm ( ".global optprobe_template_entry\n" "optprobe_template_entry:\n" #ifdef CONFIG_X86_64 - /* We don't bother saving the ss register */ + " pushq $" __stringify(__KERNEL_DS) "\n" + /* Save the 'sp - 8', this will be fixed later. */ " pushq %rsp\n" " pushfq\n" + ".global optprobe_template_clac\n" + "optprobe_template_clac:\n" + ASM_NOP3 SAVE_REGS_STRING " movq %rsp, %rsi\n" ".global optprobe_template_val\n" @@ -101,16 +122,22 @@ asm ( ".global optprobe_template_call\n" "optprobe_template_call:\n" ASM_NOP5 - /* Move flags to rsp */ + /* Copy 'regs->flags' into 'regs->ss'. */ " movq 18*8(%rsp), %rdx\n" - " movq %rdx, 19*8(%rsp)\n" + " movq %rdx, 20*8(%rsp)\n" RESTORE_REGS_STRING - /* Skip flags entry */ - " addq $8, %rsp\n" + /* Skip 'regs->flags' and 'regs->sp'. */ + " addq $16, %rsp\n" + /* And pop flags register from 'regs->ss'. */ " popfq\n" #else /* CONFIG_X86_32 */ + " pushl %ss\n" + /* Save the 'sp - 4', this will be fixed later. */ " pushl %esp\n" " pushfl\n" + ".global optprobe_template_clac\n" + "optprobe_template_clac:\n" + ASM_NOP3 SAVE_REGS_STRING " movl %esp, %edx\n" ".global optprobe_template_val\n" @@ -119,12 +146,13 @@ asm ( ".global optprobe_template_call\n" "optprobe_template_call:\n" ASM_NOP5 - /* Move flags into esp */ + /* Copy 'regs->flags' into 'regs->ss'. */ " movl 14*4(%esp), %edx\n" - " movl %edx, 15*4(%esp)\n" + " movl %edx, 16*4(%esp)\n" RESTORE_REGS_STRING - /* Skip flags entry */ - " addl $4, %esp\n" + /* Skip 'regs->flags' and 'regs->sp'. */ + " addl $8, %esp\n" + /* And pop flags register from 'regs->ss'. */ " popfl\n" #endif ".global optprobe_template_end\n" @@ -134,6 +162,8 @@ asm ( void optprobe_template_func(void); STACK_FRAME_NON_STANDARD(optprobe_template_func); +#define TMPL_CLAC_IDX \ + ((long)optprobe_template_clac - (long)optprobe_template_entry) #define TMPL_MOVE_IDX \ ((long)optprobe_template_val - (long)optprobe_template_entry) #define TMPL_CALL_IDX \ @@ -154,10 +184,11 @@ optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) kprobes_inc_nmissed_count(&op->kp); } else { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + /* Adjust stack pointer */ + regs->sp += sizeof(long); /* Save skipped registers */ regs->cs = __KERNEL_CS; #ifdef CONFIG_X86_32 - regs->cs |= get_kernel_rpl(); regs->gs = 0; #endif regs->ip = (unsigned long)op->kp.addr + INT3_INSN_SIZE; @@ -186,7 +217,8 @@ static int copy_optimized_instructions(u8 *dest, u8 *src, u8 *real) /* Check whether the address range is reserved */ if (ftrace_text_reserved(src, src + len - 1) || alternatives_text_reserved(src, src + len - 1) || - jump_label_text_reserved(src, src + len - 1)) + jump_label_text_reserved(src, src + len - 1) || + static_call_text_reserved(src, src + len - 1)) return -EBUSY; return len; @@ -247,6 +279,19 @@ static int insn_is_indirect_jump(struct insn *insn) return ret; } +static bool is_padding_int3(unsigned long addr, unsigned long eaddr) +{ + unsigned char ops; + + for (; addr < eaddr; addr++) { + if (get_kernel_nofault(ops, (void *)addr) < 0 || + ops != INT3_INSN_OPCODE) + return false; + } + + return true; +} + /* Decode whole function to ensure any instructions don't jump into target */ static int can_optimize(unsigned long paddr) { @@ -263,9 +308,7 @@ static int can_optimize(unsigned long paddr) * stack handling and registers setup. */ if (((paddr >= (unsigned long)__entry_text_start) && - (paddr < (unsigned long)__entry_text_end)) || - ((paddr >= (unsigned long)__irqentry_text_start) && - (paddr < (unsigned long)__irqentry_text_end))) + (paddr < (unsigned long)__entry_text_end))) return 0; /* Check there is enough space for a relative jump. */ @@ -276,6 +319,8 @@ static int can_optimize(unsigned long paddr) addr = paddr - offset; while (addr < paddr - offset + size) { /* Decode until function end */ unsigned long recovered_insn; + int ret; + if (search_exception_tables(addr)) /* * Since some fixup code will jumps into this function, @@ -285,11 +330,19 @@ static int can_optimize(unsigned long paddr) recovered_insn = recover_probed_instruction(buf, addr); if (!recovered_insn) return 0; - kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); - insn_get_length(&insn); - /* Another subsystem puts a breakpoint */ - if (insn.opcode.bytes[0] == INT3_INSN_OPCODE) + + ret = insn_decode_kernel(&insn, (void *)recovered_insn); + if (ret < 0) return 0; + + /* + * In the case of detecting unknown breakpoint, this could be + * a padding INT3 between functions. Let's check that all the + * rest of the bytes are also INT3. + */ + if (insn.opcode.bytes[0] == INT3_INSN_OPCODE) + return is_padding_int3(addr, paddr - offset + size) ? 1 : 0; + /* Recover address */ insn.kaddr = (void *)addr; insn.next_byte = (void *)(addr + insn.length); @@ -321,18 +374,25 @@ int arch_check_optimized_kprobe(struct optimized_kprobe *op) /* Check the addr is within the optimized instructions. */ int arch_within_optimized_kprobe(struct optimized_kprobe *op, - unsigned long addr) + kprobe_opcode_t *addr) { - return ((unsigned long)op->kp.addr <= addr && - (unsigned long)op->kp.addr + op->optinsn.size > addr); + return (op->kp.addr <= addr && + op->kp.addr + op->optinsn.size > addr); } /* Free optimized instruction slot */ static void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty) { - if (op->optinsn.insn) { - free_optinsn_slot(op->optinsn.insn, dirty); + u8 *slot = op->optinsn.insn; + if (slot) { + int len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE; + + /* Record the perf event before freeing the slot */ + if (dirty) + perf_event_text_poke(slot, slot, len, NULL, 0); + + free_optinsn_slot(slot, dirty); op->optinsn.insn = NULL; op->optinsn.size = 0; } @@ -389,6 +449,8 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, op->optinsn.size = ret; len = TMPL_END_IDX + op->optinsn.size; + synthesize_clac(buf + TMPL_CLAC_IDX); + /* Set probe information */ synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op); @@ -401,8 +463,15 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, (u8 *)op->kp.addr + op->optinsn.size); len += JMP32_INSN_SIZE; + /* + * Note len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE is also + * used in __arch_remove_optimized_kprobe(). + */ + /* We have to use text_poke() for instruction buffer because it is RO */ + perf_event_text_poke(slot, NULL, 0, buf, len); text_poke(slot, buf, len); + ret = 0; out: kfree(buf); @@ -454,10 +523,23 @@ void arch_optimize_kprobes(struct list_head *oplist) */ void arch_unoptimize_kprobe(struct optimized_kprobe *op) { - arch_arm_kprobe(&op->kp); - text_poke(op->kp.addr + INT3_INSN_SIZE, - op->optinsn.copied_insn, DISP32_SIZE); + u8 new[JMP32_INSN_SIZE] = { INT3_INSN_OPCODE, }; + u8 old[JMP32_INSN_SIZE]; + u8 *addr = op->kp.addr; + + memcpy(old, op->kp.addr, JMP32_INSN_SIZE); + memcpy(new + INT3_INSN_SIZE, + op->optinsn.copied_insn, + JMP32_INSN_SIZE - INT3_INSN_SIZE); + + text_poke(addr, new, INT3_INSN_SIZE); + text_poke_sync(); + text_poke(addr + INT3_INSN_SIZE, + new + INT3_INSN_SIZE, + JMP32_INSN_SIZE - INT3_INSN_SIZE); text_poke_sync(); + + perf_event_text_poke(op->kp.addr, old, JMP32_INSN_SIZE, new, JMP32_INSN_SIZE); } /* diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c index d0a19121c6a4..257892fcefa7 100644 --- a/arch/x86/kernel/ksysfs.c +++ b/arch/x86/kernel/ksysfs.c @@ -91,26 +91,41 @@ static int get_setup_data_paddr(int nr, u64 *paddr) static int __init get_setup_data_size(int nr, size_t *size) { - int i = 0; + u64 pa_data = boot_params.hdr.setup_data, pa_next; + struct setup_indirect *indirect; struct setup_data *data; - u64 pa_data = boot_params.hdr.setup_data; + int i = 0; + u32 len; while (pa_data) { data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); if (!data) return -ENOMEM; + pa_next = data->next; + if (nr == i) { - if (data->type == SETUP_INDIRECT && - ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) - *size = ((struct setup_indirect *)data->data)->len; - else + if (data->type == SETUP_INDIRECT) { + len = sizeof(*data) + data->len; + memunmap(data); + data = memremap(pa_data, len, MEMREMAP_WB); + if (!data) + return -ENOMEM; + + indirect = (struct setup_indirect *)data->data; + + if (indirect->type != SETUP_INDIRECT) + *size = indirect->len; + else + *size = data->len; + } else { *size = data->len; + } memunmap(data); return 0; } - pa_data = data->next; + pa_data = pa_next; memunmap(data); i++; } @@ -120,9 +135,11 @@ static int __init get_setup_data_size(int nr, size_t *size) static ssize_t type_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { + struct setup_indirect *indirect; + struct setup_data *data; int nr, ret; u64 paddr; - struct setup_data *data; + u32 len; ret = kobj_to_setup_data_nr(kobj, &nr); if (ret) @@ -135,10 +152,20 @@ static ssize_t type_show(struct kobject *kobj, if (!data) return -ENOMEM; - if (data->type == SETUP_INDIRECT) - ret = sprintf(buf, "0x%x\n", ((struct setup_indirect *)data->data)->type); - else + if (data->type == SETUP_INDIRECT) { + len = sizeof(*data) + data->len; + memunmap(data); + data = memremap(paddr, len, MEMREMAP_WB); + if (!data) + return -ENOMEM; + + indirect = (struct setup_indirect *)data->data; + + ret = sprintf(buf, "0x%x\n", indirect->type); + } else { ret = sprintf(buf, "0x%x\n", data->type); + } + memunmap(data); return ret; } @@ -149,9 +176,10 @@ static ssize_t setup_data_data_read(struct file *fp, char *buf, loff_t off, size_t count) { + struct setup_indirect *indirect; + struct setup_data *data; int nr, ret = 0; u64 paddr, len; - struct setup_data *data; void *p; ret = kobj_to_setup_data_nr(kobj, &nr); @@ -165,10 +193,27 @@ static ssize_t setup_data_data_read(struct file *fp, if (!data) return -ENOMEM; - if (data->type == SETUP_INDIRECT && - ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) { - paddr = ((struct setup_indirect *)data->data)->addr; - len = ((struct setup_indirect *)data->data)->len; + if (data->type == SETUP_INDIRECT) { + len = sizeof(*data) + data->len; + memunmap(data); + data = memremap(paddr, len, MEMREMAP_WB); + if (!data) + return -ENOMEM; + + indirect = (struct setup_indirect *)data->data; + + if (indirect->type != SETUP_INDIRECT) { + paddr = indirect->addr; + len = indirect->len; + } else { + /* + * Even though this is technically undefined, return + * the data as though it is a normal setup_data struct. + * This will at least allow it to be inspected. + */ + paddr += sizeof(*data); + len = data->len; + } } else { paddr += sizeof(*data); len = data->len; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 6efe0410fb72..d4e48b4a438b 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -7,8 +7,11 @@ * Authors: Anthony Liguori <aliguori@us.ibm.com> */ +#define pr_fmt(fmt) "kvm-guest: " fmt + #include <linux/context_tracking.h> #include <linux/init.h> +#include <linux/irq.h> #include <linux/kernel.h> #include <linux/kvm_para.h> #include <linux/cpu.h> @@ -21,9 +24,11 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/kprobes.h> -#include <linux/debugfs.h> #include <linux/nmi.h> #include <linux/swait.h> +#include <linux/syscore_ops.h> +#include <linux/cc_platform.h> +#include <linux/efi.h> #include <asm/timer.h> #include <asm/cpu.h> #include <asm/traps.h> @@ -34,6 +39,12 @@ #include <asm/hypervisor.h> #include <asm/tlb.h> #include <asm/cpuidle_haltpoll.h> +#include <asm/ptrace.h> +#include <asm/reboot.h> +#include <asm/svm.h> +#include <asm/e820/api.h> + +DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled); static int kvmapf = 1; @@ -58,6 +69,7 @@ static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __align DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible; static int has_steal_clock = 0; +static int has_guest_poll = 0; /* * No need for any "IO delay" on KVM */ @@ -73,7 +85,6 @@ struct kvm_task_sleep_node { struct swait_queue_head wq; u32 token; int cpu; - bool halted; }; static struct kvm_task_sleep_head { @@ -96,77 +107,64 @@ static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b, return NULL; } -/* - * @interrupt_kernel: Is this called from a routine which interrupts the kernel - * (other than user space)? - */ -void kvm_async_pf_task_wait(u32 token, int interrupt_kernel) +static bool kvm_async_pf_queue_task(u32 token, struct kvm_task_sleep_node *n) { u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; - struct kvm_task_sleep_node n, *e; - DECLARE_SWAITQUEUE(wait); - - rcu_irq_enter(); + struct kvm_task_sleep_node *e; raw_spin_lock(&b->lock); e = _find_apf_task(b, token); if (e) { /* dummy entry exist -> wake up was delivered ahead of PF */ hlist_del(&e->link); - kfree(e); raw_spin_unlock(&b->lock); - - rcu_irq_exit(); - return; + kfree(e); + return false; } - n.token = token; - n.cpu = smp_processor_id(); - n.halted = is_idle_task(current) || - (IS_ENABLED(CONFIG_PREEMPT_COUNT) - ? preempt_count() > 1 || rcu_preempt_depth() - : interrupt_kernel); - init_swait_queue_head(&n.wq); - hlist_add_head(&n.link, &b->list); + n->token = token; + n->cpu = smp_processor_id(); + init_swait_queue_head(&n->wq); + hlist_add_head(&n->link, &b->list); raw_spin_unlock(&b->lock); + return true; +} + +/* + * kvm_async_pf_task_wait_schedule - Wait for pagefault to be handled + * @token: Token to identify the sleep node entry + * + * Invoked from the async pagefault handling code or from the VM exit page + * fault handler. In both cases RCU is watching. + */ +void kvm_async_pf_task_wait_schedule(u32 token) +{ + struct kvm_task_sleep_node n; + DECLARE_SWAITQUEUE(wait); + + lockdep_assert_irqs_disabled(); + + if (!kvm_async_pf_queue_task(token, &n)) + return; for (;;) { - if (!n.halted) - prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE); + prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE); if (hlist_unhashed(&n.link)) break; - rcu_irq_exit(); - - if (!n.halted) { - local_irq_enable(); - schedule(); - local_irq_disable(); - } else { - /* - * We cannot reschedule. So halt. - */ - native_safe_halt(); - local_irq_disable(); - } - - rcu_irq_enter(); + local_irq_enable(); + schedule(); + local_irq_disable(); } - if (!n.halted) - finish_swait(&n.wq, &wait); - - rcu_irq_exit(); - return; + finish_swait(&n.wq, &wait); } -EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait); +EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait_schedule); static void apf_task_wake_one(struct kvm_task_sleep_node *n) { hlist_del_init(&n->link); - if (n->halted) - smp_send_reschedule(n->cpu); - else if (swq_has_sleeper(&n->wq)) + if (swq_has_sleeper(&n->wq)) swake_up_one(&n->wq); } @@ -175,12 +173,13 @@ static void apf_task_wake_all(void) int i; for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) { - struct hlist_node *p, *next; struct kvm_task_sleep_head *b = &async_pf_sleepers[i]; + struct kvm_task_sleep_node *n; + struct hlist_node *p, *next; + raw_spin_lock(&b->lock); hlist_for_each_safe(p, next, &b->list) { - struct kvm_task_sleep_node *n = - hlist_entry(p, typeof(*n), link); + n = hlist_entry(p, typeof(*n), link); if (n->cpu == smp_processor_id()) apf_task_wake_one(n); } @@ -192,7 +191,7 @@ void kvm_async_pf_task_wake(u32 token) { u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; - struct kvm_task_sleep_node *n; + struct kvm_task_sleep_node *n, *dummy = NULL; if (token == ~0) { apf_task_wake_all(); @@ -204,63 +203,107 @@ again: n = _find_apf_task(b, token); if (!n) { /* - * async PF was not yet handled. - * Add dummy entry for the token. + * Async #PF not yet handled, add a dummy entry for the token. + * Allocating the token must be down outside of the raw lock + * as the allocator is preemptible on PREEMPT_RT kernels. */ - n = kzalloc(sizeof(*n), GFP_ATOMIC); - if (!n) { + if (!dummy) { + raw_spin_unlock(&b->lock); + dummy = kzalloc(sizeof(*dummy), GFP_ATOMIC); + /* - * Allocation failed! Busy wait while other cpu - * handles async PF. + * Continue looping on allocation failure, eventually + * the async #PF will be handled and allocating a new + * node will be unnecessary. + */ + if (!dummy) + cpu_relax(); + + /* + * Recheck for async #PF completion before enqueueing + * the dummy token to avoid duplicate list entries. */ - raw_spin_unlock(&b->lock); - cpu_relax(); goto again; } - n->token = token; - n->cpu = smp_processor_id(); - init_swait_queue_head(&n->wq); - hlist_add_head(&n->link, &b->list); - } else + dummy->token = token; + dummy->cpu = smp_processor_id(); + init_swait_queue_head(&dummy->wq); + hlist_add_head(&dummy->link, &b->list); + dummy = NULL; + } else { apf_task_wake_one(n); + } raw_spin_unlock(&b->lock); - return; + + /* A dummy token might be allocated and ultimately not used. */ + kfree(dummy); } EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); -u32 kvm_read_and_reset_pf_reason(void) +noinstr u32 kvm_read_and_reset_apf_flags(void) { - u32 reason = 0; + u32 flags = 0; if (__this_cpu_read(apf_reason.enabled)) { - reason = __this_cpu_read(apf_reason.reason); - __this_cpu_write(apf_reason.reason, 0); + flags = __this_cpu_read(apf_reason.flags); + __this_cpu_write(apf_reason.flags, 0); } - return reason; + return flags; } -EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason); -NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason); +EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags); -dotraplinkage void -do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address) +noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) { - switch (kvm_read_and_reset_pf_reason()) { - default: - do_page_fault(regs, error_code, address); - break; - case KVM_PV_REASON_PAGE_NOT_PRESENT: - /* page is swapped out by the host. */ - kvm_async_pf_task_wait((u32)address, !user_mode(regs)); - break; - case KVM_PV_REASON_PAGE_READY: - rcu_irq_enter(); - kvm_async_pf_task_wake((u32)address); - rcu_irq_exit(); - break; + u32 flags = kvm_read_and_reset_apf_flags(); + irqentry_state_t state; + + if (!flags) + return false; + + state = irqentry_enter(regs); + instrumentation_begin(); + + /* + * If the host managed to inject an async #PF into an interrupt + * disabled region, then die hard as this is not going to end well + * and the host side is seriously broken. + */ + if (unlikely(!(regs->flags & X86_EFLAGS_IF))) + panic("Host injected async #PF in interrupt disabled region\n"); + + if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) { + if (unlikely(!(user_mode(regs)))) + panic("Host injected async #PF in kernel mode\n"); + /* Page is swapped out by the host. */ + kvm_async_pf_task_wait_schedule(token); + } else { + WARN_ONCE(1, "Unexpected async PF flags: %x\n", flags); } + + instrumentation_end(); + irqentry_exit(regs, state); + return true; +} + +DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + u32 token; + + ack_APIC_irq(); + + inc_irq_stat(irq_hv_callback_count); + + if (__this_cpu_read(apf_reason.enabled)) { + token = __this_cpu_read(apf_reason.token); + kvm_async_pf_task_wake(token); + __this_cpu_write(apf_reason.token, 0); + wrmsrl(MSR_KVM_ASYNC_PF_ACK, 1); + } + + set_irq_regs(old_regs); } -NOKPROBE_SYMBOL(do_async_page_fault); static void __init paravirt_ops_setup(void) { @@ -283,8 +326,8 @@ static void kvm_register_steal_time(void) return; wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); - pr_info("kvm-stealtime: cpu %d, msr %llx\n", - cpu, (unsigned long long) slow_virt_to_phys(st)); + pr_debug("stealtime: cpu %d, msr %llx\n", cpu, + (unsigned long long) slow_virt_to_phys(st)); } static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; @@ -305,25 +348,27 @@ static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val) static void kvm_guest_cpu_init(void) { - if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) { u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); -#ifdef CONFIG_PREEMPTION - pa |= KVM_ASYNC_PF_SEND_ALWAYS; -#endif - pa |= KVM_ASYNC_PF_ENABLED; + WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled)); + + pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); + pa |= KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT)) pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; + wrmsrl(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR); + wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); __this_cpu_write(apf_reason.enabled, 1); - printk(KERN_INFO"KVM setup async PF for cpu %d\n", - smp_processor_id()); + pr_debug("setup async PF for cpu %d\n", smp_processor_id()); } if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { unsigned long pa; + /* Size alignment is implied but just to make it explicit. */ BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); __this_cpu_write(kvm_apic_eoi, 0); @@ -344,35 +389,17 @@ static void kvm_pv_disable_apf(void) wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); __this_cpu_write(apf_reason.enabled, 0); - printk(KERN_INFO"Unregister pv shared memory for cpu %d\n", - smp_processor_id()); + pr_debug("disable async PF for cpu %d\n", smp_processor_id()); } -static void kvm_pv_guest_cpu_reboot(void *unused) +static void kvm_disable_steal_time(void) { - /* - * We disable PV EOI before we load a new kernel by kexec, - * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory. - * New kernel can re-enable when it boots. - */ - if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) - wrmsrl(MSR_KVM_PV_EOI_EN, 0); - kvm_pv_disable_apf(); - kvm_disable_steal_time(); -} + if (!has_steal_clock) + return; -static int kvm_pv_reboot_notify(struct notifier_block *nb, - unsigned long code, void *unused) -{ - if (code == SYS_RESTART) - on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1); - return NOTIFY_DONE; + wrmsr(MSR_KVM_STEAL_TIME, 0, 0); } -static struct notifier_block kvm_pv_reboot_nb = { - .notifier_call = kvm_pv_reboot_notify, -}; - static u64 kvm_steal_clock(int cpu) { u64 steal; @@ -390,14 +417,6 @@ static u64 kvm_steal_clock(int cpu) return steal; } -void kvm_disable_steal_time(void) -{ - if (!has_steal_clock) - return; - - wrmsr(MSR_KVM_STEAL_TIME, 0, 0); -} - static inline void __set_percpu_decrypted(void *ptr, unsigned long size) { early_set_memory_decrypted((unsigned long) ptr, size); @@ -415,7 +434,7 @@ static void __init sev_map_percpu_data(void) { int cpu; - if (!sev_active()) + if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) return; for_each_possible_cpu(cpu) { @@ -425,27 +444,55 @@ static void __init sev_map_percpu_data(void) } } -static bool pv_tlb_flush_supported(void) +static void kvm_guest_cpu_offline(bool shutdown) { - return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && - !kvm_para_has_hint(KVM_HINTS_REALTIME) && - kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)); + kvm_disable_steal_time(); + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) + wrmsrl(MSR_KVM_PV_EOI_EN, 0); + if (kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) + wrmsrl(MSR_KVM_MIGRATION_CONTROL, 0); + kvm_pv_disable_apf(); + if (!shutdown) + apf_task_wake_all(); + kvmclock_disable(); } -static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask); +static int kvm_cpu_online(unsigned int cpu) +{ + unsigned long flags; + + local_irq_save(flags); + kvm_guest_cpu_init(); + local_irq_restore(flags); + return 0; +} #ifdef CONFIG_SMP +static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask); + +static bool pv_tlb_flush_supported(void) +{ + return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && + !kvm_para_has_hint(KVM_HINTS_REALTIME) && + kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) && + !boot_cpu_has(X86_FEATURE_MWAIT) && + (num_possible_cpus() != 1)); +} + static bool pv_ipi_supported(void) { - return kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI); + return (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI) && + (num_possible_cpus() != 1)); } static bool pv_sched_yield_supported(void) { return (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) && !kvm_para_has_hint(KVM_HINTS_REALTIME) && - kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)); + kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) && + !boot_cpu_has(X86_FEATURE_MWAIT) && + (num_possible_cpus() != 1)); } #define KVM_IPI_CLUSTER_SIZE (2 * BITS_PER_LONG) @@ -483,12 +530,13 @@ static void __send_ipi_mask(const struct cpumask *mask, int vector) } else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) { ipi_bitmap <<= min - apic_id; min = apic_id; - } else if (apic_id < min + KVM_IPI_CLUSTER_SIZE) { + } else if (apic_id > min && apic_id < min + KVM_IPI_CLUSTER_SIZE) { max = apic_id < max ? max : apic_id; } else { ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr); - WARN_ONCE(ret < 0, "KVM: failed to send PV IPI: %ld", ret); + WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld", + ret); min = max = apic_id; ipi_bitmap = 0; } @@ -498,7 +546,8 @@ static void __send_ipi_mask(const struct cpumask *mask, int vector) if (ipi_bitmap) { ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr); - WARN_ONCE(ret < 0, "KVM: failed to send PV IPI: %ld", ret); + WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld", + ret); } local_irq_restore(flags); @@ -521,6 +570,55 @@ static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector) __send_ipi_mask(local_mask, vector); } +static int __init setup_efi_kvm_sev_migration(void) +{ + efi_char16_t efi_sev_live_migration_enabled[] = L"SevLiveMigrationEnabled"; + efi_guid_t efi_variable_guid = AMD_SEV_MEM_ENCRYPT_GUID; + efi_status_t status; + unsigned long size; + bool enabled; + + if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) || + !kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) + return 0; + + if (!efi_enabled(EFI_BOOT)) + return 0; + + if (!efi_enabled(EFI_RUNTIME_SERVICES)) { + pr_info("%s : EFI runtime services are not enabled\n", __func__); + return 0; + } + + size = sizeof(enabled); + + /* Get variable contents into buffer */ + status = efi.get_variable(efi_sev_live_migration_enabled, + &efi_variable_guid, NULL, &size, &enabled); + + if (status == EFI_NOT_FOUND) { + pr_info("%s : EFI live migration variable not found\n", __func__); + return 0; + } + + if (status != EFI_SUCCESS) { + pr_info("%s : EFI variable retrieval failed\n", __func__); + return 0; + } + + if (enabled == 0) { + pr_info("%s: live migration disabled in EFI\n", __func__); + return 0; + } + + pr_info("%s : live migration enabled in EFI\n", __func__); + wrmsrl(MSR_KVM_MIGRATION_CONTROL, KVM_MIGRATION_READY); + + return 1; +} + +late_initcall(setup_efi_kvm_sev_migration); + /* * Set the IPI entry points */ @@ -528,7 +626,7 @@ static void kvm_setup_pv_ipi(void) { apic->send_IPI_mask = kvm_send_ipi_mask; apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself; - pr_info("KVM setup pv IPIs\n"); + pr_info("setup PV IPIs\n"); } static void kvm_smp_send_call_func_ipi(const struct cpumask *mask) @@ -539,19 +637,60 @@ static void kvm_smp_send_call_func_ipi(const struct cpumask *mask) /* Make sure other vCPUs get a chance to run if they need to. */ for_each_cpu(cpu, mask) { - if (vcpu_is_preempted(cpu)) { + if (!idle_cpu(cpu) && vcpu_is_preempted(cpu)) { kvm_hypercall1(KVM_HC_SCHED_YIELD, per_cpu(x86_cpu_to_apicid, cpu)); break; } } } -static void __init kvm_smp_prepare_cpus(unsigned int max_cpus) +static void kvm_flush_tlb_multi(const struct cpumask *cpumask, + const struct flush_tlb_info *info) +{ + u8 state; + int cpu; + struct kvm_steal_time *src; + struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask); + + cpumask_copy(flushmask, cpumask); + /* + * We have to call flush only on online vCPUs. And + * queue flush_on_enter for pre-empted vCPUs + */ + for_each_cpu(cpu, flushmask) { + /* + * The local vCPU is never preempted, so we do not explicitly + * skip check for local vCPU - it will never be cleared from + * flushmask. + */ + src = &per_cpu(steal_time, cpu); + state = READ_ONCE(src->preempted); + if ((state & KVM_VCPU_PREEMPTED)) { + if (try_cmpxchg(&src->preempted, &state, + state | KVM_VCPU_FLUSH_TLB)) + __cpumask_clear_cpu(cpu, flushmask); + } + } + + native_flush_tlb_multi(flushmask, info); +} + +static __init int kvm_alloc_cpumask(void) { - native_smp_prepare_cpus(max_cpus); - if (kvm_para_has_hint(KVM_HINTS_REALTIME)) - static_branch_disable(&virt_spin_lock_key); + int cpu; + + if (!kvm_para_available() || nopv) + return 0; + + if (pv_tlb_flush_supported() || pv_ipi_supported()) + for_each_possible_cpu(cpu) { + zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu), + GFP_KERNEL, cpu_to_node(cpu)); + } + + return 0; } +arch_initcall(kvm_alloc_cpumask); static void __init kvm_smp_prepare_boot_cpu(void) { @@ -566,63 +705,113 @@ static void __init kvm_smp_prepare_boot_cpu(void) kvm_spinlock_init(); } -static void kvm_guest_cpu_offline(void) +static int kvm_cpu_down_prepare(unsigned int cpu) { - kvm_disable_steal_time(); - if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) - wrmsrl(MSR_KVM_PV_EOI_EN, 0); - kvm_pv_disable_apf(); - apf_task_wake_all(); -} + unsigned long flags; -static int kvm_cpu_online(unsigned int cpu) -{ - local_irq_disable(); - kvm_guest_cpu_init(); - local_irq_enable(); + local_irq_save(flags); + kvm_guest_cpu_offline(false); + local_irq_restore(flags); return 0; } -static int kvm_cpu_down_prepare(unsigned int cpu) +#endif + +static int kvm_suspend(void) { - local_irq_disable(); - kvm_guest_cpu_offline(); - local_irq_enable(); + u64 val = 0; + + kvm_guest_cpu_offline(false); + +#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL + if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) + rdmsrl(MSR_KVM_POLL_CONTROL, val); + has_guest_poll = !(val & 1); +#endif return 0; } + +static void kvm_resume(void) +{ + kvm_cpu_online(raw_smp_processor_id()); + +#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL + if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll) + wrmsrl(MSR_KVM_POLL_CONTROL, 0); #endif +} + +static struct syscore_ops kvm_syscore_ops = { + .suspend = kvm_suspend, + .resume = kvm_resume, +}; -static void __init kvm_apf_trap_init(void) +static void kvm_pv_guest_cpu_reboot(void *unused) { - update_intr_gate(X86_TRAP_PF, async_page_fault); + kvm_guest_cpu_offline(true); } +static int kvm_pv_reboot_notify(struct notifier_block *nb, + unsigned long code, void *unused) +{ + if (code == SYS_RESTART) + on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1); + return NOTIFY_DONE; +} -static void kvm_flush_tlb_others(const struct cpumask *cpumask, - const struct flush_tlb_info *info) +static struct notifier_block kvm_pv_reboot_nb = { + .notifier_call = kvm_pv_reboot_notify, +}; + +/* + * After a PV feature is registered, the host will keep writing to the + * registered memory location. If the guest happens to shutdown, this memory + * won't be valid. In cases like kexec, in which you install a new kernel, this + * means a random memory location will be kept being written. + */ +#ifdef CONFIG_KEXEC_CORE +static void kvm_crash_shutdown(struct pt_regs *regs) { - u8 state; - int cpu; - struct kvm_steal_time *src; - struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask); + kvm_guest_cpu_offline(true); + native_machine_crash_shutdown(regs); +} +#endif - cpumask_copy(flushmask, cpumask); - /* - * We have to call flush only on online vCPUs. And - * queue flush_on_enter for pre-empted vCPUs - */ - for_each_cpu(cpu, flushmask) { - src = &per_cpu(steal_time, cpu); - state = READ_ONCE(src->preempted); - if ((state & KVM_VCPU_PREEMPTED)) { - if (try_cmpxchg(&src->preempted, &state, - state | KVM_VCPU_FLUSH_TLB)) - __cpumask_clear_cpu(cpu, flushmask); - } - } +#if defined(CONFIG_X86_32) || !defined(CONFIG_SMP) +bool __kvm_vcpu_is_preempted(long cpu); + +__visible bool __kvm_vcpu_is_preempted(long cpu) +{ + struct kvm_steal_time *src = &per_cpu(steal_time, cpu); - native_flush_tlb_others(flushmask, info); + return !!(src->preempted & KVM_VCPU_PREEMPTED); } +PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted); + +#else + +#include <asm/asm-offsets.h> + +extern bool __raw_callee_save___kvm_vcpu_is_preempted(long); + +/* + * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and + * restoring to/from the stack. + */ +asm( +".pushsection .text;" +".global __raw_callee_save___kvm_vcpu_is_preempted;" +".type __raw_callee_save___kvm_vcpu_is_preempted, @function;" +"__raw_callee_save___kvm_vcpu_is_preempted:" +ASM_ENDBR +"movq __per_cpu_offset(,%rdi,8), %rax;" +"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);" +"setne %al;" +ASM_RET +".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;" +".popsection"); + +#endif static void __init kvm_guest_init(void) { @@ -632,38 +821,49 @@ static void __init kvm_guest_init(void) register_reboot_notifier(&kvm_pv_reboot_nb); for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) raw_spin_lock_init(&async_pf_sleepers[i].lock); - if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) - x86_init.irqs.trap_init = kvm_apf_trap_init; if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { has_steal_clock = 1; - pv_ops.time.steal_clock = kvm_steal_clock; - } + static_call_update(pv_steal_clock, kvm_steal_clock); - if (pv_tlb_flush_supported()) { - pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others; - pv_ops.mmu.tlb_remove_table = tlb_remove_table; - pr_info("KVM setup pv remote TLB flush\n"); + pv_ops.lock.vcpu_is_preempted = + PV_CALLEE_SAVE(__kvm_vcpu_is_preempted); } if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) apic_set_eoi_write(kvm_guest_apic_eoi_write); + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) { + static_branch_enable(&kvm_async_pf_enabled); + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_kvm_asyncpf_interrupt); + } + #ifdef CONFIG_SMP - smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus; + if (pv_tlb_flush_supported()) { + pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi; + pv_ops.mmu.tlb_remove_table = tlb_remove_table; + pr_info("KVM setup pv remote TLB flush\n"); + } + smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; if (pv_sched_yield_supported()) { smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi; - pr_info("KVM setup pv sched yield\n"); + pr_info("setup PV sched yield\n"); } if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online", kvm_cpu_online, kvm_cpu_down_prepare) < 0) - pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n"); + pr_err("failed to install cpu hotplug callbacks\n"); #else sev_map_percpu_data(); kvm_guest_cpu_init(); #endif +#ifdef CONFIG_KEXEC_CORE + machine_ops.crash_shutdown = kvm_crash_shutdown; +#endif + + register_syscore_ops(&kvm_syscore_ops); + /* * Hard lockup detection is enabled by default. Disable it, as guests * can get false positives too easily, for example if the host is @@ -678,7 +878,7 @@ static noinline uint32_t __kvm_cpuid_base(void) return 0; /* So we don't blow up on old processors */ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) - return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0); + return hypervisor_cpuid_base(KVM_SIGNATURE, 0); return 0; } @@ -717,25 +917,106 @@ static uint32_t __init kvm_detect(void) static void __init kvm_apic_init(void) { -#if defined(CONFIG_SMP) +#ifdef CONFIG_SMP if (pv_ipi_supported()) kvm_setup_pv_ipi(); #endif } +static bool __init kvm_msi_ext_dest_id(void) +{ + return kvm_para_has_feature(KVM_FEATURE_MSI_EXT_DEST_ID); +} + +static void kvm_sev_hc_page_enc_status(unsigned long pfn, int npages, bool enc) +{ + kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, pfn << PAGE_SHIFT, npages, + KVM_MAP_GPA_RANGE_ENC_STAT(enc) | KVM_MAP_GPA_RANGE_PAGE_SZ_4K); +} + static void __init kvm_init_platform(void) { + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) && + kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) { + unsigned long nr_pages; + int i; + + pv_ops.mmu.notify_page_enc_status_changed = + kvm_sev_hc_page_enc_status; + + /* + * Reset the host's shared pages list related to kernel + * specific page encryption status settings before we load a + * new kernel by kexec. Reset the page encryption status + * during early boot intead of just before kexec to avoid SMP + * races during kvm_pv_guest_cpu_reboot(). + * NOTE: We cannot reset the complete shared pages list + * here as we need to retain the UEFI/OVMF firmware + * specific settings. + */ + + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; + + if (entry->type != E820_TYPE_RAM) + continue; + + nr_pages = DIV_ROUND_UP(entry->size, PAGE_SIZE); + + kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, entry->addr, + nr_pages, + KVM_MAP_GPA_RANGE_ENCRYPTED | KVM_MAP_GPA_RANGE_PAGE_SZ_4K); + } + + /* + * Ensure that _bss_decrypted section is marked as decrypted in the + * shared pages list. + */ + nr_pages = DIV_ROUND_UP(__end_bss_decrypted - __start_bss_decrypted, + PAGE_SIZE); + early_set_mem_enc_dec_hypercall((unsigned long)__start_bss_decrypted, + nr_pages, 0); + + /* + * If not booted using EFI, enable Live migration support. + */ + if (!efi_enabled(EFI_BOOT)) + wrmsrl(MSR_KVM_MIGRATION_CONTROL, + KVM_MIGRATION_READY); + } kvmclock_init(); x86_platform.apic_post_init = kvm_apic_init; } +#if defined(CONFIG_AMD_MEM_ENCRYPT) +static void kvm_sev_es_hcall_prepare(struct ghcb *ghcb, struct pt_regs *regs) +{ + /* RAX and CPL are already in the GHCB */ + ghcb_set_rbx(ghcb, regs->bx); + ghcb_set_rcx(ghcb, regs->cx); + ghcb_set_rdx(ghcb, regs->dx); + ghcb_set_rsi(ghcb, regs->si); +} + +static bool kvm_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs) +{ + /* No checking of the return state needed */ + return true; +} +#endif + const __initconst struct hypervisor_x86 x86_hyper_kvm = { - .name = "KVM", - .detect = kvm_detect, - .type = X86_HYPER_KVM, - .init.guest_late_init = kvm_guest_init, - .init.x2apic_available = kvm_para_available, - .init.init_platform = kvm_init_platform, + .name = "KVM", + .detect = kvm_detect, + .type = X86_HYPER_KVM, + .init.guest_late_init = kvm_guest_init, + .init.x2apic_available = kvm_para_available, + .init.msi_ext_dest_id = kvm_msi_ext_dest_id, + .init.init_platform = kvm_init_platform, +#if defined(CONFIG_AMD_MEM_ENCRYPT) + .runtime.sev_es_hcall_prepare = kvm_sev_es_hcall_prepare, + .runtime.sev_es_hcall_finish = kvm_sev_es_hcall_finish, +#endif }; static __init int activate_jump_labels(void) @@ -750,32 +1031,6 @@ static __init int activate_jump_labels(void) } arch_initcall(activate_jump_labels); -static __init int kvm_alloc_cpumask(void) -{ - int cpu; - bool alloc = false; - - if (!kvm_para_available() || nopv) - return 0; - - if (pv_tlb_flush_supported()) - alloc = true; - -#if defined(CONFIG_SMP) - if (pv_ipi_supported()) - alloc = true; -#endif - - if (alloc) - for_each_possible_cpu(cpu) { - zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu), - GFP_KERNEL, cpu_to_node(cpu)); - } - - return 0; -} -arch_initcall(kvm_alloc_cpumask); - #ifdef CONFIG_PARAVIRT_SPINLOCKS /* Kick a cpu by its apicid. Used to wake up a halted vcpu */ @@ -792,78 +1047,63 @@ static void kvm_kick_cpu(int cpu) static void kvm_wait(u8 *ptr, u8 val) { - unsigned long flags; - if (in_nmi()) return; - local_irq_save(flags); - - if (READ_ONCE(*ptr) != val) - goto out; - /* * halt until it's our turn and kicked. Note that we do safe halt * for irq enabled case to avoid hang when lock info is overwritten * in irq spinlock slowpath and no spurious interrupt occur to save us. */ - if (arch_irqs_disabled_flags(flags)) - halt(); - else - safe_halt(); - -out: - local_irq_restore(flags); -} - -#ifdef CONFIG_X86_32 -__visible bool __kvm_vcpu_is_preempted(long cpu) -{ - struct kvm_steal_time *src = &per_cpu(steal_time, cpu); - - return !!(src->preempted & KVM_VCPU_PREEMPTED); + if (irqs_disabled()) { + if (READ_ONCE(*ptr) == val) + halt(); + } else { + local_irq_disable(); + + /* safe_halt() will enable IRQ */ + if (READ_ONCE(*ptr) == val) + safe_halt(); + else + local_irq_enable(); + } } -PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted); - -#else - -#include <asm/asm-offsets.h> - -extern bool __raw_callee_save___kvm_vcpu_is_preempted(long); - -/* - * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and - * restoring to/from the stack. - */ -asm( -".pushsection .text;" -".global __raw_callee_save___kvm_vcpu_is_preempted;" -".type __raw_callee_save___kvm_vcpu_is_preempted, @function;" -"__raw_callee_save___kvm_vcpu_is_preempted:" -"movq __per_cpu_offset(,%rdi,8), %rax;" -"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);" -"setne %al;" -"ret;" -".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;" -".popsection"); - -#endif /* * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present. */ void __init kvm_spinlock_init(void) { - /* Does host kernel support KVM_FEATURE_PV_UNHALT? */ - if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) + /* + * In case host doesn't support KVM_FEATURE_PV_UNHALT there is still an + * advantage of keeping virt_spin_lock_key enabled: virt_spin_lock() is + * preferred over native qspinlock when vCPU is preempted. + */ + if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) { + pr_info("PV spinlocks disabled, no host support\n"); return; + } - if (kvm_para_has_hint(KVM_HINTS_REALTIME)) - return; + /* + * Disable PV spinlocks and use native qspinlock when dedicated pCPUs + * are available. + */ + if (kvm_para_has_hint(KVM_HINTS_REALTIME)) { + pr_info("PV spinlocks disabled with KVM_HINTS_REALTIME hints\n"); + goto out; + } - /* Don't use the pvqspinlock code if there is only 1 vCPU. */ - if (num_possible_cpus() == 1) - return; + if (num_possible_cpus() == 1) { + pr_info("PV spinlocks disabled, single CPU\n"); + goto out; + } + + if (nopvspin) { + pr_info("PV spinlocks disabled, forced by \"nopvspin\" parameter\n"); + goto out; + } + + pr_info("PV spinlocks enabled\n"); __pv_init_lock_hash(); pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; @@ -872,10 +1112,13 @@ void __init kvm_spinlock_init(void) pv_ops.lock.wait = kvm_wait; pv_ops.lock.kick = kvm_kick_cpu; - if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { - pv_ops.lock.vcpu_is_preempted = - PV_CALLEE_SAVE(__kvm_vcpu_is_preempted); - } + /* + * When PV spinlock is enabled which is preferred over + * virt_spin_lock(), virt_spin_lock_key's value is meaningless. + * Just disable it anyway. + */ +out: + static_branch_disable(&virt_spin_lock_key); } #endif /* CONFIG_PARAVIRT_SPINLOCKS */ @@ -895,8 +1138,8 @@ static void kvm_enable_host_haltpoll(void *i) void arch_haltpoll_enable(unsigned int cpu) { if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) { - pr_err_once("kvm: host does not support poll control\n"); - pr_err_once("kvm: host upgrade recommended\n"); + pr_err_once("host does not support poll control\n"); + pr_err_once("host upgrade recommended\n"); return; } @@ -910,7 +1153,7 @@ void arch_haltpoll_disable(unsigned int cpu) if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) return; - /* Enable guest halt poll disables host halt poll */ + /* Disable guest halt poll enables host halt poll */ smp_call_function_single(cpu, kvm_enable_host_haltpoll, NULL, 1); } EXPORT_SYMBOL_GPL(arch_haltpoll_disable); diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 904494b924c1..16333ba1904b 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -16,11 +16,10 @@ #include <linux/mm.h> #include <linux/slab.h> #include <linux/set_memory.h> +#include <linux/cc_platform.h> #include <asm/hypervisor.h> -#include <asm/mem_encrypt.h> #include <asm/x86_init.h> -#include <asm/reboot.h> #include <asm/kvmclock.h> static int kvmclock __initdata = 1; @@ -44,25 +43,15 @@ static int __init parse_no_kvmclock_vsyscall(char *arg) early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); /* Aligned to page sizes to match whats mapped via vsyscalls to userspace */ -#define HV_CLOCK_SIZE (sizeof(struct pvclock_vsyscall_time_info) * NR_CPUS) #define HVC_BOOT_ARRAY_SIZE \ (PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info)) static struct pvclock_vsyscall_time_info hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __bss_decrypted __aligned(PAGE_SIZE); static struct pvclock_wall_clock wall_clock __bss_decrypted; -static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); static struct pvclock_vsyscall_time_info *hvclock_mem; - -static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void) -{ - return &this_cpu_read(hv_clock_per_cpu)->pvti; -} - -static inline struct pvclock_vsyscall_time_info *this_cpu_hvclock(void) -{ - return this_cpu_read(hv_clock_per_cpu); -} +DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); +EXPORT_PER_CPU_SYMBOL_GPL(hv_clock_per_cpu); /* * The wallclock is the time of day when we booted. Since then, some time may @@ -107,7 +96,7 @@ static inline void kvm_sched_clock_init(bool stable) if (!stable) clear_sched_clock_stable(); kvm_sched_clock_offset = kvm_clock_read(); - pv_ops.time.sched_clock = kvm_sched_clock_read; + paravirt_set_sched_clock(kvm_sched_clock_read); pr_info("kvm-clock: using sched offset of %llu cycles", kvm_sched_clock_offset); @@ -159,12 +148,19 @@ bool kvm_check_and_clear_guest_paused(void) return ret; } +static int kvm_cs_enable(struct clocksource *cs) +{ + vclocks_set_used(VDSO_CLOCKMODE_PVCLOCK); + return 0; +} + struct clocksource kvm_clock = { .name = "kvm-clock", .read = kvm_clock_get_cycles, .rating = 400, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .enable = kvm_cs_enable, }; EXPORT_SYMBOL_GPL(kvm_clock); @@ -178,7 +174,7 @@ static void kvm_register_clock(char *txt) pa = slow_virt_to_phys(&src->pvti) | 0x01ULL; wrmsrl(msr_kvm_system_time, pa); - pr_info("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt); + pr_debug("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt); } static void kvm_save_sched_clock_state(void) @@ -197,28 +193,9 @@ static void kvm_setup_secondary_clock(void) } #endif -/* - * After the clock is registered, the host will keep writing to the - * registered memory location. If the guest happens to shutdown, this memory - * won't be valid. In cases like kexec, in which you install a new kernel, this - * means a random memory location will be kept being written. So before any - * kind of shutdown from our side, we unregister the clock by writing anything - * that does not have the 'enable' bit set in the msr - */ -#ifdef CONFIG_KEXEC_CORE -static void kvm_crash_shutdown(struct pt_regs *regs) -{ - native_write_msr(msr_kvm_system_time, 0, 0); - kvm_disable_steal_time(); - native_machine_crash_shutdown(regs); -} -#endif - -static void kvm_shutdown(void) +void kvmclock_disable(void) { native_write_msr(msr_kvm_system_time, 0, 0); - kvm_disable_steal_time(); - native_machine_shutdown(); } static void __init kvmclock_init_mem(void) @@ -246,7 +223,7 @@ static void __init kvmclock_init_mem(void) * hvclock is shared between the guest and the hypervisor, must * be mapped decrypted. */ - if (sev_active()) { + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { r = set_memory_decrypted((unsigned long) hvclock_mem, 1UL << order); if (r) { @@ -262,20 +239,22 @@ static void __init kvmclock_init_mem(void) static int __init kvm_setup_vsyscall_timeinfo(void) { -#ifdef CONFIG_X86_64 - u8 flags; - - if (!per_cpu(hv_clock_per_cpu, 0) || !kvmclock_vsyscall) + if (!kvm_para_available() || !kvmclock || nopv) return 0; - flags = pvclock_read_flags(&hv_clock_boot[0].pvti); - if (!(flags & PVCLOCK_TSC_STABLE_BIT)) - return 0; + kvmclock_init_mem(); - kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; -#endif +#ifdef CONFIG_X86_64 + if (per_cpu(hv_clock_per_cpu, 0) && kvmclock_vsyscall) { + u8 flags; - kvmclock_init_mem(); + flags = pvclock_read_flags(&hv_clock_boot[0].pvti); + if (!(flags & PVCLOCK_TSC_STABLE_BIT)) + return 0; + + kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK; + } +#endif return 0; } @@ -346,10 +325,6 @@ void __init kvmclock_init(void) #endif x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state; - machine_ops.shutdown = kvm_shutdown; -#ifdef CONFIG_KEXEC_CORE - machine_ops.crash_shutdown = kvm_crash_shutdown; -#endif kvm_get_preset_lpj(); /* diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index c57e1ca70fd1..525876e7b9f4 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -8,7 +8,7 @@ * * Lock order: * contex.ldt_usr_sem - * mmap_sem + * mmap_lock * context.lock */ @@ -27,9 +27,10 @@ #include <asm/tlb.h> #include <asm/desc.h> #include <asm/mmu_context.h> -#include <asm/syscalls.h> #include <asm/pgtable_areas.h> +#include <xen/xen.h> + /* This is a multiple of PAGE_SIZE. */ #define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE) @@ -153,7 +154,7 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) if (num_entries > LDT_ENTRIES) return NULL; - new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL); + new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL_ACCOUNT); if (!new_ldt) return NULL; @@ -167,9 +168,9 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) * than PAGE_SIZE. */ if (alloc_size > PAGE_SIZE) - new_ldt->entries = vzalloc(alloc_size); + new_ldt->entries = __vmalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); else - new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL); + new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!new_ldt->entries) { kfree(new_ldt); @@ -397,9 +398,15 @@ static void free_ldt_pgtables(struct mm_struct *mm) if (!boot_cpu_has(X86_FEATURE_PTI)) return; - tlb_gather_mmu(&tlb, mm, start, end); + /* + * Although free_pgd_range() is intended for freeing user + * page-tables, it also works out for kernel mappings on x86. + * We use tlb_gather_mmu_fullmm() to avoid confusing the + * range-tracking logic in __tlb_adjust_range(). + */ + tlb_gather_mmu_fullmm(&tlb, mm); free_pgd_range(&tlb, start, end, start, end); - tlb_finish_mmu(&tlb, start, end); + tlb_finish_mmu(&tlb); #endif } @@ -544,6 +551,28 @@ static int read_default_ldt(void __user *ptr, unsigned long bytecount) return bytecount; } +static bool allow_16bit_segments(void) +{ + if (!IS_ENABLED(CONFIG_X86_16BIT)) + return false; + +#ifdef CONFIG_XEN_PV + /* + * Xen PV does not implement ESPFIX64, which means that 16-bit + * segments will not work correctly. Until either Xen PV implements + * ESPFIX64 and can signal this fact to the guest or unless someone + * provides compelling evidence that allowing broken 16-bit segments + * is worthwhile, disallow 16-bit segments under Xen PV. + */ + if (xen_pv_domain()) { + pr_info_once("Warning: 16-bit segments do not work correctly in a Xen PV guest\n"); + return false; + } +#endif + + return true; +} + static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) { struct mm_struct *mm = current->mm; @@ -575,7 +604,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) /* The user wants to clear the entry. */ memset(&ldt, 0, sizeof(ldt)); } else { - if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) { + if (!ldt_info.seg_32bit && !allow_16bit_segments()) { error = -EINVAL; goto out; } diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c deleted file mode 100644 index 6a68e41206e7..000000000000 --- a/arch/x86/kernel/livepatch.c +++ /dev/null @@ -1,53 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * livepatch.c - x86-specific Kernel Live Patching Core - */ - -#include <linux/module.h> -#include <linux/kallsyms.h> -#include <linux/livepatch.h> -#include <asm/text-patching.h> - -/* Apply per-object alternatives. Based on x86 module_finalize() */ -void arch_klp_init_object_loaded(struct klp_patch *patch, - struct klp_object *obj) -{ - int cnt; - struct klp_modinfo *info; - Elf_Shdr *s, *alt = NULL, *para = NULL; - void *aseg, *pseg; - const char *objname; - char sec_objname[MODULE_NAME_LEN]; - char secname[KSYM_NAME_LEN]; - - info = patch->mod->klp_info; - objname = obj->name ? obj->name : "vmlinux"; - - /* See livepatch core code for BUILD_BUG_ON() explanation */ - BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 128); - - for (s = info->sechdrs; s < info->sechdrs + info->hdr.e_shnum; s++) { - /* Apply per-object .klp.arch sections */ - cnt = sscanf(info->secstrings + s->sh_name, - ".klp.arch.%55[^.].%127s", - sec_objname, secname); - if (cnt != 2) - continue; - if (strcmp(sec_objname, objname)) - continue; - if (!strcmp(".altinstructions", secname)) - alt = s; - if (!strcmp(".parainstructions", secname)) - para = s; - } - - if (alt) { - aseg = (void *) alt->sh_addr; - apply_alternatives(aseg, aseg + alt->sh_size); - } - - if (para) { - pseg = (void *) para->sh_addr; - apply_paravirt(pseg, pseg + para->sh_size); - } -} diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 02bddfc122a4..1b373d79cedc 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -13,7 +13,6 @@ #include <linux/gfp.h> #include <linux/io.h> -#include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> @@ -24,17 +23,6 @@ #include <asm/set_memory.h> #include <asm/debugreg.h> -static void set_gdt(void *newgdt, __u16 limit) -{ - struct desc_ptr curgdt; - - /* ia32 supports unaligned loads & stores */ - curgdt.size = limit; - curgdt.address = (unsigned long)newgdt; - - load_gdt(&curgdt); -} - static void load_segments(void) { #define __STR(X) #X @@ -233,8 +221,8 @@ void machine_kexec(struct kimage *image) * The gdt & idt are now invalid. * If you want to load them you must set up your own idt & gdt. */ - idt_invalidate(phys_to_virt(0)); - set_gdt(phys_to_virt(0), 0); + native_idt_invalidate(); + native_gdt_invalidate(); /* now call it */ image->start = relocate_kernel_ptr((unsigned long)image->head, diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index ad5cdd6a5f23..0611fd83858e 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -17,9 +17,9 @@ #include <linux/suspend.h> #include <linux/vmalloc.h> #include <linux/efi.h> +#include <linux/cc_platform.h> #include <asm/init.h> -#include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> #include <asm/io_apic.h> @@ -27,6 +27,7 @@ #include <asm/kexec-bzimage64.h> #include <asm/setup.h> #include <asm/set_memory.h> +#include <asm/cpu.h> #ifdef CONFIG_ACPI /* @@ -167,7 +168,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) } pte = pte_offset_kernel(pmd, vaddr); - if (sev_active()) + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) prot = PAGE_KERNEL_EXEC; set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot)); @@ -207,7 +208,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) level4p = (pgd_t *)__va(start_pgtable); clear_page(level4p); - if (sev_active()) { + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { info.page_flag |= _PAGE_ENC; info.kernpg_flag |= _PAGE_ENC; } @@ -257,35 +258,6 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) return init_transition_pgtable(image, level4p); } -static void set_idt(void *newidt, u16 limit) -{ - struct desc_ptr curidt; - - /* x86-64 supports unaliged loads & stores */ - curidt.size = limit; - curidt.address = (unsigned long)newidt; - - __asm__ __volatile__ ( - "lidtq %0\n" - : : "m" (curidt) - ); -}; - - -static void set_gdt(void *newgdt, u16 limit) -{ - struct desc_ptr curgdt; - - /* x86-64 supports unaligned loads & stores */ - curgdt.size = limit; - curgdt.address = (unsigned long)newgdt; - - __asm__ __volatile__ ( - "lgdtq %0\n" - : : "m" (curgdt) - ); -}; - static void load_segments(void) { __asm__ __volatile__ ( @@ -339,6 +311,7 @@ void machine_kexec(struct kimage *image) /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); hw_breakpoint_disable(); + cet_disable(); if (image->preserve_context) { #ifdef CONFIG_X86_IO_APIC @@ -354,7 +327,7 @@ void machine_kexec(struct kimage *image) } control_page = page_address(image->control_code_page) + PAGE_SIZE; - memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); + __memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; @@ -380,15 +353,15 @@ void machine_kexec(struct kimage *image) * The gdt & idt are now invalid. * If you want to load them you must set up your own idt & gdt. */ - set_gdt(phys_to_virt(0), 0); - set_idt(phys_to_virt(0), 0); + native_idt_invalidate(); + native_gdt_invalidate(); /* now call it */ image->start = relocate_kernel((unsigned long)image->head, (unsigned long)page_list, image->start, image->preserve_context, - sme_active()); + cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)); #ifdef CONFIG_KEXEC_JUMP if (image->preserve_context) @@ -403,9 +376,6 @@ void machine_kexec(struct kimage *image) #ifdef CONFIG_KEXEC_FILE void *arch_kexec_kernel_image_load(struct kimage *image) { - vfree(image->arch.elf_headers); - image->arch.elf_headers = NULL; - if (!image->fops || !image->fops->load) return ERR_PTR(-ENOEXEC); @@ -541,6 +511,15 @@ overflow: (int)ELF64_R_TYPE(rel[i].r_info), value); return -ENOEXEC; } + +int arch_kimage_file_post_load_cleanup(struct kimage *image) +{ + vfree(image->elf_headers); + image->elf_headers = NULL; + image->elf_headers_sz = 0; + + return kexec_image_post_load_cleanup_default(image); +} #endif /* CONFIG_KEXEC_FILE */ static int @@ -599,12 +578,12 @@ void arch_kexec_unprotect_crashkres(void) */ int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) { - if (sev_active()) + if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) return 0; /* - * If SME is active we need to be sure that kexec pages are - * not encrypted because when we boot to the new kernel the + * If host memory encryption is active we need to be sure that kexec + * pages are not encrypted because when we boot to the new kernel the * pages won't be accessed encrypted (initially). */ return set_memory_decrypted((unsigned long)vaddr, pages); @@ -612,12 +591,12 @@ int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) { - if (sev_active()) + if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) return; /* - * If SME is active we need to reset the pages back to being - * an encrypted mapping before freeing them. + * If host memory encryption is active we need to reset the pages back + * to being an encrypted mapping before freeing them. */ set_memory_encrypted((unsigned long)vaddr, pages); } diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index b5cb49e57df8..c94dec6a1834 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -95,7 +95,7 @@ static void get_fam10h_pci_mmconf_base(void) return; /* SYS_CFG */ - address = MSR_K8_SYSCFG; + address = MSR_AMD64_SYSCFG; rdmsrl(address, val); /* TOP_MEM2 is not enabled? */ diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index d5c72cb877b3..c032edcd3d95 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -18,10 +18,10 @@ #include <linux/gfp.h> #include <linux/jump_label.h> #include <linux/random.h> +#include <linux/memory.h> #include <asm/text-patching.h> #include <asm/page.h> -#include <asm/pgtable.h> #include <asm/setup.h> #include <asm/unwind.h> @@ -53,7 +53,7 @@ static unsigned long int get_module_load_offset(void) */ if (module_load_offset == 0) module_load_offset = - (get_random_int() % 1024 + 1) * PAGE_SIZE; + (prandom_u32_max(1024) + 1) * PAGE_SIZE; mutex_unlock(&module_kaslr_mutex); } return module_load_offset; @@ -67,6 +67,7 @@ static unsigned long int get_module_load_offset(void) void *module_alloc(unsigned long size) { + gfp_t gfp_mask = GFP_KERNEL; void *p; if (PAGE_ALIGN(size) > MODULES_LEN) @@ -74,10 +75,10 @@ void *module_alloc(unsigned long size) p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR + get_module_load_offset(), - MODULES_END, GFP_KERNEL, - PAGE_KERNEL, 0, NUMA_NO_NODE, + MODULES_END, gfp_mask, + PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, __builtin_return_address(0)); - if (p && (kasan_module_alloc(p, size) < 0)) { + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { vfree(p); return NULL; } @@ -114,6 +115,7 @@ int apply_relocate(Elf32_Shdr *sechdrs, *location += sym->st_value; break; case R_386_PC32: + case R_386_PLT32: /* Add the value, subtract its position */ *location += sym->st_value - (uint32_t)location; break; @@ -126,11 +128,12 @@ int apply_relocate(Elf32_Shdr *sechdrs, return 0; } #else /*X86_64*/ -int apply_relocate_add(Elf64_Shdr *sechdrs, +static int __apply_relocate_add(Elf64_Shdr *sechdrs, const char *strtab, unsigned int symindex, unsigned int relsec, - struct module *me) + struct module *me, + void *(*write)(void *dest, const void *src, size_t len)) { unsigned int i; Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr; @@ -162,19 +165,19 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, case R_X86_64_64: if (*(u64 *)loc != 0) goto invalid_relocation; - *(u64 *)loc = val; + write(loc, &val, 8); break; case R_X86_64_32: if (*(u32 *)loc != 0) goto invalid_relocation; - *(u32 *)loc = val; + write(loc, &val, 4); if (val != *(u32 *)loc) goto overflow; break; case R_X86_64_32S: if (*(s32 *)loc != 0) goto invalid_relocation; - *(s32 *)loc = val; + write(loc, &val, 4); if ((s64)val != *(s32 *)loc) goto overflow; break; @@ -183,7 +186,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, if (*(u32 *)loc != 0) goto invalid_relocation; val -= (u64)loc; - *(u32 *)loc = val; + write(loc, &val, 4); #if 0 if ((s64)val != *(s32 *)loc) goto overflow; @@ -193,7 +196,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, if (*(u64 *)loc != 0) goto invalid_relocation; val -= (u64)loc; - *(u64 *)loc = val; + write(loc, &val, 8); break; default: pr_err("%s: Unknown rela relocation: %llu\n", @@ -215,6 +218,33 @@ overflow: me->name); return -ENOEXEC; } + +int apply_relocate_add(Elf64_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + int ret; + bool early = me->state == MODULE_STATE_UNFORMED; + void *(*write)(void *, const void *, size_t) = memcpy; + + if (!early) { + write = text_poke; + mutex_lock(&text_mutex); + } + + ret = __apply_relocate_add(sechdrs, strtab, symindex, relsec, me, + write); + + if (!early) { + text_poke_sync(); + mutex_unlock(&text_mutex); + } + + return ret; +} + #endif int module_finalize(const Elf_Ehdr *hdr, @@ -222,7 +252,8 @@ int module_finalize(const Elf_Ehdr *hdr, struct module *me) { const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, - *para = NULL, *orc = NULL, *orc_ip = NULL; + *para = NULL, *orc = NULL, *orc_ip = NULL, + *retpolines = NULL, *returns = NULL, *ibt_endbr = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { @@ -238,13 +269,39 @@ int module_finalize(const Elf_Ehdr *hdr, orc = s; if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name)) orc_ip = s; + if (!strcmp(".retpoline_sites", secstrings + s->sh_name)) + retpolines = s; + if (!strcmp(".return_sites", secstrings + s->sh_name)) + returns = s; + if (!strcmp(".ibt_endbr_seal", secstrings + s->sh_name)) + ibt_endbr = s; } + /* + * See alternative_instructions() for the ordering rules between the + * various patching types. + */ + if (para) { + void *pseg = (void *)para->sh_addr; + apply_paravirt(pseg, pseg + para->sh_size); + } + if (retpolines) { + void *rseg = (void *)retpolines->sh_addr; + apply_retpolines(rseg, rseg + retpolines->sh_size); + } + if (returns) { + void *rseg = (void *)returns->sh_addr; + apply_returns(rseg, rseg + returns->sh_size); + } if (alt) { /* patch .altinstructions */ void *aseg = (void *)alt->sh_addr; apply_alternatives(aseg, aseg + alt->sh_size); } + if (ibt_endbr) { + void *iseg = (void *)ibt_endbr->sh_addr; + apply_ibt_endbr(iseg, iseg + ibt_endbr->sh_size); + } if (locks && text) { void *lseg = (void *)locks->sh_addr; void *tseg = (void *)text->sh_addr; @@ -253,14 +310,6 @@ int module_finalize(const Elf_Ehdr *hdr, tseg, tseg + text->sh_size); } - if (para) { - void *pseg = (void *)para->sh_addr; - apply_paravirt(pseg, pseg + para->sh_size); - } - - /* make jump label nops */ - jump_label_apply_nops(me); - if (orc && orc_ip) unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size, (void *)orc->sh_addr, orc->sh_size); diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index afac7ccce72f..fed721f90116 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -19,11 +19,12 @@ #include <linux/smp.h> #include <linux/pci.h> +#include <asm/i8259.h> +#include <asm/io_apic.h> +#include <asm/acpi.h> #include <asm/irqdomain.h> #include <asm/mtrr.h> #include <asm/mpspec.h> -#include <asm/pgalloc.h> -#include <asm/io_apic.h> #include <asm/proto.h> #include <asm/bios_ebda.h> #include <asm/e820/api.h> @@ -45,11 +46,6 @@ static int __init mpf_checksum(unsigned char *mp, int len) return sum & 0xFF; } -int __init default_mpc_apic_id(struct mpc_cpu *m) -{ - return m->apicid; -} - static void __init MP_processor_info(struct mpc_cpu *m) { int apicid; @@ -60,7 +56,7 @@ static void __init MP_processor_info(struct mpc_cpu *m) return; } - apicid = x86_init.mpparse.mpc_apic_id(m); + apicid = m->apicid; if (m->cpuflag & CPU_BOOTPROCESSOR) { bootup_cpu = " (Bootup-CPU)"; @@ -72,7 +68,7 @@ static void __init MP_processor_info(struct mpc_cpu *m) } #ifdef CONFIG_X86_IO_APIC -void __init default_mpc_oem_bus_info(struct mpc_bus *m, char *str) +static void __init mpc_oem_bus_info(struct mpc_bus *m, char *str) { memcpy(str, m->bustype, 6); str[6] = 0; @@ -83,7 +79,7 @@ static void __init MP_bus_info(struct mpc_bus *m) { char str[7]; - x86_init.mpparse.mpc_oem_bus_info(m, str); + mpc_oem_bus_info(m, str); #if MAX_MP_BUSSES < 256 if (m->busid >= MAX_MP_BUSSES) { @@ -99,9 +95,6 @@ static void __init MP_bus_info(struct mpc_bus *m) mp_bus_id_to_type[m->busid] = MP_BUS_ISA; #endif } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { - if (x86_init.mpparse.mpc_oem_pci_bus) - x86_init.mpparse.mpc_oem_pci_bus(m); - clear_bit(m->busid, mp_bus_not_pci); #ifdef CONFIG_EISA mp_bus_id_to_type[m->busid] = MP_BUS_PCI; @@ -197,8 +190,6 @@ static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt) 1, mpc, mpc->length, 1); } -void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { } - static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) { char str[16]; @@ -217,14 +208,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) if (early) return 1; - if (mpc->oemptr) - x86_init.mpparse.smp_read_mpc_oem(mpc); - - /* - * Now process the configuration blocks. - */ - x86_init.mpparse.mpc_record(0); - + /* Now process the configuration blocks. */ while (count < mpc->length) { switch (*mpt) { case MP_PROCESSOR: @@ -255,7 +239,6 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) count = mpc->length; break; } - x86_init.mpparse.mpc_record(1); } if (!num_processors) @@ -269,7 +252,7 @@ static int __init ELCR_trigger(unsigned int irq) { unsigned int port; - port = 0x4d0 + (irq >> 3); + port = PIC_ELCR1 + (irq >> 3); return (inb(port) >> (irq & 7)) & 1; } @@ -311,7 +294,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type) case 2: if (i == 0 || i == 13) continue; /* IRQ0 & IRQ13 not connected */ - /* fall through */ + fallthrough; default: if (i == 2) continue; /* IRQ2 is never connected */ @@ -355,7 +338,7 @@ static void __init construct_ioapic_table(int mpc_default_type) default: pr_err("???\nUnknown standard configuration %d\n", mpc_default_type); - /* fall through */ + fallthrough; case 1: case 5: memcpy(bus.bustype, "ISA ", 6); diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 1547be359d7f..ed8ac6bcbafb 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -42,6 +42,14 @@ static struct class *msr_class; static enum cpuhp_state cpuhp_msr_state; +enum allow_write_msrs { + MSR_WRITES_ON, + MSR_WRITES_OFF, + MSR_WRITES_DEFAULT, +}; + +static enum allow_write_msrs allow_writes = MSR_WRITES_DEFAULT; + static ssize_t msr_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -70,6 +78,34 @@ static ssize_t msr_read(struct file *file, char __user *buf, return bytes ? bytes : err; } +static int filter_write(u32 reg) +{ + /* + * MSRs writes usually happen all at once, and can easily saturate kmsg. + * Only allow one message every 30 seconds. + * + * It's possible to be smarter here and do it (for example) per-MSR, but + * it would certainly be more complex, and this is enough at least to + * avoid saturating the ring buffer. + */ + static DEFINE_RATELIMIT_STATE(fw_rs, 30 * HZ, 1); + + switch (allow_writes) { + case MSR_WRITES_ON: return 0; + case MSR_WRITES_OFF: return -EPERM; + default: break; + } + + if (!__ratelimit(&fw_rs)) + return 0; + + pr_warn("Write to unrecognized MSR 0x%x by %s (pid: %d).\n", + reg, current->comm, current->pid); + pr_warn("See https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/about for details.\n"); + + return 0; +} + static ssize_t msr_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { @@ -84,6 +120,10 @@ static ssize_t msr_write(struct file *file, const char __user *buf, if (err) return err; + err = filter_write(reg); + if (err) + return err; + if (count % 8) return -EINVAL; /* Invalid chunk size */ @@ -92,9 +132,13 @@ static ssize_t msr_write(struct file *file, const char __user *buf, err = -EFAULT; break; } + + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]); if (err) break; + tmp += 2; bytes += 8; } @@ -138,6 +182,13 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg) err = security_locked_down(LOCKDOWN_MSR); if (err) break; + + err = filter_write(regs[1]); + if (err) + return err; + + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + err = wrmsr_safe_regs_on_cpu(cpu, regs); if (err) break; @@ -242,6 +293,41 @@ static void __exit msr_exit(void) } module_exit(msr_exit) +static int set_allow_writes(const char *val, const struct kernel_param *cp) +{ + /* val is NUL-terminated, see kernfs_fop_write() */ + char *s = strstrip((char *)val); + + if (!strcmp(s, "on")) + allow_writes = MSR_WRITES_ON; + else if (!strcmp(s, "off")) + allow_writes = MSR_WRITES_OFF; + else + allow_writes = MSR_WRITES_DEFAULT; + + return 0; +} + +static int get_allow_writes(char *buf, const struct kernel_param *kp) +{ + const char *res; + + switch (allow_writes) { + case MSR_WRITES_ON: res = "on"; break; + case MSR_WRITES_OFF: res = "off"; break; + default: res = "default"; break; + } + + return sprintf(buf, "%s\n", res); +} + +static const struct kernel_param_ops allow_writes_ops = { + .set = set_allow_writes, + .get = get_allow_writes +}; + +module_param_cb(allow_writes, &allow_writes_ops, NULL, 0600); + MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>"); MODULE_DESCRIPTION("x86 generic MSR driver"); MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 54c21d6abd5a..cec0bfa3bc04 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -25,10 +25,6 @@ #include <linux/atomic.h> #include <linux/sched/clock.h> -#if defined(CONFIG_EDAC) -#include <linux/edac.h> -#endif - #include <asm/cpu_entry_area.h> #include <asm/traps.h> #include <asm/mach_traps.h> @@ -37,6 +33,7 @@ #include <asm/reboot.h> #include <asm/cache.h> #include <asm/nospec-branch.h> +#include <asm/sev.h> #define CREATE_TRACE_POINTS #include <trace/events/nmi.h> @@ -106,7 +103,6 @@ fs_initcall(nmi_warning_debugfs); static void nmi_check_duration(struct nmiaction *action, u64 duration) { - u64 whole_msecs = READ_ONCE(action->max_duration); int remainder_ns, decimal_msecs; if (duration < nmi_longest_ns || duration < action->max_duration) @@ -114,12 +110,12 @@ static void nmi_check_duration(struct nmiaction *action, u64 duration) action->max_duration = duration; - remainder_ns = do_div(whole_msecs, (1000 * 1000)); + remainder_ns = do_div(duration, (1000 * 1000)); decimal_msecs = remainder_ns / 1000; printk_ratelimited(KERN_INFO "INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n", - action->handler, whole_msecs, decimal_msecs); + action->handler, duration, decimal_msecs); } static int nmi_handle(unsigned int type, struct pt_regs *regs) @@ -161,7 +157,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action) struct nmi_desc *desc = nmi_to_desc(type); unsigned long flags; - if (!action->handler) + if (WARN_ON_ONCE(!action->handler || !list_empty(&action->list))) return -EINVAL; raw_spin_lock_irqsave(&desc->lock, flags); @@ -181,7 +177,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action) list_add_rcu(&action->list, &desc->head); else list_add_tail_rcu(&action->list, &desc->head); - + raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } @@ -190,7 +186,7 @@ EXPORT_SYMBOL(__register_nmi_handler); void unregister_nmi_handler(unsigned int type, const char *name) { struct nmi_desc *desc = nmi_to_desc(type); - struct nmiaction *n; + struct nmiaction *n, *found = NULL; unsigned long flags; raw_spin_lock_irqsave(&desc->lock, flags); @@ -204,12 +200,16 @@ void unregister_nmi_handler(unsigned int type, const char *name) WARN(in_nmi(), "Trying to free NMI (%s) from NMI context!\n", n->name); list_del_rcu(&n->list); + found = n; break; } } raw_spin_unlock_irqrestore(&desc->lock, flags); - synchronize_rcu(); + if (found) { + synchronize_rcu(); + INIT_LIST_HEAD(&found->list); + } } EXPORT_SYMBOL_GPL(unregister_nmi_handler); @@ -296,7 +296,6 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", reason, smp_processor_id()); - pr_emerg("Do you have a strange power saving mode enabled?\n"); if (unknown_nmi_panic || panic_on_unrecovered_nmi) nmi_panic(regs, "NMI: Not continuing"); @@ -307,7 +306,7 @@ NOKPROBE_SYMBOL(unknown_nmi_error); static DEFINE_PER_CPU(bool, swallow_nmi); static DEFINE_PER_CPU(unsigned long, last_nmi_rip); -static void default_do_nmi(struct pt_regs *regs) +static noinstr void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; int handled; @@ -333,6 +332,8 @@ static void default_do_nmi(struct pt_regs *regs) __this_cpu_write(last_nmi_rip, regs->ip); + instrumentation_begin(); + handled = nmi_handle(NMI_LOCAL, regs); __this_cpu_add(nmi_stats.normal, handled); if (handled) { @@ -346,7 +347,7 @@ static void default_do_nmi(struct pt_regs *regs) */ if (handled > 1) __this_cpu_write(swallow_nmi, true); - return; + goto out; } /* @@ -378,7 +379,7 @@ static void default_do_nmi(struct pt_regs *regs) #endif __this_cpu_add(nmi_stats.external, 1); raw_spin_unlock(&nmi_reason_lock); - return; + goto out; } raw_spin_unlock(&nmi_reason_lock); @@ -403,9 +404,9 @@ static void default_do_nmi(struct pt_regs *regs) * a 'real' unknown NMI. For example, while processing * a perf NMI another perf NMI comes in along with a * 'real' unknown NMI. These two NMIs get combined into - * one (as descibed above). When the next NMI gets + * one (as described above). When the next NMI gets * processed, it will be flagged by perf as handled, but - * noone will know that there was a 'real' unknown NMI sent + * no one will know that there was a 'real' unknown NMI sent * also. As a result it gets swallowed. Or if the first * perf NMI returns two events handled then the second * NMI will get eaten by the logic below, again losing a @@ -416,8 +417,10 @@ static void default_do_nmi(struct pt_regs *regs) __this_cpu_add(nmi_stats.swallow, 1); else unknown_nmi_error(reason, regs); + +out: + instrumentation_end(); } -NOKPROBE_SYMBOL(default_do_nmi); /* * NMIs can page fault or hit breakpoints which will cause it to lose @@ -471,46 +474,19 @@ enum nmi_states { }; static DEFINE_PER_CPU(enum nmi_states, nmi_state); static DEFINE_PER_CPU(unsigned long, nmi_cr2); +static DEFINE_PER_CPU(unsigned long, nmi_dr7); -#ifdef CONFIG_X86_64 -/* - * In x86_64, we need to handle breakpoint -> NMI -> breakpoint. Without - * some care, the inner breakpoint will clobber the outer breakpoint's - * stack. - * - * If a breakpoint is being processed, and the debug stack is being - * used, if an NMI comes in and also hits a breakpoint, the stack - * pointer will be set to the same fixed address as the breakpoint that - * was interrupted, causing that stack to be corrupted. To handle this - * case, check if the stack that was interrupted is the debug stack, and - * if so, change the IDT so that new breakpoints will use the current - * stack and not switch to the fixed address. On return of the NMI, - * switch back to the original IDT. - */ -static DEFINE_PER_CPU(int, update_debug_stack); - -static bool notrace is_debug_stack(unsigned long addr) +DEFINE_IDTENTRY_RAW(exc_nmi) { - struct cea_exception_stacks *cs = __this_cpu_read(cea_exception_stacks); - unsigned long top = CEA_ESTACK_TOP(cs, DB); - unsigned long bot = CEA_ESTACK_BOT(cs, DB1); + irqentry_state_t irq_state; - if (__this_cpu_read(debug_stack_usage)) - return true; /* - * Note, this covers the guard page between DB and DB1 as well to - * avoid two checks. But by all means @addr can never point into - * the guard page. + * Re-enable NMIs right here when running as an SEV-ES guest. This might + * cause nested NMIs, but those can be handled safely. */ - return addr >= bot && addr < top; -} -NOKPROBE_SYMBOL(is_debug_stack); -#endif + sev_es_nmi_complete(); -dotraplinkage notrace void -do_nmi(struct pt_regs *regs, long error_code) -{ - if (IS_ENABLED(CONFIG_SMP) && cpu_is_offline(smp_processor_id())) + if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id())) return; if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { @@ -521,34 +497,26 @@ do_nmi(struct pt_regs *regs, long error_code) this_cpu_write(nmi_cr2, read_cr2()); nmi_restart: -#ifdef CONFIG_X86_64 /* - * If we interrupted a breakpoint, it is possible that - * the nmi handler will have breakpoints too. We need to - * change the IDT such that breakpoints that happen here - * continue to use the NMI stack. + * Needs to happen before DR7 is accessed, because the hypervisor can + * intercept DR7 reads/writes, turning those into #VC exceptions. */ - if (unlikely(is_debug_stack(regs->sp))) { - debug_stack_set_zero(); - this_cpu_write(update_debug_stack, 1); - } -#endif + sev_es_ist_enter(regs); - nmi_enter(); + this_cpu_write(nmi_dr7, local_db_save()); + + irq_state = irqentry_nmi_enter(regs); inc_irq_stat(__nmi_count); if (!ignore_nmis) default_do_nmi(regs); - nmi_exit(); + irqentry_nmi_exit(regs, irq_state); -#ifdef CONFIG_X86_64 - if (unlikely(this_cpu_read(update_debug_stack))) { - debug_stack_reset(); - this_cpu_write(update_debug_stack, 0); - } -#endif + local_db_restore(this_cpu_read(nmi_dr7)); + + sev_es_ist_exit(); if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) write_cr2(this_cpu_read(nmi_cr2)); @@ -558,7 +526,16 @@ nmi_restart: if (user_mode(regs)) mds_user_clear_cpu_buffers(); } -NOKPROBE_SYMBOL(do_nmi); + +#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL) +DEFINE_IDTENTRY_RAW(exc_nmi_noist) +{ + exc_nmi(regs); +} +#endif +#if IS_MODULE(CONFIG_KVM_INTEL) +EXPORT_SYMBOL_GPL(asm_exc_nmi_noist); +#endif void stop_nmi(void) { diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index 4f75d0cf6305..9e1ea99ad9df 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -32,3 +32,12 @@ bool pv_is_native_vcpu_is_preempted(void) return pv_ops.lock.vcpu_is_preempted.func == __raw_callee_save___native_vcpu_is_preempted; } + +void __init paravirt_set_cap(void) +{ + if (!pv_is_native_spin_unlock()) + setup_force_cpu_cap(X86_FEATURE_PVUNLOCK); + + if (!pv_is_native_vcpu_is_preempted()) + setup_force_cpu_cap(X86_FEATURE_VCPUPREEMPT); +} diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index c131ba4e70ef..7ca2d46c08cc 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -13,13 +13,14 @@ #include <linux/bcd.h> #include <linux/highmem.h> #include <linux/kprobes.h> +#include <linux/pgtable.h> +#include <linux/static_call.h> #include <asm/bug.h> #include <asm/paravirt.h> #include <asm/debugreg.h> #include <asm/desc.h> #include <asm/setup.h> -#include <asm/pgtable.h> #include <asm/time.h> #include <asm/pgalloc.h> #include <asm/irq.h> @@ -40,11 +41,24 @@ extern void _paravirt_nop(void); asm (".pushsection .entry.text, \"ax\"\n" ".global _paravirt_nop\n" "_paravirt_nop:\n\t" - "ret\n\t" + ASM_ENDBR + ASM_RET ".size _paravirt_nop, . - _paravirt_nop\n\t" ".type _paravirt_nop, @function\n\t" ".popsection"); +/* stub always returning 0. */ +asm (".pushsection .entry.text, \"ax\"\n" + ".global paravirt_ret0\n" + "paravirt_ret0:\n\t" + ASM_ENDBR + "xor %" _ASM_AX ", %" _ASM_AX ";\n\t" + ASM_RET + ".size paravirt_ret0, . - paravirt_ret0\n\t" + ".type paravirt_ret0, @function\n\t" + ".popsection"); + + void __init default_banner(void) { printk(KERN_INFO "Booting paravirtualized kernel on %s\n", @@ -52,31 +66,17 @@ void __init default_banner(void) } /* Undefined instruction for dealing with missing ops pointers. */ -static const unsigned char ud2a[] = { 0x0f, 0x0b }; - -struct branch { - unsigned char opcode; - u32 delta; -} __attribute__((packed)); +noinstr void paravirt_BUG(void) +{ + BUG(); +} static unsigned paravirt_patch_call(void *insn_buff, const void *target, unsigned long addr, unsigned len) { - const int call_len = 5; - struct branch *b = insn_buff; - unsigned long delta = (unsigned long)target - (addr+call_len); - - if (len < call_len) { - pr_warn("paravirt: Failed to patch indirect CALL at %ps\n", (void *)addr); - /* Kernel might not be viable if patching fails, bail out: */ - BUG_ON(1); - } - - b->opcode = 0xe8; /* call */ - b->delta = delta; - BUILD_BUG_ON(sizeof(*b) != call_len); - - return call_len; + __text_gen_insn(insn_buff, CALL_INSN_OPCODE, + (void *)addr, target, CALL_INSN_SIZE); + return CALL_INSN_SIZE; } #ifdef CONFIG_PARAVIRT_XXL @@ -85,25 +85,6 @@ u64 notrace _paravirt_ident_64(u64 x) { return x; } - -static unsigned paravirt_patch_jmp(void *insn_buff, const void *target, - unsigned long addr, unsigned len) -{ - struct branch *b = insn_buff; - unsigned long delta = (unsigned long)target - (addr+5); - - if (len < 5) { -#ifdef CONFIG_RETPOLINE - WARN_ONCE(1, "Failing to patch indirect JMP in %ps\n", (void *)addr); -#endif - return len; /* call too long for patch site */ - } - - b->opcode = 0xe9; /* jmp */ - b->delta = delta; - - return 5; -} #endif DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key); @@ -114,8 +95,8 @@ void __init native_pv_lock_init(void) static_branch_disable(&virt_spin_lock_key); } -unsigned paravirt_patch_default(u8 type, void *insn_buff, - unsigned long addr, unsigned len) +unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr, + unsigned int len) { /* * Neat trick to map patch type back to the call within the @@ -125,21 +106,10 @@ unsigned paravirt_patch_default(u8 type, void *insn_buff, unsigned ret; if (opfunc == NULL) - /* If there's no function, patch it with a ud2a (BUG) */ - ret = paravirt_patch_insns(insn_buff, len, ud2a, ud2a+sizeof(ud2a)); + /* If there's no function, patch it with paravirt_BUG() */ + ret = paravirt_patch_call(insn_buff, paravirt_BUG, addr, len); else if (opfunc == _paravirt_nop) ret = 0; - -#ifdef CONFIG_PARAVIRT_XXL - /* identity functions just return their single argument */ - else if (opfunc == _paravirt_ident_64) - ret = paravirt_patch_ident_64(insn_buff, len); - - else if (type == PARAVIRT_PATCH(cpu.iret) || - type == PARAVIRT_PATCH(cpu.usergs_sysret64)) - /* If operation requires a jmp, then jmp */ - ret = paravirt_patch_jmp(insn_buff, opfunc, addr, len); -#endif else /* Otherwise call the function. */ ret = paravirt_patch_call(insn_buff, opfunc, addr, len); @@ -147,38 +117,6 @@ unsigned paravirt_patch_default(u8 type, void *insn_buff, return ret; } -unsigned paravirt_patch_insns(void *insn_buff, unsigned len, - const char *start, const char *end) -{ - unsigned insn_len = end - start; - - /* Alternative instruction is too large for the patch site and we cannot continue: */ - BUG_ON(insn_len > len || start == NULL); - - memcpy(insn_buff, start, insn_len); - - return insn_len; -} - -static void native_flush_tlb(void) -{ - __native_flush_tlb(); -} - -/* - * Global pages have to be flushed a bit differently. Not a real - * performance problem because this does not happen often. - */ -static void native_flush_tlb_global(void) -{ - __native_flush_tlb_global(); -} - -static void native_flush_tlb_one_user(unsigned long addr) -{ - __native_flush_tlb_one_user(addr); -} - struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; @@ -187,10 +125,15 @@ static u64 native_steal_clock(int cpu) return 0; } -/* These are in entry.S */ -extern void native_iret(void); -extern void native_usergs_sysret64(void); +DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock); +DEFINE_STATIC_CALL(pv_sched_clock, native_sched_clock); +void paravirt_set_sched_clock(u64 (*func)(void)) +{ + static_call_update(pv_sched_clock, func); +} + +/* These are in entry.S */ static struct resource reserve_ioports = { .start = 0, .end = IO_SPACE_LIMIT, @@ -269,6 +212,36 @@ void paravirt_end_context_switch(struct task_struct *next) if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES)) arch_enter_lazy_mmu_mode(); } + +static noinstr unsigned long pv_native_read_cr2(void) +{ + return native_read_cr2(); +} + +static noinstr void pv_native_write_cr2(unsigned long val) +{ + native_write_cr2(val); +} + +static noinstr unsigned long pv_native_get_debugreg(int regno) +{ + return native_get_debugreg(regno); +} + +static noinstr void pv_native_set_debugreg(int regno, unsigned long val) +{ + native_set_debugreg(regno, val); +} + +static noinstr void pv_native_irq_enable(void) +{ + native_irq_enable(); +} + +static noinstr void pv_native_irq_disable(void) +{ + native_irq_disable(); +} #endif enum paravirt_lazy_mode paravirt_get_lazy_mode(void) @@ -282,33 +255,21 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) struct pv_info pv_info = { .name = "bare hardware", #ifdef CONFIG_PARAVIRT_XXL - .kernel_rpl = 0, - .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */ - -#ifdef CONFIG_X86_64 .extra_user_64bit_cs = __USER_CS, #endif -#endif }; /* 64-bit pagetable entries */ #define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64) struct paravirt_patch_template pv_ops = { - /* Init ops. */ - .init.patch = native_patch, - - /* Time ops. */ - .time.sched_clock = native_sched_clock, - .time.steal_clock = native_steal_clock, - /* Cpu ops. */ .cpu.io_delay = native_io_delay, #ifdef CONFIG_PARAVIRT_XXL .cpu.cpuid = native_cpuid, - .cpu.get_debugreg = native_get_debugreg, - .cpu.set_debugreg = native_set_debugreg, + .cpu.get_debugreg = pv_native_get_debugreg, + .cpu.set_debugreg = pv_native_set_debugreg, .cpu.read_cr0 = native_read_cr0, .cpu.write_cr0 = native_write_cr0, .cpu.write_cr4 = native_write_cr4, @@ -324,9 +285,7 @@ struct paravirt_patch_template pv_ops = { .cpu.load_idt = native_load_idt, .cpu.store_tr = native_store_tr, .cpu.load_tls = native_load_tls, -#ifdef CONFIG_X86_64 .cpu.load_gs_index = native_load_gs_index, -#endif .cpu.write_ldt_entry = native_write_ldt_entry, .cpu.write_gdt_entry = native_write_gdt_entry, .cpu.write_idt_entry = native_write_idt_entry, @@ -336,14 +295,9 @@ struct paravirt_patch_template pv_ops = { .cpu.load_sp0 = native_load_sp0, -#ifdef CONFIG_X86_64 - .cpu.usergs_sysret64 = native_usergs_sysret64, -#endif - .cpu.iret = native_iret, - .cpu.swapgs = native_swapgs, - #ifdef CONFIG_X86_IOPL_IOPERM - .cpu.update_io_bitmap = native_tss_update_io_bitmap, + .cpu.invalidate_io_bitmap = native_tss_invalidate_io_bitmap, + .cpu.update_io_bitmap = native_tss_update_io_bitmap, #endif .cpu.start_context_switch = paravirt_nop, @@ -351,26 +305,26 @@ struct paravirt_patch_template pv_ops = { /* Irq ops. */ .irq.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), - .irq.restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl), - .irq.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), - .irq.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable), + .irq.irq_disable = __PV_IS_CALLEE_SAVE(pv_native_irq_disable), + .irq.irq_enable = __PV_IS_CALLEE_SAVE(pv_native_irq_enable), .irq.safe_halt = native_safe_halt, .irq.halt = native_halt, #endif /* CONFIG_PARAVIRT_XXL */ /* Mmu ops. */ - .mmu.flush_tlb_user = native_flush_tlb, + .mmu.flush_tlb_user = native_flush_tlb_local, .mmu.flush_tlb_kernel = native_flush_tlb_global, .mmu.flush_tlb_one_user = native_flush_tlb_one_user, - .mmu.flush_tlb_others = native_flush_tlb_others, + .mmu.flush_tlb_multi = native_flush_tlb_multi, .mmu.tlb_remove_table = (void (*)(struct mmu_gather *, void *))tlb_remove_page, .mmu.exit_mmap = paravirt_nop, + .mmu.notify_page_enc_status_changed = paravirt_nop, #ifdef CONFIG_PARAVIRT_XXL - .mmu.read_cr2 = __PV_IS_CALLEE_SAVE(native_read_cr2), - .mmu.write_cr2 = native_write_cr2, + .mmu.read_cr2 = __PV_IS_CALLEE_SAVE(pv_native_read_cr2), + .mmu.write_cr2 = pv_native_write_cr2, .mmu.read_cr3 = __native_read_cr3, .mmu.write_cr3 = native_write_cr3, @@ -387,24 +341,16 @@ struct paravirt_patch_template pv_ops = { .mmu.release_p4d = paravirt_nop, .mmu.set_pte = native_set_pte, - .mmu.set_pte_at = native_set_pte_at, .mmu.set_pmd = native_set_pmd, .mmu.ptep_modify_prot_start = __ptep_modify_prot_start, .mmu.ptep_modify_prot_commit = __ptep_modify_prot_commit, -#if CONFIG_PGTABLE_LEVELS >= 3 -#ifdef CONFIG_X86_PAE - .mmu.set_pte_atomic = native_set_pte_atomic, - .mmu.pte_clear = native_pte_clear, - .mmu.pmd_clear = native_pmd_clear, -#endif .mmu.set_pud = native_set_pud, .mmu.pmd_val = PTE_IDENT, .mmu.make_pmd = PTE_IDENT, -#if CONFIG_PGTABLE_LEVELS >= 4 .mmu.pud_val = PTE_IDENT, .mmu.make_pud = PTE_IDENT, @@ -416,8 +362,6 @@ struct paravirt_patch_template pv_ops = { .mmu.set_pgd = native_set_pgd, #endif /* CONFIG_PGTABLE_LEVELS >= 5 */ -#endif /* CONFIG_PGTABLE_LEVELS >= 4 */ -#endif /* CONFIG_PGTABLE_LEVELS >= 3 */ .mmu.pte_val = PTE_IDENT, .mmu.pgd_val = PTE_IDENT, @@ -452,9 +396,6 @@ struct paravirt_patch_template pv_ops = { }; #ifdef CONFIG_PARAVIRT_XXL -/* At this point, native_get/set_debugreg has real function entries */ -NOKPROBE_SYMBOL(native_get_debugreg); -NOKPROBE_SYMBOL(native_set_debugreg); NOKPROBE_SYMBOL(native_load_idt); #endif diff --git a/arch/x86/kernel/paravirt_patch.c b/arch/x86/kernel/paravirt_patch.c deleted file mode 100644 index 3eff63c090d2..000000000000 --- a/arch/x86/kernel/paravirt_patch.c +++ /dev/null @@ -1,126 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/stringify.h> - -#include <asm/paravirt.h> -#include <asm/asm-offsets.h> - -#define PSTART(d, m) \ - patch_data_##d.m - -#define PEND(d, m) \ - (PSTART(d, m) + sizeof(patch_data_##d.m)) - -#define PATCH(d, m, insn_buff, len) \ - paravirt_patch_insns(insn_buff, len, PSTART(d, m), PEND(d, m)) - -#define PATCH_CASE(ops, m, data, insn_buff, len) \ - case PARAVIRT_PATCH(ops.m): \ - return PATCH(data, ops##_##m, insn_buff, len) - -#ifdef CONFIG_PARAVIRT_XXL -struct patch_xxl { - const unsigned char irq_irq_disable[1]; - const unsigned char irq_irq_enable[1]; - const unsigned char irq_save_fl[2]; - const unsigned char mmu_read_cr2[3]; - const unsigned char mmu_read_cr3[3]; - const unsigned char mmu_write_cr3[3]; - const unsigned char irq_restore_fl[2]; -# ifdef CONFIG_X86_64 - const unsigned char cpu_wbinvd[2]; - const unsigned char cpu_usergs_sysret64[6]; - const unsigned char cpu_swapgs[3]; - const unsigned char mov64[3]; -# else - const unsigned char cpu_iret[1]; -# endif -}; - -static const struct patch_xxl patch_data_xxl = { - .irq_irq_disable = { 0xfa }, // cli - .irq_irq_enable = { 0xfb }, // sti - .irq_save_fl = { 0x9c, 0x58 }, // pushf; pop %[re]ax - .mmu_read_cr2 = { 0x0f, 0x20, 0xd0 }, // mov %cr2, %[re]ax - .mmu_read_cr3 = { 0x0f, 0x20, 0xd8 }, // mov %cr3, %[re]ax -# ifdef CONFIG_X86_64 - .mmu_write_cr3 = { 0x0f, 0x22, 0xdf }, // mov %rdi, %cr3 - .irq_restore_fl = { 0x57, 0x9d }, // push %rdi; popfq - .cpu_wbinvd = { 0x0f, 0x09 }, // wbinvd - .cpu_usergs_sysret64 = { 0x0f, 0x01, 0xf8, - 0x48, 0x0f, 0x07 }, // swapgs; sysretq - .cpu_swapgs = { 0x0f, 0x01, 0xf8 }, // swapgs - .mov64 = { 0x48, 0x89, 0xf8 }, // mov %rdi, %rax -# else - .mmu_write_cr3 = { 0x0f, 0x22, 0xd8 }, // mov %eax, %cr3 - .irq_restore_fl = { 0x50, 0x9d }, // push %eax; popf - .cpu_iret = { 0xcf }, // iret -# endif -}; - -unsigned int paravirt_patch_ident_64(void *insn_buff, unsigned int len) -{ -#ifdef CONFIG_X86_64 - return PATCH(xxl, mov64, insn_buff, len); -#endif - return 0; -} -# endif /* CONFIG_PARAVIRT_XXL */ - -#ifdef CONFIG_PARAVIRT_SPINLOCKS -struct patch_lock { - unsigned char queued_spin_unlock[3]; - unsigned char vcpu_is_preempted[2]; -}; - -static const struct patch_lock patch_data_lock = { - .vcpu_is_preempted = { 0x31, 0xc0 }, // xor %eax, %eax - -# ifdef CONFIG_X86_64 - .queued_spin_unlock = { 0xc6, 0x07, 0x00 }, // movb $0, (%rdi) -# else - .queued_spin_unlock = { 0xc6, 0x00, 0x00 }, // movb $0, (%eax) -# endif -}; -#endif /* CONFIG_PARAVIRT_SPINLOCKS */ - -unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr, - unsigned int len) -{ - switch (type) { - -#ifdef CONFIG_PARAVIRT_XXL - PATCH_CASE(irq, restore_fl, xxl, insn_buff, len); - PATCH_CASE(irq, save_fl, xxl, insn_buff, len); - PATCH_CASE(irq, irq_enable, xxl, insn_buff, len); - PATCH_CASE(irq, irq_disable, xxl, insn_buff, len); - - PATCH_CASE(mmu, read_cr2, xxl, insn_buff, len); - PATCH_CASE(mmu, read_cr3, xxl, insn_buff, len); - PATCH_CASE(mmu, write_cr3, xxl, insn_buff, len); - -# ifdef CONFIG_X86_64 - PATCH_CASE(cpu, usergs_sysret64, xxl, insn_buff, len); - PATCH_CASE(cpu, swapgs, xxl, insn_buff, len); - PATCH_CASE(cpu, wbinvd, xxl, insn_buff, len); -# else - PATCH_CASE(cpu, iret, xxl, insn_buff, len); -# endif -#endif - -#ifdef CONFIG_PARAVIRT_SPINLOCKS - case PARAVIRT_PATCH(lock.queued_spin_unlock): - if (pv_is_native_spin_unlock()) - return PATCH(lock, queued_spin_unlock, insn_buff, len); - break; - - case PARAVIRT_PATCH(lock.vcpu_is_preempted): - if (pv_is_native_vcpu_is_preempted()) - return PATCH(lock, vcpu_is_preempted, insn_buff, len); - break; -#endif - default: - break; - } - - return paravirt_patch_default(type, insn_buff, addr, len); -} diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 5dcedad21dff..30bbe4abb5d6 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -1,19 +1,22 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/dma-map-ops.h> #include <linux/dma-direct.h> -#include <linux/dma-debug.h> #include <linux/iommu.h> #include <linux/dmar.h> #include <linux/export.h> #include <linux/memblock.h> #include <linux/gfp.h> #include <linux/pci.h> +#include <linux/amd-iommu.h> #include <asm/proto.h> #include <asm/dma.h> #include <asm/iommu.h> #include <asm/gart.h> #include <asm/x86_init.h> -#include <asm/iommu_table.h> + +#include <xen/xen.h> +#include <xen/swiotlb-xen.h> static bool disable_dac_quirk __read_mostly; @@ -34,24 +37,90 @@ int no_iommu __read_mostly; /* Set this to 1 if there is a HW IOMMU in the system */ int iommu_detected __read_mostly = 0; -extern struct iommu_table_entry __iommu_table[], __iommu_table_end[]; +#ifdef CONFIG_SWIOTLB +bool x86_swiotlb_enable; +static unsigned int x86_swiotlb_flags; + +static void __init pci_swiotlb_detect(void) +{ + /* don't initialize swiotlb if iommu=off (no_iommu=1) */ + if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN) + x86_swiotlb_enable = true; + + /* + * Set swiotlb to 1 so that bounce buffers are allocated and used for + * devices that can't support DMA to encrypted memory. + */ + if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) + x86_swiotlb_enable = true; + + /* + * Guest with guest memory encryption currently perform all DMA through + * bounce buffers as the hypervisor can't access arbitrary VM memory + * that is not explicitly shared with it. + */ + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { + x86_swiotlb_enable = true; + x86_swiotlb_flags |= SWIOTLB_FORCE; + } +} +#else +static inline void __init pci_swiotlb_detect(void) +{ +} +#define x86_swiotlb_flags 0 +#endif /* CONFIG_SWIOTLB */ + +#ifdef CONFIG_SWIOTLB_XEN +static void __init pci_xen_swiotlb_init(void) +{ + if (!xen_initial_domain() && !x86_swiotlb_enable) + return; + x86_swiotlb_enable = true; + x86_swiotlb_flags |= SWIOTLB_ANY; + swiotlb_init_remap(true, x86_swiotlb_flags, xen_swiotlb_fixup); + dma_ops = &xen_swiotlb_dma_ops; + if (IS_ENABLED(CONFIG_PCI)) + pci_request_acs(); +} + +int pci_xen_swiotlb_init_late(void) +{ + if (dma_ops == &xen_swiotlb_dma_ops) + return 0; + + /* we can work with the default swiotlb */ + if (!io_tlb_default_mem.nslabs) { + int rc = swiotlb_init_late(swiotlb_size_or_default(), + GFP_KERNEL, xen_swiotlb_fixup); + if (rc < 0) + return rc; + } + + /* XXX: this switches the dma ops under live devices! */ + dma_ops = &xen_swiotlb_dma_ops; + if (IS_ENABLED(CONFIG_PCI)) + pci_request_acs(); + return 0; +} +EXPORT_SYMBOL_GPL(pci_xen_swiotlb_init_late); +#else +static inline void __init pci_xen_swiotlb_init(void) +{ +} +#endif /* CONFIG_SWIOTLB_XEN */ void __init pci_iommu_alloc(void) { - struct iommu_table_entry *p; - - sort_iommu_table(__iommu_table, __iommu_table_end); - check_iommu_entries(__iommu_table, __iommu_table_end); - - for (p = __iommu_table; p < __iommu_table_end; p++) { - if (p && p->detect && p->detect() > 0) { - p->flags |= IOMMU_DETECTED; - if (p->early_init) - p->early_init(); - if (p->flags & IOMMU_FINISH_IF_DETECTED) - break; - } + if (xen_pv_domain()) { + pci_xen_swiotlb_init(); + return; } + pci_swiotlb_detect(); + gart_iommu_hole_init(); + amd_iommu_detect(); + detect_intel_iommu(); + swiotlb_init(x86_swiotlb_enable, x86_swiotlb_flags); } /* @@ -102,7 +171,7 @@ static __init int iommu_setup(char *p) } #ifdef CONFIG_SWIOTLB if (!strncmp(p, "soft", 4)) - swiotlb = 1; + x86_swiotlb_enable = true; #endif if (!strncmp(p, "pt", 2)) iommu_set_default_passthrough(true); @@ -121,14 +190,17 @@ early_param("iommu", iommu_setup); static int __init pci_iommu_init(void) { - struct iommu_table_entry *p; - x86_init.iommu.iommu_init(); - for (p = __iommu_table; p < __iommu_table_end; p++) { - if (p && (p->flags & IOMMU_DETECTED) && p->late_init) - p->late_init(); +#ifdef CONFIG_SWIOTLB + /* An IOMMU turned us off. */ + if (x86_swiotlb_enable) { + pr_info("PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); + swiotlb_print_info(); + } else { + swiotlb_exit(); } +#endif return 0; } diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c deleted file mode 100644 index 2e9006c1e240..000000000000 --- a/arch/x86/kernel/pci-iommu_table.c +++ /dev/null @@ -1,80 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/dma-mapping.h> -#include <asm/iommu_table.h> -#include <linux/string.h> -#include <linux/kallsyms.h> - - -#define DEBUG 1 - -static struct iommu_table_entry * __init -find_dependents_of(struct iommu_table_entry *start, - struct iommu_table_entry *finish, - struct iommu_table_entry *q) -{ - struct iommu_table_entry *p; - - if (!q) - return NULL; - - for (p = start; p < finish; p++) - if (p->detect == q->depend) - return p; - - return NULL; -} - - -void __init sort_iommu_table(struct iommu_table_entry *start, - struct iommu_table_entry *finish) { - - struct iommu_table_entry *p, *q, tmp; - - for (p = start; p < finish; p++) { -again: - q = find_dependents_of(start, finish, p); - /* We are bit sneaky here. We use the memory address to figure - * out if the node we depend on is past our point, if so, swap. - */ - if (q > p) { - tmp = *p; - memmove(p, q, sizeof(*p)); - *q = tmp; - goto again; - } - } - -} - -#ifdef DEBUG -void __init check_iommu_entries(struct iommu_table_entry *start, - struct iommu_table_entry *finish) -{ - struct iommu_table_entry *p, *q, *x; - - /* Simple cyclic dependency checker. */ - for (p = start; p < finish; p++) { - q = find_dependents_of(start, finish, p); - x = find_dependents_of(start, finish, q); - if (p == x) { - printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %pS depends on %pS and vice-versa. BREAKING IT.\n", - p->detect, q->detect); - /* Heavy handed way..*/ - x->depend = NULL; - } - } - - for (p = start; p < finish; p++) { - q = find_dependents_of(p, finish, p); - if (q && q > p) { - printk(KERN_ERR "EXECUTION ORDER INVALID! %pS should be called before %pS!\n", - p->detect, q->detect); - } - } -} -#else -void __init check_iommu_entries(struct iommu_table_entry *start, - struct iommu_table_entry *finish) -{ -} -#endif diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c deleted file mode 100644 index c2cfa5e7c152..000000000000 --- a/arch/x86/kernel/pci-swiotlb.c +++ /dev/null @@ -1,78 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include <linux/pci.h> -#include <linux/cache.h> -#include <linux/init.h> -#include <linux/swiotlb.h> -#include <linux/memblock.h> -#include <linux/dma-direct.h> -#include <linux/mem_encrypt.h> - -#include <asm/iommu.h> -#include <asm/swiotlb.h> -#include <asm/dma.h> -#include <asm/xen/swiotlb-xen.h> -#include <asm/iommu_table.h> - -int swiotlb __read_mostly; - -/* - * pci_swiotlb_detect_override - set swiotlb to 1 if necessary - * - * This returns non-zero if we are forced to use swiotlb (by the boot - * option). - */ -int __init pci_swiotlb_detect_override(void) -{ - if (swiotlb_force == SWIOTLB_FORCE) - swiotlb = 1; - - return swiotlb; -} -IOMMU_INIT_FINISH(pci_swiotlb_detect_override, - pci_xen_swiotlb_detect, - pci_swiotlb_init, - pci_swiotlb_late_init); - -/* - * If 4GB or more detected (and iommu=off not set) or if SME is active - * then set swiotlb to 1 and return 1. - */ -int __init pci_swiotlb_detect_4gb(void) -{ - /* don't initialize swiotlb if iommu=off (no_iommu=1) */ - if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN) - swiotlb = 1; - - /* - * If SME is active then swiotlb will be set to 1 so that bounce - * buffers are allocated and used for devices that do not support - * the addressing range required for the encryption mask. - */ - if (sme_active()) - swiotlb = 1; - - return swiotlb; -} -IOMMU_INIT(pci_swiotlb_detect_4gb, - pci_swiotlb_detect_override, - pci_swiotlb_init, - pci_swiotlb_late_init); - -void __init pci_swiotlb_init(void) -{ - if (swiotlb) - swiotlb_init(0); -} - -void __init pci_swiotlb_late_init(void) -{ - /* An IOMMU turned us off. */ - if (!swiotlb) - swiotlb_exit(); - else { - printk(KERN_INFO "PCI-DMA: " - "Using software bounce buffering for IO (SWIOTLB)\n"); - swiotlb_print_info(); - } -} diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c index bb7e1132290b..624703af80a1 100644 --- a/arch/x86/kernel/perf_regs.c +++ b/arch/x86/kernel/perf_regs.c @@ -101,8 +101,7 @@ u64 perf_reg_abi(struct task_struct *task) } void perf_get_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy) + struct pt_regs *regs) { regs_user->regs = task_pt_regs(current); regs_user->abi = perf_reg_abi(current); @@ -123,18 +122,26 @@ int perf_reg_validate(u64 mask) u64 perf_reg_abi(struct task_struct *task) { - if (test_tsk_thread_flag(task, TIF_IA32)) + if (!user_64bit_mode(task_pt_regs(task))) return PERF_SAMPLE_REGS_ABI_32; else return PERF_SAMPLE_REGS_ABI_64; } +static DEFINE_PER_CPU(struct pt_regs, nmi_user_regs); + void perf_get_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy) + struct pt_regs *regs) { + struct pt_regs *regs_user_copy = this_cpu_ptr(&nmi_user_regs); struct pt_regs *user_regs = task_pt_regs(current); + if (!in_nmi()) { + regs_user->regs = user_regs; + regs_user->abi = perf_reg_abi(current); + return; + } + /* * If we're in an NMI that interrupted task_pt_regs setup, then * we can't sample user regs at all. This check isn't really diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c index 6b07faaa1579..23154d24b117 100644 --- a/arch/x86/kernel/pmem.c +++ b/arch/x86/kernel/pmem.c @@ -27,6 +27,11 @@ static __init int register_e820_pmem(void) * simply here to trigger the module to load on demand. */ pdev = platform_device_alloc("e820_pmem", -1); - return platform_device_add(pdev); + + rc = platform_device_add(pdev); + if (rc) + platform_device_put(pdev); + + return rc; } device_initcall(register_e820_pmem); diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c index ee0286390a4c..319fef37d9dc 100644 --- a/arch/x86/kernel/probe_roms.c +++ b/arch/x86/kernel/probe_roms.c @@ -21,6 +21,7 @@ #include <asm/sections.h> #include <asm/io.h> #include <asm/setup_arch.h> +#include <asm/sev.h> static struct resource system_rom_resource = { .name = "System ROM", @@ -80,7 +81,7 @@ static struct resource video_rom_resource = { */ static bool match_id(struct pci_dev *pdev, unsigned short vendor, unsigned short device) { - struct pci_driver *drv = pdev->driver; + struct pci_driver *drv = to_pci_driver(pdev->dev.driver); const struct pci_device_id *id; if (pdev->vendor == vendor && pdev->device == device) @@ -94,12 +95,12 @@ static bool match_id(struct pci_dev *pdev, unsigned short vendor, unsigned short } static bool probe_list(struct pci_dev *pdev, unsigned short vendor, - const unsigned char *rom_list) + const void *rom_list) { unsigned short device; do { - if (probe_kernel_address(rom_list, device) != 0) + if (get_kernel_nofault(device, rom_list) != 0) device = 0; if (device && match_id(pdev, vendor, device)) @@ -119,19 +120,19 @@ static struct resource *find_oprom(struct pci_dev *pdev) for (i = 0; i < ARRAY_SIZE(adapter_rom_resources); i++) { struct resource *res = &adapter_rom_resources[i]; unsigned short offset, vendor, device, list, rev; - const unsigned char *rom; + const void *rom; if (res->end == 0) break; rom = isa_bus_to_virt(res->start); - if (probe_kernel_address(rom + 0x18, offset) != 0) + if (get_kernel_nofault(offset, rom + 0x18) != 0) continue; - if (probe_kernel_address(rom + offset + 0x4, vendor) != 0) + if (get_kernel_nofault(vendor, rom + offset + 0x4) != 0) continue; - if (probe_kernel_address(rom + offset + 0x6, device) != 0) + if (get_kernel_nofault(device, rom + offset + 0x6) != 0) continue; if (match_id(pdev, vendor, device)) { @@ -139,8 +140,8 @@ static struct resource *find_oprom(struct pci_dev *pdev) break; } - if (probe_kernel_address(rom + offset + 0x8, list) == 0 && - probe_kernel_address(rom + offset + 0xc, rev) == 0 && + if (get_kernel_nofault(list, rom + offset + 0x8) == 0 && + get_kernel_nofault(rev, rom + offset + 0xc) == 0 && rev >= 3 && list && probe_list(pdev, vendor, rom + offset + list)) { oprom = res; @@ -183,25 +184,35 @@ static int __init romsignature(const unsigned char *rom) const unsigned short * const ptr = (const unsigned short *)rom; unsigned short sig; - return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE; + return get_kernel_nofault(sig, ptr) == 0 && sig == ROMSIGNATURE; } static int __init romchecksum(const unsigned char *rom, unsigned long length) { unsigned char sum, c; - for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--) + for (sum = 0; length && get_kernel_nofault(c, rom++) == 0; length--) sum += c; return !length && !sum; } void __init probe_roms(void) { - const unsigned char *rom; unsigned long start, length, upper; + const unsigned char *rom; unsigned char c; int i; + /* + * The ROM memory range is not part of the e820 table and is therefore not + * pre-validated by BIOS. The kernel page table maps the ROM region as encrypted + * memory, and SNP requires encrypted memory to be validated before access. + * Do that here. + */ + snp_prep_memory(video_rom_resource.start, + ((system_rom_resource.end + 1) - video_rom_resource.start), + SNP_PAGE_STATE_PRIVATE); + /* video rom */ upper = adapter_rom_resources[0].start; for (start = video_rom_resource.start; start < upper; start += 2048) { @@ -211,7 +222,7 @@ void __init probe_roms(void) video_rom_resource.start = start; - if (probe_kernel_address(rom + 2, c) != 0) + if (get_kernel_nofault(c, rom + 2) != 0) continue; /* 0 < length <= 0x7f * 512, historically */ @@ -249,7 +260,7 @@ void __init probe_roms(void) if (!romsignature(rom)) continue; - if (probe_kernel_address(rom + 2, c) != 0) + if (get_kernel_nofault(c, rom + 2) != 0) continue; /* 0 < length <= 0x7f * 512, historically */ diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 3053c85e0e42..c21b7347a26d 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -28,10 +28,11 @@ #include <linux/hw_breakpoint.h> #include <asm/cpu.h> #include <asm/apic.h> -#include <asm/syscalls.h> #include <linux/uaccess.h> #include <asm/mwait.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> +#include <asm/fpu/sched.h> +#include <asm/fpu/xstate.h> #include <asm/debugreg.h> #include <asm/nmi.h> #include <asm/tlbflush.h> @@ -43,6 +44,9 @@ #include <asm/spec-ctrl.h> #include <asm/io_bitmap.h> #include <asm/proto.h> +#include <asm/frame.h> +#include <asm/unwind.h> +#include <asm/tdx.h> #include "process.h" @@ -63,14 +67,9 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = { */ .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, - /* - * .sp1 is cpu_current_top_of_stack. The init task never - * runs user code, but cpu_current_top_of_stack should still - * be well defined before the first context switch. - */ +#ifdef CONFIG_X86_32 .sp1 = TOP_OF_INIT_STACK, -#ifdef CONFIG_X86_32 .ss0 = __KERNEL_DS, .ss1 = __KERNEL_CS, #endif @@ -92,12 +91,22 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) #ifdef CONFIG_VM86 dst->thread.vm86 = NULL; #endif + /* Drop the copied pointer to current's fpstate */ + dst->thread.fpu.fpstate = NULL; - return fpu__copy(dst, src); + return 0; +} + +#ifdef CONFIG_X86_64 +void arch_release_task_struct(struct task_struct *tsk) +{ + if (fpu_state_size_dynamic()) + fpstate_free(&tsk->thread.fpu); } +#endif /* - * Free current thread data structures etc.. + * Free thread data structures etc.. */ void exit_thread(struct task_struct *tsk) { @@ -105,7 +114,7 @@ void exit_thread(struct task_struct *tsk) struct fpu *fpu = &t->fpu; if (test_thread_flag(TIF_IO_BITMAP)) - io_bitmap_exit(); + io_bitmap_exit(tsk); free_vm86(t); @@ -122,9 +131,11 @@ static int set_new_tls(struct task_struct *p, unsigned long tls) return do_set_thread_area_64(p, ARCH_SET_FS, tls); } -int copy_thread_tls(unsigned long clone_flags, unsigned long sp, - unsigned long arg, struct task_struct *p, unsigned long tls) +int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { + unsigned long clone_flags = args->flags; + unsigned long sp = args->stack; + unsigned long tls = args->tls; struct inactive_task_frame *frame; struct fork_frame *fork_frame; struct pt_regs *childregs; @@ -134,21 +145,25 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, fork_frame = container_of(childregs, struct fork_frame, regs); frame = &fork_frame->frame; - frame->bp = 0; + frame->bp = encode_frame_pointer(childregs); frame->ret_addr = (unsigned long) ret_from_fork; p->thread.sp = (unsigned long) fork_frame; p->thread.io_bitmap = NULL; + p->thread.iopl_warn = 0; memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); #ifdef CONFIG_X86_64 - savesegment(gs, p->thread.gsindex); - p->thread.gsbase = p->thread.gsindex ? 0 : current->thread.gsbase; - savesegment(fs, p->thread.fsindex); - p->thread.fsbase = p->thread.fsindex ? 0 : current->thread.fsbase; + current_save_fsgs(); + p->thread.fsindex = current->thread.fsindex; + p->thread.fsbase = current->thread.fsbase; + p->thread.gsindex = current->thread.gsindex; + p->thread.gsbase = current->thread.gsbase; + savesegment(es, p->thread.es); savesegment(ds, p->thread.ds); #else p->thread.sp0 = (unsigned long) (childregs + 1); + savesegment(gs, p->thread.gs); /* * Clear all status flags including IF and set fixed bit. 64bit * does not have this initialization as the frame does not contain @@ -158,22 +173,44 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, frame->flags = X86_EFLAGS_FIXED; #endif + fpu_clone(p, clone_flags, args->fn); + /* Kernel thread ? */ if (unlikely(p->flags & PF_KTHREAD)) { + p->thread.pkru = pkru_get_init_value(); memset(childregs, 0, sizeof(struct pt_regs)); - kthread_frame_init(frame, sp, arg); + kthread_frame_init(frame, args->fn, args->fn_arg); return 0; } + /* + * Clone current's PKRU value from hardware. tsk->thread.pkru + * is only valid when scheduled out. + */ + p->thread.pkru = read_pkru(); + frame->bx = 0; *childregs = *current_pt_regs(); childregs->ax = 0; if (sp) childregs->sp = sp; -#ifdef CONFIG_X86_32 - task_user_gs(p) = get_user_gs(current_pt_regs()); -#endif + if (unlikely(args->fn)) { + /* + * A user space thread, but it doesn't return to + * ret_after_fork(). + * + * In order to indicate that to tools like gdb, + * we reset the stack and instruction pointers. + * + * It does the same kernel frame setup to return to a kernel + * function that a kernel thread does. + */ + childregs->sp = 0; + childregs->ip = 0; + kthread_frame_init(frame, args->fn, args->fn_arg); + return 0; + } /* Set a new TLS for the child thread? */ if (clone_flags & CLONE_SETTLS) @@ -185,6 +222,15 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, return ret; } +static void pkru_flush_thread(void) +{ + /* + * If PKRU is enabled the default PKRU value has to be loaded into + * the hardware right here (similar to context switch). + */ + pkru_write_default(); +} + void flush_thread(void) { struct task_struct *tsk = current; @@ -192,7 +238,8 @@ void flush_thread(void) flush_ptrace_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); - fpu__clear(&tsk->thread.fpu); + fpu_flush_thread(); + pkru_flush_thread(); } void disable_TSC(void) @@ -287,7 +334,7 @@ static int get_cpuid_mode(void) return !test_thread_flag(TIF_NOCPUID); } -static int set_cpuid_mode(struct task_struct *task, unsigned long cpuid_enabled) +static int set_cpuid_mode(unsigned long cpuid_enabled) { if (!boot_cpu_has(X86_FEATURE_CPUID_FAULT)) return -ENODEV; @@ -318,25 +365,11 @@ void arch_setup_new_exec(void) clear_thread_flag(TIF_SSBD); task_clear_spec_ssb_disable(current); task_clear_spec_ssb_noexec(current); - speculation_ctrl_update(task_thread_info(current)->flags); + speculation_ctrl_update(read_thread_flags()); } } #ifdef CONFIG_X86_IOPL_IOPERM -static inline void tss_invalidate_io_bitmap(struct tss_struct *tss) -{ - /* - * Invalidate the I/O bitmap by moving io_bitmap_base outside the - * TSS limit so any subsequent I/O access from user space will - * trigger a #GP. - * - * This is correct even when VMEXIT rewrites the TSS limit - * to 0x67 as the only requirement is that the base points - * outside the limit. - */ - tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_INVALID; -} - static inline void switch_to_bitmap(unsigned long tifp) { /* @@ -347,7 +380,7 @@ static inline void switch_to_bitmap(unsigned long tifp) * user mode. */ if (tifp & _TIF_IO_BITMAP) - tss_invalidate_io_bitmap(this_cpu_ptr(&cpu_tss_rw)); + tss_invalidate_io_bitmap(); } static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm) @@ -372,7 +405,7 @@ static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm) } /** - * tss_update_io_bitmap - Update I/O bitmap before exiting to usermode + * native_tss_update_io_bitmap - Update I/O bitmap before exiting to user mode */ void native_tss_update_io_bitmap(void) { @@ -381,7 +414,7 @@ void native_tss_update_io_bitmap(void) u16 *base = &tss->x86_tss.io_bitmap_base; if (!test_thread_flag(TIF_IO_BITMAP)) { - tss_invalidate_io_bitmap(tss); + native_tss_invalidate_io_bitmap(); return; } @@ -463,7 +496,7 @@ void speculative_store_bypass_ht_init(void) * First HT sibling to come up on the core. Link shared state of * the first HT sibling to itself. The siblings on the same core * which come up later will see the shared state pointer and link - * themself to the state of this CPU. + * themselves to the state of this CPU. */ st->shared_state = st; } @@ -546,28 +579,20 @@ static __always_inline void __speculation_ctrl_update(unsigned long tifp, lockdep_assert_irqs_disabled(); - /* - * If TIF_SSBD is different, select the proper mitigation - * method. Note that if SSBD mitigation is disabled or permanentely - * enabled this branch can't be taken because nothing can set - * TIF_SSBD. - */ - if (tif_diff & _TIF_SSBD) { - if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) { + /* Handle change of TIF_SSBD depending on the mitigation method. */ + if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) { + if (tif_diff & _TIF_SSBD) amd_set_ssb_virt_state(tifn); - } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) { + } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) { + if (tif_diff & _TIF_SSBD) amd_set_core_ssb_state(tifn); - } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || - static_cpu_has(X86_FEATURE_AMD_SSBD)) { - msr |= ssbd_tif_to_spec_ctrl(tifn); - updmsr = true; - } + } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || + static_cpu_has(X86_FEATURE_AMD_SSBD)) { + updmsr |= !!(tif_diff & _TIF_SSBD); + msr |= ssbd_tif_to_spec_ctrl(tifn); } - /* - * Only evaluate TIF_SPEC_IB if conditional STIBP is enabled, - * otherwise avoid the MSR write. - */ + /* Only evaluate TIF_SPEC_IB if conditional STIBP is enabled. */ if (IS_ENABLED(CONFIG_SMP) && static_branch_unlikely(&switch_to_cond_stibp)) { updmsr |= !!(tif_diff & _TIF_SPEC_IB); @@ -575,7 +600,7 @@ static __always_inline void __speculation_ctrl_update(unsigned long tifp, } if (updmsr) - wrmsrl(MSR_IA32_SPEC_CTRL, msr); + write_spec_ctrl_current(msr, false); } static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk) @@ -592,7 +617,7 @@ static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk) clear_tsk_thread_flag(tsk, TIF_SPEC_IB); } /* Return the updated threadinfo flags*/ - return task_thread_info(tsk)->flags; + return read_task_thread_flags(tsk); } void speculation_ctrl_update(unsigned long tif) @@ -613,12 +638,23 @@ void speculation_ctrl_update_current(void) preempt_enable(); } +static inline void cr4_toggle_bits_irqsoff(unsigned long mask) +{ + unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4); + + newval = cr4 ^ mask; + if (newval != cr4) { + this_cpu_write(cpu_tlbstate.cr4, newval); + __write_cr4(newval); + } +} + void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) { unsigned long tifp, tifn; - tifn = READ_ONCE(task_thread_info(next_p)->flags); - tifp = READ_ONCE(task_thread_info(prev_p)->flags); + tifn = read_task_thread_flags(next_p); + tifp = read_task_thread_flags(prev_p); switch_to_bitmap(tifp); @@ -691,9 +727,7 @@ void arch_cpu_idle(void) */ void __cpuidle default_idle(void) { - trace_cpu_idle_rcuidle(1, smp_processor_id()); - safe_halt(); - trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); + raw_safe_halt(); } #if defined(CONFIG_APM_MODULE) || defined(CONFIG_HALTPOLL_CPUIDLE_MODULE) EXPORT_SYMBOL(default_idle); @@ -710,7 +744,7 @@ bool xen_set_default_idle(void) } #endif -void stop_this_cpu(void *dummy) +void __noreturn stop_this_cpu(void *dummy) { local_irq_disable(); /* @@ -728,8 +762,11 @@ void stop_this_cpu(void *dummy) * without the encryption bit, they don't race each other when flushed * and potentially end up with the wrong entry being committed to * memory. + * + * Test the CPUID bit directly because the machine might've cleared + * X86_FEATURE_SME due to cmdline options. */ - if (boot_cpu_has(X86_FEATURE_SME)) + if (cpuid_eax(0x8000001f) & BIT(0)) native_wbinvd(); for (;;) { /* @@ -744,6 +781,8 @@ void stop_this_cpu(void *dummy) /* * AMD Erratum 400 aware idle routine. We handle it the same way as C3 power * states (local apic timer and TSC stop). + * + * XXX this function is completely buggered vs RCU and tracing. */ static void amd_e400_idle(void) { @@ -765,30 +804,49 @@ static void amd_e400_idle(void) * The switch back from broadcast mode needs to be called with * interrupts disabled. */ - local_irq_disable(); + raw_local_irq_disable(); tick_broadcast_exit(); - local_irq_enable(); + raw_local_irq_enable(); } /* - * Intel Core2 and older machines prefer MWAIT over HALT for C1. - * We can't rely on cpuidle installing MWAIT, because it will not load - * on systems that support only C1 -- so the boot default must be MWAIT. - * - * Some AMD machines are the opposite, they depend on using HALT. + * Prefer MWAIT over HALT if MWAIT is supported, MWAIT_CPUID leaf + * exists and whenever MONITOR/MWAIT extensions are present there is at + * least one C1 substate. * - * So for default C1, which is used during boot until cpuidle loads, - * use MWAIT-C1 on Intel HW that has it, else use HALT. + * Do not prefer MWAIT if MONITOR instruction has a bug or idle=nomwait + * is passed to kernel commandline parameter. */ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) { - if (c->x86_vendor != X86_VENDOR_INTEL) + u32 eax, ebx, ecx, edx; + + /* User has disallowed the use of MWAIT. Fallback to HALT */ + if (boot_option_idle_override == IDLE_NOMWAIT) + return 0; + + /* MWAIT is not supported on this platform. Fallback to HALT */ + if (!cpu_has(c, X86_FEATURE_MWAIT)) return 0; - if (!cpu_has(c, X86_FEATURE_MWAIT) || boot_cpu_has_bug(X86_BUG_MONITOR)) + /* Monitor has a bug. Fallback to HALT */ + if (boot_cpu_has_bug(X86_BUG_MONITOR)) return 0; - return 1; + cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); + + /* + * If MWAIT extensions are not available, it is safe to use MWAIT + * with EAX=0, ECX=0. + */ + if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) + return 1; + + /* + * If MWAIT extensions are available, there should be at least one + * MWAIT C1 substate present. + */ + return (edx & MWAIT_C1_SUBSTATE_MASK); } /* @@ -799,7 +857,6 @@ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) static __cpuidle void mwait_idle(void) { if (!current_set_polling_and_test()) { - trace_cpu_idle_rcuidle(1, smp_processor_id()); if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) { mb(); /* quirk */ clflush((void *)¤t_thread_info()->flags); @@ -810,10 +867,9 @@ static __cpuidle void mwait_idle(void) if (!need_resched()) __sti_mwait(0, 0); else - local_irq_enable(); - trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); + raw_local_irq_enable(); } else { - local_irq_enable(); + raw_local_irq_enable(); } __current_clr_polling(); } @@ -833,6 +889,9 @@ void select_idle_routine(const struct cpuinfo_x86 *c) } else if (prefer_mwait_c1_over_halt(c)) { pr_info("using mwait in idle threads\n"); x86_idle = mwait_idle; + } else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { + pr_info("using TDX aware idle routine\n"); + x86_idle = tdx_safe_halt; } else x86_idle = default_idle; } @@ -892,9 +951,8 @@ static int __init idle_setup(char *str) } else if (!strcmp(str, "nomwait")) { /* * If the boot option of "idle=nomwait" is added, - * it means that mwait will be disabled for CPU C2/C3 - * states. In such case it won't touch the variable - * of boot_option_idle_override. + * it means that mwait will be disabled for CPU C1/C2/C3 + * states. */ boot_option_idle_override = IDLE_NOMWAIT; } else @@ -907,7 +965,7 @@ early_param("idle", idle_setup); unsigned long arch_align_stack(unsigned long sp) { if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() % 8192; + sp -= prandom_u32_max(8192); return sp & ~0xf; } @@ -922,70 +980,42 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) * because the task might wake up and we might look at a stack * changing under us. */ -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { - unsigned long start, bottom, top, sp, fp, ip, ret = 0; - int count = 0; - - if (p == current || p->state == TASK_RUNNING) - return 0; + struct unwind_state state; + unsigned long addr = 0; if (!try_get_task_stack(p)) return 0; - start = (unsigned long)task_stack_page(p); - if (!start) - goto out; - - /* - * Layout of the stack page: - * - * ----------- topmax = start + THREAD_SIZE - sizeof(unsigned long) - * PADDING - * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING - * stack - * ----------- bottom = start - * - * The tasks stack pointer points at the location where the - * framepointer is stored. The data on the stack is: - * ... IP FP ... IP FP - * - * We need to read FP and IP, so we need to adjust the upper - * bound by another unsigned long. - */ - top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; - top -= 2 * sizeof(unsigned long); - bottom = start; - - sp = READ_ONCE(p->thread.sp); - if (sp < bottom || sp > top) - goto out; - - fp = READ_ONCE_NOCHECK(((struct inactive_task_frame *)sp)->bp); - do { - if (fp < bottom || fp > top) - goto out; - ip = READ_ONCE_NOCHECK(*(unsigned long *)(fp + sizeof(unsigned long))); - if (!in_sched_functions(ip)) { - ret = ip; - goto out; - } - fp = READ_ONCE_NOCHECK(*(unsigned long *)fp); - } while (count++ < 16 && p->state != TASK_RUNNING); + for (unwind_start(&state, p, NULL, NULL); !unwind_done(&state); + unwind_next_frame(&state)) { + addr = unwind_get_return_address(&state); + if (!addr) + break; + if (in_sched_functions(addr)) + continue; + break; + } -out: put_task_stack(p); - return ret; + + return addr; } -long do_arch_prctl_common(struct task_struct *task, int option, - unsigned long cpuid_enabled) +long do_arch_prctl_common(int option, unsigned long arg2) { switch (option) { case ARCH_GET_CPUID: return get_cpuid_mode(); case ARCH_SET_CPUID: - return set_cpuid_mode(task, cpuid_enabled); + return set_cpuid_mode(arg2); + case ARCH_GET_XCOMP_SUPP: + case ARCH_GET_XCOMP_PERM: + case ARCH_REQ_XCOMP_PERM: + case ARCH_GET_XCOMP_GUEST_PERM: + case ARCH_REQ_XCOMP_GUEST_PERM: + return fpu_xstate_prctl(option, arg2); } return -EINVAL; diff --git a/arch/x86/kernel/process.h b/arch/x86/kernel/process.h index 1d0797b2338a..76b547b83232 100644 --- a/arch/x86/kernel/process.h +++ b/arch/x86/kernel/process.h @@ -13,8 +13,8 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p); static inline void switch_to_extra(struct task_struct *prev, struct task_struct *next) { - unsigned long next_tif = task_thread_info(next)->flags; - unsigned long prev_tif = task_thread_info(prev)->flags; + unsigned long next_tif = read_task_thread_flags(next); + unsigned long prev_tif = read_task_thread_flags(prev); if (IS_ENABLED(CONFIG_SMP)) { /* diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 5052ced43373..2f314b170c9f 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -39,44 +39,40 @@ #include <linux/kdebug.h> #include <linux/syscalls.h> -#include <asm/pgtable.h> #include <asm/ldt.h> #include <asm/processor.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/sched.h> #include <asm/desc.h> #include <linux/err.h> #include <asm/tlbflush.h> #include <asm/cpu.h> -#include <asm/syscalls.h> #include <asm/debugreg.h> #include <asm/switch_to.h> #include <asm/vm86.h> -#include <asm/resctrl_sched.h> +#include <asm/resctrl.h> #include <asm/proto.h> #include "process.h" -void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) +void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, + const char *log_lvl) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; unsigned long d0, d1, d2, d3, d6, d7; unsigned short gs; - if (user_mode(regs)) - gs = get_user_gs(regs); - else - savesegment(gs, gs); + savesegment(gs, gs); - show_ip(regs, KERN_DEFAULT); + show_ip(regs, log_lvl); - printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", - regs->ax, regs->bx, regs->cx, regs->dx); - printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", - regs->si, regs->di, regs->bp, regs->sp); - printk(KERN_DEFAULT "DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x EFLAGS: %08lx\n", - (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, regs->ss, regs->flags); + printk("%sEAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", + log_lvl, regs->ax, regs->bx, regs->cx, regs->dx); + printk("%sESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", + log_lvl, regs->si, regs->di, regs->bp, regs->sp); + printk("%sDS: %04x ES: %04x FS: %04x GS: %04x SS: %04x EFLAGS: %08lx\n", + log_lvl, (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, regs->ss, regs->flags); if (mode != SHOW_REGS_ALL) return; @@ -85,8 +81,8 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) cr2 = read_cr2(); cr3 = __read_cr3(); cr4 = __read_cr4(); - printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", - cr0, cr2, cr3, cr4); + printk("%sCR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", + log_lvl, cr0, cr2, cr3, cr4); get_debugreg(d0, 0); get_debugreg(d1, 1); @@ -100,10 +96,10 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) (d6 == DR6_RESERVED) && (d7 == 0x400)) return; - printk(KERN_DEFAULT "DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", - d0, d1, d2, d3); - printk(KERN_DEFAULT "DR6: %08lx DR7: %08lx\n", - d6, d7); + printk("%sDR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", + log_lvl, d0, d1, d2, d3); + printk("%sDR6: %08lx DR7: %08lx\n", + log_lvl, d6, d7); } void release_thread(struct task_struct *dead_task) @@ -115,7 +111,7 @@ void release_thread(struct task_struct *dead_task) void start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) { - set_user_gs(regs, 0); + loadsegment(gs, 0); regs->fs = 0; regs->ds = __USER_DS; regs->es = __USER_DS; @@ -161,7 +157,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; struct fpu *prev_fpu = &prev->fpu; - struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ @@ -179,7 +174,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * used %fs or %gs (it does not today), or if the kernel is * running inside of a hypervisor layer. */ - lazy_save_gs(prev->gs); + savesegment(gs, prev->gs); /* * Load the per-thread Thread-Local Storage descriptor. @@ -210,11 +205,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * Restore %gs if needed (which is common) */ if (prev->gs | next->gs) - lazy_load_gs(next->gs); + loadsegment(gs, next->gs); this_cpu_write(current_task, next_p); - switch_fpu_finish(next_fpu); + switch_fpu_finish(); /* Load the Intel cache allocation PQR MSR. */ resctrl_sched_in(); @@ -224,5 +219,5 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) { - return do_arch_prctl_common(current, option, arg2); + return do_arch_prctl_common(option, arg2); } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ffd497804dbc..6b3418bff326 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -40,20 +40,19 @@ #include <linux/ftrace.h> #include <linux/syscalls.h> -#include <asm/pgtable.h> #include <asm/processor.h> -#include <asm/fpu/internal.h> +#include <asm/pkru.h> +#include <asm/fpu/sched.h> #include <asm/mmu_context.h> #include <asm/prctl.h> #include <asm/desc.h> #include <asm/proto.h> #include <asm/ia32.h> -#include <asm/syscalls.h> #include <asm/debugreg.h> #include <asm/switch_to.h> #include <asm/xen/hypervisor.h> #include <asm/vdso.h> -#include <asm/resctrl_sched.h> +#include <asm/resctrl.h> #include <asm/unistd.h> #include <asm/fsgsbase.h> #ifdef CONFIG_IA32_EMULATION @@ -64,30 +63,31 @@ #include "process.h" /* Prints also some state that isn't saved in the pt_regs */ -void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) +void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, + const char *log_lvl) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; unsigned long d0, d1, d2, d3, d6, d7; unsigned int fsindex, gsindex; unsigned int ds, es; - show_iret_regs(regs); + show_iret_regs(regs, log_lvl); if (regs->orig_ax != -1) pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax); else pr_cont("\n"); - printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n", - regs->ax, regs->bx, regs->cx); - printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n", - regs->dx, regs->si, regs->di); - printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n", - regs->bp, regs->r8, regs->r9); - printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n", - regs->r10, regs->r11, regs->r12); - printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n", - regs->r13, regs->r14, regs->r15); + printk("%sRAX: %016lx RBX: %016lx RCX: %016lx\n", + log_lvl, regs->ax, regs->bx, regs->cx); + printk("%sRDX: %016lx RSI: %016lx RDI: %016lx\n", + log_lvl, regs->dx, regs->si, regs->di); + printk("%sRBP: %016lx R08: %016lx R09: %016lx\n", + log_lvl, regs->bp, regs->r8, regs->r9); + printk("%sR10: %016lx R11: %016lx R12: %016lx\n", + log_lvl, regs->r10, regs->r11, regs->r12); + printk("%sR13: %016lx R14: %016lx R15: %016lx\n", + log_lvl, regs->r13, regs->r14, regs->r15); if (mode == SHOW_REGS_SHORT) return; @@ -95,8 +95,8 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) if (mode == SHOW_REGS_USER) { rdmsrl(MSR_FS_BASE, fs); rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); - printk(KERN_DEFAULT "FS: %016lx GS: %016lx\n", - fs, shadowgs); + printk("%sFS: %016lx GS: %016lx\n", + log_lvl, fs, shadowgs); return; } @@ -114,12 +114,12 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) cr3 = __read_cr3(); cr4 = __read_cr4(); - printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", - fs, fsindex, gs, gsindex, shadowgs); - printk(KERN_DEFAULT "CS: %04lx DS: %04x ES: %04x CR0: %016lx\n", regs->cs, ds, - es, cr0); - printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, - cr4); + printk("%sFS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", + log_lvl, fs, fsindex, gs, gsindex, shadowgs); + printk("%sCS: %04lx DS: %04x ES: %04x CR0: %016lx\n", + log_lvl, regs->cs, ds, es, cr0); + printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n", + log_lvl, cr2, cr3, cr4); get_debugreg(d0, 0); get_debugreg(d1, 1); @@ -131,14 +131,14 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) /* Only print out debug registers if they are in their non-default state. */ if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) && (d6 == DR6_RESERVED) && (d7 == 0x400))) { - printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", - d0, d1, d2); - printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", - d3, d6, d7); + printk("%sDR0: %016lx DR1: %016lx DR2: %016lx\n", + log_lvl, d0, d1, d2); + printk("%sDR3: %016lx DR6: %016lx DR7: %016lx\n", + log_lvl, d3, d6, d7); } - if (boot_cpu_has(X86_FEATURE_OSPKE)) - printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru()); + if (cpu_feature_enabled(X86_FEATURE_OSPKE)) + printk("%sPKRU: %08x\n", log_lvl, read_pkru()); } void release_thread(struct task_struct *dead_task) @@ -152,6 +152,56 @@ enum which_selector { }; /* + * Out of line to be protected from kprobes and tracing. If this would be + * traced or probed than any access to a per CPU variable happens with + * the wrong GS. + * + * It is not used on Xen paravirt. When paravirt support is needed, it + * needs to be renamed with native_ prefix. + */ +static noinstr unsigned long __rdgsbase_inactive(void) +{ + unsigned long gsbase; + + lockdep_assert_irqs_disabled(); + + if (!static_cpu_has(X86_FEATURE_XENPV)) { + native_swapgs(); + gsbase = rdgsbase(); + native_swapgs(); + } else { + instrumentation_begin(); + rdmsrl(MSR_KERNEL_GS_BASE, gsbase); + instrumentation_end(); + } + + return gsbase; +} + +/* + * Out of line to be protected from kprobes and tracing. If this would be + * traced or probed than any access to a per CPU variable happens with + * the wrong GS. + * + * It is not used on Xen paravirt. When paravirt support is needed, it + * needs to be renamed with native_ prefix. + */ +static noinstr void __wrgsbase_inactive(unsigned long gsbase) +{ + lockdep_assert_irqs_disabled(); + + if (!static_cpu_has(X86_FEATURE_XENPV)) { + native_swapgs(); + wrgsbase(gsbase); + native_swapgs(); + } else { + instrumentation_begin(); + wrmsrl(MSR_KERNEL_GS_BASE, gsbase); + instrumentation_end(); + } +} + +/* * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are * not available. The goal is to be reasonably fast on non-FSGSBASE systems. * It's forcibly inlined because it'll generate better code and this function @@ -200,22 +250,35 @@ static __always_inline void save_fsgs(struct task_struct *task) { savesegment(fs, task->thread.fsindex); savesegment(gs, task->thread.gsindex); - save_base_legacy(task, task->thread.fsindex, FS); - save_base_legacy(task, task->thread.gsindex, GS); + if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + /* + * If FSGSBASE is enabled, we can't make any useful guesses + * about the base, and user code expects us to save the current + * value. Fortunately, reading the base directly is efficient. + */ + task->thread.fsbase = rdfsbase(); + task->thread.gsbase = __rdgsbase_inactive(); + } else { + save_base_legacy(task, task->thread.fsindex, FS); + save_base_legacy(task, task->thread.gsindex, GS); + } } -#if IS_ENABLED(CONFIG_KVM) /* * While a process is running,current->thread.fsbase and current->thread.gsbase - * may not match the corresponding CPU registers (see save_base_legacy()). KVM - * wants an efficient way to save and restore FSBASE and GSBASE. - * When FSGSBASE extensions are enabled, this will have to use RD{FS,GS}BASE. + * may not match the corresponding CPU registers (see save_base_legacy()). */ -void save_fsgs_for_kvm(void) +void current_save_fsgs(void) { + unsigned long flags; + + /* Interrupts need to be off for FSGSBASE */ + local_irq_save(flags); save_fsgs(current); + local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(save_fsgs_for_kvm); +#if IS_ENABLED(CONFIG_KVM) +EXPORT_SYMBOL_GPL(current_save_fsgs); #endif static __always_inline void loadseg(enum which_selector which, @@ -277,17 +340,52 @@ static __always_inline void load_seg_legacy(unsigned short prev_index, } } +/* + * Store prev's PKRU value and load next's PKRU value if they differ. PKRU + * is not XSTATE managed on context switch because that would require a + * lookup in the task's FPU xsave buffer and require to keep that updated + * in various places. + */ +static __always_inline void x86_pkru_load(struct thread_struct *prev, + struct thread_struct *next) +{ + if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) + return; + + /* Stash the prev task's value: */ + prev->pkru = rdpkru(); + + /* + * PKRU writes are slightly expensive. Avoid them when not + * strictly necessary: + */ + if (prev->pkru != next->pkru) + wrpkru(next->pkru); +} + static __always_inline void x86_fsgsbase_load(struct thread_struct *prev, struct thread_struct *next) { - load_seg_legacy(prev->fsindex, prev->fsbase, - next->fsindex, next->fsbase, FS); - load_seg_legacy(prev->gsindex, prev->gsbase, - next->gsindex, next->gsbase, GS); + if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + /* Update the FS and GS selectors if they could have changed. */ + if (unlikely(prev->fsindex || next->fsindex)) + loadseg(FS, next->fsindex); + if (unlikely(prev->gsindex || next->gsindex)) + loadseg(GS, next->gsindex); + + /* Update the bases. */ + wrfsbase(next->fsbase); + __wrgsbase_inactive(next->gsbase); + } else { + load_seg_legacy(prev->fsindex, prev->fsbase, + next->fsindex, next->fsbase, FS); + load_seg_legacy(prev->gsindex, prev->gsbase, + next->gsindex, next->gsbase, GS); + } } -static unsigned long x86_fsgsbase_read_task(struct task_struct *task, - unsigned short selector) +unsigned long x86_fsgsbase_read_task(struct task_struct *task, + unsigned short selector) { unsigned short idx = selector >> 3; unsigned long base; @@ -316,7 +414,7 @@ static unsigned long x86_fsgsbase_read_task(struct task_struct *task, */ mutex_lock(&task->mm->context.lock); ldt = task->mm->context.ldt; - if (unlikely(idx >= ldt->nr_entries)) + if (unlikely(!ldt || idx >= ldt->nr_entries)) base = 0; else base = get_desc_base(ldt->entries + idx); @@ -329,13 +427,44 @@ static unsigned long x86_fsgsbase_read_task(struct task_struct *task, return base; } +unsigned long x86_gsbase_read_cpu_inactive(void) +{ + unsigned long gsbase; + + if (boot_cpu_has(X86_FEATURE_FSGSBASE)) { + unsigned long flags; + + local_irq_save(flags); + gsbase = __rdgsbase_inactive(); + local_irq_restore(flags); + } else { + rdmsrl(MSR_KERNEL_GS_BASE, gsbase); + } + + return gsbase; +} + +void x86_gsbase_write_cpu_inactive(unsigned long gsbase) +{ + if (boot_cpu_has(X86_FEATURE_FSGSBASE)) { + unsigned long flags; + + local_irq_save(flags); + __wrgsbase_inactive(gsbase); + local_irq_restore(flags); + } else { + wrmsrl(MSR_KERNEL_GS_BASE, gsbase); + } +} + unsigned long x86_fsbase_read_task(struct task_struct *task) { unsigned long fsbase; if (task == current) fsbase = x86_fsbase_read_cpu(); - else if (task->thread.fsindex == 0) + else if (boot_cpu_has(X86_FEATURE_FSGSBASE) || + (task->thread.fsindex == 0)) fsbase = task->thread.fsbase; else fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex); @@ -349,7 +478,8 @@ unsigned long x86_gsbase_read_task(struct task_struct *task) if (task == current) gsbase = x86_gsbase_read_cpu_inactive(); - else if (task->thread.gsindex == 0) + else if (boot_cpu_has(X86_FEATURE_FSGSBASE) || + (task->thread.gsindex == 0)) gsbase = task->thread.gsbase; else gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex); @@ -405,11 +535,10 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) EXPORT_SYMBOL_GPL(start_thread); #ifdef CONFIG_COMPAT -void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp) +void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32) { start_thread_common(regs, new_ip, new_sp, - test_thread_flag(TIF_X32) - ? __USER_CS : __USER32_CS, + x32 ? __USER_CS : __USER32_CS, __USER_DS, __USER_DS); } #endif @@ -424,17 +553,17 @@ void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp) * Kprobes not supported here. Set the probe on schedule instead. * Function graph tracer not supported too. */ +__no_kmsan_checks __visible __notrace_funcgraph struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread; struct thread_struct *next = &next_p->thread; struct fpu *prev_fpu = &prev->fpu; - struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && - this_cpu_read(irq_count) != -1); + this_cpu_read(hardirq_stack_inuse)); if (!test_thread_flag(TIF_NEED_FPU_LOAD)) switch_fpu_prepare(prev_fpu, cpu); @@ -483,13 +612,15 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) x86_fsgsbase_load(prev, next); + x86_pkru_load(prev, next); + /* * Switch the PDA and FPU contexts. */ this_cpu_write(current_task, next_p); this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); - switch_fpu_finish(next_fpu); + switch_fpu_finish(); /* Reload sp0. */ update_task_stack(next_p); @@ -535,16 +666,12 @@ void set_personality_64bit(void) /* inherit personality from parent */ /* Make sure to be in 64bit mode */ - clear_thread_flag(TIF_IA32); clear_thread_flag(TIF_ADDR32); - clear_thread_flag(TIF_X32); /* Pretend that this comes from a 64bit execve */ task_pt_regs(current)->orig_ax = __NR_execve; current_thread_info()->status &= ~TS_COMPAT; - - /* Ensure the corresponding mm is not marked. */ if (current->mm) - current->mm->context.ia32_compat = 0; + current->mm->context.flags = MM_CONTEXT_HAS_VSYSCALL; /* TBD: overwrites user setup. Should have two bits. But 64bit processes have always behaved this way, @@ -555,11 +682,10 @@ void set_personality_64bit(void) static void __set_personality_x32(void) { -#ifdef CONFIG_X86_X32 - clear_thread_flag(TIF_IA32); - set_thread_flag(TIF_X32); +#ifdef CONFIG_X86_X32_ABI if (current->mm) - current->mm->context.ia32_compat = TIF_X32; + current->mm->context.flags = 0; + current->personality &= ~READ_IMPLIES_EXEC; /* * in_32bit_syscall() uses the presence of the x32 syscall bit @@ -577,10 +703,14 @@ static void __set_personality_x32(void) static void __set_personality_ia32(void) { #ifdef CONFIG_IA32_EMULATION - set_thread_flag(TIF_IA32); - clear_thread_flag(TIF_X32); - if (current->mm) - current->mm->context.ia32_compat = TIF_IA32; + if (current->mm) { + /* + * uprobes applied to this MM need to know this and + * cannot use user_64bit_mode() at that time. + */ + current->mm->context.flags = MM_CONTEXT_UPROBE_IA32; + } + current->personality |= force_personality32; /* Prepare the first "return" to user space */ task_pt_regs(current)->orig_ax = __NR_ia32_execve; @@ -715,7 +845,7 @@ SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) ret = do_arch_prctl_64(current, option, arg2); if (ret == -EINVAL) - ret = do_arch_prctl_common(current, option, arg2); + ret = do_arch_prctl_common(option, arg2); return ret; } @@ -723,7 +853,7 @@ SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) #ifdef CONFIG_IA32_EMULATION COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) { - return do_arch_prctl_common(current, option, arg2); + return do_arch_prctl_common(option, arg2); } #endif diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index f0e1ddbc2fd7..37c12fb92906 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -13,7 +13,6 @@ #include <linux/errno.h> #include <linux/slab.h> #include <linux/ptrace.h> -#include <linux/tracehook.h> #include <linux/user.h> #include <linux/elf.h> #include <linux/security.h> @@ -28,11 +27,10 @@ #include <linux/nospec.h> #include <linux/uaccess.h> -#include <asm/pgtable.h> #include <asm/processor.h> -#include <asm/fpu/internal.h> #include <asm/fpu/signal.h> #include <asm/fpu/regset.h> +#include <asm/fpu/xstate.h> #include <asm/debugreg.h> #include <asm/ldt.h> #include <asm/desc.h> @@ -172,9 +170,9 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset) retval = *pt_regs_access(task_pt_regs(task), offset); else { if (task == current) - retval = get_user_gs(task_pt_regs(task)); + savesegment(gs, retval); else - retval = task_user_gs(task); + retval = task->thread.gs; } return retval; } @@ -205,14 +203,14 @@ static int set_segment_reg(struct task_struct *task, case offsetof(struct user_regs_struct, ss): if (unlikely(value == 0)) return -EIO; - /* Else, fall through */ + fallthrough; default: *pt_regs_access(task_pt_regs(task), offset) = value; break; case offsetof(struct user_regs_struct, gs): - task_user_gs(task) = value; + task->thread.gs = value; } return 0; @@ -282,17 +280,9 @@ static int set_segment_reg(struct task_struct *task, return -EIO; /* - * This function has some ABI oddities. - * - * A 32-bit ptracer probably expects that writing FS or GS will change - * FSBASE or GSBASE respectively. In the absence of FSGSBASE support, - * this code indeed has that effect. When FSGSBASE is added, this - * will require a special case. - * - * For existing 64-bit ptracers, writing FS or GS *also* currently - * changes the base if the selector is nonzero the next time the task - * is run. This behavior may not be needed, and trying to preserve it - * when FSGSBASE is added would be complicated at best. + * Writes to FS and GS will change the stored selector. Whether + * this changes the segment base as well depends on whether + * FSGSBASE is enabled. */ switch (offset) { @@ -380,25 +370,12 @@ static int putreg(struct task_struct *child, case offsetof(struct user_regs_struct,fs_base): if (value >= TASK_SIZE_MAX) return -EIO; - /* - * When changing the FS base, use do_arch_prctl_64() - * to set the index to zero and to set the base - * as requested. - * - * NB: This behavior is nonsensical and likely needs to - * change when FSGSBASE support is added. - */ - if (child->thread.fsbase != value) - return do_arch_prctl_64(child, ARCH_SET_FS, value); + x86_fsbase_write_task(child, value); return 0; case offsetof(struct user_regs_struct,gs_base): - /* - * Exactly the same here as the %fs handling above. - */ if (value >= TASK_SIZE_MAX) return -EIO; - if (child->thread.gsbase != value) - return do_arch_prctl_64(child, ARCH_SET_GS, value); + x86_gsbase_write_task(child, value); return 0; #endif } @@ -434,26 +411,12 @@ static unsigned long getreg(struct task_struct *task, unsigned long offset) static int genregs_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { - if (kbuf) { - unsigned long *k = kbuf; - while (count >= sizeof(*k)) { - *k++ = getreg(target, pos); - count -= sizeof(*k); - pos += sizeof(*k); - } - } else { - unsigned long __user *u = ubuf; - while (count >= sizeof(*u)) { - if (__put_user(getreg(target, pos), u++)) - return -EFAULT; - count -= sizeof(*u); - pos += sizeof(*u); - } - } + int reg; + for (reg = 0; to.left; reg++) + membuf_store(&to, getreg(target, reg * sizeof(unsigned long))); return 0; } @@ -501,7 +464,7 @@ static void ptrace_triggered(struct perf_event *bp, break; } - thread->debugreg6 |= (DR_TRAP0 << i); + thread->virtual_dr6 |= (DR_TRAP0 << i); } /* @@ -637,7 +600,7 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) if (bp) val = bp->hw.info.address; } else if (n == 6) { - val = thread->debugreg6; + val = thread->virtual_dr6 ^ DR6_RESERVED; /* Flip back to arch polarity */ } else if (n == 7) { val = thread->ptrace_dr7; } @@ -693,7 +656,7 @@ static int ptrace_set_debugreg(struct task_struct *tsk, int n, if (n < HBP_NUM) { rc = ptrace_set_breakpoint_addr(tsk, n, val); } else if (n == 6) { - thread->debugreg6 = val; + thread->virtual_dr6 = val ^ DR6_RESERVED; /* Flip to positive polarity */ rc = 0; } else if (n == 7) { rc = ptrace_write_dr7(tsk, val); @@ -717,16 +680,14 @@ static int ioperm_active(struct task_struct *target, static int ioperm_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { struct io_bitmap *iobm = target->thread.io_bitmap; if (!iobm) return -ENXIO; - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - iobm->bitmap, 0, IO_BITMAP_BYTES); + return membuf_write(&to, iobm->bitmap, IO_BITMAP_BYTES); } /* @@ -742,6 +703,9 @@ void ptrace_disable(struct task_struct *child) #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION static const struct user_regset_view user_x86_32_view; /* Initialized below. */ #endif +#ifdef CONFIG_X86_64 +static const struct user_regset_view user_x86_64_view; /* Initialized below. */ +#endif long arch_ptrace(struct task_struct *child, long request, unsigned long addr, unsigned long data) @@ -749,6 +713,14 @@ long arch_ptrace(struct task_struct *child, long request, int ret; unsigned long __user *datap = (unsigned long __user *)data; +#ifdef CONFIG_X86_64 + /* This is native 64-bit ptrace() */ + const struct user_regset_view *regset_view = &user_x86_64_view; +#else + /* This is native 32-bit ptrace() */ + const struct user_regset_view *regset_view = &user_x86_32_view; +#endif + switch (request) { /* read the word at location addr in the USER area. */ case PTRACE_PEEKUSR: { @@ -787,28 +759,28 @@ long arch_ptrace(struct task_struct *child, long request, case PTRACE_GETREGS: /* Get all gp regs from the child. */ return copy_regset_to_user(child, - task_user_regset_view(current), + regset_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_SETREGS: /* Set all gp regs in the child. */ return copy_regset_from_user(child, - task_user_regset_view(current), + regset_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_GETFPREGS: /* Get the child FPU state. */ return copy_regset_to_user(child, - task_user_regset_view(current), + regset_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); case PTRACE_SETFPREGS: /* Set the child FPU state. */ return copy_regset_from_user(child, - task_user_regset_view(current), + regset_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); @@ -881,14 +853,39 @@ long arch_ptrace(struct task_struct *child, long request, static int putreg32(struct task_struct *child, unsigned regno, u32 value) { struct pt_regs *regs = task_pt_regs(child); + int ret; switch (regno) { SEG32(cs); SEG32(ds); SEG32(es); - SEG32(fs); - SEG32(gs); + + /* + * A 32-bit ptracer on a 64-bit kernel expects that writing + * FS or GS will also update the base. This is needed for + * operations like PTRACE_SETREGS to fully restore a saved + * CPU state. + */ + + case offsetof(struct user32, regs.fs): + ret = set_segment_reg(child, + offsetof(struct user_regs_struct, fs), + value); + if (ret == 0) + child->thread.fsbase = + x86_fsgsbase_read_task(child, value); + return ret; + + case offsetof(struct user32, regs.gs): + ret = set_segment_reg(child, + offsetof(struct user_regs_struct, gs), + value); + if (ret == 0) + child->thread.gsbase = + x86_fsgsbase_read_task(child, value); + return ret; + SEG32(ss); R32(ebx, bx); @@ -913,7 +910,7 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value) * syscall with TS_COMPAT still set. */ regs->orig_ax = value; - if (syscall_get_nr(child, regs) >= 0) + if (syscall_get_nr(child, regs) != -1) child->thread_info.status |= TS_I386_REGS_POKED; break; @@ -1004,28 +1001,15 @@ static int getreg32(struct task_struct *child, unsigned regno, u32 *val) static int genregs32_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { - if (kbuf) { - compat_ulong_t *k = kbuf; - while (count >= sizeof(*k)) { - getreg32(target, pos, k++); - count -= sizeof(*k); - pos += sizeof(*k); - } - } else { - compat_ulong_t __user *u = ubuf; - while (count >= sizeof(*u)) { - compat_ulong_t word; - getreg32(target, pos, &word); - if (__put_user(word, u++)) - return -EFAULT; - count -= sizeof(*u); - pos += sizeof(*u); - } - } + int reg; + for (reg = 0; to.left; reg++) { + u32 val; + getreg32(target, reg * 4, &val); + membuf_store(&to, val); + } return 0; } @@ -1178,28 +1162,28 @@ static long x32_arch_ptrace(struct task_struct *child, case PTRACE_GETREGS: /* Get all gp regs from the child. */ return copy_regset_to_user(child, - task_user_regset_view(current), + &user_x86_64_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_SETREGS: /* Set all gp regs in the child. */ return copy_regset_from_user(child, - task_user_regset_view(current), + &user_x86_64_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_GETFPREGS: /* Get the child FPU state. */ return copy_regset_to_user(child, - task_user_regset_view(current), + &user_x86_64_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); case PTRACE_SETFPREGS: /* Set the child FPU state. */ return copy_regset_from_user(child, - task_user_regset_view(current), + &user_x86_64_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); @@ -1235,25 +1219,25 @@ static struct user_regset x86_64_regsets[] __ro_after_init = { .core_note_type = NT_PRSTATUS, .n = sizeof(struct user_regs_struct) / sizeof(long), .size = sizeof(long), .align = sizeof(long), - .get = genregs_get, .set = genregs_set + .regset_get = genregs_get, .set = genregs_set }, [REGSET_FP] = { .core_note_type = NT_PRFPREG, - .n = sizeof(struct user_i387_struct) / sizeof(long), + .n = sizeof(struct fxregs_state) / sizeof(long), .size = sizeof(long), .align = sizeof(long), - .active = regset_xregset_fpregs_active, .get = xfpregs_get, .set = xfpregs_set + .active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set }, [REGSET_XSTATE] = { .core_note_type = NT_X86_XSTATE, .size = sizeof(u64), .align = sizeof(u64), - .active = xstateregs_active, .get = xstateregs_get, + .active = xstateregs_active, .regset_get = xstateregs_get, .set = xstateregs_set }, [REGSET_IOPERM64] = { .core_note_type = NT_386_IOPERM, .n = IO_BITMAP_LONGS, .size = sizeof(long), .align = sizeof(long), - .active = ioperm_active, .get = ioperm_get + .active = ioperm_active, .regset_get = ioperm_get }, }; @@ -1276,24 +1260,24 @@ static struct user_regset x86_32_regsets[] __ro_after_init = { .core_note_type = NT_PRSTATUS, .n = sizeof(struct user_regs_struct32) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), - .get = genregs32_get, .set = genregs32_set + .regset_get = genregs32_get, .set = genregs32_set }, [REGSET_FP] = { .core_note_type = NT_PRFPREG, .n = sizeof(struct user_i387_ia32_struct) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), - .active = regset_fpregs_active, .get = fpregs_get, .set = fpregs_set + .active = regset_fpregs_active, .regset_get = fpregs_get, .set = fpregs_set }, [REGSET_XFP] = { .core_note_type = NT_PRXFPREG, - .n = sizeof(struct user32_fxsr_struct) / sizeof(u32), + .n = sizeof(struct fxregs_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), - .active = regset_xregset_fpregs_active, .get = xfpregs_get, .set = xfpregs_set + .active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set }, [REGSET_XSTATE] = { .core_note_type = NT_X86_XSTATE, .size = sizeof(u64), .align = sizeof(u64), - .active = xstateregs_active, .get = xstateregs_get, + .active = xstateregs_active, .regset_get = xstateregs_get, .set = xstateregs_set }, [REGSET_TLS] = { @@ -1302,13 +1286,13 @@ static struct user_regset x86_32_regsets[] __ro_after_init = { .size = sizeof(struct user_desc), .align = sizeof(struct user_desc), .active = regset_tls_active, - .get = regset_tls_get, .set = regset_tls_set + .regset_get = regset_tls_get, .set = regset_tls_set }, [REGSET_IOPERM32] = { .core_note_type = NT_386_IOPERM, .n = IO_BITMAP_BYTES / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), - .active = ioperm_active, .get = ioperm_get + .active = ioperm_active, .regset_get = ioperm_get }, }; @@ -1335,6 +1319,25 @@ void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask) xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask; } +/* + * This is used by the core dump code to decide which regset to dump. The + * core dump code writes out the resulting .e_machine and the corresponding + * regsets. This is suboptimal if the task is messing around with its CS.L + * field, but at worst the core dump will end up missing some information. + * + * Unfortunately, it is also used by the broken PTRACE_GETREGSET and + * PTRACE_SETREGSET APIs. These APIs look at the .regsets field but have + * no way to make sure that the e_machine they use matches the caller's + * expectations. The result is that the data format returned by + * PTRACE_GETREGSET depends on the returned CS field (and even the offset + * of the returned CS field depends on its value!) and the data format + * accepted by PTRACE_SETREGSET is determined by the old CS value. The + * upshot is that it is basically impossible to use these APIs correctly. + * + * The best way to fix it in the long run would probably be to add new + * improved ptrace() APIs to read and write registers reliably, possibly by + * allowing userspace to select the ELF e_machine variant that they expect. + */ const struct user_regset_view *task_user_regset_view(struct task_struct *task) { #ifdef CONFIG_IA32_EMULATION diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 10125358b9c4..eda37df016f0 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -89,7 +89,7 @@ u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) /* * Assumption here is that last_value, a global accumulator, always goes * forward. If we are less than that, we should not be much smaller. - * We assume there is an error marging we're inside, and then the correction + * We assume there is an error margin we're inside, and then the correction * does not sacrifice accuracy. * * For reads: global may have changed between test and return, @@ -145,7 +145,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, void pvclock_set_pvti_cpu0_va(struct pvclock_vsyscall_time_info *pvti) { - WARN_ON(vclock_was_used(VCLOCK_PVCLOCK)); + WARN_ON(vclock_was_used(VDSO_CLOCKMODE_PVCLOCK)); pvti_cpu0_va = pvti; } diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 896d74cb5081..6d0df6a58873 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -8,6 +8,7 @@ #include <asm/hpet.h> #include <asm/setup.h> +#include <asm/mce.h> #if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) @@ -95,7 +96,7 @@ static void ich_force_hpet_resume(void) static void ich_force_enable_hpet(struct pci_dev *dev) { u32 val; - u32 uninitialized_var(rcba); + u32 rcba; int err = 0; if (hpet_address || force_hpet_address) @@ -185,7 +186,7 @@ static void hpet_print_force_info(void) static void old_ich_force_hpet_resume(void) { u32 val; - u32 uninitialized_var(gen_cntl); + u32 gen_cntl; if (!force_hpet_address || !cached_dev) return; @@ -207,7 +208,7 @@ static void old_ich_force_hpet_resume(void) static void old_ich_force_enable_hpet(struct pci_dev *dev) { u32 val; - u32 uninitialized_var(gen_cntl); + u32 gen_cntl; if (hpet_address || force_hpet_address) return; @@ -298,7 +299,7 @@ static void vt8237_force_hpet_resume(void) static void vt8237_force_enable_hpet(struct pci_dev *dev) { - u32 uninitialized_var(val); + u32 val; if (hpet_address || force_hpet_address) return; @@ -429,7 +430,7 @@ static void nvidia_force_hpet_resume(void) static void nvidia_force_enable_hpet(struct pci_dev *dev) { - u32 uninitialized_var(val); + u32 val; if (hpet_address || force_hpet_address) return; @@ -624,10 +625,6 @@ static void amd_disable_seq_and_redirect_scrub(struct pci_dev *dev) DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3, amd_disable_seq_and_redirect_scrub); -#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) -#include <linux/jump_label.h> -#include <asm/string_64.h> - /* Ivy Bridge, Haswell, Broadwell */ static void quirk_intel_brickland_xeon_ras_cap(struct pci_dev *pdev) { @@ -636,7 +633,7 @@ static void quirk_intel_brickland_xeon_ras_cap(struct pci_dev *pdev) pci_read_config_dword(pdev, 0x84, &capid0); if (capid0 & 0x10) - static_branch_inc(&mcsafe_key); + enable_copy_mc_fragile(); } /* Skylake */ @@ -653,7 +650,7 @@ static void quirk_intel_purley_xeon_ras_cap(struct pci_dev *pdev) * enabled, so memory machine check recovery is also enabled. */ if ((capid0 & 0xc0) == 0xc0 || (capid5 & 0x1e0)) - static_branch_inc(&mcsafe_key); + enable_copy_mc_fragile(); } DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x0ec3, quirk_intel_brickland_xeon_ras_cap); @@ -661,7 +658,6 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, quirk_intel_brickland_xeon_ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, quirk_intel_brickland_xeon_ras_cap); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2083, quirk_intel_purley_xeon_ras_cap); #endif -#endif bool x86_apple_machine; EXPORT_SYMBOL(x86_apple_machine); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 0cc7c0b106bb..c3636ea4aa71 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -10,14 +10,14 @@ #include <linux/sched.h> #include <linux/tboot.h> #include <linux/delay.h> -#include <linux/frame.h> +#include <linux/objtool.h> +#include <linux/pgtable.h> #include <acpi/reboot.h> #include <asm/io.h> #include <asm/apic.h> #include <asm/io_apic.h> #include <asm/desc.h> #include <asm/hpet.h> -#include <asm/pgtable.h> #include <asm/proto.h> #include <asm/reboot_fixups.h> #include <asm/reboot.h> @@ -113,17 +113,9 @@ void __noreturn machine_real_restart(unsigned int type) spin_unlock(&rtc_lock); /* - * Switch back to the initial page table. + * Switch to the trampoline page table. */ -#ifdef CONFIG_X86_32 - load_cr3(initial_page_table); -#else - write_cr3(real_mode_header->trampoline_pgd); - - /* Exiting long mode will fail if CR4.PCIDE is set. */ - if (boot_cpu_has(X86_FEATURE_PCID)) - cr4_clear_bits(X86_CR4_PCIDE); -#endif + load_trampoline_pgtable(); /* Jump to the identity-mapped low memory code */ #ifdef CONFIG_X86_32 @@ -197,6 +189,14 @@ static const struct dmi_system_id reboot_dmi_table[] __initconst = { DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"), }, }, + { /* Handle problems with rebooting on Apple MacBook6,1 */ + .callback = set_pci_reboot, + .ident = "Apple MacBook6,1", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBook6,1"), + }, + }, { /* Handle problems with rebooting on Apple MacBookPro5 */ .callback = set_pci_reboot, .ident = "Apple MacBookPro5", @@ -380,10 +380,11 @@ static const struct dmi_system_id reboot_dmi_table[] __initconst = { }, { /* Handle problems with rebooting on the OptiPlex 990. */ .callback = set_pci_reboot, - .ident = "Dell OptiPlex 990", + .ident = "Dell OptiPlex 990 BIOS A0x", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"), + DMI_MATCH(DMI_BIOS_VERSION, "A0"), }, }, { /* Handle problems with rebooting on Dell 300's */ @@ -469,6 +470,15 @@ static const struct dmi_system_id reboot_dmi_table[] __initconst = { }, }, + { /* PCIe Wifi card isn't detected after reboot otherwise */ + .callback = set_pci_reboot, + .ident = "Zotac ZBOX CI327 nano", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "NA"), + DMI_MATCH(DMI_PRODUCT_NAME, "ZBOX-CI327NANO-GS-01"), + }, + }, + /* Sony */ { /* Handle problems with rebooting on Sony VGN-Z540N */ .callback = set_bios_reboot, @@ -530,31 +540,21 @@ static void emergency_vmx_disable_all(void) local_irq_disable(); /* - * We need to disable VMX on all CPUs before rebooting, otherwise - * we risk hanging up the machine, because the CPU ignore INIT - * signals when VMX is enabled. + * Disable VMX on all CPUs before rebooting, otherwise we risk hanging + * the machine, because the CPU blocks INIT when it's in VMX root. * - * We can't take any locks and we may be on an inconsistent - * state, so we use NMIs as IPIs to tell the other CPUs to disable - * VMX and halt. + * We can't take any locks and we may be on an inconsistent state, so + * use NMIs as IPIs to tell the other CPUs to exit VMX root and halt. * - * For safety, we will avoid running the nmi_shootdown_cpus() - * stuff unnecessarily, but we don't have a way to check - * if other CPUs have VMX enabled. So we will call it only if the - * CPU we are running on has VMX enabled. - * - * We will miss cases where VMX is not enabled on all CPUs. This - * shouldn't do much harm because KVM always enable VMX on all - * CPUs anyway. But we can miss it on the small window where KVM - * is still enabling VMX. + * Do the NMI shootdown even if VMX if off on _this_ CPU, as that + * doesn't prevent a different CPU from being in VMX root operation. */ - if (cpu_has_vmx() && cpu_vmx_enabled()) { - /* Disable VMX on this CPU. */ - cpu_vmxoff(); + if (cpu_has_vmx()) { + /* Safely force _this_ CPU out of VMX root operation. */ + __cpu_emergency_vmxoff(); - /* Halt and disable VMX on the other CPUs */ + /* Halt and exit VMX root operation on the other CPUs. */ nmi_shootdown_cpus(vmxoff_nmi); - } } @@ -646,7 +646,7 @@ static void native_machine_emergency_restart(void) case BOOT_CF9_FORCE: port_cf9_safe = true; - /* Fall through */ + fallthrough; case BOOT_CF9_SAFE: if (port_cf9_safe) { @@ -662,7 +662,7 @@ static void native_machine_emergency_restart(void) break; case BOOT_TRIPLE: - idt_invalidate(NULL); + idt_invalidate(); __asm__ __volatile__("int3"); /* We're probably dead after this, but... */ @@ -739,10 +739,10 @@ static void native_machine_halt(void) static void native_machine_power_off(void) { - if (pm_power_off) { + if (kernel_can_power_off()) { if (!reboot_force) machine_shutdown(); - pm_power_off(); + do_kernel_power_off(); } /* A fallback in case there is no PM info available */ tboot_shutdown(TB_SHUTDOWN_HALT); diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index 94b33885f8d2..c7c4b1917336 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -7,10 +7,12 @@ #include <linux/linkage.h> #include <asm/page_types.h> #include <asm/kexec.h> +#include <asm/nospec-branch.h> #include <asm/processor-flags.h> /* - * Must be relocatable PIC code callable as a C function + * Must be relocatable PIC code callable as a C function, in particular + * there must be a plain RET and not jump to return thunk. */ #define PTR(x) (x << 2) @@ -91,7 +93,9 @@ SYM_CODE_START_NOALIGN(relocate_kernel) movl %edi, %eax addl $(identity_mapped - relocate_kernel), %eax pushl %eax + ANNOTATE_UNRET_SAFE ret + int3 SYM_CODE_END(relocate_kernel) SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) @@ -107,7 +111,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) * - Write protect disabled * - No task switch * - Don't do FP software emulation. - * - Proctected mode enabled + * - Protected mode enabled */ movl %cr0, %eax andl $~(X86_CR0_PG | X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %eax @@ -159,12 +163,15 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) xorl %edx, %edx xorl %esi, %esi xorl %ebp, %ebp + ANNOTATE_UNRET_SAFE ret + int3 1: popl %edx movl CP_PA_SWAP_PAGE(%edi), %esp addl $PAGE_SIZE, %esp 2: + ANNOTATE_RETPOLINE_SAFE call *%edx /* get the re-entry point of the peer system */ @@ -190,7 +197,9 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) movl %edi, %eax addl $(virtual_mapped - relocate_kernel), %eax pushl %eax + ANNOTATE_UNRET_SAFE ret + int3 SYM_CODE_END(identity_mapped) SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) @@ -208,7 +217,9 @@ SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) popl %edi popl %esi popl %ebx + ANNOTATE_UNRET_SAFE ret + int3 SYM_CODE_END(virtual_mapped) /* Do the copies */ @@ -271,7 +282,9 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) popl %edi popl %ebx popl %ebp + ANNOTATE_UNRET_SAFE ret + int3 SYM_CODE_END(swap_pages) .globl kexec_control_code_size diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index ef3ba99068d3..4809c0dc4eb0 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -9,9 +9,12 @@ #include <asm/kexec.h> #include <asm/processor-flags.h> #include <asm/pgtable_types.h> +#include <asm/nospec-branch.h> +#include <asm/unwind_hints.h> /* - * Must be relocatable PIC code callable as a C function + * Must be relocatable PIC code callable as a C function, in particular + * there must be a plain RET and not jump to return thunk. */ #define PTR(x) (x << 3) @@ -39,12 +42,14 @@ .align PAGE_SIZE .code64 SYM_CODE_START_NOALIGN(relocate_kernel) + UNWIND_HINT_EMPTY + ANNOTATE_NOENDBR /* * %rdi indirection_page * %rsi page_list * %rdx start address * %rcx preserve_context - * %r8 sme_active + * %r8 host_mem_enc_active */ /* Save the CPU context, used for jumping back */ @@ -101,23 +106,34 @@ SYM_CODE_START_NOALIGN(relocate_kernel) /* jump to identity mapped page */ addq $(identity_mapped - relocate_kernel), %r8 pushq %r8 + ANNOTATE_UNRET_SAFE ret + int3 SYM_CODE_END(relocate_kernel) SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) + UNWIND_HINT_EMPTY /* set return address to 0 if not preserving context */ pushq $0 /* store the start address on the stack */ pushq %rdx /* + * Clear X86_CR4_CET (if it was set) such that we can clear CR0_WP + * below. + */ + movq %cr4, %rax + andq $~(X86_CR4_CET), %rax + movq %rax, %cr4 + + /* * Set cr0 to a known state: * - Paging enabled * - Alignment check disabled * - Write protect disabled * - No task switch * - Don't do FP software emulation. - * - Proctected mode enabled + * - Protected mode enabled */ movq %cr0, %rax andq $~(X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %rax @@ -187,19 +203,19 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) xorl %r14d, %r14d xorl %r15d, %r15d + ANNOTATE_UNRET_SAFE ret + int3 1: popq %rdx leaq PAGE_SIZE(%r10), %rsp + ANNOTATE_RETPOLINE_SAFE call *%rdx /* get the re-entry point of the peer system */ movq 0(%rsp), %rbp - call 1f -1: - popq %r8 - subq $(1b - relocate_kernel), %r8 + leaq relocate_kernel(%rip), %r8 movq CP_PA_SWAP_PAGE(%r8), %r10 movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi movq CP_PA_TABLE_PAGE(%r8), %rax @@ -208,10 +224,14 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) call swap_pages movq $virtual_mapped, %rax pushq %rax + ANNOTATE_UNRET_SAFE ret + int3 SYM_CODE_END(identity_mapped) SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) + UNWIND_HINT_EMPTY + ANNOTATE_NOENDBR // RET target, above movq RSP(%r8), %rsp movq CR4(%r8), %rax movq %rax, %cr4 @@ -228,11 +248,14 @@ SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) popq %r12 popq %rbp popq %rbx + ANNOTATE_UNRET_SAFE ret + int3 SYM_CODE_END(virtual_mapped) /* Do the copies */ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) + UNWIND_HINT_EMPTY movq %rdi, %rcx /* Put the page_list in %rcx */ xorl %edi, %edi xorl %esi, %esi @@ -284,7 +307,9 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) lea PAGE_SIZE(%rax), %rsi jmp 0b 3: + ANNOTATE_UNRET_SAFE ret + int3 SYM_CODE_END(swap_pages) .globl kexec_control_code_size diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c index 9b9fb7882c20..bba1abd05bfe 100644 --- a/arch/x86/kernel/resource.c +++ b/arch/x86/kernel/resource.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/ioport.h> +#include <linux/printk.h> #include <asm/e820/api.h> +#include <asm/pci_x86.h> static void resource_clip(struct resource *res, resource_size_t start, resource_size_t end) @@ -27,12 +29,23 @@ static void remove_e820_regions(struct resource *avail) { int i; struct e820_entry *entry; + u64 e820_start, e820_end; + struct resource orig = *avail; + + if (!pci_use_e820) + return; for (i = 0; i < e820_table->nr_entries; i++) { entry = &e820_table->entries[i]; + e820_start = entry->addr; + e820_end = entry->addr + entry->size - 1; - resource_clip(avail, entry->addr, - entry->addr + entry->size - 1); + resource_clip(avail, e820_start, e820_end); + if (orig.start != avail->start || orig.end != avail->end) { + pr_info("clipped %pR to %pR for e820 entry [mem %#010Lx-%#010Lx]\n", + &orig, avail, e820_start, e820_end); + orig = *avail; + } } } diff --git a/arch/x86/kernel/rethook.c b/arch/x86/kernel/rethook.c new file mode 100644 index 000000000000..8a1c0111ae79 --- /dev/null +++ b/arch/x86/kernel/rethook.c @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * x86 implementation of rethook. Mostly copied from arch/x86/kernel/kprobes/core.c. + */ +#include <linux/bug.h> +#include <linux/rethook.h> +#include <linux/kprobes.h> +#include <linux/objtool.h> + +#include "kprobes/common.h" + +__visible void arch_rethook_trampoline_callback(struct pt_regs *regs); + +#ifndef ANNOTATE_NOENDBR +#define ANNOTATE_NOENDBR +#endif + +/* + * When a target function returns, this code saves registers and calls + * arch_rethook_trampoline_callback(), which calls the rethook handler. + */ +asm( + ".text\n" + ".global arch_rethook_trampoline\n" + ".type arch_rethook_trampoline, @function\n" + "arch_rethook_trampoline:\n" +#ifdef CONFIG_X86_64 + ANNOTATE_NOENDBR /* This is only jumped from ret instruction */ + /* Push a fake return address to tell the unwinder it's a rethook. */ + " pushq $arch_rethook_trampoline\n" + UNWIND_HINT_FUNC + " pushq $" __stringify(__KERNEL_DS) "\n" + /* Save the 'sp - 16', this will be fixed later. */ + " pushq %rsp\n" + " pushfq\n" + SAVE_REGS_STRING + " movq %rsp, %rdi\n" + " call arch_rethook_trampoline_callback\n" + RESTORE_REGS_STRING + /* In the callback function, 'regs->flags' is copied to 'regs->ss'. */ + " addq $16, %rsp\n" + " popfq\n" +#else + /* Push a fake return address to tell the unwinder it's a rethook. */ + " pushl $arch_rethook_trampoline\n" + UNWIND_HINT_FUNC + " pushl %ss\n" + /* Save the 'sp - 8', this will be fixed later. */ + " pushl %esp\n" + " pushfl\n" + SAVE_REGS_STRING + " movl %esp, %eax\n" + " call arch_rethook_trampoline_callback\n" + RESTORE_REGS_STRING + /* In the callback function, 'regs->flags' is copied to 'regs->ss'. */ + " addl $8, %esp\n" + " popfl\n" +#endif + ASM_RET + ".size arch_rethook_trampoline, .-arch_rethook_trampoline\n" +); +NOKPROBE_SYMBOL(arch_rethook_trampoline); + +/* + * Called from arch_rethook_trampoline + */ +__used __visible void arch_rethook_trampoline_callback(struct pt_regs *regs) +{ + unsigned long *frame_pointer; + + /* fixup registers */ + regs->cs = __KERNEL_CS; +#ifdef CONFIG_X86_32 + regs->gs = 0; +#endif + regs->ip = (unsigned long)&arch_rethook_trampoline; + regs->orig_ax = ~0UL; + regs->sp += 2*sizeof(long); + frame_pointer = (long *)(regs + 1); + + /* + * The return address at 'frame_pointer' is recovered by the + * arch_rethook_fixup_return() which called from this + * rethook_trampoline_handler(). + */ + rethook_trampoline_handler(regs, (unsigned long)frame_pointer); + + /* + * Copy FLAGS to 'pt_regs::ss' so that arch_rethook_trapmoline() + * can do RET right after POPF. + */ + *(unsigned long *)®s->ss = regs->flags; +} +NOKPROBE_SYMBOL(arch_rethook_trampoline_callback); + +/* + * arch_rethook_trampoline() skips updating frame pointer. The frame pointer + * saved in arch_rethook_trampoline_callback() points to the real caller + * function's frame pointer. Thus the arch_rethook_trampoline() doesn't have + * a standard stack frame with CONFIG_FRAME_POINTER=y. + * Let's mark it non-standard function. Anyway, FP unwinder can correctly + * unwind without the hint. + */ +STACK_FRAME_NON_STANDARD_FP(arch_rethook_trampoline); + +/* This is called from rethook_trampoline_handler(). */ +void arch_rethook_fixup_return(struct pt_regs *regs, + unsigned long correct_ret_addr) +{ + unsigned long *frame_pointer = (void *)(regs + 1); + + /* Replace fake return address with real one. */ + *frame_pointer = correct_ret_addr; +} +NOKPROBE_SYMBOL(arch_rethook_fixup_return); + +void arch_rethook_prepare(struct rethook_node *rh, struct pt_regs *regs, bool mcount) +{ + unsigned long *stack = (unsigned long *)regs->sp; + + rh->ret_addr = stack[0]; + rh->frame = regs->sp; + + /* Replace the return addr with trampoline addr */ + stack[0] = (unsigned long) arch_rethook_trampoline; +} +NOKPROBE_SYMBOL(arch_rethook_prepare); diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 586f718b8e95..349046434513 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -4,11 +4,8 @@ */ #include <linux/platform_device.h> #include <linux/mc146818rtc.h> -#include <linux/acpi.h> -#include <linux/bcd.h> #include <linux/export.h> #include <linux/pnp.h> -#include <linux/of.h> #include <asm/vsyscall.h> #include <asm/x86_init.h> @@ -20,26 +17,23 @@ /* * This is a special lock that is owned by the CPU and holds the index * register we are working with. It is required for NMI access to the - * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details. + * CMOS/RTC registers. See arch/x86/include/asm/mc146818rtc.h for details. */ volatile unsigned long cmos_lock; EXPORT_SYMBOL(cmos_lock); #endif /* CONFIG_X86_32 */ -/* For two digit years assume time is always after that */ -#define CMOS_YEARS_OFFS 2000 - DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL(rtc_lock); /* - * In order to set the CMOS clock precisely, set_rtc_mmss has to be + * In order to set the CMOS clock precisely, mach_set_cmos_time has to be * called 500 ms after the second nowtime has started, because when * nowtime is written into the registers of the CMOS clock, it will * jump to the next second precisely 500 ms later. Check the Motorola * MC146818A or Dallas DS12887 data sheet for details. */ -int mach_set_rtc_mmss(const struct timespec64 *now) +int mach_set_cmos_time(const struct timespec64 *now) { unsigned long long nowtime = now->tv_sec; struct rtc_time tm; @@ -62,8 +56,7 @@ int mach_set_rtc_mmss(const struct timespec64 *now) void mach_get_cmos_time(struct timespec64 *now) { - unsigned int status, year, mon, day, hour, min, sec, century = 0; - unsigned long flags; + struct rtc_time tm; /* * If pm_trace abused the RTC as storage, set the timespec to 0, @@ -74,51 +67,13 @@ void mach_get_cmos_time(struct timespec64 *now) return; } - spin_lock_irqsave(&rtc_lock, flags); - - /* - * If UIP is clear, then we have >= 244 microseconds before - * RTC registers will be updated. Spec sheet says that this - * is the reliable way to read RTC - registers. If UIP is set - * then the register access might be invalid. - */ - while ((CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP)) - cpu_relax(); - - sec = CMOS_READ(RTC_SECONDS); - min = CMOS_READ(RTC_MINUTES); - hour = CMOS_READ(RTC_HOURS); - day = CMOS_READ(RTC_DAY_OF_MONTH); - mon = CMOS_READ(RTC_MONTH); - year = CMOS_READ(RTC_YEAR); - -#ifdef CONFIG_ACPI - if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && - acpi_gbl_FADT.century) - century = CMOS_READ(acpi_gbl_FADT.century); -#endif - - status = CMOS_READ(RTC_CONTROL); - WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY)); - - spin_unlock_irqrestore(&rtc_lock, flags); - - if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) { - sec = bcd2bin(sec); - min = bcd2bin(min); - hour = bcd2bin(hour); - day = bcd2bin(day); - mon = bcd2bin(mon); - year = bcd2bin(year); + if (mc146818_get_time(&tm)) { + pr_err("Unable to read current time from RTC\n"); + now->tv_sec = now->tv_nsec = 0; + return; } - if (century) { - century = bcd2bin(century); - year += century * 100; - } else - year += CMOS_YEARS_OFFS; - - now->tv_sec = mktime64(year, mon, day, hour, min, sec); + now->tv_sec = rtc_tm_to_time64(&tm); now->tv_nsec = 0; } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index a74262c71484..216fee7144ee 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -5,25 +5,33 @@ * This file contains the setup_arch() code, which handles the architecture-dependent * parts of early kernel initialization. */ +#include <linux/acpi.h> #include <linux/console.h> #include <linux/crash_dump.h> +#include <linux/dma-map-ops.h> #include <linux/dmi.h> #include <linux/efi.h> +#include <linux/ima.h> #include <linux/init_ohci1394_dma.h> #include <linux/initrd.h> #include <linux/iscsi_ibft.h> #include <linux/memblock.h> +#include <linux/panic_notifier.h> #include <linux/pci.h> #include <linux/root_dev.h> -#include <linux/sfi.h> +#include <linux/hugetlb.h> #include <linux/tboot.h> #include <linux/usb/xhci-dbgp.h> +#include <linux/static_call.h> +#include <linux/swiotlb.h> +#include <linux/random.h> #include <uapi/linux/mount.h> #include <xen/xen.h> #include <asm/apic.h> +#include <asm/numa.h> #include <asm/bios_ebda.h> #include <asm/bugs.h> #include <asm/cpu.h> @@ -34,12 +42,14 @@ #include <asm/kasan.h> #include <asm/kaslr.h> #include <asm/mce.h> +#include <asm/memtype.h> #include <asm/mtrr.h> #include <asm/realmode.h> #include <asm/olpc_ofw.h> #include <asm/pci-direct.h> #include <asm/prom.h> #include <asm/proto.h> +#include <asm/thermal.h> #include <asm/unwind.h> #include <asm/vsyscall.h> #include <linux/vmalloc.h> @@ -59,12 +69,6 @@ RESERVE_BRK(dmi_alloc, 65536); #endif -/* - * Range of the BSS area. The size of the BSS area is determined - * at link time, with RESERVE_BRK*() facility reserving additional - * chunks. - */ -static __initdata unsigned long _brk_start = (unsigned long)__brk_base; unsigned long _brk_end = (unsigned long)__brk_base; @@ -115,11 +119,6 @@ EXPORT_SYMBOL(boot_cpu_data); unsigned int def_to_bigsmp; -/* For MCA, but anyone else can use it if they want */ -unsigned int machine_id; -unsigned int machine_submodel_id; -unsigned int BIOS_revision; - struct apm_info apm_info; EXPORT_SYMBOL(apm_info); @@ -143,6 +142,11 @@ __visible unsigned long mmu_cr4_features __ro_after_init; __visible unsigned long mmu_cr4_features __ro_after_init = X86_CR4_PAE; #endif +#ifdef CONFIG_IMA +static phys_addr_t ima_kexec_buffer_phys; +static size_t ima_kexec_buffer_size; +#endif + /* Boot loader ID and version as integers, for the benefit of proc_dointvec */ int bootloader_type, bootloader_version; @@ -237,6 +241,9 @@ static u64 __init get_ramdisk_image(void) ramdisk_image |= (u64)boot_params.ext_ramdisk_image << 32; + if (ramdisk_image == 0) + ramdisk_image = phys_initrd_start; + return ramdisk_image; } static u64 __init get_ramdisk_size(void) @@ -245,6 +252,9 @@ static u64 __init get_ramdisk_size(void) ramdisk_size |= (u64)boot_params.ext_ramdisk_size << 32; + if (ramdisk_size == 0) + ramdisk_size = phys_initrd_size; + return ramdisk_size; } @@ -256,16 +266,12 @@ static void __init relocate_initrd(void) u64 area_size = PAGE_ALIGN(ramdisk_size); /* We need to move the initrd down into directly mapped mem */ - relocated_ramdisk = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), - area_size, PAGE_SIZE); - + relocated_ramdisk = memblock_phys_alloc_range(area_size, PAGE_SIZE, 0, + PFN_PHYS(max_pfn_mapped)); if (!relocated_ramdisk) panic("Cannot find place for new RAMDISK of size %lld\n", ramdisk_size); - /* Note: this includes all the mem currently occupied by - the initrd, we rely on that fact to keep the data intact. */ - memblock_reserve(relocated_ramdisk, area_size); initrd_start = relocated_ramdisk + PAGE_OFFSET; initrd_end = initrd_start + ramdisk_size; printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n", @@ -292,13 +298,13 @@ static void __init early_reserve_initrd(void) memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image); } + static void __init reserve_initrd(void) { /* Assume only end is not page aligned */ u64 ramdisk_image = get_ramdisk_image(); u64 ramdisk_size = get_ramdisk_size(); u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); - u64 mapped_size; if (!boot_params.hdr.type_of_loader || !ramdisk_image || !ramdisk_size) @@ -306,12 +312,6 @@ static void __init reserve_initrd(void) initrd_start = 0; - mapped_size = memblock_mem_size(max_pfn_mapped); - if (ramdisk_size >= (mapped_size>>1)) - panic("initrd too large to handle, " - "disabling initrd (%lld needed, %lld available)\n", - ramdisk_size, mapped_size>>1); - printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image, ramdisk_end - 1); @@ -325,7 +325,7 @@ static void __init reserve_initrd(void) relocate_initrd(); - memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); + memblock_phys_free(ramdisk_image, ramdisk_end - ramdisk_image); } #else @@ -337,6 +337,60 @@ static void __init reserve_initrd(void) } #endif /* CONFIG_BLK_DEV_INITRD */ +static void __init add_early_ima_buffer(u64 phys_addr) +{ +#ifdef CONFIG_IMA + struct ima_setup_data *data; + + data = early_memremap(phys_addr + sizeof(struct setup_data), sizeof(*data)); + if (!data) { + pr_warn("setup: failed to memremap ima_setup_data entry\n"); + return; + } + + if (data->size) { + memblock_reserve(data->addr, data->size); + ima_kexec_buffer_phys = data->addr; + ima_kexec_buffer_size = data->size; + } + + early_memunmap(data, sizeof(*data)); +#else + pr_warn("Passed IMA kexec data, but CONFIG_IMA not set. Ignoring.\n"); +#endif +} + +#if defined(CONFIG_HAVE_IMA_KEXEC) && !defined(CONFIG_OF_FLATTREE) +int __init ima_free_kexec_buffer(void) +{ + int rc; + + if (!ima_kexec_buffer_size) + return -ENOENT; + + rc = memblock_phys_free(ima_kexec_buffer_phys, + ima_kexec_buffer_size); + if (rc) + return rc; + + ima_kexec_buffer_phys = 0; + ima_kexec_buffer_size = 0; + + return 0; +} + +int __init ima_get_kexec_buffer(void **addr, size_t *size) +{ + if (!ima_kexec_buffer_size) + return -ENOENT; + + *addr = __va(ima_kexec_buffer_phys); + *size = ima_kexec_buffer_size; + + return 0; +} +#endif + static void __init parse_setup_data(void) { struct setup_data *data; @@ -362,6 +416,18 @@ static void __init parse_setup_data(void) case SETUP_EFI: parse_efi_setup(pa_data, data_len); break; + case SETUP_IMA: + add_early_ima_buffer(pa_data); + break; + case SETUP_RNG_SEED: + data = early_memremap(pa_data, data_len); + add_bootloader_randomness(data->data, data->len); + /* Zero seed for forward secrecy. */ + memzero_explicit(data->data, data->len); + /* Zero length in case we find ourselves back here by accident. */ + memzero_explicit(&data->len, sizeof(data->len)); + early_memunmap(data, data_len); + break; default: break; } @@ -371,21 +437,41 @@ static void __init parse_setup_data(void) static void __init memblock_x86_reserve_range_setup_data(void) { + struct setup_indirect *indirect; struct setup_data *data; - u64 pa_data; + u64 pa_data, pa_next; + u32 len; pa_data = boot_params.hdr.setup_data; while (pa_data) { data = early_memremap(pa_data, sizeof(*data)); + if (!data) { + pr_warn("setup: failed to memremap setup_data entry\n"); + return; + } + + len = sizeof(*data); + pa_next = data->next; + memblock_reserve(pa_data, sizeof(*data) + data->len); - if (data->type == SETUP_INDIRECT && - ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) - memblock_reserve(((struct setup_indirect *)data->data)->addr, - ((struct setup_indirect *)data->data)->len); + if (data->type == SETUP_INDIRECT) { + len += data->len; + early_memunmap(data, sizeof(*data)); + data = early_memremap(pa_data, len); + if (!data) { + pr_warn("setup: failed to memremap indirect setup_data\n"); + return; + } - pa_data = data->next; - early_memunmap(data, sizeof(*data)); + indirect = (struct setup_indirect *)data->data; + + if (indirect->type != SETUP_INDIRECT) + memblock_reserve(indirect->addr, indirect->len); + } + + pa_data = pa_next; + early_memunmap(data, len); } } @@ -393,8 +479,6 @@ static void __init memblock_x86_reserve_range_setup_data(void) * --------- Crashkernel reservation ------------------------------ */ -#ifdef CONFIG_KEXEC_CORE - /* 16M alignment for crash kernel regions */ #define CRASH_ALIGN SZ_16M @@ -423,13 +507,13 @@ static int __init reserve_crashkernel_low(void) { #ifdef CONFIG_X86_64 unsigned long long base, low_base = 0, low_size = 0; - unsigned long total_low_mem; + unsigned long low_mem_limit; int ret; - total_low_mem = memblock_mem_size(1UL << (32 - PAGE_SHIFT)); + low_mem_limit = min(memblock_phys_mem_size(), CRASH_ADDR_LOW_MAX); /* crashkernel=Y,low */ - ret = parse_crashkernel_low(boot_command_line, total_low_mem, &low_size, &base); + ret = parse_crashkernel_low(boot_command_line, low_mem_limit, &low_size, &base); if (ret) { /* * two parts from kernel/dma/swiotlb.c: @@ -447,23 +531,17 @@ static int __init reserve_crashkernel_low(void) return 0; } - low_base = memblock_find_in_range(0, 1ULL << 32, low_size, CRASH_ALIGN); + low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, CRASH_ADDR_LOW_MAX); if (!low_base) { pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n", (unsigned long)(low_size >> 20)); return -ENOMEM; } - ret = memblock_reserve(low_base, low_size); - if (ret) { - pr_err("%s: Error reserving crashkernel low memblock.\n", __func__); - return ret; - } - - pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n", + pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (low RAM limit: %ldMB)\n", (unsigned long)(low_size >> 20), (unsigned long)(low_base >> 20), - (unsigned long)(total_low_mem >> 20)); + (unsigned long)(low_mem_limit >> 20)); crashk_low_res.start = low_base; crashk_low_res.end = low_base + low_size - 1; @@ -478,6 +556,9 @@ static void __init reserve_crashkernel(void) bool high = false; int ret; + if (!IS_ENABLED(CONFIG_KEXEC_CORE)) + return; + total_mem = memblock_phys_mem_size(); /* crashkernel=XM */ @@ -507,13 +588,13 @@ static void __init reserve_crashkernel(void) * unless "crashkernel=size[KMG],high" is specified. */ if (!high) - crash_base = memblock_find_in_range(CRASH_ALIGN, - CRASH_ADDR_LOW_MAX, - crash_size, CRASH_ALIGN); + crash_base = memblock_phys_alloc_range(crash_size, + CRASH_ALIGN, CRASH_ALIGN, + CRASH_ADDR_LOW_MAX); if (!crash_base) - crash_base = memblock_find_in_range(CRASH_ALIGN, - CRASH_ADDR_HIGH_MAX, - crash_size, CRASH_ALIGN); + crash_base = memblock_phys_alloc_range(crash_size, + CRASH_ALIGN, CRASH_ALIGN, + CRASH_ADDR_HIGH_MAX); if (!crash_base) { pr_info("crashkernel reservation failed - No suitable area found.\n"); return; @@ -521,22 +602,16 @@ static void __init reserve_crashkernel(void) } else { unsigned long long start; - start = memblock_find_in_range(crash_base, - crash_base + crash_size, - crash_size, 1 << 20); + start = memblock_phys_alloc_range(crash_size, SZ_1M, crash_base, + crash_base + crash_size); if (start != crash_base) { pr_info("crashkernel reservation failed - memory is in use.\n"); return; } } - ret = memblock_reserve(crash_base, crash_size); - if (ret) { - pr_err("%s: Error reserving crashkernel memblock.\n", __func__); - return; - } if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) { - memblock_free(crash_base, crash_size); + memblock_phys_free(crash_base, crash_size); return; } @@ -549,11 +624,6 @@ static void __init reserve_crashkernel(void) crashk_res.end = crash_base + crash_size - 1; insert_resource(&iomem_resource, &crashk_res); } -#else -static void __init reserve_crashkernel(void) -{ -} -#endif static struct resource standard_io_resources[] = { { .name = "dma1", .start = 0x00, .end = 0x1f, @@ -588,16 +658,6 @@ void __init reserve_standard_io_resources(void) } -static __init void reserve_ibft_region(void) -{ - unsigned long addr, size = 0; - - addr = find_ibft_region(&size); - - if (size) - memblock_reserve(addr, size); -} - static bool __init snb_gfx_workaround_needed(void) { #ifdef CONFIG_PCI @@ -651,11 +711,16 @@ static void __init trim_snb_memory(void) printk(KERN_DEBUG "reserving inaccessible SNB gfx pages\n"); /* - * Reserve all memory below the 1 MB mark that has not - * already been reserved. + * SandyBridge integrated graphics devices have a bug that prevents + * them from accessing certain memory ranges, namely anything below + * 1M and in the pages listed in bad_pages[] above. + * + * To avoid these pages being ever accessed by SNB gfx devices reserve + * bad_pages that have not already been reserved at boot time. + * All memory below the 1 MB mark is anyway reserved later during + * setup_arch(), so there is no need to reserve it here. */ - memblock_reserve(0, 1<<20); - + for (i = 0; i < ARRAY_SIZE(bad_pages); i++) { if (memblock_reserve(bad_pages[i], PAGE_SIZE)) printk(KERN_WARNING "failed to reserve 0x%08lx\n", @@ -663,18 +728,6 @@ static void __init trim_snb_memory(void) } } -/* - * Here we put platform-specific memory range workarounds, i.e. - * memory known to be corrupt or otherwise in need to be reserved on - * specific platforms. - * - * If this gets used more widely it could use a real dispatch mechanism. - */ -static void __init trim_platform_memory_ranges(void) -{ - trim_snb_memory(); -} - static void __init trim_bios_range(void) { /* @@ -719,35 +772,39 @@ static void __init e820_add_kernel_range(void) e820__range_add(start, size, E820_TYPE_RAM); } -static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; - -static int __init parse_reservelow(char *p) +static void __init early_reserve_memory(void) { - unsigned long long size; - - if (!p) - return -EINVAL; - - size = memparse(p, &p); + /* + * Reserve the memory occupied by the kernel between _text and + * __end_of_kernel_reserve symbols. Any kernel sections after the + * __end_of_kernel_reserve symbol must be explicitly reserved with a + * separate memblock_reserve() or they will be discarded. + */ + memblock_reserve(__pa_symbol(_text), + (unsigned long)__end_of_kernel_reserve - (unsigned long)_text); - if (size < 4096) - size = 4096; + /* + * The first 4Kb of memory is a BIOS owned area, but generally it is + * not listed as such in the E820 table. + * + * Reserve the first 64K of memory since some BIOSes are known to + * corrupt low memory. After the real mode trampoline is allocated the + * rest of the memory below 640k is reserved. + * + * In addition, make sure page 0 is always reserved because on + * systems with L1TF its contents can be leaked to user processes. + */ + memblock_reserve(0, SZ_64K); - if (size > 640*1024) - size = 640*1024; + early_reserve_initrd(); - reserve_low = size; + memblock_x86_reserve_range_setup_data(); - return 0; + reserve_ibft_region(); + reserve_bios_regions(); + trim_snb_memory(); } -early_param("reservelow", parse_reservelow); - -static void __init trim_low_memory_range(void) -{ - memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE)); -} - /* * Dump out kernel offset information on panic. */ @@ -767,6 +824,30 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) return 0; } +void x86_configure_nx(void) +{ + if (boot_cpu_has(X86_FEATURE_NX)) + __supported_pte_mask |= _PAGE_NX; + else + __supported_pte_mask &= ~_PAGE_NX; +} + +static void __init x86_report_nx(void) +{ + if (!boot_cpu_has(X86_FEATURE_NX)) { + printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " + "missing in CPU!\n"); + } else { +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) + printk(KERN_INFO "NX (Execute Disable) protection: active\n"); +#else + /* 32bit non-PAE kernel, NX cannot be used */ + printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " + "cannot be enabled: non-PAE kernel!\n"); +#endif + } +} + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -782,29 +863,6 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) void __init setup_arch(char **cmdline_p) { - /* - * Reserve the memory occupied by the kernel between _text and - * __end_of_kernel_reserve symbols. Any kernel sections after the - * __end_of_kernel_reserve symbol must be explicitly reserved with a - * separate memblock_reserve() or they will be discarded. - */ - memblock_reserve(__pa_symbol(_text), - (unsigned long)__end_of_kernel_reserve - (unsigned long)_text); - - /* - * Make sure page 0 is always reserved because on systems with - * L1TF its contents can be leaked to user processes. - */ - memblock_reserve(0, PAGE_SIZE); - - early_reserve_initrd(); - - /* - * At this point everything still needed from the boot loader - * or BIOS or kernel text should be early reserved or marked not - * RAM in e820. All other memory is free game. - */ - #ifdef CONFIG_X86_32 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); @@ -840,8 +898,8 @@ void __init setup_arch(char **cmdline_p) idt_setup_early_traps(); early_cpu_init(); - arch_init_ideal_nops(); jump_label_init(); + static_call_init(); early_ioremap_init(); setup_olpc_ofw_pgd(); @@ -864,8 +922,6 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_BLK_DEV_RAM rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; - rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0); - rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0); #endif #ifdef CONFIG_EFI if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, @@ -880,6 +936,20 @@ void __init setup_arch(char **cmdline_p) x86_init.oem.arch_setup(); + /* + * Do some memory reservations *before* memory is added to memblock, so + * memblock allocations won't overwrite it. + * + * After this point, everything still needed from the boot loader or + * firmware or kernel text should be early reserved or marked not RAM in + * e820. All other memory is free game. + * + * This call needs to happen before e820__memory_setup() which calls the + * xen_memory_setup() on Xen dom0 which relies on the fact that those + * early reservations have happened already. + */ + early_reserve_memory(); + iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; e820__memory_setup(); parse_setup_data(); @@ -888,10 +958,7 @@ void __init setup_arch(char **cmdline_p) if (!boot_params.hdr.root_flags) root_mountflags &= ~MS_RDONLY; - init_mm.start_code = (unsigned long) _text; - init_mm.end_code = (unsigned long) _etext; - init_mm.end_data = (unsigned long) _edata; - init_mm.brk = _brk_end; + setup_initial_init_mm(_text, _etext, _edata, (void *)_brk_end); code_resource.start = __pa_symbol(_text); code_resource.end = __pa_symbol(_etext)-1; @@ -904,26 +971,24 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_CMDLINE_BOOL #ifdef CONFIG_CMDLINE_OVERRIDE - strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); + strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); #else if (builtin_cmdline[0]) { /* append boot loader cmdline to builtin */ strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE); strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE); - strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); + strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); } #endif #endif - strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); + strscpy(command_line, boot_command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; /* * x86_configure_nx() is called before parse_early_param() to detect * whether hardware doesn't support NX (so that the early EHCI debug - * console setup can safely call set_fixmap()). It may then be called - * again from within noexec_setup() during parsing early parameters - * to honor the respective command line option. + * console setup can safely call set_fixmap()). */ x86_configure_nx(); @@ -931,6 +996,7 @@ void __init setup_arch(char **cmdline_p) if (efi_enabled(EFI_BOOT)) efi_memblock_x86_reserve_range(); + #ifdef CONFIG_MEMORY_HOTPLUG /* * Memory used by the kernel cannot be hot-removed because Linux @@ -957,9 +1023,6 @@ void __init setup_arch(char **cmdline_p) x86_report_nx(); - /* after early param, so could get panic from serial */ - memblock_x86_reserve_range_setup_data(); - if (acpi_mps_check()) { #ifdef CONFIG_X86_LOCAL_APIC disable_apic = 1; @@ -1011,7 +1074,11 @@ void __init setup_arch(char **cmdline_p) max_pfn = e820__end_of_ram_pfn(); /* update e820 for memory not covered by WB MTRRs */ - mtrr_bp_init(); + if (IS_ENABLED(CONFIG_MTRR)) + mtrr_bp_init(); + else + pat_disable("PAT support disabled because CONFIG_MTRR is disabled in the kernel."); + if (mtrr_trim_uncached_memory(max_pfn)) max_pfn = e820__end_of_ram_pfn(); @@ -1051,14 +1118,12 @@ void __init setup_arch(char **cmdline_p) */ find_smp_config(); - reserve_ibft_region(); - early_alloc_pgt_buf(); /* * Need to conclude brk, before e820__memblock_setup() - * it could use memblock_find_in_range, could overlap with - * brk area. + * it could use memblock_find_in_range, could overlap with + * brk area. */ reserve_brk(); @@ -1067,11 +1132,16 @@ void __init setup_arch(char **cmdline_p) memblock_set_current_limit(ISA_END_ADDRESS); e820__memblock_setup(); - reserve_bios_regions(); + /* + * Needs to run after memblock setup because it needs the physical + * memory size. + */ + sev_setup_arch(); efi_fake_memmap(); efi_find_mirror(); efi_esrt_init(); + efi_mokvar_table_init(); /* * The EFI specification says that boot service code won't be @@ -1091,11 +1161,22 @@ void __init setup_arch(char **cmdline_p) (max_pfn_mapped<<PAGE_SHIFT) - 1); #endif + /* + * Find free memory for the real mode trampoline and place it there. If + * there is not enough free memory under 1M, on EFI-enabled systems + * there will be additional attempt to reclaim the memory for the real + * mode trampoline at efi_free_boot_services(). + * + * Unconditionally reserve the entire first 1M of RAM because BIOSes + * are known to corrupt low memory and several hundred kilobytes are not + * worth complex detection what memory gets clobbered. Windows does the + * same thing for very similar reasons. + * + * Moreover, on machines with SandyBridge graphics or in setups that use + * crashkernel the entire 1M is reserved anyway. + */ reserve_real_mode(); - trim_platform_memory_ranges(); - trim_low_memory_range(); - init_mem_mapping(); idt_setup_early_pf(); @@ -1141,6 +1222,8 @@ void __init setup_arch(char **cmdline_p) reserve_initrd(); acpi_table_upgrade(); + /* Look for ACPI tables and reserve memory occupied by them. */ + acpi_boot_table_init(); vsmp_init(); @@ -1148,16 +1231,14 @@ void __init setup_arch(char **cmdline_p) early_platform_quirks(); - /* - * Parse the ACPI tables for possible boot-time SMP configuration. - */ - acpi_boot_table_init(); - early_acpi_boot_init(); initmem_init(); dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT); + if (boot_cpu_has(X86_FEATURE_GBPAGES)) + hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT); + /* * Reserve memory for crash kernel after SRAT is parsed so that it * won't consume hotpluggable memory. @@ -1193,7 +1274,6 @@ void __init setup_arch(char **cmdline_p) * Read APIC and some other early information from ACPI tables. */ acpi_boot_init(); - sfi_init(); x86_dtb_init(); /* @@ -1210,6 +1290,7 @@ void __init setup_arch(char **cmdline_p) prefill_possible_map(); init_cpu_to_node(); + init_gi_nodes(); io_apic_init_mappings(); @@ -1232,6 +1313,14 @@ void __init setup_arch(char **cmdline_p) x86_init.timers.wallclock_init(); + /* + * This needs to run before setup_local_APIC() which soft-disables the + * local APIC temporarily and that masks the thermal LVT interrupt, + * leading to softlockups on machines which have configured SMI + * interrupt delivery. + */ + therm_lvt_init(); + mcheck_init(); register_refined_jiffies(CLOCK_TICK_RATE); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index e6d7894ad127..49325caa7307 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -66,7 +66,7 @@ EXPORT_SYMBOL(__per_cpu_offset); */ static bool __init pcpu_need_numa(void) { -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA pg_data_t *last = NULL; unsigned int cpu; @@ -84,63 +84,9 @@ static bool __init pcpu_need_numa(void) } #endif -/** - * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu - * @cpu: cpu to allocate for - * @size: size allocation in bytes - * @align: alignment - * - * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper - * does the right thing for NUMA regardless of the current - * configuration. - * - * RETURNS: - * Pointer to the allocated area on success, NULL on failure. - */ -static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, - unsigned long align) -{ - const unsigned long goal = __pa(MAX_DMA_ADDRESS); -#ifdef CONFIG_NEED_MULTIPLE_NODES - int node = early_cpu_to_node(cpu); - void *ptr; - - if (!node_online(node) || !NODE_DATA(node)) { - ptr = memblock_alloc_from(size, align, goal); - pr_info("cpu %d has no node %d or node-local memory\n", - cpu, node); - pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n", - cpu, size, __pa(ptr)); - } else { - ptr = memblock_alloc_try_nid(size, align, goal, - MEMBLOCK_ALLOC_ACCESSIBLE, - node); - - pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n", - cpu, size, node, __pa(ptr)); - } - return ptr; -#else - return memblock_alloc_from(size, align, goal); -#endif -} - -/* - * Helpers for first chunk memory allocation - */ -static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) -{ - return pcpu_alloc_bootmem(cpu, size, align); -} - -static void __init pcpu_fc_free(void *ptr, size_t size) -{ - memblock_free(__pa(ptr), size); -} - static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) { -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA if (early_cpu_to_node(from) == early_cpu_to_node(to)) return LOCAL_DISTANCE; else @@ -150,7 +96,12 @@ static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) #endif } -static void __init pcpup_populate_pte(unsigned long addr) +static int __init pcpu_cpu_to_node(int cpu) +{ + return early_cpu_to_node(cpu); +} + +void __init pcpu_populate_pte(unsigned long addr) { populate_extra_pte(addr); } @@ -205,15 +156,14 @@ void __init setup_per_cpu_areas(void) rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, dyn_size, atom_size, pcpu_cpu_distance, - pcpu_fc_alloc, pcpu_fc_free); + pcpu_cpu_to_node); if (rc < 0) pr_warn("%s allocator failed (%d), falling back to page size\n", pcpu_fc_names[pcpu_chosen_fc], rc); } if (rc < 0) rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, - pcpu_fc_alloc, pcpu_fc_free, - pcpup_populate_pte); + pcpu_cpu_to_node); if (rc < 0) panic("cannot initialize percpu area (err=%d)", rc); @@ -224,7 +174,6 @@ void __init setup_per_cpu_areas(void) per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); per_cpu(cpu_number, cpu) = cpu; setup_percpu_segment(cpu); - setup_stack_canary_segment(cpu); /* * Copy data used in early init routines from the * initial arrays to the per cpu data areas. These @@ -287,9 +236,9 @@ void __init setup_per_cpu_areas(void) /* * Sync back kernel address range again. We already did this in * setup_arch(), but percpu data also needs to be available in - * the smpboot asm. We can't reliably pick up percpu mappings - * using vmalloc_fault(), because exception dispatch needs - * percpu data. + * the smpboot asm and arch_sync_kernel_mappings() doesn't sync to + * swapper_pg_dir on 32-bit. The per-cpu mappings need to be available + * there too. * * FIXME: Can the later sync in setup_cpu_entry_areas() replace * this call? diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c new file mode 100644 index 000000000000..3a5b0c9c4fcc --- /dev/null +++ b/arch/x86/kernel/sev-shared.c @@ -0,0 +1,993 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AMD Encrypted Register State Support + * + * Author: Joerg Roedel <jroedel@suse.de> + * + * This file is not compiled stand-alone. It contains code shared + * between the pre-decompression boot code and the running Linux kernel + * and is included directly into both code-bases. + */ + +#ifndef __BOOT_COMPRESSED +#define error(v) pr_err(v) +#define has_cpuflag(f) boot_cpu_has(f) +#endif + +/* I/O parameters for CPUID-related helpers */ +struct cpuid_leaf { + u32 fn; + u32 subfn; + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; +}; + +/* + * Individual entries of the SNP CPUID table, as defined by the SNP + * Firmware ABI, Revision 0.9, Section 7.1, Table 14. + */ +struct snp_cpuid_fn { + u32 eax_in; + u32 ecx_in; + u64 xcr0_in; + u64 xss_in; + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; + u64 __reserved; +} __packed; + +/* + * SNP CPUID table, as defined by the SNP Firmware ABI, Revision 0.9, + * Section 8.14.2.6. Also noted there is the SNP firmware-enforced limit + * of 64 entries per CPUID table. + */ +#define SNP_CPUID_COUNT_MAX 64 + +struct snp_cpuid_table { + u32 count; + u32 __reserved1; + u64 __reserved2; + struct snp_cpuid_fn fn[SNP_CPUID_COUNT_MAX]; +} __packed; + +/* + * Since feature negotiation related variables are set early in the boot + * process they must reside in the .data section so as not to be zeroed + * out when the .bss section is later cleared. + * + * GHCB protocol version negotiated with the hypervisor. + */ +static u16 ghcb_version __ro_after_init; + +/* Copy of the SNP firmware's CPUID page. */ +static struct snp_cpuid_table cpuid_table_copy __ro_after_init; + +/* + * These will be initialized based on CPUID table so that non-present + * all-zero leaves (for sparse tables) can be differentiated from + * invalid/out-of-range leaves. This is needed since all-zero leaves + * still need to be post-processed. + */ +static u32 cpuid_std_range_max __ro_after_init; +static u32 cpuid_hyp_range_max __ro_after_init; +static u32 cpuid_ext_range_max __ro_after_init; + +static bool __init sev_es_check_cpu_features(void) +{ + if (!has_cpuflag(X86_FEATURE_RDRAND)) { + error("RDRAND instruction not supported - no trusted source of randomness available\n"); + return false; + } + + return true; +} + +static void __noreturn sev_es_terminate(unsigned int set, unsigned int reason) +{ + u64 val = GHCB_MSR_TERM_REQ; + + /* Tell the hypervisor what went wrong. */ + val |= GHCB_SEV_TERM_REASON(set, reason); + + /* Request Guest Termination from Hypvervisor */ + sev_es_wr_ghcb_msr(val); + VMGEXIT(); + + while (true) + asm volatile("hlt\n" : : : "memory"); +} + +/* + * The hypervisor features are available from GHCB version 2 onward. + */ +static u64 get_hv_features(void) +{ + u64 val; + + if (ghcb_version < 2) + return 0; + + sev_es_wr_ghcb_msr(GHCB_MSR_HV_FT_REQ); + VMGEXIT(); + + val = sev_es_rd_ghcb_msr(); + if (GHCB_RESP_CODE(val) != GHCB_MSR_HV_FT_RESP) + return 0; + + return GHCB_MSR_HV_FT_RESP_VAL(val); +} + +static void snp_register_ghcb_early(unsigned long paddr) +{ + unsigned long pfn = paddr >> PAGE_SHIFT; + u64 val; + + sev_es_wr_ghcb_msr(GHCB_MSR_REG_GPA_REQ_VAL(pfn)); + VMGEXIT(); + + val = sev_es_rd_ghcb_msr(); + + /* If the response GPA is not ours then abort the guest */ + if ((GHCB_RESP_CODE(val) != GHCB_MSR_REG_GPA_RESP) || + (GHCB_MSR_REG_GPA_RESP_VAL(val) != pfn)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_REGISTER); +} + +static bool sev_es_negotiate_protocol(void) +{ + u64 val; + + /* Do the GHCB protocol version negotiation */ + sev_es_wr_ghcb_msr(GHCB_MSR_SEV_INFO_REQ); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + + if (GHCB_MSR_INFO(val) != GHCB_MSR_SEV_INFO_RESP) + return false; + + if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTOCOL_MIN || + GHCB_MSR_PROTO_MIN(val) > GHCB_PROTOCOL_MAX) + return false; + + ghcb_version = min_t(size_t, GHCB_MSR_PROTO_MAX(val), GHCB_PROTOCOL_MAX); + + return true; +} + +static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) +{ + ghcb->save.sw_exit_code = 0; + __builtin_memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); +} + +static bool vc_decoding_needed(unsigned long exit_code) +{ + /* Exceptions don't require to decode the instruction */ + return !(exit_code >= SVM_EXIT_EXCP_BASE && + exit_code <= SVM_EXIT_LAST_EXCP); +} + +static enum es_result vc_init_em_ctxt(struct es_em_ctxt *ctxt, + struct pt_regs *regs, + unsigned long exit_code) +{ + enum es_result ret = ES_OK; + + memset(ctxt, 0, sizeof(*ctxt)); + ctxt->regs = regs; + + if (vc_decoding_needed(exit_code)) + ret = vc_decode_insn(ctxt); + + return ret; +} + +static void vc_finish_insn(struct es_em_ctxt *ctxt) +{ + ctxt->regs->ip += ctxt->insn.length; +} + +static enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + u32 ret; + + ret = ghcb->save.sw_exit_info_1 & GENMASK_ULL(31, 0); + if (!ret) + return ES_OK; + + if (ret == 1) { + u64 info = ghcb->save.sw_exit_info_2; + unsigned long v = info & SVM_EVTINJ_VEC_MASK; + + /* Check if exception information from hypervisor is sane. */ + if ((info & SVM_EVTINJ_VALID) && + ((v == X86_TRAP_GP) || (v == X86_TRAP_UD)) && + ((info & SVM_EVTINJ_TYPE_MASK) == SVM_EVTINJ_TYPE_EXEPT)) { + ctxt->fi.vector = v; + + if (info & SVM_EVTINJ_VALID_ERR) + ctxt->fi.error_code = info >> 32; + + return ES_EXCEPTION; + } + } + + return ES_VMM_ERROR; +} + +static enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, + struct es_em_ctxt *ctxt, + u64 exit_code, u64 exit_info_1, + u64 exit_info_2) +{ + /* Fill in protocol and format specifiers */ + ghcb->protocol_version = ghcb_version; + ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; + + ghcb_set_sw_exit_code(ghcb, exit_code); + ghcb_set_sw_exit_info_1(ghcb, exit_info_1); + ghcb_set_sw_exit_info_2(ghcb, exit_info_2); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + return verify_exception_info(ghcb, ctxt); +} + +static int __sev_cpuid_hv(u32 fn, int reg_idx, u32 *reg) +{ + u64 val; + + sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, reg_idx)); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) + return -EIO; + + *reg = (val >> 32); + + return 0; +} + +static int sev_cpuid_hv(struct cpuid_leaf *leaf) +{ + int ret; + + /* + * MSR protocol does not support fetching non-zero subfunctions, but is + * sufficient to handle current early-boot cases. Should that change, + * make sure to report an error rather than ignoring the index and + * grabbing random values. If this issue arises in the future, handling + * can be added here to use GHCB-page protocol for cases that occur late + * enough in boot that GHCB page is available. + */ + if (cpuid_function_is_indexed(leaf->fn) && leaf->subfn) + return -EINVAL; + + ret = __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EAX, &leaf->eax); + ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EBX, &leaf->ebx); + ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_ECX, &leaf->ecx); + ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EDX, &leaf->edx); + + return ret; +} + +/* + * This may be called early while still running on the initial identity + * mapping. Use RIP-relative addressing to obtain the correct address + * while running with the initial identity mapping as well as the + * switch-over to kernel virtual addresses later. + */ +static const struct snp_cpuid_table *snp_cpuid_get_table(void) +{ + void *ptr; + + asm ("lea cpuid_table_copy(%%rip), %0" + : "=r" (ptr) + : "p" (&cpuid_table_copy)); + + return ptr; +} + +/* + * The SNP Firmware ABI, Revision 0.9, Section 7.1, details the use of + * XCR0_IN and XSS_IN to encode multiple versions of 0xD subfunctions 0 + * and 1 based on the corresponding features enabled by a particular + * combination of XCR0 and XSS registers so that a guest can look up the + * version corresponding to the features currently enabled in its XCR0/XSS + * registers. The only values that differ between these versions/table + * entries is the enabled XSAVE area size advertised via EBX. + * + * While hypervisors may choose to make use of this support, it is more + * robust/secure for a guest to simply find the entry corresponding to the + * base/legacy XSAVE area size (XCR0=1 or XCR0=3), and then calculate the + * XSAVE area size using subfunctions 2 through 64, as documented in APM + * Volume 3, Rev 3.31, Appendix E.3.8, which is what is done here. + * + * Since base/legacy XSAVE area size is documented as 0x240, use that value + * directly rather than relying on the base size in the CPUID table. + * + * Return: XSAVE area size on success, 0 otherwise. + */ +static u32 snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted) +{ + const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); + u64 xfeatures_found = 0; + u32 xsave_size = 0x240; + int i; + + for (i = 0; i < cpuid_table->count; i++) { + const struct snp_cpuid_fn *e = &cpuid_table->fn[i]; + + if (!(e->eax_in == 0xD && e->ecx_in > 1 && e->ecx_in < 64)) + continue; + if (!(xfeatures_en & (BIT_ULL(e->ecx_in)))) + continue; + if (xfeatures_found & (BIT_ULL(e->ecx_in))) + continue; + + xfeatures_found |= (BIT_ULL(e->ecx_in)); + + if (compacted) + xsave_size += e->eax; + else + xsave_size = max(xsave_size, e->eax + e->ebx); + } + + /* + * Either the guest set unsupported XCR0/XSS bits, or the corresponding + * entries in the CPUID table were not present. This is not a valid + * state to be in. + */ + if (xfeatures_found != (xfeatures_en & GENMASK_ULL(63, 2))) + return 0; + + return xsave_size; +} + +static bool +snp_cpuid_get_validated_func(struct cpuid_leaf *leaf) +{ + const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); + int i; + + for (i = 0; i < cpuid_table->count; i++) { + const struct snp_cpuid_fn *e = &cpuid_table->fn[i]; + + if (e->eax_in != leaf->fn) + continue; + + if (cpuid_function_is_indexed(leaf->fn) && e->ecx_in != leaf->subfn) + continue; + + /* + * For 0xD subfunctions 0 and 1, only use the entry corresponding + * to the base/legacy XSAVE area size (XCR0=1 or XCR0=3, XSS=0). + * See the comments above snp_cpuid_calc_xsave_size() for more + * details. + */ + if (e->eax_in == 0xD && (e->ecx_in == 0 || e->ecx_in == 1)) + if (!(e->xcr0_in == 1 || e->xcr0_in == 3) || e->xss_in) + continue; + + leaf->eax = e->eax; + leaf->ebx = e->ebx; + leaf->ecx = e->ecx; + leaf->edx = e->edx; + + return true; + } + + return false; +} + +static void snp_cpuid_hv(struct cpuid_leaf *leaf) +{ + if (sev_cpuid_hv(leaf)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV); +} + +static int snp_cpuid_postprocess(struct cpuid_leaf *leaf) +{ + struct cpuid_leaf leaf_hv = *leaf; + + switch (leaf->fn) { + case 0x1: + snp_cpuid_hv(&leaf_hv); + + /* initial APIC ID */ + leaf->ebx = (leaf_hv.ebx & GENMASK(31, 24)) | (leaf->ebx & GENMASK(23, 0)); + /* APIC enabled bit */ + leaf->edx = (leaf_hv.edx & BIT(9)) | (leaf->edx & ~BIT(9)); + + /* OSXSAVE enabled bit */ + if (native_read_cr4() & X86_CR4_OSXSAVE) + leaf->ecx |= BIT(27); + break; + case 0x7: + /* OSPKE enabled bit */ + leaf->ecx &= ~BIT(4); + if (native_read_cr4() & X86_CR4_PKE) + leaf->ecx |= BIT(4); + break; + case 0xB: + leaf_hv.subfn = 0; + snp_cpuid_hv(&leaf_hv); + + /* extended APIC ID */ + leaf->edx = leaf_hv.edx; + break; + case 0xD: { + bool compacted = false; + u64 xcr0 = 1, xss = 0; + u32 xsave_size; + + if (leaf->subfn != 0 && leaf->subfn != 1) + return 0; + + if (native_read_cr4() & X86_CR4_OSXSAVE) + xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + if (leaf->subfn == 1) { + /* Get XSS value if XSAVES is enabled. */ + if (leaf->eax & BIT(3)) { + unsigned long lo, hi; + + asm volatile("rdmsr" : "=a" (lo), "=d" (hi) + : "c" (MSR_IA32_XSS)); + xss = (hi << 32) | lo; + } + + /* + * The PPR and APM aren't clear on what size should be + * encoded in 0xD:0x1:EBX when compaction is not enabled + * by either XSAVEC (feature bit 1) or XSAVES (feature + * bit 3) since SNP-capable hardware has these feature + * bits fixed as 1. KVM sets it to 0 in this case, but + * to avoid this becoming an issue it's safer to simply + * treat this as unsupported for SNP guests. + */ + if (!(leaf->eax & (BIT(1) | BIT(3)))) + return -EINVAL; + + compacted = true; + } + + xsave_size = snp_cpuid_calc_xsave_size(xcr0 | xss, compacted); + if (!xsave_size) + return -EINVAL; + + leaf->ebx = xsave_size; + } + break; + case 0x8000001E: + snp_cpuid_hv(&leaf_hv); + + /* extended APIC ID */ + leaf->eax = leaf_hv.eax; + /* compute ID */ + leaf->ebx = (leaf->ebx & GENMASK(31, 8)) | (leaf_hv.ebx & GENMASK(7, 0)); + /* node ID */ + leaf->ecx = (leaf->ecx & GENMASK(31, 8)) | (leaf_hv.ecx & GENMASK(7, 0)); + break; + default: + /* No fix-ups needed, use values as-is. */ + break; + } + + return 0; +} + +/* + * Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value + * should be treated as fatal by caller. + */ +static int snp_cpuid(struct cpuid_leaf *leaf) +{ + const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); + + if (!cpuid_table->count) + return -EOPNOTSUPP; + + if (!snp_cpuid_get_validated_func(leaf)) { + /* + * Some hypervisors will avoid keeping track of CPUID entries + * where all values are zero, since they can be handled the + * same as out-of-range values (all-zero). This is useful here + * as well as it allows virtually all guest configurations to + * work using a single SNP CPUID table. + * + * To allow for this, there is a need to distinguish between + * out-of-range entries and in-range zero entries, since the + * CPUID table entries are only a template that may need to be + * augmented with additional values for things like + * CPU-specific information during post-processing. So if it's + * not in the table, set the values to zero. Then, if they are + * within a valid CPUID range, proceed with post-processing + * using zeros as the initial values. Otherwise, skip + * post-processing and just return zeros immediately. + */ + leaf->eax = leaf->ebx = leaf->ecx = leaf->edx = 0; + + /* Skip post-processing for out-of-range zero leafs. */ + if (!(leaf->fn <= cpuid_std_range_max || + (leaf->fn >= 0x40000000 && leaf->fn <= cpuid_hyp_range_max) || + (leaf->fn >= 0x80000000 && leaf->fn <= cpuid_ext_range_max))) + return 0; + } + + return snp_cpuid_postprocess(leaf); +} + +/* + * Boot VC Handler - This is the first VC handler during boot, there is no GHCB + * page yet, so it only supports the MSR based communication with the + * hypervisor and only the CPUID exit-code. + */ +void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) +{ + unsigned int subfn = lower_bits(regs->cx, 32); + unsigned int fn = lower_bits(regs->ax, 32); + struct cpuid_leaf leaf; + int ret; + + /* Only CPUID is supported via MSR protocol */ + if (exit_code != SVM_EXIT_CPUID) + goto fail; + + leaf.fn = fn; + leaf.subfn = subfn; + + ret = snp_cpuid(&leaf); + if (!ret) + goto cpuid_done; + + if (ret != -EOPNOTSUPP) + goto fail; + + if (sev_cpuid_hv(&leaf)) + goto fail; + +cpuid_done: + regs->ax = leaf.eax; + regs->bx = leaf.ebx; + regs->cx = leaf.ecx; + regs->dx = leaf.edx; + + /* + * This is a VC handler and the #VC is only raised when SEV-ES is + * active, which means SEV must be active too. Do sanity checks on the + * CPUID results to make sure the hypervisor does not trick the kernel + * into the no-sev path. This could map sensitive data unencrypted and + * make it accessible to the hypervisor. + * + * In particular, check for: + * - Availability of CPUID leaf 0x8000001f + * - SEV CPUID bit. + * + * The hypervisor might still report the wrong C-bit position, but this + * can't be checked here. + */ + + if (fn == 0x80000000 && (regs->ax < 0x8000001f)) + /* SEV leaf check */ + goto fail; + else if ((fn == 0x8000001f && !(regs->ax & BIT(1)))) + /* SEV bit */ + goto fail; + + /* Skip over the CPUID two-byte opcode */ + regs->ip += 2; + + return; + +fail: + /* Terminate the guest */ + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); +} + +static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt, + void *src, char *buf, + unsigned int data_size, + unsigned int count, + bool backwards) +{ + int i, b = backwards ? -1 : 1; + enum es_result ret = ES_OK; + + for (i = 0; i < count; i++) { + void *s = src + (i * data_size * b); + char *d = buf + (i * data_size); + + ret = vc_read_mem(ctxt, s, d, data_size); + if (ret != ES_OK) + break; + } + + return ret; +} + +static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt, + void *dst, char *buf, + unsigned int data_size, + unsigned int count, + bool backwards) +{ + int i, s = backwards ? -1 : 1; + enum es_result ret = ES_OK; + + for (i = 0; i < count; i++) { + void *d = dst + (i * data_size * s); + char *b = buf + (i * data_size); + + ret = vc_write_mem(ctxt, d, b, data_size); + if (ret != ES_OK) + break; + } + + return ret; +} + +#define IOIO_TYPE_STR BIT(2) +#define IOIO_TYPE_IN 1 +#define IOIO_TYPE_INS (IOIO_TYPE_IN | IOIO_TYPE_STR) +#define IOIO_TYPE_OUT 0 +#define IOIO_TYPE_OUTS (IOIO_TYPE_OUT | IOIO_TYPE_STR) + +#define IOIO_REP BIT(3) + +#define IOIO_ADDR_64 BIT(9) +#define IOIO_ADDR_32 BIT(8) +#define IOIO_ADDR_16 BIT(7) + +#define IOIO_DATA_32 BIT(6) +#define IOIO_DATA_16 BIT(5) +#define IOIO_DATA_8 BIT(4) + +#define IOIO_SEG_ES (0 << 10) +#define IOIO_SEG_DS (3 << 10) + +static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) +{ + struct insn *insn = &ctxt->insn; + *exitinfo = 0; + + switch (insn->opcode.bytes[0]) { + /* INS opcodes */ + case 0x6c: + case 0x6d: + *exitinfo |= IOIO_TYPE_INS; + *exitinfo |= IOIO_SEG_ES; + *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; + break; + + /* OUTS opcodes */ + case 0x6e: + case 0x6f: + *exitinfo |= IOIO_TYPE_OUTS; + *exitinfo |= IOIO_SEG_DS; + *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; + break; + + /* IN immediate opcodes */ + case 0xe4: + case 0xe5: + *exitinfo |= IOIO_TYPE_IN; + *exitinfo |= (u8)insn->immediate.value << 16; + break; + + /* OUT immediate opcodes */ + case 0xe6: + case 0xe7: + *exitinfo |= IOIO_TYPE_OUT; + *exitinfo |= (u8)insn->immediate.value << 16; + break; + + /* IN register opcodes */ + case 0xec: + case 0xed: + *exitinfo |= IOIO_TYPE_IN; + *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; + break; + + /* OUT register opcodes */ + case 0xee: + case 0xef: + *exitinfo |= IOIO_TYPE_OUT; + *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; + break; + + default: + return ES_DECODE_FAILED; + } + + switch (insn->opcode.bytes[0]) { + case 0x6c: + case 0x6e: + case 0xe4: + case 0xe6: + case 0xec: + case 0xee: + /* Single byte opcodes */ + *exitinfo |= IOIO_DATA_8; + break; + default: + /* Length determined by instruction parsing */ + *exitinfo |= (insn->opnd_bytes == 2) ? IOIO_DATA_16 + : IOIO_DATA_32; + } + switch (insn->addr_bytes) { + case 2: + *exitinfo |= IOIO_ADDR_16; + break; + case 4: + *exitinfo |= IOIO_ADDR_32; + break; + case 8: + *exitinfo |= IOIO_ADDR_64; + break; + } + + if (insn_has_rep_prefix(insn)) + *exitinfo |= IOIO_REP; + + return ES_OK; +} + +static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + struct pt_regs *regs = ctxt->regs; + u64 exit_info_1, exit_info_2; + enum es_result ret; + + ret = vc_ioio_exitinfo(ctxt, &exit_info_1); + if (ret != ES_OK) + return ret; + + if (exit_info_1 & IOIO_TYPE_STR) { + + /* (REP) INS/OUTS */ + + bool df = ((regs->flags & X86_EFLAGS_DF) == X86_EFLAGS_DF); + unsigned int io_bytes, exit_bytes; + unsigned int ghcb_count, op_count; + unsigned long es_base; + u64 sw_scratch; + + /* + * For the string variants with rep prefix the amount of in/out + * operations per #VC exception is limited so that the kernel + * has a chance to take interrupts and re-schedule while the + * instruction is emulated. + */ + io_bytes = (exit_info_1 >> 4) & 0x7; + ghcb_count = sizeof(ghcb->shared_buffer) / io_bytes; + + op_count = (exit_info_1 & IOIO_REP) ? regs->cx : 1; + exit_info_2 = min(op_count, ghcb_count); + exit_bytes = exit_info_2 * io_bytes; + + es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES); + + /* Read bytes of OUTS into the shared buffer */ + if (!(exit_info_1 & IOIO_TYPE_IN)) { + ret = vc_insn_string_read(ctxt, + (void *)(es_base + regs->si), + ghcb->shared_buffer, io_bytes, + exit_info_2, df); + if (ret) + return ret; + } + + /* + * Issue an VMGEXIT to the HV to consume the bytes from the + * shared buffer or to have it write them into the shared buffer + * depending on the instruction: OUTS or INS. + */ + sw_scratch = __pa(ghcb) + offsetof(struct ghcb, shared_buffer); + ghcb_set_sw_scratch(ghcb, sw_scratch); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, + exit_info_1, exit_info_2); + if (ret != ES_OK) + return ret; + + /* Read bytes from shared buffer into the guest's destination. */ + if (exit_info_1 & IOIO_TYPE_IN) { + ret = vc_insn_string_write(ctxt, + (void *)(es_base + regs->di), + ghcb->shared_buffer, io_bytes, + exit_info_2, df); + if (ret) + return ret; + + if (df) + regs->di -= exit_bytes; + else + regs->di += exit_bytes; + } else { + if (df) + regs->si -= exit_bytes; + else + regs->si += exit_bytes; + } + + if (exit_info_1 & IOIO_REP) + regs->cx -= exit_info_2; + + ret = regs->cx ? ES_RETRY : ES_OK; + + } else { + + /* IN/OUT into/from rAX */ + + int bits = (exit_info_1 & 0x70) >> 1; + u64 rax = 0; + + if (!(exit_info_1 & IOIO_TYPE_IN)) + rax = lower_bits(regs->ax, bits); + + ghcb_set_rax(ghcb, rax); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, exit_info_1, 0); + if (ret != ES_OK) + return ret; + + if (exit_info_1 & IOIO_TYPE_IN) { + if (!ghcb_rax_is_valid(ghcb)) + return ES_VMM_ERROR; + regs->ax = lower_bits(ghcb->save.rax, bits); + } + } + + return ret; +} + +static int vc_handle_cpuid_snp(struct pt_regs *regs) +{ + struct cpuid_leaf leaf; + int ret; + + leaf.fn = regs->ax; + leaf.subfn = regs->cx; + ret = snp_cpuid(&leaf); + if (!ret) { + regs->ax = leaf.eax; + regs->bx = leaf.ebx; + regs->cx = leaf.ecx; + regs->dx = leaf.edx; + } + + return ret; +} + +static enum es_result vc_handle_cpuid(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct pt_regs *regs = ctxt->regs; + u32 cr4 = native_read_cr4(); + enum es_result ret; + int snp_cpuid_ret; + + snp_cpuid_ret = vc_handle_cpuid_snp(regs); + if (!snp_cpuid_ret) + return ES_OK; + if (snp_cpuid_ret != -EOPNOTSUPP) + return ES_VMM_ERROR; + + ghcb_set_rax(ghcb, regs->ax); + ghcb_set_rcx(ghcb, regs->cx); + + if (cr4 & X86_CR4_OSXSAVE) + /* Safe to read xcr0 */ + ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK)); + else + /* xgetbv will cause #GP - use reset value for xcr0 */ + ghcb_set_xcr0(ghcb, 1); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); + if (ret != ES_OK) + return ret; + + if (!(ghcb_rax_is_valid(ghcb) && + ghcb_rbx_is_valid(ghcb) && + ghcb_rcx_is_valid(ghcb) && + ghcb_rdx_is_valid(ghcb))) + return ES_VMM_ERROR; + + regs->ax = ghcb->save.rax; + regs->bx = ghcb->save.rbx; + regs->cx = ghcb->save.rcx; + regs->dx = ghcb->save.rdx; + + return ES_OK; +} + +static enum es_result vc_handle_rdtsc(struct ghcb *ghcb, + struct es_em_ctxt *ctxt, + unsigned long exit_code) +{ + bool rdtscp = (exit_code == SVM_EXIT_RDTSCP); + enum es_result ret; + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, 0, 0); + if (ret != ES_OK) + return ret; + + if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb) && + (!rdtscp || ghcb_rcx_is_valid(ghcb)))) + return ES_VMM_ERROR; + + ctxt->regs->ax = ghcb->save.rax; + ctxt->regs->dx = ghcb->save.rdx; + if (rdtscp) + ctxt->regs->cx = ghcb->save.rcx; + + return ES_OK; +} + +struct cc_setup_data { + struct setup_data header; + u32 cc_blob_address; +}; + +/* + * Search for a Confidential Computing blob passed in as a setup_data entry + * via the Linux Boot Protocol. + */ +static struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp) +{ + struct cc_setup_data *sd = NULL; + struct setup_data *hdr; + + hdr = (struct setup_data *)bp->hdr.setup_data; + + while (hdr) { + if (hdr->type == SETUP_CC_BLOB) { + sd = (struct cc_setup_data *)hdr; + return (struct cc_blob_sev_info *)(unsigned long)sd->cc_blob_address; + } + hdr = (struct setup_data *)hdr->next; + } + + return NULL; +} + +/* + * Initialize the kernel's copy of the SNP CPUID table, and set up the + * pointer that will be used to access it. + * + * Maintaining a direct mapping of the SNP CPUID table used by firmware would + * be possible as an alternative, but the approach is brittle since the + * mapping needs to be updated in sync with all the changes to virtual memory + * layout and related mapping facilities throughout the boot process. + */ +static void __init setup_cpuid_table(const struct cc_blob_sev_info *cc_info) +{ + const struct snp_cpuid_table *cpuid_table_fw, *cpuid_table; + int i; + + if (!cc_info || !cc_info->cpuid_phys || cc_info->cpuid_len < PAGE_SIZE) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID); + + cpuid_table_fw = (const struct snp_cpuid_table *)cc_info->cpuid_phys; + if (!cpuid_table_fw->count || cpuid_table_fw->count > SNP_CPUID_COUNT_MAX) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID); + + cpuid_table = snp_cpuid_get_table(); + memcpy((void *)cpuid_table, cpuid_table_fw, sizeof(*cpuid_table)); + + /* Initialize CPUID ranges for range-checking. */ + for (i = 0; i < cpuid_table->count; i++) { + const struct snp_cpuid_fn *fn = &cpuid_table->fn[i]; + + if (fn->eax_in == 0x0) + cpuid_std_range_max = fn->eax; + else if (fn->eax_in == 0x40000000) + cpuid_hyp_range_max = fn->eax; + else if (fn->eax_in == 0x80000000) + cpuid_ext_range_max = fn->eax; + } +} diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c new file mode 100644 index 000000000000..a428c62330d3 --- /dev/null +++ b/arch/x86/kernel/sev.c @@ -0,0 +1,2262 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2019 SUSE + * + * Author: Joerg Roedel <jroedel@suse.de> + */ + +#define pr_fmt(fmt) "SEV: " fmt + +#include <linux/sched/debug.h> /* For show_regs() */ +#include <linux/percpu-defs.h> +#include <linux/cc_platform.h> +#include <linux/printk.h> +#include <linux/mm_types.h> +#include <linux/set_memory.h> +#include <linux/memblock.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/cpumask.h> +#include <linux/efi.h> +#include <linux/platform_device.h> +#include <linux/io.h> + +#include <asm/cpu_entry_area.h> +#include <asm/stacktrace.h> +#include <asm/sev.h> +#include <asm/insn-eval.h> +#include <asm/fpu/xcr.h> +#include <asm/processor.h> +#include <asm/realmode.h> +#include <asm/setup.h> +#include <asm/traps.h> +#include <asm/svm.h> +#include <asm/smp.h> +#include <asm/cpu.h> +#include <asm/apic.h> +#include <asm/cpuid.h> +#include <asm/cmdline.h> + +#define DR7_RESET_VALUE 0x400 + +/* AP INIT values as documented in the APM2 section "Processor Initialization State" */ +#define AP_INIT_CS_LIMIT 0xffff +#define AP_INIT_DS_LIMIT 0xffff +#define AP_INIT_LDTR_LIMIT 0xffff +#define AP_INIT_GDTR_LIMIT 0xffff +#define AP_INIT_IDTR_LIMIT 0xffff +#define AP_INIT_TR_LIMIT 0xffff +#define AP_INIT_RFLAGS_DEFAULT 0x2 +#define AP_INIT_DR6_DEFAULT 0xffff0ff0 +#define AP_INIT_GPAT_DEFAULT 0x0007040600070406ULL +#define AP_INIT_XCR0_DEFAULT 0x1 +#define AP_INIT_X87_FTW_DEFAULT 0x5555 +#define AP_INIT_X87_FCW_DEFAULT 0x0040 +#define AP_INIT_CR0_DEFAULT 0x60000010 +#define AP_INIT_MXCSR_DEFAULT 0x1f80 + +/* For early boot hypervisor communication in SEV-ES enabled guests */ +static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); + +/* + * Needs to be in the .data section because we need it NULL before bss is + * cleared + */ +static struct ghcb *boot_ghcb __section(".data"); + +/* Bitmap of SEV features supported by the hypervisor */ +static u64 sev_hv_features __ro_after_init; + +/* #VC handler runtime per-CPU data */ +struct sev_es_runtime_data { + struct ghcb ghcb_page; + + /* + * Reserve one page per CPU as backup storage for the unencrypted GHCB. + * It is needed when an NMI happens while the #VC handler uses the real + * GHCB, and the NMI handler itself is causing another #VC exception. In + * that case the GHCB content of the first handler needs to be backed up + * and restored. + */ + struct ghcb backup_ghcb; + + /* + * Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions. + * There is no need for it to be atomic, because nothing is written to + * the GHCB between the read and the write of ghcb_active. So it is safe + * to use it when a nested #VC exception happens before the write. + * + * This is necessary for example in the #VC->NMI->#VC case when the NMI + * happens while the first #VC handler uses the GHCB. When the NMI code + * raises a second #VC handler it might overwrite the contents of the + * GHCB written by the first handler. To avoid this the content of the + * GHCB is saved and restored when the GHCB is detected to be in use + * already. + */ + bool ghcb_active; + bool backup_ghcb_active; + + /* + * Cached DR7 value - write it on DR7 writes and return it on reads. + * That value will never make it to the real hardware DR7 as debugging + * is currently unsupported in SEV-ES guests. + */ + unsigned long dr7; +}; + +struct ghcb_state { + struct ghcb *ghcb; +}; + +static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data); +DEFINE_STATIC_KEY_FALSE(sev_es_enable_key); + +static DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa); + +struct sev_config { + __u64 debug : 1, + __reserved : 63; +}; + +static struct sev_config sev_cfg __read_mostly; + +static __always_inline bool on_vc_stack(struct pt_regs *regs) +{ + unsigned long sp = regs->sp; + + /* User-mode RSP is not trusted */ + if (user_mode(regs)) + return false; + + /* SYSCALL gap still has user-mode RSP */ + if (ip_within_syscall_gap(regs)) + return false; + + return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC))); +} + +/* + * This function handles the case when an NMI is raised in the #VC + * exception handler entry code, before the #VC handler has switched off + * its IST stack. In this case, the IST entry for #VC must be adjusted, + * so that any nested #VC exception will not overwrite the stack + * contents of the interrupted #VC handler. + * + * The IST entry is adjusted unconditionally so that it can be also be + * unconditionally adjusted back in __sev_es_ist_exit(). Otherwise a + * nested sev_es_ist_exit() call may adjust back the IST entry too + * early. + * + * The __sev_es_ist_enter() and __sev_es_ist_exit() functions always run + * on the NMI IST stack, as they are only called from NMI handling code + * right now. + */ +void noinstr __sev_es_ist_enter(struct pt_regs *regs) +{ + unsigned long old_ist, new_ist; + + /* Read old IST entry */ + new_ist = old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); + + /* + * If NMI happened while on the #VC IST stack, set the new IST + * value below regs->sp, so that the interrupted stack frame is + * not overwritten by subsequent #VC exceptions. + */ + if (on_vc_stack(regs)) + new_ist = regs->sp; + + /* + * Reserve additional 8 bytes and store old IST value so this + * adjustment can be unrolled in __sev_es_ist_exit(). + */ + new_ist -= sizeof(old_ist); + *(unsigned long *)new_ist = old_ist; + + /* Set new IST entry */ + this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist); +} + +void noinstr __sev_es_ist_exit(void) +{ + unsigned long ist; + + /* Read IST entry */ + ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); + + if (WARN_ON(ist == __this_cpu_ist_top_va(VC))) + return; + + /* Read back old IST entry and write it to the TSS */ + this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist); +} + +/* + * Nothing shall interrupt this code path while holding the per-CPU + * GHCB. The backup GHCB is only for NMIs interrupting this path. + * + * Callers must disable local interrupts around it. + */ +static noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state) +{ + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + WARN_ON(!irqs_disabled()); + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + if (unlikely(data->ghcb_active)) { + /* GHCB is already in use - save its contents */ + + if (unlikely(data->backup_ghcb_active)) { + /* + * Backup-GHCB is also already in use. There is no way + * to continue here so just kill the machine. To make + * panic() work, mark GHCBs inactive so that messages + * can be printed out. + */ + data->ghcb_active = false; + data->backup_ghcb_active = false; + + instrumentation_begin(); + panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use"); + instrumentation_end(); + } + + /* Mark backup_ghcb active before writing to it */ + data->backup_ghcb_active = true; + + state->ghcb = &data->backup_ghcb; + + /* Backup GHCB content */ + *state->ghcb = *ghcb; + } else { + state->ghcb = NULL; + data->ghcb_active = true; + } + + return ghcb; +} + +static inline u64 sev_es_rd_ghcb_msr(void) +{ + return __rdmsr(MSR_AMD64_SEV_ES_GHCB); +} + +static __always_inline void sev_es_wr_ghcb_msr(u64 val) +{ + u32 low, high; + + low = (u32)(val); + high = (u32)(val >> 32); + + native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high); +} + +static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt, + unsigned char *buffer) +{ + return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); +} + +static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt) +{ + char buffer[MAX_INSN_SIZE]; + int insn_bytes; + + insn_bytes = insn_fetch_from_user_inatomic(ctxt->regs, buffer); + if (insn_bytes == 0) { + /* Nothing could be copied */ + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER; + ctxt->fi.cr2 = ctxt->regs->ip; + return ES_EXCEPTION; + } else if (insn_bytes == -EINVAL) { + /* Effective RIP could not be calculated */ + ctxt->fi.vector = X86_TRAP_GP; + ctxt->fi.error_code = 0; + ctxt->fi.cr2 = 0; + return ES_EXCEPTION; + } + + if (!insn_decode_from_regs(&ctxt->insn, ctxt->regs, buffer, insn_bytes)) + return ES_DECODE_FAILED; + + if (ctxt->insn.immediate.got) + return ES_OK; + else + return ES_DECODE_FAILED; +} + +static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt) +{ + char buffer[MAX_INSN_SIZE]; + int res, ret; + + res = vc_fetch_insn_kernel(ctxt, buffer); + if (res) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = X86_PF_INSTR; + ctxt->fi.cr2 = ctxt->regs->ip; + return ES_EXCEPTION; + } + + ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64); + if (ret < 0) + return ES_DECODE_FAILED; + else + return ES_OK; +} + +static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) +{ + if (user_mode(ctxt->regs)) + return __vc_decode_user_insn(ctxt); + else + return __vc_decode_kern_insn(ctxt); +} + +static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, + char *dst, char *buf, size_t size) +{ + unsigned long error_code = X86_PF_PROT | X86_PF_WRITE; + + /* + * This function uses __put_user() independent of whether kernel or user + * memory is accessed. This works fine because __put_user() does no + * sanity checks of the pointer being accessed. All that it does is + * to report when the access failed. + * + * Also, this function runs in atomic context, so __put_user() is not + * allowed to sleep. The page-fault handler detects that it is running + * in atomic context and will not try to take mmap_sem and handle the + * fault, so additional pagefault_enable()/disable() calls are not + * needed. + * + * The access can't be done via copy_to_user() here because + * vc_write_mem() must not use string instructions to access unsafe + * memory. The reason is that MOVS is emulated by the #VC handler by + * splitting the move up into a read and a write and taking a nested #VC + * exception on whatever of them is the MMIO access. Using string + * instructions here would cause infinite nesting. + */ + switch (size) { + case 1: { + u8 d1; + u8 __user *target = (u8 __user *)dst; + + memcpy(&d1, buf, 1); + if (__put_user(d1, target)) + goto fault; + break; + } + case 2: { + u16 d2; + u16 __user *target = (u16 __user *)dst; + + memcpy(&d2, buf, 2); + if (__put_user(d2, target)) + goto fault; + break; + } + case 4: { + u32 d4; + u32 __user *target = (u32 __user *)dst; + + memcpy(&d4, buf, 4); + if (__put_user(d4, target)) + goto fault; + break; + } + case 8: { + u64 d8; + u64 __user *target = (u64 __user *)dst; + + memcpy(&d8, buf, 8); + if (__put_user(d8, target)) + goto fault; + break; + } + default: + WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); + return ES_UNSUPPORTED; + } + + return ES_OK; + +fault: + if (user_mode(ctxt->regs)) + error_code |= X86_PF_USER; + + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = error_code; + ctxt->fi.cr2 = (unsigned long)dst; + + return ES_EXCEPTION; +} + +static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, + char *src, char *buf, size_t size) +{ + unsigned long error_code = X86_PF_PROT; + + /* + * This function uses __get_user() independent of whether kernel or user + * memory is accessed. This works fine because __get_user() does no + * sanity checks of the pointer being accessed. All that it does is + * to report when the access failed. + * + * Also, this function runs in atomic context, so __get_user() is not + * allowed to sleep. The page-fault handler detects that it is running + * in atomic context and will not try to take mmap_sem and handle the + * fault, so additional pagefault_enable()/disable() calls are not + * needed. + * + * The access can't be done via copy_from_user() here because + * vc_read_mem() must not use string instructions to access unsafe + * memory. The reason is that MOVS is emulated by the #VC handler by + * splitting the move up into a read and a write and taking a nested #VC + * exception on whatever of them is the MMIO access. Using string + * instructions here would cause infinite nesting. + */ + switch (size) { + case 1: { + u8 d1; + u8 __user *s = (u8 __user *)src; + + if (__get_user(d1, s)) + goto fault; + memcpy(buf, &d1, 1); + break; + } + case 2: { + u16 d2; + u16 __user *s = (u16 __user *)src; + + if (__get_user(d2, s)) + goto fault; + memcpy(buf, &d2, 2); + break; + } + case 4: { + u32 d4; + u32 __user *s = (u32 __user *)src; + + if (__get_user(d4, s)) + goto fault; + memcpy(buf, &d4, 4); + break; + } + case 8: { + u64 d8; + u64 __user *s = (u64 __user *)src; + if (__get_user(d8, s)) + goto fault; + memcpy(buf, &d8, 8); + break; + } + default: + WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); + return ES_UNSUPPORTED; + } + + return ES_OK; + +fault: + if (user_mode(ctxt->regs)) + error_code |= X86_PF_USER; + + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = error_code; + ctxt->fi.cr2 = (unsigned long)src; + + return ES_EXCEPTION; +} + +static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt, + unsigned long vaddr, phys_addr_t *paddr) +{ + unsigned long va = (unsigned long)vaddr; + unsigned int level; + phys_addr_t pa; + pgd_t *pgd; + pte_t *pte; + + pgd = __va(read_cr3_pa()); + pgd = &pgd[pgd_index(va)]; + pte = lookup_address_in_pgd(pgd, va, &level); + if (!pte) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.cr2 = vaddr; + ctxt->fi.error_code = 0; + + if (user_mode(ctxt->regs)) + ctxt->fi.error_code |= X86_PF_USER; + + return ES_EXCEPTION; + } + + if (WARN_ON_ONCE(pte_val(*pte) & _PAGE_ENC)) + /* Emulated MMIO to/from encrypted memory not supported */ + return ES_UNSUPPORTED; + + pa = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; + pa |= va & ~page_level_mask(level); + + *paddr = pa; + + return ES_OK; +} + +/* Include code shared with pre-decompression boot stage */ +#include "sev-shared.c" + +static noinstr void __sev_put_ghcb(struct ghcb_state *state) +{ + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + WARN_ON(!irqs_disabled()); + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + if (state->ghcb) { + /* Restore GHCB from Backup */ + *ghcb = *state->ghcb; + data->backup_ghcb_active = false; + state->ghcb = NULL; + } else { + /* + * Invalidate the GHCB so a VMGEXIT instruction issued + * from userspace won't appear to be valid. + */ + vc_ghcb_invalidate(ghcb); + data->ghcb_active = false; + } +} + +void noinstr __sev_es_nmi_complete(void) +{ + struct ghcb_state state; + struct ghcb *ghcb; + + ghcb = __sev_get_ghcb(&state); + + vc_ghcb_invalidate(ghcb); + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE); + ghcb_set_sw_exit_info_1(ghcb, 0); + ghcb_set_sw_exit_info_2(ghcb, 0); + + sev_es_wr_ghcb_msr(__pa_nodebug(ghcb)); + VMGEXIT(); + + __sev_put_ghcb(&state); +} + +static u64 __init get_secrets_page(void) +{ + u64 pa_data = boot_params.cc_blob_address; + struct cc_blob_sev_info info; + void *map; + + /* + * The CC blob contains the address of the secrets page, check if the + * blob is present. + */ + if (!pa_data) + return 0; + + map = early_memremap(pa_data, sizeof(info)); + if (!map) { + pr_err("Unable to locate SNP secrets page: failed to map the Confidential Computing blob.\n"); + return 0; + } + memcpy(&info, map, sizeof(info)); + early_memunmap(map, sizeof(info)); + + /* smoke-test the secrets page passed */ + if (!info.secrets_phys || info.secrets_len != PAGE_SIZE) + return 0; + + return info.secrets_phys; +} + +static u64 __init get_snp_jump_table_addr(void) +{ + struct snp_secrets_page_layout *layout; + void __iomem *mem; + u64 pa, addr; + + pa = get_secrets_page(); + if (!pa) + return 0; + + mem = ioremap_encrypted(pa, PAGE_SIZE); + if (!mem) { + pr_err("Unable to locate AP jump table address: failed to map the SNP secrets page.\n"); + return 0; + } + + layout = (__force struct snp_secrets_page_layout *)mem; + + addr = layout->os_area.ap_jump_table_pa; + iounmap(mem); + + return addr; +} + +static u64 __init get_jump_table_addr(void) +{ + struct ghcb_state state; + unsigned long flags; + struct ghcb *ghcb; + u64 ret = 0; + + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return get_snp_jump_table_addr(); + + local_irq_save(flags); + + ghcb = __sev_get_ghcb(&state); + + vc_ghcb_invalidate(ghcb); + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE); + ghcb_set_sw_exit_info_1(ghcb, SVM_VMGEXIT_GET_AP_JUMP_TABLE); + ghcb_set_sw_exit_info_2(ghcb, 0); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + if (ghcb_sw_exit_info_1_is_valid(ghcb) && + ghcb_sw_exit_info_2_is_valid(ghcb)) + ret = ghcb->save.sw_exit_info_2; + + __sev_put_ghcb(&state); + + local_irq_restore(flags); + + return ret; +} + +static void pvalidate_pages(unsigned long vaddr, unsigned int npages, bool validate) +{ + unsigned long vaddr_end; + int rc; + + vaddr = vaddr & PAGE_MASK; + vaddr_end = vaddr + (npages << PAGE_SHIFT); + + while (vaddr < vaddr_end) { + rc = pvalidate(vaddr, RMP_PG_SIZE_4K, validate); + if (WARN(rc, "Failed to validate address 0x%lx ret %d", vaddr, rc)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); + + vaddr = vaddr + PAGE_SIZE; + } +} + +static void __init early_set_pages_state(unsigned long paddr, unsigned int npages, enum psc_op op) +{ + unsigned long paddr_end; + u64 val; + + paddr = paddr & PAGE_MASK; + paddr_end = paddr + (npages << PAGE_SHIFT); + + while (paddr < paddr_end) { + /* + * Use the MSR protocol because this function can be called before + * the GHCB is established. + */ + sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op)); + VMGEXIT(); + + val = sev_es_rd_ghcb_msr(); + + if (WARN(GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP, + "Wrong PSC response code: 0x%x\n", + (unsigned int)GHCB_RESP_CODE(val))) + goto e_term; + + if (WARN(GHCB_MSR_PSC_RESP_VAL(val), + "Failed to change page state to '%s' paddr 0x%lx error 0x%llx\n", + op == SNP_PAGE_STATE_PRIVATE ? "private" : "shared", + paddr, GHCB_MSR_PSC_RESP_VAL(val))) + goto e_term; + + paddr = paddr + PAGE_SIZE; + } + + return; + +e_term: + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); +} + +void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, + unsigned int npages) +{ + /* + * This can be invoked in early boot while running identity mapped, so + * use an open coded check for SNP instead of using cc_platform_has(). + * This eliminates worries about jump tables or checking boot_cpu_data + * in the cc_platform_has() function. + */ + if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) + return; + + /* + * Ask the hypervisor to mark the memory pages as private in the RMP + * table. + */ + early_set_pages_state(paddr, npages, SNP_PAGE_STATE_PRIVATE); + + /* Validate the memory pages after they've been added in the RMP table. */ + pvalidate_pages(vaddr, npages, true); +} + +void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, + unsigned int npages) +{ + /* + * This can be invoked in early boot while running identity mapped, so + * use an open coded check for SNP instead of using cc_platform_has(). + * This eliminates worries about jump tables or checking boot_cpu_data + * in the cc_platform_has() function. + */ + if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) + return; + + /* Invalidate the memory pages before they are marked shared in the RMP table. */ + pvalidate_pages(vaddr, npages, false); + + /* Ask hypervisor to mark the memory pages shared in the RMP table. */ + early_set_pages_state(paddr, npages, SNP_PAGE_STATE_SHARED); +} + +void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) +{ + unsigned long vaddr, npages; + + vaddr = (unsigned long)__va(paddr); + npages = PAGE_ALIGN(sz) >> PAGE_SHIFT; + + if (op == SNP_PAGE_STATE_PRIVATE) + early_snp_set_memory_private(vaddr, paddr, npages); + else if (op == SNP_PAGE_STATE_SHARED) + early_snp_set_memory_shared(vaddr, paddr, npages); + else + WARN(1, "invalid memory op %d\n", op); +} + +static int vmgexit_psc(struct snp_psc_desc *desc) +{ + int cur_entry, end_entry, ret = 0; + struct snp_psc_desc *data; + struct ghcb_state state; + struct es_em_ctxt ctxt; + unsigned long flags; + struct ghcb *ghcb; + + /* + * __sev_get_ghcb() needs to run with IRQs disabled because it is using + * a per-CPU GHCB. + */ + local_irq_save(flags); + + ghcb = __sev_get_ghcb(&state); + if (!ghcb) { + ret = 1; + goto out_unlock; + } + + /* Copy the input desc into GHCB shared buffer */ + data = (struct snp_psc_desc *)ghcb->shared_buffer; + memcpy(ghcb->shared_buffer, desc, min_t(int, GHCB_SHARED_BUF_SIZE, sizeof(*desc))); + + /* + * As per the GHCB specification, the hypervisor can resume the guest + * before processing all the entries. Check whether all the entries + * are processed. If not, then keep retrying. Note, the hypervisor + * will update the data memory directly to indicate the status, so + * reference the data->hdr everywhere. + * + * The strategy here is to wait for the hypervisor to change the page + * state in the RMP table before guest accesses the memory pages. If the + * page state change was not successful, then later memory access will + * result in a crash. + */ + cur_entry = data->hdr.cur_entry; + end_entry = data->hdr.end_entry; + + while (data->hdr.cur_entry <= data->hdr.end_entry) { + ghcb_set_sw_scratch(ghcb, (u64)__pa(data)); + + /* This will advance the shared buffer data points to. */ + ret = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_PSC, 0, 0); + + /* + * Page State Change VMGEXIT can pass error code through + * exit_info_2. + */ + if (WARN(ret || ghcb->save.sw_exit_info_2, + "SNP: PSC failed ret=%d exit_info_2=%llx\n", + ret, ghcb->save.sw_exit_info_2)) { + ret = 1; + goto out; + } + + /* Verify that reserved bit is not set */ + if (WARN(data->hdr.reserved, "Reserved bit is set in the PSC header\n")) { + ret = 1; + goto out; + } + + /* + * Sanity check that entry processing is not going backwards. + * This will happen only if hypervisor is tricking us. + */ + if (WARN(data->hdr.end_entry > end_entry || cur_entry > data->hdr.cur_entry, +"SNP: PSC processing going backward, end_entry %d (got %d) cur_entry %d (got %d)\n", + end_entry, data->hdr.end_entry, cur_entry, data->hdr.cur_entry)) { + ret = 1; + goto out; + } + } + +out: + __sev_put_ghcb(&state); + +out_unlock: + local_irq_restore(flags); + + return ret; +} + +static void __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr, + unsigned long vaddr_end, int op) +{ + struct psc_hdr *hdr; + struct psc_entry *e; + unsigned long pfn; + int i; + + hdr = &data->hdr; + e = data->entries; + + memset(data, 0, sizeof(*data)); + i = 0; + + while (vaddr < vaddr_end) { + if (is_vmalloc_addr((void *)vaddr)) + pfn = vmalloc_to_pfn((void *)vaddr); + else + pfn = __pa(vaddr) >> PAGE_SHIFT; + + e->gfn = pfn; + e->operation = op; + hdr->end_entry = i; + + /* + * Current SNP implementation doesn't keep track of the RMP page + * size so use 4K for simplicity. + */ + e->pagesize = RMP_PG_SIZE_4K; + + vaddr = vaddr + PAGE_SIZE; + e++; + i++; + } + + if (vmgexit_psc(data)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); +} + +static void set_pages_state(unsigned long vaddr, unsigned int npages, int op) +{ + unsigned long vaddr_end, next_vaddr; + struct snp_psc_desc *desc; + + desc = kmalloc(sizeof(*desc), GFP_KERNEL_ACCOUNT); + if (!desc) + panic("SNP: failed to allocate memory for PSC descriptor\n"); + + vaddr = vaddr & PAGE_MASK; + vaddr_end = vaddr + (npages << PAGE_SHIFT); + + while (vaddr < vaddr_end) { + /* Calculate the last vaddr that fits in one struct snp_psc_desc. */ + next_vaddr = min_t(unsigned long, vaddr_end, + (VMGEXIT_PSC_MAX_ENTRY * PAGE_SIZE) + vaddr); + + __set_pages_state(desc, vaddr, next_vaddr, op); + + vaddr = next_vaddr; + } + + kfree(desc); +} + +void snp_set_memory_shared(unsigned long vaddr, unsigned int npages) +{ + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return; + + pvalidate_pages(vaddr, npages, false); + + set_pages_state(vaddr, npages, SNP_PAGE_STATE_SHARED); +} + +void snp_set_memory_private(unsigned long vaddr, unsigned int npages) +{ + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return; + + set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE); + + pvalidate_pages(vaddr, npages, true); +} + +static int snp_set_vmsa(void *va, bool vmsa) +{ + u64 attrs; + + /* + * Running at VMPL0 allows the kernel to change the VMSA bit for a page + * using the RMPADJUST instruction. However, for the instruction to + * succeed it must target the permissions of a lesser privileged + * (higher numbered) VMPL level, so use VMPL1 (refer to the RMPADJUST + * instruction in the AMD64 APM Volume 3). + */ + attrs = 1; + if (vmsa) + attrs |= RMPADJUST_VMSA_PAGE_BIT; + + return rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs); +} + +#define __ATTR_BASE (SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK) +#define INIT_CS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_READ_MASK | SVM_SELECTOR_CODE_MASK) +#define INIT_DS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_WRITE_MASK) + +#define INIT_LDTR_ATTRIBS (SVM_SELECTOR_P_MASK | 2) +#define INIT_TR_ATTRIBS (SVM_SELECTOR_P_MASK | 3) + +static void *snp_alloc_vmsa_page(void) +{ + struct page *p; + + /* + * Allocate VMSA page to work around the SNP erratum where the CPU will + * incorrectly signal an RMP violation #PF if a large page (2MB or 1GB) + * collides with the RMP entry of VMSA page. The recommended workaround + * is to not use a large page. + * + * Allocate an 8k page which is also 8k-aligned. + */ + p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1); + if (!p) + return NULL; + + split_page(p, 1); + + /* Free the first 4k. This page may be 2M/1G aligned and cannot be used. */ + __free_page(p); + + return page_address(p + 1); +} + +static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa) +{ + int err; + + err = snp_set_vmsa(vmsa, false); + if (err) + pr_err("clear VMSA page failed (%u), leaking page\n", err); + else + free_page((unsigned long)vmsa); +} + +static int wakeup_cpu_via_vmgexit(int apic_id, unsigned long start_ip) +{ + struct sev_es_save_area *cur_vmsa, *vmsa; + struct ghcb_state state; + unsigned long flags; + struct ghcb *ghcb; + u8 sipi_vector; + int cpu, ret; + u64 cr4; + + /* + * The hypervisor SNP feature support check has happened earlier, just check + * the AP_CREATION one here. + */ + if (!(sev_hv_features & GHCB_HV_FT_SNP_AP_CREATION)) + return -EOPNOTSUPP; + + /* + * Verify the desired start IP against the known trampoline start IP + * to catch any future new trampolines that may be introduced that + * would require a new protected guest entry point. + */ + if (WARN_ONCE(start_ip != real_mode_header->trampoline_start, + "Unsupported SNP start_ip: %lx\n", start_ip)) + return -EINVAL; + + /* Override start_ip with known protected guest start IP */ + start_ip = real_mode_header->sev_es_trampoline_start; + + /* Find the logical CPU for the APIC ID */ + for_each_present_cpu(cpu) { + if (arch_match_cpu_phys_id(cpu, apic_id)) + break; + } + if (cpu >= nr_cpu_ids) + return -EINVAL; + + cur_vmsa = per_cpu(sev_vmsa, cpu); + + /* + * A new VMSA is created each time because there is no guarantee that + * the current VMSA is the kernels or that the vCPU is not running. If + * an attempt was done to use the current VMSA with a running vCPU, a + * #VMEXIT of that vCPU would wipe out all of the settings being done + * here. + */ + vmsa = (struct sev_es_save_area *)snp_alloc_vmsa_page(); + if (!vmsa) + return -ENOMEM; + + /* CR4 should maintain the MCE value */ + cr4 = native_read_cr4() & X86_CR4_MCE; + + /* Set the CS value based on the start_ip converted to a SIPI vector */ + sipi_vector = (start_ip >> 12); + vmsa->cs.base = sipi_vector << 12; + vmsa->cs.limit = AP_INIT_CS_LIMIT; + vmsa->cs.attrib = INIT_CS_ATTRIBS; + vmsa->cs.selector = sipi_vector << 8; + + /* Set the RIP value based on start_ip */ + vmsa->rip = start_ip & 0xfff; + + /* Set AP INIT defaults as documented in the APM */ + vmsa->ds.limit = AP_INIT_DS_LIMIT; + vmsa->ds.attrib = INIT_DS_ATTRIBS; + vmsa->es = vmsa->ds; + vmsa->fs = vmsa->ds; + vmsa->gs = vmsa->ds; + vmsa->ss = vmsa->ds; + + vmsa->gdtr.limit = AP_INIT_GDTR_LIMIT; + vmsa->ldtr.limit = AP_INIT_LDTR_LIMIT; + vmsa->ldtr.attrib = INIT_LDTR_ATTRIBS; + vmsa->idtr.limit = AP_INIT_IDTR_LIMIT; + vmsa->tr.limit = AP_INIT_TR_LIMIT; + vmsa->tr.attrib = INIT_TR_ATTRIBS; + + vmsa->cr4 = cr4; + vmsa->cr0 = AP_INIT_CR0_DEFAULT; + vmsa->dr7 = DR7_RESET_VALUE; + vmsa->dr6 = AP_INIT_DR6_DEFAULT; + vmsa->rflags = AP_INIT_RFLAGS_DEFAULT; + vmsa->g_pat = AP_INIT_GPAT_DEFAULT; + vmsa->xcr0 = AP_INIT_XCR0_DEFAULT; + vmsa->mxcsr = AP_INIT_MXCSR_DEFAULT; + vmsa->x87_ftw = AP_INIT_X87_FTW_DEFAULT; + vmsa->x87_fcw = AP_INIT_X87_FCW_DEFAULT; + + /* SVME must be set. */ + vmsa->efer = EFER_SVME; + + /* + * Set the SNP-specific fields for this VMSA: + * VMPL level + * SEV_FEATURES (matches the SEV STATUS MSR right shifted 2 bits) + */ + vmsa->vmpl = 0; + vmsa->sev_features = sev_status >> 2; + + /* Switch the page over to a VMSA page now that it is initialized */ + ret = snp_set_vmsa(vmsa, true); + if (ret) { + pr_err("set VMSA page failed (%u)\n", ret); + free_page((unsigned long)vmsa); + + return -EINVAL; + } + + /* Issue VMGEXIT AP Creation NAE event */ + local_irq_save(flags); + + ghcb = __sev_get_ghcb(&state); + + vc_ghcb_invalidate(ghcb); + ghcb_set_rax(ghcb, vmsa->sev_features); + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION); + ghcb_set_sw_exit_info_1(ghcb, ((u64)apic_id << 32) | SVM_VMGEXIT_AP_CREATE); + ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa)); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + if (!ghcb_sw_exit_info_1_is_valid(ghcb) || + lower_32_bits(ghcb->save.sw_exit_info_1)) { + pr_err("SNP AP Creation error\n"); + ret = -EINVAL; + } + + __sev_put_ghcb(&state); + + local_irq_restore(flags); + + /* Perform cleanup if there was an error */ + if (ret) { + snp_cleanup_vmsa(vmsa); + vmsa = NULL; + } + + /* Free up any previous VMSA page */ + if (cur_vmsa) + snp_cleanup_vmsa(cur_vmsa); + + /* Record the current VMSA page */ + per_cpu(sev_vmsa, cpu) = vmsa; + + return ret; +} + +void snp_set_wakeup_secondary_cpu(void) +{ + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return; + + /* + * Always set this override if SNP is enabled. This makes it the + * required method to start APs under SNP. If the hypervisor does + * not support AP creation, then no APs will be started. + */ + apic->wakeup_secondary_cpu = wakeup_cpu_via_vmgexit; +} + +int __init sev_es_setup_ap_jump_table(struct real_mode_header *rmh) +{ + u16 startup_cs, startup_ip; + phys_addr_t jump_table_pa; + u64 jump_table_addr; + u16 __iomem *jump_table; + + jump_table_addr = get_jump_table_addr(); + + /* On UP guests there is no jump table so this is not a failure */ + if (!jump_table_addr) + return 0; + + /* Check if AP Jump Table is page-aligned */ + if (jump_table_addr & ~PAGE_MASK) + return -EINVAL; + + jump_table_pa = jump_table_addr & PAGE_MASK; + + startup_cs = (u16)(rmh->trampoline_start >> 4); + startup_ip = (u16)(rmh->sev_es_trampoline_start - + rmh->trampoline_start); + + jump_table = ioremap_encrypted(jump_table_pa, PAGE_SIZE); + if (!jump_table) + return -EIO; + + writew(startup_ip, &jump_table[0]); + writew(startup_cs, &jump_table[1]); + + iounmap(jump_table); + + return 0; +} + +/* + * This is needed by the OVMF UEFI firmware which will use whatever it finds in + * the GHCB MSR as its GHCB to talk to the hypervisor. So make sure the per-cpu + * runtime GHCBs used by the kernel are also mapped in the EFI page-table. + */ +int __init sev_es_efi_map_ghcbs(pgd_t *pgd) +{ + struct sev_es_runtime_data *data; + unsigned long address, pflags; + int cpu; + u64 pfn; + + if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) + return 0; + + pflags = _PAGE_NX | _PAGE_RW; + + for_each_possible_cpu(cpu) { + data = per_cpu(runtime_data, cpu); + + address = __pa(&data->ghcb_page); + pfn = address >> PAGE_SHIFT; + + if (kernel_map_pages_in_pgd(pgd, pfn, address, 1, pflags)) + return 1; + } + + return 0; +} + +static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + struct pt_regs *regs = ctxt->regs; + enum es_result ret; + u64 exit_info_1; + + /* Is it a WRMSR? */ + exit_info_1 = (ctxt->insn.opcode.bytes[1] == 0x30) ? 1 : 0; + + ghcb_set_rcx(ghcb, regs->cx); + if (exit_info_1) { + ghcb_set_rax(ghcb, regs->ax); + ghcb_set_rdx(ghcb, regs->dx); + } + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, exit_info_1, 0); + + if ((ret == ES_OK) && (!exit_info_1)) { + regs->ax = ghcb->save.rax; + regs->dx = ghcb->save.rdx; + } + + return ret; +} + +static void snp_register_per_cpu_ghcb(void) +{ + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + snp_register_ghcb_early(__pa(ghcb)); +} + +void setup_ghcb(void) +{ + if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) + return; + + /* First make sure the hypervisor talks a supported protocol. */ + if (!sev_es_negotiate_protocol()) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); + + /* + * Check whether the runtime #VC exception handler is active. It uses + * the per-CPU GHCB page which is set up by sev_es_init_vc_handling(). + * + * If SNP is active, register the per-CPU GHCB page so that the runtime + * exception handler can use it. + */ + if (initial_vc_handler == (unsigned long)kernel_exc_vmm_communication) { + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + snp_register_per_cpu_ghcb(); + + return; + } + + /* + * Clear the boot_ghcb. The first exception comes in before the bss + * section is cleared. + */ + memset(&boot_ghcb_page, 0, PAGE_SIZE); + + /* Alright - Make the boot-ghcb public */ + boot_ghcb = &boot_ghcb_page; + + /* SNP guest requires that GHCB GPA must be registered. */ + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + snp_register_ghcb_early(__pa(&boot_ghcb_page)); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void sev_es_ap_hlt_loop(void) +{ + struct ghcb_state state; + struct ghcb *ghcb; + + ghcb = __sev_get_ghcb(&state); + + while (true) { + vc_ghcb_invalidate(ghcb); + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_HLT_LOOP); + ghcb_set_sw_exit_info_1(ghcb, 0); + ghcb_set_sw_exit_info_2(ghcb, 0); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + /* Wakeup signal? */ + if (ghcb_sw_exit_info_2_is_valid(ghcb) && + ghcb->save.sw_exit_info_2) + break; + } + + __sev_put_ghcb(&state); +} + +/* + * Play_dead handler when running under SEV-ES. This is needed because + * the hypervisor can't deliver an SIPI request to restart the AP. + * Instead the kernel has to issue a VMGEXIT to halt the VCPU until the + * hypervisor wakes it up again. + */ +static void sev_es_play_dead(void) +{ + play_dead_common(); + + /* IRQs now disabled */ + + sev_es_ap_hlt_loop(); + + /* + * If we get here, the VCPU was woken up again. Jump to CPU + * startup code to get it back online. + */ + start_cpu0(); +} +#else /* CONFIG_HOTPLUG_CPU */ +#define sev_es_play_dead native_play_dead +#endif /* CONFIG_HOTPLUG_CPU */ + +#ifdef CONFIG_SMP +static void __init sev_es_setup_play_dead(void) +{ + smp_ops.play_dead = sev_es_play_dead; +} +#else +static inline void sev_es_setup_play_dead(void) { } +#endif + +static void __init alloc_runtime_data(int cpu) +{ + struct sev_es_runtime_data *data; + + data = memblock_alloc(sizeof(*data), PAGE_SIZE); + if (!data) + panic("Can't allocate SEV-ES runtime data"); + + per_cpu(runtime_data, cpu) = data; +} + +static void __init init_ghcb(int cpu) +{ + struct sev_es_runtime_data *data; + int err; + + data = per_cpu(runtime_data, cpu); + + err = early_set_memory_decrypted((unsigned long)&data->ghcb_page, + sizeof(data->ghcb_page)); + if (err) + panic("Can't map GHCBs unencrypted"); + + memset(&data->ghcb_page, 0, sizeof(data->ghcb_page)); + + data->ghcb_active = false; + data->backup_ghcb_active = false; +} + +void __init sev_es_init_vc_handling(void) +{ + int cpu; + + BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE); + + if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) + return; + + if (!sev_es_check_cpu_features()) + panic("SEV-ES CPU Features missing"); + + /* + * SNP is supported in v2 of the GHCB spec which mandates support for HV + * features. + */ + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) { + sev_hv_features = get_hv_features(); + + if (!(sev_hv_features & GHCB_HV_FT_SNP)) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); + } + + /* Enable SEV-ES special handling */ + static_branch_enable(&sev_es_enable_key); + + /* Initialize per-cpu GHCB pages */ + for_each_possible_cpu(cpu) { + alloc_runtime_data(cpu); + init_ghcb(cpu); + } + + sev_es_setup_play_dead(); + + /* Secondary CPUs use the runtime #VC handler */ + initial_vc_handler = (unsigned long)kernel_exc_vmm_communication; +} + +static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt) +{ + int trapnr = ctxt->fi.vector; + + if (trapnr == X86_TRAP_PF) + native_write_cr2(ctxt->fi.cr2); + + ctxt->regs->orig_ax = ctxt->fi.error_code; + do_early_exception(ctxt->regs, trapnr); +} + +static long *vc_insn_get_rm(struct es_em_ctxt *ctxt) +{ + long *reg_array; + int offset; + + reg_array = (long *)ctxt->regs; + offset = insn_get_modrm_rm_off(&ctxt->insn, ctxt->regs); + + if (offset < 0) + return NULL; + + offset /= sizeof(long); + + return reg_array + offset; +} +static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt, + unsigned int bytes, bool read) +{ + u64 exit_code, exit_info_1, exit_info_2; + unsigned long ghcb_pa = __pa(ghcb); + enum es_result res; + phys_addr_t paddr; + void __user *ref; + + ref = insn_get_addr_ref(&ctxt->insn, ctxt->regs); + if (ref == (void __user *)-1L) + return ES_UNSUPPORTED; + + exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE; + + res = vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr); + if (res != ES_OK) { + if (res == ES_EXCEPTION && !read) + ctxt->fi.error_code |= X86_PF_WRITE; + + return res; + } + + exit_info_1 = paddr; + /* Can never be greater than 8 */ + exit_info_2 = bytes; + + ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer)); + + return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2); +} + +/* + * The MOVS instruction has two memory operands, which raises the + * problem that it is not known whether the access to the source or the + * destination caused the #VC exception (and hence whether an MMIO read + * or write operation needs to be emulated). + * + * Instead of playing games with walking page-tables and trying to guess + * whether the source or destination is an MMIO range, split the move + * into two operations, a read and a write with only one memory operand. + * This will cause a nested #VC exception on the MMIO address which can + * then be handled. + * + * This implementation has the benefit that it also supports MOVS where + * source _and_ destination are MMIO regions. + * + * It will slow MOVS on MMIO down a lot, but in SEV-ES guests it is a + * rare operation. If it turns out to be a performance problem the split + * operations can be moved to memcpy_fromio() and memcpy_toio(). + */ +static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt, + unsigned int bytes) +{ + unsigned long ds_base, es_base; + unsigned char *src, *dst; + unsigned char buffer[8]; + enum es_result ret; + bool rep; + int off; + + ds_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_DS); + es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES); + + if (ds_base == -1L || es_base == -1L) { + ctxt->fi.vector = X86_TRAP_GP; + ctxt->fi.error_code = 0; + return ES_EXCEPTION; + } + + src = ds_base + (unsigned char *)ctxt->regs->si; + dst = es_base + (unsigned char *)ctxt->regs->di; + + ret = vc_read_mem(ctxt, src, buffer, bytes); + if (ret != ES_OK) + return ret; + + ret = vc_write_mem(ctxt, dst, buffer, bytes); + if (ret != ES_OK) + return ret; + + if (ctxt->regs->flags & X86_EFLAGS_DF) + off = -bytes; + else + off = bytes; + + ctxt->regs->si += off; + ctxt->regs->di += off; + + rep = insn_has_rep_prefix(&ctxt->insn); + if (rep) + ctxt->regs->cx -= 1; + + if (!rep || ctxt->regs->cx == 0) + return ES_OK; + else + return ES_RETRY; +} + +static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + struct insn *insn = &ctxt->insn; + unsigned int bytes = 0; + enum mmio_type mmio; + enum es_result ret; + u8 sign_byte; + long *reg_data; + + mmio = insn_decode_mmio(insn, &bytes); + if (mmio == MMIO_DECODE_FAILED) + return ES_DECODE_FAILED; + + if (mmio != MMIO_WRITE_IMM && mmio != MMIO_MOVS) { + reg_data = insn_get_modrm_reg_ptr(insn, ctxt->regs); + if (!reg_data) + return ES_DECODE_FAILED; + } + + switch (mmio) { + case MMIO_WRITE: + memcpy(ghcb->shared_buffer, reg_data, bytes); + ret = vc_do_mmio(ghcb, ctxt, bytes, false); + break; + case MMIO_WRITE_IMM: + memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes); + ret = vc_do_mmio(ghcb, ctxt, bytes, false); + break; + case MMIO_READ: + ret = vc_do_mmio(ghcb, ctxt, bytes, true); + if (ret) + break; + + /* Zero-extend for 32-bit operation */ + if (bytes == 4) + *reg_data = 0; + + memcpy(reg_data, ghcb->shared_buffer, bytes); + break; + case MMIO_READ_ZERO_EXTEND: + ret = vc_do_mmio(ghcb, ctxt, bytes, true); + if (ret) + break; + + /* Zero extend based on operand size */ + memset(reg_data, 0, insn->opnd_bytes); + memcpy(reg_data, ghcb->shared_buffer, bytes); + break; + case MMIO_READ_SIGN_EXTEND: + ret = vc_do_mmio(ghcb, ctxt, bytes, true); + if (ret) + break; + + if (bytes == 1) { + u8 *val = (u8 *)ghcb->shared_buffer; + + sign_byte = (*val & 0x80) ? 0xff : 0x00; + } else { + u16 *val = (u16 *)ghcb->shared_buffer; + + sign_byte = (*val & 0x8000) ? 0xff : 0x00; + } + + /* Sign extend based on operand size */ + memset(reg_data, sign_byte, insn->opnd_bytes); + memcpy(reg_data, ghcb->shared_buffer, bytes); + break; + case MMIO_MOVS: + ret = vc_handle_mmio_movs(ctxt, bytes); + break; + default: + ret = ES_UNSUPPORTED; + break; + } + + return ret; +} + +static enum es_result vc_handle_dr7_write(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct sev_es_runtime_data *data = this_cpu_read(runtime_data); + long val, *reg = vc_insn_get_rm(ctxt); + enum es_result ret; + + if (!reg) + return ES_DECODE_FAILED; + + val = *reg; + + /* Upper 32 bits must be written as zeroes */ + if (val >> 32) { + ctxt->fi.vector = X86_TRAP_GP; + ctxt->fi.error_code = 0; + return ES_EXCEPTION; + } + + /* Clear out other reserved bits and set bit 10 */ + val = (val & 0xffff23ffL) | BIT(10); + + /* Early non-zero writes to DR7 are not supported */ + if (!data && (val & ~DR7_RESET_VALUE)) + return ES_UNSUPPORTED; + + /* Using a value of 0 for ExitInfo1 means RAX holds the value */ + ghcb_set_rax(ghcb, val); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, 0, 0); + if (ret != ES_OK) + return ret; + + if (data) + data->dr7 = val; + + return ES_OK; +} + +static enum es_result vc_handle_dr7_read(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct sev_es_runtime_data *data = this_cpu_read(runtime_data); + long *reg = vc_insn_get_rm(ctxt); + + if (!reg) + return ES_DECODE_FAILED; + + if (data) + *reg = data->dr7; + else + *reg = DR7_RESET_VALUE; + + return ES_OK; +} + +static enum es_result vc_handle_wbinvd(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, 0, 0); +} + +static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + enum es_result ret; + + ghcb_set_rcx(ghcb, ctxt->regs->cx); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, 0, 0); + if (ret != ES_OK) + return ret; + + if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb))) + return ES_VMM_ERROR; + + ctxt->regs->ax = ghcb->save.rax; + ctxt->regs->dx = ghcb->save.rdx; + + return ES_OK; +} + +static enum es_result vc_handle_monitor(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + /* + * Treat it as a NOP and do not leak a physical address to the + * hypervisor. + */ + return ES_OK; +} + +static enum es_result vc_handle_mwait(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + /* Treat the same as MONITOR/MONITORX */ + return ES_OK; +} + +static enum es_result vc_handle_vmmcall(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + enum es_result ret; + + ghcb_set_rax(ghcb, ctxt->regs->ax); + ghcb_set_cpl(ghcb, user_mode(ctxt->regs) ? 3 : 0); + + if (x86_platform.hyper.sev_es_hcall_prepare) + x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, 0, 0); + if (ret != ES_OK) + return ret; + + if (!ghcb_rax_is_valid(ghcb)) + return ES_VMM_ERROR; + + ctxt->regs->ax = ghcb->save.rax; + + /* + * Call sev_es_hcall_finish() after regs->ax is already set. + * This allows the hypervisor handler to overwrite it again if + * necessary. + */ + if (x86_platform.hyper.sev_es_hcall_finish && + !x86_platform.hyper.sev_es_hcall_finish(ghcb, ctxt->regs)) + return ES_VMM_ERROR; + + return ES_OK; +} + +static enum es_result vc_handle_trap_ac(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + /* + * Calling ecx_alignment_check() directly does not work, because it + * enables IRQs and the GHCB is active. Forward the exception and call + * it later from vc_forward_exception(). + */ + ctxt->fi.vector = X86_TRAP_AC; + ctxt->fi.error_code = 0; + return ES_EXCEPTION; +} + +static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt, + struct ghcb *ghcb, + unsigned long exit_code) +{ + enum es_result result; + + switch (exit_code) { + case SVM_EXIT_READ_DR7: + result = vc_handle_dr7_read(ghcb, ctxt); + break; + case SVM_EXIT_WRITE_DR7: + result = vc_handle_dr7_write(ghcb, ctxt); + break; + case SVM_EXIT_EXCP_BASE + X86_TRAP_AC: + result = vc_handle_trap_ac(ghcb, ctxt); + break; + case SVM_EXIT_RDTSC: + case SVM_EXIT_RDTSCP: + result = vc_handle_rdtsc(ghcb, ctxt, exit_code); + break; + case SVM_EXIT_RDPMC: + result = vc_handle_rdpmc(ghcb, ctxt); + break; + case SVM_EXIT_INVD: + pr_err_ratelimited("#VC exception for INVD??? Seriously???\n"); + result = ES_UNSUPPORTED; + break; + case SVM_EXIT_CPUID: + result = vc_handle_cpuid(ghcb, ctxt); + break; + case SVM_EXIT_IOIO: + result = vc_handle_ioio(ghcb, ctxt); + break; + case SVM_EXIT_MSR: + result = vc_handle_msr(ghcb, ctxt); + break; + case SVM_EXIT_VMMCALL: + result = vc_handle_vmmcall(ghcb, ctxt); + break; + case SVM_EXIT_WBINVD: + result = vc_handle_wbinvd(ghcb, ctxt); + break; + case SVM_EXIT_MONITOR: + result = vc_handle_monitor(ghcb, ctxt); + break; + case SVM_EXIT_MWAIT: + result = vc_handle_mwait(ghcb, ctxt); + break; + case SVM_EXIT_NPF: + result = vc_handle_mmio(ghcb, ctxt); + break; + default: + /* + * Unexpected #VC exception + */ + result = ES_UNSUPPORTED; + } + + return result; +} + +static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt) +{ + long error_code = ctxt->fi.error_code; + int trapnr = ctxt->fi.vector; + + ctxt->regs->orig_ax = ctxt->fi.error_code; + + switch (trapnr) { + case X86_TRAP_GP: + exc_general_protection(ctxt->regs, error_code); + break; + case X86_TRAP_UD: + exc_invalid_op(ctxt->regs); + break; + case X86_TRAP_PF: + write_cr2(ctxt->fi.cr2); + exc_page_fault(ctxt->regs, error_code); + break; + case X86_TRAP_AC: + exc_alignment_check(ctxt->regs, error_code); + break; + default: + pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n"); + BUG(); + } +} + +static __always_inline bool is_vc2_stack(unsigned long sp) +{ + return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2)); +} + +static __always_inline bool vc_from_invalid_context(struct pt_regs *regs) +{ + unsigned long sp, prev_sp; + + sp = (unsigned long)regs; + prev_sp = regs->sp; + + /* + * If the code was already executing on the VC2 stack when the #VC + * happened, let it proceed to the normal handling routine. This way the + * code executing on the VC2 stack can cause #VC exceptions to get handled. + */ + return is_vc2_stack(sp) && !is_vc2_stack(prev_sp); +} + +static bool vc_raw_handle_exception(struct pt_regs *regs, unsigned long error_code) +{ + struct ghcb_state state; + struct es_em_ctxt ctxt; + enum es_result result; + struct ghcb *ghcb; + bool ret = true; + + ghcb = __sev_get_ghcb(&state); + + vc_ghcb_invalidate(ghcb); + result = vc_init_em_ctxt(&ctxt, regs, error_code); + + if (result == ES_OK) + result = vc_handle_exitcode(&ctxt, ghcb, error_code); + + __sev_put_ghcb(&state); + + /* Done - now check the result */ + switch (result) { + case ES_OK: + vc_finish_insn(&ctxt); + break; + case ES_UNSUPPORTED: + pr_err_ratelimited("Unsupported exit-code 0x%02lx in #VC exception (IP: 0x%lx)\n", + error_code, regs->ip); + ret = false; + break; + case ES_VMM_ERROR: + pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", + error_code, regs->ip); + ret = false; + break; + case ES_DECODE_FAILED: + pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", + error_code, regs->ip); + ret = false; + break; + case ES_EXCEPTION: + vc_forward_exception(&ctxt); + break; + case ES_RETRY: + /* Nothing to do */ + break; + default: + pr_emerg("Unknown result in %s():%d\n", __func__, result); + /* + * Emulating the instruction which caused the #VC exception + * failed - can't continue so print debug information + */ + BUG(); + } + + return ret; +} + +static __always_inline bool vc_is_db(unsigned long error_code) +{ + return error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB; +} + +/* + * Runtime #VC exception handler when raised from kernel mode. Runs in NMI mode + * and will panic when an error happens. + */ +DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication) +{ + irqentry_state_t irq_state; + + /* + * With the current implementation it is always possible to switch to a + * safe stack because #VC exceptions only happen at known places, like + * intercepted instructions or accesses to MMIO areas/IO ports. They can + * also happen with code instrumentation when the hypervisor intercepts + * #DB, but the critical paths are forbidden to be instrumented, so #DB + * exceptions currently also only happen in safe places. + * + * But keep this here in case the noinstr annotations are violated due + * to bug elsewhere. + */ + if (unlikely(vc_from_invalid_context(regs))) { + instrumentation_begin(); + panic("Can't handle #VC exception from unsupported context\n"); + instrumentation_end(); + } + + /* + * Handle #DB before calling into !noinstr code to avoid recursive #DB. + */ + if (vc_is_db(error_code)) { + exc_debug(regs); + return; + } + + irq_state = irqentry_nmi_enter(regs); + + instrumentation_begin(); + + if (!vc_raw_handle_exception(regs, error_code)) { + /* Show some debug info */ + show_regs(regs); + + /* Ask hypervisor to sev_es_terminate */ + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); + + /* If that fails and we get here - just panic */ + panic("Returned from Terminate-Request to Hypervisor\n"); + } + + instrumentation_end(); + irqentry_nmi_exit(regs, irq_state); +} + +/* + * Runtime #VC exception handler when raised from user mode. Runs in IRQ mode + * and will kill the current task with SIGBUS when an error happens. + */ +DEFINE_IDTENTRY_VC_USER(exc_vmm_communication) +{ + /* + * Handle #DB before calling into !noinstr code to avoid recursive #DB. + */ + if (vc_is_db(error_code)) { + noist_exc_debug(regs); + return; + } + + irqentry_enter_from_user_mode(regs); + instrumentation_begin(); + + if (!vc_raw_handle_exception(regs, error_code)) { + /* + * Do not kill the machine if user-space triggered the + * exception. Send SIGBUS instead and let user-space deal with + * it. + */ + force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0); + } + + instrumentation_end(); + irqentry_exit_to_user_mode(regs); +} + +bool __init handle_vc_boot_ghcb(struct pt_regs *regs) +{ + unsigned long exit_code = regs->orig_ax; + struct es_em_ctxt ctxt; + enum es_result result; + + vc_ghcb_invalidate(boot_ghcb); + + result = vc_init_em_ctxt(&ctxt, regs, exit_code); + if (result == ES_OK) + result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code); + + /* Done - now check the result */ + switch (result) { + case ES_OK: + vc_finish_insn(&ctxt); + break; + case ES_UNSUPPORTED: + early_printk("PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n", + exit_code, regs->ip); + goto fail; + case ES_VMM_ERROR: + early_printk("PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", + exit_code, regs->ip); + goto fail; + case ES_DECODE_FAILED: + early_printk("PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", + exit_code, regs->ip); + goto fail; + case ES_EXCEPTION: + vc_early_forward_exception(&ctxt); + break; + case ES_RETRY: + /* Nothing to do */ + break; + default: + BUG(); + } + + return true; + +fail: + show_regs(regs); + + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); +} + +/* + * Initial set up of SNP relies on information provided by the + * Confidential Computing blob, which can be passed to the kernel + * in the following ways, depending on how it is booted: + * + * - when booted via the boot/decompress kernel: + * - via boot_params + * + * - when booted directly by firmware/bootloader (e.g. CONFIG_PVH): + * - via a setup_data entry, as defined by the Linux Boot Protocol + * + * Scan for the blob in that order. + */ +static __init struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp) +{ + struct cc_blob_sev_info *cc_info; + + /* Boot kernel would have passed the CC blob via boot_params. */ + if (bp->cc_blob_address) { + cc_info = (struct cc_blob_sev_info *)(unsigned long)bp->cc_blob_address; + goto found_cc_info; + } + + /* + * If kernel was booted directly, without the use of the + * boot/decompression kernel, the CC blob may have been passed via + * setup_data instead. + */ + cc_info = find_cc_blob_setup_data(bp); + if (!cc_info) + return NULL; + +found_cc_info: + if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC) + snp_abort(); + + return cc_info; +} + +bool __init snp_init(struct boot_params *bp) +{ + struct cc_blob_sev_info *cc_info; + + if (!bp) + return false; + + cc_info = find_cc_blob(bp); + if (!cc_info) + return false; + + setup_cpuid_table(cc_info); + + /* + * The CC blob will be used later to access the secrets page. Cache + * it here like the boot kernel does. + */ + bp->cc_blob_address = (u32)(unsigned long)cc_info; + + return true; +} + +void __init __noreturn snp_abort(void) +{ + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); +} + +static void dump_cpuid_table(void) +{ + const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); + int i = 0; + + pr_info("count=%d reserved=0x%x reserved2=0x%llx\n", + cpuid_table->count, cpuid_table->__reserved1, cpuid_table->__reserved2); + + for (i = 0; i < SNP_CPUID_COUNT_MAX; i++) { + const struct snp_cpuid_fn *fn = &cpuid_table->fn[i]; + + pr_info("index=%3d fn=0x%08x subfn=0x%08x: eax=0x%08x ebx=0x%08x ecx=0x%08x edx=0x%08x xcr0_in=0x%016llx xss_in=0x%016llx reserved=0x%016llx\n", + i, fn->eax_in, fn->ecx_in, fn->eax, fn->ebx, fn->ecx, + fn->edx, fn->xcr0_in, fn->xss_in, fn->__reserved); + } +} + +/* + * It is useful from an auditing/testing perspective to provide an easy way + * for the guest owner to know that the CPUID table has been initialized as + * expected, but that initialization happens too early in boot to print any + * sort of indicator, and there's not really any other good place to do it, + * so do it here. + */ +static int __init report_cpuid_table(void) +{ + const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); + + if (!cpuid_table->count) + return 0; + + pr_info("Using SNP CPUID table, %d entries present.\n", + cpuid_table->count); + + if (sev_cfg.debug) + dump_cpuid_table(); + + return 0; +} +arch_initcall(report_cpuid_table); + +static int __init init_sev_config(char *str) +{ + char *s; + + while ((s = strsep(&str, ","))) { + if (!strcmp(s, "debug")) { + sev_cfg.debug = true; + continue; + } + + pr_info("SEV command-line option '%s' was not recognized\n", s); + } + + return 1; +} +__setup("sev=", init_sev_config); + +int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned long *fw_err) +{ + struct ghcb_state state; + struct es_em_ctxt ctxt; + unsigned long flags; + struct ghcb *ghcb; + int ret; + + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return -ENODEV; + + if (!fw_err) + return -EINVAL; + + /* + * __sev_get_ghcb() needs to run with IRQs disabled because it is using + * a per-CPU GHCB. + */ + local_irq_save(flags); + + ghcb = __sev_get_ghcb(&state); + if (!ghcb) { + ret = -EIO; + goto e_restore_irq; + } + + vc_ghcb_invalidate(ghcb); + + if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) { + ghcb_set_rax(ghcb, input->data_gpa); + ghcb_set_rbx(ghcb, input->data_npages); + } + + ret = sev_es_ghcb_hv_call(ghcb, &ctxt, exit_code, input->req_gpa, input->resp_gpa); + if (ret) + goto e_put; + + if (ghcb->save.sw_exit_info_2) { + /* Number of expected pages are returned in RBX */ + if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST && + ghcb->save.sw_exit_info_2 == SNP_GUEST_REQ_INVALID_LEN) + input->data_npages = ghcb_get_rbx(ghcb); + + *fw_err = ghcb->save.sw_exit_info_2; + + ret = -EIO; + } + +e_put: + __sev_put_ghcb(&state); +e_restore_irq: + local_irq_restore(flags); + + return ret; +} +EXPORT_SYMBOL_GPL(snp_issue_guest_request); + +static struct platform_device sev_guest_device = { + .name = "sev-guest", + .id = -1, +}; + +static int __init snp_init_platform_device(void) +{ + struct sev_guest_platform_data data; + u64 gpa; + + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return -ENODEV; + + gpa = get_secrets_page(); + if (!gpa) + return -ENODEV; + + data.secrets_gpa = gpa; + if (platform_device_add_data(&sev_guest_device, &data, sizeof(data))) + return -ENODEV; + + if (platform_device_register(&sev_guest_device)) + return -ENODEV; + + pr_info("SNP guest platform device initialized.\n"); + return 0; +} +device_initcall(snp_init_platform_device); diff --git a/arch/x86/kernel/sev_verify_cbit.S b/arch/x86/kernel/sev_verify_cbit.S new file mode 100644 index 000000000000..3355e27c69eb --- /dev/null +++ b/arch/x86/kernel/sev_verify_cbit.S @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * sev_verify_cbit.S - Code for verification of the C-bit position reported + * by the Hypervisor when running with SEV enabled. + * + * Copyright (c) 2020 Joerg Roedel (jroedel@suse.de) + * + * sev_verify_cbit() is called before switching to a new long-mode page-table + * at boot. + * + * Verify that the C-bit position is correct by writing a random value to + * an encrypted memory location while on the current page-table. Then it + * switches to the new page-table to verify the memory content is still the + * same. After that it switches back to the current page-table and when the + * check succeeded it returns. If the check failed the code invalidates the + * stack pointer and goes into a hlt loop. The stack-pointer is invalidated to + * make sure no interrupt or exception can get the CPU out of the hlt loop. + * + * New page-table pointer is expected in %rdi (first parameter) + * + */ +SYM_FUNC_START(sev_verify_cbit) +#ifdef CONFIG_AMD_MEM_ENCRYPT + /* First check if a C-bit was detected */ + movq sme_me_mask(%rip), %rsi + testq %rsi, %rsi + jz 3f + + /* sme_me_mask != 0 could mean SME or SEV - Check also for SEV */ + movq sev_status(%rip), %rsi + testq %rsi, %rsi + jz 3f + + /* Save CR4 in %rsi */ + movq %cr4, %rsi + + /* Disable Global Pages */ + movq %rsi, %rdx + andq $(~X86_CR4_PGE), %rdx + movq %rdx, %cr4 + + /* + * Verified that running under SEV - now get a random value using + * RDRAND. This instruction is mandatory when running as an SEV guest. + * + * Don't bail out of the loop if RDRAND returns errors. It is better to + * prevent forward progress than to work with a non-random value here. + */ +1: rdrand %rdx + jnc 1b + + /* Store value to memory and keep it in %rdx */ + movq %rdx, sev_check_data(%rip) + + /* Backup current %cr3 value to restore it later */ + movq %cr3, %rcx + + /* Switch to new %cr3 - This might unmap the stack */ + movq %rdi, %cr3 + + /* + * Compare value in %rdx with memory location. If C-bit is incorrect + * this would read the encrypted data and make the check fail. + */ + cmpq %rdx, sev_check_data(%rip) + + /* Restore old %cr3 */ + movq %rcx, %cr3 + + /* Restore previous CR4 */ + movq %rsi, %cr4 + + /* Check CMPQ result */ + je 3f + + /* + * The check failed, prevent any forward progress to prevent ROP + * attacks, invalidate the stack and go into a hlt loop. + */ + xorq %rsp, %rsp + subq $0x1000, %rsp +2: hlt + jmp 2b +3: +#endif + /* Return page-table pointer */ + movq %rdi, %rax + RET +SYM_FUNC_END(sev_verify_cbit) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 8a29573851a3..9c7265b524c7 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -15,9 +15,9 @@ #include <linux/mm.h> #include <linux/smp.h> #include <linux/kernel.h> +#include <linux/kstrtox.h> #include <linux/errno.h> #include <linux/wait.h> -#include <linux/tracehook.h> #include <linux/unistd.h> #include <linux/stddef.h> #include <linux/personality.h> @@ -25,46 +25,29 @@ #include <linux/user-return-notifier.h> #include <linux/uprobes.h> #include <linux/context_tracking.h> +#include <linux/entry-common.h> #include <linux/syscalls.h> #include <asm/processor.h> #include <asm/ucontext.h> -#include <asm/fpu/internal.h> #include <asm/fpu/signal.h> +#include <asm/fpu/xstate.h> #include <asm/vdso.h> #include <asm/mce.h> #include <asm/sighandling.h> #include <asm/vm86.h> #ifdef CONFIG_X86_64 +#include <linux/compat.h> #include <asm/proto.h> #include <asm/ia32_unistd.h> +#include <asm/fpu/xstate.h> #endif /* CONFIG_X86_64 */ #include <asm/syscall.h> -#include <asm/syscalls.h> - #include <asm/sigframe.h> #include <asm/signal.h> -#define COPY(x) do { \ - get_user_ex(regs->x, &sc->x); \ -} while (0) - -#define GET_SEG(seg) ({ \ - unsigned short tmp; \ - get_user_ex(tmp, &sc->seg); \ - tmp; \ -}) - -#define COPY_SEG(seg) do { \ - regs->seg = GET_SEG(seg); \ -} while (0) - -#define COPY_SEG_CPL3(seg) do { \ - regs->seg = GET_SEG(seg) | 3; \ -} while (0) - #ifdef CONFIG_X86_64 /* * If regs->ss will cause an IRET fault, change it. Otherwise leave it @@ -92,53 +75,58 @@ static void force_valid_ss(struct pt_regs *regs) ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA_EXPDOWN)) regs->ss = __USER_DS; } +# define CONTEXT_COPY_SIZE offsetof(struct sigcontext, reserved1) +#else +# define CONTEXT_COPY_SIZE sizeof(struct sigcontext) #endif -static int restore_sigcontext(struct pt_regs *regs, - struct sigcontext __user *sc, - unsigned long uc_flags) +static bool restore_sigcontext(struct pt_regs *regs, + struct sigcontext __user *usc, + unsigned long uc_flags) { - unsigned long buf_val; - void __user *buf; - unsigned int tmpflags; - unsigned int err = 0; + struct sigcontext sc; /* Always make any pending restarted system calls return -EINTR */ current->restart_block.fn = do_no_restart_syscall; - get_user_try { + if (copy_from_user(&sc, usc, CONTEXT_COPY_SIZE)) + return false; #ifdef CONFIG_X86_32 - set_user_gs(regs, GET_SEG(gs)); - COPY_SEG(fs); - COPY_SEG(es); - COPY_SEG(ds); + loadsegment(gs, sc.gs); + regs->fs = sc.fs; + regs->es = sc.es; + regs->ds = sc.ds; #endif /* CONFIG_X86_32 */ - COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); - COPY(dx); COPY(cx); COPY(ip); COPY(ax); + regs->bx = sc.bx; + regs->cx = sc.cx; + regs->dx = sc.dx; + regs->si = sc.si; + regs->di = sc.di; + regs->bp = sc.bp; + regs->ax = sc.ax; + regs->sp = sc.sp; + regs->ip = sc.ip; #ifdef CONFIG_X86_64 - COPY(r8); - COPY(r9); - COPY(r10); - COPY(r11); - COPY(r12); - COPY(r13); - COPY(r14); - COPY(r15); + regs->r8 = sc.r8; + regs->r9 = sc.r9; + regs->r10 = sc.r10; + regs->r11 = sc.r11; + regs->r12 = sc.r12; + regs->r13 = sc.r13; + regs->r14 = sc.r14; + regs->r15 = sc.r15; #endif /* CONFIG_X86_64 */ - COPY_SEG_CPL3(cs); - COPY_SEG_CPL3(ss); - - get_user_ex(tmpflags, &sc->flags); - regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); - regs->orig_ax = -1; /* disable syscall checks */ + /* Get CS/SS and force CPL3 */ + regs->cs = sc.cs | 0x03; + regs->ss = sc.ss | 0x03; - get_user_ex(buf_val, &sc->fpstate); - buf = (void __user *)buf_val; - } get_user_catch(err); + regs->flags = (regs->flags & ~FIX_EFLAGS) | (sc.flags & FIX_EFLAGS); + /* disable syscall checks */ + regs->orig_ax = -1; #ifdef CONFIG_X86_64 /* @@ -149,74 +137,89 @@ static int restore_sigcontext(struct pt_regs *regs, force_valid_ss(regs); #endif - err |= fpu__restore_sig(buf, IS_ENABLED(CONFIG_X86_32)); - - return err; + return fpu__restore_sig((void __user *)sc.fpstate, + IS_ENABLED(CONFIG_X86_32)); } -int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, +static __always_inline int +__unsafe_setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, struct pt_regs *regs, unsigned long mask) { - int err = 0; - - put_user_try { - #ifdef CONFIG_X86_32 - put_user_ex(get_user_gs(regs), (unsigned int __user *)&sc->gs); - put_user_ex(regs->fs, (unsigned int __user *)&sc->fs); - put_user_ex(regs->es, (unsigned int __user *)&sc->es); - put_user_ex(regs->ds, (unsigned int __user *)&sc->ds); + unsigned int gs; + savesegment(gs, gs); + + unsafe_put_user(gs, (unsigned int __user *)&sc->gs, Efault); + unsafe_put_user(regs->fs, (unsigned int __user *)&sc->fs, Efault); + unsafe_put_user(regs->es, (unsigned int __user *)&sc->es, Efault); + unsafe_put_user(regs->ds, (unsigned int __user *)&sc->ds, Efault); #endif /* CONFIG_X86_32 */ - put_user_ex(regs->di, &sc->di); - put_user_ex(regs->si, &sc->si); - put_user_ex(regs->bp, &sc->bp); - put_user_ex(regs->sp, &sc->sp); - put_user_ex(regs->bx, &sc->bx); - put_user_ex(regs->dx, &sc->dx); - put_user_ex(regs->cx, &sc->cx); - put_user_ex(regs->ax, &sc->ax); + unsafe_put_user(regs->di, &sc->di, Efault); + unsafe_put_user(regs->si, &sc->si, Efault); + unsafe_put_user(regs->bp, &sc->bp, Efault); + unsafe_put_user(regs->sp, &sc->sp, Efault); + unsafe_put_user(regs->bx, &sc->bx, Efault); + unsafe_put_user(regs->dx, &sc->dx, Efault); + unsafe_put_user(regs->cx, &sc->cx, Efault); + unsafe_put_user(regs->ax, &sc->ax, Efault); #ifdef CONFIG_X86_64 - put_user_ex(regs->r8, &sc->r8); - put_user_ex(regs->r9, &sc->r9); - put_user_ex(regs->r10, &sc->r10); - put_user_ex(regs->r11, &sc->r11); - put_user_ex(regs->r12, &sc->r12); - put_user_ex(regs->r13, &sc->r13); - put_user_ex(regs->r14, &sc->r14); - put_user_ex(regs->r15, &sc->r15); + unsafe_put_user(regs->r8, &sc->r8, Efault); + unsafe_put_user(regs->r9, &sc->r9, Efault); + unsafe_put_user(regs->r10, &sc->r10, Efault); + unsafe_put_user(regs->r11, &sc->r11, Efault); + unsafe_put_user(regs->r12, &sc->r12, Efault); + unsafe_put_user(regs->r13, &sc->r13, Efault); + unsafe_put_user(regs->r14, &sc->r14, Efault); + unsafe_put_user(regs->r15, &sc->r15, Efault); #endif /* CONFIG_X86_64 */ - put_user_ex(current->thread.trap_nr, &sc->trapno); - put_user_ex(current->thread.error_code, &sc->err); - put_user_ex(regs->ip, &sc->ip); + unsafe_put_user(current->thread.trap_nr, &sc->trapno, Efault); + unsafe_put_user(current->thread.error_code, &sc->err, Efault); + unsafe_put_user(regs->ip, &sc->ip, Efault); #ifdef CONFIG_X86_32 - put_user_ex(regs->cs, (unsigned int __user *)&sc->cs); - put_user_ex(regs->flags, &sc->flags); - put_user_ex(regs->sp, &sc->sp_at_signal); - put_user_ex(regs->ss, (unsigned int __user *)&sc->ss); + unsafe_put_user(regs->cs, (unsigned int __user *)&sc->cs, Efault); + unsafe_put_user(regs->flags, &sc->flags, Efault); + unsafe_put_user(regs->sp, &sc->sp_at_signal, Efault); + unsafe_put_user(regs->ss, (unsigned int __user *)&sc->ss, Efault); #else /* !CONFIG_X86_32 */ - put_user_ex(regs->flags, &sc->flags); - put_user_ex(regs->cs, &sc->cs); - put_user_ex(0, &sc->gs); - put_user_ex(0, &sc->fs); - put_user_ex(regs->ss, &sc->ss); + unsafe_put_user(regs->flags, &sc->flags, Efault); + unsafe_put_user(regs->cs, &sc->cs, Efault); + unsafe_put_user(0, &sc->gs, Efault); + unsafe_put_user(0, &sc->fs, Efault); + unsafe_put_user(regs->ss, &sc->ss, Efault); #endif /* CONFIG_X86_32 */ - put_user_ex(fpstate, (unsigned long __user *)&sc->fpstate); - - /* non-iBCS2 extensions.. */ - put_user_ex(mask, &sc->oldmask); - put_user_ex(current->thread.cr2, &sc->cr2); - } put_user_catch(err); + unsafe_put_user(fpstate, (unsigned long __user *)&sc->fpstate, Efault); - return err; + /* non-iBCS2 extensions.. */ + unsafe_put_user(mask, &sc->oldmask, Efault); + unsafe_put_user(current->thread.cr2, &sc->cr2, Efault); + return 0; +Efault: + return -EFAULT; } +#define unsafe_put_sigcontext(sc, fp, regs, set, label) \ +do { \ + if (__unsafe_setup_sigcontext(sc, fp, regs, set->sig[0])) \ + goto label; \ +} while(0); + +#define unsafe_put_sigmask(set, frame, label) \ + unsafe_put_user(*(__u64 *)(set), \ + (__u64 __user *)&(frame)->uc.uc_sigmask, \ + label) + /* * Set up a signal frame. */ +/* x86 ABI requires 16-byte alignment */ +#define FRAME_ALIGNMENT 16UL + +#define MAX_FRAME_PADDING (FRAME_ALIGNMENT - 1) + /* * Determine which stack to use.. */ @@ -227,9 +230,9 @@ static unsigned long align_sigframe(unsigned long sp) * Align the stack pointer according to the i386 ABI, * i.e. so that on function entry ((sp + 4) & 15) == 0. */ - sp = ((sp + 4) & -16ul) - 4; + sp = ((sp + 4) & -FRAME_ALIGNMENT) - 4; #else /* !CONFIG_X86_32 */ - sp = round_down(sp, 16) - 8; + sp = round_down(sp, FRAME_ALIGNMENT) - 8; #endif return sp; } @@ -239,11 +242,11 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, void __user **fpstate) { /* Default to using normal stack */ + bool nested_altstack = on_sig_stack(regs->sp); + bool entering_altstack = false; unsigned long math_size = 0; unsigned long sp = regs->sp; unsigned long buf_fx = 0; - int onsigstack = on_sig_stack(sp); - int ret; /* redzone */ if (IS_ENABLED(CONFIG_X86_64)) @@ -251,15 +254,23 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, /* This is the X/Open sanctioned signal stack switching. */ if (ka->sa.sa_flags & SA_ONSTACK) { - if (sas_ss_flags(sp) == 0) + /* + * This checks nested_altstack via sas_ss_flags(). Sensible + * programs use SS_AUTODISARM, which disables that check, and + * programs that don't use SS_AUTODISARM get compatible. + */ + if (sas_ss_flags(sp) == 0) { sp = current->sas_ss_sp + current->sas_ss_size; + entering_altstack = true; + } } else if (IS_ENABLED(CONFIG_X86_32) && - !onsigstack && + !nested_altstack && regs->ss != __USER_DS && !(ka->sa.sa_flags & SA_RESTORER) && ka->sa.sa_restorer) { /* This is the legacy signal stack switching. */ sp = (unsigned long) ka->sa.sa_restorer; + entering_altstack = true; } sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32), @@ -272,12 +283,18 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, * If we are on the alternate signal stack and would overflow it, don't. * Return an always-bogus address instead so we will die with SIGSEGV. */ - if (onsigstack && !likely(on_sig_stack(sp))) + if (unlikely((nested_altstack || entering_altstack) && + !__on_sig_stack(sp))) { + + if (show_unhandled_signals && printk_ratelimit()) + pr_info("%s[%d] overflowed sigaltstack\n", + current->comm, task_pid_nr(current)); + return (void __user *)-1L; + } /* save i387 and extended state */ - ret = copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size); - if (ret < 0) + if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size)) return (void __user *)-1L; return (void __user *)sp; @@ -312,26 +329,16 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set, { struct sigframe __user *frame; void __user *restorer; - int err = 0; - void __user *fpstate = NULL; - - frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate); - - if (!access_ok(frame, sizeof(*frame))) - return -EFAULT; + void __user *fp = NULL; - if (__put_user(sig, &frame->sig)) - return -EFAULT; + frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp); - if (setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0])) + if (!user_access_begin(frame, sizeof(*frame))) return -EFAULT; - if (_NSIG_WORDS > 1) { - if (__copy_to_user(&frame->extramask, &set->sig[1], - sizeof(frame->extramask))) - return -EFAULT; - } - + unsafe_put_user(sig, &frame->sig, Efault); + unsafe_put_sigcontext(&frame->sc, fp, regs, set, Efault); + unsafe_put_user(set->sig[1], &frame->extramask[0], Efault); if (current->mm->context.vdso) restorer = current->mm->context.vdso + vdso_image_32.sym___kernel_sigreturn; @@ -341,7 +348,7 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set, restorer = ksig->ka.sa.sa_restorer; /* Set up to return from userspace. */ - err |= __put_user(restorer, &frame->pretcode); + unsafe_put_user(restorer, &frame->pretcode, Efault); /* * This is popl %eax ; movl $__NR_sigreturn, %eax ; int $0x80 @@ -350,10 +357,8 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set, * reasons and because gdb uses it as a signature to notice * signal handler stack frames. */ - err |= __put_user(*((u64 *)&retcode), (u64 *)frame->retcode); - - if (err) - return -EFAULT; + unsafe_put_user(*((u64 *)&retcode), (u64 *)frame->retcode, Efault); + user_access_end(); /* Set up registers for signal handler */ regs->sp = (unsigned long)frame; @@ -368,6 +373,10 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set, regs->cs = __USER_CS; return 0; + +Efault: + user_access_end(); + return -EFAULT; } static int __setup_rt_frame(int sig, struct ksignal *ksig, @@ -375,50 +384,45 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, { struct rt_sigframe __user *frame; void __user *restorer; - int err = 0; - void __user *fpstate = NULL; + void __user *fp = NULL; - frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate); + frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp); - if (!access_ok(frame, sizeof(*frame))) + if (!user_access_begin(frame, sizeof(*frame))) return -EFAULT; - put_user_try { - put_user_ex(sig, &frame->sig); - put_user_ex(&frame->info, &frame->pinfo); - put_user_ex(&frame->uc, &frame->puc); + unsafe_put_user(sig, &frame->sig, Efault); + unsafe_put_user(&frame->info, &frame->pinfo, Efault); + unsafe_put_user(&frame->uc, &frame->puc, Efault); - /* Create the ucontext. */ - if (static_cpu_has(X86_FEATURE_XSAVE)) - put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); - else - put_user_ex(0, &frame->uc.uc_flags); - put_user_ex(0, &frame->uc.uc_link); - save_altstack_ex(&frame->uc.uc_stack, regs->sp); + /* Create the ucontext. */ + if (static_cpu_has(X86_FEATURE_XSAVE)) + unsafe_put_user(UC_FP_XSTATE, &frame->uc.uc_flags, Efault); + else + unsafe_put_user(0, &frame->uc.uc_flags, Efault); + unsafe_put_user(0, &frame->uc.uc_link, Efault); + unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); - /* Set up to return from userspace. */ - restorer = current->mm->context.vdso + - vdso_image_32.sym___kernel_rt_sigreturn; - if (ksig->ka.sa.sa_flags & SA_RESTORER) - restorer = ksig->ka.sa.sa_restorer; - put_user_ex(restorer, &frame->pretcode); + /* Set up to return from userspace. */ + restorer = current->mm->context.vdso + + vdso_image_32.sym___kernel_rt_sigreturn; + if (ksig->ka.sa.sa_flags & SA_RESTORER) + restorer = ksig->ka.sa.sa_restorer; + unsafe_put_user(restorer, &frame->pretcode, Efault); - /* - * This is movl $__NR_rt_sigreturn, %ax ; int $0x80 - * - * WE DO NOT USE IT ANY MORE! It's only left here for historical - * reasons and because gdb uses it as a signature to notice - * signal handler stack frames. - */ - put_user_ex(*((u64 *)&rt_retcode), (u64 *)frame->retcode); - } put_user_catch(err); + /* + * This is movl $__NR_rt_sigreturn, %ax ; int $0x80 + * + * WE DO NOT USE IT ANY MORE! It's only left here for historical + * reasons and because gdb uses it as a signature to notice + * signal handler stack frames. + */ + unsafe_put_user(*((u64 *)&rt_retcode), (u64 *)frame->retcode, Efault); + unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault); + unsafe_put_sigmask(set, frame, Efault); + user_access_end(); - err |= copy_siginfo_to_user(&frame->info, &ksig->info); - err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, - regs, set->sig[0]); - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); - - if (err) + if (copy_siginfo_to_user(&frame->info, &ksig->info)) return -EFAULT; /* Set up registers for signal handler */ @@ -434,6 +438,9 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, regs->cs = __USER_CS; return 0; +Efault: + user_access_end(); + return -EFAULT; } #else /* !CONFIG_X86_32 */ static unsigned long frame_uc_flags(struct pt_regs *regs) @@ -457,43 +464,34 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, struct rt_sigframe __user *frame; void __user *fp = NULL; unsigned long uc_flags; - int err = 0; + + /* x86-64 should always use SA_RESTORER. */ + if (!(ksig->ka.sa.sa_flags & SA_RESTORER)) + return -EFAULT; frame = get_sigframe(&ksig->ka, regs, sizeof(struct rt_sigframe), &fp); + uc_flags = frame_uc_flags(regs); - if (!access_ok(frame, sizeof(*frame))) + if (!user_access_begin(frame, sizeof(*frame))) return -EFAULT; + /* Create the ucontext. */ + unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault); + unsafe_put_user(0, &frame->uc.uc_link, Efault); + unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); + + /* Set up to return from userspace. If provided, use a stub + already in userspace. */ + unsafe_put_user(ksig->ka.sa.sa_restorer, &frame->pretcode, Efault); + unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault); + unsafe_put_sigmask(set, frame, Efault); + user_access_end(); + if (ksig->ka.sa.sa_flags & SA_SIGINFO) { if (copy_siginfo_to_user(&frame->info, &ksig->info)) return -EFAULT; } - uc_flags = frame_uc_flags(regs); - - put_user_try { - /* Create the ucontext. */ - put_user_ex(uc_flags, &frame->uc.uc_flags); - put_user_ex(0, &frame->uc.uc_link); - save_altstack_ex(&frame->uc.uc_stack, regs->sp); - - /* Set up to return from userspace. If provided, use a stub - already in userspace. */ - /* x86-64 should always use SA_RESTORER. */ - if (ksig->ka.sa.sa_flags & SA_RESTORER) { - put_user_ex(ksig->ka.sa.sa_restorer, &frame->pretcode); - } else { - /* could use a vstub here */ - err |= -EFAULT; - } - } put_user_catch(err); - - err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]); - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); - - if (err) - return -EFAULT; - /* Set up registers for signal handler */ regs->di = sig; /* In case the signal handler was declared without prototypes */ @@ -516,7 +514,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, * SS descriptor, but we do need SS to be valid. It's possible * that the old SS is entirely bogus -- this can happen if the * signal we're trying to deliver is #GP or #SS caused by a bad - * SS value. We also have a compatbility issue here: DOSEMU + * SS value. We also have a compatibility issue here: DOSEMU * relies on the contents of the SS register indicating the * SS value at the time of the signal, even though that code in * DOSEMU predates sigreturn's ability to restore SS. (DOSEMU @@ -530,9 +528,38 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, force_valid_ss(regs); return 0; + +Efault: + user_access_end(); + return -EFAULT; } #endif /* CONFIG_X86_32 */ +#ifdef CONFIG_X86_X32_ABI +static int x32_copy_siginfo_to_user(struct compat_siginfo __user *to, + const struct kernel_siginfo *from) +{ + struct compat_siginfo new; + + copy_siginfo_to_external32(&new, from); + if (from->si_signo == SIGCHLD) { + new._sifields._sigchld_x32._utime = from->si_utime; + new._sifields._sigchld_x32._stime = from->si_stime; + } + if (copy_to_user(to, &new, sizeof(struct compat_siginfo))) + return -EFAULT; + return 0; +} + +int copy_siginfo_to_user32(struct compat_siginfo __user *to, + const struct kernel_siginfo *from) +{ + if (in_x32_syscall()) + return x32_copy_siginfo_to_user(to, from); + return __copy_siginfo_to_user32(to, from); +} +#endif /* CONFIG_X86_X32_ABI */ + static int x32_setup_rt_frame(struct ksignal *ksig, compat_sigset_t *set, struct pt_regs *regs) @@ -541,44 +568,33 @@ static int x32_setup_rt_frame(struct ksignal *ksig, struct rt_sigframe_x32 __user *frame; unsigned long uc_flags; void __user *restorer; - int err = 0; - void __user *fpstate = NULL; - - frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate); + void __user *fp = NULL; - if (!access_ok(frame, sizeof(*frame))) + if (!(ksig->ka.sa.sa_flags & SA_RESTORER)) return -EFAULT; - if (ksig->ka.sa.sa_flags & SA_SIGINFO) { - if (__copy_siginfo_to_user32(&frame->info, &ksig->info, true)) - return -EFAULT; - } + frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp); uc_flags = frame_uc_flags(regs); - put_user_try { - /* Create the ucontext. */ - put_user_ex(uc_flags, &frame->uc.uc_flags); - put_user_ex(0, &frame->uc.uc_link); - compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp); - put_user_ex(0, &frame->uc.uc__pad0); - - if (ksig->ka.sa.sa_flags & SA_RESTORER) { - restorer = ksig->ka.sa.sa_restorer; - } else { - /* could use a vstub here */ - restorer = NULL; - err |= -EFAULT; - } - put_user_ex(restorer, (unsigned long __user *)&frame->pretcode); - } put_user_catch(err); + if (!user_access_begin(frame, sizeof(*frame))) + return -EFAULT; - err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, - regs, set->sig[0]); - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); + /* Create the ucontext. */ + unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault); + unsafe_put_user(0, &frame->uc.uc_link, Efault); + unsafe_compat_save_altstack(&frame->uc.uc_stack, regs->sp, Efault); + unsafe_put_user(0, &frame->uc.uc__pad0, Efault); + restorer = ksig->ka.sa.sa_restorer; + unsafe_put_user(restorer, (unsigned long __user *)&frame->pretcode, Efault); + unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault); + unsafe_put_sigmask(set, frame, Efault); + user_access_end(); - if (err) - return -EFAULT; + if (ksig->ka.sa.sa_flags & SA_SIGINFO) { + if (x32_copy_siginfo_to_user(&frame->info, &ksig->info)) + return -EFAULT; + } /* Set up registers for signal handler */ regs->sp = (unsigned long) frame; @@ -597,6 +613,11 @@ static int x32_setup_rt_frame(struct ksignal *ksig, #endif /* CONFIG_X86_X32_ABI */ return 0; +#ifdef CONFIG_X86_X32_ABI +Efault: + user_access_end(); + return -EFAULT; +#endif } /* @@ -613,9 +634,8 @@ SYSCALL_DEFINE0(sigreturn) if (!access_ok(frame, sizeof(*frame))) goto badframe; - if (__get_user(set.sig[0], &frame->sc.oldmask) || (_NSIG_WORDS > 1 - && __copy_from_user(&set.sig[1], &frame->extramask, - sizeof(frame->extramask)))) + if (__get_user(set.sig[0], &frame->sc.oldmask) || + __get_user(set.sig[1], &frame->extramask[0])) goto badframe; set_current_blocked(&set); @@ -624,7 +644,7 @@ SYSCALL_DEFINE0(sigreturn) * x86_32 has no uc_flags bits relevant to restore_sigcontext. * Save a few cycles by skipping the __get_user. */ - if (restore_sigcontext(regs, &frame->sc, 0)) + if (!restore_sigcontext(regs, &frame->sc, 0)) goto badframe; return regs->ax; @@ -645,14 +665,14 @@ SYSCALL_DEFINE0(rt_sigreturn) frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); if (!access_ok(frame, sizeof(*frame))) goto badframe; - if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) + if (__get_user(*(__u64 *)&set, (__u64 __user *)&frame->uc.uc_sigmask)) goto badframe; if (__get_user(uc_flags, &frame->uc.uc_flags)) goto badframe; set_current_blocked(&set); - if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) + if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) goto badframe; if (restore_altstack(&frame->uc.uc_stack)) @@ -665,6 +685,64 @@ badframe: return 0; } +/* + * There are four different struct types for signal frame: sigframe_ia32, + * rt_sigframe_ia32, rt_sigframe_x32, and rt_sigframe. Use the worst case + * -- the largest size. It means the size for 64-bit apps is a bit more + * than needed, but this keeps the code simple. + */ +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) +# define MAX_FRAME_SIGINFO_UCTXT_SIZE sizeof(struct sigframe_ia32) +#else +# define MAX_FRAME_SIGINFO_UCTXT_SIZE sizeof(struct rt_sigframe) +#endif + +/* + * The FP state frame contains an XSAVE buffer which must be 64-byte aligned. + * If a signal frame starts at an unaligned address, extra space is required. + * This is the max alignment padding, conservatively. + */ +#define MAX_XSAVE_PADDING 63UL + +/* + * The frame data is composed of the following areas and laid out as: + * + * ------------------------- + * | alignment padding | + * ------------------------- + * | (f)xsave frame | + * ------------------------- + * | fsave header | + * ------------------------- + * | alignment padding | + * ------------------------- + * | siginfo + ucontext | + * ------------------------- + */ + +/* max_frame_size tells userspace the worst case signal stack size. */ +static unsigned long __ro_after_init max_frame_size; +static unsigned int __ro_after_init fpu_default_state_size; + +void __init init_sigframe_size(void) +{ + fpu_default_state_size = fpu__get_fpstate_size(); + + max_frame_size = MAX_FRAME_SIGINFO_UCTXT_SIZE + MAX_FRAME_PADDING; + + max_frame_size += fpu_default_state_size + MAX_XSAVE_PADDING; + + /* Userspace expects an aligned size. */ + max_frame_size = round_up(max_frame_size, FRAME_ALIGNMENT); + + pr_info("max sigframe size: %lu\n", max_frame_size); +} + +unsigned long get_sigframe_size(void) +{ + return max_frame_size; +} + static inline int is_ia32_compat_frame(struct ksignal *ksig) { return IS_ENABLED(CONFIG_IA32_EMULATION) && @@ -715,7 +793,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL); /* Are we from a system call? */ - if (syscall_get_nr(current, regs) >= 0) { + if (syscall_get_nr(current, regs) != -1) { /* If so, check system call restarting.. */ switch (syscall_get_error(current, regs)) { case -ERESTART_RESTARTBLOCK: @@ -728,7 +806,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) regs->ax = -EINTR; break; } - /* fallthrough */ + fallthrough; case -ERESTARTNOINTR: regs->ax = regs->orig_ax; regs->ip -= 2; @@ -761,37 +839,15 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) /* * Ensure the signal handler starts with the new fpu state. */ - fpu__clear(fpu); + fpu__clear_user_states(fpu); } signal_setup_done(failed, ksig, stepping); } static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) { - /* - * This function is fundamentally broken as currently - * implemented. - * - * The idea is that we want to trigger a call to the - * restart_block() syscall and that we want in_ia32_syscall(), - * in_x32_syscall(), etc. to match whatever they were in the - * syscall being restarted. We assume that the syscall - * instruction at (regs->ip - 2) matches whatever syscall - * instruction we used to enter in the first place. - * - * The problem is that we can get here when ptrace pokes - * syscall-like values into regs even if we're not in a syscall - * at all. - * - * For now, we maintain historical behavior and guess based on - * stored state. We could do better by saving the actual - * syscall arch in restart_block or (with caveats on x32) by - * checking if regs->ip points to 'int $0x80'. The current - * behavior is incorrect if a tracer has a different bitness - * than the tracee. - */ #ifdef CONFIG_IA32_EMULATION - if (current_thread_info()->status & (TS_COMPAT|TS_I386_REGS_POKED)) + if (current->restart_block.arch_data & TS_COMPAT) return __NR_ia32_restart_syscall; #endif #ifdef CONFIG_X86_X32_ABI @@ -806,7 +862,7 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) * want to handle. Thus you cannot kill init even with a SIGKILL even by * mistake. */ -void do_signal(struct pt_regs *regs) +void arch_do_signal_or_restart(struct pt_regs *regs) { struct ksignal ksig; @@ -817,7 +873,7 @@ void do_signal(struct pt_regs *regs) } /* Did we come from a system call? */ - if (syscall_get_nr(current, regs) >= 0) { + if (syscall_get_nr(current, regs) != -1) { /* Restart the system call - no handlers present */ switch (syscall_get_error(current, regs)) { case -ERESTARTNOHAND: @@ -858,8 +914,64 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where) force_sig(SIGSEGV); } +#ifdef CONFIG_DYNAMIC_SIGFRAME +#ifdef CONFIG_STRICT_SIGALTSTACK_SIZE +static bool strict_sigaltstack_size __ro_after_init = true; +#else +static bool strict_sigaltstack_size __ro_after_init = false; +#endif + +static int __init strict_sas_size(char *arg) +{ + return kstrtobool(arg, &strict_sigaltstack_size); +} +__setup("strict_sas_size", strict_sas_size); + +/* + * MINSIGSTKSZ is 2048 and can't be changed despite the fact that AVX512 + * exceeds that size already. As such programs might never use the + * sigaltstack they just continued to work. While always checking against + * the real size would be correct, this might be considered a regression. + * + * Therefore avoid the sanity check, unless enforced by kernel + * configuration or command line option. + * + * When dynamic FPU features are supported, the check is also enforced when + * the task has permissions to use dynamic features. Tasks which have no + * permission are checked against the size of the non-dynamic feature set + * if strict checking is enabled. This avoids forcing all tasks on the + * system to allocate large sigaltstacks even if they are never going + * to use a dynamic feature. As this is serialized via sighand::siglock + * any permission request for a dynamic feature either happened already + * or will see the newly install sigaltstack size in the permission checks. + */ +bool sigaltstack_size_valid(size_t ss_size) +{ + unsigned long fsize = max_frame_size - fpu_default_state_size; + u64 mask; + + lockdep_assert_held(¤t->sighand->siglock); + + if (!fpu_state_size_dynamic() && !strict_sigaltstack_size) + return true; + + fsize += current->group_leader->thread.fpu.perm.__user_state_size; + if (likely(ss_size > fsize)) + return true; + + if (strict_sigaltstack_size) + return ss_size > fsize; + + mask = current->group_leader->thread.fpu.perm.__state_perm; + if (mask & XFEATURE_MASK_USER_DYNAMIC) + return ss_size > fsize; + + return true; +} +#endif /* CONFIG_DYNAMIC_SIGFRAME */ + #ifdef CONFIG_X86_X32_ABI -asmlinkage long sys32_x32_rt_sigreturn(void) +COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn) { struct pt_regs *regs = current_pt_regs(); struct rt_sigframe_x32 __user *frame; @@ -870,14 +982,14 @@ asmlinkage long sys32_x32_rt_sigreturn(void) if (!access_ok(frame, sizeof(*frame))) goto badframe; - if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) + if (__get_user(set.sig[0], (__u64 __user *)&frame->uc.uc_sigmask)) goto badframe; if (__get_user(uc_flags, &frame->uc.uc_flags)) goto badframe; set_current_blocked(&set); - if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) + if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) goto badframe; if (compat_restore_altstack(&frame->uc.uc_stack)) diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index 9ccbf0576cd0..879ef8c72f5c 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -27,14 +27,20 @@ static inline void signal_compat_build_tests(void) */ BUILD_BUG_ON(NSIGILL != 11); BUILD_BUG_ON(NSIGFPE != 15); - BUILD_BUG_ON(NSIGSEGV != 7); + BUILD_BUG_ON(NSIGSEGV != 9); BUILD_BUG_ON(NSIGBUS != 5); - BUILD_BUG_ON(NSIGTRAP != 5); + BUILD_BUG_ON(NSIGTRAP != 6); BUILD_BUG_ON(NSIGCHLD != 6); - BUILD_BUG_ON(NSIGSYS != 1); + BUILD_BUG_ON(NSIGSYS != 2); /* This is part of the ABI and can never change in size: */ + BUILD_BUG_ON(sizeof(siginfo_t) != 128); BUILD_BUG_ON(sizeof(compat_siginfo_t) != 128); + + /* This is a part of the ABI and can never change in alignment */ + BUILD_BUG_ON(__alignof__(siginfo_t) != 8); + BUILD_BUG_ON(__alignof__(compat_siginfo_t) != 4); + /* * The offsets of all the (unioned) si_fields are fixed * in the ABI, of course. Make sure none of them ever @@ -127,6 +133,9 @@ static inline void signal_compat_build_tests(void) BUILD_BUG_ON(offsetof(siginfo_t, si_addr) != 0x10); BUILD_BUG_ON(offsetof(compat_siginfo_t, si_addr) != 0x0C); + BUILD_BUG_ON(offsetof(siginfo_t, si_trapno) != 0x18); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_trapno) != 0x10); + BUILD_BUG_ON(offsetof(siginfo_t, si_addr_lsb) != 0x18); BUILD_BUG_ON(offsetof(compat_siginfo_t, si_addr_lsb) != 0x10); @@ -138,6 +147,13 @@ static inline void signal_compat_build_tests(void) BUILD_BUG_ON(offsetof(siginfo_t, si_pkey) != 0x20); BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pkey) != 0x14); + BUILD_BUG_ON(offsetof(siginfo_t, si_perf_data) != 0x18); + BUILD_BUG_ON(offsetof(siginfo_t, si_perf_type) != 0x20); + BUILD_BUG_ON(offsetof(siginfo_t, si_perf_flags) != 0x24); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_data) != 0x10); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_type) != 0x14); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_flags) != 0x18); + CHECK_CSI_OFFSET(_sigpoll); CHECK_CSI_SIZE (_sigpoll, 2*sizeof(int)); CHECK_SI_SIZE (_sigpoll, 4*sizeof(int)); @@ -165,16 +181,9 @@ void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact) { signal_compat_build_tests(); - /* Don't leak in-kernel non-uapi flags to user-space */ - if (oact) - oact->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI); - if (!act) return; - /* Don't let flags to be set from userspace */ - act->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI); - if (in_ia32_syscall()) act->sa.sa_flags |= SA_IA32_ABI; if (in_x32_syscall()) diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index b8d4e9c3c070..06db901fabe8 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -27,6 +27,7 @@ #include <asm/mmu_context.h> #include <asm/proto.h> #include <asm/apic.h> +#include <asm/idtentry.h> #include <asm/nmi.h> #include <asm/mce.h> #include <asm/trace/irq_vectors.h> @@ -66,7 +67,7 @@ * 5AP. symmetric IO mode (normal Linux operation) not affected. * 'noapic' mode has vector 0xf filled out properly. * 6AP. 'noapic' mode might be affected - fixed in later steppings - * 7AP. We do not assume writes to the LVT deassering IRQs + * 7AP. We do not assume writes to the LVT deasserting IRQs * 8AP. We do not enable low power mode (deep sleep) during MP bootup * 9AP. We do not use mixed mode * @@ -130,13 +131,11 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) /* * this function calls the 'stop' function on all other CPUs in the system. */ - -asmlinkage __visible void smp_reboot_interrupt(void) +DEFINE_IDTENTRY_SYSVEC(sysvec_reboot) { - ipi_entering_ack_irq(); + ack_APIC_irq(); cpu_emergency_vmxoff(); stop_this_cpu(NULL); - irq_exit(); } static int register_stop_handler(void) @@ -205,7 +204,7 @@ static void native_stop_other_cpus(int wait) } /* * Don't wait longer than 10 ms if the caller didn't - * reqeust it. If wait is true, the machine hangs here if + * request it. If wait is true, the machine hangs here if * one or more CPUs do not reach shutdown state. */ timeout = USEC_PER_MSEC * 10; @@ -221,47 +220,33 @@ static void native_stop_other_cpus(int wait) /* * Reschedule call back. KVM uses this interrupt to force a cpu out of - * guest mode + * guest mode. */ -__visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_reschedule_ipi) { ack_APIC_irq(); + trace_reschedule_entry(RESCHEDULE_VECTOR); inc_irq_stat(irq_resched_count); - kvm_set_cpu_l1tf_flush_l1d(); - - if (trace_resched_ipi_enabled()) { - /* - * scheduler_ipi() might call irq_enter() as well, but - * nested calls are fine. - */ - irq_enter(); - trace_reschedule_entry(RESCHEDULE_VECTOR); - scheduler_ipi(); - trace_reschedule_exit(RESCHEDULE_VECTOR); - irq_exit(); - return; - } scheduler_ipi(); + trace_reschedule_exit(RESCHEDULE_VECTOR); } -__visible void __irq_entry smp_call_function_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_call_function) { - ipi_entering_ack_irq(); + ack_APIC_irq(); trace_call_function_entry(CALL_FUNCTION_VECTOR); inc_irq_stat(irq_call_count); generic_smp_call_function_interrupt(); trace_call_function_exit(CALL_FUNCTION_VECTOR); - exiting_irq(); } -__visible void __irq_entry smp_call_function_single_interrupt(struct pt_regs *r) +DEFINE_IDTENTRY_SYSVEC(sysvec_call_function_single) { - ipi_entering_ack_irq(); + ack_APIC_irq(); trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); inc_irq_stat(irq_call_count); generic_smp_call_function_single_interrupt(); trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR); - exiting_irq(); } static int __init nonmi_ipi_setup(char *str) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 69881b2d446c..3f3ea0287f69 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -51,10 +51,11 @@ #include <linux/err.h> #include <linux/nmi.h> #include <linux/tboot.h> -#include <linux/stackprotector.h> #include <linux/gfp.h> #include <linux/cpuidle.h> #include <linux/numa.h> +#include <linux/pgtable.h> +#include <linux/overflow.h> #include <asm/acpi.h> #include <asm/desc.h> @@ -63,13 +64,12 @@ #include <asm/realmode.h> #include <asm/cpu.h> #include <asm/numa.h> -#include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/mtrr.h> #include <asm/mwait.h> #include <asm/apic.h> #include <asm/io_apic.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> #include <asm/setup.h> #include <asm/uv/uv.h> #include <linux/mc146818rtc.h> @@ -80,6 +80,8 @@ #include <asm/cpu_device_id.h> #include <asm/spec-ctrl.h> #include <asm/hw_irq.h> +#include <asm/stackprotector.h> +#include <asm/sev.h> /* representing HT siblings of each logical CPU */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); @@ -93,8 +95,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_core_map); DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map); EXPORT_PER_CPU_SYMBOL(cpu_die_map); -DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); - /* Per CPU bogomips and other parameters */ DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); EXPORT_PER_CPU_SYMBOL(cpu_info); @@ -183,6 +183,8 @@ static void smp_callin(void) */ set_cpu_sibling_map(raw_smp_processor_id()); + ap_init_aperfmperf(); + /* * Get our bogomips. * Update loops_per_jiffy in cpu_data. Previous call to @@ -222,10 +224,9 @@ static void notrace start_secondary(void *unused) load_cr3(swapper_pg_dir); __flush_tlb_all(); #endif - load_current_idt(); - cpu_init(); + cpu_init_secondary(); + rcu_cpu_starting(raw_smp_processor_id()); x86_cpuinit.early_percpu_clock_init(); - preempt_disable(); smp_callin(); enable_start_cpu0 = 0; @@ -255,9 +256,6 @@ static void notrace start_secondary(void *unused) /* enable local interrupts */ local_irq_enable(); - /* to prevent fake stack check failure in clock setup */ - boot_init_stack_canary(); - x86_cpuinit.setup_percpu_clockev(); wmb(); @@ -450,29 +448,67 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) return false; } +static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + if (c->phys_proc_id == o->phys_proc_id && + c->cpu_die_id == o->cpu_die_id) + return true; + return false; +} + +static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + + /* If the arch didn't set up l2c_id, fall back to SMT */ + if (per_cpu(cpu_l2c_id, cpu1) == BAD_APICID) + return match_smt(c, o); + + /* Do not match if L2 cache id does not match: */ + if (per_cpu(cpu_l2c_id, cpu1) != per_cpu(cpu_l2c_id, cpu2)) + return false; + + return topology_sane(c, o, "l2c"); +} + /* - * Define snc_cpu[] for SNC (Sub-NUMA Cluster) CPUs. + * Unlike the other levels, we do not enforce keeping a + * multicore group inside a NUMA node. If this happens, we will + * discard the MC level of the topology later. + */ +static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + if (c->phys_proc_id == o->phys_proc_id) + return true; + return false; +} + +/* + * Define intel_cod_cpu[] for Intel COD (Cluster-on-Die) CPUs. * - * These are Intel CPUs that enumerate an LLC that is shared by - * multiple NUMA nodes. The LLC on these systems is shared for - * off-package data access but private to the NUMA node (half - * of the package) for on-package access. + * Any Intel CPU that has multiple nodes per package and does not + * match intel_cod_cpu[] has the SNC (Sub-NUMA Cluster) topology. * - * CPUID (the source of the information about the LLC) can only - * enumerate the cache as being shared *or* unshared, but not - * this particular configuration. The CPU in this case enumerates - * the cache to be shared across the entire package (spanning both - * NUMA nodes). + * When in SNC mode, these CPUs enumerate an LLC that is shared + * by multiple NUMA nodes. The LLC is shared for off-package data + * access but private to the NUMA node (half of the package) for + * on-package access. CPUID (the source of the information about + * the LLC) can only enumerate the cache as shared or unshared, + * but not this particular configuration. */ -static const struct x86_cpu_id snc_cpu[] = { - { X86_VENDOR_INTEL, 6, INTEL_FAM6_SKYLAKE_X }, +static const struct x86_cpu_id intel_cod_cpu[] = { + X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, 0), /* COD */ + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, 0), /* COD */ + X86_MATCH_INTEL_FAM6_MODEL(ANY, 1), /* SNC */ {} }; static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { + const struct x86_cpu_id *id = x86_match_cpu(intel_cod_cpu); int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + bool intel_snc = id && id->driver_data; /* Do not match if we do not have a valid APICID for cpu: */ if (per_cpu(cpu_llc_id, cpu1) == BAD_APICID) @@ -487,34 +523,14 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) * means 'c' does not share the LLC of 'o'. This will be * reflected to userspace. */ - if (!topology_same_node(c, o) && x86_match_cpu(snc_cpu)) + if (match_pkg(c, o) && !topology_same_node(c, o) && intel_snc) return false; return topology_sane(c, o, "llc"); } -/* - * Unlike the other levels, we do not enforce keeping a - * multicore group inside a NUMA node. If this happens, we will - * discard the MC level of the topology later. - */ -static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) -{ - if (c->phys_proc_id == o->phys_proc_id) - return true; - return false; -} - -static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) -{ - if ((c->phys_proc_id == o->phys_proc_id) && - (c->cpu_die_id == o->cpu_die_id)) - return true; - return false; -} - -#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) +#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_CLUSTER) || defined(CONFIG_SCHED_MC) static inline int x86_sched_itmt_flags(void) { return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0; @@ -532,22 +548,45 @@ static int x86_smt_flags(void) return cpu_smt_flags() | x86_sched_itmt_flags(); } #endif +#ifdef CONFIG_SCHED_CLUSTER +static int x86_cluster_flags(void) +{ + return cpu_cluster_flags() | x86_sched_itmt_flags(); +} +#endif #endif static struct sched_domain_topology_level x86_numa_in_package_topology[] = { #ifdef CONFIG_SCHED_SMT { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, #endif +#ifdef CONFIG_SCHED_CLUSTER + { cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) }, +#endif #ifdef CONFIG_SCHED_MC { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, #endif { NULL, }, }; +static struct sched_domain_topology_level x86_hybrid_topology[] = { +#ifdef CONFIG_SCHED_SMT + { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, +#endif +#ifdef CONFIG_SCHED_MC + { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, +#endif + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + static struct sched_domain_topology_level x86_topology[] = { #ifdef CONFIG_SCHED_SMT { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, #endif +#ifdef CONFIG_SCHED_CLUSTER + { cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) }, +#endif #ifdef CONFIG_SCHED_MC { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, #endif @@ -575,6 +614,7 @@ void set_cpu_sibling_map(int cpu) if (!has_mp) { cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu)); cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu)); + cpumask_set_cpu(cpu, cpu_l2c_shared_mask(cpu)); cpumask_set_cpu(cpu, topology_core_cpumask(cpu)); cpumask_set_cpu(cpu, topology_die_cpumask(cpu)); c->booted_cores = 1; @@ -584,14 +624,29 @@ void set_cpu_sibling_map(int cpu) for_each_cpu(i, cpu_sibling_setup_mask) { o = &cpu_data(i); + if (match_pkg(c, o) && !topology_same_node(c, o)) + x86_has_numa_in_package = true; + if ((i == cpu) || (has_smt && match_smt(c, o))) link_mask(topology_sibling_cpumask, cpu, i); if ((i == cpu) || (has_mp && match_llc(c, o))) link_mask(cpu_llc_shared_mask, cpu, i); + if ((i == cpu) || (has_mp && match_l2c(c, o))) + link_mask(cpu_l2c_shared_mask, cpu, i); + + if ((i == cpu) || (has_mp && match_die(c, o))) + link_mask(topology_die_cpumask, cpu, i); } + threads = cpumask_weight(topology_sibling_cpumask(cpu)); + if (threads > __max_smt_threads) + __max_smt_threads = threads; + + for_each_cpu(i, topology_sibling_cpumask(cpu)) + cpu_data(i).smt_active = threads > 1; + /* * This needs a separate iteration over the cpus because we rely on all * topology_sibling_cpumask links to be set-up. @@ -605,8 +660,7 @@ void set_cpu_sibling_map(int cpu) /* * Does this new cpu bringup a new core? */ - if (cpumask_weight( - topology_sibling_cpumask(cpu)) == 1) { + if (threads == 1) { /* * for each core in package, increment * the booted_cores for this new cpu @@ -623,16 +677,7 @@ void set_cpu_sibling_map(int cpu) } else if (i != cpu && !c->booted_cores) c->booted_cores = cpu_data(i).booted_cores; } - if (match_pkg(c, o) && !topology_same_node(c, o)) - x86_has_numa_in_package = true; - - if ((i == cpu) || (has_mp && match_die(c, o))) - link_mask(topology_die_cpumask, cpu, i); } - - threads = cpumask_weight(topology_sibling_cpumask(cpu)); - if (threads > __max_smt_threads) - __max_smt_threads = threads; } /* maps the cpu to the sched domain representing multi-core */ @@ -641,6 +686,11 @@ const struct cpumask *cpu_coregroup_mask(int cpu) return cpu_llc_shared_mask(cpu); } +const struct cpumask *cpu_clustergroup_mask(int cpu) +{ + return cpu_l2c_shared_mask(cpu); +} + static void impress_friends(void) { int cpu; @@ -745,13 +795,14 @@ static void __init smp_quirk_init_udelay(void) int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip) { + u32 dm = apic->dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL; unsigned long send_status, accept_status = 0; int maxlvt; /* Target chip */ /* Boot on the stack */ /* Kick the second */ - apic_icr_write(APIC_DM_NMI | apic->dest_logical, apicid); + apic_icr_write(APIC_DM_NMI | dm, apicid); pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); @@ -978,10 +1029,7 @@ wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid, if (!boot_error) { enable_start_cpu0 = 1; *cpu0_nmi_registered = 1; - if (apic->dest_logical == APIC_DEST_LOGICAL) - id = cpu0_logical_apicid; - else - id = apicid; + id = apic->dest_mode_logical ? cpu0_logical_apicid : apicid; boot_error = wakeup_secondary_cpu_via_nmi(id, start_ip); } @@ -999,6 +1047,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle) alternatives_enable_smp(); per_cpu(current_task, cpu) = idle; + cpu_init_stack_canary(cpu, idle); /* Initialize the interrupt stack(s) */ ret = irq_init_percpu_irqstack(cpu); @@ -1029,6 +1078,11 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, unsigned long boot_error = 0; unsigned long timeout; +#ifdef CONFIG_X86_64 + /* If 64-bit wakeup method exists, use the 64-bit mode trampoline IP */ + if (apic->wakeup_secondary_cpu_64) + start_ip = real_mode_header->trampoline_start64; +#endif idle->thread.sp = (unsigned long)task_pt_regs(idle); early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu); initial_code = (unsigned long)start_secondary; @@ -1070,11 +1124,14 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, /* * Wake up a CPU in difference cases: - * - Use the method in the APIC driver if it's defined + * - Use a method from the APIC driver if one defined, with wakeup + * straight to 64-bit mode preferred over wakeup to RM. * Otherwise, * - Use an INIT boot APIC message for APs or NMI for BSP. */ - if (apic->wakeup_secondary_cpu) + if (apic->wakeup_secondary_cpu_64) + boot_error = apic->wakeup_secondary_cpu_64(apicid, start_ip); + else if (apic->wakeup_secondary_cpu) boot_error = apic->wakeup_secondary_cpu(apicid, start_ip); else boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid, @@ -1259,7 +1316,7 @@ static void __init smp_sanity_check(void) nr++; } - nr_cpu_ids = 8; + set_nr_cpu_ids(8); } #endif @@ -1302,12 +1359,7 @@ static void __init smp_get_logical_apicid(void) cpu0_logical_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); } -/* - * Prepare for SMP bootup. - * @max_cpus: configured maximum number of CPUs, It is a legacy parameter - * for common interface support. - */ -void __init native_smp_prepare_cpus(unsigned int max_cpus) +void __init smp_prepare_cpus_common(void) { unsigned int i; @@ -1325,6 +1377,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(cpu_l2c_shared_map, i), GFP_KERNEL); } /* @@ -1337,6 +1390,16 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) set_sched_topology(x86_topology); set_cpu_sibling_map(0); +} + +/* + * Prepare for SMP bootup. + * @max_cpus: configured maximum number of CPUs, It is a legacy parameter + * for common interface support. + */ +void __init native_smp_prepare_cpus(unsigned int max_cpus) +{ + smp_prepare_cpus_common(); smp_sanity_check(); @@ -1370,14 +1433,16 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) smp_quirk_init_udelay(); speculative_store_bypass_ht_init(); + + snp_set_wakeup_secondary_cpu(); } -void arch_enable_nonboot_cpus_begin(void) +void arch_thaw_secondary_cpus_begin(void) { set_mtrr_aps_delayed_init(); } -void arch_enable_nonboot_cpus_end(void) +void arch_thaw_secondary_cpus_end(void) { mtrr_aps_init(); } @@ -1400,7 +1465,7 @@ void __init calculate_max_logical_packages(void) int ncpus; /* - * Today neither Intel nor AMD support heterogenous systems so + * Today neither Intel nor AMD support heterogeneous systems so * extrapolate the boot cpu's data to all packages. */ ncpus = cpu_data(0).booted_cores * topology_max_smt_threads(); @@ -1414,8 +1479,11 @@ void __init native_smp_cpus_done(unsigned int max_cpus) calculate_max_logical_packages(); + /* XXX for now assume numa-in-package and hybrid don't overlap */ if (x86_has_numa_in_package) set_sched_topology(x86_numa_in_package_topology); + if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) + set_sched_topology(x86_hybrid_topology); nmi_selftest(); impress_friends(); @@ -1434,7 +1502,7 @@ early_param("possible_cpus", _setup_possible_cpus); /* * cpu_possible_mask should be static, it cannot change as cpu's * are onlined, or offlined. The reason is per-cpu data-structures - * are allocated by some modules at init time, and dont expect to + * are allocated by some modules at init time, and don't expect to * do this dynamically on cpu arrival/departure. * cpu_present_mask on the other hand can change dynamically. * In case when cpu_hotplug is not compiled, then we resort to current @@ -1501,7 +1569,7 @@ __init void prefill_possible_map(void) possible = i; } - nr_cpu_ids = possible; + set_nr_cpu_ids(possible); pr_info("Allowing %d CPUs, %d hotplug CPUs\n", possible, max_t(int, possible - num_processors, 0)); @@ -1545,11 +1613,19 @@ static void remove_siblinginfo(int cpu) for_each_cpu(sibling, topology_die_cpumask(cpu)) cpumask_clear_cpu(cpu, topology_die_cpumask(sibling)); - for_each_cpu(sibling, topology_sibling_cpumask(cpu)) + + for_each_cpu(sibling, topology_sibling_cpumask(cpu)) { cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling)); + if (cpumask_weight(topology_sibling_cpumask(sibling)) == 1) + cpu_data(sibling).smt_active = false; + } + for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling)); + for_each_cpu(sibling, cpu_l2c_shared_mask(cpu)) + cpumask_clear_cpu(cpu, cpu_l2c_shared_mask(sibling)); cpumask_clear(cpu_llc_shared_mask(cpu)); + cpumask_clear(cpu_l2c_shared_mask(cpu)); cpumask_clear(topology_sibling_cpumask(cpu)); cpumask_clear(topology_core_cpumask(cpu)); cpumask_clear(topology_die_cpumask(cpu)); @@ -1591,14 +1667,28 @@ int native_cpu_disable(void) if (ret) return ret; - /* - * Disable the local APIC. Otherwise IPI broadcasts will reach - * it. It still responds normally to INIT, NMI, SMI, and SIPI - * messages. - */ - apic_soft_disable(); cpu_disable_common(); + /* + * Disable the local APIC. Otherwise IPI broadcasts will reach + * it. It still responds normally to INIT, NMI, SMI, and SIPI + * messages. + * + * Disabling the APIC must happen after cpu_disable_common() + * which invokes fixup_irqs(). + * + * Disabling the APIC preserves already set bits in IRR, but + * an interrupt arriving after disabling the local APIC does not + * set the corresponding IRR bit. + * + * fixup_irqs() scans IRR for set bits so it can raise a not + * yet handled interrupt on the new destination CPU via an IPI + * but obviously it can't do so for IRR bits which are not set. + * IOW, interrupts arriving after disabling the local APIC will + * be lost. + */ + apic_soft_disable(); + return 0; } @@ -1638,13 +1728,17 @@ void play_dead_common(void) local_irq_disable(); } -static bool wakeup_cpu0(void) +/** + * cond_wakeup_cpu0 - Wake up CPU0 if needed. + * + * If NMI wants to wake up CPU0, start CPU0. + */ +void cond_wakeup_cpu0(void) { if (smp_processor_id() == 0 && enable_start_cpu0) - return true; - - return false; + start_cpu0(); } +EXPORT_SYMBOL_GPL(cond_wakeup_cpu0); /* * We need to flush the caches before going to sleep, lest we have @@ -1713,11 +1807,8 @@ static inline void mwait_play_dead(void) __monitor(mwait_ptr, 0, 0); mb(); __mwait(eax, 0); - /* - * If NMI wants to wake up CPU0, start CPU0. - */ - if (wakeup_cpu0()) - start_cpu0(); + + cond_wakeup_cpu0(); } } @@ -1728,11 +1819,8 @@ void hlt_play_dead(void) while (1) { native_halt(); - /* - * If NMI wants to wake up CPU0, start CPU0. - */ - if (wakeup_cpu0()) - start_cpu0(); + + cond_wakeup_cpu0(); } } diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 2d6898c2cb64..ee117fcf46ed 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -18,23 +18,17 @@ void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, struct unwind_state state; unsigned long addr; - if (regs && !consume_entry(cookie, regs->ip, false)) + if (regs && !consume_entry(cookie, regs->ip)) return; for (unwind_start(&state, task, regs, NULL); !unwind_done(&state); unwind_next_frame(&state)) { addr = unwind_get_return_address(&state); - if (!addr || !consume_entry(cookie, addr, false)) + if (!addr || !consume_entry(cookie, addr)) break; } } -/* - * This function returns an error if it detects any unreliable features of the - * stack. Otherwise it guarantees that the stack trace is reliable. - * - * If the task is not 'current', the caller *must* ensure the task is inactive. - */ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, void *cookie, struct task_struct *task) { @@ -58,7 +52,6 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, * or a page fault), which can make frame pointers * unreliable. */ - if (IS_ENABLED(CONFIG_FRAME_POINTER)) return -EINVAL; } @@ -73,7 +66,7 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, if (!addr) return -EINVAL; - if (!consume_entry(cookie, addr, false)) + if (!consume_entry(cookie, addr)) return -EINVAL; } @@ -81,10 +74,6 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, if (unwind_error(&state)) return -EINVAL; - /* Success path for non-user tasks, i.e. kthreads and idle tasks */ - if (!(task->flags & (PF_KTHREAD | PF_IDLE))) - return -EINVAL; - return 0; } @@ -96,16 +85,18 @@ struct stack_frame_user { }; static int -copy_stack_frame(const void __user *fp, struct stack_frame_user *frame) +copy_stack_frame(const struct stack_frame_user __user *fp, + struct stack_frame_user *frame) { int ret; - if (__range_not_ok(fp, sizeof(*frame), TASK_SIZE)) + if (!__access_ok(fp, sizeof(*frame))) return 0; ret = 1; pagefault_disable(); - if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) + if (__get_user(frame->next_fp, &fp->next_fp) || + __get_user(frame->ret_addr, &fp->ret_addr)) ret = 0; pagefault_enable(); @@ -117,7 +108,7 @@ void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie, { const void __user *fp = (const void __user *)regs->bp; - if (!consume_entry(cookie, regs->ip, false)) + if (!consume_entry(cookie, regs->ip)) return; while (1) { @@ -131,7 +122,7 @@ void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie, break; if (!frame.ret_addr) break; - if (!consume_entry(cookie, frame.ret_addr, false)) + if (!consume_entry(cookie, frame.ret_addr)) break; fp = frame.next_fp; } diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c new file mode 100644 index 000000000000..aaaba85d6d7f --- /dev/null +++ b/arch/x86/kernel/static_call.c @@ -0,0 +1,156 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/static_call.h> +#include <linux/memory.h> +#include <linux/bug.h> +#include <asm/text-patching.h> + +enum insn_type { + CALL = 0, /* site call */ + NOP = 1, /* site cond-call */ + JMP = 2, /* tramp / site tail-call */ + RET = 3, /* tramp / site cond-tail-call */ +}; + +/* + * ud1 %esp, %ecx - a 3 byte #UD that is unique to trampolines, chosen such + * that there is no false-positive trampoline identification while also being a + * speculation stop. + */ +static const u8 tramp_ud[] = { 0x0f, 0xb9, 0xcc }; + +/* + * cs cs cs xorl %eax, %eax - a single 5 byte instruction that clears %[er]ax + */ +static const u8 xor5rax[] = { 0x2e, 0x2e, 0x2e, 0x31, 0xc0 }; + +static const u8 retinsn[] = { RET_INSN_OPCODE, 0xcc, 0xcc, 0xcc, 0xcc }; + +static void __ref __static_call_transform(void *insn, enum insn_type type, + void *func, bool modinit) +{ + const void *emulate = NULL; + int size = CALL_INSN_SIZE; + const void *code; + + switch (type) { + case CALL: + code = text_gen_insn(CALL_INSN_OPCODE, insn, func); + if (func == &__static_call_return0) { + emulate = code; + code = &xor5rax; + } + + break; + + case NOP: + code = x86_nops[5]; + break; + + case JMP: + code = text_gen_insn(JMP32_INSN_OPCODE, insn, func); + break; + + case RET: + if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + code = text_gen_insn(JMP32_INSN_OPCODE, insn, &__x86_return_thunk); + else + code = &retinsn; + break; + } + + if (memcmp(insn, code, size) == 0) + return; + + if (system_state == SYSTEM_BOOTING || modinit) + return text_poke_early(insn, code, size); + + text_poke_bp(insn, code, size, emulate); +} + +static void __static_call_validate(void *insn, bool tail, bool tramp) +{ + u8 opcode = *(u8 *)insn; + + if (tramp && memcmp(insn+5, tramp_ud, 3)) { + pr_err("trampoline signature fail"); + BUG(); + } + + if (tail) { + if (opcode == JMP32_INSN_OPCODE || + opcode == RET_INSN_OPCODE) + return; + } else { + if (opcode == CALL_INSN_OPCODE || + !memcmp(insn, x86_nops[5], 5) || + !memcmp(insn, xor5rax, 5)) + return; + } + + /* + * If we ever trigger this, our text is corrupt, we'll probably not live long. + */ + pr_err("unexpected static_call insn opcode 0x%x at %pS\n", opcode, insn); + BUG(); +} + +static inline enum insn_type __sc_insn(bool null, bool tail) +{ + /* + * Encode the following table without branches: + * + * tail null insn + * -----+-------+------ + * 0 | 0 | CALL + * 0 | 1 | NOP + * 1 | 0 | JMP + * 1 | 1 | RET + */ + return 2*tail + null; +} + +void arch_static_call_transform(void *site, void *tramp, void *func, bool tail) +{ + mutex_lock(&text_mutex); + + if (tramp) { + __static_call_validate(tramp, true, true); + __static_call_transform(tramp, __sc_insn(!func, true), func, false); + } + + if (IS_ENABLED(CONFIG_HAVE_STATIC_CALL_INLINE) && site) { + __static_call_validate(site, tail, false); + __static_call_transform(site, __sc_insn(!func, tail), func, false); + } + + mutex_unlock(&text_mutex); +} +EXPORT_SYMBOL_GPL(arch_static_call_transform); + +#ifdef CONFIG_RETHUNK +/* + * This is called by apply_returns() to fix up static call trampolines, + * specifically ARCH_DEFINE_STATIC_CALL_NULL_TRAMP which is recorded as + * having a return trampoline. + * + * The problem is that static_call() is available before determining + * X86_FEATURE_RETHUNK and, by implication, running alternatives. + * + * This means that __static_call_transform() above can have overwritten the + * return trampoline and we now need to fix things up to be consistent. + */ +bool __static_call_fixup(void *tramp, u8 op, void *dest) +{ + if (memcmp(tramp+5, tramp_ud, 3)) { + /* Not a trampoline site, not our problem. */ + return false; + } + + mutex_lock(&text_mutex); + if (op == RET_INSN_OPCODE || dest == &__x86_return_thunk) + __static_call_transform(tramp, RET, NULL, true); + mutex_unlock(&text_mutex); + + return true; +} +#endif diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 60d2c3798ba2..8e2b2552b5ee 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -127,12 +127,17 @@ static int enable_single_step(struct task_struct *child) regs->flags |= X86_EFLAGS_TF; /* - * Always set TIF_SINGLESTEP - this guarantees that - * we single-step system calls etc.. This will also + * Always set TIF_SINGLESTEP. This will also * cause us to set TF when returning to user mode. */ set_tsk_thread_flag(child, TIF_SINGLESTEP); + /* + * Ensure that a trap is triggered once stepping out of a system + * call prior to executing any user instruction. + */ + set_task_syscall_work(child, SYSCALL_EXIT_TRAP); + oflags = regs->flags; /* Set TF on the kernel stack.. */ @@ -175,8 +180,7 @@ void set_task_blockstep(struct task_struct *task, bool on) * * NOTE: this means that set/clear TIF_BLOCKSTEP is only safe if * task is current or it can't be running, otherwise we can race - * with __switch_to_xtra(). We rely on ptrace_freeze_traced() but - * PTRACE_KILL is not safe. + * with __switch_to_xtra(). We rely on ptrace_freeze_traced(). */ local_irq_disable(); debugctl = get_debugctlmsr(); @@ -230,6 +234,7 @@ void user_disable_single_step(struct task_struct *child) /* Always clear TIF_SINGLESTEP... */ clear_tsk_thread_flag(child, TIF_SINGLESTEP); + clear_task_syscall_work(child, SYSCALL_EXIT_TRAP); /* But touch TF only if it was set by us.. */ if (test_and_clear_tsk_thread_flag(child, TIF_FORCED_TF)) diff --git a/arch/x86/kernel/sys_ia32.c b/arch/x86/kernel/sys_ia32.c new file mode 100644 index 000000000000..6cf65397d225 --- /dev/null +++ b/arch/x86/kernel/sys_ia32.c @@ -0,0 +1,256 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * sys_ia32.c: Conversion between 32bit and 64bit native syscalls. Based on + * sys_sparc32 + * + * Copyright (C) 2000 VA Linux Co + * Copyright (C) 2000 Don Dugger <n0ano@valinux.com> + * Copyright (C) 1999 Arun Sharma <arun.sharma@intel.com> + * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz) + * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) + * Copyright (C) 2000 Hewlett-Packard Co. + * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com> + * Copyright (C) 2000,2001,2002 Andi Kleen, SuSE Labs (x86-64 port) + * + * These routines maintain argument size conversion between 32bit and 64bit + * environment. In 2.5 most of this should be moved to a generic directory. + * + * This file assumes that there is a hole at the end of user address space. + * + * Some of the functions are LE specific currently. These are + * hopefully all marked. This should be fixed. + */ + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/signal.h> +#include <linux/syscalls.h> +#include <linux/times.h> +#include <linux/utsname.h> +#include <linux/mm.h> +#include <linux/uio.h> +#include <linux/poll.h> +#include <linux/personality.h> +#include <linux/stat.h> +#include <linux/rwsem.h> +#include <linux/compat.h> +#include <linux/vfs.h> +#include <linux/ptrace.h> +#include <linux/highuid.h> +#include <linux/sysctl.h> +#include <linux/slab.h> +#include <linux/sched/task.h> +#include <asm/mman.h> +#include <asm/types.h> +#include <linux/uaccess.h> +#include <linux/atomic.h> +#include <asm/vgtod.h> +#include <asm/ia32.h> + +#define AA(__x) ((unsigned long)(__x)) + +SYSCALL_DEFINE3(ia32_truncate64, const char __user *, filename, + unsigned long, offset_low, unsigned long, offset_high) +{ + return ksys_truncate(filename, + ((loff_t) offset_high << 32) | offset_low); +} + +SYSCALL_DEFINE3(ia32_ftruncate64, unsigned int, fd, + unsigned long, offset_low, unsigned long, offset_high) +{ + return ksys_ftruncate(fd, ((loff_t) offset_high << 32) | offset_low); +} + +/* warning: next two assume little endian */ +SYSCALL_DEFINE5(ia32_pread64, unsigned int, fd, char __user *, ubuf, + u32, count, u32, poslo, u32, poshi) +{ + return ksys_pread64(fd, ubuf, count, + ((loff_t)AA(poshi) << 32) | AA(poslo)); +} + +SYSCALL_DEFINE5(ia32_pwrite64, unsigned int, fd, const char __user *, ubuf, + u32, count, u32, poslo, u32, poshi) +{ + return ksys_pwrite64(fd, ubuf, count, + ((loff_t)AA(poshi) << 32) | AA(poslo)); +} + + +/* + * Some system calls that need sign extended arguments. This could be + * done by a generic wrapper. + */ +SYSCALL_DEFINE6(ia32_fadvise64_64, int, fd, __u32, offset_low, + __u32, offset_high, __u32, len_low, __u32, len_high, + int, advice) +{ + return ksys_fadvise64_64(fd, + (((u64)offset_high)<<32) | offset_low, + (((u64)len_high)<<32) | len_low, + advice); +} + +SYSCALL_DEFINE4(ia32_readahead, int, fd, unsigned int, off_lo, + unsigned int, off_hi, size_t, count) +{ + return ksys_readahead(fd, ((u64)off_hi << 32) | off_lo, count); +} + +SYSCALL_DEFINE6(ia32_sync_file_range, int, fd, unsigned int, off_low, + unsigned int, off_hi, unsigned int, n_low, + unsigned int, n_hi, int, flags) +{ + return ksys_sync_file_range(fd, + ((u64)off_hi << 32) | off_low, + ((u64)n_hi << 32) | n_low, flags); +} + +SYSCALL_DEFINE5(ia32_fadvise64, int, fd, unsigned int, offset_lo, + unsigned int, offset_hi, size_t, len, int, advice) +{ + return ksys_fadvise64_64(fd, ((u64)offset_hi << 32) | offset_lo, + len, advice); +} + +SYSCALL_DEFINE6(ia32_fallocate, int, fd, int, mode, + unsigned int, offset_lo, unsigned int, offset_hi, + unsigned int, len_lo, unsigned int, len_hi) +{ + return ksys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo, + ((u64)len_hi << 32) | len_lo); +} + +#ifdef CONFIG_IA32_EMULATION +/* + * Another set for IA32/LFS -- x86_64 struct stat is different due to + * support for 64bit inode numbers. + */ +static int cp_stat64(struct stat64 __user *ubuf, struct kstat *stat) +{ + typeof(ubuf->st_uid) uid = 0; + typeof(ubuf->st_gid) gid = 0; + SET_UID(uid, from_kuid_munged(current_user_ns(), stat->uid)); + SET_GID(gid, from_kgid_munged(current_user_ns(), stat->gid)); + if (!user_write_access_begin(ubuf, sizeof(struct stat64))) + return -EFAULT; + unsafe_put_user(huge_encode_dev(stat->dev), &ubuf->st_dev, Efault); + unsafe_put_user(stat->ino, &ubuf->__st_ino, Efault); + unsafe_put_user(stat->ino, &ubuf->st_ino, Efault); + unsafe_put_user(stat->mode, &ubuf->st_mode, Efault); + unsafe_put_user(stat->nlink, &ubuf->st_nlink, Efault); + unsafe_put_user(uid, &ubuf->st_uid, Efault); + unsafe_put_user(gid, &ubuf->st_gid, Efault); + unsafe_put_user(huge_encode_dev(stat->rdev), &ubuf->st_rdev, Efault); + unsafe_put_user(stat->size, &ubuf->st_size, Efault); + unsafe_put_user(stat->atime.tv_sec, &ubuf->st_atime, Efault); + unsafe_put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec, Efault); + unsafe_put_user(stat->mtime.tv_sec, &ubuf->st_mtime, Efault); + unsafe_put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec, Efault); + unsafe_put_user(stat->ctime.tv_sec, &ubuf->st_ctime, Efault); + unsafe_put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec, Efault); + unsafe_put_user(stat->blksize, &ubuf->st_blksize, Efault); + unsafe_put_user(stat->blocks, &ubuf->st_blocks, Efault); + user_access_end(); + return 0; +Efault: + user_write_access_end(); + return -EFAULT; +} + +COMPAT_SYSCALL_DEFINE2(ia32_stat64, const char __user *, filename, + struct stat64 __user *, statbuf) +{ + struct kstat stat; + int ret = vfs_stat(filename, &stat); + + if (!ret) + ret = cp_stat64(statbuf, &stat); + return ret; +} + +COMPAT_SYSCALL_DEFINE2(ia32_lstat64, const char __user *, filename, + struct stat64 __user *, statbuf) +{ + struct kstat stat; + int ret = vfs_lstat(filename, &stat); + if (!ret) + ret = cp_stat64(statbuf, &stat); + return ret; +} + +COMPAT_SYSCALL_DEFINE2(ia32_fstat64, unsigned int, fd, + struct stat64 __user *, statbuf) +{ + struct kstat stat; + int ret = vfs_fstat(fd, &stat); + if (!ret) + ret = cp_stat64(statbuf, &stat); + return ret; +} + +COMPAT_SYSCALL_DEFINE4(ia32_fstatat64, unsigned int, dfd, + const char __user *, filename, + struct stat64 __user *, statbuf, int, flag) +{ + struct kstat stat; + int error; + + error = vfs_fstatat(dfd, filename, &stat, flag); + if (error) + return error; + return cp_stat64(statbuf, &stat); +} + +/* + * Linux/i386 didn't use to be able to handle more than + * 4 system call parameters, so these system calls used a memory + * block for parameter passing.. + */ + +struct mmap_arg_struct32 { + unsigned int addr; + unsigned int len; + unsigned int prot; + unsigned int flags; + unsigned int fd; + unsigned int offset; +}; + +COMPAT_SYSCALL_DEFINE1(ia32_mmap, struct mmap_arg_struct32 __user *, arg) +{ + struct mmap_arg_struct32 a; + + if (copy_from_user(&a, arg, sizeof(a))) + return -EFAULT; + + if (a.offset & ~PAGE_MASK) + return -EINVAL; + + return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + a.offset>>PAGE_SHIFT); +} + +/* + * The 32-bit clone ABI is CONFIG_CLONE_BACKWARDS + */ +COMPAT_SYSCALL_DEFINE5(ia32_clone, unsigned long, clone_flags, + unsigned long, newsp, int __user *, parent_tidptr, + unsigned long, tls_val, int __user *, child_tidptr) +{ + struct kernel_clone_args args = { + .flags = (clone_flags & ~CSIGNAL), + .pidfd = parent_tidptr, + .child_tid = child_tidptr, + .parent_tid = parent_tidptr, + .exit_signal = (clone_flags & CSIGNAL), + .stack = newsp, + .tls = tls_val, + }; + + return kernel_clone(&args); +} +#endif /* CONFIG_IA32_EMULATION */ diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index ca3c11a17b5a..8cc653ffdccd 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -21,7 +21,6 @@ #include <asm/elf.h> #include <asm/ia32.h> -#include <asm/syscalls.h> /* * Align a virtual address to avoid aliasing in the I$ on AMD F15h. @@ -69,9 +68,6 @@ static int __init control_va_addr_alignment(char *str) if (*str == 0) return 1; - if (*str == '=') - str++; - if (!strcmp(str, "32")) va_align.flags = ALIGN_VA_32; else if (!strcmp(str, "64")) @@ -81,24 +77,20 @@ static int __init control_va_addr_alignment(char *str) else if (!strcmp(str, "on")) va_align.flags = ALIGN_VA_32 | ALIGN_VA_64; else - return 0; + pr_warn("invalid option value: 'align_va_addr=%s'\n", str); return 1; } -__setup("align_va_addr", control_va_addr_alignment); +__setup("align_va_addr=", control_va_addr_alignment); SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, unsigned long, fd, unsigned long, off) { - long error; - error = -EINVAL; if (off & ~PAGE_MASK) - goto out; + return -EINVAL; - error = ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); -out: - return error; + return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); } static void find_start_end(unsigned long addr, unsigned long flags, diff --git a/arch/x86/kernel/sysfb.c b/arch/x86/kernel/sysfb.c deleted file mode 100644 index 014ebd8ca869..000000000000 --- a/arch/x86/kernel/sysfb.c +++ /dev/null @@ -1,70 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Generic System Framebuffers on x86 - * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com> - */ - -/* - * Simple-Framebuffer support for x86 systems - * Create a platform-device for any available boot framebuffer. The - * simple-framebuffer platform device is already available on DT systems, so - * this module parses the global "screen_info" object and creates a suitable - * platform device compatible with the "simple-framebuffer" DT object. If - * the framebuffer is incompatible, we instead create a legacy - * "vesa-framebuffer", "efi-framebuffer" or "platform-framebuffer" device and - * pass the screen_info as platform_data. This allows legacy drivers - * to pick these devices up without messing with simple-framebuffer drivers. - * The global "screen_info" is still valid at all times. - * - * If CONFIG_X86_SYSFB is not selected, we never register "simple-framebuffer" - * platform devices, but only use legacy framebuffer devices for - * backwards compatibility. - * - * TODO: We set the dev_id field of all platform-devices to 0. This allows - * other x86 OF/DT parsers to create such devices, too. However, they must - * start at offset 1 for this to work. - */ - -#include <linux/err.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/platform_data/simplefb.h> -#include <linux/platform_device.h> -#include <linux/screen_info.h> -#include <asm/sysfb.h> - -static __init int sysfb_init(void) -{ - struct screen_info *si = &screen_info; - struct simplefb_platform_data mode; - struct platform_device *pd; - const char *name; - bool compatible; - int ret; - - sysfb_apply_efi_quirks(); - - /* try to create a simple-framebuffer device */ - compatible = parse_mode(si, &mode); - if (compatible) { - ret = create_simplefb(si, &mode); - if (!ret) - return 0; - } - - /* if the FB is incompatible, create a legacy framebuffer device */ - if (si->orig_video_isVGA == VIDEO_TYPE_EFI) - name = "efi-framebuffer"; - else if (si->orig_video_isVGA == VIDEO_TYPE_VLFB) - name = "vesa-framebuffer"; - else - name = "platform-framebuffer"; - - pd = platform_device_register_resndata(NULL, name, 0, - NULL, 0, si, sizeof(*si)); - return PTR_ERR_OR_ZERO(pd); -} - -/* must execute after PCI subsystem for EFI quirks */ -device_initcall(sysfb_init); diff --git a/arch/x86/kernel/sysfb_efi.c b/arch/x86/kernel/sysfb_efi.c deleted file mode 100644 index 653b7f617b61..000000000000 --- a/arch/x86/kernel/sysfb_efi.c +++ /dev/null @@ -1,284 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Generic System Framebuffers on x86 - * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com> - * - * EFI Quirks Copyright (c) 2006 Edgar Hucek <gimli@dark-green.com> - */ - -/* - * EFI Quirks - * Several EFI systems do not correctly advertise their boot framebuffers. - * Hence, we use this static table of known broken machines and fix up the - * information so framebuffer drivers can load corectly. - */ - -#include <linux/dmi.h> -#include <linux/err.h> -#include <linux/efi.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/pci.h> -#include <linux/screen_info.h> -#include <video/vga.h> - -#include <asm/efi.h> -#include <asm/sysfb.h> - -enum { - OVERRIDE_NONE = 0x0, - OVERRIDE_BASE = 0x1, - OVERRIDE_STRIDE = 0x2, - OVERRIDE_HEIGHT = 0x4, - OVERRIDE_WIDTH = 0x8, -}; - -struct efifb_dmi_info efifb_dmi_list[] = { - [M_I17] = { "i17", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE }, - [M_I20] = { "i20", 0x80010000, 1728 * 4, 1680, 1050, OVERRIDE_NONE }, /* guess */ - [M_I20_SR] = { "imac7", 0x40010000, 1728 * 4, 1680, 1050, OVERRIDE_NONE }, - [M_I24] = { "i24", 0x80010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, /* guess */ - [M_I24_8_1] = { "imac8", 0xc0060000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, - [M_I24_10_1] = { "imac10", 0xc0010000, 2048 * 4, 1920, 1080, OVERRIDE_NONE }, - [M_I27_11_1] = { "imac11", 0xc0010000, 2560 * 4, 2560, 1440, OVERRIDE_NONE }, - [M_MINI]= { "mini", 0x80000000, 2048 * 4, 1024, 768, OVERRIDE_NONE }, - [M_MINI_3_1] = { "mini31", 0x40010000, 1024 * 4, 1024, 768, OVERRIDE_NONE }, - [M_MINI_4_1] = { "mini41", 0xc0010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, - [M_MB] = { "macbook", 0x80000000, 2048 * 4, 1280, 800, OVERRIDE_NONE }, - [M_MB_5_1] = { "macbook51", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE }, - [M_MB_6_1] = { "macbook61", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE }, - [M_MB_7_1] = { "macbook71", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE }, - [M_MBA] = { "mba", 0x80000000, 2048 * 4, 1280, 800, OVERRIDE_NONE }, - /* 11" Macbook Air 3,1 passes the wrong stride */ - [M_MBA_3] = { "mba3", 0, 2048 * 4, 0, 0, OVERRIDE_STRIDE }, - [M_MBP] = { "mbp", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE }, - [M_MBP_2] = { "mbp2", 0, 0, 0, 0, OVERRIDE_NONE }, /* placeholder */ - [M_MBP_2_2] = { "mbp22", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE }, - [M_MBP_SR] = { "mbp3", 0x80030000, 2048 * 4, 1440, 900, OVERRIDE_NONE }, - [M_MBP_4] = { "mbp4", 0xc0060000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, - [M_MBP_5_1] = { "mbp51", 0xc0010000, 2048 * 4, 1440, 900, OVERRIDE_NONE }, - [M_MBP_5_2] = { "mbp52", 0xc0010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, - [M_MBP_5_3] = { "mbp53", 0xd0010000, 2048 * 4, 1440, 900, OVERRIDE_NONE }, - [M_MBP_6_1] = { "mbp61", 0x90030000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, - [M_MBP_6_2] = { "mbp62", 0x90030000, 2048 * 4, 1680, 1050, OVERRIDE_NONE }, - [M_MBP_7_1] = { "mbp71", 0xc0010000, 2048 * 4, 1280, 800, OVERRIDE_NONE }, - [M_MBP_8_2] = { "mbp82", 0x90010000, 1472 * 4, 1440, 900, OVERRIDE_NONE }, - [M_UNKNOWN] = { NULL, 0, 0, 0, 0, OVERRIDE_NONE } -}; - -void efifb_setup_from_dmi(struct screen_info *si, const char *opt) -{ - int i; - - for (i = 0; i < M_UNKNOWN; i++) { - if (efifb_dmi_list[i].base != 0 && - !strcmp(opt, efifb_dmi_list[i].optname)) { - si->lfb_base = efifb_dmi_list[i].base; - si->lfb_linelength = efifb_dmi_list[i].stride; - si->lfb_width = efifb_dmi_list[i].width; - si->lfb_height = efifb_dmi_list[i].height; - } - } -} - -#define choose_value(dmivalue, fwvalue, field, flags) ({ \ - typeof(fwvalue) _ret_ = fwvalue; \ - if ((flags) & (field)) \ - _ret_ = dmivalue; \ - else if ((fwvalue) == 0) \ - _ret_ = dmivalue; \ - _ret_; \ - }) - -static int __init efifb_set_system(const struct dmi_system_id *id) -{ - struct efifb_dmi_info *info = id->driver_data; - - if (info->base == 0 && info->height == 0 && info->width == 0 && - info->stride == 0) - return 0; - - /* Trust the bootloader over the DMI tables */ - if (screen_info.lfb_base == 0) { -#if defined(CONFIG_PCI) - struct pci_dev *dev = NULL; - int found_bar = 0; -#endif - if (info->base) { - screen_info.lfb_base = choose_value(info->base, - screen_info.lfb_base, OVERRIDE_BASE, - info->flags); - -#if defined(CONFIG_PCI) - /* make sure that the address in the table is actually - * on a VGA device's PCI BAR */ - - for_each_pci_dev(dev) { - int i; - if ((dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) - continue; - for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { - resource_size_t start, end; - unsigned long flags; - - flags = pci_resource_flags(dev, i); - if (!(flags & IORESOURCE_MEM)) - continue; - - if (flags & IORESOURCE_UNSET) - continue; - - if (pci_resource_len(dev, i) == 0) - continue; - - start = pci_resource_start(dev, i); - end = pci_resource_end(dev, i); - if (screen_info.lfb_base >= start && - screen_info.lfb_base < end) { - found_bar = 1; - break; - } - } - } - if (!found_bar) - screen_info.lfb_base = 0; -#endif - } - } - if (screen_info.lfb_base) { - screen_info.lfb_linelength = choose_value(info->stride, - screen_info.lfb_linelength, OVERRIDE_STRIDE, - info->flags); - screen_info.lfb_width = choose_value(info->width, - screen_info.lfb_width, OVERRIDE_WIDTH, - info->flags); - screen_info.lfb_height = choose_value(info->height, - screen_info.lfb_height, OVERRIDE_HEIGHT, - info->flags); - if (screen_info.orig_video_isVGA == 0) - screen_info.orig_video_isVGA = VIDEO_TYPE_EFI; - } else { - screen_info.lfb_linelength = 0; - screen_info.lfb_width = 0; - screen_info.lfb_height = 0; - screen_info.orig_video_isVGA = 0; - return 0; - } - - printk(KERN_INFO "efifb: dmi detected %s - framebuffer at 0x%08x " - "(%dx%d, stride %d)\n", id->ident, - screen_info.lfb_base, screen_info.lfb_width, - screen_info.lfb_height, screen_info.lfb_linelength); - - return 1; -} - -#define EFIFB_DMI_SYSTEM_ID(vendor, name, enumid) \ - { \ - efifb_set_system, \ - name, \ - { \ - DMI_MATCH(DMI_BIOS_VENDOR, vendor), \ - DMI_MATCH(DMI_PRODUCT_NAME, name) \ - }, \ - &efifb_dmi_list[enumid] \ - } - -static const struct dmi_system_id efifb_dmi_system_table[] __initconst = { - EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac4,1", M_I17), - /* At least one of these two will be right; maybe both? */ - EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac5,1", M_I20), - EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac5,1", M_I20), - /* At least one of these two will be right; maybe both? */ - EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac6,1", M_I24), - EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac6,1", M_I24), - EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac7,1", M_I20_SR), - EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac8,1", M_I24_8_1), - EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac10,1", M_I24_10_1), - EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac11,1", M_I27_11_1), - EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "Macmini1,1", M_MINI), - EFIFB_DMI_SYSTEM_ID("Apple Inc.", "Macmini3,1", M_MINI_3_1), - EFIFB_DMI_SYSTEM_ID("Apple Inc.", "Macmini4,1", M_MINI_4_1), - EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook1,1", M_MB), - /* At least one of these two will be right; maybe both? */ - EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook2,1", M_MB), - EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook2,1", M_MB), - /* At least one of these two will be right; maybe both? */ - EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook3,1", M_MB), - EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook3,1", M_MB), - EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook4,1", M_MB), - EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook5,1", M_MB_5_1), - EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook6,1", M_MB_6_1), - EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook7,1", M_MB_7_1), - EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookAir1,1", M_MBA), - EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookAir3,1", M_MBA_3), - EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro1,1", M_MBP), - EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro2,1", M_MBP_2), - EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro2,2", M_MBP_2_2), - EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro2,1", M_MBP_2), - EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro3,1", M_MBP_SR), - EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro3,1", M_MBP_SR), - EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro4,1", M_MBP_4), - EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,1", M_MBP_5_1), - EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,2", M_MBP_5_2), - EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,3", M_MBP_5_3), - EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro6,1", M_MBP_6_1), - EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro6,2", M_MBP_6_2), - EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro7,1", M_MBP_7_1), - EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro8,2", M_MBP_8_2), - {}, -}; - -/* - * Some devices have a portrait LCD but advertise a landscape resolution (and - * pitch). We simply swap width and height for these devices so that we can - * correctly deal with some of them coming with multiple resolutions. - */ -static const struct dmi_system_id efifb_dmi_swap_width_height[] __initconst = { - { - /* - * Lenovo MIIX310-10ICR, only some batches have the troublesome - * 800x1280 portrait screen. Luckily the portrait version has - * its own BIOS version, so we match on that. - */ - .matches = { - DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"), - DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, "MIIX 310-10ICR"), - DMI_EXACT_MATCH(DMI_BIOS_VERSION, "1HCN44WW"), - }, - }, - { - /* Lenovo MIIX 320-10ICR with 800x1280 portrait screen */ - .matches = { - DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"), - DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, - "Lenovo MIIX 320-10ICR"), - }, - }, - { - /* Lenovo D330 with 800x1280 or 1200x1920 portrait screen */ - .matches = { - DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"), - DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, - "Lenovo ideapad D330-10IGM"), - }, - }, - {}, -}; - -__init void sysfb_apply_efi_quirks(void) -{ - if (screen_info.orig_video_isVGA != VIDEO_TYPE_EFI || - !(screen_info.capabilities & VIDEO_CAPABILITY_SKIP_QUIRKS)) - dmi_check_system(efifb_dmi_system_table); - - if (screen_info.orig_video_isVGA == VIDEO_TYPE_EFI && - dmi_check_system(efifb_dmi_swap_width_height)) { - u16 temp = screen_info.lfb_width; - - screen_info.lfb_width = screen_info.lfb_height; - screen_info.lfb_height = temp; - screen_info.lfb_linelength = 4 * screen_info.lfb_width; - } -} diff --git a/arch/x86/kernel/sysfb_simplefb.c b/arch/x86/kernel/sysfb_simplefb.c deleted file mode 100644 index 298fc1edd9c9..000000000000 --- a/arch/x86/kernel/sysfb_simplefb.c +++ /dev/null @@ -1,111 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Generic System Framebuffers on x86 - * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com> - */ - -/* - * simple-framebuffer probing - * Try to convert "screen_info" into a "simple-framebuffer" compatible mode. - * If the mode is incompatible, we return "false" and let the caller create - * legacy nodes instead. - */ - -#include <linux/err.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/platform_data/simplefb.h> -#include <linux/platform_device.h> -#include <linux/screen_info.h> -#include <asm/sysfb.h> - -static const char simplefb_resname[] = "BOOTFB"; -static const struct simplefb_format formats[] = SIMPLEFB_FORMATS; - -/* try parsing x86 screen_info into a simple-framebuffer mode struct */ -__init bool parse_mode(const struct screen_info *si, - struct simplefb_platform_data *mode) -{ - const struct simplefb_format *f; - __u8 type; - unsigned int i; - - type = si->orig_video_isVGA; - if (type != VIDEO_TYPE_VLFB && type != VIDEO_TYPE_EFI) - return false; - - for (i = 0; i < ARRAY_SIZE(formats); ++i) { - f = &formats[i]; - if (si->lfb_depth == f->bits_per_pixel && - si->red_size == f->red.length && - si->red_pos == f->red.offset && - si->green_size == f->green.length && - si->green_pos == f->green.offset && - si->blue_size == f->blue.length && - si->blue_pos == f->blue.offset && - si->rsvd_size == f->transp.length && - si->rsvd_pos == f->transp.offset) { - mode->format = f->name; - mode->width = si->lfb_width; - mode->height = si->lfb_height; - mode->stride = si->lfb_linelength; - return true; - } - } - - return false; -} - -__init int create_simplefb(const struct screen_info *si, - const struct simplefb_platform_data *mode) -{ - struct platform_device *pd; - struct resource res; - u64 base, size; - u32 length; - - /* - * If the 64BIT_BASE capability is set, ext_lfb_base will contain the - * upper half of the base address. Assemble the address, then make sure - * it is valid and we can actually access it. - */ - base = si->lfb_base; - if (si->capabilities & VIDEO_CAPABILITY_64BIT_BASE) - base |= (u64)si->ext_lfb_base << 32; - if (!base || (u64)(resource_size_t)base != base) { - printk(KERN_DEBUG "sysfb: inaccessible VRAM base\n"); - return -EINVAL; - } - - /* - * Don't use lfb_size as IORESOURCE size, since it may contain the - * entire VMEM, and thus require huge mappings. Use just the part we - * need, that is, the part where the framebuffer is located. But verify - * that it does not exceed the advertised VMEM. - * Note that in case of VBE, the lfb_size is shifted by 16 bits for - * historical reasons. - */ - size = si->lfb_size; - if (si->orig_video_isVGA == VIDEO_TYPE_VLFB) - size <<= 16; - length = mode->height * mode->stride; - if (length > size) { - printk(KERN_WARNING "sysfb: VRAM smaller than advertised\n"); - return -EINVAL; - } - length = PAGE_ALIGN(length); - - /* setup IORESOURCE_MEM as framebuffer memory */ - memset(&res, 0, sizeof(res)); - res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; - res.name = simplefb_resname; - res.start = base; - res.end = res.start + length - 1; - if (res.end <= res.start) - return -EINVAL; - - pd = platform_device_register_resndata(NULL, "simple-framebuffer", 0, - &res, 1, mode, sizeof(*mode)); - return PTR_ERR_OR_ZERO(pd); -} diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index b89f6ac6a0c0..4c1bcb6053fc 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -6,7 +6,6 @@ * Copyright (c) 2006-2009, Intel Corporation */ -#include <linux/intel-iommu.h> #include <linux/init_task.h> #include <linux/spinlock.h> #include <linux/export.h> @@ -23,9 +22,7 @@ #include <asm/realmode.h> #include <asm/processor.h> #include <asm/bootparam.h> -#include <asm/pgtable.h> #include <asm/pgalloc.h> -#include <asm/swiotlb.h> #include <asm/fixmap.h> #include <asm/proto.h> #include <asm/setup.h> @@ -35,8 +32,7 @@ #include "../realmode/rm/wakeup.h" /* Global pointer to shared data; NULL means no measured launch. */ -struct tboot *tboot __read_mostly; -EXPORT_SYMBOL(tboot); +static struct tboot *tboot __read_mostly; /* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */ #define AP_WAIT_TIMEOUT 1 @@ -46,6 +42,35 @@ EXPORT_SYMBOL(tboot); static u8 tboot_uuid[16] __initdata = TBOOT_UUID; +bool tboot_enabled(void) +{ + return tboot != NULL; +} + +/* noinline to prevent gcc from warning about dereferencing constant fixaddr */ +static noinline __init bool check_tboot_version(void) +{ + if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) { + pr_warn("tboot at 0x%llx is invalid\n", boot_params.tboot_addr); + return false; + } + + if (tboot->version < 5) { + pr_warn("tboot version is invalid: %u\n", tboot->version); + return false; + } + + pr_info("found shared page at phys addr 0x%llx:\n", + boot_params.tboot_addr); + pr_debug("version: %d\n", tboot->version); + pr_debug("log_addr: 0x%08x\n", tboot->log_addr); + pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry); + pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base); + pr_debug("tboot_size: 0x%x\n", tboot->tboot_size); + + return true; +} + void __init tboot_probe(void) { /* Look for valid page-aligned address for shared page. */ @@ -63,34 +88,19 @@ void __init tboot_probe(void) /* Map and check for tboot UUID. */ set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr); - tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE); - if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) { - pr_warn("tboot at 0x%llx is invalid\n", boot_params.tboot_addr); + tboot = (void *)fix_to_virt(FIX_TBOOT_BASE); + if (!check_tboot_version()) tboot = NULL; - return; - } - if (tboot->version < 5) { - pr_warn("tboot version is invalid: %u\n", tboot->version); - tboot = NULL; - return; - } - - pr_info("found shared page at phys addr 0x%llx:\n", - boot_params.tboot_addr); - pr_debug("version: %d\n", tboot->version); - pr_debug("log_addr: 0x%08x\n", tboot->log_addr); - pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry); - pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base); - pr_debug("tboot_size: 0x%x\n", tboot->tboot_size); } static pgd_t *tboot_pg_dir; static struct mm_struct tboot_mm = { - .mm_rb = RB_ROOT, + .mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, tboot_mm.mmap_lock), .pgd = swapper_pg_dir, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), - .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), + .write_protect_seq = SEQCNT_ZERO(tboot_mm.write_protect_seq), + MMAP_LOCK_INITIALIZER(init_mm) .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), }; @@ -505,23 +515,3 @@ struct acpi_table_header *tboot_get_dmar_table(struct acpi_table_header *dmar_tb return dmar_tbl; } - -int tboot_force_iommu(void) -{ - if (!tboot_enabled()) - return 0; - - if (intel_iommu_tboot_noforce) - return 1; - - if (no_iommu || swiotlb || dmar_disabled) - pr_warn("Forcing Intel-IOMMU to enabled\n"); - - dmar_disabled = 0; -#ifdef CONFIG_SWIOTLB - swiotlb = 0; -#endif - no_iommu = 0; - - return 1; -} diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index d8673d8a779b..e42faa792c07 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -25,10 +25,6 @@ #include <asm/hpet.h> #include <asm/time.h> -#ifdef CONFIG_X86_64 -__visible volatile unsigned long jiffies __cacheline_aligned_in_smp = INITIAL_JIFFIES; -#endif - unsigned long profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); @@ -62,19 +58,16 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static struct irqaction irq0 = { - .handler = timer_interrupt, - .flags = IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, - .name = "timer" -}; - static void __init setup_default_timer_irq(void) { + unsigned long flags = IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER; + /* - * Unconditionally register the legacy timer; even without legacy - * PIC/PIT we need this for the HPET0 in legacy replacement mode. + * Unconditionally register the legacy timer interrupt; even + * without legacy PIC/PIT we need this for the HPET0 in legacy + * replacement mode. */ - if (setup_irq(0, &irq0)) + if (request_irq(0, timer_interrupt, flags, "timer", NULL)) pr_info("Failed to register legacy timer interrupt\n"); } @@ -106,6 +99,9 @@ static __init void x86_late_time_init(void) */ x86_init.irqs.intr_mode_init(); tsc_init(); + + if (static_cpu_has(X86_FEATURE_WAITPKG)) + use_tpause_delay(); } /* @@ -122,18 +118,12 @@ void __init time_init(void) */ void clocksource_arch_init(struct clocksource *cs) { - if (cs->archdata.vclock_mode == VCLOCK_NONE) + if (cs->vdso_clock_mode == VDSO_CLOCKMODE_NONE) return; - if (cs->archdata.vclock_mode > VCLOCK_MAX) { - pr_warn("clocksource %s registered with invalid vclock_mode %d. Disabling vclock.\n", - cs->name, cs->archdata.vclock_mode); - cs->archdata.vclock_mode = VCLOCK_NONE; - } - if (cs->mask != CLOCKSOURCE_MASK(64)) { - pr_warn("clocksource %s registered with invalid mask %016llx. Disabling vclock.\n", + pr_warn("clocksource %s registered with invalid mask %016llx for VDSO. Disabling VDSO support.\n", cs->name, cs->mask); - cs->archdata.vclock_mode = VCLOCK_NONE; + cs->vdso_clock_mode = VDSO_CLOCKMODE_NONE; } } diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index 71d3fef1edc9..3c883e064242 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -164,17 +164,11 @@ int do_set_thread_area(struct task_struct *p, int idx, savesegment(fs, sel); if (sel == modified_sel) loadsegment(fs, sel); - - savesegment(gs, sel); - if (sel == modified_sel) - load_gs_index(sel); #endif -#ifdef CONFIG_X86_32_LAZY_GS savesegment(gs, sel); if (sel == modified_sel) - loadsegment(gs, sel); -#endif + load_gs_index(sel); } else { #ifdef CONFIG_X86_64 if (p->thread.fsindex == modified_sel) @@ -256,36 +250,16 @@ int regset_tls_active(struct task_struct *target, } int regset_tls_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { const struct desc_struct *tls; + struct user_desc v; + int pos; - if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || - (pos % sizeof(struct user_desc)) != 0 || - (count % sizeof(struct user_desc)) != 0) - return -EINVAL; - - pos /= sizeof(struct user_desc); - count /= sizeof(struct user_desc); - - tls = &target->thread.tls_array[pos]; - - if (kbuf) { - struct user_desc *info = kbuf; - while (count-- > 0) - fill_user_desc(info++, GDT_ENTRY_TLS_MIN + pos++, - tls++); - } else { - struct user_desc __user *u_info = ubuf; - while (count-- > 0) { - struct user_desc info; - fill_user_desc(&info, GDT_ENTRY_TLS_MIN + pos++, tls++); - if (__copy_to_user(u_info++, &info, sizeof(info))) - return -EFAULT; - } + for (pos = 0, tls = target->thread.tls_array; to.left; pos++, tls++) { + fill_user_desc(&v, GDT_ENTRY_TLS_MIN + pos, tls); + membuf_write(&to, &v, sizeof(v)); } - return 0; } diff --git a/arch/x86/kernel/tls.h b/arch/x86/kernel/tls.h index 3a76e1d3535e..fc39447a0c1a 100644 --- a/arch/x86/kernel/tls.h +++ b/arch/x86/kernel/tls.h @@ -12,7 +12,7 @@ #include <linux/regset.h> extern user_regset_active_fn regset_tls_active; -extern user_regset_get_fn regset_tls_get; +extern user_regset_get2_fn regset_tls_get; extern user_regset_set_fn regset_tls_set; #endif /* _ARCH_X86_KERNEL_TLS_H */ diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index be5bc2e47c71..8617d1ed9d31 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -25,12 +25,14 @@ * * Send feedback to <colpatch@us.ibm.com> */ +#include <linux/interrupt.h> #include <linux/nodemask.h> #include <linux/export.h> #include <linux/mmzone.h> #include <linux/init.h> #include <linux/smp.h> #include <linux/irq.h> +#include <asm/io_apic.h> #include <asm/cpu.h> static DEFINE_PER_CPU(struct x86_cpu, cpu_devices); @@ -59,39 +61,29 @@ __setup("cpu0_hotplug", enable_cpu0_hotplug); */ int _debug_hotplug_cpu(int cpu, int action) { - struct device *dev = get_cpu_device(cpu); int ret; if (!cpu_is_hotpluggable(cpu)) return -EINVAL; - lock_device_hotplug(); - switch (action) { case 0: - ret = cpu_down(cpu); - if (!ret) { + ret = remove_cpu(cpu); + if (!ret) pr_info("DEBUG_HOTPLUG_CPU0: CPU %u is now offline\n", cpu); - dev->offline = true; - kobject_uevent(&dev->kobj, KOBJ_OFFLINE); - } else + else pr_debug("Can't offline CPU%d.\n", cpu); break; case 1: - ret = cpu_up(cpu); - if (!ret) { - dev->offline = false; - kobject_uevent(&dev->kobj, KOBJ_ONLINE); - } else { + ret = add_cpu(cpu); + if (ret) pr_debug("Can't online CPU%d.\n", cpu); - } + break; default: ret = -EINVAL; } - unlock_device_hotplug(); - return ret; } @@ -121,7 +113,7 @@ int arch_register_cpu(int num) * Two known BSP/CPU0 dependencies: Resume from suspend/hibernate * depends on BSP. PIC interrupts depend on BSP. * - * If the BSP depencies are under control, one can tell kernel to + * If the BSP dependencies are under control, one can tell kernel to * enable BSP hotplug. This basically adds a control file and * one can attempt to offline BSP. */ @@ -162,11 +154,6 @@ static int __init topology_init(void) { int i; -#ifdef CONFIG_NUMA - for_each_online_node(i) - register_one_node(i); -#endif - for_each_present_cpu(i) arch_register_cpu(i); diff --git a/arch/x86/kernel/trace.c b/arch/x86/kernel/trace.c new file mode 100644 index 000000000000..8322e8352777 --- /dev/null +++ b/arch/x86/kernel/trace.c @@ -0,0 +1,234 @@ +#include <asm/trace/irq_vectors.h> +#include <linux/trace.h> + +#if defined(CONFIG_OSNOISE_TRACER) && defined(CONFIG_X86_LOCAL_APIC) +/* + * trace_intel_irq_entry - record intel specific IRQ entry + */ +static void trace_intel_irq_entry(void *data, int vector) +{ + osnoise_trace_irq_entry(vector); +} + +/* + * trace_intel_irq_exit - record intel specific IRQ exit + */ +static void trace_intel_irq_exit(void *data, int vector) +{ + char *vector_desc = (char *) data; + + osnoise_trace_irq_exit(vector, vector_desc); +} + +/* + * register_intel_irq_tp - Register intel specific IRQ entry tracepoints + */ +int osnoise_arch_register(void) +{ + int ret; + + ret = register_trace_local_timer_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_err; + + ret = register_trace_local_timer_exit(trace_intel_irq_exit, "local_timer"); + if (ret) + goto out_timer_entry; + +#ifdef CONFIG_X86_THERMAL_VECTOR + ret = register_trace_thermal_apic_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_timer_exit; + + ret = register_trace_thermal_apic_exit(trace_intel_irq_exit, "thermal_apic"); + if (ret) + goto out_thermal_entry; +#endif /* CONFIG_X86_THERMAL_VECTOR */ + +#ifdef CONFIG_X86_MCE_AMD + ret = register_trace_deferred_error_apic_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_thermal_exit; + + ret = register_trace_deferred_error_apic_exit(trace_intel_irq_exit, "deferred_error"); + if (ret) + goto out_deferred_entry; +#endif + +#ifdef CONFIG_X86_MCE_THRESHOLD + ret = register_trace_threshold_apic_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_deferred_exit; + + ret = register_trace_threshold_apic_exit(trace_intel_irq_exit, "threshold_apic"); + if (ret) + goto out_threshold_entry; +#endif /* CONFIG_X86_MCE_THRESHOLD */ + +#ifdef CONFIG_SMP + ret = register_trace_call_function_single_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_threshold_exit; + + ret = register_trace_call_function_single_exit(trace_intel_irq_exit, + "call_function_single"); + if (ret) + goto out_call_function_single_entry; + + ret = register_trace_call_function_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_call_function_single_exit; + + ret = register_trace_call_function_exit(trace_intel_irq_exit, "call_function"); + if (ret) + goto out_call_function_entry; + + ret = register_trace_reschedule_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_call_function_exit; + + ret = register_trace_reschedule_exit(trace_intel_irq_exit, "reschedule"); + if (ret) + goto out_reschedule_entry; +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_IRQ_WORK + ret = register_trace_irq_work_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_reschedule_exit; + + ret = register_trace_irq_work_exit(trace_intel_irq_exit, "irq_work"); + if (ret) + goto out_irq_work_entry; +#endif + + ret = register_trace_x86_platform_ipi_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_irq_work_exit; + + ret = register_trace_x86_platform_ipi_exit(trace_intel_irq_exit, "x86_platform_ipi"); + if (ret) + goto out_x86_ipi_entry; + + ret = register_trace_error_apic_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_x86_ipi_exit; + + ret = register_trace_error_apic_exit(trace_intel_irq_exit, "error_apic"); + if (ret) + goto out_error_apic_entry; + + ret = register_trace_spurious_apic_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_error_apic_exit; + + ret = register_trace_spurious_apic_exit(trace_intel_irq_exit, "spurious_apic"); + if (ret) + goto out_spurious_apic_entry; + + return 0; + +out_spurious_apic_entry: + unregister_trace_spurious_apic_entry(trace_intel_irq_entry, NULL); +out_error_apic_exit: + unregister_trace_error_apic_exit(trace_intel_irq_exit, "error_apic"); +out_error_apic_entry: + unregister_trace_error_apic_entry(trace_intel_irq_entry, NULL); +out_x86_ipi_exit: + unregister_trace_x86_platform_ipi_exit(trace_intel_irq_exit, "x86_platform_ipi"); +out_x86_ipi_entry: + unregister_trace_x86_platform_ipi_entry(trace_intel_irq_entry, NULL); +out_irq_work_exit: + +#ifdef CONFIG_IRQ_WORK + unregister_trace_irq_work_exit(trace_intel_irq_exit, "irq_work"); +out_irq_work_entry: + unregister_trace_irq_work_entry(trace_intel_irq_entry, NULL); +out_reschedule_exit: +#endif + +#ifdef CONFIG_SMP + unregister_trace_reschedule_exit(trace_intel_irq_exit, "reschedule"); +out_reschedule_entry: + unregister_trace_reschedule_entry(trace_intel_irq_entry, NULL); +out_call_function_exit: + unregister_trace_call_function_exit(trace_intel_irq_exit, "call_function"); +out_call_function_entry: + unregister_trace_call_function_entry(trace_intel_irq_entry, NULL); +out_call_function_single_exit: + unregister_trace_call_function_single_exit(trace_intel_irq_exit, "call_function_single"); +out_call_function_single_entry: + unregister_trace_call_function_single_entry(trace_intel_irq_entry, NULL); +out_threshold_exit: +#endif + +#ifdef CONFIG_X86_MCE_THRESHOLD + unregister_trace_threshold_apic_exit(trace_intel_irq_exit, "threshold_apic"); +out_threshold_entry: + unregister_trace_threshold_apic_entry(trace_intel_irq_entry, NULL); +out_deferred_exit: +#endif + +#ifdef CONFIG_X86_MCE_AMD + unregister_trace_deferred_error_apic_exit(trace_intel_irq_exit, "deferred_error"); +out_deferred_entry: + unregister_trace_deferred_error_apic_entry(trace_intel_irq_entry, NULL); +out_thermal_exit: +#endif /* CONFIG_X86_MCE_AMD */ + +#ifdef CONFIG_X86_THERMAL_VECTOR + unregister_trace_thermal_apic_exit(trace_intel_irq_exit, "thermal_apic"); +out_thermal_entry: + unregister_trace_thermal_apic_entry(trace_intel_irq_entry, NULL); +out_timer_exit: +#endif /* CONFIG_X86_THERMAL_VECTOR */ + + unregister_trace_local_timer_exit(trace_intel_irq_exit, "local_timer"); +out_timer_entry: + unregister_trace_local_timer_entry(trace_intel_irq_entry, NULL); +out_err: + return -EINVAL; +} + +void osnoise_arch_unregister(void) +{ + unregister_trace_spurious_apic_exit(trace_intel_irq_exit, "spurious_apic"); + unregister_trace_spurious_apic_entry(trace_intel_irq_entry, NULL); + unregister_trace_error_apic_exit(trace_intel_irq_exit, "error_apic"); + unregister_trace_error_apic_entry(trace_intel_irq_entry, NULL); + unregister_trace_x86_platform_ipi_exit(trace_intel_irq_exit, "x86_platform_ipi"); + unregister_trace_x86_platform_ipi_entry(trace_intel_irq_entry, NULL); + +#ifdef CONFIG_IRQ_WORK + unregister_trace_irq_work_exit(trace_intel_irq_exit, "irq_work"); + unregister_trace_irq_work_entry(trace_intel_irq_entry, NULL); +#endif + +#ifdef CONFIG_SMP + unregister_trace_reschedule_exit(trace_intel_irq_exit, "reschedule"); + unregister_trace_reschedule_entry(trace_intel_irq_entry, NULL); + unregister_trace_call_function_exit(trace_intel_irq_exit, "call_function"); + unregister_trace_call_function_entry(trace_intel_irq_entry, NULL); + unregister_trace_call_function_single_exit(trace_intel_irq_exit, "call_function_single"); + unregister_trace_call_function_single_entry(trace_intel_irq_entry, NULL); +#endif + +#ifdef CONFIG_X86_MCE_THRESHOLD + unregister_trace_threshold_apic_exit(trace_intel_irq_exit, "threshold_apic"); + unregister_trace_threshold_apic_entry(trace_intel_irq_entry, NULL); +#endif + +#ifdef CONFIG_X86_MCE_AMD + unregister_trace_deferred_error_apic_exit(trace_intel_irq_exit, "deferred_error"); + unregister_trace_deferred_error_apic_entry(trace_intel_irq_entry, NULL); +#endif + +#ifdef CONFIG_X86_THERMAL_VECTOR + unregister_trace_thermal_apic_exit(trace_intel_irq_exit, "thermal_apic"); + unregister_trace_thermal_apic_entry(trace_intel_irq_entry, NULL); +#endif /* CONFIG_X86_THERMAL_VECTOR */ + + unregister_trace_local_timer_exit(trace_intel_irq_exit, "local_timer"); + unregister_trace_local_timer_entry(trace_intel_irq_entry, NULL); +} +#endif /* CONFIG_OSNOISE_TRACER && CONFIG_X86_LOCAL_APIC */ diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c index 496748ed266a..03ae1caaa878 100644 --- a/arch/x86/kernel/tracepoint.c +++ b/arch/x86/kernel/tracepoint.c @@ -1,17 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Code for supporting irq vector tracepoints. - * * Copyright (C) 2013 Seiji Aguchi <seiji.aguchi@hds.com> - * */ #include <linux/jump_label.h> #include <linux/atomic.h> -#include <asm/hw_irq.h> -#include <asm/desc.h> #include <asm/trace/exceptions.h> -#include <asm/trace/irq_vectors.h> DEFINE_STATIC_KEY_FALSE(trace_pagefault_key); @@ -25,20 +19,3 @@ void trace_pagefault_unreg(void) { static_branch_dec(&trace_pagefault_key); } - -#ifdef CONFIG_SMP - -DEFINE_STATIC_KEY_FALSE(trace_resched_ipi_key); - -int trace_resched_ipi_reg(void) -{ - static_branch_inc(&trace_resched_ipi_key); - return 0; -} - -void trace_resched_ipi_unreg(void) -{ - static_branch_dec(&trace_resched_ipi_key); -} - -#endif diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 6ef00eb6fbb9..d3fdec706f1d 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -15,6 +15,7 @@ #include <linux/context_tracking.h> #include <linux/interrupt.h> #include <linux/kallsyms.h> +#include <linux/kmsan.h> #include <linux/spinlock.h> #include <linux/kprobes.h> #include <linux/uaccess.h> @@ -37,15 +38,20 @@ #include <linux/mm.h> #include <linux/smp.h> #include <linux/io.h> +#include <linux/hardirq.h> +#include <linux/atomic.h> +#include <linux/ioasid.h> + #include <asm/stacktrace.h> #include <asm/processor.h> #include <asm/debugreg.h> -#include <linux/atomic.h> +#include <asm/realmode.h> #include <asm/text-patching.h> #include <asm/ftrace.h> #include <asm/traps.h> #include <asm/desc.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> +#include <asm/cpu.h> #include <asm/cpu_entry_area.h> #include <asm/mce.h> #include <asm/fixmap.h> @@ -56,10 +62,12 @@ #include <asm/umip.h> #include <asm/insn.h> #include <asm/insn-eval.h> +#include <asm/vdso.h> +#include <asm/tdx.h> +#include <asm/cfi.h> #ifdef CONFIG_X86_64 #include <asm/x86_init.h> -#include <asm/pgalloc.h> #include <asm/proto.h> #else #include <asm/processor-flags.h> @@ -81,107 +89,16 @@ static inline void cond_local_irq_disable(struct pt_regs *regs) local_irq_disable(); } -/* - * In IST context, we explicitly disable preemption. This serves two - * purposes: it makes it much less likely that we would accidentally - * schedule in IST context and it will force a warning if we somehow - * manage to schedule by accident. - */ -void ist_enter(struct pt_regs *regs) -{ - if (user_mode(regs)) { - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); - } else { - /* - * We might have interrupted pretty much anything. In - * fact, if we're a machine check, we can even interrupt - * NMI processing. We don't want in_nmi() to return true, - * but we need to notify RCU. - */ - rcu_nmi_enter(); - } - - preempt_disable(); - - /* This code is a bit fragile. Test it. */ - RCU_LOCKDEP_WARN(!rcu_is_watching(), "ist_enter didn't work"); -} -NOKPROBE_SYMBOL(ist_enter); - -void ist_exit(struct pt_regs *regs) -{ - preempt_enable_no_resched(); - - if (!user_mode(regs)) - rcu_nmi_exit(); -} - -/** - * ist_begin_non_atomic() - begin a non-atomic section in an IST exception - * @regs: regs passed to the IST exception handler - * - * IST exception handlers normally cannot schedule. As a special - * exception, if the exception interrupted userspace code (i.e. - * user_mode(regs) would return true) and the exception was not - * a double fault, it can be safe to schedule. ist_begin_non_atomic() - * begins a non-atomic section within an ist_enter()/ist_exit() region. - * Callers are responsible for enabling interrupts themselves inside - * the non-atomic section, and callers must call ist_end_non_atomic() - * before ist_exit(). - */ -void ist_begin_non_atomic(struct pt_regs *regs) -{ - BUG_ON(!user_mode(regs)); - - /* - * Sanity check: we need to be on the normal thread stack. This - * will catch asm bugs and any attempt to use ist_preempt_enable - * from double_fault. - */ - BUG_ON(!on_thread_stack()); - - preempt_enable_no_resched(); -} - -/** - * ist_end_non_atomic() - begin a non-atomic section in an IST exception - * - * Ends a non-atomic section started with ist_begin_non_atomic(). - */ -void ist_end_non_atomic(void) +__always_inline int is_valid_bugaddr(unsigned long addr) { - preempt_disable(); -} - -int is_valid_bugaddr(unsigned long addr) -{ - unsigned short ud; - if (addr < TASK_SIZE_MAX) return 0; - if (probe_kernel_address((unsigned short *)addr, ud)) - return 0; - - return ud == INSN_UD0 || ud == INSN_UD2; -} - -int fixup_bug(struct pt_regs *regs, int trapnr) -{ - if (trapnr != X86_TRAP_UD) - return 0; - - switch (report_bug(regs->ip, regs)) { - case BUG_TRAP_TYPE_NONE: - case BUG_TRAP_TYPE_BUG: - break; - - case BUG_TRAP_TYPE_WARN: - regs->ip += LEN_UD2; - return 1; - } - - return 0; + /* + * We got #UD, if the text isn't readable we'd have gotten + * a different exception. + */ + return *(unsigned short *)addr == INSN_UD2; } static nokprobe_inline int @@ -205,6 +122,9 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str, tsk->thread.error_code = error_code; tsk->thread.trap_nr = trapnr; die(str, regs, error_code); + } else { + if (fixup_vdso_exception(regs, trapnr, error_code, 0)) + return 0; } /* @@ -214,7 +134,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str, * process no chance to handle the signal and notice the * kernel fault information, so that won't result in polluting * the information about previously queued, but not yet - * delivered, faults. See also do_general_protection below. + * delivered, faults. See also exc_general_protection below. */ tsk->thread.error_code = error_code; tsk->thread.trap_nr = trapnr; @@ -242,7 +162,6 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, { struct task_struct *tsk = current; - if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code)) return; @@ -260,53 +179,242 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str, { RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); - /* - * WARN*()s end up here; fix them up before we call the - * notifier chain. - */ - if (!user_mode(regs) && fixup_bug(regs, trapnr)) - return; - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) != NOTIFY_STOP) { cond_local_irq_enable(regs); do_trap(trapnr, signr, str, regs, error_code, sicode, addr); + cond_local_irq_disable(regs); } } -#define IP ((void __user *)uprobe_get_trap_addr(regs)) -#define DO_ERROR(trapnr, signr, sicode, addr, str, name) \ -dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ -{ \ - do_error_trap(regs, error_code, str, trapnr, signr, sicode, addr); \ +/* + * Posix requires to provide the address of the faulting instruction for + * SIGILL (#UD) and SIGFPE (#DE) in the si_addr member of siginfo_t. + * + * This address is usually regs->ip, but when an uprobe moved the code out + * of line then regs->ip points to the XOL code which would confuse + * anything which analyzes the fault address vs. the unmodified binary. If + * a trap happened in XOL code then uprobe maps regs->ip back to the + * original instruction address. + */ +static __always_inline void __user *error_get_trap_addr(struct pt_regs *regs) +{ + return (void __user *)uprobe_get_trap_addr(regs); +} + +DEFINE_IDTENTRY(exc_divide_error) +{ + do_error_trap(regs, 0, "divide error", X86_TRAP_DE, SIGFPE, + FPE_INTDIV, error_get_trap_addr(regs)); +} + +DEFINE_IDTENTRY(exc_overflow) +{ + do_error_trap(regs, 0, "overflow", X86_TRAP_OF, SIGSEGV, 0, NULL); +} + +#ifdef CONFIG_X86_KERNEL_IBT + +static __ro_after_init bool ibt_fatal = true; + +extern void ibt_selftest_ip(void); /* code label defined in asm below */ + +enum cp_error_code { + CP_EC = (1 << 15) - 1, + + CP_RET = 1, + CP_IRET = 2, + CP_ENDBR = 3, + CP_RSTRORSSP = 4, + CP_SETSSBSY = 5, + + CP_ENCL = 1 << 15, +}; + +DEFINE_IDTENTRY_ERRORCODE(exc_control_protection) +{ + if (!cpu_feature_enabled(X86_FEATURE_IBT)) { + pr_err("Unexpected #CP\n"); + BUG(); + } + + if (WARN_ON_ONCE(user_mode(regs) || (error_code & CP_EC) != CP_ENDBR)) + return; + + if (unlikely(regs->ip == (unsigned long)&ibt_selftest_ip)) { + regs->ax = 0; + return; + } + + pr_err("Missing ENDBR: %pS\n", (void *)instruction_pointer(regs)); + if (!ibt_fatal) { + printk(KERN_DEFAULT CUT_HERE); + __warn(__FILE__, __LINE__, (void *)regs->ip, TAINT_WARN, regs, NULL); + return; + } + BUG(); +} + +/* Must be noinline to ensure uniqueness of ibt_selftest_ip. */ +noinline bool ibt_selftest(void) +{ + unsigned long ret; + + asm (" lea ibt_selftest_ip(%%rip), %%rax\n\t" + ANNOTATE_RETPOLINE_SAFE + " jmp *%%rax\n\t" + "ibt_selftest_ip:\n\t" + UNWIND_HINT_FUNC + ANNOTATE_NOENDBR + " nop\n\t" + + : "=a" (ret) : : "memory"); + + return !ret; +} + +static int __init ibt_setup(char *str) +{ + if (!strcmp(str, "off")) + setup_clear_cpu_cap(X86_FEATURE_IBT); + + if (!strcmp(str, "warn")) + ibt_fatal = false; + + return 1; +} + +__setup("ibt=", ibt_setup); + +#endif /* CONFIG_X86_KERNEL_IBT */ + +#ifdef CONFIG_X86_F00F_BUG +void handle_invalid_op(struct pt_regs *regs) +#else +static inline void handle_invalid_op(struct pt_regs *regs) +#endif +{ + do_error_trap(regs, 0, "invalid opcode", X86_TRAP_UD, SIGILL, + ILL_ILLOPN, error_get_trap_addr(regs)); +} + +static noinstr bool handle_bug(struct pt_regs *regs) +{ + bool handled = false; + + /* + * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug() + * is a rare case that uses @regs without passing them to + * irqentry_enter(). + */ + kmsan_unpoison_entry_regs(regs); + if (!is_valid_bugaddr(regs->ip)) + return handled; + + /* + * All lies, just get the WARN/BUG out. + */ + instrumentation_begin(); + /* + * Since we're emulating a CALL with exceptions, restore the interrupt + * state to what it was at the exception site. + */ + if (regs->flags & X86_EFLAGS_IF) + raw_local_irq_enable(); + if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN || + handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) { + regs->ip += LEN_UD2; + handled = true; + } + if (regs->flags & X86_EFLAGS_IF) + raw_local_irq_disable(); + instrumentation_end(); + + return handled; +} + +DEFINE_IDTENTRY_RAW(exc_invalid_op) +{ + irqentry_state_t state; + + /* + * We use UD2 as a short encoding for 'CALL __WARN', as such + * handle it before exception entry to avoid recursive WARN + * in case exception entry is the one triggering WARNs. + */ + if (!user_mode(regs) && handle_bug(regs)) + return; + + state = irqentry_enter(regs); + instrumentation_begin(); + handle_invalid_op(regs); + instrumentation_end(); + irqentry_exit(regs, state); +} + +DEFINE_IDTENTRY(exc_coproc_segment_overrun) +{ + do_error_trap(regs, 0, "coprocessor segment overrun", + X86_TRAP_OLD_MF, SIGFPE, 0, NULL); +} + +DEFINE_IDTENTRY_ERRORCODE(exc_invalid_tss) +{ + do_error_trap(regs, error_code, "invalid TSS", X86_TRAP_TS, SIGSEGV, + 0, NULL); } -DO_ERROR(X86_TRAP_DE, SIGFPE, FPE_INTDIV, IP, "divide error", divide_error) -DO_ERROR(X86_TRAP_OF, SIGSEGV, 0, NULL, "overflow", overflow) -DO_ERROR(X86_TRAP_UD, SIGILL, ILL_ILLOPN, IP, "invalid opcode", invalid_op) -DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, 0, NULL, "coprocessor segment overrun", coprocessor_segment_overrun) -DO_ERROR(X86_TRAP_TS, SIGSEGV, 0, NULL, "invalid TSS", invalid_TSS) -DO_ERROR(X86_TRAP_NP, SIGBUS, 0, NULL, "segment not present", segment_not_present) -DO_ERROR(X86_TRAP_SS, SIGBUS, 0, NULL, "stack segment", stack_segment) -DO_ERROR(X86_TRAP_AC, SIGBUS, BUS_ADRALN, NULL, "alignment check", alignment_check) -#undef IP +DEFINE_IDTENTRY_ERRORCODE(exc_segment_not_present) +{ + do_error_trap(regs, error_code, "segment not present", X86_TRAP_NP, + SIGBUS, 0, NULL); +} + +DEFINE_IDTENTRY_ERRORCODE(exc_stack_segment) +{ + do_error_trap(regs, error_code, "stack segment", X86_TRAP_SS, SIGBUS, + 0, NULL); +} + +DEFINE_IDTENTRY_ERRORCODE(exc_alignment_check) +{ + char *str = "alignment check"; + + if (notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_AC, SIGBUS) == NOTIFY_STOP) + return; + + if (!user_mode(regs)) + die("Split lock detected\n", regs, error_code); + + local_irq_enable(); + + if (handle_user_split_lock(regs, error_code)) + goto out; + + do_trap(X86_TRAP_AC, SIGBUS, "alignment check", regs, + error_code, BUS_ADRALN, NULL); + +out: + local_irq_disable(); +} #ifdef CONFIG_VMAP_STACK -__visible void __noreturn handle_stack_overflow(const char *message, - struct pt_regs *regs, - unsigned long fault_address) +__visible void __noreturn handle_stack_overflow(struct pt_regs *regs, + unsigned long fault_address, + struct stack_info *info) { - printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n", - (void *)fault_address, current->stack, - (char *)current->stack + THREAD_SIZE - 1); - die(message, regs, 0); + const char *name = stack_type_name(info->type); + + printk(KERN_EMERG "BUG: %s stack guard page was hit at %p (stack is %p..%p)\n", + name, (void *)fault_address, info->begin, info->end); + + die("stack guard page", regs, 0); /* Be absolutely certain we don't return. */ - panic("%s", message); + panic("%s stack guard hit", name); } #endif -#if defined(CONFIG_X86_64) || defined(CONFIG_DOUBLEFAULT) /* * Runs on an IST stack for x86_64 and on a special task stack for x86_32. * @@ -322,12 +430,20 @@ __visible void __noreturn handle_stack_overflow(const char *message, * from the TSS. Returning is, in principle, okay, but changes to regs will * be lost. If, for some reason, we need to return to a context with modified * regs, the shim code could be adjusted to synchronize the registers. + * + * The 32bit #DF shim provides CR2 already as an argument. On 64bit it needs + * to be read before doing anything else. */ -dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2) +DEFINE_IDTENTRY_DF(exc_double_fault) { static const char str[] = "double fault"; struct task_struct *tsk = current; +#ifdef CONFIG_VMAP_STACK + unsigned long address = read_cr2(); + struct stack_info info; +#endif + #ifdef CONFIG_X86_ESPFIX64 extern unsigned char native_irq_return_iret[]; @@ -343,13 +459,14 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign * The net result is that our #GP handler will think that we * entered from usermode with the bad user context. * - * No need for ist_enter here because we don't use RCU. + * No need for nmi_enter() here because we don't use RCU. */ if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY && regs->cs == __KERNEL_CS && regs->ip == (unsigned long)native_irq_return_iret) { struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; + unsigned long *p = (unsigned long *)regs->sp; /* * regs->sp points to the failing IRET frame on the @@ -357,13 +474,17 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign * in gpregs->ss through gpregs->ip. * */ - memmove(&gpregs->ip, (void *)regs->sp, 5*8); + gpregs->ip = p[0]; + gpregs->cs = p[1]; + gpregs->flags = p[2]; + gpregs->sp = p[3]; + gpregs->ss = p[4]; gpregs->orig_ax = 0; /* Missing (lost) #GP error code */ /* * Adjust our frame so that we return straight to the #GP * vector with the expected RSP value. This is safe because - * we won't enable interupts or schedule before we invoke + * we won't enable interrupts or schedule before we invoke * general_protection, so nothing will clobber the stack * frame we just set up. * @@ -371,14 +492,15 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign * which is what the stub expects, given that the faulting * RIP will be the IRET instruction. */ - regs->ip = (unsigned long)general_protection; + regs->ip = (unsigned long)asm_exc_general_protection; regs->sp = (unsigned long)&gpregs->orig_ax; return; } #endif - ist_enter(regs); + irqentry_nmi_enter(regs); + instrumentation_begin(); notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); tsk->thread.error_code = error_code; @@ -422,28 +544,29 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign * stack even if the actual trigger for the double fault was * something else. */ - if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE) - handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2); + if (get_stack_guard_info((void *)address, &info)) + handle_stack_overflow(regs, address, &info); #endif pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code); die("double fault", regs, error_code); panic("Machine halted."); + instrumentation_end(); } -#endif -dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY(exc_bounds) { - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); - if (notify_die(DIE_TRAP, "bounds", regs, error_code, + if (notify_die(DIE_TRAP, "bounds", regs, 0, X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP) return; cond_local_irq_enable(regs); if (!user_mode(regs)) - die("bounds", regs, error_code); + die("bounds", regs, 0); + + do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, 0, 0, NULL); - do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, 0, NULL); + cond_local_irq_disable(regs); } enum kernel_gp_hint { @@ -462,13 +585,15 @@ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs, { u8 insn_buf[MAX_INSN_SIZE]; struct insn insn; + int ret; - if (probe_kernel_read(insn_buf, (void *)regs->ip, MAX_INSN_SIZE)) + if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip, + MAX_INSN_SIZE)) return GP_NO_HINT; - kernel_insn_init(&insn, insn_buf, MAX_INSN_SIZE); - insn_get_modrm(&insn); - insn_get_sib(&insn); + ret = insn_decode_kernel(&insn, insn_buf); + if (ret < 0) + return GP_NO_HINT; *addr = (unsigned long)insn_get_addr_ref(&insn, regs); if (*addr == -1UL) @@ -490,58 +615,152 @@ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs, #define GPFSTR "general protection fault" -dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code) +static bool fixup_iopl_exception(struct pt_regs *regs) +{ + struct thread_struct *t = ¤t->thread; + unsigned char byte; + unsigned long ip; + + if (!IS_ENABLED(CONFIG_X86_IOPL_IOPERM) || t->iopl_emul != 3) + return false; + + if (insn_get_effective_ip(regs, &ip)) + return false; + + if (get_user(byte, (const char __user *)ip)) + return false; + + if (byte != 0xfa && byte != 0xfb) + return false; + + if (!t->iopl_warn && printk_ratelimit()) { + pr_err("%s[%d] attempts to use CLI/STI, pretending it's a NOP, ip:%lx", + current->comm, task_pid_nr(current), ip); + print_vma_addr(KERN_CONT " in ", ip); + pr_cont("\n"); + t->iopl_warn = 1; + } + + regs->ip += 1; + return true; +} + +/* + * The unprivileged ENQCMD instruction generates #GPs if the + * IA32_PASID MSR has not been populated. If possible, populate + * the MSR from a PASID previously allocated to the mm. + */ +static bool try_fixup_enqcmd_gp(void) +{ +#ifdef CONFIG_IOMMU_SVA + u32 pasid; + + /* + * MSR_IA32_PASID is managed using XSAVE. Directly + * writing to the MSR is only possible when fpregs + * are valid and the fpstate is not. This is + * guaranteed when handling a userspace exception + * in *before* interrupts are re-enabled. + */ + lockdep_assert_irqs_disabled(); + + /* + * Hardware without ENQCMD will not generate + * #GPs that can be fixed up here. + */ + if (!cpu_feature_enabled(X86_FEATURE_ENQCMD)) + return false; + + pasid = current->mm->pasid; + + /* + * If the mm has not been allocated a + * PASID, the #GP can not be fixed up. + */ + if (!pasid_valid(pasid)) + return false; + + /* + * Did this thread already have its PASID activated? + * If so, the #GP must be from something else. + */ + if (current->pasid_activated) + return false; + + wrmsrl(MSR_IA32_PASID, pasid | MSR_IA32_PASID_VALID); + current->pasid_activated = 1; + + return true; +#else + return false; +#endif +} + +static bool gp_try_fixup_and_notify(struct pt_regs *regs, int trapnr, + unsigned long error_code, const char *str) +{ + if (fixup_exception(regs, trapnr, error_code, 0)) + return true; + + current->thread.error_code = error_code; + current->thread.trap_nr = trapnr; + + /* + * To be potentially processing a kprobe fault and to trust the result + * from kprobe_running(), we have to be non-preemptible. + */ + if (!preemptible() && kprobe_running() && + kprobe_fault_handler(regs, trapnr)) + return true; + + return notify_die(DIE_GPF, str, regs, error_code, trapnr, SIGSEGV) == NOTIFY_STOP; +} + +static void gp_user_force_sig_segv(struct pt_regs *regs, int trapnr, + unsigned long error_code, const char *str) +{ + current->thread.error_code = error_code; + current->thread.trap_nr = trapnr; + show_signal(current, SIGSEGV, "", str, regs, error_code); + force_sig(SIGSEGV); +} + +DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) { char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR; enum kernel_gp_hint hint = GP_NO_HINT; - struct task_struct *tsk; unsigned long gp_addr; - int ret; - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); + if (user_mode(regs) && try_fixup_enqcmd_gp()) + return; + cond_local_irq_enable(regs); if (static_cpu_has(X86_FEATURE_UMIP)) { if (user_mode(regs) && fixup_umip_exception(regs)) - return; + goto exit; } if (v8086_mode(regs)) { local_irq_enable(); handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); + local_irq_disable(); return; } - tsk = current; - if (user_mode(regs)) { - tsk->thread.error_code = error_code; - tsk->thread.trap_nr = X86_TRAP_GP; + if (fixup_iopl_exception(regs)) + goto exit; - show_signal(tsk, SIGSEGV, "", desc, regs, error_code); - force_sig(SIGSEGV); + if (fixup_vdso_exception(regs, X86_TRAP_GP, error_code, 0)) + goto exit; - return; + gp_user_force_sig_segv(regs, X86_TRAP_GP, error_code, desc); + goto exit; } - if (fixup_exception(regs, X86_TRAP_GP, error_code, 0)) - return; - - tsk->thread.error_code = error_code; - tsk->thread.trap_nr = X86_TRAP_GP; - - /* - * To be potentially processing a kprobe fault and to trust the result - * from kprobe_running(), we have to be non-preemptible. - */ - if (!preemptible() && - kprobe_running() && - kprobe_fault_handler(regs, X86_TRAP_GP)) - return; - - ret = notify_die(DIE_GPF, desc, regs, error_code, X86_TRAP_GP, SIGSEGV); - if (ret == NOTIFY_STOP) - return; + if (gp_try_fixup_and_notify(regs, X86_TRAP_GP, error_code, desc)) + goto exit; if (error_code) snprintf(desc, sizeof(desc), "segment-related " GPFSTR); @@ -563,46 +782,73 @@ dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code) die_addr(desc, regs, error_code, gp_addr); +exit: + cond_local_irq_disable(regs); } -NOKPROBE_SYMBOL(do_general_protection); -dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) +static bool do_int3(struct pt_regs *regs) { - if (poke_int3_handler(regs)) - return; + int res; - /* - * Use ist_enter despite the fact that we don't use an IST stack. - * We can be called from a kprobe in non-CONTEXT_KERNEL kernel - * mode or even during context tracking state changes. - * - * This means that we can't schedule. That's okay. - */ - ist_enter(regs); - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP - if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, - SIGTRAP) == NOTIFY_STOP) - goto exit; + if (kgdb_ll_trap(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, + SIGTRAP) == NOTIFY_STOP) + return true; #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ #ifdef CONFIG_KPROBES if (kprobe_int3_handler(regs)) - goto exit; + return true; #endif + res = notify_die(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, SIGTRAP); - if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, - SIGTRAP) == NOTIFY_STOP) - goto exit; + return res == NOTIFY_STOP; +} +NOKPROBE_SYMBOL(do_int3); + +static void do_int3_user(struct pt_regs *regs) +{ + if (do_int3(regs)) + return; cond_local_irq_enable(regs); - do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, 0, NULL); + do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, 0, 0, NULL); cond_local_irq_disable(regs); +} -exit: - ist_exit(regs); +DEFINE_IDTENTRY_RAW(exc_int3) +{ + /* + * poke_int3_handler() is completely self contained code; it does (and + * must) *NOT* call out to anything, lest it hits upon yet another + * INT3. + */ + if (poke_int3_handler(regs)) + return; + + /* + * irqentry_enter_from_user_mode() uses static_branch_{,un}likely() + * and therefore can trigger INT3, hence poke_int3_handler() must + * be done before. If the entry came from kernel mode, then use + * nmi_enter() because the INT3 could have been hit in any context + * including NMI. + */ + if (user_mode(regs)) { + irqentry_enter_from_user_mode(regs); + instrumentation_begin(); + do_int3_user(regs); + instrumentation_end(); + irqentry_exit_to_user_mode(regs); + } else { + irqentry_state_t irq_state = irqentry_nmi_enter(regs); + + instrumentation_begin(); + if (!do_int3(regs)) + die("int3", regs, 0); + instrumentation_end(); + irqentry_nmi_exit(regs, irq_state); + } } -NOKPROBE_SYMBOL(do_int3); #ifdef CONFIG_X86_64 /* @@ -610,23 +856,61 @@ NOKPROBE_SYMBOL(do_int3); * to switch to the normal thread stack if the interrupted code was in * user mode. The actual stack switch is done in entry_64.S */ -asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs) +asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs) { struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1; if (regs != eregs) *regs = *eregs; return regs; } -NOKPROBE_SYMBOL(sync_regs); -struct bad_iret_stack { - void *error_entry_ret; - struct pt_regs regs; -}; +#ifdef CONFIG_AMD_MEM_ENCRYPT +asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *regs) +{ + unsigned long sp, *stack; + struct stack_info info; + struct pt_regs *regs_ret; + + /* + * In the SYSCALL entry path the RSP value comes from user-space - don't + * trust it and switch to the current kernel stack + */ + if (ip_within_syscall_gap(regs)) { + sp = this_cpu_read(cpu_current_top_of_stack); + goto sync; + } -asmlinkage __visible notrace -struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) + /* + * From here on the RSP value is trusted. Now check whether entry + * happened from a safe stack. Not safe are the entry or unknown stacks, + * use the fall-back stack instead in this case. + */ + sp = regs->sp; + stack = (unsigned long *)sp; + + if (!get_stack_info_noinstr(stack, current, &info) || info.type == STACK_TYPE_ENTRY || + info.type > STACK_TYPE_EXCEPTION_LAST) + sp = __this_cpu_ist_top_va(VC2); + +sync: + /* + * Found a safe stack - switch to it as if the entry didn't happen via + * IST stack. The code below only copies pt_regs, the real switch happens + * in assembly code. + */ + sp = ALIGN_DOWN(sp, 8) - sizeof(*regs_ret); + + regs_ret = (struct pt_regs *)sp; + *regs_ret = *regs; + + return regs_ret; +} +#endif + +asmlinkage __visible noinstr struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs) { + struct pt_regs tmp, *new_stack; + /* * This is called from entry_64.S early in handling a fault * caused by a bad iret to user mode. To handle the fault @@ -635,19 +919,20 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) * just below the IRET frame) and we want to pretend that the * exception came from the IRET target. */ - struct bad_iret_stack *new_stack = - (struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; + new_stack = (struct pt_regs *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; - /* Copy the IRET target to the new stack. */ - memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8); + /* Copy the IRET target to the temporary storage. */ + __memcpy(&tmp.ip, (void *)bad_regs->sp, 5*8); /* Copy the remainder of the stack from the current stack. */ - memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip)); + __memcpy(&tmp, bad_regs, offsetof(struct pt_regs, ip)); - BUG_ON(!user_mode(&new_stack->regs)); + /* Update the entry stack */ + __memcpy(new_stack, &tmp, sizeof(tmp)); + + BUG_ON(!user_mode(new_stack)); return new_stack; } -NOKPROBE_SYMBOL(fixup_bad_iret); #endif static bool is_sysenter_singlestep(struct pt_regs *regs) @@ -673,6 +958,28 @@ static bool is_sysenter_singlestep(struct pt_regs *regs) #endif } +static __always_inline unsigned long debug_read_clear_dr6(void) +{ + unsigned long dr6; + + /* + * The Intel SDM says: + * + * Certain debug exceptions may clear bits 0-3. The remaining + * contents of the DR6 register are never cleared by the + * processor. To avoid confusion in identifying debug + * exceptions, debug handlers should clear the register before + * returning to the interrupted task. + * + * Keep it simple: clear DR6 immediately. + */ + get_debugreg(dr6, 6); + set_debugreg(DR6_RESERVED, 6); + dr6 ^= DR6_RESERVED; /* Flip to positive polarity */ + + return dr6; +} + /* * Our handling of the processor debug registers is non-trivial. * We do not clear them on entry and exit from the kernel. Therefore @@ -697,116 +1004,200 @@ static bool is_sysenter_singlestep(struct pt_regs *regs) * * May run on IST stack. */ -dotraplinkage void do_debug(struct pt_regs *regs, long error_code) + +static bool notify_debug(struct pt_regs *regs, unsigned long *dr6) { - struct task_struct *tsk = current; - int user_icebp = 0; - unsigned long dr6; - int si_code; + /* + * Notifiers will clear bits in @dr6 to indicate the event has been + * consumed - hw_breakpoint_handler(), single_stop_cont(). + * + * Notifiers will set bits in @virtual_dr6 to indicate the desire + * for signals - ptrace_triggered(), kgdb_hw_overflow_handler(). + */ + if (notify_die(DIE_DEBUG, "debug", regs, (long)dr6, 0, SIGTRAP) == NOTIFY_STOP) + return true; - ist_enter(regs); + return false; +} - get_debugreg(dr6, 6); +static __always_inline void exc_debug_kernel(struct pt_regs *regs, + unsigned long dr6) +{ /* - * The Intel SDM says: + * Disable breakpoints during exception handling; recursive exceptions + * are exceedingly 'fun'. * - * Certain debug exceptions may clear bits 0-3. The remaining - * contents of the DR6 register are never cleared by the - * processor. To avoid confusion in identifying debug - * exceptions, debug handlers should clear the register before - * returning to the interrupted task. + * Since this function is NOKPROBE, and that also applies to + * HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a + * HW_BREAKPOINT_W on our stack) * - * Keep it simple: clear DR6 immediately. + * Entry text is excluded for HW_BP_X and cpu_entry_area, which + * includes the entry stack is excluded for everything. */ - set_debugreg(0, 6); - - /* Filter out all the reserved bits which are preset to 1 */ - dr6 &= ~DR6_RESERVED; + unsigned long dr7 = local_db_save(); + irqentry_state_t irq_state = irqentry_nmi_enter(regs); + instrumentation_begin(); /* - * The SDM says "The processor clears the BTF flag when it - * generates a debug exception." Clear TIF_BLOCKSTEP to keep - * TIF_BLOCKSTEP in sync with the hardware BTF flag. + * If something gets miswired and we end up here for a user mode + * #DB, we will malfunction. */ - clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP); + WARN_ON_ONCE(user_mode(regs)); - if (unlikely(!user_mode(regs) && (dr6 & DR_STEP) && - is_sysenter_singlestep(regs))) { - dr6 &= ~DR_STEP; - if (!dr6) - goto exit; + if (test_thread_flag(TIF_BLOCKSTEP)) { /* - * else we might have gotten a single-step trap and hit a - * watchpoint at the same time, in which case we should fall - * through and handle the watchpoint. + * The SDM says "The processor clears the BTF flag when it + * generates a debug exception." but PTRACE_BLOCKSTEP requested + * it for userspace, but we just took a kernel #DB, so re-set + * BTF. */ + unsigned long debugctl; + + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + debugctl |= DEBUGCTLMSR_BTF; + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); } /* - * If dr6 has no reason to give us about the origin of this trap, - * then it's very likely the result of an icebp/int01 trap. - * User wants a sigtrap for that. + * Catch SYSENTER with TF set and clear DR_STEP. If this hit a + * watchpoint at the same time then that will still be handled. + */ + if ((dr6 & DR_STEP) && is_sysenter_singlestep(regs)) + dr6 &= ~DR_STEP; + + /* + * The kernel doesn't use INT1 */ - if (!dr6 && user_mode(regs)) - user_icebp = 1; + if (!dr6) + goto out; - /* Store the virtualized DR6 value */ - tsk->thread.debugreg6 = dr6; + if (notify_debug(regs, &dr6)) + goto out; -#ifdef CONFIG_KPROBES - if (kprobe_debug_handler(regs)) - goto exit; -#endif + /* + * The kernel doesn't use TF single-step outside of: + * + * - Kprobes, consumed through kprobe_debug_handler() + * - KGDB, consumed through notify_debug() + * + * So if we get here with DR_STEP set, something is wonky. + * + * A known way to trigger this is through QEMU's GDB stub, + * which leaks #DB into the guest and causes IST recursion. + */ + if (WARN_ON_ONCE(dr6 & DR_STEP)) + regs->flags &= ~X86_EFLAGS_TF; +out: + instrumentation_end(); + irqentry_nmi_exit(regs, irq_state); - if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, error_code, - SIGTRAP) == NOTIFY_STOP) - goto exit; + local_db_restore(dr7); +} + +static __always_inline void exc_debug_user(struct pt_regs *regs, + unsigned long dr6) +{ + bool icebp; + + /* + * If something gets miswired and we end up here for a kernel mode + * #DB, we will malfunction. + */ + WARN_ON_ONCE(!user_mode(regs)); + + /* + * NB: We can't easily clear DR7 here because + * irqentry_exit_to_usermode() can invoke ptrace, schedule, access + * user memory, etc. This means that a recursive #DB is possible. If + * this happens, that #DB will hit exc_debug_kernel() and clear DR7. + * Since we're not on the IST stack right now, everything will be + * fine. + */ + + irqentry_enter_from_user_mode(regs); + instrumentation_begin(); /* - * Let others (NMI) know that the debug stack is in use - * as we may switch to the interrupt stack. + * Start the virtual/ptrace DR6 value with just the DR_STEP mask + * of the real DR6. ptrace_triggered() will set the DR_TRAPn bits. + * + * Userspace expects DR_STEP to be visible in ptrace_get_debugreg(6) + * even if it is not the result of PTRACE_SINGLESTEP. + */ + current->thread.virtual_dr6 = (dr6 & DR_STEP); + + /* + * The SDM says "The processor clears the BTF flag when it + * generates a debug exception." Clear TIF_BLOCKSTEP to keep + * TIF_BLOCKSTEP in sync with the hardware BTF flag. */ - debug_stack_usage_inc(); + clear_thread_flag(TIF_BLOCKSTEP); + + /* + * If dr6 has no reason to give us about the origin of this trap, + * then it's very likely the result of an icebp/int01 trap. + * User wants a sigtrap for that. + */ + icebp = !dr6; + + if (notify_debug(regs, &dr6)) + goto out; /* It's safe to allow irq's after DR6 has been saved */ - cond_local_irq_enable(regs); + local_irq_enable(); if (v8086_mode(regs)) { - handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, - X86_TRAP_DB); - cond_local_irq_disable(regs); - debug_stack_usage_dec(); - goto exit; + handle_vm86_trap((struct kernel_vm86_regs *)regs, 0, X86_TRAP_DB); + goto out_irq; } - if (WARN_ON_ONCE((dr6 & DR_STEP) && !user_mode(regs))) { - /* - * Historical junk that used to handle SYSENTER single-stepping. - * This should be unreachable now. If we survive for a while - * without anyone hitting this warning, we'll turn this into - * an oops. - */ - tsk->thread.debugreg6 &= ~DR_STEP; - set_tsk_thread_flag(tsk, TIF_SINGLESTEP); - regs->flags &= ~X86_EFLAGS_TF; - } - si_code = get_si_code(tsk->thread.debugreg6); - if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) - send_sigtrap(regs, error_code, si_code); - cond_local_irq_disable(regs); - debug_stack_usage_dec(); + /* #DB for bus lock can only be triggered from userspace. */ + if (dr6 & DR_BUS_LOCK) + handle_bus_lock(regs); -exit: - ist_exit(regs); + /* Add the virtual_dr6 bits for signals. */ + dr6 |= current->thread.virtual_dr6; + if (dr6 & (DR_STEP | DR_TRAP_BITS) || icebp) + send_sigtrap(regs, 0, get_si_code(dr6)); + +out_irq: + local_irq_disable(); +out: + instrumentation_end(); + irqentry_exit_to_user_mode(regs); +} + +#ifdef CONFIG_X86_64 +/* IST stack entry */ +DEFINE_IDTENTRY_DEBUG(exc_debug) +{ + exc_debug_kernel(regs, debug_read_clear_dr6()); +} + +/* User entry, runs on regular task stack */ +DEFINE_IDTENTRY_DEBUG_USER(exc_debug) +{ + exc_debug_user(regs, debug_read_clear_dr6()); +} +#else +/* 32 bit does not have separate entry points. */ +DEFINE_IDTENTRY_RAW(exc_debug) +{ + unsigned long dr6 = debug_read_clear_dr6(); + + if (user_mode(regs)) + exc_debug_user(regs, dr6); + else + exc_debug_kernel(regs, dr6); } -NOKPROBE_SYMBOL(do_debug); +#endif /* * Note that we play around with the 'TS' bit in an attempt to get * the correct behaviour even in the presence of the asynchronous * IRQ13 behaviour */ -static void math_error(struct pt_regs *regs, int error_code, int trapnr) +static void math_error(struct pt_regs *regs, int trapnr) { struct task_struct *task = current; struct fpu *fpu = &task->thread.fpu; @@ -817,60 +1208,122 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) cond_local_irq_enable(regs); if (!user_mode(regs)) { - if (fixup_exception(regs, trapnr, error_code, 0)) - return; + if (fixup_exception(regs, trapnr, 0, 0)) + goto exit; - task->thread.error_code = error_code; + task->thread.error_code = 0; task->thread.trap_nr = trapnr; - if (notify_die(DIE_TRAP, str, regs, error_code, - trapnr, SIGFPE) != NOTIFY_STOP) - die(str, regs, error_code); - return; + if (notify_die(DIE_TRAP, str, regs, 0, trapnr, + SIGFPE) != NOTIFY_STOP) + die(str, regs, 0); + goto exit; } /* - * Save the info for the exception handler and clear the error. + * Synchronize the FPU register state to the memory register state + * if necessary. This allows the exception handler to inspect it. */ - fpu__save(fpu); + fpu_sync_fpstate(fpu); task->thread.trap_nr = trapnr; - task->thread.error_code = error_code; + task->thread.error_code = 0; si_code = fpu__exception_code(fpu, trapnr); /* Retry when we get spurious exceptions: */ if (!si_code) - return; + goto exit; + + if (fixup_vdso_exception(regs, trapnr, 0, 0)) + goto exit; force_sig_fault(SIGFPE, si_code, (void __user *)uprobe_get_trap_addr(regs)); +exit: + cond_local_irq_disable(regs); } -dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY(exc_coprocessor_error) { - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); - math_error(regs, error_code, X86_TRAP_MF); + math_error(regs, X86_TRAP_MF); } -dotraplinkage void -do_simd_coprocessor_error(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY(exc_simd_coprocessor_error) { - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); - math_error(regs, error_code, X86_TRAP_XF); + if (IS_ENABLED(CONFIG_X86_INVD_BUG)) { + /* AMD 486 bug: INVD in CPL 0 raises #XF instead of #GP */ + if (!static_cpu_has(X86_FEATURE_XMM)) { + __exc_general_protection(regs, 0); + return; + } + } + math_error(regs, X86_TRAP_XF); } -dotraplinkage void -do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY(exc_spurious_interrupt_bug) { - cond_local_irq_enable(regs); + /* + * This addresses a Pentium Pro Erratum: + * + * PROBLEM: If the APIC subsystem is configured in mixed mode with + * Virtual Wire mode implemented through the local APIC, an + * interrupt vector of 0Fh (Intel reserved encoding) may be + * generated by the local APIC (Int 15). This vector may be + * generated upon receipt of a spurious interrupt (an interrupt + * which is removed before the system receives the INTA sequence) + * instead of the programmed 8259 spurious interrupt vector. + * + * IMPLICATION: The spurious interrupt vector programmed in the + * 8259 is normally handled by an operating system's spurious + * interrupt handler. However, a vector of 0Fh is unknown to some + * operating systems, which would crash if this erratum occurred. + * + * In theory this could be limited to 32bit, but the handler is not + * hurting and who knows which other CPUs suffer from this. + */ +} + +static bool handle_xfd_event(struct pt_regs *regs) +{ + u64 xfd_err; + int err; + + if (!IS_ENABLED(CONFIG_X86_64) || !cpu_feature_enabled(X86_FEATURE_XFD)) + return false; + + rdmsrl(MSR_IA32_XFD_ERR, xfd_err); + if (!xfd_err) + return false; + + wrmsrl(MSR_IA32_XFD_ERR, 0); + + /* Die if that happens in kernel space */ + if (WARN_ON(!user_mode(regs))) + return false; + + local_irq_enable(); + + err = xfd_enable_feature(xfd_err); + + switch (err) { + case -EPERM: + force_sig_fault(SIGILL, ILL_ILLOPC, error_get_trap_addr(regs)); + break; + case -EFAULT: + force_sig(SIGSEGV); + break; + } + + local_irq_disable(); + return true; } -dotraplinkage void -do_device_not_available(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY(exc_device_not_available) { unsigned long cr0 = read_cr0(); - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); + if (handle_xfd_event(regs)) + return; #ifdef CONFIG_MATH_EMULATION if (!boot_cpu_has(X86_FEATURE_FPU) && (cr0 & X86_CR0_EM)) { @@ -880,6 +1333,8 @@ do_device_not_available(struct pt_regs *regs, long error_code) info.regs = regs; math_emulate(&info); + + cond_local_irq_disable(regs); return; } #endif @@ -894,22 +1349,105 @@ do_device_not_available(struct pt_regs *regs, long error_code) * to kill the task than getting stuck in a never-ending * loop of #NM faults. */ - die("unexpected #NM exception", regs, error_code); + die("unexpected #NM exception", regs, 0); + } +} + +#ifdef CONFIG_INTEL_TDX_GUEST + +#define VE_FAULT_STR "VE fault" + +static void ve_raise_fault(struct pt_regs *regs, long error_code) +{ + if (user_mode(regs)) { + gp_user_force_sig_segv(regs, X86_TRAP_VE, error_code, VE_FAULT_STR); + return; } + + if (gp_try_fixup_and_notify(regs, X86_TRAP_VE, error_code, VE_FAULT_STR)) + return; + + die_addr(VE_FAULT_STR, regs, error_code, 0); +} + +/* + * Virtualization Exceptions (#VE) are delivered to TDX guests due to + * specific guest actions which may happen in either user space or the + * kernel: + * + * * Specific instructions (WBINVD, for example) + * * Specific MSR accesses + * * Specific CPUID leaf accesses + * * Access to specific guest physical addresses + * + * In the settings that Linux will run in, virtualization exceptions are + * never generated on accesses to normal, TD-private memory that has been + * accepted (by BIOS or with tdx_enc_status_changed()). + * + * Syscall entry code has a critical window where the kernel stack is not + * yet set up. Any exception in this window leads to hard to debug issues + * and can be exploited for privilege escalation. Exceptions in the NMI + * entry code also cause issues. Returning from the exception handler with + * IRET will re-enable NMIs and nested NMI will corrupt the NMI stack. + * + * For these reasons, the kernel avoids #VEs during the syscall gap and + * the NMI entry code. Entry code paths do not access TD-shared memory, + * MMIO regions, use #VE triggering MSRs, instructions, or CPUID leaves + * that might generate #VE. VMM can remove memory from TD at any point, + * but access to unaccepted (or missing) private memory leads to VM + * termination, not to #VE. + * + * Similarly to page faults and breakpoints, #VEs are allowed in NMI + * handlers once the kernel is ready to deal with nested NMIs. + * + * During #VE delivery, all interrupts, including NMIs, are blocked until + * TDGETVEINFO is called. It prevents #VE nesting until the kernel reads + * the VE info. + * + * If a guest kernel action which would normally cause a #VE occurs in + * the interrupt-disabled region before TDGETVEINFO, a #DF (fault + * exception) is delivered to the guest which will result in an oops. + * + * The entry code has been audited carefully for following these expectations. + * Changes in the entry code have to be audited for correctness vs. this + * aspect. Similarly to #PF, #VE in these places will expose kernel to + * privilege escalation or may lead to random crashes. + */ +DEFINE_IDTENTRY(exc_virtualization_exception) +{ + struct ve_info ve; + + /* + * NMIs/Machine-checks/Interrupts will be in a disabled state + * till TDGETVEINFO TDCALL is executed. This ensures that VE + * info cannot be overwritten by a nested #VE. + */ + tdx_get_ve_info(&ve); + + cond_local_irq_enable(regs); + + /* + * If tdx_handle_virt_exception() could not process + * it successfully, treat it as #GP(0) and handle it. + */ + if (!tdx_handle_virt_exception(regs, &ve)) + ve_raise_fault(regs, 0); + + cond_local_irq_disable(regs); } -NOKPROBE_SYMBOL(do_device_not_available); + +#endif #ifdef CONFIG_X86_32 -dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY_SW(iret_error) { - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); local_irq_enable(); - - if (notify_die(DIE_TRAP, "iret exception", regs, error_code, + if (notify_die(DIE_TRAP, "iret exception", regs, 0, X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) { - do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code, + do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, 0, ILL_BADSTK, (void __user *)NULL); } + local_irq_disable(); } #endif @@ -918,25 +1456,12 @@ void __init trap_init(void) /* Init cpu_entry_area before IST entries are set up */ setup_cpu_entry_areas(); - idt_setup_traps(); + /* Init GHCB memory pages when running as an SEV-ES guest */ + sev_es_init_vc_handling(); - /* - * Set the IDT descriptor to a fixed read-only location, so that the - * "sidt" instruction will not leak the location of the kernel, and - * to defend the IDT against arbitrary memory write vulnerabilities. - * It will be reloaded in cpu_init() */ - cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table), - PAGE_KERNEL_RO); - idt_descr.address = CPU_ENTRY_AREA_RO_IDT; - - /* - * Should be a barrier for any external CPU state: - */ + /* Initialize TSS before setting up traps so ISTs work */ + cpu_init_exception_handling(); + /* Setup traps as cpu_init() might #GP */ + idt_setup_traps(); cpu_init(); - - idt_setup_ist_traps(); - - x86_init.irqs.trap_init(); - - idt_setup_debugidt_traps(); } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 7e322e2daaf5..cafacb2e58cc 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -14,6 +14,7 @@ #include <linux/percpu.h> #include <linux/timex.h> #include <linux/static_key.h> +#include <linux/static_call.h> #include <asm/hpet.h> #include <asm/timer.h> @@ -41,6 +42,7 @@ EXPORT_SYMBOL(tsc_khz); * TSC can be unstable due to cpufreq or due to unsynced TSCs */ static int __read_mostly tsc_unstable; +static unsigned int __initdata tsc_early_khz; static DEFINE_STATIC_KEY_FALSE(__use_tsc); @@ -53,12 +55,18 @@ struct clocksource *art_related_clocksource; struct cyc2ns { struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */ - seqcount_t seq; /* 32 + 4 = 36 */ + seqcount_latch_t seq; /* 32 + 4 = 36 */ }; /* fits one cacheline */ static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); +static int __init tsc_early_khz_setup(char *buf) +{ + return kstrtouint(buf, 0, &tsc_early_khz); +} +early_param("tsc_early_khz", tsc_early_khz_setup); + __always_inline void cyc2ns_read_begin(struct cyc2ns_data *data) { int seq, idx; @@ -66,14 +74,14 @@ __always_inline void cyc2ns_read_begin(struct cyc2ns_data *data) preempt_disable_notrace(); do { - seq = this_cpu_read(cyc2ns.seq.sequence); + seq = this_cpu_read(cyc2ns.seq.seqcount.sequence); idx = seq & 1; data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset); data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul); data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift); - } while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence))); + } while (unlikely(seq != this_cpu_read(cyc2ns.seq.seqcount.sequence))); } __always_inline void cyc2ns_read_end(void) @@ -179,7 +187,7 @@ static void __init cyc2ns_init_boot_cpu(void) { struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns); - seqcount_init(&c2n->seq); + seqcount_latch_init(&c2n->seq); __set_cyc2ns_scale(tsc_khz, smp_processor_id(), rdtsc()); } @@ -196,7 +204,7 @@ static void __init cyc2ns_init_secondary_cpus(void) for_each_possible_cpu(cpu) { if (cpu != this_cpu) { - seqcount_init(&c2n->seq); + seqcount_latch_init(&c2n->seq); c2n = per_cpu_ptr(&cyc2ns, cpu); c2n->data[0] = data[0]; c2n->data[1] = data[1]; @@ -247,7 +255,7 @@ unsigned long long sched_clock(void) bool using_native_sched_clock(void) { - return pv_ops.time.sched_clock == native_sched_clock; + return static_call_query(pv_sched_clock) == native_sched_clock; } #else unsigned long long @@ -477,7 +485,7 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin) * transition from one expected value to another with a fairly * high accuracy, and we didn't miss any events. We can thus * use the TSC value at the transitions to calculate a pretty - * good value for the TSC frequencty. + * good value for the TSC frequency. */ static inline int pit_verify_msb(unsigned char val) { @@ -732,7 +740,7 @@ static unsigned long pit_hpet_ptimer_calibrate_cpu(void) * 2) Reference counter. If available we use the HPET or the * PMTIMER as a reference to check the sanity of that value. * We use separate TSC readouts and check inside of the - * reference read for any possible disturbance. We dicard + * reference read for any possible disturbance. We discard * disturbed values here as well. We do that around the PIT * calibration delay loop as we have to wait for a certain * amount of time anyway. @@ -1072,7 +1080,7 @@ static void tsc_resume(struct clocksource *cs) * very small window right after one CPU updated cycle_last under * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which * is smaller than the cycle_last reference value due to a TSC which - * is slighty behind. This delta is nowhere else observable, but in + * is slightly behind. This delta is nowhere else observable, but in * that case it results in a forward time jump in the range of hours * due to the unsigned delta calculation of the time keeping core * code, which is necessary to support wrapping clocksources like pm @@ -1108,17 +1116,25 @@ static void tsc_cs_tick_stable(struct clocksource *cs) sched_clock_tick_stable(); } +static int tsc_cs_enable(struct clocksource *cs) +{ + vclocks_set_used(VDSO_CLOCKMODE_TSC); + return 0; +} + /* * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc() */ static struct clocksource clocksource_tsc_early = { - .name = "tsc-early", - .rating = 299, - .read = read_tsc, - .mask = CLOCKSOURCE_MASK(64), - .flags = CLOCK_SOURCE_IS_CONTINUOUS | + .name = "tsc-early", + .rating = 299, + .uncertainty_margin = 32 * NSEC_PER_MSEC, + .read = read_tsc, + .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_MUST_VERIFY, - .archdata = { .vclock_mode = VCLOCK_TSC }, + .vdso_clock_mode = VDSO_CLOCKMODE_TSC, + .enable = tsc_cs_enable, .resume = tsc_resume, .mark_unstable = tsc_cs_mark_unstable, .tick_stable = tsc_cs_tick_stable, @@ -1131,14 +1147,16 @@ static struct clocksource clocksource_tsc_early = { * been found good. */ static struct clocksource clocksource_tsc = { - .name = "tsc", - .rating = 300, - .read = read_tsc, - .mask = CLOCKSOURCE_MASK(64), - .flags = CLOCK_SOURCE_IS_CONTINUOUS | + .name = "tsc", + .rating = 300, + .read = read_tsc, + .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_VALID_FOR_HRES | - CLOCK_SOURCE_MUST_VERIFY, - .archdata = { .vclock_mode = VCLOCK_TSC }, + CLOCK_SOURCE_MUST_VERIFY | + CLOCK_SOURCE_VERIFY_PERCPU, + .vdso_clock_mode = VDSO_CLOCKMODE_TSC, + .enable = tsc_cs_enable, .resume = tsc_resume, .mark_unstable = tsc_cs_mark_unstable, .tick_stable = tsc_cs_tick_stable, @@ -1162,6 +1180,12 @@ void mark_tsc_unstable(char *reason) EXPORT_SYMBOL_GPL(mark_tsc_unstable); +static void __init tsc_disable_clocksource_watchdog(void) +{ + clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY; + clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; +} + static void __init check_system_tsc_reliable(void) { #if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC) @@ -1178,6 +1202,23 @@ static void __init check_system_tsc_reliable(void) #endif if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) tsc_clocksource_reliable = 1; + + /* + * Disable the clocksource watchdog when the system has: + * - TSC running at constant frequency + * - TSC which does not stop in C-States + * - the TSC_ADJUST register which allows to detect even minimal + * modifications + * - not more than two sockets. As the number of sockets cannot be + * evaluated at the early boot stage where this has to be + * invoked, check the number of online memory nodes as a + * fallback solution which is an reasonable estimate. + */ + if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && + boot_cpu_has(X86_FEATURE_NONSTOP_TSC) && + boot_cpu_has(X86_FEATURE_TSC_ADJUST) && + nr_online_nodes <= 2) + tsc_disable_clocksource_watchdog(); } /* @@ -1249,7 +1290,7 @@ EXPORT_SYMBOL(convert_art_to_tsc); * corresponding clocksource * @cycles: System counter value * @cs: Clocksource corresponding to system counter value. Used - * by timekeeping code to verify comparibility of two cycle + * by timekeeping code to verify comparability of two cycle * values. */ @@ -1369,9 +1410,6 @@ static int __init init_tsc_clocksource(void) if (tsc_unstable) goto unreg; - if (tsc_clocksource_reliable || no_tsc_watchdog) - clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; - if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3)) clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP; @@ -1404,7 +1442,10 @@ static bool __init determine_cpu_tsc_frequencies(bool early) if (early) { cpu_khz = x86_platform.calibrate_cpu(); - tsc_khz = x86_platform.calibrate_tsc(); + if (tsc_early_khz) + tsc_khz = tsc_early_khz; + else + tsc_khz = x86_platform.calibrate_tsc(); } else { /* We should not be here with non-native cpu calibration */ WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu); @@ -1446,6 +1487,9 @@ static unsigned long __init get_loops_per_jiffy(void) static void __init tsc_enable_sched_clock(void) { + loops_per_jiffy = get_loops_per_jiffy(); + use_tsc_delay(); + /* Sanitize TSC ADJUST before cyc2ns gets initialized */ tsc_store_and_check_tsc_adjust(true); cyc2ns_init_boot_cpu(); @@ -1461,8 +1505,6 @@ void __init tsc_early_init(void) return; if (!determine_cpu_tsc_frequencies(true)) return; - loops_per_jiffy = get_loops_per_jiffy(); - tsc_enable_sched_clock(); } @@ -1496,7 +1538,6 @@ void __init tsc_init(void) enable_sched_clock_irqtime(); lpj_fine = get_loops_per_jiffy(); - use_tsc_delay(); check_system_tsc_reliable(); @@ -1506,7 +1547,7 @@ void __init tsc_init(void) } if (tsc_clocksource_reliable || no_tsc_watchdog) - clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY; + tsc_disable_clocksource_watchdog(); clocksource_register_khz(&clocksource_tsc_early, tsc_khz); detect_art(); diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index e0cbe4f2af49..6555a857a1e6 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -7,6 +7,7 @@ */ #include <linux/kernel.h> +#include <linux/thread_info.h> #include <asm/apic.h> #include <asm/cpu_device_id.h> @@ -15,18 +16,46 @@ #include <asm/param.h> #include <asm/tsc.h> -#define MAX_NUM_FREQS 9 +#define MAX_NUM_FREQS 16 /* 4 bits to select the frequency */ + +/* + * The frequency numbers in the SDM are e.g. 83.3 MHz, which does not contain a + * lot of accuracy which leads to clock drift. As far as we know Bay Trail SoCs + * use a 25 MHz crystal and Cherry Trail uses a 19.2 MHz crystal, the crystal + * is the source clk for a root PLL which outputs 1600 and 100 MHz. It is + * unclear if the root PLL outputs are used directly by the CPU clock PLL or + * if there is another PLL in between. + * This does not matter though, we can model the chain of PLLs as a single PLL + * with a quotient equal to the quotients of all PLLs in the chain multiplied. + * So we can create a simplified model of the CPU clock setup using a reference + * clock of 100 MHz plus a quotient which gets us as close to the frequency + * from the SDM as possible. + * For the 83.3 MHz example from above this would give us 100 MHz * 5 / 6 = + * 83 and 1/3 MHz, which matches exactly what has been measured on actual hw. + */ +#define TSC_REFERENCE_KHZ 100000 + +struct muldiv { + u32 multiplier; + u32 divider; +}; /* * If MSR_PERF_STAT[31] is set, the maximum resolved bus ratio can be * read in MSR_PLATFORM_ID[12:8], otherwise in MSR_PERF_STAT[44:40]. * Unfortunately some Intel Atom SoCs aren't quite compliant to this, * so we need manually differentiate SoC families. This is what the - * field msr_plat does. + * field use_msr_plat does. */ struct freq_desc { - u8 msr_plat; /* 1: use MSR_PLATFORM_INFO, 0: MSR_IA32_PERF_STATUS */ + bool use_msr_plat; + struct muldiv muldiv[MAX_NUM_FREQS]; + /* + * Some CPU frequencies in the SDM do not map to known PLL freqs, in + * that case the muldiv array is empty and the freqs array is used. + */ u32 freqs[MAX_NUM_FREQS]; + u32 mask; }; /* @@ -35,41 +64,96 @@ struct freq_desc { * by MSR based on SDM. */ static const struct freq_desc freq_desc_pnw = { - 0, { 0, 0, 0, 0, 0, 99840, 0, 83200 } + .use_msr_plat = false, + .freqs = { 0, 0, 0, 0, 0, 99840, 0, 83200 }, + .mask = 0x07, }; static const struct freq_desc freq_desc_clv = { - 0, { 0, 133200, 0, 0, 0, 99840, 0, 83200 } + .use_msr_plat = false, + .freqs = { 0, 133200, 0, 0, 0, 99840, 0, 83200 }, + .mask = 0x07, }; +/* + * Bay Trail SDM MSR_FSB_FREQ frequencies simplified PLL model: + * 000: 100 * 5 / 6 = 83.3333 MHz + * 001: 100 * 1 / 1 = 100.0000 MHz + * 010: 100 * 4 / 3 = 133.3333 MHz + * 011: 100 * 7 / 6 = 116.6667 MHz + * 100: 100 * 4 / 5 = 80.0000 MHz + */ static const struct freq_desc freq_desc_byt = { - 1, { 83300, 100000, 133300, 116700, 80000, 0, 0, 0 } + .use_msr_plat = true, + .muldiv = { { 5, 6 }, { 1, 1 }, { 4, 3 }, { 7, 6 }, + { 4, 5 } }, + .mask = 0x07, }; +/* + * Cherry Trail SDM MSR_FSB_FREQ frequencies simplified PLL model: + * 0000: 100 * 5 / 6 = 83.3333 MHz + * 0001: 100 * 1 / 1 = 100.0000 MHz + * 0010: 100 * 4 / 3 = 133.3333 MHz + * 0011: 100 * 7 / 6 = 116.6667 MHz + * 0100: 100 * 4 / 5 = 80.0000 MHz + * 0101: 100 * 14 / 15 = 93.3333 MHz + * 0110: 100 * 9 / 10 = 90.0000 MHz + * 0111: 100 * 8 / 9 = 88.8889 MHz + * 1000: 100 * 7 / 8 = 87.5000 MHz + */ static const struct freq_desc freq_desc_cht = { - 1, { 83300, 100000, 133300, 116700, 80000, 93300, 90000, 88900, 87500 } + .use_msr_plat = true, + .muldiv = { { 5, 6 }, { 1, 1 }, { 4, 3 }, { 7, 6 }, + { 4, 5 }, { 14, 15 }, { 9, 10 }, { 8, 9 }, + { 7, 8 } }, + .mask = 0x0f, }; +/* + * Merriefield SDM MSR_FSB_FREQ frequencies simplified PLL model: + * 0001: 100 * 1 / 1 = 100.0000 MHz + * 0010: 100 * 4 / 3 = 133.3333 MHz + */ static const struct freq_desc freq_desc_tng = { - 1, { 0, 100000, 133300, 0, 0, 0, 0, 0 } + .use_msr_plat = true, + .muldiv = { { 0, 0 }, { 1, 1 }, { 4, 3 } }, + .mask = 0x07, }; +/* + * Moorefield SDM MSR_FSB_FREQ frequencies simplified PLL model: + * 0000: 100 * 5 / 6 = 83.3333 MHz + * 0001: 100 * 1 / 1 = 100.0000 MHz + * 0010: 100 * 4 / 3 = 133.3333 MHz + * 0011: 100 * 1 / 1 = 100.0000 MHz + */ static const struct freq_desc freq_desc_ann = { - 1, { 83300, 100000, 133300, 100000, 0, 0, 0, 0 } + .use_msr_plat = true, + .muldiv = { { 5, 6 }, { 1, 1 }, { 4, 3 }, { 1, 1 } }, + .mask = 0x0f, }; +/* + * 24 MHz crystal? : 24 * 13 / 4 = 78 MHz + * Frequency step for Lightning Mountain SoC is fixed to 78 MHz, + * so all the frequency entries are 78000. + */ static const struct freq_desc freq_desc_lgm = { - 1, { 78000, 78000, 78000, 78000, 78000, 78000, 78000, 78000 } + .use_msr_plat = true, + .freqs = { 78000, 78000, 78000, 78000, 78000, 78000, 78000, 78000, + 78000, 78000, 78000, 78000, 78000, 78000, 78000, 78000 }, + .mask = 0x0f, }; static const struct x86_cpu_id tsc_msr_cpu_ids[] = { - INTEL_CPU_FAM6(ATOM_SALTWELL_MID, freq_desc_pnw), - INTEL_CPU_FAM6(ATOM_SALTWELL_TABLET, freq_desc_clv), - INTEL_CPU_FAM6(ATOM_SILVERMONT, freq_desc_byt), - INTEL_CPU_FAM6(ATOM_SILVERMONT_MID, freq_desc_tng), - INTEL_CPU_FAM6(ATOM_AIRMONT, freq_desc_cht), - INTEL_CPU_FAM6(ATOM_AIRMONT_MID, freq_desc_ann), - INTEL_CPU_FAM6(ATOM_AIRMONT_NP, freq_desc_lgm), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_SALTWELL_MID, &freq_desc_pnw), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_SALTWELL_TABLET,&freq_desc_clv), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT, &freq_desc_byt), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_MID, &freq_desc_tng), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT, &freq_desc_cht), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT_MID, &freq_desc_ann), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT_NP, &freq_desc_lgm), {} }; @@ -81,17 +165,19 @@ static const struct x86_cpu_id tsc_msr_cpu_ids[] = { */ unsigned long cpu_khz_from_msr(void) { - u32 lo, hi, ratio, freq; + u32 lo, hi, ratio, freq, tscref; const struct freq_desc *freq_desc; const struct x86_cpu_id *id; + const struct muldiv *md; unsigned long res; + int index; id = x86_match_cpu(tsc_msr_cpu_ids); if (!id) return 0; freq_desc = (struct freq_desc *)id->driver_data; - if (freq_desc->msr_plat) { + if (freq_desc->use_msr_plat) { rdmsr(MSR_PLATFORM_INFO, lo, hi); ratio = (lo >> 8) & 0xff; } else { @@ -101,12 +187,28 @@ unsigned long cpu_khz_from_msr(void) /* Get FSB FREQ ID */ rdmsr(MSR_FSB_FREQ, lo, hi); + index = lo & freq_desc->mask; + md = &freq_desc->muldiv[index]; - /* Map CPU reference clock freq ID(0-7) to CPU reference clock freq(KHz) */ - freq = freq_desc->freqs[lo & 0x7]; + /* + * Note this also catches cases where the index points to an unpopulated + * part of muldiv, in that case the else will set freq and res to 0. + */ + if (md->divider) { + tscref = TSC_REFERENCE_KHZ * md->multiplier; + freq = DIV_ROUND_CLOSEST(tscref, md->divider); + /* + * Multiplying by ratio before the division has better + * accuracy than just calculating freq * ratio. + */ + res = DIV_ROUND_CLOSEST(tscref * ratio, md->divider); + } else { + freq = freq_desc->freqs[index]; + res = freq * ratio; + } - /* TSC frequency = maximum resolved freq * maximum resolved bus ratio */ - res = freq * ratio; + if (freq == 0) + pr_err("Error MSR_FSB_FREQ index %d is unknown\n", index); #ifdef CONFIG_X86_LOCAL_APIC lapic_timer_period = (freq * 1000) / HZ; diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 32a818764e03..9452dc9664b5 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -30,6 +30,7 @@ struct tsc_adjust { }; static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust); +static struct timer_list tsc_sync_check_timer; /* * TSC's on different sockets may be reset asynchronously. @@ -77,6 +78,46 @@ void tsc_verify_tsc_adjust(bool resume) } } +/* + * Normally the tsc_sync will be checked every time system enters idle + * state, but there is still caveat that a system won't enter idle, + * either because it's too busy or configured purposely to not enter + * idle. + * + * So setup a periodic timer (every 10 minutes) to make sure the check + * is always on. + */ + +#define SYNC_CHECK_INTERVAL (HZ * 600) + +static void tsc_sync_check_timer_fn(struct timer_list *unused) +{ + int next_cpu; + + tsc_verify_tsc_adjust(false); + + /* Run the check for all onlined CPUs in turn */ + next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); + if (next_cpu >= nr_cpu_ids) + next_cpu = cpumask_first(cpu_online_mask); + + tsc_sync_check_timer.expires += SYNC_CHECK_INTERVAL; + add_timer_on(&tsc_sync_check_timer, next_cpu); +} + +static int __init start_sync_check_timer(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_TSC_ADJUST) || tsc_clocksource_reliable) + return 0; + + timer_setup(&tsc_sync_check_timer, tsc_sync_check_timer_fn, 0); + tsc_sync_check_timer.expires = jiffies + SYNC_CHECK_INTERVAL; + add_timer(&tsc_sync_check_timer); + + return 0; +} +late_initcall(start_sync_check_timer); + static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval, unsigned int cpu, bool bootcpu) { @@ -295,7 +336,7 @@ static cycles_t check_tsc_warp(unsigned int timeout) * But as the TSC is per-logical CPU and can potentially be modified wrongly * by the bios, TSC sync test for smaller duration should be able * to catch such errors. Also this will catch the condition where all the - * cores in the socket doesn't get reset at the same time. + * cores in the socket don't get reset at the same time. */ static inline unsigned int loop_timeout(int cpu) { @@ -472,7 +513,7 @@ retry: /* * Add the result to the previous adjustment value. * - * The adjustement value is slightly off by the overhead of the + * The adjustment value is slightly off by the overhead of the * sync mechanism (observed values are ~200 TSC cycles), but this * really depends on CPU, node distance and frequency. So * compensating for this is hard to get right. Experiments show diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index 4d732a444711..5a4b21389b1d 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -45,11 +45,12 @@ * value that, lies close to the top of the kernel memory. The limit for the GDT * and the IDT are set to zero. * - * Given that SLDT and STR are not commonly used in programs that run on WineHQ - * or DOSEMU2, they are not emulated. - * - * The instruction smsw is emulated to return the value that the register CR0 + * The instruction SMSW is emulated to return the value that the register CR0 * has at boot time as set in the head_32. + * SLDT and STR are emulated to return the values that the kernel programmatically + * assigns: + * - SLDT returns (GDT_ENTRY_LDT * 8) if an LDT has been set, 0 if not. + * - STR returns (GDT_ENTRY_TSS * 8). * * Emulation is provided for both 32-bit and 64-bit processes. * @@ -81,7 +82,7 @@ #define UMIP_INST_SLDT 3 /* 0F 00 /0 */ #define UMIP_INST_STR 4 /* 0F 00 /1 */ -const char * const umip_insns[5] = { +static const char * const umip_insns[5] = { [UMIP_INST_SGDT] = "SGDT", [UMIP_INST_SIDT] = "SIDT", [UMIP_INST_SMSW] = "SMSW", @@ -91,8 +92,8 @@ const char * const umip_insns[5] = { #define umip_pr_err(regs, fmt, ...) \ umip_printk(regs, KERN_ERR, fmt, ##__VA_ARGS__) -#define umip_pr_warn(regs, fmt, ...) \ - umip_printk(regs, KERN_WARNING, fmt, ##__VA_ARGS__) +#define umip_pr_debug(regs, fmt, ...) \ + umip_printk(regs, KERN_DEBUG, fmt, ##__VA_ARGS__) /** * umip_printk() - Print a rate-limited message @@ -244,16 +245,34 @@ static int emulate_umip_insn(struct insn *insn, int umip_inst, *data_size += UMIP_GDT_IDT_LIMIT_SIZE; memcpy(data, &dummy_limit, UMIP_GDT_IDT_LIMIT_SIZE); - } else if (umip_inst == UMIP_INST_SMSW) { - unsigned long dummy_value = CR0_STATE; + } else if (umip_inst == UMIP_INST_SMSW || umip_inst == UMIP_INST_SLDT || + umip_inst == UMIP_INST_STR) { + unsigned long dummy_value; + + if (umip_inst == UMIP_INST_SMSW) { + dummy_value = CR0_STATE; + } else if (umip_inst == UMIP_INST_STR) { + dummy_value = GDT_ENTRY_TSS * 8; + } else if (umip_inst == UMIP_INST_SLDT) { +#ifdef CONFIG_MODIFY_LDT_SYSCALL + down_read(¤t->mm->context.ldt_usr_sem); + if (current->mm->context.ldt) + dummy_value = GDT_ENTRY_LDT * 8; + else + dummy_value = 0; + up_read(¤t->mm->context.ldt_usr_sem); +#else + dummy_value = 0; +#endif + } /* - * Even though the CR0 register has 4 bytes, the number + * For these 3 instructions, the number * of bytes to be copied in the result buffer is determined * by whether the operand is a register or a memory location. * If operand is a register, return as many bytes as the operand * size. If operand is memory, return only the two least - * siginificant bytes of CR0. + * significant bytes. */ if (X86_MODRM_MOD(insn->modrm.value) == 3) *data_size = insn->opnd_bytes; @@ -261,7 +280,6 @@ static int emulate_umip_insn(struct insn *insn, int umip_inst, *data_size = 2; memcpy(data, &dummy_value, *data_size); - /* STR and SLDT are not emulated */ } else { return -EINVAL; } @@ -317,77 +335,36 @@ static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs) */ bool fixup_umip_exception(struct pt_regs *regs) { - int not_copied, nr_copied, reg_offset, dummy_data_size, umip_inst; - unsigned long seg_base = 0, *reg_addr; + int nr_copied, reg_offset, dummy_data_size, umip_inst; /* 10 bytes is the maximum size of the result of UMIP instructions */ unsigned char dummy_data[10] = { 0 }; unsigned char buf[MAX_INSN_SIZE]; + unsigned long *reg_addr; void __user *uaddr; struct insn insn; - int seg_defs; if (!regs) return false; /* - * If not in user-space long mode, a custom code segment could be in - * use. This is true in protected mode (if the process defined a local - * descriptor table), or virtual-8086 mode. In most of the cases - * seg_base will be zero as in USER_CS. - */ - if (!user_64bit_mode(regs)) - seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS); - - if (seg_base == -1L) - return false; - - not_copied = copy_from_user(buf, (void __user *)(seg_base + regs->ip), - sizeof(buf)); - nr_copied = sizeof(buf) - not_copied; - - /* - * The copy_from_user above could have failed if user code is protected - * by a memory protection key. Give up on emulation in such a case. - * Should we issue a page fault? - */ - if (!nr_copied) - return false; - - insn_init(&insn, buf, nr_copied, user_64bit_mode(regs)); - - /* - * Override the default operand and address sizes with what is specified - * in the code segment descriptor. The instruction decoder only sets - * the address size it to either 4 or 8 address bytes and does nothing - * for the operand bytes. This OK for most of the cases, but we could - * have special cases where, for instance, a 16-bit code segment - * descriptor is used. - * If there is an address override prefix, the instruction decoder - * correctly updates these values, even for 16-bit defaults. + * Give up on emulation if fetching the instruction failed. Should a + * page fault or a #GP be issued? */ - seg_defs = insn_get_code_seg_params(regs); - if (seg_defs == -EINVAL) + nr_copied = insn_fetch_from_user(regs, buf); + if (nr_copied <= 0) return false; - insn.addr_bytes = INSN_CODE_SEG_ADDR_SZ(seg_defs); - insn.opnd_bytes = INSN_CODE_SEG_OPND_SZ(seg_defs); - - insn_get_length(&insn); - if (nr_copied < insn.length) + if (!insn_decode_from_regs(&insn, regs, buf, nr_copied)) return false; umip_inst = identify_insn(&insn); if (umip_inst < 0) return false; - umip_pr_warn(regs, "%s instruction cannot be used by applications.\n", + umip_pr_debug(regs, "%s instruction cannot be used by applications.\n", umip_insns[umip_inst]); - /* Do not emulate (spoof) SLDT or STR. */ - if (umip_inst == UMIP_INST_STR || umip_inst == UMIP_INST_SLDT) - return false; - - umip_pr_warn(regs, "For now, expensive software emulation returns the result.\n"); + umip_pr_debug(regs, "For now, expensive software emulation returns the result.\n"); if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size, user_64bit_mode(regs))) diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index a224b5ab103f..d8ba93778ae3 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -74,13 +74,7 @@ static bool in_entry_code(unsigned long ip) { char *addr = (char *)ip; - if (addr >= __entry_text_start && addr < __entry_text_end) - return true; - - if (addr >= __irqentry_text_start && addr < __irqentry_text_end) - return true; - - return false; + return addr >= __entry_text_start && addr < __entry_text_end; } static inline unsigned long *last_frame(struct unwind_state *state) @@ -189,6 +183,16 @@ static struct pt_regs *decode_frame_pointer(unsigned long *bp) } #endif +/* + * While walking the stack, KMSAN may stomp on stale locals from other + * functions that were marked as uninitialized upon function exit, and + * now hold the call frame information for the current function (e.g. the frame + * pointer). Because KMSAN does not specifically mark call frames as + * initialized, false positive reports are possible. To prevent such reports, + * we mark the functions scanning the stack (here and below) with + * __no_kmsan_checks. + */ +__no_kmsan_checks static bool update_stack_state(struct unwind_state *state, unsigned long *next_bp) { @@ -246,8 +250,7 @@ static bool update_stack_state(struct unwind_state *state, else { addr_p = unwind_get_return_address_ptr(state); addr = READ_ONCE_TASK_STACK(state->task, *addr_p); - state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, - addr, addr_p); + state->ip = unwind_recover_ret_addr(state, addr, addr_p); } /* Save the original stack pointer for unwind_dump(): */ @@ -257,6 +260,7 @@ static bool update_stack_state(struct unwind_state *state, return true; } +__no_kmsan_checks bool unwind_next_frame(struct unwind_state *state) { struct pt_regs *regs; @@ -275,13 +279,13 @@ bool unwind_next_frame(struct unwind_state *state) /* * kthreads (other than the boot CPU's idle thread) have some * partial regs at the end of their stack which were placed - * there by copy_thread_tls(). But the regs don't have any + * there by copy_thread(). But the regs don't have any * useful information, so we can skip them. * * This user_mode() check is slightly broader than a PF_KTHREAD * check because it also catches the awkward situation where a * newly forked kthread transitions into a user task by calling - * do_execve(), which eventually clears PF_KTHREAD. + * kernel_execve(), which eventually clears PF_KTHREAD. */ if (!user_mode(regs)) goto the_end; @@ -344,6 +348,9 @@ bad_address: if (IS_ENABLED(CONFIG_X86_32)) goto the_end; + if (state->task != current) + goto the_end; + if (state->regs) { printk_deferred_once(KERN_WARNING "WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n", diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c index c49f10ffd8cd..884d68a6e714 100644 --- a/arch/x86/kernel/unwind_guess.c +++ b/arch/x86/kernel/unwind_guess.c @@ -15,8 +15,7 @@ unsigned long unwind_get_return_address(struct unwind_state *state) addr = READ_ONCE_NOCHECK(*state->sp); - return ftrace_graph_ret_addr(state->task, &state->graph_idx, - addr, state->sp); + return unwind_recover_ret_addr(state, addr, state->sp); } EXPORT_SYMBOL_GPL(unwind_get_return_address); diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index e9cc182aa97e..c059820dfaea 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only +#include <linux/objtool.h> #include <linux/module.h> #include <linux/sort.h> #include <asm/ptrace.h> @@ -8,19 +9,21 @@ #include <asm/orc_lookup.h> #define orc_warn(fmt, ...) \ - printk_deferred_once(KERN_WARNING pr_fmt("WARNING: " fmt), ##__VA_ARGS__) + printk_deferred_once(KERN_WARNING "WARNING: " fmt, ##__VA_ARGS__) + +#define orc_warn_current(args...) \ +({ \ + if (state->task == current && !state->error) \ + orc_warn(args); \ +}) extern int __start_orc_unwind_ip[]; extern int __stop_orc_unwind_ip[]; extern struct orc_entry __start_orc_unwind[]; extern struct orc_entry __stop_orc_unwind[]; -static DEFINE_MUTEX(sort_mutex); -int *cur_orc_ip_table = __start_orc_unwind_ip; -struct orc_entry *cur_orc_table = __start_orc_unwind; - -unsigned int lookup_num_blocks; -bool orc_init; +static bool orc_init __ro_after_init; +static unsigned int lookup_num_blocks __ro_after_init; static inline unsigned long orc_ip(const int *ip) { @@ -90,22 +93,27 @@ static struct orc_entry *orc_find(unsigned long ip); static struct orc_entry *orc_ftrace_find(unsigned long ip) { struct ftrace_ops *ops; - unsigned long caller; + unsigned long tramp_addr, offset; ops = ftrace_ops_trampoline(ip); if (!ops) return NULL; + /* Set tramp_addr to the start of the code copied by the trampoline */ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) - caller = (unsigned long)ftrace_regs_call; + tramp_addr = (unsigned long)ftrace_regs_caller; else - caller = (unsigned long)ftrace_call; + tramp_addr = (unsigned long)ftrace_caller; + + /* Now place tramp_addr to the location within the trampoline ip is at */ + offset = ip - ops->trampoline; + tramp_addr += offset; /* Prevent unlikely recursion */ - if (ip == caller) + if (ip == tramp_addr) return NULL; - return orc_find(caller); + return orc_find(tramp_addr); } #else static struct orc_entry *orc_ftrace_find(unsigned long ip) @@ -125,12 +133,12 @@ static struct orc_entry null_orc_entry = { .sp_offset = sizeof(long), .sp_reg = ORC_REG_SP, .bp_reg = ORC_REG_UNDEFINED, - .type = ORC_TYPE_CALL + .type = UNWIND_HINT_TYPE_CALL }; /* Fake frame pointer entry -- used as a fallback for generated code */ static struct orc_entry orc_fp_entry = { - .type = ORC_TYPE_CALL, + .type = UNWIND_HINT_TYPE_CALL, .sp_reg = ORC_REG_BP, .sp_offset = 16, .bp_reg = ORC_REG_PREV_SP, @@ -142,9 +150,6 @@ static struct orc_entry *orc_find(unsigned long ip) { static struct orc_entry *orc; - if (!orc_init) - return NULL; - if (ip == 0) return &null_orc_entry; @@ -175,7 +180,7 @@ static struct orc_entry *orc_find(unsigned long ip) } /* vmlinux .init slow lookup: */ - if (init_kernel_text(ip)) + if (is_kernel_inittext(ip)) return __orc_find(__start_orc_unwind_ip, __start_orc_unwind, __stop_orc_unwind_ip - __start_orc_unwind_ip, ip); @@ -189,6 +194,10 @@ static struct orc_entry *orc_find(unsigned long ip) #ifdef CONFIG_MODULES +static DEFINE_MUTEX(sort_mutex); +static int *cur_orc_ip_table = __start_orc_unwind_ip; +static struct orc_entry *cur_orc_table = __start_orc_unwind; + static void orc_sort_swap(void *_a, void *_b, int size) { struct orc_entry *orc_a, *orc_b; @@ -335,11 +344,11 @@ static bool stack_access_ok(struct unwind_state *state, unsigned long _addr, struct stack_info *info = &state->stack_info; void *addr = (void *)_addr; - if (!on_stack(info, addr, len) && - (get_stack_info(addr, state->task, info, &state->stack_mask))) - return false; + if (on_stack(info, addr, len)) + return true; - return true; + return !get_stack_info(addr, state->task, info, &state->stack_mask) && + on_stack(info, addr, len); } static bool deref_stack_reg(struct unwind_state *state, unsigned long addr, @@ -363,8 +372,8 @@ static bool deref_stack_regs(struct unwind_state *state, unsigned long addr, if (!stack_access_ok(state, addr, sizeof(struct pt_regs))) return false; - *ip = regs->ip; - *sp = regs->sp; + *ip = READ_ONCE_NOCHECK(regs->ip); + *sp = READ_ONCE_NOCHECK(regs->sp); return true; } @@ -376,14 +385,43 @@ static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr if (!stack_access_ok(state, addr, IRET_FRAME_SIZE)) return false; - *ip = regs->ip; - *sp = regs->sp; + *ip = READ_ONCE_NOCHECK(regs->ip); + *sp = READ_ONCE_NOCHECK(regs->sp); return true; } +/* + * If state->regs is non-NULL, and points to a full pt_regs, just get the reg + * value from state->regs. + * + * Otherwise, if state->regs just points to IRET regs, and the previous frame + * had full regs, it's safe to get the value from the previous regs. This can + * happen when early/late IRQ entry code gets interrupted by an NMI. + */ +static bool get_reg(struct unwind_state *state, unsigned int reg_off, + unsigned long *val) +{ + unsigned int reg = reg_off/8; + + if (!state->regs) + return false; + + if (state->full_regs) { + *val = READ_ONCE_NOCHECK(((unsigned long *)state->regs)[reg]); + return true; + } + + if (state->prev_regs) { + *val = READ_ONCE_NOCHECK(((unsigned long *)state->prev_regs)[reg]); + return true; + } + + return false; +} + bool unwind_next_frame(struct unwind_state *state) { - unsigned long ip_p, sp, orig_ip = state->ip, prev_sp = state->sp; + unsigned long ip_p, sp, tmp, orig_ip = state->ip, prev_sp = state->sp; enum stack_type prev_type = state->stack_info.type; struct orc_entry *orc; bool indirect = false; @@ -401,8 +439,11 @@ bool unwind_next_frame(struct unwind_state *state) /* * Find the orc_entry associated with the text address. * - * Decrement call return addresses by one so they work for sibling - * calls and calls to noreturn functions. + * For a call frame (as opposed to a signal frame), state->ip points to + * the instruction after the call. That instruction's stack layout + * could be different from the call instruction's layout, for example + * if the call was to a noreturn function. So get the ORC data for the + * call instruction itself. */ orc = orc_find(state->signal ? state->ip : state->ip - 1); if (!orc) { @@ -435,7 +476,7 @@ bool unwind_next_frame(struct unwind_state *state) break; case ORC_REG_SP_INDIRECT: - sp = state->sp + orc->sp_offset; + sp = state->sp; indirect = true; break; @@ -445,43 +486,39 @@ bool unwind_next_frame(struct unwind_state *state) break; case ORC_REG_R10: - if (!state->regs || !state->full_regs) { - orc_warn("missing regs for base reg R10 at ip %pB\n", - (void *)state->ip); + if (!get_reg(state, offsetof(struct pt_regs, r10), &sp)) { + orc_warn_current("missing R10 value at %pB\n", + (void *)state->ip); goto err; } - sp = state->regs->r10; break; case ORC_REG_R13: - if (!state->regs || !state->full_regs) { - orc_warn("missing regs for base reg R13 at ip %pB\n", - (void *)state->ip); + if (!get_reg(state, offsetof(struct pt_regs, r13), &sp)) { + orc_warn_current("missing R13 value at %pB\n", + (void *)state->ip); goto err; } - sp = state->regs->r13; break; case ORC_REG_DI: - if (!state->regs || !state->full_regs) { - orc_warn("missing regs for base reg DI at ip %pB\n", - (void *)state->ip); + if (!get_reg(state, offsetof(struct pt_regs, di), &sp)) { + orc_warn_current("missing RDI value at %pB\n", + (void *)state->ip); goto err; } - sp = state->regs->di; break; case ORC_REG_DX: - if (!state->regs || !state->full_regs) { - orc_warn("missing regs for base reg DX at ip %pB\n", - (void *)state->ip); + if (!get_reg(state, offsetof(struct pt_regs, dx), &sp)) { + orc_warn_current("missing DX value at %pB\n", + (void *)state->ip); goto err; } - sp = state->regs->dx; break; default: - orc_warn("unknown SP base reg %d for ip %pB\n", + orc_warn("unknown SP base reg %d at %pB\n", orc->sp_reg, (void *)state->ip); goto err; } @@ -489,59 +526,79 @@ bool unwind_next_frame(struct unwind_state *state) if (indirect) { if (!deref_stack_reg(state, sp, &sp)) goto err; + + if (orc->sp_reg == ORC_REG_SP_INDIRECT) + sp += orc->sp_offset; } /* Find IP, SP and possibly regs: */ switch (orc->type) { - case ORC_TYPE_CALL: + case UNWIND_HINT_TYPE_CALL: ip_p = sp - sizeof(long); if (!deref_stack_reg(state, ip_p, &state->ip)) goto err; - state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, - state->ip, (void *)ip_p); - + state->ip = unwind_recover_ret_addr(state, state->ip, + (unsigned long *)ip_p); state->sp = sp; state->regs = NULL; + state->prev_regs = NULL; state->signal = false; break; - case ORC_TYPE_REGS: + case UNWIND_HINT_TYPE_REGS: if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) { - orc_warn("can't dereference registers at %p for ip %pB\n", - (void *)sp, (void *)orig_ip); + orc_warn_current("can't access registers at %pB\n", + (void *)orig_ip); goto err; } - + /* + * There is a small chance to interrupt at the entry of + * arch_rethook_trampoline() where the ORC info doesn't exist. + * That point is right after the RET to arch_rethook_trampoline() + * which was modified return address. + * At that point, the @addr_p of the unwind_recover_rethook() + * (this has to point the address of the stack entry storing + * the modified return address) must be "SP - (a stack entry)" + * because SP is incremented by the RET. + */ + state->ip = unwind_recover_rethook(state, state->ip, + (unsigned long *)(state->sp - sizeof(long))); state->regs = (struct pt_regs *)sp; + state->prev_regs = NULL; state->full_regs = true; state->signal = true; break; - case ORC_TYPE_REGS_IRET: + case UNWIND_HINT_TYPE_REGS_PARTIAL: if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) { - orc_warn("can't dereference iret registers at %p for ip %pB\n", - (void *)sp, (void *)orig_ip); + orc_warn_current("can't access iret registers at %pB\n", + (void *)orig_ip); goto err; } + /* See UNWIND_HINT_TYPE_REGS case comment. */ + state->ip = unwind_recover_rethook(state, state->ip, + (unsigned long *)(state->sp - sizeof(long))); + if (state->full_regs) + state->prev_regs = state->regs; state->regs = (void *)sp - IRET_FRAME_OFFSET; state->full_regs = false; state->signal = true; break; default: - orc_warn("unknown .orc_unwind entry type %d for ip %pB\n", + orc_warn("unknown .orc_unwind entry type %d at %pB\n", orc->type, (void *)orig_ip); - break; + goto err; } /* Find BP: */ switch (orc->bp_reg) { case ORC_REG_UNDEFINED: - if (state->regs && state->full_regs) - state->bp = state->regs->bp; + if (get_reg(state, offsetof(struct pt_regs, bp), &tmp)) + state->bp = tmp; break; case ORC_REG_PREV_SP: @@ -564,8 +621,8 @@ bool unwind_next_frame(struct unwind_state *state) if (state->stack_info.type == prev_type && on_stack(&state->stack_info, (void *)state->sp, sizeof(long)) && state->sp <= prev_sp) { - orc_warn("stack going in the wrong direction? ip=%pB\n", - (void *)orig_ip); + orc_warn_current("stack going in the wrong direction? at %pB\n", + (void *)orig_ip); goto err; } @@ -588,17 +645,20 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, memset(state, 0, sizeof(*state)); state->task = task; + if (!orc_init) + goto err; + /* * Refuse to unwind the stack of a task while it's executing on another * CPU. This check is racy, but that's ok: the unwinder has other * checks to prevent it from going off the rails. */ if (task_on_another_cpu(task)) - goto done; + goto err; if (regs) { if (user_mode(regs)) - goto done; + goto the_end; state->ip = regs->ip; state->sp = regs->sp; @@ -617,9 +677,10 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, } else { struct inactive_task_frame *frame = (void *)task->thread.sp; - state->sp = task->thread.sp; + state->sp = task->thread.sp + sizeof(*frame); state->bp = READ_ONCE_NOCHECK(frame->bp); state->ip = READ_ONCE_NOCHECK(frame->ret_addr); + state->signal = (void *)state->ip == ret_from_fork; } if (get_stack_info((unsigned long *)state->sp, state->task, @@ -631,6 +692,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, * generate some kind of backtrace if this happens. */ void *next_page = (void *)PAGE_ALIGN((unsigned long)state->sp); + state->error = true; if (get_stack_info(next_page, state->task, &state->stack_info, &state->stack_mask)) return; @@ -656,8 +718,9 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, return; -done: +err: + state->error = true; +the_end: state->stack_info.type = STACK_TYPE_UNKNOWN; - return; } EXPORT_SYMBOL_GPL(__unwind_start); diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 15e5aad8ac2c..b63cf8f7745e 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -255,12 +255,13 @@ static volatile u32 good_2byte_insns[256 / 32] = { static bool is_prefix_bad(struct insn *insn) { + insn_byte_t p; int i; - for (i = 0; i < insn->prefixes.nbytes; i++) { + for_each_insn_prefix(insn, i, p) { insn_attr_t attr; - attr = inat_get_opcode_attribute(insn->prefixes.bytes[i]); + attr = inat_get_opcode_attribute(p); switch (attr) { case INAT_MAKE_PREFIX(INAT_PFX_ES): case INAT_MAKE_PREFIX(INAT_PFX_CS): @@ -275,12 +276,12 @@ static bool is_prefix_bad(struct insn *insn) static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool x86_64) { + enum insn_mode m = x86_64 ? INSN_MODE_64 : INSN_MODE_32; u32 volatile *good_insns; + int ret; - insn_init(insn, auprobe->insn, sizeof(auprobe->insn), x86_64); - /* has the side-effect of processing the entire instruction */ - insn_get_length(insn); - if (!insn_complete(insn)) + ret = insn_decode(insn, auprobe->insn, sizeof(auprobe->insn), m); + if (ret < 0) return -ENOEXEC; if (is_prefix_bad(insn)) @@ -715,6 +716,7 @@ static const struct uprobe_xol_ops push_xol_ops = { static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) { u8 opc1 = OPCODE1(insn); + insn_byte_t p; int i; switch (opc1) { @@ -735,7 +737,7 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) * OPCODE1() of the "short" jmp which checks the same condition. */ opc1 = OPCODE2(insn) - 0x10; - /* fall through */ + fallthrough; default: if (!is_cond_jmp_opcode(opc1)) return -ENOSYS; @@ -746,8 +748,8 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) * Intel and AMD behavior differ in 64-bit mode: Intel ignores 66 prefix. * No one uses these insns, reject any branch insns with such prefix. */ - for (i = 0; i < insn->prefixes.nbytes; i++) { - if (insn->prefixes.bytes[i] == 0x66) + for_each_insn_prefix(insn, i, p) { + if (p == 0x66) return -ENOTSUPP; } @@ -892,7 +894,7 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, fix_ip_or_call = 0; break; } - /* fall through */ + fallthrough; default: riprel_analyze(auprobe, &insn); } @@ -1015,6 +1017,8 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, if (uprobe_post_sstep_notifier(regs)) ret = NOTIFY_STOP; + break; + default: break; } diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S index 641f0fe1e5b4..1258a5872d12 100644 --- a/arch/x86/kernel/verify_cpu.S +++ b/arch/x86/kernel/verify_cpu.S @@ -132,9 +132,9 @@ SYM_FUNC_START_LOCAL(verify_cpu) .Lverify_cpu_no_longmode: popf # Restore caller passed flags movl $1,%eax - ret + RET .Lverify_cpu_sse_ok: popf # Restore caller passed flags xorl %eax, %eax - ret + RET SYM_FUNC_END(verify_cpu) diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 91d55454e702..e9e803a4d44c 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -98,7 +98,6 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) struct task_struct *tsk = current; struct vm86plus_struct __user *user; struct vm86 *vm86 = current->thread.vm86; - long err = 0; /* * This gets called from entry.S with interrupts disabled, but @@ -107,45 +106,41 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) */ local_irq_enable(); - if (!vm86 || !vm86->user_vm86) { - pr_alert("no user_vm86: BAD\n"); - do_exit(SIGSEGV); - } + BUG_ON(!vm86); + set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->veflags_mask); user = vm86->user_vm86; - if (!access_ok(user, vm86->vm86plus.is_vm86pus ? + if (!user_access_begin(user, vm86->vm86plus.is_vm86pus ? sizeof(struct vm86plus_struct) : - sizeof(struct vm86_struct))) { - pr_alert("could not access userspace vm86 info\n"); - do_exit(SIGSEGV); - } + sizeof(struct vm86_struct))) + goto Efault; + + unsafe_put_user(regs->pt.bx, &user->regs.ebx, Efault_end); + unsafe_put_user(regs->pt.cx, &user->regs.ecx, Efault_end); + unsafe_put_user(regs->pt.dx, &user->regs.edx, Efault_end); + unsafe_put_user(regs->pt.si, &user->regs.esi, Efault_end); + unsafe_put_user(regs->pt.di, &user->regs.edi, Efault_end); + unsafe_put_user(regs->pt.bp, &user->regs.ebp, Efault_end); + unsafe_put_user(regs->pt.ax, &user->regs.eax, Efault_end); + unsafe_put_user(regs->pt.ip, &user->regs.eip, Efault_end); + unsafe_put_user(regs->pt.cs, &user->regs.cs, Efault_end); + unsafe_put_user(regs->pt.flags, &user->regs.eflags, Efault_end); + unsafe_put_user(regs->pt.sp, &user->regs.esp, Efault_end); + unsafe_put_user(regs->pt.ss, &user->regs.ss, Efault_end); + unsafe_put_user(regs->es, &user->regs.es, Efault_end); + unsafe_put_user(regs->ds, &user->regs.ds, Efault_end); + unsafe_put_user(regs->fs, &user->regs.fs, Efault_end); + unsafe_put_user(regs->gs, &user->regs.gs, Efault_end); - put_user_try { - put_user_ex(regs->pt.bx, &user->regs.ebx); - put_user_ex(regs->pt.cx, &user->regs.ecx); - put_user_ex(regs->pt.dx, &user->regs.edx); - put_user_ex(regs->pt.si, &user->regs.esi); - put_user_ex(regs->pt.di, &user->regs.edi); - put_user_ex(regs->pt.bp, &user->regs.ebp); - put_user_ex(regs->pt.ax, &user->regs.eax); - put_user_ex(regs->pt.ip, &user->regs.eip); - put_user_ex(regs->pt.cs, &user->regs.cs); - put_user_ex(regs->pt.flags, &user->regs.eflags); - put_user_ex(regs->pt.sp, &user->regs.esp); - put_user_ex(regs->pt.ss, &user->regs.ss); - put_user_ex(regs->es, &user->regs.es); - put_user_ex(regs->ds, &user->regs.ds); - put_user_ex(regs->fs, &user->regs.fs); - put_user_ex(regs->gs, &user->regs.gs); - - put_user_ex(vm86->screen_bitmap, &user->screen_bitmap); - } put_user_catch(err); - if (err) { - pr_alert("could not access userspace vm86 info\n"); - do_exit(SIGSEGV); - } + /* + * Don't write screen_bitmap in case some user had a value there + * and expected it to remain unchanged. + */ + + user_access_end(); +exit_vm86: preempt_disable(); tsk->thread.sp0 = vm86->saved_sp0; tsk->thread.sysenter_cs = __KERNEL_CS; @@ -156,54 +151,19 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) memcpy(®s->pt, &vm86->regs32, sizeof(struct pt_regs)); - lazy_load_gs(vm86->regs32.gs); + loadsegment(gs, vm86->regs32.gs); regs->pt.ax = retval; -} - -static void mark_screen_rdonly(struct mm_struct *mm) -{ - struct vm_area_struct *vma; - spinlock_t *ptl; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - int i; - - down_write(&mm->mmap_sem); - pgd = pgd_offset(mm, 0xA0000); - if (pgd_none_or_clear_bad(pgd)) - goto out; - p4d = p4d_offset(pgd, 0xA0000); - if (p4d_none_or_clear_bad(p4d)) - goto out; - pud = pud_offset(p4d, 0xA0000); - if (pud_none_or_clear_bad(pud)) - goto out; - pmd = pmd_offset(pud, 0xA0000); + return; - if (pmd_trans_huge(*pmd)) { - vma = find_vma(mm, 0xA0000); - split_huge_pmd(vma, pmd, 0xA0000); - } - if (pmd_none_or_clear_bad(pmd)) - goto out; - pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); - for (i = 0; i < 32; i++) { - if (pte_present(*pte)) - set_pte(pte, pte_wrprotect(*pte)); - pte++; - } - pte_unmap_unlock(pte, ptl); -out: - up_write(&mm->mmap_sem); - flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, PAGE_SHIFT, false); +Efault_end: + user_access_end(); +Efault: + pr_alert("could not access userspace vm86 info\n"); + force_exit_sig(SIGSEGV); + goto exit_vm86; } - - static int do_vm86_irq_handling(int subfunction, int irqnumber); static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus); @@ -243,6 +203,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) struct kernel_vm86_regs vm86regs; struct pt_regs *regs = current_pt_regs(); unsigned long err = 0; + struct vm86_struct v; err = security_mmap_addr(0); if (err) { @@ -278,39 +239,40 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) if (vm86->saved_sp0) return -EPERM; - if (!access_ok(user_vm86, plus ? - sizeof(struct vm86_struct) : - sizeof(struct vm86plus_struct))) + if (copy_from_user(&v, user_vm86, + offsetof(struct vm86_struct, int_revectored))) return -EFAULT; + + /* VM86_SCREEN_BITMAP had numerous bugs and appears to have no users. */ + if (v.flags & VM86_SCREEN_BITMAP) { + char comm[TASK_COMM_LEN]; + + pr_info_once("vm86: '%s' uses VM86_SCREEN_BITMAP, which is no longer supported\n", get_task_comm(comm, current)); + return -EINVAL; + } + memset(&vm86regs, 0, sizeof(vm86regs)); - get_user_try { - unsigned short seg; - get_user_ex(vm86regs.pt.bx, &user_vm86->regs.ebx); - get_user_ex(vm86regs.pt.cx, &user_vm86->regs.ecx); - get_user_ex(vm86regs.pt.dx, &user_vm86->regs.edx); - get_user_ex(vm86regs.pt.si, &user_vm86->regs.esi); - get_user_ex(vm86regs.pt.di, &user_vm86->regs.edi); - get_user_ex(vm86regs.pt.bp, &user_vm86->regs.ebp); - get_user_ex(vm86regs.pt.ax, &user_vm86->regs.eax); - get_user_ex(vm86regs.pt.ip, &user_vm86->regs.eip); - get_user_ex(seg, &user_vm86->regs.cs); - vm86regs.pt.cs = seg; - get_user_ex(vm86regs.pt.flags, &user_vm86->regs.eflags); - get_user_ex(vm86regs.pt.sp, &user_vm86->regs.esp); - get_user_ex(seg, &user_vm86->regs.ss); - vm86regs.pt.ss = seg; - get_user_ex(vm86regs.es, &user_vm86->regs.es); - get_user_ex(vm86regs.ds, &user_vm86->regs.ds); - get_user_ex(vm86regs.fs, &user_vm86->regs.fs); - get_user_ex(vm86regs.gs, &user_vm86->regs.gs); - - get_user_ex(vm86->flags, &user_vm86->flags); - get_user_ex(vm86->screen_bitmap, &user_vm86->screen_bitmap); - get_user_ex(vm86->cpu_type, &user_vm86->cpu_type); - } get_user_catch(err); - if (err) - return err; + + vm86regs.pt.bx = v.regs.ebx; + vm86regs.pt.cx = v.regs.ecx; + vm86regs.pt.dx = v.regs.edx; + vm86regs.pt.si = v.regs.esi; + vm86regs.pt.di = v.regs.edi; + vm86regs.pt.bp = v.regs.ebp; + vm86regs.pt.ax = v.regs.eax; + vm86regs.pt.ip = v.regs.eip; + vm86regs.pt.cs = v.regs.cs; + vm86regs.pt.flags = v.regs.eflags; + vm86regs.pt.sp = v.regs.esp; + vm86regs.pt.ss = v.regs.ss; + vm86regs.es = v.regs.es; + vm86regs.ds = v.regs.ds; + vm86regs.fs = v.regs.fs; + vm86regs.gs = v.regs.gs; + + vm86->flags = v.flags; + vm86->cpu_type = v.cpu_type; if (copy_from_user(&vm86->int_revectored, &user_vm86->int_revectored, @@ -363,7 +325,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) * Save old state */ vm86->saved_sp0 = tsk->thread.sp0; - lazy_save_gs(vm86->regs32.gs); + savesegment(gs, vm86->regs32.gs); /* make room for real-mode segments */ preempt_disable(); @@ -377,9 +339,6 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) update_task_stack(tsk); preempt_enable(); - if (vm86->flags & VM86_SCREEN_BITMAP) - mark_screen_rdonly(tsk->mm); - memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs)); return regs->ax; } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index e3296aa028fe..15f29053cec4 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -21,6 +21,7 @@ #define LOAD_OFFSET __START_KERNEL_map #endif +#define RUNTIME_DISCARD_EXIT #define EMITS_PT_NOTE #define RO_EXCEPTION_TABLE_ALIGN 16 @@ -39,13 +40,13 @@ OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT) #ifdef CONFIG_X86_32 OUTPUT_ARCH(i386) ENTRY(phys_startup_32) -jiffies = jiffies_64; #else OUTPUT_ARCH(i386:x86-64) ENTRY(phys_startup_64) -jiffies_64 = jiffies; #endif +jiffies = jiffies_64; + #if defined(CONFIG_X86_64) /* * On 64-bit, align RODATA to 2MB so we retain large page mappings for @@ -133,15 +134,14 @@ SECTIONS KPROBES_TEXT ALIGN_ENTRY_TEXT_BEGIN ENTRY_TEXT - IRQENTRY_TEXT ALIGN_ENTRY_TEXT_END SOFTIRQENTRY_TEXT - *(.fixup) + STATIC_CALL_TEXT *(.gnu.warning) #ifdef CONFIG_RETPOLINE __indirect_thunk_start = .; - *(.text.__x86.indirect_thunk) + *(.text.__x86.*) __indirect_thunk_end = .; #endif } :text =0xcccc @@ -271,6 +271,36 @@ SECTIONS __parainstructions_end = .; } +#ifdef CONFIG_RETPOLINE + /* + * List of instructions that call/jmp/jcc to retpoline thunks + * __x86_indirect_thunk_*(). These instructions can be patched along + * with alternatives, after which the section can be freed. + */ + . = ALIGN(8); + .retpoline_sites : AT(ADDR(.retpoline_sites) - LOAD_OFFSET) { + __retpoline_sites = .; + *(.retpoline_sites) + __retpoline_sites_end = .; + } + + . = ALIGN(8); + .return_sites : AT(ADDR(.return_sites) - LOAD_OFFSET) { + __return_sites = .; + *(.return_sites) + __return_sites_end = .; + } +#endif + +#ifdef CONFIG_X86_KERNEL_IBT + . = ALIGN(8); + .ibt_endbr_seal : AT(ADDR(.ibt_endbr_seal) - LOAD_OFFSET) { + __ibt_endbr_seal = .; + *(.ibt_endbr_seal) + __ibt_endbr_seal_end = .; + } +#endif + /* * struct alt_inst entries. From the header (alternative.h): * "Alternative instructions for different CPU types or capabilities" @@ -292,18 +322,6 @@ SECTIONS *(.altinstr_replacement) } - /* - * struct iommu_table_entry entries are injected in this section. - * It is an array of IOMMUs which during run time gets sorted depending - * on its dependency order. After rootfs_initcall is complete - * this section can be safely removed. - */ - .iommu_table : AT(ADDR(.iommu_table) - LOAD_OFFSET) { - __iommu_table = .; - *(.iommu_table) - __iommu_table_end = .; - } - . = ALIGN(8); .apicdrivers : AT(ADDR(.apicdrivers) - LOAD_OFFSET) { __apicdrivers = .; @@ -313,8 +331,8 @@ SECTIONS . = ALIGN(8); /* - * .exit.text is discard at runtime, not link time, to deal with - * references from .altinstructions and .eh_frame + * .exit.text is discarded at runtime, not link time, to deal with + * references from .altinstructions */ .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { EXIT_TEXT @@ -358,6 +376,7 @@ SECTIONS .bss : AT(ADDR(.bss) - LOAD_OFFSET) { __bss_start = .; *(.bss..page_aligned) + . = ALIGN(PAGE_SIZE); *(BSS_MAIN) BSS_DECRYPTED . = ALIGN(PAGE_SIZE); @@ -376,7 +395,7 @@ SECTIONS .brk : AT(ADDR(.brk) - LOAD_OFFSET) { __brk_base = .; . += 64 * 1024; /* 64k alignment slop space */ - *(.brk_reservation) /* areas brk users have reserved */ + *(.bss..brk) /* areas brk users have reserved */ __brk_limit = .; } @@ -410,21 +429,55 @@ SECTIONS STABS_DEBUG DWARF_DEBUG + ELF_DETAILS DISCARDS - /DISCARD/ : { - *(.eh_frame) + + /* + * Make sure that the .got.plt is either completely empty or it + * contains only the lazy dispatch entries. + */ + .got.plt (INFO) : { *(.got.plt) } + ASSERT(SIZEOF(.got.plt) == 0 || +#ifdef CONFIG_X86_64 + SIZEOF(.got.plt) == 0x18, +#else + SIZEOF(.got.plt) == 0xc, +#endif + "Unexpected GOT/PLT entries detected!") + + /* + * Sections that should stay zero sized, which is safer to + * explicitly check instead of blindly discarding. + */ + .got : { + *(.got) *(.igot.*) } -} + ASSERT(SIZEOF(.got) == 0, "Unexpected GOT entries detected!") + .plt : { + *(.plt) *(.plt.*) *(.iplt) + } + ASSERT(SIZEOF(.plt) == 0, "Unexpected run-time procedure linkages detected!") + + .rel.dyn : { + *(.rel.*) *(.rel_*) + } + ASSERT(SIZEOF(.rel.dyn) == 0, "Unexpected run-time relocations (.rel) detected!") + + .rela.dyn : { + *(.rela.*) *(.rela_*) + } + ASSERT(SIZEOF(.rela.dyn) == 0, "Unexpected run-time relocations (.rela) detected!") +} -#ifdef CONFIG_X86_32 /* * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility: */ . = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), "kernel image bigger than KERNEL_IMAGE_SIZE"); -#else + +#ifdef CONFIG_X86_64 /* * Per-cpu symbols which need to be offset from __per_cpu_load * for the boot processor. @@ -434,18 +487,12 @@ INIT_PER_CPU(gdt_page); INIT_PER_CPU(fixed_percpu_data); INIT_PER_CPU(irq_stack_backing_store); -/* - * Build-time check on the image size: - */ -. = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), - "kernel image bigger than KERNEL_IMAGE_SIZE"); - #ifdef CONFIG_SMP . = ASSERT((fixed_percpu_data == 0), "fixed_percpu_data is not at start of per-cpu area"); #endif -#endif /* CONFIG_X86_32 */ +#endif /* CONFIG_X86_64 */ #ifdef CONFIG_KEXEC_CORE #include <asm/kexec.h> diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 85f1a90c55cd..57353519bc11 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -24,6 +24,7 @@ #include <asm/tsc.h> #include <asm/iommu.h> #include <asm/mach_traps.h> +#include <asm/irqdomain.h> void x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } @@ -67,11 +68,7 @@ struct x86_init_ops x86_init __initdata = { }, .mpparse = { - .mpc_record = x86_init_uint_noop, .setup_ioapic_ids = x86_init_noop, - .mpc_apic_id = default_mpc_apic_id, - .smp_read_mpc_oem = default_smp_read_mpc_oem, - .mpc_oem_bus_info = default_mpc_oem_bus_info, .find_smp_config = default_find_smp_config, .get_smp_config = default_get_smp_config, }, @@ -79,9 +76,9 @@ struct x86_init_ops x86_init __initdata = { .irqs = { .pre_vector_init = init_ISA_irqs, .intr_init = native_init_IRQ, - .trap_init = x86_init_noop, .intr_mode_select = apic_intr_mode_select, - .intr_mode_init = apic_intr_mode_init + .intr_mode_init = apic_intr_mode_init, + .create_pci_msi_domain = native_create_pci_msi_domain, }, .oem = { @@ -113,6 +110,7 @@ struct x86_init_ops x86_init __initdata = { .init_platform = x86_init_noop, .guest_late_init = x86_init_noop, .x2apic_available = bool_x86_init_noop, + .msi_ext_dest_id = bool_x86_init_noop, .init_mem_mapping = x86_init_noop, .init_after_bootmem = x86_init_noop, }, @@ -131,51 +129,33 @@ struct x86_cpuinit_ops x86_cpuinit = { static void default_nmi_init(void) { }; +static void enc_status_change_prepare_noop(unsigned long vaddr, int npages, bool enc) { } +static bool enc_status_change_finish_noop(unsigned long vaddr, int npages, bool enc) { return false; } +static bool enc_tlb_flush_required_noop(bool enc) { return false; } +static bool enc_cache_flush_required_noop(void) { return false; } + struct x86_platform_ops x86_platform __ro_after_init = { .calibrate_cpu = native_calibrate_cpu_early, .calibrate_tsc = native_calibrate_tsc, .get_wallclock = mach_get_cmos_time, - .set_wallclock = mach_set_rtc_mmss, + .set_wallclock = mach_set_cmos_time, .iommu_shutdown = iommu_shutdown_noop, .is_untracked_pat_range = is_ISA_range, .nmi_init = default_nmi_init, .get_nmi_reason = default_get_nmi_reason, - .save_sched_clock_state = tsc_save_sched_clock_state, - .restore_sched_clock_state = tsc_restore_sched_clock_state, + .save_sched_clock_state = tsc_save_sched_clock_state, + .restore_sched_clock_state = tsc_restore_sched_clock_state, .hyper.pin_vcpu = x86_op_int_noop, -}; -EXPORT_SYMBOL_GPL(x86_platform); - -#if defined(CONFIG_PCI_MSI) -struct x86_msi_ops x86_msi __ro_after_init = { - .setup_msi_irqs = native_setup_msi_irqs, - .teardown_msi_irq = native_teardown_msi_irq, - .teardown_msi_irqs = default_teardown_msi_irqs, - .restore_msi_irqs = default_restore_msi_irqs, + .guest = { + .enc_status_change_prepare = enc_status_change_prepare_noop, + .enc_status_change_finish = enc_status_change_finish_noop, + .enc_tlb_flush_required = enc_tlb_flush_required_noop, + .enc_cache_flush_required = enc_cache_flush_required_noop, + }, }; -/* MSI arch specific hooks */ -int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) -{ - return x86_msi.setup_msi_irqs(dev, nvec, type); -} - -void arch_teardown_msi_irqs(struct pci_dev *dev) -{ - x86_msi.teardown_msi_irqs(dev); -} - -void arch_teardown_msi_irq(unsigned int irq) -{ - x86_msi.teardown_msi_irq(irq); -} - -void arch_restore_msi_irqs(struct pci_dev *dev) -{ - x86_msi.restore_msi_irqs(dev); -} -#endif +EXPORT_SYMBOL_GPL(x86_platform); struct x86_apic_ops x86_apic_ops __ro_after_init = { .io_apic_read = native_io_apic_read, |