diff options
author | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2018-04-04 16:11:49 -0700 |
---|---|---|
committer | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2018-04-04 16:11:49 -0700 |
commit | 664b0bae0b87f69bc9deb098f5e0158b9cf18e04 (patch) | |
tree | d5841492b396ff483723b9339c7c11dc33b67688 /arch/x86/kernel | |
parent | Input: ALPS - fix TrackStick detection on Thinkpad L570 and Latitude 7370 (diff) | |
parent | Input: i8042 - enable MUX on Sony VAIO VGN-CS series to fix touchpad (diff) | |
download | linux-dev-664b0bae0b87f69bc9deb098f5e0158b9cf18e04.tar.xz linux-dev-664b0bae0b87f69bc9deb098f5e0158b9cf18e04.zip |
Merge branch 'next' into for-linus
Prepare input updates for 4.17 merge window.
Diffstat (limited to 'arch/x86/kernel')
129 files changed, 4931 insertions, 2622 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5f70044340ff..29786c87e864 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -25,14 +25,17 @@ endif KASAN_SANITIZE_head$(BITS).o := n KASAN_SANITIZE_dumpstack.o := n KASAN_SANITIZE_dumpstack_$(BITS).o := n -KASAN_SANITIZE_stacktrace.o := n +KASAN_SANITIZE_stacktrace.o := n +KASAN_SANITIZE_paravirt.o := n -OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y -OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y OBJECT_FILES_NON_STANDARD_test_nx.o := y OBJECT_FILES_NON_STANDARD_paravirt_patch_$(BITS).o := y +ifdef CONFIG_FRAME_POINTER +OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y +endif + # If instrumentation of this dir is enabled, boot hangs during first second. # Probably could be more selective here, but note that files related to irqs, # boot, dumpstack/stacktrace, etc are either non-interesting or can lead to @@ -112,6 +115,8 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o +obj-$(CONFIG_JAILHOUSE_GUEST) += jailhouse.o + obj-$(CONFIG_EISA) += eisa.o obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o @@ -127,10 +132,11 @@ obj-$(CONFIG_EFI) += sysfb_efi.o obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_TRACING) += tracepoint.o obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o +obj-$(CONFIG_X86_INTEL_UMIP) += umip.o -obj-$(CONFIG_ORC_UNWINDER) += unwind_orc.o -obj-$(CONFIG_FRAME_POINTER_UNWINDER) += unwind_frame.o -obj-$(CONFIG_GUESS_UNWINDER) += unwind_guess.o +obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o +obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o +obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o ### # 64 bit specific files diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c index ea3046e0b0cf..bb8d300fecbd 100644 --- a/arch/x86/kernel/acpi/apei.c +++ b/arch/x86/kernel/acpi/apei.c @@ -52,8 +52,3 @@ void arch_apei_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) apei_mce_report_mem_error(sev, mem_err); #endif } - -void arch_apei_flush_tlb_one(unsigned long addr) -{ - __flush_tlb_one(addr); -} diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 079535e53e2a..2aa92094b59d 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -36,6 +36,7 @@ #include <linux/ioport.h> #include <linux/pci.h> #include <linux/efi-bgrt.h> +#include <linux/serial_core.h> #include <asm/e820/api.h> #include <asm/irqdomain.h> @@ -68,8 +69,9 @@ int acpi_ioapic; int acpi_strict; int acpi_disable_cmcff; +/* ACPI SCI override configuration */ u8 acpi_sci_flags __initdata; -int acpi_sci_override_gsi __initdata; +u32 acpi_sci_override_gsi __initdata = INVALID_ACPI_IRQ; int acpi_skip_timer_override __initdata; int acpi_use_timer_override __initdata; int acpi_fix_pin2_polarity __initdata; @@ -112,8 +114,6 @@ static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; -#define ACPI_INVALID_GSI INT_MIN - /* * This is just a simple wrapper around early_memremap(), * with sanity checks for phys == 0 and size == 0. @@ -342,13 +342,12 @@ acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long e #ifdef CONFIG_X86_IO_APIC #define MP_ISA_BUS 0 +static int __init mp_register_ioapic_irq(u8 bus_irq, u8 polarity, + u8 trigger, u32 gsi); + static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) { - int ioapic; - int pin; - struct mpc_intsrc mp_irq; - /* * Check bus_irq boundary. */ @@ -358,14 +357,6 @@ static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, } /* - * Convert 'gsi' to 'ioapic.pin'. - */ - ioapic = mp_find_ioapic(gsi); - if (ioapic < 0) - return; - pin = mp_find_ioapic_pin(ioapic, gsi); - - /* * TBD: This check is for faulty timer entries, where the override * erroneously sets the trigger to level, resulting in a HUGE * increase of timer interrupts! @@ -373,23 +364,15 @@ static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, if ((bus_irq == 0) && (trigger == 3)) trigger = 1; - mp_irq.type = MP_INTSRC; - mp_irq.irqtype = mp_INT; - mp_irq.irqflag = (trigger << 2) | polarity; - mp_irq.srcbus = MP_ISA_BUS; - mp_irq.srcbusirq = bus_irq; /* IRQ */ - mp_irq.dstapic = mpc_ioapic_id(ioapic); /* APIC ID */ - mp_irq.dstirq = pin; /* INTIN# */ - - mp_save_irq(&mp_irq); - + if (mp_register_ioapic_irq(bus_irq, polarity, trigger, gsi) < 0) + return; /* * Reset default identity mapping if gsi is also an legacy IRQ, * otherwise there will be more than one entry with the same GSI * and acpi_isa_irq_to_gsi() may give wrong result. */ if (gsi < nr_legacy_irqs() && isa_irq_to_gsi[gsi] == gsi) - isa_irq_to_gsi[gsi] = ACPI_INVALID_GSI; + isa_irq_to_gsi[gsi] = INVALID_ACPI_IRQ; isa_irq_to_gsi[bus_irq] = gsi; } @@ -429,6 +412,34 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, return 0; } +static int __init mp_register_ioapic_irq(u8 bus_irq, u8 polarity, + u8 trigger, u32 gsi) +{ + struct mpc_intsrc mp_irq; + int ioapic, pin; + + /* Convert 'gsi' to 'ioapic.pin'(INTIN#) */ + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) { + pr_warn("Failed to find ioapic for gsi : %u\n", gsi); + return ioapic; + } + + pin = mp_find_ioapic_pin(ioapic, gsi); + + mp_irq.type = MP_INTSRC; + mp_irq.irqtype = mp_INT; + mp_irq.irqflag = (trigger << 2) | polarity; + mp_irq.srcbus = MP_ISA_BUS; + mp_irq.srcbusirq = bus_irq; + mp_irq.dstapic = mpc_ioapic_id(ioapic); + mp_irq.dstirq = pin; + + mp_save_irq(&mp_irq); + + return 0; +} + static int __init acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end) { @@ -473,7 +484,11 @@ static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger, if (acpi_sci_flags & ACPI_MADT_POLARITY_MASK) polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK; - mp_override_legacy_irq(bus_irq, polarity, trigger, gsi); + if (bus_irq < NR_IRQS_LEGACY) + mp_override_legacy_irq(bus_irq, polarity, trigger, gsi); + else + mp_register_ioapic_irq(bus_irq, polarity, trigger, gsi); + acpi_penalize_sci_irq(bus_irq, trigger, polarity); /* @@ -605,24 +620,24 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irqp) } rc = acpi_get_override_irq(gsi, &trigger, &polarity); - if (rc == 0) { - trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE; - polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH; - irq = acpi_register_gsi(NULL, gsi, trigger, polarity); - if (irq >= 0) { - *irqp = irq; - return 0; - } - } + if (rc) + return rc; - return -1; + trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE; + polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH; + irq = acpi_register_gsi(NULL, gsi, trigger, polarity); + if (irq < 0) + return irq; + + *irqp = irq; + return 0; } EXPORT_SYMBOL_GPL(acpi_gsi_to_irq); int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi) { if (isa_irq < nr_legacy_irqs() && - isa_irq_to_gsi[isa_irq] != ACPI_INVALID_GSI) { + isa_irq_to_gsi[isa_irq] != INVALID_ACPI_IRQ) { *gsi = isa_irq_to_gsi[isa_irq]; return 0; } @@ -661,8 +676,7 @@ static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi, mutex_lock(&acpi_ioapic_lock); irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info); /* Don't set up the ACPI SCI because it's already set up */ - if (irq >= 0 && enable_update_mptable && - acpi_gbl_FADT.sci_interrupt != gsi) + if (irq >= 0 && enable_update_mptable && gsi != acpi_gbl_FADT.sci_interrupt) mp_config_acpi_gsi(dev, gsi, trigger, polarity); mutex_unlock(&acpi_ioapic_lock); #endif @@ -961,6 +975,11 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table) x86_platform.legacy.rtc = 0; } + if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_VGA) { + pr_debug("ACPI: probing for VGA not safe\n"); + x86_platform.legacy.no_vga = 1; + } + #ifdef CONFIG_X86_PM_TIMER /* detect the location of the ACPI PM Timer */ if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) { @@ -1191,8 +1210,9 @@ static int __init acpi_parse_madt_ioapic_entries(void) /* * If BIOS did not supply an INT_SRC_OVR for the SCI * pretend we got one so we can set the SCI flags. + * But ignore setting up SCI on hardware reduced platforms. */ - if (!acpi_sci_override_gsi) + if (acpi_sci_override_gsi == INVALID_ACPI_IRQ && !acpi_gbl_reduced_hardware) acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0, acpi_gbl_FADT.sci_interrupt); @@ -1606,6 +1626,8 @@ int __init acpi_boot_init(void) if (!acpi_noirq) x86_init.pci.init = pci_acpi_init; + /* Do not enable ACPI SPCR console by default */ + acpi_parse_spcr(earlycon_acpi_spcr_enable, false); return 0; } diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 7188aea91549..f1915b744052 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -138,6 +138,8 @@ static int __init acpi_sleep_setup(char *str) acpi_nvs_nosave_s3(); if (strncmp(str, "old_ordering", 12) == 0) acpi_old_suspend_ordering(); + if (strncmp(str, "nobl", 4) == 0) + acpi_sleep_no_blacklist(); str = strchr(str, ','); if (str != NULL) str += strspn(str, ", \t"); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 3344d3382e91..a481763a3776 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -46,17 +46,6 @@ static int __init setup_noreplace_smp(char *str) } __setup("noreplace-smp", setup_noreplace_smp); -#ifdef CONFIG_PARAVIRT -static int __initdata_or_module noreplace_paravirt = 0; - -static int __init setup_noreplace_paravirt(char *str) -{ - noreplace_paravirt = 1; - return 1; -} -__setup("noreplace-paravirt", setup_noreplace_paravirt); -#endif - #define DPRINTK(fmt, args...) \ do { \ if (debug_alternative) \ @@ -298,7 +287,7 @@ recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf) tgt_rip = next_rip + o_dspl; n_dspl = tgt_rip - orig_insn; - DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl); + DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl); if (tgt_rip - orig_insn >= 0) { if (n_dspl - 2 <= 127) @@ -344,15 +333,18 @@ done: static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr) { unsigned long flags; + int i; - if (instr[0] != 0x90) - return; + for (i = 0; i < a->padlen; i++) { + if (instr[i] != 0x90) + return; + } local_irq_save(flags); add_nops(instr + (a->instrlen - a->padlen), a->padlen); local_irq_restore(flags); - DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ", + DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ", instr, a->instrlen - a->padlen, a->padlen); } @@ -373,7 +365,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, u8 *instr, *replacement; u8 insnbuf[MAX_PATCH_LEN]; - DPRINTK("alt table %p -> %p", start, end); + DPRINTK("alt table %px, -> %px", start, end); /* * The scan order should be from start to end. A later scanned * alternative code can overwrite previously scanned alternative code. @@ -397,14 +389,14 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, continue; } - DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d), pad: %d", + DPRINTK("feat: %d*32+%d, old: (%px len: %d), repl: (%px, len: %d), pad: %d", a->cpuid >> 5, a->cpuid & 0x1f, instr, a->instrlen, replacement, a->replacementlen, a->padlen); - DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr); - DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement); + DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr); + DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement); memcpy(insnbuf, replacement, a->replacementlen); insnbuf_sz = a->replacementlen; @@ -430,7 +422,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, a->instrlen - a->replacementlen); insnbuf_sz += a->instrlen - a->replacementlen; } - DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr); + DUMP_BYTES(insnbuf, insnbuf_sz, "%px: final_insn: ", instr); text_poke_early(instr, insnbuf, insnbuf_sz); } @@ -442,7 +434,6 @@ static void alternatives_smp_lock(const s32 *start, const s32 *end, { const s32 *poff; - mutex_lock(&text_mutex); for (poff = start; poff < end; poff++) { u8 *ptr = (u8 *)poff + *poff; @@ -452,7 +443,6 @@ static void alternatives_smp_lock(const s32 *start, const s32 *end, if (*ptr == 0x3e) text_poke(ptr, ((unsigned char []){0xf0}), 1); } - mutex_unlock(&text_mutex); } static void alternatives_smp_unlock(const s32 *start, const s32 *end, @@ -460,7 +450,6 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end, { const s32 *poff; - mutex_lock(&text_mutex); for (poff = start; poff < end; poff++) { u8 *ptr = (u8 *)poff + *poff; @@ -470,7 +459,6 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end, if (*ptr == 0xf0) text_poke(ptr, ((unsigned char []){0x3E}), 1); } - mutex_unlock(&text_mutex); } struct smp_alt_module { @@ -489,8 +477,7 @@ struct smp_alt_module { struct list_head next; }; static LIST_HEAD(smp_alt_modules); -static DEFINE_MUTEX(smp_alt); -static bool uniproc_patched = false; /* protected by smp_alt */ +static bool uniproc_patched = false; /* protected by text_mutex */ void __init_or_module alternatives_smp_module_add(struct module *mod, char *name, @@ -499,7 +486,7 @@ void __init_or_module alternatives_smp_module_add(struct module *mod, { struct smp_alt_module *smp; - mutex_lock(&smp_alt); + mutex_lock(&text_mutex); if (!uniproc_patched) goto unlock; @@ -526,14 +513,14 @@ void __init_or_module alternatives_smp_module_add(struct module *mod, smp_unlock: alternatives_smp_unlock(locks, locks_end, text, text_end); unlock: - mutex_unlock(&smp_alt); + mutex_unlock(&text_mutex); } void __init_or_module alternatives_smp_module_del(struct module *mod) { struct smp_alt_module *item; - mutex_lock(&smp_alt); + mutex_lock(&text_mutex); list_for_each_entry(item, &smp_alt_modules, next) { if (mod != item->mod) continue; @@ -541,7 +528,7 @@ void __init_or_module alternatives_smp_module_del(struct module *mod) kfree(item); break; } - mutex_unlock(&smp_alt); + mutex_unlock(&text_mutex); } void alternatives_enable_smp(void) @@ -551,7 +538,7 @@ void alternatives_enable_smp(void) /* Why bother if there are no other CPUs? */ BUG_ON(num_possible_cpus() == 1); - mutex_lock(&smp_alt); + mutex_lock(&text_mutex); if (uniproc_patched) { pr_info("switching to SMP code\n"); @@ -563,10 +550,13 @@ void alternatives_enable_smp(void) mod->text, mod->text_end); uniproc_patched = false; } - mutex_unlock(&smp_alt); + mutex_unlock(&text_mutex); } -/* Return 1 if the address range is reserved for smp-alternatives */ +/* + * Return 1 if the address range is reserved for SMP-alternatives. + * Must hold text_mutex. + */ int alternatives_text_reserved(void *start, void *end) { struct smp_alt_module *mod; @@ -574,6 +564,8 @@ int alternatives_text_reserved(void *start, void *end) u8 *text_start = start; u8 *text_end = end; + lockdep_assert_held(&text_mutex); + list_for_each_entry(mod, &smp_alt_modules, next) { if (mod->text > text_end || mod->text_end < text_start) continue; @@ -596,9 +588,6 @@ void __init_or_module apply_paravirt(struct paravirt_patch_site *start, struct paravirt_patch_site *p; char insnbuf[MAX_PATCH_LEN]; - if (noreplace_paravirt) - return; - for (p = start; p < end; p++) { unsigned int used; diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index cc0e8bc0ea3f..ecd486cb06ab 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -31,6 +31,7 @@ #include <linux/io.h> #include <linux/gfp.h> #include <linux/atomic.h> +#include <linux/dma-direct.h> #include <asm/mtrr.h> #include <asm/pgtable.h> #include <asm/proto.h> diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 6db28f17ff28..c88e0b127810 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -235,7 +235,7 @@ int amd_cache_northbridges(void) if (boot_cpu_data.x86 == 0x10 && boot_cpu_data.x86_model >= 0x8 && (boot_cpu_data.x86_model > 0x9 || - boot_cpu_data.x86_mask >= 0x1)) + boot_cpu_data.x86_stepping >= 0x1)) amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE; if (boot_cpu_data.x86 == 0x15) diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index f5d92bc3b884..2c4d5ece7456 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -30,6 +30,7 @@ #include <asm/dma.h> #include <asm/amd_nb.h> #include <asm/x86_init.h> +#include <linux/crash_dump.h> /* * Using 512M as goal, in case kexec will load kernel_big @@ -56,6 +57,33 @@ int fallback_aper_force __initdata; int fix_aperture __initdata = 1; +#ifdef CONFIG_PROC_VMCORE +/* + * If the first kernel maps the aperture over e820 RAM, the kdump kernel will + * use the same range because it will remain configured in the northbridge. + * Trying to dump this area via /proc/vmcore may crash the machine, so exclude + * it from vmcore. + */ +static unsigned long aperture_pfn_start, aperture_page_count; + +static int gart_oldmem_pfn_is_ram(unsigned long pfn) +{ + return likely((pfn < aperture_pfn_start) || + (pfn >= aperture_pfn_start + aperture_page_count)); +} + +static void exclude_from_vmcore(u64 aper_base, u32 aper_order) +{ + aperture_pfn_start = aper_base >> PAGE_SHIFT; + aperture_page_count = (32 * 1024 * 1024) << aper_order >> PAGE_SHIFT; + WARN_ON(register_oldmem_pfn_is_ram(&gart_oldmem_pfn_is_ram)); +} +#else +static void exclude_from_vmcore(u64 aper_base, u32 aper_order) +{ +} +#endif + /* This code runs before the PCI subsystem is initialized, so just access the northbridge directly. */ @@ -435,8 +463,16 @@ int __init gart_iommu_hole_init(void) out: if (!fix && !fallback_aper_force) { - if (last_aper_base) + if (last_aper_base) { + /* + * If this is the kdump kernel, the first kernel + * may have allocated the range over its e820 RAM + * and fixed up the northbridge + */ + exclude_from_vmcore(last_aper_base, last_aper_order); + return 1; + } return 0; } @@ -473,6 +509,14 @@ out: return 0; } + /* + * If this is the kdump kernel _and_ the first kernel did not + * configure the aperture in the northbridge, this range may + * overlap with the first kernel's memory. We can't access the + * range through vmcore even though it should be part of the dump. + */ + exclude_from_vmcore(aper_alloc, aper_order); + /* Fix up the north bridges */ for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) { int bus, dev_base, dev_limit; diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 2fb7309c6900..a6fcaf16cdbf 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -7,12 +7,11 @@ # In particualr, smp_apic_timer_interrupt() is called in random places. KCOV_INSTRUMENT := n -obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o ipi.o vector.o +obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_common.o apic_noop.o ipi.o vector.o obj-y += hw_nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_PCI_MSI) += msi.o -obj-$(CONFIG_HT_IRQ) += htirq.o obj-$(CONFIG_SMP) += ipi.o ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index ff891772c9f8..b203af0855b5 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -211,11 +211,7 @@ static inline int lapic_get_version(void) */ static inline int lapic_is_integrated(void) { -#ifdef CONFIG_X86_64 - return 1; -#else return APIC_INTEGRATED(lapic_get_version()); -#endif } /* @@ -298,14 +294,11 @@ int get_physical_broadcast(void) */ int lapic_get_maxlvt(void) { - unsigned int v; - - v = apic_read(APIC_LVR); /* * - we always have APIC integrated on 64bit mode * - 82489DXs do not report # of LVT entries */ - return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; + return lapic_is_integrated() ? GET_APIC_MAXLVT(apic_read(APIC_LVR)) : 2; } /* @@ -553,7 +546,7 @@ static DEFINE_PER_CPU(struct clock_event_device, lapic_events); static u32 hsx_deadline_rev(void) { - switch (boot_cpu_data.x86_mask) { + switch (boot_cpu_data.x86_stepping) { case 0x02: return 0x3a; /* EP */ case 0x04: return 0x0f; /* EX */ } @@ -563,7 +556,7 @@ static u32 hsx_deadline_rev(void) static u32 bdx_deadline_rev(void) { - switch (boot_cpu_data.x86_mask) { + switch (boot_cpu_data.x86_stepping) { case 0x02: return 0x00000011; case 0x03: return 0x0700000e; case 0x04: return 0x0f00000c; @@ -575,7 +568,7 @@ static u32 bdx_deadline_rev(void) static u32 skx_deadline_rev(void) { - switch (boot_cpu_data.x86_mask) { + switch (boot_cpu_data.x86_stepping) { case 0x03: return 0x01000136; case 0x04: return 0x02000014; } @@ -1229,6 +1222,70 @@ void __init sync_Arb_IDs(void) APIC_INT_LEVELTRIG | APIC_DM_INIT); } +enum apic_intr_mode_id apic_intr_mode; + +static int __init apic_intr_mode_select(void) +{ + /* Check kernel option */ + if (disable_apic) { + pr_info("APIC disabled via kernel command line\n"); + return APIC_PIC; + } + + /* Check BIOS */ +#ifdef CONFIG_X86_64 + /* On 64-bit, the APIC must be integrated, Check local APIC only */ + if (!boot_cpu_has(X86_FEATURE_APIC)) { + disable_apic = 1; + pr_info("APIC disabled by BIOS\n"); + return APIC_PIC; + } +#else + /* On 32-bit, the APIC may be integrated APIC or 82489DX */ + + /* Neither 82489DX nor integrated APIC ? */ + if (!boot_cpu_has(X86_FEATURE_APIC) && !smp_found_config) { + disable_apic = 1; + return APIC_PIC; + } + + /* If the BIOS pretends there is an integrated APIC ? */ + if (!boot_cpu_has(X86_FEATURE_APIC) && + APIC_INTEGRATED(boot_cpu_apic_version)) { + disable_apic = 1; + pr_err(FW_BUG "Local APIC %d not detected, force emulation\n", + boot_cpu_physical_apicid); + return APIC_PIC; + } +#endif + + /* Check MP table or ACPI MADT configuration */ + if (!smp_found_config) { + disable_ioapic_support(); + if (!acpi_lapic) { + pr_info("APIC: ACPI MADT or MP tables are not detected\n"); + return APIC_VIRTUAL_WIRE_NO_CONFIG; + } + return APIC_VIRTUAL_WIRE; + } + +#ifdef CONFIG_SMP + /* If SMP should be disabled, then really disable it! */ + if (!setup_max_cpus) { + pr_info("APIC: SMP mode deactivated\n"); + return APIC_SYMMETRIC_IO_NO_ROUTING; + } + + if (read_apic_id() != boot_cpu_physical_apicid) { + panic("Boot APIC ID in local APIC unexpected (%d vs %d)", + read_apic_id(), boot_cpu_physical_apicid); + /* Or can we switch back to PIC here? */ + } +#endif + + return APIC_SYMMETRIC_IO; +} + /* * An initial setup of the virtual wire mode. */ @@ -1278,6 +1335,38 @@ void __init init_bsp_APIC(void) apic_write(APIC_LVT1, value); } +/* Init the interrupt delivery mode for the BSP */ +void __init apic_intr_mode_init(void) +{ + bool upmode = IS_ENABLED(CONFIG_UP_LATE_INIT); + + apic_intr_mode = apic_intr_mode_select(); + + switch (apic_intr_mode) { + case APIC_PIC: + pr_info("APIC: Keep in PIC mode(8259)\n"); + return; + case APIC_VIRTUAL_WIRE: + pr_info("APIC: Switch to virtual wire mode setup\n"); + default_setup_apic_routing(); + break; + case APIC_VIRTUAL_WIRE_NO_CONFIG: + pr_info("APIC: Switch to virtual wire mode setup with no configuration\n"); + upmode = true; + default_setup_apic_routing(); + break; + case APIC_SYMMETRIC_IO: + pr_info("APIC: Switch to symmetric I/O mode setup\n"); + default_setup_apic_routing(); + break; + case APIC_SYMMETRIC_IO_NO_ROUTING: + pr_info("APIC: Switch to symmetric I/O mode setup in no SMP routine\n"); + break; + } + + apic_bsp_setup(upmode); +} + static void lapic_setup_esr(void) { unsigned int oldvalue, value, maxlvt; @@ -1473,7 +1562,7 @@ void setup_local_APIC(void) /* * Set up LVT0, LVT1: * - * set up through-local-APIC on the BP's LINT0. This is not + * set up through-local-APIC on the boot CPU's LINT0. This is not * strictly necessary in pure symmetric-IO mode, but sometimes * we delegate interrupts to the 8259A. */ @@ -1499,7 +1588,9 @@ void setup_local_APIC(void) value = APIC_DM_NMI; else value = APIC_DM_NMI | APIC_LVT_MASKED; - if (!lapic_is_integrated()) /* 82489DX */ + + /* Is 82489DX ? */ + if (!lapic_is_integrated()) value |= APIC_LVT_LEVEL_TRIGGER; apic_write(APIC_LVT1, value); @@ -1645,7 +1736,7 @@ static __init void try_to_enable_x2apic(int remap_mode) * under KVM */ if (max_physical_apicid > 255 || - !hypervisor_x2apic_available()) { + !x86_init.hyper.x2apic_available()) { pr_info("x2apic: IRQ remapping doesn't support X2APIC mode\n"); x2apic_disable(); return; @@ -1885,8 +1976,8 @@ void __init init_apic_mappings(void) * yeah -- we lie about apic_version * in case if apic was disabled via boot option * but it's not a problem for SMP compiled kernel - * since smp_sanity_check is prepared for such a case - * and disable smp mode + * since apic_intr_mode_select is prepared for such + * a case and disable smp mode */ boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); } @@ -2242,44 +2333,6 @@ int hard_smp_processor_id(void) return read_apic_id(); } -void default_init_apic_ldr(void) -{ - unsigned long val; - - apic_write(APIC_DFR, APIC_DFR_VALUE); - val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; - val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id()); - apic_write(APIC_LDR, val); -} - -int default_cpu_mask_to_apicid(const struct cpumask *mask, - struct irq_data *irqdata, - unsigned int *apicid) -{ - unsigned int cpu = cpumask_first(mask); - - if (cpu >= nr_cpu_ids) - return -EINVAL; - *apicid = per_cpu(x86_cpu_to_apicid, cpu); - irq_data_update_effective_affinity(irqdata, cpumask_of(cpu)); - return 0; -} - -int flat_cpu_mask_to_apicid(const struct cpumask *mask, - struct irq_data *irqdata, - unsigned int *apicid) - -{ - struct cpumask *effmsk = irq_data_get_effective_affinity_mask(irqdata); - unsigned long cpu_mask = cpumask_bits(mask)[0] & APIC_ALL_CPUS; - - if (!cpu_mask) - return -EINVAL; - *apicid = (unsigned int)cpu_mask; - cpumask_bits(effmsk)[0] = cpu_mask; - return 0; -} - /* * Override the generic EOI implementation with an optimized version. * Only called during early boot when only one CPU is active and with @@ -2322,72 +2375,27 @@ static void __init apic_bsp_up_setup(void) * Returns: * apic_id of BSP APIC */ -int __init apic_bsp_setup(bool upmode) +void __init apic_bsp_setup(bool upmode) { - int id; - connect_bsp_APIC(); if (upmode) apic_bsp_up_setup(); setup_local_APIC(); - if (x2apic_mode) - id = apic_read(APIC_LDR); - else - id = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); - enable_IO_APIC(); end_local_APIC_setup(); irq_remap_enable_fault_handling(); setup_IO_APIC(); - /* Setup local timer */ - x86_init.timers.setup_percpu_clockev(); - return id; -} - -/* - * This initializes the IO-APIC and APIC hardware if this is - * a UP kernel. - */ -int __init APIC_init_uniprocessor(void) -{ - if (disable_apic) { - pr_info("Apic disabled\n"); - return -1; - } -#ifdef CONFIG_X86_64 - if (!boot_cpu_has(X86_FEATURE_APIC)) { - disable_apic = 1; - pr_info("Apic disabled by BIOS\n"); - return -1; - } -#else - if (!smp_found_config && !boot_cpu_has(X86_FEATURE_APIC)) - return -1; - - /* - * Complain if the BIOS pretends there is one. - */ - if (!boot_cpu_has(X86_FEATURE_APIC) && - APIC_INTEGRATED(boot_cpu_apic_version)) { - pr_err("BIOS bug, local APIC 0x%x not detected!...\n", - boot_cpu_physical_apicid); - return -1; - } -#endif - - if (!smp_found_config) - disable_ioapic_support(); - - default_setup_apic_routing(); - apic_bsp_setup(true); - return 0; } #ifdef CONFIG_UP_LATE_INIT void __init up_late_init(void) { - APIC_init_uniprocessor(); + if (apic_intr_mode == APIC_PIC) + return; + + /* Setup local timer */ + x86_init.timers.setup_percpu_clockev(); } #endif @@ -2667,11 +2675,13 @@ static int __init apic_set_verbosity(char *arg) apic_verbosity = APIC_DEBUG; else if (strcmp("verbose", arg) == 0) apic_verbosity = APIC_VERBOSE; +#ifdef CONFIG_X86_64 else { pr_warning("APIC Verbosity level %s not recognised" " use apic=verbose or apic=debug\n", arg); return -EINVAL; } +#endif return 0; } diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c new file mode 100644 index 000000000000..a360801779ae --- /dev/null +++ b/arch/x86/kernel/apic/apic_common.c @@ -0,0 +1,46 @@ +/* + * Common functions shared between the various APIC flavours + * + * SPDX-License-Identifier: GPL-2.0 + */ +#include <linux/irq.h> +#include <asm/apic.h> + +u32 apic_default_calc_apicid(unsigned int cpu) +{ + return per_cpu(x86_cpu_to_apicid, cpu); +} + +u32 apic_flat_calc_apicid(unsigned int cpu) +{ + return 1U << cpu; +} + +bool default_check_apicid_used(physid_mask_t *map, int apicid) +{ + return physid_isset(apicid, *map); +} + +void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) +{ + *retmap = *phys_map; +} + +int default_cpu_present_to_apicid(int mps_cpu) +{ + if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu)) + return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu); + else + return BAD_APICID; +} +EXPORT_SYMBOL_GPL(default_cpu_present_to_apicid); + +int default_check_phys_apicid_present(int phys_apicid) +{ + return physid_isset(phys_apicid, phys_cpu_present_map); +} + +int default_apic_id_valid(int apicid) +{ + return (apicid < 255); +} diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index dedd5a41ba48..e84c9eb4e5b4 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -19,6 +19,7 @@ #include <asm/smp.h> #include <asm/apic.h> #include <asm/ipi.h> +#include <asm/jailhouse_para.h> #include <linux/acpi.h> @@ -84,12 +85,8 @@ flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) static void flat_send_IPI_allbutself(int vector) { int cpu = smp_processor_id(); -#ifdef CONFIG_HOTPLUG_CPU - int hotplug = 1; -#else - int hotplug = 0; -#endif - if (hotplug || vector == NMI_VECTOR) { + + if (IS_ENABLED(CONFIG_HOTPLUG_CPU) || vector == NMI_VECTOR) { if (!cpumask_equal(cpu_online_mask, cpumask_of(cpu))) { unsigned long mask = cpumask_bits(cpu_online_mask)[0]; @@ -119,7 +116,7 @@ static unsigned int flat_get_apic_id(unsigned long x) return (x >> 24) & 0xFF; } -static unsigned long set_apic_id(unsigned int id) +static u32 set_apic_id(unsigned int id) { return (id & 0xFF) << 24; } @@ -151,15 +148,13 @@ static struct apic apic_flat __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = flat_apic_id_registered, - .irq_delivery_mode = dest_LowestPrio, + .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 1, /* logical */ - .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = NULL, - .vector_allocation_domain = flat_vector_allocation_domain, .init_apic_ldr = flat_init_apic_ldr, .ioapic_phys_id_map = NULL, @@ -172,7 +167,7 @@ static struct apic apic_flat __ro_after_init = { .get_apic_id = flat_get_apic_id, .set_apic_id = set_apic_id, - .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, + .calc_dest_apicid = apic_flat_calc_apicid, .send_IPI = default_send_IPI_single, .send_IPI_mask = flat_send_IPI_mask, @@ -220,6 +215,15 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 0; } +static void physflat_init_apic_ldr(void) +{ + /* + * LDR and DFR are not involved in physflat mode, rather: + * "In physical destination mode, the destination processor is + * specified by its local APIC ID [...]." (Intel SDM, 10.6.2.1) + */ +} + static void physflat_send_IPI_allbutself(int vector) { default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector); @@ -232,7 +236,8 @@ static void physflat_send_IPI_all(int vector) static int physflat_probe(void) { - if (apic == &apic_physflat || num_possible_cpus() > 8) + if (apic == &apic_physflat || num_possible_cpus() > 8 || + jailhouse_paravirt()) return 1; return 0; @@ -249,14 +254,11 @@ static struct apic apic_physflat __ro_after_init = { .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 0, /* physical */ - .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = 0, .check_apicid_used = NULL, - .vector_allocation_domain = default_vector_allocation_domain, - /* not needed, but shouldn't hurt: */ - .init_apic_ldr = flat_init_apic_ldr, + .init_apic_ldr = physflat_init_apic_ldr, .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, @@ -268,7 +270,7 @@ static struct apic apic_physflat __ro_after_init = { .get_apic_id = flat_get_apic_id, .set_apic_id = set_apic_id, - .cpu_mask_to_apicid = default_cpu_mask_to_apicid, + .calc_dest_apicid = apic_default_calc_apicid, .send_IPI = default_send_IPI_single_phys, .send_IPI_mask = default_send_IPI_mask_sequence_phys, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index c8d211277315..5078b5ce63a7 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -84,20 +84,6 @@ static int noop_apic_id_registered(void) return physid_isset(0, phys_cpu_present_map); } -static const struct cpumask *noop_target_cpus(void) -{ - /* only BSP here */ - return cpumask_of(0); -} - -static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask, - const struct cpumask *mask) -{ - if (cpu != 0) - pr_warning("APIC: Vector allocated for non-BSP cpu\n"); - cpumask_copy(retmask, cpumask_of(cpu)); -} - static u32 noop_apic_read(u32 reg) { WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic); @@ -109,6 +95,13 @@ static void noop_apic_write(u32 reg, u32 v) WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic); } +#ifdef CONFIG_X86_32 +static int noop_x86_32_early_logical_apicid(int cpu) +{ + return BAD_APICID; +} +#endif + struct apic apic_noop __ro_after_init = { .name = "noop", .probe = noop_probe, @@ -117,16 +110,14 @@ struct apic apic_noop __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = noop_apic_id_registered, - .irq_delivery_mode = dest_LowestPrio, + .irq_delivery_mode = dest_Fixed, /* logical delivery broadcast to all CPUs: */ .irq_dest_mode = 1, - .target_cpus = noop_target_cpus, .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = default_check_apicid_used, - .vector_allocation_domain = noop_vector_allocation_domain, .init_apic_ldr = noop_init_apic_ldr, .ioapic_phys_id_map = default_ioapic_phys_id_map, @@ -142,7 +133,7 @@ struct apic apic_noop __ro_after_init = { .get_apic_id = noop_get_apic_id, .set_apic_id = NULL, - .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, + .calc_dest_apicid = apic_flat_calc_apicid, .send_IPI = noop_send_IPI, .send_IPI_mask = noop_send_IPI_mask, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 2fda912219a6..134e04506ab4 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -38,7 +38,7 @@ static unsigned int numachip1_get_apic_id(unsigned long x) return id; } -static unsigned long numachip1_set_apic_id(unsigned int id) +static u32 numachip1_set_apic_id(unsigned int id) { return (id & 0xff) << 24; } @@ -51,7 +51,7 @@ static unsigned int numachip2_get_apic_id(unsigned long x) return ((mcfg >> (28 - 8)) & 0xfff00) | (x >> 24); } -static unsigned long numachip2_set_apic_id(unsigned int id) +static u32 numachip2_set_apic_id(unsigned int id) { return id << 24; } @@ -249,12 +249,10 @@ static const struct apic apic_numachip1 __refconst = { .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 0, /* physical */ - .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = 0, .check_apicid_used = NULL, - .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = flat_init_apic_ldr, .ioapic_phys_id_map = NULL, @@ -267,7 +265,7 @@ static const struct apic apic_numachip1 __refconst = { .get_apic_id = numachip1_get_apic_id, .set_apic_id = numachip1_set_apic_id, - .cpu_mask_to_apicid = default_cpu_mask_to_apicid, + .calc_dest_apicid = apic_default_calc_apicid, .send_IPI = numachip_send_IPI_one, .send_IPI_mask = numachip_send_IPI_mask, @@ -300,12 +298,10 @@ static const struct apic apic_numachip2 __refconst = { .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 0, /* physical */ - .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = 0, .check_apicid_used = NULL, - .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = flat_init_apic_ldr, .ioapic_phys_id_map = NULL, @@ -318,7 +314,7 @@ static const struct apic apic_numachip2 __refconst = { .get_apic_id = numachip2_get_apic_id, .set_apic_id = numachip2_set_apic_id, - .cpu_mask_to_apicid = default_cpu_mask_to_apicid, + .calc_dest_apicid = apic_default_calc_apicid, .send_IPI = numachip_send_IPI_one, .send_IPI_mask = numachip_send_IPI_mask, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index e12fbcfc9571..afee386ff711 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -27,9 +27,9 @@ static int bigsmp_apic_id_registered(void) return 1; } -static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid) +static bool bigsmp_check_apicid_used(physid_mask_t *map, int apicid) { - return 0; + return false; } static int bigsmp_early_logical_apicid(int cpu) @@ -155,12 +155,10 @@ static struct apic apic_bigsmp __ro_after_init = { /* phys delivery to target CPU: */ .irq_dest_mode = 0, - .target_cpus = default_target_cpus, .disable_esr = 1, .dest_logical = 0, .check_apicid_used = bigsmp_check_apicid_used, - .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = bigsmp_init_apic_ldr, .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, @@ -173,7 +171,7 @@ static struct apic apic_bigsmp __ro_after_init = { .get_apic_id = bigsmp_get_apic_id, .set_apic_id = NULL, - .cpu_mask_to_apicid = default_cpu_mask_to_apicid, + .calc_dest_apicid = apic_default_calc_apicid, .send_IPI = default_send_IPI_single_phys, .send_IPI_mask = default_send_IPI_mask_sequence_phys, diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c deleted file mode 100644 index 56ccf9346b08..000000000000 --- a/arch/x86/kernel/apic/htirq.c +++ /dev/null @@ -1,197 +0,0 @@ -/* - * Support Hypertransport IRQ - * - * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo - * Moved from arch/x86/kernel/apic/io_apic.c. - * Jiang Liu <jiang.liu@linux.intel.com> - * Add support of hierarchical irqdomain - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#include <linux/mm.h> -#include <linux/interrupt.h> -#include <linux/init.h> -#include <linux/device.h> -#include <linux/pci.h> -#include <linux/htirq.h> -#include <asm/irqdomain.h> -#include <asm/hw_irq.h> -#include <asm/apic.h> -#include <asm/hypertransport.h> - -static struct irq_domain *htirq_domain; - -/* - * Hypertransport interrupt support - */ -static int -ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) -{ - struct irq_data *parent = data->parent_data; - int ret; - - ret = parent->chip->irq_set_affinity(parent, mask, force); - if (ret >= 0) { - struct ht_irq_msg msg; - struct irq_cfg *cfg = irqd_cfg(data); - - fetch_ht_irq_msg(data->irq, &msg); - msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | - HT_IRQ_LOW_DEST_ID_MASK); - msg.address_lo |= HT_IRQ_LOW_VECTOR(cfg->vector) | - HT_IRQ_LOW_DEST_ID(cfg->dest_apicid); - msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); - msg.address_hi |= HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid); - write_ht_irq_msg(data->irq, &msg); - } - - return ret; -} - -static struct irq_chip ht_irq_chip = { - .name = "PCI-HT", - .irq_mask = mask_ht_irq, - .irq_unmask = unmask_ht_irq, - .irq_ack = irq_chip_ack_parent, - .irq_set_affinity = ht_set_affinity, - .irq_retrigger = irq_chip_retrigger_hierarchy, - .flags = IRQCHIP_SKIP_SET_WAKE, -}; - -static int htirq_domain_alloc(struct irq_domain *domain, unsigned int virq, - unsigned int nr_irqs, void *arg) -{ - struct ht_irq_cfg *ht_cfg; - struct irq_alloc_info *info = arg; - struct pci_dev *dev; - irq_hw_number_t hwirq; - int ret; - - if (nr_irqs > 1 || !info) - return -EINVAL; - - dev = info->ht_dev; - hwirq = (info->ht_idx & 0xFF) | - PCI_DEVID(dev->bus->number, dev->devfn) << 8 | - (pci_domain_nr(dev->bus) & 0xFFFFFFFF) << 24; - if (irq_find_mapping(domain, hwirq) > 0) - return -EEXIST; - - ht_cfg = kmalloc(sizeof(*ht_cfg), GFP_KERNEL); - if (!ht_cfg) - return -ENOMEM; - - ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info); - if (ret < 0) { - kfree(ht_cfg); - return ret; - } - - /* Initialize msg to a value that will never match the first write. */ - ht_cfg->msg.address_lo = 0xffffffff; - ht_cfg->msg.address_hi = 0xffffffff; - ht_cfg->dev = info->ht_dev; - ht_cfg->update = info->ht_update; - ht_cfg->pos = info->ht_pos; - ht_cfg->idx = 0x10 + (info->ht_idx * 2); - irq_domain_set_info(domain, virq, hwirq, &ht_irq_chip, ht_cfg, - handle_edge_irq, ht_cfg, "edge"); - - return 0; -} - -static void htirq_domain_free(struct irq_domain *domain, unsigned int virq, - unsigned int nr_irqs) -{ - struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq); - - BUG_ON(nr_irqs != 1); - kfree(irq_data->chip_data); - irq_domain_free_irqs_top(domain, virq, nr_irqs); -} - -static void htirq_domain_activate(struct irq_domain *domain, - struct irq_data *irq_data) -{ - struct ht_irq_msg msg; - struct irq_cfg *cfg = irqd_cfg(irq_data); - - msg.address_hi = HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid); - msg.address_lo = - HT_IRQ_LOW_BASE | - HT_IRQ_LOW_DEST_ID(cfg->dest_apicid) | - HT_IRQ_LOW_VECTOR(cfg->vector) | - ((apic->irq_dest_mode == 0) ? - HT_IRQ_LOW_DM_PHYSICAL : - HT_IRQ_LOW_DM_LOGICAL) | - HT_IRQ_LOW_RQEOI_EDGE | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - HT_IRQ_LOW_MT_FIXED : - HT_IRQ_LOW_MT_ARBITRATED) | - HT_IRQ_LOW_IRQ_MASKED; - write_ht_irq_msg(irq_data->irq, &msg); -} - -static void htirq_domain_deactivate(struct irq_domain *domain, - struct irq_data *irq_data) -{ - struct ht_irq_msg msg; - - memset(&msg, 0, sizeof(msg)); - write_ht_irq_msg(irq_data->irq, &msg); -} - -static const struct irq_domain_ops htirq_domain_ops = { - .alloc = htirq_domain_alloc, - .free = htirq_domain_free, - .activate = htirq_domain_activate, - .deactivate = htirq_domain_deactivate, -}; - -void __init arch_init_htirq_domain(struct irq_domain *parent) -{ - struct fwnode_handle *fn; - - if (disable_apic) - return; - - fn = irq_domain_alloc_named_fwnode("PCI-HT"); - if (!fn) - goto warn; - - htirq_domain = irq_domain_create_tree(fn, &htirq_domain_ops, NULL); - irq_domain_free_fwnode(fn); - if (!htirq_domain) - goto warn; - - htirq_domain->parent = parent; - return; - -warn: - pr_warn("Failed to initialize irqdomain for HTIRQ.\n"); -} - -int arch_setup_ht_irq(int idx, int pos, struct pci_dev *dev, - ht_irq_update_t *update) -{ - struct irq_alloc_info info; - - if (!htirq_domain) - return -ENOSYS; - - init_irq_alloc_info(&info, NULL); - info.ht_idx = idx; - info.ht_pos = pos; - info.ht_dev = dev; - info.ht_update = update; - - return irq_domain_alloc_irqs(htirq_domain, 1, dev_to_node(&dev->dev), - &info); -} - -void arch_teardown_ht_irq(unsigned int irq) -{ - irq_domain_free_irqs(irq, 1); -} diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 3b89b27945ff..7c5538769f7e 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -800,18 +800,18 @@ static int irq_polarity(int idx) /* * Determine IRQ line polarity (high active or low active): */ - switch (mp_irqs[idx].irqflag & 0x03) { - case 0: + switch (mp_irqs[idx].irqflag & MP_IRQPOL_MASK) { + case MP_IRQPOL_DEFAULT: /* conforms to spec, ie. bus-type dependent polarity */ if (test_bit(bus, mp_bus_not_pci)) return default_ISA_polarity(idx); else return default_PCI_polarity(idx); - case 1: + case MP_IRQPOL_ACTIVE_HIGH: return IOAPIC_POL_HIGH; - case 2: + case MP_IRQPOL_RESERVED: pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n"); - case 3: + case MP_IRQPOL_ACTIVE_LOW: default: /* Pointless default required due to do gcc stupidity */ return IOAPIC_POL_LOW; } @@ -845,8 +845,8 @@ static int irq_trigger(int idx) /* * Determine IRQ trigger mode (edge or level sensitive): */ - switch ((mp_irqs[idx].irqflag >> 2) & 0x03) { - case 0: + switch (mp_irqs[idx].irqflag & MP_IRQTRIG_MASK) { + case MP_IRQTRIG_DEFAULT: /* conforms to spec, ie. bus-type dependent trigger mode */ if (test_bit(bus, mp_bus_not_pci)) trigger = default_ISA_trigger(idx); @@ -854,11 +854,11 @@ static int irq_trigger(int idx) trigger = default_PCI_trigger(idx); /* Take EISA into account */ return eisa_irq_trigger(idx, bus, trigger); - case 1: + case MP_IRQTRIG_EDGE: return IOAPIC_EDGE; - case 2: + case MP_IRQTRIG_RESERVED: pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n"); - case 3: + case MP_IRQTRIG_LEVEL: default: /* Pointless default required due to do gcc stupidity */ return IOAPIC_LEVEL; } @@ -1014,6 +1014,7 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain, info->ioapic_pin)) return -ENOMEM; } else { + info->flags |= X86_IRQ_ALLOC_LEGACY; irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true, NULL); if (irq >= 0) { @@ -1586,6 +1587,43 @@ static int __init notimercheck(char *s) } __setup("no_timer_check", notimercheck); +static void __init delay_with_tsc(void) +{ + unsigned long long start, now; + unsigned long end = jiffies + 4; + + start = rdtsc(); + + /* + * We don't know the TSC frequency yet, but waiting for + * 40000000000/HZ TSC cycles is safe: + * 4 GHz == 10 jiffies + * 1 GHz == 40 jiffies + */ + do { + rep_nop(); + now = rdtsc(); + } while ((now - start) < 40000000000ULL / HZ && + time_before_eq(jiffies, end)); +} + +static void __init delay_without_tsc(void) +{ + unsigned long end = jiffies + 4; + int band = 1; + + /* + * We don't know any frequency yet, but waiting for + * 40940000000/HZ cycles is safe: + * 4 GHz == 10 jiffies + * 1 GHz == 40 jiffies + * 1 << 1 + 1 << 2 +...+ 1 << 11 = 4094 + */ + do { + __delay(((1U << band++) * 10000000UL) / HZ); + } while (band < 12 && time_before_eq(jiffies, end)); +} + /* * There is a nasty bug in some older SMP boards, their mptable lies * about the timer IRQ. We do the following to work around the situation: @@ -1604,8 +1642,12 @@ static int __init timer_irq_works(void) local_save_flags(flags); local_irq_enable(); - /* Let ten ticks pass... */ - mdelay((10 * 1000) / HZ); + + if (boot_cpu_has(X86_FEATURE_TSC)) + delay_with_tsc(); + else + delay_without_tsc(); + local_irq_restore(flags); /* @@ -1821,26 +1863,36 @@ static void ioapic_ir_ack_level(struct irq_data *irq_data) eoi_ioapic_pin(data->entry.vector, data); } +static void ioapic_configure_entry(struct irq_data *irqd) +{ + struct mp_chip_data *mpd = irqd->chip_data; + struct irq_cfg *cfg = irqd_cfg(irqd); + struct irq_pin_list *entry; + + /* + * Only update when the parent is the vector domain, don't touch it + * if the parent is the remapping domain. Check the installed + * ioapic chip to verify that. + */ + if (irqd->chip == &ioapic_chip) { + mpd->entry.dest = cfg->dest_apicid; + mpd->entry.vector = cfg->vector; + } + for_each_irq_pin(entry, mpd->irq_2_pin) + __ioapic_write_entry(entry->apic, entry->pin, mpd->entry); +} + static int ioapic_set_affinity(struct irq_data *irq_data, const struct cpumask *mask, bool force) { struct irq_data *parent = irq_data->parent_data; - struct mp_chip_data *data = irq_data->chip_data; - struct irq_pin_list *entry; - struct irq_cfg *cfg; unsigned long flags; int ret; ret = parent->chip->irq_set_affinity(parent, mask, force); raw_spin_lock_irqsave(&ioapic_lock, flags); - if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) { - cfg = irqd_cfg(irq_data); - data->entry.dest = cfg->dest_apicid; - data->entry.vector = cfg->vector; - for_each_irq_pin(entry, data->irq_2_pin) - __ioapic_write_entry(entry->apic, entry->pin, - data->entry); - } + if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) + ioapic_configure_entry(irq_data); raw_spin_unlock_irqrestore(&ioapic_lock, flags); return ret; @@ -2097,7 +2149,7 @@ static inline void __init check_timer(void) unmask_ioapic_irq(irq_get_irq_data(0)); } irq_domain_deactivate_irq(irq_data); - irq_domain_activate_irq(irq_data); + irq_domain_activate_irq(irq_data, false); if (timer_irq_works()) { if (disable_timer_pin_1 > 0) clear_IO_APIC_pin(0, pin1); @@ -2119,7 +2171,7 @@ static inline void __init check_timer(void) */ replace_pin_at_irq_node(data, node, apic1, pin1, apic2, pin2); irq_domain_deactivate_irq(irq_data); - irq_domain_activate_irq(irq_data); + irq_domain_activate_irq(irq_data, false); legacy_pic->unmask(0); if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); @@ -2513,52 +2565,9 @@ int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity) } /* - * This function currently is only a helper for the i386 smp boot process where - * we need to reprogram the ioredtbls to cater for the cpus which have come online - * so mask in all cases should simply be apic->target_cpus() + * This function updates target affinity of IOAPIC interrupts to include + * the CPUs which came online during SMP bringup. */ -#ifdef CONFIG_SMP -void __init setup_ioapic_dest(void) -{ - int pin, ioapic, irq, irq_entry; - const struct cpumask *mask; - struct irq_desc *desc; - struct irq_data *idata; - struct irq_chip *chip; - - if (skip_ioapic_setup == 1) - return; - - for_each_ioapic_pin(ioapic, pin) { - irq_entry = find_irq_entry(ioapic, pin, mp_INT); - if (irq_entry == -1) - continue; - - irq = pin_2_irq(irq_entry, ioapic, pin, 0); - if (irq < 0 || !mp_init_irq_at_boot(ioapic, irq)) - continue; - - desc = irq_to_desc(irq); - raw_spin_lock_irq(&desc->lock); - idata = irq_desc_get_irq_data(desc); - - /* - * Honour affinities which have been set in early boot - */ - if (!irqd_can_balance(idata) || irqd_affinity_was_set(idata)) - mask = irq_data_get_affinity_mask(idata); - else - mask = apic->target_cpus(); - - chip = irq_data_get_irq_chip(idata); - /* Might be lapic_chip for irq 0 */ - if (chip->irq_set_affinity) - chip->irq_set_affinity(idata, mask, false); - raw_spin_unlock_irq(&desc->lock); - } -} -#endif - #define IOAPIC_RESOURCE_NAME_SIZE 11 static struct resource *ioapic_resources; @@ -2978,17 +2987,15 @@ void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, irq_domain_free_irqs_top(domain, virq, nr_irqs); } -void mp_irqdomain_activate(struct irq_domain *domain, - struct irq_data *irq_data) +int mp_irqdomain_activate(struct irq_domain *domain, + struct irq_data *irq_data, bool reserve) { unsigned long flags; - struct irq_pin_list *entry; - struct mp_chip_data *data = irq_data->chip_data; raw_spin_lock_irqsave(&ioapic_lock, flags); - for_each_irq_pin(entry, data->irq_2_pin) - __ioapic_write_entry(entry->apic, entry->pin, data->entry); + ioapic_configure_entry(irq_data); raw_spin_unlock_irqrestore(&ioapic_lock, flags); + return 0; } void mp_irqdomain_deactivate(struct irq_domain *domain, diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 9b18be764422..ce503c99f5c4 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -39,17 +39,13 @@ static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) ((apic->irq_dest_mode == 0) ? MSI_ADDR_DEST_MODE_PHYSICAL : MSI_ADDR_DEST_MODE_LOGICAL) | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - MSI_ADDR_REDIRECTION_CPU : - MSI_ADDR_REDIRECTION_LOWPRI) | + MSI_ADDR_REDIRECTION_CPU | MSI_ADDR_DEST_ID(cfg->dest_apicid); msg->data = MSI_DATA_TRIGGER_EDGE | MSI_DATA_LEVEL_ASSERT | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - MSI_DATA_DELIVERY_FIXED : - MSI_DATA_DELIVERY_LOWPRI) | + MSI_DATA_DELIVERY_FIXED | MSI_DATA_VECTOR(cfg->vector); } diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 63287659adb6..02e8acb134f8 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -66,6 +66,31 @@ static void setup_apic_flat_routing(void) #endif } +static int default_apic_id_registered(void) +{ + return physid_isset(read_apic_id(), phys_cpu_present_map); +} + +/* + * Set up the logical destination ID. Intel recommends to set DFR, LDR and + * TPR before enabling an APIC. See e.g. "AP-388 82489DX User's Manual" + * (Intel document number 292116). + */ +static void default_init_apic_ldr(void) +{ + unsigned long val; + + apic_write(APIC_DFR, APIC_DFR_VALUE); + val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; + val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id()); + apic_write(APIC_LDR, val); +} + +static int default_phys_pkg_id(int cpuid_apic, int index_msb) +{ + return cpuid_apic >> index_msb; +} + /* should be called last. */ static int probe_default(void) { @@ -80,16 +105,14 @@ static struct apic apic_default __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = default_apic_id_registered, - .irq_delivery_mode = dest_LowestPrio, + .irq_delivery_mode = dest_Fixed, /* logical delivery broadcast to all CPUs: */ .irq_dest_mode = 1, - .target_cpus = default_target_cpus, .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = default_check_apicid_used, - .vector_allocation_domain = flat_vector_allocation_domain, .init_apic_ldr = default_init_apic_ldr, .ioapic_phys_id_map = default_ioapic_phys_id_map, @@ -102,7 +125,7 @@ static struct apic apic_default __ro_after_init = { .get_apic_id = default_get_apic_id, .set_apic_id = NULL, - .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, + .calc_dest_apicid = apic_flat_calc_apicid, .send_IPI = default_send_IPI_single, .send_IPI_mask = default_send_IPI_mask_logical, diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 88c214e75a6b..bb6f7a2148d7 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -1,5 +1,5 @@ /* - * Local APIC related interfaces to support IOAPIC, MSI, HT_IRQ etc. + * Local APIC related interfaces to support IOAPIC, MSI, etc. * * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo * Moved from arch/x86/kernel/apic/io_apic.c. @@ -11,6 +11,7 @@ * published by the Free Software Foundation. */ #include <linux/interrupt.h> +#include <linux/seq_file.h> #include <linux/init.h> #include <linux/compiler.h> #include <linux/slab.h> @@ -21,20 +22,30 @@ #include <asm/desc.h> #include <asm/irq_remapping.h> +#include <asm/trace/irq_vectors.h> + struct apic_chip_data { - struct irq_cfg cfg; - cpumask_var_t domain; - cpumask_var_t old_domain; - u8 move_in_progress : 1; + struct irq_cfg hw_irq_cfg; + unsigned int vector; + unsigned int prev_vector; + unsigned int cpu; + unsigned int prev_cpu; + unsigned int irq; + struct hlist_node clist; + unsigned int move_in_progress : 1, + is_managed : 1, + can_reserve : 1, + has_reserved : 1; }; struct irq_domain *x86_vector_domain; EXPORT_SYMBOL_GPL(x86_vector_domain); static DEFINE_RAW_SPINLOCK(vector_lock); -static cpumask_var_t vector_cpumask, vector_searchmask, searched_cpumask; +static cpumask_var_t vector_searchmask; static struct irq_chip lapic_controller; -#ifdef CONFIG_X86_IO_APIC -static struct apic_chip_data *legacy_irq_data[NR_IRQS_LEGACY]; +static struct irq_matrix *vector_matrix; +#ifdef CONFIG_SMP +static DEFINE_PER_CPU(struct hlist_head, cleanup_list); #endif void lock_vector_lock(void) @@ -50,22 +61,37 @@ void unlock_vector_lock(void) raw_spin_unlock(&vector_lock); } -static struct apic_chip_data *apic_chip_data(struct irq_data *irq_data) +void init_irq_alloc_info(struct irq_alloc_info *info, + const struct cpumask *mask) +{ + memset(info, 0, sizeof(*info)); + info->mask = mask; +} + +void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src) { - if (!irq_data) + if (src) + *dst = *src; + else + memset(dst, 0, sizeof(*dst)); +} + +static struct apic_chip_data *apic_chip_data(struct irq_data *irqd) +{ + if (!irqd) return NULL; - while (irq_data->parent_data) - irq_data = irq_data->parent_data; + while (irqd->parent_data) + irqd = irqd->parent_data; - return irq_data->chip_data; + return irqd->chip_data; } -struct irq_cfg *irqd_cfg(struct irq_data *irq_data) +struct irq_cfg *irqd_cfg(struct irq_data *irqd) { - struct apic_chip_data *data = apic_chip_data(irq_data); + struct apic_chip_data *apicd = apic_chip_data(irqd); - return data ? &data->cfg : NULL; + return apicd ? &apicd->hw_irq_cfg : NULL; } EXPORT_SYMBOL_GPL(irqd_cfg); @@ -76,270 +102,426 @@ struct irq_cfg *irq_cfg(unsigned int irq) static struct apic_chip_data *alloc_apic_chip_data(int node) { - struct apic_chip_data *data; + struct apic_chip_data *apicd; - data = kzalloc_node(sizeof(*data), GFP_KERNEL, node); - if (!data) - return NULL; - if (!zalloc_cpumask_var_node(&data->domain, GFP_KERNEL, node)) - goto out_data; - if (!zalloc_cpumask_var_node(&data->old_domain, GFP_KERNEL, node)) - goto out_domain; - return data; -out_domain: - free_cpumask_var(data->domain); -out_data: - kfree(data); - return NULL; -} - -static void free_apic_chip_data(struct apic_chip_data *data) -{ - if (data) { - free_cpumask_var(data->domain); - free_cpumask_var(data->old_domain); - kfree(data); - } + apicd = kzalloc_node(sizeof(*apicd), GFP_KERNEL, node); + if (apicd) + INIT_HLIST_NODE(&apicd->clist); + return apicd; } -static int __assign_irq_vector(int irq, struct apic_chip_data *d, - const struct cpumask *mask, - struct irq_data *irqdata) +static void free_apic_chip_data(struct apic_chip_data *apicd) { + kfree(apicd); +} + +static void apic_update_irq_cfg(struct irq_data *irqd, unsigned int vector, + unsigned int cpu) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + + lockdep_assert_held(&vector_lock); + + apicd->hw_irq_cfg.vector = vector; + apicd->hw_irq_cfg.dest_apicid = apic->calc_dest_apicid(cpu); + irq_data_update_effective_affinity(irqd, cpumask_of(cpu)); + trace_vector_config(irqd->irq, vector, cpu, + apicd->hw_irq_cfg.dest_apicid); +} + +static void apic_update_vector(struct irq_data *irqd, unsigned int newvec, + unsigned int newcpu) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + struct irq_desc *desc = irq_data_to_desc(irqd); + bool managed = irqd_affinity_is_managed(irqd); + + lockdep_assert_held(&vector_lock); + + trace_vector_update(irqd->irq, newvec, newcpu, apicd->vector, + apicd->cpu); + /* - * NOTE! The local APIC isn't very good at handling - * multiple interrupts at the same interrupt level. - * As the interrupt level is determined by taking the - * vector number and shifting that right by 4, we - * want to spread these out a bit so that they don't - * all fall in the same interrupt level. - * - * Also, we've got to be careful not to trash gate - * 0x80, because int 0x80 is hm, kind of importantish. ;) + * If there is no vector associated or if the associated vector is + * the shutdown vector, which is associated to make PCI/MSI + * shutdown mode work, then there is nothing to release. Clear out + * prev_vector for this and the offlined target case. */ - static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START; - static int current_offset = VECTOR_OFFSET_START % 16; - int cpu, vector; - + apicd->prev_vector = 0; + if (!apicd->vector || apicd->vector == MANAGED_IRQ_SHUTDOWN_VECTOR) + goto setnew; /* - * If there is still a move in progress or the previous move has not - * been cleaned up completely, tell the caller to come back later. + * If the target CPU of the previous vector is online, then mark + * the vector as move in progress and store it for cleanup when the + * first interrupt on the new vector arrives. If the target CPU is + * offline then the regular release mechanism via the cleanup + * vector is not possible and the vector can be immediately freed + * in the underlying matrix allocator. */ - if (d->move_in_progress || - cpumask_intersects(d->old_domain, cpu_online_mask)) - return -EBUSY; + if (cpu_online(apicd->cpu)) { + apicd->move_in_progress = true; + apicd->prev_vector = apicd->vector; + apicd->prev_cpu = apicd->cpu; + } else { + irq_matrix_free(vector_matrix, apicd->cpu, apicd->vector, + managed); + } + +setnew: + apicd->vector = newvec; + apicd->cpu = newcpu; + BUG_ON(!IS_ERR_OR_NULL(per_cpu(vector_irq, newcpu)[newvec])); + per_cpu(vector_irq, newcpu)[newvec] = desc; +} - /* Only try and allocate irqs on cpus that are present */ - cpumask_clear(d->old_domain); - cpumask_clear(searched_cpumask); - cpu = cpumask_first_and(mask, cpu_online_mask); - while (cpu < nr_cpu_ids) { - int new_cpu, offset; +static void vector_assign_managed_shutdown(struct irq_data *irqd) +{ + unsigned int cpu = cpumask_first(cpu_online_mask); - /* Get the possible target cpus for @mask/@cpu from the apic */ - apic->vector_allocation_domain(cpu, vector_cpumask, mask); + apic_update_irq_cfg(irqd, MANAGED_IRQ_SHUTDOWN_VECTOR, cpu); +} - /* - * Clear the offline cpus from @vector_cpumask for searching - * and verify whether the result overlaps with @mask. If true, - * then the call to apic->cpu_mask_to_apicid() will - * succeed as well. If not, no point in trying to find a - * vector in this mask. - */ - cpumask_and(vector_searchmask, vector_cpumask, cpu_online_mask); - if (!cpumask_intersects(vector_searchmask, mask)) - goto next_cpu; - - if (cpumask_subset(vector_cpumask, d->domain)) { - if (cpumask_equal(vector_cpumask, d->domain)) - goto success; - /* - * Mark the cpus which are not longer in the mask for - * cleanup. - */ - cpumask_andnot(d->old_domain, d->domain, vector_cpumask); - vector = d->cfg.vector; - goto update; - } +static int reserve_managed_vector(struct irq_data *irqd) +{ + const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd); + struct apic_chip_data *apicd = apic_chip_data(irqd); + unsigned long flags; + int ret; - vector = current_vector; - offset = current_offset; -next: - vector += 16; - if (vector >= FIRST_SYSTEM_VECTOR) { - offset = (offset + 1) % 16; - vector = FIRST_EXTERNAL_VECTOR + offset; - } + raw_spin_lock_irqsave(&vector_lock, flags); + apicd->is_managed = true; + ret = irq_matrix_reserve_managed(vector_matrix, affmsk); + raw_spin_unlock_irqrestore(&vector_lock, flags); + trace_vector_reserve_managed(irqd->irq, ret); + return ret; +} - /* If the search wrapped around, try the next cpu */ - if (unlikely(current_vector == vector)) - goto next_cpu; +static void reserve_irq_vector_locked(struct irq_data *irqd) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + + irq_matrix_reserve(vector_matrix); + apicd->can_reserve = true; + apicd->has_reserved = true; + irqd_set_can_reserve(irqd); + trace_vector_reserve(irqd->irq, 0); + vector_assign_managed_shutdown(irqd); +} - if (test_bit(vector, used_vectors)) - goto next; +static int reserve_irq_vector(struct irq_data *irqd) +{ + unsigned long flags; - for_each_cpu(new_cpu, vector_searchmask) { - if (!IS_ERR_OR_NULL(per_cpu(vector_irq, new_cpu)[vector])) - goto next; - } - /* Found one! */ - current_vector = vector; - current_offset = offset; - /* Schedule the old vector for cleanup on all cpus */ - if (d->cfg.vector) - cpumask_copy(d->old_domain, d->domain); - for_each_cpu(new_cpu, vector_searchmask) - per_cpu(vector_irq, new_cpu)[vector] = irq_to_desc(irq); - goto update; - -next_cpu: - /* - * We exclude the current @vector_cpumask from the requested - * @mask and try again with the next online cpu in the - * result. We cannot modify @mask, so we use @vector_cpumask - * as a temporary buffer here as it will be reassigned when - * calling apic->vector_allocation_domain() above. - */ - cpumask_or(searched_cpumask, searched_cpumask, vector_cpumask); - cpumask_andnot(vector_cpumask, mask, searched_cpumask); - cpu = cpumask_first_and(vector_cpumask, cpu_online_mask); - continue; - } - return -ENOSPC; + raw_spin_lock_irqsave(&vector_lock, flags); + reserve_irq_vector_locked(irqd); + raw_spin_unlock_irqrestore(&vector_lock, flags); + return 0; +} + +static int allocate_vector(struct irq_data *irqd, const struct cpumask *dest) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + bool resvd = apicd->has_reserved; + unsigned int cpu = apicd->cpu; + int vector = apicd->vector; + + lockdep_assert_held(&vector_lock); -update: /* - * Exclude offline cpus from the cleanup mask and set the - * move_in_progress flag when the result is not empty. + * If the current target CPU is online and in the new requested + * affinity mask, there is no point in moving the interrupt from + * one CPU to another. */ - cpumask_and(d->old_domain, d->old_domain, cpu_online_mask); - d->move_in_progress = !cpumask_empty(d->old_domain); - d->cfg.old_vector = d->move_in_progress ? d->cfg.vector : 0; - d->cfg.vector = vector; - cpumask_copy(d->domain, vector_cpumask); -success: - /* - * Cache destination APIC IDs into cfg->dest_apicid. This cannot fail - * as we already established, that mask & d->domain & cpu_online_mask - * is not empty. - * - * vector_searchmask is a subset of d->domain and has the offline - * cpus masked out. - */ - cpumask_and(vector_searchmask, vector_searchmask, mask); - BUG_ON(apic->cpu_mask_to_apicid(vector_searchmask, irqdata, - &d->cfg.dest_apicid)); + if (vector && cpu_online(cpu) && cpumask_test_cpu(cpu, dest)) + return 0; + + vector = irq_matrix_alloc(vector_matrix, dest, resvd, &cpu); + if (vector > 0) + apic_update_vector(irqd, vector, cpu); + trace_vector_alloc(irqd->irq, vector, resvd, vector); + return vector; +} + +static int assign_vector_locked(struct irq_data *irqd, + const struct cpumask *dest) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + int vector = allocate_vector(irqd, dest); + + if (vector < 0) + return vector; + + apic_update_irq_cfg(irqd, apicd->vector, apicd->cpu); return 0; } -static int assign_irq_vector(int irq, struct apic_chip_data *data, - const struct cpumask *mask, - struct irq_data *irqdata) +static int assign_irq_vector(struct irq_data *irqd, const struct cpumask *dest) { - int err; unsigned long flags; + int ret; raw_spin_lock_irqsave(&vector_lock, flags); - err = __assign_irq_vector(irq, data, mask, irqdata); + cpumask_and(vector_searchmask, dest, cpu_online_mask); + ret = assign_vector_locked(irqd, vector_searchmask); raw_spin_unlock_irqrestore(&vector_lock, flags); - return err; + return ret; } -static int assign_irq_vector_policy(int irq, int node, - struct apic_chip_data *data, - struct irq_alloc_info *info, - struct irq_data *irqdata) +static int assign_irq_vector_any_locked(struct irq_data *irqd) { - if (info && info->mask) - return assign_irq_vector(irq, data, info->mask, irqdata); - if (node != NUMA_NO_NODE && - assign_irq_vector(irq, data, cpumask_of_node(node), irqdata) == 0) + /* Get the affinity mask - either irq_default_affinity or (user) set */ + const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd); + int node = irq_data_get_node(irqd); + + if (node == NUMA_NO_NODE) + goto all; + /* Try the intersection of @affmsk and node mask */ + cpumask_and(vector_searchmask, cpumask_of_node(node), affmsk); + if (!assign_vector_locked(irqd, vector_searchmask)) + return 0; + /* Try the node mask */ + if (!assign_vector_locked(irqd, cpumask_of_node(node))) + return 0; +all: + /* Try the full affinity mask */ + cpumask_and(vector_searchmask, affmsk, cpu_online_mask); + if (!assign_vector_locked(irqd, vector_searchmask)) return 0; - return assign_irq_vector(irq, data, apic->target_cpus(), irqdata); + /* Try the full online mask */ + return assign_vector_locked(irqd, cpu_online_mask); +} + +static int +assign_irq_vector_policy(struct irq_data *irqd, struct irq_alloc_info *info) +{ + if (irqd_affinity_is_managed(irqd)) + return reserve_managed_vector(irqd); + if (info->mask) + return assign_irq_vector(irqd, info->mask); + /* + * Make only a global reservation with no guarantee. A real vector + * is associated at activation time. + */ + return reserve_irq_vector(irqd); } -static void clear_irq_vector(int irq, struct apic_chip_data *data) +static int +assign_managed_vector(struct irq_data *irqd, const struct cpumask *dest) { - struct irq_desc *desc; - int cpu, vector; + const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd); + struct apic_chip_data *apicd = apic_chip_data(irqd); + int vector, cpu; - if (!data->cfg.vector) + cpumask_and(vector_searchmask, vector_searchmask, affmsk); + cpu = cpumask_first(vector_searchmask); + if (cpu >= nr_cpu_ids) + return -EINVAL; + /* set_affinity might call here for nothing */ + if (apicd->vector && cpumask_test_cpu(apicd->cpu, vector_searchmask)) + return 0; + vector = irq_matrix_alloc_managed(vector_matrix, cpu); + trace_vector_alloc_managed(irqd->irq, vector, vector); + if (vector < 0) + return vector; + apic_update_vector(irqd, vector, cpu); + apic_update_irq_cfg(irqd, vector, cpu); + return 0; +} + +static void clear_irq_vector(struct irq_data *irqd) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + bool managed = irqd_affinity_is_managed(irqd); + unsigned int vector = apicd->vector; + + lockdep_assert_held(&vector_lock); + + if (!vector) return; - vector = data->cfg.vector; - for_each_cpu_and(cpu, data->domain, cpu_online_mask) - per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED; + trace_vector_clear(irqd->irq, vector, apicd->cpu, apicd->prev_vector, + apicd->prev_cpu); - data->cfg.vector = 0; - cpumask_clear(data->domain); + per_cpu(vector_irq, apicd->cpu)[vector] = VECTOR_UNUSED; + irq_matrix_free(vector_matrix, apicd->cpu, vector, managed); + apicd->vector = 0; - /* - * If move is in progress or the old_domain mask is not empty, - * i.e. the cleanup IPI has not been processed yet, we need to remove - * the old references to desc from all cpus vector tables. - */ - if (!data->move_in_progress && cpumask_empty(data->old_domain)) + /* Clean up move in progress */ + vector = apicd->prev_vector; + if (!vector) return; - desc = irq_to_desc(irq); - for_each_cpu_and(cpu, data->old_domain, cpu_online_mask) { - for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; - vector++) { - if (per_cpu(vector_irq, cpu)[vector] != desc) - continue; - per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED; - break; - } + per_cpu(vector_irq, apicd->prev_cpu)[vector] = VECTOR_UNUSED; + irq_matrix_free(vector_matrix, apicd->prev_cpu, vector, managed); + apicd->prev_vector = 0; + apicd->move_in_progress = 0; + hlist_del_init(&apicd->clist); +} + +static void x86_vector_deactivate(struct irq_domain *dom, struct irq_data *irqd) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + unsigned long flags; + + trace_vector_deactivate(irqd->irq, apicd->is_managed, + apicd->can_reserve, false); + + /* Regular fixed assigned interrupt */ + if (!apicd->is_managed && !apicd->can_reserve) + return; + /* If the interrupt has a global reservation, nothing to do */ + if (apicd->has_reserved) + return; + + raw_spin_lock_irqsave(&vector_lock, flags); + clear_irq_vector(irqd); + if (apicd->can_reserve) + reserve_irq_vector_locked(irqd); + else + vector_assign_managed_shutdown(irqd); + raw_spin_unlock_irqrestore(&vector_lock, flags); +} + +static int activate_reserved(struct irq_data *irqd) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + int ret; + + ret = assign_irq_vector_any_locked(irqd); + if (!ret) { + apicd->has_reserved = false; + /* + * Core might have disabled reservation mode after + * allocating the irq descriptor. Ideally this should + * happen before allocation time, but that would require + * completely convoluted ways of transporting that + * information. + */ + if (!irqd_can_reserve(irqd)) + apicd->can_reserve = false; } - data->move_in_progress = 0; + return ret; } -void init_irq_alloc_info(struct irq_alloc_info *info, - const struct cpumask *mask) +static int activate_managed(struct irq_data *irqd) { - memset(info, 0, sizeof(*info)); - info->mask = mask; + const struct cpumask *dest = irq_data_get_affinity_mask(irqd); + int ret; + + cpumask_and(vector_searchmask, dest, cpu_online_mask); + if (WARN_ON_ONCE(cpumask_empty(vector_searchmask))) { + /* Something in the core code broke! Survive gracefully */ + pr_err("Managed startup for irq %u, but no CPU\n", irqd->irq); + return EINVAL; + } + + ret = assign_managed_vector(irqd, vector_searchmask); + /* + * This should not happen. The vector reservation got buggered. Handle + * it gracefully. + */ + if (WARN_ON_ONCE(ret < 0)) { + pr_err("Managed startup irq %u, no vector available\n", + irqd->irq); + } + return ret; } -void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src) +static int x86_vector_activate(struct irq_domain *dom, struct irq_data *irqd, + bool reserve) { - if (src) - *dst = *src; - else - memset(dst, 0, sizeof(*dst)); + struct apic_chip_data *apicd = apic_chip_data(irqd); + unsigned long flags; + int ret = 0; + + trace_vector_activate(irqd->irq, apicd->is_managed, + apicd->can_reserve, reserve); + + /* Nothing to do for fixed assigned vectors */ + if (!apicd->can_reserve && !apicd->is_managed) + return 0; + + raw_spin_lock_irqsave(&vector_lock, flags); + if (reserve || irqd_is_managed_and_shutdown(irqd)) + vector_assign_managed_shutdown(irqd); + else if (apicd->is_managed) + ret = activate_managed(irqd); + else if (apicd->has_reserved) + ret = activate_reserved(irqd); + raw_spin_unlock_irqrestore(&vector_lock, flags); + return ret; +} + +static void vector_free_reserved_and_managed(struct irq_data *irqd) +{ + const struct cpumask *dest = irq_data_get_affinity_mask(irqd); + struct apic_chip_data *apicd = apic_chip_data(irqd); + + trace_vector_teardown(irqd->irq, apicd->is_managed, + apicd->has_reserved); + + if (apicd->has_reserved) + irq_matrix_remove_reserved(vector_matrix); + if (apicd->is_managed) + irq_matrix_remove_managed(vector_matrix, dest); } static void x86_vector_free_irqs(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs) { - struct apic_chip_data *apic_data; - struct irq_data *irq_data; + struct apic_chip_data *apicd; + struct irq_data *irqd; unsigned long flags; int i; for (i = 0; i < nr_irqs; i++) { - irq_data = irq_domain_get_irq_data(x86_vector_domain, virq + i); - if (irq_data && irq_data->chip_data) { + irqd = irq_domain_get_irq_data(x86_vector_domain, virq + i); + if (irqd && irqd->chip_data) { raw_spin_lock_irqsave(&vector_lock, flags); - clear_irq_vector(virq + i, irq_data->chip_data); - apic_data = irq_data->chip_data; - irq_domain_reset_irq_data(irq_data); + clear_irq_vector(irqd); + vector_free_reserved_and_managed(irqd); + apicd = irqd->chip_data; + irq_domain_reset_irq_data(irqd); raw_spin_unlock_irqrestore(&vector_lock, flags); - free_apic_chip_data(apic_data); -#ifdef CONFIG_X86_IO_APIC - if (virq + i < nr_legacy_irqs()) - legacy_irq_data[virq + i] = NULL; -#endif + free_apic_chip_data(apicd); } } } +static bool vector_configure_legacy(unsigned int virq, struct irq_data *irqd, + struct apic_chip_data *apicd) +{ + unsigned long flags; + bool realloc = false; + + apicd->vector = ISA_IRQ_VECTOR(virq); + apicd->cpu = 0; + + raw_spin_lock_irqsave(&vector_lock, flags); + /* + * If the interrupt is activated, then it must stay at this vector + * position. That's usually the timer interrupt (0). + */ + if (irqd_is_activated(irqd)) { + trace_vector_setup(virq, true, 0); + apic_update_irq_cfg(irqd, apicd->vector, apicd->cpu); + } else { + /* Release the vector */ + apicd->can_reserve = true; + irqd_set_can_reserve(irqd); + clear_irq_vector(irqd); + realloc = true; + } + raw_spin_unlock_irqrestore(&vector_lock, flags); + return realloc; +} + static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *arg) { struct irq_alloc_info *info = arg; - struct apic_chip_data *data; - struct irq_data *irq_data; + struct apic_chip_data *apicd; + struct irq_data *irqd; int i, err, node; if (disable_apic) @@ -350,46 +532,99 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, return -ENOSYS; for (i = 0; i < nr_irqs; i++) { - irq_data = irq_domain_get_irq_data(domain, virq + i); - BUG_ON(!irq_data); - node = irq_data_get_node(irq_data); -#ifdef CONFIG_X86_IO_APIC - if (virq + i < nr_legacy_irqs() && legacy_irq_data[virq + i]) - data = legacy_irq_data[virq + i]; - else -#endif - data = alloc_apic_chip_data(node); - if (!data) { + irqd = irq_domain_get_irq_data(domain, virq + i); + BUG_ON(!irqd); + node = irq_data_get_node(irqd); + WARN_ON_ONCE(irqd->chip_data); + apicd = alloc_apic_chip_data(node); + if (!apicd) { err = -ENOMEM; goto error; } - irq_data->chip = &lapic_controller; - irq_data->chip_data = data; - irq_data->hwirq = virq + i; - err = assign_irq_vector_policy(virq + i, node, data, info, - irq_data); - if (err) - goto error; + apicd->irq = virq + i; + irqd->chip = &lapic_controller; + irqd->chip_data = apicd; + irqd->hwirq = virq + i; + irqd_set_single_target(irqd); /* - * If the apic destination mode is physical, then the - * effective affinity is restricted to a single target - * CPU. Mark the interrupt accordingly. + * Legacy vectors are already assigned when the IOAPIC + * takes them over. They stay on the same vector. This is + * required for check_timer() to work correctly as it might + * switch back to legacy mode. Only update the hardware + * config. */ - if (!apic->irq_dest_mode) - irqd_set_single_target(irq_data); + if (info->flags & X86_IRQ_ALLOC_LEGACY) { + if (!vector_configure_legacy(virq + i, irqd, apicd)) + continue; + } + + err = assign_irq_vector_policy(irqd, info); + trace_vector_setup(virq + i, false, err); + if (err) { + irqd->chip_data = NULL; + free_apic_chip_data(apicd); + goto error; + } } return 0; error: - x86_vector_free_irqs(domain, virq, i + 1); + x86_vector_free_irqs(domain, virq, i); return err; } +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS +static void x86_vector_debug_show(struct seq_file *m, struct irq_domain *d, + struct irq_data *irqd, int ind) +{ + unsigned int cpu, vector, prev_cpu, prev_vector; + struct apic_chip_data *apicd; + unsigned long flags; + int irq; + + if (!irqd) { + irq_matrix_debug_show(m, vector_matrix, ind); + return; + } + + irq = irqd->irq; + if (irq < nr_legacy_irqs() && !test_bit(irq, &io_apic_irqs)) { + seq_printf(m, "%*sVector: %5d\n", ind, "", ISA_IRQ_VECTOR(irq)); + seq_printf(m, "%*sTarget: Legacy PIC all CPUs\n", ind, ""); + return; + } + + apicd = irqd->chip_data; + if (!apicd) { + seq_printf(m, "%*sVector: Not assigned\n", ind, ""); + return; + } + + raw_spin_lock_irqsave(&vector_lock, flags); + cpu = apicd->cpu; + vector = apicd->vector; + prev_cpu = apicd->prev_cpu; + prev_vector = apicd->prev_vector; + raw_spin_unlock_irqrestore(&vector_lock, flags); + seq_printf(m, "%*sVector: %5u\n", ind, "", vector); + seq_printf(m, "%*sTarget: %5u\n", ind, "", cpu); + if (prev_vector) { + seq_printf(m, "%*sPrevious vector: %5u\n", ind, "", prev_vector); + seq_printf(m, "%*sPrevious target: %5u\n", ind, "", prev_cpu); + } +} +#endif + static const struct irq_domain_ops x86_vector_domain_ops = { - .alloc = x86_vector_alloc_irqs, - .free = x86_vector_free_irqs, + .alloc = x86_vector_alloc_irqs, + .free = x86_vector_free_irqs, + .activate = x86_vector_activate, + .deactivate = x86_vector_deactivate, +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS + .debug_show = x86_vector_debug_show, +#endif }; int __init arch_probe_nr_irqs(void) @@ -400,7 +635,7 @@ int __init arch_probe_nr_irqs(void) nr_irqs = NR_VECTORS * nr_cpu_ids; nr = (gsi_top + nr_legacy_irqs()) + 8 * nr_cpu_ids; -#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ) +#if defined(CONFIG_PCI_MSI) /* * for MSI and HT dyn irq */ @@ -419,35 +654,40 @@ int __init arch_probe_nr_irqs(void) return legacy_pic->probe(); } -#ifdef CONFIG_X86_IO_APIC -static void __init init_legacy_irqs(void) +void lapic_assign_legacy_vector(unsigned int irq, bool replace) { - int i, node = cpu_to_node(0); - struct apic_chip_data *data; - /* - * For legacy IRQ's, start with assigning irq0 to irq15 to - * ISA_IRQ_VECTOR(i) for all cpu's. + * Use assign system here so it wont get accounted as allocated + * and moveable in the cpu hotplug check and it prevents managed + * irq reservation from touching it. */ - for (i = 0; i < nr_legacy_irqs(); i++) { - data = legacy_irq_data[i] = alloc_apic_chip_data(node); - BUG_ON(!data); + irq_matrix_assign_system(vector_matrix, ISA_IRQ_VECTOR(irq), replace); +} + +void __init lapic_assign_system_vectors(void) +{ + unsigned int i, vector = 0; + + for_each_set_bit_from(vector, system_vectors, NR_VECTORS) + irq_matrix_assign_system(vector_matrix, vector, false); + + if (nr_legacy_irqs() > 1) + lapic_assign_legacy_vector(PIC_CASCADE_IR, false); + + /* System vectors are reserved, online it */ + irq_matrix_online(vector_matrix); - data->cfg.vector = ISA_IRQ_VECTOR(i); - cpumask_setall(data->domain); - irq_set_chip_data(i, data); + /* Mark the preallocated legacy interrupts */ + for (i = 0; i < nr_legacy_irqs(); i++) { + if (i != PIC_CASCADE_IR) + irq_matrix_assign(vector_matrix, ISA_IRQ_VECTOR(i)); } } -#else -static inline void init_legacy_irqs(void) { } -#endif int __init arch_early_irq_init(void) { struct fwnode_handle *fn; - init_legacy_irqs(); - fn = irq_domain_alloc_named_fwnode("VECTOR"); BUG_ON(!fn); x86_vector_domain = irq_domain_create_tree(fn, &x86_vector_domain_ops, @@ -457,102 +697,116 @@ int __init arch_early_irq_init(void) irq_set_default_host(x86_vector_domain); arch_init_msi_domain(x86_vector_domain); - arch_init_htirq_domain(x86_vector_domain); - BUG_ON(!alloc_cpumask_var(&vector_cpumask, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&vector_searchmask, GFP_KERNEL)); - BUG_ON(!alloc_cpumask_var(&searched_cpumask, GFP_KERNEL)); + + /* + * Allocate the vector matrix allocator data structure and limit the + * search area. + */ + vector_matrix = irq_alloc_matrix(NR_VECTORS, FIRST_EXTERNAL_VECTOR, + FIRST_SYSTEM_VECTOR); + BUG_ON(!vector_matrix); return arch_early_ioapic_init(); } -/* Initialize vector_irq on a new cpu */ -static void __setup_vector_irq(int cpu) +#ifdef CONFIG_SMP + +static struct irq_desc *__setup_vector_irq(int vector) { - struct apic_chip_data *data; - struct irq_desc *desc; - int irq, vector; + int isairq = vector - ISA_IRQ_VECTOR(0); + + /* Check whether the irq is in the legacy space */ + if (isairq < 0 || isairq >= nr_legacy_irqs()) + return VECTOR_UNUSED; + /* Check whether the irq is handled by the IOAPIC */ + if (test_bit(isairq, &io_apic_irqs)) + return VECTOR_UNUSED; + return irq_to_desc(isairq); +} - /* Mark the inuse vectors */ - for_each_irq_desc(irq, desc) { - struct irq_data *idata = irq_desc_get_irq_data(desc); +/* Online the local APIC infrastructure and initialize the vectors */ +void lapic_online(void) +{ + unsigned int vector; - data = apic_chip_data(idata); - if (!data || !cpumask_test_cpu(cpu, data->domain)) - continue; - vector = data->cfg.vector; - per_cpu(vector_irq, cpu)[vector] = desc; - } - /* Mark the free vectors */ - for (vector = 0; vector < NR_VECTORS; ++vector) { - desc = per_cpu(vector_irq, cpu)[vector]; - if (IS_ERR_OR_NULL(desc)) - continue; + lockdep_assert_held(&vector_lock); - data = apic_chip_data(irq_desc_get_irq_data(desc)); - if (!cpumask_test_cpu(cpu, data->domain)) - per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED; - } + /* Online the vector matrix array for this CPU */ + irq_matrix_online(vector_matrix); + + /* + * The interrupt affinity logic never targets interrupts to offline + * CPUs. The exception are the legacy PIC interrupts. In general + * they are only targeted to CPU0, but depending on the platform + * they can be distributed to any online CPU in hardware. The + * kernel has no influence on that. So all active legacy vectors + * must be installed on all CPUs. All non legacy interrupts can be + * cleared. + */ + for (vector = 0; vector < NR_VECTORS; vector++) + this_cpu_write(vector_irq[vector], __setup_vector_irq(vector)); } -/* - * Setup the vector to irq mappings. Must be called with vector_lock held. - */ -void setup_vector_irq(int cpu) +void lapic_offline(void) { - int irq; + lock_vector_lock(); + irq_matrix_offline(vector_matrix); + unlock_vector_lock(); +} + +static int apic_set_affinity(struct irq_data *irqd, + const struct cpumask *dest, bool force) +{ + struct apic_chip_data *apicd = apic_chip_data(irqd); + int err; - lockdep_assert_held(&vector_lock); /* - * On most of the platforms, legacy PIC delivers the interrupts on the - * boot cpu. But there are certain platforms where PIC interrupts are - * delivered to multiple cpu's. If the legacy IRQ is handled by the - * legacy PIC, for the new cpu that is coming online, setup the static - * legacy vector to irq mapping: + * Core code can call here for inactive interrupts. For inactive + * interrupts which use managed or reservation mode there is no + * point in going through the vector assignment right now as the + * activation will assign a vector which fits the destination + * cpumask. Let the core code store the destination mask and be + * done with it. */ - for (irq = 0; irq < nr_legacy_irqs(); irq++) - per_cpu(vector_irq, cpu)[ISA_IRQ_VECTOR(irq)] = irq_to_desc(irq); + if (!irqd_is_activated(irqd) && + (apicd->is_managed || apicd->can_reserve)) + return IRQ_SET_MASK_OK; - __setup_vector_irq(cpu); + raw_spin_lock(&vector_lock); + cpumask_and(vector_searchmask, dest, cpu_online_mask); + if (irqd_affinity_is_managed(irqd)) + err = assign_managed_vector(irqd, vector_searchmask); + else + err = assign_vector_locked(irqd, vector_searchmask); + raw_spin_unlock(&vector_lock); + return err ? err : IRQ_SET_MASK_OK; } -static int apic_retrigger_irq(struct irq_data *irq_data) +#else +# define apic_set_affinity NULL +#endif + +static int apic_retrigger_irq(struct irq_data *irqd) { - struct apic_chip_data *data = apic_chip_data(irq_data); + struct apic_chip_data *apicd = apic_chip_data(irqd); unsigned long flags; - int cpu; raw_spin_lock_irqsave(&vector_lock, flags); - cpu = cpumask_first_and(data->domain, cpu_online_mask); - apic->send_IPI_mask(cpumask_of(cpu), data->cfg.vector); + apic->send_IPI(apicd->cpu, apicd->vector); raw_spin_unlock_irqrestore(&vector_lock, flags); return 1; } -void apic_ack_edge(struct irq_data *data) +void apic_ack_edge(struct irq_data *irqd) { - irq_complete_move(irqd_cfg(data)); - irq_move_irq(data); + irq_complete_move(irqd_cfg(irqd)); + irq_move_irq(irqd); ack_APIC_irq(); } -static int apic_set_affinity(struct irq_data *irq_data, - const struct cpumask *dest, bool force) -{ - struct apic_chip_data *data = irq_data->chip_data; - int err, irq = irq_data->irq; - - if (!IS_ENABLED(CONFIG_SMP)) - return -EPERM; - - if (!cpumask_intersects(dest, cpu_online_mask)) - return -EINVAL; - - err = assign_irq_vector(irq, data, dest, irq_data); - return err ? err : IRQ_SET_MASK_OK; -} - static struct irq_chip lapic_controller = { .name = "APIC", .irq_ack = apic_ack_edge, @@ -561,115 +815,98 @@ static struct irq_chip lapic_controller = { }; #ifdef CONFIG_SMP -static void __send_cleanup_vector(struct apic_chip_data *data) -{ - raw_spin_lock(&vector_lock); - cpumask_and(data->old_domain, data->old_domain, cpu_online_mask); - data->move_in_progress = 0; - if (!cpumask_empty(data->old_domain)) - apic->send_IPI_mask(data->old_domain, IRQ_MOVE_CLEANUP_VECTOR); - raw_spin_unlock(&vector_lock); -} -void send_cleanup_vector(struct irq_cfg *cfg) +static void free_moved_vector(struct apic_chip_data *apicd) { - struct apic_chip_data *data; + unsigned int vector = apicd->prev_vector; + unsigned int cpu = apicd->prev_cpu; + bool managed = apicd->is_managed; - data = container_of(cfg, struct apic_chip_data, cfg); - if (data->move_in_progress) - __send_cleanup_vector(data); + /* + * This should never happen. Managed interrupts are not + * migrated except on CPU down, which does not involve the + * cleanup vector. But try to keep the accounting correct + * nevertheless. + */ + WARN_ON_ONCE(managed); + + trace_vector_free_moved(apicd->irq, cpu, vector, managed); + irq_matrix_free(vector_matrix, cpu, vector, managed); + per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED; + hlist_del_init(&apicd->clist); + apicd->prev_vector = 0; + apicd->move_in_progress = 0; } asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void) { - unsigned vector, me; + struct hlist_head *clhead = this_cpu_ptr(&cleanup_list); + struct apic_chip_data *apicd; + struct hlist_node *tmp; entering_ack_irq(); - /* Prevent vectors vanishing under us */ raw_spin_lock(&vector_lock); - me = smp_processor_id(); - for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { - struct apic_chip_data *data; - struct irq_desc *desc; - unsigned int irr; - - retry: - desc = __this_cpu_read(vector_irq[vector]); - if (IS_ERR_OR_NULL(desc)) - continue; - - if (!raw_spin_trylock(&desc->lock)) { - raw_spin_unlock(&vector_lock); - cpu_relax(); - raw_spin_lock(&vector_lock); - goto retry; - } - - data = apic_chip_data(irq_desc_get_irq_data(desc)); - if (!data) - goto unlock; - - /* - * Nothing to cleanup if irq migration is in progress - * or this cpu is not set in the cleanup mask. - */ - if (data->move_in_progress || - !cpumask_test_cpu(me, data->old_domain)) - goto unlock; - - /* - * We have two cases to handle here: - * 1) vector is unchanged but the target mask got reduced - * 2) vector and the target mask has changed - * - * #1 is obvious, but in #2 we have two vectors with the same - * irq descriptor: the old and the new vector. So we need to - * make sure that we only cleanup the old vector. The new - * vector has the current @vector number in the config and - * this cpu is part of the target mask. We better leave that - * one alone. - */ - if (vector == data->cfg.vector && - cpumask_test_cpu(me, data->domain)) - goto unlock; + hlist_for_each_entry_safe(apicd, tmp, clhead, clist) { + unsigned int irr, vector = apicd->prev_vector; - irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); /* - * Check if the vector that needs to be cleanedup is - * registered at the cpu's IRR. If so, then this is not - * the best time to clean it up. Lets clean it up in the + * Paranoia: Check if the vector that needs to be cleaned + * up is registered at the APICs IRR. If so, then this is + * not the best time to clean it up. Clean it up in the * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR - * to myself. + * to this CPU. IRQ_MOVE_CLEANUP_VECTOR is the lowest + * priority external vector, so on return from this + * interrupt the device interrupt will happen first. */ - if (irr & (1 << (vector % 32))) { + irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); + if (irr & (1U << (vector % 32))) { apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR); - goto unlock; + continue; } - __this_cpu_write(vector_irq[vector], VECTOR_UNUSED); - cpumask_clear_cpu(me, data->old_domain); -unlock: - raw_spin_unlock(&desc->lock); + free_moved_vector(apicd); } raw_spin_unlock(&vector_lock); - exiting_irq(); } +static void __send_cleanup_vector(struct apic_chip_data *apicd) +{ + unsigned int cpu; + + raw_spin_lock(&vector_lock); + apicd->move_in_progress = 0; + cpu = apicd->prev_cpu; + if (cpu_online(cpu)) { + hlist_add_head(&apicd->clist, per_cpu_ptr(&cleanup_list, cpu)); + apic->send_IPI(cpu, IRQ_MOVE_CLEANUP_VECTOR); + } else { + apicd->prev_vector = 0; + } + raw_spin_unlock(&vector_lock); +} + +void send_cleanup_vector(struct irq_cfg *cfg) +{ + struct apic_chip_data *apicd; + + apicd = container_of(cfg, struct apic_chip_data, hw_irq_cfg); + if (apicd->move_in_progress) + __send_cleanup_vector(apicd); +} + static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector) { - unsigned me; - struct apic_chip_data *data; + struct apic_chip_data *apicd; - data = container_of(cfg, struct apic_chip_data, cfg); - if (likely(!data->move_in_progress)) + apicd = container_of(cfg, struct apic_chip_data, hw_irq_cfg); + if (likely(!apicd->move_in_progress)) return; - me = smp_processor_id(); - if (vector == data->cfg.vector && cpumask_test_cpu(me, data->domain)) - __send_cleanup_vector(data); + if (vector == apicd->vector && apicd->cpu == smp_processor_id()) + __send_cleanup_vector(apicd); } void irq_complete_move(struct irq_cfg *cfg) @@ -682,10 +919,9 @@ void irq_complete_move(struct irq_cfg *cfg) */ void irq_force_complete_move(struct irq_desc *desc) { - struct irq_data *irqdata; - struct apic_chip_data *data; - struct irq_cfg *cfg; - unsigned int cpu; + struct apic_chip_data *apicd; + struct irq_data *irqd; + unsigned int vector; /* * The function is called for all descriptors regardless of which @@ -696,43 +932,31 @@ void irq_force_complete_move(struct irq_desc *desc) * Check first that the chip_data is what we expect * (apic_chip_data) before touching it any further. */ - irqdata = irq_domain_get_irq_data(x86_vector_domain, - irq_desc_get_irq(desc)); - if (!irqdata) + irqd = irq_domain_get_irq_data(x86_vector_domain, + irq_desc_get_irq(desc)); + if (!irqd) return; - data = apic_chip_data(irqdata); - cfg = data ? &data->cfg : NULL; + raw_spin_lock(&vector_lock); + apicd = apic_chip_data(irqd); + if (!apicd) + goto unlock; - if (!cfg) - return; + /* + * If prev_vector is empty, no action required. + */ + vector = apicd->prev_vector; + if (!vector) + goto unlock; /* - * This is tricky. If the cleanup of @data->old_domain has not been + * This is tricky. If the cleanup of the old vector has not been * done yet, then the following setaffinity call will fail with * -EBUSY. This can leave the interrupt in a stale state. * * All CPUs are stuck in stop machine with interrupts disabled so * calling __irq_complete_move() would be completely pointless. - */ - raw_spin_lock(&vector_lock); - /* - * Clean out all offline cpus (including the outgoing one) from the - * old_domain mask. - */ - cpumask_and(data->old_domain, data->old_domain, cpu_online_mask); - - /* - * If move_in_progress is cleared and the old_domain mask is empty, - * then there is nothing to cleanup. fixup_irqs() will take care of - * the stale vectors on the outgoing cpu. - */ - if (!data->move_in_progress && cpumask_empty(data->old_domain)) { - raw_spin_unlock(&vector_lock); - return; - } - - /* + * * 1) The interrupt is in move_in_progress state. That means that we * have not seen an interrupt since the io_apic was reprogrammed to * the new vector. @@ -740,7 +964,7 @@ void irq_force_complete_move(struct irq_desc *desc) * 2) The interrupt has fired on the new vector, but the cleanup IPIs * have not been processed yet. */ - if (data->move_in_progress) { + if (apicd->move_in_progress) { /* * In theory there is a race: * @@ -774,21 +998,43 @@ void irq_force_complete_move(struct irq_desc *desc) * area arises. */ pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n", - irqdata->irq, cfg->old_vector); + irqd->irq, vector); } - /* - * If old_domain is not empty, then other cpus still have the irq - * descriptor set in their vector array. Clean it up. - */ - for_each_cpu(cpu, data->old_domain) - per_cpu(vector_irq, cpu)[cfg->old_vector] = VECTOR_UNUSED; + free_moved_vector(apicd); +unlock: + raw_spin_unlock(&vector_lock); +} + +#ifdef CONFIG_HOTPLUG_CPU +/* + * Note, this is not accurate accounting, but at least good enough to + * prevent that the actual interrupt move will run out of vectors. + */ +int lapic_can_unplug_cpu(void) +{ + unsigned int rsvd, avl, tomove, cpu = smp_processor_id(); + int ret = 0; - /* Cleanup the left overs of the (half finished) move */ - cpumask_clear(data->old_domain); - data->move_in_progress = 0; + raw_spin_lock(&vector_lock); + tomove = irq_matrix_allocated(vector_matrix); + avl = irq_matrix_available(vector_matrix, true); + if (avl < tomove) { + pr_warn("CPU %u has %u vectors, %u available. Cannot disable CPU\n", + cpu, tomove, avl); + ret = -ENOSPC; + goto out; + } + rsvd = irq_matrix_reserved(vector_matrix); + if (avl < rsvd) { + pr_warn("Reserved vectors %u > available %u. IRQ request may fail\n", + rsvd, avl); + } +out: raw_spin_unlock(&vector_lock); + return ret; } -#endif +#endif /* HOTPLUG_CPU */ +#endif /* SMP */ static void __init print_APIC_field(int base) { diff --git a/arch/x86/kernel/apic/x2apic.h b/arch/x86/kernel/apic/x2apic.h new file mode 100644 index 000000000000..b107de381cb5 --- /dev/null +++ b/arch/x86/kernel/apic/x2apic.h @@ -0,0 +1,9 @@ +/* Common bits for X2APIC cluster/physical modes. */ + +int x2apic_apic_id_valid(int apicid); +int x2apic_apic_id_registered(void); +void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest); +unsigned int x2apic_get_apic_id(unsigned long id); +u32 x2apic_set_apic_id(unsigned int id); +int x2apic_phys_pkg_id(int initial_apicid, int index_msb); +void x2apic_send_IPI_self(int vector); diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index e216cf3d64d2..8b04234e010b 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -9,22 +9,24 @@ #include <linux/cpu.h> #include <asm/smp.h> -#include <asm/x2apic.h> +#include "x2apic.h" + +struct cluster_mask { + unsigned int clusterid; + int node; + struct cpumask mask; +}; static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); -static DEFINE_PER_CPU(cpumask_var_t, cpus_in_cluster); static DEFINE_PER_CPU(cpumask_var_t, ipi_mask); +static DEFINE_PER_CPU(struct cluster_mask *, cluster_masks); +static struct cluster_mask *cluster_hotplug_mask; static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return x2apic_enabled(); } -static inline u32 x2apic_cluster(int cpu) -{ - return per_cpu(x86_cpu_to_logical_apicid, cpu) >> 16; -} - static void x2apic_send_IPI(int cpu, int vector) { u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu); @@ -36,49 +38,34 @@ static void x2apic_send_IPI(int cpu, int vector) static void __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) { - struct cpumask *cpus_in_cluster_ptr; - struct cpumask *ipi_mask_ptr; - unsigned int cpu, this_cpu; + unsigned int cpu, clustercpu; + struct cpumask *tmpmsk; unsigned long flags; u32 dest; x2apic_wrmsr_fence(); - local_irq_save(flags); - this_cpu = smp_processor_id(); + tmpmsk = this_cpu_cpumask_var_ptr(ipi_mask); + cpumask_copy(tmpmsk, mask); + /* If IPI should not be sent to self, clear current CPU */ + if (apic_dest != APIC_DEST_ALLINC) + cpumask_clear_cpu(smp_processor_id(), tmpmsk); - /* - * We are to modify mask, so we need an own copy - * and be sure it's manipulated with irq off. - */ - ipi_mask_ptr = this_cpu_cpumask_var_ptr(ipi_mask); - cpumask_copy(ipi_mask_ptr, mask); - - /* - * The idea is to send one IPI per cluster. - */ - for_each_cpu(cpu, ipi_mask_ptr) { - unsigned long i; + /* Collapse cpus in a cluster so a single IPI per cluster is sent */ + for_each_cpu(cpu, tmpmsk) { + struct cluster_mask *cmsk = per_cpu(cluster_masks, cpu); - cpus_in_cluster_ptr = per_cpu(cpus_in_cluster, cpu); dest = 0; - - /* Collect cpus in cluster. */ - for_each_cpu_and(i, ipi_mask_ptr, cpus_in_cluster_ptr) { - if (apic_dest == APIC_DEST_ALLINC || i != this_cpu) - dest |= per_cpu(x86_cpu_to_logical_apicid, i); - } + for_each_cpu_and(clustercpu, tmpmsk, &cmsk->mask) + dest |= per_cpu(x86_cpu_to_logical_apicid, clustercpu); if (!dest) continue; __x2apic_send_IPI_dest(dest, vector, apic->dest_logical); - /* - * Cluster sibling cpus should be discared now so - * we would not send IPI them second time. - */ - cpumask_andnot(ipi_mask_ptr, ipi_mask_ptr, cpus_in_cluster_ptr); + /* Remove cluster CPUs from tmpmask */ + cpumask_andnot(tmpmsk, tmpmsk, &cmsk->mask); } local_irq_restore(flags); @@ -105,125 +92,90 @@ static void x2apic_send_IPI_all(int vector) __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC); } -static int -x2apic_cpu_mask_to_apicid(const struct cpumask *mask, struct irq_data *irqdata, - unsigned int *apicid) +static u32 x2apic_calc_apicid(unsigned int cpu) { - struct cpumask *effmsk = irq_data_get_effective_affinity_mask(irqdata); - unsigned int cpu; - u32 dest = 0; - u16 cluster; - - cpu = cpumask_first(mask); - if (cpu >= nr_cpu_ids) - return -EINVAL; - - dest = per_cpu(x86_cpu_to_logical_apicid, cpu); - cluster = x2apic_cluster(cpu); - - cpumask_clear(effmsk); - for_each_cpu(cpu, mask) { - if (cluster != x2apic_cluster(cpu)) - continue; - dest |= per_cpu(x86_cpu_to_logical_apicid, cpu); - cpumask_set_cpu(cpu, effmsk); - } - - *apicid = dest; - return 0; + return per_cpu(x86_cpu_to_logical_apicid, cpu); } static void init_x2apic_ldr(void) { - unsigned int this_cpu = smp_processor_id(); + struct cluster_mask *cmsk = this_cpu_read(cluster_masks); + u32 cluster, apicid = apic_read(APIC_LDR); unsigned int cpu; - per_cpu(x86_cpu_to_logical_apicid, this_cpu) = apic_read(APIC_LDR); + this_cpu_write(x86_cpu_to_logical_apicid, apicid); + + if (cmsk) + goto update; - cpumask_set_cpu(this_cpu, per_cpu(cpus_in_cluster, this_cpu)); + cluster = apicid >> 16; for_each_online_cpu(cpu) { - if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu)) - continue; - cpumask_set_cpu(this_cpu, per_cpu(cpus_in_cluster, cpu)); - cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, this_cpu)); + cmsk = per_cpu(cluster_masks, cpu); + /* Matching cluster found. Link and update it. */ + if (cmsk && cmsk->clusterid == cluster) + goto update; } + cmsk = cluster_hotplug_mask; + cluster_hotplug_mask = NULL; +update: + this_cpu_write(cluster_masks, cmsk); + cpumask_set_cpu(smp_processor_id(), &cmsk->mask); } -/* - * At CPU state changes, update the x2apic cluster sibling info. - */ -static int x2apic_prepare_cpu(unsigned int cpu) +static int alloc_clustermask(unsigned int cpu, int node) { - if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL)) - return -ENOMEM; + if (per_cpu(cluster_masks, cpu)) + return 0; + /* + * If a hotplug spare mask exists, check whether it's on the right + * node. If not, free it and allocate a new one. + */ + if (cluster_hotplug_mask) { + if (cluster_hotplug_mask->node == node) + return 0; + kfree(cluster_hotplug_mask); + } - if (!zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL)) { - free_cpumask_var(per_cpu(cpus_in_cluster, cpu)); + cluster_hotplug_mask = kzalloc_node(sizeof(*cluster_hotplug_mask), + GFP_KERNEL, node); + if (!cluster_hotplug_mask) return -ENOMEM; - } + cluster_hotplug_mask->node = node; + return 0; +} +static int x2apic_prepare_cpu(unsigned int cpu) +{ + if (alloc_clustermask(cpu, cpu_to_node(cpu)) < 0) + return -ENOMEM; + if (!zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL)) + return -ENOMEM; return 0; } -static int x2apic_dead_cpu(unsigned int this_cpu) +static int x2apic_dead_cpu(unsigned int dead_cpu) { - int cpu; + struct cluster_mask *cmsk = per_cpu(cluster_masks, dead_cpu); - for_each_online_cpu(cpu) { - if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu)) - continue; - cpumask_clear_cpu(this_cpu, per_cpu(cpus_in_cluster, cpu)); - cpumask_clear_cpu(cpu, per_cpu(cpus_in_cluster, this_cpu)); - } - free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu)); - free_cpumask_var(per_cpu(ipi_mask, this_cpu)); + cpumask_clear_cpu(dead_cpu, &cmsk->mask); + free_cpumask_var(per_cpu(ipi_mask, dead_cpu)); return 0; } static int x2apic_cluster_probe(void) { - int cpu = smp_processor_id(); - int ret; - if (!x2apic_mode) return 0; - ret = cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "x86/x2apic:prepare", - x2apic_prepare_cpu, x2apic_dead_cpu); - if (ret < 0) { + if (cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "x86/x2apic:prepare", + x2apic_prepare_cpu, x2apic_dead_cpu) < 0) { pr_err("Failed to register X2APIC_PREPARE\n"); return 0; } - cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, cpu)); + init_x2apic_ldr(); return 1; } -static const struct cpumask *x2apic_cluster_target_cpus(void) -{ - return cpu_all_mask; -} - -/* - * Each x2apic cluster is an allocation domain. - */ -static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask, - const struct cpumask *mask) -{ - /* - * To minimize vector pressure, default case of boot, device bringup - * etc will use a single cpu for the interrupt destination. - * - * On explicit migration requests coming from irqbalance etc, - * interrupts will be routed to the x2apic cluster (cluster-id - * derived from the first cpu in the mask) members specified - * in the mask. - */ - if (mask == x2apic_cluster_target_cpus()) - cpumask_copy(retmask, cpumask_of(cpu)); - else - cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu)); -} - static struct apic apic_x2apic_cluster __ro_after_init = { .name = "cluster x2apic", @@ -232,15 +184,13 @@ static struct apic apic_x2apic_cluster __ro_after_init = { .apic_id_valid = x2apic_apic_id_valid, .apic_id_registered = x2apic_apic_id_registered, - .irq_delivery_mode = dest_LowestPrio, + .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 1, /* logical */ - .target_cpus = x2apic_cluster_target_cpus, .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = NULL, - .vector_allocation_domain = cluster_vector_allocation_domain, .init_apic_ldr = init_x2apic_ldr, .ioapic_phys_id_map = NULL, @@ -253,7 +203,7 @@ static struct apic apic_x2apic_cluster __ro_after_init = { .get_apic_id = x2apic_get_apic_id, .set_apic_id = x2apic_set_apic_id, - .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, + .calc_dest_apicid = x2apic_calc_apicid, .send_IPI = x2apic_send_IPI, .send_IPI_mask = x2apic_send_IPI_mask, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index b94d35320f85..f8d9d69994e6 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -7,7 +7,8 @@ #include <linux/dmar.h> #include <asm/smp.h> -#include <asm/x2apic.h> +#include <asm/ipi.h> +#include "x2apic.h" int x2apic_phys; @@ -99,6 +100,43 @@ static int x2apic_phys_probe(void) return apic == &apic_x2apic_phys; } +/* Common x2apic functions, also used by x2apic_cluster */ +int x2apic_apic_id_valid(int apicid) +{ + return 1; +} + +int x2apic_apic_id_registered(void) +{ + return 1; +} + +void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest) +{ + unsigned long cfg = __prepare_ICR(0, vector, dest); + native_x2apic_icr_write(cfg, apicid); +} + +unsigned int x2apic_get_apic_id(unsigned long id) +{ + return id; +} + +u32 x2apic_set_apic_id(unsigned int id) +{ + return id; +} + +int x2apic_phys_pkg_id(int initial_apicid, int index_msb) +{ + return initial_apicid >> index_msb; +} + +void x2apic_send_IPI_self(int vector) +{ + apic_write(APIC_SELF_IPI, vector); +} + static struct apic apic_x2apic_phys __ro_after_init = { .name = "physical x2apic", @@ -110,12 +148,10 @@ static struct apic apic_x2apic_phys __ro_after_init = { .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 0, /* physical */ - .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = 0, .check_apicid_used = NULL, - .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = init_x2apic_ldr, .ioapic_phys_id_map = NULL, @@ -128,7 +164,7 @@ static struct apic apic_x2apic_phys __ro_after_init = { .get_apic_id = x2apic_get_apic_id, .set_apic_id = x2apic_set_apic_id, - .cpu_mask_to_apicid = default_cpu_mask_to_apicid, + .calc_dest_apicid = apic_default_calc_apicid, .send_IPI = x2apic_send_IPI, .send_IPI_mask = x2apic_send_IPI_mask, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 0d57bb9079c9..f11910b44638 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -137,6 +137,8 @@ static int __init early_get_pnodeid(void) case UV3_HUB_PART_NUMBER_X: uv_min_hub_revision_id += UV3_HUB_REVISION_BASE; break; + + /* Update: UV4A has only a modified revision to indicate HUB fixes */ case UV4_HUB_PART_NUMBER: uv_min_hub_revision_id += UV4_HUB_REVISION_BASE - 1; uv_cpuid.gnode_shift = 2; /* min partition is 4 sockets */ @@ -154,6 +156,48 @@ static int __init early_get_pnodeid(void) return pnode; } +static void __init uv_tsc_check_sync(void) +{ + u64 mmr; + int sync_state; + int mmr_shift; + char *state; + bool valid; + + /* Accommodate different UV arch BIOSes */ + mmr = uv_early_read_mmr(UVH_TSC_SYNC_MMR); + mmr_shift = + is_uv1_hub() ? 0 : + is_uv2_hub() ? UVH_TSC_SYNC_SHIFT_UV2K : UVH_TSC_SYNC_SHIFT; + if (mmr_shift) + sync_state = (mmr >> mmr_shift) & UVH_TSC_SYNC_MASK; + else + sync_state = 0; + + switch (sync_state) { + case UVH_TSC_SYNC_VALID: + state = "in sync"; + valid = true; + break; + + case UVH_TSC_SYNC_INVALID: + state = "unstable"; + valid = false; + break; + default: + state = "unknown: assuming valid"; + valid = true; + break; + } + pr_info("UV: TSC sync state from BIOS:0%d(%s)\n", sync_state, state); + + /* Mark flag that says TSC != 0 is valid for socket 0 */ + if (valid) + mark_tsc_async_resets("UV BIOS"); + else + mark_tsc_unstable("UV BIOS"); +} + /* [Copied from arch/x86/kernel/cpu/topology.c:detect_extended_topology()] */ #define SMT_LEVEL 0 /* Leaf 0xb SMT level */ @@ -274,6 +318,7 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) } else if (!strcmp(oem_table_id, "UVH")) { /* Only UV1 systems: */ uv_system_type = UV_NON_UNIQUE_APIC; + x86_platform.legacy.warm_reset = 0; __this_cpu_write(x2apic_extra_bits, pnodeid << uvh_apicid.s.pnode_shift); uv_set_apicid_hibit(); uv_apic = 1; @@ -288,6 +333,7 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) } pr_info("UV: OEM IDs %s/%s, System/HUB Types %d/%d, uv_apic %d\n", oem_id, oem_table_id, uv_system_type, uv_min_hub_revision_id, uv_apic); + uv_tsc_check_sync(); return uv_apic; @@ -525,16 +571,9 @@ static void uv_init_apic_ldr(void) { } -static int -uv_cpu_mask_to_apicid(const struct cpumask *mask, struct irq_data *irqdata, - unsigned int *apicid) +static u32 apic_uv_calc_apicid(unsigned int cpu) { - int ret = default_cpu_mask_to_apicid(mask, irqdata, apicid); - - if (!ret) - *apicid |= uv_apicid_hibits; - - return ret; + return apic_default_calc_apicid(cpu) | uv_apicid_hibits; } static unsigned int x2apic_get_apic_id(unsigned long x) @@ -547,7 +586,7 @@ static unsigned int x2apic_get_apic_id(unsigned long x) return id; } -static unsigned long set_apic_id(unsigned int id) +static u32 set_apic_id(unsigned int id) { /* CHECKME: Do we need to mask out the xapic extra bits? */ return id; @@ -584,12 +623,10 @@ static struct apic apic_x2apic_uv_x __ro_after_init = { .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 0, /* Physical */ - .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = NULL, - .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = uv_init_apic_ldr, .ioapic_phys_id_map = NULL, @@ -602,7 +639,7 @@ static struct apic apic_x2apic_uv_x __ro_after_init = { .get_apic_id = x2apic_get_apic_id, .set_apic_id = set_apic_id, - .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, + .calc_dest_apicid = apic_uv_calc_apicid, .send_IPI = uv_send_IPI_one, .send_IPI_mask = uv_send_IPI_mask, @@ -733,6 +770,7 @@ static __init void map_gru_high(int max_pnode) return; } + /* Only UV3 has distributed GRU mode */ if (is_uv3_hub() && gru.s3.mode) { map_gru_distributed(gru.v); return; @@ -756,63 +794,61 @@ static __init void map_mmr_high(int max_pnode) pr_info("UV: MMR disabled\n"); } -/* - * This commonality works because both 0 & 1 versions of the MMIOH OVERLAY - * and REDIRECT MMR regs are exactly the same on UV3. - */ -struct mmioh_config { - unsigned long overlay; - unsigned long redirect; - char *id; -}; - -static __initdata struct mmioh_config mmiohs[] = { - { - UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR, - UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR, - "MMIOH0" - }, - { - UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR, - UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR, - "MMIOH1" - }, -}; - -/* UV3 & UV4 have identical MMIOH overlay configs */ -static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode) +/* UV3/4 have identical MMIOH overlay configs, UV4A is slightly different */ +static __init void map_mmioh_high_uv34(int index, int min_pnode, int max_pnode) { - union uv3h_rh_gam_mmioh_overlay_config0_mmr_u overlay; + unsigned long overlay; unsigned long mmr; unsigned long base; + unsigned long nasid_mask; + unsigned long m_overlay; int i, n, shift, m_io, max_io; int nasid, lnasid, fi, li; char *id; - id = mmiohs[index].id; - overlay.v = uv_read_local_mmr(mmiohs[index].overlay); - - pr_info("UV: %s overlay 0x%lx base:0x%x m_io:%d\n", id, overlay.v, overlay.s3.base, overlay.s3.m_io); - if (!overlay.s3.enable) { + if (index == 0) { + id = "MMIOH0"; + m_overlay = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR; + overlay = uv_read_local_mmr(m_overlay); + base = overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_MASK; + mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR; + m_io = (overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK) + >> UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT; + shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT; + n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH; + nasid_mask = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK; + } else { + id = "MMIOH1"; + m_overlay = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR; + overlay = uv_read_local_mmr(m_overlay); + base = overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_MASK; + mmr = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR; + m_io = (overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK) + >> UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT; + shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT; + n = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH; + nasid_mask = UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK; + } + pr_info("UV: %s overlay 0x%lx base:0x%lx m_io:%d\n", id, overlay, base, m_io); + if (!(overlay & UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK)) { pr_info("UV: %s disabled\n", id); return; } - shift = UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_SHFT; - base = (unsigned long)overlay.s3.base; - m_io = overlay.s3.m_io; - mmr = mmiohs[index].redirect; - n = UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH; /* Convert to NASID: */ min_pnode *= 2; max_pnode *= 2; max_io = lnasid = fi = li = -1; for (i = 0; i < n; i++) { - union uv3h_rh_gam_mmioh_redirect_config0_mmr_u redirect; + unsigned long m_redirect = mmr + i * 8; + unsigned long redirect = uv_read_local_mmr(m_redirect); + + nasid = redirect & nasid_mask; + if (i == 0) + pr_info("UV: %s redirect base 0x%lx(@0x%lx) 0x%04x\n", + id, redirect, m_redirect, nasid); - redirect.v = uv_read_local_mmr(mmr + i * 8); - nasid = redirect.s3.nasid; /* Invalid NASID: */ if (nasid < min_pnode || max_pnode < nasid) nasid = -1; @@ -860,8 +896,8 @@ static __init void map_mmioh_high(int min_pnode, int max_pnode) if (is_uv3_hub() || is_uv4_hub()) { /* Map both MMIOH regions: */ - map_mmioh_high_uv3(0, min_pnode, max_pnode); - map_mmioh_high_uv3(1, min_pnode, max_pnode); + map_mmioh_high_uv34(0, min_pnode, max_pnode); + map_mmioh_high_uv34(1, min_pnode, max_pnode); return; } @@ -920,9 +956,8 @@ static __init void uv_rtc_init(void) /* * percpu heartbeat timer */ -static void uv_heartbeat(unsigned long ignored) +static void uv_heartbeat(struct timer_list *timer) { - struct timer_list *timer = &uv_scir_info->timer; unsigned char bits = uv_scir_info->state; /* Flip heartbeat bit: */ @@ -947,7 +982,7 @@ static int uv_heartbeat_enable(unsigned int cpu) struct timer_list *timer = &uv_cpu_scir_info(cpu)->timer; uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); - setup_pinned_timer(timer, uv_heartbeat, cpu); + timer_setup(timer, uv_heartbeat, TIMER_PINNED); timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; add_timer_on(timer, cpu); uv_cpu_scir_info(cpu)->enabled = 1; @@ -1141,16 +1176,25 @@ static void __init decode_gam_rng_tbl(unsigned long ptr) uv_gre_table = gre; for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { + unsigned long size = ((unsigned long)(gre->limit - lgre) + << UV_GAM_RANGE_SHFT); + int order = 0; + char suffix[] = " KMGTPE"; + + while (size > 9999 && order < sizeof(suffix)) { + size /= 1024; + order++; + } + if (!index) { pr_info("UV: GAM Range Table...\n"); pr_info("UV: # %20s %14s %5s %4s %5s %3s %2s\n", "Range", "", "Size", "Type", "NASID", "SID", "PN"); } - pr_info("UV: %2d: 0x%014lx-0x%014lx %5luG %3d %04x %02x %02x\n", + pr_info("UV: %2d: 0x%014lx-0x%014lx %5lu%c %3d %04x %02x %02x\n", index++, (unsigned long)lgre << UV_GAM_RANGE_SHFT, (unsigned long)gre->limit << UV_GAM_RANGE_SHFT, - ((unsigned long)(gre->limit - lgre)) >> - (30 - UV_GAM_RANGE_SHFT), /* 64M -> 1G */ + size, suffix[order], gre->type, gre->nasid, gre->sockid, gre->pnode); lgre = gre->limit; diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index e4b0d92b3ae0..dfcbe6924eaf 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -1506,7 +1506,7 @@ static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t * return 0; } -static unsigned int do_poll(struct file *fp, poll_table *wait) +static __poll_t do_poll(struct file *fp, poll_table *wait) { struct apm_user *as; @@ -1515,7 +1515,7 @@ static unsigned int do_poll(struct file *fp, poll_table *wait) return 0; poll_wait(fp, &apm_waitqueue, wait); if (!queue_empty(as)) - return POLLIN | POLLRDNORM; + return EPOLLIN | EPOLLRDNORM; return 0; } @@ -2389,6 +2389,7 @@ static int __init apm_init(void) if (HZ != 100) idle_period = (idle_period * HZ) / 100; if (idle_threshold < 100) { + cpuidle_poll_state_init(&apm_idle_driver); if (!cpuidle_register_driver(&apm_idle_driver)) if (cpuidle_register_device(&apm_cpuidle_device)) cpuidle_unregister_driver(&apm_idle_driver); diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 8ea78275480d..76417a9aab73 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -17,6 +17,7 @@ #include <asm/sigframe.h> #include <asm/bootparam.h> #include <asm/suspend.h> +#include <asm/tlbflush.h> #ifdef CONFIG_XEN #include <xen/interface/xen.h> @@ -93,4 +94,13 @@ void common(void) { BLANK(); DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); + + /* TLB state for the entry code */ + OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask); + + /* Layout info for cpu_entry_area */ + OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); + OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); + OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page); + DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack)); } diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index dedf428b20b6..f91ba53e06c8 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -18,7 +18,7 @@ void foo(void) OFFSET(CPUINFO_x86, cpuinfo_x86, x86); OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor); OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model); - OFFSET(CPUINFO_x86_mask, cpuinfo_x86, x86_mask); + OFFSET(CPUINFO_x86_stepping, cpuinfo_x86, x86_stepping); OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level); OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability); OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id); @@ -47,13 +47,8 @@ void foo(void) BLANK(); /* Offset from the sysenter stack to tss.sp0 */ - DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - - offsetofend(struct tss_struct, SYSENTER_stack)); - - /* Offset from cpu_tss to SYSENTER_stack */ - OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack); - /* Size of SYSENTER_stack */ - DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); + DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) - + offsetofend(struct cpu_entry_area, entry_stack_page.stack)); #ifdef CONFIG_CC_STACKPROTECTOR BLANK(); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 630212fa9b9d..bf51e51d808d 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -23,6 +23,9 @@ int main(void) #ifdef CONFIG_PARAVIRT OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); +#ifdef CONFIG_DEBUG_ENTRY + OFFSET(PV_IRQ_save_fl, pv_irq_ops, save_fl); +#endif BLANK(); #endif @@ -63,6 +66,7 @@ int main(void) OFFSET(TSS_ist, tss_struct, x86_tss.ist); OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); + OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); BLANK(); #ifdef CONFIG_CC_STACKPROTECTOR diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 236999c54edc..570e8bb1f386 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -23,6 +23,7 @@ obj-y += rdrand.o obj-y += match.o obj-y += bugs.o obj-y += aperfmperf.o +obj-y += cpuid-deps.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index d58184b7cd44..f0e6456ca7d3 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -119,7 +119,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c) return; } - if (c->x86_model == 6 && c->x86_mask == 1) { + if (c->x86_model == 6 && c->x86_stepping == 1) { const int K6_BUG_LOOP = 1000000; int n; void (*f_vide)(void); @@ -149,7 +149,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c) /* K6 with old style WHCR */ if (c->x86_model < 8 || - (c->x86_model == 8 && c->x86_mask < 8)) { + (c->x86_model == 8 && c->x86_stepping < 8)) { /* We can only write allocate on the low 508Mb */ if (mbytes > 508) mbytes = 508; @@ -168,7 +168,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c) return; } - if ((c->x86_model == 8 && c->x86_mask > 7) || + if ((c->x86_model == 8 && c->x86_stepping > 7) || c->x86_model == 9 || c->x86_model == 13) { /* The more serious chips .. */ @@ -221,7 +221,7 @@ static void init_amd_k7(struct cpuinfo_x86 *c) * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx * As per AMD technical note 27212 0.2 */ - if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) { + if ((c->x86_model == 8 && c->x86_stepping >= 1) || (c->x86_model > 8)) { rdmsr(MSR_K7_CLK_CTL, l, h); if ((l & 0xfff00000) != 0x20000000) { pr_info("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", @@ -241,12 +241,12 @@ static void init_amd_k7(struct cpuinfo_x86 *c) * but they are not certified as MP capable. */ /* Athlon 660/661 is valid. */ - if ((c->x86_model == 6) && ((c->x86_mask == 0) || - (c->x86_mask == 1))) + if ((c->x86_model == 6) && ((c->x86_stepping == 0) || + (c->x86_stepping == 1))) return; /* Duron 670 is valid */ - if ((c->x86_model == 7) && (c->x86_mask == 0)) + if ((c->x86_model == 7) && (c->x86_stepping == 0)) return; /* @@ -256,8 +256,8 @@ static void init_amd_k7(struct cpuinfo_x86 *c) * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for * more. */ - if (((c->x86_model == 6) && (c->x86_mask >= 2)) || - ((c->x86_model == 7) && (c->x86_mask >= 1)) || + if (((c->x86_model == 6) && (c->x86_stepping >= 2)) || + ((c->x86_model == 7) && (c->x86_stepping >= 1)) || (c->x86_model > 7)) if (cpu_has(c, X86_FEATURE_MP)) return; @@ -556,6 +556,51 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) } } +static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) +{ + u64 msr; + + /* + * BIOS support is required for SME and SEV. + * For SME: If BIOS has enabled SME then adjust x86_phys_bits by + * the SME physical address space reduction value. + * If BIOS has not enabled SME then don't advertise the + * SME feature (set in scattered.c). + * For SEV: If BIOS has not enabled SEV then don't advertise the + * SEV feature (set in scattered.c). + * + * In all cases, since support for SME and SEV requires long mode, + * don't advertise the feature under CONFIG_X86_32. + */ + if (cpu_has(c, X86_FEATURE_SME) || cpu_has(c, X86_FEATURE_SEV)) { + /* Check if memory encryption is enabled */ + rdmsrl(MSR_K8_SYSCFG, msr); + if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + goto clear_all; + + /* + * Always adjust physical address bits. Even though this + * will be a value above 32-bits this is still done for + * CONFIG_X86_32 so that accurate values are reported. + */ + c->x86_phys_bits -= (cpuid_ebx(0x8000001f) >> 6) & 0x3f; + + if (IS_ENABLED(CONFIG_X86_32)) + goto clear_all; + + rdmsrl(MSR_K7_HWCR, msr); + if (!(msr & MSR_K7_HWCR_SMMLOCK)) + goto clear_sev; + + return; + +clear_all: + clear_cpu_cap(c, X86_FEATURE_SME); +clear_sev: + clear_cpu_cap(c, X86_FEATURE_SEV); + } +} + static void early_init_amd(struct cpuinfo_x86 *c) { u32 dummy; @@ -583,7 +628,7 @@ static void early_init_amd(struct cpuinfo_x86 *c) /* Set MTRR capability flag if appropriate */ if (c->x86 == 5) if (c->x86_model == 13 || c->x86_model == 9 || - (c->x86_model == 8 && c->x86_mask >= 8)) + (c->x86_model == 8 && c->x86_stepping >= 8)) set_cpu_cap(c, X86_FEATURE_K6_MTRR); #endif #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) @@ -627,26 +672,7 @@ static void early_init_amd(struct cpuinfo_x86 *c) if (cpu_has_amd_erratum(c, amd_erratum_400)) set_cpu_bug(c, X86_BUG_AMD_E400); - /* - * BIOS support is required for SME. If BIOS has enabled SME then - * adjust x86_phys_bits by the SME physical address space reduction - * value. If BIOS has not enabled SME then don't advertise the - * feature (set in scattered.c). Also, since the SME support requires - * long mode, don't advertise the feature under CONFIG_X86_32. - */ - if (cpu_has(c, X86_FEATURE_SME)) { - u64 msr; - - /* Check if SME is enabled */ - rdmsrl(MSR_K8_SYSCFG, msr); - if (msr & MSR_K8_SYSCFG_MEM_ENCRYPT) { - c->x86_phys_bits -= (cpuid_ebx(0x8000001f) >> 6) & 0x3f; - if (IS_ENABLED(CONFIG_X86_32)) - clear_cpu_cap(c, X86_FEATURE_SME); - } else { - clear_cpu_cap(c, X86_FEATURE_SME); - } - } + early_detect_mem_encrypt(c); } static void init_amd_k8(struct cpuinfo_x86 *c) @@ -769,7 +795,7 @@ static void init_amd_zn(struct cpuinfo_x86 *c) * Fix erratum 1076: CPB feature bit not being set in CPUID. It affects * all up to and including B1. */ - if (c->x86_model <= 1 && c->x86_mask <= 1) + if (c->x86_model <= 1 && c->x86_stepping <= 1) set_cpu_cap(c, X86_FEATURE_CPB); } @@ -804,8 +830,11 @@ static void init_amd(struct cpuinfo_x86 *c) case 0x17: init_amd_zn(c); break; } - /* Enable workaround for FXSAVE leak */ - if (c->x86 >= 6) + /* + * Enable workaround for FXSAVE leak on CPUs + * without a XSaveErPtr feature + */ + if ((c->x86 >= 6) && (!cpu_has(c, X86_FEATURE_XSAVEERPTR))) set_cpu_bug(c, X86_BUG_FXSAVE_LEAK); cpu_detect_cache_sizes(c); @@ -826,8 +855,32 @@ static void init_amd(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_K8); if (cpu_has(c, X86_FEATURE_XMM2)) { - /* MFENCE stops RDTSC speculation */ - set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); + unsigned long long val; + int ret; + + /* + * A serializing LFENCE has less overhead than MFENCE, so + * use it for execution serialization. On families which + * don't have that MSR, LFENCE is already serializing. + * msr_set_bit() uses the safe accessors, too, even if the MSR + * is not present. + */ + msr_set_bit(MSR_F10H_DECFG, + MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); + + /* + * Verify that the MSR write was successful (could be running + * under a hypervisor) and only then assume that LFENCE is + * serializing. + */ + ret = rdmsrl_safe(MSR_F10H_DECFG, &val); + if (!ret && (val & MSR_F10H_DECFG_LFENCE_SERIALIZE)) { + /* A serializing LFENCE stops RDTSC speculation */ + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); + } else { + /* MFENCE stops RDTSC speculation */ + set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); + } } /* @@ -853,11 +906,11 @@ static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) /* AMD errata T13 (order #21922) */ if ((c->x86 == 6)) { /* Duron Rev A0 */ - if (c->x86_model == 3 && c->x86_mask == 0) + if (c->x86_model == 3 && c->x86_stepping == 0) size = 64; /* Tbird rev A1/A2 */ if (c->x86_model == 4 && - (c->x86_mask == 0 || c->x86_mask == 1)) + (c->x86_stepping == 0 || c->x86_stepping == 1)) size = 256; } return size; @@ -994,7 +1047,7 @@ static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) } /* OSVW unavailable or ID unknown, match family-model-stepping range */ - ms = (cpu->x86_model << 4) | cpu->x86_mask; + ms = (cpu->x86_model << 4) | cpu->x86_stepping; while ((range = *erratum++)) if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) && (ms >= AMD_MODEL_RANGE_START(range)) && diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index 957813e0180d..7eba34df54c3 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -14,6 +14,8 @@ #include <linux/percpu.h> #include <linux/smp.h> +#include "cpu.h" + struct aperfmperf_sample { unsigned int khz; ktime_t time; @@ -24,7 +26,7 @@ struct aperfmperf_sample { static DEFINE_PER_CPU(struct aperfmperf_sample, samples); #define APERFMPERF_CACHE_THRESHOLD_MS 10 -#define APERFMPERF_REFRESH_DELAY_MS 20 +#define APERFMPERF_REFRESH_DELAY_MS 10 #define APERFMPERF_STALE_THRESHOLD_MS 1000 /* @@ -38,8 +40,6 @@ static void aperfmperf_snapshot_khz(void *dummy) u64 aperf, aperf_delta; u64 mperf, mperf_delta; struct aperfmperf_sample *s = this_cpu_ptr(&samples); - ktime_t now = ktime_get(); - s64 time_delta = ktime_ms_delta(now, s->time); unsigned long flags; local_irq_save(flags); @@ -57,38 +57,68 @@ static void aperfmperf_snapshot_khz(void *dummy) if (mperf_delta == 0) return; - s->time = now; + s->time = ktime_get(); s->aperf = aperf; s->mperf = mperf; - - /* If the previous iteration was too long ago, discard it. */ - if (time_delta > APERFMPERF_STALE_THRESHOLD_MS) - s->khz = 0; - else - s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta); + s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta); } -unsigned int arch_freq_get_on_cpu(int cpu) +static bool aperfmperf_snapshot_cpu(int cpu, ktime_t now, bool wait) { - s64 time_delta; - unsigned int khz; + s64 time_delta = ktime_ms_delta(now, per_cpu(samples.time, cpu)); + + /* Don't bother re-computing within the cache threshold time. */ + if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS) + return true; + + smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, wait); + + /* Return false if the previous iteration was too long ago. */ + return time_delta <= APERFMPERF_STALE_THRESHOLD_MS; +} +unsigned int aperfmperf_get_khz(int cpu) +{ if (!cpu_khz) return 0; if (!static_cpu_has(X86_FEATURE_APERFMPERF)) return 0; - /* Don't bother re-computing within the cache threshold time. */ - time_delta = ktime_ms_delta(ktime_get(), per_cpu(samples.time, cpu)); - khz = per_cpu(samples.khz, cpu); - if (khz && time_delta < APERFMPERF_CACHE_THRESHOLD_MS) - return khz; + aperfmperf_snapshot_cpu(cpu, ktime_get(), true); + return per_cpu(samples.khz, cpu); +} - smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1); - khz = per_cpu(samples.khz, cpu); - if (khz) - return khz; +void arch_freq_prepare_all(void) +{ + ktime_t now = ktime_get(); + bool wait = false; + int cpu; + + if (!cpu_khz) + return; + + if (!static_cpu_has(X86_FEATURE_APERFMPERF)) + return; + + for_each_online_cpu(cpu) + if (!aperfmperf_snapshot_cpu(cpu, now, false)) + wait = true; + + if (wait) + msleep(APERFMPERF_REFRESH_DELAY_MS); +} + +unsigned int arch_freq_get_on_cpu(int cpu) +{ + if (!cpu_khz) + return 0; + + if (!static_cpu_has(X86_FEATURE_APERFMPERF)) + return 0; + + if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true)) + return per_cpu(samples.khz, cpu); msleep(APERFMPERF_REFRESH_DELAY_MS); smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1); diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index ba0b2424c9b0..bfca937bdcc3 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -10,6 +10,11 @@ */ #include <linux/init.h> #include <linux/utsname.h> +#include <linux/cpu.h> +#include <linux/module.h> + +#include <asm/nospec-branch.h> +#include <asm/cmdline.h> #include <asm/bugs.h> #include <asm/processor.h> #include <asm/processor-flags.h> @@ -19,6 +24,9 @@ #include <asm/alternative.h> #include <asm/pgtable.h> #include <asm/set_memory.h> +#include <asm/intel-family.h> + +static void __init spectre_v2_select_mitigation(void); void __init check_bugs(void) { @@ -29,6 +37,9 @@ void __init check_bugs(void) print_cpu_info(&boot_cpu_data); } + /* Select the proper spectre mitigation before patching alternatives */ + spectre_v2_select_mitigation(); + #ifdef CONFIG_X86_32 /* * Check whether we are able to run this kernel safely on SMP. @@ -60,3 +71,273 @@ void __init check_bugs(void) set_memory_4k((unsigned long)__va(0), 1); #endif } + +/* The kernel command line selection */ +enum spectre_v2_mitigation_cmd { + SPECTRE_V2_CMD_NONE, + SPECTRE_V2_CMD_AUTO, + SPECTRE_V2_CMD_FORCE, + SPECTRE_V2_CMD_RETPOLINE, + SPECTRE_V2_CMD_RETPOLINE_GENERIC, + SPECTRE_V2_CMD_RETPOLINE_AMD, +}; + +static const char *spectre_v2_strings[] = { + [SPECTRE_V2_NONE] = "Vulnerable", + [SPECTRE_V2_RETPOLINE_MINIMAL] = "Vulnerable: Minimal generic ASM retpoline", + [SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline", + [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline", + [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline", +}; + +#undef pr_fmt +#define pr_fmt(fmt) "Spectre V2 : " fmt + +static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE; + +#ifdef RETPOLINE +static bool spectre_v2_bad_module; + +bool retpoline_module_ok(bool has_retpoline) +{ + if (spectre_v2_enabled == SPECTRE_V2_NONE || has_retpoline) + return true; + + pr_err("System may be vulnerable to spectre v2\n"); + spectre_v2_bad_module = true; + return false; +} + +static inline const char *spectre_v2_module_string(void) +{ + return spectre_v2_bad_module ? " - vulnerable module loaded" : ""; +} +#else +static inline const char *spectre_v2_module_string(void) { return ""; } +#endif + +static void __init spec2_print_if_insecure(const char *reason) +{ + if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) + pr_info("%s selected on command line.\n", reason); +} + +static void __init spec2_print_if_secure(const char *reason) +{ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) + pr_info("%s selected on command line.\n", reason); +} + +static inline bool retp_compiler(void) +{ + return __is_defined(RETPOLINE); +} + +static inline bool match_option(const char *arg, int arglen, const char *opt) +{ + int len = strlen(opt); + + return len == arglen && !strncmp(arg, opt, len); +} + +static const struct { + const char *option; + enum spectre_v2_mitigation_cmd cmd; + bool secure; +} mitigation_options[] = { + { "off", SPECTRE_V2_CMD_NONE, false }, + { "on", SPECTRE_V2_CMD_FORCE, true }, + { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, + { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false }, + { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false }, + { "auto", SPECTRE_V2_CMD_AUTO, false }, +}; + +static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) +{ + char arg[20]; + int ret, i; + enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO; + + if (cmdline_find_option_bool(boot_command_line, "nospectre_v2")) + return SPECTRE_V2_CMD_NONE; + else { + ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg)); + if (ret < 0) + return SPECTRE_V2_CMD_AUTO; + + for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) { + if (!match_option(arg, ret, mitigation_options[i].option)) + continue; + cmd = mitigation_options[i].cmd; + break; + } + + if (i >= ARRAY_SIZE(mitigation_options)) { + pr_err("unknown option (%s). Switching to AUTO select\n", arg); + return SPECTRE_V2_CMD_AUTO; + } + } + + if ((cmd == SPECTRE_V2_CMD_RETPOLINE || + cmd == SPECTRE_V2_CMD_RETPOLINE_AMD || + cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC) && + !IS_ENABLED(CONFIG_RETPOLINE)) { + pr_err("%s selected but not compiled in. Switching to AUTO select\n", mitigation_options[i].option); + return SPECTRE_V2_CMD_AUTO; + } + + if (cmd == SPECTRE_V2_CMD_RETPOLINE_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { + pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n"); + return SPECTRE_V2_CMD_AUTO; + } + + if (mitigation_options[i].secure) + spec2_print_if_secure(mitigation_options[i].option); + else + spec2_print_if_insecure(mitigation_options[i].option); + + return cmd; +} + +/* Check for Skylake-like CPUs (for RSB handling) */ +static bool __init is_skylake_era(void) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + boot_cpu_data.x86 == 6) { + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_SKYLAKE_MOBILE: + case INTEL_FAM6_SKYLAKE_DESKTOP: + case INTEL_FAM6_SKYLAKE_X: + case INTEL_FAM6_KABYLAKE_MOBILE: + case INTEL_FAM6_KABYLAKE_DESKTOP: + return true; + } + } + return false; +} + +static void __init spectre_v2_select_mitigation(void) +{ + enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); + enum spectre_v2_mitigation mode = SPECTRE_V2_NONE; + + /* + * If the CPU is not affected and the command line mode is NONE or AUTO + * then nothing to do. + */ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && + (cmd == SPECTRE_V2_CMD_NONE || cmd == SPECTRE_V2_CMD_AUTO)) + return; + + switch (cmd) { + case SPECTRE_V2_CMD_NONE: + return; + + case SPECTRE_V2_CMD_FORCE: + case SPECTRE_V2_CMD_AUTO: + if (IS_ENABLED(CONFIG_RETPOLINE)) + goto retpoline_auto; + break; + case SPECTRE_V2_CMD_RETPOLINE_AMD: + if (IS_ENABLED(CONFIG_RETPOLINE)) + goto retpoline_amd; + break; + case SPECTRE_V2_CMD_RETPOLINE_GENERIC: + if (IS_ENABLED(CONFIG_RETPOLINE)) + goto retpoline_generic; + break; + case SPECTRE_V2_CMD_RETPOLINE: + if (IS_ENABLED(CONFIG_RETPOLINE)) + goto retpoline_auto; + break; + } + pr_err("Spectre mitigation: kernel not compiled with retpoline; no mitigation available!"); + return; + +retpoline_auto: + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + retpoline_amd: + if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { + pr_err("Spectre mitigation: LFENCE not serializing, switching to generic retpoline\n"); + goto retpoline_generic; + } + mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_AMD : + SPECTRE_V2_RETPOLINE_MINIMAL_AMD; + setup_force_cpu_cap(X86_FEATURE_RETPOLINE_AMD); + setup_force_cpu_cap(X86_FEATURE_RETPOLINE); + } else { + retpoline_generic: + mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_GENERIC : + SPECTRE_V2_RETPOLINE_MINIMAL; + setup_force_cpu_cap(X86_FEATURE_RETPOLINE); + } + + spectre_v2_enabled = mode; + pr_info("%s\n", spectre_v2_strings[mode]); + + /* + * If neither SMEP nor PTI are available, there is a risk of + * hitting userspace addresses in the RSB after a context switch + * from a shallow call stack to a deeper one. To prevent this fill + * the entire RSB, even when using IBRS. + * + * Skylake era CPUs have a separate issue with *underflow* of the + * RSB, when they will predict 'ret' targets from the generic BTB. + * The proper mitigation for this is IBRS. If IBRS is not supported + * or deactivated in favour of retpolines the RSB fill on context + * switch is required. + */ + if ((!boot_cpu_has(X86_FEATURE_PTI) && + !boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) { + setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); + pr_info("Spectre v2 mitigation: Filling RSB on context switch\n"); + } + + /* Initialize Indirect Branch Prediction Barrier if supported */ + if (boot_cpu_has(X86_FEATURE_IBPB)) { + setup_force_cpu_cap(X86_FEATURE_USE_IBPB); + pr_info("Spectre v2 mitigation: Enabling Indirect Branch Prediction Barrier\n"); + } + + /* + * Retpoline means the kernel is safe because it has no indirect + * branches. But firmware isn't, so use IBRS to protect that. + */ + if (boot_cpu_has(X86_FEATURE_IBRS)) { + setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); + pr_info("Enabling Restricted Speculation for firmware calls\n"); + } +} + +#undef pr_fmt + +#ifdef CONFIG_SYSFS +ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf) +{ + if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) + return sprintf(buf, "Not affected\n"); + if (boot_cpu_has(X86_FEATURE_PTI)) + return sprintf(buf, "Mitigation: PTI\n"); + return sprintf(buf, "Vulnerable\n"); +} + +ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, char *buf) +{ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1)) + return sprintf(buf, "Not affected\n"); + return sprintf(buf, "Mitigation: __user pointer sanitization\n"); +} + +ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf) +{ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) + return sprintf(buf, "Not affected\n"); + + return sprintf(buf, "%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], + boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "", + boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", + spectre_v2_module_string()); +} +#endif diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 68bc6d9b3132..e5ec0f11c0de 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -106,6 +106,10 @@ static void early_init_centaur(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_SYSENTER32); #endif + if (c->x86_power & (1 << 8)) { + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); + } } static void init_centaur(struct cpuinfo_x86 *c) @@ -136,7 +140,7 @@ static void init_centaur(struct cpuinfo_x86 *c) clear_cpu_cap(c, X86_FEATURE_TSC); break; case 8: - switch (c->x86_mask) { + switch (c->x86_stepping) { default: name = "2"; break; @@ -211,7 +215,7 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) * - Note, it seems this may only be in engineering samples. */ if ((c->x86 == 6) && (c->x86_model == 9) && - (c->x86_mask == 1) && (size == 65)) + (c->x86_stepping == 1) && (size == 65)) size -= 1; return size; } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c9176bae7fd8..348cf4821240 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -47,6 +47,8 @@ #include <asm/pat.h> #include <asm/microcode.h> #include <asm/microcode_intel.h> +#include <asm/intel-family.h> +#include <asm/cpu_device_id.h> #ifdef CONFIG_X86_LOCAL_APIC #include <asm/uv/uv.h> @@ -329,6 +331,30 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) } } +static __always_inline void setup_umip(struct cpuinfo_x86 *c) +{ + /* Check the boot processor, plus build option for UMIP. */ + if (!cpu_feature_enabled(X86_FEATURE_UMIP)) + goto out; + + /* Check the current processor's cpuid bits. */ + if (!cpu_has(c, X86_FEATURE_UMIP)) + goto out; + + cr4_set_bits(X86_CR4_UMIP); + + pr_info("x86/cpu: Activated the Intel User Mode Instruction Prevention (UMIP) CPU feature\n"); + + return; + +out: + /* + * Make sure UMIP is disabled in case it was enabled in a + * previous boot (e.g., via kexec). + */ + cr4_clear_bits(X86_CR4_UMIP); +} + /* * Protection Keys are not available in 32-bit mode. */ @@ -452,8 +478,8 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c) return NULL; /* Not found */ } -__u32 cpu_caps_cleared[NCAPINTS]; -__u32 cpu_caps_set[NCAPINTS]; +__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS]; +__u32 cpu_caps_set[NCAPINTS + NBUGINTS]; void load_percpu_segment(int cpu) { @@ -466,28 +492,23 @@ void load_percpu_segment(int cpu) load_stack_canary_segment(); } -/* Setup the fixmap mapping only once per-processor */ -static inline void setup_fixmap_gdt(int cpu) -{ -#ifdef CONFIG_X86_64 - /* On 64-bit systems, we use a read-only fixmap GDT. */ - pgprot_t prot = PAGE_KERNEL_RO; -#else - /* - * On native 32-bit systems, the GDT cannot be read-only because - * our double fault handler uses a task gate, and entering through - * a task gate needs to change an available TSS to busy. If the GDT - * is read-only, that will triple fault. - * - * On Xen PV, the GDT must be read-only because the hypervisor requires - * it. - */ - pgprot_t prot = boot_cpu_has(X86_FEATURE_XENPV) ? - PAGE_KERNEL_RO : PAGE_KERNEL; +#ifdef CONFIG_X86_32 +/* The 32-bit entry code needs to find cpu_entry_area. */ +DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); #endif - __set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot); -} +#ifdef CONFIG_X86_64 +/* + * Special IST stacks which the CPU switches to when it calls + * an IST-marked descriptor entry. Up to 7 stacks (hardware + * limit), all of them are 4K, except the debug stack which + * is 8K. + */ +static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, + [DEBUG_STACK - 1] = DEBUG_STKSZ +}; +#endif /* Load the original GDT from the per-cpu structure */ void load_direct_gdt(int cpu) @@ -710,7 +731,7 @@ void cpu_detect(struct cpuinfo_x86 *c) cpuid(0x00000001, &tfms, &misc, &junk, &cap0); c->x86 = x86_family(tfms); c->x86_model = x86_model(tfms); - c->x86_mask = x86_stepping(tfms); + c->x86_stepping = x86_stepping(tfms); if (cap0 & (1<<19)) { c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; @@ -723,12 +744,32 @@ static void apply_forced_caps(struct cpuinfo_x86 *c) { int i; - for (i = 0; i < NCAPINTS; i++) { + for (i = 0; i < NCAPINTS + NBUGINTS; i++) { c->x86_capability[i] &= ~cpu_caps_cleared[i]; c->x86_capability[i] |= cpu_caps_set[i]; } } +static void init_speculation_control(struct cpuinfo_x86 *c) +{ + /* + * The Intel SPEC_CTRL CPUID bit implies IBRS and IBPB support, + * and they also have a different bit for STIBP support. Also, + * a hypervisor might have set the individual AMD bits even on + * Intel CPUs, for finer-grained selection of what's available. + * + * We use the AMD bits in 0x8000_0008 EBX as the generic hardware + * features, which are visible in /proc/cpuinfo and used by the + * kernel. So set those accordingly from the Intel bits. + */ + if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) { + set_cpu_cap(c, X86_FEATURE_IBRS); + set_cpu_cap(c, X86_FEATURE_IBPB); + } + if (cpu_has(c, X86_FEATURE_INTEL_STIBP)) + set_cpu_cap(c, X86_FEATURE_STIBP); +} + void get_cpu_cap(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; @@ -750,6 +791,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c) cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); c->x86_capability[CPUID_7_0_EBX] = ebx; c->x86_capability[CPUID_7_ECX] = ecx; + c->x86_capability[CPUID_7_EDX] = edx; } /* Extended state features: level 0x0000000d */ @@ -822,6 +864,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a); init_scattered_cpuid_features(c); + init_speculation_control(c); /* * Clear/Set all flags overridden by options, after probe. @@ -857,14 +900,49 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) #endif } +static const __initconst struct x86_cpu_id cpu_no_speculation[] = { + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL, X86_FEATURE_ANY }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW, X86_FEATURE_ANY }, + { X86_VENDOR_CENTAUR, 5 }, + { X86_VENDOR_INTEL, 5 }, + { X86_VENDOR_NSC, 5 }, + { X86_VENDOR_ANY, 4 }, + {} +}; + +static const __initconst struct x86_cpu_id cpu_no_meltdown[] = { + { X86_VENDOR_AMD }, + {} +}; + +static bool __init cpu_vulnerable_to_meltdown(struct cpuinfo_x86 *c) +{ + u64 ia32_cap = 0; + + if (x86_match_cpu(cpu_no_meltdown)) + return false; + + if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES)) + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); + + /* Rogue Data Cache Load? No! */ + if (ia32_cap & ARCH_CAP_RDCL_NO) + return false; + + return true; +} + /* * Do minimum CPU detection early. * Fields really needed: vendor, cpuid_level, family, model, mask, * cache alignment. * The others are not touched to avoid unwanted side effects. * - * WARNING: this function is only called on the BP. Don't add code here - * that is supposed to run on all CPUs. + * WARNING: this function is only called on the boot CPU. Don't add code + * here that is supposed to run on all CPUs. */ static void __init early_identify_cpu(struct cpuinfo_x86 *c) { @@ -903,6 +981,14 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) } setup_force_cpu_cap(X86_FEATURE_ALWAYS); + + if (!x86_match_cpu(cpu_no_speculation)) { + if (cpu_vulnerable_to_meltdown(c)) + setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); + setup_force_cpu_bug(X86_BUG_SPECTRE_V1); + setup_force_cpu_bug(X86_BUG_SPECTRE_V2); + } + fpu__init_system(c); #ifdef CONFIG_X86_32 @@ -1098,9 +1184,9 @@ static void identify_cpu(struct cpuinfo_x86 *c) int i; c->loops_per_jiffy = loops_per_jiffy; - c->x86_cache_size = -1; + c->x86_cache_size = 0; c->x86_vendor = X86_VENDOR_UNKNOWN; - c->x86_model = c->x86_mask = 0; /* So far unknown... */ + c->x86_model = c->x86_stepping = 0; /* So far unknown... */ c->x86_vendor_id[0] = '\0'; /* Unset */ c->x86_model_id[0] = '\0'; /* Unset */ c->x86_max_cores = 1; @@ -1147,9 +1233,10 @@ static void identify_cpu(struct cpuinfo_x86 *c) /* Disable the PN if appropriate */ squash_the_stupid_serial_number(c); - /* Set up SMEP/SMAP */ + /* Set up SMEP/SMAP/UMIP */ setup_smep(c); setup_smap(c); + setup_umip(c); /* * The vendor-specific functions might have changed features. @@ -1225,7 +1312,7 @@ void enable_sep_cpu(void) return; cpu = get_cpu(); - tss = &per_cpu(cpu_tss, cpu); + tss = &per_cpu(cpu_tss_rw, cpu); /* * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field -- @@ -1234,11 +1321,7 @@ void enable_sep_cpu(void) tss->x86_tss.ss1 = __KERNEL_CS; wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); - - wrmsr(MSR_IA32_SYSENTER_ESP, - (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack), - 0); - + wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1), 0); wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); put_cpu(); @@ -1295,24 +1378,22 @@ void print_cpu_info(struct cpuinfo_x86 *c) pr_cont(" (family: 0x%x, model: 0x%x", c->x86, c->x86_model); - if (c->x86_mask || c->cpuid_level >= 0) - pr_cont(", stepping: 0x%x)\n", c->x86_mask); + if (c->x86_stepping || c->cpuid_level >= 0) + pr_cont(", stepping: 0x%x)\n", c->x86_stepping); else pr_cont(")\n"); } -static __init int setup_disablecpuid(char *arg) +/* + * clearcpuid= was already parsed in fpu__init_parse_early_param. + * But we need to keep a dummy __setup around otherwise it would + * show up as an environment variable for init. + */ +static __init int setup_clearcpuid(char *arg) { - int bit; - - if (get_option(&arg, &bit) && bit >= 0 && bit < NCAPINTS * 32) - setup_clear_cpu_cap(bit); - else - return 0; - return 1; } -__setup("clearcpuid=", setup_disablecpuid); +__setup("clearcpuid=", setup_clearcpuid); #ifdef CONFIG_X86_64 DEFINE_PER_CPU_FIRST(union irq_stack_union, @@ -1334,25 +1415,22 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; EXPORT_PER_CPU_SYMBOL(__preempt_count); -/* - * Special IST stacks which the CPU switches to when it calls - * an IST-marked descriptor entry. Up to 7 stacks (hardware - * limit), all of them are 4K, except the debug stack which - * is 8K. - */ -static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, - [DEBUG_STACK - 1] = DEBUG_STKSZ -}; - -static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks - [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); - /* May not be marked __init: used by software suspend */ void syscall_init(void) { + extern char _entry_trampoline[]; + extern char entry_SYSCALL_64_trampoline[]; + + int cpu = smp_processor_id(); + unsigned long SYSCALL64_entry_trampoline = + (unsigned long)get_cpu_entry_area(cpu)->entry_trampoline + + (entry_SYSCALL_64_trampoline - _entry_trampoline); + wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); - wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); + if (static_cpu_has(X86_FEATURE_PTI)) + wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline); + else + wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); #ifdef CONFIG_IA32_EMULATION wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); @@ -1363,7 +1441,7 @@ void syscall_init(void) * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). */ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1)); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); #else wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); @@ -1507,7 +1585,7 @@ void cpu_init(void) if (cpu) load_ucode_ap(); - t = &per_cpu(cpu_tss, cpu); + t = &per_cpu(cpu_tss_rw, cpu); oist = &per_cpu(orig_ist, cpu); #ifdef CONFIG_NUMA @@ -1546,7 +1624,7 @@ void cpu_init(void) * set up and load the per-CPU TSS */ if (!oist->ist[0]) { - char *estacks = per_cpu(exception_stacks, cpu); + char *estacks = get_cpu_entry_area(cpu)->exception_stacks; for (v = 0; v < N_EXCEPTION_STACKS; v++) { estacks += exception_stack_sizes[v]; @@ -1557,7 +1635,7 @@ void cpu_init(void) } } - t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); + t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; /* * <= is required because the CPU will access up to @@ -1572,9 +1650,14 @@ void cpu_init(void) initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, me); - load_sp0(t, ¤t->thread); - set_tss_desc(cpu, t); + /* + * Initialize the TSS. sp0 points to the entry trampoline stack + * regardless of what task is running. + */ + set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); load_TR_desc(); + load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1)); + load_mm_ldt(&init_mm); clear_all_debug_regs(); @@ -1585,7 +1668,6 @@ void cpu_init(void) if (is_uv_system()) uv_cpu_init(); - setup_fixmap_gdt(cpu); load_fixmap_gdt(cpu); } @@ -1595,8 +1677,7 @@ void cpu_init(void) { int cpu = smp_processor_id(); struct task_struct *curr = current; - struct tss_struct *t = &per_cpu(cpu_tss, cpu); - struct thread_struct *thread = &curr->thread; + struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu); wait_for_master_cpu(cpu); @@ -1627,12 +1708,16 @@ void cpu_init(void) initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, curr); - load_sp0(t, thread); - set_tss_desc(cpu, t); + /* + * Initialize the TSS. Don't bother initializing sp0, as the initial + * task never enters user mode. + */ + set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); load_TR_desc(); + load_mm_ldt(&init_mm); - t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); + t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; #ifdef CONFIG_DOUBLEFAULT /* Set up doublefault TSS pointer in the GDT */ @@ -1644,7 +1729,6 @@ void cpu_init(void) fpu__init_cpu(); - setup_fixmap_gdt(cpu); load_fixmap_gdt(cpu); } #endif @@ -1665,3 +1749,33 @@ static int __init init_cpu_syscore(void) return 0; } core_initcall(init_cpu_syscore); + +/* + * The microcode loader calls this upon late microcode load to recheck features, + * only when microcode has been updated. Caller holds microcode_mutex and CPU + * hotplug lock. + */ +void microcode_check(void) +{ + struct cpuinfo_x86 info; + + perf_check_microcode(); + + /* Reload CPUID max function as it might've changed. */ + info.cpuid_level = cpuid_eax(0); + + /* + * Copy all capability leafs to pick up the synthetic ones so that + * memcmp() below doesn't fail on that. The ones coming from CPUID will + * get overwritten in get_cpu_cap(). + */ + memcpy(&info.x86_capability, &boot_cpu_data.x86_capability, sizeof(info.x86_capability)); + + get_cpu_cap(&info); + + if (!memcmp(&info.x86_capability, &boot_cpu_data.x86_capability, sizeof(info.x86_capability))) + return; + + pr_warn("x86/CPU: CPU features have changed after loading microcode, but might not take effect.\n"); + pr_warn("x86/CPU: Please consider either early loading through initrd/built-in or a potential BIOS update.\n"); +} diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index f52a370b6c00..e806b11a99af 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -47,4 +47,7 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[], extern void get_cpu_cap(struct cpuinfo_x86 *c); extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); + +unsigned int aperfmperf_get_khz(int cpu); + #endif /* ARCH_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c new file mode 100644 index 000000000000..904b0a3c4e53 --- /dev/null +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -0,0 +1,121 @@ +/* Declare dependencies between CPUIDs */ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <asm/cpufeature.h> + +struct cpuid_dep { + unsigned int feature; + unsigned int depends; +}; + +/* + * Table of CPUID features that depend on others. + * + * This only includes dependencies that can be usefully disabled, not + * features part of the base set (like FPU). + * + * Note this all is not __init / __initdata because it can be + * called from cpu hotplug. It shouldn't do anything in this case, + * but it's difficult to tell that to the init reference checker. + */ +const static struct cpuid_dep cpuid_deps[] = { + { X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE }, + { X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE }, + { X86_FEATURE_XSAVES, X86_FEATURE_XSAVE }, + { X86_FEATURE_AVX, X86_FEATURE_XSAVE }, + { X86_FEATURE_PKU, X86_FEATURE_XSAVE }, + { X86_FEATURE_MPX, X86_FEATURE_XSAVE }, + { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE }, + { X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR }, + { X86_FEATURE_XMM, X86_FEATURE_FXSR }, + { X86_FEATURE_XMM2, X86_FEATURE_XMM }, + { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, + { X86_FEATURE_XMM4_1, X86_FEATURE_XMM2 }, + { X86_FEATURE_XMM4_2, X86_FEATURE_XMM2 }, + { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, + { X86_FEATURE_PCLMULQDQ, X86_FEATURE_XMM2 }, + { X86_FEATURE_SSSE3, X86_FEATURE_XMM2, }, + { X86_FEATURE_F16C, X86_FEATURE_XMM2, }, + { X86_FEATURE_AES, X86_FEATURE_XMM2 }, + { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 }, + { X86_FEATURE_FMA, X86_FEATURE_AVX }, + { X86_FEATURE_AVX2, X86_FEATURE_AVX, }, + { X86_FEATURE_AVX512F, X86_FEATURE_AVX, }, + { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512PF, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512ER, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512CD, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512DQ, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL }, + { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL }, + { X86_FEATURE_VAES, X86_FEATURE_AVX512VL }, + { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F }, + {} +}; + +static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature) +{ + /* + * Note: This could use the non atomic __*_bit() variants, but the + * rest of the cpufeature code uses atomics as well, so keep it for + * consistency. Cleanup all of it separately. + */ + if (!c) { + clear_cpu_cap(&boot_cpu_data, feature); + set_bit(feature, (unsigned long *)cpu_caps_cleared); + } else { + clear_bit(feature, (unsigned long *)c->x86_capability); + } +} + +/* Take the capabilities and the BUG bits into account */ +#define MAX_FEATURE_BITS ((NCAPINTS + NBUGINTS) * sizeof(u32) * 8) + +static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature) +{ + DECLARE_BITMAP(disable, MAX_FEATURE_BITS); + const struct cpuid_dep *d; + bool changed; + + if (WARN_ON(feature >= MAX_FEATURE_BITS)) + return; + + clear_feature(c, feature); + + /* Collect all features to disable, handling dependencies */ + memset(disable, 0, sizeof(disable)); + __set_bit(feature, disable); + + /* Loop until we get a stable state. */ + do { + changed = false; + for (d = cpuid_deps; d->feature; d++) { + if (!test_bit(d->depends, disable)) + continue; + if (__test_and_set_bit(d->feature, disable)) + continue; + + changed = true; + clear_feature(c, d->feature); + } + } while (changed); +} + +void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature) +{ + do_clear_cpu_cap(c, feature); +} + +void setup_clear_cpu_cap(unsigned int feature) +{ + do_clear_cpu_cap(NULL, feature); +} diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 6b4bb335641f..8949b7ae6d92 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -215,7 +215,7 @@ static void init_cyrix(struct cpuinfo_x86 *c) /* common case step number/rev -- exceptions handled below */ c->x86_model = (dir1 >> 4) + 1; - c->x86_mask = dir1 & 0xf; + c->x86_stepping = dir1 & 0xf; /* Now cook; the original recipe is by Channing Corn, from Cyrix. * We do the same thing for each generation: we work out diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 4fa90006ac68..479ca4728de0 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -26,6 +26,13 @@ #include <asm/processor.h> #include <asm/hypervisor.h> +extern const struct hypervisor_x86 x86_hyper_vmware; +extern const struct hypervisor_x86 x86_hyper_ms_hyperv; +extern const struct hypervisor_x86 x86_hyper_xen_pv; +extern const struct hypervisor_x86 x86_hyper_xen_hvm; +extern const struct hypervisor_x86 x86_hyper_kvm; +extern const struct hypervisor_x86 x86_hyper_jailhouse; + static const __initconst struct hypervisor_x86 * const hypervisors[] = { #ifdef CONFIG_XEN_PV @@ -39,56 +46,57 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = #ifdef CONFIG_KVM_GUEST &x86_hyper_kvm, #endif +#ifdef CONFIG_JAILHOUSE_GUEST + &x86_hyper_jailhouse, +#endif }; -const struct hypervisor_x86 *x86_hyper; -EXPORT_SYMBOL(x86_hyper); +enum x86_hypervisor_type x86_hyper_type; +EXPORT_SYMBOL(x86_hyper_type); -static inline void __init +static inline const struct hypervisor_x86 * __init detect_hypervisor_vendor(void) { - const struct hypervisor_x86 *h, * const *p; + const struct hypervisor_x86 *h = NULL, * const *p; uint32_t pri, max_pri = 0; for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) { - h = *p; - pri = h->detect(); - if (pri != 0 && pri > max_pri) { + pri = (*p)->detect(); + if (pri > max_pri) { max_pri = pri; - x86_hyper = h; + h = *p; } } - if (max_pri) - pr_info("Hypervisor detected: %s\n", x86_hyper->name); + if (h) + pr_info("Hypervisor detected: %s\n", h->name); + + return h; } -void __init init_hypervisor_platform(void) +static void __init copy_array(const void *src, void *target, unsigned int size) { + unsigned int i, n = size / sizeof(void *); + const void * const *from = (const void * const *)src; + const void **to = (const void **)target; - detect_hypervisor_vendor(); - - if (!x86_hyper) - return; - - if (x86_hyper->init_platform) - x86_hyper->init_platform(); + for (i = 0; i < n; i++) + if (from[i]) + to[i] = from[i]; } -bool __init hypervisor_x2apic_available(void) +void __init init_hypervisor_platform(void) { - return x86_hyper && - x86_hyper->x2apic_available && - x86_hyper->x2apic_available(); -} + const struct hypervisor_x86 *h; -void hypervisor_pin_vcpu(int cpu) -{ - if (!x86_hyper) + h = detect_hypervisor_vendor(); + + if (!h) return; - if (x86_hyper->pin_vcpu) - x86_hyper->pin_vcpu(cpu); - else - WARN_ONCE(1, "vcpu pinning requested but not supported!\n"); + copy_array(&h->init, &x86_init.hyper, sizeof(h->init)); + copy_array(&h->runtime, &x86_platform.hyper, sizeof(h->runtime)); + + x86_hyper_type = h->type; + x86_init.hyper.init_platform(); } diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index b720dacac051..d19e903214b4 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -102,6 +102,56 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c) ELF_HWCAP2 |= HWCAP2_RING3MWAIT; } +/* + * Early microcode releases for the Spectre v2 mitigation were broken. + * Information taken from; + * - https://newsroom.intel.com/wp-content/uploads/sites/11/2018/01/microcode-update-guidance.pdf + * - https://kb.vmware.com/s/article/52345 + * - Microcode revisions observed in the wild + * - Release note from 20180108 microcode release + */ +struct sku_microcode { + u8 model; + u8 stepping; + u32 microcode; +}; +static const struct sku_microcode spectre_bad_microcodes[] = { + { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0B, 0x80 }, + { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0A, 0x80 }, + { INTEL_FAM6_KABYLAKE_DESKTOP, 0x09, 0x80 }, + { INTEL_FAM6_KABYLAKE_MOBILE, 0x0A, 0x80 }, + { INTEL_FAM6_KABYLAKE_MOBILE, 0x09, 0x80 }, + { INTEL_FAM6_SKYLAKE_X, 0x03, 0x0100013e }, + { INTEL_FAM6_SKYLAKE_X, 0x04, 0x0200003c }, + { INTEL_FAM6_SKYLAKE_DESKTOP, 0x03, 0xc2 }, + { INTEL_FAM6_BROADWELL_CORE, 0x04, 0x28 }, + { INTEL_FAM6_BROADWELL_GT3E, 0x01, 0x1b }, + { INTEL_FAM6_BROADWELL_XEON_D, 0x02, 0x14 }, + { INTEL_FAM6_BROADWELL_XEON_D, 0x03, 0x07000011 }, + { INTEL_FAM6_BROADWELL_X, 0x01, 0x0b000025 }, + { INTEL_FAM6_HASWELL_ULT, 0x01, 0x21 }, + { INTEL_FAM6_HASWELL_GT3E, 0x01, 0x18 }, + { INTEL_FAM6_HASWELL_CORE, 0x03, 0x23 }, + { INTEL_FAM6_HASWELL_X, 0x02, 0x3b }, + { INTEL_FAM6_HASWELL_X, 0x04, 0x10 }, + { INTEL_FAM6_IVYBRIDGE_X, 0x04, 0x42a }, + /* Observed in the wild */ + { INTEL_FAM6_SANDYBRIDGE_X, 0x06, 0x61b }, + { INTEL_FAM6_SANDYBRIDGE_X, 0x07, 0x712 }, +}; + +static bool bad_spectre_microcode(struct cpuinfo_x86 *c) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) { + if (c->x86_model == spectre_bad_microcodes[i].model && + c->x86_stepping == spectre_bad_microcodes[i].stepping) + return (c->microcode <= spectre_bad_microcodes[i].microcode); + } + return false; +} + static void early_init_intel(struct cpuinfo_x86 *c) { u64 misc_enable; @@ -122,6 +172,19 @@ static void early_init_intel(struct cpuinfo_x86 *c) if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) c->microcode = intel_get_microcode_revision(); + /* Now if any of them are set, check the blacklist and clear the lot */ + if ((cpu_has(c, X86_FEATURE_SPEC_CTRL) || + cpu_has(c, X86_FEATURE_INTEL_STIBP) || + cpu_has(c, X86_FEATURE_IBRS) || cpu_has(c, X86_FEATURE_IBPB) || + cpu_has(c, X86_FEATURE_STIBP)) && bad_spectre_microcode(c)) { + pr_warn("Intel Spectre v2 broken microcode detected; disabling Speculation Control\n"); + setup_clear_cpu_cap(X86_FEATURE_IBRS); + setup_clear_cpu_cap(X86_FEATURE_IBPB); + setup_clear_cpu_cap(X86_FEATURE_STIBP); + setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL); + setup_clear_cpu_cap(X86_FEATURE_INTEL_STIBP); + } + /* * Atom erratum AAE44/AAF40/AAG38/AAH41: * @@ -130,7 +193,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) * need the microcode to have already been loaded... so if it is * not, recommend a BIOS update and disable large pages. */ - if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2 && + if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_stepping <= 2 && c->microcode < 0x20e) { pr_warn("Atom PSE erratum detected, BIOS microcode update recommended\n"); clear_cpu_cap(c, X86_FEATURE_PSE); @@ -146,7 +209,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) /* CPUID workaround for 0F33/0F34 CPU */ if (c->x86 == 0xF && c->x86_model == 0x3 - && (c->x86_mask == 0x3 || c->x86_mask == 0x4)) + && (c->x86_stepping == 0x3 || c->x86_stepping == 0x4)) c->x86_phys_bits = 36; /* @@ -187,21 +250,6 @@ static void early_init_intel(struct cpuinfo_x86 *c) if (c->x86 == 6 && c->x86_model < 15) clear_cpu_cap(c, X86_FEATURE_PAT); -#ifdef CONFIG_KMEMCHECK - /* - * P4s have a "fast strings" feature which causes single- - * stepping REP instructions to only generate a #DB on - * cache-line boundaries. - * - * Ingo Molnar reported a Pentium D (model 6) and a Xeon - * (model 2) with the same problem. - */ - if (c->x86 == 15) - if (msr_clear_bit(MSR_IA32_MISC_ENABLE, - MSR_IA32_MISC_ENABLE_FAST_STRING_BIT) > 0) - pr_info("kmemcheck: Disabling fast string operations\n"); -#endif - /* * If fast string is not enabled in IA32_MISC_ENABLE for any reason, * clear the fast string and enhanced fast string CPU capabilities. @@ -259,7 +307,7 @@ int ppro_with_ram_bug(void) if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 1 && - boot_cpu_data.x86_mask < 8) { + boot_cpu_data.x86_stepping < 8) { pr_info("Pentium Pro with Errata#50 detected. Taking evasive action.\n"); return 1; } @@ -276,7 +324,7 @@ static void intel_smp_check(struct cpuinfo_x86 *c) * Mask B, Pentium, but not Pentium MMX */ if (c->x86 == 5 && - c->x86_mask >= 1 && c->x86_mask <= 4 && + c->x86_stepping >= 1 && c->x86_stepping <= 4 && c->x86_model <= 3) { /* * Remember we have B step Pentia with bugs @@ -319,7 +367,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c) * SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until * model 3 mask 3 */ - if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633) + if ((c->x86<<8 | c->x86_model<<4 | c->x86_stepping) < 0x633) clear_cpu_cap(c, X86_FEATURE_SEP); /* @@ -337,7 +385,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c) * P4 Xeon erratum 037 workaround. * Hardware prefetcher may cause stale data to be loaded into the cache. */ - if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) { + if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_stepping == 1)) { if (msr_set_bit(MSR_IA32_MISC_ENABLE, MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) > 0) { pr_info("CPU: C0 stepping P4 Xeon detected.\n"); @@ -352,7 +400,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c) * Specification Update"). */ if (boot_cpu_has(X86_FEATURE_APIC) && (c->x86<<8 | c->x86_model<<4) == 0x520 && - (c->x86_mask < 0x6 || c->x86_mask == 0xb)) + (c->x86_stepping < 0x6 || c->x86_stepping == 0xb)) set_cpu_bug(c, X86_BUG_11AP); @@ -599,7 +647,7 @@ static void init_intel(struct cpuinfo_x86 *c) case 6: if (l2 == 128) p = "Celeron (Mendocino)"; - else if (c->x86_mask == 0 || c->x86_mask == 5) + else if (c->x86_stepping == 0 || c->x86_stepping == 5) p = "Celeron-A"; break; diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index cd5fc61ba450..589b948e6e01 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -135,6 +135,40 @@ struct rdt_resource rdt_resources_all[] = { .format_str = "%d=%0*x", .fflags = RFTYPE_RES_CACHE, }, + [RDT_RESOURCE_L2DATA] = + { + .rid = RDT_RESOURCE_L2DATA, + .name = "L2DATA", + .domains = domain_init(RDT_RESOURCE_L2DATA), + .msr_base = IA32_L2_CBM_BASE, + .msr_update = cat_wrmsr, + .cache_level = 2, + .cache = { + .min_cbm_bits = 1, + .cbm_idx_mult = 2, + .cbm_idx_offset = 0, + }, + .parse_ctrlval = parse_cbm, + .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, + }, + [RDT_RESOURCE_L2CODE] = + { + .rid = RDT_RESOURCE_L2CODE, + .name = "L2CODE", + .domains = domain_init(RDT_RESOURCE_L2CODE), + .msr_base = IA32_L2_CBM_BASE, + .msr_update = cat_wrmsr, + .cache_level = 2, + .cache = { + .min_cbm_bits = 1, + .cbm_idx_mult = 2, + .cbm_idx_offset = 1, + }, + .parse_ctrlval = parse_cbm, + .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, + }, [RDT_RESOURCE_MBA] = { .rid = RDT_RESOURCE_MBA, @@ -259,14 +293,15 @@ static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r) r->alloc_enabled = true; } -static void rdt_get_cdp_l3_config(int type) +static void rdt_get_cdp_config(int level, int type) { - struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3]; + struct rdt_resource *r_l = &rdt_resources_all[level]; struct rdt_resource *r = &rdt_resources_all[type]; - r->num_closid = r_l3->num_closid / 2; - r->cache.cbm_len = r_l3->cache.cbm_len; - r->default_ctrl = r_l3->default_ctrl; + r->num_closid = r_l->num_closid / 2; + r->cache.cbm_len = r_l->cache.cbm_len; + r->default_ctrl = r_l->default_ctrl; + r->cache.shareable_bits = r_l->cache.shareable_bits; r->data_width = (r->cache.cbm_len + 3) / 4; r->alloc_capable = true; /* @@ -276,6 +311,18 @@ static void rdt_get_cdp_l3_config(int type) r->alloc_enabled = false; } +static void rdt_get_cdp_l3_config(void) +{ + rdt_get_cdp_config(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA); + rdt_get_cdp_config(RDT_RESOURCE_L3, RDT_RESOURCE_L3CODE); +} + +static void rdt_get_cdp_l2_config(void) +{ + rdt_get_cdp_config(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA); + rdt_get_cdp_config(RDT_RESOURCE_L2, RDT_RESOURCE_L2CODE); +} + static int get_cache_id(int cpu, int level) { struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu); @@ -524,10 +571,6 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) */ if (static_branch_unlikely(&rdt_mon_enable_key)) rmdir_mondata_subdir_allrdtgrp(r, d->id); - kfree(d->ctrl_val); - kfree(d->rmid_busy_llc); - kfree(d->mbm_total); - kfree(d->mbm_local); list_del(&d->list); if (is_mbm_enabled()) cancel_delayed_work(&d->mbm_over); @@ -544,6 +587,10 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) cancel_delayed_work(&d->cqm_limbo); } + kfree(d->ctrl_val); + kfree(d->rmid_busy_llc); + kfree(d->mbm_total); + kfree(d->mbm_local); kfree(d); return; } @@ -644,6 +691,7 @@ enum { RDT_FLAG_L3_CAT, RDT_FLAG_L3_CDP, RDT_FLAG_L2_CAT, + RDT_FLAG_L2_CDP, RDT_FLAG_MBA, }; @@ -666,6 +714,7 @@ static struct rdt_options rdt_options[] __initdata = { RDT_OPT(RDT_FLAG_L3_CAT, "l3cat", X86_FEATURE_CAT_L3), RDT_OPT(RDT_FLAG_L3_CDP, "l3cdp", X86_FEATURE_CDP_L3), RDT_OPT(RDT_FLAG_L2_CAT, "l2cat", X86_FEATURE_CAT_L2), + RDT_OPT(RDT_FLAG_L2_CDP, "l2cdp", X86_FEATURE_CDP_L2), RDT_OPT(RDT_FLAG_MBA, "mba", X86_FEATURE_MBA), }; #define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options) @@ -728,15 +777,15 @@ static __init bool get_rdt_alloc_resources(void) if (rdt_cpu_has(X86_FEATURE_CAT_L3)) { rdt_get_cache_alloc_cfg(1, &rdt_resources_all[RDT_RESOURCE_L3]); - if (rdt_cpu_has(X86_FEATURE_CDP_L3)) { - rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA); - rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE); - } + if (rdt_cpu_has(X86_FEATURE_CDP_L3)) + rdt_get_cdp_l3_config(); ret = true; } if (rdt_cpu_has(X86_FEATURE_CAT_L2)) { /* CPUID 0x10.2 fields are same format at 0x10.1 */ rdt_get_cache_alloc_cfg(2, &rdt_resources_all[RDT_RESOURCE_L2]); + if (rdt_cpu_has(X86_FEATURE_CDP_L2)) + rdt_get_cdp_l2_config(); ret = true; } @@ -770,7 +819,7 @@ static __init void rdt_quirks(void) cache_alloc_hsw_probe(); break; case INTEL_FAM6_SKYLAKE_X: - if (boot_cpu_data.x86_mask <= 4) + if (boot_cpu_data.x86_stepping <= 4) set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat"); } } diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index a43a72d8e88e..3fd7a70ee04a 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -7,12 +7,15 @@ #include <linux/jump_label.h> #define IA32_L3_QOS_CFG 0xc81 +#define IA32_L2_QOS_CFG 0xc82 #define IA32_L3_CBM_BASE 0xc90 #define IA32_L2_CBM_BASE 0xd10 #define IA32_MBA_THRTL_BASE 0xd50 #define L3_QOS_CDP_ENABLE 0x01ULL +#define L2_QOS_CDP_ENABLE 0x01ULL + /* * Event IDs are used to program IA32_QM_EVTSEL before reading event * counter from IA32_QM_CTR @@ -127,12 +130,15 @@ struct rdtgroup { #define RFTYPE_BASE BIT(1) #define RF_CTRLSHIFT 4 #define RF_MONSHIFT 5 +#define RF_TOPSHIFT 6 #define RFTYPE_CTRL BIT(RF_CTRLSHIFT) #define RFTYPE_MON BIT(RF_MONSHIFT) +#define RFTYPE_TOP BIT(RF_TOPSHIFT) #define RFTYPE_RES_CACHE BIT(8) #define RFTYPE_RES_MB BIT(9) #define RF_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) #define RF_MON_INFO (RFTYPE_INFO | RFTYPE_MON) +#define RF_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP) #define RF_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) /* List of all resource groups */ @@ -354,6 +360,8 @@ enum { RDT_RESOURCE_L3DATA, RDT_RESOURCE_L3CODE, RDT_RESOURCE_L2, + RDT_RESOURCE_L2DATA, + RDT_RESOURCE_L2CODE, RDT_RESOURCE_MBA, /* Must be the last */ @@ -409,6 +417,10 @@ union cpuid_0x10_x_edx { unsigned int full; }; +void rdt_last_cmd_clear(void); +void rdt_last_cmd_puts(const char *s); +void rdt_last_cmd_printf(const char *fmt, ...); + void rdt_ctrl_update(void *arg); struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); void rdtgroup_kn_unlock(struct kernfs_node *kn); diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index f6ea94f8954a..23e1d5c249c6 100644 --- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c @@ -42,15 +42,22 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) /* * Only linear delay values is supported for current Intel SKUs. */ - if (!r->membw.delay_linear) + if (!r->membw.delay_linear) { + rdt_last_cmd_puts("No support for non-linear MB domains\n"); return false; + } ret = kstrtoul(buf, 10, &bw); - if (ret) + if (ret) { + rdt_last_cmd_printf("Non-decimal digit in MB value %s\n", buf); return false; + } - if (bw < r->membw.min_bw || bw > r->default_ctrl) + if (bw < r->membw.min_bw || bw > r->default_ctrl) { + rdt_last_cmd_printf("MB value %ld out of range [%d,%d]\n", bw, + r->membw.min_bw, r->default_ctrl); return false; + } *data = roundup(bw, (unsigned long)r->membw.bw_gran); return true; @@ -60,8 +67,10 @@ int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d) { unsigned long data; - if (d->have_new_ctrl) + if (d->have_new_ctrl) { + rdt_last_cmd_printf("duplicate domain %d\n", d->id); return -EINVAL; + } if (!bw_validate(buf, &data, r)) return -EINVAL; @@ -84,20 +93,29 @@ static bool cbm_validate(char *buf, unsigned long *data, struct rdt_resource *r) int ret; ret = kstrtoul(buf, 16, &val); - if (ret) + if (ret) { + rdt_last_cmd_printf("non-hex character in mask %s\n", buf); return false; + } - if (val == 0 || val > r->default_ctrl) + if (val == 0 || val > r->default_ctrl) { + rdt_last_cmd_puts("mask out of range\n"); return false; + } first_bit = find_first_bit(&val, cbm_len); zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); - if (find_next_bit(&val, cbm_len, zero_bit) < cbm_len) + if (find_next_bit(&val, cbm_len, zero_bit) < cbm_len) { + rdt_last_cmd_printf("mask %lx has non-consecutive 1-bits\n", val); return false; + } - if ((zero_bit - first_bit) < r->cache.min_cbm_bits) + if ((zero_bit - first_bit) < r->cache.min_cbm_bits) { + rdt_last_cmd_printf("Need at least %d bits in mask\n", + r->cache.min_cbm_bits); return false; + } *data = val; return true; @@ -111,8 +129,10 @@ int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d) { unsigned long data; - if (d->have_new_ctrl) + if (d->have_new_ctrl) { + rdt_last_cmd_printf("duplicate domain %d\n", d->id); return -EINVAL; + } if(!cbm_validate(buf, &data, r)) return -EINVAL; @@ -139,8 +159,10 @@ next: return 0; dom = strsep(&line, ";"); id = strsep(&dom, "="); - if (!dom || kstrtoul(id, 10, &dom_id)) + if (!dom || kstrtoul(id, 10, &dom_id)) { + rdt_last_cmd_puts("Missing '=' or non-numeric domain\n"); return -EINVAL; + } dom = strim(dom); list_for_each_entry(d, &r->domains, list) { if (d->id == dom_id) { @@ -196,6 +218,7 @@ static int rdtgroup_parse_resource(char *resname, char *tok, int closid) if (!strcmp(resname, r->name) && closid < r->num_closid) return parse_line(tok, r); } + rdt_last_cmd_printf("unknown/unsupported resource name '%s'\n", resname); return -EINVAL; } @@ -218,6 +241,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, rdtgroup_kn_unlock(of->kn); return -ENOENT; } + rdt_last_cmd_clear(); closid = rdtgrp->closid; @@ -229,6 +253,12 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, while ((tok = strsep(&buf, "\n")) != NULL) { resname = strim(strsep(&tok, ":")); if (!tok) { + rdt_last_cmd_puts("Missing ':'\n"); + ret = -EINVAL; + goto out; + } + if (tok[0] == '\0') { + rdt_last_cmd_printf("Missing '%s' value\n", resname); ret = -EINVAL; goto out; } diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c index 30827510094b..681450eee428 100644 --- a/arch/x86/kernel/cpu/intel_rdt_monitor.c +++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c @@ -51,7 +51,7 @@ static LIST_HEAD(rmid_free_lru); * may have a occupancy value > intel_cqm_threshold. User can change * the threshold occupancy value. */ -unsigned int rmid_limbo_count; +static unsigned int rmid_limbo_count; /** * @rmid_entry - The entry in the limbo and free lists. diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index a869d4a073c5..fca759d272a1 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -24,6 +24,7 @@ #include <linux/fs.h> #include <linux/sysfs.h> #include <linux/kernfs.h> +#include <linux/seq_buf.h> #include <linux/seq_file.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> @@ -51,6 +52,31 @@ static struct kernfs_node *kn_mongrp; /* Kernel fs node for "mon_data" directory under root */ static struct kernfs_node *kn_mondata; +static struct seq_buf last_cmd_status; +static char last_cmd_status_buf[512]; + +void rdt_last_cmd_clear(void) +{ + lockdep_assert_held(&rdtgroup_mutex); + seq_buf_clear(&last_cmd_status); +} + +void rdt_last_cmd_puts(const char *s) +{ + lockdep_assert_held(&rdtgroup_mutex); + seq_buf_puts(&last_cmd_status, s); +} + +void rdt_last_cmd_printf(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + lockdep_assert_held(&rdtgroup_mutex); + seq_buf_vprintf(&last_cmd_status, fmt, ap); + va_end(ap); +} + /* * Trivial allocator for CLOSIDs. Since h/w only supports a small number, * we can keep a bitmap of free CLOSIDs in a single integer. @@ -238,8 +264,10 @@ static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, /* Check whether cpus belong to parent ctrl group */ cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask); - if (cpumask_weight(tmpmask)) + if (cpumask_weight(tmpmask)) { + rdt_last_cmd_puts("can only add CPUs to mongroup that belong to parent\n"); return -EINVAL; + } /* Check whether cpus are dropped from this group */ cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); @@ -291,8 +319,10 @@ static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); if (cpumask_weight(tmpmask)) { /* Can't drop from default group */ - if (rdtgrp == &rdtgroup_default) + if (rdtgrp == &rdtgroup_default) { + rdt_last_cmd_puts("Can't drop CPUs from default group\n"); return -EINVAL; + } /* Give any dropped cpus to rdtgroup_default */ cpumask_or(&rdtgroup_default.cpu_mask, @@ -357,8 +387,10 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, } rdtgrp = rdtgroup_kn_lock_live(of->kn); + rdt_last_cmd_clear(); if (!rdtgrp) { ret = -ENOENT; + rdt_last_cmd_puts("directory was removed\n"); goto unlock; } @@ -367,13 +399,16 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, else ret = cpumask_parse(buf, newmask); - if (ret) + if (ret) { + rdt_last_cmd_puts("bad cpu list/mask\n"); goto unlock; + } /* check that user didn't specify any offline cpus */ cpumask_andnot(tmpmask, newmask, cpu_online_mask); if (cpumask_weight(tmpmask)) { ret = -EINVAL; + rdt_last_cmd_puts("can only assign online cpus\n"); goto unlock; } @@ -452,6 +487,7 @@ static int __rdtgroup_move_task(struct task_struct *tsk, */ atomic_dec(&rdtgrp->waitcount); kfree(callback); + rdt_last_cmd_puts("task exited\n"); } else { /* * For ctrl_mon groups move both closid and rmid. @@ -462,10 +498,12 @@ static int __rdtgroup_move_task(struct task_struct *tsk, tsk->closid = rdtgrp->closid; tsk->rmid = rdtgrp->mon.rmid; } else if (rdtgrp->type == RDTMON_GROUP) { - if (rdtgrp->mon.parent->closid == tsk->closid) + if (rdtgrp->mon.parent->closid == tsk->closid) { tsk->rmid = rdtgrp->mon.rmid; - else + } else { + rdt_last_cmd_puts("Can't move task to different control group\n"); ret = -EINVAL; + } } } return ret; @@ -484,8 +522,10 @@ static int rdtgroup_task_write_permission(struct task_struct *task, */ if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid)) + !uid_eq(cred->euid, tcred->suid)) { + rdt_last_cmd_printf("No permission to move task %d\n", task->pid); ret = -EPERM; + } put_cred(tcred); return ret; @@ -502,6 +542,7 @@ static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp, tsk = find_task_by_vpid(pid); if (!tsk) { rcu_read_unlock(); + rdt_last_cmd_printf("No task %d\n", pid); return -ESRCH; } } else { @@ -529,6 +570,7 @@ static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return -EINVAL; rdtgrp = rdtgroup_kn_lock_live(of->kn); + rdt_last_cmd_clear(); if (rdtgrp) ret = rdtgroup_move_task(pid, rdtgrp, of); @@ -569,6 +611,21 @@ static int rdtgroup_tasks_show(struct kernfs_open_file *of, return ret; } +static int rdt_last_cmd_status_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + int len; + + mutex_lock(&rdtgroup_mutex); + len = seq_buf_used(&last_cmd_status); + if (len) + seq_printf(seq, "%.*s", len, last_cmd_status_buf); + else + seq_puts(seq, "ok\n"); + mutex_unlock(&rdtgroup_mutex); + return 0; +} + static int rdt_num_closids_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { @@ -686,6 +743,13 @@ static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, /* rdtgroup information files for one cache resource. */ static struct rftype res_common_files[] = { { + .name = "last_cmd_status", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_last_cmd_status_show, + .fflags = RF_TOP_INFO, + }, + { .name = "num_closids", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, @@ -855,6 +919,10 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) return PTR_ERR(kn_info); kernfs_get(kn_info); + ret = rdtgroup_add_files(kn_info, RF_TOP_INFO); + if (ret) + goto out_destroy; + for_each_alloc_enabled_rdt_resource(r) { fflags = r->fflags | RF_CTRL_INFO; ret = rdtgroup_mkdir_info_resdir(r, r->name, fflags); @@ -922,6 +990,7 @@ out_destroy: kernfs_remove(kn); return ret; } + static void l3_qos_cfg_update(void *arg) { bool *enable = arg; @@ -929,8 +998,17 @@ static void l3_qos_cfg_update(void *arg) wrmsrl(IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL); } -static int set_l3_qos_cfg(struct rdt_resource *r, bool enable) +static void l2_qos_cfg_update(void *arg) { + bool *enable = arg; + + wrmsrl(IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL); +} + +static int set_cache_qos_cfg(int level, bool enable) +{ + void (*update)(void *arg); + struct rdt_resource *r_l; cpumask_var_t cpu_mask; struct rdt_domain *d; int cpu; @@ -938,16 +1016,24 @@ static int set_l3_qos_cfg(struct rdt_resource *r, bool enable) if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) return -ENOMEM; - list_for_each_entry(d, &r->domains, list) { + if (level == RDT_RESOURCE_L3) + update = l3_qos_cfg_update; + else if (level == RDT_RESOURCE_L2) + update = l2_qos_cfg_update; + else + return -EINVAL; + + r_l = &rdt_resources_all[level]; + list_for_each_entry(d, &r_l->domains, list) { /* Pick one CPU from each domain instance to update MSR */ cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); } cpu = get_cpu(); /* Update QOS_CFG MSR on this cpu if it's in cpu_mask. */ if (cpumask_test_cpu(cpu, cpu_mask)) - l3_qos_cfg_update(&enable); + update(&enable); /* Update QOS_CFG MSR on all other cpus in cpu_mask. */ - smp_call_function_many(cpu_mask, l3_qos_cfg_update, &enable, 1); + smp_call_function_many(cpu_mask, update, &enable, 1); put_cpu(); free_cpumask_var(cpu_mask); @@ -955,52 +1041,99 @@ static int set_l3_qos_cfg(struct rdt_resource *r, bool enable) return 0; } -static int cdp_enable(void) +static int cdp_enable(int level, int data_type, int code_type) { - struct rdt_resource *r_l3data = &rdt_resources_all[RDT_RESOURCE_L3DATA]; - struct rdt_resource *r_l3code = &rdt_resources_all[RDT_RESOURCE_L3CODE]; - struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3]; + struct rdt_resource *r_ldata = &rdt_resources_all[data_type]; + struct rdt_resource *r_lcode = &rdt_resources_all[code_type]; + struct rdt_resource *r_l = &rdt_resources_all[level]; int ret; - if (!r_l3->alloc_capable || !r_l3data->alloc_capable || - !r_l3code->alloc_capable) + if (!r_l->alloc_capable || !r_ldata->alloc_capable || + !r_lcode->alloc_capable) return -EINVAL; - ret = set_l3_qos_cfg(r_l3, true); + ret = set_cache_qos_cfg(level, true); if (!ret) { - r_l3->alloc_enabled = false; - r_l3data->alloc_enabled = true; - r_l3code->alloc_enabled = true; + r_l->alloc_enabled = false; + r_ldata->alloc_enabled = true; + r_lcode->alloc_enabled = true; } return ret; } -static void cdp_disable(void) +static int cdpl3_enable(void) { - struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; + return cdp_enable(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA, + RDT_RESOURCE_L3CODE); +} + +static int cdpl2_enable(void) +{ + return cdp_enable(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA, + RDT_RESOURCE_L2CODE); +} + +static void cdp_disable(int level, int data_type, int code_type) +{ + struct rdt_resource *r = &rdt_resources_all[level]; r->alloc_enabled = r->alloc_capable; - if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) { - rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled = false; - rdt_resources_all[RDT_RESOURCE_L3CODE].alloc_enabled = false; - set_l3_qos_cfg(r, false); + if (rdt_resources_all[data_type].alloc_enabled) { + rdt_resources_all[data_type].alloc_enabled = false; + rdt_resources_all[code_type].alloc_enabled = false; + set_cache_qos_cfg(level, false); } } +static void cdpl3_disable(void) +{ + cdp_disable(RDT_RESOURCE_L3, RDT_RESOURCE_L3DATA, RDT_RESOURCE_L3CODE); +} + +static void cdpl2_disable(void) +{ + cdp_disable(RDT_RESOURCE_L2, RDT_RESOURCE_L2DATA, RDT_RESOURCE_L2CODE); +} + +static void cdp_disable_all(void) +{ + if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) + cdpl3_disable(); + if (rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled) + cdpl2_disable(); +} + static int parse_rdtgroupfs_options(char *data) { char *token, *o = data; int ret = 0; while ((token = strsep(&o, ",")) != NULL) { - if (!*token) - return -EINVAL; + if (!*token) { + ret = -EINVAL; + goto out; + } - if (!strcmp(token, "cdp")) - ret = cdp_enable(); + if (!strcmp(token, "cdp")) { + ret = cdpl3_enable(); + if (ret) + goto out; + } else if (!strcmp(token, "cdpl2")) { + ret = cdpl2_enable(); + if (ret) + goto out; + } else { + ret = -EINVAL; + goto out; + } } + return 0; + +out: + pr_err("Invalid mount option \"%s\"\n", token); + return ret; } @@ -1081,6 +1214,7 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type, struct dentry *dentry; int ret; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); /* * resctrl file system can only be mounted once. @@ -1130,12 +1264,12 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type, goto out_mondata; if (rdt_alloc_capable) - static_branch_enable(&rdt_alloc_enable_key); + static_branch_enable_cpuslocked(&rdt_alloc_enable_key); if (rdt_mon_capable) - static_branch_enable(&rdt_mon_enable_key); + static_branch_enable_cpuslocked(&rdt_mon_enable_key); if (rdt_alloc_capable || rdt_mon_capable) - static_branch_enable(&rdt_enable_key); + static_branch_enable_cpuslocked(&rdt_enable_key); if (is_mbm_enabled()) { r = &rdt_resources_all[RDT_RESOURCE_L3]; @@ -1154,9 +1288,11 @@ out_mongrp: out_info: kernfs_remove(kn_info); out_cdp: - cdp_disable(); + cdp_disable_all(); out: + rdt_last_cmd_clear(); mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); return dentry; } @@ -1295,9 +1431,7 @@ static void rmdir_all_sub(void) kfree(rdtgrp); } /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ - get_online_cpus(); update_closid_rmid(cpu_online_mask, &rdtgroup_default); - put_online_cpus(); kernfs_remove(kn_info); kernfs_remove(kn_mongrp); @@ -1308,18 +1442,20 @@ static void rdt_kill_sb(struct super_block *sb) { struct rdt_resource *r; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); /*Put everything back to default values. */ for_each_alloc_enabled_rdt_resource(r) reset_all_ctrls(r); - cdp_disable(); + cdp_disable_all(); rmdir_all_sub(); - static_branch_disable(&rdt_alloc_enable_key); - static_branch_disable(&rdt_mon_enable_key); - static_branch_disable(&rdt_enable_key); + static_branch_disable_cpuslocked(&rdt_alloc_enable_key); + static_branch_disable_cpuslocked(&rdt_mon_enable_key); + static_branch_disable_cpuslocked(&rdt_enable_key); kernfs_kill_sb(sb); mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); } static struct file_system_type rdt_fs_type = { @@ -1524,8 +1660,10 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, int ret; prdtgrp = rdtgroup_kn_lock_live(prgrp_kn); + rdt_last_cmd_clear(); if (!prdtgrp) { ret = -ENODEV; + rdt_last_cmd_puts("directory was removed\n"); goto out_unlock; } @@ -1533,6 +1671,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); if (!rdtgrp) { ret = -ENOSPC; + rdt_last_cmd_puts("kernel out of memory\n"); goto out_unlock; } *r = rdtgrp; @@ -1544,6 +1683,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp); if (IS_ERR(kn)) { ret = PTR_ERR(kn); + rdt_last_cmd_puts("kernfs create error\n"); goto out_free_rgrp; } rdtgrp->kn = kn; @@ -1557,24 +1697,31 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, kernfs_get(kn); ret = rdtgroup_kn_set_ugid(kn); - if (ret) + if (ret) { + rdt_last_cmd_puts("kernfs perm error\n"); goto out_destroy; + } - files = RFTYPE_BASE | RFTYPE_CTRL; files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype); ret = rdtgroup_add_files(kn, files); - if (ret) + if (ret) { + rdt_last_cmd_puts("kernfs fill error\n"); goto out_destroy; + } if (rdt_mon_capable) { ret = alloc_rmid(); - if (ret < 0) + if (ret < 0) { + rdt_last_cmd_puts("out of RMIDs\n"); goto out_destroy; + } rdtgrp->mon.rmid = ret; ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn); - if (ret) + if (ret) { + rdt_last_cmd_puts("kernfs subdir error\n"); goto out_idfree; + } } kernfs_activate(kn); @@ -1652,9 +1799,12 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, kn = rdtgrp->kn; ret = closid_alloc(); - if (ret < 0) + if (ret < 0) { + rdt_last_cmd_puts("out of CLOSIDs\n"); goto out_common_fail; + } closid = ret; + ret = 0; rdtgrp->closid = closid; list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); @@ -1665,8 +1815,10 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, * of tasks and cpus to monitor. */ ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL); - if (ret) + if (ret) { + rdt_last_cmd_puts("kernfs subdir error\n"); goto out_id_free; + } } goto out_unlock; @@ -1902,6 +2054,9 @@ int __init rdtgroup_init(void) { int ret = 0; + seq_buf_init(&last_cmd_status, last_cmd_status_buf, + sizeof(last_cmd_status_buf)); + ret = rdtgroup_setup_root(); if (ret) return ret; diff --git a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c index 7f85b76f43bc..97685a0c3175 100644 --- a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c +++ b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c @@ -243,13 +243,13 @@ out: return err ? err : buf - ubuf; } -static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait) +static __poll_t mce_chrdev_poll(struct file *file, poll_table *wait) { poll_wait(file, &mce_chrdev_wait, wait); if (READ_ONCE(mcelog.next)) - return POLLIN | POLLRDNORM; + return EPOLLIN | EPOLLRDNORM; if (!mce_apei_read_done && apei_check_mce()) - return POLLIN | POLLRDNORM; + return EPOLLIN | EPOLLRDNORM; return 0; } diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index aa0d5df9dc60..e956eb267061 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -115,4 +115,19 @@ static inline void mce_unregister_injector_chain(struct notifier_block *nb) { } extern struct mca_config mca_cfg; +#ifndef CONFIG_X86_64 +/* + * On 32-bit systems it would be difficult to safely unmap a poison page + * from the kernel 1:1 map because there are no non-canonical addresses that + * we can use to refer to the address without risking a speculative access. + * However, this isn't much of an issue because: + * 1) Few unmappable pages are in the 1:1 map. Most are in HIGHMEM which + * are only mapped into the kernel as needed + * 2) Few people would run a 32-bit kernel on a machine that supports + * recoverable errors because they have too much memory to boot 32-bit. + */ +static inline void mce_unmap_kpfn(unsigned long pfn) {} +#define mce_unmap_kpfn mce_unmap_kpfn +#endif + #endif /* __X86_MCE_INTERNAL_H__ */ diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 87cc9ab7a13c..5bbd06f38ff6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -59,6 +59,7 @@ static struct severity { #define MCGMASK(x, y) .mcgmask = x, .mcgres = y #define MASK(x, y) .mask = x, .result = y #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) +#define MCI_UC_AR (MCI_STATUS_UC|MCI_STATUS_AR) #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) #define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV) @@ -101,6 +102,22 @@ static struct severity { NOSER, BITCLR(MCI_STATUS_UC) ), + /* + * known AO MCACODs reported via MCE or CMC: + * + * SRAO could be signaled either via a machine check exception or + * CMCI with the corresponding bit S 1 or 0. So we don't need to + * check bit S for SRAO. + */ + MCESEV( + AO, "Action optional: memory scrubbing error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB) + ), + MCESEV( + AO, "Action optional: last level cache writeback error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB) + ), + /* ignore OVER for UCNA */ MCESEV( UCNA, "Uncorrected no action required", @@ -149,15 +166,6 @@ static struct severity { SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR) ), - /* known AO MCACODs: */ - MCESEV( - AO, "Action optional: memory scrubbing error", - SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB) - ), - MCESEV( - AO, "Action optional: last level cache writeback error", - SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB) - ), MCESEV( SOME, "Action optional: unknown MCACOD", SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S) @@ -204,7 +212,7 @@ static int error_context(struct mce *m) return IN_KERNEL; } -static int mce_severity_amd_smca(struct mce *m, int err_ctx) +static int mce_severity_amd_smca(struct mce *m, enum context err_ctx) { u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank); u32 low, high; @@ -245,6 +253,9 @@ static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_exc if (m->status & MCI_STATUS_UC) { + if (ctx == IN_KERNEL) + return MCE_PANIC_SEVERITY; + /* * On older systems where overflow_recov flag is not present, we * should simply panic if an error overflow occurs. If @@ -255,10 +266,6 @@ static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_exc if (mce_flags.smca) return mce_severity_amd_smca(m, ctx); - /* software can try to contain */ - if (!(m->mcgstatus & MCG_STATUS_RIPV) && (ctx == IN_KERNEL)) - return MCE_PANIC_SEVERITY; - /* kill current process */ return MCE_AR_SEVERITY; } else { diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 3b413065c613..8ff94d1e2dce 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -14,7 +14,6 @@ #include <linux/capability.h> #include <linux/miscdevice.h> #include <linux/ratelimit.h> -#include <linux/kallsyms.h> #include <linux/rcupdate.h> #include <linux/kobject.h> #include <linux/uaccess.h> @@ -106,6 +105,10 @@ static struct irq_work mce_irq_work; static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); +#ifndef mce_unmap_kpfn +static void mce_unmap_kpfn(unsigned long pfn); +#endif + /* * CPU/chipset specific EDAC code can register a notifier call here to print * MCE errors in a human-readable form. @@ -235,7 +238,7 @@ static void __print_mce(struct mce *m) m->cs, m->ip); if (m->cs == __KERNEL_CS) - print_symbol("{%s}", m->ip); + pr_cont("{%pS}", (void *)(unsigned long)m->ip); pr_cont("\n"); } @@ -503,10 +506,8 @@ static int mce_usable_address(struct mce *m) bool mce_is_memory_error(struct mce *m) { if (m->cpuvendor == X86_VENDOR_AMD) { - /* ErrCodeExt[20:16] */ - u8 xec = (m->status >> 16) & 0x1f; + return amd_mce_is_memory_error(m); - return (xec == 0x0 || xec == 0x8); } else if (m->cpuvendor == X86_VENDOR_INTEL) { /* * Intel SDM Volume 3B - 15.9.2 Compound Error Codes @@ -530,6 +531,17 @@ bool mce_is_memory_error(struct mce *m) } EXPORT_SYMBOL_GPL(mce_is_memory_error); +static bool mce_is_correctable(struct mce *m) +{ + if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED) + return false; + + if (m->status & MCI_STATUS_UC) + return false; + + return true; +} + static bool cec_add_mce(struct mce *m) { if (!m) @@ -537,7 +549,7 @@ static bool cec_add_mce(struct mce *m) /* We eat only correctable DRAM errors with usable addresses. */ if (mce_is_memory_error(m) && - !(m->status & MCI_STATUS_UC) && + mce_is_correctable(m) && mce_usable_address(m)) if (!cec_add_elem(m->addr >> PAGE_SHIFT)) return true; @@ -582,7 +594,8 @@ static int srao_decode_notifier(struct notifier_block *nb, unsigned long val, if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) { pfn = mce->addr >> PAGE_SHIFT; - memory_failure(pfn, MCE_VECTOR, 0); + if (!memory_failure(pfn, 0)) + mce_unmap_kpfn(pfn); } return NOTIFY_OK; @@ -1046,15 +1059,16 @@ static int do_memory_failure(struct mce *m) pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr); if (!(m->mcgstatus & MCG_STATUS_RIPV)) flags |= MF_MUST_KILL; - ret = memory_failure(m->addr >> PAGE_SHIFT, MCE_VECTOR, flags); + ret = memory_failure(m->addr >> PAGE_SHIFT, flags); if (ret) pr_err("Memory error not recovered"); + else + mce_unmap_kpfn(m->addr >> PAGE_SHIFT); return ret; } -#if defined(arch_unmap_kpfn) && defined(CONFIG_MEMORY_FAILURE) - -void arch_unmap_kpfn(unsigned long pfn) +#ifndef mce_unmap_kpfn +static void mce_unmap_kpfn(unsigned long pfn) { unsigned long decoy_addr; @@ -1065,7 +1079,7 @@ void arch_unmap_kpfn(unsigned long pfn) * We would like to just call: * set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1); * but doing that would radically increase the odds of a - * speculative access to the posion page because we'd have + * speculative access to the poison page because we'd have * the virtual address of the kernel 1:1 mapping sitting * around in registers. * Instead we get tricky. We create a non-canonical address @@ -1090,7 +1104,6 @@ void arch_unmap_kpfn(unsigned long pfn) if (set_memory_np(decoy_addr, 1)) pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); - } #endif @@ -1325,7 +1338,7 @@ out_ist: EXPORT_SYMBOL_GPL(do_machine_check); #ifndef CONFIG_MEMORY_FAILURE -int memory_failure(unsigned long pfn, int vector, int flags) +int memory_failure(unsigned long pfn, int flags) { /* mce_severity() should not hand us an ACTION_REQUIRED error */ BUG_ON(flags & MF_ACTION_REQUIRED); @@ -1367,13 +1380,12 @@ static void __start_timer(struct timer_list *t, unsigned long interval) local_irq_restore(flags); } -static void mce_timer_fn(unsigned long data) +static void mce_timer_fn(struct timer_list *t) { - struct timer_list *t = this_cpu_ptr(&mce_timer); - int cpu = smp_processor_id(); + struct timer_list *cpu_t = this_cpu_ptr(&mce_timer); unsigned long iv; - WARN_ON(cpu != data); + WARN_ON(cpu_t != t); iv = __this_cpu_read(mce_next_interval); @@ -1763,17 +1775,15 @@ static void mce_start_timer(struct timer_list *t) static void __mcheck_cpu_setup_timer(void) { struct timer_list *t = this_cpu_ptr(&mce_timer); - unsigned int cpu = smp_processor_id(); - setup_pinned_timer(t, mce_timer_fn, cpu); + timer_setup(t, mce_timer_fn, TIMER_PINNED); } static void __mcheck_cpu_init_timer(void) { struct timer_list *t = this_cpu_ptr(&mce_timer); - unsigned int cpu = smp_processor_id(); - setup_pinned_timer(t, mce_timer_fn, cpu); + timer_setup(t, mce_timer_fn, TIMER_PINNED); mce_start_timer(t); } @@ -1788,6 +1798,11 @@ static void unexpected_machine_check(struct pt_regs *regs, long error_code) void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; +dotraplinkage void do_mce(struct pt_regs *regs, long error_code) +{ + machine_check_vector(regs, error_code); +} + /* * Called for each booted CPU to set up machine checks. * Must be called with preempt off: diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 486f640b02ef..0f32ad242324 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -110,6 +110,20 @@ const char *smca_get_long_name(enum smca_bank_types t) } EXPORT_SYMBOL_GPL(smca_get_long_name); +static enum smca_bank_types smca_get_bank_type(struct mce *m) +{ + struct smca_bank *b; + + if (m->bank >= N_SMCA_BANK_TYPES) + return N_SMCA_BANK_TYPES; + + b = &smca_banks[m->bank]; + if (!b->hwid) + return N_SMCA_BANK_TYPES; + + return b->hwid->bank_type; +} + static struct smca_hwid smca_hwid_mcatypes[] = { /* { bank_type, hwid_mcatype, xec_bitmap } */ @@ -407,7 +421,9 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) (deferred_error_int_vector != amd_deferred_error_interrupt)) deferred_error_int_vector = amd_deferred_error_interrupt; - low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC; + if (!mce_flags.smca) + low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC; + wrmsr(MSR_CU_DEF_ERR, low, high); } @@ -738,6 +754,17 @@ out_err: } EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr); +bool amd_mce_is_memory_error(struct mce *m) +{ + /* ErrCodeExt[20:16] */ + u8 xec = (m->status >> 16) & 0x1f; + + if (mce_flags.smca) + return smca_get_bank_type(m) == SMCA_UMC && xec == 0x0; + + return m->bank == 4 && xec == 0x8; +} + static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) { struct mce m; diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index c6daec4bdba5..a998e1a7d46f 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -470,6 +470,7 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size, #define F14H_MPB_MAX_SIZE 1824 #define F15H_MPB_MAX_SIZE 4096 #define F16H_MPB_MAX_SIZE 3458 +#define F17H_MPB_MAX_SIZE 3200 switch (family) { case 0x14: @@ -481,6 +482,9 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size, case 0x16: max_size = F16H_MPB_MAX_SIZE; break; + case 0x17: + max_size = F17H_MPB_MAX_SIZE; + break; default: max_size = F1XH_MPB_MAX_SIZE; break; @@ -494,7 +498,7 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size, return patch_size; } -static int apply_microcode_amd(int cpu) +static enum ucode_state apply_microcode_amd(int cpu) { struct cpuinfo_x86 *c = &cpu_data(cpu); struct microcode_amd *mc_amd; @@ -508,7 +512,7 @@ static int apply_microcode_amd(int cpu) p = find_patch(cpu); if (!p) - return 0; + return UCODE_NFOUND; mc_amd = p->data; uci->mc = p->data; @@ -519,13 +523,13 @@ static int apply_microcode_amd(int cpu) if (rev >= mc_amd->hdr.patch_id) { c->microcode = rev; uci->cpu_sig.rev = rev; - return 0; + return UCODE_OK; } if (__apply_microcode_amd(mc_amd)) { pr_err("CPU%d: update failed for patch_level=0x%08x\n", cpu, mc_amd->hdr.patch_id); - return -1; + return UCODE_ERROR; } pr_info("CPU%d: new patch_level=0x%08x\n", cpu, mc_amd->hdr.patch_id); @@ -533,7 +537,7 @@ static int apply_microcode_amd(int cpu) uci->cpu_sig.rev = mc_amd->hdr.patch_id; c->microcode = mc_amd->hdr.patch_id; - return 0; + return UCODE_UPDATED; } static int install_equiv_cpu_table(const u8 *buf) diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index c4fa4a85d4cb..aa1b9a422f2b 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -239,7 +239,7 @@ static int __init save_microcode_in_initrd(void) break; case X86_VENDOR_AMD: if (c->x86 >= 0x10) - return save_microcode_in_initrd_amd(cpuid_eax(1)); + ret = save_microcode_in_initrd_amd(cpuid_eax(1)); break; default: break; @@ -374,7 +374,7 @@ static int collect_cpu_info(int cpu) } struct apply_microcode_ctx { - int err; + enum ucode_state err; }; static void apply_microcode_local(void *arg) @@ -489,31 +489,30 @@ static void __exit microcode_dev_exit(void) /* fake device for request_firmware */ static struct platform_device *microcode_pdev; -static int reload_for_cpu(int cpu) +static enum ucode_state reload_for_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; enum ucode_state ustate; - int err = 0; if (!uci->valid) - return err; + return UCODE_OK; ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev, true); - if (ustate == UCODE_OK) - apply_microcode_on_target(cpu); - else - if (ustate == UCODE_ERROR) - err = -EINVAL; - return err; + if (ustate != UCODE_OK) + return ustate; + + return apply_microcode_on_target(cpu); } static ssize_t reload_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t size) { + enum ucode_state tmp_ret = UCODE_OK; + bool do_callback = false; unsigned long val; + ssize_t ret = 0; int cpu; - ssize_t ret = 0, tmp_ret; ret = kstrtoul(buf, 0, &val); if (ret) @@ -526,15 +525,21 @@ static ssize_t reload_store(struct device *dev, mutex_lock(µcode_mutex); for_each_online_cpu(cpu) { tmp_ret = reload_for_cpu(cpu); - if (tmp_ret != 0) + if (tmp_ret > UCODE_NFOUND) { pr_warn("Error reloading microcode on CPU %d\n", cpu); - /* save retval of the first encountered reload error */ - if (!ret) - ret = tmp_ret; + /* set retval for the first encountered reload error */ + if (!ret) + ret = -EINVAL; + } + + if (tmp_ret == UCODE_UPDATED) + do_callback = true; } - if (!ret) - perf_check_microcode(); + + if (!ret && do_callback) + microcode_check(); + mutex_unlock(µcode_mutex); put_online_cpus(); @@ -560,7 +565,7 @@ static ssize_t pf_show(struct device *dev, return sprintf(buf, "0x%x\n", uci->cpu_sig.pf); } -static DEVICE_ATTR(reload, 0200, NULL, reload_store); +static DEVICE_ATTR_WO(reload); static DEVICE_ATTR(version, 0400, version_show, NULL); static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL); diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 7dbcb7adf797..923054a6b760 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -45,6 +45,9 @@ static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin"; /* Current microcode patch used in early patching on the APs. */ static struct microcode_intel *intel_ucode_patch; +/* last level cache size per core */ +static int llc_size_per_core; + static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1, unsigned int s2, unsigned int p2) { @@ -565,15 +568,6 @@ static void print_ucode(struct ucode_cpu_info *uci) } #else -/* - * Flush global tlb. We only do this in x86_64 where paging has been enabled - * already and PGE should be enabled as well. - */ -static inline void flush_tlb_early(void) -{ - __native_flush_tlb_global_irq_disabled(); -} - static inline void print_ucode(struct ucode_cpu_info *uci) { struct microcode_intel *mc; @@ -602,10 +596,6 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) if (rev != mc->hdr.rev) return -1; -#ifdef CONFIG_X86_64 - /* Flush global tlb. This is precaution. */ - flush_tlb_early(); -#endif uci->cpu_sig.rev = rev; if (early) @@ -782,7 +772,7 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) return 0; } -static int apply_microcode_intel(int cpu) +static enum ucode_state apply_microcode_intel(int cpu) { struct microcode_intel *mc; struct ucode_cpu_info *uci; @@ -792,7 +782,7 @@ static int apply_microcode_intel(int cpu) /* We should bind the task to the CPU */ if (WARN_ON(raw_smp_processor_id() != cpu)) - return -1; + return UCODE_ERROR; uci = ucode_cpu_info + cpu; mc = uci->mc; @@ -800,7 +790,7 @@ static int apply_microcode_intel(int cpu) /* Look for a newer patch in our cache: */ mc = find_patch(uci); if (!mc) - return 0; + return UCODE_NFOUND; } /* write microcode via MSR 0x79 */ @@ -811,7 +801,7 @@ static int apply_microcode_intel(int cpu) if (rev != mc->hdr.rev) { pr_err("CPU%d update to revision 0x%x failed\n", cpu, mc->hdr.rev); - return -1; + return UCODE_ERROR; } if (rev != prev_rev) { @@ -828,7 +818,7 @@ static int apply_microcode_intel(int cpu) uci->cpu_sig.rev = rev; c->microcode = rev; - return 0; + return UCODE_UPDATED; } static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, @@ -923,8 +913,19 @@ static bool is_blacklisted(unsigned int cpu) { struct cpuinfo_x86 *c = &cpu_data(cpu); - if (c->x86 == 6 && c->x86_model == INTEL_FAM6_BROADWELL_X) { - pr_err_once("late loading on model 79 is disabled.\n"); + /* + * Late loading on model 79 with microcode revision less than 0x0b000021 + * and LLC size per core bigger than 2.5MB may result in a system hang. + * This behavior is documented in item BDF90, #334165 (Intel Xeon + * Processor E7-8800/4800 v4 Product Family). + */ + if (c->x86 == 6 && + c->x86_model == INTEL_FAM6_BROADWELL_X && + c->x86_stepping == 0x01 && + llc_size_per_core > 2621440 && + c->microcode < 0x0b000021) { + pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode); + pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n"); return true; } @@ -943,7 +944,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device, return UCODE_NFOUND; sprintf(name, "intel-ucode/%02x-%02x-%02x", - c->x86, c->x86_model, c->x86_mask); + c->x86, c->x86_model, c->x86_stepping); if (request_firmware_direct(&firmware, name, device)) { pr_debug("data file %s load failed\n", name); @@ -979,6 +980,15 @@ static struct microcode_ops microcode_intel_ops = { .apply_microcode = apply_microcode_intel, }; +static int __init calc_llc_size_per_core(struct cpuinfo_x86 *c) +{ + u64 llc_size = c->x86_cache_size * 1024ULL; + + do_div(llc_size, c->x86_max_cores); + + return (int)llc_size; +} + struct microcode_ops * __init init_intel_microcode(void) { struct cpuinfo_x86 *c = &boot_cpu_data; @@ -989,5 +999,7 @@ struct microcode_ops * __init init_intel_microcode(void) return NULL; } + llc_size_per_core = calc_llc_size_per_core(c); + return µcode_intel_ops; } diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 236324e83a3a..9340f41ce8d3 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -251,12 +251,18 @@ static void __init ms_hyperv_init_platform(void) hyperv_setup_mmu_ops(); /* Setup the IDT for hypervisor callback */ alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector); + + /* Setup the IDT for reenlightenment notifications */ + if (ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT) + alloc_intr_gate(HYPERV_REENLIGHTENMENT_VECTOR, + hyperv_reenlightenment_vector); + #endif } -const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { +const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = { .name = "Microsoft Hyper-V", .detect = ms_hyperv_platform, - .init_platform = ms_hyperv_init_platform, + .type = X86_HYPER_MS_HYPERV, + .init.init_platform = ms_hyperv_init_platform, }; -EXPORT_SYMBOL(x86_hyper_ms_hyperv); diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index fdc55215d44d..e12ee86906c6 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -859,7 +859,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, */ if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 1 && - boot_cpu_data.x86_mask <= 7) { + boot_cpu_data.x86_stepping <= 7) { if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) { pr_warn("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); return -EINVAL; diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 40d5a8a75212..7468de429087 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -711,8 +711,8 @@ void __init mtrr_bp_init(void) if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && boot_cpu_data.x86 == 0xF && boot_cpu_data.x86_model == 0x3 && - (boot_cpu_data.x86_mask == 0x3 || - boot_cpu_data.x86_mask == 0x4)) + (boot_cpu_data.x86_stepping == 0x3 || + boot_cpu_data.x86_stepping == 0x4)) phys_addr = 36; size_or_mask = SIZE_OR_MASK_BITS(phys_addr); diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 4378a729b933..2c8522a39ed5 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -5,6 +5,8 @@ #include <linux/seq_file.h> #include <linux/cpufreq.h> +#include "cpu.h" + /* * Get CPU information for use by the procfs. */ @@ -70,15 +72,15 @@ static int show_cpuinfo(struct seq_file *m, void *v) c->x86_model, c->x86_model_id[0] ? c->x86_model_id : "unknown"); - if (c->x86_mask || c->cpuid_level >= 0) - seq_printf(m, "stepping\t: %d\n", c->x86_mask); + if (c->x86_stepping || c->cpuid_level >= 0) + seq_printf(m, "stepping\t: %d\n", c->x86_stepping); else seq_puts(m, "stepping\t: unknown\n"); if (c->microcode) seq_printf(m, "microcode\t: 0x%x\n", c->microcode); if (cpu_has(c, X86_FEATURE_TSC)) { - unsigned int freq = arch_freq_get_on_cpu(cpu); + unsigned int freq = aperfmperf_get_khz(cpu); if (!freq) freq = cpufreq_quick_get(cpu); @@ -89,8 +91,8 @@ static int show_cpuinfo(struct seq_file *m, void *v) } /* Cache size */ - if (c->x86_cache_size >= 0) - seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size); + if (c->x86_cache_size) + seq_printf(m, "cache size\t: %u KB\n", c->x86_cache_size); show_cpuinfo_core(m, c, cpu); show_cpuinfo_misc(m, c); diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 05459ad3db46..772c219b6889 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -21,17 +21,16 @@ struct cpuid_bit { static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, - { X86_FEATURE_INTEL_PT, CPUID_EBX, 25, 0x00000007, 0 }, - { X86_FEATURE_AVX512_4VNNIW, CPUID_EDX, 2, 0x00000007, 0 }, - { X86_FEATURE_AVX512_4FMAPS, CPUID_EDX, 3, 0x00000007, 0 }, { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 }, { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 }, { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 }, + { X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 }, { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 }, { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 }, + { X86_FEATURE_SEV, CPUID_EAX, 1, 0x8000001f, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 40ed26852ebd..8e005329648b 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -205,10 +205,10 @@ static bool __init vmware_legacy_x2apic_available(void) (eax & (1 << VMWARE_PORT_CMD_LEGACY_X2APIC)) != 0; } -const __refconst struct hypervisor_x86 x86_hyper_vmware = { +const __initconst struct hypervisor_x86 x86_hyper_vmware = { .name = "VMware", .detect = vmware_platform, - .init_platform = vmware_platform_setup, - .x2apic_available = vmware_legacy_x2apic_available, + .type = X86_HYPER_VMWARE, + .init.init_platform = vmware_platform_setup, + .init.x2apic_available = vmware_legacy_x2apic_available, }; -EXPORT_SYMBOL(x86_hyper_vmware); diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 44404e2307bb..10e74d4778a1 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -209,7 +209,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs) } #ifdef CONFIG_KEXEC_FILE -static int get_nr_ram_ranges_callback(u64 start, u64 end, void *arg) +static int get_nr_ram_ranges_callback(struct resource *res, void *arg) { unsigned int *nr_ranges = arg; @@ -342,7 +342,7 @@ static int elf_header_exclude_ranges(struct crash_elf_data *ced, return ret; } -static int prepare_elf64_ram_headers_callback(u64 start, u64 end, void *arg) +static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg) { struct crash_elf_data *ced = arg; Elf64_Ehdr *ehdr; @@ -355,7 +355,7 @@ static int prepare_elf64_ram_headers_callback(u64 start, u64 end, void *arg) ehdr = ced->ehdr; /* Exclude unwanted mem ranges */ - ret = elf_header_exclude_ranges(ced, start, end); + ret = elf_header_exclude_ranges(ced, res->start, res->end); if (ret) return ret; @@ -518,14 +518,14 @@ static int add_e820_entry(struct boot_params *params, struct e820_entry *entry) return 0; } -static int memmap_entry_callback(u64 start, u64 end, void *arg) +static int memmap_entry_callback(struct resource *res, void *arg) { struct crash_memmap_data *cmd = arg; struct boot_params *params = cmd->params; struct e820_entry ei; - ei.addr = start; - ei.size = end - start + 1; + ei.addr = res->start; + ei.size = resource_size(res); ei.type = cmd->type; add_e820_entry(params, &ei); @@ -619,12 +619,12 @@ out: return ret; } -static int determine_backup_region(u64 start, u64 end, void *arg) +static int determine_backup_region(struct resource *res, void *arg) { struct kimage *image = arg; - image->arch.backup_src_start = start; - image->arch.backup_src_sz = end - start + 1; + image->arch.backup_src_start = res->start; + image->arch.backup_src_sz = resource_size(res); /* Expecting only one range for backup region */ return 1; diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 76e07698e6d1..25de5f6ca997 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -2,7 +2,6 @@ /* * Architecture specific OF callbacks. */ -#include <linux/bootmem.h> #include <linux/export.h> #include <linux/io.h> #include <linux/interrupt.h> @@ -39,11 +38,6 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size) BUG(); } -void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align) -{ - return __alloc_bootmem(size, align, __pa(MAX_DMA_ADDRESS)); -} - void __init add_dtb(u64 data) { initial_dtb = data + offsetof(struct setup_data, data); diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c index 0e662c55ae90..0b8cedb20d6d 100644 --- a/arch/x86/kernel/doublefault.c +++ b/arch/x86/kernel/doublefault.c @@ -50,25 +50,23 @@ static void doublefault_fn(void) cpu_relax(); } -struct tss_struct doublefault_tss __cacheline_aligned = { - .x86_tss = { - .sp0 = STACK_START, - .ss0 = __KERNEL_DS, - .ldt = 0, - .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, - - .ip = (unsigned long) doublefault_fn, - /* 0x2 bit is always set */ - .flags = X86_EFLAGS_SF | 0x2, - .sp = STACK_START, - .es = __USER_DS, - .cs = __KERNEL_CS, - .ss = __KERNEL_DS, - .ds = __USER_DS, - .fs = __KERNEL_PERCPU, - - .__cr3 = __pa_nodebug(swapper_pg_dir), - } +struct x86_hw_tss doublefault_tss __cacheline_aligned = { + .sp0 = STACK_START, + .ss0 = __KERNEL_DS, + .ldt = 0, + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, + + .ip = (unsigned long) doublefault_fn, + /* 0x2 bit is always set */ + .flags = X86_EFLAGS_SF | 0x2, + .sp = STACK_START, + .es = __USER_DS, + .cs = __KERNEL_CS, + .ss = __KERNEL_DS, + .ds = __USER_DS, + .fs = __KERNEL_PERCPU, + + .__cr3 = __pa_nodebug(swapper_pg_dir), }; /* dummy for do_double_fault() call */ diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index f13b4c00a5de..a2d8a3908670 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -18,6 +18,7 @@ #include <linux/nmi.h> #include <linux/sysfs.h> +#include <asm/cpu_entry_area.h> #include <asm/stacktrace.h> #include <asm/unwind.h> @@ -43,6 +44,24 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task, return true; } +bool in_entry_stack(unsigned long *stack, struct stack_info *info) +{ + struct entry_stack *ss = cpu_entry_stack(smp_processor_id()); + + void *begin = ss; + void *end = ss + 1; + + if ((void *)stack < begin || (void *)stack >= end) + return false; + + info->type = STACK_TYPE_ENTRY; + info->begin = begin; + info->end = end; + info->next_sp = NULL; + + return true; +} + static void printk_stack_address(unsigned long address, int reliable, char *log_lvl) { @@ -50,6 +69,39 @@ static void printk_stack_address(unsigned long address, int reliable, printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address); } +void show_iret_regs(struct pt_regs *regs) +{ + printk(KERN_DEFAULT "RIP: %04x:%pS\n", (int)regs->cs, (void *)regs->ip); + printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss, + regs->sp, regs->flags); +} + +static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, + bool partial) +{ + /* + * These on_stack() checks aren't strictly necessary: the unwind code + * has already validated the 'regs' pointer. The checks are done for + * ordering reasons: if the registers are on the next stack, we don't + * want to print them out yet. Otherwise they'll be shown as part of + * the wrong stack. Later, when show_trace_log_lvl() switches to the + * next stack, this function will be called again with the same regs so + * they can be printed in the right context. + */ + if (!partial && on_stack(info, regs, sizeof(*regs))) { + __show_regs(regs, 0); + + } else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET, + IRET_FRAME_SIZE)) { + /* + * When an interrupt or exception occurs in entry code, the + * full pt_regs might not have been saved yet. In that case + * just print the iret frame. + */ + show_iret_regs(regs); + } +} + void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, char *log_lvl) { @@ -57,11 +109,13 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, struct stack_info stack_info = {0}; unsigned long visit_mask = 0; int graph_idx = 0; + bool partial = false; printk("%sCall Trace:\n", log_lvl); unwind_start(&state, task, regs, stack); stack = stack ? : get_stack_pointer(task, regs); + regs = unwind_get_entry_regs(&state, &partial); /* * Iterate through the stacks, starting with the current stack pointer. @@ -71,31 +125,35 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * - task stack * - interrupt stack * - HW exception stacks (double fault, nmi, debug, mce) + * - entry stack * - * x86-32 can have up to three stacks: + * x86-32 can have up to four stacks: * - task stack * - softirq stack * - hardirq stack + * - entry stack */ - for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { + for ( ; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { const char *stack_name; - /* - * If we overflowed the task stack into a guard page, jump back - * to the bottom of the usable stack. - */ - if (task_stack_page(task) - (void *)stack < PAGE_SIZE) - stack = task_stack_page(task); - - if (get_stack_info(stack, task, &stack_info, &visit_mask)) - break; + if (get_stack_info(stack, task, &stack_info, &visit_mask)) { + /* + * We weren't on a valid stack. It's possible that + * we overflowed a valid stack into a guard page. + * See if the next page up is valid so that we can + * generate some kind of backtrace if this happens. + */ + stack = (unsigned long *)PAGE_ALIGN((unsigned long)stack); + if (get_stack_info(stack, task, &stack_info, &visit_mask)) + break; + } stack_name = stack_type_name(stack_info.type); if (stack_name) printk("%s <%s>\n", log_lvl, stack_name); - if (regs && on_stack(&stack_info, regs, sizeof(*regs))) - __show_regs(regs, 0); + if (regs) + show_regs_if_on_stack(&stack_info, regs, partial); /* * Scan the stack, printing any text addresses we find. At the @@ -119,7 +177,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, /* * Don't print regs->ip again if it was already printed - * by __show_regs() below. + * by show_regs_if_on_stack(). */ if (regs && stack == ®s->ip) goto next; @@ -154,9 +212,9 @@ next: unwind_next_frame(&state); /* if the frame has entry regs, print them */ - regs = unwind_get_entry_regs(&state); - if (regs && on_stack(&stack_info, regs, sizeof(*regs))) - __show_regs(regs, 0); + regs = unwind_get_entry_regs(&state, &partial); + if (regs) + show_regs_if_on_stack(&stack_info, regs, partial); } if (stack_name) @@ -252,11 +310,13 @@ int __die(const char *str, struct pt_regs *regs, long err) unsigned long sp; #endif printk(KERN_DEFAULT - "%s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff, ++die_counter, + "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter, IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", IS_ENABLED(CONFIG_SMP) ? " SMP" : "", debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", - IS_ENABLED(CONFIG_KASAN) ? " KASAN" : ""); + IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "", + IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ? + (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : ""); if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index daefae83a3aa..04170f63e3a1 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -26,6 +26,9 @@ const char *stack_type_name(enum stack_type type) if (type == STACK_TYPE_SOFTIRQ) return "SOFTIRQ"; + if (type == STACK_TYPE_ENTRY) + return "ENTRY_TRAMPOLINE"; + return NULL; } @@ -93,6 +96,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, if (task != current) goto unknown; + if (in_entry_stack(stack, info)) + goto recursion_check; + if (in_hardirq_stack(stack, info)) goto recursion_check; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 88ce2ffdb110..563e28d14f2c 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -37,6 +37,15 @@ const char *stack_type_name(enum stack_type type) if (type == STACK_TYPE_IRQ) return "IRQ"; + if (type == STACK_TYPE_ENTRY) { + /* + * On 64-bit, we have a generic entry stack that we + * use for all the kernel entry points, including + * SYSENTER. + */ + return "ENTRY_TRAMPOLINE"; + } + if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST) return exception_stack_names[type - STACK_TYPE_EXCEPTION]; @@ -115,6 +124,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, if (in_irq_stack(stack, info)) goto recursion_check; + if (in_entry_stack(stack, info)) + goto recursion_check; + goto unknown; recursion_check: diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 1e82f787c160..bae0d32e327b 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -243,7 +243,7 @@ static void __init intel_remapping_check(int num, int slot, int func) #define KB(x) ((x) * 1024UL) #define MB(x) (KB (KB (x))) -static size_t __init i830_tseg_size(void) +static resource_size_t __init i830_tseg_size(void) { u8 esmramc = read_pci_config_byte(0, 0, 0, I830_ESMRAMC); @@ -256,7 +256,7 @@ static size_t __init i830_tseg_size(void) return KB(512); } -static size_t __init i845_tseg_size(void) +static resource_size_t __init i845_tseg_size(void) { u8 esmramc = read_pci_config_byte(0, 0, 0, I845_ESMRAMC); u8 tseg_size = esmramc & I845_TSEG_SIZE_MASK; @@ -273,7 +273,7 @@ static size_t __init i845_tseg_size(void) return 0; } -static size_t __init i85x_tseg_size(void) +static resource_size_t __init i85x_tseg_size(void) { u8 esmramc = read_pci_config_byte(0, 0, 0, I85X_ESMRAMC); @@ -283,12 +283,12 @@ static size_t __init i85x_tseg_size(void) return MB(1); } -static size_t __init i830_mem_size(void) +static resource_size_t __init i830_mem_size(void) { return read_pci_config_byte(0, 0, 0, I830_DRB3) * MB(32); } -static size_t __init i85x_mem_size(void) +static resource_size_t __init i85x_mem_size(void) { return read_pci_config_byte(0, 0, 1, I85X_DRB3) * MB(32); } @@ -297,36 +297,36 @@ static size_t __init i85x_mem_size(void) * On 830/845/85x the stolen memory base isn't available in any * register. We need to calculate it as TOM-TSEG_SIZE-stolen_size. */ -static phys_addr_t __init i830_stolen_base(int num, int slot, int func, - size_t stolen_size) +static resource_size_t __init i830_stolen_base(int num, int slot, int func, + resource_size_t stolen_size) { - return (phys_addr_t)i830_mem_size() - i830_tseg_size() - stolen_size; + return i830_mem_size() - i830_tseg_size() - stolen_size; } -static phys_addr_t __init i845_stolen_base(int num, int slot, int func, - size_t stolen_size) +static resource_size_t __init i845_stolen_base(int num, int slot, int func, + resource_size_t stolen_size) { - return (phys_addr_t)i830_mem_size() - i845_tseg_size() - stolen_size; + return i830_mem_size() - i845_tseg_size() - stolen_size; } -static phys_addr_t __init i85x_stolen_base(int num, int slot, int func, - size_t stolen_size) +static resource_size_t __init i85x_stolen_base(int num, int slot, int func, + resource_size_t stolen_size) { - return (phys_addr_t)i85x_mem_size() - i85x_tseg_size() - stolen_size; + return i85x_mem_size() - i85x_tseg_size() - stolen_size; } -static phys_addr_t __init i865_stolen_base(int num, int slot, int func, - size_t stolen_size) +static resource_size_t __init i865_stolen_base(int num, int slot, int func, + resource_size_t stolen_size) { u16 toud = 0; toud = read_pci_config_16(0, 0, 0, I865_TOUD); - return (phys_addr_t)(toud << 16) + i845_tseg_size(); + return toud * KB(64) + i845_tseg_size(); } -static phys_addr_t __init gen3_stolen_base(int num, int slot, int func, - size_t stolen_size) +static resource_size_t __init gen3_stolen_base(int num, int slot, int func, + resource_size_t stolen_size) { u32 bsm; @@ -337,10 +337,10 @@ static phys_addr_t __init gen3_stolen_base(int num, int slot, int func, */ bsm = read_pci_config(num, slot, func, INTEL_BSM); - return (phys_addr_t)bsm & INTEL_BSM_MASK; + return bsm & INTEL_BSM_MASK; } -static size_t __init i830_stolen_size(int num, int slot, int func) +static resource_size_t __init i830_stolen_size(int num, int slot, int func) { u16 gmch_ctrl; u16 gms; @@ -361,7 +361,7 @@ static size_t __init i830_stolen_size(int num, int slot, int func) return 0; } -static size_t __init gen3_stolen_size(int num, int slot, int func) +static resource_size_t __init gen3_stolen_size(int num, int slot, int func) { u16 gmch_ctrl; u16 gms; @@ -390,7 +390,7 @@ static size_t __init gen3_stolen_size(int num, int slot, int func) return 0; } -static size_t __init gen6_stolen_size(int num, int slot, int func) +static resource_size_t __init gen6_stolen_size(int num, int slot, int func) { u16 gmch_ctrl; u16 gms; @@ -398,10 +398,10 @@ static size_t __init gen6_stolen_size(int num, int slot, int func) gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL); gms = (gmch_ctrl >> SNB_GMCH_GMS_SHIFT) & SNB_GMCH_GMS_MASK; - return (size_t)gms * MB(32); + return gms * MB(32); } -static size_t __init gen8_stolen_size(int num, int slot, int func) +static resource_size_t __init gen8_stolen_size(int num, int slot, int func) { u16 gmch_ctrl; u16 gms; @@ -409,10 +409,10 @@ static size_t __init gen8_stolen_size(int num, int slot, int func) gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL); gms = (gmch_ctrl >> BDW_GMCH_GMS_SHIFT) & BDW_GMCH_GMS_MASK; - return (size_t)gms * MB(32); + return gms * MB(32); } -static size_t __init chv_stolen_size(int num, int slot, int func) +static resource_size_t __init chv_stolen_size(int num, int slot, int func) { u16 gmch_ctrl; u16 gms; @@ -426,14 +426,14 @@ static size_t __init chv_stolen_size(int num, int slot, int func) * 0x17 to 0x1d: 4MB increments start at 36MB */ if (gms < 0x11) - return (size_t)gms * MB(32); + return gms * MB(32); else if (gms < 0x17) - return (size_t)(gms - 0x11 + 2) * MB(4); + return (gms - 0x11) * MB(4) + MB(8); else - return (size_t)(gms - 0x17 + 9) * MB(4); + return (gms - 0x17) * MB(4) + MB(36); } -static size_t __init gen9_stolen_size(int num, int slot, int func) +static resource_size_t __init gen9_stolen_size(int num, int slot, int func) { u16 gmch_ctrl; u16 gms; @@ -444,14 +444,15 @@ static size_t __init gen9_stolen_size(int num, int slot, int func) /* 0x0 to 0xef: 32MB increments starting at 0MB */ /* 0xf0 to 0xfe: 4MB increments starting at 4MB */ if (gms < 0xf0) - return (size_t)gms * MB(32); + return gms * MB(32); else - return (size_t)(gms - 0xf0 + 1) * MB(4); + return (gms - 0xf0) * MB(4) + MB(4); } struct intel_early_ops { - size_t (*stolen_size)(int num, int slot, int func); - phys_addr_t (*stolen_base)(int num, int slot, int func, size_t size); + resource_size_t (*stolen_size)(int num, int slot, int func); + resource_size_t (*stolen_base)(int num, int slot, int func, + resource_size_t size); }; static const struct intel_early_ops i830_early_ops __initconst = { @@ -527,16 +528,20 @@ static const struct pci_device_id intel_early_ids[] __initconst = { INTEL_SKL_IDS(&gen9_early_ops), INTEL_BXT_IDS(&gen9_early_ops), INTEL_KBL_IDS(&gen9_early_ops), + INTEL_CFL_IDS(&gen9_early_ops), INTEL_GLK_IDS(&gen9_early_ops), INTEL_CNL_IDS(&gen9_early_ops), }; +struct resource intel_graphics_stolen_res __ro_after_init = DEFINE_RES_MEM(0, 0); +EXPORT_SYMBOL(intel_graphics_stolen_res); + static void __init intel_graphics_stolen(int num, int slot, int func, const struct intel_early_ops *early_ops) { - phys_addr_t base, end; - size_t size; + resource_size_t base, size; + resource_size_t end; size = early_ops->stolen_size(num, slot, func); base = early_ops->stolen_base(num, slot, func, size); @@ -545,8 +550,12 @@ intel_graphics_stolen(int num, int slot, int func, return; end = base + size - 1; - printk(KERN_INFO "Reserving Intel graphics memory at %pa-%pa\n", - &base, &end); + + intel_graphics_stolen_res.start = base; + intel_graphics_stolen_res.end = end; + + printk(KERN_INFO "Reserving Intel graphics memory at %pR\n", + &intel_graphics_stolen_res); /* Mark this space as reserved */ e820__range_add(base, size, E820_TYPE_RESERVED); diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 9c4e7ba6870c..e5ec3cafa72e 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -57,7 +57,7 @@ # error "Need more virtual address space for the ESPFIX hack" #endif -#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) +#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO) /* This contains the *bottom* address of the espfix stack */ DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack); @@ -155,14 +155,14 @@ void init_espfix_ap(int cpu) page = cpu/ESPFIX_STACKS_PER_PAGE; /* Did another CPU already set this up? */ - stack_page = ACCESS_ONCE(espfix_pages[page]); + stack_page = READ_ONCE(espfix_pages[page]); if (likely(stack_page)) goto done; mutex_lock(&espfix_init_mutex); /* Did we race on the lock? */ - stack_page = ACCESS_ONCE(espfix_pages[page]); + stack_page = READ_ONCE(espfix_pages[page]); if (stack_page) goto unlock_done; @@ -200,7 +200,7 @@ void init_espfix_ap(int cpu) set_pte(&pte_p[n*PTE_STRIDE], pte); /* Job is done for this CPU and any CPU which shares this page */ - ACCESS_ONCE(espfix_pages[page]) = stack_page; + WRITE_ONCE(espfix_pages[page], stack_page); unlock_done: mutex_unlock(&espfix_init_mutex); diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 7affb7e3d9a5..6abd83572b01 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -249,6 +249,10 @@ static void __init fpu__init_system_ctx_switch(void) */ static void __init fpu__init_parse_early_param(void) { + char arg[32]; + char *argptr = arg; + int bit; + if (cmdline_find_option_bool(boot_command_line, "no387")) setup_clear_cpu_cap(X86_FEATURE_FPU); @@ -266,6 +270,13 @@ static void __init fpu__init_parse_early_param(void) if (cmdline_find_option_bool(boot_command_line, "noxsaves")) setup_clear_cpu_cap(X86_FEATURE_XSAVES); + + if (cmdline_find_option(boot_command_line, "clearcpuid", arg, + sizeof(arg)) && + get_option(&argptr, &bit) && + bit >= 0 && + bit < NCAPINTS * 32) + setup_clear_cpu_cap(bit); } /* diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index f1d5476c9022..87a57b7642d3 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -15,6 +15,7 @@ #include <asm/fpu/xstate.h> #include <asm/tlbflush.h> +#include <asm/cpufeature.h> /* * Although we spell it out in here, the Processor Trace @@ -36,6 +37,19 @@ static const char *xfeature_names[] = "unknown xstate feature" , }; +static short xsave_cpuid_features[] __initdata = { + X86_FEATURE_FPU, + X86_FEATURE_XMM, + X86_FEATURE_AVX, + X86_FEATURE_MPX, + X86_FEATURE_MPX, + X86_FEATURE_AVX512F, + X86_FEATURE_AVX512F, + X86_FEATURE_AVX512F, + X86_FEATURE_INTEL_PT, + X86_FEATURE_PKU, +}; + /* * Mask of xstate features supported by the CPU and the kernel: */ @@ -59,26 +73,6 @@ unsigned int fpu_user_xstate_size; void fpu__xstate_clear_all_cpu_caps(void) { setup_clear_cpu_cap(X86_FEATURE_XSAVE); - setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); - setup_clear_cpu_cap(X86_FEATURE_XSAVEC); - setup_clear_cpu_cap(X86_FEATURE_XSAVES); - setup_clear_cpu_cap(X86_FEATURE_AVX); - setup_clear_cpu_cap(X86_FEATURE_AVX2); - setup_clear_cpu_cap(X86_FEATURE_AVX512F); - setup_clear_cpu_cap(X86_FEATURE_AVX512IFMA); - setup_clear_cpu_cap(X86_FEATURE_AVX512PF); - setup_clear_cpu_cap(X86_FEATURE_AVX512ER); - setup_clear_cpu_cap(X86_FEATURE_AVX512CD); - setup_clear_cpu_cap(X86_FEATURE_AVX512DQ); - setup_clear_cpu_cap(X86_FEATURE_AVX512BW); - setup_clear_cpu_cap(X86_FEATURE_AVX512VL); - setup_clear_cpu_cap(X86_FEATURE_MPX); - setup_clear_cpu_cap(X86_FEATURE_XGETBV1); - setup_clear_cpu_cap(X86_FEATURE_AVX512VBMI); - setup_clear_cpu_cap(X86_FEATURE_PKU); - setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW); - setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS); - setup_clear_cpu_cap(X86_FEATURE_AVX512_VPOPCNTDQ); } /* @@ -726,6 +720,7 @@ void __init fpu__init_system_xstate(void) unsigned int eax, ebx, ecx, edx; static int on_boot_cpu __initdata = 1; int err; + int i; WARN_ON_FPU(!on_boot_cpu); on_boot_cpu = 0; @@ -759,6 +754,14 @@ void __init fpu__init_system_xstate(void) goto out_disable; } + /* + * Clear XSAVE features that are disabled in the normal CPUID. + */ + for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) { + if (!boot_cpu_has(xsave_cpuid_features[i])) + xfeatures_mask &= ~BIT(i); + } + xfeatures_mask &= fpu__get_supported_xfeatures_mask(); /* Enable xstate instructions to be able to continue with initialization: */ diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S index b6c6468e10bc..4c8440de3355 100644 --- a/arch/x86/kernel/ftrace_32.S +++ b/arch/x86/kernel/ftrace_32.S @@ -8,6 +8,7 @@ #include <asm/segment.h> #include <asm/export.h> #include <asm/ftrace.h> +#include <asm/nospec-branch.h> #ifdef CC_USING_FENTRY # define function_hook __fentry__ @@ -197,7 +198,8 @@ ftrace_stub: movl 0x4(%ebp), %edx subl $MCOUNT_INSN_SIZE, %eax - call *ftrace_trace_function + movl ftrace_trace_function, %ecx + CALL_NOSPEC %ecx popl %edx popl %ecx @@ -241,5 +243,5 @@ return_to_handler: movl %eax, %ecx popl %edx popl %eax - jmp *%ecx + JMP_NOSPEC %ecx #endif diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index c832291d948a..91b2cff4b79a 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -7,7 +7,8 @@ #include <asm/ptrace.h> #include <asm/ftrace.h> #include <asm/export.h> - +#include <asm/nospec-branch.h> +#include <asm/unwind_hints.h> .code64 .section .entry.text, "ax" @@ -20,7 +21,6 @@ EXPORT_SYMBOL(__fentry__) EXPORT_SYMBOL(mcount) #endif -/* All cases save the original rbp (8 bytes) */ #ifdef CONFIG_FRAME_POINTER # ifdef CC_USING_FENTRY /* Save parent and function stack frames (rip and rbp) */ @@ -31,7 +31,7 @@ EXPORT_SYMBOL(mcount) # endif #else /* No need to save a stack frame */ -# define MCOUNT_FRAME_SIZE 8 +# define MCOUNT_FRAME_SIZE 0 #endif /* CONFIG_FRAME_POINTER */ /* Size of stack used to save mcount regs in save_mcount_regs */ @@ -64,10 +64,10 @@ EXPORT_SYMBOL(mcount) */ .macro save_mcount_regs added=0 - /* Always save the original rbp */ +#ifdef CONFIG_FRAME_POINTER + /* Save the original rbp */ pushq %rbp -#ifdef CONFIG_FRAME_POINTER /* * Stack traces will stop at the ftrace trampoline if the frame pointer * is not set up properly. If fentry is used, we need to save a frame @@ -105,7 +105,11 @@ EXPORT_SYMBOL(mcount) * Save the original RBP. Even though the mcount ABI does not * require this, it helps out callers. */ +#ifdef CONFIG_FRAME_POINTER movq MCOUNT_REG_SIZE-8(%rsp), %rdx +#else + movq %rbp, %rdx +#endif movq %rdx, RBP(%rsp) /* Copy the parent address into %rsi (second parameter) */ @@ -148,7 +152,7 @@ EXPORT_SYMBOL(mcount) ENTRY(function_hook) retq -END(function_hook) +ENDPROC(function_hook) ENTRY(ftrace_caller) /* save_mcount_regs fills in first two parameters */ @@ -184,7 +188,7 @@ GLOBAL(ftrace_graph_call) /* This is weak to keep gas from relaxing the jumps */ WEAK(ftrace_stub) retq -END(ftrace_caller) +ENDPROC(ftrace_caller) ENTRY(ftrace_regs_caller) /* Save the current flags before any operations that can change them */ @@ -255,7 +259,7 @@ GLOBAL(ftrace_regs_caller_end) jmp ftrace_epilogue -END(ftrace_regs_caller) +ENDPROC(ftrace_regs_caller) #else /* ! CONFIG_DYNAMIC_FTRACE */ @@ -286,12 +290,12 @@ trace: * ip and parent ip are used and the list function is called when * function tracing is enabled. */ - call *ftrace_trace_function - + movq ftrace_trace_function, %r8 + CALL_NOSPEC %r8 restore_mcount_regs jmp fgraph_trace -END(function_hook) +ENDPROC(function_hook) #endif /* CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -313,9 +317,10 @@ ENTRY(ftrace_graph_caller) restore_mcount_regs retq -END(ftrace_graph_caller) +ENDPROC(ftrace_graph_caller) -GLOBAL(return_to_handler) +ENTRY(return_to_handler) + UNWIND_HINT_EMPTY subq $24, %rsp /* Save the return values */ @@ -329,5 +334,6 @@ GLOBAL(return_to_handler) movq 8(%rsp), %rdx movq (%rsp), %rax addq $24, %rsp - jmp *%rdi + JMP_NOSPEC %rdi +END(return_to_handler) #endif diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 6a5d757b9cfd..7ba5d819ebe3 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -157,8 +157,8 @@ unsigned long __head __startup_64(unsigned long physaddr, p = fixup_pointer(&phys_base, physaddr); *p += load_delta - sme_get_me_mask(); - /* Encrypt the kernel (if SME is active) */ - sme_encrypt_kernel(); + /* Encrypt the kernel and related (if SME is active) */ + sme_encrypt_kernel(bp); /* * Return the SME encryption mask (if SME is active) to be used as a diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index f1d528bb66a6..b59e4fb40fd9 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -37,7 +37,7 @@ #define X86 new_cpu_data+CPUINFO_x86 #define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor #define X86_MODEL new_cpu_data+CPUINFO_x86_model -#define X86_MASK new_cpu_data+CPUINFO_x86_mask +#define X86_STEPPING new_cpu_data+CPUINFO_x86_stepping #define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math #define X86_CPUID new_cpu_data+CPUINFO_cpuid_level #define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability @@ -212,9 +212,6 @@ ENTRY(startup_32_smp) #endif .Ldefault_entry: -#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ - X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ - X86_CR0_PG) movl $(CR0_STATE & ~X86_CR0_PG),%eax movl %eax,%cr0 @@ -335,7 +332,7 @@ ENTRY(startup_32_smp) shrb $4,%al movb %al,X86_MODEL andb $0x0f,%cl # mask mask revision - movb %cl,X86_MASK + movb %cl,X86_STEPPING movl %edx,X86_CAPABILITY .Lis486: @@ -402,7 +399,7 @@ ENTRY(early_idt_handler_array) # 24(%rsp) error code i = 0 .rept NUM_EXCEPTION_VECTORS - .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 + .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0 pushl $0 # Dummy error code, to make stack frame uniform .endif pushl $i # 20(%esp) Vector number diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 6dde3f3fc1f8..0f545b3cf926 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -23,6 +23,7 @@ #include <asm/nops.h> #include "../entry/calling.h" #include <asm/export.h> +#include <asm/nospec-branch.h> #ifdef CONFIG_PARAVIRT #include <asm/asm-offsets.h> @@ -38,11 +39,12 @@ * */ -#define p4d_index(x) (((x) >> P4D_SHIFT) & (PTRS_PER_P4D-1)) #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) +#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH) PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE) PGD_START_KERNEL = pgd_index(__START_KERNEL_map) +#endif L3_START_KERNEL = pud_index(__START_KERNEL_map) .text @@ -50,6 +52,7 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map) .code64 .globl startup_64 startup_64: + UNWIND_HINT_EMPTY /* * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, * and someone has loaded an identity mapped page table @@ -89,6 +92,7 @@ startup_64: addq $(early_top_pgt - __START_KERNEL_map), %rax jmp 1f ENTRY(secondary_startup_64) + UNWIND_HINT_EMPTY /* * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, * and someone has loaded a mapped page table. @@ -131,8 +135,10 @@ ENTRY(secondary_startup_64) /* Ensure I am executing from virtual addresses */ movq $1f, %rax + ANNOTATE_RETPOLINE_SAFE jmp *%rax 1: + UNWIND_HINT_EMPTY /* Check if nx is implemented */ movl $0x80000001, %eax @@ -150,9 +156,6 @@ ENTRY(secondary_startup_64) 1: wrmsr /* Make changes effective */ /* Setup cr0 */ -#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ - X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ - X86_CR0_PG) movl $CR0_STATE, %eax /* Make changes effective */ movq %rax, %cr0 @@ -235,7 +238,7 @@ ENTRY(secondary_startup_64) pushq %rax # target address in negative space lretq .Lafter_lret: -ENDPROC(secondary_startup_64) +END(secondary_startup_64) #include "verify_cpu.S" @@ -247,6 +250,7 @@ ENDPROC(secondary_startup_64) */ ENTRY(start_cpu0) movq initial_stack(%rip), %rsp + UNWIND_HINT_EMPTY jmp .Ljump_to_C_code ENDPROC(start_cpu0) #endif @@ -266,26 +270,24 @@ ENDPROC(start_cpu0) .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS __FINITDATA -bad_address: - jmp bad_address - __INIT ENTRY(early_idt_handler_array) - # 104(%rsp) %rflags - # 96(%rsp) %cs - # 88(%rsp) %rip - # 80(%rsp) error code i = 0 .rept NUM_EXCEPTION_VECTORS - .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 - pushq $0 # Dummy error code, to make stack frame uniform + .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0 + UNWIND_HINT_IRET_REGS + pushq $0 # Dummy error code, to make stack frame uniform + .else + UNWIND_HINT_IRET_REGS offset=8 .endif pushq $i # 72(%rsp) Vector number jmp early_idt_handler_common + UNWIND_HINT_IRET_REGS i = i + 1 .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr -ENDPROC(early_idt_handler_array) + UNWIND_HINT_IRET_REGS offset=16 +END(early_idt_handler_array) early_idt_handler_common: /* @@ -313,6 +315,7 @@ early_idt_handler_common: pushq %r13 /* pt_regs->r13 */ pushq %r14 /* pt_regs->r14 */ pushq %r15 /* pt_regs->r15 */ + UNWIND_HINT_REGS cmpq $14,%rsi /* Page fault? */ jnz 10f @@ -327,8 +330,8 @@ early_idt_handler_common: 20: decl early_recursion_flag(%rip) - jmp restore_regs_and_iret -ENDPROC(early_idt_handler_common) + jmp restore_regs_and_return_to_kernel +END(early_idt_handler_common) __INITDATA @@ -340,6 +343,27 @@ GLOBAL(early_recursion_flag) .balign PAGE_SIZE; \ GLOBAL(name) +#ifdef CONFIG_PAGE_TABLE_ISOLATION +/* + * Each PGD needs to be 8k long and 8k aligned. We do not + * ever go out to userspace with these, so we do not + * strictly *need* the second page, but this allows us to + * have a single set_pgd() implementation that does not + * need to worry about whether it has 4k or 8k to work + * with. + * + * This ensures PGDs are 8k long: + */ +#define PTI_USER_PGD_FILL 512 +/* This ensures they are 8k-aligned: */ +#define NEXT_PGD_PAGE(name) \ + .balign 2 * PAGE_SIZE; \ +GLOBAL(name) +#else +#define NEXT_PGD_PAGE(name) NEXT_PAGE(name) +#define PTI_USER_PGD_FILL 0 +#endif + /* Automate the creation of 1 to 1 mapping pmd entries */ #define PMDS(START, PERM, COUNT) \ i = 0 ; \ @@ -349,30 +373,29 @@ GLOBAL(name) .endr __INITDATA -NEXT_PAGE(early_top_pgt) +NEXT_PGD_PAGE(early_top_pgt) .fill 511,8,0 #ifdef CONFIG_X86_5LEVEL .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC #else .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC #endif + .fill PTI_USER_PGD_FILL,8,0 NEXT_PAGE(early_dynamic_pgts) .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 .data -#ifndef CONFIG_XEN -NEXT_PAGE(init_top_pgt) - .fill 512,8,0 -#else -NEXT_PAGE(init_top_pgt) +#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH) +NEXT_PGD_PAGE(init_top_pgt) .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .org init_top_pgt + PGD_START_KERNEL*8, 0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC + .fill PTI_USER_PGD_FILL,8,0 NEXT_PAGE(level3_ident_pgt) .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC @@ -382,6 +405,10 @@ NEXT_PAGE(level2_ident_pgt) * Don't set NX because code runs from these pages. */ PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) +#else +NEXT_PGD_PAGE(init_top_pgt) + .fill 512,8,0 + .fill PTI_USER_PGD_FILL,8,0 #endif #ifdef CONFIG_X86_5LEVEL @@ -435,7 +462,7 @@ ENTRY(phys_base) EXPORT_SYMBOL(phys_base) #include "../../x86/xen/xen-head.S" - + __PAGE_ALIGNED_BSS NEXT_PAGE(empty_zero_page) .skip PAGE_SIZE diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 8f5cb2c7060c..86c4439f9d74 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -114,6 +114,7 @@ static void make_8259A_irq(unsigned int irq) io_apic_irqs &= ~(1<<irq); irq_set_chip_and_handler(irq, &i8259A_chip, handle_level_irq); enable_irq(irq); + lapic_assign_legacy_vector(irq, true); } /* diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 6107ee1cb8d5..56d99be3706a 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -56,7 +56,7 @@ struct idt_data { * Early traps running on the DEFAULT_STACK because the other interrupt * stacks work only after cpu_init(). */ -static const __initdata struct idt_data early_idts[] = { +static const __initconst struct idt_data early_idts[] = { INTG(X86_TRAP_DB, debug), SYSG(X86_TRAP_BP, int3), #ifdef CONFIG_X86_32 @@ -70,7 +70,7 @@ static const __initdata struct idt_data early_idts[] = { * the traps which use them are reinitialized with IST after cpu_init() has * set up TSS. */ -static const __initdata struct idt_data def_idts[] = { +static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_DE, divide_error), INTG(X86_TRAP_NMI, nmi), INTG(X86_TRAP_BR, bounds), @@ -92,8 +92,6 @@ static const __initdata struct idt_data def_idts[] = { INTG(X86_TRAP_DF, double_fault), #endif INTG(X86_TRAP_DB, debug), - INTG(X86_TRAP_NMI, nmi), - INTG(X86_TRAP_BP, int3), #ifdef CONFIG_X86_MCE INTG(X86_TRAP_MC, &machine_check), @@ -110,7 +108,7 @@ static const __initdata struct idt_data def_idts[] = { /* * The APIC and SMP idt entries */ -static const __initdata struct idt_data apic_idts[] = { +static const __initconst struct idt_data apic_idts[] = { #ifdef CONFIG_SMP INTG(RESCHEDULE_VECTOR, reschedule_interrupt), INTG(CALL_FUNCTION_VECTOR, call_function_interrupt), @@ -152,7 +150,7 @@ static const __initdata struct idt_data apic_idts[] = { * Early traps running on the DEFAULT_STACK because the other interrupt * stacks work only after cpu_init(). */ -static const __initdata struct idt_data early_pf_idts[] = { +static const __initconst struct idt_data early_pf_idts[] = { INTG(X86_TRAP_PF, page_fault), }; @@ -160,7 +158,7 @@ static const __initdata struct idt_data early_pf_idts[] = { * Override for the debug_idt. Same as the default, but with interrupt * stack set to DEFAULT_STACK (0). Required for NMI trap handling. */ -static const __initdata struct idt_data dbg_idts[] = { +static const __initconst struct idt_data dbg_idts[] = { INTG(X86_TRAP_DB, debug), INTG(X86_TRAP_BP, int3), }; @@ -182,7 +180,7 @@ gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; * The exceptions which use Interrupt stacks. They are setup after * cpu_init() when the TSS has been initialized. */ -static const __initdata struct idt_data ist_idts[] = { +static const __initconst struct idt_data ist_idts[] = { ISTG(X86_TRAP_DB, debug, DEBUG_STACK), ISTG(X86_TRAP_NMI, nmi, NMI_STACK), SISTG(X86_TRAP_BP, int3, DEBUG_STACK), @@ -225,7 +223,7 @@ idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sy idt_init_desc(&desc, t); write_idt_entry(idt, t->vector, &desc); if (sys) - set_bit(t->vector, used_vectors); + set_bit(t->vector, system_vectors); } } @@ -313,14 +311,14 @@ void __init idt_setup_apic_and_irq_gates(void) idt_setup_from_table(idt_table, apic_idts, ARRAY_SIZE(apic_idts), true); - for_each_clear_bit_from(i, used_vectors, FIRST_SYSTEM_VECTOR) { + for_each_clear_bit_from(i, system_vectors, FIRST_SYSTEM_VECTOR) { entry = irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR); set_intr_gate(i, entry); } - for_each_clear_bit_from(i, used_vectors, NR_VECTORS) { + for_each_clear_bit_from(i, system_vectors, NR_VECTORS) { #ifdef CONFIG_X86_LOCAL_APIC - set_bit(i, used_vectors); + set_bit(i, system_vectors); set_intr_gate(i, spurious_interrupt); #else entry = irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR); @@ -358,7 +356,7 @@ void idt_invalidate(void *addr) void __init update_intr_gate(unsigned int n, const void *addr) { - if (WARN_ON_ONCE(!test_bit(n, used_vectors))) + if (WARN_ON_ONCE(!test_bit(n, system_vectors))) return; set_intr_gate(n, addr); } @@ -366,6 +364,6 @@ void __init update_intr_gate(unsigned int n, const void *addr) void alloc_intr_gate(unsigned int n, const void *addr) { BUG_ON(n < FIRST_SYSTEM_VECTOR); - if (!test_and_set_bit(n, used_vectors)) + if (!test_and_set_bit(n, system_vectors)) set_intr_gate(n, addr); } diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 3feb648781c4..2f723301eb58 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -67,7 +67,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) * because the ->io_bitmap_max value must match the bitmap * contents: */ - tss = &per_cpu(cpu_tss, get_cpu()); + tss = &per_cpu(cpu_tss_rw, get_cpu()); if (turn_on) bitmap_clear(t->io_bitmap_ptr, from, num); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 52089c043160..45fb4d2565f8 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -134,7 +134,7 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_puts(p, " Machine check polls\n"); #endif #if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN) - if (test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors)) { + if (test_bit(HYPERVISOR_CALLBACK_VECTOR, system_vectors)) { seq_printf(p, "%*s: ", prec, "HYP"); for_each_online_cpu(j) seq_printf(p, "%10u ", @@ -142,6 +142,15 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_puts(p, " Hypervisor callback interrupts\n"); } #endif +#if IS_ENABLED(CONFIG_HYPERV) + if (test_bit(HYPERV_REENLIGHTENMENT_VECTOR, system_vectors)) { + seq_printf(p, "%*s: ", prec, "HRE"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + irq_stats(j)->irq_hv_reenlightenment_count); + seq_puts(p, " Hyper-V reenlightenment interrupts\n"); + } +#endif seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); @@ -219,18 +228,6 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) /* high bit used in ret_from_ code */ unsigned vector = ~regs->orig_ax; - /* - * NB: Unlike exception entries, IRQ entries do not reliably - * handle context tracking in the low-level entry code. This is - * because syscall entries execute briefly with IRQs on before - * updating context tracking state, so we can take an IRQ from - * kernel mode with CONTEXT_USER. The low-level entry code only - * updates the context if we came from user mode, so we won't - * switch to CONTEXT_KERNEL. We'll fix that once the syscall - * code is cleaned up enough that we can cleanly defer enabling - * IRQs. - */ - entering_irq(); /* entering_irq() tells RCU that we're not quiescent. Check it. */ @@ -333,105 +330,6 @@ __visible void smp_kvm_posted_intr_nested_ipi(struct pt_regs *regs) #ifdef CONFIG_HOTPLUG_CPU - -/* These two declarations are only used in check_irq_vectors_for_cpu_disable() - * below, which is protected by stop_machine(). Putting them on the stack - * results in a stack frame overflow. Dynamically allocating could result in a - * failure so declare these two cpumasks as global. - */ -static struct cpumask affinity_new, online_new; - -/* - * This cpu is going to be removed and its vectors migrated to the remaining - * online cpus. Check to see if there are enough vectors in the remaining cpus. - * This function is protected by stop_machine(). - */ -int check_irq_vectors_for_cpu_disable(void) -{ - unsigned int this_cpu, vector, this_count, count; - struct irq_desc *desc; - struct irq_data *data; - int cpu; - - this_cpu = smp_processor_id(); - cpumask_copy(&online_new, cpu_online_mask); - cpumask_clear_cpu(this_cpu, &online_new); - - this_count = 0; - for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { - desc = __this_cpu_read(vector_irq[vector]); - if (IS_ERR_OR_NULL(desc)) - continue; - /* - * Protect against concurrent action removal, affinity - * changes etc. - */ - raw_spin_lock(&desc->lock); - data = irq_desc_get_irq_data(desc); - cpumask_copy(&affinity_new, - irq_data_get_affinity_mask(data)); - cpumask_clear_cpu(this_cpu, &affinity_new); - - /* Do not count inactive or per-cpu irqs. */ - if (!irq_desc_has_action(desc) || irqd_is_per_cpu(data)) { - raw_spin_unlock(&desc->lock); - continue; - } - - raw_spin_unlock(&desc->lock); - /* - * A single irq may be mapped to multiple cpu's - * vector_irq[] (for example IOAPIC cluster mode). In - * this case we have two possibilities: - * - * 1) the resulting affinity mask is empty; that is - * this the down'd cpu is the last cpu in the irq's - * affinity mask, or - * - * 2) the resulting affinity mask is no longer a - * subset of the online cpus but the affinity mask is - * not zero; that is the down'd cpu is the last online - * cpu in a user set affinity mask. - */ - if (cpumask_empty(&affinity_new) || - !cpumask_subset(&affinity_new, &online_new)) - this_count++; - } - /* No need to check any further. */ - if (!this_count) - return 0; - - count = 0; - for_each_online_cpu(cpu) { - if (cpu == this_cpu) - continue; - /* - * We scan from FIRST_EXTERNAL_VECTOR to first system - * vector. If the vector is marked in the used vectors - * bitmap or an irq is assigned to it, we don't count - * it as available. - * - * As this is an inaccurate snapshot anyway, we can do - * this w/o holding vector_lock. - */ - for (vector = FIRST_EXTERNAL_VECTOR; - vector < FIRST_SYSTEM_VECTOR; vector++) { - if (!test_bit(vector, used_vectors) && - IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector])) { - if (++count == this_count) - return 0; - } - } - } - - if (count < this_count) { - pr_warn("CPU %d disable failed: CPU has %u vectors assigned and there are only %u available.\n", - this_cpu, this_count, count); - return -ERANGE; - } - return 0; -} - /* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ void fixup_irqs(void) { diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index a83b3346a0e1..c1bdbd3d3232 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -20,6 +20,7 @@ #include <linux/mm.h> #include <asm/apic.h> +#include <asm/nospec-branch.h> #ifdef CONFIG_DEBUG_STACKOVERFLOW @@ -55,11 +56,11 @@ DEFINE_PER_CPU(struct irq_stack *, softirq_stack); static void call_on_stack(void *func, void *stack) { asm volatile("xchgl %%ebx,%%esp \n" - "call *%%edi \n" + CALL_NOSPEC "movl %%ebx,%%esp \n" : "=b" (stack) : "0" (stack), - "D"(func) + [thunk_target] "D"(func) : "memory", "cc", "edx", "ecx", "eax"); } @@ -95,11 +96,11 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) call_on_stack(print_stack_overflow, isp); asm volatile("xchgl %%ebx,%%esp \n" - "call *%%edi \n" + CALL_NOSPEC "movl %%ebx,%%esp \n" : "=a" (arg1), "=b" (isp) : "0" (desc), "1" (isp), - "D" (desc->handle_irq) + [thunk_target] "D" (desc->handle_irq) : "memory", "cc", "ecx"); return 1; } diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 020efbf5786b..d86e344f5b3d 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -57,10 +57,10 @@ static inline void stack_overflow_check(struct pt_regs *regs) if (regs->sp >= estack_top && regs->sp <= estack_bottom) return; - WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n", + WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n", current->comm, curbase, regs->sp, irq_stack_top, irq_stack_bottom, - estack_top, estack_bottom); + estack_top, estack_bottom, (void *)regs->ip); if (sysctl_panic_on_stackoverflow) panic("low stack detected by irq handler - check messages\n"); diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 1e4094eba15e..a539410c4ea9 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -94,6 +94,7 @@ void __init native_init_IRQ(void) x86_init.irqs.pre_vector_init(); idt_setup_apic_and_irq_gates(); + lapic_assign_system_vectors(); if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) setup_irq(2, &irq2); diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index f73f475d0573..d177940aa090 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c @@ -24,7 +24,6 @@ #include <linux/cpumask.h> #include <linux/cpuset.h> #include <linux/mutex.h> -#include <linux/sched.h> #include <linux/sysctl.h> #include <linux/nodemask.h> diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c new file mode 100644 index 000000000000..b68fd895235a --- /dev/null +++ b/arch/x86/kernel/jailhouse.c @@ -0,0 +1,211 @@ +// SPDX-License-Identifier: GPL2.0 +/* + * Jailhouse paravirt_ops implementation + * + * Copyright (c) Siemens AG, 2015-2017 + * + * Authors: + * Jan Kiszka <jan.kiszka@siemens.com> + */ + +#include <linux/acpi_pmtmr.h> +#include <linux/kernel.h> +#include <linux/reboot.h> +#include <asm/apic.h> +#include <asm/cpu.h> +#include <asm/hypervisor.h> +#include <asm/i8259.h> +#include <asm/irqdomain.h> +#include <asm/pci_x86.h> +#include <asm/reboot.h> +#include <asm/setup.h> + +static __initdata struct jailhouse_setup_data setup_data; +static unsigned int precalibrated_tsc_khz; + +static uint32_t jailhouse_cpuid_base(void) +{ + if (boot_cpu_data.cpuid_level < 0 || + !boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return 0; + + return hypervisor_cpuid_base("Jailhouse\0\0\0", 0); +} + +static uint32_t __init jailhouse_detect(void) +{ + return jailhouse_cpuid_base(); +} + +static void jailhouse_get_wallclock(struct timespec *now) +{ + memset(now, 0, sizeof(*now)); +} + +static void __init jailhouse_timer_init(void) +{ + lapic_timer_frequency = setup_data.apic_khz * (1000 / HZ); +} + +static unsigned long jailhouse_get_tsc(void) +{ + return precalibrated_tsc_khz; +} + +static void __init jailhouse_x2apic_init(void) +{ +#ifdef CONFIG_X86_X2APIC + if (!x2apic_enabled()) + return; + /* + * We do not have access to IR inside Jailhouse non-root cells. So + * we have to run in physical mode. + */ + x2apic_phys = 1; + /* + * This will trigger the switch to apic_x2apic_phys. Empty OEM IDs + * ensure that only this APIC driver picks up the call. + */ + default_acpi_madt_oem_check("", ""); +#endif +} + +static void __init jailhouse_get_smp_config(unsigned int early) +{ + struct ioapic_domain_cfg ioapic_cfg = { + .type = IOAPIC_DOMAIN_STRICT, + .ops = &mp_ioapic_irqdomain_ops, + }; + struct mpc_intsrc mp_irq = { + .type = MP_INTSRC, + .irqtype = mp_INT, + .irqflag = MP_IRQPOL_ACTIVE_HIGH | MP_IRQTRIG_EDGE, + }; + unsigned int cpu; + + jailhouse_x2apic_init(); + + register_lapic_address(0xfee00000); + + for (cpu = 0; cpu < setup_data.num_cpus; cpu++) { + generic_processor_info(setup_data.cpu_ids[cpu], + boot_cpu_apic_version); + } + + smp_found_config = 1; + + if (setup_data.standard_ioapic) { + mp_register_ioapic(0, 0xfec00000, gsi_top, &ioapic_cfg); + + /* Register 1:1 mapping for legacy UART IRQs 3 and 4 */ + mp_irq.srcbusirq = mp_irq.dstirq = 3; + mp_save_irq(&mp_irq); + + mp_irq.srcbusirq = mp_irq.dstirq = 4; + mp_save_irq(&mp_irq); + } +} + +static void jailhouse_no_restart(void) +{ + pr_notice("Jailhouse: Restart not supported, halting\n"); + machine_halt(); +} + +static int __init jailhouse_pci_arch_init(void) +{ + pci_direct_init(1); + + /* + * There are no bridges on the virtual PCI root bus under Jailhouse, + * thus no other way to discover all devices than a full scan. + * Respect any overrides via the command line, though. + */ + if (pcibios_last_bus < 0) + pcibios_last_bus = 0xff; + + return 0; +} + +static void __init jailhouse_init_platform(void) +{ + u64 pa_data = boot_params.hdr.setup_data; + struct setup_data header; + void *mapping; + + x86_init.irqs.pre_vector_init = x86_init_noop; + x86_init.timers.timer_init = jailhouse_timer_init; + x86_init.mpparse.get_smp_config = jailhouse_get_smp_config; + x86_init.pci.arch_init = jailhouse_pci_arch_init; + + x86_platform.calibrate_cpu = jailhouse_get_tsc; + x86_platform.calibrate_tsc = jailhouse_get_tsc; + x86_platform.get_wallclock = jailhouse_get_wallclock; + x86_platform.legacy.rtc = 0; + x86_platform.legacy.warm_reset = 0; + x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT; + + legacy_pic = &null_legacy_pic; + + machine_ops.emergency_restart = jailhouse_no_restart; + + while (pa_data) { + mapping = early_memremap(pa_data, sizeof(header)); + memcpy(&header, mapping, sizeof(header)); + early_memunmap(mapping, sizeof(header)); + + if (header.type == SETUP_JAILHOUSE && + header.len >= sizeof(setup_data)) { + pa_data += offsetof(struct setup_data, data); + + mapping = early_memremap(pa_data, sizeof(setup_data)); + memcpy(&setup_data, mapping, sizeof(setup_data)); + early_memunmap(mapping, sizeof(setup_data)); + + break; + } + + pa_data = header.next; + } + + if (!pa_data) + panic("Jailhouse: No valid setup data found"); + + if (setup_data.compatible_version > JAILHOUSE_SETUP_REQUIRED_VERSION) + panic("Jailhouse: Unsupported setup data structure"); + + pmtmr_ioport = setup_data.pm_timer_address; + pr_debug("Jailhouse: PM-Timer IO Port: %#x\n", pmtmr_ioport); + + precalibrated_tsc_khz = setup_data.tsc_khz; + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + + pci_probe = 0; + + /* + * Avoid that the kernel complains about missing ACPI tables - there + * are none in a non-root cell. + */ + disable_acpi(); +} + +bool jailhouse_paravirt(void) +{ + return jailhouse_cpuid_base() != 0; +} + +static bool jailhouse_x2apic_available(void) +{ + /* + * The x2APIC is only available if the root cell enabled it. Jailhouse + * does not support switching between xAPIC and x2APIC. + */ + return x2apic_enabled(); +} + +const struct hypervisor_x86 x86_hyper_jailhouse __refconst = { + .name = "Jailhouse", + .detect = jailhouse_detect, + .init.init_platform = jailhouse_init_platform, + .init.x2apic_available = jailhouse_x2apic_available, +}; diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h index 615105cf7d58..ae38dccf0c8f 100644 --- a/arch/x86/kernel/kprobes/common.h +++ b/arch/x86/kernel/kprobes/common.h @@ -85,11 +85,11 @@ extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf, * Copy an instruction and adjust the displacement if the instruction * uses the %rip-relative addressing mode. */ -extern int __copy_instruction(u8 *dest, u8 *src, struct insn *insn); +extern int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn); /* Generate a relative-jump/call instruction */ -extern void synthesize_reljump(void *from, void *to); -extern void synthesize_relcall(void *from, void *to); +extern void synthesize_reljump(void *dest, void *from, void *to); +extern void synthesize_relcall(void *dest, void *from, void *to); #ifdef CONFIG_OPTPROBES extern int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter); diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 0742491cbb73..bd36f3c33cd0 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -119,29 +119,29 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = { const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); static nokprobe_inline void -__synthesize_relative_insn(void *from, void *to, u8 op) +__synthesize_relative_insn(void *dest, void *from, void *to, u8 op) { struct __arch_relative_insn { u8 op; s32 raddr; } __packed *insn; - insn = (struct __arch_relative_insn *)from; + insn = (struct __arch_relative_insn *)dest; insn->raddr = (s32)((long)(to) - ((long)(from) + 5)); insn->op = op; } /* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ -void synthesize_reljump(void *from, void *to) +void synthesize_reljump(void *dest, void *from, void *to) { - __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE); + __synthesize_relative_insn(dest, from, to, RELATIVEJUMP_OPCODE); } NOKPROBE_SYMBOL(synthesize_reljump); /* Insert a call instruction at address 'from', which calls address 'to'.*/ -void synthesize_relcall(void *from, void *to) +void synthesize_relcall(void *dest, void *from, void *to) { - __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE); + __synthesize_relative_insn(dest, from, to, RELATIVECALL_OPCODE); } NOKPROBE_SYMBOL(synthesize_relcall); @@ -346,10 +346,11 @@ static int is_IF_modifier(kprobe_opcode_t *insn) /* * Copy an instruction with recovering modified instruction by kprobes * and adjust the displacement if the instruction uses the %rip-relative - * addressing mode. + * addressing mode. Note that since @real will be the final place of copied + * instruction, displacement must be adjust by @real, not @dest. * This returns the length of copied instruction, or 0 if it has an error. */ -int __copy_instruction(u8 *dest, u8 *src, struct insn *insn) +int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn) { kprobe_opcode_t buf[MAX_INSN_SIZE]; unsigned long recovered_insn = @@ -387,11 +388,11 @@ int __copy_instruction(u8 *dest, u8 *src, struct insn *insn) * have given. */ newdisp = (u8 *) src + (s64) insn->displacement.value - - (u8 *) dest; + - (u8 *) real; if ((s64) (s32) newdisp != newdisp) { pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp); pr_err("\tSrc: %p, Dest: %p, old disp: %x\n", - src, dest, insn->displacement.value); + src, real, insn->displacement.value); return 0; } disp = (u8 *) dest + insn_offset_displacement(insn); @@ -402,20 +403,38 @@ int __copy_instruction(u8 *dest, u8 *src, struct insn *insn) } /* Prepare reljump right after instruction to boost */ -static void prepare_boost(struct kprobe *p, struct insn *insn) +static int prepare_boost(kprobe_opcode_t *buf, struct kprobe *p, + struct insn *insn) { + int len = insn->length; + if (can_boost(insn, p->addr) && - MAX_INSN_SIZE - insn->length >= RELATIVEJUMP_SIZE) { + MAX_INSN_SIZE - len >= RELATIVEJUMP_SIZE) { /* * These instructions can be executed directly if it * jumps back to correct address. */ - synthesize_reljump(p->ainsn.insn + insn->length, + synthesize_reljump(buf + len, p->ainsn.insn + len, p->addr + insn->length); + len += RELATIVEJUMP_SIZE; p->ainsn.boostable = true; } else { p->ainsn.boostable = false; } + + return len; +} + +/* Make page to RO mode when allocate it */ +void *alloc_insn_page(void) +{ + void *page; + + page = module_alloc(PAGE_SIZE); + if (page) + set_memory_ro((unsigned long)page & PAGE_MASK, 1); + + return page; } /* Recover page to RW mode before releasing it */ @@ -429,12 +448,11 @@ void free_insn_page(void *page) static int arch_copy_kprobe(struct kprobe *p) { struct insn insn; + kprobe_opcode_t buf[MAX_INSN_SIZE]; int len; - set_memory_rw((unsigned long)p->ainsn.insn & PAGE_MASK, 1); - /* Copy an instruction with recovering if other optprobe modifies it.*/ - len = __copy_instruction(p->ainsn.insn, p->addr, &insn); + len = __copy_instruction(buf, p->addr, p->ainsn.insn, &insn); if (!len) return -EINVAL; @@ -442,15 +460,16 @@ static int arch_copy_kprobe(struct kprobe *p) * __copy_instruction can modify the displacement of the instruction, * but it doesn't affect boostable check. */ - prepare_boost(p, &insn); - - set_memory_ro((unsigned long)p->ainsn.insn & PAGE_MASK, 1); + len = prepare_boost(buf, p, &insn); /* Check whether the instruction modifies Interrupt Flag or not */ - p->ainsn.if_modifier = is_IF_modifier(p->ainsn.insn); + p->ainsn.if_modifier = is_IF_modifier(buf); /* Also, displacement change doesn't affect the first byte */ - p->opcode = p->ainsn.insn[0]; + p->opcode = buf[0]; + + /* OK, write back the instruction(s) into ROX insn buffer */ + text_poke(p->ainsn.insn, buf, len); return 0; } diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 041f7b6dfa0f..8dc0161cec8f 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -26,7 +26,7 @@ #include "common.h" static nokprobe_inline -int __skip_singlestep(struct kprobe *p, struct pt_regs *regs, +void __skip_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb, unsigned long orig_ip) { /* @@ -41,33 +41,31 @@ int __skip_singlestep(struct kprobe *p, struct pt_regs *regs, __this_cpu_write(current_kprobe, NULL); if (orig_ip) regs->ip = orig_ip; - return 1; } int skip_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) { - if (kprobe_ftrace(p)) - return __skip_singlestep(p, regs, kcb, 0); - else - return 0; + if (kprobe_ftrace(p)) { + __skip_singlestep(p, regs, kcb, 0); + preempt_enable_no_resched(); + return 1; + } + return 0; } NOKPROBE_SYMBOL(skip_singlestep); -/* Ftrace callback handler for kprobes */ +/* Ftrace callback handler for kprobes -- called under preepmt disabed */ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct pt_regs *regs) { struct kprobe *p; struct kprobe_ctlblk *kcb; - unsigned long flags; - - /* Disable irq for emulating a breakpoint and avoiding preempt */ - local_irq_save(flags); + /* Preempt is disabled by ftrace */ p = get_kprobe((kprobe_opcode_t *)ip); if (unlikely(!p) || kprobe_disabled(p)) - goto end; + return; kcb = get_kprobe_ctlblk(); if (kprobe_running()) { @@ -77,17 +75,19 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */ regs->ip = ip + sizeof(kprobe_opcode_t); + /* To emulate trap based kprobes, preempt_disable here */ + preempt_disable(); __this_cpu_write(current_kprobe, p); kcb->kprobe_status = KPROBE_HIT_ACTIVE; - if (!p->pre_handler || !p->pre_handler(p, regs)) + if (!p->pre_handler || !p->pre_handler(p, regs)) { __skip_singlestep(p, regs, kcb, orig_ip); + preempt_enable_no_resched(); + } /* * If pre_handler returns !0, it sets regs->ip and - * resets current kprobe. + * resets current kprobe, and keep preempt count +1. */ } -end: - local_irq_restore(flags); } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 4f98aad38237..203d398802a3 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -40,6 +40,7 @@ #include <asm/debugreg.h> #include <asm/set_memory.h> #include <asm/sections.h> +#include <asm/nospec-branch.h> #include "common.h" @@ -142,11 +143,11 @@ void optprobe_template_func(void); STACK_FRAME_NON_STANDARD(optprobe_template_func); #define TMPL_MOVE_IDX \ - ((long)&optprobe_template_val - (long)&optprobe_template_entry) + ((long)optprobe_template_val - (long)optprobe_template_entry) #define TMPL_CALL_IDX \ - ((long)&optprobe_template_call - (long)&optprobe_template_entry) + ((long)optprobe_template_call - (long)optprobe_template_entry) #define TMPL_END_IDX \ - ((long)&optprobe_template_end - (long)&optprobe_template_entry) + ((long)optprobe_template_end - (long)optprobe_template_entry) #define INT3_SIZE sizeof(kprobe_opcode_t) @@ -154,17 +155,15 @@ STACK_FRAME_NON_STANDARD(optprobe_template_func); static void optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) { - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - unsigned long flags; - /* This is possible if op is under delayed unoptimizing */ if (kprobe_disabled(&op->kp)) return; - local_irq_save(flags); + preempt_disable(); if (kprobe_running()) { kprobes_inc_nmissed_count(&op->kp); } else { + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); /* Save skipped registers */ #ifdef CONFIG_X86_64 regs->cs = __KERNEL_CS; @@ -180,17 +179,17 @@ optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) opt_pre_handler(&op->kp, regs); __this_cpu_write(current_kprobe, NULL); } - local_irq_restore(flags); + preempt_enable_no_resched(); } NOKPROBE_SYMBOL(optimized_callback); -static int copy_optimized_instructions(u8 *dest, u8 *src) +static int copy_optimized_instructions(u8 *dest, u8 *src, u8 *real) { struct insn insn; int len = 0, ret; while (len < RELATIVEJUMP_SIZE) { - ret = __copy_instruction(dest + len, src + len, &insn); + ret = __copy_instruction(dest + len, src + len, real, &insn); if (!ret || !can_boost(&insn, src + len)) return -EINVAL; len += ret; @@ -205,7 +204,7 @@ static int copy_optimized_instructions(u8 *dest, u8 *src) } /* Check whether insn is indirect jump */ -static int insn_is_indirect_jump(struct insn *insn) +static int __insn_is_indirect_jump(struct insn *insn) { return ((insn->opcode.bytes[0] == 0xff && (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */ @@ -239,6 +238,26 @@ static int insn_jump_into_range(struct insn *insn, unsigned long start, int len) return (start <= target && target <= start + len); } +static int insn_is_indirect_jump(struct insn *insn) +{ + int ret = __insn_is_indirect_jump(insn); + +#ifdef CONFIG_RETPOLINE + /* + * Jump to x86_indirect_thunk_* is treated as an indirect jump. + * Note that even with CONFIG_RETPOLINE=y, the kernel compiled with + * older gcc may use indirect jump. So we add this check instead of + * replace indirect-jump check. + */ + if (!ret) + ret = insn_jump_into_range(insn, + (unsigned long)__indirect_thunk_start, + (unsigned long)__indirect_thunk_end - + (unsigned long)__indirect_thunk_start); +#endif + return ret; +} + /* Decode whole function to ensure any instructions don't jump into target */ static int can_optimize(unsigned long paddr) { @@ -343,57 +362,66 @@ void arch_remove_optimized_kprobe(struct optimized_kprobe *op) int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *__unused) { - u8 *buf; - int ret; + u8 *buf = NULL, *slot; + int ret, len; long rel; if (!can_optimize((unsigned long)op->kp.addr)) return -EILSEQ; - op->optinsn.insn = get_optinsn_slot(); - if (!op->optinsn.insn) + buf = kzalloc(MAX_OPTINSN_SIZE, GFP_KERNEL); + if (!buf) return -ENOMEM; + op->optinsn.insn = slot = get_optinsn_slot(); + if (!slot) { + ret = -ENOMEM; + goto out; + } + /* * Verify if the address gap is in 2GB range, because this uses * a relative jump. */ - rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE; + rel = (long)slot - (long)op->kp.addr + RELATIVEJUMP_SIZE; if (abs(rel) > 0x7fffffff) { - __arch_remove_optimized_kprobe(op, 0); - return -ERANGE; + ret = -ERANGE; + goto err; } - buf = (u8 *)op->optinsn.insn; - set_memory_rw((unsigned long)buf & PAGE_MASK, 1); + /* Copy arch-dep-instance from template */ + memcpy(buf, optprobe_template_entry, TMPL_END_IDX); /* Copy instructions into the out-of-line buffer */ - ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr); - if (ret < 0) { - __arch_remove_optimized_kprobe(op, 0); - return ret; - } + ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr, + slot + TMPL_END_IDX); + if (ret < 0) + goto err; op->optinsn.size = ret; - - /* Copy arch-dep-instance from template */ - memcpy(buf, &optprobe_template_entry, TMPL_END_IDX); + len = TMPL_END_IDX + op->optinsn.size; /* Set probe information */ synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op); /* Set probe function call */ - synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback); + synthesize_relcall(buf + TMPL_CALL_IDX, + slot + TMPL_CALL_IDX, optimized_callback); /* Set returning jmp instruction at the tail of out-of-line buffer */ - synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size, + synthesize_reljump(buf + len, slot + len, (u8 *)op->kp.addr + op->optinsn.size); - - set_memory_ro((unsigned long)buf & PAGE_MASK, 1); - - flush_icache_range((unsigned long) buf, - (unsigned long) buf + TMPL_END_IDX + - op->optinsn.size + RELATIVEJUMP_SIZE); - return 0; + len += RELATIVEJUMP_SIZE; + + /* We have to use text_poke for instuction buffer because it is RO */ + text_poke(slot, buf, len); + ret = 0; +out: + kfree(buf); + return ret; + +err: + __arch_remove_optimized_kprobe(op, 0); + goto out; } /* diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 8bb9594d0761..bc1a27280c4b 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -49,7 +49,7 @@ static int kvmapf = 1; -static int parse_no_kvmapf(char *arg) +static int __init parse_no_kvmapf(char *arg) { kvmapf = 0; return 0; @@ -58,7 +58,7 @@ static int parse_no_kvmapf(char *arg) early_param("no-kvmapf", parse_no_kvmapf); static int steal_acc = 1; -static int parse_no_stealacc(char *arg) +static int __init parse_no_stealacc(char *arg) { steal_acc = 0; return 0; @@ -67,7 +67,7 @@ static int parse_no_stealacc(char *arg) early_param("no-steal-acc", parse_no_stealacc); static int kvmclock_vsyscall = 1; -static int parse_no_kvmclock_vsyscall(char *arg) +static int __init parse_no_kvmclock_vsyscall(char *arg) { kvmclock_vsyscall = 0; return 0; @@ -75,8 +75,8 @@ static int parse_no_kvmclock_vsyscall(char *arg) early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); -static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); -static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); +static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); +static DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64); static int has_steal_clock = 0; /* @@ -312,7 +312,7 @@ static void kvm_register_steal_time(void) cpu, (unsigned long long) slow_virt_to_phys(st)); } -static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; +static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val) { @@ -341,10 +341,10 @@ static void kvm_guest_cpu_init(void) #endif pa |= KVM_ASYNC_PF_ENABLED; - /* Async page fault support for L1 hypervisor is optional */ - if (wrmsr_safe(MSR_KVM_ASYNC_PF_EN, - (pa | KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT) & 0xffffffff, pa >> 32) < 0) - wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT)) + pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; + + wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); __this_cpu_write(apf_reason.enabled, 1); printk(KERN_INFO"KVM setup async PF for cpu %d\n", smp_processor_id()); @@ -426,9 +426,42 @@ void kvm_disable_steal_time(void) wrmsr(MSR_KVM_STEAL_TIME, 0, 0); } +static inline void __set_percpu_decrypted(void *ptr, unsigned long size) +{ + early_set_memory_decrypted((unsigned long) ptr, size); +} + +/* + * Iterate through all possible CPUs and map the memory region pointed + * by apf_reason, steal_time and kvm_apic_eoi as decrypted at once. + * + * Note: we iterate through all possible CPUs to ensure that CPUs + * hotplugged will have their per-cpu variable already mapped as + * decrypted. + */ +static void __init sev_map_percpu_data(void) +{ + int cpu; + + if (!sev_active()) + return; + + for_each_possible_cpu(cpu) { + __set_percpu_decrypted(&per_cpu(apf_reason, cpu), sizeof(apf_reason)); + __set_percpu_decrypted(&per_cpu(steal_time, cpu), sizeof(steal_time)); + __set_percpu_decrypted(&per_cpu(kvm_apic_eoi, cpu), sizeof(kvm_apic_eoi)); + } +} + #ifdef CONFIG_SMP static void __init kvm_smp_prepare_boot_cpu(void) { + /* + * Map the per-cpu variables as decrypted before kvm_guest_cpu_init() + * shares the guest physical address with the hypervisor. + */ + sev_map_percpu_data(); + kvm_guest_cpu_init(); native_smp_prepare_boot_cpu(); kvm_spinlock_init(); @@ -465,7 +498,35 @@ static void __init kvm_apf_trap_init(void) update_intr_gate(X86_TRAP_PF, async_page_fault); } -void __init kvm_guest_init(void) +static DEFINE_PER_CPU(cpumask_var_t, __pv_tlb_mask); + +static void kvm_flush_tlb_others(const struct cpumask *cpumask, + const struct flush_tlb_info *info) +{ + u8 state; + int cpu; + struct kvm_steal_time *src; + struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_tlb_mask); + + cpumask_copy(flushmask, cpumask); + /* + * We have to call flush only on online vCPUs. And + * queue flush_on_enter for pre-empted vCPUs + */ + for_each_cpu(cpu, flushmask) { + src = &per_cpu(steal_time, cpu); + state = READ_ONCE(src->preempted); + if ((state & KVM_VCPU_PREEMPTED)) { + if (try_cmpxchg(&src->preempted, &state, + state | KVM_VCPU_FLUSH_TLB)) + __cpumask_clear_cpu(cpu, flushmask); + } + } + + native_flush_tlb_others(flushmask, info); +} + +static void __init kvm_guest_init(void) { int i; @@ -484,6 +545,10 @@ void __init kvm_guest_init(void) pv_time_ops.steal_clock = kvm_steal_clock; } + if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && + !kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) + pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others; + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) apic_set_eoi_write(kvm_guest_apic_eoi_write); @@ -496,6 +561,7 @@ void __init kvm_guest_init(void) kvm_cpu_online, kvm_cpu_down_prepare) < 0) pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n"); #else + sev_map_percpu_data(); kvm_guest_cpu_init(); #endif @@ -544,12 +610,13 @@ static uint32_t __init kvm_detect(void) return kvm_cpuid_base(); } -const struct hypervisor_x86 x86_hyper_kvm __refconst = { +const __initconst struct hypervisor_x86 x86_hyper_kvm = { .name = "KVM", .detect = kvm_detect, - .x2apic_available = kvm_para_available, + .type = X86_HYPER_KVM, + .init.guest_late_init = kvm_guest_init, + .init.x2apic_available = kvm_para_available, }; -EXPORT_SYMBOL_GPL(x86_hyper_kvm); static __init int activate_jump_labels(void) { @@ -563,6 +630,23 @@ static __init int activate_jump_labels(void) } arch_initcall(activate_jump_labels); +static __init int kvm_setup_pv_tlb_flush(void) +{ + int cpu; + + if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && + !kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { + for_each_possible_cpu(cpu) { + zalloc_cpumask_var_node(per_cpu_ptr(&__pv_tlb_mask, cpu), + GFP_KERNEL, cpu_to_node(cpu)); + } + pr_info("KVM setup pv remote TLB flush\n"); + } + + return 0; +} +arch_initcall(kvm_setup_pv_tlb_flush); + #ifdef CONFIG_PARAVIRT_SPINLOCKS /* Kick a cpu by its apicid. Used to wake up a halted vcpu */ @@ -608,7 +692,7 @@ __visible bool __kvm_vcpu_is_preempted(long cpu) { struct kvm_steal_time *src = &per_cpu(steal_time, cpu); - return !!src->preempted; + return !!(src->preempted & KVM_VCPU_PREEMPTED); } PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted); diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 5b609e28ce3f..8b26c9e01cc4 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -27,6 +27,7 @@ #include <linux/sched.h> #include <linux/sched/clock.h> +#include <asm/mem_encrypt.h> #include <asm/x86_init.h> #include <asm/reboot.h> #include <asm/kvmclock.h> @@ -45,13 +46,7 @@ early_param("no-kvmclock", parse_no_kvmclock); /* The hypervisor will put information about time periodically here */ static struct pvclock_vsyscall_time_info *hv_clock; -static struct pvclock_wall_clock wall_clock; - -struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void) -{ - return hv_clock; -} -EXPORT_SYMBOL_GPL(pvclock_pvti_cpu0_va); +static struct pvclock_wall_clock *wall_clock; /* * The wallclock is the time of day when we booted. Since then, some time may @@ -64,15 +59,15 @@ static void kvm_get_wallclock(struct timespec *now) int low, high; int cpu; - low = (int)__pa_symbol(&wall_clock); - high = ((u64)__pa_symbol(&wall_clock) >> 32); + low = (int)slow_virt_to_phys(wall_clock); + high = ((u64)slow_virt_to_phys(wall_clock) >> 32); native_write_msr(msr_kvm_wall_clock, low, high); cpu = get_cpu(); vcpu_time = &hv_clock[cpu].pvti; - pvclock_read_wallclock(&wall_clock, vcpu_time, now); + pvclock_read_wallclock(wall_clock, vcpu_time, now); put_cpu(); } @@ -249,11 +244,39 @@ static void kvm_shutdown(void) native_machine_shutdown(); } +static phys_addr_t __init kvm_memblock_alloc(phys_addr_t size, + phys_addr_t align) +{ + phys_addr_t mem; + + mem = memblock_alloc(size, align); + if (!mem) + return 0; + + if (sev_active()) { + if (early_set_memory_decrypted((unsigned long)__va(mem), size)) + goto e_free; + } + + return mem; +e_free: + memblock_free(mem, size); + return 0; +} + +static void __init kvm_memblock_free(phys_addr_t addr, phys_addr_t size) +{ + if (sev_active()) + early_set_memory_encrypted((unsigned long)__va(addr), size); + + memblock_free(addr, size); +} + void __init kvmclock_init(void) { struct pvclock_vcpu_time_info *vcpu_time; - unsigned long mem; - int size, cpu; + unsigned long mem, mem_wall_clock; + int size, cpu, wall_clock_size; u8 flags; size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); @@ -267,21 +290,35 @@ void __init kvmclock_init(void) } else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE))) return; - printk(KERN_INFO "kvm-clock: Using msrs %x and %x", - msr_kvm_system_time, msr_kvm_wall_clock); + wall_clock_size = PAGE_ALIGN(sizeof(struct pvclock_wall_clock)); + mem_wall_clock = kvm_memblock_alloc(wall_clock_size, PAGE_SIZE); + if (!mem_wall_clock) + return; - mem = memblock_alloc(size, PAGE_SIZE); - if (!mem) + wall_clock = __va(mem_wall_clock); + memset(wall_clock, 0, wall_clock_size); + + mem = kvm_memblock_alloc(size, PAGE_SIZE); + if (!mem) { + kvm_memblock_free(mem_wall_clock, wall_clock_size); + wall_clock = NULL; return; + } + hv_clock = __va(mem); memset(hv_clock, 0, size); if (kvm_register_clock("primary cpu clock")) { hv_clock = NULL; - memblock_free(mem, size); + kvm_memblock_free(mem, size); + kvm_memblock_free(mem_wall_clock, wall_clock_size); + wall_clock = NULL; return; } + printk(KERN_INFO "kvm-clock: Using msrs %x and %x", + msr_kvm_system_time, msr_kvm_wall_clock); + if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); @@ -334,6 +371,7 @@ int __init kvm_setup_vsyscall_timeinfo(void) return 1; } + pvclock_set_pvti_cpu0_va(hv_clock); put_cpu(); kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 4d17bacf4030..26d713ecad34 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -5,6 +5,11 @@ * Copyright (C) 2002 Andi Kleen * * This handles calls from both 32bit and 64bit mode. + * + * Lock order: + * contex.ldt_usr_sem + * mmap_sem + * context.lock */ #include <linux/errno.h> @@ -13,11 +18,13 @@ #include <linux/string.h> #include <linux/mm.h> #include <linux/smp.h> +#include <linux/syscalls.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> #include <asm/ldt.h> +#include <asm/tlb.h> #include <asm/desc.h> #include <asm/mmu_context.h> #include <asm/syscalls.h> @@ -41,17 +48,15 @@ static void refresh_ldt_segments(void) #endif } -/* context.lock is held for us, so we don't need any locking. */ +/* context.lock is held by the task which issued the smp function call */ static void flush_ldt(void *__mm) { struct mm_struct *mm = __mm; - mm_context_t *pc; if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm) return; - pc = &mm->context; - set_ldt(pc->ldt->entries, pc->ldt->nr_entries); + load_mm_ldt(mm); refresh_ldt_segments(); } @@ -88,25 +93,143 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) return NULL; } + /* The new LDT isn't aliased for PTI yet. */ + new_ldt->slot = -1; + new_ldt->nr_entries = num_entries; return new_ldt; } +/* + * If PTI is enabled, this maps the LDT into the kernelmode and + * usermode tables for the given mm. + * + * There is no corresponding unmap function. Even if the LDT is freed, we + * leave the PTEs around until the slot is reused or the mm is destroyed. + * This is harmless: the LDT is always in ordinary memory, and no one will + * access the freed slot. + * + * If we wanted to unmap freed LDTs, we'd also need to do a flush to make + * it useful, and the flush would slow down modify_ldt(). + */ +static int +map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) +{ +#ifdef CONFIG_PAGE_TABLE_ISOLATION + bool is_vmalloc, had_top_level_entry; + unsigned long va; + spinlock_t *ptl; + pgd_t *pgd; + int i; + + if (!static_cpu_has(X86_FEATURE_PTI)) + return 0; + + /* + * Any given ldt_struct should have map_ldt_struct() called at most + * once. + */ + WARN_ON(ldt->slot != -1); + + /* + * Did we already have the top level entry allocated? We can't + * use pgd_none() for this because it doens't do anything on + * 4-level page table kernels. + */ + pgd = pgd_offset(mm, LDT_BASE_ADDR); + had_top_level_entry = (pgd->pgd != 0); + + is_vmalloc = is_vmalloc_addr(ldt->entries); + + for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) { + unsigned long offset = i << PAGE_SHIFT; + const void *src = (char *)ldt->entries + offset; + unsigned long pfn; + pte_t pte, *ptep; + + va = (unsigned long)ldt_slot_va(slot) + offset; + pfn = is_vmalloc ? vmalloc_to_pfn(src) : + page_to_pfn(virt_to_page(src)); + /* + * Treat the PTI LDT range as a *userspace* range. + * get_locked_pte() will allocate all needed pagetables + * and account for them in this mm. + */ + ptep = get_locked_pte(mm, va, &ptl); + if (!ptep) + return -ENOMEM; + /* + * Map it RO so the easy to find address is not a primary + * target via some kernel interface which misses a + * permission check. + */ + pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL)); + set_pte_at(mm, va, ptep, pte); + pte_unmap_unlock(ptep, ptl); + } + + if (mm->context.ldt) { + /* + * We already had an LDT. The top-level entry should already + * have been allocated and synchronized with the usermode + * tables. + */ + WARN_ON(!had_top_level_entry); + if (static_cpu_has(X86_FEATURE_PTI)) + WARN_ON(!kernel_to_user_pgdp(pgd)->pgd); + } else { + /* + * This is the first time we're mapping an LDT for this process. + * Sync the pgd to the usermode tables. + */ + WARN_ON(had_top_level_entry); + if (static_cpu_has(X86_FEATURE_PTI)) { + WARN_ON(kernel_to_user_pgdp(pgd)->pgd); + set_pgd(kernel_to_user_pgdp(pgd), *pgd); + } + } + + va = (unsigned long)ldt_slot_va(slot); + flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0); + + ldt->slot = slot; +#endif + return 0; +} + +static void free_ldt_pgtables(struct mm_struct *mm) +{ +#ifdef CONFIG_PAGE_TABLE_ISOLATION + struct mmu_gather tlb; + unsigned long start = LDT_BASE_ADDR; + unsigned long end = start + (1UL << PGDIR_SHIFT); + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + tlb_gather_mmu(&tlb, mm, start, end); + free_pgd_range(&tlb, start, end, start, end); + tlb_finish_mmu(&tlb, start, end); +#endif +} + /* After calling this, the LDT is immutable. */ static void finalize_ldt_struct(struct ldt_struct *ldt) { paravirt_alloc_ldt(ldt->entries, ldt->nr_entries); } -/* context.lock is held */ -static void install_ldt(struct mm_struct *current_mm, - struct ldt_struct *ldt) +static void install_ldt(struct mm_struct *mm, struct ldt_struct *ldt) { - /* Synchronizes with lockless_dereference in load_mm_ldt. */ - smp_store_release(¤t_mm->context.ldt, ldt); + mutex_lock(&mm->context.lock); + + /* Synchronizes with READ_ONCE in load_mm_ldt. */ + smp_store_release(&mm->context.ldt, ldt); + + /* Activate the LDT for all CPUs using currents mm. */ + on_each_cpu_mask(mm_cpumask(mm), flush_ldt, mm, true); - /* Activate the LDT for all CPUs using current_mm. */ - on_each_cpu_mask(mm_cpumask(current_mm), flush_ldt, current_mm, true); + mutex_unlock(&mm->context.lock); } static void free_ldt_struct(struct ldt_struct *ldt) @@ -123,27 +246,20 @@ static void free_ldt_struct(struct ldt_struct *ldt) } /* - * we do not have to muck with descriptors here, that is - * done in switch_mm() as needed. + * Called on fork from arch_dup_mmap(). Just copy the current LDT state, + * the new task is not running, so nothing can be installed. */ -int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm) +int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm) { struct ldt_struct *new_ldt; - struct mm_struct *old_mm; int retval = 0; - mutex_init(&mm->context.lock); - old_mm = current->mm; - if (!old_mm) { - mm->context.ldt = NULL; + if (!old_mm) return 0; - } mutex_lock(&old_mm->context.lock); - if (!old_mm->context.ldt) { - mm->context.ldt = NULL; + if (!old_mm->context.ldt) goto out_unlock; - } new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries); if (!new_ldt) { @@ -155,6 +271,12 @@ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm) new_ldt->nr_entries * LDT_ENTRY_SIZE); finalize_ldt_struct(new_ldt); + retval = map_ldt_struct(mm, new_ldt, 0); + if (retval) { + free_ldt_pgtables(mm); + free_ldt_struct(new_ldt); + goto out_unlock; + } mm->context.ldt = new_ldt; out_unlock: @@ -173,13 +295,18 @@ void destroy_context_ldt(struct mm_struct *mm) mm->context.ldt = NULL; } +void ldt_arch_exit_mmap(struct mm_struct *mm) +{ + free_ldt_pgtables(mm); +} + static int read_ldt(void __user *ptr, unsigned long bytecount) { struct mm_struct *mm = current->mm; unsigned long entries_size; int retval; - mutex_lock(&mm->context.lock); + down_read(&mm->context.ldt_usr_sem); if (!mm->context.ldt) { retval = 0; @@ -208,7 +335,7 @@ static int read_ldt(void __user *ptr, unsigned long bytecount) retval = bytecount; out_unlock: - mutex_unlock(&mm->context.lock); + up_read(&mm->context.ldt_usr_sem); return retval; } @@ -268,7 +395,8 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) ldt.avl = 0; } - mutex_lock(&mm->context.lock); + if (down_write_killable(&mm->context.ldt_usr_sem)) + return -EINTR; old_ldt = mm->context.ldt; old_nr_entries = old_ldt ? old_ldt->nr_entries : 0; @@ -285,18 +413,37 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) new_ldt->entries[ldt_info.entry_number] = ldt; finalize_ldt_struct(new_ldt); + /* + * If we are using PTI, map the new LDT into the userspace pagetables. + * If there is already an LDT, use the other slot so that other CPUs + * will continue to use the old LDT until install_ldt() switches + * them over to the new LDT. + */ + error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0); + if (error) { + /* + * This only can fail for the first LDT setup. If an LDT is + * already installed then the PTE page is already + * populated. Mop up a half populated page table. + */ + if (!WARN_ON_ONCE(old_ldt)) + free_ldt_pgtables(mm); + free_ldt_struct(new_ldt); + goto out_unlock; + } + install_ldt(mm, new_ldt); free_ldt_struct(old_ldt); error = 0; out_unlock: - mutex_unlock(&mm->context.lock); + up_write(&mm->context.ldt_usr_sem); out: return error; } -asmlinkage int sys_modify_ldt(int func, void __user *ptr, - unsigned long bytecount) +SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr , + unsigned long , bytecount) { int ret = -ENOSYS; @@ -314,5 +461,14 @@ asmlinkage int sys_modify_ldt(int func, void __user *ptr, ret = write_ldt(ptr, bytecount, 0); break; } - return ret; + /* + * The SYSCALL_DEFINE() macros give us an 'unsigned long' + * return type, but tht ABI for sys_modify_ldt() expects + * 'int'. This cast gives us an int-sized value in %rax + * for the return code. The 'unsigned' is necessary so + * the compiler does not try to sign-extend the negative + * return codes into the high half of the register when + * taking the value from int->long. + */ + return (unsigned int)ret; } diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 00bc751c861c..edfede768688 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -48,8 +48,6 @@ static void load_segments(void) "\tmovl $"STR(__KERNEL_DS)",%%eax\n" "\tmovl %%eax,%%ds\n" "\tmovl %%eax,%%es\n" - "\tmovl %%eax,%%fs\n" - "\tmovl %%eax,%%gs\n" "\tmovl %%eax,%%ss\n" : : : "eax", "memory"); #undef STR @@ -232,8 +230,8 @@ void machine_kexec(struct kimage *image) * The gdt & idt are now invalid. * If you want to load them you must set up your own idt & gdt. */ - set_gdt(phys_to_virt(0), 0); idt_invalidate(phys_to_virt(0)); + set_gdt(phys_to_virt(0), 0); /* now call it */ image->start = relocate_kernel_ptr((unsigned long)image->head, diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 1f790cf9d38f..3b7427aa7d85 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -542,6 +542,7 @@ int arch_kexec_apply_relocations_add(const Elf64_Ehdr *ehdr, goto overflow; break; case R_X86_64_PC32: + case R_X86_64_PLT32: value -= (u64)address; *(u32 *)location = value; break; diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index da0c160e5589..f58336af095c 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -191,6 +191,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, goto overflow; break; case R_X86_64_PC32: + case R_X86_64_PLT32: if (*(u32 *)loc != 0) goto invalid_relocation; val -= (u64)loc; diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 410c5dadcee3..f1c5eb99d445 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -281,7 +281,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type) int ELCR_fallback = 0; intsrc.type = MP_INTSRC; - intsrc.irqflag = 0; /* conforming */ + intsrc.irqflag = MP_IRQTRIG_DEFAULT | MP_IRQPOL_DEFAULT; intsrc.srcbus = 0; intsrc.dstapic = mpc_ioapic_id(0); @@ -324,10 +324,13 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type) * copy that information over to the MP table in the * irqflag field (level sensitive, active high polarity). */ - if (ELCR_trigger(i)) - intsrc.irqflag = 13; - else - intsrc.irqflag = 0; + if (ELCR_trigger(i)) { + intsrc.irqflag = MP_IRQTRIG_LEVEL | + MP_IRQPOL_ACTIVE_HIGH; + } else { + intsrc.irqflag = MP_IRQTRIG_DEFAULT | + MP_IRQPOL_DEFAULT; + } } intsrc.srcbusirq = i; @@ -407,7 +410,7 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) processor.apicver = mpc_default_type > 4 ? 0x10 : 0x01; processor.cpuflag = CPU_ENABLED; processor.cpufeature = (boot_cpu_data.x86 << 8) | - (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; + (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_stepping; processor.featureflag = boot_cpu_data.x86_capability[CPUID_1_EDX]; processor.reserved[0] = 0; processor.reserved[1] = 0; @@ -419,7 +422,7 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) construct_ioapic_table(mpc_default_type); lintsrc.type = MP_LINTSRC; - lintsrc.irqflag = 0; /* conforming */ + lintsrc.irqflag = MP_IRQTRIG_DEFAULT | MP_IRQPOL_DEFAULT; lintsrc.srcbusid = 0; lintsrc.srcbusirq = 0; lintsrc.destapic = MP_APIC_ALL; @@ -431,6 +434,7 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) } static unsigned long mpf_base; +static bool mpf_found; static unsigned long __init get_mpc_size(unsigned long physptr) { @@ -504,7 +508,7 @@ void __init default_get_smp_config(unsigned int early) if (!smp_found_config) return; - if (!mpf_base) + if (!mpf_found) return; if (acpi_lapic && early) @@ -593,6 +597,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length) smp_found_config = 1; #endif mpf_base = base; + mpf_found = true; pr_info("found SMP MP-table at [mem %#010lx-%#010lx] mapped at [%p]\n", base, base + sizeof(*mpf) - 1, mpf); @@ -662,7 +667,7 @@ static int __init get_MP_intsrc_index(struct mpc_intsrc *m) if (m->irqtype != mp_INT) return 0; - if (m->irqflag != 0x0f) + if (m->irqflag != (MP_IRQTRIG_LEVEL | MP_IRQPOL_ACTIVE_LOW)) return 0; /* not legacy */ @@ -671,7 +676,8 @@ static int __init get_MP_intsrc_index(struct mpc_intsrc *m) if (mp_irqs[i].irqtype != mp_INT) continue; - if (mp_irqs[i].irqflag != 0x0f) + if (mp_irqs[i].irqflag != (MP_IRQTRIG_LEVEL | + MP_IRQPOL_ACTIVE_LOW)) continue; if (mp_irqs[i].srcbus != m->srcbus) @@ -782,7 +788,8 @@ static int __init replace_intsrc_all(struct mpc_table *mpc, if (mp_irqs[i].irqtype != mp_INT) continue; - if (mp_irqs[i].irqflag != 0x0f) + if (mp_irqs[i].irqflag != (MP_IRQTRIG_LEVEL | + MP_IRQPOL_ACTIVE_LOW)) continue; if (nr_m_spare > 0) { @@ -858,7 +865,7 @@ static int __init update_mp_table(void) if (!enable_update_mptable) return 0; - if (!mpf_base) + if (!mpf_found) return 0; mpf = early_memremap(mpf_base, sizeof(*mpf)); diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 35aafc95e4b8..18bc9b51ac9b 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -105,7 +105,7 @@ static void nmi_max_handler(struct irq_work *w) { struct nmiaction *a = container_of(w, struct nmiaction, irq_work); int remainder_ns, decimal_msecs; - u64 whole_msecs = ACCESS_ONCE(a->max_duration); + u64 whole_msecs = READ_ONCE(a->max_duration); remainder_ns = do_div(whole_msecs, (1000 * 1000)); decimal_msecs = remainder_ns / 1000; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 19a3e8f961c7..99dc79e76bdc 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -115,8 +115,18 @@ unsigned paravirt_patch_jmp(void *insnbuf, const void *target, return 5; } -/* Neat trick to map patch type back to the call within the - * corresponding structure. */ +DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key); + +void __init native_pv_lock_init(void) +{ + if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) + static_branch_disable(&virt_spin_lock_key); +} + +/* + * Neat trick to map patch type back to the call within the + * corresponding structure. + */ static void *get_call_destination(u8 type) { struct paravirt_patch_template tmpl = { @@ -190,9 +200,9 @@ static void native_flush_tlb_global(void) __native_flush_tlb_global(); } -static void native_flush_tlb_single(unsigned long addr) +static void native_flush_tlb_one_user(unsigned long addr) { - __native_flush_tlb_single(addr); + __native_flush_tlb_one_user(addr); } struct static_key paravirt_steal_enabled; @@ -391,7 +401,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = { .flush_tlb_user = native_flush_tlb, .flush_tlb_kernel = native_flush_tlb_global, - .flush_tlb_single = native_flush_tlb_single, + .flush_tlb_one_user = native_flush_tlb_one_user, .flush_tlb_others = native_flush_tlb_others, .pgd_alloc = __paravirt_pgd_alloc, diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index ac0be8283325..9edadabf04f6 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -10,7 +10,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax"); DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); -DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)"); DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq"); @@ -60,7 +59,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_mmu_ops, read_cr2); PATCH_SITE(pv_mmu_ops, read_cr3); PATCH_SITE(pv_mmu_ops, write_cr3); - PATCH_SITE(pv_mmu_ops, flush_tlb_single); PATCH_SITE(pv_cpu_ops, wbinvd); #if defined(CONFIG_PARAVIRT_SPINLOCKS) case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 5286a4a92cf7..35c461f21815 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -898,10 +898,9 @@ static void calioc2_dump_error_regs(struct iommu_table *tbl) PHB_ROOT_COMPLEX_STATUS); } -static void calgary_watchdog(unsigned long data) +static void calgary_watchdog(struct timer_list *t) { - struct pci_dev *dev = (struct pci_dev *)data; - struct iommu_table *tbl = pci_iommu(dev->bus); + struct iommu_table *tbl = from_timer(tbl, t, watchdog_timer); void __iomem *bbar = tbl->bbar; u32 val32; void __iomem *target; @@ -1016,8 +1015,7 @@ static void __init calgary_enable_translation(struct pci_dev *dev) writel(cpu_to_be32(val32), target); readl(target); /* flush */ - setup_timer(&tbl->watchdog_timer, &calgary_watchdog, - (unsigned long)dev); + timer_setup(&tbl->watchdog_timer, calgary_watchdog, 0); mod_timer(&tbl->watchdog_timer, jiffies); } diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 599d7462eccc..df7ab02f959f 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/dma-mapping.h> +#include <linux/dma-direct.h> #include <linux/dma-debug.h> #include <linux/dmar.h> #include <linux/export.h> @@ -87,7 +87,6 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size, dma_mask = dma_alloc_coherent_mask(dev, flag); - flag &= ~__GFP_ZERO; again: page = NULL; /* CMA can be used only in the context which permits sleeping */ @@ -139,7 +138,6 @@ bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp) if (!*dev) *dev = &x86_dma_fallback_dev; - *gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); *gfp = dma_alloc_coherent_gfp_flags(*dev, *gfp); if (!is_device_dma_capable(*dev)) @@ -217,7 +215,7 @@ static __init int iommu_setup(char *p) } early_param("iommu", iommu_setup); -int x86_dma_supported(struct device *dev, u64 mask) +int arch_dma_supported(struct device *dev, u64 mask) { #ifdef CONFIG_PCI if (mask > 0xffffffff && forbid_dac > 0) { @@ -226,12 +224,6 @@ int x86_dma_supported(struct device *dev, u64 mask) } #endif - /* Copied from i386. Doesn't make much sense, because it will - only work for pci_alloc_coherent. - The caller just has to use GFP_DMA in this case. */ - if (mask < DMA_BIT_MASK(24)) - return 0; - /* Tell the device to use SAC when IOMMU force is on. This allows the driver to use cheaper accesses in some cases. @@ -251,6 +243,17 @@ int x86_dma_supported(struct device *dev, u64 mask) return 1; } +EXPORT_SYMBOL(arch_dma_supported); + +int x86_dma_supported(struct device *dev, u64 mask) +{ + /* Copied from i386. Doesn't make much sense, because it will + only work for pci_alloc_coherent. + The caller just has to use GFP_DMA in this case. */ + if (mask < DMA_BIT_MASK(24)) + return 0; + return 1; +} static int __init pci_iommu_init(void) { diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index b0caae27e1b7..618285e475c6 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* Fallback functions when the main IOMMU code is not compiled in. This code is roughly equivalent to i386. */ -#include <linux/dma-mapping.h> +#include <linux/dma-direct.h> #include <linux/scatterlist.h> #include <linux/string.h> #include <linux/gfp.h> diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 53bd05ea90d8..0ee0f8f34251 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -6,7 +6,7 @@ #include <linux/init.h> #include <linux/swiotlb.h> #include <linux/bootmem.h> -#include <linux/dma-mapping.h> +#include <linux/dma-direct.h> #include <linux/mem_encrypt.h> #include <asm/iommu.h> @@ -48,7 +48,7 @@ void x86_swiotlb_free_coherent(struct device *dev, size_t size, dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs); } -static const struct dma_map_ops swiotlb_dma_ops = { +static const struct dma_map_ops x86_swiotlb_dma_ops = { .mapping_error = swiotlb_dma_mapping_error, .alloc = x86_swiotlb_alloc_coherent, .free = x86_swiotlb_free_coherent, @@ -112,7 +112,7 @@ void __init pci_swiotlb_init(void) { if (swiotlb) { swiotlb_init(0); - dma_ops = &swiotlb_dma_ops; + dma_ops = &x86_swiotlb_dma_ops; } } @@ -120,7 +120,7 @@ void __init pci_swiotlb_late_init(void) { /* An IOMMU turned us off. */ if (!swiotlb) - swiotlb_free(); + swiotlb_exit(); else { printk(KERN_INFO "PCI-DMA: " "Using software bounce buffering for IO (SWIOTLB)\n"); diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c index 39a59299bfa0..235fe6008ac8 100644 --- a/arch/x86/kernel/platform-quirks.c +++ b/arch/x86/kernel/platform-quirks.c @@ -9,6 +9,7 @@ void __init x86_early_init_platform_quirks(void) { x86_platform.legacy.i8042 = X86_LEGACY_I8042_EXPECTED_PRESENT; x86_platform.legacy.rtc = 1; + x86_platform.legacy.warm_reset = 1; x86_platform.legacy.reserve_bios_regions = 0; x86_platform.legacy.devices.pnpbios = 1; diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c index 3fe690067802..6b07faaa1579 100644 --- a/arch/x86/kernel/pmem.c +++ b/arch/x86/kernel/pmem.c @@ -7,7 +7,7 @@ #include <linux/init.h> #include <linux/ioport.h> -static int found(u64 start, u64 end, void *data) +static int found(struct resource *res, void *data) { return 1; } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index c67685337c5a..03408b942adb 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -21,7 +21,6 @@ #include <linux/dmi.h> #include <linux/utsname.h> #include <linux/stackprotector.h> -#include <linux/tick.h> #include <linux/cpuidle.h> #include <trace/events/power.h> #include <linux/hw_breakpoint.h> @@ -47,9 +46,25 @@ * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { +__visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = { .x86_tss = { - .sp0 = TOP_OF_INIT_STACK, + /* + * .sp0 is only used when entering ring 0 from a lower + * privilege level. Since the init task never runs anything + * but ring 0 code, there is no need for a valid value here. + * Poison it. + */ + .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, + +#ifdef CONFIG_X86_64 + /* + * .sp1 is cpu_current_top_of_stack. The init task never + * runs user code, but cpu_current_top_of_stack should still + * be well defined before the first context switch. + */ + .sp1 = TOP_OF_INIT_STACK, +#endif + #ifdef CONFIG_X86_32 .ss0 = __KERNEL_DS, .ss1 = __KERNEL_CS, @@ -65,11 +80,8 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { */ .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, #endif -#ifdef CONFIG_X86_32 - .SYSENTER_stack_canary = STACK_END_MAGIC, -#endif }; -EXPORT_PER_CPU_SYMBOL(cpu_tss); +EXPORT_PER_CPU_SYMBOL(cpu_tss_rw); DEFINE_PER_CPU(bool, __tss_limit_invalid); EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid); @@ -98,7 +110,7 @@ void exit_thread(struct task_struct *tsk) struct fpu *fpu = &t->fpu; if (bp) { - struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu()); + struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu()); t->io_bitmap_ptr = NULL; clear_thread_flag(TIF_IO_BITMAP); @@ -293,7 +305,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, } if ((tifp ^ tifn) & _TIF_NOTSC) - cr4_toggle_bits(X86_CR4_TSD); + cr4_toggle_bits_irqsoff(X86_CR4_TSD); if ((tifp ^ tifn) & _TIF_NOCPUID) set_cpuid_faulting(!!(tifn & _TIF_NOCPUID)); @@ -367,19 +379,24 @@ void stop_this_cpu(void *dummy) disable_local_APIC(); mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); + /* + * Use wbinvd on processors that support SME. This provides support + * for performing a successful kexec when going from SME inactive + * to SME active (or vice-versa). The cache must be cleared so that + * if there are entries with the same physical address, both with and + * without the encryption bit, they don't race each other when flushed + * and potentially end up with the wrong entry being committed to + * memory. + */ + if (boot_cpu_has(X86_FEATURE_SME)) + native_wbinvd(); for (;;) { /* - * Use wbinvd followed by hlt to stop the processor. This - * provides support for kexec on a processor that supports - * SME. With kexec, going from SME inactive to SME active - * requires clearing cache entries so that addresses without - * the encryption bit set don't corrupt the same physical - * address that has the encryption bit set when caches are - * flushed. To achieve this a wbinvd is performed followed by - * a hlt. Even if the processor is not in the kexec/SME - * scenario this only adds a wbinvd to a halting processor. + * Use native_halt() so that memory contents don't change + * (stack usage and variables) after possibly issuing the + * native_wbinvd() above. */ - asm volatile("wbinvd; hlt" : : : "memory"); + native_halt(); } } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 11966251cd42..5224c6099184 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -234,7 +234,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct fpu *prev_fpu = &prev->fpu; struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); - struct tss_struct *tss = &per_cpu(cpu_tss, cpu); + struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu); /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ @@ -284,9 +284,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Reload esp0 and cpu_current_top_of_stack. This changes - * current_thread_info(). + * current_thread_info(). Refresh the SYSENTER configuration in + * case prev or next is vm86. */ - load_sp0(tss, next); + update_sp0(next_p); + refresh_sysenter_cs(next); this_cpu_write(cpu_current_top_of_stack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 302e7b2572d1..9eb448c7859d 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -69,9 +69,8 @@ void __show_regs(struct pt_regs *regs, int all) unsigned int fsindex, gsindex; unsigned int ds, cs, es; - printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs, (void *)regs->ip); - printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss, - regs->sp, regs->flags); + show_iret_regs(regs); + if (regs->orig_ax != -1) pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax); else @@ -88,6 +87,9 @@ void __show_regs(struct pt_regs *regs, int all) printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n", regs->r13, regs->r14, regs->r15); + if (!all) + return; + asm("movl %%ds,%0" : "=r" (ds)); asm("movl %%cs,%0" : "=r" (cs)); asm("movl %%es,%0" : "=r" (es)); @@ -98,9 +100,6 @@ void __show_regs(struct pt_regs *regs, int all) rdmsrl(MSR_GS_BASE, gs); rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); - if (!all) - return; - cr0 = read_cr0(); cr2 = read_cr2(); cr3 = __read_cr3(); @@ -274,7 +273,6 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, struct inactive_task_frame *frame; struct task_struct *me = current; - p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE; childregs = task_pt_regs(p); fork_frame = container_of(childregs, struct fork_frame, regs); frame = &fork_frame->frame; @@ -401,7 +399,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct fpu *prev_fpu = &prev->fpu; struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); - struct tss_struct *tss = &per_cpu(cpu_tss, cpu); + struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu); WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && this_cpu_read(irq_count) != -1); @@ -463,9 +461,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * Switch the PDA and FPU contexts. */ this_cpu_write(current_task, next_p); + this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); - /* Reload esp0 and ss1. This changes current_thread_info(). */ - load_sp0(tss, next); + /* Reload sp0. */ + update_sp0(next_p); /* * Now maybe reload the debug registers and handle I/O bitmaps @@ -558,7 +557,7 @@ static void __set_personality_x32(void) * Pretend to come from a x32 execve. */ task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT; - current->thread.status &= ~TS_COMPAT; + current_thread_info()->status &= ~TS_COMPAT; #endif } @@ -572,7 +571,7 @@ static void __set_personality_ia32(void) current->personality |= force_personality32; /* Prepare the first "return" to user space */ task_pt_regs(current)->orig_ax = __NR_ia32_execve; - current->thread.status |= TS_COMPAT; + current_thread_info()->status |= TS_COMPAT; #endif } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index f37d18124648..ed5c4cdf0a34 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -935,7 +935,7 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value) */ regs->orig_ax = value; if (syscall_get_nr(child, regs) >= 0) - child->thread.status |= TS_I386_REGS_POKED; + child->thread_info.status |= TS_I386_REGS_POKED; break; case offsetof(struct user32, regs.eflags): diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 5c3f6d6a5078..761f6af6efa5 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -25,8 +25,10 @@ #include <asm/fixmap.h> #include <asm/pvclock.h> +#include <asm/vgtod.h> static u8 valid_flags __read_mostly = 0; +static struct pvclock_vsyscall_time_info *pvti_cpu0_va __read_mostly; void pvclock_set_flags(u8 flags) { @@ -144,3 +146,15 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); } + +void pvclock_set_pvti_cpu0_va(struct pvclock_vsyscall_time_info *pvti) +{ + WARN_ON(vclock_was_used(VCLOCK_PVCLOCK)); + pvti_cpu0_va = pvti; +} + +struct pvclock_vsyscall_time_info *pvclock_get_pvti_cpu0_va(void) +{ + return pvti_cpu0_va; +} +EXPORT_SYMBOL_GPL(pvclock_get_pvti_cpu0_va); diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 307d3bac5f04..11eda21eb697 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -68,6 +68,9 @@ relocate_kernel: movq %cr4, %rax movq %rax, CR4(%r11) + /* Save CR4. Required to enable the right paging mode later. */ + movq %rax, %r13 + /* zero out flags, and disable interrupts */ pushq $0 popfq @@ -126,8 +129,13 @@ identity_mapped: /* * Set cr4 to a known state: * - physical address extension enabled + * - 5-level paging, if it was enabled before */ movl $X86_CR4_PAE, %eax + testq $X86_CR4_LA57, %r13 + jz 1f + orl $X86_CR4_LA57, %eax +1: movq %rax, %cr4 jmp 1f diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0957dd73d127..4c616be28506 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -114,7 +114,6 @@ #include <asm/alternative.h> #include <asm/prom.h> #include <asm/microcode.h> -#include <asm/mmu_context.h> #include <asm/kaslr.h> #include <asm/unwind.h> @@ -136,18 +135,6 @@ RESERVE_BRK(dmi_alloc, 65536); static __initdata unsigned long _brk_start = (unsigned long)__brk_base; unsigned long _brk_end = (unsigned long)__brk_base; -#ifdef CONFIG_X86_64 -int default_cpu_present_to_apicid(int mps_cpu) -{ - return __default_cpu_present_to_apicid(mps_cpu); -} - -int default_check_phys_apicid_present(int phys_apicid) -{ - return __default_check_phys_apicid_present(phys_apicid); -} -#endif - struct boot_params boot_params; /* @@ -376,14 +363,6 @@ static void __init reserve_initrd(void) !ramdisk_image || !ramdisk_size) return; /* No initrd provided by bootloader */ - /* - * If SME is active, this memory will be marked encrypted by the - * kernel when it is accessed (including relocation). However, the - * ramdisk image was loaded decrypted by the bootloader, so make - * sure that it is encrypted before accessing it. - */ - sme_early_encrypt(ramdisk_image, ramdisk_end - ramdisk_image); - initrd_start = 0; mapped_size = memblock_mem_size(max_pfn_mapped); @@ -822,26 +801,6 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) return 0; } -static void __init simple_udelay_calibration(void) -{ - unsigned int tsc_khz, cpu_khz; - unsigned long lpj; - - if (!boot_cpu_has(X86_FEATURE_TSC)) - return; - - cpu_khz = x86_platform.calibrate_cpu(); - tsc_khz = x86_platform.calibrate_tsc(); - - tsc_khz = tsc_khz ? : cpu_khz; - if (!tsc_khz) - return; - - lpj = tsc_khz * 1000; - do_div(lpj, HZ); - loops_per_jiffy = lpj; -} - /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -936,9 +895,6 @@ void __init setup_arch(char **cmdline_p) set_bit(EFI_BOOT, &efi.flags); set_bit(EFI_64BIT, &efi.flags); } - - if (efi_enabled(EFI_BOOT)) - efi_memblock_x86_reserve_range(); #endif x86_init.oem.arch_setup(); @@ -992,6 +948,8 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); + if (efi_enabled(EFI_BOOT)) + efi_memblock_x86_reserve_range(); #ifdef CONFIG_MEMORY_HOTPLUG /* * Memory used by the kernel cannot be hot-removed because Linux @@ -1045,12 +1003,10 @@ void __init setup_arch(char **cmdline_p) /* * VMware detection requires dmi to be available, so this - * needs to be done after dmi_scan_machine, for the BP. + * needs to be done after dmi_scan_machine(), for the boot CPU. */ init_hypervisor_platform(); - simple_udelay_calibration(); - x86_init.resources.probe_roms(); /* after parse_early_param, so could debug it */ @@ -1135,9 +1091,6 @@ void __init setup_arch(char **cmdline_p) memblock_set_current_limit(ISA_END_ADDRESS); e820__memblock_setup(); - if (!early_xdbc_setup_hardware()) - early_xdbc_register_console(); - reserve_bios_regions(); if (efi_enabled(EFI_MEMMAP)) { @@ -1243,24 +1196,21 @@ void __init setup_arch(char **cmdline_p) kvmclock_init(); #endif + tsc_early_delay_calibrate(); + if (!early_xdbc_setup_hardware()) + early_xdbc_register_console(); + x86_init.paging.pagetable_init(); kasan_init(); -#ifdef CONFIG_X86_32 - /* sync back kernel address range */ - clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, - swapper_pg_dir + KERNEL_PGD_BOUNDARY, - KERNEL_PGD_PTRS); - /* - * sync back low identity map too. It is used for example - * in the 32-bit EFI stub. + * Sync back kernel address range. + * + * FIXME: Can the later sync in setup_cpu_entry_areas() replace + * this call? */ - clone_pgd_range(initial_page_table, - swapper_pg_dir + KERNEL_PGD_BOUNDARY, - min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); -#endif + sync_initial_page_table(); tboot_probe(); @@ -1294,7 +1244,7 @@ void __init setup_arch(char **cmdline_p) io_apic_init_mappings(); - kvm_guest_init(); + x86_init.hyper.guest_late_init(); e820__reserve_resources(); e820__register_nosave_regions(max_low_pfn); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 497aa766fab3..ea554f812ee1 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -287,24 +287,15 @@ void __init setup_per_cpu_areas(void) /* Setup cpu initialized, callin, callout masks */ setup_cpu_local_masks(); -#ifdef CONFIG_X86_32 /* * Sync back kernel address range again. We already did this in * setup_arch(), but percpu data also needs to be available in * the smpboot asm. We can't reliably pick up percpu mappings * using vmalloc_fault(), because exception dispatch needs * percpu data. + * + * FIXME: Can the later sync in setup_cpu_entry_areas() replace + * this call? */ - clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, - swapper_pg_dir + KERNEL_PGD_BOUNDARY, - KERNEL_PGD_PTRS); - - /* - * sync back low identity map too. It is used for example - * in the 32-bit EFI stub. - */ - clone_pgd_range(initial_page_table, - swapper_pg_dir + KERNEL_PGD_BOUNDARY, - min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); -#endif + sync_initial_page_table(); } diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index b9e00e8f1c9b..4cdc0b27ec82 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -787,7 +787,7 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) * than the tracee. */ #ifdef CONFIG_IA32_EMULATION - if (current->thread.status & (TS_COMPAT|TS_I386_REGS_POKED)) + if (current_thread_info()->status & (TS_COMPAT|TS_I386_REGS_POKED)) return __NR_ia32_restart_syscall; #endif #ifdef CONFIG_X86_X32_ABI diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index 8c6da1a643da..ac057f9b0763 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -25,8 +25,8 @@ static inline void signal_compat_build_tests(void) * limits also have to look at this code. Make sure any * new fields are handled in copy_siginfo_to_user32()! */ - BUILD_BUG_ON(NSIGILL != 8); - BUILD_BUG_ON(NSIGFPE != 8); + BUILD_BUG_ON(NSIGILL != 11); + BUILD_BUG_ON(NSIGFPE != 13); BUILD_BUG_ON(NSIGSEGV != 4); BUILD_BUG_ON(NSIGBUS != 5); BUILD_BUG_ON(NSIGTRAP != 4); @@ -64,7 +64,7 @@ static inline void signal_compat_build_tests(void) CHECK_SI_SIZE (_kill, 2*sizeof(int)); CHECK_CSI_OFFSET(_timer); - CHECK_CSI_SIZE (_timer, 5*sizeof(int)); + CHECK_CSI_SIZE (_timer, 3*sizeof(int)); CHECK_SI_SIZE (_timer, 6*sizeof(int)); CHECK_CSI_OFFSET(_rt); @@ -75,9 +75,11 @@ static inline void signal_compat_build_tests(void) CHECK_CSI_SIZE (_sigchld, 5*sizeof(int)); CHECK_SI_SIZE (_sigchld, 8*sizeof(int)); +#ifdef CONFIG_X86_X32_ABI CHECK_CSI_OFFSET(_sigchld_x32); CHECK_CSI_SIZE (_sigchld_x32, 7*sizeof(int)); /* no _sigchld_x32 in the generic siginfo_t */ +#endif CHECK_CSI_OFFSET(_sigfault); CHECK_CSI_SIZE (_sigfault, 4*sizeof(int)); @@ -96,6 +98,8 @@ static inline void signal_compat_build_tests(void) void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact) { + signal_compat_build_tests(); + /* Don't leak in-kernel non-uapi flags to user-space */ if (oact) oact->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI); @@ -111,116 +115,3 @@ void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact) if (in_x32_syscall()) act->sa.sa_flags |= SA_X32_ABI; } - -int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from, - bool x32_ABI) -{ - int err = 0; - - signal_compat_build_tests(); - - if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t))) - return -EFAULT; - - put_user_try { - /* If you change siginfo_t structure, please make sure that - this code is fixed accordingly. - It should never copy any pad contained in the structure - to avoid security leaks, but must copy the generic - 3 ints plus the relevant union member. */ - put_user_ex(from->si_signo, &to->si_signo); - put_user_ex(from->si_errno, &to->si_errno); - put_user_ex(from->si_code, &to->si_code); - - if (from->si_code < 0) { - put_user_ex(from->si_pid, &to->si_pid); - put_user_ex(from->si_uid, &to->si_uid); - put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr); - } else { - /* - * First 32bits of unions are always present: - * si_pid === si_band === si_tid === si_addr(LS half) - */ - put_user_ex(from->_sifields._pad[0], - &to->_sifields._pad[0]); - switch (siginfo_layout(from->si_signo, from->si_code)) { - case SIL_FAULT: - if (from->si_signo == SIGBUS && - (from->si_code == BUS_MCEERR_AR || - from->si_code == BUS_MCEERR_AO)) - put_user_ex(from->si_addr_lsb, &to->si_addr_lsb); - - if (from->si_signo == SIGSEGV) { - if (from->si_code == SEGV_BNDERR) { - compat_uptr_t lower = (unsigned long)from->si_lower; - compat_uptr_t upper = (unsigned long)from->si_upper; - put_user_ex(lower, &to->si_lower); - put_user_ex(upper, &to->si_upper); - } - if (from->si_code == SEGV_PKUERR) - put_user_ex(from->si_pkey, &to->si_pkey); - } - break; - case SIL_SYS: - put_user_ex(from->si_syscall, &to->si_syscall); - put_user_ex(from->si_arch, &to->si_arch); - break; - case SIL_CHLD: - if (!x32_ABI) { - put_user_ex(from->si_utime, &to->si_utime); - put_user_ex(from->si_stime, &to->si_stime); - } else { - put_user_ex(from->si_utime, &to->_sifields._sigchld_x32._utime); - put_user_ex(from->si_stime, &to->_sifields._sigchld_x32._stime); - } - put_user_ex(from->si_status, &to->si_status); - /* FALL THROUGH */ - case SIL_KILL: - put_user_ex(from->si_uid, &to->si_uid); - break; - case SIL_POLL: - put_user_ex(from->si_fd, &to->si_fd); - break; - case SIL_TIMER: - put_user_ex(from->si_overrun, &to->si_overrun); - put_user_ex(ptr_to_compat(from->si_ptr), - &to->si_ptr); - break; - case SIL_RT: - put_user_ex(from->si_uid, &to->si_uid); - put_user_ex(from->si_int, &to->si_int); - break; - } - } - } put_user_catch(err); - - return err; -} - -/* from syscall's path, where we know the ABI */ -int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) -{ - return __copy_siginfo_to_user32(to, from, in_x32_syscall()); -} - -int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) -{ - int err = 0; - u32 ptr32; - - if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t))) - return -EFAULT; - - get_user_try { - get_user_ex(to->si_signo, &from->si_signo); - get_user_ex(to->si_errno, &from->si_errno); - get_user_ex(to->si_code, &from->si_code); - - get_user_ex(to->si_pid, &from->si_pid); - get_user_ex(to->si_uid, &from->si_uid); - get_user_ex(ptr32, &from->si_ptr); - to->si_ptr = compat_ptr(ptr32); - } get_user_catch(err); - - return err; -} diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ad59edd84de7..ff99e2b6fc54 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -75,8 +75,8 @@ #include <asm/uv/uv.h> #include <linux/mc146818rtc.h> #include <asm/i8259.h> -#include <asm/realmode.h> #include <asm/misc.h> +#include <asm/qspinlock.h> /* Number of siblings per CPU package */ int smp_num_siblings = 1; @@ -100,15 +100,12 @@ DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); EXPORT_PER_CPU_SYMBOL(cpu_info); /* Logical package management. We might want to allocate that dynamically */ -static int *physical_to_logical_pkg __read_mostly; -static unsigned long *physical_package_map __read_mostly;; -static unsigned int max_physical_pkg_id __read_mostly; unsigned int __max_logical_packages __read_mostly; EXPORT_SYMBOL(__max_logical_packages); static unsigned int logical_packages __read_mostly; /* Maximum number of SMT threads on any online core */ -int __max_smt_threads __read_mostly; +int __read_mostly __max_smt_threads = 1; /* Flag to indicate if a complete sched domain rebuild is required */ bool x86_topology_update; @@ -128,14 +125,10 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) spin_lock_irqsave(&rtc_lock, flags); CMOS_WRITE(0xa, 0xf); spin_unlock_irqrestore(&rtc_lock, flags); - local_flush_tlb(); - pr_debug("1.\n"); *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = start_eip >> 4; - pr_debug("2.\n"); *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = start_eip & 0xf; - pr_debug("3.\n"); } static inline void smpboot_restore_warm_reset_vector(void) @@ -143,11 +136,6 @@ static inline void smpboot_restore_warm_reset_vector(void) unsigned long flags; /* - * Install writable page 0 entry to set BIOS data area. - */ - local_flush_tlb(); - - /* * Paranoid: Set warm reset code and vector here back * to default values. */ @@ -194,6 +182,12 @@ static void smp_callin(void) smp_store_cpu_info(cpuid); /* + * The topology information must be up to date before + * calibrate_delay() and notify_cpu_starting(). + */ + set_cpu_sibling_map(raw_smp_processor_id()); + + /* * Get our bogomips. * Update loops_per_jiffy in cpu_data. Previous call to * smp_store_cpu_info() stored a value that is close but not as @@ -203,11 +197,6 @@ static void smp_callin(void) cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy; pr_debug("Stack at about %p\n", &cpuid); - /* - * This must be done before setting cpu_online_mask - * or calling notify_cpu_starting. - */ - set_cpu_sibling_map(raw_smp_processor_id()); wmb(); notify_cpu_starting(cpuid); @@ -238,7 +227,7 @@ static void notrace start_secondary(void *unused) load_cr3(swapper_pg_dir); __flush_tlb_all(); #endif - + load_current_idt(); cpu_init(); x86_cpuinit.early_percpu_clock_init(); preempt_disable(); @@ -249,19 +238,19 @@ static void notrace start_secondary(void *unused) /* otherwise gcc will move up smp_processor_id before the cpu_init */ barrier(); /* - * Check TSC synchronization with the BP: + * Check TSC synchronization with the boot CPU: */ check_tsc_sync_target(); /* - * Lock vector_lock and initialize the vectors on this cpu - * before setting the cpu online. We must set it online with - * vector_lock held to prevent a concurrent setup/teardown - * from seeing a half valid vector space. + * Lock vector_lock, set CPU online and bring the vector + * allocator online. Online must be set with vector_lock held + * to prevent a concurrent irq setup/teardown from seeing a + * half valid vector space. */ lock_vector_lock(); - setup_vector_irq(smp_processor_id()); set_cpu_online(smp_processor_id(), true); + lapic_online(); unlock_vector_lock(); cpu_set_state_online(smp_processor_id()); x86_platform.nmi_init(); @@ -279,108 +268,48 @@ static void notrace start_secondary(void *unused) } /** + * topology_phys_to_logical_pkg - Map a physical package id to a logical + * + * Returns logical package id or -1 if not found + */ +int topology_phys_to_logical_pkg(unsigned int phys_pkg) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct cpuinfo_x86 *c = &cpu_data(cpu); + + if (c->initialized && c->phys_proc_id == phys_pkg) + return c->logical_proc_id; + } + return -1; +} +EXPORT_SYMBOL(topology_phys_to_logical_pkg); + +/** * topology_update_package_map - Update the physical to logical package map * @pkg: The physical package id as retrieved via CPUID * @cpu: The cpu for which this is updated */ int topology_update_package_map(unsigned int pkg, unsigned int cpu) { - unsigned int new; - - /* Called from early boot ? */ - if (!physical_package_map) - return 0; + int new; - if (pkg >= max_physical_pkg_id) - return -EINVAL; - - /* Set the logical package id */ - if (test_and_set_bit(pkg, physical_package_map)) + /* Already available somewhere? */ + new = topology_phys_to_logical_pkg(pkg); + if (new >= 0) goto found; - if (logical_packages >= __max_logical_packages) { - pr_warn("Package %u of CPU %u exceeds BIOS package data %u.\n", - logical_packages, cpu, __max_logical_packages); - return -ENOSPC; - } - new = logical_packages++; if (new != pkg) { pr_info("CPU %u Converting physical %u to logical package %u\n", cpu, pkg, new); } - physical_to_logical_pkg[pkg] = new; - found: - cpu_data(cpu).logical_proc_id = physical_to_logical_pkg[pkg]; + cpu_data(cpu).logical_proc_id = new; return 0; } -/** - * topology_phys_to_logical_pkg - Map a physical package id to a logical - * - * Returns logical package id or -1 if not found - */ -int topology_phys_to_logical_pkg(unsigned int phys_pkg) -{ - if (phys_pkg >= max_physical_pkg_id) - return -1; - return physical_to_logical_pkg[phys_pkg]; -} -EXPORT_SYMBOL(topology_phys_to_logical_pkg); - -static void __init smp_init_package_map(struct cpuinfo_x86 *c, unsigned int cpu) -{ - unsigned int ncpus; - size_t size; - - /* - * Today neither Intel nor AMD support heterogenous systems. That - * might change in the future.... - * - * While ideally we'd want '* smp_num_siblings' in the below @ncpus - * computation, this won't actually work since some Intel BIOSes - * report inconsistent HT data when they disable HT. - * - * In particular, they reduce the APIC-IDs to only include the cores, - * but leave the CPUID topology to say there are (2) siblings. - * This means we don't know how many threads there will be until - * after the APIC enumeration. - * - * By not including this we'll sometimes over-estimate the number of - * logical packages by the amount of !present siblings, but this is - * still better than MAX_LOCAL_APIC. - * - * We use total_cpus not nr_cpu_ids because nr_cpu_ids can be limited - * on the command line leading to a similar issue as the HT disable - * problem because the hyperthreads are usually enumerated after the - * primary cores. - */ - ncpus = boot_cpu_data.x86_max_cores; - if (!ncpus) { - pr_warn("x86_max_cores == zero !?!?"); - ncpus = 1; - } - - __max_logical_packages = DIV_ROUND_UP(total_cpus, ncpus); - logical_packages = 0; - - /* - * Possibly larger than what we need as the number of apic ids per - * package can be smaller than the actual used apic ids. - */ - max_physical_pkg_id = DIV_ROUND_UP(MAX_LOCAL_APIC, ncpus); - size = max_physical_pkg_id * sizeof(unsigned int); - physical_to_logical_pkg = kmalloc(size, GFP_KERNEL); - memset(physical_to_logical_pkg, 0xff, size); - size = BITS_TO_LONGS(max_physical_pkg_id) * sizeof(unsigned long); - physical_package_map = kzalloc(size, GFP_KERNEL); - - pr_info("Max logical packages: %u\n", __max_logical_packages); - - topology_update_package_map(c->phys_proc_id, cpu); -} - void __init smp_store_boot_cpu_info(void) { int id = 0; /* CPU 0 */ @@ -388,7 +317,8 @@ void __init smp_store_boot_cpu_info(void) *c = boot_cpu_data; c->cpu_index = id; - smp_init_package_map(c, id); + topology_update_package_map(c->phys_proc_id, id); + c->initialized = true; } /* @@ -399,13 +329,16 @@ void smp_store_cpu_info(int id) { struct cpuinfo_x86 *c = &cpu_data(id); - *c = boot_cpu_data; + /* Copy boot_cpu_data only on the first bringup */ + if (!c->initialized) + *c = boot_cpu_data; c->cpu_index = id; /* * During boot time, CPU0 has this setup already. Save the info when * bringing up AP or offlined CPU0. */ identify_secondary_cpu(c); + c->initialized = true; } static bool @@ -961,8 +894,7 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle) #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ irq_ctx_init(cpu); - per_cpu(cpu_current_top_of_stack, cpu) = - (unsigned long)task_stack_page(idle) + THREAD_SIZE; + per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle); #else initial_gs = per_cpu_offset(cpu); #endif @@ -990,12 +922,8 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, initial_code = (unsigned long)start_secondary; initial_stack = idle->thread.sp; - /* - * Enable the espfix hack for this CPU - */ -#ifdef CONFIG_X86_ESPFIX64 + /* Enable the espfix hack for this CPU */ init_espfix_ap(cpu); -#endif /* So we see what's up */ announce_cpu(cpu, apicid); @@ -1005,7 +933,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, * the targeted processor. */ - if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { + if (x86_platform.legacy.warm_reset) { pr_debug("Setting warm reset code and vector.\n"); @@ -1077,7 +1005,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, /* mark "stuck" area as not stuck */ *trampoline_status = 0; - if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { + if (x86_platform.legacy.warm_reset) { /* * Cleanup possible dangling ends... */ @@ -1094,7 +1022,7 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) unsigned long flags; int err, ret = 0; - WARN_ON(irqs_disabled()); + lockdep_assert_irqs_enabled(); pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu); @@ -1190,17 +1118,10 @@ static __init void disable_smp(void) cpumask_set_cpu(0, topology_core_cpumask(0)); } -enum { - SMP_OK, - SMP_NO_CONFIG, - SMP_NO_APIC, - SMP_FORCE_UP, -}; - /* * Various sanity checks. */ -static int __init smp_sanity_check(unsigned max_cpus) +static void __init smp_sanity_check(void) { preempt_disable(); @@ -1238,16 +1159,6 @@ static int __init smp_sanity_check(unsigned max_cpus) } /* - * If we couldn't find an SMP configuration at boot time, - * get out of here now! - */ - if (!smp_found_config && !acpi_lapic) { - preempt_enable(); - pr_notice("SMP motherboard not detected\n"); - return SMP_NO_CONFIG; - } - - /* * Should not be necessary because the MP table should list the boot * CPU too, but we do it for the sake of robustness anyway. */ @@ -1257,29 +1168,6 @@ static int __init smp_sanity_check(unsigned max_cpus) physid_set(hard_smp_processor_id(), phys_cpu_present_map); } preempt_enable(); - - /* - * If we couldn't find a local APIC, then get out of here now! - */ - if (APIC_INTEGRATED(boot_cpu_apic_version) && - !boot_cpu_has(X86_FEATURE_APIC)) { - if (!disable_apic) { - pr_err("BIOS bug, local APIC #%d not detected!...\n", - boot_cpu_physical_apicid); - pr_err("... forcing use of dummy APIC emulation (tell your hw vendor)\n"); - } - return SMP_NO_APIC; - } - - /* - * If SMP should be disabled, then really disable it! - */ - if (!max_cpus) { - pr_info("SMP mode deactivated\n"); - return SMP_FORCE_UP; - } - - return SMP_OK; } static void __init smp_cpu_index_default(void) @@ -1294,9 +1182,18 @@ static void __init smp_cpu_index_default(void) } } +static void __init smp_get_logical_apicid(void) +{ + if (x2apic_mode) + cpu0_logical_apicid = apic_read(APIC_LDR); + else + cpu0_logical_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); +} + /* - * Prepare for SMP bootup. The MP table or ACPI has been read - * earlier. Just do some sanity checking here and enable APIC mode. + * Prepare for SMP bootup. + * @max_cpus: configured maximum number of CPUs, It is a legacy parameter + * for common interface support. */ void __init native_smp_prepare_cpus(unsigned int max_cpus) { @@ -1328,35 +1225,33 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) set_cpu_sibling_map(0); - switch (smp_sanity_check(max_cpus)) { - case SMP_NO_CONFIG: - disable_smp(); - if (APIC_init_uniprocessor()) - pr_notice("Local APIC not detected. Using dummy APIC emulation.\n"); - return; - case SMP_NO_APIC: + smp_sanity_check(); + + switch (apic_intr_mode) { + case APIC_PIC: + case APIC_VIRTUAL_WIRE_NO_CONFIG: disable_smp(); return; - case SMP_FORCE_UP: + case APIC_SYMMETRIC_IO_NO_ROUTING: disable_smp(); - apic_bsp_setup(false); + /* Setup local timer */ + x86_init.timers.setup_percpu_clockev(); return; - case SMP_OK: + case APIC_VIRTUAL_WIRE: + case APIC_SYMMETRIC_IO: break; } - if (read_apic_id() != boot_cpu_physical_apicid) { - panic("Boot APIC ID in local APIC unexpected (%d vs %d)", - read_apic_id(), boot_cpu_physical_apicid); - /* Or can we switch back to PIC here? */ - } + /* Setup local timer */ + x86_init.timers.setup_percpu_clockev(); - default_setup_apic_routing(); - cpu0_logical_apicid = apic_bsp_setup(false); + smp_get_logical_apicid(); pr_info("CPU0: "); print_cpu_info(&cpu_data(0)); + native_pv_lock_init(); + uv_system_init(); set_mtrr_aps_delayed_init(); @@ -1386,16 +1281,30 @@ void __init native_smp_prepare_boot_cpu(void) cpu_set_state_online(me); } +void __init calculate_max_logical_packages(void) +{ + int ncpus; + + /* + * Today neither Intel nor AMD support heterogenous systems so + * extrapolate the boot cpu's data to all packages. + */ + ncpus = cpu_data(0).booted_cores * topology_max_smt_threads(); + __max_logical_packages = DIV_ROUND_UP(nr_cpu_ids, ncpus); + pr_info("Max logical packages: %u\n", __max_logical_packages); +} + void __init native_smp_cpus_done(unsigned int max_cpus) { pr_debug("Boot done\n"); + calculate_max_logical_packages(); + if (x86_has_numa_in_package) set_sched_topology(x86_numa_in_package_topology); nmi_selftest(); impress_friends(); - setup_ioapic_dest(); mtrr_aps_init(); } @@ -1527,8 +1436,8 @@ static void remove_siblinginfo(int cpu) cpumask_clear(cpu_llc_shared_mask(cpu)); cpumask_clear(topology_sibling_cpumask(cpu)); cpumask_clear(topology_core_cpumask(cpu)); - c->phys_proc_id = 0; c->cpu_core_id = 0; + c->booted_cores = 0; cpumask_clear_cpu(cpu, cpu_sibling_setup_mask); recompute_smt_state(); } @@ -1554,13 +1463,14 @@ void cpu_disable_common(void) remove_cpu_from_maps(cpu); unlock_vector_lock(); fixup_irqs(); + lapic_offline(); } int native_cpu_disable(void) { int ret; - ret = check_irq_vectors_for_cpu_disable(); + ret = lapic_can_unplug_cpu(); if (ret) return ret; diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 8dabd7bf1673..093f2ea5dd56 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -30,7 +30,7 @@ static int save_stack_address(struct stack_trace *trace, unsigned long addr, return 0; } -static void __save_stack_trace(struct stack_trace *trace, +static void noinline __save_stack_trace(struct stack_trace *trace, struct task_struct *task, struct pt_regs *regs, bool nosched) { @@ -56,6 +56,7 @@ static void __save_stack_trace(struct stack_trace *trace, */ void save_stack_trace(struct stack_trace *trace) { + trace->skip++; __save_stack_trace(trace, current, NULL, false); } EXPORT_SYMBOL_GPL(save_stack_trace); @@ -70,6 +71,8 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) if (!try_get_task_stack(tsk)) return; + if (tsk == current) + trace->skip++; __save_stack_trace(trace, tsk, NULL, true); put_task_stack(tsk); @@ -88,8 +91,9 @@ EXPORT_SYMBOL_GPL(save_stack_trace_tsk); } \ }) -static int __save_stack_trace_reliable(struct stack_trace *trace, - struct task_struct *task) +static int __always_inline +__save_stack_trace_reliable(struct stack_trace *trace, + struct task_struct *task) { struct unwind_state state; struct pt_regs *regs; @@ -98,7 +102,7 @@ static int __save_stack_trace_reliable(struct stack_trace *trace, for (unwind_start(&state, task, NULL, NULL); !unwind_done(&state); unwind_next_frame(&state)) { - regs = unwind_get_entry_regs(&state); + regs = unwind_get_entry_regs(&state, NULL); if (regs) { /* * Kernel mode registers on the stack indicate an @@ -160,8 +164,12 @@ int save_stack_trace_tsk_reliable(struct task_struct *tsk, { int ret; + /* + * If the task doesn't have a stack (e.g., a zombie), the stack is + * "reliably" empty. + */ if (!try_get_task_stack(tsk)) - return -EINVAL; + return 0; ret = __save_stack_trace_reliable(trace, tsk); diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index a63fe77b3217..676774b9bb8d 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -188,6 +188,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, if (len > TASK_SIZE) return -ENOMEM; + /* No address checking. See comment at mmap_address_hint_valid() */ if (flags & MAP_FIXED) return addr; @@ -197,12 +198,15 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, /* requesting a specific address */ if (addr) { - addr = PAGE_ALIGN(addr); + addr &= PAGE_MASK; + if (!mmap_address_hint_valid(addr, len)) + goto get_unmapped_area; + vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && - (!vma || addr + len <= vm_start_gap(vma))) + if (!vma || addr + len <= vm_start_gap(vma)) return addr; } +get_unmapped_area: info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index a4eb27918ceb..a2486f444073 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -138,6 +138,17 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, return -1; set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); pte_unmap(pte); + + /* + * PTI poisons low addresses in the kernel page tables in the + * name of making them unusable for userspace. To execute + * code at such a low address, the poison must be cleared. + * + * Note: 'pgd' actually gets set in p4d_alloc() _or_ + * pud_alloc() depending on 4/5-level paging. + */ + pgd->pgd &= ~_PAGE_NX; + return 0; } diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 879af864d99a..774ebafa97c4 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -69,9 +69,12 @@ static struct irqaction irq0 = { static void __init setup_default_timer_irq(void) { - if (!nr_legacy_irqs()) - return; - setup_irq(0, &irq0); + /* + * Unconditionally register the legacy timer; even without legacy + * PIC/PIT we need this for the HPET0 in legacy replacement mode. + */ + if (setup_irq(0, &irq0)) + pr_info("Failed to register legacy timer interrupt\n"); } /* Default timer init function */ @@ -85,6 +88,11 @@ void __init hpet_time_init(void) static __init void x86_late_time_init(void) { x86_init.timers.timer_init(); + /* + * After PIT/HPET timers init, select and setup + * the final interrupt mode for delivering IRQs. + */ + x86_init.irqs.intr_mode_init(); tsc_init(); } diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index 9a9c9b076955..a5b802a12212 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -93,17 +93,10 @@ static void set_tls_desc(struct task_struct *p, int idx, cpu = get_cpu(); while (n-- > 0) { - if (LDT_empty(info) || LDT_zero(info)) { + if (LDT_empty(info) || LDT_zero(info)) memset(desc, 0, sizeof(*desc)); - } else { + else fill_ldt(desc, info); - - /* - * Always set the accessed bit so that the CPU - * doesn't try to write to the (read-only) GDT. - */ - desc->type |= 1; - } ++info; ++desc; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 67db4f43309e..3d9b2308e7fa 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -42,7 +42,6 @@ #include <linux/edac.h> #endif -#include <asm/kmemcheck.h> #include <asm/stacktrace.h> #include <asm/processor.h> #include <asm/debugreg.h> @@ -52,6 +51,7 @@ #include <asm/traps.h> #include <asm/desc.h> #include <asm/fpu/internal.h> +#include <asm/cpu_entry_area.h> #include <asm/mce.h> #include <asm/fixmap.h> #include <asm/mach_traps.h> @@ -60,6 +60,7 @@ #include <asm/trace/mpx.h> #include <asm/mpx.h> #include <asm/vm86.h> +#include <asm/umip.h> #ifdef CONFIG_X86_64 #include <asm/x86_init.h> @@ -71,7 +72,7 @@ #include <asm/proto.h> #endif -DECLARE_BITMAP(used_vectors, NR_VECTORS); +DECLARE_BITMAP(system_vectors, NR_VECTORS); static inline void cond_local_irq_enable(struct pt_regs *regs) { @@ -141,8 +142,7 @@ void ist_begin_non_atomic(struct pt_regs *regs) * will catch asm bugs and any attempt to use ist_preempt_enable * from double_fault. */ - BUG_ON((unsigned long)(current_top_of_stack() - - current_stack_pointer) >= THREAD_SIZE); + BUG_ON(!on_thread_stack()); preempt_enable_no_resched(); } @@ -181,7 +181,7 @@ int fixup_bug(struct pt_regs *regs, int trapnr) break; case BUG_TRAP_TYPE_WARN: - regs->ip += LEN_UD0; + regs->ip += LEN_UD2; return 1; } @@ -209,9 +209,6 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, if (fixup_exception(regs, trapnr)) return 0; - if (fixup_bug(regs, trapnr)) - return 0; - tsk->thread.error_code = error_code; tsk->thread.trap_nr = trapnr; die(str, regs, error_code); @@ -292,6 +289,13 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str, RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); + /* + * WARN*()s end up here; fix them up before we call the + * notifier chain. + */ + if (!user_mode(regs) && fixup_bug(regs, trapnr)) + return; + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) != NOTIFY_STOP) { cond_local_irq_enable(regs); @@ -345,23 +349,42 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) /* * If IRET takes a non-IST fault on the espfix64 stack, then we - * end up promoting it to a doublefault. In that case, modify - * the stack to make it look like we just entered the #GP - * handler from user space, similar to bad_iret. + * end up promoting it to a doublefault. In that case, take + * advantage of the fact that we're not using the normal (TSS.sp0) + * stack right now. We can write a fake #GP(0) frame at TSS.sp0 + * and then modify our own IRET frame so that, when we return, + * we land directly at the #GP(0) vector with the stack already + * set up according to its expectations. + * + * The net result is that our #GP handler will think that we + * entered from usermode with the bad user context. * * No need for ist_enter here because we don't use RCU. */ - if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY && + if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY && regs->cs == __KERNEL_CS && regs->ip == (unsigned long)native_irq_return_iret) { - struct pt_regs *normal_regs = task_pt_regs(current); + struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; - /* Fake a #GP(0) from userspace. */ - memmove(&normal_regs->ip, (void *)regs->sp, 5*8); - normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */ + /* + * regs->sp points to the failing IRET frame on the + * ESPFIX64 stack. Copy it to the entry stack. This fills + * in gpregs->ss through gpregs->ip. + * + */ + memmove(&gpregs->ip, (void *)regs->sp, 5*8); + gpregs->orig_ax = 0; /* Missing (lost) #GP error code */ + + /* + * Adjust our frame so that we return straight to the #GP + * vector with the expected RSP value. This is safe because + * we won't enable interupts or schedule before we invoke + * general_protection, so nothing will clobber the stack + * frame we just set up. + */ regs->ip = (unsigned long)general_protection; - regs->sp = (unsigned long)&normal_regs->orig_ax; + regs->sp = (unsigned long)&gpregs->orig_ax; return; } @@ -386,7 +409,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) * * Processors update CR2 whenever a page fault is detected. If a * second page fault occurs while an earlier page fault is being - * deliv- ered, the faulting linear address of the second fault will + * delivered, the faulting linear address of the second fault will * overwrite the contents of CR2 (replacing the previous * address). These updates to CR2 occur even if the page fault * results in a double fault or occurs during the delivery of a @@ -514,6 +537,11 @@ do_general_protection(struct pt_regs *regs, long error_code) RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); cond_local_irq_enable(regs); + if (static_cpu_has(X86_FEATURE_UMIP)) { + if (user_mode(regs) && fixup_umip_exception(regs)) + return; + } + if (v8086_mode(regs)) { local_irq_enable(); handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); @@ -597,14 +625,15 @@ NOKPROBE_SYMBOL(do_int3); #ifdef CONFIG_X86_64 /* - * Help handler running on IST stack to switch off the IST stack if the - * interrupted code was in user mode. The actual stack switch is done in - * entry_64.S + * Help handler running on a per-cpu (IST or entry trampoline) stack + * to switch to the normal thread stack if the interrupted code was in + * user mode. The actual stack switch is done in entry_64.S */ asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs) { - struct pt_regs *regs = task_pt_regs(current); - *regs = *eregs; + struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1; + if (regs != eregs) + *regs = *eregs; return regs; } NOKPROBE_SYMBOL(sync_regs); @@ -620,13 +649,13 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) /* * This is called from entry_64.S early in handling a fault * caused by a bad iret to user mode. To handle the fault - * correctly, we want move our stack frame to task_pt_regs - * and we want to pretend that the exception came from the - * iret target. + * correctly, we want to move our stack frame to where it would + * be had we entered directly on the entry stack (rather than + * just below the IRET frame) and we want to pretend that the + * exception came from the IRET target. */ struct bad_iret_stack *new_stack = - container_of(task_pt_regs(current), - struct bad_iret_stack, regs); + (struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; /* Copy the IRET target to the new stack. */ memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8); @@ -740,10 +769,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) if (!dr6 && user_mode(regs)) user_icebp = 1; - /* Catch kmemcheck conditions! */ - if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) - goto exit; - /* Store the virtualized DR6 value */ tsk->thread.debugreg6 = dr6; @@ -791,14 +816,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) debug_stack_usage_dec(); exit: -#if defined(CONFIG_X86_32) - /* - * This is the most likely code path that involves non-trivial use - * of the SYSENTER stack. Check that we haven't overrun it. - */ - WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC, - "Overran or corrupted SYSENTER stack\n"); -#endif ist_exit(regs); } NOKPROBE_SYMBOL(do_debug); @@ -925,6 +942,9 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) void __init trap_init(void) { + /* Init cpu_entry_area before IST entries are set up */ + setup_cpu_entry_areas(); + idt_setup_traps(); /* @@ -932,8 +952,9 @@ void __init trap_init(void) * "sidt" instruction will not leak the location of the kernel, and * to defend the IDT against arbitrary memory write vulnerabilities. * It will be reloaded in cpu_init() */ - __set_fixmap(FIX_RO_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO); - idt_descr.address = fix_to_virt(FIX_RO_IDT); + cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table), + PAGE_KERNEL_RO); + idt_descr.address = CPU_ENTRY_AREA_RO_IDT; /* * Should be a barrier for any external CPU state: diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 796d96bb0821..fb4302738410 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -25,6 +25,7 @@ #include <asm/geode.h> #include <asm/apic.h> #include <asm/intel-family.h> +#include <asm/i8259.h> unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); @@ -112,7 +113,7 @@ static void cyc2ns_data_init(struct cyc2ns_data *data) data->cyc2ns_offset = 0; } -static void cyc2ns_init(int cpu) +static void __init cyc2ns_init(int cpu) { struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); @@ -363,6 +364,20 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin) unsigned long tscmin, tscmax; int pitcnt; + if (!has_legacy_pic()) { + /* + * Relies on tsc_early_delay_calibrate() to have given us semi + * usable udelay(), wait for the same 50ms we would have with + * the PIT loop below. + */ + udelay(10 * USEC_PER_MSEC); + udelay(10 * USEC_PER_MSEC); + udelay(10 * USEC_PER_MSEC); + udelay(10 * USEC_PER_MSEC); + udelay(10 * USEC_PER_MSEC); + return ULONG_MAX; + } + /* Set the Gate high, disable speaker */ outb((inb(0x61) & ~0x02) | 0x01, 0x61); @@ -487,6 +502,9 @@ static unsigned long quick_pit_calibrate(void) u64 tsc, delta; unsigned long d1, d2; + if (!has_legacy_pic()) + return 0; + /* Set the Gate high, disable speaker */ outb((inb(0x61) & ~0x02) | 0x01, 0x61); @@ -602,7 +620,6 @@ unsigned long native_calibrate_tsc(void) case INTEL_FAM6_KABYLAKE_DESKTOP: crystal_khz = 24000; /* 24.0 MHz */ break; - case INTEL_FAM6_SKYLAKE_X: case INTEL_FAM6_ATOM_DENVERTON: crystal_khz = 25000; /* 25.0 MHz */ break; @@ -612,6 +629,8 @@ unsigned long native_calibrate_tsc(void) } } + if (crystal_khz == 0) + return 0; /* * TSC frequency determined by CPUID is a "hardware reported" * frequency and is the most accurate one so far we have. This @@ -812,13 +831,13 @@ unsigned long native_calibrate_cpu(void) return tsc_pit_min; } -int recalibrate_cpu_khz(void) +void recalibrate_cpu_khz(void) { #ifndef CONFIG_SMP unsigned long cpu_khz_old = cpu_khz; if (!boot_cpu_has(X86_FEATURE_TSC)) - return -ENODEV; + return; cpu_khz = x86_platform.calibrate_cpu(); tsc_khz = x86_platform.calibrate_tsc(); @@ -828,10 +847,6 @@ int recalibrate_cpu_khz(void) cpu_khz = tsc_khz; cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy, cpu_khz_old, cpu_khz); - - return 0; -#else - return -ENODEV; #endif } @@ -959,17 +974,21 @@ core_initcall(cpufreq_register_tsc_scaling); /* * If ART is present detect the numerator:denominator to convert to TSC */ -static void detect_art(void) +static void __init detect_art(void) { unsigned int unused[2]; if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF) return; - /* Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required */ + /* + * Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required, + * and the TSC counter resets must not occur asynchronously. + */ if (boot_cpu_has(X86_FEATURE_HYPERVISOR) || !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) || - !boot_cpu_has(X86_FEATURE_TSC_ADJUST)) + !boot_cpu_has(X86_FEATURE_TSC_ADJUST) || + tsc_async_resets) return; cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator, @@ -987,8 +1006,6 @@ static void detect_art(void) /* clocksource code */ -static struct clocksource clocksource_tsc; - static void tsc_resume(struct clocksource *cs) { tsc_verify_tsc_adjust(true); @@ -1039,12 +1056,31 @@ static void tsc_cs_tick_stable(struct clocksource *cs) /* * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc() */ +static struct clocksource clocksource_tsc_early = { + .name = "tsc-early", + .rating = 299, + .read = read_tsc, + .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS | + CLOCK_SOURCE_MUST_VERIFY, + .archdata = { .vclock_mode = VCLOCK_TSC }, + .resume = tsc_resume, + .mark_unstable = tsc_cs_mark_unstable, + .tick_stable = tsc_cs_tick_stable, +}; + +/* + * Must mark VALID_FOR_HRES early such that when we unregister tsc_early + * this one will immediately take over. We will only register if TSC has + * been found good. + */ static struct clocksource clocksource_tsc = { .name = "tsc", .rating = 300, .read = read_tsc, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS | + CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_MUST_VERIFY, .archdata = { .vclock_mode = VCLOCK_TSC }, .resume = tsc_resume, @@ -1168,8 +1204,8 @@ static void tsc_refine_calibration_work(struct work_struct *work) int cpu; /* Don't bother refining TSC on unstable systems */ - if (check_tsc_unstable()) - goto out; + if (tsc_unstable) + return; /* * Since the work is started early in boot, we may be @@ -1221,9 +1257,13 @@ static void tsc_refine_calibration_work(struct work_struct *work) set_cyc2ns_scale(tsc_khz, cpu, tsc_stop); out: + if (tsc_unstable) + return; + if (boot_cpu_has(X86_FEATURE_ART)) art_related_clocksource = &clocksource_tsc; clocksource_register_khz(&clocksource_tsc, tsc_khz); + clocksource_unregister(&clocksource_tsc_early); } @@ -1232,13 +1272,11 @@ static int __init init_tsc_clocksource(void) if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_disabled > 0 || !tsc_khz) return 0; + if (check_tsc_unstable()) + return 0; + if (tsc_clocksource_reliable) clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; - /* lower the rating if we already know its unstable: */ - if (check_tsc_unstable()) { - clocksource_tsc.rating = 0; - clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; - } if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3)) clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP; @@ -1251,6 +1289,7 @@ static int __init init_tsc_clocksource(void) if (boot_cpu_has(X86_FEATURE_ART)) art_related_clocksource = &clocksource_tsc; clocksource_register_khz(&clocksource_tsc, tsc_khz); + clocksource_unregister(&clocksource_tsc_early); return 0; } @@ -1263,6 +1302,25 @@ static int __init init_tsc_clocksource(void) */ device_initcall(init_tsc_clocksource); +void __init tsc_early_delay_calibrate(void) +{ + unsigned long lpj; + + if (!boot_cpu_has(X86_FEATURE_TSC)) + return; + + cpu_khz = x86_platform.calibrate_cpu(); + tsc_khz = x86_platform.calibrate_tsc(); + + tsc_khz = tsc_khz ? : cpu_khz; + if (!tsc_khz) + return; + + lpj = tsc_khz * 1000; + do_div(lpj, HZ); + loops_per_jiffy = lpj; +} + void __init tsc_init(void) { u64 lpj, cyc; @@ -1296,6 +1354,12 @@ void __init tsc_init(void) (unsigned long)cpu_khz / 1000, (unsigned long)cpu_khz % 1000); + if (cpu_khz != tsc_khz) { + pr_info("Detected %lu.%03lu MHz TSC", + (unsigned long)tsc_khz / 1000, + (unsigned long)tsc_khz % 1000); + } + /* Sanitize TSC ADJUST before cyc2ns gets initialized */ tsc_store_and_check_tsc_adjust(true); @@ -1330,9 +1394,12 @@ void __init tsc_init(void) check_system_tsc_reliable(); - if (unsynchronized_tsc()) + if (unsynchronized_tsc()) { mark_tsc_unstable("TSCs unsynchronized"); + return; + } + clocksource_register_khz(&clocksource_tsc_early, tsc_khz); detect_art(); } @@ -1346,12 +1413,10 @@ void __init tsc_init(void) unsigned long calibrate_delay_is_known(void) { int sibling, cpu = smp_processor_id(); - struct cpumask *mask = topology_core_cpumask(cpu); - - if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC)) - return 0; + int constant_tsc = cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC); + const struct cpumask *mask = topology_core_cpumask(cpu); - if (!mask) + if (tsc_disabled || !constant_tsc || !mask) return 0; sibling = cpumask_any_but(mask, cpu); diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index e76a9881306b..ec534f978867 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -31,6 +31,20 @@ struct tsc_adjust { static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust); +/* + * TSC's on different sockets may be reset asynchronously. + * This may cause the TSC ADJUST value on socket 0 to be NOT 0. + */ +bool __read_mostly tsc_async_resets; + +void mark_tsc_async_resets(char *reason) +{ + if (tsc_async_resets) + return; + tsc_async_resets = true; + pr_info("tsc: Marking TSC async resets true due to %s\n", reason); +} + void tsc_verify_tsc_adjust(bool resume) { struct tsc_adjust *adj = this_cpu_ptr(&tsc_adjust); @@ -39,6 +53,10 @@ void tsc_verify_tsc_adjust(bool resume) if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) return; + /* Skip unnecessary error messages if TSC already unstable */ + if (check_tsc_unstable()) + return; + /* Rate limit the MSR check */ if (!resume && time_before(jiffies, adj->nextcheck)) return; @@ -72,12 +90,22 @@ static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval, * non zero. We don't do that on non boot cpus because physical * hotplug should have set the ADJUST register to a value > 0 so * the TSC is in sync with the already running cpus. + * + * Also don't force the ADJUST value to zero if that is a valid value + * for socket 0 as determined by the system arch. This is required + * when multiple sockets are reset asynchronously with each other + * and socket 0 may not have an TSC ADJUST value of 0. */ if (bootcpu && bootval != 0) { - pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n", cpu, - bootval); - wrmsrl(MSR_IA32_TSC_ADJUST, 0); - bootval = 0; + if (likely(!tsc_async_resets)) { + pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n", + cpu, bootval); + wrmsrl(MSR_IA32_TSC_ADJUST, 0); + bootval = 0; + } else { + pr_info("TSC ADJUST: CPU%u: %lld NOT forced to 0\n", + cpu, bootval); + } } cur->adjusted = bootval; } @@ -91,6 +119,10 @@ bool __init tsc_store_and_check_tsc_adjust(bool bootcpu) if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) return false; + /* Skip unnecessary error messages if TSC already unstable */ + if (check_tsc_unstable()) + return false; + rdmsrl(MSR_IA32_TSC_ADJUST, bootval); cur->bootval = bootval; cur->nextcheck = jiffies + HZ; @@ -119,6 +151,13 @@ bool tsc_store_and_check_tsc_adjust(bool bootcpu) cur->warned = false; /* + * If a non-zero TSC value for socket 0 may be valid then the default + * adjusted value cannot assumed to be zero either. + */ + if (tsc_async_resets) + cur->adjusted = bootval; + + /* * Check whether this CPU is the first in a package to come up. In * this case do not check the boot value against another package * because the new package might have been physically hotplugged, @@ -139,10 +178,9 @@ bool tsc_store_and_check_tsc_adjust(bool bootcpu) * Compare the boot value and complain if it differs in the * package. */ - if (bootval != ref->bootval) { - pr_warn(FW_BUG "TSC ADJUST differs: Reference CPU%u: %lld CPU%u: %lld\n", - refcpu, ref->bootval, cpu, bootval); - } + if (bootval != ref->bootval) + printk_once(FW_BUG "TSC ADJUST differs within socket(s), fixing all errors\n"); + /* * The TSC_ADJUST values in a package must be the same. If the boot * value on this newly upcoming CPU differs from the adjustment @@ -150,8 +188,6 @@ bool tsc_store_and_check_tsc_adjust(bool bootcpu) * adjusted value. */ if (bootval != ref->adjusted) { - pr_warn("TSC ADJUST synchronize: Reference CPU%u: %lld CPU%u: %lld\n", - refcpu, ref->adjusted, cpu, bootval); cur->adjusted = ref->adjusted; wrmsrl(MSR_IA32_TSC_ADJUST, ref->adjusted); } diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c new file mode 100644 index 000000000000..f44ce0fb3583 --- /dev/null +++ b/arch/x86/kernel/umip.c @@ -0,0 +1,428 @@ +/* + * umip.c Emulation for instruction protected by the Intel User-Mode + * Instruction Prevention feature + * + * Copyright (c) 2017, Intel Corporation. + * Ricardo Neri <ricardo.neri-calderon@linux.intel.com> + */ + +#include <linux/uaccess.h> +#include <asm/umip.h> +#include <asm/traps.h> +#include <asm/insn.h> +#include <asm/insn-eval.h> +#include <linux/ratelimit.h> + +#undef pr_fmt +#define pr_fmt(fmt) "umip: " fmt + +/** DOC: Emulation for User-Mode Instruction Prevention (UMIP) + * + * The feature User-Mode Instruction Prevention present in recent Intel + * processor prevents a group of instructions (sgdt, sidt, sldt, smsw, and str) + * from being executed with CPL > 0. Otherwise, a general protection fault is + * issued. + * + * Rather than relaying to the user space the general protection fault caused by + * the UMIP-protected instructions (in the form of a SIGSEGV signal), it can be + * trapped and emulate the result of such instructions to provide dummy values. + * This allows to both conserve the current kernel behavior and not reveal the + * system resources that UMIP intends to protect (i.e., the locations of the + * global descriptor and interrupt descriptor tables, the segment selectors of + * the local descriptor table, the value of the task state register and the + * contents of the CR0 register). + * + * This emulation is needed because certain applications (e.g., WineHQ and + * DOSEMU2) rely on this subset of instructions to function. + * + * The instructions protected by UMIP can be split in two groups. Those which + * return a kernel memory address (sgdt and sidt) and those which return a + * value (sldt, str and smsw). + * + * For the instructions that return a kernel memory address, applications + * such as WineHQ rely on the result being located in the kernel memory space, + * not the actual location of the table. The result is emulated as a hard-coded + * value that, lies close to the top of the kernel memory. The limit for the GDT + * and the IDT are set to zero. + * + * Given that sldt and str are not commonly used in programs that run on WineHQ + * or DOSEMU2, they are not emulated. + * + * The instruction smsw is emulated to return the value that the register CR0 + * has at boot time as set in the head_32. + * + * Also, emulation is provided only for 32-bit processes; 64-bit processes + * that attempt to use the instructions that UMIP protects will receive the + * SIGSEGV signal issued as a consequence of the general protection fault. + * + * Care is taken to appropriately emulate the results when segmentation is + * used. That is, rather than relying on USER_DS and USER_CS, the function + * insn_get_addr_ref() inspects the segment descriptor pointed by the + * registers in pt_regs. This ensures that we correctly obtain the segment + * base address and the address and operand sizes even if the user space + * application uses a local descriptor table. + */ + +#define UMIP_DUMMY_GDT_BASE 0xfffe0000 +#define UMIP_DUMMY_IDT_BASE 0xffff0000 + +/* + * The SGDT and SIDT instructions store the contents of the global descriptor + * table and interrupt table registers, respectively. The destination is a + * memory operand of X+2 bytes. X bytes are used to store the base address of + * the table and 2 bytes are used to store the limit. In 32-bit processes, the + * only processes for which emulation is provided, X has a value of 4. + */ +#define UMIP_GDT_IDT_BASE_SIZE 4 +#define UMIP_GDT_IDT_LIMIT_SIZE 2 + +#define UMIP_INST_SGDT 0 /* 0F 01 /0 */ +#define UMIP_INST_SIDT 1 /* 0F 01 /1 */ +#define UMIP_INST_SMSW 2 /* 0F 01 /4 */ +#define UMIP_INST_SLDT 3 /* 0F 00 /0 */ +#define UMIP_INST_STR 4 /* 0F 00 /1 */ + +const char * const umip_insns[5] = { + [UMIP_INST_SGDT] = "SGDT", + [UMIP_INST_SIDT] = "SIDT", + [UMIP_INST_SMSW] = "SMSW", + [UMIP_INST_SLDT] = "SLDT", + [UMIP_INST_STR] = "STR", +}; + +#define umip_pr_err(regs, fmt, ...) \ + umip_printk(regs, KERN_ERR, fmt, ##__VA_ARGS__) +#define umip_pr_warning(regs, fmt, ...) \ + umip_printk(regs, KERN_WARNING, fmt, ##__VA_ARGS__) + +/** + * umip_printk() - Print a rate-limited message + * @regs: Register set with the context in which the warning is printed + * @log_level: Kernel log level to print the message + * @fmt: The text string to print + * + * Print the text contained in @fmt. The print rate is limited to bursts of 5 + * messages every two minutes. The purpose of this customized version of + * printk() is to print messages when user space processes use any of the + * UMIP-protected instructions. Thus, the printed text is prepended with the + * task name and process ID number of the current task as well as the + * instruction and stack pointers in @regs as seen when entering kernel mode. + * + * Returns: + * + * None. + */ +static __printf(3, 4) +void umip_printk(const struct pt_regs *regs, const char *log_level, + const char *fmt, ...) +{ + /* Bursts of 5 messages every two minutes */ + static DEFINE_RATELIMIT_STATE(ratelimit, 2 * 60 * HZ, 5); + struct task_struct *tsk = current; + struct va_format vaf; + va_list args; + + if (!__ratelimit(&ratelimit)) + return; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk("%s" pr_fmt("%s[%d] ip:%lx sp:%lx: %pV"), log_level, tsk->comm, + task_pid_nr(tsk), regs->ip, regs->sp, &vaf); + va_end(args); +} + +/** + * identify_insn() - Identify a UMIP-protected instruction + * @insn: Instruction structure with opcode and ModRM byte. + * + * From the opcode and ModRM.reg in @insn identify, if any, a UMIP-protected + * instruction that can be emulated. + * + * Returns: + * + * On success, a constant identifying a specific UMIP-protected instruction that + * can be emulated. + * + * -EINVAL on error or when not an UMIP-protected instruction that can be + * emulated. + */ +static int identify_insn(struct insn *insn) +{ + /* By getting modrm we also get the opcode. */ + insn_get_modrm(insn); + + if (!insn->modrm.nbytes) + return -EINVAL; + + /* All the instructions of interest start with 0x0f. */ + if (insn->opcode.bytes[0] != 0xf) + return -EINVAL; + + if (insn->opcode.bytes[1] == 0x1) { + switch (X86_MODRM_REG(insn->modrm.value)) { + case 0: + return UMIP_INST_SGDT; + case 1: + return UMIP_INST_SIDT; + case 4: + return UMIP_INST_SMSW; + default: + return -EINVAL; + } + } else if (insn->opcode.bytes[1] == 0x0) { + if (X86_MODRM_REG(insn->modrm.value) == 0) + return UMIP_INST_SLDT; + else if (X86_MODRM_REG(insn->modrm.value) == 1) + return UMIP_INST_STR; + else + return -EINVAL; + } else { + return -EINVAL; + } +} + +/** + * emulate_umip_insn() - Emulate UMIP instructions and return dummy values + * @insn: Instruction structure with operands + * @umip_inst: A constant indicating the instruction to emulate + * @data: Buffer into which the dummy result is stored + * @data_size: Size of the emulated result + * + * Emulate an instruction protected by UMIP and provide a dummy result. The + * result of the emulation is saved in @data. The size of the results depends + * on both the instruction and type of operand (register vs memory address). + * The size of the result is updated in @data_size. Caller is responsible + * of providing a @data buffer of at least UMIP_GDT_IDT_BASE_SIZE + + * UMIP_GDT_IDT_LIMIT_SIZE bytes. + * + * Returns: + * + * 0 on success, -EINVAL on error while emulating. + */ +static int emulate_umip_insn(struct insn *insn, int umip_inst, + unsigned char *data, int *data_size) +{ + unsigned long dummy_base_addr, dummy_value; + unsigned short dummy_limit = 0; + + if (!data || !data_size || !insn) + return -EINVAL; + /* + * These two instructions return the base address and limit of the + * global and interrupt descriptor table, respectively. According to the + * Intel Software Development manual, the base address can be 24-bit, + * 32-bit or 64-bit. Limit is always 16-bit. If the operand size is + * 16-bit, the returned value of the base address is supposed to be a + * zero-extended 24-byte number. However, it seems that a 32-byte number + * is always returned irrespective of the operand size. + */ + if (umip_inst == UMIP_INST_SGDT || umip_inst == UMIP_INST_SIDT) { + /* SGDT and SIDT do not use registers operands. */ + if (X86_MODRM_MOD(insn->modrm.value) == 3) + return -EINVAL; + + if (umip_inst == UMIP_INST_SGDT) + dummy_base_addr = UMIP_DUMMY_GDT_BASE; + else + dummy_base_addr = UMIP_DUMMY_IDT_BASE; + + *data_size = UMIP_GDT_IDT_LIMIT_SIZE + UMIP_GDT_IDT_BASE_SIZE; + + memcpy(data + 2, &dummy_base_addr, UMIP_GDT_IDT_BASE_SIZE); + memcpy(data, &dummy_limit, UMIP_GDT_IDT_LIMIT_SIZE); + + } else if (umip_inst == UMIP_INST_SMSW) { + dummy_value = CR0_STATE; + + /* + * Even though the CR0 register has 4 bytes, the number + * of bytes to be copied in the result buffer is determined + * by whether the operand is a register or a memory location. + * If operand is a register, return as many bytes as the operand + * size. If operand is memory, return only the two least + * siginificant bytes of CR0. + */ + if (X86_MODRM_MOD(insn->modrm.value) == 3) + *data_size = insn->opnd_bytes; + else + *data_size = 2; + + memcpy(data, &dummy_value, *data_size); + /* STR and SLDT are not emulated */ + } else { + return -EINVAL; + } + + return 0; +} + +/** + * force_sig_info_umip_fault() - Force a SIGSEGV with SEGV_MAPERR + * @addr: Address that caused the signal + * @regs: Register set containing the instruction pointer + * + * Force a SIGSEGV signal with SEGV_MAPERR as the error code. This function is + * intended to be used to provide a segmentation fault when the result of the + * UMIP emulation could not be copied to the user space memory. + * + * Returns: none + */ +static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs) +{ + siginfo_t info; + struct task_struct *tsk = current; + + tsk->thread.cr2 = (unsigned long)addr; + tsk->thread.error_code = X86_PF_USER | X86_PF_WRITE; + tsk->thread.trap_nr = X86_TRAP_PF; + + info.si_signo = SIGSEGV; + info.si_errno = 0; + info.si_code = SEGV_MAPERR; + info.si_addr = addr; + force_sig_info(SIGSEGV, &info, tsk); + + if (!(show_unhandled_signals && unhandled_signal(tsk, SIGSEGV))) + return; + + umip_pr_err(regs, "segfault in emulation. error%x\n", + X86_PF_USER | X86_PF_WRITE); +} + +/** + * fixup_umip_exception() - Fixup a general protection fault caused by UMIP + * @regs: Registers as saved when entering the #GP handler + * + * The instructions sgdt, sidt, str, smsw, sldt cause a general protection + * fault if executed with CPL > 0 (i.e., from user space). If the offending + * user-space process is not in long mode, this function fixes the exception + * up and provides dummy results for sgdt, sidt and smsw; str and sldt are not + * fixed up. Also long mode user-space processes are not fixed up. + * + * If operands are memory addresses, results are copied to user-space memory as + * indicated by the instruction pointed by eIP using the registers indicated in + * the instruction operands. If operands are registers, results are copied into + * the context that was saved when entering kernel mode. + * + * Returns: + * + * True if emulation was successful; false if not. + */ +bool fixup_umip_exception(struct pt_regs *regs) +{ + int not_copied, nr_copied, reg_offset, dummy_data_size, umip_inst; + unsigned long seg_base = 0, *reg_addr; + /* 10 bytes is the maximum size of the result of UMIP instructions */ + unsigned char dummy_data[10] = { 0 }; + unsigned char buf[MAX_INSN_SIZE]; + void __user *uaddr; + struct insn insn; + int seg_defs; + + if (!regs) + return false; + + /* + * If not in user-space long mode, a custom code segment could be in + * use. This is true in protected mode (if the process defined a local + * descriptor table), or virtual-8086 mode. In most of the cases + * seg_base will be zero as in USER_CS. + */ + if (!user_64bit_mode(regs)) + seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS); + + if (seg_base == -1L) + return false; + + not_copied = copy_from_user(buf, (void __user *)(seg_base + regs->ip), + sizeof(buf)); + nr_copied = sizeof(buf) - not_copied; + + /* + * The copy_from_user above could have failed if user code is protected + * by a memory protection key. Give up on emulation in such a case. + * Should we issue a page fault? + */ + if (!nr_copied) + return false; + + insn_init(&insn, buf, nr_copied, user_64bit_mode(regs)); + + /* + * Override the default operand and address sizes with what is specified + * in the code segment descriptor. The instruction decoder only sets + * the address size it to either 4 or 8 address bytes and does nothing + * for the operand bytes. This OK for most of the cases, but we could + * have special cases where, for instance, a 16-bit code segment + * descriptor is used. + * If there is an address override prefix, the instruction decoder + * correctly updates these values, even for 16-bit defaults. + */ + seg_defs = insn_get_code_seg_params(regs); + if (seg_defs == -EINVAL) + return false; + + insn.addr_bytes = INSN_CODE_SEG_ADDR_SZ(seg_defs); + insn.opnd_bytes = INSN_CODE_SEG_OPND_SZ(seg_defs); + + insn_get_length(&insn); + if (nr_copied < insn.length) + return false; + + umip_inst = identify_insn(&insn); + if (umip_inst < 0) + return false; + + umip_pr_warning(regs, "%s instruction cannot be used by applications.\n", + umip_insns[umip_inst]); + + /* Do not emulate SLDT, STR or user long mode processes. */ + if (umip_inst == UMIP_INST_STR || umip_inst == UMIP_INST_SLDT || user_64bit_mode(regs)) + return false; + + umip_pr_warning(regs, "For now, expensive software emulation returns the result.\n"); + + if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size)) + return false; + + /* + * If operand is a register, write result to the copy of the register + * value that was pushed to the stack when entering into kernel mode. + * Upon exit, the value we write will be restored to the actual hardware + * register. + */ + if (X86_MODRM_MOD(insn.modrm.value) == 3) { + reg_offset = insn_get_modrm_rm_off(&insn, regs); + + /* + * Negative values are usually errors. In memory addressing, + * the exception is -EDOM. Since we expect a register operand, + * all negative values are errors. + */ + if (reg_offset < 0) + return false; + + reg_addr = (unsigned long *)((unsigned long)regs + reg_offset); + memcpy(reg_addr, dummy_data, dummy_data_size); + } else { + uaddr = insn_get_addr_ref(&insn, regs); + if ((unsigned long)uaddr == -1L) + return false; + + nr_copied = copy_to_user(uaddr, dummy_data, dummy_data_size); + if (nr_copied > 0) { + /* + * If copy fails, send a signal and tell caller that + * fault was fixed up. + */ + force_sig_info_umip_fault(uaddr, regs); + return true; + } + } + + /* increase IP to let the program keep going */ + regs->ip += insn.length; + return true; +} diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index b95007e7c1b3..feb28fee6cea 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -5,7 +5,6 @@ #include <asm/unwind.h> #include <asm/orc_types.h> #include <asm/orc_lookup.h> -#include <asm/sections.h> #define orc_warn(fmt, ...) \ printk_deferred_once(KERN_WARNING pr_fmt("WARNING: " fmt), ##__VA_ARGS__) @@ -74,8 +73,50 @@ static struct orc_entry *orc_module_find(unsigned long ip) } #endif +#ifdef CONFIG_DYNAMIC_FTRACE +static struct orc_entry *orc_find(unsigned long ip); + +/* + * Ftrace dynamic trampolines do not have orc entries of their own. + * But they are copies of the ftrace entries that are static and + * defined in ftrace_*.S, which do have orc entries. + * + * If the undwinder comes across a ftrace trampoline, then find the + * ftrace function that was used to create it, and use that ftrace + * function's orc entrie, as the placement of the return code in + * the stack will be identical. + */ +static struct orc_entry *orc_ftrace_find(unsigned long ip) +{ + struct ftrace_ops *ops; + unsigned long caller; + + ops = ftrace_ops_trampoline(ip); + if (!ops) + return NULL; + + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) + caller = (unsigned long)ftrace_regs_call; + else + caller = (unsigned long)ftrace_call; + + /* Prevent unlikely recursion */ + if (ip == caller) + return NULL; + + return orc_find(caller); +} +#else +static struct orc_entry *orc_ftrace_find(unsigned long ip) +{ + return NULL; +} +#endif + static struct orc_entry *orc_find(unsigned long ip) { + static struct orc_entry *orc; + if (!orc_init) return NULL; @@ -106,12 +147,16 @@ static struct orc_entry *orc_find(unsigned long ip) } /* vmlinux .init slow lookup: */ - if (ip >= (unsigned long)_sinittext && ip < (unsigned long)_einittext) + if (init_kernel_text(ip)) return __orc_find(__start_orc_unwind_ip, __start_orc_unwind, __stop_orc_unwind_ip - __start_orc_unwind_ip, ip); /* Module lookup: */ - return orc_module_find(ip); + orc = orc_module_find(ip); + if (orc) + return orc; + + return orc_ftrace_find(ip); } static void orc_sort_swap(void *_a, void *_b, int size) @@ -253,22 +298,15 @@ unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) return NULL; } -static bool stack_access_ok(struct unwind_state *state, unsigned long addr, +static bool stack_access_ok(struct unwind_state *state, unsigned long _addr, size_t len) { struct stack_info *info = &state->stack_info; + void *addr = (void *)_addr; - /* - * If the address isn't on the current stack, switch to the next one. - * - * We may have to traverse multiple stacks to deal with the possibility - * that info->next_sp could point to an empty stack and the address - * could be on a subsequent stack. - */ - while (!on_stack(info, (void *)addr, len)) - if (get_stack_info(info->next_sp, state->task, info, - &state->stack_mask)) - return false; + if (!on_stack(info, addr, len) && + (get_stack_info(addr, state->task, info, &state->stack_mask))) + return false; return true; } @@ -279,46 +317,36 @@ static bool deref_stack_reg(struct unwind_state *state, unsigned long addr, if (!stack_access_ok(state, addr, sizeof(long))) return false; - *val = READ_ONCE_TASK_STACK(state->task, *(unsigned long *)addr); + *val = READ_ONCE_NOCHECK(*(unsigned long *)addr); return true; } -#define REGS_SIZE (sizeof(struct pt_regs)) -#define SP_OFFSET (offsetof(struct pt_regs, sp)) -#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip)) -#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip)) - static bool deref_stack_regs(struct unwind_state *state, unsigned long addr, - unsigned long *ip, unsigned long *sp, bool full) + unsigned long *ip, unsigned long *sp) { - size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE; - size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET; - struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE); - - if (IS_ENABLED(CONFIG_X86_64)) { - if (!stack_access_ok(state, addr, regs_size)) - return false; + struct pt_regs *regs = (struct pt_regs *)addr; - *ip = regs->ip; - *sp = regs->sp; + /* x86-32 support will be more complicated due to the ®s->sp hack */ + BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_32)); - return true; - } - - if (!stack_access_ok(state, addr, sp_offset)) + if (!stack_access_ok(state, addr, sizeof(struct pt_regs))) return false; *ip = regs->ip; + *sp = regs->sp; + return true; +} - if (user_mode(regs)) { - if (!stack_access_ok(state, addr + sp_offset, - REGS_SIZE - SP_OFFSET)) - return false; +static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr, + unsigned long *ip, unsigned long *sp) +{ + struct pt_regs *regs = (void *)addr - IRET_FRAME_OFFSET; - *sp = regs->sp; - } else - *sp = (unsigned long)®s->sp; + if (!stack_access_ok(state, addr, IRET_FRAME_SIZE)) + return false; + *ip = regs->ip; + *sp = regs->sp; return true; } @@ -327,7 +355,6 @@ bool unwind_next_frame(struct unwind_state *state) unsigned long ip_p, sp, orig_ip, prev_sp = state->sp; enum stack_type prev_type = state->stack_info.type; struct orc_entry *orc; - struct pt_regs *ptregs; bool indirect = false; if (unwind_done(state)) @@ -435,7 +462,7 @@ bool unwind_next_frame(struct unwind_state *state) break; case ORC_TYPE_REGS: - if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) { + if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) { orc_warn("can't dereference registers at %p for ip %pB\n", (void *)sp, (void *)orig_ip); goto done; @@ -447,20 +474,14 @@ bool unwind_next_frame(struct unwind_state *state) break; case ORC_TYPE_REGS_IRET: - if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) { + if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) { orc_warn("can't dereference iret registers at %p for ip %pB\n", (void *)sp, (void *)orig_ip); goto done; } - ptregs = container_of((void *)sp, struct pt_regs, ip); - if ((unsigned long)ptregs >= prev_sp && - on_stack(&state->stack_info, ptregs, REGS_SIZE)) { - state->regs = ptregs; - state->full_regs = false; - } else - state->regs = NULL; - + state->regs = (void *)sp - IRET_FRAME_OFFSET; + state->full_regs = false; state->signal = true; break; @@ -553,8 +574,18 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, } if (get_stack_info((unsigned long *)state->sp, state->task, - &state->stack_info, &state->stack_mask)) - return; + &state->stack_info, &state->stack_mask)) { + /* + * We weren't on a valid stack. It's possible that + * we overflowed a valid stack into a guard page. + * See if the next page up is valid so that we can + * generate some kind of backtrace if this happens. + */ + void *next_page = (void *)PAGE_ALIGN((unsigned long)state->sp); + if (get_stack_info(next_page, state->task, &state->stack_info, + &state->stack_mask)) + return; + } /* * The caller can provide the address of the first frame directly diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 495c776de4b4..85c7ef23d99f 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -271,12 +271,15 @@ static bool is_prefix_bad(struct insn *insn) int i; for (i = 0; i < insn->prefixes.nbytes; i++) { - switch (insn->prefixes.bytes[i]) { - case 0x26: /* INAT_PFX_ES */ - case 0x2E: /* INAT_PFX_CS */ - case 0x36: /* INAT_PFX_DS */ - case 0x3E: /* INAT_PFX_SS */ - case 0xF0: /* INAT_PFX_LOCK */ + insn_attr_t attr; + + attr = inat_get_opcode_attribute(insn->prefixes.bytes[i]); + switch (attr) { + case INAT_MAKE_PREFIX(INAT_PFX_ES): + case INAT_MAKE_PREFIX(INAT_PFX_CS): + case INAT_MAKE_PREFIX(INAT_PFX_DS): + case INAT_MAKE_PREFIX(INAT_PFX_SS): + case INAT_MAKE_PREFIX(INAT_PFX_LOCK): return true; } } @@ -525,11 +528,11 @@ static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) return 0; } -static int push_ret_address(struct pt_regs *regs, unsigned long ip) +static int emulate_push_stack(struct pt_regs *regs, unsigned long val) { unsigned long new_sp = regs->sp - sizeof_long(); - if (copy_to_user((void __user *)new_sp, &ip, sizeof_long())) + if (copy_to_user((void __user *)new_sp, &val, sizeof_long())) return -EFAULT; regs->sp = new_sp; @@ -563,7 +566,7 @@ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs regs->ip += correction; } else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) { regs->sp += sizeof_long(); /* Pop incorrect return address */ - if (push_ret_address(regs, utask->vaddr + auprobe->defparam.ilen)) + if (emulate_push_stack(regs, utask->vaddr + auprobe->defparam.ilen)) return -ERESTART; } /* popf; tell the caller to not touch TF */ @@ -652,7 +655,7 @@ static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs) * * But there is corner case, see the comment in ->post_xol(). */ - if (push_ret_address(regs, new_ip)) + if (emulate_push_stack(regs, new_ip)) return false; } else if (!check_jmp_cond(auprobe, regs)) { offs = 0; @@ -662,6 +665,16 @@ static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs) return true; } +static bool push_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + unsigned long *src_ptr = (void *)regs + auprobe->push.reg_offset; + + if (emulate_push_stack(regs, *src_ptr)) + return false; + regs->ip += auprobe->push.ilen; + return true; +} + static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { BUG_ON(!branch_is_call(auprobe)); @@ -700,6 +713,10 @@ static const struct uprobe_xol_ops branch_xol_ops = { .post_xol = branch_post_xol_op, }; +static const struct uprobe_xol_ops push_xol_ops = { + .emulate = push_emulate_op, +}; + /* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) { @@ -747,6 +764,87 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) return 0; } +/* Returns -ENOSYS if push_xol_ops doesn't handle this insn */ +static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) +{ + u8 opc1 = OPCODE1(insn), reg_offset = 0; + + if (opc1 < 0x50 || opc1 > 0x57) + return -ENOSYS; + + if (insn->length > 2) + return -ENOSYS; + if (insn->length == 2) { + /* only support rex_prefix 0x41 (x64 only) */ +#ifdef CONFIG_X86_64 + if (insn->rex_prefix.nbytes != 1 || + insn->rex_prefix.bytes[0] != 0x41) + return -ENOSYS; + + switch (opc1) { + case 0x50: + reg_offset = offsetof(struct pt_regs, r8); + break; + case 0x51: + reg_offset = offsetof(struct pt_regs, r9); + break; + case 0x52: + reg_offset = offsetof(struct pt_regs, r10); + break; + case 0x53: + reg_offset = offsetof(struct pt_regs, r11); + break; + case 0x54: + reg_offset = offsetof(struct pt_regs, r12); + break; + case 0x55: + reg_offset = offsetof(struct pt_regs, r13); + break; + case 0x56: + reg_offset = offsetof(struct pt_regs, r14); + break; + case 0x57: + reg_offset = offsetof(struct pt_regs, r15); + break; + } +#else + return -ENOSYS; +#endif + } else { + switch (opc1) { + case 0x50: + reg_offset = offsetof(struct pt_regs, ax); + break; + case 0x51: + reg_offset = offsetof(struct pt_regs, cx); + break; + case 0x52: + reg_offset = offsetof(struct pt_regs, dx); + break; + case 0x53: + reg_offset = offsetof(struct pt_regs, bx); + break; + case 0x54: + reg_offset = offsetof(struct pt_regs, sp); + break; + case 0x55: + reg_offset = offsetof(struct pt_regs, bp); + break; + case 0x56: + reg_offset = offsetof(struct pt_regs, si); + break; + case 0x57: + reg_offset = offsetof(struct pt_regs, di); + break; + } + } + + auprobe->push.reg_offset = reg_offset; + auprobe->push.ilen = insn->length; + auprobe->ops = &push_xol_ops; + return 0; +} + /** * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. * @mm: the probed address space. @@ -768,6 +866,10 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, if (ret != -ENOSYS) return ret; + ret = push_setup_xol_ops(auprobe, &insn); + if (ret != -ENOSYS) + return ret; + /* * Figure out which fixups default_post_xol_op() will need to perform, * and annotate defparam->fixups accordingly. diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S index 014ea59aa153..3d3c2f71f617 100644 --- a/arch/x86/kernel/verify_cpu.S +++ b/arch/x86/kernel/verify_cpu.S @@ -33,7 +33,7 @@ #include <asm/cpufeatures.h> #include <asm/msr-index.h> -verify_cpu: +ENTRY(verify_cpu) pushf # Save caller passed flags push $0 # Kill any dangerous flags popf @@ -139,3 +139,4 @@ verify_cpu: popf # Restore caller passed flags xorl %eax, %eax ret +ENDPROC(verify_cpu) diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 68244742ecb0..5edb27f1a2c4 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -55,6 +55,7 @@ #include <asm/irq.h> #include <asm/traps.h> #include <asm/vm86.h> +#include <asm/switch_to.h> /* * Known problems: @@ -94,7 +95,6 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) { - struct tss_struct *tss; struct task_struct *tsk = current; struct vm86plus_struct __user *user; struct vm86 *vm86 = current->thread.vm86; @@ -146,12 +146,13 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) do_exit(SIGSEGV); } - tss = &per_cpu(cpu_tss, get_cpu()); + preempt_disable(); tsk->thread.sp0 = vm86->saved_sp0; tsk->thread.sysenter_cs = __KERNEL_CS; - load_sp0(tss, &tsk->thread); + update_sp0(tsk); + refresh_sysenter_cs(&tsk->thread); vm86->saved_sp0 = 0; - put_cpu(); + preempt_enable(); memcpy(®s->pt, &vm86->regs32, sizeof(struct pt_regs)); @@ -237,7 +238,6 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg) static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) { - struct tss_struct *tss; struct task_struct *tsk = current; struct vm86 *vm86 = tsk->thread.vm86; struct kernel_vm86_regs vm86regs; @@ -365,15 +365,17 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) vm86->saved_sp0 = tsk->thread.sp0; lazy_save_gs(vm86->regs32.gs); - tss = &per_cpu(cpu_tss, get_cpu()); /* make room for real-mode segments */ + preempt_disable(); tsk->thread.sp0 += 16; - if (static_cpu_has(X86_FEATURE_SEP)) + if (static_cpu_has(X86_FEATURE_SEP)) { tsk->thread.sysenter_cs = 0; + refresh_sysenter_cs(&tsk->thread); + } - load_sp0(tss, &tsk->thread); - put_cpu(); + update_sp0(tsk); + preempt_enable(); if (vm86->flags & VM86_SCREEN_BITMAP) mark_screen_rdonly(tsk->mm); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index a4009fb9be87..9b138a06c1a4 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -61,11 +61,17 @@ jiffies_64 = jiffies; . = ALIGN(HPAGE_SIZE); \ __end_rodata_hpage_align = .; +#define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE); +#define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE); + #else #define X64_ALIGN_RODATA_BEGIN #define X64_ALIGN_RODATA_END +#define ALIGN_ENTRY_TEXT_BEGIN +#define ALIGN_ENTRY_TEXT_END + #endif PHDRS { @@ -102,11 +108,28 @@ SECTIONS CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT + ALIGN_ENTRY_TEXT_BEGIN ENTRY_TEXT IRQENTRY_TEXT + ALIGN_ENTRY_TEXT_END SOFTIRQENTRY_TEXT *(.fixup) *(.gnu.warning) + +#ifdef CONFIG_X86_64 + . = ALIGN(PAGE_SIZE); + _entry_trampoline = .; + *(.entry_trampoline) + . = ALIGN(PAGE_SIZE); + ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big"); +#endif + +#ifdef CONFIG_RETPOLINE + __indirect_thunk_start = .; + *(.text.__x86.indirect_thunk) + __indirect_thunk_end = .; +#endif + /* End of text section */ _etext = .; } :text = 0x9090 diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index b034b1b14b9c..44685fb2a192 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -26,9 +26,6 @@ #define TOPOLOGY_REGISTER_OFFSET 0x10 -/* Flag below is initialized once during vSMP PCI initialization. */ -static int irq_routing_comply = 1; - #if defined CONFIG_PCI && defined CONFIG_PARAVIRT /* * Interrupt control on vSMPowered systems: @@ -105,9 +102,6 @@ static void __init set_vsmp_pv_ops(void) if (cap & ctl & BIT(8)) { ctl &= ~BIT(8); - /* Interrupt routing set to ignore */ - irq_routing_comply = 0; - #ifdef CONFIG_PROC_FS /* Don't let users change irq affinity via procfs */ no_irq_affinity = 1; @@ -211,23 +205,10 @@ static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) return hard_smp_processor_id() >> index_msb; } -/* - * In vSMP, all cpus should be capable of handling interrupts, regardless of - * the APIC used. - */ -static void fill_vector_allocation_domain(int cpu, struct cpumask *retmask, - const struct cpumask *mask) -{ - cpumask_setall(retmask); -} - static void vsmp_apic_post_init(void) { /* need to update phys_pkg_id */ apic->phys_pkg_id = apicid_phys_pkg_id; - - if (!irq_routing_comply) - apic->vector_allocation_domain = fill_vector_allocation_domain; } void __init vsmp_init(void) diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index a088b2c47f73..1151ccd72ce9 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -28,6 +28,8 @@ void x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } int __init iommu_init_noop(void) { return 0; } void iommu_shutdown_noop(void) { } +bool __init bool_x86_init_noop(void) { return false; } +void x86_op_int_noop(int cpu) { } /* * The platform setup functions are preset with the default functions @@ -55,6 +57,7 @@ struct x86_init_ops x86_init __initdata = { .pre_vector_init = init_ISA_irqs, .intr_init = native_init_IRQ, .trap_init = x86_init_noop, + .intr_mode_init = apic_intr_mode_init }, .oem = { @@ -81,6 +84,13 @@ struct x86_init_ops x86_init __initdata = { .init_irq = x86_default_pci_init_irq, .fixup_irqs = x86_default_pci_fixup_irqs, }, + + .hyper = { + .init_platform = x86_init_noop, + .guest_late_init = x86_init_noop, + .x2apic_available = bool_x86_init_noop, + .init_mem_mapping = x86_init_noop, + }, }; struct x86_cpuinit_ops x86_cpuinit = { @@ -101,6 +111,7 @@ struct x86_platform_ops x86_platform __ro_after_init = { .get_nmi_reason = default_get_nmi_reason, .save_sched_clock_state = tsc_save_sched_clock_state, .restore_sched_clock_state = tsc_restore_sched_clock_state, + .hyper.pin_vcpu = x86_op_int_noop, }; EXPORT_SYMBOL_GPL(x86_platform); |