diff options
Diffstat (limited to 'arch/x86/include')
308 files changed, 12579 insertions, 7983 deletions
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 1e51650b79d7..4566000e15c4 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -1,13 +1,16 @@ # SPDX-License-Identifier: GPL-2.0 +generated-y += orc_hash.h generated-y += syscalls_32.h generated-y += syscalls_64.h generated-y += syscalls_x32.h generated-y += unistd_32_ia32.h generated-y += unistd_64_x32.h generated-y += xen-hypercalls.h +generated-y += cpufeaturemasks.h generic-y += early_ioremap.h -generic-y += export.h +generic-y += fprobe.h generic-y += mcs_spinlock.h +generic-y += mmzone.h diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 65064d9f7fa6..5ab1a4598d00 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -6,7 +6,7 @@ * Copyright (C) 2001 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org> */ -#include <acpi/pdc_intel.h> +#include <acpi/proc_cap_intel.h> #include <asm/numa.h> #include <asm/fixmap.h> @@ -14,6 +14,11 @@ #include <asm/mmu.h> #include <asm/mpspec.h> #include <asm/x86_init.h> +#include <asm/cpufeature.h> +#include <asm/irq_vectors.h> +#include <asm/xen/hypervisor.h> + +#include <xen/xen.h> #ifdef CONFIG_ACPI_APEI # include <asm/pgtable_types.h> @@ -30,6 +35,7 @@ extern int acpi_skip_timer_override; extern int acpi_use_timer_override; extern int acpi_fix_pin2_polarity; extern int acpi_disable_cmcff; +extern bool acpi_int_src_ovr[NR_IRQS_LEGACY]; extern u8 acpi_sci_flags; extern u32 acpi_sci_override_gsi; @@ -50,6 +56,8 @@ static inline void disable_acpi(void) extern int acpi_gsi_to_irq(u32 gsi, unsigned int *irq); +extern int acpi_blacklisted(void); + static inline void acpi_noirq_set(void) { acpi_noirq = 1; } static inline void acpi_disable_pci(void) { @@ -63,6 +71,20 @@ extern int (*acpi_suspend_lowlevel)(void); /* Physical address to resume after wakeup */ unsigned long acpi_get_wakeup_address(void); +static inline bool acpi_skip_set_wakeup_address(void) +{ + return cpu_feature_enabled(X86_FEATURE_XENPV); +} + +#define acpi_skip_set_wakeup_address acpi_skip_set_wakeup_address + +union acpi_subtable_headers; + +int __init acpi_parse_mp_wake(union acpi_subtable_headers *header, + const unsigned long end); + +void asm_acpi_mp_play_dead(u64 reset_vector, u64 pgd_pa); + /* * Check if the CPU can handle C2 and deeper */ @@ -92,23 +114,42 @@ static inline bool arch_has_acpi_pdc(void) c->x86_vendor == X86_VENDOR_CENTAUR); } -static inline void arch_acpi_set_pdc_bits(u32 *buf) +static inline void arch_acpi_set_proc_cap_bits(u32 *cap) { struct cpuinfo_x86 *c = &cpu_data(0); - buf[2] |= ACPI_PDC_C_CAPABILITY_SMP; + *cap |= ACPI_PROC_CAP_C_CAPABILITY_SMP; + + /* Enable coordination with firmware's _TSD info */ + *cap |= ACPI_PROC_CAP_SMP_T_SWCOORD; if (cpu_has(c, X86_FEATURE_EST)) - buf[2] |= ACPI_PDC_EST_CAPABILITY_SWSMP; + *cap |= ACPI_PROC_CAP_EST_CAPABILITY_SWSMP; if (cpu_has(c, X86_FEATURE_ACPI)) - buf[2] |= ACPI_PDC_T_FFH; + *cap |= ACPI_PROC_CAP_T_FFH; + + if (cpu_has(c, X86_FEATURE_HWP)) + *cap |= ACPI_PROC_CAP_COLLAB_PROC_PERF; /* - * If mwait/monitor is unsupported, C2/C3_FFH will be disabled + * If mwait/monitor is unsupported, C_C1_FFH and + * C2/C3_FFH will be disabled. */ - if (!cpu_has(c, X86_FEATURE_MWAIT)) - buf[2] &= ~(ACPI_PDC_C_C2C3_FFH); + if (!cpu_has(c, X86_FEATURE_MWAIT) || + boot_option_idle_override == IDLE_NOMWAIT) + *cap &= ~(ACPI_PROC_CAP_C_C1_FFH | ACPI_PROC_CAP_C_C2C3_FFH); + + if (xen_initial_domain()) { + /* + * When Linux is running as Xen dom0, the hypervisor is the + * entity in charge of the processor power management, and so + * Xen needs to check the OS capabilities reported in the + * processor capabilities buffer matches what the hypervisor + * driver supports. + */ + xen_sanitize_proc_cap_bits(cap); + } } static inline bool acpi_has_cpu_in_madt(void) @@ -133,6 +174,14 @@ void acpi_generic_reduced_hw_init(void); void x86_default_set_root_pointer(u64 addr); u64 x86_default_get_root_pointer(void); +#ifdef CONFIG_XEN_PV +/* A Xen PV domain needs a special acpi_os_ioremap() handling. */ +extern void __iomem * (*acpi_os_ioremap)(acpi_physical_address phys, + acpi_size size); +void __iomem *x86_acpi_os_ioremap(acpi_physical_address phys, acpi_size size); +#define acpi_os_ioremap acpi_os_ioremap +#endif + #else /* !CONFIG_ACPI */ #define acpi_lapic 0 diff --git a/arch/x86/include/asm/acrn.h b/arch/x86/include/asm/acrn.h index e003a01b7c67..fab11192c60a 100644 --- a/arch/x86/include/asm/acrn.h +++ b/arch/x86/include/asm/acrn.h @@ -10,17 +10,31 @@ /* Bit 0 indicates whether guest VM is privileged */ #define ACRN_FEATURE_PRIVILEGED_VM BIT(0) +/* + * Timing Information. + * This leaf returns the current TSC frequency in kHz. + * + * EAX: (Virtual) TSC frequency in kHz. + * EBX, ECX, EDX: RESERVED (reserved fields are set to zero). + */ +#define ACRN_CPUID_TIMING_INFO 0x40000010 + void acrn_setup_intr_handler(void (*handler)(void)); void acrn_remove_intr_handler(void); static inline u32 acrn_cpuid_base(void) { if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) - return hypervisor_cpuid_base("ACRNACRNACRN", 0); + return cpuid_base_hypervisor("ACRNACRNACRN", 0); return 0; } +static inline unsigned long acrn_get_tsc_khz(void) +{ + return cpuid_eax(ACRN_CPUID_TIMING_INFO); +} + /* * Hypercalls for ACRN * diff --git a/arch/x86/include/asm/agp.h b/arch/x86/include/asm/agp.h index cd7b14322035..c8c111d8fbd7 100644 --- a/arch/x86/include/asm/agp.h +++ b/arch/x86/include/asm/agp.h @@ -23,10 +23,4 @@ */ #define flush_agp_cache() wbinvd() -/* GATT allocation. Returns/accepts GATT kernel virtual address. */ -#define alloc_gatt_pages(order) \ - ((char *)__get_free_pages(GFP_KERNEL, (order))) -#define free_gatt_pages(table, order) \ - free_pages((unsigned long)(table), (order)) - #endif /* _ASM_X86_AGP_H */ diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 9542c582d546..15bc07a5ebb3 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -4,12 +4,19 @@ #include <linux/types.h> #include <linux/stringify.h> +#include <linux/objtool.h> #include <asm/asm.h> +#include <asm/bug.h> -#define ALTINSTR_FLAG_INV (1 << 15) -#define ALT_NOT(feat) ((feat) | ALTINSTR_FLAG_INV) +#define ALT_FLAGS_SHIFT 16 -#ifndef __ASSEMBLY__ +#define ALT_FLAG_NOT (1 << 0) +#define ALT_NOT(feature) ((ALT_FLAG_NOT << ALT_FLAGS_SHIFT) | (feature)) +#define ALT_FLAG_DIRECT_CALL (1 << 1) +#define ALT_DIRECT_CALL(feature) ((ALT_FLAG_DIRECT_CALL << ALT_FLAGS_SHIFT) | (feature)) +#define ALT_CALL_ALWAYS ALT_DIRECT_CALL(X86_FEATURE_ALWAYS) + +#ifndef __ASSEMBLER__ #include <linux/stddef.h> @@ -42,7 +49,7 @@ ".popsection\n" \ "671:" -#define LOCK_PREFIX LOCK_PREFIX_HERE "\n\tlock; " +#define LOCK_PREFIX LOCK_PREFIX_HERE "\n\tlock " #else /* ! CONFIG_SMP */ #define LOCK_PREFIX_HERE "" @@ -50,23 +57,38 @@ #endif /* - * objtool annotation to ignore the alternatives and only consider the original - * instruction(s). + * The patching flags are part of the upper bits of the @ft_flags parameter when + * specifying them. The split is currently like this: + * + * [31... flags ...16][15... CPUID feature bit ...0] + * + * but since this is all hidden in the macros argument being split, those fields can be + * extended in the future to fit in a u64 or however the need arises. */ -#define ANNOTATE_IGNORE_ALTERNATIVE \ - "999:\n\t" \ - ".pushsection .discard.ignore_alts\n\t" \ - ".long 999b - .\n\t" \ - ".popsection\n\t" - struct alt_instr { s32 instr_offset; /* original instruction */ s32 repl_offset; /* offset to replacement instruction */ - u16 cpuid; /* cpuid bit set for replacement */ + + union { + struct { + u32 cpuid: 16; /* CPUID bit set for replacement */ + u32 flags: 16; /* patching control flags */ + }; + u32 ft_flags; + }; + u8 instrlen; /* length of original instruction */ u8 replacementlen; /* length of new instruction */ } __packed; +extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; + +extern s32 __retpoline_sites[], __retpoline_sites_end[]; +extern s32 __return_sites[], __return_sites_end[]; +extern s32 __cfi_sites[], __cfi_sites_end[]; +extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[]; +extern s32 __smp_locks[], __smp_locks_end[]; + /* * Debug flag that can be tested to see whether alternative * instructions were patched in already: @@ -77,10 +99,69 @@ extern void alternative_instructions(void); extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); extern void apply_retpolines(s32 *start, s32 *end); extern void apply_returns(s32 *start, s32 *end); -extern void apply_ibt_endbr(s32 *start, s32 *end); +extern void apply_seal_endbr(s32 *start, s32 *end); +extern void apply_fineibt(s32 *start_retpoline, s32 *end_retpoine, + s32 *start_cfi, s32 *end_cfi); struct module; +struct callthunk_sites { + s32 *call_start, *call_end; +}; + +#ifdef CONFIG_CALL_THUNKS +extern void callthunks_patch_builtin_calls(void); +extern void callthunks_patch_module_calls(struct callthunk_sites *sites, + struct module *mod); +extern void *callthunks_translate_call_dest(void *dest); +extern int x86_call_depth_emit_accounting(u8 **pprog, void *func, void *ip); +#else +static __always_inline void callthunks_patch_builtin_calls(void) {} +static __always_inline void +callthunks_patch_module_calls(struct callthunk_sites *sites, + struct module *mod) {} +static __always_inline void *callthunks_translate_call_dest(void *dest) +{ + return dest; +} +static __always_inline int x86_call_depth_emit_accounting(u8 **pprog, + void *func, void *ip) +{ + return 0; +} +#endif + +#ifdef CONFIG_MITIGATION_ITS +extern void its_init_mod(struct module *mod); +extern void its_fini_mod(struct module *mod); +extern void its_free_mod(struct module *mod); +extern u8 *its_static_thunk(int reg); +#else /* CONFIG_MITIGATION_ITS */ +static inline void its_init_mod(struct module *mod) { } +static inline void its_fini_mod(struct module *mod) { } +static inline void its_free_mod(struct module *mod) { } +static inline u8 *its_static_thunk(int reg) +{ + WARN_ONCE(1, "ITS not compiled in"); + + return NULL; +} +#endif + +#if defined(CONFIG_MITIGATION_RETHUNK) && defined(CONFIG_OBJTOOL) +extern bool cpu_wants_rethunk(void); +extern bool cpu_wants_rethunk_at(void *addr); +#else +static __always_inline bool cpu_wants_rethunk(void) +{ + return false; +} +static __always_inline bool cpu_wants_rethunk_at(void *addr) +{ + return false; +} +#endif + #ifdef CONFIG_SMP extern void alternatives_smp_module_add(struct module *mod, char *name, void *locks, void *locks_end, @@ -101,101 +182,52 @@ static inline int alternatives_text_reserved(void *start, void *end) } #endif /* CONFIG_SMP */ -#define b_replacement(num) "664"#num -#define e_replacement(num) "665"#num +#define ALT_CALL_INSTR "call BUG_func" -#define alt_end_marker "663" -#define alt_slen "662b-661b" -#define alt_total_slen alt_end_marker"b-661b" -#define alt_rlen(num) e_replacement(num)"f-"b_replacement(num)"f" +#define alt_slen "772b-771b" +#define alt_total_slen "773b-771b" +#define alt_rlen "775f-774f" -#define OLDINSTR(oldinstr, num) \ - "# ALT: oldnstr\n" \ - "661:\n\t" oldinstr "\n662:\n" \ +#define OLDINSTR(oldinstr) \ + "# ALT: oldinstr\n" \ + "771:\n\t" oldinstr "\n772:\n" \ "# ALT: padding\n" \ - ".skip -(((" alt_rlen(num) ")-(" alt_slen ")) > 0) * " \ - "((" alt_rlen(num) ")-(" alt_slen ")),0x90\n" \ - alt_end_marker ":\n" - -/* - * gas compatible max based on the idea from: - * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax - * - * The additional "-" is needed because gas uses a "true" value of -1. - */ -#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") < (" b ")))))" - -/* - * Pad the second replacement alternative with additional NOPs if it is - * additionally longer than the first replacement alternative. - */ -#define OLDINSTR_2(oldinstr, num1, num2) \ - "# ALT: oldinstr2\n" \ - "661:\n\t" oldinstr "\n662:\n" \ - "# ALT: padding2\n" \ - ".skip -((" alt_max_short(alt_rlen(num1), alt_rlen(num2)) " - (" alt_slen ")) > 0) * " \ - "(" alt_max_short(alt_rlen(num1), alt_rlen(num2)) " - (" alt_slen ")), 0x90\n" \ - alt_end_marker ":\n" - -#define OLDINSTR_3(oldinsn, n1, n2, n3) \ - "# ALT: oldinstr3\n" \ - "661:\n\t" oldinsn "\n662:\n" \ - "# ALT: padding3\n" \ - ".skip -((" alt_max_short(alt_max_short(alt_rlen(n1), alt_rlen(n2)), alt_rlen(n3)) \ - " - (" alt_slen ")) > 0) * " \ - "(" alt_max_short(alt_max_short(alt_rlen(n1), alt_rlen(n2)), alt_rlen(n3)) \ - " - (" alt_slen ")), 0x90\n" \ - alt_end_marker ":\n" - -#define ALTINSTR_ENTRY(feature, num) \ - " .long 661b - .\n" /* label */ \ - " .long " b_replacement(num)"f - .\n" /* new instruction */ \ - " .word " __stringify(feature) "\n" /* feature bit */ \ + ".skip -(((" alt_rlen ")-(" alt_slen ")) > 0) * " \ + "((" alt_rlen ")-(" alt_slen ")),0x90\n" \ + "773:\n" + +#define ALTINSTR_ENTRY(ft_flags) \ + ".pushsection .altinstructions,\"a\"\n" \ + " .long 771b - .\n" /* label */ \ + " .long 774f - .\n" /* new instruction */ \ + " .4byte " __stringify(ft_flags) "\n" /* feature + flags */ \ " .byte " alt_total_slen "\n" /* source len */ \ - " .byte " alt_rlen(num) "\n" /* replacement len */ - -#define ALTINSTR_REPLACEMENT(newinstr, num) /* replacement */ \ - "# ALT: replacement " #num "\n" \ - b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n" - -/* alternative assembly primitive: */ -#define ALTERNATIVE(oldinstr, newinstr, feature) \ - OLDINSTR(oldinstr, 1) \ - ".pushsection .altinstructions,\"a\"\n" \ - ALTINSTR_ENTRY(feature, 1) \ - ".popsection\n" \ - ".pushsection .altinstr_replacement, \"ax\"\n" \ - ALTINSTR_REPLACEMENT(newinstr, 1) \ + " .byte " alt_rlen "\n" /* replacement len */ \ ".popsection\n" -#define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\ - OLDINSTR_2(oldinstr, 1, 2) \ - ".pushsection .altinstructions,\"a\"\n" \ - ALTINSTR_ENTRY(feature1, 1) \ - ALTINSTR_ENTRY(feature2, 2) \ - ".popsection\n" \ +#define ALTINSTR_REPLACEMENT(newinstr) /* replacement */ \ ".pushsection .altinstr_replacement, \"ax\"\n" \ - ALTINSTR_REPLACEMENT(newinstr1, 1) \ - ALTINSTR_REPLACEMENT(newinstr2, 2) \ + "# ALT: replacement\n" \ + "774:\n\t" newinstr "\n775:\n" \ ".popsection\n" +/* alternative assembly primitive: */ +#define ALTERNATIVE(oldinstr, newinstr, ft_flags) \ + OLDINSTR(oldinstr) \ + ALTINSTR_ENTRY(ft_flags) \ + ALTINSTR_REPLACEMENT(newinstr) + +#define ALTERNATIVE_2(oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2) \ + ALTERNATIVE(ALTERNATIVE(oldinstr, newinstr1, ft_flags1), newinstr2, ft_flags2) + /* If @feature is set, patch in @newinstr_yes, otherwise @newinstr_no. */ -#define ALTERNATIVE_TERNARY(oldinstr, feature, newinstr_yes, newinstr_no) \ - ALTERNATIVE_2(oldinstr, newinstr_no, X86_FEATURE_ALWAYS, \ - newinstr_yes, feature) - -#define ALTERNATIVE_3(oldinsn, newinsn1, feat1, newinsn2, feat2, newinsn3, feat3) \ - OLDINSTR_3(oldinsn, 1, 2, 3) \ - ".pushsection .altinstructions,\"a\"\n" \ - ALTINSTR_ENTRY(feat1, 1) \ - ALTINSTR_ENTRY(feat2, 2) \ - ALTINSTR_ENTRY(feat3, 3) \ - ".popsection\n" \ - ".pushsection .altinstr_replacement, \"ax\"\n" \ - ALTINSTR_REPLACEMENT(newinsn1, 1) \ - ALTINSTR_REPLACEMENT(newinsn2, 2) \ - ALTINSTR_REPLACEMENT(newinsn3, 3) \ - ".popsection\n" +#define ALTERNATIVE_TERNARY(oldinstr, ft_flags, newinstr_yes, newinstr_no) \ + ALTERNATIVE_2(oldinstr, newinstr_no, X86_FEATURE_ALWAYS, newinstr_yes, ft_flags) + +#define ALTERNATIVE_3(oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2, \ + newinstr3, ft_flags3) \ + ALTERNATIVE(ALTERNATIVE_2(oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2), \ + newinstr3, ft_flags3) /* * Alternative instructions for different CPU types or capabilities. @@ -209,14 +241,11 @@ static inline int alternatives_text_reserved(void *start, void *end) * For non barrier like inlines please define new variants * without volatile and memory clobber. */ -#define alternative(oldinstr, newinstr, feature) \ - asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory") +#define alternative(oldinstr, newinstr, ft_flags) \ + asm_inline volatile(ALTERNATIVE(oldinstr, newinstr, ft_flags) : : : "memory") -#define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \ - asm_inline volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory") - -#define alternative_ternary(oldinstr, feature, newinstr_yes, newinstr_no) \ - asm_inline volatile(ALTERNATIVE_TERNARY(oldinstr, feature, newinstr_yes, newinstr_no) ::: "memory") +#define alternative_2(oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2) \ + asm_inline volatile(ALTERNATIVE_2(oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2) ::: "memory") /* * Alternative inline assembly with input. @@ -226,33 +255,31 @@ static inline int alternatives_text_reserved(void *start, void *end) * Argument numbers start with 1. * Leaving an unused argument 0 to keep API compatibility. */ -#define alternative_input(oldinstr, newinstr, feature, input...) \ - asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, feature) \ - : : "i" (0), ## input) - -/* - * This is similar to alternative_input. But it has two features and - * respective instructions. - * - * If CPU has feature2, newinstr2 is used. - * Otherwise, if CPU has feature1, newinstr1 is used. - * Otherwise, oldinstr is used. - */ -#define alternative_input_2(oldinstr, newinstr1, feature1, newinstr2, \ - feature2, input...) \ - asm_inline volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, \ - newinstr2, feature2) \ +#define alternative_input(oldinstr, newinstr, ft_flags, input...) \ + asm_inline volatile(ALTERNATIVE(oldinstr, newinstr, ft_flags) \ : : "i" (0), ## input) /* Like alternative_input, but with a single output argument */ -#define alternative_io(oldinstr, newinstr, feature, output, input...) \ - asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, feature) \ +#define alternative_io(oldinstr, newinstr, ft_flags, output, input...) \ + asm_inline volatile(ALTERNATIVE(oldinstr, newinstr, ft_flags) \ : output : "i" (0), ## input) -/* Like alternative_io, but for replacing a direct call with another one. */ -#define alternative_call(oldfunc, newfunc, feature, output, input...) \ - asm_inline volatile (ALTERNATIVE("call %P[old]", "call %P[new]", feature) \ - : output : [old] "i" (oldfunc), [new] "i" (newfunc), ## input) +/* + * Like alternative_io, but for replacing a direct call with another one. + * + * Use the %c operand modifier which is the generic way to print a bare + * constant expression with all syntax-specific punctuation omitted. %P + * is the x86-specific variant which can handle constants too, for + * historical reasons, but it should be used primarily for PIC + * references: i.e., if used for a function, it would add the PLT + * suffix. + */ +#define alternative_call(oldfunc, newfunc, ft_flags, output, input, clobbers...) \ + asm_inline volatile(ALTERNATIVE("call %c[old]", "call %c[new]", ft_flags) \ + : ALT_OUTPUT_SP(output) \ + : [old] "i" (oldfunc), [new] "i" (newfunc) \ + COMMA(input) \ + : clobbers) /* * Like alternative_call, but there are two features and respective functions. @@ -260,27 +287,35 @@ static inline int alternatives_text_reserved(void *start, void *end) * Otherwise, if CPU has feature1, function1 is used. * Otherwise, old function is used. */ -#define alternative_call_2(oldfunc, newfunc1, feature1, newfunc2, feature2, \ - output, input...) \ - asm_inline volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", feature1,\ - "call %P[new2]", feature2) \ - : output, ASM_CALL_CONSTRAINT \ - : [old] "i" (oldfunc), [new1] "i" (newfunc1), \ - [new2] "i" (newfunc2), ## input) - -/* - * use this macro(s) if you need more than one output parameter - * in alternative_io - */ -#define ASM_OUTPUT2(a...) a - -/* - * use this macro if you need clobbers but no inputs in - * alternative_{input,io,call}() - */ -#define ASM_NO_INPUT_CLOBBER(clbr...) "i" (0) : clbr - -#else /* __ASSEMBLY__ */ +#define alternative_call_2(oldfunc, newfunc1, ft_flags1, newfunc2, ft_flags2, \ + output, input, clobbers...) \ + asm_inline volatile(ALTERNATIVE_2("call %c[old]", "call %c[new1]", ft_flags1, \ + "call %c[new2]", ft_flags2) \ + : ALT_OUTPUT_SP(output) \ + : [old] "i" (oldfunc), [new1] "i" (newfunc1), \ + [new2] "i" (newfunc2) \ + COMMA(input) \ + : clobbers) + +#define ALT_OUTPUT_SP(...) ASM_CALL_CONSTRAINT, ## __VA_ARGS__ + +/* Macro for creating assembler functions avoiding any C magic. */ +#define DEFINE_ASM_FUNC(func, instr, sec) \ + asm (".pushsection " #sec ", \"ax\"\n" \ + ".global " #func "\n\t" \ + ".type " #func ", @function\n\t" \ + ASM_FUNC_ALIGN "\n" \ + #func ":\n\t" \ + ASM_ENDBR \ + instr "\n\t" \ + ASM_RET \ + ".size " #func ", . - " #func "\n\t" \ + ".popsection") + +void BUG_func(void); +void nop_func(void); + +#else /* __ASSEMBLER__ */ #ifdef CONFIG_SMP .macro LOCK_PREFIX @@ -296,99 +331,68 @@ static inline int alternatives_text_reserved(void *start, void *end) #endif /* - * objtool annotation to ignore the alternatives and only consider the original - * instruction(s). - */ -.macro ANNOTATE_IGNORE_ALTERNATIVE - .Lannotate_\@: - .pushsection .discard.ignore_alts - .long .Lannotate_\@ - . - .popsection -.endm - -/* * Issue one struct alt_instr descriptor entry (need to put it into * the section .altinstructions, see below). This entry contains * enough information for the alternatives patching code to patch an * instruction. See apply_alternatives(). */ -.macro altinstruction_entry orig alt feature orig_len alt_len +.macro altinstr_entry orig alt ft_flags orig_len alt_len .long \orig - . .long \alt - . - .word \feature + .4byte \ft_flags .byte \orig_len .byte \alt_len .endm +.macro ALT_CALL_INSTR + call BUG_func +.endm + /* * Define an alternative between two instructions. If @feature is * present, early code in apply_alternatives() replaces @oldinstr with * @newinstr. ".skip" directive takes care of proper instruction padding * in case @newinstr is longer than @oldinstr. */ -.macro ALTERNATIVE oldinstr, newinstr, feature -140: - \oldinstr -141: - .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90 -142: - - .pushsection .altinstructions,"a" - altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f - .popsection - - .pushsection .altinstr_replacement,"ax" -143: - \newinstr -144: - .popsection +#define __ALTERNATIVE(oldinst, newinst, flag) \ +740: \ + oldinst ; \ +741: \ + .skip -(((744f-743f)-(741b-740b)) > 0) * ((744f-743f)-(741b-740b)),0x90 ;\ +742: \ + .pushsection .altinstructions,"a" ; \ + altinstr_entry 740b,743f,flag,742b-740b,744f-743f ; \ + .popsection ; \ + .pushsection .altinstr_replacement,"ax" ; \ +743: \ + newinst ; \ +744: \ + .popsection ; + +.macro ALTERNATIVE oldinstr, newinstr, ft_flags + __ALTERNATIVE(\oldinstr, \newinstr, \ft_flags) .endm -#define old_len 141b-140b -#define new_len1 144f-143f -#define new_len2 145f-144f - -/* - * gas compatible max based on the idea from: - * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax - * - * The additional "-" is needed because gas uses a "true" value of -1. - */ -#define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) - - /* * Same as ALTERNATIVE macro above but for two alternatives. If CPU * has @feature1, it replaces @oldinstr with @newinstr1. If CPU has * @feature2, it replaces @oldinstr with @feature2. */ -.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2 -140: - \oldinstr -141: - .skip -((alt_max_short(new_len1, new_len2) - (old_len)) > 0) * \ - (alt_max_short(new_len1, new_len2) - (old_len)),0x90 -142: - - .pushsection .altinstructions,"a" - altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f - altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f - .popsection +.macro ALTERNATIVE_2 oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2 + __ALTERNATIVE(__ALTERNATIVE(\oldinstr, \newinstr1, \ft_flags1), + \newinstr2, \ft_flags2) +.endm - .pushsection .altinstr_replacement,"ax" -143: - \newinstr1 -144: - \newinstr2 -145: - .popsection +.macro ALTERNATIVE_3 oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2, newinstr3, ft_flags3 + __ALTERNATIVE(ALTERNATIVE_2(\oldinstr, \newinstr1, \ft_flags1, \newinstr2, \ft_flags2), + \newinstr3, \ft_flags3) .endm /* If @feature is set, patch in @newinstr_yes, otherwise @newinstr_no. */ -#define ALTERNATIVE_TERNARY(oldinstr, feature, newinstr_yes, newinstr_no) \ +#define ALTERNATIVE_TERNARY(oldinstr, ft_flags, newinstr_yes, newinstr_no) \ ALTERNATIVE_2 oldinstr, newinstr_no, X86_FEATURE_ALWAYS, \ - newinstr_yes, feature + newinstr_yes, ft_flags -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_ALTERNATIVE_H */ diff --git a/arch/x86/include/asm/amd/fch.h b/arch/x86/include/asm/amd/fch.h new file mode 100644 index 000000000000..2cf5153edbc2 --- /dev/null +++ b/arch/x86/include/asm/amd/fch.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_AMD_FCH_H_ +#define _ASM_X86_AMD_FCH_H_ + +#define FCH_PM_BASE 0xFED80300 + +/* Register offsets from PM base: */ +#define FCH_PM_DECODEEN 0x00 +#define FCH_PM_DECODEEN_SMBUS0SEL GENMASK(20, 19) +#define FCH_PM_SCRATCH 0x80 +#define FCH_PM_S5_RESET_STATUS 0xC0 + +#endif /* _ASM_X86_AMD_FCH_H_ */ diff --git a/arch/x86/include/asm/amd_hsmp.h b/arch/x86/include/asm/amd/hsmp.h index 03c2ce3edaf5..2137f62853ed 100644 --- a/arch/x86/include/asm/amd_hsmp.h +++ b/arch/x86/include/asm/amd/hsmp.h @@ -1,5 +1,4 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ - #ifndef _ASM_X86_AMD_HSMP_H_ #define _ASM_X86_AMD_HSMP_H_ @@ -13,4 +12,5 @@ static inline int hsmp_send_message(struct hsmp_message *msg) return -ENODEV; } #endif + #endif /*_ASM_X86_AMD_HSMP_H_*/ diff --git a/arch/x86/include/asm/amd-ibs.h b/arch/x86/include/asm/amd/ibs.h index f3eb098d63d4..3ee5903982c2 100644 --- a/arch/x86/include/asm/amd-ibs.h +++ b/arch/x86/include/asm/amd/ibs.h @@ -1,4 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_AMD_IBS_H +#define _ASM_X86_AMD_IBS_H + /* * From PPR Vol 1 for AMD Family 19h Model 01h B1 * 55898 Rev 0.35 - Feb 5, 2021 @@ -6,6 +9,22 @@ #include <asm/msr-index.h> +/* IBS_OP_DATA2 DataSrc */ +#define IBS_DATA_SRC_LOC_CACHE 2 +#define IBS_DATA_SRC_DRAM 3 +#define IBS_DATA_SRC_REM_CACHE 4 +#define IBS_DATA_SRC_IO 7 + +/* IBS_OP_DATA2 DataSrc Extension */ +#define IBS_DATA_SRC_EXT_LOC_CACHE 1 +#define IBS_DATA_SRC_EXT_NEAR_CCX_CACHE 2 +#define IBS_DATA_SRC_EXT_DRAM 3 +#define IBS_DATA_SRC_EXT_FAR_CCX_CACHE 5 +#define IBS_DATA_SRC_EXT_PMEM 6 +#define IBS_DATA_SRC_EXT_IO 7 +#define IBS_DATA_SRC_EXT_EXT_MEM 8 +#define IBS_DATA_SRC_EXT_PEER_AGENT_MEM 12 + /* * IBS Hardware MSRs */ @@ -48,7 +67,8 @@ union ibs_op_ctl { opmaxcnt_ext:7, /* 20-26: upper 7 bits of periodic op maximum count */ reserved0:5, /* 27-31: reserved */ opcurcnt:27, /* 32-58: periodic op counter current count */ - reserved1:5; /* 59-63: reserved */ + ldlat_thrsh:4, /* 59-62: Load Latency threshold */ + ldlat_en:1; /* 63: Load Latency enabled */ }; }; @@ -134,3 +154,5 @@ struct perf_ibs_data { }; u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX]; }; + +#endif /* _ASM_X86_AMD_IBS_H */ diff --git a/arch/x86/include/asm/amd/nb.h b/arch/x86/include/asm/amd/nb.h new file mode 100644 index 000000000000..ddb5108cf46c --- /dev/null +++ b/arch/x86/include/asm/amd/nb.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_AMD_NB_H +#define _ASM_X86_AMD_NB_H + +#include <linux/ioport.h> +#include <linux/pci.h> +#include <asm/amd/node.h> + +struct amd_nb_bus_dev_range { + u8 bus; + u8 dev_base; + u8 dev_limit; +}; + +extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[]; + +extern bool early_is_amd_nb(u32 value); +extern struct resource *amd_get_mmconfig_range(struct resource *res); +extern void amd_flush_garts(void); +extern int amd_numa_init(void); +extern int amd_get_subcaches(int); +extern int amd_set_subcaches(int, unsigned long); + +struct amd_l3_cache { + unsigned indices; + u8 subcaches[4]; +}; + +struct amd_northbridge { + struct pci_dev *misc; + struct pci_dev *link; + struct amd_l3_cache l3_cache; +}; + +struct amd_northbridge_info { + u16 num; + u64 flags; + struct amd_northbridge *nb; +}; + +#define AMD_NB_GART BIT(0) +#define AMD_NB_L3_INDEX_DISABLE BIT(1) +#define AMD_NB_L3_PARTITIONING BIT(2) + +#ifdef CONFIG_AMD_NB + +u16 amd_nb_num(void); +bool amd_nb_has_feature(unsigned int feature); +struct amd_northbridge *node_to_amd_nb(int node); + +static inline bool amd_gart_present(void) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return false; + + /* GART present only on Fam15h, up to model 0fh */ + if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 || + (boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model < 0x10)) + return true; + + return false; +} + +#else + +#define amd_nb_num(x) 0 +#define amd_nb_has_feature(x) false +static inline struct amd_northbridge *node_to_amd_nb(int node) +{ + return NULL; +} +#define amd_gart_present(x) false + +#endif + + +#endif /* _ASM_X86_AMD_NB_H */ diff --git a/arch/x86/include/asm/amd/node.h b/arch/x86/include/asm/amd/node.h new file mode 100644 index 000000000000..23fe617898a8 --- /dev/null +++ b/arch/x86/include/asm/amd/node.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * AMD Node helper functions and common defines + * + * Copyright (c) 2024, Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Author: Yazen Ghannam <Yazen.Ghannam@amd.com> + * + * Note: + * Items in this file may only be used in a single place. + * However, it's prudent to keep all AMD Node functionality + * in a unified place rather than spreading throughout the + * kernel. + */ + +#ifndef _ASM_X86_AMD_NODE_H_ +#define _ASM_X86_AMD_NODE_H_ + +#include <linux/pci.h> + +#define MAX_AMD_NUM_NODES 8 +#define AMD_NODE0_PCI_SLOT 0x18 + +struct pci_dev *amd_node_get_func(u16 node, u8 func); +struct pci_dev *amd_node_get_root(u16 node); + +static inline u16 amd_num_nodes(void) +{ + return topology_amd_nodes_per_pkg() * topology_max_packages(); +} + +#ifdef CONFIG_AMD_NODE +int __must_check amd_smn_read(u16 node, u32 address, u32 *value); +int __must_check amd_smn_write(u16 node, u32 address, u32 value); + +/* Should only be used by the HSMP driver. */ +int __must_check amd_smn_hsmp_rdwr(u16 node, u32 address, u32 *value, bool write); +#else +static inline int __must_check amd_smn_read(u16 node, u32 address, u32 *value) { return -ENODEV; } +static inline int __must_check amd_smn_write(u16 node, u32 address, u32 value) { return -ENODEV; } + +static inline int __must_check amd_smn_hsmp_rdwr(u16 node, u32 address, u32 *value, bool write) +{ + return -ENODEV; +} +#endif /* CONFIG_AMD_NODE */ + +/* helper for use with read_poll_timeout */ +static inline int smn_read_register(u32 reg) +{ + int data, rc; + + rc = amd_smn_read(0, reg, &data); + if (rc) + return rc; + + return data; +} +#endif /*_ASM_X86_AMD_NODE_H_*/ diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h deleted file mode 100644 index ed0eaf65c437..000000000000 --- a/arch/x86/include/asm/amd_nb.h +++ /dev/null @@ -1,125 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_AMD_NB_H -#define _ASM_X86_AMD_NB_H - -#include <linux/ioport.h> -#include <linux/pci.h> -#include <linux/refcount.h> - -struct amd_nb_bus_dev_range { - u8 bus; - u8 dev_base; - u8 dev_limit; -}; - -extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[]; - -extern bool early_is_amd_nb(u32 value); -extern struct resource *amd_get_mmconfig_range(struct resource *res); -extern void amd_flush_garts(void); -extern int amd_numa_init(void); -extern int amd_get_subcaches(int); -extern int amd_set_subcaches(int, unsigned long); - -extern int amd_smn_read(u16 node, u32 address, u32 *value); -extern int amd_smn_write(u16 node, u32 address, u32 value); - -struct amd_l3_cache { - unsigned indices; - u8 subcaches[4]; -}; - -struct threshold_block { - unsigned int block; /* Number within bank */ - unsigned int bank; /* MCA bank the block belongs to */ - unsigned int cpu; /* CPU which controls MCA bank */ - u32 address; /* MSR address for the block */ - u16 interrupt_enable; /* Enable/Disable APIC interrupt */ - bool interrupt_capable; /* Bank can generate an interrupt. */ - - u16 threshold_limit; /* - * Value upon which threshold - * interrupt is generated. - */ - - struct kobject kobj; /* sysfs object */ - struct list_head miscj; /* - * List of threshold blocks - * within a bank. - */ -}; - -struct threshold_bank { - struct kobject *kobj; - struct threshold_block *blocks; - - /* initialized to the number of CPUs on the node sharing this bank */ - refcount_t cpus; - unsigned int shared; -}; - -struct amd_northbridge { - struct pci_dev *root; - struct pci_dev *misc; - struct pci_dev *link; - struct amd_l3_cache l3_cache; - struct threshold_bank *bank4; -}; - -struct amd_northbridge_info { - u16 num; - u64 flags; - struct amd_northbridge *nb; -}; - -#define AMD_NB_GART BIT(0) -#define AMD_NB_L3_INDEX_DISABLE BIT(1) -#define AMD_NB_L3_PARTITIONING BIT(2) - -#ifdef CONFIG_AMD_NB - -u16 amd_nb_num(void); -bool amd_nb_has_feature(unsigned int feature); -struct amd_northbridge *node_to_amd_nb(int node); - -static inline u16 amd_pci_dev_to_node_id(struct pci_dev *pdev) -{ - struct pci_dev *misc; - int i; - - for (i = 0; i != amd_nb_num(); i++) { - misc = node_to_amd_nb(i)->misc; - - if (pci_domain_nr(misc->bus) == pci_domain_nr(pdev->bus) && - PCI_SLOT(misc->devfn) == PCI_SLOT(pdev->devfn)) - return i; - } - - WARN(1, "Unable to find AMD Northbridge id for %s\n", pci_name(pdev)); - return 0; -} - -static inline bool amd_gart_present(void) -{ - if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) - return false; - - /* GART present only on Fam15h, upto model 0fh */ - if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 || - (boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model < 0x10)) - return true; - - return false; -} - -#else - -#define amd_nb_num(x) 0 -#define amd_nb_has_feature(x) false -#define node_to_amd_nb(x) NULL -#define amd_gart_present(x) false - -#endif - - -#endif /* _ASM_X86_AMD_NB_H */ diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index bd8ae0a7010a..23d86c9750b9 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -3,6 +3,7 @@ #define _ASM_X86_APIC_H #include <linux/cpumask.h> +#include <linux/static_call.h> #include <asm/alternative.h> #include <asm/cpufeature.h> @@ -12,9 +13,16 @@ #include <asm/mpspec.h> #include <asm/msr.h> #include <asm/hardirq.h> +#include <asm/io.h> +#include <asm/posted_intr.h> #define ARCH_APICTIMER_STOPS_ON_C3 1 +/* Macros for apic_extnmi which controls external NMI masking */ +#define APIC_EXTNMI_BSP 0 /* Default */ +#define APIC_EXTNMI_ALL 1 +#define APIC_EXTNMI_NONE 2 + /* * Debugging macros */ @@ -22,37 +30,39 @@ #define APIC_VERBOSE 1 #define APIC_DEBUG 2 -/* Macros for apic_extnmi which controls external NMI masking */ -#define APIC_EXTNMI_BSP 0 /* Default */ -#define APIC_EXTNMI_ALL 1 -#define APIC_EXTNMI_NONE 2 - /* - * Define the default level of output to be very little - * This can be turned up by using apic=verbose for more - * information and apic=debug for _lots_ of information. - * apic_verbosity is defined in apic.c + * Define the default level of output to be very little This can be turned + * up by using apic=verbose for more information and apic=debug for _lots_ + * of information. apic_verbosity is defined in apic.c */ -#define apic_printk(v, s, a...) do { \ - if ((v) <= apic_verbosity) \ - printk(s, ##a); \ - } while (0) - +#define apic_printk(v, s, a...) \ +do { \ + if ((v) <= apic_verbosity) \ + printk(s, ##a); \ +} while (0) + +#define apic_pr_verbose(s, a...) apic_printk(APIC_VERBOSE, KERN_INFO s, ##a) +#define apic_pr_debug(s, a...) apic_printk(APIC_DEBUG, KERN_DEBUG s, ##a) +#define apic_pr_debug_cont(s, a...) apic_printk(APIC_DEBUG, KERN_CONT s, ##a) +/* Unconditional debug prints for code which is guarded by apic_verbosity already */ +#define apic_dbg(s, a...) printk(KERN_DEBUG s, ##a) #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) -extern void generic_apic_probe(void); +extern void x86_32_probe_apic(void); #else -static inline void generic_apic_probe(void) -{ -} +static inline void x86_32_probe_apic(void) { } #endif +extern u32 cpuid_to_apicid[]; + +#define CPU_ACPIID_INVALID U32_MAX + #ifdef CONFIG_X86_LOCAL_APIC extern int apic_verbosity; extern int local_apic_timer_c2_ok; -extern int disable_apic; +extern bool apic_is_disabled; extern unsigned int lapic_timer_period; extern enum apic_intr_mode_id apic_intr_mode; @@ -64,20 +74,6 @@ enum apic_intr_mode_id { APIC_SYMMETRIC_IO_NO_ROUTING }; -#ifdef CONFIG_SMP -extern void __inquire_remote_apic(int apicid); -#else /* CONFIG_SMP */ -static inline void __inquire_remote_apic(int apicid) -{ -} -#endif /* CONFIG_SMP */ - -static inline void default_inquire_remote_apic(int apicid) -{ - if (apic_verbosity >= APIC_DEBUG) - __inquire_remote_apic(apicid); -} - /* * With 82489DX we can't rely on apic feature bit * retrieved via cpuid but still have to deal with @@ -88,7 +84,7 @@ static inline void default_inquire_remote_apic(int apicid) */ static inline bool apic_from_smp_config(void) { - return smp_found_config && !disable_apic; + return smp_found_config && !apic_is_disabled; } /* @@ -98,24 +94,25 @@ static inline bool apic_from_smp_config(void) #include <asm/paravirt.h> #endif -extern int setup_profiling_timer(unsigned int); - static inline void native_apic_mem_write(u32 reg, u32 v) { volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg); - alternative_io("movl %0, %P1", "xchgl %0, %P1", X86_BUG_11AP, - ASM_OUTPUT2("=r" (v), "=m" (*addr)), - ASM_OUTPUT2("0" (v), "m" (*addr))); + alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP, + ASM_OUTPUT("=r" (v), "=m" (*addr)), + ASM_INPUT("0" (v), "m" (*addr))); } static inline u32 native_apic_mem_read(u32 reg) { - return *((volatile u32 *)(APIC_BASE + reg)); + return readl((void __iomem *)(APIC_BASE + reg)); +} + +static inline void native_apic_mem_eoi(void) +{ + native_apic_mem_write(APIC_EOI, APIC_EOI_ACK); } -extern void native_apic_wait_icr_idle(void); -extern u32 native_safe_apic_wait_icr_idle(void); extern void native_apic_icr_write(u32 low, u32 id); extern u64 native_apic_icr_read(void); @@ -123,15 +120,13 @@ static inline bool apic_is_x2apic_enabled(void) { u64 msr; - if (rdmsrl_safe(MSR_IA32_APICBASE, &msr)) + if (rdmsrq_safe(MSR_IA32_APICBASE, &msr)) return false; return msr & X2APIC_ENABLE; } extern void enable_IR_x2apic(void); -extern int get_physical_broadcast(void); - extern int lapic_get_maxlvt(void); extern void clear_local_APIC(void); extern void disconnect_bsp_APIC(int virt_wire_setup); @@ -149,12 +144,12 @@ extern void setup_secondary_APIC_clock(void); extern void lapic_update_tsc_freq(void); #ifdef CONFIG_X86_64 -static inline int apic_force_enable(unsigned long addr) +static inline bool apic_force_enable(unsigned long addr) { - return -1; + return false; } #else -extern int apic_force_enable(unsigned long addr); +extern bool apic_force_enable(unsigned long addr); #endif extern void apic_ap_setup(void); @@ -181,6 +176,14 @@ extern bool apic_needs_pit(void); extern void apic_send_IPI_allbutself(unsigned int vector); +extern void topology_register_apic(u32 apic_id, u32 acpi_id, bool present); +extern void topology_register_boot_apic(u32 apic_id); +extern int topology_hotplug_apic(u32 apic_id, u32 acpi_id); +extern void topology_hotunplug_apic(unsigned int cpu); +extern void topology_apply_cmdline_limits_early(void); +extern void topology_init_possible_cpus(void); +extern void topology_reset_possible_cpus_up(void); + #else /* !CONFIG_X86_LOCAL_APIC */ static inline void lapic_shutdown(void) { } #define local_apic_timer_c2_ok 1 @@ -195,6 +198,8 @@ static inline void apic_intr_mode_init(void) { } static inline void lapic_assign_system_vectors(void) { } static inline void lapic_assign_legacy_vector(unsigned int i, bool r) { } static inline bool apic_needs_pit(void) { return true; } +static inline void topology_apply_cmdline_limits_early(void) { } +static inline void topology_init_possible_cpus(void) { } #endif /* !CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_X2APIC @@ -204,12 +209,12 @@ static inline void native_apic_msr_write(u32 reg, u32 v) reg == APIC_LVR) return; - wrmsr(APIC_BASE_MSR + (reg >> 4), v, 0); + wrmsrq(APIC_BASE_MSR + (reg >> 4), v); } -static inline void native_apic_msr_eoi_write(u32 reg, u32 v) +static inline void native_apic_msr_eoi(void) { - __wrmsr(APIC_BASE_MSR + (APIC_EOI >> 4), APIC_EOI_ACK, 0); + native_wrmsrq(APIC_BASE_MSR + (APIC_EOI >> 4), APIC_EOI_ACK); } static inline u32 native_apic_msr_read(u32 reg) @@ -219,39 +224,26 @@ static inline u32 native_apic_msr_read(u32 reg) if (reg == APIC_DFR) return -1; - rdmsrl(APIC_BASE_MSR + (reg >> 4), msr); + rdmsrq(APIC_BASE_MSR + (reg >> 4), msr); return (u32)msr; } -static inline void native_x2apic_wait_icr_idle(void) -{ - /* no need to wait for icr idle in x2apic */ - return; -} - -static inline u32 native_safe_x2apic_wait_icr_idle(void) -{ - /* no need to wait for icr idle in x2apic */ - return 0; -} - static inline void native_x2apic_icr_write(u32 low, u32 id) { - wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low); + wrmsrq(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low); } static inline u64 native_x2apic_icr_read(void) { unsigned long val; - rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val); + rdmsrq(APIC_BASE_MSR + (APIC_ICR >> 4), val); return val; } extern int x2apic_mode; extern int x2apic_phys; extern void __init x2apic_set_max_apicid(u32 apicid); -extern void __init check_x2apic(void); extern void x2apic_setup(void); static inline int x2apic_enabled(void) { @@ -260,13 +252,13 @@ static inline int x2apic_enabled(void) #define x2apic_supported() (boot_cpu_has(X86_FEATURE_X2APIC)) #else /* !CONFIG_X86_X2APIC */ -static inline void check_x2apic(void) { } static inline void x2apic_setup(void) { } static inline int x2apic_enabled(void) { return 0; } - +static inline u32 native_apic_msr_read(u32 reg) { BUG(); } #define x2apic_mode (0) #define x2apic_supported() (0) #endif /* !CONFIG_X86_X2APIC */ +extern void __init check_x2apic(void); struct irq_data; @@ -281,8 +273,8 @@ struct irq_data; */ struct apic { /* Hotpath functions first */ - void (*eoi_write)(u32 reg, u32 v); - void (*native_eoi_write)(u32 reg, u32 v); + void (*eoi)(void); + void (*native_eoi)(void); void (*write)(u32 reg, u32 v); u32 (*read)(u32 reg); @@ -297,10 +289,10 @@ struct apic { void (*send_IPI_all)(int vector); void (*send_IPI_self)(int vector); - u32 disable_esr; - - enum apic_delivery_modes delivery_mode; - bool dest_mode_logical; + u32 disable_esr : 1, + dest_mode_logical : 1, + x2apic_set_max_apicid : 1, + nmi_to_offline_cpu : 1; u32 (*calc_dest_apicid)(unsigned int cpu); @@ -308,47 +300,43 @@ struct apic { u64 (*icr_read)(void); void (*icr_write)(u32 low, u32 high); + /* The limit of the APIC ID space. */ + u32 max_apic_id; + /* Probe, setup and smpboot functions */ int (*probe)(void); int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id); - int (*apic_id_valid)(u32 apicid); - int (*apic_id_registered)(void); - bool (*check_apicid_used)(physid_mask_t *map, int apicid); void (*init_apic_ldr)(void); - void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap); - void (*setup_apic_routing)(void); - int (*cpu_present_to_apicid)(int mps_cpu); - void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap); - int (*check_phys_apicid_present)(int phys_apicid); - int (*phys_pkg_id)(int cpuid_apic, int index_msb); + u32 (*cpu_present_to_apicid)(int mps_cpu); - u32 (*get_apic_id)(unsigned long x); - u32 (*set_apic_id)(unsigned int id); + u32 (*get_apic_id)(u32 id); /* wakeup_secondary_cpu */ - int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip); + int (*wakeup_secondary_cpu)(u32 apicid, unsigned long start_eip, unsigned int cpu); /* wakeup secondary CPU using 64-bit wakeup point */ - int (*wakeup_secondary_cpu_64)(int apicid, unsigned long start_eip); - - void (*inquire_remote_apic)(int apicid); - -#ifdef CONFIG_X86_32 - /* - * Called very early during boot from get_smp_config(). It should - * return the logical apicid. x86_[bios]_cpu_to_apicid is - * initialized before this function is called. - * - * If logical apicid can't be determined that early, the function - * may return BAD_APICID. Logical apicid will be configured after - * init_apic_ldr() while bringing up CPUs. Note that NUMA affinity - * won't be applied properly during early boot in this case. - */ - int (*x86_32_early_logical_apicid)(int cpu); -#endif + int (*wakeup_secondary_cpu_64)(u32 apicid, unsigned long start_eip, unsigned int cpu); + char *name; }; +struct apic_override { + void (*eoi)(void); + void (*native_eoi)(void); + void (*write)(u32 reg, u32 v); + u32 (*read)(u32 reg); + void (*send_IPI)(int cpu, int vector); + void (*send_IPI_mask)(const struct cpumask *mask, int vector); + void (*send_IPI_mask_allbutself)(const struct cpumask *msk, int vec); + void (*send_IPI_allbutself)(int vector); + void (*send_IPI_all)(int vector); + void (*send_IPI_self)(int vector); + u64 (*icr_read)(void); + void (*icr_write)(u32 low, u32 high); + int (*wakeup_secondary_cpu)(u32 apicid, unsigned long start_eip, unsigned int cpu); + int (*wakeup_secondary_cpu_64)(u32 apicid, unsigned long start_eip, unsigned int cpu); +}; + /* * Pointer to the local APIC driver in use on this system (there's * always just one such driver in use - the kernel decides via an @@ -360,68 +348,127 @@ extern struct apic *apic; * APIC drivers are probed based on how they are listed in the .apicdrivers * section. So the order is important and enforced by the ordering * of different apic driver files in the Makefile. - * - * For the files having two apic drivers, we use apic_drivers() - * to enforce the order with in them. */ #define apic_driver(sym) \ static const struct apic *__apicdrivers_##sym __used \ __aligned(sizeof(struct apic *)) \ __section(".apicdrivers") = { &sym } -#define apic_drivers(sym1, sym2) \ - static struct apic *__apicdrivers_##sym1##sym2[2] __used \ - __aligned(sizeof(struct apic *)) \ - __section(".apicdrivers") = { &sym1, &sym2 } - extern struct apic *__apicdrivers[], *__apicdrivers_end[]; /* * APIC functionality to boot other CPUs - only used on SMP: */ #ifdef CONFIG_SMP -extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip); extern int lapic_can_unplug_cpu(void); #endif #ifdef CONFIG_X86_LOCAL_APIC +extern struct apic_override __x86_apic_override; + +void __init apic_setup_apic_calls(void); +void __init apic_install_driver(struct apic *driver); + +#define apic_update_callback(_callback, _fn) { \ + __x86_apic_override._callback = _fn; \ + apic->_callback = _fn; \ + static_call_update(apic_call_##_callback, _fn); \ + pr_info("APIC: %s() replaced with %ps()\n", #_callback, _fn); \ +} + +#define DECLARE_APIC_CALL(__cb) \ + DECLARE_STATIC_CALL(apic_call_##__cb, *apic->__cb) + +DECLARE_APIC_CALL(eoi); +DECLARE_APIC_CALL(native_eoi); +DECLARE_APIC_CALL(icr_read); +DECLARE_APIC_CALL(icr_write); +DECLARE_APIC_CALL(read); +DECLARE_APIC_CALL(send_IPI); +DECLARE_APIC_CALL(send_IPI_mask); +DECLARE_APIC_CALL(send_IPI_mask_allbutself); +DECLARE_APIC_CALL(send_IPI_allbutself); +DECLARE_APIC_CALL(send_IPI_all); +DECLARE_APIC_CALL(send_IPI_self); +DECLARE_APIC_CALL(wait_icr_idle); +DECLARE_APIC_CALL(wakeup_secondary_cpu); +DECLARE_APIC_CALL(wakeup_secondary_cpu_64); +DECLARE_APIC_CALL(write); + +static __always_inline u32 apic_read(u32 reg) +{ + return static_call(apic_call_read)(reg); +} + +static __always_inline void apic_write(u32 reg, u32 val) +{ + static_call(apic_call_write)(reg, val); +} + +static __always_inline void apic_eoi(void) +{ + static_call(apic_call_eoi)(); +} + +static __always_inline void apic_native_eoi(void) +{ + static_call(apic_call_native_eoi)(); +} + +static __always_inline u64 apic_icr_read(void) +{ + return static_call(apic_call_icr_read)(); +} + +static __always_inline void apic_icr_write(u32 low, u32 high) +{ + static_call(apic_call_icr_write)(low, high); +} -static inline u32 apic_read(u32 reg) +static __always_inline void __apic_send_IPI(int cpu, int vector) { - return apic->read(reg); + static_call(apic_call_send_IPI)(cpu, vector); } -static inline void apic_write(u32 reg, u32 val) +static __always_inline void __apic_send_IPI_mask(const struct cpumask *mask, int vector) { - apic->write(reg, val); + static_call_mod(apic_call_send_IPI_mask)(mask, vector); } -static inline void apic_eoi(void) +static __always_inline void __apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) { - apic->eoi_write(APIC_EOI, APIC_EOI_ACK); + static_call(apic_call_send_IPI_mask_allbutself)(mask, vector); } -static inline u64 apic_icr_read(void) +static __always_inline void __apic_send_IPI_allbutself(int vector) { - return apic->icr_read(); + static_call(apic_call_send_IPI_allbutself)(vector); } -static inline void apic_icr_write(u32 low, u32 high) +static __always_inline void __apic_send_IPI_all(int vector) { - apic->icr_write(low, high); + static_call(apic_call_send_IPI_all)(vector); } -static inline void apic_wait_icr_idle(void) +static __always_inline void __apic_send_IPI_self(int vector) { - apic->wait_icr_idle(); + static_call_mod(apic_call_send_IPI_self)(vector); } -static inline u32 safe_apic_wait_icr_idle(void) +static __always_inline void apic_wait_icr_idle(void) { - return apic->safe_wait_icr_idle(); + static_call_cond(apic_call_wait_icr_idle)(); } -extern void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)); +static __always_inline u32 safe_apic_wait_icr_idle(void) +{ + return apic->safe_wait_icr_idle ? apic->safe_wait_icr_idle() : 0; +} + +static __always_inline bool apic_id_valid(u32 apic_id) +{ + return apic_id <= apic->max_apic_id; +} #else /* CONFIG_X86_LOCAL_APIC */ @@ -432,22 +479,15 @@ static inline u64 apic_icr_read(void) { return 0; } static inline void apic_icr_write(u32 low, u32 high) { } static inline void apic_wait_icr_idle(void) { } static inline u32 safe_apic_wait_icr_idle(void) { return 0; } -static inline void apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) {} +static inline void apic_native_eoi(void) { WARN_ON_ONCE(1); } +static inline void apic_setup_apic_calls(void) { } + +#define apic_update_callback(_callback, _fn) do { } while (0) #endif /* CONFIG_X86_LOCAL_APIC */ extern void apic_ack_irq(struct irq_data *data); -static inline void ack_APIC_irq(void) -{ - /* - * ack_APIC_irq() actually gets compiled as a single instruction - * ... yummie. - */ - apic_eoi(); -} - - static inline bool lapic_vector_set_in_irr(unsigned int vector) { u32 irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); @@ -455,14 +495,9 @@ static inline bool lapic_vector_set_in_irr(unsigned int vector) return !!(irr & (1U << (vector % 32))); } -static inline unsigned default_get_apic_id(unsigned long x) +static inline bool is_vector_pending(unsigned int vector) { - unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); - - if (APIC_XAPIC(ver) || boot_cpu_has(X86_FEATURE_EXTD_APICID)) - return (x >> 24) & 0xFF; - else - return (x >> 24) & 0x0F; + return lapic_vector_set_in_irr(vector) || pi_pending_this_cpu(vector); } /* @@ -471,49 +506,44 @@ static inline unsigned default_get_apic_id(unsigned long x) #define TRAMPOLINE_PHYS_LOW 0x467 #define TRAMPOLINE_PHYS_HIGH 0x469 -extern void generic_bigsmp_probe(void); - #ifdef CONFIG_X86_LOCAL_APIC #include <asm/smp.h> -#define APIC_DFR_VALUE (APIC_DFR_FLAT) - -DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid); - extern struct apic apic_noop; -static inline unsigned int read_apic_id(void) +static inline u32 read_apic_id(void) { - unsigned int reg = apic_read(APIC_ID); + u32 reg = apic_read(APIC_ID); return apic->get_apic_id(reg); } #ifdef CONFIG_X86_64 typedef int (*wakeup_cpu_handler)(int apicid, unsigned long start_eip); -extern void acpi_wake_cpu_handler_update(wakeup_cpu_handler handler); -#endif - -extern int default_apic_id_valid(u32 apicid); extern int default_acpi_madt_oem_check(char *, char *); -extern void default_setup_apic_routing(void); +extern void x86_64_probe_apic(void); +#else +static inline int default_acpi_madt_oem_check(char *a, char *b) { return 0; } +static inline void x86_64_probe_apic(void) { } +#endif extern u32 apic_default_calc_apicid(unsigned int cpu); extern u32 apic_flat_calc_apicid(unsigned int cpu); -extern bool default_check_apicid_used(physid_mask_t *map, int apicid); -extern void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap); -extern int default_cpu_present_to_apicid(int mps_cpu); -extern int default_check_phys_apicid_present(int phys_apicid); +extern u32 default_cpu_present_to_apicid(int mps_cpu); -#endif /* CONFIG_X86_LOCAL_APIC */ +void apic_send_nmi_to_offline_cpu(unsigned int cpu); + +#else /* CONFIG_X86_LOCAL_APIC */ + +static inline u32 read_apic_id(void) { return 0; } + +#endif /* !CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_SMP -bool apic_id_is_primary_thread(unsigned int id); void apic_smt_update(void); #else -static inline bool apic_id_is_primary_thread(unsigned int id) { return false; } static inline void apic_smt_update(void) { } #endif diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 92035eb3afee..094106b6a538 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_APICDEF_H #define _ASM_X86_APICDEF_H +#include <linux/bits.h> + /* * Constants for various Intel APICs. (local APIC, IOAPIC, etc.) * @@ -18,6 +20,13 @@ */ #define IO_APIC_SLOT_SIZE 1024 +#define APIC_DELIVERY_MODE_FIXED 0 +#define APIC_DELIVERY_MODE_LOWESTPRIO 1 +#define APIC_DELIVERY_MODE_SMI 2 +#define APIC_DELIVERY_MODE_NMI 4 +#define APIC_DELIVERY_MODE_INIT 5 +#define APIC_DELIVERY_MODE_EXTINT 7 + #define APIC_ID 0x20 #define APIC_LVR 0x30 @@ -89,8 +98,8 @@ #define APIC_DM_EXTINT 0x00700 #define APIC_VECTOR_MASK 0x000FF #define APIC_ICR2 0x310 -#define GET_APIC_DEST_FIELD(x) (((x) >> 24) & 0xFF) -#define SET_APIC_DEST_FIELD(x) ((x) << 24) +#define GET_XAPIC_DEST_FIELD(x) (((x) >> 24) & 0xFF) +#define SET_XAPIC_DEST_FIELD(x) ((x) << 24) #define APIC_LVTT 0x320 #define APIC_LVTTHMR 0x330 #define APIC_LVTPC 0x340 @@ -138,9 +147,10 @@ #define APIC_EILVT_MASKED (1 << 16) #define APIC_BASE (fix_to_virt(FIX_APIC_BASE)) -#define APIC_BASE_MSR 0x800 -#define XAPIC_ENABLE (1UL << 11) -#define X2APIC_ENABLE (1UL << 10) +#define APIC_BASE_MSR 0x800 +#define APIC_X2APIC_ID_MSR 0x802 +#define XAPIC_ENABLE BIT(11) +#define X2APIC_ENABLE BIT(10) #ifdef CONFIG_X86_32 # define MAX_IO_APICS 64 @@ -162,277 +172,10 @@ #define APIC_CPUID(apicid) ((apicid) & XAPIC_DEST_CPUS_MASK) #define NUM_APIC_CLUSTERS ((BAD_APICID + 1) >> XAPIC_DEST_CPUS_SHIFT) -/* - * the local APIC register structure, memory mapped. Not terribly well - * tested, but we might eventually use this one in the future - the - * problem why we cannot use it right now is the P5 APIC, it has an - * errata which cannot take 8-bit reads and writes, only 32-bit ones ... - */ -#define u32 unsigned int - -struct local_apic { - -/*000*/ struct { u32 __reserved[4]; } __reserved_01; - -/*010*/ struct { u32 __reserved[4]; } __reserved_02; - -/*020*/ struct { /* APIC ID Register */ - u32 __reserved_1 : 24, - phys_apic_id : 4, - __reserved_2 : 4; - u32 __reserved[3]; - } id; - -/*030*/ const - struct { /* APIC Version Register */ - u32 version : 8, - __reserved_1 : 8, - max_lvt : 8, - __reserved_2 : 8; - u32 __reserved[3]; - } version; - -/*040*/ struct { u32 __reserved[4]; } __reserved_03; - -/*050*/ struct { u32 __reserved[4]; } __reserved_04; - -/*060*/ struct { u32 __reserved[4]; } __reserved_05; - -/*070*/ struct { u32 __reserved[4]; } __reserved_06; - -/*080*/ struct { /* Task Priority Register */ - u32 priority : 8, - __reserved_1 : 24; - u32 __reserved_2[3]; - } tpr; - -/*090*/ const - struct { /* Arbitration Priority Register */ - u32 priority : 8, - __reserved_1 : 24; - u32 __reserved_2[3]; - } apr; - -/*0A0*/ const - struct { /* Processor Priority Register */ - u32 priority : 8, - __reserved_1 : 24; - u32 __reserved_2[3]; - } ppr; - -/*0B0*/ struct { /* End Of Interrupt Register */ - u32 eoi; - u32 __reserved[3]; - } eoi; - -/*0C0*/ struct { u32 __reserved[4]; } __reserved_07; - -/*0D0*/ struct { /* Logical Destination Register */ - u32 __reserved_1 : 24, - logical_dest : 8; - u32 __reserved_2[3]; - } ldr; - -/*0E0*/ struct { /* Destination Format Register */ - u32 __reserved_1 : 28, - model : 4; - u32 __reserved_2[3]; - } dfr; - -/*0F0*/ struct { /* Spurious Interrupt Vector Register */ - u32 spurious_vector : 8, - apic_enabled : 1, - focus_cpu : 1, - __reserved_2 : 22; - u32 __reserved_3[3]; - } svr; - -/*100*/ struct { /* In Service Register */ -/*170*/ u32 bitfield; - u32 __reserved[3]; - } isr [8]; - -/*180*/ struct { /* Trigger Mode Register */ -/*1F0*/ u32 bitfield; - u32 __reserved[3]; - } tmr [8]; - -/*200*/ struct { /* Interrupt Request Register */ -/*270*/ u32 bitfield; - u32 __reserved[3]; - } irr [8]; - -/*280*/ union { /* Error Status Register */ - struct { - u32 send_cs_error : 1, - receive_cs_error : 1, - send_accept_error : 1, - receive_accept_error : 1, - __reserved_1 : 1, - send_illegal_vector : 1, - receive_illegal_vector : 1, - illegal_register_address : 1, - __reserved_2 : 24; - u32 __reserved_3[3]; - } error_bits; - struct { - u32 errors; - u32 __reserved_3[3]; - } all_errors; - } esr; - -/*290*/ struct { u32 __reserved[4]; } __reserved_08; - -/*2A0*/ struct { u32 __reserved[4]; } __reserved_09; - -/*2B0*/ struct { u32 __reserved[4]; } __reserved_10; - -/*2C0*/ struct { u32 __reserved[4]; } __reserved_11; - -/*2D0*/ struct { u32 __reserved[4]; } __reserved_12; - -/*2E0*/ struct { u32 __reserved[4]; } __reserved_13; - -/*2F0*/ struct { u32 __reserved[4]; } __reserved_14; - -/*300*/ struct { /* Interrupt Command Register 1 */ - u32 vector : 8, - delivery_mode : 3, - destination_mode : 1, - delivery_status : 1, - __reserved_1 : 1, - level : 1, - trigger : 1, - __reserved_2 : 2, - shorthand : 2, - __reserved_3 : 12; - u32 __reserved_4[3]; - } icr1; - -/*310*/ struct { /* Interrupt Command Register 2 */ - union { - u32 __reserved_1 : 24, - phys_dest : 4, - __reserved_2 : 4; - u32 __reserved_3 : 24, - logical_dest : 8; - } dest; - u32 __reserved_4[3]; - } icr2; - -/*320*/ struct { /* LVT - Timer */ - u32 vector : 8, - __reserved_1 : 4, - delivery_status : 1, - __reserved_2 : 3, - mask : 1, - timer_mode : 1, - __reserved_3 : 14; - u32 __reserved_4[3]; - } lvt_timer; - -/*330*/ struct { /* LVT - Thermal Sensor */ - u32 vector : 8, - delivery_mode : 3, - __reserved_1 : 1, - delivery_status : 1, - __reserved_2 : 3, - mask : 1, - __reserved_3 : 15; - u32 __reserved_4[3]; - } lvt_thermal; - -/*340*/ struct { /* LVT - Performance Counter */ - u32 vector : 8, - delivery_mode : 3, - __reserved_1 : 1, - delivery_status : 1, - __reserved_2 : 3, - mask : 1, - __reserved_3 : 15; - u32 __reserved_4[3]; - } lvt_pc; - -/*350*/ struct { /* LVT - LINT0 */ - u32 vector : 8, - delivery_mode : 3, - __reserved_1 : 1, - delivery_status : 1, - polarity : 1, - remote_irr : 1, - trigger : 1, - mask : 1, - __reserved_2 : 15; - u32 __reserved_3[3]; - } lvt_lint0; - -/*360*/ struct { /* LVT - LINT1 */ - u32 vector : 8, - delivery_mode : 3, - __reserved_1 : 1, - delivery_status : 1, - polarity : 1, - remote_irr : 1, - trigger : 1, - mask : 1, - __reserved_2 : 15; - u32 __reserved_3[3]; - } lvt_lint1; - -/*370*/ struct { /* LVT - Error */ - u32 vector : 8, - __reserved_1 : 4, - delivery_status : 1, - __reserved_2 : 3, - mask : 1, - __reserved_3 : 15; - u32 __reserved_4[3]; - } lvt_error; - -/*380*/ struct { /* Timer Initial Count Register */ - u32 initial_count; - u32 __reserved_2[3]; - } timer_icr; - -/*390*/ const - struct { /* Timer Current Count Register */ - u32 curr_count; - u32 __reserved_2[3]; - } timer_ccr; - -/*3A0*/ struct { u32 __reserved[4]; } __reserved_16; - -/*3B0*/ struct { u32 __reserved[4]; } __reserved_17; - -/*3C0*/ struct { u32 __reserved[4]; } __reserved_18; - -/*3D0*/ struct { u32 __reserved[4]; } __reserved_19; - -/*3E0*/ struct { /* Timer Divide Configuration Register */ - u32 divisor : 4, - __reserved_1 : 28; - u32 __reserved_2[3]; - } timer_dcr; - -/*3F0*/ struct { u32 __reserved[4]; } __reserved_20; - -} __attribute__ ((packed)); - -#undef u32 - #ifdef CONFIG_X86_32 #define BAD_APICID 0xFFu #else #define BAD_APICID 0xFFFFu #endif -enum apic_delivery_modes { - APIC_DELIVERY_MODE_FIXED = 0, - APIC_DELIVERY_MODE_LOWESTPRIO = 1, - APIC_DELIVERY_MODE_SMI = 2, - APIC_DELIVERY_MODE_NMI = 4, - APIC_DELIVERY_MODE_INIT = 5, - APIC_DELIVERY_MODE_EXTINT = 7, -}; - #endif /* _ASM_X86_APICDEF_H */ diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h index ba88edd0d58b..b5982b94bdba 100644 --- a/arch/x86/include/asm/arch_hweight.h +++ b/arch/x86/include/asm/arch_hweight.h @@ -16,9 +16,10 @@ static __always_inline unsigned int __arch_hweight32(unsigned int w) { unsigned int res; - asm (ALTERNATIVE("call __sw_hweight32", "popcntl %1, %0", X86_FEATURE_POPCNT) - : "="REG_OUT (res) - : REG_IN (w)); + asm_inline (ALTERNATIVE("call __sw_hweight32", + "popcntl %[val], %[cnt]", X86_FEATURE_POPCNT) + : [cnt] "=" REG_OUT (res), ASM_CALL_CONSTRAINT + : [val] REG_IN (w)); return res; } @@ -44,9 +45,10 @@ static __always_inline unsigned long __arch_hweight64(__u64 w) { unsigned long res; - asm (ALTERNATIVE("call __sw_hweight64", "popcntq %1, %0", X86_FEATURE_POPCNT) - : "="REG_OUT (res) - : REG_IN (w)); + asm_inline (ALTERNATIVE("call __sw_hweight64", + "popcntq %[val], %[cnt]", X86_FEATURE_POPCNT) + : [cnt] "=" REG_OUT (res), ASM_CALL_CONSTRAINT + : [val] REG_IN (w)); return res; } diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index 8f80de627c60..11c6fecc3ad7 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -12,8 +12,14 @@ #include <asm/special_insns.h> #include <asm/preempt.h> #include <asm/asm.h> +#include <asm/fred.h> +#include <asm/gsseg.h> +#include <asm/nospec-branch.h> -#ifndef CONFIG_X86_CMPXCHG64 +#ifndef CONFIG_X86_CX8 extern void cmpxchg8b_emu(void); #endif +#ifdef CONFIG_STACKPROTECTOR +extern unsigned long __ref_stack_chk_guard; +#endif diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index fbcfec4dc4cc..f963848024a5 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -2,7 +2,7 @@ #ifndef _ASM_X86_ASM_H #define _ASM_X86_ASM_H -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ # define __ASM_FORM(x, ...) x,## __VA_ARGS__ # define __ASM_FORM_RAW(x, ...) x,## __VA_ARGS__ # define __ASM_FORM_COMMA(x, ...) x,## __VA_ARGS__, @@ -113,6 +113,15 @@ #endif +#ifndef __ASSEMBLER__ +static __always_inline __pure void *rip_rel_ptr(void *p) +{ + asm("leaq %c1(%%rip), %0" : "=r"(p) : "i"(p)); + + return p; +} +#endif + /* * Macros to generate condition code outputs from inline assembly, * The output operand must be type "bool". @@ -130,7 +139,7 @@ # include <asm/extable_fixup_types.h> /* Exception table entry */ -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ # define _ASM_EXTABLE_TYPE(from, to, type) \ .pushsection "__ex_table","a" ; \ @@ -150,7 +159,7 @@ # define _ASM_NOKPROBE(entry) # endif -#else /* ! __ASSEMBLY__ */ +#else /* ! __ASSEMBLER__ */ # define DEFINE_EXTABLE_TYPE_REG \ ".macro extable_type_reg type:req reg:req\n" \ @@ -199,6 +208,17 @@ /* For C file, we already have NOKPROBE_SYMBOL macro */ +/* Insert a comma if args are non-empty */ +#define COMMA(x...) __COMMA(x) +#define __COMMA(...) , ##__VA_ARGS__ + +/* + * Combine multiple asm inline constraint args into a single arg for passing to + * another macro. + */ +#define ASM_OUTPUT(x...) x +#define ASM_INPUT(x...) x + /* * This output constraint should be used for any inline asm which has a "call" * instruction. Otherwise the asm may be inserted before the frame pointer @@ -207,7 +227,7 @@ */ register unsigned long current_stack_pointer asm(_ASM_SP); #define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define _ASM_EXTABLE(from, to) \ _ASM_EXTABLE_TYPE(from, to, EX_TYPE_DEFAULT) @@ -215,11 +235,27 @@ register unsigned long current_stack_pointer asm(_ASM_SP); #define _ASM_EXTABLE_UA(from, to) \ _ASM_EXTABLE_TYPE(from, to, EX_TYPE_UACCESS) -#define _ASM_EXTABLE_CPY(from, to) \ - _ASM_EXTABLE_TYPE(from, to, EX_TYPE_COPY) - #define _ASM_EXTABLE_FAULT(from, to) \ _ASM_EXTABLE_TYPE(from, to, EX_TYPE_FAULT) +/* + * Both i386 and x86_64 returns 64-bit values in edx:eax for certain + * instructions, but GCC's "A" constraint has different meanings. + * For i386, "A" means exactly edx:eax, while for x86_64 it + * means rax *or* rdx. + * + * These helpers wrapping these semantic differences save one instruction + * clearing the high half of 'low': + */ +#ifdef CONFIG_X86_64 +# define EAX_EDX_DECLARE_ARGS(val, low, high) unsigned long low, high +# define EAX_EDX_VAL(val, low, high) ((low) | (high) << 32) +# define EAX_EDX_RET(val, low, high) "=a" (low), "=d" (high) +#else +# define EAX_EDX_DECLARE_ARGS(val, low, high) u64 val +# define EAX_EDX_VAL(val, low, high) (val) +# define EAX_EDX_RET(val, low, high) "=A" (val) +#endif + #endif /* __KERNEL__ */ #endif /* _ASM_X86_ASM_H */ diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 5e754e895767..75743f1dfd4e 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -14,12 +14,6 @@ * resource counting etc.. */ -/** - * arch_atomic_read - read atomic variable - * @v: pointer of type atomic_t - * - * Atomically reads the value of @v. - */ static __always_inline int arch_atomic_read(const atomic_t *v) { /* @@ -29,155 +23,70 @@ static __always_inline int arch_atomic_read(const atomic_t *v) return __READ_ONCE((v)->counter); } -/** - * arch_atomic_set - set atomic variable - * @v: pointer of type atomic_t - * @i: required value - * - * Atomically sets the value of @v to @i. - */ static __always_inline void arch_atomic_set(atomic_t *v, int i) { __WRITE_ONCE(v->counter, i); } -/** - * arch_atomic_add - add integer to atomic variable - * @i: integer value to add - * @v: pointer of type atomic_t - * - * Atomically adds @i to @v. - */ static __always_inline void arch_atomic_add(int i, atomic_t *v) { - asm volatile(LOCK_PREFIX "addl %1,%0" + asm_inline volatile(LOCK_PREFIX "addl %1, %0" : "+m" (v->counter) : "ir" (i) : "memory"); } -/** - * arch_atomic_sub - subtract integer from atomic variable - * @i: integer value to subtract - * @v: pointer of type atomic_t - * - * Atomically subtracts @i from @v. - */ static __always_inline void arch_atomic_sub(int i, atomic_t *v) { - asm volatile(LOCK_PREFIX "subl %1,%0" + asm_inline volatile(LOCK_PREFIX "subl %1, %0" : "+m" (v->counter) : "ir" (i) : "memory"); } -/** - * arch_atomic_sub_and_test - subtract value from variable and test result - * @i: integer value to subtract - * @v: pointer of type atomic_t - * - * Atomically subtracts @i from @v and returns - * true if the result is zero, or false for all - * other cases. - */ static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v) { return GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, e, "er", i); } #define arch_atomic_sub_and_test arch_atomic_sub_and_test -/** - * arch_atomic_inc - increment atomic variable - * @v: pointer of type atomic_t - * - * Atomically increments @v by 1. - */ static __always_inline void arch_atomic_inc(atomic_t *v) { - asm volatile(LOCK_PREFIX "incl %0" + asm_inline volatile(LOCK_PREFIX "incl %0" : "+m" (v->counter) :: "memory"); } #define arch_atomic_inc arch_atomic_inc -/** - * arch_atomic_dec - decrement atomic variable - * @v: pointer of type atomic_t - * - * Atomically decrements @v by 1. - */ static __always_inline void arch_atomic_dec(atomic_t *v) { - asm volatile(LOCK_PREFIX "decl %0" + asm_inline volatile(LOCK_PREFIX "decl %0" : "+m" (v->counter) :: "memory"); } #define arch_atomic_dec arch_atomic_dec -/** - * arch_atomic_dec_and_test - decrement and test - * @v: pointer of type atomic_t - * - * Atomically decrements @v by 1 and - * returns true if the result is 0, or false for all other - * cases. - */ static __always_inline bool arch_atomic_dec_and_test(atomic_t *v) { return GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, e); } #define arch_atomic_dec_and_test arch_atomic_dec_and_test -/** - * arch_atomic_inc_and_test - increment and test - * @v: pointer of type atomic_t - * - * Atomically increments @v by 1 - * and returns true if the result is zero, or false for all - * other cases. - */ static __always_inline bool arch_atomic_inc_and_test(atomic_t *v) { return GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, e); } #define arch_atomic_inc_and_test arch_atomic_inc_and_test -/** - * arch_atomic_add_negative - add and test if negative - * @i: integer value to add - * @v: pointer of type atomic_t - * - * Atomically adds @i to @v and returns true - * if the result is negative, or false when - * result is greater than or equal to zero. - */ static __always_inline bool arch_atomic_add_negative(int i, atomic_t *v) { return GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, s, "er", i); } #define arch_atomic_add_negative arch_atomic_add_negative -/** - * arch_atomic_add_return - add integer and return - * @i: integer value to add - * @v: pointer of type atomic_t - * - * Atomically adds @i to @v and returns @i + @v - */ static __always_inline int arch_atomic_add_return(int i, atomic_t *v) { return i + xadd(&v->counter, i); } #define arch_atomic_add_return arch_atomic_add_return -/** - * arch_atomic_sub_return - subtract integer and return - * @v: pointer of type atomic_t - * @i: integer value to subtract - * - * Atomically subtracts @i from @v and returns @v - @i - */ -static __always_inline int arch_atomic_sub_return(int i, atomic_t *v) -{ - return arch_atomic_add_return(-i, v); -} -#define arch_atomic_sub_return arch_atomic_sub_return +#define arch_atomic_sub_return(i, v) arch_atomic_add_return(-(i), v) static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v) { @@ -185,11 +94,7 @@ static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v) } #define arch_atomic_fetch_add arch_atomic_fetch_add -static __always_inline int arch_atomic_fetch_sub(int i, atomic_t *v) -{ - return xadd(&v->counter, -i); -} -#define arch_atomic_fetch_sub arch_atomic_fetch_sub +#define arch_atomic_fetch_sub(i, v) arch_atomic_fetch_add(-(i), v) static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new) { @@ -211,7 +116,7 @@ static __always_inline int arch_atomic_xchg(atomic_t *v, int new) static __always_inline void arch_atomic_and(int i, atomic_t *v) { - asm volatile(LOCK_PREFIX "andl %1,%0" + asm_inline volatile(LOCK_PREFIX "andl %1, %0" : "+m" (v->counter) : "ir" (i) : "memory"); @@ -229,7 +134,7 @@ static __always_inline int arch_atomic_fetch_and(int i, atomic_t *v) static __always_inline void arch_atomic_or(int i, atomic_t *v) { - asm volatile(LOCK_PREFIX "orl %1,%0" + asm_inline volatile(LOCK_PREFIX "orl %1, %0" : "+m" (v->counter) : "ir" (i) : "memory"); @@ -247,7 +152,7 @@ static __always_inline int arch_atomic_fetch_or(int i, atomic_t *v) static __always_inline void arch_atomic_xor(int i, atomic_t *v) { - asm volatile(LOCK_PREFIX "xorl %1,%0" + asm_inline volatile(LOCK_PREFIX "xorl %1, %0" : "+m" (v->counter) : "ir" (i) : "memory"); diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 5efd01b548d1..ab838205c1c6 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -14,6 +14,32 @@ typedef struct { #define ATOMIC64_INIT(val) { (val) } +/* + * Read an atomic64_t non-atomically. + * + * This is intended to be used in cases where a subsequent atomic operation + * will handle the torn value, and can be used to prime the first iteration + * of unconditional try_cmpxchg() loops, e.g.: + * + * s64 val = arch_atomic64_read_nonatomic(v); + * do { } while (!arch_atomic64_try_cmpxchg(v, &val, val OP i); + * + * This is NOT safe to use where the value is not always checked by a + * subsequent atomic operation, such as in conditional try_cmpxchg() loops + * that can break before the atomic operation, e.g.: + * + * s64 val = arch_atomic64_read_nonatomic(v); + * do { + * if (condition(val)) + * break; + * } while (!arch_atomic64_try_cmpxchg(v, &val, val OP i); + */ +static __always_inline s64 arch_atomic64_read_nonatomic(const atomic64_t *v) +{ + /* See comment in arch_atomic_read(). */ + return __READ_ONCE(v->counter); +} + #define __ATOMIC64_DECL(sym) void atomic64_##sym(atomic64_t *, ...) #ifndef ATOMIC64_EXPORT #define ATOMIC64_DECL_ONE __ATOMIC64_DECL @@ -22,16 +48,20 @@ typedef struct { ATOMIC64_EXPORT(atomic64_##sym) #endif -#ifdef CONFIG_X86_CMPXCHG64 -#define __alternative_atomic64(f, g, out, in...) \ - asm volatile("call %P[func]" \ - : out : [func] "i" (atomic64_##g##_cx8), ## in) +#ifdef CONFIG_X86_CX8 +#define __alternative_atomic64(f, g, out, in, clobbers...) \ + asm volatile("call %c[func]" \ + : ALT_OUTPUT_SP(out) \ + : [func] "i" (atomic64_##g##_cx8) \ + COMMA(in) \ + : clobbers) #define ATOMIC64_DECL(sym) ATOMIC64_DECL_ONE(sym##_cx8) #else -#define __alternative_atomic64(f, g, out, in...) \ - alternative_call(atomic64_##f##_386, atomic64_##g##_cx8, \ - X86_FEATURE_CX8, ASM_OUTPUT2(out), ## in) +#define __alternative_atomic64(f, g, out, in, clobbers...) \ + alternative_call(atomic64_##f##_386, atomic64_##g##_cx8, \ + X86_FEATURE_CX8, ASM_OUTPUT(out), \ + ASM_INPUT(in), clobbers) #define ATOMIC64_DECL(sym) ATOMIC64_DECL_ONE(sym##_cx8); \ ATOMIC64_DECL_ONE(sym##_386) @@ -42,8 +72,8 @@ ATOMIC64_DECL_ONE(inc_386); ATOMIC64_DECL_ONE(dec_386); #endif -#define alternative_atomic64(f, out, in...) \ - __alternative_atomic64(f, f, ASM_OUTPUT2(out), ## in) +#define alternative_atomic64(f, out, in, clobbers...) \ + __alternative_atomic64(f, f, ASM_OUTPUT(out), ASM_INPUT(in), clobbers) ATOMIC64_DECL(read); ATOMIC64_DECL(set); @@ -61,207 +91,154 @@ ATOMIC64_DECL(add_unless); #undef __ATOMIC64_DECL #undef ATOMIC64_EXPORT -/** - * arch_atomic64_cmpxchg - cmpxchg atomic64 variable - * @v: pointer to type atomic64_t - * @o: expected value - * @n: new value - * - * Atomically sets @v to @n if it was equal to @o and returns - * the old value. - */ - -static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n) +static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) { - return arch_cmpxchg64(&v->counter, o, n); + return arch_cmpxchg64(&v->counter, old, new); } #define arch_atomic64_cmpxchg arch_atomic64_cmpxchg -/** - * arch_atomic64_xchg - xchg atomic64 variable - * @v: pointer to type atomic64_t - * @n: value to assign - * - * Atomically xchgs the value of @v to @n and returns - * the old value. - */ -static inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n) +static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) +{ + return arch_try_cmpxchg64(&v->counter, old, new); +} +#define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg + +static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n) { s64 o; unsigned high = (unsigned)(n >> 32); unsigned low = (unsigned)n; - alternative_atomic64(xchg, "=&A" (o), - "S" (v), "b" (low), "c" (high) - : "memory"); + alternative_atomic64(xchg, + "=&A" (o), + ASM_INPUT("S" (v), "b" (low), "c" (high)), + "memory"); return o; } #define arch_atomic64_xchg arch_atomic64_xchg -/** - * arch_atomic64_set - set atomic64 variable - * @v: pointer to type atomic64_t - * @i: value to assign - * - * Atomically sets the value of @v to @n. - */ -static inline void arch_atomic64_set(atomic64_t *v, s64 i) +static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i) { unsigned high = (unsigned)(i >> 32); unsigned low = (unsigned)i; - alternative_atomic64(set, /* no output */, - "S" (v), "b" (low), "c" (high) - : "eax", "edx", "memory"); + alternative_atomic64(set, + /* no output */, + ASM_INPUT("S" (v), "b" (low), "c" (high)), + "eax", "edx", "memory"); } -/** - * arch_atomic64_read - read atomic64 variable - * @v: pointer to type atomic64_t - * - * Atomically reads the value of @v and returns it. - */ -static inline s64 arch_atomic64_read(const atomic64_t *v) +static __always_inline s64 arch_atomic64_read(const atomic64_t *v) { s64 r; - alternative_atomic64(read, "=&A" (r), "c" (v) : "memory"); + alternative_atomic64(read, "=&A" (r), "c" (v), "memory"); return r; } -/** - * arch_atomic64_add_return - add and return - * @i: integer value to add - * @v: pointer to type atomic64_t - * - * Atomically adds @i to @v and returns @i + *@v - */ -static inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) { alternative_atomic64(add_return, - ASM_OUTPUT2("+A" (i), "+c" (v)), - ASM_NO_INPUT_CLOBBER("memory")); + ASM_OUTPUT("+A" (i), "+c" (v)), + /* no input */, + "memory"); return i; } #define arch_atomic64_add_return arch_atomic64_add_return -/* - * Other variants with different arithmetic operators: - */ -static inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v) { alternative_atomic64(sub_return, - ASM_OUTPUT2("+A" (i), "+c" (v)), - ASM_NO_INPUT_CLOBBER("memory")); + ASM_OUTPUT("+A" (i), "+c" (v)), + /* no input */, + "memory"); return i; } #define arch_atomic64_sub_return arch_atomic64_sub_return -static inline s64 arch_atomic64_inc_return(atomic64_t *v) +static __always_inline s64 arch_atomic64_inc_return(atomic64_t *v) { s64 a; - alternative_atomic64(inc_return, "=&A" (a), - "S" (v) : "memory", "ecx"); + alternative_atomic64(inc_return, + "=&A" (a), + "S" (v), + "memory", "ecx"); return a; } #define arch_atomic64_inc_return arch_atomic64_inc_return -static inline s64 arch_atomic64_dec_return(atomic64_t *v) +static __always_inline s64 arch_atomic64_dec_return(atomic64_t *v) { s64 a; - alternative_atomic64(dec_return, "=&A" (a), - "S" (v) : "memory", "ecx"); + alternative_atomic64(dec_return, + "=&A" (a), + "S" (v), + "memory", "ecx"); return a; } #define arch_atomic64_dec_return arch_atomic64_dec_return -/** - * arch_atomic64_add - add integer to atomic64 variable - * @i: integer value to add - * @v: pointer to type atomic64_t - * - * Atomically adds @i to @v. - */ -static inline s64 arch_atomic64_add(s64 i, atomic64_t *v) +static __always_inline void arch_atomic64_add(s64 i, atomic64_t *v) { __alternative_atomic64(add, add_return, - ASM_OUTPUT2("+A" (i), "+c" (v)), - ASM_NO_INPUT_CLOBBER("memory")); - return i; + ASM_OUTPUT("+A" (i), "+c" (v)), + /* no input */, + "memory"); } -/** - * arch_atomic64_sub - subtract the atomic64 variable - * @i: integer value to subtract - * @v: pointer to type atomic64_t - * - * Atomically subtracts @i from @v. - */ -static inline s64 arch_atomic64_sub(s64 i, atomic64_t *v) +static __always_inline void arch_atomic64_sub(s64 i, atomic64_t *v) { __alternative_atomic64(sub, sub_return, - ASM_OUTPUT2("+A" (i), "+c" (v)), - ASM_NO_INPUT_CLOBBER("memory")); - return i; + ASM_OUTPUT("+A" (i), "+c" (v)), + /* no input */, + "memory"); } -/** - * arch_atomic64_inc - increment atomic64 variable - * @v: pointer to type atomic64_t - * - * Atomically increments @v by 1. - */ -static inline void arch_atomic64_inc(atomic64_t *v) +static __always_inline void arch_atomic64_inc(atomic64_t *v) { - __alternative_atomic64(inc, inc_return, /* no output */, - "S" (v) : "memory", "eax", "ecx", "edx"); + __alternative_atomic64(inc, inc_return, + /* no output */, + "S" (v), + "memory", "eax", "ecx", "edx"); } #define arch_atomic64_inc arch_atomic64_inc -/** - * arch_atomic64_dec - decrement atomic64 variable - * @v: pointer to type atomic64_t - * - * Atomically decrements @v by 1. - */ -static inline void arch_atomic64_dec(atomic64_t *v) +static __always_inline void arch_atomic64_dec(atomic64_t *v) { - __alternative_atomic64(dec, dec_return, /* no output */, - "S" (v) : "memory", "eax", "ecx", "edx"); + __alternative_atomic64(dec, dec_return, + /* no output */, + "S" (v), + "memory", "eax", "ecx", "edx"); } #define arch_atomic64_dec arch_atomic64_dec -/** - * arch_atomic64_add_unless - add unless the number is a given value - * @v: pointer of type atomic64_t - * @a: the amount to add to v... - * @u: ...unless v is equal to u. - * - * Atomically adds @a to @v, so long as it was not @u. - * Returns non-zero if the add was done, zero otherwise. - */ -static inline int arch_atomic64_add_unless(atomic64_t *v, s64 a, s64 u) +static __always_inline int arch_atomic64_add_unless(atomic64_t *v, s64 a, s64 u) { unsigned low = (unsigned)u; unsigned high = (unsigned)(u >> 32); alternative_atomic64(add_unless, - ASM_OUTPUT2("+A" (a), "+c" (low), "+D" (high)), - "S" (v) : "memory"); + ASM_OUTPUT("+A" (a), "+c" (low), "+D" (high)), + "S" (v), + "memory"); return (int)a; } #define arch_atomic64_add_unless arch_atomic64_add_unless -static inline int arch_atomic64_inc_not_zero(atomic64_t *v) +static __always_inline int arch_atomic64_inc_not_zero(atomic64_t *v) { int r; - alternative_atomic64(inc_not_zero, "=&a" (r), - "S" (v) : "ecx", "edx", "memory"); + alternative_atomic64(inc_not_zero, + "=&a" (r), + "S" (v), + "ecx", "edx", "memory"); return r; } #define arch_atomic64_inc_not_zero arch_atomic64_inc_not_zero -static inline s64 arch_atomic64_dec_if_positive(atomic64_t *v) +static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v) { s64 r; - alternative_atomic64(dec_if_positive, "=&A" (r), - "S" (v) : "ecx", "memory"); + alternative_atomic64(dec_if_positive, + "=&A" (r), + "S" (v), + "ecx", "memory"); return r; } #define arch_atomic64_dec_if_positive arch_atomic64_dec_if_positive @@ -269,71 +246,64 @@ static inline s64 arch_atomic64_dec_if_positive(atomic64_t *v) #undef alternative_atomic64 #undef __alternative_atomic64 -static inline void arch_atomic64_and(s64 i, atomic64_t *v) +static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v) { - s64 old, c = 0; + s64 val = arch_atomic64_read_nonatomic(v); - while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c) - c = old; + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val & i)); } -static inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v) { - s64 old, c = 0; + s64 val = arch_atomic64_read_nonatomic(v); - while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c) - c = old; + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val & i)); - return old; + return val; } #define arch_atomic64_fetch_and arch_atomic64_fetch_and -static inline void arch_atomic64_or(s64 i, atomic64_t *v) +static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v) { - s64 old, c = 0; + s64 val = arch_atomic64_read_nonatomic(v); - while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c) - c = old; + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val | i)); } -static inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v) { - s64 old, c = 0; + s64 val = arch_atomic64_read_nonatomic(v); - while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c) - c = old; + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val | i)); - return old; + return val; } #define arch_atomic64_fetch_or arch_atomic64_fetch_or -static inline void arch_atomic64_xor(s64 i, atomic64_t *v) +static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v) { - s64 old, c = 0; + s64 val = arch_atomic64_read_nonatomic(v); - while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c) - c = old; + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i)); } -static inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v) { - s64 old, c = 0; + s64 val = arch_atomic64_read_nonatomic(v); - while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c) - c = old; + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i)); - return old; + return val; } #define arch_atomic64_fetch_xor arch_atomic64_fetch_xor -static inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) { - s64 old, c = 0; + s64 val = arch_atomic64_read_nonatomic(v); - while ((old = arch_atomic64_cmpxchg(v, c, c + i)) != c) - c = old; + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val + i)); - return old; + return val; } #define arch_atomic64_fetch_add arch_atomic64_fetch_add diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 7886d0578fc9..87b496325b5b 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -10,176 +10,87 @@ #define ATOMIC64_INIT(i) { (i) } -/** - * arch_atomic64_read - read atomic64 variable - * @v: pointer of type atomic64_t - * - * Atomically reads the value of @v. - * Doesn't imply a read memory barrier. - */ -static inline s64 arch_atomic64_read(const atomic64_t *v) +static __always_inline s64 arch_atomic64_read(const atomic64_t *v) { return __READ_ONCE((v)->counter); } -/** - * arch_atomic64_set - set atomic64 variable - * @v: pointer to type atomic64_t - * @i: required value - * - * Atomically sets the value of @v to @i. - */ -static inline void arch_atomic64_set(atomic64_t *v, s64 i) +static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i) { __WRITE_ONCE(v->counter, i); } -/** - * arch_atomic64_add - add integer to atomic64 variable - * @i: integer value to add - * @v: pointer to type atomic64_t - * - * Atomically adds @i to @v. - */ static __always_inline void arch_atomic64_add(s64 i, atomic64_t *v) { - asm volatile(LOCK_PREFIX "addq %1,%0" + asm_inline volatile(LOCK_PREFIX "addq %1, %0" : "=m" (v->counter) : "er" (i), "m" (v->counter) : "memory"); } -/** - * arch_atomic64_sub - subtract the atomic64 variable - * @i: integer value to subtract - * @v: pointer to type atomic64_t - * - * Atomically subtracts @i from @v. - */ -static inline void arch_atomic64_sub(s64 i, atomic64_t *v) +static __always_inline void arch_atomic64_sub(s64 i, atomic64_t *v) { - asm volatile(LOCK_PREFIX "subq %1,%0" + asm_inline volatile(LOCK_PREFIX "subq %1, %0" : "=m" (v->counter) : "er" (i), "m" (v->counter) : "memory"); } -/** - * arch_atomic64_sub_and_test - subtract value from variable and test result - * @i: integer value to subtract - * @v: pointer to type atomic64_t - * - * Atomically subtracts @i from @v and returns - * true if the result is zero, or false for all - * other cases. - */ -static inline bool arch_atomic64_sub_and_test(s64 i, atomic64_t *v) +static __always_inline bool arch_atomic64_sub_and_test(s64 i, atomic64_t *v) { return GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, e, "er", i); } #define arch_atomic64_sub_and_test arch_atomic64_sub_and_test -/** - * arch_atomic64_inc - increment atomic64 variable - * @v: pointer to type atomic64_t - * - * Atomically increments @v by 1. - */ static __always_inline void arch_atomic64_inc(atomic64_t *v) { - asm volatile(LOCK_PREFIX "incq %0" + asm_inline volatile(LOCK_PREFIX "incq %0" : "=m" (v->counter) : "m" (v->counter) : "memory"); } #define arch_atomic64_inc arch_atomic64_inc -/** - * arch_atomic64_dec - decrement atomic64 variable - * @v: pointer to type atomic64_t - * - * Atomically decrements @v by 1. - */ static __always_inline void arch_atomic64_dec(atomic64_t *v) { - asm volatile(LOCK_PREFIX "decq %0" + asm_inline volatile(LOCK_PREFIX "decq %0" : "=m" (v->counter) : "m" (v->counter) : "memory"); } #define arch_atomic64_dec arch_atomic64_dec -/** - * arch_atomic64_dec_and_test - decrement and test - * @v: pointer to type atomic64_t - * - * Atomically decrements @v by 1 and - * returns true if the result is 0, or false for all other - * cases. - */ -static inline bool arch_atomic64_dec_and_test(atomic64_t *v) +static __always_inline bool arch_atomic64_dec_and_test(atomic64_t *v) { return GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, e); } #define arch_atomic64_dec_and_test arch_atomic64_dec_and_test -/** - * arch_atomic64_inc_and_test - increment and test - * @v: pointer to type atomic64_t - * - * Atomically increments @v by 1 - * and returns true if the result is zero, or false for all - * other cases. - */ -static inline bool arch_atomic64_inc_and_test(atomic64_t *v) +static __always_inline bool arch_atomic64_inc_and_test(atomic64_t *v) { return GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, e); } #define arch_atomic64_inc_and_test arch_atomic64_inc_and_test -/** - * arch_atomic64_add_negative - add and test if negative - * @i: integer value to add - * @v: pointer to type atomic64_t - * - * Atomically adds @i to @v and returns true - * if the result is negative, or false when - * result is greater than or equal to zero. - */ -static inline bool arch_atomic64_add_negative(s64 i, atomic64_t *v) +static __always_inline bool arch_atomic64_add_negative(s64 i, atomic64_t *v) { return GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, s, "er", i); } #define arch_atomic64_add_negative arch_atomic64_add_negative -/** - * arch_atomic64_add_return - add and return - * @i: integer value to add - * @v: pointer to type atomic64_t - * - * Atomically adds @i to @v and returns @i + @v - */ static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) { return i + xadd(&v->counter, i); } #define arch_atomic64_add_return arch_atomic64_add_return -static inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v) -{ - return arch_atomic64_add_return(-i, v); -} -#define arch_atomic64_sub_return arch_atomic64_sub_return +#define arch_atomic64_sub_return(i, v) arch_atomic64_add_return(-(i), v) -static inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) { return xadd(&v->counter, i); } #define arch_atomic64_fetch_add arch_atomic64_fetch_add -static inline s64 arch_atomic64_fetch_sub(s64 i, atomic64_t *v) -{ - return xadd(&v->counter, -i); -} -#define arch_atomic64_fetch_sub arch_atomic64_fetch_sub +#define arch_atomic64_fetch_sub(i, v) arch_atomic64_fetch_add(-(i), v) -static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) +static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) { return arch_cmpxchg(&v->counter, old, new); } @@ -191,21 +102,21 @@ static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s } #define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg -static inline s64 arch_atomic64_xchg(atomic64_t *v, s64 new) +static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 new) { return arch_xchg(&v->counter, new); } #define arch_atomic64_xchg arch_atomic64_xchg -static inline void arch_atomic64_and(s64 i, atomic64_t *v) +static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v) { - asm volatile(LOCK_PREFIX "andq %1,%0" + asm_inline volatile(LOCK_PREFIX "andq %1, %0" : "+m" (v->counter) : "er" (i) : "memory"); } -static inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v) { s64 val = arch_atomic64_read(v); @@ -215,15 +126,15 @@ static inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v) } #define arch_atomic64_fetch_and arch_atomic64_fetch_and -static inline void arch_atomic64_or(s64 i, atomic64_t *v) +static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v) { - asm volatile(LOCK_PREFIX "orq %1,%0" + asm_inline volatile(LOCK_PREFIX "orq %1, %0" : "+m" (v->counter) : "er" (i) : "memory"); } -static inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v) { s64 val = arch_atomic64_read(v); @@ -233,15 +144,15 @@ static inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v) } #define arch_atomic64_fetch_or arch_atomic64_fetch_or -static inline void arch_atomic64_xor(s64 i, atomic64_t *v) +static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v) { - asm volatile(LOCK_PREFIX "xorq %1,%0" + asm_inline volatile(LOCK_PREFIX "xorq %1, %0" : "+m" (v->counter) : "er" (i) : "memory"); } -static inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v) +static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v) { s64 val = arch_atomic64_read(v); diff --git a/arch/x86/include/asm/audit.h b/arch/x86/include/asm/audit.h index 36aec57ea7a3..fa918f01333e 100644 --- a/arch/x86/include/asm/audit.h +++ b/arch/x86/include/asm/audit.h @@ -4,4 +4,11 @@ int ia32_classify_syscall(unsigned int syscall); +extern unsigned ia32_dir_class[]; +extern unsigned ia32_write_class[]; +extern unsigned ia32_read_class[]; +extern unsigned ia32_chattr_class[]; +extern unsigned ia32_signal_class[]; + + #endif /* _ASM_X86_AUDIT_H */ diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 35389b2af88e..db70832232d4 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -12,11 +12,11 @@ */ #ifdef CONFIG_X86_32 -#define mb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "mfence", \ +#define mb() asm volatile(ALTERNATIVE("lock addl $0,-4(%%esp)", "mfence", \ X86_FEATURE_XMM2) ::: "memory", "cc") -#define rmb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "lfence", \ +#define rmb() asm volatile(ALTERNATIVE("lock addl $0,-4(%%esp)", "lfence", \ X86_FEATURE_XMM2) ::: "memory", "cc") -#define wmb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "sfence", \ +#define wmb() asm volatile(ALTERNATIVE("lock addl $0,-4(%%esp)", "sfence", \ X86_FEATURE_XMM2) ::: "memory", "cc") #else #define __mb() asm volatile("mfence":::"memory") @@ -33,20 +33,16 @@ * Returns: * 0 - (index < size) */ -static inline unsigned long array_index_mask_nospec(unsigned long index, - unsigned long size) -{ - unsigned long mask; - - asm volatile ("cmp %1,%2; sbb %0,%0;" - :"=r" (mask) - :"g"(size),"r" (index) - :"cc"); - return mask; -} - -/* Override the default implementation from linux/nospec.h. */ -#define array_index_mask_nospec array_index_mask_nospec +#define array_index_mask_nospec(idx,sz) ({ \ + typeof((idx)+(sz)) __idx = (idx); \ + typeof(__idx) __sz = (sz); \ + unsigned long __mask; \ + asm volatile ("cmp %1,%2; sbb %0,%0" \ + :"=r" (__mask) \ + :ASM_INPUT_G (__sz), \ + "r" (__idx) \ + :"cc"); \ + __mask; }) /* Prevent speculative execution past this barrier. */ #define barrier_nospec() alternative("", "lfence", X86_FEATURE_LFENCE_RDTSC) @@ -54,7 +50,7 @@ static inline unsigned long array_index_mask_nospec(unsigned long index, #define __dma_rmb() barrier() #define __dma_wmb() barrier() -#define __smp_mb() asm volatile("lock; addl $0,-4(%%" _ASM_SP ")" ::: "memory", "cc") +#define __smp_mb() asm volatile("lock addl $0,-4(%%" _ASM_SP ")" ::: "memory", "cc") #define __smp_rmb() dma_rmb() #define __smp_wmb() barrier() @@ -79,24 +75,9 @@ do { \ #define __smp_mb__before_atomic() do { } while (0) #define __smp_mb__after_atomic() do { } while (0) -#include <asm-generic/barrier.h> +/* Writing to CR3 provides a full memory barrier in switch_mm(). */ +#define smp_mb__after_switch_mm() do { } while (0) -/* - * Make previous memory operations globally visible before - * a WRMSR. - * - * MFENCE makes writes visible, but only affects load/store - * instructions. WRMSR is unfortunately not a load/store - * instruction and is unaffected by MFENCE. The LFENCE ensures - * that the WRMSR is not reordered. - * - * Most WRMSRs are full serializing instructions themselves and - * do not require this barrier. This is only required for the - * IA32_TSC_DEADLINE and X2APIC MSRs. - */ -static inline void weak_wrmsr_fence(void) -{ - asm volatile("mfence; lfence" : : : "memory"); -} +#include <asm-generic/barrier.h> #endif /* _ASM_X86_BARRIER_H */ diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index a288ecd230ab..eebbc8889e70 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -52,18 +52,18 @@ static __always_inline void arch_set_bit(long nr, volatile unsigned long *addr) { if (__builtin_constant_p(nr)) { - asm volatile(LOCK_PREFIX "orb %b1,%0" + asm_inline volatile(LOCK_PREFIX "orb %b1,%0" : CONST_MASK_ADDR(nr, addr) : "iq" (CONST_MASK(nr)) : "memory"); } else { - asm volatile(LOCK_PREFIX __ASM_SIZE(bts) " %1,%0" + asm_inline volatile(LOCK_PREFIX __ASM_SIZE(bts) " %1,%0" : : RLONG_ADDR(addr), "Ir" (nr) : "memory"); } } static __always_inline void -arch___set_bit(long nr, volatile unsigned long *addr) +arch___set_bit(unsigned long nr, volatile unsigned long *addr) { asm volatile(__ASM_SIZE(bts) " %1,%0" : : ADDR, "Ir" (nr) : "memory"); } @@ -72,11 +72,11 @@ static __always_inline void arch_clear_bit(long nr, volatile unsigned long *addr) { if (__builtin_constant_p(nr)) { - asm volatile(LOCK_PREFIX "andb %b1,%0" + asm_inline volatile(LOCK_PREFIX "andb %b1,%0" : CONST_MASK_ADDR(nr, addr) : "iq" (~CONST_MASK(nr))); } else { - asm volatile(LOCK_PREFIX __ASM_SIZE(btr) " %1,%0" + asm_inline volatile(LOCK_PREFIX __ASM_SIZE(btr) " %1,%0" : : RLONG_ADDR(addr), "Ir" (nr) : "memory"); } } @@ -89,23 +89,22 @@ arch_clear_bit_unlock(long nr, volatile unsigned long *addr) } static __always_inline void -arch___clear_bit(long nr, volatile unsigned long *addr) +arch___clear_bit(unsigned long nr, volatile unsigned long *addr) { asm volatile(__ASM_SIZE(btr) " %1,%0" : : ADDR, "Ir" (nr) : "memory"); } -static __always_inline bool -arch_clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr) +static __always_inline bool arch_xor_unlock_is_negative_byte(unsigned long mask, + volatile unsigned long *addr) { bool negative; - asm volatile(LOCK_PREFIX "andb %2,%1" + asm_inline volatile(LOCK_PREFIX "xorb %2,%1" CC_SET(s) : CC_OUT(s) (negative), WBYTE_ADDR(addr) - : "ir" ((char) ~(1 << nr)) : "memory"); + : "iq" ((char)mask) : "memory"); return negative; } -#define arch_clear_bit_unlock_is_negative_byte \ - arch_clear_bit_unlock_is_negative_byte +#define arch_xor_unlock_is_negative_byte arch_xor_unlock_is_negative_byte static __always_inline void arch___clear_bit_unlock(long nr, volatile unsigned long *addr) @@ -114,7 +113,7 @@ arch___clear_bit_unlock(long nr, volatile unsigned long *addr) } static __always_inline void -arch___change_bit(long nr, volatile unsigned long *addr) +arch___change_bit(unsigned long nr, volatile unsigned long *addr) { asm volatile(__ASM_SIZE(btc) " %1,%0" : : ADDR, "Ir" (nr) : "memory"); } @@ -123,11 +122,11 @@ static __always_inline void arch_change_bit(long nr, volatile unsigned long *addr) { if (__builtin_constant_p(nr)) { - asm volatile(LOCK_PREFIX "xorb %b1,%0" + asm_inline volatile(LOCK_PREFIX "xorb %b1,%0" : CONST_MASK_ADDR(nr, addr) : "iq" (CONST_MASK(nr))); } else { - asm volatile(LOCK_PREFIX __ASM_SIZE(btc) " %1,%0" + asm_inline volatile(LOCK_PREFIX __ASM_SIZE(btc) " %1,%0" : : RLONG_ADDR(addr), "Ir" (nr) : "memory"); } } @@ -145,7 +144,7 @@ arch_test_and_set_bit_lock(long nr, volatile unsigned long *addr) } static __always_inline bool -arch___test_and_set_bit(long nr, volatile unsigned long *addr) +arch___test_and_set_bit(unsigned long nr, volatile unsigned long *addr) { bool oldbit; @@ -171,7 +170,7 @@ arch_test_and_clear_bit(long nr, volatile unsigned long *addr) * this without also updating arch/x86/kernel/kvm.c */ static __always_inline bool -arch___test_and_clear_bit(long nr, volatile unsigned long *addr) +arch___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr) { bool oldbit; @@ -183,7 +182,7 @@ arch___test_and_clear_bit(long nr, volatile unsigned long *addr) } static __always_inline bool -arch___test_and_change_bit(long nr, volatile unsigned long *addr) +arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr) { bool oldbit; @@ -207,6 +206,20 @@ static __always_inline bool constant_test_bit(long nr, const volatile unsigned l (addr[nr >> _BITOPS_LONG_SHIFT])) != 0; } +static __always_inline bool constant_test_bit_acquire(long nr, const volatile unsigned long *addr) +{ + bool oldbit; + + asm volatile("testb %2,%1" + CC_SET(nz) + : CC_OUT(nz) (oldbit) + : "m" (((unsigned char *)addr)[nr >> 3]), + "i" (1 << (nr & 7)) + :"memory"); + + return oldbit; +} + static __always_inline bool variable_test_bit(long nr, volatile const unsigned long *addr) { bool oldbit; @@ -219,10 +232,27 @@ static __always_inline bool variable_test_bit(long nr, volatile const unsigned l return oldbit; } -#define arch_test_bit(nr, addr) \ - (__builtin_constant_p((nr)) \ - ? constant_test_bit((nr), (addr)) \ - : variable_test_bit((nr), (addr))) +static __always_inline bool +arch_test_bit(unsigned long nr, const volatile unsigned long *addr) +{ + return __builtin_constant_p(nr) ? constant_test_bit(nr, addr) : + variable_test_bit(nr, addr); +} + +static __always_inline bool +arch_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr) +{ + return __builtin_constant_p(nr) ? constant_test_bit_acquire(nr, addr) : + variable_test_bit(nr, addr); +} + +static __always_inline unsigned long variable__ffs(unsigned long word) +{ + asm("tzcnt %1,%0" + : "=r" (word) + : ASM_INPUT_RM (word)); + return word; +} /** * __ffs - find first set bit in word @@ -230,12 +260,14 @@ static __always_inline bool variable_test_bit(long nr, volatile const unsigned l * * Undefined if no bit exists, so code should check against 0 first. */ -static __always_inline unsigned long __ffs(unsigned long word) +#define __ffs(word) \ + (__builtin_constant_p(word) ? \ + (unsigned long)__builtin_ctzl(word) : \ + variable__ffs(word)) + +static __always_inline unsigned long variable_ffz(unsigned long word) { - asm("rep; bsf %1,%0" - : "=r" (word) - : "rm" (word)); - return word; + return variable__ffs(~word); } /** @@ -244,13 +276,10 @@ static __always_inline unsigned long __ffs(unsigned long word) * * Undefined if no zero exists, so code should check against ~0UL first. */ -static __always_inline unsigned long ffz(unsigned long word) -{ - asm("rep; bsf %1,%0" - : "=r" (word) - : "r" (~word)); - return word; -} +#define ffz(word) \ + (__builtin_constant_p(word) ? \ + (unsigned long)__builtin_ctzl(~word) : \ + variable_ffz(word)) /* * __fls: find last set bit in word @@ -260,27 +289,19 @@ static __always_inline unsigned long ffz(unsigned long word) */ static __always_inline unsigned long __fls(unsigned long word) { + if (__builtin_constant_p(word)) + return BITS_PER_LONG - 1 - __builtin_clzl(word); + asm("bsr %1,%0" : "=r" (word) - : "rm" (word)); + : ASM_INPUT_RM (word)); return word; } #undef ADDR #ifdef __KERNEL__ -/** - * ffs - find first set bit in word - * @x: the word to search - * - * This is defined the same way as the libc and compiler builtin ffs - * routines, therefore differs in spirit from the other bitops. - * - * ffs(value) returns 0 if value is 0 or the position of the first - * set bit if value is nonzero. The first (least significant) bit - * is at position 1. - */ -static __always_inline int ffs(int x) +static __always_inline int variable_ffs(int x) { int r; @@ -296,7 +317,7 @@ static __always_inline int ffs(int x) */ asm("bsfl %1,%0" : "=r" (r) - : "rm" (x), "0" (-1)); + : ASM_INPUT_RM (x), "0" (-1)); #elif defined(CONFIG_X86_CMOV) asm("bsfl %1,%0\n\t" "cmovzl %2,%0" @@ -311,6 +332,19 @@ static __always_inline int ffs(int x) } /** + * ffs - find first set bit in word + * @x: the word to search + * + * This is defined the same way as the libc and compiler builtin ffs + * routines, therefore differs in spirit from the other bitops. + * + * ffs(value) returns 0 if value is 0 or the position of the first + * set bit if value is nonzero. The first (least significant) bit + * is at position 1. + */ +#define ffs(x) (__builtin_constant_p(x) ? __builtin_ffs(x) : variable_ffs(x)) + +/** * fls - find last set bit in word * @x: the word to search * @@ -325,6 +359,9 @@ static __always_inline int fls(unsigned int x) { int r; + if (__builtin_constant_p(x)) + return x ? 32 - __builtin_clz(x) : 0; + #ifdef CONFIG_X86_64 /* * AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the @@ -337,7 +374,7 @@ static __always_inline int fls(unsigned int x) */ asm("bsrl %1,%0" : "=r" (r) - : "rm" (x), "0" (-1)); + : ASM_INPUT_RM (x), "0" (-1)); #elif defined(CONFIG_X86_CMOV) asm("bsrl %1,%0\n\t" "cmovzl %2,%0" @@ -366,6 +403,9 @@ static __always_inline int fls(unsigned int x) static __always_inline int fls64(__u64 x) { int bitpos = -1; + + if (__builtin_constant_p(x)) + return x ? 64 - __builtin_clzll(x) : 0; /* * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the * dest reg is undefined if x==0, but their CPU architect says its @@ -373,7 +413,7 @@ static __always_inline int fls64(__u64 x) */ asm("bsrq %1,%q0" : "+r" (bitpos) - : "rm" (x)); + : ASM_INPUT_RM (x)); return bitpos + 1; } #else diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 9191280d9ea3..02b23aa78955 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -6,11 +6,6 @@ #include <asm/pgtable_types.h> #include <uapi/asm/boot.h> -/* Physical address where kernel should be loaded. */ -#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \ - + (CONFIG_PHYSICAL_ALIGN - 1)) \ - & ~(CONFIG_PHYSICAL_ALIGN - 1)) - /* Minimum kernel alignment, as a power of two */ #ifdef CONFIG_X86_64 # define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT @@ -40,26 +35,64 @@ #ifdef CONFIG_X86_64 # define BOOT_STACK_SIZE 0x4000 +/* + * Used by decompressor's startup_32() to allocate page tables for identity + * mapping of the 4G of RAM in 4-level paging mode: + * - 1 level4 table; + * - 1 level3 table; + * - 4 level2 table that maps everything with 2M pages; + * + * The additional level5 table needed for 5-level paging is allocated from + * trampoline_32bit memory. + */ # define BOOT_INIT_PGT_SIZE (6*4096) -# ifdef CONFIG_RANDOMIZE_BASE + /* - * Assuming all cross the 512GB boundary: - * 1 page for level4 - * (2+2)*4 pages for kernel, param, cmd_line, and randomized kernel - * 2 pages for first 2M (video RAM: CONFIG_X86_VERBOSE_BOOTUP). - * Total is 19 pages. + * Total number of page tables kernel_add_identity_map() can allocate, + * including page tables consumed by startup_32(). + * + * Worst-case scenario: + * - 5-level paging needs 1 level5 table; + * - KASLR needs to map kernel, boot_params, cmdline and randomized kernel, + * assuming all of them cross 256T boundary: + * + 4*2 level4 table; + * + 4*2 level3 table; + * + 4*2 level2 table; + * - X86_VERBOSE_BOOTUP needs to map the first 2M (video RAM): + * + 1 level4 table; + * + 1 level3 table; + * + 1 level2 table; + * Total: 28 tables + * + * Add 4 spare table in case decompressor touches anything beyond what is + * accounted above. Warn if it happens. */ -# ifdef CONFIG_X86_VERBOSE_BOOTUP -# define BOOT_PGT_SIZE (19*4096) -# else /* !CONFIG_X86_VERBOSE_BOOTUP */ -# define BOOT_PGT_SIZE (17*4096) -# endif -# else /* !CONFIG_RANDOMIZE_BASE */ -# define BOOT_PGT_SIZE BOOT_INIT_PGT_SIZE -# endif +# define BOOT_PGT_SIZE_WARN (28*4096) +# define BOOT_PGT_SIZE (32*4096) #else /* !CONFIG_X86_64 */ # define BOOT_STACK_SIZE 0x1000 #endif +#define TRAMPOLINE_32BIT_SIZE (2 * PAGE_SIZE) + +#define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE +#define TRAMPOLINE_32BIT_CODE_SIZE 0xA0 + +#ifndef __ASSEMBLER__ +extern unsigned int output_len; +extern const unsigned long kernel_text_size; +extern const unsigned long kernel_total_size; + +unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr, + void (*error)(char *x)); + +extern struct boot_params *boot_params_ptr; +extern unsigned long *trampoline_32bit; +extern const u16 trampoline_ljmp_imm_offset; + +void trampoline_32bit_src(void *trampoline, bool enable_5lvl); + +#endif + #endif /* _ASM_X86_BOOT_H */ diff --git a/arch/x86/include/asm/bootparam_utils.h b/arch/x86/include/asm/bootparam_utils.h index 53e9b0620d96..d90ae472fb76 100644 --- a/arch/x86/include/asm/bootparam_utils.h +++ b/arch/x86/include/asm/bootparam_utils.h @@ -38,7 +38,7 @@ static void sanitize_boot_params(struct boot_params *boot_params) * IMPORTANT NOTE TO BOOTLOADER AUTHORS: do not simply clear * this field. The purpose of this field is to guarantee * compliance with the x86 boot spec located in - * Documentation/x86/boot.rst . That spec says that the + * Documentation/arch/x86/boot.rst . That spec says that the * *whole* structure should be cleared, after which only the * portion defined by struct setup_header (boot_params->hdr) * should be copied in. diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index a3ec87d198ac..f0e9acf72547 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -13,6 +13,22 @@ #define INSN_UD2 0x0b0f #define LEN_UD2 2 +/* + * In clang we have UD1s reporting UBSAN failures on X86, 64 and 32bit. + */ +#define INSN_ASOP 0x67 +#define INSN_LOCK 0xf0 +#define OPCODE_ESCAPE 0x0f +#define SECOND_BYTE_OPCODE_UD1 0xb9 +#define SECOND_BYTE_OPCODE_UD2 0x0b + +#define BUG_NONE 0xffff +#define BUG_UD2 0xfffe +#define BUG_UD1 0xfffd +#define BUG_UD1_UBSAN 0xfffc +#define BUG_EA 0xffea +#define BUG_LOCK 0xfff0 + #ifdef CONFIG_GENERIC_BUG #ifdef CONFIG_X86_32 @@ -80,7 +96,7 @@ do { \ do { \ __auto_type __flags = BUGFLAG_WARNING|(flags); \ instrumentation_begin(); \ - _BUG_FLAGS(ASM_UD2, __flags, ASM_REACHABLE); \ + _BUG_FLAGS(ASM_UD2, __flags, ANNOTATE_REACHABLE(1b)); \ instrumentation_end(); \ } while (0) diff --git a/arch/x86/include/asm/bugs.h b/arch/x86/include/asm/bugs.h index 92ae28389940..f25ca2d709d4 100644 --- a/arch/x86/include/asm/bugs.h +++ b/arch/x86/include/asm/bugs.h @@ -4,8 +4,6 @@ #include <asm/processor.h> -extern void check_bugs(void); - #if defined(CONFIG_CPU_SUP_INTEL) && defined(CONFIG_X86_32) int ppro_with_ram_bug(void); #else diff --git a/arch/x86/include/asm/cacheinfo.h b/arch/x86/include/asm/cacheinfo.h index 86b2e0dcc4bf..5aa061199866 100644 --- a/arch/x86/include/asm/cacheinfo.h +++ b/arch/x86/include/asm/cacheinfo.h @@ -2,7 +2,17 @@ #ifndef _ASM_X86_CACHEINFO_H #define _ASM_X86_CACHEINFO_H -void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu); -void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu); +/* Kernel controls MTRR and/or PAT MSRs. */ +extern unsigned int memory_caching_control; +#define CACHE_MTRR 0x01 +#define CACHE_PAT 0x02 + +void cache_disable(void); +void cache_enable(void); +void set_cache_aps_delayed_init(bool val); +bool get_cache_aps_delayed_init(void); +void cache_bp_init(void); +void cache_bp_restore(void); +void cache_aps_init(void); #endif /* _ASM_X86_CACHEINFO_H */ diff --git a/arch/x86/include/asm/cfi.h b/arch/x86/include/asm/cfi.h new file mode 100644 index 000000000000..3e51ba459154 --- /dev/null +++ b/arch/x86/include/asm/cfi.h @@ -0,0 +1,172 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_CFI_H +#define _ASM_X86_CFI_H + +/* + * Clang Control Flow Integrity (CFI) support. + * + * Copyright (C) 2022 Google LLC + */ +#include <linux/bug.h> +#include <asm/ibt.h> + +/* + * An overview of the various calling conventions... + * + * Traditional: + * + * foo: + * ... code here ... + * ret + * + * direct caller: + * call foo + * + * indirect caller: + * lea foo(%rip), %r11 + * ... + * call *%r11 + * + * + * IBT: + * + * foo: + * endbr64 + * ... code here ... + * ret + * + * direct caller: + * call foo / call foo+4 + * + * indirect caller: + * lea foo(%rip), %r11 + * ... + * call *%r11 + * + * + * kCFI: + * + * __cfi_foo: + * movl $0x12345678, %eax + * # 11 nops when CONFIG_CALL_PADDING + * foo: + * endbr64 # when IBT + * ... code here ... + * ret + * + * direct call: + * call foo # / call foo+4 when IBT + * + * indirect call: + * lea foo(%rip), %r11 + * ... + * movl $(-0x12345678), %r10d + * addl -4(%r11), %r10d # -15 when CONFIG_CALL_PADDING + * jz 1f + * ud2 + * 1:call *%r11 + * + * + * FineIBT (builds as kCFI + CALL_PADDING + IBT + RETPOLINE and runtime patches into): + * + * __cfi_foo: + * endbr64 + * subl 0x12345678, %r10d + * jz foo + * ud2 + * nop + * foo: + * osp nop3 # was endbr64 + * ... code here ... + * ret + * + * direct caller: + * call foo / call foo+4 + * + * indirect caller: + * lea foo(%rip), %r11 + * ... + * movl $0x12345678, %r10d + * subl $16, %r11 + * nop4 + * call *%r11 + * + */ +enum cfi_mode { + CFI_AUTO, /* FineIBT if hardware has IBT, otherwise kCFI */ + CFI_OFF, /* Taditional / IBT depending on .config */ + CFI_KCFI, /* Optionally CALL_PADDING, IBT, RETPOLINE */ + CFI_FINEIBT, /* see arch/x86/kernel/alternative.c */ +}; + +extern enum cfi_mode cfi_mode; + +#ifdef CONFIG_FINEIBT_BHI +extern bool cfi_bhi; +#else +#define cfi_bhi (0) +#endif + +typedef u8 bhi_thunk[32]; +extern bhi_thunk __bhi_args[]; +extern bhi_thunk __bhi_args_end[]; + +struct pt_regs; + +#ifdef CONFIG_CFI_CLANG +enum bug_trap_type handle_cfi_failure(struct pt_regs *regs); +#define __bpfcall +extern u32 cfi_bpf_hash; +extern u32 cfi_bpf_subprog_hash; + +static inline int cfi_get_offset(void) +{ + switch (cfi_mode) { + case CFI_FINEIBT: + return 16; + case CFI_KCFI: + if (IS_ENABLED(CONFIG_CALL_PADDING)) + return 16; + return 5; + default: + return 0; + } +} +#define cfi_get_offset cfi_get_offset + +extern u32 cfi_get_func_hash(void *func); +extern int cfi_get_func_arity(void *func); + +#ifdef CONFIG_FINEIBT +extern bool decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type); +#else +static inline bool +decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type) +{ + return false; +} + +#endif + +#else +static inline enum bug_trap_type handle_cfi_failure(struct pt_regs *regs) +{ + return BUG_TRAP_TYPE_NONE; +} +#define cfi_bpf_hash 0U +#define cfi_bpf_subprog_hash 0U +static inline u32 cfi_get_func_hash(void *func) +{ + return 0; +} +static inline int cfi_get_func_arity(void *func) +{ + return 0; +} +#endif /* CONFIG_CFI_CLANG */ + +#if HAS_KERNEL_IBT == 1 +#define CFI_NOSEAL(x) asm(IBT_NOSEAL(__stringify(x))) +#endif + +#endif /* _ASM_X86_CFI_H */ diff --git a/arch/x86/include/asm/checksum.h b/arch/x86/include/asm/checksum.h index bca625a60186..6df6ece8a28e 100644 --- a/arch/x86/include/asm/checksum.h +++ b/arch/x86/include/asm/checksum.h @@ -1,9 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER 1 -#define HAVE_CSUM_COPY_USER -#define _HAVE_ARCH_CSUM_AND_COPY -#ifdef CONFIG_X86_32 -# include <asm/checksum_32.h> +#ifdef CONFIG_GENERIC_CSUM +# include <asm-generic/checksum.h> #else -# include <asm/checksum_64.h> +# define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER 1 +# define HAVE_CSUM_COPY_USER +# define _HAVE_ARCH_CSUM_AND_COPY +# ifdef CONFIG_X86_32 +# include <asm/checksum_32.h> +# else +# include <asm/checksum_64.h> +# endif #endif diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h index 407beebadaf4..4d4a47a3a8ab 100644 --- a/arch/x86/include/asm/checksum_64.h +++ b/arch/x86/include/asm/checksum_64.h @@ -9,7 +9,6 @@ */ #include <linux/compiler.h> -#include <linux/uaccess.h> #include <asm/byteorder.h> /** diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h index 6faaf27e8899..6cbd9ae58b21 100644 --- a/arch/x86/include/asm/cmdline.h +++ b/arch/x86/include/asm/cmdline.h @@ -2,6 +2,10 @@ #ifndef _ASM_X86_CMDLINE_H #define _ASM_X86_CMDLINE_H +#include <asm/setup.h> + +extern char builtin_cmdline[COMMAND_LINE_SIZE]; + int cmdline_find_option_bool(const char *cmdline_ptr, const char *option); int cmdline_find_option(const char *cmdline_ptr, const char *option, char *buffer, int bufsize); diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index 94fbe6ae7431..b61f32c3459f 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -44,22 +44,22 @@ extern void __add_wrong_size(void) __typeof__ (*(ptr)) __ret = (arg); \ switch (sizeof(*(ptr))) { \ case __X86_CASE_B: \ - asm volatile (lock #op "b %b0, %1\n" \ + asm_inline volatile (lock #op "b %b0, %1" \ : "+q" (__ret), "+m" (*(ptr)) \ : : "memory", "cc"); \ break; \ case __X86_CASE_W: \ - asm volatile (lock #op "w %w0, %1\n" \ + asm_inline volatile (lock #op "w %w0, %1" \ : "+r" (__ret), "+m" (*(ptr)) \ : : "memory", "cc"); \ break; \ case __X86_CASE_L: \ - asm volatile (lock #op "l %0, %1\n" \ + asm_inline volatile (lock #op "l %0, %1" \ : "+r" (__ret), "+m" (*(ptr)) \ : : "memory", "cc"); \ break; \ case __X86_CASE_Q: \ - asm volatile (lock #op "q %q0, %1\n" \ + asm_inline volatile (lock #op "q %q0, %1" \ : "+r" (__ret), "+m" (*(ptr)) \ : : "memory", "cc"); \ break; \ @@ -91,7 +91,7 @@ extern void __add_wrong_size(void) case __X86_CASE_B: \ { \ volatile u8 *__ptr = (volatile u8 *)(ptr); \ - asm volatile(lock "cmpxchgb %2,%1" \ + asm_inline volatile(lock "cmpxchgb %2, %1" \ : "=a" (__ret), "+m" (*__ptr) \ : "q" (__new), "0" (__old) \ : "memory"); \ @@ -100,7 +100,7 @@ extern void __add_wrong_size(void) case __X86_CASE_W: \ { \ volatile u16 *__ptr = (volatile u16 *)(ptr); \ - asm volatile(lock "cmpxchgw %2,%1" \ + asm_inline volatile(lock "cmpxchgw %2, %1" \ : "=a" (__ret), "+m" (*__ptr) \ : "r" (__new), "0" (__old) \ : "memory"); \ @@ -109,7 +109,7 @@ extern void __add_wrong_size(void) case __X86_CASE_L: \ { \ volatile u32 *__ptr = (volatile u32 *)(ptr); \ - asm volatile(lock "cmpxchgl %2,%1" \ + asm_inline volatile(lock "cmpxchgl %2, %1" \ : "=a" (__ret), "+m" (*__ptr) \ : "r" (__new), "0" (__old) \ : "memory"); \ @@ -118,7 +118,7 @@ extern void __add_wrong_size(void) case __X86_CASE_Q: \ { \ volatile u64 *__ptr = (volatile u64 *)(ptr); \ - asm volatile(lock "cmpxchgq %2,%1" \ + asm_inline volatile(lock "cmpxchgq %2, %1" \ : "=a" (__ret), "+m" (*__ptr) \ : "r" (__new), "0" (__old) \ : "memory"); \ @@ -134,7 +134,7 @@ extern void __add_wrong_size(void) __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX) #define __sync_cmpxchg(ptr, old, new, size) \ - __raw_cmpxchg((ptr), (old), (new), (size), "lock; ") + __raw_cmpxchg((ptr), (old), (new), (size), "lock ") #define __cmpxchg_local(ptr, old, new, size) \ __raw_cmpxchg((ptr), (old), (new), (size), "") @@ -165,7 +165,7 @@ extern void __add_wrong_size(void) case __X86_CASE_B: \ { \ volatile u8 *__ptr = (volatile u8 *)(_ptr); \ - asm volatile(lock "cmpxchgb %[new], %[ptr]" \ + asm_inline volatile(lock "cmpxchgb %[new], %[ptr]" \ CC_SET(z) \ : CC_OUT(z) (success), \ [ptr] "+m" (*__ptr), \ @@ -177,7 +177,7 @@ extern void __add_wrong_size(void) case __X86_CASE_W: \ { \ volatile u16 *__ptr = (volatile u16 *)(_ptr); \ - asm volatile(lock "cmpxchgw %[new], %[ptr]" \ + asm_inline volatile(lock "cmpxchgw %[new], %[ptr]" \ CC_SET(z) \ : CC_OUT(z) (success), \ [ptr] "+m" (*__ptr), \ @@ -189,7 +189,7 @@ extern void __add_wrong_size(void) case __X86_CASE_L: \ { \ volatile u32 *__ptr = (volatile u32 *)(_ptr); \ - asm volatile(lock "cmpxchgl %[new], %[ptr]" \ + asm_inline volatile(lock "cmpxchgl %[new], %[ptr]" \ CC_SET(z) \ : CC_OUT(z) (success), \ [ptr] "+m" (*__ptr), \ @@ -201,7 +201,7 @@ extern void __add_wrong_size(void) case __X86_CASE_Q: \ { \ volatile u64 *__ptr = (volatile u64 *)(_ptr); \ - asm volatile(lock "cmpxchgq %[new], %[ptr]" \ + asm_inline volatile(lock "cmpxchgq %[new], %[ptr]" \ CC_SET(z) \ : CC_OUT(z) (success), \ [ptr] "+m" (*__ptr), \ @@ -221,9 +221,21 @@ extern void __add_wrong_size(void) #define __try_cmpxchg(ptr, pold, new, size) \ __raw_try_cmpxchg((ptr), (pold), (new), (size), LOCK_PREFIX) +#define __sync_try_cmpxchg(ptr, pold, new, size) \ + __raw_try_cmpxchg((ptr), (pold), (new), (size), "lock ") + +#define __try_cmpxchg_local(ptr, pold, new, size) \ + __raw_try_cmpxchg((ptr), (pold), (new), (size), "") + #define arch_try_cmpxchg(ptr, pold, new) \ __try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr))) +#define arch_sync_try_cmpxchg(ptr, pold, new) \ + __sync_try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr))) + +#define arch_try_cmpxchg_local(ptr, pold, new) \ + __try_cmpxchg_local((ptr), (pold), (new), sizeof(*(ptr))) + /* * xadd() adds "inc" to "*ptr" and atomically returns the previous * value of "*ptr". @@ -233,29 +245,4 @@ extern void __add_wrong_size(void) #define __xadd(ptr, inc, lock) __xchg_op((ptr), (inc), xadd, lock) #define xadd(ptr, inc) __xadd((ptr), (inc), LOCK_PREFIX) -#define __cmpxchg_double(pfx, p1, p2, o1, o2, n1, n2) \ -({ \ - bool __ret; \ - __typeof__(*(p1)) __old1 = (o1), __new1 = (n1); \ - __typeof__(*(p2)) __old2 = (o2), __new2 = (n2); \ - BUILD_BUG_ON(sizeof(*(p1)) != sizeof(long)); \ - BUILD_BUG_ON(sizeof(*(p2)) != sizeof(long)); \ - VM_BUG_ON((unsigned long)(p1) % (2 * sizeof(long))); \ - VM_BUG_ON((unsigned long)((p1) + 1) != (unsigned long)(p2)); \ - asm volatile(pfx "cmpxchg%c5b %1" \ - CC_SET(e) \ - : CC_OUT(e) (__ret), \ - "+m" (*(p1)), "+m" (*(p2)), \ - "+a" (__old1), "+d" (__old2) \ - : "i" (2 * sizeof(long)), \ - "b" (__new1), "c" (__new2)); \ - __ret; \ -}) - -#define arch_cmpxchg_double(p1, p2, o1, o2, n1, n2) \ - __cmpxchg_double(LOCK_PREFIX, p1, p2, o1, o2, n1, n2) - -#define arch_cmpxchg_double_local(p1, p2, o1, o2, n1, n2) \ - __cmpxchg_double(, p1, p2, o1, o2, n1, n2) - #endif /* ASM_X86_CMPXCHG_H */ diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index 215f5a65790f..371f7906019e 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -3,134 +3,155 @@ #define _ASM_X86_CMPXCHG_32_H /* - * Note: if you use set64_bit(), __cmpxchg64(), or their variants, + * Note: if you use __cmpxchg64(), or their variants, * you need to test for the feature in boot_cpu_data. */ -/* - * CMPXCHG8B only writes to the target if we had the previous - * value in registers, otherwise it acts as a read and gives us the - * "new previous" value. That is why there is a loop. Preloading - * EDX:EAX is a performance optimization: in the common case it means - * we need only one locked operation. - * - * A SIMD/3DNOW!/MMX/FPU 64-bit store here would require at the very - * least an FPU save and/or %cr0.ts manipulation. - * - * cmpxchg8b must be used with the lock prefix here to allow the - * instruction to be executed atomically. We need to have the reader - * side to see the coherent 64bit value. - */ -static inline void set_64bit(volatile u64 *ptr, u64 value) +union __u64_halves { + u64 full; + struct { + u32 low, high; + }; +}; + +#define __arch_cmpxchg64(_ptr, _old, _new, _lock) \ +({ \ + union __u64_halves o = { .full = (_old), }, \ + n = { .full = (_new), }; \ + \ + asm_inline volatile(_lock "cmpxchg8b %[ptr]" \ + : [ptr] "+m" (*(_ptr)), \ + "+a" (o.low), "+d" (o.high) \ + : "b" (n.low), "c" (n.high) \ + : "memory"); \ + \ + o.full; \ +}) + + +static __always_inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new) { - u32 low = value; - u32 high = value >> 32; - u64 prev = *ptr; - - asm volatile("\n1:\t" - LOCK_PREFIX "cmpxchg8b %0\n\t" - "jnz 1b" - : "=m" (*ptr), "+A" (prev) - : "b" (low), "c" (high) - : "memory"); + return __arch_cmpxchg64(ptr, old, new, LOCK_PREFIX); } -#ifdef CONFIG_X86_CMPXCHG64 -#define arch_cmpxchg64(ptr, o, n) \ - ((__typeof__(*(ptr)))__cmpxchg64((ptr), (unsigned long long)(o), \ - (unsigned long long)(n))) -#define arch_cmpxchg64_local(ptr, o, n) \ - ((__typeof__(*(ptr)))__cmpxchg64_local((ptr), (unsigned long long)(o), \ - (unsigned long long)(n))) -#define arch_try_cmpxchg64(ptr, po, n) \ - __try_cmpxchg64((ptr), (unsigned long long *)(po), \ - (unsigned long long)(n)) -#endif - -static inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new) +static __always_inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new) { - u64 prev; - asm volatile(LOCK_PREFIX "cmpxchg8b %1" - : "=A" (prev), - "+m" (*ptr) - : "b" ((u32)new), - "c" ((u32)(new >> 32)), - "0" (old) - : "memory"); - return prev; + return __arch_cmpxchg64(ptr, old, new,); } -static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new) +#define __arch_try_cmpxchg64(_ptr, _oldp, _new, _lock) \ +({ \ + union __u64_halves o = { .full = *(_oldp), }, \ + n = { .full = (_new), }; \ + bool ret; \ + \ + asm_inline volatile(_lock "cmpxchg8b %[ptr]" \ + CC_SET(e) \ + : CC_OUT(e) (ret), \ + [ptr] "+m" (*(_ptr)), \ + "+a" (o.low), "+d" (o.high) \ + : "b" (n.low), "c" (n.high) \ + : "memory"); \ + \ + if (unlikely(!ret)) \ + *(_oldp) = o.full; \ + \ + likely(ret); \ +}) + +static __always_inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *oldp, u64 new) { - u64 prev; - asm volatile("cmpxchg8b %1" - : "=A" (prev), - "+m" (*ptr) - : "b" ((u32)new), - "c" ((u32)(new >> 32)), - "0" (old) - : "memory"); - return prev; + return __arch_try_cmpxchg64(ptr, oldp, new, LOCK_PREFIX); } -static inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *pold, u64 new) +static __always_inline bool __try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, u64 new) { - bool success; - u64 old = *pold; - asm volatile(LOCK_PREFIX "cmpxchg8b %[ptr]" - CC_SET(z) - : CC_OUT(z) (success), - [ptr] "+m" (*ptr), - "+A" (old) - : "b" ((u32)new), - "c" ((u32)(new >> 32)) - : "memory"); - - if (unlikely(!success)) - *pold = old; - return success; + return __arch_try_cmpxchg64(ptr, oldp, new,); } -#ifndef CONFIG_X86_CMPXCHG64 +#ifdef CONFIG_X86_CX8 + +#define arch_cmpxchg64 __cmpxchg64 + +#define arch_cmpxchg64_local __cmpxchg64_local + +#define arch_try_cmpxchg64 __try_cmpxchg64 + +#define arch_try_cmpxchg64_local __try_cmpxchg64_local + +#else + /* * Building a kernel capable running on 80386 and 80486. It may be necessary * to simulate the cmpxchg8b on the 80386 and 80486 CPU. */ -#define arch_cmpxchg64(ptr, o, n) \ -({ \ - __typeof__(*(ptr)) __ret; \ - __typeof__(*(ptr)) __old = (o); \ - __typeof__(*(ptr)) __new = (n); \ - alternative_io(LOCK_PREFIX_HERE \ - "call cmpxchg8b_emu", \ - "lock; cmpxchg8b (%%esi)" , \ - X86_FEATURE_CX8, \ - "=A" (__ret), \ - "S" ((ptr)), "0" (__old), \ - "b" ((unsigned int)__new), \ - "c" ((unsigned int)(__new>>32)) \ - : "memory"); \ - __ret; }) - - -#define arch_cmpxchg64_local(ptr, o, n) \ -({ \ - __typeof__(*(ptr)) __ret; \ - __typeof__(*(ptr)) __old = (o); \ - __typeof__(*(ptr)) __new = (n); \ - alternative_io("call cmpxchg8b_emu", \ - "cmpxchg8b (%%esi)" , \ - X86_FEATURE_CX8, \ - "=A" (__ret), \ - "S" ((ptr)), "0" (__old), \ - "b" ((unsigned int)__new), \ - "c" ((unsigned int)(__new>>32)) \ - : "memory"); \ - __ret; }) +#define __arch_cmpxchg64_emu(_ptr, _old, _new, _lock_loc, _lock) \ +({ \ + union __u64_halves o = { .full = (_old), }, \ + n = { .full = (_new), }; \ + \ + asm_inline volatile( \ + ALTERNATIVE(_lock_loc \ + "call cmpxchg8b_emu", \ + _lock "cmpxchg8b %a[ptr]", X86_FEATURE_CX8) \ + : ALT_OUTPUT_SP("+a" (o.low), "+d" (o.high)) \ + : "b" (n.low), "c" (n.high), \ + [ptr] "S" (_ptr) \ + : "memory"); \ + \ + o.full; \ +}) + +static __always_inline u64 arch_cmpxchg64(volatile u64 *ptr, u64 old, u64 new) +{ + return __arch_cmpxchg64_emu(ptr, old, new, LOCK_PREFIX_HERE, "lock "); +} +#define arch_cmpxchg64 arch_cmpxchg64 + +static __always_inline u64 arch_cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new) +{ + return __arch_cmpxchg64_emu(ptr, old, new, ,); +} +#define arch_cmpxchg64_local arch_cmpxchg64_local + +#define __arch_try_cmpxchg64_emu(_ptr, _oldp, _new, _lock_loc, _lock) \ +({ \ + union __u64_halves o = { .full = *(_oldp), }, \ + n = { .full = (_new), }; \ + bool ret; \ + \ + asm_inline volatile( \ + ALTERNATIVE(_lock_loc \ + "call cmpxchg8b_emu", \ + _lock "cmpxchg8b %a[ptr]", X86_FEATURE_CX8) \ + CC_SET(e) \ + : ALT_OUTPUT_SP(CC_OUT(e) (ret), \ + "+a" (o.low), "+d" (o.high)) \ + : "b" (n.low), "c" (n.high), \ + [ptr] "S" (_ptr) \ + : "memory"); \ + \ + if (unlikely(!ret)) \ + *(_oldp) = o.full; \ + \ + likely(ret); \ +}) + +static __always_inline bool arch_try_cmpxchg64(volatile u64 *ptr, u64 *oldp, u64 new) +{ + return __arch_try_cmpxchg64_emu(ptr, oldp, new, LOCK_PREFIX_HERE, "lock "); +} +#define arch_try_cmpxchg64 arch_try_cmpxchg64 + +static __always_inline bool arch_try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, u64 new) +{ + return __arch_try_cmpxchg64_emu(ptr, oldp, new, ,); +} +#define arch_try_cmpxchg64_local arch_try_cmpxchg64_local #endif -#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX8) +#define system_has_cmpxchg64() boot_cpu_has(X86_FEATURE_CX8) #endif /* _ASM_X86_CMPXCHG_32_H */ diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index 250187ac8248..71d1e72ed879 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -2,11 +2,6 @@ #ifndef _ASM_X86_CMPXCHG_64_H #define _ASM_X86_CMPXCHG_64_H -static inline void set_64bit(volatile u64 *ptr, u64 val) -{ - *ptr = val; -} - #define arch_cmpxchg64(ptr, o, n) \ ({ \ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ @@ -25,6 +20,77 @@ static inline void set_64bit(volatile u64 *ptr, u64 val) arch_try_cmpxchg((ptr), (po), (n)); \ }) -#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX16) +#define arch_try_cmpxchg64_local(ptr, po, n) \ +({ \ + BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ + arch_try_cmpxchg_local((ptr), (po), (n)); \ +}) + +union __u128_halves { + u128 full; + struct { + u64 low, high; + }; +}; + +#define __arch_cmpxchg128(_ptr, _old, _new, _lock) \ +({ \ + union __u128_halves o = { .full = (_old), }, \ + n = { .full = (_new), }; \ + \ + asm_inline volatile(_lock "cmpxchg16b %[ptr]" \ + : [ptr] "+m" (*(_ptr)), \ + "+a" (o.low), "+d" (o.high) \ + : "b" (n.low), "c" (n.high) \ + : "memory"); \ + \ + o.full; \ +}) + +static __always_inline u128 arch_cmpxchg128(volatile u128 *ptr, u128 old, u128 new) +{ + return __arch_cmpxchg128(ptr, old, new, LOCK_PREFIX); +} +#define arch_cmpxchg128 arch_cmpxchg128 + +static __always_inline u128 arch_cmpxchg128_local(volatile u128 *ptr, u128 old, u128 new) +{ + return __arch_cmpxchg128(ptr, old, new,); +} +#define arch_cmpxchg128_local arch_cmpxchg128_local + +#define __arch_try_cmpxchg128(_ptr, _oldp, _new, _lock) \ +({ \ + union __u128_halves o = { .full = *(_oldp), }, \ + n = { .full = (_new), }; \ + bool ret; \ + \ + asm_inline volatile(_lock "cmpxchg16b %[ptr]" \ + CC_SET(e) \ + : CC_OUT(e) (ret), \ + [ptr] "+m" (*(_ptr)), \ + "+a" (o.low), "+d" (o.high) \ + : "b" (n.low), "c" (n.high) \ + : "memory"); \ + \ + if (unlikely(!ret)) \ + *(_oldp) = o.full; \ + \ + likely(ret); \ +}) + +static __always_inline bool arch_try_cmpxchg128(volatile u128 *ptr, u128 *oldp, u128 new) +{ + return __arch_try_cmpxchg128(ptr, oldp, new, LOCK_PREFIX); +} +#define arch_try_cmpxchg128 arch_try_cmpxchg128 + +static __always_inline bool arch_try_cmpxchg128_local(volatile u128 *ptr, u128 *oldp, u128 new) +{ + return __arch_try_cmpxchg128(ptr, oldp, new,); +} +#define arch_try_cmpxchg128_local arch_try_cmpxchg128_local + +#define system_has_cmpxchg128() boot_cpu_has(X86_FEATURE_CX16) #endif /* _ASM_X86_CMPXCHG_64_H */ diff --git a/arch/x86/include/asm/coco.h b/arch/x86/include/asm/coco.h index 3d98c3a60d34..e1dbf8df1b69 100644 --- a/arch/x86/include/asm/coco.h +++ b/arch/x86/include/asm/coco.h @@ -2,22 +2,39 @@ #ifndef _ASM_X86_COCO_H #define _ASM_X86_COCO_H +#include <asm/asm.h> #include <asm/types.h> enum cc_vendor { CC_VENDOR_NONE, CC_VENDOR_AMD, - CC_VENDOR_HYPERV, CC_VENDOR_INTEL, }; -void cc_set_vendor(enum cc_vendor v); -void cc_set_mask(u64 mask); - #ifdef CONFIG_ARCH_HAS_CC_PLATFORM +extern enum cc_vendor cc_vendor; +extern u64 cc_mask; + +static inline u64 cc_get_mask(void) +{ + return cc_mask; +} + +static inline void cc_set_mask(u64 mask) +{ + cc_mask = mask; +} + u64 cc_mkenc(u64 val); u64 cc_mkdec(u64 val); +void cc_random_init(void); #else +#define cc_vendor (CC_VENDOR_NONE) +static inline u64 cc_get_mask(void) +{ + return 0; +} + static inline u64 cc_mkenc(u64 val) { return val; @@ -27,6 +44,7 @@ static inline u64 cc_mkdec(u64 val) { return val; } +static inline void cc_random_init(void) { } #endif #endif /* _ASM_X86_COCO_H */ diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index 8cbf623f0ecf..ad235dda1ded 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -9,31 +9,13 @@ #include <linux/percpu.h> #include <asm/ibt.h> -#ifdef CONFIG_SMP - -extern void prefill_possible_map(void); - -#else /* CONFIG_SMP */ - -static inline void prefill_possible_map(void) {} - +#ifndef CONFIG_SMP #define cpu_physical_id(cpu) boot_cpu_physical_apicid #define cpu_acpi_id(cpu) 0 -#define safe_smp_processor_id() 0 - #endif /* CONFIG_SMP */ -struct x86_cpu { - struct cpu cpu; -}; - #ifdef CONFIG_HOTPLUG_CPU -extern int arch_register_cpu(int num); -extern void arch_unregister_cpu(int); -extern void start_cpu0(void); -#ifdef CONFIG_DEBUG_HOTPLUG_CPU0 -extern int _debug_hotplug_cpu(int cpu, int action); -#endif +extern void soft_restart_cpu(void); #endif extern void ap_init_aperfmperf(void); @@ -43,12 +25,13 @@ int mwait_usable(const struct cpuinfo_x86 *); unsigned int x86_family(unsigned int sig); unsigned int x86_model(unsigned int sig); unsigned int x86_stepping(unsigned int sig); -#ifdef CONFIG_CPU_SUP_INTEL +#ifdef CONFIG_X86_BUS_LOCK_DETECT extern void __init sld_setup(struct cpuinfo_x86 *c); extern bool handle_user_split_lock(struct pt_regs *regs, long error_code); extern bool handle_guest_split_lock(unsigned long ip); extern void handle_bus_lock(struct pt_regs *regs); -u8 get_this_hybrid_cpu_type(void); +void split_lock_init(void); +void bus_lock_init(void); #else static inline void __init sld_setup(struct cpuinfo_x86 *c) {} static inline bool handle_user_split_lock(struct pt_regs *regs, long error_code) @@ -62,12 +45,10 @@ static inline bool handle_guest_split_lock(unsigned long ip) } static inline void handle_bus_lock(struct pt_regs *regs) {} - -static inline u8 get_this_hybrid_cpu_type(void) -{ - return 0; -} +static inline void split_lock_init(void) {} +static inline void bus_lock_init(void) {} #endif + #ifdef CONFIG_IA32_FEAT_CTL void init_ia32_feat_ctl(struct cpuinfo_x86 *c); #else @@ -76,22 +57,14 @@ static inline void init_ia32_feat_ctl(struct cpuinfo_x86 *c) {} extern __noendbr void cet_disable(void); -struct ucode_cpu_info; +struct cpu_signature; -int intel_cpu_collect_info(struct ucode_cpu_info *uci); +void intel_collect_cpu_info(struct cpu_signature *sig); -static inline bool intel_cpu_signatures_match(unsigned int s1, unsigned int p1, - unsigned int s2, unsigned int p2) -{ - if (s1 != s2) - return false; - - /* Processor flags are either both 0 ... */ - if (!p1 && !p2) - return true; +extern u64 x86_read_arch_cap_msr(void); +bool intel_find_matching_signature(void *mc, struct cpu_signature *sig); +int intel_microcode_sanity_check(void *mc, bool print_err, int hdr_type); - /* ... or they intersect. */ - return p1 & p2; -} +extern struct cpumask cpus_stop_mask; #endif /* _ASM_X86_CPU_H */ diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h index eb8fcede9e3b..6be777a06944 100644 --- a/arch/x86/include/asm/cpu_device_id.h +++ b/arch/x86/include/asm/cpu_device_id.h @@ -3,6 +3,39 @@ #define _ASM_X86_CPU_DEVICE_ID /* + * Can't use <linux/bitfield.h> because it generates expressions that + * cannot be used in structure initializers. Bitfield construction + * here must match the union in struct cpuinfo_86: + * union { + * struct { + * __u8 x86_model; + * __u8 x86; + * __u8 x86_vendor; + * __u8 x86_reserved; + * }; + * __u32 x86_vfm; + * }; + */ +#define VFM_MODEL_BIT 0 +#define VFM_FAMILY_BIT 8 +#define VFM_VENDOR_BIT 16 +#define VFM_RSVD_BIT 24 + +#define VFM_MODEL_MASK GENMASK(VFM_FAMILY_BIT - 1, VFM_MODEL_BIT) +#define VFM_FAMILY_MASK GENMASK(VFM_VENDOR_BIT - 1, VFM_FAMILY_BIT) +#define VFM_VENDOR_MASK GENMASK(VFM_RSVD_BIT - 1, VFM_VENDOR_BIT) + +#define VFM_MODEL(vfm) (((vfm) & VFM_MODEL_MASK) >> VFM_MODEL_BIT) +#define VFM_FAMILY(vfm) (((vfm) & VFM_FAMILY_MASK) >> VFM_FAMILY_BIT) +#define VFM_VENDOR(vfm) (((vfm) & VFM_VENDOR_MASK) >> VFM_VENDOR_BIT) + +#define VFM_MAKE(_vendor, _family, _model) ( \ + ((_model) << VFM_MODEL_BIT) | \ + ((_family) << VFM_FAMILY_BIT) | \ + ((_vendor) << VFM_VENDOR_BIT) \ +) + +/* * Declare drivers belonging to specific x86 CPUs * Similar in spirit to pci_device_id and related PCI functions * @@ -20,9 +53,11 @@ #define X86_CENTAUR_FAM6_C7_D 0xd #define X86_CENTAUR_FAM6_NANO 0xf -#define X86_STEPPINGS(mins, maxs) GENMASK(maxs, mins) +/* x86_cpu_id::flags */ +#define X86_CPU_ID_FLAG_ENTRY_VALID BIT(0) + /** - * X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE - Base macro for CPU matching + * X86_MATCH_CPU - Base macro for CPU matching * @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY * The name is expanded to X86_VENDOR_@_vendor * @_family: The family number or X86_FAMILY_ANY @@ -39,35 +74,18 @@ * into another macro at the usage site for good reasons, then please * start this local macro with X86_MATCH to allow easy grepping. */ -#define X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(_vendor, _family, _model, \ - _steppings, _feature, _data) { \ - .vendor = X86_VENDOR_##_vendor, \ +#define X86_MATCH_CPU(_vendor, _family, _model, _steppings, _feature, _type, _data) { \ + .vendor = _vendor, \ .family = _family, \ .model = _model, \ .steppings = _steppings, \ .feature = _feature, \ + .flags = X86_CPU_ID_FLAG_ENTRY_VALID, \ + .type = _type, \ .driver_data = (unsigned long) _data \ } /** - * X86_MATCH_VENDOR_FAM_MODEL_FEATURE - Macro for CPU matching - * @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY - * The name is expanded to X86_VENDOR_@_vendor - * @_family: The family number or X86_FAMILY_ANY - * @_model: The model number, model constant or X86_MODEL_ANY - * @_feature: A X86_FEATURE bit or X86_FEATURE_ANY - * @_data: Driver specific data or NULL. The internal storage - * format is unsigned long. The supplied value, pointer - * etc. is casted to unsigned long internally. - * - * The steppings arguments of X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE() is - * set to wildcards. - */ -#define X86_MATCH_VENDOR_FAM_MODEL_FEATURE(vendor, family, model, feature, data) \ - X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(vendor, family, model, \ - X86_STEPPING_ANY, feature, data) - -/** * X86_MATCH_VENDOR_FAM_FEATURE - Macro for matching vendor, family and CPU feature * @vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY * The name is expanded to X86_VENDOR_@vendor @@ -76,13 +94,10 @@ * @data: Driver specific data or NULL. The internal storage * format is unsigned long. The supplied value, pointer * etc. is casted to unsigned long internally. - * - * All other missing arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are - * set to wildcards. */ -#define X86_MATCH_VENDOR_FAM_FEATURE(vendor, family, feature, data) \ - X86_MATCH_VENDOR_FAM_MODEL_FEATURE(vendor, family, \ - X86_MODEL_ANY, feature, data) +#define X86_MATCH_VENDOR_FAM_FEATURE(vendor, family, feature, data) \ + X86_MATCH_CPU(X86_VENDOR_##vendor, family, X86_MODEL_ANY, \ + X86_STEPPING_ANY, feature, X86_CPU_TYPE_ANY, data) /** * X86_MATCH_VENDOR_FEATURE - Macro for matching vendor and CPU feature @@ -92,12 +107,10 @@ * @data: Driver specific data or NULL. The internal storage * format is unsigned long. The supplied value, pointer * etc. is casted to unsigned long internally. - * - * All other missing arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are - * set to wildcards. */ -#define X86_MATCH_VENDOR_FEATURE(vendor, feature, data) \ - X86_MATCH_VENDOR_FAM_FEATURE(vendor, X86_FAMILY_ANY, feature, data) +#define X86_MATCH_VENDOR_FEATURE(vendor, feature, data) \ + X86_MATCH_CPU(X86_VENDOR_##vendor, X86_FAMILY_ANY, X86_MODEL_ANY, \ + X86_STEPPING_ANY, feature, X86_CPU_TYPE_ANY, data) /** * X86_MATCH_FEATURE - Macro for matching a CPU feature @@ -105,12 +118,10 @@ * @data: Driver specific data or NULL. The internal storage * format is unsigned long. The supplied value, pointer * etc. is casted to unsigned long internally. - * - * All other missing arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are - * set to wildcards. */ -#define X86_MATCH_FEATURE(feature, data) \ - X86_MATCH_VENDOR_FEATURE(ANY, feature, data) +#define X86_MATCH_FEATURE(feature, data) \ + X86_MATCH_CPU(X86_VENDOR_ANY, X86_FAMILY_ANY, X86_MODEL_ANY, \ + X86_STEPPING_ANY, feature, X86_CPU_TYPE_ANY, data) /** * X86_MATCH_VENDOR_FAM_MODEL - Match vendor, family and model @@ -121,13 +132,10 @@ * @data: Driver specific data or NULL. The internal storage * format is unsigned long. The supplied value, pointer * etc. is casted to unsigned long internally. - * - * All other missing arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are - * set to wildcards. */ -#define X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, data) \ - X86_MATCH_VENDOR_FAM_MODEL_FEATURE(vendor, family, model, \ - X86_FEATURE_ANY, data) +#define X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, data) \ + X86_MATCH_CPU(X86_VENDOR_##vendor, family, model, X86_STEPPING_ANY, \ + X86_FEATURE_ANY, X86_CPU_TYPE_ANY, data) /** * X86_MATCH_VENDOR_FAM - Match vendor and family @@ -137,60 +145,63 @@ * @data: Driver specific data or NULL. The internal storage * format is unsigned long. The supplied value, pointer * etc. is casted to unsigned long internally. - * - * All other missing arguments to X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are - * set of wildcards. */ -#define X86_MATCH_VENDOR_FAM(vendor, family, data) \ - X86_MATCH_VENDOR_FAM_MODEL(vendor, family, X86_MODEL_ANY, data) +#define X86_MATCH_VENDOR_FAM(vendor, family, data) \ + X86_MATCH_CPU(X86_VENDOR_##vendor, family, X86_MODEL_ANY, \ + X86_STEPPING_ANY, X86_FEATURE_ANY, X86_CPU_TYPE_ANY, data) /** - * X86_MATCH_INTEL_FAM6_MODEL - Match vendor INTEL, family 6 and model - * @model: The model name without the INTEL_FAM6_ prefix or ANY - * The model name is expanded to INTEL_FAM6_@model internally + * X86_MATCH_VFM - Match encoded vendor/family/model + * @vfm: Encoded 8-bits each for vendor, family, model * @data: Driver specific data or NULL. The internal storage * format is unsigned long. The supplied value, pointer - * etc. is casted to unsigned long internally. - * - * The vendor is set to INTEL, the family to 6 and all other missing - * arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are set to wildcards. - * - * See X86_MATCH_VENDOR_FAM_MODEL_FEATURE() for further information. + * etc. is cast to unsigned long internally. */ -#define X86_MATCH_INTEL_FAM6_MODEL(model, data) \ - X86_MATCH_VENDOR_FAM_MODEL(INTEL, 6, INTEL_FAM6_##model, data) +#define X86_MATCH_VFM(vfm, data) \ + X86_MATCH_CPU(VFM_VENDOR(vfm), VFM_FAMILY(vfm), VFM_MODEL(vfm), \ + X86_STEPPING_ANY, X86_FEATURE_ANY, X86_CPU_TYPE_ANY, data) -#define X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(model, steppings, data) \ - X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, INTEL_FAM6_##model, \ - steppings, X86_FEATURE_ANY, data) +#define __X86_STEPPINGS(mins, maxs) GENMASK(maxs, mins) +/** + * X86_MATCH_VFM_STEPS - Match encoded vendor/family/model and steppings + * range. + * @vfm: Encoded 8-bits each for vendor, family, model + * @min_step: Lowest stepping number to match + * @max_step: Highest stepping number to match + * @data: Driver specific data or NULL. The internal storage + * format is unsigned long. The supplied value, pointer + * etc. is cast to unsigned long internally. + */ +#define X86_MATCH_VFM_STEPS(vfm, min_step, max_step, data) \ + X86_MATCH_CPU(VFM_VENDOR(vfm), VFM_FAMILY(vfm), VFM_MODEL(vfm), \ + __X86_STEPPINGS(min_step, max_step), X86_FEATURE_ANY, \ + X86_CPU_TYPE_ANY, data) -/* - * Match specific microcode revisions. - * - * vendor/family/model/stepping must be all set. - * - * Only checks against the boot CPU. When mixed-stepping configs are - * valid for a CPU model, add a quirk for every valid stepping and - * do the fine-tuning in the quirk handler. +/** + * X86_MATCH_VFM_FEATURE - Match encoded vendor/family/model/feature + * @vfm: Encoded 8-bits each for vendor, family, model + * @feature: A X86_FEATURE bit + * @data: Driver specific data or NULL. The internal storage + * format is unsigned long. The supplied value, pointer + * etc. is cast to unsigned long internally. */ +#define X86_MATCH_VFM_FEATURE(vfm, feature, data) \ + X86_MATCH_CPU(VFM_VENDOR(vfm), VFM_FAMILY(vfm), VFM_MODEL(vfm), \ + X86_STEPPING_ANY, feature, X86_CPU_TYPE_ANY, data) -struct x86_cpu_desc { - u8 x86_family; - u8 x86_vendor; - u8 x86_model; - u8 x86_stepping; - u32 x86_microcode_rev; -}; - -#define INTEL_CPU_DESC(model, stepping, revision) { \ - .x86_family = 6, \ - .x86_vendor = X86_VENDOR_INTEL, \ - .x86_model = (model), \ - .x86_stepping = (stepping), \ - .x86_microcode_rev = (revision), \ -} +/** + * X86_MATCH_VFM_CPU_TYPE - Match encoded vendor/family/model/type + * @vfm: Encoded 8-bits each for vendor, family, model + * @type: CPU type e.g. P-core, E-core + * @data: Driver specific data or NULL. The internal storage + * format is unsigned long. The supplied value, pointer + * etc. is cast to unsigned long internally. + */ +#define X86_MATCH_VFM_CPU_TYPE(vfm, type, data) \ + X86_MATCH_CPU(VFM_VENDOR(vfm), VFM_FAMILY(vfm), VFM_MODEL(vfm), \ + X86_STEPPING_ANY, X86_FEATURE_ANY, type, data) extern const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match); -extern bool x86_cpu_has_min_microcode_rev(const struct x86_cpu_desc *table); +extern bool x86_match_min_microcode_rev(const struct x86_cpu_id *table); #endif /* _ASM_X86_CPU_DEVICE_ID */ diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index 75efc4c6f076..462fc34f1317 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -130,10 +130,6 @@ struct cpu_entry_area { }; #define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area)) -#define CPU_ENTRY_AREA_ARRAY_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS) - -/* Total size includes the readonly IDT mapping page as well: */ -#define CPU_ENTRY_AREA_TOTAL_SIZE (CPU_ENTRY_AREA_ARRAY_SIZE + PAGE_SIZE) DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); DECLARE_PER_CPU(struct cea_exception_stacks *, cea_exception_stacks); diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index ea34cc31b047..893cbca37fe9 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -4,11 +4,12 @@ #include <asm/processor.h> -#if defined(__KERNEL__) && !defined(__ASSEMBLY__) +#if defined(__KERNEL__) && !defined(__ASSEMBLER__) #include <asm/asm.h> #include <linux/bitops.h> #include <asm/alternative.h> +#include <asm/cpufeaturemasks.h> enum cpuid_leafs { @@ -32,110 +33,39 @@ enum cpuid_leafs CPUID_8000_0007_EBX, CPUID_7_EDX, CPUID_8000_001F_EAX, + CPUID_8000_0021_EAX, + CPUID_LNX_5, + NR_CPUID_WORDS, }; -#define X86_CAP_FMT_NUM "%d:%d" -#define x86_cap_flag_num(flag) ((flag) >> 5), ((flag) & 31) - -#ifdef CONFIG_X86_FEATURE_NAMES extern const char * const x86_cap_flags[NCAPINTS*32]; extern const char * const x86_power_flags[32]; -#define X86_CAP_FMT "%s" -#define x86_cap_flag(flag) x86_cap_flags[flag] -#else -#define X86_CAP_FMT X86_CAP_FMT_NUM -#define x86_cap_flag x86_cap_flag_num -#endif /* * In order to save room, we index into this array by doing * X86_BUG_<name> - NCAPINTS*32. */ extern const char * const x86_bug_flags[NBUGINTS*32]; +#define x86_bug_flag(flag) x86_bug_flags[flag] #define test_cpu_cap(c, bit) \ arch_test_bit(bit, (unsigned long *)((c)->x86_capability)) -/* - * There are 32 bits/features in each mask word. The high bits - * (selected with (bit>>5) give us the word number and the low 5 - * bits give us the bit/feature number inside the word. - * (1UL<<((bit)&31) gives us a mask for the feature_bit so we can - * see if it is set in the mask word. - */ -#define CHECK_BIT_IN_MASK_WORD(maskname, word, bit) \ - (((bit)>>5)==(word) && (1UL<<((bit)&31) & maskname##word )) - -/* - * {REQUIRED,DISABLED}_MASK_CHECK below may seem duplicated with the - * following BUILD_BUG_ON_ZERO() check but when NCAPINTS gets changed, all - * header macros which use NCAPINTS need to be changed. The duplicated macro - * use causes the compiler to issue errors for all headers so that all usage - * sites can be corrected. - */ -#define REQUIRED_MASK_BIT_SET(feature_bit) \ - ( CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 0, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 1, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 2, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 3, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 4, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 5, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 6, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 7, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 8, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 9, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 10, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 11, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 12, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 13, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 14, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 15, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 19, feature_bit) || \ - REQUIRED_MASK_CHECK || \ - BUILD_BUG_ON_ZERO(NCAPINTS != 20)) - -#define DISABLED_MASK_BIT_SET(feature_bit) \ - ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 1, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 2, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 3, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 4, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 5, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 6, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 7, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 8, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 9, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 10, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 11, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 12, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 13, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 14, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 15, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \ - CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 19, feature_bit) || \ - DISABLED_MASK_CHECK || \ - BUILD_BUG_ON_ZERO(NCAPINTS != 20)) - #define cpu_has(c, bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ test_cpu_cap(c, bit)) #define this_cpu_has(bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ - x86_this_cpu_test_bit(bit, \ - (unsigned long __percpu *)&cpu_info.x86_capability)) + x86_this_cpu_test_bit(bit, cpu_info.x86_capability)) /* - * This macro is for detection of features which need kernel - * infrastructure to be used. It may *not* directly test the CPU - * itself. Use the cpu_has() family if you want true runtime - * testing of CPU features, like in hypervisor code where you are - * supporting a possible guest feature where host support for it + * This is the default CPU features testing macro to use in code. + * + * It is for detection of features which need kernel infrastructure to be + * used. It may *not* directly test the CPU itself. Use the cpu_has() family + * if you want true runtime testing of CPU features, like in hypervisor code + * where you are supporting a possible guest feature where host support for it * is not relevant. */ #define cpu_feature_enabled(bit) \ @@ -147,36 +77,20 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; extern void setup_clear_cpu_cap(unsigned int bit); extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); - -#define setup_force_cpu_cap(bit) do { \ - set_cpu_cap(&boot_cpu_data, bit); \ +void check_cpufeature_deps(struct cpuinfo_x86 *c); + +#define setup_force_cpu_cap(bit) do { \ + \ + if (!boot_cpu_has(bit)) \ + WARN_ON(alternatives_patched); \ + \ + set_cpu_cap(&boot_cpu_data, bit); \ set_bit(bit, (unsigned long *)cpu_caps_set); \ } while (0) #define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit) -#if defined(__clang__) && !defined(CONFIG_CC_HAS_ASM_GOTO) - -/* - * Workaround for the sake of BPF compilation which utilizes kernel - * headers, but clang does not support ASM GOTO and fails the build. - */ -#ifndef __BPF_TRACING__ -#warning "Compiler lacks ASM_GOTO support. Add -D __BPF_TRACING__ to your compiler arguments" -#endif - -#define static_cpu_has(bit) boot_cpu_has(bit) - -#else - /* - * Static testing of CPU features. Used the same as boot_cpu_has(). It - * statically patches the target code for additional performance. Use - * static_cpu_has() only in fast paths, where every cycle counts. Which - * means that the boot_cpu_has() variant is already fast enough for the - * majority of cases and you should stick to using it as it is generally - * only two instructions: a RIP-relative MOV and a TEST. - * * Do not use an "m" constraint for [cap_byte] here: gcc doesn't know * that this is only used on a fallback path and will sometimes cause * it to manifest the address of boot_cpu_data in a register, fouling @@ -184,11 +98,10 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); */ static __always_inline bool _static_cpu_has(u16 bit) { - asm_volatile_goto( - ALTERNATIVE_TERNARY("jmp 6f", %P[feature], "", "jmp %l[t_no]") + asm goto(ALTERNATIVE_TERNARY("jmp 6f", %c[feature], "", "jmp %l[t_no]") ".pushsection .altinstr_aux,\"ax\"\n" "6:\n" - " testb %[bitnum]," _ASM_RIP(%P[cap_byte]) "\n" + " testb %[bitnum], %a[cap_byte]\n" " jnz %l[t_yes]\n" " jmp %l[t_no]\n" ".popsection\n" @@ -208,7 +121,6 @@ t_no: boot_cpu_has(bit) : \ _static_cpu_has(bit) \ ) -#endif #define cpu_has_bug(c, bit) cpu_has(c, (bit)) #define set_cpu_bug(c, bit) set_cpu_cap(c, (bit)) @@ -225,5 +137,5 @@ t_no: #define CPU_FEATURE_TYPEVAL boot_cpu_data.x86_vendor, boot_cpu_data.x86, \ boot_cpu_data.x86_model -#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */ +#endif /* defined(__KERNEL__) && !defined(__ASSEMBLER__) */ #endif /* _ASM_X86_CPUFEATURE_H */ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 5fe7f6c8a7a4..ee176236c2be 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -2,188 +2,178 @@ #ifndef _ASM_X86_CPUFEATURES_H #define _ASM_X86_CPUFEATURES_H -#ifndef _ASM_X86_REQUIRED_FEATURES_H -#include <asm/required-features.h> -#endif - -#ifndef _ASM_X86_DISABLED_FEATURES_H -#include <asm/disabled-features.h> -#endif - /* * Defines x86 CPU feature bits */ -#define NCAPINTS 20 /* N 32-bit words worth of info */ -#define NBUGINTS 1 /* N 32-bit bug flags */ +#define NCAPINTS 22 /* N 32-bit words worth of info */ +#define NBUGINTS 2 /* N 32-bit bug flags */ /* * Note: If the comment begins with a quoted string, that string is used - * in /proc/cpuinfo instead of the macro name. If the string is "", - * this feature bit is not displayed in /proc/cpuinfo at all. + * in /proc/cpuinfo instead of the macro name. Otherwise, this feature + * bit is not displayed in /proc/cpuinfo at all. * * When adding new features here that depend on other features, * please update the table in kernel/cpu/cpuid-deps.c as well. */ /* Intel-defined CPU features, CPUID level 0x00000001 (EDX), word 0 */ -#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ -#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ -#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */ -#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */ -#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */ -#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */ -#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */ -#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */ -#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */ -#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */ -#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */ -#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */ -#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */ -#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */ -#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions (plus FCMOVcc, FCOMI with FPU) */ -#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */ -#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */ -#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */ -#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */ +#define X86_FEATURE_FPU ( 0*32+ 0) /* "fpu" Onboard FPU */ +#define X86_FEATURE_VME ( 0*32+ 1) /* "vme" Virtual Mode Extensions */ +#define X86_FEATURE_DE ( 0*32+ 2) /* "de" Debugging Extensions */ +#define X86_FEATURE_PSE ( 0*32+ 3) /* "pse" Page Size Extensions */ +#define X86_FEATURE_TSC ( 0*32+ 4) /* "tsc" Time Stamp Counter */ +#define X86_FEATURE_MSR ( 0*32+ 5) /* "msr" Model-Specific Registers */ +#define X86_FEATURE_PAE ( 0*32+ 6) /* "pae" Physical Address Extensions */ +#define X86_FEATURE_MCE ( 0*32+ 7) /* "mce" Machine Check Exception */ +#define X86_FEATURE_CX8 ( 0*32+ 8) /* "cx8" CMPXCHG8 instruction */ +#define X86_FEATURE_APIC ( 0*32+ 9) /* "apic" Onboard APIC */ +#define X86_FEATURE_SEP ( 0*32+11) /* "sep" SYSENTER/SYSEXIT */ +#define X86_FEATURE_MTRR ( 0*32+12) /* "mtrr" Memory Type Range Registers */ +#define X86_FEATURE_PGE ( 0*32+13) /* "pge" Page Global Enable */ +#define X86_FEATURE_MCA ( 0*32+14) /* "mca" Machine Check Architecture */ +#define X86_FEATURE_CMOV ( 0*32+15) /* "cmov" CMOV instructions (plus FCMOVcc, FCOMI with FPU) */ +#define X86_FEATURE_PAT ( 0*32+16) /* "pat" Page Attribute Table */ +#define X86_FEATURE_PSE36 ( 0*32+17) /* "pse36" 36-bit PSEs */ +#define X86_FEATURE_PN ( 0*32+18) /* "pn" Processor serial number */ +#define X86_FEATURE_CLFLUSH ( 0*32+19) /* "clflush" CLFLUSH instruction */ #define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */ -#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */ -#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */ -#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */ +#define X86_FEATURE_ACPI ( 0*32+22) /* "acpi" ACPI via MSR */ +#define X86_FEATURE_MMX ( 0*32+23) /* "mmx" Multimedia Extensions */ +#define X86_FEATURE_FXSR ( 0*32+24) /* "fxsr" FXSAVE/FXRSTOR, CR4.OSFXSR */ #define X86_FEATURE_XMM ( 0*32+25) /* "sse" */ #define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */ #define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */ -#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */ +#define X86_FEATURE_HT ( 0*32+28) /* "ht" Hyper-Threading */ #define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */ -#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */ -#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */ +#define X86_FEATURE_IA64 ( 0*32+30) /* "ia64" IA-64 processor */ +#define X86_FEATURE_PBE ( 0*32+31) /* "pbe" Pending Break Enable */ /* AMD-defined CPU features, CPUID level 0x80000001, word 1 */ /* Don't duplicate feature flags which are redundant with Intel! */ -#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */ -#define X86_FEATURE_MP ( 1*32+19) /* MP Capable */ -#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */ -#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */ -#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */ +#define X86_FEATURE_SYSCALL ( 1*32+11) /* "syscall" SYSCALL/SYSRET */ +#define X86_FEATURE_MP ( 1*32+19) /* "mp" MP Capable */ +#define X86_FEATURE_NX ( 1*32+20) /* "nx" Execute Disable */ +#define X86_FEATURE_MMXEXT ( 1*32+22) /* "mmxext" AMD MMX extensions */ +#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* "fxsr_opt" FXSAVE/FXRSTOR optimizations */ #define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */ -#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */ -#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64, 64-bit support) */ -#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow extensions */ -#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow */ +#define X86_FEATURE_RDTSCP ( 1*32+27) /* "rdtscp" RDTSCP */ +#define X86_FEATURE_LM ( 1*32+29) /* "lm" Long Mode (x86-64, 64-bit support) */ +#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* "3dnowext" AMD 3DNow extensions */ +#define X86_FEATURE_3DNOW ( 1*32+31) /* "3dnow" 3DNow */ /* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */ -#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */ -#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */ -#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */ +#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* "recovery" CPU in recovery mode */ +#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* "longrun" Longrun power control */ +#define X86_FEATURE_LRTI ( 2*32+ 3) /* "lrti" LongRun table interface */ /* Other features, Linux-defined mapping, word 3 */ /* This range is used for feature bits which conflict or are synthesized */ -#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */ -#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ -#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ -#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ - -/* CPU types for specific tunings: */ -#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ -/* FREE, was #define X86_FEATURE_K7 ( 3*32+ 5) "" Athlon */ -#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ -#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ -#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ -#define X86_FEATURE_UP ( 3*32+ 9) /* SMP kernel running on UP */ -#define X86_FEATURE_ART ( 3*32+10) /* Always running timer (ART) */ -#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */ -#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */ -#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */ -#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */ -#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ -#define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ -/* FREE! ( 3*32+17) */ -#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */ -#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ -#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ -#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ -#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* CPU topology enum extensions */ -#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */ -#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */ -#define X86_FEATURE_CPUID ( 3*32+25) /* CPU has CPUID instruction itself */ -#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* Extended APICID (8 bits) */ -#define X86_FEATURE_AMD_DCM ( 3*32+27) /* AMD multi-node processor */ -#define X86_FEATURE_APERFMPERF ( 3*32+28) /* P-State hardware coordination feedback capability (APERF/MPERF MSRs) */ -#define X86_FEATURE_RAPL ( 3*32+29) /* AMD/Hygon RAPL interface */ -#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ -#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */ +#define X86_FEATURE_CXMMX ( 3*32+ 0) /* "cxmmx" Cyrix MMX extensions */ +#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* "k6_mtrr" AMD K6 nonstandard MTRRs */ +#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* "cyrix_arr" Cyrix ARRs (= MTRRs) */ +#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* "centaur_mcr" Centaur MCRs (= MTRRs) */ +#define X86_FEATURE_K8 ( 3*32+ 4) /* Opteron, Athlon64 */ +#define X86_FEATURE_ZEN5 ( 3*32+ 5) /* CPU based on Zen5 microarchitecture */ +#define X86_FEATURE_ZEN6 ( 3*32+ 6) /* CPU based on Zen6 microarchitecture */ +/* Free ( 3*32+ 7) */ +#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* "constant_tsc" TSC ticks at a constant rate */ +#define X86_FEATURE_UP ( 3*32+ 9) /* "up" SMP kernel running on UP */ +#define X86_FEATURE_ART ( 3*32+10) /* "art" Always running timer (ART) */ +#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* "arch_perfmon" Intel Architectural PerfMon */ +#define X86_FEATURE_PEBS ( 3*32+12) /* "pebs" Precise-Event Based Sampling */ +#define X86_FEATURE_BTS ( 3*32+13) /* "bts" Branch Trace Store */ +#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* syscall in IA32 userspace */ +#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* sysenter in IA32 userspace */ +#define X86_FEATURE_REP_GOOD ( 3*32+16) /* "rep_good" REP microcode works well */ +#define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* "amd_lbr_v2" AMD Last Branch Record Extension Version 2 */ +#define X86_FEATURE_CLEAR_CPU_BUF ( 3*32+18) /* Clear CPU buffers using VERW */ +#define X86_FEATURE_ACC_POWER ( 3*32+19) /* "acc_power" AMD Accumulated Power Mechanism */ +#define X86_FEATURE_NOPL ( 3*32+20) /* "nopl" The NOPL (0F 1F) instructions */ +#define X86_FEATURE_ALWAYS ( 3*32+21) /* Always-present feature */ +#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* "xtopology" CPU topology enum extensions */ +#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* "tsc_reliable" TSC is known to be reliable */ +#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* "nonstop_tsc" TSC does not stop in C states */ +#define X86_FEATURE_CPUID ( 3*32+25) /* "cpuid" CPU has CPUID instruction itself */ +#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* "extd_apicid" Extended APICID (8 bits) */ +#define X86_FEATURE_AMD_DCM ( 3*32+27) /* "amd_dcm" AMD multi-node processor */ +#define X86_FEATURE_APERFMPERF ( 3*32+28) /* "aperfmperf" P-State hardware coordination feedback capability (APERF/MPERF MSRs) */ +#define X86_FEATURE_RAPL ( 3*32+29) /* "rapl" AMD/Hygon RAPL interface */ +#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* "nonstop_tsc_s3" TSC doesn't stop in S3 state */ +#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* "tsc_known_freq" TSC has known frequency */ /* Intel-defined CPU features, CPUID level 0x00000001 (ECX), word 4 */ #define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ -#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */ -#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */ +#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* "pclmulqdq" PCLMULQDQ instruction */ +#define X86_FEATURE_DTES64 ( 4*32+ 2) /* "dtes64" 64-bit Debug Store */ #define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" MONITOR/MWAIT support */ #define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL-qualified (filtered) Debug Store */ -#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */ -#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer Mode eXtensions */ -#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */ -#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */ -#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */ -#define X86_FEATURE_CID ( 4*32+10) /* Context ID */ -#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */ -#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */ -#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B instruction */ -#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */ -#define X86_FEATURE_PDCM ( 4*32+15) /* Perf/Debug Capabilities MSR */ -#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */ -#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */ +#define X86_FEATURE_VMX ( 4*32+ 5) /* "vmx" Hardware virtualization */ +#define X86_FEATURE_SMX ( 4*32+ 6) /* "smx" Safer Mode eXtensions */ +#define X86_FEATURE_EST ( 4*32+ 7) /* "est" Enhanced SpeedStep */ +#define X86_FEATURE_TM2 ( 4*32+ 8) /* "tm2" Thermal Monitor 2 */ +#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* "ssse3" Supplemental SSE-3 */ +#define X86_FEATURE_CID ( 4*32+10) /* "cid" Context ID */ +#define X86_FEATURE_SDBG ( 4*32+11) /* "sdbg" Silicon Debug */ +#define X86_FEATURE_FMA ( 4*32+12) /* "fma" Fused multiply-add */ +#define X86_FEATURE_CX16 ( 4*32+13) /* "cx16" CMPXCHG16B instruction */ +#define X86_FEATURE_XTPR ( 4*32+14) /* "xtpr" Send Task Priority Messages */ +#define X86_FEATURE_PDCM ( 4*32+15) /* "pdcm" Perf/Debug Capabilities MSR */ +#define X86_FEATURE_PCID ( 4*32+17) /* "pcid" Process Context Identifiers */ +#define X86_FEATURE_DCA ( 4*32+18) /* "dca" Direct Cache Access */ #define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */ #define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */ -#define X86_FEATURE_X2APIC ( 4*32+21) /* X2APIC */ -#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */ -#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */ -#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* TSC deadline timer */ -#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */ -#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV instructions */ -#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE instruction enabled in the OS */ -#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */ -#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit FP conversions */ -#define X86_FEATURE_RDRAND ( 4*32+30) /* RDRAND instruction */ -#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */ +#define X86_FEATURE_X2APIC ( 4*32+21) /* "x2apic" X2APIC */ +#define X86_FEATURE_MOVBE ( 4*32+22) /* "movbe" MOVBE instruction */ +#define X86_FEATURE_POPCNT ( 4*32+23) /* "popcnt" POPCNT instruction */ +#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* "tsc_deadline_timer" TSC deadline timer */ +#define X86_FEATURE_AES ( 4*32+25) /* "aes" AES instructions */ +#define X86_FEATURE_XSAVE ( 4*32+26) /* "xsave" XSAVE/XRSTOR/XSETBV/XGETBV instructions */ +#define X86_FEATURE_OSXSAVE ( 4*32+27) /* XSAVE instruction enabled in the OS */ +#define X86_FEATURE_AVX ( 4*32+28) /* "avx" Advanced Vector Extensions */ +#define X86_FEATURE_F16C ( 4*32+29) /* "f16c" 16-bit FP conversions */ +#define X86_FEATURE_RDRAND ( 4*32+30) /* "rdrand" RDRAND instruction */ +#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* "hypervisor" Running on a hypervisor */ /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ #define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */ #define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */ #define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */ #define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */ -#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */ -#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */ -#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */ -#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */ -#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */ -#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */ +#define X86_FEATURE_ACE2 ( 5*32+ 8) /* "ace2" Advanced Cryptography Engine v2 */ +#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* "ace2_en" ACE v2 enabled */ +#define X86_FEATURE_PHE ( 5*32+10) /* "phe" PadLock Hash Engine */ +#define X86_FEATURE_PHE_EN ( 5*32+11) /* "phe_en" PHE enabled */ +#define X86_FEATURE_PMM ( 5*32+12) /* "pmm" PadLock Montgomery Multiplier */ +#define X86_FEATURE_PMM_EN ( 5*32+13) /* "pmm_en" PMM enabled */ /* More extended AMD flags: CPUID level 0x80000001, ECX, word 6 */ -#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */ -#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */ -#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure Virtual Machine */ -#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */ -#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */ -#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */ -#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */ -#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */ -#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */ -#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */ -#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */ -#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */ -#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */ -#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */ -#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */ -#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */ -#define X86_FEATURE_TCE ( 6*32+17) /* Translation Cache Extension */ -#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */ -#define X86_FEATURE_TBM ( 6*32+21) /* Trailing Bit Manipulations */ -#define X86_FEATURE_TOPOEXT ( 6*32+22) /* Topology extensions CPUID leafs */ -#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* Core performance counter extensions */ -#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ -#define X86_FEATURE_BPEXT ( 6*32+26) /* Data breakpoint extension */ -#define X86_FEATURE_PTSC ( 6*32+27) /* Performance time-stamp counter */ -#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* Last Level Cache performance counter extensions */ -#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX instructions) */ +#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* "lahf_lm" LAHF/SAHF in long mode */ +#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* "cmp_legacy" If yes HyperThreading not valid */ +#define X86_FEATURE_SVM ( 6*32+ 2) /* "svm" Secure Virtual Machine */ +#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* "extapic" Extended APIC space */ +#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* "cr8_legacy" CR8 in 32-bit mode */ +#define X86_FEATURE_ABM ( 6*32+ 5) /* "abm" Advanced bit manipulation */ +#define X86_FEATURE_SSE4A ( 6*32+ 6) /* "sse4a" SSE-4A */ +#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* "misalignsse" Misaligned SSE mode */ +#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* "3dnowprefetch" 3DNow prefetch instructions */ +#define X86_FEATURE_OSVW ( 6*32+ 9) /* "osvw" OS Visible Workaround */ +#define X86_FEATURE_IBS ( 6*32+10) /* "ibs" Instruction Based Sampling */ +#define X86_FEATURE_XOP ( 6*32+11) /* "xop" Extended AVX instructions */ +#define X86_FEATURE_SKINIT ( 6*32+12) /* "skinit" SKINIT/STGI instructions */ +#define X86_FEATURE_WDT ( 6*32+13) /* "wdt" Watchdog timer */ +#define X86_FEATURE_LWP ( 6*32+15) /* "lwp" Light Weight Profiling */ +#define X86_FEATURE_FMA4 ( 6*32+16) /* "fma4" 4 operands MAC instructions */ +#define X86_FEATURE_TCE ( 6*32+17) /* "tce" Translation Cache Extension */ +#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* "nodeid_msr" NodeId MSR */ +#define X86_FEATURE_TBM ( 6*32+21) /* "tbm" Trailing Bit Manipulations */ +#define X86_FEATURE_TOPOEXT ( 6*32+22) /* "topoext" Topology extensions CPUID leafs */ +#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* "perfctr_core" Core performance counter extensions */ +#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* "perfctr_nb" NB performance counter extensions */ +#define X86_FEATURE_BPEXT ( 6*32+26) /* "bpext" Data breakpoint extension */ +#define X86_FEATURE_PTSC ( 6*32+27) /* "ptsc" Performance time-stamp counter */ +#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* "perfctr_llc" Last Level Cache performance counter extensions */ +#define X86_FEATURE_MWAITX ( 6*32+29) /* "mwaitx" MWAIT extension (MONITORX/MWAITX instructions) */ /* * Auxiliary flags: Linux defined - For features scattered in various @@ -191,94 +181,92 @@ * * Reuse free bits when adding new feature flags! */ -#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT instructions */ -#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */ -#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ -#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ -#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ -#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ -#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ -#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */ -#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ -#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ -#define X86_FEATURE_XCOMPACTED ( 7*32+10) /* "" Use compacted XSTATE (XSAVES or XSAVEC) */ -#define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ -#define X86_FEATURE_KERNEL_IBRS ( 7*32+12) /* "" Set/clear IBRS on kernel entry/exit */ -#define X86_FEATURE_RSB_VMEXIT ( 7*32+13) /* "" Fill RSB on VM-Exit */ -#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ -#define X86_FEATURE_CDP_L2 ( 7*32+15) /* Code and Data Prioritization L2 */ -#define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */ -#define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */ -#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ -#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */ -#define X86_FEATURE_PERFMON_V2 ( 7*32+20) /* AMD Performance Monitoring Version 2 */ -#define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */ -#define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */ -#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */ -#define X86_FEATURE_LS_CFG_SSBD ( 7*32+24) /* "" AMD SSBD implementation via LS_CFG MSR */ -#define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */ -#define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */ -#define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ -#define X86_FEATURE_ZEN (7*32+28) /* "" CPU based on Zen microarchitecture */ -#define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */ -#define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */ -#define X86_FEATURE_MSR_IA32_FEAT_CTL ( 7*32+31) /* "" MSR IA32_FEAT_CTL configured */ +#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* "ring3mwait" Ring 3 MONITOR/MWAIT instructions */ +#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* "cpuid_fault" Intel CPUID faulting */ +#define X86_FEATURE_CPB ( 7*32+ 2) /* "cpb" AMD Core Performance Boost */ +#define X86_FEATURE_EPB ( 7*32+ 3) /* "epb" IA32_ENERGY_PERF_BIAS support */ +#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* "cat_l3" Cache Allocation Technology L3 */ +#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* "cat_l2" Cache Allocation Technology L2 */ +#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* "cdp_l3" Code and Data Prioritization L3 */ +#define X86_FEATURE_TDX_HOST_PLATFORM ( 7*32+ 7) /* "tdx_host_platform" Platform supports being a TDX host */ +#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* "hw_pstate" AMD HW-PState */ +#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* "proc_feedback" AMD ProcFeedbackInterface */ +#define X86_FEATURE_XCOMPACTED ( 7*32+10) /* Use compacted XSTATE (XSAVES or XSAVEC) */ +#define X86_FEATURE_PTI ( 7*32+11) /* "pti" Kernel Page Table Isolation enabled */ +#define X86_FEATURE_KERNEL_IBRS ( 7*32+12) /* Set/clear IBRS on kernel entry/exit */ +#define X86_FEATURE_RSB_VMEXIT ( 7*32+13) /* Fill RSB on VM-Exit */ +#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* "intel_ppin" Intel Processor Inventory Number */ +#define X86_FEATURE_CDP_L2 ( 7*32+15) /* "cdp_l2" Code and Data Prioritization L2 */ +#define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* MSR SPEC_CTRL is implemented */ +#define X86_FEATURE_SSBD ( 7*32+17) /* "ssbd" Speculative Store Bypass Disable */ +#define X86_FEATURE_MBA ( 7*32+18) /* "mba" Memory Bandwidth Allocation */ +#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */ +#define X86_FEATURE_PERFMON_V2 ( 7*32+20) /* "perfmon_v2" AMD Performance Monitoring Version 2 */ +#define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* Use IBRS during runtime firmware calls */ +#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* Disable Speculative Store Bypass. */ +#define X86_FEATURE_LS_CFG_SSBD ( 7*32+24) /* AMD SSBD implementation via LS_CFG MSR */ +#define X86_FEATURE_IBRS ( 7*32+25) /* "ibrs" Indirect Branch Restricted Speculation */ +#define X86_FEATURE_IBPB ( 7*32+26) /* "ibpb" Indirect Branch Prediction Barrier without a guaranteed RSB flush */ +#define X86_FEATURE_STIBP ( 7*32+27) /* "stibp" Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_ZEN ( 7*32+28) /* Generic flag for all Zen and newer */ +#define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* L1TF workaround PTE inversion */ +#define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* "ibrs_enhanced" Enhanced IBRS */ +#define X86_FEATURE_MSR_IA32_FEAT_CTL ( 7*32+31) /* MSR IA32_FEAT_CTL configured */ /* Virtualization flags: Linux defined, word 8 */ -#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ -#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ -#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */ -#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */ -#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */ +#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* "tpr_shadow" Intel TPR Shadow */ +#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 1) /* "flexpriority" Intel FlexPriority */ +#define X86_FEATURE_EPT ( 8*32+ 2) /* "ept" Intel Extended Page Table */ +#define X86_FEATURE_VPID ( 8*32+ 3) /* "vpid" Intel Virtual Processor ID */ -#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer VMMCALL to VMCALL */ -#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ -#define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */ -#define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */ -#define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */ -#define X86_FEATURE_PVUNLOCK ( 8*32+20) /* "" PV unlock function */ -#define X86_FEATURE_VCPUPREEMPT ( 8*32+21) /* "" PV vcpu_is_preempted function */ -#define X86_FEATURE_TDX_GUEST ( 8*32+22) /* Intel Trust Domain Extensions Guest */ +#define X86_FEATURE_VMMCALL ( 8*32+15) /* "vmmcall" Prefer VMMCALL to VMCALL */ +#define X86_FEATURE_XENPV ( 8*32+16) /* Xen paravirtual guest */ +#define X86_FEATURE_EPT_AD ( 8*32+17) /* "ept_ad" Intel Extended Page Table access-dirty bit */ +#define X86_FEATURE_VMCALL ( 8*32+18) /* Hypervisor supports the VMCALL instruction */ +#define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* VMware prefers VMMCALL hypercall instruction */ +#define X86_FEATURE_PVUNLOCK ( 8*32+20) /* PV unlock function */ +#define X86_FEATURE_VCPUPREEMPT ( 8*32+21) /* PV vcpu_is_preempted function */ +#define X86_FEATURE_TDX_GUEST ( 8*32+22) /* "tdx_guest" Intel Trust Domain Extensions Guest */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ -#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ -#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3B */ -#define X86_FEATURE_SGX ( 9*32+ 2) /* Software Guard Extensions */ -#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ -#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ -#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ -#define X86_FEATURE_FDP_EXCPTN_ONLY ( 9*32+ 6) /* "" FPU data pointer updated only on x87 exceptions */ -#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ -#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ -#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB instructions */ -#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ -#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ -#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ -#define X86_FEATURE_ZERO_FCS_FDS ( 9*32+13) /* "" Zero out FPU CS and FPU DS */ -#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ -#define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */ -#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ -#define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */ -#define X86_FEATURE_RDSEED ( 9*32+18) /* RDSEED instruction */ -#define X86_FEATURE_ADX ( 9*32+19) /* ADCX and ADOX instructions */ -#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ -#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */ -#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ -#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ -#define X86_FEATURE_INTEL_PT ( 9*32+25) /* Intel Processor Trace */ -#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ -#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ -#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ -#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */ -#define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */ -#define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */ +#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* "fsgsbase" RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ +#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* "tsc_adjust" TSC adjustment MSR 0x3B */ +#define X86_FEATURE_SGX ( 9*32+ 2) /* "sgx" Software Guard Extensions */ +#define X86_FEATURE_BMI1 ( 9*32+ 3) /* "bmi1" 1st group bit manipulation extensions */ +#define X86_FEATURE_HLE ( 9*32+ 4) /* "hle" Hardware Lock Elision */ +#define X86_FEATURE_AVX2 ( 9*32+ 5) /* "avx2" AVX2 instructions */ +#define X86_FEATURE_FDP_EXCPTN_ONLY ( 9*32+ 6) /* FPU data pointer updated only on x87 exceptions */ +#define X86_FEATURE_SMEP ( 9*32+ 7) /* "smep" Supervisor Mode Execution Protection */ +#define X86_FEATURE_BMI2 ( 9*32+ 8) /* "bmi2" 2nd group bit manipulation extensions */ +#define X86_FEATURE_ERMS ( 9*32+ 9) /* "erms" Enhanced REP MOVSB/STOSB instructions */ +#define X86_FEATURE_INVPCID ( 9*32+10) /* "invpcid" Invalidate Processor Context ID */ +#define X86_FEATURE_RTM ( 9*32+11) /* "rtm" Restricted Transactional Memory */ +#define X86_FEATURE_CQM ( 9*32+12) /* "cqm" Cache QoS Monitoring */ +#define X86_FEATURE_ZERO_FCS_FDS ( 9*32+13) /* Zero out FPU CS and FPU DS */ +#define X86_FEATURE_MPX ( 9*32+14) /* "mpx" Memory Protection Extension */ +#define X86_FEATURE_RDT_A ( 9*32+15) /* "rdt_a" Resource Director Technology Allocation */ +#define X86_FEATURE_AVX512F ( 9*32+16) /* "avx512f" AVX-512 Foundation */ +#define X86_FEATURE_AVX512DQ ( 9*32+17) /* "avx512dq" AVX-512 DQ (Double/Quad granular) Instructions */ +#define X86_FEATURE_RDSEED ( 9*32+18) /* "rdseed" RDSEED instruction */ +#define X86_FEATURE_ADX ( 9*32+19) /* "adx" ADCX and ADOX instructions */ +#define X86_FEATURE_SMAP ( 9*32+20) /* "smap" Supervisor Mode Access Prevention */ +#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* "avx512ifma" AVX-512 Integer Fused Multiply-Add instructions */ +#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* "clflushopt" CLFLUSHOPT instruction */ +#define X86_FEATURE_CLWB ( 9*32+24) /* "clwb" CLWB instruction */ +#define X86_FEATURE_INTEL_PT ( 9*32+25) /* "intel_pt" Intel Processor Trace */ +#define X86_FEATURE_AVX512PF ( 9*32+26) /* "avx512pf" AVX-512 Prefetch */ +#define X86_FEATURE_AVX512ER ( 9*32+27) /* "avx512er" AVX-512 Exponential and Reciprocal */ +#define X86_FEATURE_AVX512CD ( 9*32+28) /* "avx512cd" AVX-512 Conflict Detection */ +#define X86_FEATURE_SHA_NI ( 9*32+29) /* "sha_ni" SHA1/SHA256 Instruction Extensions */ +#define X86_FEATURE_AVX512BW ( 9*32+30) /* "avx512bw" AVX-512 BW (Byte/Word granular) Instructions */ +#define X86_FEATURE_AVX512VL ( 9*32+31) /* "avx512vl" AVX-512 VL (128/256 Vector Length) Extensions */ /* Extended state features, CPUID level 0x0000000d:1 (EAX), word 10 */ -#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT instruction */ -#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC instruction */ -#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 instruction */ -#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS instructions */ -#define X86_FEATURE_XFD (10*32+ 4) /* "" eXtended Feature Disabling */ +#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* "xsaveopt" XSAVEOPT instruction */ +#define X86_FEATURE_XSAVEC (10*32+ 1) /* "xsavec" XSAVEC instruction */ +#define X86_FEATURE_XGETBV1 (10*32+ 2) /* "xgetbv1" XGETBV with ECX = 1 instruction */ +#define X86_FEATURE_XSAVES (10*32+ 3) /* "xsaves" XSAVES/XRSTORS instructions */ +#define X86_FEATURE_XFD (10*32+ 4) /* eXtended Feature Disabling */ /* * Extended auxiliary flags: Linux defined - for features scattered in various @@ -286,175 +274,273 @@ * * Reuse free bits when adding new feature flags! */ -#define X86_FEATURE_CQM_LLC (11*32+ 0) /* LLC QoS if 1 */ -#define X86_FEATURE_CQM_OCCUP_LLC (11*32+ 1) /* LLC occupancy monitoring */ -#define X86_FEATURE_CQM_MBM_TOTAL (11*32+ 2) /* LLC Total MBM monitoring */ -#define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* LLC Local MBM monitoring */ -#define X86_FEATURE_FENCE_SWAPGS_USER (11*32+ 4) /* "" LFENCE in user entry SWAPGS path */ -#define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */ -#define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* #AC for split lock */ -#define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */ -#define X86_FEATURE_SGX1 (11*32+ 8) /* "" Basic SGX */ -#define X86_FEATURE_SGX2 (11*32+ 9) /* "" SGX Enclave Dynamic Memory Management (EDMM) */ -#define X86_FEATURE_ENTRY_IBPB (11*32+10) /* "" Issue an IBPB on kernel entry */ -#define X86_FEATURE_RRSBA_CTRL (11*32+11) /* "" RET prediction control */ -#define X86_FEATURE_RETPOLINE (11*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ -#define X86_FEATURE_RETPOLINE_LFENCE (11*32+13) /* "" Use LFENCE for Spectre variant 2 */ -#define X86_FEATURE_RETHUNK (11*32+14) /* "" Use REturn THUNK */ -#define X86_FEATURE_UNRET (11*32+15) /* "" AMD BTB untrain return */ -#define X86_FEATURE_USE_IBPB_FW (11*32+16) /* "" Use IBPB during runtime firmware calls */ +#define X86_FEATURE_CQM_LLC (11*32+ 0) /* "cqm_llc" LLC QoS if 1 */ +#define X86_FEATURE_CQM_OCCUP_LLC (11*32+ 1) /* "cqm_occup_llc" LLC occupancy monitoring */ +#define X86_FEATURE_CQM_MBM_TOTAL (11*32+ 2) /* "cqm_mbm_total" LLC Total MBM monitoring */ +#define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* "cqm_mbm_local" LLC Local MBM monitoring */ +#define X86_FEATURE_FENCE_SWAPGS_USER (11*32+ 4) /* LFENCE in user entry SWAPGS path */ +#define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* LFENCE in kernel entry SWAPGS path */ +#define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* "split_lock_detect" #AC for split lock */ +#define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* Per-thread Memory Bandwidth Allocation */ +#define X86_FEATURE_SGX1 (11*32+ 8) /* Basic SGX */ +#define X86_FEATURE_SGX2 (11*32+ 9) /* SGX Enclave Dynamic Memory Management (EDMM) */ +#define X86_FEATURE_ENTRY_IBPB (11*32+10) /* Issue an IBPB on kernel entry */ +#define X86_FEATURE_RRSBA_CTRL (11*32+11) /* RET prediction control */ +#define X86_FEATURE_RETPOLINE (11*32+12) /* Generic Retpoline mitigation for Spectre variant 2 */ +#define X86_FEATURE_RETPOLINE_LFENCE (11*32+13) /* Use LFENCE for Spectre variant 2 */ +#define X86_FEATURE_RETHUNK (11*32+14) /* Use REturn THUNK */ +#define X86_FEATURE_UNRET (11*32+15) /* AMD BTB untrain return */ +#define X86_FEATURE_USE_IBPB_FW (11*32+16) /* Use IBPB during runtime firmware calls */ +#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* Fill RSB on VM exit when EIBRS is enabled */ +#define X86_FEATURE_SGX_EDECCSSA (11*32+18) /* SGX EDECCSSA user leaf function */ +#define X86_FEATURE_CALL_DEPTH (11*32+19) /* Call depth tracking for RSB stuffing */ +#define X86_FEATURE_MSR_TSX_CTRL (11*32+20) /* MSR IA32_TSX_CTRL (Intel) implemented */ +#define X86_FEATURE_SMBA (11*32+21) /* Slow Memory Bandwidth Allocation */ +#define X86_FEATURE_BMEC (11*32+22) /* Bandwidth Monitoring Event Configuration */ +#define X86_FEATURE_USER_SHSTK (11*32+23) /* "user_shstk" Shadow stack support for user mode applications */ +#define X86_FEATURE_SRSO (11*32+24) /* AMD BTB untrain RETs */ +#define X86_FEATURE_SRSO_ALIAS (11*32+25) /* AMD BTB untrain RETs through aliasing */ +#define X86_FEATURE_IBPB_ON_VMEXIT (11*32+26) /* Issue an IBPB only on VMEXIT */ +#define X86_FEATURE_APIC_MSRS_FENCE (11*32+27) /* IA32_TSC_DEADLINE and X2APIC MSRs need fencing */ +#define X86_FEATURE_ZEN2 (11*32+28) /* CPU based on Zen2 microarchitecture */ +#define X86_FEATURE_ZEN3 (11*32+29) /* CPU based on Zen3 microarchitecture */ +#define X86_FEATURE_ZEN4 (11*32+30) /* CPU based on Zen4 microarchitecture */ +#define X86_FEATURE_ZEN1 (11*32+31) /* CPU based on Zen1 microarchitecture */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ -#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ -#define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ +#define X86_FEATURE_SHA512 (12*32+ 0) /* SHA512 instructions */ +#define X86_FEATURE_SM3 (12*32+ 1) /* SM3 instructions */ +#define X86_FEATURE_SM4 (12*32+ 2) /* SM4 instructions */ +#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* "avx_vnni" AVX VNNI instructions */ +#define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* "avx512_bf16" AVX512 BFLOAT16 instructions */ +#define X86_FEATURE_CMPCCXADD (12*32+ 7) /* CMPccXADD instructions */ +#define X86_FEATURE_ARCH_PERFMON_EXT (12*32+ 8) /* Intel Architectural PerfMon Extension */ +#define X86_FEATURE_FZRM (12*32+10) /* Fast zero-length REP MOVSB */ +#define X86_FEATURE_FSRS (12*32+11) /* Fast short REP STOSB */ +#define X86_FEATURE_FSRC (12*32+12) /* Fast short REP {CMPSB,SCASB} */ +#define X86_FEATURE_FRED (12*32+17) /* "fred" Flexible Return and Event Delivery */ +#define X86_FEATURE_LKGS (12*32+18) /* Load "kernel" (userspace) GS */ +#define X86_FEATURE_WRMSRNS (12*32+19) /* Non-serializing WRMSR */ +#define X86_FEATURE_AMX_FP16 (12*32+21) /* AMX fp16 Support */ +#define X86_FEATURE_AVX_IFMA (12*32+23) /* Support for VPMADD52[H,L]UQ */ +#define X86_FEATURE_LAM (12*32+26) /* "lam" Linear Address Masking */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ -#define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ -#define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ -#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */ -#define X86_FEATURE_RDPRU (13*32+ 4) /* Read processor register at user level */ -#define X86_FEATURE_WBNOINVD (13*32+ 9) /* WBNOINVD instruction */ -#define X86_FEATURE_AMD_IBPB (13*32+12) /* "" Indirect Branch Prediction Barrier */ -#define X86_FEATURE_AMD_IBRS (13*32+14) /* "" Indirect Branch Restricted Speculation */ -#define X86_FEATURE_AMD_STIBP (13*32+15) /* "" Single Thread Indirect Branch Predictors */ -#define X86_FEATURE_AMD_STIBP_ALWAYS_ON (13*32+17) /* "" Single Thread Indirect Branch Predictors always-on preferred */ -#define X86_FEATURE_AMD_PPIN (13*32+23) /* Protected Processor Inventory Number */ -#define X86_FEATURE_AMD_SSBD (13*32+24) /* "" Speculative Store Bypass Disable */ -#define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ -#define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */ -#define X86_FEATURE_CPPC (13*32+27) /* Collaborative Processor Performance Control */ -#define X86_FEATURE_BTC_NO (13*32+29) /* "" Not vulnerable to Branch Type Confusion */ -#define X86_FEATURE_BRS (13*32+31) /* Branch Sampling available */ +#define X86_FEATURE_CLZERO (13*32+ 0) /* "clzero" CLZERO instruction */ +#define X86_FEATURE_IRPERF (13*32+ 1) /* "irperf" Instructions Retired Count */ +#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* "xsaveerptr" Always save/restore FP error pointers */ +#define X86_FEATURE_INVLPGB (13*32+ 3) /* INVLPGB and TLBSYNC instructions supported */ +#define X86_FEATURE_RDPRU (13*32+ 4) /* "rdpru" Read processor register at user level */ +#define X86_FEATURE_WBNOINVD (13*32+ 9) /* "wbnoinvd" WBNOINVD instruction */ +#define X86_FEATURE_AMD_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */ +#define X86_FEATURE_AMD_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */ +#define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_AMD_STIBP_ALWAYS_ON (13*32+17) /* Single Thread Indirect Branch Predictors always-on preferred */ +#define X86_FEATURE_AMD_IBRS_SAME_MODE (13*32+19) /* Indirect Branch Restricted Speculation same mode protection*/ +#define X86_FEATURE_AMD_PPIN (13*32+23) /* "amd_ppin" Protected Processor Inventory Number */ +#define X86_FEATURE_AMD_SSBD (13*32+24) /* Speculative Store Bypass Disable */ +#define X86_FEATURE_VIRT_SSBD (13*32+25) /* "virt_ssbd" Virtualized Speculative Store Bypass Disable */ +#define X86_FEATURE_AMD_SSB_NO (13*32+26) /* Speculative Store Bypass is fixed in hardware. */ +#define X86_FEATURE_CPPC (13*32+27) /* "cppc" Collaborative Processor Performance Control */ +#define X86_FEATURE_AMD_PSFD (13*32+28) /* Predictive Store Forwarding Disable */ +#define X86_FEATURE_BTC_NO (13*32+29) /* Not vulnerable to Branch Type Confusion */ +#define X86_FEATURE_AMD_IBPB_RET (13*32+30) /* IBPB clears return address predictor */ +#define X86_FEATURE_BRS (13*32+31) /* "brs" Branch Sampling available */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ -#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ -#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */ -#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */ -#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */ -#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */ -#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */ -#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */ -#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */ -#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ -#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ -#define X86_FEATURE_HFI (14*32+19) /* Hardware Feedback Interface */ +#define X86_FEATURE_DTHERM (14*32+ 0) /* "dtherm" Digital Thermal Sensor */ +#define X86_FEATURE_IDA (14*32+ 1) /* "ida" Intel Dynamic Acceleration */ +#define X86_FEATURE_ARAT (14*32+ 2) /* "arat" Always Running APIC Timer */ +#define X86_FEATURE_PLN (14*32+ 4) /* "pln" Intel Power Limit Notification */ +#define X86_FEATURE_PTS (14*32+ 6) /* "pts" Intel Package Thermal Status */ +#define X86_FEATURE_HWP (14*32+ 7) /* "hwp" Intel Hardware P-states */ +#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* "hwp_notify" HWP Notification */ +#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* "hwp_act_window" HWP Activity Window */ +#define X86_FEATURE_HWP_EPP (14*32+10) /* "hwp_epp" HWP Energy Perf. Preference */ +#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* "hwp_pkg_req" HWP Package Level Request */ +#define X86_FEATURE_HWP_HIGHEST_PERF_CHANGE (14*32+15) /* HWP Highest perf change */ +#define X86_FEATURE_HFI (14*32+19) /* "hfi" Hardware Feedback Interface */ /* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */ -#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ -#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */ +#define X86_FEATURE_NPT (15*32+ 0) /* "npt" Nested Page Table support */ +#define X86_FEATURE_LBRV (15*32+ 1) /* "lbrv" LBR Virtualization support */ #define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */ #define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */ #define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */ #define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */ -#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */ -#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */ -#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ -#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ -#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ -#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ -#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ -#define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* Virtual SPEC_CTRL */ -#define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */ +#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* "flushbyasid" Flush-by-ASID support */ +#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* "decodeassists" Decode Assists support */ +#define X86_FEATURE_PAUSEFILTER (15*32+10) /* "pausefilter" Filtered pause intercept */ +#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* "pfthreshold" Pause filter threshold */ +#define X86_FEATURE_AVIC (15*32+13) /* "avic" Virtual Interrupt Controller */ +#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* "v_vmsave_vmload" Virtual VMSAVE VMLOAD */ +#define X86_FEATURE_VGIF (15*32+16) /* "vgif" Virtual GIF */ +#define X86_FEATURE_X2AVIC (15*32+18) /* "x2avic" Virtual x2apic */ +#define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* "v_spec_ctrl" Virtual SPEC_CTRL */ +#define X86_FEATURE_VNMI (15*32+25) /* "vnmi" Virtual NMI */ +#define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* SVME addr check */ +#define X86_FEATURE_BUS_LOCK_THRESHOLD (15*32+29) /* Bus lock threshold */ +#define X86_FEATURE_IDLE_HLT (15*32+30) /* IDLE HLT intercept */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */ -#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ -#define X86_FEATURE_UMIP (16*32+ 2) /* User Mode Instruction Protection */ -#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ -#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ -#define X86_FEATURE_WAITPKG (16*32+ 5) /* UMONITOR/UMWAIT/TPAUSE Instructions */ -#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ -#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */ -#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */ -#define X86_FEATURE_VPCLMULQDQ (16*32+10) /* Carry-Less Multiplication Double Quadword */ -#define X86_FEATURE_AVX512_VNNI (16*32+11) /* Vector Neural Network Instructions */ -#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */ -#define X86_FEATURE_TME (16*32+13) /* Intel Total Memory Encryption */ -#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ -#define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ -#define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ -#define X86_FEATURE_BUS_LOCK_DETECT (16*32+24) /* Bus Lock detect */ -#define X86_FEATURE_CLDEMOTE (16*32+25) /* CLDEMOTE instruction */ -#define X86_FEATURE_MOVDIRI (16*32+27) /* MOVDIRI instruction */ -#define X86_FEATURE_MOVDIR64B (16*32+28) /* MOVDIR64B instruction */ -#define X86_FEATURE_ENQCMD (16*32+29) /* ENQCMD and ENQCMDS instructions */ -#define X86_FEATURE_SGX_LC (16*32+30) /* Software Guard Extensions Launch Control */ +#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* "avx512vbmi" AVX512 Vector Bit Manipulation instructions*/ +#define X86_FEATURE_UMIP (16*32+ 2) /* "umip" User Mode Instruction Protection */ +#define X86_FEATURE_PKU (16*32+ 3) /* "pku" Protection Keys for Userspace */ +#define X86_FEATURE_OSPKE (16*32+ 4) /* "ospke" OS Protection Keys Enable */ +#define X86_FEATURE_WAITPKG (16*32+ 5) /* "waitpkg" UMONITOR/UMWAIT/TPAUSE Instructions */ +#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* "avx512_vbmi2" Additional AVX512 Vector Bit Manipulation Instructions */ +#define X86_FEATURE_SHSTK (16*32+ 7) /* Shadow stack */ +#define X86_FEATURE_GFNI (16*32+ 8) /* "gfni" Galois Field New Instructions */ +#define X86_FEATURE_VAES (16*32+ 9) /* "vaes" Vector AES */ +#define X86_FEATURE_VPCLMULQDQ (16*32+10) /* "vpclmulqdq" Carry-Less Multiplication Double Quadword */ +#define X86_FEATURE_AVX512_VNNI (16*32+11) /* "avx512_vnni" Vector Neural Network Instructions */ +#define X86_FEATURE_AVX512_BITALG (16*32+12) /* "avx512_bitalg" Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */ +#define X86_FEATURE_TME (16*32+13) /* "tme" Intel Total Memory Encryption */ +#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* "avx512_vpopcntdq" POPCNT for vectors of DW/QW */ +#define X86_FEATURE_LA57 (16*32+16) /* "la57" 5-level page tables */ +#define X86_FEATURE_RDPID (16*32+22) /* "rdpid" RDPID instruction */ +#define X86_FEATURE_BUS_LOCK_DETECT (16*32+24) /* "bus_lock_detect" Bus Lock detect */ +#define X86_FEATURE_CLDEMOTE (16*32+25) /* "cldemote" CLDEMOTE instruction */ +#define X86_FEATURE_MOVDIRI (16*32+27) /* "movdiri" MOVDIRI instruction */ +#define X86_FEATURE_MOVDIR64B (16*32+28) /* "movdir64b" MOVDIR64B instruction */ +#define X86_FEATURE_ENQCMD (16*32+29) /* "enqcmd" ENQCMD and ENQCMDS instructions */ +#define X86_FEATURE_SGX_LC (16*32+30) /* "sgx_lc" Software Guard Extensions Launch Control */ /* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */ -#define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* MCA overflow recovery support */ -#define X86_FEATURE_SUCCOR (17*32+ 1) /* Uncorrectable error containment and recovery */ -#define X86_FEATURE_SMCA (17*32+ 3) /* Scalable MCA */ +#define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* "overflow_recov" MCA overflow recovery support */ +#define X86_FEATURE_SUCCOR (17*32+ 1) /* "succor" Uncorrectable error containment and recovery */ +#define X86_FEATURE_SMCA (17*32+ 3) /* "smca" Scalable MCA */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ -#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */ -#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ -#define X86_FEATURE_FSRM (18*32+ 4) /* Fast Short Rep Mov */ -#define X86_FEATURE_AVX512_VP2INTERSECT (18*32+ 8) /* AVX-512 Intersect for D/Q */ -#define X86_FEATURE_SRBDS_CTRL (18*32+ 9) /* "" SRBDS mitigation MSR available */ -#define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */ -#define X86_FEATURE_RTM_ALWAYS_ABORT (18*32+11) /* "" RTM transaction always aborts */ -#define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */ -#define X86_FEATURE_SERIALIZE (18*32+14) /* SERIALIZE instruction */ -#define X86_FEATURE_HYBRID_CPU (18*32+15) /* "" This part has CPUs of more than one type */ -#define X86_FEATURE_TSXLDTRK (18*32+16) /* TSX Suspend Load Address Tracking */ -#define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ -#define X86_FEATURE_ARCH_LBR (18*32+19) /* Intel ARCH LBR */ -#define X86_FEATURE_IBT (18*32+20) /* Indirect Branch Tracking */ -#define X86_FEATURE_AMX_BF16 (18*32+22) /* AMX bf16 Support */ -#define X86_FEATURE_AVX512_FP16 (18*32+23) /* AVX512 FP16 */ -#define X86_FEATURE_AMX_TILE (18*32+24) /* AMX tile Support */ -#define X86_FEATURE_AMX_INT8 (18*32+25) /* AMX int8 Support */ -#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ -#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ -#define X86_FEATURE_FLUSH_L1D (18*32+28) /* Flush L1D cache */ -#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */ -#define X86_FEATURE_CORE_CAPABILITIES (18*32+30) /* "" IA32_CORE_CAPABILITIES MSR */ -#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */ +#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* "avx512_4vnniw" AVX-512 Neural Network Instructions */ +#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* "avx512_4fmaps" AVX-512 Multiply Accumulation Single precision */ +#define X86_FEATURE_FSRM (18*32+ 4) /* "fsrm" Fast Short Rep Mov */ +#define X86_FEATURE_AVX512_VP2INTERSECT (18*32+ 8) /* "avx512_vp2intersect" AVX-512 Intersect for D/Q */ +#define X86_FEATURE_SRBDS_CTRL (18*32+ 9) /* SRBDS mitigation MSR available */ +#define X86_FEATURE_MD_CLEAR (18*32+10) /* "md_clear" VERW clears CPU buffers */ +#define X86_FEATURE_RTM_ALWAYS_ABORT (18*32+11) /* RTM transaction always aborts */ +#define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* TSX_FORCE_ABORT */ +#define X86_FEATURE_SERIALIZE (18*32+14) /* "serialize" SERIALIZE instruction */ +#define X86_FEATURE_HYBRID_CPU (18*32+15) /* This part has CPUs of more than one type */ +#define X86_FEATURE_TSXLDTRK (18*32+16) /* "tsxldtrk" TSX Suspend Load Address Tracking */ +#define X86_FEATURE_PCONFIG (18*32+18) /* "pconfig" Intel PCONFIG */ +#define X86_FEATURE_ARCH_LBR (18*32+19) /* "arch_lbr" Intel ARCH LBR */ +#define X86_FEATURE_IBT (18*32+20) /* "ibt" Indirect Branch Tracking */ +#define X86_FEATURE_AMX_BF16 (18*32+22) /* "amx_bf16" AMX bf16 Support */ +#define X86_FEATURE_AVX512_FP16 (18*32+23) /* "avx512_fp16" AVX512 FP16 */ +#define X86_FEATURE_AMX_TILE (18*32+24) /* "amx_tile" AMX tile Support */ +#define X86_FEATURE_AMX_INT8 (18*32+25) /* "amx_int8" AMX int8 Support */ +#define X86_FEATURE_SPEC_CTRL (18*32+26) /* Speculation Control (IBRS + IBPB) */ +#define X86_FEATURE_INTEL_STIBP (18*32+27) /* Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_FLUSH_L1D (18*32+28) /* "flush_l1d" Flush L1D cache */ +#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* "arch_capabilities" IA32_ARCH_CAPABILITIES MSR (Intel) */ +#define X86_FEATURE_CORE_CAPABILITIES (18*32+30) /* IA32_CORE_CAPABILITIES MSR */ +#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* Speculative Store Bypass Disable */ /* AMD-defined memory encryption features, CPUID level 0x8000001f (EAX), word 19 */ -#define X86_FEATURE_SME (19*32+ 0) /* AMD Secure Memory Encryption */ -#define X86_FEATURE_SEV (19*32+ 1) /* AMD Secure Encrypted Virtualization */ -#define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* "" VM Page Flush MSR is supported */ -#define X86_FEATURE_SEV_ES (19*32+ 3) /* AMD Secure Encrypted Virtualization - Encrypted State */ -#define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* "" Virtual TSC_AUX */ -#define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */ +#define X86_FEATURE_SME (19*32+ 0) /* "sme" Secure Memory Encryption */ +#define X86_FEATURE_SEV (19*32+ 1) /* "sev" Secure Encrypted Virtualization */ +#define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* VM Page Flush MSR is supported */ +#define X86_FEATURE_SEV_ES (19*32+ 3) /* "sev_es" Secure Encrypted Virtualization - Encrypted State */ +#define X86_FEATURE_SEV_SNP (19*32+ 4) /* "sev_snp" Secure Encrypted Virtualization - Secure Nested Paging */ +#define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* Virtual TSC_AUX */ +#define X86_FEATURE_SME_COHERENT (19*32+10) /* hardware-enforced cache coherency */ +#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* "debug_swap" SEV-ES full debug state swap support */ +#define X86_FEATURE_RMPREAD (19*32+21) /* RMPREAD instruction */ +#define X86_FEATURE_SEGMENTED_RMP (19*32+23) /* Segmented RMP support */ +#define X86_FEATURE_ALLOWED_SEV_FEATURES (19*32+27) /* Allowed SEV Features */ +#define X86_FEATURE_SVSM (19*32+28) /* "svsm" SVSM present */ +#define X86_FEATURE_HV_INUSE_WR_ALLOWED (19*32+30) /* Allow Write to in-use hypervisor-owned pages */ + +/* AMD-defined Extended Feature 2 EAX, CPUID level 0x80000021 (EAX), word 20 */ +#define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* No Nested Data Breakpoints */ +#define X86_FEATURE_WRMSR_XX_BASE_NS (20*32+ 1) /* WRMSR to {FS,GS,KERNEL_GS}_BASE is non-serializing */ +#define X86_FEATURE_LFENCE_RDTSC (20*32+ 2) /* LFENCE always serializing / synchronizes RDTSC */ +#define X86_FEATURE_NULL_SEL_CLR_BASE (20*32+ 6) /* Null Selector Clears Base */ +#define X86_FEATURE_AUTOIBRS (20*32+ 8) /* Automatic IBRS */ +#define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* SMM_CTL MSR is not present */ + +#define X86_FEATURE_PREFETCHI (20*32+20) /* Prefetch Data/Instruction to Cache Level */ +#define X86_FEATURE_SBPB (20*32+27) /* Selective Branch Prediction Barrier */ +#define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* MSR_PRED_CMD[IBPB] flushes all branch type predictions */ +#define X86_FEATURE_SRSO_NO (20*32+29) /* CPU is not affected by SRSO */ +#define X86_FEATURE_SRSO_USER_KERNEL_NO (20*32+30) /* CPU is not affected by SRSO across user/kernel boundaries */ +#define X86_FEATURE_SRSO_BP_SPEC_REDUCE (20*32+31) /* + * BP_CFG[BpSpecReduce] can be used to mitigate SRSO for VMs. + * (SRSO_MSR_FIX in the official doc). + */ + +/* + * Extended auxiliary flags: Linux defined - for features scattered in various + * CPUID levels like 0x80000022, etc and Linux defined features. + * + * Reuse free bits when adding new feature flags! + */ +#define X86_FEATURE_AMD_LBR_PMC_FREEZE (21*32+ 0) /* "amd_lbr_pmc_freeze" AMD LBR and PMC Freeze */ +#define X86_FEATURE_CLEAR_BHB_LOOP (21*32+ 1) /* Clear branch history at syscall entry using SW loop */ +#define X86_FEATURE_BHI_CTRL (21*32+ 2) /* BHI_DIS_S HW control available */ +#define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* BHI_DIS_S HW control enabled */ +#define X86_FEATURE_CLEAR_BHB_VMEXIT (21*32+ 4) /* Clear branch history at vmexit using SW loop */ +#define X86_FEATURE_AMD_FAST_CPPC (21*32+ 5) /* Fast CPPC */ +#define X86_FEATURE_AMD_HTR_CORES (21*32+ 6) /* Heterogeneous Core Topology */ +#define X86_FEATURE_AMD_WORKLOAD_CLASS (21*32+ 7) /* Workload Classification */ +#define X86_FEATURE_PREFER_YMM (21*32+ 8) /* Avoid ZMM registers due to downclocking */ +#define X86_FEATURE_APX (21*32+ 9) /* Advanced Performance Extensions */ +#define X86_FEATURE_INDIRECT_THUNK_ITS (21*32+10) /* Use thunk for indirect branches in lower half of cacheline */ /* * BUG word(s) */ #define X86_BUG(x) (NCAPINTS*32 + (x)) -#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */ -#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */ -#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */ +#define X86_BUG_F00F X86_BUG(0) /* "f00f" Intel F00F */ +#define X86_BUG_FDIV X86_BUG(1) /* "fdiv" FPU FDIV */ +#define X86_BUG_COMA X86_BUG(2) /* "coma" Cyrix 6x86 coma */ #define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */ #define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */ -#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */ -#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ -#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ -#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ +#define X86_BUG_11AP X86_BUG(5) /* "11ap" Bad local APIC aka 11AP */ +#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* "fxsave_leak" FXSAVE leaks FOP/FIP/FOP */ +#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* "clflush_monitor" AAI65, CLFLUSH required before MONITOR */ +#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* "sysret_ss_attrs" SYSRET doesn't fix up SS attrs */ #ifdef CONFIG_X86_32 /* * 64-bit kernels don't use X86_BUG_ESPFIX. Make the define conditional * to avoid confusion. */ -#define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */ +#define X86_BUG_ESPFIX X86_BUG(9) /* IRET to 16-bit SS corrupts ESP/RSP high bits */ #endif -#define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */ -#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ -#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ -#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ -#define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */ -#define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */ -#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */ -#define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */ -#define X86_BUG_L1TF X86_BUG(18) /* CPU is affected by L1 Terminal Fault */ -#define X86_BUG_MDS X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */ -#define X86_BUG_MSBDS_ONLY X86_BUG(20) /* CPU is only affected by the MSDBS variant of BUG_MDS */ -#define X86_BUG_SWAPGS X86_BUG(21) /* CPU is affected by speculation through SWAPGS */ -#define X86_BUG_TAA X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */ -#define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */ -#define X86_BUG_SRBDS X86_BUG(24) /* CPU may leak RNG bits if not mitigated */ -#define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* CPU is affected by Processor MMIO Stale Data vulnerabilities */ -#define X86_BUG_RETBLEED X86_BUG(26) /* CPU is affected by RETBleed */ +#define X86_BUG_NULL_SEG X86_BUG(10) /* "null_seg" Nulling a selector preserves the base */ +#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* "swapgs_fence" SWAPGS without input dep on GS */ +#define X86_BUG_MONITOR X86_BUG(12) /* "monitor" IPI required to wake up remote CPU */ +#define X86_BUG_AMD_E400 X86_BUG(13) /* "amd_e400" CPU is among the affected by Erratum 400 */ +#define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* "cpu_meltdown" CPU is affected by meltdown attack and needs kernel page table isolation */ +#define X86_BUG_SPECTRE_V1 X86_BUG(15) /* "spectre_v1" CPU is affected by Spectre variant 1 attack with conditional branches */ +#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* "spectre_v2" CPU is affected by Spectre variant 2 attack with indirect branches */ +#define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* "spec_store_bypass" CPU is affected by speculative store bypass attack */ +#define X86_BUG_L1TF X86_BUG(18) /* "l1tf" CPU is affected by L1 Terminal Fault */ +#define X86_BUG_MDS X86_BUG(19) /* "mds" CPU is affected by Microarchitectural data sampling */ +#define X86_BUG_MSBDS_ONLY X86_BUG(20) /* "msbds_only" CPU is only affected by the MSDBS variant of BUG_MDS */ +#define X86_BUG_SWAPGS X86_BUG(21) /* "swapgs" CPU is affected by speculation through SWAPGS */ +#define X86_BUG_TAA X86_BUG(22) /* "taa" CPU is affected by TSX Async Abort(TAA) */ +#define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* "itlb_multihit" CPU may incur MCE during certain page attribute changes */ +#define X86_BUG_SRBDS X86_BUG(24) /* "srbds" CPU may leak RNG bits if not mitigated */ +#define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* "mmio_stale_data" CPU is affected by Processor MMIO Stale Data vulnerabilities */ +/* unused, was #define X86_BUG_MMIO_UNKNOWN X86_BUG(26) "mmio_unknown" CPU is too old and its MMIO Stale Data status is unknown */ +#define X86_BUG_RETBLEED X86_BUG(27) /* "retbleed" CPU is affected by RETBleed */ +#define X86_BUG_EIBRS_PBRSB X86_BUG(28) /* "eibrs_pbrsb" EIBRS is vulnerable to Post Barrier RSB Predictions */ +#define X86_BUG_SMT_RSB X86_BUG(29) /* "smt_rsb" CPU is vulnerable to Cross-Thread Return Address Predictions */ +#define X86_BUG_GDS X86_BUG(30) /* "gds" CPU is affected by Gather Data Sampling */ +#define X86_BUG_TDX_PW_MCE X86_BUG(31) /* "tdx_pw_mce" CPU may incur #MC if non-TD software does partial write to TDX private memory */ + +/* BUG word 2 */ +#define X86_BUG_SRSO X86_BUG( 1*32+ 0) /* "srso" AMD SRSO bug */ +#define X86_BUG_DIV0 X86_BUG( 1*32+ 1) /* "div0" AMD DIV0 speculation bug */ +#define X86_BUG_RFDS X86_BUG( 1*32+ 2) /* "rfds" CPU is vulnerable to Register File Data Sampling */ +#define X86_BUG_BHI X86_BUG( 1*32+ 3) /* "bhi" CPU is affected by Branch History Injection */ +#define X86_BUG_IBPB_NO_RET X86_BUG( 1*32+ 4) /* "ibpb_no_ret" IBPB omits return target predictions */ +#define X86_BUG_SPECTRE_V2_USER X86_BUG( 1*32+ 5) /* "spectre_v2_user" CPU is affected by Spectre variant 2 attack between user processes */ +#define X86_BUG_OLD_MICROCODE X86_BUG( 1*32+ 6) /* "old_microcode" CPU has old microcode, it is surely vulnerable to something */ +#define X86_BUG_ITS X86_BUG( 1*32+ 7) /* "its" CPU is affected by Indirect Target Selection */ +#define X86_BUG_ITS_NATIVE_ONLY X86_BUG( 1*32+ 8) /* "its_native_only" CPU is affected by ITS, VMX is not affected */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h index 70b2db18165e..d5749b25fa10 100644 --- a/arch/x86/include/asm/cpuid.h +++ b/arch/x86/include/asm/cpuid.h @@ -1,34 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* - * CPUID-related helpers/definitions - * - * Derived from arch/x86/kvm/cpuid.c - */ #ifndef _ASM_X86_CPUID_H #define _ASM_X86_CPUID_H -static __always_inline bool cpuid_function_is_indexed(u32 function) -{ - switch (function) { - case 4: - case 7: - case 0xb: - case 0xd: - case 0xf: - case 0x10: - case 0x12: - case 0x14: - case 0x17: - case 0x18: - case 0x1d: - case 0x1e: - case 0x1f: - case 0x8000001d: - return true; - } - - return false; -} +#include <asm/cpuid/api.h> #endif /* _ASM_X86_CPUID_H */ diff --git a/arch/x86/include/asm/cpuid/api.h b/arch/x86/include/asm/cpuid/api.h new file mode 100644 index 000000000000..44fa82e1267c --- /dev/null +++ b/arch/x86/include/asm/cpuid/api.h @@ -0,0 +1,292 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_CPUID_API_H +#define _ASM_X86_CPUID_API_H + +#include <asm/cpuid/types.h> + +#include <linux/build_bug.h> +#include <linux/types.h> + +#include <asm/string.h> + +/* + * Raw CPUID accessors: + */ + +#ifdef CONFIG_X86_32 +bool cpuid_feature(void); +#else +static inline bool cpuid_feature(void) +{ + return true; +} +#endif + +static inline void native_cpuid(u32 *eax, u32 *ebx, + u32 *ecx, u32 *edx) +{ + /* ecx is often an input as well as an output. */ + asm volatile("cpuid" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx) + : "memory"); +} + +#define NATIVE_CPUID_REG(reg) \ +static inline u32 native_cpuid_##reg(u32 op) \ +{ \ + u32 eax = op, ebx, ecx = 0, edx; \ + \ + native_cpuid(&eax, &ebx, &ecx, &edx); \ + \ + return reg; \ +} + +/* + * Native CPUID functions returning a single datum: + */ +NATIVE_CPUID_REG(eax) +NATIVE_CPUID_REG(ebx) +NATIVE_CPUID_REG(ecx) +NATIVE_CPUID_REG(edx) + +#ifdef CONFIG_PARAVIRT_XXL +# include <asm/paravirt.h> +#else +# define __cpuid native_cpuid +#endif + +/* + * Generic CPUID function + * + * Clear ECX since some CPUs (Cyrix MII) do not set or clear ECX + * resulting in stale register contents being returned. + */ +static inline void cpuid(u32 op, + u32 *eax, u32 *ebx, + u32 *ecx, u32 *edx) +{ + *eax = op; + *ecx = 0; + __cpuid(eax, ebx, ecx, edx); +} + +/* Some CPUID calls want 'count' to be placed in ECX */ +static inline void cpuid_count(u32 op, int count, + u32 *eax, u32 *ebx, + u32 *ecx, u32 *edx) +{ + *eax = op; + *ecx = count; + __cpuid(eax, ebx, ecx, edx); +} + +/* + * CPUID functions returning a single datum: + */ + +static inline u32 cpuid_eax(u32 op) +{ + u32 eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + + return eax; +} + +static inline u32 cpuid_ebx(u32 op) +{ + u32 eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + + return ebx; +} + +static inline u32 cpuid_ecx(u32 op) +{ + u32 eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + + return ecx; +} + +static inline u32 cpuid_edx(u32 op) +{ + u32 eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + + return edx; +} + +static inline void __cpuid_read(u32 leaf, u32 subleaf, u32 *regs) +{ + regs[CPUID_EAX] = leaf; + regs[CPUID_ECX] = subleaf; + __cpuid(regs + CPUID_EAX, regs + CPUID_EBX, regs + CPUID_ECX, regs + CPUID_EDX); +} + +#define cpuid_subleaf(leaf, subleaf, regs) { \ + static_assert(sizeof(*(regs)) == 16); \ + __cpuid_read(leaf, subleaf, (u32 *)(regs)); \ +} + +#define cpuid_leaf(leaf, regs) { \ + static_assert(sizeof(*(regs)) == 16); \ + __cpuid_read(leaf, 0, (u32 *)(regs)); \ +} + +static inline void __cpuid_read_reg(u32 leaf, u32 subleaf, + enum cpuid_regs_idx regidx, u32 *reg) +{ + u32 regs[4]; + + __cpuid_read(leaf, subleaf, regs); + *reg = regs[regidx]; +} + +#define cpuid_subleaf_reg(leaf, subleaf, regidx, reg) { \ + static_assert(sizeof(*(reg)) == 4); \ + __cpuid_read_reg(leaf, subleaf, regidx, (u32 *)(reg)); \ +} + +#define cpuid_leaf_reg(leaf, regidx, reg) { \ + static_assert(sizeof(*(reg)) == 4); \ + __cpuid_read_reg(leaf, 0, regidx, (u32 *)(reg)); \ +} + +/* + * Hypervisor-related APIs: + */ + +static __always_inline bool cpuid_function_is_indexed(u32 function) +{ + switch (function) { + case 4: + case 7: + case 0xb: + case 0xd: + case 0xf: + case 0x10: + case 0x12: + case 0x14: + case 0x17: + case 0x18: + case 0x1d: + case 0x1e: + case 0x1f: + case 0x24: + case 0x8000001d: + return true; + } + + return false; +} + +#define for_each_possible_cpuid_base_hypervisor(function) \ + for (function = 0x40000000; function < 0x40010000; function += 0x100) + +static inline u32 cpuid_base_hypervisor(const char *sig, u32 leaves) +{ + u32 base, eax, signature[3]; + + for_each_possible_cpuid_base_hypervisor(base) { + cpuid(base, &eax, &signature[0], &signature[1], &signature[2]); + + /* + * This must not compile to "call memcmp" because it's called + * from PVH early boot code before instrumentation is set up + * and memcmp() itself may be instrumented. + */ + if (!__builtin_memcmp(sig, signature, 12) && + (leaves == 0 || ((eax - base) >= leaves))) + return base; + } + + return 0; +} + +/* + * CPUID(0x2) parsing: + */ + +/** + * cpuid_leaf_0x2() - Return sanitized CPUID(0x2) register output + * @regs: Output parameter + * + * Query CPUID(0x2) and store its output in @regs. Force set any + * invalid 1-byte descriptor returned by the hardware to zero (the NULL + * cache/TLB descriptor) before returning it to the caller. + * + * Use for_each_cpuid_0x2_desc() to iterate over the register output in + * parsed form. + */ +static inline void cpuid_leaf_0x2(union leaf_0x2_regs *regs) +{ + cpuid_leaf(0x2, regs); + + /* + * All Intel CPUs must report an iteration count of 1. In case + * of bogus hardware, treat all returned descriptors as NULL. + */ + if (regs->desc[0] != 0x01) { + for (int i = 0; i < 4; i++) + regs->regv[i] = 0; + return; + } + + /* + * The most significant bit (MSB) of each register must be clear. + * If a register is invalid, replace its descriptors with NULL. + */ + for (int i = 0; i < 4; i++) { + if (regs->reg[i].invalid) + regs->regv[i] = 0; + } +} + +/** + * for_each_cpuid_0x2_desc() - Iterator for parsed CPUID(0x2) descriptors + * @_regs: CPUID(0x2) register output, as returned by cpuid_leaf_0x2() + * @_ptr: u8 pointer, for macro internal use only + * @_desc: Pointer to the parsed CPUID(0x2) descriptor at each iteration + * + * Loop over the 1-byte descriptors in the passed CPUID(0x2) output registers + * @_regs. Provide the parsed information for each descriptor through @_desc. + * + * To handle cache-specific descriptors, switch on @_desc->c_type. For TLB + * descriptors, switch on @_desc->t_type. + * + * Example usage for cache descriptors:: + * + * const struct leaf_0x2_table *desc; + * union leaf_0x2_regs regs; + * u8 *ptr; + * + * cpuid_leaf_0x2(®s); + * for_each_cpuid_0x2_desc(regs, ptr, desc) { + * switch (desc->c_type) { + * ... + * } + * } + */ +#define for_each_cpuid_0x2_desc(_regs, _ptr, _desc) \ + for (_ptr = &(_regs).desc[1]; \ + _ptr < &(_regs).desc[16] && (_desc = &cpuid_0x2_table[*_ptr]); \ + _ptr++) + +/* + * CPUID(0x80000006) parsing: + */ + +static inline bool cpuid_amd_hygon_has_l3_cache(void) +{ + return cpuid_edx(0x80000006); +} + +#endif /* _ASM_X86_CPUID_API_H */ diff --git a/arch/x86/include/asm/cpuid/types.h b/arch/x86/include/asm/cpuid/types.h new file mode 100644 index 000000000000..8a00364b79de --- /dev/null +++ b/arch/x86/include/asm/cpuid/types.h @@ -0,0 +1,127 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_CPUID_TYPES_H +#define _ASM_X86_CPUID_TYPES_H + +#include <linux/build_bug.h> +#include <linux/types.h> + +/* + * Types for raw CPUID access: + */ + +struct cpuid_regs { + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; +}; + +enum cpuid_regs_idx { + CPUID_EAX = 0, + CPUID_EBX, + CPUID_ECX, + CPUID_EDX, +}; + +#define CPUID_LEAF_MWAIT 0x05 +#define CPUID_LEAF_DCA 0x09 +#define CPUID_LEAF_XSTATE 0x0d +#define CPUID_LEAF_TSC 0x15 +#define CPUID_LEAF_FREQ 0x16 +#define CPUID_LEAF_TILE 0x1d + +/* + * Types for CPUID(0x2) parsing: + */ + +struct leaf_0x2_reg { + u32 : 31, + invalid : 1; +}; + +union leaf_0x2_regs { + struct leaf_0x2_reg reg[4]; + u32 regv[4]; + u8 desc[16]; +}; + +/* + * Leaf 0x2 1-byte descriptors' cache types + * To be used for their mappings at cpuid_0x2_table[] + * + * Start at 1 since type 0 is reserved for HW byte descriptors which are + * not recognized by the kernel; i.e., those without an explicit mapping. + */ +enum _cache_table_type { + CACHE_L1_INST = 1, + CACHE_L1_DATA, + CACHE_L2, + CACHE_L3 + /* Adjust __TLB_TABLE_TYPE_BEGIN before adding more types */ +} __packed; +#ifndef __CHECKER__ +static_assert(sizeof(enum _cache_table_type) == 1); +#endif + +/* + * Ensure that leaf 0x2 cache and TLB type values do not intersect, + * since they share the same type field at struct cpuid_0x2_table. + */ +#define __TLB_TABLE_TYPE_BEGIN (CACHE_L3 + 1) + +/* + * Leaf 0x2 1-byte descriptors' TLB types + * To be used for their mappings at cpuid_0x2_table[] + */ +enum _tlb_table_type { + TLB_INST_4K = __TLB_TABLE_TYPE_BEGIN, + TLB_INST_4M, + TLB_INST_2M_4M, + TLB_INST_ALL, + + TLB_DATA_4K, + TLB_DATA_4M, + TLB_DATA_2M_4M, + TLB_DATA_4K_4M, + TLB_DATA_1G, + TLB_DATA_1G_2M_4M, + + TLB_DATA0_4K, + TLB_DATA0_4M, + TLB_DATA0_2M_4M, + + STLB_4K, + STLB_4K_2M, +} __packed; +#ifndef __CHECKER__ +static_assert(sizeof(enum _tlb_table_type) == 1); +#endif + +/* + * Combined parsing table for leaf 0x2 cache and TLB descriptors. + */ + +struct leaf_0x2_table { + union { + enum _cache_table_type c_type; + enum _tlb_table_type t_type; + }; + union { + short c_size; + short entries; + }; +}; + +extern const struct leaf_0x2_table cpuid_0x2_table[256]; + +/* + * All of leaf 0x2's one-byte TLB descriptors implies the same number of entries + * for their respective TLB types. TLB descriptor 0x63 is an exception: it + * implies 4 dTLB entries for 1GB pages and 32 dTLB entries for 2MB or 4MB pages. + * + * Encode that descriptor's dTLB entry count for 2MB/4MB pages here, as the entry + * count for dTLB 1GB pages is already encoded at the cpuid_0x2_table[]'s mapping. + */ +#define TLB_0x63_2M_4M_ENTRIES 32 + +#endif /* _ASM_X86_CPUID_TYPES_H */ diff --git a/arch/x86/include/asm/cpumask.h b/arch/x86/include/asm/cpumask.h index c5aed9e9226c..70f6b60ad67b 100644 --- a/arch/x86/include/asm/cpumask.h +++ b/arch/x86/include/asm/cpumask.h @@ -1,14 +1,9 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_CPUMASK_H #define _ASM_X86_CPUMASK_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/cpumask.h> -extern cpumask_var_t cpu_callin_mask; -extern cpumask_var_t cpu_callout_mask; -extern cpumask_var_t cpu_initialized_mask; -extern cpumask_var_t cpu_sibling_setup_mask; - extern void setup_cpu_local_masks(void); /* @@ -39,5 +34,5 @@ static __always_inline void arch_cpumask_clear_cpu(int cpu, struct cpumask *dstp #define arch_cpu_is_offline(cpu) unlikely(!arch_cpu_online(cpu)) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_CPUMASK_H */ diff --git a/arch/x86/include/asm/crash_reserve.h b/arch/x86/include/asm/crash_reserve.h new file mode 100644 index 000000000000..7835b2cdff04 --- /dev/null +++ b/arch/x86/include/asm/crash_reserve.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _X86_CRASH_RESERVE_H +#define _X86_CRASH_RESERVE_H + +/* 16M alignment for crash kernel regions */ +#define CRASH_ALIGN SZ_16M + +/* + * Keep the crash kernel below this limit. + * + * Earlier 32-bits kernels would limit the kernel to the low 512 MB range + * due to mapping restrictions. + * + * 64-bit kdump kernels need to be restricted to be under 64 TB, which is + * the upper limit of system RAM in 4-level paging mode. Since the kdump + * jump could be from 5-level paging to 4-level paging, the jump will fail if + * the kernel is put above 64 TB, and during the 1st kernel bootup there's + * no good way to detect the paging mode of the target kernel which will be + * loaded for dumping. + */ +extern unsigned long swiotlb_size_or_default(void); + +#ifdef CONFIG_X86_32 +# define CRASH_ADDR_LOW_MAX SZ_512M +# define CRASH_ADDR_HIGH_MAX SZ_512M +#else +# define CRASH_ADDR_LOW_MAX SZ_4G +# define CRASH_ADDR_HIGH_MAX SZ_64T +#endif + +# define DEFAULT_CRASH_KERNEL_LOW_SIZE crash_low_size_default() + +static inline unsigned long crash_low_size_default(void) +{ +#ifdef CONFIG_X86_64 + return max(swiotlb_size_or_default() + (8UL << 20), 256UL << 20); +#else + return 0; +#endif +} + +#define HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY + +#endif /* _X86_CRASH_RESERVE_H */ diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index 3e204e6140b5..cc4a3f725b37 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -2,21 +2,31 @@ #ifndef _ASM_X86_CURRENT_H #define _ASM_X86_CURRENT_H +#include <linux/build_bug.h> #include <linux/compiler.h> + +#ifndef __ASSEMBLER__ + +#include <linux/cache.h> #include <asm/percpu.h> -#ifndef __ASSEMBLY__ struct task_struct; -DECLARE_PER_CPU(struct task_struct *, current_task); +DECLARE_PER_CPU_CACHE_HOT(struct task_struct *, current_task); +/* const-qualified alias provided by the linker. */ +DECLARE_PER_CPU_CACHE_HOT(struct task_struct * const __percpu_seg_override, + const_current_task); static __always_inline struct task_struct *get_current(void) { + if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT)) + return this_cpu_read_const(const_current_task); + return this_cpu_read_stable(current_task); } #define current get_current() -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_CURRENT_H */ diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index cfdf307ddc01..363110e6b2e3 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -2,10 +2,13 @@ #ifndef _ASM_X86_DEBUGREG_H #define _ASM_X86_DEBUGREG_H - #include <linux/bug.h> +#include <linux/percpu.h> #include <uapi/asm/debugreg.h> +#include <asm/cpufeature.h> +#include <asm/msr.h> + DECLARE_PER_CPU(unsigned long, cpu_dr7); #ifndef CONFIG_PARAVIRT_XXL @@ -20,7 +23,7 @@ DECLARE_PER_CPU(unsigned long, cpu_dr7); static __always_inline unsigned long native_get_debugreg(int regno) { - unsigned long val = 0; /* Damn you, gcc! */ + unsigned long val; switch (regno) { case 0: @@ -39,7 +42,20 @@ static __always_inline unsigned long native_get_debugreg(int regno) asm("mov %%db6, %0" :"=r" (val)); break; case 7: - asm("mov %%db7, %0" :"=r" (val)); + /* + * Use "asm volatile" for DR7 reads to forbid re-ordering them + * with other code. + * + * This is needed because a DR7 access can cause a #VC exception + * when running under SEV-ES. Taking a #VC exception is not a + * safe thing to do just anywhere in the entry code and + * re-ordering might place the access into an unsafe location. + * + * This happened in the NMI handler, where the DR7 read was + * re-ordered to happen before the call to sev_es_ist_enter(), + * causing stack recursion. + */ + asm volatile("mov %%db7, %0" : "=r" (val)); break; default: BUG(); @@ -66,7 +82,16 @@ static __always_inline void native_set_debugreg(int regno, unsigned long value) asm("mov %0, %%db6" ::"r" (value)); break; case 7: - asm("mov %0, %%db7" ::"r" (value)); + /* + * Use "asm volatile" for DR7 writes to forbid re-ordering them + * with other code. + * + * While is didn't happen with a DR7 write (see the DR7 read + * comment above which explains where it happened), add the + * "asm volatile" here too to avoid similar problems in the + * future. + */ + asm volatile("mov %0, %%db7" ::"r" (value)); break; default: BUG(); @@ -126,9 +151,36 @@ static __always_inline void local_db_restore(unsigned long dr7) } #ifdef CONFIG_CPU_SUP_AMD -extern void set_dr_addr_mask(unsigned long mask, int dr); +extern void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr); +extern unsigned long amd_get_dr_addr_mask(unsigned int dr); #else -static inline void set_dr_addr_mask(unsigned long mask, int dr) { } +static inline void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr) { } +static inline unsigned long amd_get_dr_addr_mask(unsigned int dr) +{ + return 0; +} #endif +static inline unsigned long get_debugctlmsr(void) +{ + unsigned long debugctlmsr = 0; + +#ifndef CONFIG_X86_DEBUGCTLMSR + if (boot_cpu_data.x86 < 6) + return 0; +#endif + rdmsrq(MSR_IA32_DEBUGCTLMSR, debugctlmsr); + + return debugctlmsr; +} + +static inline void update_debugctlmsr(unsigned long debugctlmsr) +{ +#ifndef CONFIG_X86_DEBUGCTLMSR + if (boot_cpu_data.x86 < 6) + return; +#endif + wrmsrq(MSR_IA32_DEBUGCTLMSR, debugctlmsr); +} + #endif /* _ASM_X86_DEBUGREG_H */ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index ab97b22ac04a..ec95fe44fa3a 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -402,8 +402,6 @@ static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) desc->limit1 = (limit >> 16) & 0xf; } -void alloc_intr_gate(unsigned int n, const void *addr); - static inline void init_idt_data(struct idt_data *data, unsigned int n, const void *addr) { diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h index f7e7099af595..7e6b9314758a 100644 --- a/arch/x86/include/asm/desc_defs.h +++ b/arch/x86/include/asm/desc_defs.h @@ -8,7 +8,57 @@ * archs. */ -#ifndef __ASSEMBLY__ +/* + * Low-level interface mapping flags/field names to bits + */ + +/* Flags for _DESC_S (non-system) descriptors */ +#define _DESC_ACCESSED 0x0001 +#define _DESC_DATA_WRITABLE 0x0002 +#define _DESC_CODE_READABLE 0x0002 +#define _DESC_DATA_EXPAND_DOWN 0x0004 +#define _DESC_CODE_CONFORMING 0x0004 +#define _DESC_CODE_EXECUTABLE 0x0008 + +/* Common flags */ +#define _DESC_S 0x0010 +#define _DESC_DPL(dpl) ((dpl) << 5) +#define _DESC_PRESENT 0x0080 + +#define _DESC_LONG_CODE 0x2000 +#define _DESC_DB 0x4000 +#define _DESC_GRANULARITY_4K 0x8000 + +/* System descriptors have a numeric "type" field instead of flags */ +#define _DESC_SYSTEM(code) (code) + +/* + * High-level interface mapping intended usage to low-level combinations + * of flags + */ + +#define _DESC_DATA (_DESC_S | _DESC_PRESENT | _DESC_ACCESSED | \ + _DESC_DATA_WRITABLE) +#define _DESC_CODE (_DESC_S | _DESC_PRESENT | _DESC_ACCESSED | \ + _DESC_CODE_READABLE | _DESC_CODE_EXECUTABLE) + +#define DESC_DATA16 (_DESC_DATA) +#define DESC_CODE16 (_DESC_CODE) + +#define DESC_DATA32 (_DESC_DATA | _DESC_GRANULARITY_4K | _DESC_DB) +#define DESC_DATA32_BIOS (_DESC_DATA | _DESC_DB) + +#define DESC_CODE32 (_DESC_CODE | _DESC_GRANULARITY_4K | _DESC_DB) +#define DESC_CODE32_BIOS (_DESC_CODE | _DESC_DB) + +#define DESC_TSS32 (_DESC_SYSTEM(9) | _DESC_PRESENT) + +#define DESC_DATA64 (_DESC_DATA | _DESC_GRANULARITY_4K | _DESC_DB) +#define DESC_CODE64 (_DESC_CODE | _DESC_GRANULARITY_4K | _DESC_LONG_CODE) + +#define DESC_USER (_DESC_DPL(3)) + +#ifndef __ASSEMBLER__ #include <linux/types.h> @@ -22,19 +72,19 @@ struct desc_struct { #define GDT_ENTRY_INIT(flags, base, limit) \ { \ - .limit0 = (u16) (limit), \ - .limit1 = ((limit) >> 16) & 0x0F, \ - .base0 = (u16) (base), \ - .base1 = ((base) >> 16) & 0xFF, \ - .base2 = ((base) >> 24) & 0xFF, \ - .type = (flags & 0x0f), \ - .s = (flags >> 4) & 0x01, \ - .dpl = (flags >> 5) & 0x03, \ - .p = (flags >> 7) & 0x01, \ - .avl = (flags >> 12) & 0x01, \ - .l = (flags >> 13) & 0x01, \ - .d = (flags >> 14) & 0x01, \ - .g = (flags >> 15) & 0x01, \ + .limit0 = ((limit) >> 0) & 0xFFFF, \ + .limit1 = ((limit) >> 16) & 0x000F, \ + .base0 = ((base) >> 0) & 0xFFFF, \ + .base1 = ((base) >> 16) & 0x00FF, \ + .base2 = ((base) >> 24) & 0x00FF, \ + .type = ((flags) >> 0) & 0x000F, \ + .s = ((flags) >> 4) & 0x0001, \ + .dpl = ((flags) >> 5) & 0x0003, \ + .p = ((flags) >> 7) & 0x0001, \ + .avl = ((flags) >> 12) & 0x0001, \ + .l = ((flags) >> 13) & 0x0001, \ + .d = ((flags) >> 14) & 0x0001, \ + .g = ((flags) >> 15) & 0x0001, \ } enum { @@ -94,6 +144,7 @@ struct gate_struct { typedef struct gate_struct gate_desc; +#ifndef _SETUP static inline unsigned long gate_offset(const gate_desc *g) { #ifdef CONFIG_X86_64 @@ -108,13 +159,14 @@ static inline unsigned long gate_segment(const gate_desc *g) { return g->segment; } +#endif struct desc_ptr { unsigned short size; unsigned long address; } __attribute__((packed)) ; -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ /* Boot IDT definitions */ #define BOOT_IDT_ENTRIES 32 diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h deleted file mode 100644 index 33d2cd04d254..000000000000 --- a/arch/x86/include/asm/disabled-features.h +++ /dev/null @@ -1,116 +0,0 @@ -#ifndef _ASM_X86_DISABLED_FEATURES_H -#define _ASM_X86_DISABLED_FEATURES_H - -/* These features, although they might be available in a CPU - * will not be used because the compile options to support - * them are not present. - * - * This code allows them to be checked and disabled at - * compile time without an explicit #ifdef. Use - * cpu_feature_enabled(). - */ - -#ifdef CONFIG_X86_UMIP -# define DISABLE_UMIP 0 -#else -# define DISABLE_UMIP (1<<(X86_FEATURE_UMIP & 31)) -#endif - -#ifdef CONFIG_X86_64 -# define DISABLE_VME (1<<(X86_FEATURE_VME & 31)) -# define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31)) -# define DISABLE_CYRIX_ARR (1<<(X86_FEATURE_CYRIX_ARR & 31)) -# define DISABLE_CENTAUR_MCR (1<<(X86_FEATURE_CENTAUR_MCR & 31)) -# define DISABLE_PCID 0 -#else -# define DISABLE_VME 0 -# define DISABLE_K6_MTRR 0 -# define DISABLE_CYRIX_ARR 0 -# define DISABLE_CENTAUR_MCR 0 -# define DISABLE_PCID (1<<(X86_FEATURE_PCID & 31)) -#endif /* CONFIG_X86_64 */ - -#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS -# define DISABLE_PKU 0 -# define DISABLE_OSPKE 0 -#else -# define DISABLE_PKU (1<<(X86_FEATURE_PKU & 31)) -# define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE & 31)) -#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ - -#ifdef CONFIG_X86_5LEVEL -# define DISABLE_LA57 0 -#else -# define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31)) -#endif - -#ifdef CONFIG_PAGE_TABLE_ISOLATION -# define DISABLE_PTI 0 -#else -# define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31)) -#endif - -#ifdef CONFIG_RETPOLINE -# define DISABLE_RETPOLINE 0 -#else -# define DISABLE_RETPOLINE ((1 << (X86_FEATURE_RETPOLINE & 31)) | \ - (1 << (X86_FEATURE_RETPOLINE_LFENCE & 31))) -#endif - -#ifdef CONFIG_RETHUNK -# define DISABLE_RETHUNK 0 -#else -# define DISABLE_RETHUNK (1 << (X86_FEATURE_RETHUNK & 31)) -#endif - -#ifdef CONFIG_CPU_UNRET_ENTRY -# define DISABLE_UNRET 0 -#else -# define DISABLE_UNRET (1 << (X86_FEATURE_UNRET & 31)) -#endif - -#ifdef CONFIG_INTEL_IOMMU_SVM -# define DISABLE_ENQCMD 0 -#else -# define DISABLE_ENQCMD (1 << (X86_FEATURE_ENQCMD & 31)) -#endif - -#ifdef CONFIG_X86_SGX -# define DISABLE_SGX 0 -#else -# define DISABLE_SGX (1 << (X86_FEATURE_SGX & 31)) -#endif - -#ifdef CONFIG_INTEL_TDX_GUEST -# define DISABLE_TDX_GUEST 0 -#else -# define DISABLE_TDX_GUEST (1 << (X86_FEATURE_TDX_GUEST & 31)) -#endif - -/* - * Make sure to add features to the correct mask - */ -#define DISABLED_MASK0 (DISABLE_VME) -#define DISABLED_MASK1 0 -#define DISABLED_MASK2 0 -#define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR) -#define DISABLED_MASK4 (DISABLE_PCID) -#define DISABLED_MASK5 0 -#define DISABLED_MASK6 0 -#define DISABLED_MASK7 (DISABLE_PTI) -#define DISABLED_MASK8 (DISABLE_TDX_GUEST) -#define DISABLED_MASK9 (DISABLE_SGX) -#define DISABLED_MASK10 0 -#define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET) -#define DISABLED_MASK12 0 -#define DISABLED_MASK13 0 -#define DISABLED_MASK14 0 -#define DISABLED_MASK15 0 -#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \ - DISABLE_ENQCMD) -#define DISABLED_MASK17 0 -#define DISABLED_MASK18 0 -#define DISABLED_MASK19 0 -#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20) - -#endif /* _ASM_X86_DISABLED_FEATURES_H */ diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h index b8f1dc0761e4..9931e4c7d73f 100644 --- a/arch/x86/include/asm/div64.h +++ b/arch/x86/include/asm/div64.h @@ -71,6 +71,12 @@ static inline u64 mul_u32_u32(u32 a, u32 b) } #define mul_u32_u32 mul_u32_u32 +/* + * __div64_32() is never called on x86, so prevent the + * generic definition from getting built. + */ +#define __div64_32 + #else # include <asm-generic/div64.h> diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 1c66708e3062..d1dac96ee30b 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -4,7 +4,7 @@ extern const struct dma_map_ops *dma_ops; -static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) +static inline const struct dma_map_ops *get_arch_dma_ops(void) { return dma_ops; } diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h index 8e95aa4b0d17..8ae6e0e11b8b 100644 --- a/arch/x86/include/asm/dma.h +++ b/arch/x86/include/asm/dma.h @@ -307,12 +307,4 @@ extern int request_dma(unsigned int dmanr, const char *device_id); extern void free_dma(unsigned int dmanr); #endif -/* From PCI */ - -#ifdef CONFIG_PCI -extern int isa_dma_bridge_buggy; -#else -#define isa_dma_bridge_buggy (0) -#endif - #endif /* _ASM_X86_DMA_H */ diff --git a/arch/x86/include/asm/doublefault.h b/arch/x86/include/asm/doublefault.h index 54a6e4a2e132..de0e88b32207 100644 --- a/arch/x86/include/asm/doublefault.h +++ b/arch/x86/include/asm/doublefault.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_DOUBLEFAULT_H #define _ASM_X86_DOUBLEFAULT_H +#include <linux/linkage.h> + #ifdef CONFIG_X86_32 extern void doublefault_init_cpu_tss(void); #else @@ -10,4 +12,6 @@ static inline void doublefault_init_cpu_tss(void) } #endif +asmlinkage void __noreturn doublefault_shim(void); + #endif /* _ASM_X86_DOUBLEFAULT_H */ diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h index 430fca13bb56..302e11b15da8 100644 --- a/arch/x86/include/asm/dwarf2.h +++ b/arch/x86/include/asm/dwarf2.h @@ -2,7 +2,7 @@ #ifndef _ASM_X86_DWARF2_H #define _ASM_X86_DWARF2_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #warning "asm/dwarf2.h should be only included in pure assembly files" #endif diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h index e8f58ddd06d9..c83645d5b2a8 100644 --- a/arch/x86/include/asm/e820/api.h +++ b/arch/x86/include/asm/e820/api.h @@ -17,6 +17,7 @@ extern bool e820__mapped_all(u64 start, u64 end, enum e820_type type); extern void e820__range_add (u64 start, u64 size, enum e820_type type); extern u64 e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type); extern u64 e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type); +extern u64 e820__range_update_table(struct e820_table *t, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type); extern void e820__print_table(char *who); extern int e820__update_table(struct e820_table *table); @@ -28,7 +29,6 @@ extern unsigned long e820__end_of_low_ram_pfn(void); extern u64 e820__memblock_alloc_reserved(u64 size, u64 align); extern void e820__memblock_setup(void); -extern void e820__reserve_setup_data(void); extern void e820__finish_early_params(void); extern void e820__reserve_resources(void); extern void e820__reserve_resources_late(void); diff --git a/arch/x86/include/asm/e820/types.h b/arch/x86/include/asm/e820/types.h index 314f75d886d0..80c4a7266629 100644 --- a/arch/x86/include/asm/e820/types.h +++ b/arch/x86/include/asm/e820/types.h @@ -35,15 +35,6 @@ enum e820_type { * marking it with the IORES_DESC_SOFT_RESERVED designation. */ E820_TYPE_SOFT_RESERVED = 0xefffffff, - - /* - * Reserved RAM used by the kernel itself if - * CONFIG_INTEL_TXT=y is enabled, memory of this type - * will be included in the S3 integrity calculation - * and so should not include any memory that the BIOS - * might alter over the S3 transition: - */ - E820_TYPE_RESERVED_KERN = 128, }; /* diff --git a/arch/x86/include/asm/edac.h b/arch/x86/include/asm/edac.h index 426fc53ff803..dfbd1ebb9f10 100644 --- a/arch/x86/include/asm/edac.h +++ b/arch/x86/include/asm/edac.h @@ -13,7 +13,7 @@ static inline void edac_atomic_scrub(void *va, u32 size) * are interrupt, DMA and SMP safe. */ for (i = 0; i < size / 4; i++, virt_addr++) - asm volatile("lock; addl $0, %0"::"m" (*virt_addr)); + asm volatile("lock addl $0, %0"::"m" (*virt_addr)); } #endif /* _ASM_X86_EDAC_H */ diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 233ae6986d6f..f227a70ac91f 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -31,6 +31,8 @@ extern unsigned long efi_mixed_mode_stack_pa; #define ARCH_EFI_IRQ_FLAGS_MASK X86_EFLAGS_IF +#define EFI_UNACCEPTED_UNIT_SIZE PMD_SIZE + /* * The EFI services are called through variadic functions in many cases. These * functions are implemented in assembler and support only a fixed number of @@ -88,52 +90,27 @@ static inline void efi_fpu_end(void) } #ifdef CONFIG_X86_32 -#define arch_efi_call_virt_setup() \ -({ \ - efi_fpu_begin(); \ - firmware_restrict_branch_speculation_start(); \ -}) - -#define arch_efi_call_virt_teardown() \ -({ \ - firmware_restrict_branch_speculation_end(); \ - efi_fpu_end(); \ -}) - +#define EFI_X86_KERNEL_ALLOC_LIMIT (SZ_512M - 1) #else /* !CONFIG_X86_32 */ - -#define EFI_LOADER_SIGNATURE "EL64" +#define EFI_X86_KERNEL_ALLOC_LIMIT EFI_ALLOC_LIMIT extern asmlinkage u64 __efi_call(void *fp, ...); +extern bool efi_disable_ibt_for_runtime; + #define efi_call(...) ({ \ __efi_nargs_check(efi_call, 7, __VA_ARGS__); \ __efi_call(__VA_ARGS__); \ }) -#define arch_efi_call_virt_setup() \ -({ \ - efi_sync_low_kernel_mappings(); \ - efi_fpu_begin(); \ - firmware_restrict_branch_speculation_start(); \ - efi_enter_mm(); \ -}) - #undef arch_efi_call_virt #define arch_efi_call_virt(p, f, args...) ({ \ - u64 ret, ibt = ibt_save(); \ + u64 ret, ibt = ibt_save(efi_disable_ibt_for_runtime); \ ret = efi_call((void *)p->f, args); \ ibt_restore(ibt); \ ret; \ }) -#define arch_efi_call_virt_teardown() \ -({ \ - efi_leave_mm(); \ - firmware_restrict_branch_speculation_end(); \ - efi_fpu_end(); \ -}) - #ifdef CONFIG_KASAN /* * CONFIG_KASAN may redefine memset to __memset. __memset function is present @@ -163,22 +140,13 @@ extern void efi_delete_dummy_variable(void); extern void efi_crash_gracefully_on_page_fault(unsigned long phys_addr); extern void efi_free_boot_services(void); -void efi_enter_mm(void); -void efi_leave_mm(void); - -/* kexec external ABI */ -struct efi_setup_data { - u64 fw_vendor; - u64 __unused; - u64 tables; - u64 smbios; - u64 reserved[8]; -}; +void arch_efi_call_virt_setup(void); +void arch_efi_call_virt_teardown(void); extern u64 efi_setup; #ifdef CONFIG_EFI -extern efi_status_t __efi64_thunk(u32, ...); +extern u64 __efi64_thunk(u32, ...); #define efi64_thunk(...) ({ \ u64 __pad[3]; /* must have space for 3 args on the stack */ \ @@ -214,6 +182,8 @@ efi_status_t efi_set_virtual_address_map(unsigned long memory_map_size, #ifdef CONFIG_EFI_MIXED +#define EFI_ALLOC_LIMIT (efi_is_64bit() ? ULONG_MAX : U32_MAX) + #define ARCH_HAS_EFISTUB_WRAPPERS static inline bool efi_is_64bit(void) @@ -228,16 +198,15 @@ static inline bool efi_is_native(void) return efi_is_64bit(); } -#define efi_mixed_mode_cast(attr) \ - __builtin_choose_expr( \ - __builtin_types_compatible_p(u32, __typeof__(attr)), \ - (unsigned long)(attr), (attr)) - #define efi_table_attr(inst, attr) \ - (efi_is_native() \ - ? inst->attr \ - : (__typeof__(inst->attr)) \ - efi_mixed_mode_cast(inst->mixed_mode.attr)) + (efi_is_native() ? (inst)->attr \ + : efi_mixed_table_attr((inst), attr)) + +#define efi_mixed_table_attr(inst, attr) \ + (__typeof__(inst->attr)) \ + _Generic(inst->mixed_mode.attr, \ + u32: (unsigned long)(inst->mixed_mode.attr), \ + default: (inst->mixed_mode.attr)) /* * The following macros allow translating arguments if necessary from native to @@ -260,7 +229,8 @@ static inline bool efi_is_native(void) static inline void *efi64_zero_upper(void *p) { - ((u32 *)p)[1] = 0; + if (p) + ((u32 *)p)[1] = 0; return p; } @@ -280,6 +250,9 @@ static inline u32 efi64_convert_status(efi_status_t status) #define __efi64_argmap_allocate_pool(type, size, buffer) \ ((type), (size), efi64_zero_upper(buffer)) +#define __efi64_argmap_locate_handle_buffer(type, proto, key, num, buf) \ + ((type), (proto), (key), efi64_zero_upper(num), efi64_zero_upper(buf)) + #define __efi64_argmap_create_event(type, tpl, f, c, event) \ ((type), (tpl), (f), (c), efi64_zero_upper(event)) @@ -325,6 +298,31 @@ static inline u32 efi64_convert_status(efi_status_t status) #define __efi64_argmap_set_memory_space_attributes(phys, size, flags) \ (__efi64_split(phys), __efi64_split(size), __efi64_split(flags)) +/* file protocol */ +#define __efi64_argmap_open(prot, newh, fname, mode, attr) \ + ((prot), efi64_zero_upper(newh), (fname), __efi64_split(mode), \ + __efi64_split(attr)) + +#define __efi64_argmap_set_position(pos) (__efi64_split(pos)) + +/* file system protocol */ +#define __efi64_argmap_open_volume(prot, file) \ + ((prot), efi64_zero_upper(file)) + +/* Memory Attribute Protocol */ +#define __efi64_argmap_get_memory_attributes(protocol, phys, size, flags) \ + ((protocol), __efi64_split(phys), __efi64_split(size), (flags)) + +#define __efi64_argmap_set_memory_attributes(protocol, phys, size, flags) \ + ((protocol), __efi64_split(phys), __efi64_split(size), __efi64_split(flags)) + +#define __efi64_argmap_clear_memory_attributes(protocol, phys, size, flags) \ + ((protocol), __efi64_split(phys), __efi64_split(size), __efi64_split(flags)) + +/* EFI SMBIOS protocol */ +#define __efi64_argmap_get_next(protocol, smbioshandle, type, record, phandle) \ + ((protocol), (smbioshandle), (type), efi64_zero_upper(record), \ + efi64_zero_upper(phandle)) /* * The macros below handle the plumbing for the argument mapping. To add a * mapping for a specific EFI method, simply define a macro @@ -344,31 +342,27 @@ static inline u32 efi64_convert_status(efi_status_t status) #define __efi_eat(...) #define __efi_eval(...) __VA_ARGS__ -/* The three macros below handle dispatching via the thunk if needed */ - -#define efi_call_proto(inst, func, ...) \ - (efi_is_native() \ - ? inst->func(inst, ##__VA_ARGS__) \ - : __efi64_thunk_map(inst, func, inst, ##__VA_ARGS__)) +static inline efi_status_t __efi64_widen_efi_status(u64 status) +{ + /* use rotate to move the value of bit #31 into position #63 */ + return ror64(rol32(status, 1), 1); +} -#define efi_bs_call(func, ...) \ - (efi_is_native() \ - ? efi_system_table->boottime->func(__VA_ARGS__) \ - : __efi64_thunk_map(efi_table_attr(efi_system_table, \ - boottime), \ - func, __VA_ARGS__)) +/* The macro below handles dispatching via the thunk if needed */ -#define efi_rt_call(func, ...) \ - (efi_is_native() \ - ? efi_system_table->runtime->func(__VA_ARGS__) \ - : __efi64_thunk_map(efi_table_attr(efi_system_table, \ - runtime), \ - func, __VA_ARGS__)) +#define efi_fn_call(inst, func, ...) \ + (efi_is_native() ? (inst)->func(__VA_ARGS__) \ + : efi_mixed_call((inst), func, ##__VA_ARGS__)) -#define efi_dxe_call(func, ...) \ - (efi_is_native() \ - ? efi_dxe_table->func(__VA_ARGS__) \ - : __efi64_thunk_map(efi_dxe_table, func, __VA_ARGS__)) +#define efi_mixed_call(inst, func, ...) \ + _Generic(inst->func(__VA_ARGS__), \ + efi_status_t: \ + __efi64_widen_efi_status( \ + __efi64_thunk_map(inst, func, ##__VA_ARGS__)), \ + u64: ({ BUILD_BUG(); ULONG_MAX; }), \ + default: \ + (__typeof__(inst->func(__VA_ARGS__))) \ + __efi64_thunk_map(inst, func, ##__VA_ARGS__)) #else /* CONFIG_EFI_MIXED */ @@ -398,15 +392,39 @@ static inline void efi_reserve_boot_services(void) } #endif /* CONFIG_EFI */ -#ifdef CONFIG_EFI_FAKE_MEMMAP -extern void __init efi_fake_memmap_early(void); +extern int __init efi_memmap_alloc(unsigned int num_entries, + struct efi_memory_map_data *data); + +extern int __init efi_memmap_install(struct efi_memory_map_data *data); +extern int __init efi_memmap_split_count(efi_memory_desc_t *md, + struct range *range); +extern void __init efi_memmap_insert(struct efi_memory_map *old_memmap, + void *buf, struct efi_mem_range *mem); + +extern enum efi_secureboot_mode __x86_ima_efi_boot_mode(void); + +#define arch_ima_efi_boot_mode __x86_ima_efi_boot_mode() + +#ifdef CONFIG_EFI_RUNTIME_MAP +int efi_get_runtime_map_size(void); +int efi_get_runtime_map_desc_size(void); +int efi_runtime_map_copy(void *buf, size_t bufsz); #else -static inline void efi_fake_memmap_early(void) +static inline int efi_get_runtime_map_size(void) { + return 0; +} + +static inline int efi_get_runtime_map_desc_size(void) +{ + return 0; } -#endif -#define arch_ima_efi_boot_mode \ - ({ extern struct boot_params boot_params; boot_params.secure_boot; }) +static inline int efi_runtime_map_copy(void *buf, size_t bufsz) +{ + return 0; +} + +#endif #endif /* _ASM_X86_EFI_H */ diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index cb0ff1055ab1..6c8fdc96be7e 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -7,6 +7,7 @@ */ #include <linux/thread_info.h> +#include <asm/ia32.h> #include <asm/ptrace.h> #include <asm/user.h> #include <asm/auxvec.h> @@ -53,8 +54,9 @@ typedef struct user_i387_struct elf_fpregset_t; #define R_X86_64_GLOB_DAT 6 /* Create GOT entry */ #define R_X86_64_JUMP_SLOT 7 /* Create PLT entry */ #define R_X86_64_RELATIVE 8 /* Adjust by program base */ -#define R_X86_64_GOTPCREL 9 /* 32 bit signed pc relative - offset to GOT */ +#define R_X86_64_GOTPCREL 9 /* 32 bit signed pc relative offset to GOT */ +#define R_X86_64_GOTPCRELX 41 +#define R_X86_64_REX_GOTPCRELX 42 #define R_X86_64_32 10 /* Direct 32 bit zero extended */ #define R_X86_64_32S 11 /* Direct 32 bit sign extended */ #define R_X86_64_16 12 /* Direct 16 bit zero extended */ @@ -74,12 +76,8 @@ typedef struct user_i387_struct elf_fpregset_t; #include <asm/vdso.h> -#ifdef CONFIG_X86_64 extern unsigned int vdso64_enabled; -#endif -#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) extern unsigned int vdso32_enabled; -#endif /* * This is used to ensure we don't load something for the wrong architecture. @@ -149,13 +147,9 @@ do { \ ((x)->e_machine == EM_X86_64) #define compat_elf_check_arch(x) \ - (elf_check_arch_ia32(x) || \ + ((elf_check_arch_ia32(x) && ia32_enabled_verbose()) || \ (IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64)) -#if __USER32_DS != __USER_DS -# error "The following code assumes __USER32_DS == __USER_DS" -#endif - static inline void elf_common_init(struct thread_struct *t, struct pt_regs *regs, const u16 ds) { @@ -226,7 +220,6 @@ do { \ /* I'm not sure if we can use '-' here */ #define ELF_PLATFORM ("x86_64") extern void set_personality_64bit(void); -extern unsigned int sysctl_vsyscall32; extern int force_personality32; #endif /* !CONFIG_X86_32 */ @@ -396,5 +389,4 @@ struct va_alignment { } ____cacheline_aligned; extern struct va_alignment va_align; -extern unsigned long align_vdso_addr(unsigned long); #endif /* _ASM_X86_ELF_H */ diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h index 674ed46d3ced..d535a97c7284 100644 --- a/arch/x86/include/asm/entry-common.h +++ b/arch/x86/include/asm/entry-common.h @@ -8,6 +8,7 @@ #include <asm/nospec-branch.h> #include <asm/io_bitmap.h> #include <asm/fpu/api.h> +#include <asm/fred.h> /* Check that the stack and regs on entry from user mode are sane. */ static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs) @@ -24,8 +25,8 @@ static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs) /* * For !SMAP hardware we patch out CLAC on entry. */ - if (boot_cpu_has(X86_FEATURE_SMAP) || - (IS_ENABLED(CONFIG_64BIT) && boot_cpu_has(X86_FEATURE_XENPV))) + if (cpu_feature_enabled(X86_FEATURE_SMAP) || + cpu_feature_enabled(X86_FEATURE_XENPV)) mask |= X86_EFLAGS_AC; WARN_ON_ONCE(flags & mask); @@ -44,8 +45,7 @@ static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs) } #define arch_enter_from_user_mode arch_enter_from_user_mode -static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, - unsigned long ti_work) +static inline void arch_exit_work(unsigned long ti_work) { if (ti_work & _TIF_USER_RETURN_NOTIFY) fire_user_return_notifiers(); @@ -53,9 +53,19 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, if (unlikely(ti_work & _TIF_IO_BITMAP)) tss_update_io_bitmap(); - fpregs_assert_state_consistent(); if (unlikely(ti_work & _TIF_NEED_FPU_LOAD)) switch_fpu_return(); +} + +static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, + unsigned long ti_work) +{ + fpregs_assert_state_consistent(); + + if (unlikely(ti_work)) + arch_exit_work(ti_work); + + fred_update_rsp0(); #ifdef CONFIG_COMPAT /* @@ -73,25 +83,22 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, #endif /* - * Ultimately, this value will get limited by KSTACK_OFFSET_MAX(), - * but not enough for x86 stack utilization comfort. To keep - * reasonable stack head room, reduce the maximum offset to 8 bits. - * - * The actual entropy will be further reduced by the compiler when - * applying stack alignment constraints (see cc_stack_align4/8 in + * This value will get limited by KSTACK_OFFSET_MAX(), which is 10 + * bits. The actual entropy will be further reduced by the compiler + * when applying stack alignment constraints (see cc_stack_align4/8 in * arch/x86/Makefile), which will remove the 3 (x86_64) or 2 (ia32) * low bits from any entropy chosen here. * - * Therefore, final stack offset entropy will be 5 (x86_64) or - * 6 (ia32) bits. + * Therefore, final stack offset entropy will be 7 (x86_64) or + * 8 (ia32) bits. */ - choose_random_kstack_offset(rdtsc() & 0xFF); + choose_random_kstack_offset(rdtsc()); } #define arch_exit_to_user_mode_prepare arch_exit_to_user_mode_prepare static __always_inline void arch_exit_to_user_mode(void) { - mds_user_clear_cpu_buffers(); + amd_clear_divider(); } #define arch_exit_to_user_mode arch_exit_to_user_mode diff --git a/arch/x86/include/asm/extable.h b/arch/x86/include/asm/extable.h index eeed395c3177..a0e0c6b50155 100644 --- a/arch/x86/include/asm/extable.h +++ b/arch/x86/include/asm/extable.h @@ -37,7 +37,6 @@ struct pt_regs; extern int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, unsigned long fault_addr); -extern int fixup_bug(struct pt_regs *regs, int trapnr); extern int ex_get_fixup_type(unsigned long ip); extern void early_fixup_exception(struct pt_regs *regs, int trapnr); diff --git a/arch/x86/include/asm/extable_fixup_types.h b/arch/x86/include/asm/extable_fixup_types.h index 503622627400..906b0d5541e8 100644 --- a/arch/x86/include/asm/extable_fixup_types.h +++ b/arch/x86/include/asm/extable_fixup_types.h @@ -4,7 +4,7 @@ /* * Our IMM is signed, as such it must live at the top end of the word. Also, - * since C99 hex constants are of ambigious type, force cast the mask to 'int' + * since C99 hex constants are of ambiguous type, force cast the mask to 'int' * so that FIELD_GET() will DTRT and sign extend the value when it extracts it. */ #define EX_DATA_TYPE_MASK ((int)0x000000FF) @@ -36,7 +36,7 @@ #define EX_TYPE_DEFAULT 1 #define EX_TYPE_FAULT 2 #define EX_TYPE_UACCESS 3 -#define EX_TYPE_COPY 4 +/* unused, was: #define EX_TYPE_COPY 4 */ #define EX_TYPE_CLEAR_FS 5 #define EX_TYPE_FPU_RESTORE 6 #define EX_TYPE_BPF 7 @@ -64,4 +64,8 @@ #define EX_TYPE_UCOPY_LEN4 (EX_TYPE_UCOPY_LEN | EX_DATA_IMM(4)) #define EX_TYPE_UCOPY_LEN8 (EX_TYPE_UCOPY_LEN | EX_DATA_IMM(8)) +#define EX_TYPE_ZEROPAD 20 /* longword load with zeropad on fault */ + +#define EX_TYPE_ERETU 21 + #endif diff --git a/arch/x86/include/asm/fb.h b/arch/x86/include/asm/fb.h deleted file mode 100644 index ab4c960146e3..000000000000 --- a/arch/x86/include/asm/fb.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_FB_H -#define _ASM_X86_FB_H - -#include <linux/fb.h> -#include <linux/fs.h> -#include <asm/page.h> - -static inline void fb_pgprotect(struct file *file, struct vm_area_struct *vma, - unsigned long off) -{ - unsigned long prot; - - prot = pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK; - if (boot_cpu_data.x86 > 3) - pgprot_val(vma->vm_page_prot) = - prot | cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS); -} - -extern int fb_is_primary_device(struct fb_info *info); - -#endif /* _ASM_X86_FB_H */ diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index d0dcefb5cc59..4519c9f35ba0 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -31,7 +31,7 @@ /* fixmap starts downwards from the 507th entry in level2_fixmap_pgt */ #define FIXMAP_PMD_TOP 507 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/kernel.h> #include <asm/apicdef.h> #include <asm/page.h> @@ -196,5 +196,5 @@ void __init *early_memremap_decrypted_wp(resource_size_t phys_addr, void __early_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags); -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_X86_FIXMAP_H */ diff --git a/arch/x86/include/asm/fpu.h b/arch/x86/include/asm/fpu.h new file mode 100644 index 000000000000..b2743fe19339 --- /dev/null +++ b/arch/x86/include/asm/fpu.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2023 SiFive + */ + +#ifndef _ASM_X86_FPU_H +#define _ASM_X86_FPU_H + +#include <asm/fpu/api.h> + +#define kernel_fpu_available() true + +#endif /* ! _ASM_X86_FPU_H */ diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 503a577814b2..cd6f194a912b 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -16,10 +16,9 @@ /* * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It - * disables preemption so be careful if you intend to use it for long periods - * of time. - * If you intend to use the FPU in irq/softirq you need to check first with - * irq_fpu_usable() if it is possible. + * disables preemption and softirq processing, so be careful if you intend to + * use it for long periods of time. Kernel-mode FPU cannot be used in all + * contexts -- see irq_fpu_usable() for details. */ /* Kernel FPU states to initialize in kernel_fpu_begin_mask() */ @@ -50,10 +49,10 @@ static inline void kernel_fpu_begin(void) } /* - * Use fpregs_lock() while editing CPU's FPU registers or fpu->fpstate. - * A context switch will (and softirq might) save CPU's FPU registers to - * fpu->fpstate.regs and set TIF_NEED_FPU_LOAD leaving CPU's FPU registers in - * a random state. + * Use fpregs_lock() while editing CPU's FPU registers or fpu->fpstate, or while + * using the FPU in kernel mode. A context switch will (and softirq might) save + * CPU's FPU registers to fpu->fpstate.regs and set TIF_NEED_FPU_LOAD leaving + * CPU's FPU registers in a random state. * * local_bh_disable() protects against both preemption and soft interrupts * on !RT kernels. @@ -63,8 +62,6 @@ static inline void kernel_fpu_begin(void) * preemptible. Disabling preemption is the right choice here as bottom * half processing is always in thread context on RT kernels so it * implicitly prevents bottom half processing as well. - * - * Disabling preemption also serializes against kernel_fpu_begin(). */ static inline void fpregs_lock(void) { @@ -82,6 +79,15 @@ static inline void fpregs_unlock(void) preempt_enable(); } +/* + * FPU state gets lazily restored before returning to userspace. So when in the + * kernel, the valid FPU state may be kept in the buffer. This function will force + * restore all the fpu state to the registers early if needed, and lock them from + * being automatically saved/restored. Then FPU state can be modified safely in the + * registers, before unlocking with fpregs_unlock(). + */ +void fpregs_lock_and_load(void); + #ifdef CONFIG_X86_DEBUG_FPU extern void fpregs_assert_state_consistent(void); #else @@ -109,7 +115,7 @@ extern void fpu_reset_from_exception_fixup(void); /* Boot, hotplug and resume */ extern void fpu__init_cpu(void); -extern void fpu__init_system(struct cpuinfo_x86 *c); +extern void fpu__init_system(void); extern void fpu__init_check_bugs(void); extern void fpu__resume_cpu(void); @@ -120,6 +126,7 @@ static inline void fpstate_init_soft(struct swregs_state *soft) {} #endif /* State tracking */ +DECLARE_PER_CPU(bool, kernel_fpu_allowed); DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); /* Process cleanup */ @@ -130,10 +137,13 @@ static inline void fpstate_free(struct fpu *fpu) { } #endif /* fpstate-related functions which are exported to KVM */ -extern void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature); +extern void fpstate_clear_xstate_component(struct fpstate *fpstate, unsigned int xfeature); extern u64 xstate_get_guest_group_perm(void); +extern void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr); + + /* KVM specific functions */ extern bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu); extern void fpu_free_guest_fpstate(struct fpu_guest *gfpu); @@ -148,7 +158,8 @@ static inline void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) { static inline void fpu_sync_guest_vmexit_xfd_state(void) { } #endif -extern void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, unsigned int size, u32 pkru); +extern void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, + unsigned int size, u64 xfeatures, u32 pkru); extern int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, u64 xcr0, u32 *vpkru); static inline void fpstate_set_confidential(struct fpu_guest *gfpu) diff --git a/arch/x86/include/asm/fpu/regset.h b/arch/x86/include/asm/fpu/regset.h index 4f928d6a367b..697b77e96025 100644 --- a/arch/x86/include/asm/fpu/regset.h +++ b/arch/x86/include/asm/fpu/regset.h @@ -7,11 +7,12 @@ #include <linux/regset.h> -extern user_regset_active_fn regset_fpregs_active, regset_xregset_fpregs_active; +extern user_regset_active_fn regset_fpregs_active, regset_xregset_fpregs_active, + ssp_active; extern user_regset_get2_fn fpregs_get, xfpregs_get, fpregs_soft_get, - xstateregs_get; + xstateregs_get, ssp_get; extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set, - xstateregs_set; + xstateregs_set, ssp_set; /* * xstateregs_active == regset_fpregs_active. Please refer to the comment diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h index b2486b2cbc6e..c060549c6c94 100644 --- a/arch/x86/include/asm/fpu/sched.h +++ b/arch/x86/include/asm/fpu/sched.h @@ -10,36 +10,33 @@ #include <asm/trace/fpu.h> extern void save_fpregs_to_fpstate(struct fpu *fpu); -extern void fpu__drop(struct fpu *fpu); -extern int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal); +extern void fpu__drop(struct task_struct *tsk); +extern int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal, + unsigned long shstk_addr); extern void fpu_flush_thread(void); /* * FPU state switching for scheduling. * - * This is a two-stage process: + * switch_fpu() saves the old state and sets TIF_NEED_FPU_LOAD if + * TIF_NEED_FPU_LOAD is not set. This is done within the context + * of the old process. * - * - switch_fpu_prepare() saves the old state. - * This is done within the context of the old process. - * - * - switch_fpu_finish() sets TIF_NEED_FPU_LOAD; the floating point state - * will get loaded on return to userspace, or when the kernel needs it. - * - * If TIF_NEED_FPU_LOAD is cleared then the CPU's FPU registers - * are saved in the current thread's FPU register state. - * - * If TIF_NEED_FPU_LOAD is set then CPU's FPU registers may not - * hold current()'s FPU registers. It is required to load the + * Once TIF_NEED_FPU_LOAD is set, it is required to load the * registers before returning to userland or using the content * otherwise. * * The FPU context is only stored/restored for a user task and * PF_KTHREAD is used to distinguish between kernel and user threads. */ -static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) +static inline void switch_fpu(struct task_struct *old, int cpu) { - if (cpu_feature_enabled(X86_FEATURE_FPU) && - !(current->flags & PF_KTHREAD)) { + if (!test_tsk_thread_flag(old, TIF_NEED_FPU_LOAD) && + cpu_feature_enabled(X86_FEATURE_FPU) && + !(old->flags & (PF_KTHREAD | PF_USER_WORKER))) { + struct fpu *old_fpu = x86_task_fpu(old); + + set_tsk_thread_flag(old, TIF_NEED_FPU_LOAD); save_fpregs_to_fpstate(old_fpu); /* * The save operation preserved register state, so the @@ -47,7 +44,7 @@ static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) * current CPU number in @old_fpu, so the next return * to user space can avoid the FPU register restore * when is returns on the same CPU and still owns the - * context. + * context. See fpregs_restore_userregs(). */ old_fpu->last_cpu = cpu; @@ -55,14 +52,4 @@ static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) } } -/* - * Delay loading of the complete FPU state until the return to userland. - * PKRU is handled separately. - */ -static inline void switch_fpu_finish(void) -{ - if (cpu_feature_enabled(X86_FEATURE_FPU)) - set_thread_flag(TIF_NEED_FPU_LOAD); -} - #endif /* _ASM_X86_FPU_SCHED_H */ diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h index e1c9df9102a5..eccc75bc9c4f 100644 --- a/arch/x86/include/asm/fpu/signal.h +++ b/arch/x86/include/asm/fpu/signal.h @@ -13,16 +13,9 @@ #ifdef CONFIG_X86_64 # include <uapi/asm/sigcontext.h> # include <asm/user32.h> -struct ksignal; -int ia32_setup_rt_frame(int sig, struct ksignal *ksig, - compat_sigset_t *set, struct pt_regs *regs); -int ia32_setup_frame(int sig, struct ksignal *ksig, - compat_sigset_t *set, struct pt_regs *regs); #else # define user_i387_ia32_struct user_i387_struct # define user32_fxsr_struct user_fxsr_struct -# define ia32_setup_frame __setup_frame -# define ia32_setup_rt_frame __setup_rt_frame #endif extern void convert_from_fxsr(struct user_i387_ia32_struct *env, @@ -36,7 +29,7 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long fpu__get_fpstate_size(void); -extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); +extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size, u32 pkru); extern void fpu__clear_user_states(struct fpu *fpu); extern bool fpu__restore_sig(void __user *buf, int ia32_frame); diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index eb7cd1139d97..1c94121acd3d 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -2,8 +2,10 @@ /* * FPU data structures: */ -#ifndef _ASM_X86_FPU_H -#define _ASM_X86_FPU_H +#ifndef _ASM_X86_FPU_TYPES_H +#define _ASM_X86_FPU_TYPES_H + +#include <asm/page_types.h> /* * The legacy x87 FPU state format, as saved by FSAVE and @@ -115,14 +117,15 @@ enum xfeature { XFEATURE_PT_UNIMPLEMENTED_SO_FAR, XFEATURE_PKRU, XFEATURE_PASID, - XFEATURE_RSRVD_COMP_11, - XFEATURE_RSRVD_COMP_12, + XFEATURE_CET_USER, + XFEATURE_CET_KERNEL_UNUSED, XFEATURE_RSRVD_COMP_13, XFEATURE_RSRVD_COMP_14, XFEATURE_LBR, XFEATURE_RSRVD_COMP_16, XFEATURE_XTILE_CFG, XFEATURE_XTILE_DATA, + XFEATURE_APX, XFEATURE_MAX, }; @@ -138,9 +141,12 @@ enum xfeature { #define XFEATURE_MASK_PT (1 << XFEATURE_PT_UNIMPLEMENTED_SO_FAR) #define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU) #define XFEATURE_MASK_PASID (1 << XFEATURE_PASID) +#define XFEATURE_MASK_CET_USER (1 << XFEATURE_CET_USER) +#define XFEATURE_MASK_CET_KERNEL (1 << XFEATURE_CET_KERNEL_UNUSED) #define XFEATURE_MASK_LBR (1 << XFEATURE_LBR) #define XFEATURE_MASK_XTILE_CFG (1 << XFEATURE_XTILE_CFG) #define XFEATURE_MASK_XTILE_DATA (1 << XFEATURE_XTILE_DATA) +#define XFEATURE_MASK_APX (1 << XFEATURE_APX) #define XFEATURE_MASK_FPSSE (XFEATURE_MASK_FP | XFEATURE_MASK_SSE) #define XFEATURE_MASK_AVX512 (XFEATURE_MASK_OPMASK \ @@ -253,6 +259,16 @@ struct pkru_state { } __packed; /* + * State component 11 is Control-flow Enforcement user states + */ +struct cet_user_state { + /* user control-flow settings */ + u64 user_cet; + /* user shadow stack pointer */ + u64 user_ssp; +}; + +/* * State component 15: Architectural LBR configuration state. * The size of Arch LBR state depends on the number of LBRs (lbr_depth). */ @@ -290,6 +306,13 @@ struct xtile_data { } __packed; /* + * State component 19: 8B extended general purpose register. + */ +struct apx_state { + u64 egpr[16]; +} __packed; + +/* * State component 10 is supervisor state used for context-switching the * PASID state. */ @@ -321,7 +344,7 @@ struct xstate_header { struct xregs_state { struct fxregs_state i387; struct xstate_header header; - u8 extended_state_area[0]; + u8 extended_state_area[]; } __attribute__ ((packed, aligned (64))); /* @@ -393,9 +416,11 @@ struct fpu_state_perm { /* * @__state_perm: * - * This bitmap indicates the permission for state components, which - * are available to a thread group. The permission prctl() sets the - * enabled state bits in thread_group_leader()->thread.fpu. + * This bitmap indicates the permission for state components + * available to a thread group, including both user and supervisor + * components and software-defined bits like FPU_GUEST_PERM_LOCKED. + * The permission prctl() sets the enabled state bits in + * thread_group_leader()->thread.fpu. * * All run time operations use the per thread information in the * currently active fpu.fpstate which contains the xfeature masks @@ -403,7 +428,7 @@ struct fpu_state_perm { * * This master permission field is only to be used when * task.fpu.fpstate based checks fail to validate whether the task - * is allowed to expand it's xfeatures set which requires to + * is allowed to expand its xfeatures set which requires to * allocate a larger sized fpstate buffer. * * Do not access this field directly. Use the provided helper @@ -511,13 +536,6 @@ struct fpu_guest { u64 xfeatures; /* - * @perm: xfeature bitmap of features which are - * permitted to be enabled for the guest - * vCPU. - */ - u64 perm; - - /* * @xfd_err: Save the guest value. */ u64 xfd_err; @@ -577,9 +595,16 @@ struct fpu_state_config { * even without XSAVE support, i.e. legacy features FP + SSE */ u64 legacy_features; + /* + * @independent_features: + * + * Features that are supported by XSAVES, but not managed as part of + * the FPU core, such as LBR + */ + u64 independent_features; }; /* FPU state configuration information */ extern struct fpu_state_config fpu_kernel_cfg, fpu_user_cfg; -#endif /* _ASM_X86_FPU_H */ +#endif /* _ASM_X86_FPU_TYPES_H */ diff --git a/arch/x86/include/asm/fpu/xcr.h b/arch/x86/include/asm/fpu/xcr.h index 9656a5bc6fea..9a710c060445 100644 --- a/arch/x86/include/asm/fpu/xcr.h +++ b/arch/x86/include/asm/fpu/xcr.h @@ -5,7 +5,7 @@ #define XCR_XFEATURE_ENABLED_MASK 0x00000000 #define XCR_XFEATURE_IN_USE_MASK 0x00000001 -static inline u64 xgetbv(u32 index) +static __always_inline u64 xgetbv(u32 index) { u32 eax, edx; @@ -27,7 +27,7 @@ static inline void xsetbv(u32 index, u64 value) * * Callers should check X86_FEATURE_XGETBV1. */ -static inline u64 xfeatures_in_use(void) +static __always_inline u64 xfeatures_in_use(void) { return xgetbv(XCR_XFEATURE_IN_USE_MASK); } diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index cd3dd170e23a..b308a76afbb7 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -12,10 +12,6 @@ /* Bit 63 of XCR0 is reserved for future expansion */ #define XFEATURE_MASK_EXTEND (~(XFEATURE_MASK_FPSSE | (1ULL << 63))) -#define XSTATE_CPUID 0x0000000d - -#define TILE_CPUID 0x0000001d - #define FXSAVE_SIZE 512 #define XSAVE_HDR_SIZE 64 @@ -36,7 +32,8 @@ XFEATURE_MASK_PKRU | \ XFEATURE_MASK_BNDREGS | \ XFEATURE_MASK_BNDCSR | \ - XFEATURE_MASK_XTILE) + XFEATURE_MASK_XTILE | \ + XFEATURE_MASK_APX) /* * Features which are restored when returning to user space. @@ -50,7 +47,8 @@ #define XFEATURE_MASK_USER_DYNAMIC XFEATURE_MASK_XTILE_DATA /* All currently supported supervisor features */ -#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID) +#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID | \ + XFEATURE_MASK_CET_USER) /* * A supervisor state component may not always contain valuable information, @@ -77,7 +75,8 @@ * Unsupported supervisor features. When a supervisor feature in this mask is * supported in the future, move it to the supported supervisor feature mask. */ -#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT) +#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT | \ + XFEATURE_MASK_CET_KERNEL) /* All supervisor states including supported and unsupported states. */ #define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \ diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h index fb42659f6e98..0ab65073c1cc 100644 --- a/arch/x86/include/asm/frame.h +++ b/arch/x86/include/asm/frame.h @@ -11,7 +11,7 @@ #ifdef CONFIG_FRAME_POINTER -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ .macro FRAME_BEGIN push %_ASM_BP @@ -51,7 +51,7 @@ .endm #endif /* CONFIG_X86_64 */ -#else /* !__ASSEMBLY__ */ +#else /* !__ASSEMBLER__ */ #define FRAME_BEGIN \ "push %" _ASM_BP "\n" \ @@ -82,18 +82,18 @@ static inline unsigned long encode_frame_pointer(struct pt_regs *regs) #endif /* CONFIG_X86_64 */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define FRAME_OFFSET __ASM_SEL(4, 8) #else /* !CONFIG_FRAME_POINTER */ -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ .macro ENCODE_FRAME_POINTER ptregs_offset=0 .endm -#else /* !__ASSEMBLY */ +#else /* !__ASSEMBLER__ */ #define ENCODE_FRAME_POINTER diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h new file mode 100644 index 000000000000..12b34d5b2953 --- /dev/null +++ b/arch/x86/include/asm/fred.h @@ -0,0 +1,119 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Macros for Flexible Return and Event Delivery (FRED) + */ + +#ifndef ASM_X86_FRED_H +#define ASM_X86_FRED_H + +#include <linux/const.h> + +#include <asm/asm.h> +#include <asm/msr.h> +#include <asm/trapnr.h> + +/* + * FRED event return instruction opcodes for ERET{S,U}; supported in + * binutils >= 2.41. + */ +#define ERETS _ASM_BYTES(0xf2,0x0f,0x01,0xca) +#define ERETU _ASM_BYTES(0xf3,0x0f,0x01,0xca) + +/* + * RSP is aligned to a 64-byte boundary before used to push a new stack frame + */ +#define FRED_STACK_FRAME_RSP_MASK _AT(unsigned long, (~0x3f)) + +/* + * Used for the return address for call emulation during code patching, + * and measured in 64-byte cache lines. + */ +#define FRED_CONFIG_REDZONE_AMOUNT 1 +#define FRED_CONFIG_REDZONE (_AT(unsigned long, FRED_CONFIG_REDZONE_AMOUNT) << 6) +#define FRED_CONFIG_INT_STKLVL(l) (_AT(unsigned long, l) << 9) +#define FRED_CONFIG_ENTRYPOINT(p) _AT(unsigned long, (p)) + +#ifndef __ASSEMBLER__ + +#ifdef CONFIG_X86_FRED +#include <linux/kernel.h> +#include <linux/sched/task_stack.h> + +#include <asm/ptrace.h> + +struct fred_info { + /* Event data: CR2, DR6, ... */ + unsigned long edata; + unsigned long resv; +}; + +/* Full format of the FRED stack frame */ +struct fred_frame { + struct pt_regs regs; + struct fred_info info; +}; + +static __always_inline struct fred_info *fred_info(struct pt_regs *regs) +{ + return &container_of(regs, struct fred_frame, regs)->info; +} + +static __always_inline unsigned long fred_event_data(struct pt_regs *regs) +{ + return fred_info(regs)->edata; +} + +void asm_fred_entrypoint_user(void); +void asm_fred_entrypoint_kernel(void); +void asm_fred_entry_from_kvm(struct fred_ss); + +__visible void fred_entry_from_user(struct pt_regs *regs); +__visible void fred_entry_from_kernel(struct pt_regs *regs); +__visible void __fred_entry_from_kvm(struct pt_regs *regs); + +/* Can be called from noinstr code, thus __always_inline */ +static __always_inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) +{ + struct fred_ss ss = { + .ss =__KERNEL_DS, + .type = type, + .vector = vector, + .nmi = type == EVENT_TYPE_NMI, + .lm = 1, + }; + + asm_fred_entry_from_kvm(ss); +} + +void cpu_init_fred_exceptions(void); +void cpu_init_fred_rsps(void); +void fred_complete_exception_setup(void); + +DECLARE_PER_CPU(unsigned long, fred_rsp0); + +static __always_inline void fred_sync_rsp0(unsigned long rsp0) +{ + __this_cpu_write(fred_rsp0, rsp0); +} + +static __always_inline void fred_update_rsp0(void) +{ + unsigned long rsp0 = (unsigned long) task_stack_page(current) + THREAD_SIZE; + + if (cpu_feature_enabled(X86_FEATURE_FRED) && (__this_cpu_read(fred_rsp0) != rsp0)) { + wrmsrns(MSR_IA32_FRED_RSP0, rsp0); + __this_cpu_write(fred_rsp0, rsp0); + } +} +#else /* CONFIG_X86_FRED */ +static __always_inline unsigned long fred_event_data(struct pt_regs *regs) { return 0; } +static inline void cpu_init_fred_exceptions(void) { } +static inline void cpu_init_fred_rsps(void) { } +static inline void fred_complete_exception_setup(void) { } +static inline void fred_entry_from_kvm(unsigned int type, unsigned int vector) { } +static inline void fred_sync_rsp0(unsigned long rsp0) { } +static inline void fred_update_rsp0(void) { } +#endif /* CONFIG_X86_FRED */ +#endif /* !__ASSEMBLER__ */ + +#endif /* ASM_X86_FRED_H */ diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h index 35cff5f2becf..ab2547f97c2c 100644 --- a/arch/x86/include/asm/fsgsbase.h +++ b/arch/x86/include/asm/fsgsbase.h @@ -2,11 +2,11 @@ #ifndef _ASM_FSGSBASE_H #define _ASM_FSGSBASE_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_X86_64 -#include <asm/msr-index.h> +#include <asm/msr.h> /* * Read/write a task's FSBASE or GSBASE. This returns the value that @@ -60,7 +60,7 @@ static inline unsigned long x86_fsbase_read_cpu(void) if (boot_cpu_has(X86_FEATURE_FSGSBASE)) fsbase = rdfsbase(); else - rdmsrl(MSR_FS_BASE, fsbase); + rdmsrq(MSR_FS_BASE, fsbase); return fsbase; } @@ -70,7 +70,7 @@ static inline void x86_fsbase_write_cpu(unsigned long fsbase) if (boot_cpu_has(X86_FEATURE_FSGSBASE)) wrfsbase(fsbase); else - wrmsrl(MSR_FS_BASE, fsbase); + wrmsrq(MSR_FS_BASE, fsbase); } extern unsigned long x86_gsbase_read_cpu_inactive(void); @@ -80,6 +80,6 @@ extern unsigned long x86_fsgsbase_read_task(struct task_struct *task, #endif /* CONFIG_X86_64 */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_FSGSBASE_H */ diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index b5ef474be858..93156ac4ffe0 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_FTRACE_H #define _ASM_X86_FTRACE_H +#include <asm/ptrace.h> + #ifdef CONFIG_FUNCTION_TRACER #ifndef CC_USING_FENTRY # error Compiler does not support fentry? @@ -20,10 +22,7 @@ #define ARCH_SUPPORTS_FTRACE_OPS 1 #endif -#define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR - -#ifndef __ASSEMBLY__ -extern atomic_t modifying_ftrace_code; +#ifndef __ASSEMBLER__ extern void __fentry__(void); static inline unsigned long ftrace_call_adjust(unsigned long addr) @@ -35,35 +34,44 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) return addr; } -/* - * When a ftrace registered caller is tracing a function that is - * also set by a register_ftrace_direct() call, it needs to be - * differentiated in the ftrace_caller trampoline. To do this, we - * place the direct caller in the ORIG_AX part of pt_regs. This - * tells the ftrace_caller that there's a direct caller. - */ -static inline void arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned long addr) +static inline unsigned long arch_ftrace_get_symaddr(unsigned long fentry_ip) { - /* Emulate a call */ - regs->orig_ax = addr; + if (is_endbr((void*)(fentry_ip - ENDBR_INSN_SIZE))) + fentry_ip -= ENDBR_INSN_SIZE; + + return fentry_ip; } +#define ftrace_get_symaddr(fentry_ip) arch_ftrace_get_symaddr(fentry_ip) #ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS -struct ftrace_regs { - struct pt_regs regs; -}; + +#include <linux/ftrace_regs.h> static __always_inline struct pt_regs * arch_ftrace_get_regs(struct ftrace_regs *fregs) { /* Only when FL_SAVE_REGS is set, cs will be non zero */ - if (!fregs->regs.cs) + if (!arch_ftrace_regs(fregs)->regs.cs) return NULL; - return &fregs->regs; + return &arch_ftrace_regs(fregs)->regs; } -#define ftrace_instruction_pointer_set(fregs, _ip) \ - do { (fregs)->regs.ip = (_ip); } while (0) +#define arch_ftrace_fill_perf_regs(fregs, _regs) do { \ + (_regs)->ip = arch_ftrace_regs(fregs)->regs.ip; \ + (_regs)->sp = arch_ftrace_regs(fregs)->regs.sp; \ + (_regs)->cs = __KERNEL_CS; \ + (_regs)->flags = 0; \ + } while (0) + +#define ftrace_regs_set_instruction_pointer(fregs, _ip) \ + do { arch_ftrace_regs(fregs)->regs.ip = (_ip); } while (0) + + +static __always_inline unsigned long +ftrace_regs_get_return_address(struct ftrace_regs *fregs) +{ + return *(unsigned long *)ftrace_regs_get_stack_pointer(fregs); +} struct ftrace_ops; #define ftrace_graph_func ftrace_graph_func @@ -73,6 +81,24 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, #define FTRACE_GRAPH_TRAMP_ADDR FTRACE_GRAPH_ADDR #endif +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +/* + * When a ftrace registered caller is tracing a function that is + * also set by a register_ftrace_direct() call, it needs to be + * differentiated in the ftrace_caller trampoline. To do this, we + * place the direct caller in the ORIG_AX part of pt_regs. This + * tells the ftrace_caller that there's a direct caller. + */ +static inline void +__arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned long addr) +{ + /* Emulate a call */ + regs->orig_ax = addr; +} +#define arch_ftrace_set_direct_caller(fregs, addr) \ + __arch_ftrace_set_direct_caller(&arch_ftrace_regs(fregs)->regs, addr) +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + #ifdef CONFIG_DYNAMIC_FTRACE struct dyn_arch_ftrace { @@ -80,11 +106,14 @@ struct dyn_arch_ftrace { }; #endif /* CONFIG_DYNAMIC_FTRACE */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* CONFIG_FUNCTION_TRACER */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ + +void prepare_ftrace_return(unsigned long ip, unsigned long *parent, + unsigned long frame_pointer); #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) extern void set_ftrace_ops_ro(void); @@ -125,6 +154,6 @@ static inline bool arch_trace_is_compat_syscall(struct pt_regs *regs) } #endif /* CONFIG_FTRACE_SYSCALLS && CONFIG_IA32_EMULATION */ #endif /* !COMPILE_OFFSETS */ -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_X86_FTRACE_H */ diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h index 99d345b686fa..6e2458088800 100644 --- a/arch/x86/include/asm/futex.h +++ b/arch/x86/include/asm/futex.h @@ -48,7 +48,9 @@ do { \ static __always_inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - if (!user_access_begin(uaddr, sizeof(u32))) + if (can_do_masked_user_access()) + uaddr = masked_user_access_begin(uaddr); + else if (!user_access_begin(uaddr, sizeof(u32))) return -EFAULT; switch (op) { @@ -84,7 +86,9 @@ static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, { int ret = 0; - if (!user_access_begin(uaddr, sizeof(u32))) + if (can_do_masked_user_access()) + uaddr = masked_user_access_begin(uaddr); + else if (!user_access_begin(uaddr, sizeof(u32))) return -EFAULT; asm volatile("\n" "1:\t" LOCK_PREFIX "cmpxchgl %3, %2\n" diff --git a/arch/x86/include/asm/gsseg.h b/arch/x86/include/asm/gsseg.h new file mode 100644 index 000000000000..ab6a595cea70 --- /dev/null +++ b/arch/x86/include/asm/gsseg.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _ASM_X86_GSSEG_H +#define _ASM_X86_GSSEG_H + +#include <linux/types.h> + +#include <asm/asm.h> +#include <asm/cpufeature.h> +#include <asm/alternative.h> +#include <asm/processor.h> +#include <asm/nops.h> + +#ifdef CONFIG_X86_64 + +extern asmlinkage void asm_load_gs_index(u16 selector); + +/* Replace with "lkgs %di" once binutils support LKGS instruction */ +#define LKGS_DI _ASM_BYTES(0xf2,0x0f,0x00,0xf7) + +static inline void native_lkgs(unsigned int selector) +{ + u16 sel = selector; + asm_inline volatile("1: " LKGS_DI + _ASM_EXTABLE_TYPE_REG(1b, 1b, EX_TYPE_ZERO_REG, %k[sel]) + : [sel] "+D" (sel)); +} + +static inline void native_load_gs_index(unsigned int selector) +{ + if (cpu_feature_enabled(X86_FEATURE_LKGS)) { + native_lkgs(selector); + } else { + unsigned long flags; + + local_irq_save(flags); + asm_load_gs_index(selector); + local_irq_restore(flags); + } +} + +#endif /* CONFIG_X86_64 */ + +static inline void __init lkgs_init(void) +{ +#ifdef CONFIG_PARAVIRT_XXL +#ifdef CONFIG_X86_64 + if (cpu_feature_enabled(X86_FEATURE_LKGS)) + pv_ops.cpu.load_gs_index = native_lkgs; +#endif +#endif +} + +#ifndef CONFIG_PARAVIRT_XXL + +static inline void load_gs_index(unsigned int selector) +{ +#ifdef CONFIG_X86_64 + native_load_gs_index(selector); +#else + loadsegment(gs, selector); +#endif +} + +#endif /* CONFIG_PARAVIRT_XXL */ + +#endif /* _ASM_X86_GSSEG_H */ diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 275e7fd20310..f00c09ffe6a9 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -5,7 +5,6 @@ #include <linux/threads.h> typedef struct { - u16 __softirq_pending; #if IS_ENABLED(CONFIG_KVM_INTEL) u8 kvm_cpu_l1tf_flush_l1d; #endif @@ -15,7 +14,7 @@ typedef struct { unsigned int irq_spurious_count; unsigned int icr_read_retry_count; #endif -#ifdef CONFIG_HAVE_KVM +#if IS_ENABLED(CONFIG_KVM) unsigned int kvm_posted_intr_ipis; unsigned int kvm_posted_intr_wakeup_ipis; unsigned int kvm_posted_intr_nested_ipis; @@ -44,10 +43,16 @@ typedef struct { unsigned int irq_hv_reenlightenment_count; unsigned int hyperv_stimer0_count; #endif +#ifdef CONFIG_X86_POSTED_MSI + unsigned int posted_msi_notification_count; +#endif } ____cacheline_aligned irq_cpustat_t; DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); +#ifdef CONFIG_X86_POSTED_MSI +DECLARE_PER_CPU_ALIGNED(struct pi_desc, posted_msi_pi_desc); +#endif #define __ARCH_IRQ_STAT #define inc_irq_stat(member) this_cpu_inc(irq_stat.member) @@ -60,9 +65,15 @@ extern u64 arch_irq_stat_cpu(unsigned int cpu); extern u64 arch_irq_stat(void); #define arch_irq_stat arch_irq_stat +DECLARE_PER_CPU_CACHE_HOT(u16, __softirq_pending); +#define local_softirq_pending_ref __softirq_pending #if IS_ENABLED(CONFIG_KVM_INTEL) -static inline void kvm_set_cpu_l1tf_flush_l1d(void) +/* + * This function is called from noinstr interrupt contexts + * and must be inlined to not get instrumentation. + */ +static __always_inline void kvm_set_cpu_l1tf_flush_l1d(void) { __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 1); } @@ -77,7 +88,7 @@ static __always_inline bool kvm_get_cpu_l1tf_flush_l1d(void) return __this_cpu_read(irq_stat.kvm_cpu_l1tf_flush_l1d); } #else /* !IS_ENABLED(CONFIG_KVM_INTEL) */ -static inline void kvm_set_cpu_l1tf_flush_l1d(void) { } +static __always_inline void kvm_set_cpu_l1tf_flush_l1d(void) { } #endif /* IS_ENABLED(CONFIG_KVM_INTEL) */ #endif /* _ASM_X86_HARDIRQ_H */ diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index 731ee7cc40a5..585bdadba47d 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -69,9 +69,6 @@ extern unsigned long highstart_pfn, highend_pfn; arch_flush_lazy_mmu_mode(); \ } while (0) -extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn, - unsigned long end_pfn); - #endif /* __KERNEL__ */ #endif /* _ASM_X86_HIGHMEM_H */ diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h index ab9f3dd87c80..ab0c78855ecb 100644 --- a/arch/x86/include/asm/hpet.h +++ b/arch/x86/include/asm/hpet.h @@ -84,7 +84,6 @@ extern int hpet_set_rtc_irq_bit(unsigned long bit_mask); extern int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec); extern int hpet_set_periodic_freq(unsigned long freq); -extern int hpet_rtc_dropped_irq(void); extern int hpet_rtc_timer_init(void); extern irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id); extern int hpet_register_irq_handler(rtc_irq_handler handler); diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h index a1f0e90d0818..0bc931cd0698 100644 --- a/arch/x86/include/asm/hw_breakpoint.h +++ b/arch/x86/include/asm/hw_breakpoint.h @@ -44,10 +44,7 @@ struct arch_hw_breakpoint { /* Total number of available HW breakpoint registers */ #define HBP_NUM 4 -static inline int hw_breakpoint_slots(int type) -{ - return HBP_NUM; -} +#define hw_breakpoint_slots(type) (HBP_NUM) struct perf_event_attr; struct perf_event; diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index d465ece58151..162ebd73a698 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -16,9 +16,7 @@ #include <asm/irq_vectors.h> -#define IRQ_MATRIX_BITS NR_VECTORS - -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/percpu.h> #include <linux/profile.h> @@ -28,7 +26,7 @@ #include <asm/irq.h> #include <asm/sections.h> -#ifdef CONFIG_X86_LOCAL_APIC +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY struct irq_data; struct pci_dev; struct msi_desc; @@ -97,18 +95,18 @@ extern struct irq_cfg *irqd_cfg(struct irq_data *irq_data); extern void lock_vector_lock(void); extern void unlock_vector_lock(void); #ifdef CONFIG_SMP -extern void send_cleanup_vector(struct irq_cfg *); +extern void vector_schedule_cleanup(struct irq_cfg *); extern void irq_complete_move(struct irq_cfg *cfg); #else -static inline void send_cleanup_vector(struct irq_cfg *c) { } +static inline void vector_schedule_cleanup(struct irq_cfg *c) { } static inline void irq_complete_move(struct irq_cfg *c) { } #endif extern void apic_ack_edge(struct irq_data *data); -#else /* CONFIG_X86_LOCAL_APIC */ +#else /* CONFIG_IRQ_DOMAIN_HIERARCHY */ static inline void lock_vector_lock(void) {} static inline void unlock_vector_lock(void) {} -#endif /* CONFIG_X86_LOCAL_APIC */ +#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ /* Statistics */ extern atomic_t irq_err_count; @@ -130,6 +128,6 @@ extern char spurious_entries_start[]; typedef struct irq_desc* vector_irq_t[NR_VECTORS]; DECLARE_PER_CPU(vector_irq_t, vector_irq); -#endif /* !ASSEMBLY_ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_X86_HW_IRQ_H */ diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h deleted file mode 100644 index 0a9407dc0859..000000000000 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ /dev/null @@ -1,640 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -/* - * This file contains definitions from Hyper-V Hypervisor Top-Level Functional - * Specification (TLFS): - * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/reference/tlfs - */ - -#ifndef _ASM_X86_HYPERV_TLFS_H -#define _ASM_X86_HYPERV_TLFS_H - -#include <linux/types.h> -#include <asm/page.h> -/* - * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent - * is set by CPUID(HvCpuIdFunctionVersionAndFeatures). - */ -#define HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS 0x40000000 -#define HYPERV_CPUID_INTERFACE 0x40000001 -#define HYPERV_CPUID_VERSION 0x40000002 -#define HYPERV_CPUID_FEATURES 0x40000003 -#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004 -#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005 -#define HYPERV_CPUID_CPU_MANAGEMENT_FEATURES 0x40000007 -#define HYPERV_CPUID_NESTED_FEATURES 0x4000000A -#define HYPERV_CPUID_ISOLATION_CONFIG 0x4000000C - -#define HYPERV_CPUID_VIRT_STACK_INTERFACE 0x40000081 -#define HYPERV_VS_INTERFACE_EAX_SIGNATURE 0x31235356 /* "VS#1" */ - -#define HYPERV_CPUID_VIRT_STACK_PROPERTIES 0x40000082 -/* Support for the extended IOAPIC RTE format */ -#define HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE BIT(2) - -#define HYPERV_HYPERVISOR_PRESENT_BIT 0x80000000 -#define HYPERV_CPUID_MIN 0x40000005 -#define HYPERV_CPUID_MAX 0x4000ffff - -/* - * Group D Features. The bit assignments are custom to each architecture. - * On x86/x64 these are HYPERV_CPUID_FEATURES.EDX bits. - */ -/* The MWAIT instruction is available (per section MONITOR / MWAIT) */ -#define HV_X64_MWAIT_AVAILABLE BIT(0) -/* Guest debugging support is available */ -#define HV_X64_GUEST_DEBUGGING_AVAILABLE BIT(1) -/* Performance Monitor support is available*/ -#define HV_X64_PERF_MONITOR_AVAILABLE BIT(2) -/* Support for physical CPU dynamic partitioning events is available*/ -#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE BIT(3) -/* - * Support for passing hypercall input parameter block via XMM - * registers is available - */ -#define HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE BIT(4) -/* Support for a virtual guest idle state is available */ -#define HV_X64_GUEST_IDLE_STATE_AVAILABLE BIT(5) -/* Frequency MSRs available */ -#define HV_FEATURE_FREQUENCY_MSRS_AVAILABLE BIT(8) -/* Crash MSR available */ -#define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE BIT(10) -/* Support for debug MSRs available */ -#define HV_FEATURE_DEBUG_MSRS_AVAILABLE BIT(11) -/* - * Support for returning hypercall output block via XMM - * registers is available - */ -#define HV_X64_HYPERCALL_XMM_OUTPUT_AVAILABLE BIT(15) -/* stimer Direct Mode is available */ -#define HV_STIMER_DIRECT_MODE_AVAILABLE BIT(19) - -/* - * Implementation recommendations. Indicates which behaviors the hypervisor - * recommends the OS implement for optimal performance. - * These are HYPERV_CPUID_ENLIGHTMENT_INFO.EAX bits. - */ -/* - * Recommend using hypercall for address space switches rather - * than MOV to CR3 instruction - */ -#define HV_X64_AS_SWITCH_RECOMMENDED BIT(0) -/* Recommend using hypercall for local TLB flushes rather - * than INVLPG or MOV to CR3 instructions */ -#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED BIT(1) -/* - * Recommend using hypercall for remote TLB flushes rather - * than inter-processor interrupts - */ -#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED BIT(2) -/* - * Recommend using MSRs for accessing APIC registers - * EOI, ICR and TPR rather than their memory-mapped counterparts - */ -#define HV_X64_APIC_ACCESS_RECOMMENDED BIT(3) -/* Recommend using the hypervisor-provided MSR to initiate a system RESET */ -#define HV_X64_SYSTEM_RESET_RECOMMENDED BIT(4) -/* - * Recommend using relaxed timing for this partition. If used, - * the VM should disable any watchdog timeouts that rely on the - * timely delivery of external interrupts - */ -#define HV_X64_RELAXED_TIMING_RECOMMENDED BIT(5) - -/* - * Recommend not using Auto End-Of-Interrupt feature - */ -#define HV_DEPRECATING_AEOI_RECOMMENDED BIT(9) - -/* - * Recommend using cluster IPI hypercalls. - */ -#define HV_X64_CLUSTER_IPI_RECOMMENDED BIT(10) - -/* Recommend using the newer ExProcessorMasks interface */ -#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED BIT(11) - -/* Recommend using enlightened VMCS */ -#define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED BIT(14) - -/* - * CPU management features identification. - * These are HYPERV_CPUID_CPU_MANAGEMENT_FEATURES.EAX bits. - */ -#define HV_X64_START_LOGICAL_PROCESSOR BIT(0) -#define HV_X64_CREATE_ROOT_VIRTUAL_PROCESSOR BIT(1) -#define HV_X64_PERFORMANCE_COUNTER_SYNC BIT(2) -#define HV_X64_RESERVED_IDENTITY_BIT BIT(31) - -/* - * Virtual processor will never share a physical core with another virtual - * processor, except for virtual processors that are reported as sibling SMT - * threads. - */ -#define HV_X64_NO_NONARCH_CORESHARING BIT(18) - -/* Nested features. These are HYPERV_CPUID_NESTED_FEATURES.EAX bits. */ -#define HV_X64_NESTED_DIRECT_FLUSH BIT(17) -#define HV_X64_NESTED_GUEST_MAPPING_FLUSH BIT(18) -#define HV_X64_NESTED_MSR_BITMAP BIT(19) - -/* - * This is specific to AMD and specifies that enlightened TLB flush is - * supported. If guest opts in to this feature, ASID invalidations only - * flushes gva -> hpa mapping entries. To flush the TLB entries derived - * from NPT, hypercalls should be used (HvFlushGuestPhysicalAddressSpace - * or HvFlushGuestPhysicalAddressList). - */ -#define HV_X64_NESTED_ENLIGHTENED_TLB BIT(22) - -/* HYPERV_CPUID_ISOLATION_CONFIG.EAX bits. */ -#define HV_PARAVISOR_PRESENT BIT(0) - -/* HYPERV_CPUID_ISOLATION_CONFIG.EBX bits. */ -#define HV_ISOLATION_TYPE GENMASK(3, 0) -#define HV_SHARED_GPA_BOUNDARY_ACTIVE BIT(5) -#define HV_SHARED_GPA_BOUNDARY_BITS GENMASK(11, 6) - -enum hv_isolation_type { - HV_ISOLATION_TYPE_NONE = 0, - HV_ISOLATION_TYPE_VBS = 1, - HV_ISOLATION_TYPE_SNP = 2 -}; - -/* Hyper-V specific model specific registers (MSRs) */ - -/* MSR used to identify the guest OS. */ -#define HV_X64_MSR_GUEST_OS_ID 0x40000000 - -/* MSR used to setup pages used to communicate with the hypervisor. */ -#define HV_X64_MSR_HYPERCALL 0x40000001 - -/* MSR used to provide vcpu index */ -#define HV_REGISTER_VP_INDEX 0x40000002 - -/* MSR used to reset the guest OS. */ -#define HV_X64_MSR_RESET 0x40000003 - -/* MSR used to provide vcpu runtime in 100ns units */ -#define HV_X64_MSR_VP_RUNTIME 0x40000010 - -/* MSR used to read the per-partition time reference counter */ -#define HV_REGISTER_TIME_REF_COUNT 0x40000020 - -/* A partition's reference time stamp counter (TSC) page */ -#define HV_REGISTER_REFERENCE_TSC 0x40000021 - -/* MSR used to retrieve the TSC frequency */ -#define HV_X64_MSR_TSC_FREQUENCY 0x40000022 - -/* MSR used to retrieve the local APIC timer frequency */ -#define HV_X64_MSR_APIC_FREQUENCY 0x40000023 - -/* Define the virtual APIC registers */ -#define HV_X64_MSR_EOI 0x40000070 -#define HV_X64_MSR_ICR 0x40000071 -#define HV_X64_MSR_TPR 0x40000072 -#define HV_X64_MSR_VP_ASSIST_PAGE 0x40000073 - -/* Define synthetic interrupt controller model specific registers. */ -#define HV_REGISTER_SCONTROL 0x40000080 -#define HV_REGISTER_SVERSION 0x40000081 -#define HV_REGISTER_SIEFP 0x40000082 -#define HV_REGISTER_SIMP 0x40000083 -#define HV_REGISTER_EOM 0x40000084 -#define HV_REGISTER_SINT0 0x40000090 -#define HV_REGISTER_SINT1 0x40000091 -#define HV_REGISTER_SINT2 0x40000092 -#define HV_REGISTER_SINT3 0x40000093 -#define HV_REGISTER_SINT4 0x40000094 -#define HV_REGISTER_SINT5 0x40000095 -#define HV_REGISTER_SINT6 0x40000096 -#define HV_REGISTER_SINT7 0x40000097 -#define HV_REGISTER_SINT8 0x40000098 -#define HV_REGISTER_SINT9 0x40000099 -#define HV_REGISTER_SINT10 0x4000009A -#define HV_REGISTER_SINT11 0x4000009B -#define HV_REGISTER_SINT12 0x4000009C -#define HV_REGISTER_SINT13 0x4000009D -#define HV_REGISTER_SINT14 0x4000009E -#define HV_REGISTER_SINT15 0x4000009F - -/* - * Synthetic Timer MSRs. Four timers per vcpu. - */ -#define HV_REGISTER_STIMER0_CONFIG 0x400000B0 -#define HV_REGISTER_STIMER0_COUNT 0x400000B1 -#define HV_REGISTER_STIMER1_CONFIG 0x400000B2 -#define HV_REGISTER_STIMER1_COUNT 0x400000B3 -#define HV_REGISTER_STIMER2_CONFIG 0x400000B4 -#define HV_REGISTER_STIMER2_COUNT 0x400000B5 -#define HV_REGISTER_STIMER3_CONFIG 0x400000B6 -#define HV_REGISTER_STIMER3_COUNT 0x400000B7 - -/* Hyper-V guest idle MSR */ -#define HV_X64_MSR_GUEST_IDLE 0x400000F0 - -/* Hyper-V guest crash notification MSR's */ -#define HV_REGISTER_CRASH_P0 0x40000100 -#define HV_REGISTER_CRASH_P1 0x40000101 -#define HV_REGISTER_CRASH_P2 0x40000102 -#define HV_REGISTER_CRASH_P3 0x40000103 -#define HV_REGISTER_CRASH_P4 0x40000104 -#define HV_REGISTER_CRASH_CTL 0x40000105 - -/* TSC emulation after migration */ -#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106 -#define HV_X64_MSR_TSC_EMULATION_CONTROL 0x40000107 -#define HV_X64_MSR_TSC_EMULATION_STATUS 0x40000108 - -/* TSC invariant control */ -#define HV_X64_MSR_TSC_INVARIANT_CONTROL 0x40000118 - -/* Register name aliases for temporary compatibility */ -#define HV_X64_MSR_STIMER0_COUNT HV_REGISTER_STIMER0_COUNT -#define HV_X64_MSR_STIMER0_CONFIG HV_REGISTER_STIMER0_CONFIG -#define HV_X64_MSR_STIMER1_COUNT HV_REGISTER_STIMER1_COUNT -#define HV_X64_MSR_STIMER1_CONFIG HV_REGISTER_STIMER1_CONFIG -#define HV_X64_MSR_STIMER2_COUNT HV_REGISTER_STIMER2_COUNT -#define HV_X64_MSR_STIMER2_CONFIG HV_REGISTER_STIMER2_CONFIG -#define HV_X64_MSR_STIMER3_COUNT HV_REGISTER_STIMER3_COUNT -#define HV_X64_MSR_STIMER3_CONFIG HV_REGISTER_STIMER3_CONFIG -#define HV_X64_MSR_SCONTROL HV_REGISTER_SCONTROL -#define HV_X64_MSR_SVERSION HV_REGISTER_SVERSION -#define HV_X64_MSR_SIMP HV_REGISTER_SIMP -#define HV_X64_MSR_SIEFP HV_REGISTER_SIEFP -#define HV_X64_MSR_VP_INDEX HV_REGISTER_VP_INDEX -#define HV_X64_MSR_EOM HV_REGISTER_EOM -#define HV_X64_MSR_SINT0 HV_REGISTER_SINT0 -#define HV_X64_MSR_SINT15 HV_REGISTER_SINT15 -#define HV_X64_MSR_CRASH_P0 HV_REGISTER_CRASH_P0 -#define HV_X64_MSR_CRASH_P1 HV_REGISTER_CRASH_P1 -#define HV_X64_MSR_CRASH_P2 HV_REGISTER_CRASH_P2 -#define HV_X64_MSR_CRASH_P3 HV_REGISTER_CRASH_P3 -#define HV_X64_MSR_CRASH_P4 HV_REGISTER_CRASH_P4 -#define HV_X64_MSR_CRASH_CTL HV_REGISTER_CRASH_CTL -#define HV_X64_MSR_TIME_REF_COUNT HV_REGISTER_TIME_REF_COUNT -#define HV_X64_MSR_REFERENCE_TSC HV_REGISTER_REFERENCE_TSC - -/* Hyper-V memory host visibility */ -enum hv_mem_host_visibility { - VMBUS_PAGE_NOT_VISIBLE = 0, - VMBUS_PAGE_VISIBLE_READ_ONLY = 1, - VMBUS_PAGE_VISIBLE_READ_WRITE = 3 -}; - -/* HvCallModifySparseGpaPageHostVisibility hypercall */ -#define HV_MAX_MODIFY_GPA_REP_COUNT ((PAGE_SIZE / sizeof(u64)) - 2) -struct hv_gpa_range_for_visibility { - u64 partition_id; - u32 host_visibility:2; - u32 reserved0:30; - u32 reserved1; - u64 gpa_page_list[HV_MAX_MODIFY_GPA_REP_COUNT]; -} __packed; - -/* - * Declare the MSR used to setup pages used to communicate with the hypervisor. - */ -union hv_x64_msr_hypercall_contents { - u64 as_uint64; - struct { - u64 enable:1; - u64 reserved:11; - u64 guest_physical_address:52; - } __packed; -}; - -union hv_vp_assist_msr_contents { - u64 as_uint64; - struct { - u64 enable:1; - u64 reserved:11; - u64 pfn:52; - } __packed; -}; - -struct hv_reenlightenment_control { - __u64 vector:8; - __u64 reserved1:8; - __u64 enabled:1; - __u64 reserved2:15; - __u64 target_vp:32; -} __packed; - -struct hv_tsc_emulation_control { - __u64 enabled:1; - __u64 reserved:63; -} __packed; - -struct hv_tsc_emulation_status { - __u64 inprogress:1; - __u64 reserved:63; -} __packed; - -#define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001 -#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12 -#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \ - (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1)) - -#define HV_X64_MSR_CRASH_PARAMS \ - (1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0)) - -#define HV_IPI_LOW_VECTOR 0x10 -#define HV_IPI_HIGH_VECTOR 0xff - -#define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE 0x00000001 -#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT 12 -#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK \ - (~((1ull << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) - 1)) - -/* Hyper-V Enlightened VMCS version mask in nested features CPUID */ -#define HV_X64_ENLIGHTENED_VMCS_VERSION 0xff - -#define HV_X64_MSR_TSC_REFERENCE_ENABLE 0x00000001 -#define HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT 12 - -/* Number of XMM registers used in hypercall input/output */ -#define HV_HYPERCALL_MAX_XMM_REGISTERS 6 - -struct hv_nested_enlightenments_control { - struct { - __u32 directhypercall:1; - __u32 reserved:31; - } features; - struct { - __u32 reserved; - } hypercallControls; -} __packed; - -/* Define virtual processor assist page structure. */ -struct hv_vp_assist_page { - __u32 apic_assist; - __u32 reserved1; - __u64 vtl_control[3]; - struct hv_nested_enlightenments_control nested_control; - __u8 enlighten_vmentry; - __u8 reserved2[7]; - __u64 current_nested_vmcs; -} __packed; - -struct hv_enlightened_vmcs { - u32 revision_id; - u32 abort; - - u16 host_es_selector; - u16 host_cs_selector; - u16 host_ss_selector; - u16 host_ds_selector; - u16 host_fs_selector; - u16 host_gs_selector; - u16 host_tr_selector; - - u16 padding16_1; - - u64 host_ia32_pat; - u64 host_ia32_efer; - - u64 host_cr0; - u64 host_cr3; - u64 host_cr4; - - u64 host_ia32_sysenter_esp; - u64 host_ia32_sysenter_eip; - u64 host_rip; - u32 host_ia32_sysenter_cs; - - u32 pin_based_vm_exec_control; - u32 vm_exit_controls; - u32 secondary_vm_exec_control; - - u64 io_bitmap_a; - u64 io_bitmap_b; - u64 msr_bitmap; - - u16 guest_es_selector; - u16 guest_cs_selector; - u16 guest_ss_selector; - u16 guest_ds_selector; - u16 guest_fs_selector; - u16 guest_gs_selector; - u16 guest_ldtr_selector; - u16 guest_tr_selector; - - u32 guest_es_limit; - u32 guest_cs_limit; - u32 guest_ss_limit; - u32 guest_ds_limit; - u32 guest_fs_limit; - u32 guest_gs_limit; - u32 guest_ldtr_limit; - u32 guest_tr_limit; - u32 guest_gdtr_limit; - u32 guest_idtr_limit; - - u32 guest_es_ar_bytes; - u32 guest_cs_ar_bytes; - u32 guest_ss_ar_bytes; - u32 guest_ds_ar_bytes; - u32 guest_fs_ar_bytes; - u32 guest_gs_ar_bytes; - u32 guest_ldtr_ar_bytes; - u32 guest_tr_ar_bytes; - - u64 guest_es_base; - u64 guest_cs_base; - u64 guest_ss_base; - u64 guest_ds_base; - u64 guest_fs_base; - u64 guest_gs_base; - u64 guest_ldtr_base; - u64 guest_tr_base; - u64 guest_gdtr_base; - u64 guest_idtr_base; - - u64 padding64_1[3]; - - u64 vm_exit_msr_store_addr; - u64 vm_exit_msr_load_addr; - u64 vm_entry_msr_load_addr; - - u64 cr3_target_value0; - u64 cr3_target_value1; - u64 cr3_target_value2; - u64 cr3_target_value3; - - u32 page_fault_error_code_mask; - u32 page_fault_error_code_match; - - u32 cr3_target_count; - u32 vm_exit_msr_store_count; - u32 vm_exit_msr_load_count; - u32 vm_entry_msr_load_count; - - u64 tsc_offset; - u64 virtual_apic_page_addr; - u64 vmcs_link_pointer; - - u64 guest_ia32_debugctl; - u64 guest_ia32_pat; - u64 guest_ia32_efer; - - u64 guest_pdptr0; - u64 guest_pdptr1; - u64 guest_pdptr2; - u64 guest_pdptr3; - - u64 guest_pending_dbg_exceptions; - u64 guest_sysenter_esp; - u64 guest_sysenter_eip; - - u32 guest_activity_state; - u32 guest_sysenter_cs; - - u64 cr0_guest_host_mask; - u64 cr4_guest_host_mask; - u64 cr0_read_shadow; - u64 cr4_read_shadow; - u64 guest_cr0; - u64 guest_cr3; - u64 guest_cr4; - u64 guest_dr7; - - u64 host_fs_base; - u64 host_gs_base; - u64 host_tr_base; - u64 host_gdtr_base; - u64 host_idtr_base; - u64 host_rsp; - - u64 ept_pointer; - - u16 virtual_processor_id; - u16 padding16_2[3]; - - u64 padding64_2[5]; - u64 guest_physical_address; - - u32 vm_instruction_error; - u32 vm_exit_reason; - u32 vm_exit_intr_info; - u32 vm_exit_intr_error_code; - u32 idt_vectoring_info_field; - u32 idt_vectoring_error_code; - u32 vm_exit_instruction_len; - u32 vmx_instruction_info; - - u64 exit_qualification; - u64 exit_io_instruction_ecx; - u64 exit_io_instruction_esi; - u64 exit_io_instruction_edi; - u64 exit_io_instruction_eip; - - u64 guest_linear_address; - u64 guest_rsp; - u64 guest_rflags; - - u32 guest_interruptibility_info; - u32 cpu_based_vm_exec_control; - u32 exception_bitmap; - u32 vm_entry_controls; - u32 vm_entry_intr_info_field; - u32 vm_entry_exception_error_code; - u32 vm_entry_instruction_len; - u32 tpr_threshold; - - u64 guest_rip; - - u32 hv_clean_fields; - u32 hv_padding_32; - u32 hv_synthetic_controls; - struct { - u32 nested_flush_hypercall:1; - u32 msr_bitmap:1; - u32 reserved:30; - } __packed hv_enlightenments_control; - u32 hv_vp_id; - - u64 hv_vm_id; - u64 partition_assist_page; - u64 padding64_4[4]; - u64 guest_bndcfgs; - u64 padding64_5[7]; - u64 xss_exit_bitmap; - u64 padding64_6[7]; -} __packed; - -#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE 0 -#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP BIT(0) -#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP BIT(1) -#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2 BIT(2) -#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1 BIT(3) -#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC BIT(4) -#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT BIT(5) -#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY BIT(6) -#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN BIT(7) -#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR BIT(8) -#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT BIT(9) -#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC BIT(10) -#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1 BIT(11) -#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2 BIT(12) -#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER BIT(13) -#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1 BIT(14) -#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL BIT(15) - -#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL 0xFFFF - -struct hv_partition_assist_pg { - u32 tlb_lock_count; -}; - -enum hv_interrupt_type { - HV_X64_INTERRUPT_TYPE_FIXED = 0x0000, - HV_X64_INTERRUPT_TYPE_LOWESTPRIORITY = 0x0001, - HV_X64_INTERRUPT_TYPE_SMI = 0x0002, - HV_X64_INTERRUPT_TYPE_REMOTEREAD = 0x0003, - HV_X64_INTERRUPT_TYPE_NMI = 0x0004, - HV_X64_INTERRUPT_TYPE_INIT = 0x0005, - HV_X64_INTERRUPT_TYPE_SIPI = 0x0006, - HV_X64_INTERRUPT_TYPE_EXTINT = 0x0007, - HV_X64_INTERRUPT_TYPE_LOCALINT0 = 0x0008, - HV_X64_INTERRUPT_TYPE_LOCALINT1 = 0x0009, - HV_X64_INTERRUPT_TYPE_MAXIMUM = 0x000A, -}; - -union hv_msi_address_register { - u32 as_uint32; - struct { - u32 reserved1:2; - u32 destination_mode:1; - u32 redirection_hint:1; - u32 reserved2:8; - u32 destination_id:8; - u32 msi_base:12; - }; -} __packed; - -union hv_msi_data_register { - u32 as_uint32; - struct { - u32 vector:8; - u32 delivery_mode:3; - u32 reserved1:3; - u32 level_assert:1; - u32 trigger_mode:1; - u32 reserved2:16; - }; -} __packed; - -/* HvRetargetDeviceInterrupt hypercall */ -union hv_msi_entry { - u64 as_uint64; - struct { - union hv_msi_address_register address; - union hv_msi_data_register data; - } __packed; -}; - -#include <asm-generic/hyperv-tlfs.h> - -#endif diff --git a/arch/x86/include/asm/hyperv_timer.h b/arch/x86/include/asm/hyperv_timer.h new file mode 100644 index 000000000000..388fa81b8f38 --- /dev/null +++ b/arch/x86/include/asm/hyperv_timer.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_HYPERV_TIMER_H +#define _ASM_X86_HYPERV_TIMER_H + +#include <asm/msr.h> + +#define hv_get_raw_timer() rdtsc_ordered() + +#endif diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index 637fa1df3512..c715097e92fd 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -69,6 +69,8 @@ struct legacy_pic { void (*make_irq)(unsigned int irq); }; +void legacy_pic_pcat_compat(void); + extern struct legacy_pic *legacy_pic; extern struct legacy_pic null_legacy_pic; diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index fada857f0a1e..9d69f3f8dbab 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -2,7 +2,6 @@ #ifndef _ASM_X86_IA32_H #define _ASM_X86_IA32_H - #ifdef CONFIG_IA32_EMULATION #include <linux/compat.h> @@ -57,17 +56,37 @@ struct stat64 { unsigned long long st_ino; } __attribute__((packed)); -#define IA32_STACK_TOP IA32_PAGE_OFFSET +extern bool __ia32_enabled; + +static __always_inline bool ia32_enabled(void) +{ + return __ia32_enabled; +} + +static inline void ia32_disable(void) +{ + __ia32_enabled = false; +} + +#else /* !CONFIG_IA32_EMULATION */ -#ifdef __KERNEL__ -struct linux_binprm; -extern int ia32_setup_arg_pages(struct linux_binprm *bprm, - unsigned long stack_top, int exec_stack); -struct mm_struct; -extern void ia32_pick_mmap_layout(struct mm_struct *mm); +static __always_inline bool ia32_enabled(void) +{ + return IS_ENABLED(CONFIG_X86_32); +} + +static inline void ia32_disable(void) {} #endif -#endif /* CONFIG_IA32_EMULATION */ +static inline bool ia32_enabled_verbose(void) +{ + bool enabled = ia32_enabled(); + + if (IS_ENABLED(CONFIG_IA32_EMULATION) && !enabled) + pr_notice_once("32-bit emulation disabled. You can reenable with ia32_emulation=on\n"); + + return enabled; +} #endif /* _ASM_X86_IA32_H */ diff --git a/arch/x86/include/asm/ia32_unistd.h b/arch/x86/include/asm/ia32_unistd.h deleted file mode 100644 index aa065c94ccf5..000000000000 --- a/arch/x86/include/asm/ia32_unistd.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_IA32_UNISTD_H -#define _ASM_X86_IA32_UNISTD_H - -/* - * This file contains the system call numbers of the ia32 compat ABI, - * this is for the kernel only. - */ -#define __SYSCALL_ia32_NR(x) (x) -#include <asm/unistd_32_ia32.h> - -#endif /* _ASM_X86_IA32_UNISTD_H */ diff --git a/arch/x86/include/asm/ibt.h b/arch/x86/include/asm/ibt.h index 689880eca9ba..28d845257303 100644 --- a/arch/x86/include/asm/ibt.h +++ b/arch/x86/include/asm/ibt.h @@ -21,7 +21,7 @@ #define HAS_KERNEL_IBT 1 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_X86_64 #define ASM_ENDBR "endbr64\n\t" @@ -31,7 +31,17 @@ #define __noendbr __attribute__((nocf_check)) -static inline __attribute_const__ u32 gen_endbr(void) +/* + * Create a dummy function pointer reference to prevent objtool from marking + * the function as needing to be "sealed" (i.e. ENDBR converted to NOP by + * apply_seal_endbr()). + */ +#define IBT_NOSEAL(fname) \ + ".pushsection .discard.ibt_endbr_noseal\n\t" \ + _ASM_PTR fname "\n\t" \ + ".popsection\n\t" + +static __always_inline __attribute_const__ u32 gen_endbr(void) { u32 endbr; @@ -46,7 +56,7 @@ static inline __attribute_const__ u32 gen_endbr(void) return endbr; } -static inline __attribute_const__ u32 gen_endbr_poison(void) +static __always_inline __attribute_const__ u32 gen_endbr_poison(void) { /* * 4 byte NOP that isn't NOP4 (in fact it is OSP NOP3), such that it @@ -55,19 +65,24 @@ static inline __attribute_const__ u32 gen_endbr_poison(void) return 0x001f0f66; /* osp nopl (%rax) */ } -static inline bool is_endbr(u32 val) +static inline bool __is_endbr(u32 val) { if (val == gen_endbr_poison()) return true; + /* See cfi_fineibt_bhi_preamble() */ + if (IS_ENABLED(CONFIG_FINEIBT_BHI) && val == 0x001f0ff5) + return true; + val &= ~0x01000000U; /* ENDBR32 -> ENDBR64 */ return val == gen_endbr(); } -extern __noendbr u64 ibt_save(void); +extern __noendbr bool is_endbr(u32 *val); +extern __noendbr u64 ibt_save(bool disable); extern __noendbr void ibt_restore(u64 save); -#else /* __ASSEMBLY__ */ +#else /* __ASSEMBLER__ */ #ifdef CONFIG_X86_64 #define ENDBR endbr64 @@ -75,28 +90,29 @@ extern __noendbr void ibt_restore(u64 save); #define ENDBR endbr32 #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #else /* !IBT */ #define HAS_KERNEL_IBT 0 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define ASM_ENDBR +#define IBT_NOSEAL(name) #define __noendbr -static inline bool is_endbr(u32 val) { return false; } +static inline bool is_endbr(u32 *val) { return false; } -static inline u64 ibt_save(void) { return 0; } +static inline u64 ibt_save(bool disable) { return 0; } static inline void ibt_restore(u64 save) { } -#else /* __ASSEMBLY__ */ +#else /* __ASSEMBLER__ */ #define ENDBR -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* CONFIG_X86_KERNEL_IBT */ diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 72184b0b2219..a4ec27c67988 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -7,21 +7,24 @@ #define IDT_ALIGN (8 * (1 + HAS_KERNEL_IBT)) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/entry-common.h> #include <linux/hardirq.h> #include <asm/irq_stack.h> +typedef void (*idtentry_t)(struct pt_regs *regs); + /** * DECLARE_IDTENTRY - Declare functions for simple IDT entry points * No error code pushed by hardware * @vector: Vector number (ignored for C) * @func: Function name of the entry point * - * Declares three functions: + * Declares four functions: * - The ASM entry point: asm_##func * - The XEN PV trap entry point: xen_##func (maybe unused) + * - The C handler called from the FRED event dispatcher (maybe unused) * - The C handler called from the ASM entry point * * Note: This is the C variant of DECLARE_IDTENTRY(). As the name says it @@ -31,6 +34,7 @@ #define DECLARE_IDTENTRY(vector, func) \ asmlinkage void asm_##func(void); \ asmlinkage void xen_asm_##func(void); \ + void fred_##func(struct pt_regs *regs); \ __visible void func(struct pt_regs *regs) /** @@ -138,6 +142,17 @@ static __always_inline void __##func(struct pt_regs *regs, \ __visible noinstr void func(struct pt_regs *regs) /** + * DEFINE_FREDENTRY_RAW - Emit code for raw FRED entry points + * @func: Function name of the entry point + * + * @func is called from the FRED event dispatcher with interrupts disabled. + * + * See @DEFINE_IDTENTRY_RAW for further details. + */ +#define DEFINE_FREDENTRY_RAW(func) \ +noinstr void fred_##func(struct pt_regs *regs) + +/** * DECLARE_IDTENTRY_RAW_ERRORCODE - Declare functions for raw IDT entry points * Error code pushed by hardware * @vector: Vector number (ignored for C) @@ -197,8 +212,8 @@ __visible noinstr void func(struct pt_regs *regs, \ irqentry_state_t state = irqentry_enter(regs); \ u32 vector = (u32)(u8)error_code; \ \ + kvm_set_cpu_l1tf_flush_l1d(); \ instrumentation_begin(); \ - kvm_set_cpu_l1tf_flush_l1d(); \ run_irq_on_irqstack_cond(__##func, regs, vector); \ instrumentation_end(); \ irqentry_exit(regs, state); \ @@ -233,17 +248,27 @@ static noinline void __##func(struct pt_regs *regs, u32 vector) #define DEFINE_IDTENTRY_SYSVEC(func) \ static void __##func(struct pt_regs *regs); \ \ +static __always_inline void instr_##func(struct pt_regs *regs) \ +{ \ + run_sysvec_on_irqstack_cond(__##func, regs); \ +} \ + \ __visible noinstr void func(struct pt_regs *regs) \ { \ irqentry_state_t state = irqentry_enter(regs); \ \ + kvm_set_cpu_l1tf_flush_l1d(); \ instrumentation_begin(); \ - kvm_set_cpu_l1tf_flush_l1d(); \ - run_sysvec_on_irqstack_cond(__##func, regs); \ + instr_##func (regs); \ instrumentation_end(); \ irqentry_exit(regs, state); \ } \ \ +void fred_##func(struct pt_regs *regs) \ +{ \ + instr_##func (regs); \ +} \ + \ static noinline void __##func(struct pt_regs *regs) /** @@ -260,19 +285,29 @@ static noinline void __##func(struct pt_regs *regs) #define DEFINE_IDTENTRY_SYSVEC_SIMPLE(func) \ static __always_inline void __##func(struct pt_regs *regs); \ \ +static __always_inline void instr_##func(struct pt_regs *regs) \ +{ \ + __irq_enter_raw(); \ + __##func (regs); \ + __irq_exit_raw(); \ +} \ + \ __visible noinstr void func(struct pt_regs *regs) \ { \ irqentry_state_t state = irqentry_enter(regs); \ \ + kvm_set_cpu_l1tf_flush_l1d(); \ instrumentation_begin(); \ - __irq_enter_raw(); \ - kvm_set_cpu_l1tf_flush_l1d(); \ - __##func (regs); \ - __irq_exit_raw(); \ + instr_##func (regs); \ instrumentation_end(); \ irqentry_exit(regs, state); \ } \ \ +void fred_##func(struct pt_regs *regs) \ +{ \ + instr_##func (regs); \ +} \ + \ static __always_inline void __##func(struct pt_regs *regs) /** @@ -410,18 +445,36 @@ __visible noinstr void func(struct pt_regs *regs, \ /* C-Code mapping */ #define DECLARE_IDTENTRY_NMI DECLARE_IDTENTRY_RAW #define DEFINE_IDTENTRY_NMI DEFINE_IDTENTRY_RAW +#define DEFINE_FREDENTRY_NMI DEFINE_FREDENTRY_RAW #ifdef CONFIG_X86_64 #define DECLARE_IDTENTRY_MCE DECLARE_IDTENTRY_IST #define DEFINE_IDTENTRY_MCE DEFINE_IDTENTRY_IST #define DEFINE_IDTENTRY_MCE_USER DEFINE_IDTENTRY_NOIST +#define DEFINE_FREDENTRY_MCE DEFINE_FREDENTRY_RAW #define DECLARE_IDTENTRY_DEBUG DECLARE_IDTENTRY_IST #define DEFINE_IDTENTRY_DEBUG DEFINE_IDTENTRY_IST #define DEFINE_IDTENTRY_DEBUG_USER DEFINE_IDTENTRY_NOIST +#define DEFINE_FREDENTRY_DEBUG DEFINE_FREDENTRY_RAW #endif -#else /* !__ASSEMBLY__ */ +void idt_install_sysvec(unsigned int n, const void *function); + +#ifdef CONFIG_X86_FRED +void fred_install_sysvec(unsigned int vector, const idtentry_t function); +#else +static inline void fred_install_sysvec(unsigned int vector, const idtentry_t function) { } +#endif + +#define sysvec_install(vector, function) { \ + if (cpu_feature_enabled(X86_FEATURE_FRED)) \ + fred_install_sysvec(vector, function); \ + else \ + idt_install_sysvec(vector, asm_##function); \ +} + +#else /* !__ASSEMBLER__ */ /* * The ASM variants for DECLARE_IDTENTRY*() which emit the ASM entry stubs. @@ -447,7 +500,7 @@ __visible noinstr void func(struct pt_regs *regs, \ /* System vector entries */ #define DECLARE_IDTENTRY_SYSVEC(vector, func) \ - idtentry_sysvec vector func + DECLARE_IDTENTRY(vector, func) #ifdef CONFIG_X86_64 # define DECLARE_IDTENTRY_MCE(vector, func) \ @@ -526,7 +579,7 @@ SYM_CODE_START(spurious_entries_start) SYM_CODE_END(spurious_entries_start) #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* * The actual entry points. Note that DECLARE_IDTENTRY*() serves two @@ -569,6 +622,10 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_UD, exc_invalid_op); DECLARE_IDTENTRY_RAW(X86_TRAP_BP, exc_int3); DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_PF, exc_page_fault); +#if defined(CONFIG_IA32_EMULATION) +DECLARE_IDTENTRY_RAW(IA32_SYSCALL_VECTOR, int80_emulation); +#endif + #ifdef CONFIG_X86_MCE #ifdef CONFIG_X86_64 DECLARE_IDTENTRY_MCE(X86_TRAP_MC, exc_machine_check); @@ -582,18 +639,14 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_MC, xenpv_exc_machine_check); /* NMI */ -#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL) +#if IS_ENABLED(CONFIG_KVM_INTEL) /* - * Special NOIST entry point for VMX which invokes this on the kernel - * stack. asm_exc_nmi() requires an IST to work correctly vs. the NMI - * 'executing' marker. - * - * On 32bit this just uses the regular NMI entry point because 32-bit does - * not have ISTs. + * Special entry point for VMX which invokes this on the kernel stack, even for + * 64-bit, i.e. without using an IST. asm_exc_nmi() requires an IST to work + * correctly vs. the NMI 'executing' marker. Used for 32-bit kernels as well + * to avoid more ifdeffery. */ -DECLARE_IDTENTRY(X86_TRAP_NMI, exc_nmi_noist); -#else -#define asm_exc_nmi_noist asm_exc_nmi +DECLARE_IDTENTRY(X86_TRAP_NMI, exc_nmi_kvm_vmx); #endif DECLARE_IDTENTRY_NMI(X86_TRAP_NMI, exc_nmi); @@ -618,7 +671,7 @@ DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_DF, xenpv_exc_double_fault); #endif /* #CP */ -#ifdef CONFIG_X86_KERNEL_IBT +#ifdef CONFIG_X86_CET DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_CP, exc_control_protection); #endif @@ -652,40 +705,62 @@ DECLARE_IDTENTRY_SYSVEC(X86_PLATFORM_IPI_VECTOR, sysvec_x86_platform_ipi); #ifdef CONFIG_SMP DECLARE_IDTENTRY(RESCHEDULE_VECTOR, sysvec_reschedule_ipi); -DECLARE_IDTENTRY_SYSVEC(IRQ_MOVE_CLEANUP_VECTOR, sysvec_irq_move_cleanup); DECLARE_IDTENTRY_SYSVEC(REBOOT_VECTOR, sysvec_reboot); DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_SINGLE_VECTOR, sysvec_call_function_single); DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_VECTOR, sysvec_call_function); +#else +# define fred_sysvec_reschedule_ipi NULL +# define fred_sysvec_reboot NULL +# define fred_sysvec_call_function_single NULL +# define fred_sysvec_call_function NULL #endif #ifdef CONFIG_X86_LOCAL_APIC # ifdef CONFIG_X86_MCE_THRESHOLD DECLARE_IDTENTRY_SYSVEC(THRESHOLD_APIC_VECTOR, sysvec_threshold); +# else +# define fred_sysvec_threshold NULL # endif # ifdef CONFIG_X86_MCE_AMD DECLARE_IDTENTRY_SYSVEC(DEFERRED_ERROR_VECTOR, sysvec_deferred_error); +# else +# define fred_sysvec_deferred_error NULL # endif # ifdef CONFIG_X86_THERMAL_VECTOR DECLARE_IDTENTRY_SYSVEC(THERMAL_APIC_VECTOR, sysvec_thermal); +# else +# define fred_sysvec_thermal NULL # endif # ifdef CONFIG_IRQ_WORK DECLARE_IDTENTRY_SYSVEC(IRQ_WORK_VECTOR, sysvec_irq_work); +# else +# define fred_sysvec_irq_work NULL # endif #endif -#ifdef CONFIG_HAVE_KVM +#if IS_ENABLED(CONFIG_KVM) DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_VECTOR, sysvec_kvm_posted_intr_ipi); DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_WAKEUP_VECTOR, sysvec_kvm_posted_intr_wakeup_ipi); DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NESTED_VECTOR, sysvec_kvm_posted_intr_nested_ipi); +#else +# define fred_sysvec_kvm_posted_intr_ipi NULL +# define fred_sysvec_kvm_posted_intr_wakeup_ipi NULL +# define fred_sysvec_kvm_posted_intr_nested_ipi NULL #endif +# ifdef CONFIG_X86_POSTED_MSI +DECLARE_IDTENTRY_SYSVEC(POSTED_MSI_NOTIFICATION_VECTOR, sysvec_posted_msi_notification); +#else +# define fred_sysvec_posted_msi_notification NULL +# endif + #if IS_ENABLED(CONFIG_HYPERV) DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_hyperv_callback); DECLARE_IDTENTRY_SYSVEC(HYPERV_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment); -DECLARE_IDTENTRY_SYSVEC(HYPERV_STIMER0_VECTOR, sysvec_hyperv_stimer0); +DECLARE_IDTENTRY_SYSVEC(HYPERV_STIMER0_VECTOR, sysvec_hyperv_stimer0); #endif #if IS_ENABLED(CONFIG_ACRN_GUEST) diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h index b56c5741581a..97f341777db5 100644 --- a/arch/x86/include/asm/inat.h +++ b/arch/x86/include/asm/inat.h @@ -35,6 +35,8 @@ #define INAT_PFX_VEX2 13 /* 2-bytes VEX prefix */ #define INAT_PFX_VEX3 14 /* 3-bytes VEX prefix */ #define INAT_PFX_EVEX 15 /* EVEX prefix */ +/* x86-64 REX2 prefix */ +#define INAT_PFX_REX2 16 /* 0xD5 */ #define INAT_LSTPFX_MAX 3 #define INAT_LGCPFX_MAX 11 @@ -50,7 +52,7 @@ /* Legacy prefix */ #define INAT_PFX_OFFS 0 -#define INAT_PFX_BITS 4 +#define INAT_PFX_BITS 5 #define INAT_PFX_MAX ((1 << INAT_PFX_BITS) - 1) #define INAT_PFX_MASK (INAT_PFX_MAX << INAT_PFX_OFFS) /* Escape opcodes */ @@ -77,6 +79,10 @@ #define INAT_VEXOK (1 << (INAT_FLAG_OFFS + 5)) #define INAT_VEXONLY (1 << (INAT_FLAG_OFFS + 6)) #define INAT_EVEXONLY (1 << (INAT_FLAG_OFFS + 7)) +#define INAT_NO_REX2 (1 << (INAT_FLAG_OFFS + 8)) +#define INAT_REX2_VARIANT (1 << (INAT_FLAG_OFFS + 9)) +#define INAT_EVEX_SCALABLE (1 << (INAT_FLAG_OFFS + 10)) +#define INAT_INV64 (1 << (INAT_FLAG_OFFS + 11)) /* Attribute making macros for attribute tables */ #define INAT_MAKE_PREFIX(pfx) (pfx << INAT_PFX_OFFS) #define INAT_MAKE_ESCAPE(esc) (esc << INAT_ESC_OFFS) @@ -128,6 +134,11 @@ static inline int inat_is_rex_prefix(insn_attr_t attr) return (attr & INAT_PFX_MASK) == INAT_PFX_REX; } +static inline int inat_is_rex2_prefix(insn_attr_t attr) +{ + return (attr & INAT_PFX_MASK) == INAT_PFX_REX2; +} + static inline int inat_last_prefix_id(insn_attr_t attr) { if ((attr & INAT_PFX_MASK) > INAT_LSTPFX_MAX) @@ -227,4 +238,14 @@ static inline int inat_must_evex(insn_attr_t attr) { return attr & INAT_EVEXONLY; } + +static inline int inat_evex_scalable(insn_attr_t attr) +{ + return attr & INAT_EVEX_SCALABLE; +} + +static inline int inat_is_invalid64(insn_attr_t attr) +{ + return attr & INAT_INV64; +} #endif diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 5f1d3c421f68..8b1b1abcef15 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -2,8 +2,15 @@ #ifndef _ASM_X86_INIT_H #define _ASM_X86_INIT_H +#if defined(CONFIG_CC_IS_CLANG) && CONFIG_CLANG_VERSION < 170000 +#define __head __section(".head.text") __no_sanitize_undefined __no_stack_protector +#else +#define __head __section(".head.text") __no_sanitize_undefined +#endif + struct x86_mapping_info { void *(*alloc_pgt_page)(void *); /* allocate buf for page table */ + void (*free_pgt_page)(void *, void *); /* free buf for page table */ void *context; /* context for alloc_pgt_page */ unsigned long page_flag; /* page flag for PMD or PUD entry */ unsigned long offset; /* ident mapping offset */ @@ -14,4 +21,6 @@ struct x86_mapping_info { int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, unsigned long pstart, unsigned long pend); +void kernel_ident_mapping_free(struct x86_mapping_info *info, pgd_t *pgd); + #endif /* _ASM_X86_INIT_H */ diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h index f07faa61c7f3..54368a43abf6 100644 --- a/arch/x86/include/asm/insn-eval.h +++ b/arch/x86/include/asm/insn-eval.h @@ -32,16 +32,16 @@ int insn_fetch_from_user_inatomic(struct pt_regs *regs, bool insn_decode_from_regs(struct insn *insn, struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE], int buf_size); -enum mmio_type { - MMIO_DECODE_FAILED, - MMIO_WRITE, - MMIO_WRITE_IMM, - MMIO_READ, - MMIO_READ_ZERO_EXTEND, - MMIO_READ_SIGN_EXTEND, - MMIO_MOVS, +enum insn_mmio_type { + INSN_MMIO_DECODE_FAILED, + INSN_MMIO_WRITE, + INSN_MMIO_WRITE_IMM, + INSN_MMIO_READ, + INSN_MMIO_READ_ZERO_EXTEND, + INSN_MMIO_READ_SIGN_EXTEND, + INSN_MMIO_MOVS, }; -enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes); +enum insn_mmio_type insn_decode_mmio(struct insn *insn, int *bytes); #endif /* _ASM_X86_INSN_EVAL_H */ diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index 1b29f58f730f..7152ea809e6a 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h @@ -112,10 +112,15 @@ struct insn { #define X86_SIB_INDEX(sib) (((sib) & 0x38) >> 3) #define X86_SIB_BASE(sib) ((sib) & 0x07) -#define X86_REX_W(rex) ((rex) & 8) -#define X86_REX_R(rex) ((rex) & 4) -#define X86_REX_X(rex) ((rex) & 2) -#define X86_REX_B(rex) ((rex) & 1) +#define X86_REX2_M(rex) ((rex) & 0x80) /* REX2 M0 */ +#define X86_REX2_R(rex) ((rex) & 0x40) /* REX2 R4 */ +#define X86_REX2_X(rex) ((rex) & 0x20) /* REX2 X4 */ +#define X86_REX2_B(rex) ((rex) & 0x10) /* REX2 B4 */ + +#define X86_REX_W(rex) ((rex) & 8) /* REX or REX2 W */ +#define X86_REX_R(rex) ((rex) & 4) /* REX or REX2 R3 */ +#define X86_REX_X(rex) ((rex) & 2) /* REX or REX2 X3 */ +#define X86_REX_B(rex) ((rex) & 1) /* REX or REX2 B3 */ /* VEX bit flags */ #define X86_VEX_W(vex) ((vex) & 0x80) /* VEX3 Byte2 */ @@ -161,6 +166,18 @@ static inline void insn_get_attribute(struct insn *insn) /* Instruction uses RIP-relative addressing */ extern int insn_rip_relative(struct insn *insn); +static inline int insn_is_rex2(struct insn *insn) +{ + if (!insn->prefixes.got) + insn_get_prefixes(insn); + return insn->rex_prefix.nbytes == 2; +} + +static inline insn_byte_t insn_rex2_m_bit(struct insn *insn) +{ + return X86_REX2_M(insn->rex_prefix.bytes[1]); +} + static inline int insn_is_avx(struct insn *insn) { if (!insn->prefixes.got) @@ -198,6 +215,13 @@ static inline insn_byte_t insn_vex_p_bits(struct insn *insn) return X86_VEX_P(insn->vex_prefix.bytes[2]); } +static inline insn_byte_t insn_vex_w_bit(struct insn *insn) +{ + if (insn->vex_prefix.nbytes < 3) + return 0; + return X86_VEX_W(insn->vex_prefix.bytes[2]); +} + /* Get the last prefix id from last prefix or VEX prefix */ static inline int insn_last_prefix_id(struct insn *insn) { diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h index 438ccd4f3cc4..e48a00b3311d 100644 --- a/arch/x86/include/asm/inst.h +++ b/arch/x86/include/asm/inst.h @@ -6,7 +6,7 @@ #ifndef X86_ASM_INST_H #define X86_ASM_INST_H -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define REG_NUM_INVALID 100 diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index def6ca121111..be10c188614f 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -10,7 +10,7 @@ * that group keep the CPUID for the variants sorted by model number. * * The defined symbol names have the following form: - * INTEL_FAM6{OPTFAMILY}_{MICROARCH}{OPTDIFF} + * INTEL_{OPTFAMILY}_{MICROARCH}{OPTDIFF} * where: * OPTFAMILY Describes the family of CPUs that this belongs to. Default * is assumed to be "_CORE" (and should be omitted). Other values @@ -27,6 +27,8 @@ * _X - regular server parts * _D - micro server parts * _N,_P - other mobile parts + * _H - premium mobile parts + * _S - other client parts * * Historical OPTDIFFs: * @@ -38,114 +40,184 @@ * their own names :-( */ -/* Wildcard match for FAM6 so X86_MATCH_INTEL_FAM6_MODEL(ANY) works */ -#define INTEL_FAM6_ANY X86_MODEL_ANY +#define IFM(_fam, _model) VFM_MAKE(X86_VENDOR_INTEL, _fam, _model) -#define INTEL_FAM6_CORE_YONAH 0x0E +/* Wildcard match so X86_MATCH_VFM(ANY) works */ +#define INTEL_ANY IFM(X86_FAMILY_ANY, X86_MODEL_ANY) -#define INTEL_FAM6_CORE2_MEROM 0x0F -#define INTEL_FAM6_CORE2_MEROM_L 0x16 -#define INTEL_FAM6_CORE2_PENRYN 0x17 -#define INTEL_FAM6_CORE2_DUNNINGTON 0x1D - -#define INTEL_FAM6_NEHALEM 0x1E -#define INTEL_FAM6_NEHALEM_G 0x1F /* Auburndale / Havendale */ -#define INTEL_FAM6_NEHALEM_EP 0x1A -#define INTEL_FAM6_NEHALEM_EX 0x2E - -#define INTEL_FAM6_WESTMERE 0x25 -#define INTEL_FAM6_WESTMERE_EP 0x2C -#define INTEL_FAM6_WESTMERE_EX 0x2F - -#define INTEL_FAM6_SANDYBRIDGE 0x2A -#define INTEL_FAM6_SANDYBRIDGE_X 0x2D -#define INTEL_FAM6_IVYBRIDGE 0x3A -#define INTEL_FAM6_IVYBRIDGE_X 0x3E - -#define INTEL_FAM6_HASWELL 0x3C -#define INTEL_FAM6_HASWELL_X 0x3F -#define INTEL_FAM6_HASWELL_L 0x45 -#define INTEL_FAM6_HASWELL_G 0x46 - -#define INTEL_FAM6_BROADWELL 0x3D -#define INTEL_FAM6_BROADWELL_G 0x47 -#define INTEL_FAM6_BROADWELL_X 0x4F -#define INTEL_FAM6_BROADWELL_D 0x56 - -#define INTEL_FAM6_SKYLAKE_L 0x4E /* Sky Lake */ -#define INTEL_FAM6_SKYLAKE 0x5E /* Sky Lake */ -#define INTEL_FAM6_SKYLAKE_X 0x55 /* Sky Lake */ +/* Family 5 */ +#define INTEL_FAM5_START IFM(5, 0x00) /* Notational marker, also P5 A-step */ +#define INTEL_PENTIUM_75 IFM(5, 0x02) /* P54C */ +#define INTEL_PENTIUM_MMX IFM(5, 0x04) /* P55C */ +#define INTEL_QUARK_X1000 IFM(5, 0x09) /* Quark X1000 SoC */ + +/* Family 6 */ +#define INTEL_PENTIUM_PRO IFM(6, 0x01) +#define INTEL_PENTIUM_II_KLAMATH IFM(6, 0x03) +#define INTEL_PENTIUM_III_DESCHUTES IFM(6, 0x05) +#define INTEL_PENTIUM_III_TUALATIN IFM(6, 0x0B) +#define INTEL_PENTIUM_M_DOTHAN IFM(6, 0x0D) + +#define INTEL_CORE_YONAH IFM(6, 0x0E) + +#define INTEL_CORE2_MEROM IFM(6, 0x0F) +#define INTEL_CORE2_MEROM_L IFM(6, 0x16) +#define INTEL_CORE2_PENRYN IFM(6, 0x17) +#define INTEL_CORE2_DUNNINGTON IFM(6, 0x1D) + +#define INTEL_NEHALEM IFM(6, 0x1E) +#define INTEL_NEHALEM_G IFM(6, 0x1F) /* Auburndale / Havendale */ +#define INTEL_NEHALEM_EP IFM(6, 0x1A) +#define INTEL_NEHALEM_EX IFM(6, 0x2E) + +#define INTEL_WESTMERE IFM(6, 0x25) +#define INTEL_WESTMERE_EP IFM(6, 0x2C) +#define INTEL_WESTMERE_EX IFM(6, 0x2F) + +#define INTEL_SANDYBRIDGE IFM(6, 0x2A) +#define INTEL_SANDYBRIDGE_X IFM(6, 0x2D) +#define INTEL_IVYBRIDGE IFM(6, 0x3A) +#define INTEL_IVYBRIDGE_X IFM(6, 0x3E) + +#define INTEL_HASWELL IFM(6, 0x3C) +#define INTEL_HASWELL_X IFM(6, 0x3F) +#define INTEL_HASWELL_L IFM(6, 0x45) +#define INTEL_HASWELL_G IFM(6, 0x46) + +#define INTEL_BROADWELL IFM(6, 0x3D) +#define INTEL_BROADWELL_G IFM(6, 0x47) +#define INTEL_BROADWELL_X IFM(6, 0x4F) +#define INTEL_BROADWELL_D IFM(6, 0x56) + +#define INTEL_SKYLAKE_L IFM(6, 0x4E) /* Sky Lake */ +#define INTEL_SKYLAKE IFM(6, 0x5E) /* Sky Lake */ +#define INTEL_SKYLAKE_X IFM(6, 0x55) /* Sky Lake */ /* CASCADELAKE_X 0x55 Sky Lake -- s: 7 */ /* COOPERLAKE_X 0x55 Sky Lake -- s: 11 */ -#define INTEL_FAM6_KABYLAKE_L 0x8E /* Sky Lake */ +#define INTEL_KABYLAKE_L IFM(6, 0x8E) /* Sky Lake */ /* AMBERLAKE_L 0x8E Sky Lake -- s: 9 */ /* COFFEELAKE_L 0x8E Sky Lake -- s: 10 */ /* WHISKEYLAKE_L 0x8E Sky Lake -- s: 11,12 */ -#define INTEL_FAM6_KABYLAKE 0x9E /* Sky Lake */ +#define INTEL_KABYLAKE IFM(6, 0x9E) /* Sky Lake */ /* COFFEELAKE 0x9E Sky Lake -- s: 10-13 */ -#define INTEL_FAM6_COMETLAKE 0xA5 /* Sky Lake */ -#define INTEL_FAM6_COMETLAKE_L 0xA6 /* Sky Lake */ +#define INTEL_COMETLAKE IFM(6, 0xA5) /* Sky Lake */ +#define INTEL_COMETLAKE_L IFM(6, 0xA6) /* Sky Lake */ + +#define INTEL_CANNONLAKE_L IFM(6, 0x66) /* Palm Cove */ + +#define INTEL_ICELAKE_X IFM(6, 0x6A) /* Sunny Cove */ +#define INTEL_ICELAKE_D IFM(6, 0x6C) /* Sunny Cove */ +#define INTEL_ICELAKE IFM(6, 0x7D) /* Sunny Cove */ +#define INTEL_ICELAKE_L IFM(6, 0x7E) /* Sunny Cove */ +#define INTEL_ICELAKE_NNPI IFM(6, 0x9D) /* Sunny Cove */ + +#define INTEL_ROCKETLAKE IFM(6, 0xA7) /* Cypress Cove */ + +#define INTEL_TIGERLAKE_L IFM(6, 0x8C) /* Willow Cove */ +#define INTEL_TIGERLAKE IFM(6, 0x8D) /* Willow Cove */ + +#define INTEL_SAPPHIRERAPIDS_X IFM(6, 0x8F) /* Golden Cove */ + +#define INTEL_EMERALDRAPIDS_X IFM(6, 0xCF) /* Raptor Cove */ -#define INTEL_FAM6_CANNONLAKE_L 0x66 /* Palm Cove */ +#define INTEL_GRANITERAPIDS_X IFM(6, 0xAD) /* Redwood Cove */ +#define INTEL_GRANITERAPIDS_D IFM(6, 0xAE) -#define INTEL_FAM6_ICELAKE_X 0x6A /* Sunny Cove */ -#define INTEL_FAM6_ICELAKE_D 0x6C /* Sunny Cove */ -#define INTEL_FAM6_ICELAKE 0x7D /* Sunny Cove */ -#define INTEL_FAM6_ICELAKE_L 0x7E /* Sunny Cove */ -#define INTEL_FAM6_ICELAKE_NNPI 0x9D /* Sunny Cove */ +#define INTEL_BARTLETTLAKE IFM(6, 0xD7) /* Raptor Cove */ -#define INTEL_FAM6_LAKEFIELD 0x8A /* Sunny Cove / Tremont */ +/* "Hybrid" Processors (P-Core/E-Core) */ -#define INTEL_FAM6_ROCKETLAKE 0xA7 /* Cypress Cove */ +#define INTEL_LAKEFIELD IFM(6, 0x8A) /* Sunny Cove / Tremont */ -#define INTEL_FAM6_TIGERLAKE_L 0x8C /* Willow Cove */ -#define INTEL_FAM6_TIGERLAKE 0x8D /* Willow Cove */ +#define INTEL_ALDERLAKE IFM(6, 0x97) /* Golden Cove / Gracemont */ +#define INTEL_ALDERLAKE_L IFM(6, 0x9A) /* Golden Cove / Gracemont */ -#define INTEL_FAM6_SAPPHIRERAPIDS_X 0x8F /* Golden Cove */ +#define INTEL_RAPTORLAKE IFM(6, 0xB7) /* Raptor Cove / Enhanced Gracemont */ +#define INTEL_RAPTORLAKE_P IFM(6, 0xBA) +#define INTEL_RAPTORLAKE_S IFM(6, 0xBF) -#define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */ -#define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */ -#define INTEL_FAM6_ALDERLAKE_N 0xBE +#define INTEL_METEORLAKE IFM(6, 0xAC) /* Redwood Cove / Crestmont */ +#define INTEL_METEORLAKE_L IFM(6, 0xAA) -#define INTEL_FAM6_RAPTORLAKE 0xB7 -#define INTEL_FAM6_RAPTORLAKE_P 0xBA +#define INTEL_ARROWLAKE_H IFM(6, 0xC5) /* Lion Cove / Skymont */ +#define INTEL_ARROWLAKE IFM(6, 0xC6) +#define INTEL_ARROWLAKE_U IFM(6, 0xB5) -/* "Small Core" Processors (Atom) */ +#define INTEL_LUNARLAKE_M IFM(6, 0xBD) /* Lion Cove / Skymont */ -#define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */ -#define INTEL_FAM6_ATOM_BONNELL_MID 0x26 /* Silverthorne, Lincroft */ +#define INTEL_PANTHERLAKE_L IFM(6, 0xCC) /* Cougar Cove / Crestmont */ -#define INTEL_FAM6_ATOM_SALTWELL 0x36 /* Cedarview */ -#define INTEL_FAM6_ATOM_SALTWELL_MID 0x27 /* Penwell */ -#define INTEL_FAM6_ATOM_SALTWELL_TABLET 0x35 /* Cloverview */ +/* "Small Core" Processors (Atom/E-Core) */ -#define INTEL_FAM6_ATOM_SILVERMONT 0x37 /* Bay Trail, Valleyview */ -#define INTEL_FAM6_ATOM_SILVERMONT_D 0x4D /* Avaton, Rangely */ -#define INTEL_FAM6_ATOM_SILVERMONT_MID 0x4A /* Merriefield */ +#define INTEL_ATOM_BONNELL IFM(6, 0x1C) /* Diamondville, Pineview */ +#define INTEL_ATOM_BONNELL_MID IFM(6, 0x26) /* Silverthorne, Lincroft */ -#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* Cherry Trail, Braswell */ -#define INTEL_FAM6_ATOM_AIRMONT_MID 0x5A /* Moorefield */ -#define INTEL_FAM6_ATOM_AIRMONT_NP 0x75 /* Lightning Mountain */ +#define INTEL_ATOM_SALTWELL IFM(6, 0x36) /* Cedarview */ +#define INTEL_ATOM_SALTWELL_MID IFM(6, 0x27) /* Penwell */ +#define INTEL_ATOM_SALTWELL_TABLET IFM(6, 0x35) /* Cloverview */ -#define INTEL_FAM6_ATOM_GOLDMONT 0x5C /* Apollo Lake */ -#define INTEL_FAM6_ATOM_GOLDMONT_D 0x5F /* Denverton */ +#define INTEL_ATOM_SILVERMONT IFM(6, 0x37) /* Bay Trail, Valleyview */ +#define INTEL_ATOM_SILVERMONT_D IFM(6, 0x4D) /* Avaton, Rangely */ +#define INTEL_ATOM_SILVERMONT_MID IFM(6, 0x4A) /* Merriefield */ +#define INTEL_ATOM_SILVERMONT_MID2 IFM(6, 0x5A) /* Anniedale */ + +#define INTEL_ATOM_AIRMONT IFM(6, 0x4C) /* Cherry Trail, Braswell */ +#define INTEL_ATOM_AIRMONT_NP IFM(6, 0x75) /* Lightning Mountain */ + +#define INTEL_ATOM_GOLDMONT IFM(6, 0x5C) /* Apollo Lake */ +#define INTEL_ATOM_GOLDMONT_D IFM(6, 0x5F) /* Denverton */ /* Note: the micro-architecture is "Goldmont Plus" */ -#define INTEL_FAM6_ATOM_GOLDMONT_PLUS 0x7A /* Gemini Lake */ +#define INTEL_ATOM_GOLDMONT_PLUS IFM(6, 0x7A) /* Gemini Lake */ + +#define INTEL_ATOM_TREMONT_D IFM(6, 0x86) /* Jacobsville */ +#define INTEL_ATOM_TREMONT IFM(6, 0x96) /* Elkhart Lake */ +#define INTEL_ATOM_TREMONT_L IFM(6, 0x9C) /* Jasper Lake */ -#define INTEL_FAM6_ATOM_TREMONT_D 0x86 /* Jacobsville */ -#define INTEL_FAM6_ATOM_TREMONT 0x96 /* Elkhart Lake */ -#define INTEL_FAM6_ATOM_TREMONT_L 0x9C /* Jasper Lake */ +#define INTEL_ATOM_GRACEMONT IFM(6, 0xBE) /* Alderlake N */ + +#define INTEL_ATOM_CRESTMONT_X IFM(6, 0xAF) /* Sierra Forest */ +#define INTEL_ATOM_CRESTMONT IFM(6, 0xB6) /* Grand Ridge */ + +#define INTEL_ATOM_DARKMONT_X IFM(6, 0xDD) /* Clearwater Forest */ /* Xeon Phi */ -#define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */ -#define INTEL_FAM6_XEON_PHI_KNM 0x85 /* Knights Mill */ +#define INTEL_XEON_PHI_KNL IFM(6, 0x57) /* Knights Landing */ +#define INTEL_XEON_PHI_KNM IFM(6, 0x85) /* Knights Mill */ -/* Family 5 */ -#define INTEL_FAM5_QUARK_X1000 0x09 /* Quark X1000 SoC */ +/* Notational marker denoting the last Family 6 model */ +#define INTEL_FAM6_LAST IFM(6, 0xFF) + +/* Family 15 - NetBurst */ +#define INTEL_P4_WILLAMETTE IFM(15, 0x01) /* Also Xeon Foster */ +#define INTEL_P4_PRESCOTT IFM(15, 0x03) +#define INTEL_P4_PRESCOTT_2M IFM(15, 0x04) +#define INTEL_P4_CEDARMILL IFM(15, 0x06) /* Also Xeon Dempsey */ + +/* Family 19 */ +#define INTEL_PANTHERCOVE_X IFM(19, 0x01) /* Diamond Rapids */ + +/* + * Intel CPU core types + * + * CPUID.1AH.EAX[31:0] uniquely identifies the microarchitecture + * of the core. Bits 31-24 indicates its core type (Core or Atom) + * and Bits [23:0] indicates the native model ID of the core. + * Core type and native model ID are defined in below enumerations. + */ +enum intel_cpu_type { + INTEL_CPU_TYPE_UNKNOWN, + INTEL_CPU_TYPE_ATOM = 0x20, + INTEL_CPU_TYPE_CORE = 0x40, +}; + +enum intel_native_id { + INTEL_ATOM_CMT_NATIVE_ID = 0x2, /* Crestmont */ + INTEL_ATOM_SKT_NATIVE_ID = 0x3, /* Skymont */ +}; #endif /* _ASM_X86_INTEL_FAMILY_H */ diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h index c201083b34f6..a3abdcd89a32 100644 --- a/arch/x86/include/asm/intel-mid.h +++ b/arch/x86/include/asm/intel-mid.h @@ -20,25 +20,4 @@ extern void intel_mid_pwr_power_off(void); extern int intel_mid_pwr_get_lss_id(struct pci_dev *pdev); -#ifdef CONFIG_X86_INTEL_MID - -extern void intel_scu_devices_create(void); -extern void intel_scu_devices_destroy(void); - -#else /* !CONFIG_X86_INTEL_MID */ - -static inline void intel_scu_devices_create(void) { } -static inline void intel_scu_devices_destroy(void) { } - -#endif /* !CONFIG_X86_INTEL_MID */ - -/* Bus Select SoC Fuse value */ -#define BSEL_SOC_FUSE_MASK 0x7 -/* FSB 133MHz */ -#define BSEL_SOC_FUSE_001 0x1 -/* FSB 100MHz */ -#define BSEL_SOC_FUSE_101 0x5 -/* FSB 83MHz */ -#define BSEL_SOC_FUSE_111 0x7 - #endif /* _ASM_X86_INTEL_MID_H */ diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h index 2f9eeb5c3069..5dbeac48a5b9 100644 --- a/arch/x86/include/asm/intel_ds.h +++ b/arch/x86/include/asm/intel_ds.h @@ -9,6 +9,7 @@ /* The maximal number of PEBS events: */ #define MAX_PEBS_EVENTS_FMT4 8 #define MAX_PEBS_EVENTS 32 +#define MAX_PEBS_EVENTS_MASK GENMASK_ULL(MAX_PEBS_EVENTS - 1, 0) #define MAX_FIXED_PEBS_EVENTS 16 /* diff --git a/arch/x86/include/asm/intel_pconfig.h b/arch/x86/include/asm/intel_pconfig.h deleted file mode 100644 index 994638ef171b..000000000000 --- a/arch/x86/include/asm/intel_pconfig.h +++ /dev/null @@ -1,65 +0,0 @@ -#ifndef _ASM_X86_INTEL_PCONFIG_H -#define _ASM_X86_INTEL_PCONFIG_H - -#include <asm/asm.h> -#include <asm/processor.h> - -enum pconfig_target { - INVALID_TARGET = 0, - MKTME_TARGET = 1, - PCONFIG_TARGET_NR -}; - -int pconfig_target_supported(enum pconfig_target target); - -enum pconfig_leaf { - MKTME_KEY_PROGRAM = 0, - PCONFIG_LEAF_INVALID, -}; - -#define PCONFIG ".byte 0x0f, 0x01, 0xc5" - -/* Defines and structure for MKTME_KEY_PROGRAM of PCONFIG instruction */ - -/* mktme_key_program::keyid_ctrl COMMAND, bits [7:0] */ -#define MKTME_KEYID_SET_KEY_DIRECT 0 -#define MKTME_KEYID_SET_KEY_RANDOM 1 -#define MKTME_KEYID_CLEAR_KEY 2 -#define MKTME_KEYID_NO_ENCRYPT 3 - -/* mktme_key_program::keyid_ctrl ENC_ALG, bits [23:8] */ -#define MKTME_AES_XTS_128 (1 << 8) - -/* Return codes from the PCONFIG MKTME_KEY_PROGRAM */ -#define MKTME_PROG_SUCCESS 0 -#define MKTME_INVALID_PROG_CMD 1 -#define MKTME_ENTROPY_ERROR 2 -#define MKTME_INVALID_KEYID 3 -#define MKTME_INVALID_ENC_ALG 4 -#define MKTME_DEVICE_BUSY 5 - -/* Hardware requires the structure to be 256 byte aligned. Otherwise #GP(0). */ -struct mktme_key_program { - u16 keyid; - u32 keyid_ctrl; - u8 __rsvd[58]; - u8 key_field_1[64]; - u8 key_field_2[64]; -} __packed __aligned(256); - -static inline int mktme_key_program(struct mktme_key_program *key_program) -{ - unsigned long rax = MKTME_KEY_PROGRAM; - - if (!pconfig_target_supported(MKTME_TARGET)) - return -ENXIO; - - asm volatile(PCONFIG - : "=a" (rax), "=b" (key_program) - : "0" (rax), "1" (key_program) - : "memory", "cc"); - - return rax; -} - -#endif /* _ASM_X86_INTEL_PCONFIG_H */ diff --git a/arch/x86/include/asm/intel_punit_ipc.h b/arch/x86/include/asm/intel_punit_ipc.h index ce16da719596..1f9b5d225912 100644 --- a/arch/x86/include/asm/intel_punit_ipc.h +++ b/arch/x86/include/asm/intel_punit_ipc.h @@ -80,17 +80,10 @@ typedef enum { #if IS_ENABLED(CONFIG_INTEL_PUNIT_IPC) -int intel_punit_ipc_simple_command(int cmd, int para1, int para2); int intel_punit_ipc_command(u32 cmd, u32 para1, u32 para2, u32 *in, u32 *out); #else -static inline int intel_punit_ipc_simple_command(int cmd, - int para1, int para2) -{ - return -ENODEV; -} - static inline int intel_punit_ipc_command(u32 cmd, u32 para1, u32 para2, u32 *in, u32 *out) { diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h deleted file mode 100644 index 8537f597d20a..000000000000 --- a/arch/x86/include/asm/intel_scu_ipc.h +++ /dev/null @@ -1,68 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_INTEL_SCU_IPC_H_ -#define _ASM_X86_INTEL_SCU_IPC_H_ - -#include <linux/ioport.h> - -struct device; -struct intel_scu_ipc_dev; - -/** - * struct intel_scu_ipc_data - Data used to configure SCU IPC - * @mem: Base address of SCU IPC MMIO registers - * @irq: The IRQ number used for SCU (optional) - */ -struct intel_scu_ipc_data { - struct resource mem; - int irq; -}; - -struct intel_scu_ipc_dev * -__intel_scu_ipc_register(struct device *parent, - const struct intel_scu_ipc_data *scu_data, - struct module *owner); - -#define intel_scu_ipc_register(parent, scu_data) \ - __intel_scu_ipc_register(parent, scu_data, THIS_MODULE) - -void intel_scu_ipc_unregister(struct intel_scu_ipc_dev *scu); - -struct intel_scu_ipc_dev * -__devm_intel_scu_ipc_register(struct device *parent, - const struct intel_scu_ipc_data *scu_data, - struct module *owner); - -#define devm_intel_scu_ipc_register(parent, scu_data) \ - __devm_intel_scu_ipc_register(parent, scu_data, THIS_MODULE) - -struct intel_scu_ipc_dev *intel_scu_ipc_dev_get(void); -void intel_scu_ipc_dev_put(struct intel_scu_ipc_dev *scu); -struct intel_scu_ipc_dev *devm_intel_scu_ipc_dev_get(struct device *dev); - -int intel_scu_ipc_dev_ioread8(struct intel_scu_ipc_dev *scu, u16 addr, - u8 *data); -int intel_scu_ipc_dev_iowrite8(struct intel_scu_ipc_dev *scu, u16 addr, - u8 data); -int intel_scu_ipc_dev_readv(struct intel_scu_ipc_dev *scu, u16 *addr, - u8 *data, size_t len); -int intel_scu_ipc_dev_writev(struct intel_scu_ipc_dev *scu, u16 *addr, - u8 *data, size_t len); - -int intel_scu_ipc_dev_update(struct intel_scu_ipc_dev *scu, u16 addr, - u8 data, u8 mask); - -int intel_scu_ipc_dev_simple_command(struct intel_scu_ipc_dev *scu, int cmd, - int sub); -int intel_scu_ipc_dev_command_with_size(struct intel_scu_ipc_dev *scu, int cmd, - int sub, const void *in, size_t inlen, - size_t size, void *out, size_t outlen); - -static inline int intel_scu_ipc_dev_command(struct intel_scu_ipc_dev *scu, int cmd, - int sub, const void *in, size_t inlen, - void *out, size_t outlen) -{ - return intel_scu_ipc_dev_command_with_size(scu, cmd, sub, in, inlen, - inlen, out, outlen); -} - -#endif diff --git a/arch/x86/include/asm/intel_telemetry.h b/arch/x86/include/asm/intel_telemetry.h index 8046e70dfd7c..43b7657febca 100644 --- a/arch/x86/include/asm/intel_telemetry.h +++ b/arch/x86/include/asm/intel_telemetry.h @@ -10,7 +10,7 @@ #define TELEM_MAX_EVENTS_SRAM 28 #define TELEM_MAX_OS_ALLOCATED_EVENTS 20 -#include <asm/intel_scu_ipc.h> +#include <linux/platform_data/x86/intel_scu_ipc.h> enum telemetry_unit { TELEM_PSS = 0, diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 1870b99c3356..ca309a3227c7 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -35,9 +35,6 @@ * - Arnaldo Carvalho de Melo <acme@conectiva.com.br> */ -#define ARCH_HAS_IOREMAP_WC -#define ARCH_HAS_IOREMAP_WT - #include <linux/string.h> #include <linux/compiler.h> #include <linux/cc_platform.h> @@ -45,6 +42,7 @@ #include <asm/early_ioremap.h> #include <asm/pgtable_types.h> #include <asm/shared/io.h> +#include <asm/special_insns.h> #define build_mmio_read(name, size, type, reg, barrier) \ static inline type name(const volatile void __iomem *addr) \ @@ -154,11 +152,6 @@ static inline void *phys_to_virt(phys_addr_t address) #define phys_to_virt phys_to_virt /* - * Change "struct page" to physical address. - */ -#define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT) - -/* * ISA I/O bus memory addresses are 1:1 with the physical address. * However, we truncate the address to unsigned int to avoid undesirable * promotions in legacy drivers. @@ -170,15 +163,6 @@ static inline unsigned int isa_virt_to_bus(volatile void *address) #define isa_bus_to_virt phys_to_virt /* - * However PCI ones are not necessarily 1:1 and therefore these interfaces - * are forbidden in portable PCI drivers. - * - * Allow them on x86 for legacy drivers, though. - */ -#define virt_to_bus virt_to_phys -#define bus_to_virt phys_to_virt - -/* * The default ioremap() behavior is non-cached; if you need something * else, you probably want one of the following. */ @@ -186,11 +170,14 @@ extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size); #define ioremap_uc ioremap_uc extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); #define ioremap_cache ioremap_cache -extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val); +extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, pgprot_t prot); #define ioremap_prot ioremap_prot extern void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size); #define ioremap_encrypted ioremap_encrypted +void *arch_memremap_wb(phys_addr_t phys_addr, size_t size, unsigned long flags); +#define arch_memremap_wb arch_memremap_wb + /** * ioremap - map bus memory into CPU space * @offset: bus address of the memory @@ -221,7 +208,22 @@ void memset_io(volatile void __iomem *, int, size_t); #define memcpy_toio memcpy_toio #define memset_io memset_io -#include <asm-generic/iomap.h> +#ifdef CONFIG_X86_64 +/* + * Commit 0f07496144c2 ("[PATCH] Add faster __iowrite32_copy routine for + * x86_64") says that circa 2006 rep movsl is noticeably faster than a copy + * loop. + */ +static inline void __iowrite32_copy(void __iomem *to, const void *from, + size_t count) +{ + asm volatile("rep movsl" + : "=&c"(count), "=&D"(to), "=&S"(from) + : "0"(count), "1"(to), "2"(from) + : "memory"); +} +#define __iowrite32_copy __iowrite32_copy +#endif /* * ISA space is 'always mapped' on a typical x86 system, no need to @@ -256,7 +258,7 @@ static inline void slow_down_io(void) #endif -#define BUILDIO(bwl, bw, type) \ +#define BUILDIO(bwl, type) \ static inline void out##bwl##_p(type value, u16 port) \ { \ out##bwl(value, port); \ @@ -280,7 +282,7 @@ static inline void outs##bwl(u16 port, const void *addr, unsigned long count) \ count--; \ } \ } else { \ - asm volatile("rep; outs" #bwl \ + asm volatile("rep outs" #bwl \ : "+S"(addr), "+c"(count) \ : "d"(port) : "memory"); \ } \ @@ -296,15 +298,15 @@ static inline void ins##bwl(u16 port, void *addr, unsigned long count) \ count--; \ } \ } else { \ - asm volatile("rep; ins" #bwl \ + asm volatile("rep ins" #bwl \ : "+D"(addr), "+c"(count) \ : "d"(port) : "memory"); \ } \ } -BUILDIO(b, b, u8) -BUILDIO(w, w, u16) -BUILDIO(l, , u32) +BUILDIO(b, u8) +BUILDIO(w, u16) +BUILDIO(l, u32) #undef BUILDIO #define inb_p inb_p @@ -393,7 +395,7 @@ static inline void iosubmit_cmds512(void __iomem *dst, const void *src, const u8 *end = from + count * 64; while (from < end) { - movdir64b(dst, from); + movdir64b_io(dst, from); from += 64; } } diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 437aa8d00e53..0d806513c4b3 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -109,8 +109,8 @@ extern int mp_irq_entries; /* MP IRQ source entries */ extern struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; -/* 1 if "noapic" boot option passed */ -extern int skip_ioapic_setup; +/* True if "noapic" boot option passed */ +extern bool ioapic_is_disabled; /* 1 if "noapic" boot option passed */ extern int noioapicquirk; @@ -129,7 +129,7 @@ extern unsigned long io_apic_irqs; * assignment of PCI IRQ's. */ #define io_apic_assign_pci_irqs \ - (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs) + (mp_irq_entries && !ioapic_is_disabled && io_apic_irqs) struct irq_cfg; extern void ioapic_insert_resources(void); @@ -140,7 +140,6 @@ extern void mask_ioapic_entries(void); extern int restore_ioapic_entries(void); extern void setup_ioapic_ids_from_mpc(void); -extern void setup_ioapic_ids_from_mpc_nocheck(void); extern int mp_find_ioapic(u32 gsi); extern int mp_find_ioapic_pin(int ioapic, u32 gsi); @@ -179,6 +178,7 @@ extern void print_IO_APICs(void); #define IO_APIC_IRQ(x) 0 #define io_apic_assign_pci_irqs 0 #define setup_ioapic_ids_from_mpc x86_init_noop +#define nr_ioapics (0) static inline void ioapic_insert_resources(void) { } static inline int arch_early_ioapic_init(void) { return 0; } static inline void print_IO_APICs(void) {} diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index 0bef44d30a27..3be2451e7bc8 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -10,6 +10,7 @@ extern int force_iommu, no_iommu; extern int iommu_detected; extern int iommu_merge; extern int panic_on_overflow; +extern bool amd_iommu_snp_en; #ifdef CONFIG_SWIOTLB extern bool x86_swiotlb_enable; @@ -25,8 +26,10 @@ arch_rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr) { u64 start = rmrr->base_address; u64 end = rmrr->end_address + 1; + int entry_type; - if (e820__mapped_all(start, end, E820_TYPE_RESERVED)) + entry_type = e820__get_entry_type(start, end); + if (entry_type == E820_TYPE_RESERVED || entry_type == E820_TYPE_NVS) return 0; pr_err(FW_BUG "No firmware reserved region can cover this RMRR [%#018Lx-%#018Lx], contact BIOS vendor for fixes\n", diff --git a/arch/x86/include/asm/iosf_mbi.h b/arch/x86/include/asm/iosf_mbi.h index a1911fea8739..8ace6559d399 100644 --- a/arch/x86/include/asm/iosf_mbi.h +++ b/arch/x86/include/asm/iosf_mbi.h @@ -111,7 +111,7 @@ int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask); * This function will block all kernel access to the PMIC I2C bus, so that the * P-Unit can safely access the PMIC over the shared I2C bus. * - * Note on these systems the i2c-bus driver will request a sempahore from the + * Note on these systems the i2c-bus driver will request a semaphore from the * P-Unit for exclusive access to the PMIC bus when i2c drivers are accessing * it, but this does not appear to be sufficient, we still need to avoid making * certain P-Unit requests during the access window to avoid problems. @@ -168,13 +168,6 @@ void iosf_mbi_unblock_punit_i2c_access(void); int iosf_mbi_register_pmic_bus_access_notifier(struct notifier_block *nb); /** - * iosf_mbi_register_pmic_bus_access_notifier - Unregister PMIC bus notifier - * - * @nb: notifier_block to unregister - */ -int iosf_mbi_unregister_pmic_bus_access_notifier(struct notifier_block *nb); - -/** * iosf_mbi_unregister_pmic_bus_access_notifier_unlocked - Unregister PMIC bus * notifier, unlocked * diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 768aa234cbb4..194dfff84cb1 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -29,7 +29,7 @@ struct irq_desc; extern void fixup_irqs(void); -#ifdef CONFIG_HAVE_KVM +#if IS_ENABLED(CONFIG_KVM) extern void kvm_set_posted_intr_wakeup_handler(void (*handler)(void)); #endif @@ -40,11 +40,9 @@ extern void __handle_irq(struct irq_desc *desc, struct pt_regs *regs); extern void init_ISA_irqs(void); -extern void __init init_IRQ(void); - #ifdef CONFIG_X86_LOCAL_APIC void arch_trigger_cpumask_backtrace(const struct cpumask *mask, - bool exclude_self); + int exclude_cpu); #define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace #endif diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 7cc49432187f..5036f13ab69f 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -44,16 +44,19 @@ extern int irq_remapping_reenable(int); extern int irq_remap_enable_fault_handling(void); extern void panic_if_irq_remap(const char *msg); -/* Create PCI MSI/MSIx irqdomain, use @parent as the parent irqdomain. */ -extern struct irq_domain * -arch_create_remap_msi_irq_domain(struct irq_domain *par, const char *n, int id); - /* Get parent irqdomain for interrupt remapping irqdomain */ static inline struct irq_domain *arch_get_ir_parent_domain(void) { return x86_vector_domain; } +extern bool enable_posted_msi; + +static inline bool posted_msi_supported(void) +{ + return enable_posted_msi && irq_remapping_cap(IRQ_POSTING_CAP); +} + #else /* CONFIG_IRQ_REMAP */ static inline bool irq_remapping_cap(enum irq_remap_cap cap) { return 0; } diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h index 63f818aedf77..735c3a491f60 100644 --- a/arch/x86/include/asm/irq_stack.h +++ b/arch/x86/include/asm/irq_stack.h @@ -100,8 +100,8 @@ } #define ASM_CALL_ARG0 \ - "call %P[__func] \n" \ - ASM_REACHABLE + "1: call %c[__func] \n" \ + ANNOTATE_REACHABLE(1b) #define ASM_CALL_ARG1 \ "movq %[arg1], %%rdi \n" \ @@ -203,7 +203,7 @@ IRQ_CONSTRAINTS, regs, vector); \ } -#ifndef CONFIG_PREEMPT_RT +#ifdef CONFIG_SOFTIRQ_ON_OWN_STACK /* * Macro to invoke __do_softirq on the irq stack. This is only called from * task context when bottom halves are about to be reenabled and soft diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 43dcb9284208..47051871b436 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -18,8 +18,8 @@ * Vectors 0 ... 31 : system traps and exceptions - hardcoded events * Vectors 32 ... 127 : device interrupts * Vector 128 : legacy int80 syscall interface - * Vectors 129 ... LOCAL_TIMER_VECTOR-1 - * Vectors LOCAL_TIMER_VECTOR ... 255 : special interrupts + * Vectors 129 ... FIRST_SYSTEM_VECTOR-1 : device interrupts + * Vectors FIRST_SYSTEM_VECTOR ... 255 : special interrupts * * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table. * @@ -35,13 +35,6 @@ */ #define FIRST_EXTERNAL_VECTOR 0x20 -/* - * Reserve the lowest usable vector (and hence lowest priority) 0x20 for - * triggering cleanup after irq migration. 0x21-0x2f will still be used - * for device interrupts. - */ -#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR - #define IA32_SYSCALL_VECTOR 0x80 /* @@ -91,11 +84,9 @@ #define HYPERVISOR_CALLBACK_VECTOR 0xf3 /* Vector for KVM to deliver posted interrupt IPI */ -#ifdef CONFIG_HAVE_KVM #define POSTED_INTR_VECTOR 0xf2 #define POSTED_INTR_WAKEUP_VECTOR 0xf1 #define POSTED_INTR_NESTED_VECTOR 0xf0 -#endif #define MANAGED_IRQ_SHUTDOWN_VECTOR 0xef @@ -106,10 +97,16 @@ #define LOCAL_TIMER_VECTOR 0xec +/* + * Posted interrupt notification vector for all device MSIs delivered to + * the host kernel. + */ +#define POSTED_MSI_NOTIFICATION_VECTOR 0xeb + #define NR_VECTORS 256 #ifdef CONFIG_X86_LOCAL_APIC -#define FIRST_SYSTEM_VECTOR LOCAL_TIMER_VECTOR +#define FIRST_SYSTEM_VECTOR POSTED_MSI_NOTIFICATION_VECTOR #else #define FIRST_SYSTEM_VECTOR NR_VECTORS #endif diff --git a/arch/x86/include/asm/irq_work.h b/arch/x86/include/asm/irq_work.h index 800ffce0db29..6b4d36c95165 100644 --- a/arch/x86/include/asm/irq_work.h +++ b/arch/x86/include/asm/irq_work.h @@ -9,7 +9,6 @@ static inline bool arch_irq_work_has_interrupt(void) { return boot_cpu_has(X86_FEATURE_APIC); } -extern void arch_irq_work_raise(void); #else static inline bool arch_irq_work_has_interrupt(void) { diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h index 125c23b7bad3..30c325c235c0 100644 --- a/arch/x86/include/asm/irqdomain.h +++ b/arch/x86/include/asm/irqdomain.h @@ -7,9 +7,7 @@ #ifdef CONFIG_X86_LOCAL_APIC enum { - /* Allocate contiguous CPU vectors */ - X86_IRQ_ALLOC_CONTIGUOUS_VECTORS = 0x1, - X86_IRQ_ALLOC_LEGACY = 0x2, + X86_IRQ_ALLOC_LEGACY = 0x1, }; extern int x86_fwspec_is_ioapic(struct irq_fwspec *fwspec); diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 7793e52d6237..9a9b21b78905 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -4,13 +4,10 @@ #include <asm/processor-flags.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <asm/nospec-branch.h> -/* Provide __cpuidle; we can't safely include <linux/cpu.h> */ -#define __cpuidle __section(".cpuidle.text") - /* * Interrupt control: */ @@ -45,46 +42,47 @@ static __always_inline void native_irq_enable(void) asm volatile("sti": : :"memory"); } -static inline __cpuidle void native_safe_halt(void) +static __always_inline void native_safe_halt(void) { mds_idle_clear_cpu_buffers(); asm volatile("sti; hlt": : :"memory"); } -static inline __cpuidle void native_halt(void) +static __always_inline void native_halt(void) { mds_idle_clear_cpu_buffers(); asm volatile("hlt": : :"memory"); } -#endif - -#ifdef CONFIG_PARAVIRT_XXL -#include <asm/paravirt.h> -#else -#ifndef __ASSEMBLY__ -#include <linux/types.h> - -static __always_inline unsigned long arch_local_save_flags(void) +static __always_inline int native_irqs_disabled_flags(unsigned long flags) { - return native_save_fl(); + return !(flags & X86_EFLAGS_IF); } -static __always_inline void arch_local_irq_disable(void) +static __always_inline unsigned long native_local_irq_save(void) { + unsigned long flags = native_save_fl(); + native_irq_disable(); + + return flags; } -static __always_inline void arch_local_irq_enable(void) +static __always_inline void native_local_irq_restore(unsigned long flags) { - native_irq_enable(); + if (!native_irqs_disabled_flags(flags)) + native_irq_enable(); } +#endif + +#ifndef CONFIG_PARAVIRT +#ifndef __ASSEMBLY__ /* * Used in the idle loop; sti takes one instruction cycle * to complete: */ -static inline __cpuidle void arch_safe_halt(void) +static __always_inline void arch_safe_halt(void) { native_safe_halt(); } @@ -93,10 +91,33 @@ static inline __cpuidle void arch_safe_halt(void) * Used when interrupts are already enabled or to * shutdown the processor: */ -static inline __cpuidle void halt(void) +static __always_inline void halt(void) { native_halt(); } +#endif /* __ASSEMBLY__ */ +#endif /* CONFIG_PARAVIRT */ + +#ifdef CONFIG_PARAVIRT_XXL +#include <asm/paravirt.h> +#else +#ifndef __ASSEMBLER__ +#include <linux/types.h> + +static __always_inline unsigned long arch_local_save_flags(void) +{ + return native_save_fl(); +} + +static __always_inline void arch_local_irq_disable(void) +{ + native_irq_disable(); +} + +static __always_inline void arch_local_irq_enable(void) +{ + native_irq_enable(); +} /* * For spinlocks, etc: @@ -116,10 +137,10 @@ static __always_inline unsigned long arch_local_irq_save(void) #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* CONFIG_PARAVIRT_XXL */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ static __always_inline int arch_irqs_disabled_flags(unsigned long flags) { return !(flags & X86_EFLAGS_IF); @@ -137,6 +158,6 @@ static __always_inline void arch_local_irq_restore(unsigned long flags) if (!arch_irqs_disabled_flags(flags)) arch_local_irq_enable(); } -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 071572e23d3a..61dd1dee7812 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -7,40 +7,33 @@ #include <asm/asm.h> #include <asm/nops.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/stringify.h> #include <linux/types.h> -#define JUMP_TABLE_ENTRY \ +#define JUMP_TABLE_ENTRY(key, label) \ ".pushsection __jump_table, \"aw\" \n\t" \ _ASM_ALIGN "\n\t" \ ".long 1b - . \n\t" \ - ".long %l[l_yes] - . \n\t" \ - _ASM_PTR "%c0 + %c1 - .\n\t" \ + ".long " label " - . \n\t" \ + _ASM_PTR " " key " - . \n\t" \ ".popsection \n\t" +/* This macro is also expanded on the Rust side. */ #ifdef CONFIG_HAVE_JUMP_LABEL_HACK - -static __always_inline bool arch_static_branch(struct static_key *key, bool branch) -{ - asm_volatile_goto("1:" - "jmp %l[l_yes] # objtool NOPs this \n\t" - JUMP_TABLE_ENTRY - : : "i" (key), "i" (2 | branch) : : l_yes); - - return false; -l_yes: - return true; -} - +#define ARCH_STATIC_BRANCH_ASM(key, label) \ + "1: jmp " label " # objtool NOPs this \n\t" \ + JUMP_TABLE_ENTRY(key " + 2", label) #else /* !CONFIG_HAVE_JUMP_LABEL_HACK */ +#define ARCH_STATIC_BRANCH_ASM(key, label) \ + "1: .byte " __stringify(BYTES_NOP5) "\n\t" \ + JUMP_TABLE_ENTRY(key, label) +#endif /* CONFIG_HAVE_JUMP_LABEL_HACK */ static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch) { - asm_volatile_goto("1:" - ".byte " __stringify(BYTES_NOP5) "\n\t" - JUMP_TABLE_ENTRY + asm goto(ARCH_STATIC_BRANCH_ASM("%c0 + %c1", "%l[l_yes]") : : "i" (key), "i" (branch) : : l_yes); return false; @@ -48,13 +41,11 @@ l_yes: return true; } -#endif /* CONFIG_HAVE_JUMP_LABEL_HACK */ - static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch) { - asm_volatile_goto("1:" + asm goto("1:" "jmp %l[l_yes]\n\t" - JUMP_TABLE_ENTRY + JUMP_TABLE_ENTRY("%c0 + %c1", "%l[l_yes]") : : "i" (key), "i" (branch) : : l_yes); return false; @@ -64,6 +55,6 @@ l_yes: extern int arch_jump_entry_size(struct jump_entry *entry); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h index 13e70da38bed..d7e33c7f096b 100644 --- a/arch/x86/include/asm/kasan.h +++ b/arch/x86/include/asm/kasan.h @@ -23,14 +23,17 @@ (1ULL << (__VIRTUAL_MASK_SHIFT - \ KASAN_SHADOW_SCALE_SHIFT))) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_KASAN void __init kasan_early_init(void); void __init kasan_init(void); +void __init kasan_populate_shadow_for_vaddr(void *va, size_t size, int nid); #else static inline void kasan_early_init(void) { } static inline void kasan_init(void) { } +static inline void kasan_populate_shadow_for_vaddr(void *va, size_t size, + int nid) { } #endif #endif diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index a3760ca796aa..f2ad77929d6e 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -9,23 +9,21 @@ # define PA_SWAP_PAGE 3 # define PAGES_NR 4 #else -# define PA_CONTROL_PAGE 0 -# define VA_CONTROL_PAGE 1 -# define PA_TABLE_PAGE 2 -# define PA_SWAP_PAGE 3 -# define PAGES_NR 4 +/* Size of each exception handler referenced by the IDT */ +# define KEXEC_DEBUG_EXC_HANDLER_SIZE 6 /* PUSHI, PUSHI, 2-byte JMP */ #endif +# define KEXEC_CONTROL_PAGE_SIZE 4096 # define KEXEC_CONTROL_CODE_MAX_SIZE 2048 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/string.h> #include <linux/kernel.h> +#include <asm/asm.h> #include <asm/page.h> #include <asm/ptrace.h> -#include <asm/bootparam.h> struct kimage; @@ -44,7 +42,6 @@ struct kimage; /* Maximum address we can use for the control code buffer */ # define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE -# define KEXEC_CONTROL_PAGE_SIZE 4096 /* The native architecture */ # define KEXEC_ARCH KEXEC_ARCH_386 @@ -59,11 +56,16 @@ struct kimage; /* Maximum address we can use for the control pages */ # define KEXEC_CONTROL_MEMORY_LIMIT (MAXMEM-1) -/* Allocate one page for the pdp and the second for the code */ -# define KEXEC_CONTROL_PAGE_SIZE (4096UL + 4096UL) - /* The native architecture */ # define KEXEC_ARCH KEXEC_ARCH_X86_64 + +extern unsigned long kexec_va_control_page; +extern unsigned long kexec_pa_table_page; +extern unsigned long kexec_pa_swap_page; +extern gate_desc kexec_debug_idt[]; +extern unsigned char kexec_debug_exc_vectors[]; +extern uint16_t kexec_debug_8250_port; +extern unsigned long kexec_debug_8250_mmio32; #endif /* @@ -77,61 +79,52 @@ static inline void crash_setup_regs(struct pt_regs *newregs, if (oldregs) { memcpy(newregs, oldregs, sizeof(*newregs)); } else { + asm volatile("mov %%" _ASM_BX ",%0" : "=m"(newregs->bx)); + asm volatile("mov %%" _ASM_CX ",%0" : "=m"(newregs->cx)); + asm volatile("mov %%" _ASM_DX ",%0" : "=m"(newregs->dx)); + asm volatile("mov %%" _ASM_SI ",%0" : "=m"(newregs->si)); + asm volatile("mov %%" _ASM_DI ",%0" : "=m"(newregs->di)); + asm volatile("mov %%" _ASM_BP ",%0" : "=m"(newregs->bp)); + asm volatile("mov %%" _ASM_AX ",%0" : "=m"(newregs->ax)); + asm volatile("mov %%" _ASM_SP ",%0" : "=m"(newregs->sp)); +#ifdef CONFIG_X86_64 + asm volatile("mov %%r8,%0" : "=m"(newregs->r8)); + asm volatile("mov %%r9,%0" : "=m"(newregs->r9)); + asm volatile("mov %%r10,%0" : "=m"(newregs->r10)); + asm volatile("mov %%r11,%0" : "=m"(newregs->r11)); + asm volatile("mov %%r12,%0" : "=m"(newregs->r12)); + asm volatile("mov %%r13,%0" : "=m"(newregs->r13)); + asm volatile("mov %%r14,%0" : "=m"(newregs->r14)); + asm volatile("mov %%r15,%0" : "=m"(newregs->r15)); +#endif + asm volatile("mov %%ss,%k0" : "=a"(newregs->ss)); + asm volatile("mov %%cs,%k0" : "=a"(newregs->cs)); #ifdef CONFIG_X86_32 - asm volatile("movl %%ebx,%0" : "=m"(newregs->bx)); - asm volatile("movl %%ecx,%0" : "=m"(newregs->cx)); - asm volatile("movl %%edx,%0" : "=m"(newregs->dx)); - asm volatile("movl %%esi,%0" : "=m"(newregs->si)); - asm volatile("movl %%edi,%0" : "=m"(newregs->di)); - asm volatile("movl %%ebp,%0" : "=m"(newregs->bp)); - asm volatile("movl %%eax,%0" : "=m"(newregs->ax)); - asm volatile("movl %%esp,%0" : "=m"(newregs->sp)); - asm volatile("movl %%ss, %%eax;" :"=a"(newregs->ss)); - asm volatile("movl %%cs, %%eax;" :"=a"(newregs->cs)); - asm volatile("movl %%ds, %%eax;" :"=a"(newregs->ds)); - asm volatile("movl %%es, %%eax;" :"=a"(newregs->es)); - asm volatile("pushfl; popl %0" :"=m"(newregs->flags)); -#else - asm volatile("movq %%rbx,%0" : "=m"(newregs->bx)); - asm volatile("movq %%rcx,%0" : "=m"(newregs->cx)); - asm volatile("movq %%rdx,%0" : "=m"(newregs->dx)); - asm volatile("movq %%rsi,%0" : "=m"(newregs->si)); - asm volatile("movq %%rdi,%0" : "=m"(newregs->di)); - asm volatile("movq %%rbp,%0" : "=m"(newregs->bp)); - asm volatile("movq %%rax,%0" : "=m"(newregs->ax)); - asm volatile("movq %%rsp,%0" : "=m"(newregs->sp)); - asm volatile("movq %%r8,%0" : "=m"(newregs->r8)); - asm volatile("movq %%r9,%0" : "=m"(newregs->r9)); - asm volatile("movq %%r10,%0" : "=m"(newregs->r10)); - asm volatile("movq %%r11,%0" : "=m"(newregs->r11)); - asm volatile("movq %%r12,%0" : "=m"(newregs->r12)); - asm volatile("movq %%r13,%0" : "=m"(newregs->r13)); - asm volatile("movq %%r14,%0" : "=m"(newregs->r14)); - asm volatile("movq %%r15,%0" : "=m"(newregs->r15)); - asm volatile("movl %%ss, %%eax;" :"=a"(newregs->ss)); - asm volatile("movl %%cs, %%eax;" :"=a"(newregs->cs)); - asm volatile("pushfq; popq %0" :"=m"(newregs->flags)); + asm volatile("mov %%ds,%k0" : "=a"(newregs->ds)); + asm volatile("mov %%es,%k0" : "=a"(newregs->es)); #endif + asm volatile("pushf\n\t" + "pop %0" : "=m"(newregs->flags)); newregs->ip = _THIS_IP_; } } #ifdef CONFIG_X86_32 -asmlinkage unsigned long -relocate_kernel(unsigned long indirection_page, - unsigned long control_page, - unsigned long start_address, - unsigned int has_pae, - unsigned int preserve_context); +typedef asmlinkage unsigned long +relocate_kernel_fn(unsigned long indirection_page, + unsigned long control_page, + unsigned long start_address, + unsigned int has_pae, + unsigned int preserve_context); #else -unsigned long -relocate_kernel(unsigned long indirection_page, - unsigned long page_list, - unsigned long start_address, - unsigned int preserve_context, - unsigned int host_mem_enc_active); +typedef unsigned long +relocate_kernel_fn(unsigned long indirection_page, + unsigned long pa_control_page, + unsigned long start_address, + unsigned int preserve_context, + unsigned int host_mem_enc_active); #endif - +extern relocate_kernel_fn relocate_kernel; #define ARCH_HAS_KIMAGE_ARCH #ifdef CONFIG_X86_32 @@ -146,6 +139,19 @@ struct kimage_arch { }; #else struct kimage_arch { + /* + * This is a kimage control page, as it must not overlap with either + * source or destination address ranges. + */ + pgd_t *pgd; + /* + * The virtual mapping of the control code page itself is used only + * during the transition, while the current kernel's pages are all + * in place. Thus the intermediate page table pages used to map it + * are not control pages, but instead just normal pages obtained + * with get_zeroed_page(). And have to be tracked (below) so that + * they can be freed. + */ p4d_t *p4d; pud_t *pud; pmd_t *pmd; @@ -200,18 +206,24 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, const Elf_Shdr *symtab); #define arch_kexec_apply_relocations_add arch_kexec_apply_relocations_add -void *arch_kexec_kernel_image_load(struct kimage *image); -#define arch_kexec_kernel_image_load arch_kexec_kernel_image_load - int arch_kimage_file_post_load_cleanup(struct kimage *image); #define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup #endif #endif -typedef void crash_vmclear_fn(void); -extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss; extern void kdump_nmi_shootdown_cpus(void); -#endif /* __ASSEMBLY__ */ +#ifdef CONFIG_CRASH_HOTPLUG +void arch_crash_handle_hotplug_event(struct kimage *image, void *arg); +#define arch_crash_handle_hotplug_event arch_crash_handle_hotplug_event + +int arch_crash_hotplug_support(struct kimage *image, unsigned long kexec_flags); +#define arch_crash_hotplug_support arch_crash_hotplug_support + +unsigned int arch_crash_get_elfcorehdr_size(void); +#define crash_get_elfcorehdr_size arch_crash_get_elfcorehdr_size +#endif + +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_KEXEC_H */ diff --git a/arch/x86/include/asm/kmsan.h b/arch/x86/include/asm/kmsan.h new file mode 100644 index 000000000000..d91b37f5b4bb --- /dev/null +++ b/arch/x86/include/asm/kmsan.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * x86 KMSAN support. + * + * Copyright (C) 2022, Google LLC + * Author: Alexander Potapenko <glider@google.com> + */ + +#ifndef _ASM_X86_KMSAN_H +#define _ASM_X86_KMSAN_H + +#ifndef MODULE + +#include <asm/cpu_entry_area.h> +#include <asm/processor.h> +#include <linux/mmzone.h> + +DECLARE_PER_CPU(char[CPU_ENTRY_AREA_SIZE], cpu_entry_area_shadow); +DECLARE_PER_CPU(char[CPU_ENTRY_AREA_SIZE], cpu_entry_area_origin); + +/* + * Functions below are declared in the header to make sure they are inlined. + * They all are called from kmsan_get_metadata() for every memory access in + * the kernel, so speed is important here. + */ + +/* + * Compute metadata addresses for the CPU entry area on x86. + */ +static inline void *arch_kmsan_get_meta_or_null(void *addr, bool is_origin) +{ + unsigned long addr64 = (unsigned long)addr; + char *metadata_array; + unsigned long off; + int cpu; + + if ((addr64 < CPU_ENTRY_AREA_BASE) || + (addr64 >= (CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_MAP_SIZE))) + return NULL; + cpu = (addr64 - CPU_ENTRY_AREA_BASE) / CPU_ENTRY_AREA_SIZE; + off = addr64 - (unsigned long)get_cpu_entry_area(cpu); + if ((off < 0) || (off >= CPU_ENTRY_AREA_SIZE)) + return NULL; + metadata_array = is_origin ? cpu_entry_area_origin : + cpu_entry_area_shadow; + return &per_cpu(metadata_array[off], cpu); +} + +/* + * Taken from arch/x86/mm/physaddr.h to avoid using an instrumented version. + */ +static inline bool kmsan_phys_addr_valid(unsigned long addr) +{ + if (IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) + return !(addr >> boot_cpu_data.x86_phys_bits); + else + return true; +} + +/* + * Taken from arch/x86/mm/physaddr.c to avoid using an instrumented version. + */ +static inline bool kmsan_virt_addr_valid(void *addr) +{ + unsigned long x = (unsigned long)addr; + unsigned long y = x - __START_KERNEL_map; + bool ret; + + /* use the carry flag to determine if x was < __START_KERNEL_map */ + if (unlikely(x > y)) { + x = y + phys_base; + + if (y >= KERNEL_IMAGE_SIZE) + return false; + } else { + x = y + (__START_KERNEL_map - PAGE_OFFSET); + + /* carry flag will be set if starting x was >= PAGE_OFFSET */ + if ((x > y) || !kmsan_phys_addr_valid(x)) + return false; + } + + /* + * pfn_valid() relies on RCU, and may call into the scheduler on exiting + * the critical section. However, this would result in recursion with + * KMSAN. Therefore, disable preemption here, and re-enable preemption + * below while suppressing reschedules to avoid recursion. + * + * Note, this sacrifices occasionally breaking scheduling guarantees. + * Although, a kernel compiled with KMSAN has already given up on any + * performance guarantees due to being heavily instrumented. + */ + preempt_disable(); + ret = pfn_valid(x >> PAGE_SHIFT); + preempt_enable_no_resched(); + + return ret; +} + +#endif /* !MODULE */ + +#endif /* _ASM_X86_KMSAN_H */ diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 71ea2eab43d5..5939694dfb28 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -50,8 +50,6 @@ extern const int kretprobe_blacklist_size; void arch_remove_kprobe(struct kprobe *p); -extern void arch_kprobe_override_function(struct pt_regs *regs); - /* Architecture specific copy of original instruction*/ struct arch_specific_insn { /* copy of the original instruction */ @@ -115,8 +113,6 @@ struct kprobe_ctlblk { }; extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr); -extern int kprobe_exceptions_notify(struct notifier_block *self, - unsigned long val, void *data); extern int kprobe_int3_handler(struct pt_regs *regs); #else diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index da47f60a4650..8d50e3e0a19b 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -9,18 +9,20 @@ BUILD_BUG_ON(1) * "static_call_update()" calls. * * KVM_X86_OP_OPTIONAL() can be used for those functions that can have - * a NULL definition, for example if "static_call_cond()" will be used - * at the call sites. KVM_X86_OP_OPTIONAL_RET0() can be used likewise + * a NULL definition. KVM_X86_OP_OPTIONAL_RET0() can be used likewise * to make a definition optional, but in this case the default will * be __static_call_return0. */ -KVM_X86_OP(hardware_enable) -KVM_X86_OP(hardware_disable) +KVM_X86_OP(check_processor_compatibility) +KVM_X86_OP(enable_virtualization_cpu) +KVM_X86_OP(disable_virtualization_cpu) KVM_X86_OP(hardware_unsetup) KVM_X86_OP(has_emulated_msr) KVM_X86_OP(vcpu_after_set_cpuid) KVM_X86_OP(vm_init) KVM_X86_OP_OPTIONAL(vm_destroy) +KVM_X86_OP_OPTIONAL(vm_pre_destroy) +KVM_X86_OP_OPTIONAL_RET0(vcpu_precreate) KVM_X86_OP(vcpu_create) KVM_X86_OP(vcpu_free) KVM_X86_OP(vcpu_reset) @@ -33,8 +35,10 @@ KVM_X86_OP(set_msr) KVM_X86_OP(get_segment_base) KVM_X86_OP(get_segment) KVM_X86_OP(get_cpl) +KVM_X86_OP(get_cpl_no_cache) KVM_X86_OP(set_segment) KVM_X86_OP(get_cs_db_l_bits) +KVM_X86_OP(is_valid_cr0) KVM_X86_OP(set_cr0) KVM_X86_OP_OPTIONAL(post_set_cr3) KVM_X86_OP(is_valid_cr4) @@ -45,6 +49,7 @@ KVM_X86_OP(set_idt) KVM_X86_OP(get_gdt) KVM_X86_OP(set_gdt) KVM_X86_OP(sync_dirty_debug_regs) +KVM_X86_OP(set_dr6) KVM_X86_OP(set_dr7) KVM_X86_OP(cache_reg) KVM_X86_OP(get_rflags) @@ -52,8 +57,10 @@ KVM_X86_OP(set_rflags) KVM_X86_OP(get_if_flag) KVM_X86_OP(flush_tlb_all) KVM_X86_OP(flush_tlb_current) -KVM_X86_OP_OPTIONAL(tlb_remote_flush) -KVM_X86_OP_OPTIONAL(tlb_remote_flush_with_range) +#if IS_ENABLED(CONFIG_HYPERV) +KVM_X86_OP_OPTIONAL(flush_remote_tlbs) +KVM_X86_OP_OPTIONAL(flush_remote_tlbs_range) +#endif KVM_X86_OP(flush_tlb_gva) KVM_X86_OP(flush_tlb_guest) KVM_X86_OP(vcpu_pre_run) @@ -66,7 +73,9 @@ KVM_X86_OP(get_interrupt_shadow) KVM_X86_OP(patch_hypercall) KVM_X86_OP(inject_irq) KVM_X86_OP(inject_nmi) -KVM_X86_OP(queue_exception) +KVM_X86_OP_OPTIONAL_RET0(is_vnmi_pending) +KVM_X86_OP_OPTIONAL_RET0(set_vnmi_pending) +KVM_X86_OP(inject_exception) KVM_X86_OP(cancel_injection) KVM_X86_OP(interrupt_allowed) KVM_X86_OP(nmi_allowed) @@ -75,11 +84,8 @@ KVM_X86_OP(set_nmi_mask) KVM_X86_OP(enable_nmi_window) KVM_X86_OP(enable_irq_window) KVM_X86_OP_OPTIONAL(update_cr8_intercept) -KVM_X86_OP(check_apicv_inhibit_reasons) KVM_X86_OP(refresh_apicv_exec_ctrl) -KVM_X86_OP_OPTIONAL(hwapic_irr_update) KVM_X86_OP_OPTIONAL(hwapic_isr_update) -KVM_X86_OP_OPTIONAL_RET0(guest_apic_has_interrupt) KVM_X86_OP_OPTIONAL(load_eoi_exitmap) KVM_X86_OP_OPTIONAL(set_virtual_apic_mode) KVM_X86_OP_OPTIONAL(set_apic_access_page_addr) @@ -87,47 +93,61 @@ KVM_X86_OP(deliver_interrupt) KVM_X86_OP_OPTIONAL(sync_pir_to_irr) KVM_X86_OP_OPTIONAL_RET0(set_tss_addr) KVM_X86_OP_OPTIONAL_RET0(set_identity_map_addr) -KVM_X86_OP(get_mt_mask) +KVM_X86_OP_OPTIONAL_RET0(get_mt_mask) KVM_X86_OP(load_mmu_pgd) +KVM_X86_OP_OPTIONAL(link_external_spt) +KVM_X86_OP_OPTIONAL(set_external_spte) +KVM_X86_OP_OPTIONAL(free_external_spt) +KVM_X86_OP_OPTIONAL(remove_external_spte) KVM_X86_OP(has_wbinvd_exit) KVM_X86_OP(get_l2_tsc_offset) KVM_X86_OP(get_l2_tsc_multiplier) KVM_X86_OP(write_tsc_offset) KVM_X86_OP(write_tsc_multiplier) KVM_X86_OP(get_exit_info) +KVM_X86_OP(get_entry_info) KVM_X86_OP(check_intercept) KVM_X86_OP(handle_exit_irqoff) -KVM_X86_OP(request_immediate_exit) -KVM_X86_OP(sched_in) KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging) KVM_X86_OP_OPTIONAL(vcpu_blocking) KVM_X86_OP_OPTIONAL(vcpu_unblocking) KVM_X86_OP_OPTIONAL(pi_update_irte) KVM_X86_OP_OPTIONAL(pi_start_assignment) +KVM_X86_OP_OPTIONAL(apicv_pre_state_restore) KVM_X86_OP_OPTIONAL(apicv_post_state_restore) KVM_X86_OP_OPTIONAL_RET0(dy_apicv_has_pending_interrupt) +KVM_X86_OP_OPTIONAL(protected_apic_has_interrupt) KVM_X86_OP_OPTIONAL(set_hv_timer) KVM_X86_OP_OPTIONAL(cancel_hv_timer) KVM_X86_OP(setup_mce) +#ifdef CONFIG_KVM_SMM KVM_X86_OP(smi_allowed) KVM_X86_OP(enter_smm) KVM_X86_OP(leave_smm) KVM_X86_OP(enable_smi_window) +#endif +KVM_X86_OP_OPTIONAL(dev_get_attr) KVM_X86_OP_OPTIONAL(mem_enc_ioctl) +KVM_X86_OP_OPTIONAL(vcpu_mem_enc_ioctl) KVM_X86_OP_OPTIONAL(mem_enc_register_region) KVM_X86_OP_OPTIONAL(mem_enc_unregister_region) KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from) KVM_X86_OP_OPTIONAL(vm_move_enc_context_from) KVM_X86_OP_OPTIONAL(guest_memory_reclaimed) -KVM_X86_OP(get_msr_feature) -KVM_X86_OP(can_emulate_instruction) +KVM_X86_OP(get_feature_msr) +KVM_X86_OP(check_emulate_instruction) KVM_X86_OP(apic_init_signal_blocked) -KVM_X86_OP_OPTIONAL(enable_direct_tlbflush) +KVM_X86_OP_OPTIONAL(enable_l2_tlb_flush) KVM_X86_OP_OPTIONAL(migrate_timers) KVM_X86_OP(msr_filter_changed) KVM_X86_OP(complete_emulated_msr) KVM_X86_OP(vcpu_deliver_sipi_vector) KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons); +KVM_X86_OP_OPTIONAL(get_untagged_addr) +KVM_X86_OP_OPTIONAL(alloc_apic_backing_page) +KVM_X86_OP_OPTIONAL_RET0(gmem_prepare) +KVM_X86_OP_OPTIONAL_RET0(private_max_mapping_level) +KVM_X86_OP_OPTIONAL(gmem_invalidate) #undef KVM_X86_OP #undef KVM_X86_OP_OPTIONAL diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h index fdfd8e06fee6..9159bf1a4730 100644 --- a/arch/x86/include/asm/kvm-x86-pmu-ops.h +++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h @@ -9,21 +9,17 @@ BUILD_BUG_ON(1) * "static_call_update()" calls. * * KVM_X86_PMU_OP_OPTIONAL() can be used for those functions that can have - * a NULL definition, for example if "static_call_cond()" will be used - * at the call sites. + * a NULL definition. */ -KVM_X86_PMU_OP(pmc_perf_hw_id) -KVM_X86_PMU_OP(pmc_is_enabled) -KVM_X86_PMU_OP(pmc_idx_to_pmc) KVM_X86_PMU_OP(rdpmc_ecx_to_pmc) KVM_X86_PMU_OP(msr_idx_to_pmc) -KVM_X86_PMU_OP(is_valid_rdpmc_ecx) +KVM_X86_PMU_OP_OPTIONAL(check_rdpmc_early) KVM_X86_PMU_OP(is_valid_msr) KVM_X86_PMU_OP(get_msr) KVM_X86_PMU_OP(set_msr) KVM_X86_PMU_OP(refresh) KVM_X86_PMU_OP(init) -KVM_X86_PMU_OP(reset) +KVM_X86_PMU_OP_OPTIONAL(reset) KVM_X86_PMU_OP_OPTIONAL(deliver_pmi) KVM_X86_PMU_OP_OPTIONAL(cleanup) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9217bd6cf0d1..b4a391929cdb 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -24,21 +24,35 @@ #include <linux/pvclock_gtod.h> #include <linux/clocksource.h> #include <linux/irqbypass.h> -#include <linux/hyperv.h> +#include <linux/kfifo.h> +#include <linux/sched/vhost_task.h> +#include <linux/call_once.h> +#include <linux/atomic.h> #include <asm/apic.h> #include <asm/pvclock-abi.h> #include <asm/desc.h> #include <asm/mtrr.h> #include <asm/msr-index.h> +#include <asm/msr.h> #include <asm/asm.h> +#include <asm/irq_remapping.h> #include <asm/kvm_page_track.h> #include <asm/kvm_vcpu_regs.h> -#include <asm/hyperv-tlfs.h> +#include <asm/reboot.h> +#include <hyperv/hvhdk.h> #define __KVM_HAVE_ARCH_VCPU_DEBUGFS +/* + * CONFIG_KVM_MAX_NR_VCPUS is defined iff CONFIG_KVM!=n, provide a dummy max if + * KVM is disabled (arbitrarily use the default from CONFIG_KVM_MAX_NR_VCPUS). + */ +#ifdef CONFIG_KVM_MAX_NR_VCPUS +#define KVM_MAX_VCPUS CONFIG_KVM_MAX_NR_VCPUS +#else #define KVM_MAX_VCPUS 1024 +#endif /* * In x86, the VCPU ID corresponds to the APIC ID, and APIC IDs @@ -53,7 +67,7 @@ #define KVM_MAX_VCPU_IDS (KVM_MAX_VCPUS * KVM_VCPU_ID_RATIO) /* memory slots that are not exposed to userspace */ -#define KVM_PRIVATE_MEM_SLOTS 3 +#define KVM_INTERNAL_MEM_SLOTS 3 #define KVM_HALT_POLL_NS_DEFAULT 200000 @@ -65,6 +79,9 @@ #define KVM_BUS_LOCK_DETECTION_VALID_MODE (KVM_BUS_LOCK_DETECTION_OFF | \ KVM_BUS_LOCK_DETECTION_EXIT) +#define KVM_X86_NOTIFY_VMEXIT_VALID_BITS (KVM_X86_NOTIFY_VMEXIT_ENABLED | \ + KVM_X86_NOTIFY_VMEXIT_USER) + /* x86-specific vcpu->requests bit members */ #define KVM_REQ_MIGRATE_TIMER KVM_ARCH_REQ(0) #define KVM_REQ_REPORT_TPR_ACCESS KVM_ARCH_REQ(1) @@ -78,7 +95,9 @@ #define KVM_REQ_NMI KVM_ARCH_REQ(9) #define KVM_REQ_PMU KVM_ARCH_REQ(10) #define KVM_REQ_PMI KVM_ARCH_REQ(11) +#ifdef CONFIG_KVM_SMM #define KVM_REQ_SMI KVM_ARCH_REQ(12) +#endif #define KVM_REQ_MASTERCLOCK_UPDATE KVM_ARCH_REQ(13) #define KVM_REQ_MCLOCK_INPROGRESS \ KVM_ARCH_REQ_FLAGS(14, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) @@ -105,6 +124,10 @@ KVM_ARCH_REQ_FLAGS(30, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_MMU_FREE_OBSOLETE_ROOTS \ KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_HV_TLB_FLUSH \ + KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_UPDATE_PROTECTED_GUEST_STATE \ + KVM_ARCH_REQ_FLAGS(34, KVM_REQUEST_WAIT) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ @@ -117,7 +140,8 @@ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \ | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \ - | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP)) + | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP \ + | X86_CR4_LAM_SUP)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) @@ -126,9 +150,6 @@ #define INVALID_PAGE (~(hpa_t)0) #define VALID_PAGE(x) ((x) != INVALID_PAGE) -#define UNMAPPED_GVA (~(gpa_t)0) -#define INVALID_GPA (~(gpa_t)0) - /* KVM Hugepage definitions for x86 */ #define KVM_MAX_HUGEPAGE_LEVEL PG_LEVEL_1G #define KVM_NR_PAGE_SIZES (KVM_MAX_HUGEPAGE_LEVEL - PG_LEVEL_4K + 1) @@ -145,7 +166,6 @@ #define KVM_MIN_FREE_MMU_PAGES 5 #define KVM_REFILL_PAGES 25 #define KVM_MAX_CPUID_ENTRIES 256 -#define KVM_NR_FIXED_MTRR_REGION 88 #define KVM_NR_VAR_MTRR 8 #define ASYNC_PF_PER_VCPU 64 @@ -197,11 +217,13 @@ enum exit_fastpath_completion { EXIT_FASTPATH_NONE, EXIT_FASTPATH_REENTER_GUEST, EXIT_FASTPATH_EXIT_HANDLED, + EXIT_FASTPATH_EXIT_USERSPACE, }; typedef enum exit_fastpath_completion fastpath_t; struct x86_emulate_ctxt; struct x86_exception; +union kvm_smram; enum x86_intercept; enum x86_intercept_stage; @@ -239,32 +261,31 @@ enum x86_intercept_stage; KVM_GUESTDBG_INJECT_DB | \ KVM_GUESTDBG_BLOCKIRQ) +#define PFERR_PRESENT_MASK BIT(0) +#define PFERR_WRITE_MASK BIT(1) +#define PFERR_USER_MASK BIT(2) +#define PFERR_RSVD_MASK BIT(3) +#define PFERR_FETCH_MASK BIT(4) +#define PFERR_PK_MASK BIT(5) +#define PFERR_SGX_MASK BIT(15) +#define PFERR_GUEST_RMP_MASK BIT_ULL(31) +#define PFERR_GUEST_FINAL_MASK BIT_ULL(32) +#define PFERR_GUEST_PAGE_MASK BIT_ULL(33) +#define PFERR_GUEST_ENC_MASK BIT_ULL(34) +#define PFERR_GUEST_SIZEM_MASK BIT_ULL(35) +#define PFERR_GUEST_VMPL_MASK BIT_ULL(36) -#define PFERR_PRESENT_BIT 0 -#define PFERR_WRITE_BIT 1 -#define PFERR_USER_BIT 2 -#define PFERR_RSVD_BIT 3 -#define PFERR_FETCH_BIT 4 -#define PFERR_PK_BIT 5 -#define PFERR_SGX_BIT 15 -#define PFERR_GUEST_FINAL_BIT 32 -#define PFERR_GUEST_PAGE_BIT 33 -#define PFERR_IMPLICIT_ACCESS_BIT 48 - -#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT) -#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT) -#define PFERR_USER_MASK (1U << PFERR_USER_BIT) -#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) -#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) -#define PFERR_PK_MASK (1U << PFERR_PK_BIT) -#define PFERR_SGX_MASK (1U << PFERR_SGX_BIT) -#define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT) -#define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT) -#define PFERR_IMPLICIT_ACCESS (1ULL << PFERR_IMPLICIT_ACCESS_BIT) - -#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ - PFERR_WRITE_MASK | \ - PFERR_PRESENT_MASK) +/* + * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP checks + * when emulating instructions that triggers implicit access. + */ +#define PFERR_IMPLICIT_ACCESS BIT_ULL(48) +/* + * PRIVATE_ACCESS is a KVM-defined flag us to indicate that a fault occurred + * when the guest was accessing private memory. + */ +#define PFERR_PRIVATE_ACCESS BIT_ULL(49) +#define PFERR_SYNTHETIC_MASK (PFERR_IMPLICIT_ACCESS | PFERR_PRIVATE_ACCESS) /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 @@ -282,13 +303,13 @@ struct kvm_kernel_irq_routing_entry; * kvm_mmu_page_role tracks the properties of a shadow page (where shadow page * also includes TDP pages) to determine whether or not a page can be used in * the given MMU context. This is a subset of the overall kvm_cpu_role to - * minimize the size of kvm_memory_slot.arch.gfn_track, i.e. allows allocating - * 2 bytes per gfn instead of 4 bytes per gfn. + * minimize the size of kvm_memory_slot.arch.gfn_write_track, i.e. allows + * allocating 2 bytes per gfn instead of 4 bytes per gfn. * * Upper-level shadow pages having gptes are tracked for write-protection via - * gfn_track. As above, gfn_track is a 16 bit counter, so KVM must not create - * more than 2^16-1 upper-level shadow pages at a single gfn, otherwise - * gfn_track will overflow and explosions will ensure. + * gfn_write_track. As above, gfn_write_track is a 16 bit counter, so KVM must + * not create more than 2^16-1 upper-level shadow pages at a single gfn, + * otherwise gfn_write_track will overflow and explosions will ensue. * * A unique shadow page (SP) for a gfn is created if and only if an existing SP * cannot be reused. The ability to reuse a SP is tracked by its role, which @@ -296,10 +317,11 @@ struct kvm_kernel_irq_routing_entry; * the number of unique SPs that can theoretically be created is 2^n, where n * is the number of bits that are used to compute the role. * - * But, even though there are 19 bits in the mask below, not all combinations + * But, even though there are 20 bits in the mask below, not all combinations * of modes and flags are possible: * - * - invalid shadow pages are not accounted, so the bits are effectively 18 + * - invalid shadow pages are not accounted, mirror pages are not shadowed, + * so the bits are effectively 18. * * - quadrant will only be used if has_4_byte_gpte=1 (non-PAE paging); * execonly and ad_disabled are only used for nested EPT which has @@ -332,7 +354,8 @@ union kvm_mmu_page_role { unsigned ad_disabled:1; unsigned guest_mode:1; unsigned passthrough:1; - unsigned :5; + unsigned is_mirror:1; + unsigned :4; /* * This is left at the top of the word so that @@ -386,11 +409,10 @@ union kvm_cpu_role { }; struct kvm_rmap_head { - unsigned long val; + atomic_long_t val; }; struct kvm_pio_request { - unsigned long linear_rip; unsigned long count; int in; int port; @@ -414,6 +436,10 @@ struct kvm_mmu_root_info { #define KVM_MMU_NUM_PREV_ROOTS 3 +#define KVM_MMU_ROOT_CURRENT BIT(0) +#define KVM_MMU_ROOT_PREVIOUS(i) BIT(1+i) +#define KVM_MMU_ROOTS_ALL (BIT(1 + KVM_MMU_NUM_PREV_ROOTS) - 1) + #define KVM_HAVE_MMU_RWLOCK struct kvm_mmu_page; @@ -433,10 +459,10 @@ struct kvm_mmu { gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gpa_t gva_or_gpa, u64 access, struct x86_exception *exception); - int (*sync_page)(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp); - void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa); + int (*sync_spte)(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, int i); struct kvm_mmu_root_info root; + hpa_t mirror_root_hpa; union kvm_cpu_role cpu_role; union kvm_mmu_page_role root_role; @@ -473,11 +499,6 @@ struct kvm_mmu { u64 pdptrs[4]; /* pae */ }; -struct kvm_tlb_range { - u64 start_gfn; - u64 pages; -}; - enum pmc_type { KVM_PMC_GP = 0, KVM_PMC_FIXED, @@ -486,40 +507,92 @@ enum pmc_type { struct kvm_pmc { enum pmc_type type; u8 idx; + bool is_paused; + bool intr; + /* + * Base value of the PMC counter, relative to the *consumed* count in + * the associated perf_event. This value includes counter updates from + * the perf_event and emulated_count since the last time the counter + * was reprogrammed, but it is *not* the current value as seen by the + * guest or userspace. + * + * The count is relative to the associated perf_event so that KVM + * doesn't need to reprogram the perf_event every time the guest writes + * to the counter. + */ u64 counter; + /* + * PMC events triggered by KVM emulation that haven't been fully + * processed, i.e. haven't undergone overflow detection. + */ + u64 emulated_counter; u64 eventsel; struct perf_event *perf_event; struct kvm_vcpu *vcpu; /* + * only for creating or reusing perf_event, * eventsel value for general purpose counters, * ctrl value for fixed counters. */ u64 current_config; - bool is_paused; - bool intr; }; -#define KVM_PMC_MAX_FIXED 3 +/* More counters may conflict with other existing Architectural MSRs */ +#define KVM_MAX(a, b) ((a) >= (b) ? (a) : (b)) +#define KVM_MAX_NR_INTEL_GP_COUNTERS 8 +#define KVM_MAX_NR_AMD_GP_COUNTERS 6 +#define KVM_MAX_NR_GP_COUNTERS KVM_MAX(KVM_MAX_NR_INTEL_GP_COUNTERS, \ + KVM_MAX_NR_AMD_GP_COUNTERS) + +#define KVM_MAX_NR_INTEL_FIXED_COUTNERS 3 +#define KVM_MAX_NR_AMD_FIXED_COUTNERS 0 +#define KVM_MAX_NR_FIXED_COUNTERS KVM_MAX(KVM_MAX_NR_INTEL_FIXED_COUTNERS, \ + KVM_MAX_NR_AMD_FIXED_COUTNERS) + struct kvm_pmu { + u8 version; unsigned nr_arch_gp_counters; unsigned nr_arch_fixed_counters; unsigned available_event_types; u64 fixed_ctr_ctrl; + u64 fixed_ctr_ctrl_rsvd; u64 global_ctrl; u64 global_status; u64 counter_bitmask[2]; - u64 global_ctrl_mask; - u64 global_ovf_ctrl_mask; + u64 global_ctrl_rsvd; + u64 global_status_rsvd; u64 reserved_bits; u64 raw_event_mask; - u8 version; - struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC]; - struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED]; - struct irq_work irq_work; - DECLARE_BITMAP(reprogram_pmi, X86_PMC_IDX_MAX); + struct kvm_pmc gp_counters[KVM_MAX_NR_GP_COUNTERS]; + struct kvm_pmc fixed_counters[KVM_MAX_NR_FIXED_COUNTERS]; + + /* + * Overlay the bitmap with a 64-bit atomic so that all bits can be + * set in a single access, e.g. to reprogram all counters when the PMU + * filter changes. + */ + union { + DECLARE_BITMAP(reprogram_pmi, X86_PMC_IDX_MAX); + atomic64_t __reprogram_pmi; + }; DECLARE_BITMAP(all_valid_pmc_idx, X86_PMC_IDX_MAX); DECLARE_BITMAP(pmc_in_use, X86_PMC_IDX_MAX); + u64 ds_area; + u64 pebs_enable; + u64 pebs_enable_rsvd; + u64 pebs_data_cfg; + u64 pebs_data_cfg_rsvd; + + /* + * If a guest counter is cross-mapped to host counter with different + * index, its PEBS capability will be temporarily disabled. + * + * The user should make sure that this mask is updated + * after disabling interrupts and before perf_guest_get_msrs(); + */ + u64 host_cross_mapped_mask; + /* * The gate to release perf_events not marked in * pmc_in_use only once in a vcpu time slice. @@ -536,22 +609,23 @@ struct kvm_pmu { struct kvm_pmu_ops; enum { - KVM_DEBUGREG_BP_ENABLED = 1, - KVM_DEBUGREG_WONT_EXIT = 2, -}; - -struct kvm_mtrr_range { - u64 base; - u64 mask; - struct list_head node; + KVM_DEBUGREG_BP_ENABLED = BIT(0), + KVM_DEBUGREG_WONT_EXIT = BIT(1), + /* + * Guest debug registers (DR0-3, DR6 and DR7) are saved/restored by + * hardware on exit from or enter to guest. KVM needn't switch them. + * DR0-3, DR6 and DR7 are set to their architectural INIT value on VM + * exit, host values need to be restored. + */ + KVM_DEBUGREG_AUTO_SWITCH = BIT(2), }; struct kvm_mtrr { - struct kvm_mtrr_range var_ranges[KVM_NR_VAR_MTRR]; - mtrr_type fixed_ranges[KVM_NR_FIXED_MTRR_REGION]; + u64 var[KVM_NR_VAR_MTRR * 2]; + u64 fixed_64k; + u64 fixed_16k[2]; + u64 fixed_4k[8]; u64 deftype; - - struct list_head head; }; /* Hyper-V SynIC timer */ @@ -579,6 +653,29 @@ struct kvm_vcpu_hv_synic { bool dont_zero_synic_pages; }; +/* The maximum number of entries on the TLB flush fifo. */ +#define KVM_HV_TLB_FLUSH_FIFO_SIZE (16) +/* + * Note: the following 'magic' entry is made up by KVM to avoid putting + * anything besides GVA on the TLB flush fifo. It is theoretically possible + * to observe a request to flush 4095 PFNs starting from 0xfffffffffffff000 + * which will look identical. KVM's action to 'flush everything' instead of + * flushing these particular addresses is, however, fully legitimate as + * flushing more than requested is always OK. + */ +#define KVM_HV_TLB_FLUSHALL_ENTRY ((u64)-1) + +enum hv_tlb_flush_fifos { + HV_L1_TLB_FLUSH_FIFO, + HV_L2_TLB_FLUSH_FIFO, + HV_NR_TLB_FLUSH_FIFOS, +}; + +struct kvm_vcpu_hv_tlb_flush_fifo { + spinlock_t write_lock; + DECLARE_KFIFO(entries, u64, KVM_HV_TLB_FLUSH_FIFO_SIZE); +}; + /* Hyper-V per vcpu emulation context */ struct kvm_vcpu_hv { struct kvm_vcpu *vcpu; @@ -597,9 +694,30 @@ struct kvm_vcpu_hv { u32 enlightenments_eax; /* HYPERV_CPUID_ENLIGHTMENT_INFO.EAX */ u32 enlightenments_ebx; /* HYPERV_CPUID_ENLIGHTMENT_INFO.EBX */ u32 syndbg_cap_eax; /* HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX */ + u32 nested_eax; /* HYPERV_CPUID_NESTED_FEATURES.EAX */ + u32 nested_ebx; /* HYPERV_CPUID_NESTED_FEATURES.EBX */ } cpuid_cache; + + struct kvm_vcpu_hv_tlb_flush_fifo tlb_flush_fifo[HV_NR_TLB_FLUSH_FIFOS]; + + /* Preallocated buffer for handling hypercalls passing sparse vCPU set */ + u64 sparse_banks[HV_MAX_SPARSE_VCPU_BANKS]; + + struct hv_vp_assist_page vp_assist_page; + + struct { + u64 pa_page_gpa; + u64 vm_id; + u32 vp_id; + } nested; }; +struct kvm_hypervisor_cpuid { + u32 base; + u32 limit; +}; + +#ifdef CONFIG_KVM_XEN /* Xen HVM per vcpu emulation context */ struct kvm_vcpu_xen { u64 hypercall_rip; @@ -608,6 +726,7 @@ struct kvm_vcpu_xen { struct gfn_to_pfn_cache vcpu_info_cache; struct gfn_to_pfn_cache vcpu_time_info_cache; struct gfn_to_pfn_cache runstate_cache; + struct gfn_to_pfn_cache runstate2_cache; u64 last_steal; u64 runstate_entry_time; u64 runstate_times[4]; @@ -619,6 +738,35 @@ struct kvm_vcpu_xen { struct hrtimer timer; int poll_evtchn; struct timer_list poll_timer; + struct kvm_hypervisor_cpuid cpuid; +}; +#endif + +struct kvm_queued_exception { + bool pending; + bool injected; + bool has_error_code; + u8 vector; + u32 error_code; + unsigned long payload; + bool has_payload; +}; + +/* + * Hardware-defined CPUID leafs that are either scattered by the kernel or are + * unknown to the kernel, but need to be directly used by KVM. Note, these + * word values conflict with the kernel's "bug" caps, but KVM doesn't use those. + */ +enum kvm_only_cpuid_leafs { + CPUID_12_EAX = NCAPINTS, + CPUID_7_1_EDX, + CPUID_8000_0007_EDX, + CPUID_8000_0022_EAX, + CPUID_7_2_EDX, + CPUID_24_0_EBX, + NR_KVM_CPU_CAPS, + + NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, }; struct kvm_vcpu_arch { @@ -642,9 +790,9 @@ struct kvm_vcpu_arch { u32 pkru; u32 hflags; u64 efer; + u64 host_debugctl; u64 apic_base; struct kvm_lapic *apic; /* kernel irqchip context */ - bool apicv_active; bool load_eoi_exitmap_pending; DECLARE_BITMAP(ioapic_handled_vectors, 256); unsigned long apic_attention; @@ -655,7 +803,6 @@ struct kvm_vcpu_arch { u64 smi_count; bool at_instruction_boundary; bool tpr_access_reporting; - bool xsaves_enabled; bool xfd_no_write_intercept; u64 ia32_xss; u64 microcode_version; @@ -695,8 +842,13 @@ struct kvm_vcpu_arch { struct kvm_mmu_memory_cache mmu_pte_list_desc_cache; struct kvm_mmu_memory_cache mmu_shadow_page_cache; - struct kvm_mmu_memory_cache mmu_gfn_array_cache; + struct kvm_mmu_memory_cache mmu_shadowed_info_cache; struct kvm_mmu_memory_cache mmu_page_header_cache; + /* + * This cache is to allocate external page table. E.g. private EPT used + * by the TDX module. + */ + struct kvm_mmu_memory_cache mmu_external_spt_cache; /* * QEMU userspace and the guest each have their own FPU state. @@ -712,6 +864,7 @@ struct kvm_vcpu_arch { struct fpu_guest guest_fpu; u64 xcr0; + u64 guest_supported_xcr0; struct kvm_pio_request pio; void *pio_data; @@ -720,16 +873,12 @@ struct kvm_vcpu_arch { u8 event_exit_inst_len; - struct kvm_queued_exception { - bool pending; - bool injected; - bool has_error_code; - u8 nr; - u32 error_code; - unsigned long payload; - bool has_payload; - u8 nested_apf; - } exception; + bool exception_from_userspace; + + /* Exceptions to be injected to the guest. */ + struct kvm_queued_exception exception; + /* Exception VM-Exits to be synthesized to L1. */ + struct kvm_queued_exception exception_vmexit; struct kvm_queued_interrupt { bool injected; @@ -741,7 +890,24 @@ struct kvm_vcpu_arch { int cpuid_nent; struct kvm_cpuid_entry2 *cpuid_entries; - u32 kvm_cpuid_base; + bool cpuid_dynamic_bits_dirty; + bool is_amd_compatible; + + /* + * cpu_caps holds the effective guest capabilities, i.e. the features + * the vCPU is allowed to use. Typically, but not always, features can + * be used by the guest if and only if both KVM and userspace want to + * expose the feature to the guest. + * + * A common exception is for virtualization holes, i.e. when KVM can't + * prevent the guest from using a feature, in which case the vCPU "has" + * the feature regardless of what KVM or userspace desires. + * + * Note, features that don't require KVM involvement in any way are + * NOT enforced/sanitized by KVM, i.e. are taken verbatim from the + * guest CPUID provided by userspace. + */ + u32 cpu_caps[NR_KVM_CPU_CAPS]; u64 reserved_gpa_bits; int maxphyaddr; @@ -752,9 +918,11 @@ struct kvm_vcpu_arch { bool emulate_regs_need_sync_to_vcpu; bool emulate_regs_need_sync_from_vcpu; int (*complete_userspace_io)(struct kvm_vcpu *vcpu); + unsigned long cui_linear_rip; gpa_t time; - struct pvclock_vcpu_time_info hv_clock; + s8 pvclock_tsc_shift; + u32 pvclock_tsc_mul; unsigned int hw_tsc_khz; struct gfn_to_pfn_cache pv_time; /* set guest stopped flag in pvclock flags field */ @@ -786,7 +954,8 @@ struct kvm_vcpu_arch { u64 tsc_scaling_ratio; /* current scaling ratio */ atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ - unsigned nmi_pending; /* NMI queued after currently running handler */ + /* Number of NMIs pending injection, not including hardware vNMIs. */ + unsigned int nmi_pending; bool nmi_injected; /* Trying to inject an NMI this entry */ bool smi_pending; /* SMI queued after currently running handler */ u8 handling_intr_from_guest; @@ -808,6 +977,7 @@ struct kvm_vcpu_arch { u64 mcg_ctl; u64 mcg_ext_ctl; u64 *mce_banks; + u64 *mci_ctl2_banks; /* Cache MMIO info */ u64 mmio_gva; @@ -820,10 +990,13 @@ struct kvm_vcpu_arch { /* used for guest single stepping over the given code position */ unsigned long singlestep_rip; +#ifdef CONFIG_KVM_HYPERV bool hyperv_enabled; struct kvm_vcpu_hv *hyperv; +#endif +#ifdef CONFIG_KVM_XEN struct kvm_vcpu_xen xen; - +#endif cpumask_var_t wbinvd_dirty_mask; unsigned long last_retry_eip; @@ -837,9 +1010,8 @@ struct kvm_vcpu_arch { u64 msr_int_val; /* MSR_KVM_ASYNC_PF_INT */ u16 vec; u32 id; - bool send_user_only; u32 host_apf_flags; - unsigned long nested_apf_token; + bool send_always; bool delivery_as_pf_vmexit; bool pageready_pending; } apf; @@ -857,26 +1029,6 @@ struct kvm_vcpu_arch { u64 msr_kvm_poll_control; - /* - * Indicates the guest is trying to write a gfn that contains one or - * more of the PTEs used to translate the write itself, i.e. the access - * is changing its own translation in the guest page tables. KVM exits - * to userspace if emulation of the faulting instruction fails and this - * flag is set, as KVM cannot make forward progress. - * - * If emulation fails for a write to guest page tables, KVM unprotects - * (zaps) the shadow page for the target gfn and resumes the guest to - * retry the non-emulatable instruction (on hardware). Unprotecting the - * gfn doesn't allow forward progress for a self-changing access because - * doing so also zaps the translation for the gfn, i.e. retrying the - * instruction will hit a !PRESENT fault, which results in a new shadow - * page and sends KVM back to square one. - */ - bool write_fault_to_shadow_pgtable; - - /* set at EPT violation at this point */ - unsigned long exit_qualification; - /* pv related host specific info */ struct { bool pv_unhalted; @@ -884,6 +1036,7 @@ struct kvm_vcpu_arch { int pending_ioapic_eoi; int pending_external_vector; + int highest_stale_pending_ioapic_eoi; /* be preempted when it's in kernel-mode(cpl=0) */ bool preempted_in_kernel; @@ -914,6 +1067,7 @@ struct kvm_vcpu_arch { /* Protected Guests */ bool guest_state_protected; + bool guest_tsc_protected; /* * Set when PDPTS were loaded directly by the userspace without @@ -933,23 +1087,34 @@ struct kvm_lpage_info { struct kvm_arch_memory_slot { struct kvm_rmap_head *rmap[KVM_NR_PAGE_SIZES]; struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; - unsigned short *gfn_track[KVM_PAGE_TRACK_MAX]; + unsigned short *gfn_write_track; }; /* - * We use as the mode the number of bits allocated in the LDR for the - * logical processor ID. It happens that these are all powers of two. - * This makes it is very easy to detect cases where the APICs are - * configured for multiple modes; in that case, we cannot use the map and - * hence cannot use kvm_irq_delivery_to_apic_fast either. + * Track the mode of the optimized logical map, as the rules for decoding the + * destination vary per mode. Enabling the optimized logical map requires all + * software-enabled local APIs to be in the same mode, each addressable APIC to + * be mapped to only one MDA, and each MDA to map to at most one APIC. */ -#define KVM_APIC_MODE_XAPIC_CLUSTER 4 -#define KVM_APIC_MODE_XAPIC_FLAT 8 -#define KVM_APIC_MODE_X2APIC 16 +enum kvm_apic_logical_mode { + /* All local APICs are software disabled. */ + KVM_APIC_MODE_SW_DISABLED, + /* All software enabled local APICs in xAPIC cluster addressing mode. */ + KVM_APIC_MODE_XAPIC_CLUSTER, + /* All software enabled local APICs in xAPIC flat addressing mode. */ + KVM_APIC_MODE_XAPIC_FLAT, + /* All software enabled local APICs in x2APIC mode. */ + KVM_APIC_MODE_X2APIC, + /* + * Optimized map disabled, e.g. not all local APICs in the same logical + * mode, same logical ID assigned to multiple APICs, etc. + */ + KVM_APIC_MODE_MAP_DISABLED, +}; struct kvm_apic_map { struct rcu_head rcu; - u8 mode; + enum kvm_apic_logical_mode logical_mode; u32 max_apic_id; union { struct kvm_lapic *xapic_flat_map[8]; @@ -984,6 +1149,7 @@ enum hv_tsc_page_status { HV_TSC_PAGE_BROKEN, }; +#ifdef CONFIG_KVM_HYPERV /* Hyper-V emulation context */ struct kvm_hv { struct mutex hv_lock; @@ -1003,6 +1169,7 @@ struct kvm_hv { u64 hv_reenlightenment_control; u64 hv_tsc_emulation_control; u64 hv_tsc_emulation_status; + u64 hv_invtsc_control; /* How many vCPUs have VP index != vCPU index */ atomic_t num_mismatched_vp_indexes; @@ -1013,9 +1180,11 @@ struct kvm_hv { */ unsigned int synic_auto_eoi_used; - struct hv_partition_assist_pg *hv_pa_pg; struct kvm_hv_syndbg hv_syndbg; + + bool xsaves_xsavec_checked; }; +#endif struct msr_bitmap_range { u32 flags; @@ -1024,15 +1193,21 @@ struct msr_bitmap_range { unsigned long *bitmap; }; +#ifdef CONFIG_KVM_XEN /* Xen emulation context */ struct kvm_xen { + struct mutex xen_lock; u32 xen_version; bool long_mode; + bool runstate_update_flag; u8 upcall_vector; struct gfn_to_pfn_cache shinfo_cache; struct idr evtchn_ports; unsigned long poll_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)]; + + struct kvm_xen_hvm_config hvm_config; }; +#endif enum kvm_irqchip_mode { KVM_IRQCHIP_NONE, @@ -1046,6 +1221,18 @@ struct kvm_x86_msr_filter { struct msr_bitmap_range ranges[16]; }; +struct kvm_x86_pmu_event_filter { + __u32 action; + __u32 nevents; + __u32 fixed_counter_bitmap; + __u32 flags; + __u32 nr_includes; + __u32 nr_excludes; + __u64 *includes; + __u64 *excludes; + __u64 events[]; +}; + enum kvm_apicv_inhibit { /********************************************************************/ @@ -1056,7 +1243,7 @@ enum kvm_apicv_inhibit { * APIC acceleration is disabled by a module parameter * and/or not supported in hardware. */ - APICV_INHIBIT_REASON_DISABLE, + APICV_INHIBIT_REASON_DISABLED, /* * APIC acceleration is inhibited because AutoEOI feature is @@ -1077,6 +1264,12 @@ enum kvm_apicv_inhibit { APICV_INHIBIT_REASON_BLOCKIRQ, /* + * APICv is disabled because not all vCPUs have a 1:1 mapping between + * APIC ID and vCPU, _and_ KVM is not applying its x2APIC hotplug hack. + */ + APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED, + + /* * For simplicity, the APIC acceleration is inhibited * first time either APIC ID or APIC base are changed by the guest * from their reset values. @@ -1111,28 +1304,63 @@ enum kvm_apicv_inhibit { APICV_INHIBIT_REASON_PIT_REINJ, /* - * AVIC is inhibited because the guest has x2apic in its CPUID. + * AVIC is disabled because SEV doesn't support it. */ - APICV_INHIBIT_REASON_X2APIC, + APICV_INHIBIT_REASON_SEV, /* - * AVIC is disabled because SEV doesn't support it. + * AVIC is disabled because not all vCPUs with a valid LDR have a 1:1 + * mapping between logical ID and vCPU. */ - APICV_INHIBIT_REASON_SEV, + APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED, + + NR_APICV_INHIBIT_REASONS, }; +#define __APICV_INHIBIT_REASON(reason) \ + { BIT(APICV_INHIBIT_REASON_##reason), #reason } + +#define APICV_INHIBIT_REASONS \ + __APICV_INHIBIT_REASON(DISABLED), \ + __APICV_INHIBIT_REASON(HYPERV), \ + __APICV_INHIBIT_REASON(ABSENT), \ + __APICV_INHIBIT_REASON(BLOCKIRQ), \ + __APICV_INHIBIT_REASON(PHYSICAL_ID_ALIASED), \ + __APICV_INHIBIT_REASON(APIC_ID_MODIFIED), \ + __APICV_INHIBIT_REASON(APIC_BASE_MODIFIED), \ + __APICV_INHIBIT_REASON(NESTED), \ + __APICV_INHIBIT_REASON(IRQWIN), \ + __APICV_INHIBIT_REASON(PIT_REINJ), \ + __APICV_INHIBIT_REASON(SEV), \ + __APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED) + struct kvm_arch { unsigned long n_used_mmu_pages; unsigned long n_requested_mmu_pages; unsigned long n_max_mmu_pages; unsigned int indirect_shadow_pages; u8 mmu_valid_gen; + u8 vm_type; + bool has_private_mem; + bool has_protected_state; + bool pre_fault_allowed; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; struct list_head active_mmu_pages; - struct list_head zapped_obsolete_pages; - struct list_head lpage_disallowed_mmu_pages; - struct kvm_page_track_notifier_node mmu_sp_tracker; + /* + * A list of kvm_mmu_page structs that, if zapped, could possibly be + * replaced by an NX huge page. A shadow page is on this list if its + * existence disallows an NX huge page (nx_huge_page_disallowed is set) + * and there are no other conditions that prevent a huge page, e.g. + * the backing host page is huge, dirtly logging is not enabled for its + * memslot, etc... Note, zapping shadow pages on this list doesn't + * guarantee an NX huge page will be created in its stead, e.g. if the + * guest attempts to execute from the region then KVM obviously can't + * create an NX huge page (without hanging the guest). + */ + struct list_head possible_nx_huge_pages; +#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING struct kvm_page_track_notifier_head track_notifier_head; +#endif /* * Protects marking pages unsync during page faults, as TDP MMU page * faults only take mmu_lock for read. For simplicity, the unsync @@ -1141,9 +1369,8 @@ struct kvm_arch { */ spinlock_t mmu_unsync_pages_lock; - struct list_head assigned_dev_head; - struct iommu_domain *iommu_domain; - bool iommu_noncoherent; + u64 shadow_mmio_value; + #define __KVM_HAVE_ARCH_NONCOHERENT_DMA atomic_t noncoherent_dma_count; #define __KVM_HAVE_ARCH_ASSIGNED_DEVICE @@ -1156,10 +1383,11 @@ struct kvm_arch { struct kvm_apic_map __rcu *apic_map; atomic_t apic_map_dirty; - /* Protects apic_access_memslot_enabled and apicv_inhibit_reasons */ - struct rw_semaphore apicv_update_lock; - bool apic_access_memslot_enabled; + bool apic_access_memslot_inhibited; + + /* Protects apicv_inhibit_reasons */ + struct rw_semaphore apicv_update_lock; unsigned long apicv_inhibit_reasons; gpa_t wall_clock; @@ -1188,6 +1416,8 @@ struct kvm_arch { int nr_vcpus_matched_tsc; u32 default_tsc_khz; + bool user_set_tsc; + u64 apic_bus_cycle_ns; seqcount_raw_spinlock_t pvclock_sc; bool use_master_clock; @@ -1196,20 +1426,22 @@ struct kvm_arch { struct delayed_work kvmclock_update_work; struct delayed_work kvmclock_sync_work; - struct kvm_xen_hvm_config xen_hvm_config; - /* reads protected by irq_srcu, writes by irq_lock */ struct hlist_head mask_notifier_list; +#ifdef CONFIG_KVM_HYPERV struct kvm_hv hyperv; +#endif + +#ifdef CONFIG_KVM_XEN struct kvm_xen xen; +#endif bool backwards_tsc_observed; bool boot_vcpu_runs_old_kvmclock; u32 bsp_vcpu_id; u64 disabled_quirks; - int cpu_dirty_logging_count; enum kvm_irqchip_mode irqchip_mode; u8 nr_reserved_ioapic_pins; @@ -1222,8 +1454,13 @@ struct kvm_arch { bool guest_can_read_msr_platform_info; bool exception_payload_enabled; + bool triple_fault_event; + bool bus_lock_detection_enabled; bool enable_pmu; + + u32 notify_window; + u32 notify_vmexit_flags; /* * If exit_on_emulation_error is set, and the in-kernel instruction * emulator fails to emulate an instruction, allow userspace @@ -1240,18 +1477,19 @@ struct kvm_arch { /* Guest can access the SGX PROVISIONKEY. */ bool sgx_provisioning_allowed; - struct kvm_pmu_event_filter __rcu *pmu_event_filter; - struct task_struct *nx_lpage_recovery_thread; + struct kvm_x86_pmu_event_filter __rcu *pmu_event_filter; + struct vhost_task *nx_huge_page_recovery_thread; + u64 nx_huge_page_last; + struct once nx_once; #ifdef CONFIG_X86_64 +#ifdef CONFIG_KVM_PROVE_MMU /* - * Whether the TDP MMU is enabled for this VM. This contains a - * snapshot of the TDP MMU module parameter from when the VM was - * created and remains unchanged for the life of the VM. If this is - * true, TDP MMU handler functions will run for various MMU - * operations. + * The number of TDP MMU pages across all roots. Used only to sanity + * check that KVM isn't leaking TDP MMU pages. */ - bool tdp_mmu_enabled; + atomic64_t tdp_mmu_pages; +#endif /* * List of struct kvm_mmu_pages being used as roots. @@ -1259,12 +1497,12 @@ struct kvm_arch { * tdp_mmu_page set. * * For reads, this list is protected by: + * RCU alone or * the MMU lock in read mode + RCU or * the MMU lock in write mode * - * For writes, this list is protected by: - * the MMU lock in read mode + the tdp_mmu_pages_lock or - * the MMU lock in write mode + * For writes, this list is protected by tdp_mmu_pages_lock; see + * below for the details. * * Roots will remain in the list until their tdp_mmu_root_count * drops to zero, at which point the thread that decremented the @@ -1274,26 +1512,19 @@ struct kvm_arch { struct list_head tdp_mmu_roots; /* - * List of struct kvmp_mmu_pages not being used as roots. - * All struct kvm_mmu_pages in the list should have - * tdp_mmu_page set and a tdp_mmu_root_count of 0. - */ - struct list_head tdp_mmu_pages; - - /* * Protects accesses to the following fields when the MMU lock * is held in read mode: * - tdp_mmu_roots (above) - * - tdp_mmu_pages (above) - * - the link field of struct kvm_mmu_pages used by the TDP MMU - * - lpage_disallowed_mmu_pages - * - the lpage_disallowed_link field of struct kvm_mmu_pages used + * - the link field of kvm_mmu_page structs used by the TDP MMU + * - possible_nx_huge_pages; + * - the possible_nx_huge_page_link field of kvm_mmu_page structs used * by the TDP MMU - * It is acceptable, but not necessary, to acquire this lock when - * the thread holds the MMU lock in write mode. + * Because the lock is only taken within the MMU lock, strictly + * speaking it is redundant to acquire this lock when the thread + * holds the MMU lock in write mode. However it often simplifies + * the code to do so. */ spinlock_t tdp_mmu_pages_lock; - struct workqueue_struct *tdp_mmu_zap_wq; #endif /* CONFIG_X86_64 */ /* @@ -1303,10 +1534,59 @@ struct kvm_arch { */ bool shadow_root_allocated; +#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING + /* + * If set, the VM has (or had) an external write tracking user, and + * thus all write tracking metadata has been allocated, even if KVM + * itself isn't using write tracking. + */ + bool external_write_tracking_enabled; +#endif + #if IS_ENABLED(CONFIG_HYPERV) hpa_t hv_root_tdp; spinlock_t hv_root_tdp_lock; + struct hv_partition_assist_pg *hv_pa_pg; #endif + /* + * VM-scope maximum vCPU ID. Used to determine the size of structures + * that increase along with the maximum vCPU ID, in which case, using + * the global KVM_MAX_VCPU_IDS may lead to significant memory waste. + */ + u32 max_vcpu_ids; + + bool disable_nx_huge_pages; + + /* + * Memory caches used to allocate shadow pages when performing eager + * page splitting. No need for a shadowed_info_cache since eager page + * splitting only allocates direct shadow pages. + * + * Protected by kvm->slots_lock. + */ + struct kvm_mmu_memory_cache split_shadow_page_cache; + struct kvm_mmu_memory_cache split_page_header_cache; + + /* + * Memory cache used to allocate pte_list_desc structs while splitting + * huge pages. In the worst case, to split one huge page, 512 + * pte_list_desc structs are needed to add each lower level leaf sptep + * to the rmap plus 1 to extend the parent_ptes rmap of the lower level + * page table. + * + * Protected by kvm->slots_lock. + */ +#define SPLIT_DESC_CACHE_MIN_NR_OBJECTS (SPTE_ENT_PER_PAGE + 1) + struct kvm_mmu_memory_cache split_desc_cache; + + gfn_t gfn_direct_bits; + + /* + * Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A Zero + * value indicates CPU dirty logging is unsupported or disabled in + * current VM. + */ + int cpu_dirty_log_size; }; struct kvm_vm_stat { @@ -1367,6 +1647,7 @@ struct kvm_vcpu_stat { u64 preemption_reported; u64 preemption_other; u64 guest_mode; + u64 notify_window_exits; }; struct x86_instruction_info; @@ -1396,8 +1677,12 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical) struct kvm_x86_ops { const char *name; - int (*hardware_enable)(void); - void (*hardware_disable)(void); + int (*check_processor_compatibility)(void); + + int (*enable_virtualization_cpu)(void); + void (*disable_virtualization_cpu)(void); + cpu_emergency_virt_cb *emergency_disable_virtualization_cpu; + void (*hardware_unsetup)(void); bool (*has_emulated_msr)(struct kvm *kvm, u32 index); void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu); @@ -1405,8 +1690,10 @@ struct kvm_x86_ops { unsigned int vm_size; int (*vm_init)(struct kvm *kvm); void (*vm_destroy)(struct kvm *kvm); + void (*vm_pre_destroy)(struct kvm *kvm); /* Create, but do not attach this VCPU */ + int (*vcpu_precreate)(struct kvm *kvm); int (*vcpu_create)(struct kvm_vcpu *vcpu); void (*vcpu_free)(struct kvm_vcpu *vcpu); void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event); @@ -1422,12 +1709,14 @@ struct kvm_x86_ops { void (*get_segment)(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); int (*get_cpl)(struct kvm_vcpu *vcpu); + int (*get_cpl_no_cache)(struct kvm_vcpu *vcpu); void (*set_segment)(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); + bool (*is_valid_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); void (*post_set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); - bool (*is_valid_cr4)(struct kvm_vcpu *vcpu, unsigned long cr0); + bool (*is_valid_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); int (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); @@ -1435,6 +1724,7 @@ struct kvm_x86_ops { void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu); + void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value); void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value); void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); @@ -1443,9 +1733,11 @@ struct kvm_x86_ops { void (*flush_tlb_all)(struct kvm_vcpu *vcpu); void (*flush_tlb_current)(struct kvm_vcpu *vcpu); - int (*tlb_remote_flush)(struct kvm *kvm); - int (*tlb_remote_flush_with_range)(struct kvm *kvm, - struct kvm_tlb_range *range); +#if IS_ENABLED(CONFIG_HYPERV) + int (*flush_remote_tlbs)(struct kvm *kvm); + int (*flush_remote_tlbs_range)(struct kvm *kvm, gfn_t gfn, + gfn_t nr_pages); +#endif /* * Flush any TLB entries associated with the given GVA. @@ -1462,7 +1754,8 @@ struct kvm_x86_ops { void (*flush_tlb_guest)(struct kvm_vcpu *vcpu); int (*vcpu_pre_run)(struct kvm_vcpu *vcpu); - enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu); + enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu, + bool force_immediate_exit); int (*handle_exit)(struct kvm_vcpu *vcpu, enum exit_fastpath_completion exit_fastpath); int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); @@ -1471,22 +1764,30 @@ struct kvm_x86_ops { u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu); void (*patch_hypercall)(struct kvm_vcpu *vcpu, unsigned char *hypercall_addr); - void (*inject_irq)(struct kvm_vcpu *vcpu); + void (*inject_irq)(struct kvm_vcpu *vcpu, bool reinjected); void (*inject_nmi)(struct kvm_vcpu *vcpu); - void (*queue_exception)(struct kvm_vcpu *vcpu); + void (*inject_exception)(struct kvm_vcpu *vcpu); void (*cancel_injection)(struct kvm_vcpu *vcpu); int (*interrupt_allowed)(struct kvm_vcpu *vcpu, bool for_injection); int (*nmi_allowed)(struct kvm_vcpu *vcpu, bool for_injection); bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked); + /* Whether or not a virtual NMI is pending in hardware. */ + bool (*is_vnmi_pending)(struct kvm_vcpu *vcpu); + /* + * Attempt to pend a virtual NMI in hardware. Returns %true on success + * to allow using static_call_ret0 as the fallback. + */ + bool (*set_vnmi_pending)(struct kvm_vcpu *vcpu); void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); - bool (*check_apicv_inhibit_reasons)(enum kvm_apicv_inhibit reason); + + const bool x2apic_icr_is_split; + const unsigned long required_apicv_inhibits; + bool allow_apicv_in_x2apic_without_x2apic_virtualization; void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); - void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr); - bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu); void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu); void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu); @@ -1495,25 +1796,43 @@ struct kvm_x86_ops { int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr); - u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); + u8 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); + /* Update external mapping with page table link. */ + int (*link_external_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level, + void *external_spt); + /* Update the external page table from spte getting set. */ + int (*set_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level, + kvm_pfn_t pfn_for_gfn); + + /* Update external page tables for page table about to be freed. */ + int (*free_external_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level, + void *external_spt); + + /* Update external page table from spte getting removed, and flush TLB. */ + int (*remove_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level, + kvm_pfn_t pfn_for_gfn); + bool (*has_wbinvd_exit)(void); u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu); u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu); - void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); - void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu, u64 multiplier); + void (*write_tsc_offset)(struct kvm_vcpu *vcpu); + void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu); /* - * Retrieve somewhat arbitrary exit information. Intended to + * Retrieve somewhat arbitrary exit/entry information. Intended to * be used only from within tracepoints or error paths. */ void (*get_exit_info)(struct kvm_vcpu *vcpu, u32 *reason, u64 *info1, u64 *info2, - u32 *exit_int_info, u32 *exit_int_info_err_code); + u32 *intr_info, u32 *error_code); + + void (*get_entry_info)(struct kvm_vcpu *vcpu, + u32 *intr_info, u32 *error_code); int (*check_intercept)(struct kvm_vcpu *vcpu, struct x86_instruction_info *info, @@ -1521,15 +1840,6 @@ struct kvm_x86_ops { struct x86_exception *exception); void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu); - void (*request_immediate_exit)(struct kvm_vcpu *vcpu); - - void (*sched_in)(struct kvm_vcpu *kvm, int cpu); - - /* - * Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A zero - * value indicates CPU dirty logging is unsupported or disabled. - */ - int cpu_dirty_log_size; void (*update_cpu_dirty_logging)(struct kvm_vcpu *vcpu); const struct kvm_x86_nested_ops *nested_ops; @@ -1540,8 +1850,10 @@ struct kvm_x86_ops { int (*pi_update_irte)(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); void (*pi_start_assignment)(struct kvm *kvm); + void (*apicv_pre_state_restore)(struct kvm_vcpu *vcpu); void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu); + bool (*protected_apic_has_interrupt)(struct kvm_vcpu *vcpu); int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, bool *expired); @@ -1549,25 +1861,29 @@ struct kvm_x86_ops { void (*setup_mce)(struct kvm_vcpu *vcpu); +#ifdef CONFIG_KVM_SMM int (*smi_allowed)(struct kvm_vcpu *vcpu, bool for_injection); - int (*enter_smm)(struct kvm_vcpu *vcpu, char *smstate); - int (*leave_smm)(struct kvm_vcpu *vcpu, const char *smstate); + int (*enter_smm)(struct kvm_vcpu *vcpu, union kvm_smram *smram); + int (*leave_smm)(struct kvm_vcpu *vcpu, const union kvm_smram *smram); void (*enable_smi_window)(struct kvm_vcpu *vcpu); +#endif + int (*dev_get_attr)(u32 group, u64 attr, u64 *val); int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp); + int (*vcpu_mem_enc_ioctl)(struct kvm_vcpu *vcpu, void __user *argp); int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd); int (*vm_move_enc_context_from)(struct kvm *kvm, unsigned int source_fd); void (*guest_memory_reclaimed)(struct kvm *kvm); - int (*get_msr_feature)(struct kvm_msr_entry *entry); + int (*get_feature_msr)(u32 msr, u64 *data); - bool (*can_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type, - void *insn, int insn_len); + int (*check_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len); bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu); - int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu); + int (*enable_l2_tlb_flush)(struct kvm_vcpu *vcpu); void (*migrate_timers)(struct kvm_vcpu *vcpu); void (*msr_filter_changed)(struct kvm_vcpu *vcpu); @@ -1579,14 +1895,20 @@ struct kvm_x86_ops { * Returns vCPU specific APICv inhibit reasons */ unsigned long (*vcpu_get_apicv_inhibit_reasons)(struct kvm_vcpu *vcpu); + + gva_t (*get_untagged_addr)(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags); + void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu); + int (*gmem_prepare)(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); + void (*gmem_invalidate)(kvm_pfn_t start, kvm_pfn_t end); + int (*private_max_mapping_level)(struct kvm *kvm, kvm_pfn_t pfn); }; struct kvm_x86_nested_ops { void (*leave_nested)(struct kvm_vcpu *vcpu); + bool (*is_exception_vmexit)(struct kvm_vcpu *vcpu, u8 vector, + u32 error_code); int (*check_events)(struct kvm_vcpu *vcpu); - bool (*handle_page_fault_workaround)(struct kvm_vcpu *vcpu, - struct x86_exception *fault); - bool (*hv_timer_pending)(struct kvm_vcpu *vcpu); + bool (*has_events)(struct kvm_vcpu *vcpu, bool for_injection); void (*triple_fault)(struct kvm_vcpu *vcpu); int (*get_state)(struct kvm_vcpu *vcpu, struct kvm_nested_state __user *user_kvm_nested_state, @@ -1600,12 +1922,10 @@ struct kvm_x86_nested_ops { int (*enable_evmcs)(struct kvm_vcpu *vcpu, uint16_t *vmcs_version); uint16_t (*get_evmcs_version)(struct kvm_vcpu *vcpu); + void (*hv_inject_synthetic_vmexit_post_tlb_flush)(struct kvm_vcpu *vcpu); }; struct kvm_x86_init_ops { - int (*cpu_has_kvm_support)(void); - int (*disabled_by_bios)(void); - int (*check_processor_compatibility)(void); int (*hardware_setup)(void); unsigned int (*handle_intel_pt_intr)(void); @@ -1618,20 +1938,27 @@ struct kvm_arch_async_pf { gfn_t gfn; unsigned long cr3; bool direct_map; + u64 error_code; }; extern u32 __read_mostly kvm_nr_uret_msrs; -extern u64 __read_mostly host_efer; extern bool __read_mostly allow_smaller_maxphyaddr; extern bool __read_mostly enable_apicv; +extern bool __read_mostly enable_device_posted_irqs; extern struct kvm_x86_ops kvm_x86_ops; +#define kvm_x86_call(func) static_call(kvm_x86_##func) +#define kvm_pmu_call(func) static_call(kvm_x86_pmu_##func) + #define KVM_X86_OP(func) \ DECLARE_STATIC_CALL(kvm_x86_##func, *(((struct kvm_x86_ops *)0)->func)); #define KVM_X86_OP_OPTIONAL KVM_X86_OP #define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP #include <asm/kvm-x86-ops.h> +int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops); +void kvm_x86_vendor_exit(void); + #define __KVM_HAVE_ARCH_VM_ALLOC static inline struct kvm *kvm_arch_alloc_vm(void) { @@ -1641,28 +1968,51 @@ static inline struct kvm *kvm_arch_alloc_vm(void) #define __KVM_HAVE_ARCH_VM_FREE void kvm_arch_free_vm(struct kvm *kvm); -#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB -static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm) +#if IS_ENABLED(CONFIG_HYPERV) +#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS +static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm) { - if (kvm_x86_ops.tlb_remote_flush && - !static_call(kvm_x86_tlb_remote_flush)(kvm)) + if (kvm_x86_ops.flush_remote_tlbs && + !kvm_x86_call(flush_remote_tlbs)(kvm)) return 0; else return -ENOTSUPP; } +#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE +static inline int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, + u64 nr_pages) +{ + if (!kvm_x86_ops.flush_remote_tlbs_range) + return -EOPNOTSUPP; + + return kvm_x86_call(flush_remote_tlbs_range)(kvm, gfn, nr_pages); +} +#endif /* CONFIG_HYPERV */ + +enum kvm_intr_type { + /* Values are arbitrary, but must be non-zero. */ + KVM_HANDLING_IRQ = 1, + KVM_HANDLING_NMI, +}; + +/* Enable perf NMI and timer modes to work, and minimise false positives. */ #define kvm_arch_pmi_in_guest(vcpu) \ - ((vcpu) && (vcpu)->arch.handling_intr_from_guest) + ((vcpu) && (vcpu)->arch.handling_intr_from_guest && \ + (!!in_nmi() == ((vcpu)->arch.handling_intr_from_guest == KVM_HANDLING_NMI))) -void kvm_mmu_x86_module_init(void); +void __init kvm_mmu_x86_module_init(void); int kvm_mmu_vendor_module_init(void); void kvm_mmu_vendor_module_exit(void); void kvm_mmu_destroy(struct kvm_vcpu *vcpu); int kvm_mmu_create(struct kvm_vcpu *vcpu); -int kvm_mmu_init_vm(struct kvm *kvm); +void kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); +void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm, + struct kvm_memory_slot *slot); + void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, @@ -1675,13 +2025,13 @@ void kvm_mmu_try_split_huge_pages(struct kvm *kvm, const struct kvm_memory_slot *memslot, u64 start, u64 end, int target_level); -void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *memslot); +void kvm_mmu_recover_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *memslot); void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, const struct kvm_memory_slot *memslot); -void kvm_mmu_zap_all(struct kvm *kvm); void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages); +void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end); int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); @@ -1705,21 +2055,6 @@ extern bool tdp_enabled; u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu); -/* control of guest tsc rate supported? */ -extern bool kvm_has_tsc_control; -/* maximum supported tsc_khz for guests */ -extern u32 kvm_max_guest_tsc_khz; -/* number of bits of the fractional part of the TSC scaling ratio */ -extern u8 kvm_tsc_scaling_ratio_frac_bits; -/* maximum allowed value of TSC scaling ratio */ -extern u64 kvm_max_tsc_scaling_ratio; -/* 1ull << kvm_tsc_scaling_ratio_frac_bits */ -extern u64 kvm_default_tsc_scaling_ratio; -/* bus lock detection supported? */ -extern bool kvm_has_bus_lock_exit; - -extern u64 kvm_mce_cap_supported; - /* * EMULTYPE_NO_DECODE - Set when re-emulating an instruction (after completing * userspace I/O) to indicate that the emulation context @@ -1752,12 +2087,31 @@ extern u64 kvm_mce_cap_supported; * VMware backdoor emulation handles select instructions * and reinjects the #GP for all other cases. * - * EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in which - * case the CR2/GPA value pass on the stack is valid. + * EMULTYPE_PF - Set when an intercepted #PF triggers the emulation, in which case + * the CR2/GPA value pass on the stack is valid. * * EMULTYPE_COMPLETE_USER_EXIT - Set when the emulator should update interruptibility * state and inject single-step #DBs after skipping * an instruction (after completing userspace I/O). + * + * EMULTYPE_WRITE_PF_TO_SP - Set when emulating an intercepted page fault that + * is attempting to write a gfn that contains one or + * more of the PTEs used to translate the write itself, + * and the owning page table is being shadowed by KVM. + * If emulation of the faulting instruction fails and + * this flag is set, KVM will exit to userspace instead + * of retrying emulation as KVM cannot make forward + * progress. + * + * If emulation fails for a write to guest page tables, + * KVM unprotects (zaps) the shadow page for the target + * gfn and resumes the guest to retry the non-emulatable + * instruction (on hardware). Unprotecting the gfn + * doesn't allow forward progress for a self-changing + * access because doing so also zaps the translation for + * the gfn, i.e. retrying the instruction will hit a + * !PRESENT fault, which results in a new shadow page + * and sends KVM back to square one. */ #define EMULTYPE_NO_DECODE (1 << 0) #define EMULTYPE_TRAP_UD (1 << 1) @@ -1767,6 +2121,12 @@ extern u64 kvm_mce_cap_supported; #define EMULTYPE_VMWARE_GP (1 << 5) #define EMULTYPE_PF (1 << 6) #define EMULTYPE_COMPLETE_USER_EXIT (1 << 7) +#define EMULTYPE_WRITE_PF_TO_SP (1 << 8) + +static inline bool kvm_can_emulate_event_vectoring(int emul_type) +{ + return !(emul_type & EMULTYPE_PF); +} int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type); int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, @@ -1775,8 +2135,12 @@ void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data, u8 ndata); void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu); +void kvm_prepare_event_vectoring_exit(struct kvm_vcpu *vcpu, gpa_t gpa); + void kvm_enable_efer_bits(u64); bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); +int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data); +int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data); int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated); int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data); int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data); @@ -1796,6 +2160,7 @@ int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu); int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); +void kvm_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); @@ -1809,7 +2174,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); -void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); +unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr); unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu); @@ -1824,10 +2189,10 @@ int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu); void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, unsigned long payload); -void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr); -void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); +void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned int nr, + bool has_error_code, u32 error_code); void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); -bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, +void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr); @@ -1844,25 +2209,28 @@ static inline int __kvm_irq_line_state(unsigned long *irq_state, return !!(*irq_state); } -#define KVM_MMU_ROOT_CURRENT BIT(0) -#define KVM_MMU_ROOT_PREVIOUS(i) BIT(1+i) -#define KVM_MMU_ROOTS_ALL (~0UL) - int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level); void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id); void kvm_inject_nmi(struct kvm_vcpu *vcpu); +int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu); void kvm_update_dr7(struct kvm_vcpu *vcpu); -int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn); +bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + bool always_retry); + +static inline bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, + gpa_t cr2_or_gpa) +{ + return __kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa, false); +} + void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, ulong roots_to_free); void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu); gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception); -gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, - struct x86_exception *exception); gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception); gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, @@ -1870,7 +2238,7 @@ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, bool kvm_apicv_activated(struct kvm *kvm); bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu); -void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu); +void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu); void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, enum kvm_apicv_inhibit reason, bool set); void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, @@ -1888,19 +2256,27 @@ static inline void kvm_clear_apicv_inhibit(struct kvm *kvm, kvm_set_or_clear_apicv_inhibit(kvm, reason, false); } -int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); - int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, void *insn, int insn_len); +void kvm_mmu_print_sptes(struct kvm_vcpu *vcpu, gpa_t gpa, const char *msg); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); -void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gva_t gva, hpa_t root_hpa); +void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + u64 addr, unsigned long roots); void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd); void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, int tdp_max_root_level, int tdp_huge_page_level); + +#ifdef CONFIG_KVM_PRIVATE_MEM +#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem) +#else +#define kvm_arch_has_private_mem(kvm) false +#endif + +#define kvm_arch_has_readonly_mem(kvm) (!(kvm)->arch.has_protected_state) + static inline u16 kvm_read_ldt(void) { u16 ldt; @@ -1918,7 +2294,7 @@ static inline unsigned long read_msr(unsigned long msr) { u64 value; - rdmsrl(msr, value); + rdmsrq(msr, value); return value; } #endif @@ -1942,25 +2318,26 @@ enum { TASK_SWITCH_GATE = 3, }; -#define HF_GIF_MASK (1 << 0) -#define HF_NMI_MASK (1 << 3) -#define HF_IRET_MASK (1 << 4) -#define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */ -#define HF_SMM_MASK (1 << 6) -#define HF_SMM_INSIDE_NMI_MASK (1 << 7) +#define HF_GUEST_MASK (1 << 0) /* VCPU is in guest-mode */ -#define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE -#define KVM_ADDRESS_SPACE_NUM 2 +#ifdef CONFIG_KVM_SMM +#define HF_SMM_MASK (1 << 1) +#define HF_SMM_INSIDE_NMI_MASK (1 << 2) -#define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) -#define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) - -#define KVM_ARCH_WANT_MMU_NOTIFIER +# define KVM_MAX_NR_ADDRESS_SPACES 2 +/* SMM is currently unsupported for guests with private memory. */ +# define kvm_arch_nr_memslot_as_ids(kvm) (kvm_arch_has_private_mem(kvm) ? 1 : 2) +# define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) +# define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) +#else +# define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, 0) +#endif int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int kvm_cpu_has_extint(struct kvm_vcpu *v); int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); +int kvm_cpu_get_extint(struct kvm_vcpu *v); int kvm_cpu_get_interrupt(struct kvm_vcpu *v); void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); @@ -1971,6 +2348,7 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, int kvm_add_user_return_msr(u32 msr); int kvm_find_user_return_msr(u32 msr); int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask); +void kvm_user_return_msr_update_cache(unsigned int index, u64 val); static inline bool kvm_is_supported_user_return_msr(u32 msr) { @@ -2001,7 +2379,6 @@ extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu); int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); -void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu); void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size); @@ -2023,12 +2400,12 @@ static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq) static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) { - static_call_cond(kvm_x86_vcpu_blocking)(vcpu); + kvm_x86_call(vcpu_blocking)(vcpu); } static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) { - static_call_cond(kvm_x86_vcpu_unblocking)(vcpu); + kvm_x86_call(vcpu_unblocking)(vcpu); } static inline int kvm_cpu_get_apicid(int mps_cpu) @@ -2041,14 +2418,6 @@ static inline int kvm_cpu_get_apicid(int mps_cpu) #endif } -#define put_smstate(type, buf, offset, val) \ - *(type *)((buf) + (offset) - 0x7e00) = val - -#define GET_SMSTATE(type, buf, offset) \ - (*(type *)((buf) + (offset) - 0x7e00)) - -int kvm_cpu_dirty_log_size(void); - int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); #define KVM_CLOCK_VALID_FLAGS \ @@ -2060,6 +2429,26 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); KVM_X86_QUIRK_LAPIC_MMIO_HOLE | \ KVM_X86_QUIRK_OUT_7E_INC_RIP | \ KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \ - KVM_X86_QUIRK_FIX_HYPERCALL_INSN) + KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \ + KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS | \ + KVM_X86_QUIRK_SLOT_ZAP_ALL | \ + KVM_X86_QUIRK_STUFF_FEATURE_MSRS | \ + KVM_X86_QUIRK_IGNORE_GUEST_PAT) + +#define KVM_X86_CONDITIONAL_QUIRKS \ + (KVM_X86_QUIRK_CD_NW_CLEARED | \ + KVM_X86_QUIRK_IGNORE_GUEST_PAT) + +/* + * KVM previously used a u32 field in kvm_run to indicate the hypercall was + * initiated from long mode. KVM now sets bit 0 to indicate long mode, but the + * remaining 31 lower bits must be 0 to preserve ABI. + */ +#define KVM_EXIT_HYPERCALL_MBZ GENMASK_ULL(31, 1) + +static inline bool kvm_arch_has_irq_bypass(void) +{ + return enable_device_posted_irqs; +} #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h index eb186bc57f6a..3d040741044b 100644 --- a/arch/x86/include/asm/kvm_page_track.h +++ b/arch/x86/include/asm/kvm_page_track.h @@ -2,11 +2,9 @@ #ifndef _ASM_X86_KVM_PAGE_TRACK_H #define _ASM_X86_KVM_PAGE_TRACK_H -enum kvm_page_track_mode { - KVM_PAGE_TRACK_WRITE, - KVM_PAGE_TRACK_MAX, -}; +#include <linux/kvm_types.h> +#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING /* * The notifier represented by @kvm_page_track_notifier_node is linked into * the head which will be notified when guest is triggering the track event. @@ -26,54 +24,39 @@ struct kvm_page_track_notifier_node { * It is called when guest is writing the write-tracked page * and write emulation is finished at that time. * - * @vcpu: the vcpu where the write access happened. * @gpa: the physical address written by guest. * @new: the data was written to the address. * @bytes: the written length. * @node: this node */ - void (*track_write)(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, - int bytes, struct kvm_page_track_notifier_node *node); + void (*track_write)(gpa_t gpa, const u8 *new, int bytes, + struct kvm_page_track_notifier_node *node); + /* - * It is called when memory slot is being moved or removed - * users can drop write-protection for the pages in that memory slot + * Invoked when a memory region is removed from the guest. Or in KVM + * terms, when a memslot is deleted. * - * @kvm: the kvm where memory slot being moved or removed - * @slot: the memory slot being moved or removed - * @node: this node + * @gfn: base gfn of the region being removed + * @nr_pages: number of pages in the to-be-removed region + * @node: this node */ - void (*track_flush_slot)(struct kvm *kvm, struct kvm_memory_slot *slot, - struct kvm_page_track_notifier_node *node); + void (*track_remove_region)(gfn_t gfn, unsigned long nr_pages, + struct kvm_page_track_notifier_node *node); }; -int kvm_page_track_init(struct kvm *kvm); -void kvm_page_track_cleanup(struct kvm *kvm); +int kvm_page_track_register_notifier(struct kvm *kvm, + struct kvm_page_track_notifier_node *n); +void kvm_page_track_unregister_notifier(struct kvm *kvm, + struct kvm_page_track_notifier_node *n); -bool kvm_page_track_write_tracking_enabled(struct kvm *kvm); -int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot); - -void kvm_page_track_free_memslot(struct kvm_memory_slot *slot); -int kvm_page_track_create_memslot(struct kvm *kvm, - struct kvm_memory_slot *slot, - unsigned long npages); - -void kvm_slot_page_track_add_page(struct kvm *kvm, - struct kvm_memory_slot *slot, gfn_t gfn, - enum kvm_page_track_mode mode); -void kvm_slot_page_track_remove_page(struct kvm *kvm, - struct kvm_memory_slot *slot, gfn_t gfn, - enum kvm_page_track_mode mode); -bool kvm_slot_page_track_is_active(struct kvm *kvm, - const struct kvm_memory_slot *slot, - gfn_t gfn, enum kvm_page_track_mode mode); +int kvm_write_track_add_gfn(struct kvm *kvm, gfn_t gfn); +int kvm_write_track_remove_gfn(struct kvm *kvm, gfn_t gfn); +#else +/* + * Allow defining a node in a structure even if page tracking is disabled, e.g. + * to play nice with testing headers via direct inclusion from the command line. + */ +struct kvm_page_track_notifier_node {}; +#endif /* CONFIG_KVM_EXTERNAL_WRITE_TRACKING */ -void -kvm_page_track_register_notifier(struct kvm *kvm, - struct kvm_page_track_notifier_node *n); -void -kvm_page_track_unregister_notifier(struct kvm *kvm, - struct kvm_page_track_notifier_node *n); -void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, - int bytes); -void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot); #endif diff --git a/arch/x86/include/asm/kvmclock.h b/arch/x86/include/asm/kvmclock.h index 6c5765192102..f163176d6f7f 100644 --- a/arch/x86/include/asm/kvmclock.h +++ b/arch/x86/include/asm/kvmclock.h @@ -4,11 +4,9 @@ #include <linux/percpu.h> -extern struct clocksource kvm_clock; - DECLARE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); -static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void) +static __always_inline struct pvclock_vcpu_time_info *this_cpu_pvti(void) { return &this_cpu_read(hv_clock_per_cpu)->pvti; } diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index 73ca20049835..9d38ae744a2e 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -8,70 +8,148 @@ #undef notrace #define notrace __attribute__((no_instrument_function)) +#ifdef CONFIG_64BIT +/* + * The generic version tends to create spurious ENDBR instructions under + * certain conditions. + */ +#define _THIS_IP_ ({ unsigned long __here; asm ("lea 0(%%rip), %0" : "=r" (__here)); __here; }) +#endif + #ifdef CONFIG_X86_32 #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) #endif /* CONFIG_X86_32 */ -#ifdef __ASSEMBLY__ - -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_ALIGNMENT_16) -#define __ALIGN .p2align 4, 0x90 +#define __ALIGN .balign CONFIG_FUNCTION_ALIGNMENT, 0x90; #define __ALIGN_STR __stringify(__ALIGN) + +#if defined(CONFIG_CALL_PADDING) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) +#define FUNCTION_PADDING .skip CONFIG_FUNCTION_ALIGNMENT, 0x90; +#else +#define FUNCTION_PADDING +#endif + +#if (CONFIG_FUNCTION_ALIGNMENT > 8) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) +# define __FUNC_ALIGN __ALIGN; FUNCTION_PADDING +#else +# define __FUNC_ALIGN __ALIGN #endif -#if defined(CONFIG_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) +#define ASM_FUNC_ALIGN __stringify(__FUNC_ALIGN) +#define SYM_F_ALIGN __FUNC_ALIGN + +#ifdef __ASSEMBLER__ + +#if defined(CONFIG_MITIGATION_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) #define RET jmp __x86_return_thunk -#else /* CONFIG_RETPOLINE */ -#ifdef CONFIG_SLS +#else /* CONFIG_MITIGATION_RETPOLINE */ +#ifdef CONFIG_MITIGATION_SLS #define RET ret; int3 #else #define RET ret #endif -#endif /* CONFIG_RETPOLINE */ +#endif /* CONFIG_MITIGATION_RETPOLINE */ -#else /* __ASSEMBLY__ */ +#else /* __ASSEMBLER__ */ -#if defined(CONFIG_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) +#if defined(CONFIG_MITIGATION_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) #define ASM_RET "jmp __x86_return_thunk\n\t" -#else /* CONFIG_RETPOLINE */ -#ifdef CONFIG_SLS +#else /* CONFIG_MITIGATION_RETPOLINE */ +#ifdef CONFIG_MITIGATION_SLS #define ASM_RET "ret; int3\n\t" #else #define ASM_RET "ret\n\t" #endif -#endif /* CONFIG_RETPOLINE */ +#endif /* CONFIG_MITIGATION_RETPOLINE */ + +#endif /* __ASSEMBLER__ */ + +/* + * Depending on -fpatchable-function-entry=N,N usage (CONFIG_CALL_PADDING) the + * CFI symbol layout changes. + * + * Without CALL_THUNKS: + * + * .align FUNCTION_ALIGNMENT + * __cfi_##name: + * .skip FUNCTION_PADDING, 0x90 + * .byte 0xb8 + * .long __kcfi_typeid_##name + * name: + * + * With CALL_THUNKS: + * + * .align FUNCTION_ALIGNMENT + * __cfi_##name: + * .byte 0xb8 + * .long __kcfi_typeid_##name + * .skip FUNCTION_PADDING, 0x90 + * name: + * + * In both cases the whole thing is FUNCTION_ALIGNMENT aligned and sized. + */ + +#ifdef CONFIG_CALL_PADDING +#define CFI_PRE_PADDING +#define CFI_POST_PADDING .skip CONFIG_FUNCTION_PADDING_BYTES, 0x90; +#else +#define CFI_PRE_PADDING .skip CONFIG_FUNCTION_PADDING_BYTES, 0x90; +#define CFI_POST_PADDING +#endif -#endif /* __ASSEMBLY__ */ +#define __CFI_TYPE(name) \ + SYM_START(__cfi_##name, SYM_L_LOCAL, SYM_A_NONE) \ + CFI_PRE_PADDING \ + .byte 0xb8 ASM_NL \ + .long __kcfi_typeid_##name ASM_NL \ + CFI_POST_PADDING \ + SYM_FUNC_END(__cfi_##name) + +/* UML needs to be able to override memcpy() and friends for KASAN. */ +#ifdef CONFIG_UML +# define SYM_FUNC_ALIAS_MEMFUNC SYM_FUNC_ALIAS_WEAK +#else +# define SYM_FUNC_ALIAS_MEMFUNC SYM_FUNC_ALIAS +#endif + +/* SYM_TYPED_FUNC_START -- use for indirectly called globals, w/ CFI type */ +#define SYM_TYPED_FUNC_START(name) \ + SYM_TYPED_START(name, SYM_L_GLOBAL, SYM_F_ALIGN) \ + ENDBR /* SYM_FUNC_START -- use for global functions */ #define SYM_FUNC_START(name) \ - SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) \ - ENDBR + SYM_START(name, SYM_L_GLOBAL, SYM_F_ALIGN) /* SYM_FUNC_START_NOALIGN -- use for global functions, w/o alignment */ #define SYM_FUNC_START_NOALIGN(name) \ - SYM_START(name, SYM_L_GLOBAL, SYM_A_NONE) \ - ENDBR + SYM_START(name, SYM_L_GLOBAL, SYM_A_NONE) /* SYM_FUNC_START_LOCAL -- use for local functions */ #define SYM_FUNC_START_LOCAL(name) \ - SYM_START(name, SYM_L_LOCAL, SYM_A_ALIGN) \ - ENDBR + SYM_START(name, SYM_L_LOCAL, SYM_F_ALIGN) /* SYM_FUNC_START_LOCAL_NOALIGN -- use for local functions, w/o alignment */ #define SYM_FUNC_START_LOCAL_NOALIGN(name) \ - SYM_START(name, SYM_L_LOCAL, SYM_A_NONE) \ - ENDBR + SYM_START(name, SYM_L_LOCAL, SYM_A_NONE) /* SYM_FUNC_START_WEAK -- use for weak functions */ #define SYM_FUNC_START_WEAK(name) \ - SYM_START(name, SYM_L_WEAK, SYM_A_ALIGN) \ - ENDBR + SYM_START(name, SYM_L_WEAK, SYM_F_ALIGN) /* SYM_FUNC_START_WEAK_NOALIGN -- use for weak functions, w/o alignment */ #define SYM_FUNC_START_WEAK_NOALIGN(name) \ - SYM_START(name, SYM_L_WEAK, SYM_A_NONE) \ - ENDBR + SYM_START(name, SYM_L_WEAK, SYM_A_NONE) + +/* + * Expose 'sym' to the startup code in arch/x86/boot/startup/, by emitting an + * alias prefixed with __pi_ + */ +#ifdef __ASSEMBLER__ +#define SYM_PIC_ALIAS(sym) SYM_ALIAS(__pi_ ## sym, sym, SYM_L_GLOBAL) +#else +#define SYM_PIC_ALIAS(sym) extern typeof(sym) __PASTE(__pi_, sym) __alias(sym) +#endif #endif /* _ASM_X86_LINKAGE_H */ diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h index 349a47acaa4a..59aa966dc212 100644 --- a/arch/x86/include/asm/local.h +++ b/arch/x86/include/asm/local.h @@ -120,34 +120,54 @@ static inline long local_sub_return(long i, local_t *l) #define local_inc_return(l) (local_add_return(1, l)) #define local_dec_return(l) (local_sub_return(1, l)) -#define local_cmpxchg(l, o, n) \ - (cmpxchg_local(&((l)->a.counter), (o), (n))) -/* Always has a lock prefix */ -#define local_xchg(l, n) (xchg(&((l)->a.counter), (n))) +static inline long local_cmpxchg(local_t *l, long old, long new) +{ + return cmpxchg_local(&l->a.counter, old, new); +} + +static inline bool local_try_cmpxchg(local_t *l, long *old, long new) +{ + return try_cmpxchg_local(&l->a.counter, + (typeof(l->a.counter) *) old, new); +} + +/* + * Implement local_xchg using CMPXCHG instruction without the LOCK prefix. + * XCHG is expensive due to the implied LOCK prefix. The processor + * cannot prefetch cachelines if XCHG is used. + */ +static __always_inline long +local_xchg(local_t *l, long n) +{ + long c = local_read(l); + + do { } while (!local_try_cmpxchg(l, &c, n)); + + return c; +} /** - * local_add_unless - add unless the number is a given value + * local_add_unless - add unless the number is already a given value * @l: pointer of type local_t * @a: the amount to add to l... * @u: ...unless l is equal to u. * - * Atomically adds @a to @l, so long as it was not @u. - * Returns non-zero if @l was not @u, and zero otherwise. + * Atomically adds @a to @l, if @v was not already @u. + * Returns true if the addition was done. */ -#define local_add_unless(l, a, u) \ -({ \ - long c, old; \ - c = local_read((l)); \ - for (;;) { \ - if (unlikely(c == (u))) \ - break; \ - old = local_cmpxchg((l), c, c + (a)); \ - if (likely(old == c)) \ - break; \ - c = old; \ - } \ - c != (u); \ -}) +static __always_inline bool +local_add_unless(local_t *l, long a, long u) +{ + long c = local_read(l); + + do { + if (unlikely(c == u)) + return false; + } while (!local_try_cmpxchg(l, &c, c + a)); + + return true; +} + #define local_inc_not_zero(l) local_add_unless((l), 1, 0) /* On x86_32, these are no better than the atomic variants. diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h index 97198001e567..6115bb3d5795 100644 --- a/arch/x86/include/asm/mc146818rtc.h +++ b/arch/x86/include/asm/mc146818rtc.h @@ -95,7 +95,7 @@ static inline unsigned char current_lock_cmos_reg(void) unsigned char rtc_cmos_read(unsigned char addr); void rtc_cmos_write(unsigned char val, unsigned char addr); -extern int mach_set_rtc_mmss(const struct timespec64 *now); +extern int mach_set_cmos_time(const struct timespec64 *now); extern void mach_get_cmos_time(struct timespec64 *now); #define RTC_IRQ 8 diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index cc73061e7255..6c77c03139f7 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -13,6 +13,7 @@ #define MCG_CTL_P BIT_ULL(8) /* MCG_CTL register available */ #define MCG_EXT_P BIT_ULL(9) /* Extended registers available */ #define MCG_CMCI_P BIT_ULL(10) /* CMCI supported */ +#define MCG_SEAM_NR BIT_ULL(12) /* MCG_STATUS_SEAM_NR supported */ #define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */ #define MCG_EXT_CNT_SHIFT 16 #define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT) @@ -25,6 +26,7 @@ #define MCG_STATUS_EIPV BIT_ULL(1) /* ip points to correct instruction */ #define MCG_STATUS_MCIP BIT_ULL(2) /* machine check in progress */ #define MCG_STATUS_LMCES BIT_ULL(3) /* LMCE signaled */ +#define MCG_STATUS_SEAM_NR BIT_ULL(12) /* Machine check inside SEAM non-root mode */ /* MCG_EXT_CTL register defines */ #define MCG_EXT_CTL_LMCE_EN BIT_ULL(0) /* Enable LMCE */ @@ -42,6 +44,7 @@ #define MCI_STATUS_CEC_SHIFT 38 /* Corrected Error Count */ #define MCI_STATUS_CEC_MASK GENMASK_ULL(52,38) #define MCI_STATUS_CEC(c) (((c) & MCI_STATUS_CEC_MASK) >> MCI_STATUS_CEC_SHIFT) +#define MCI_STATUS_MSCOD(m) (((m) >> 16) & 0xffff) /* AMD-specific bits */ #define MCI_STATUS_TCC BIT_ULL(55) /* Task context corrupt */ @@ -58,6 +61,7 @@ * - TCC bit is present in MCx_STATUS. */ #define MCI_CONFIG_MCAX 0x1 +#define MCI_CONFIG_FRUTEXT BIT_ULL(9) #define MCI_IPID_MCATYPE 0xFFFF0000 #define MCI_IPID_HWID 0xFFF @@ -87,6 +91,9 @@ #define MCI_MISC_ADDR_MEM 3 /* memory address */ #define MCI_MISC_ADDR_GENERIC 7 /* generic */ +/* MCi_ADDR register defines */ +#define MCI_ADDR_PHYSADDR GENMASK_ULL(boot_cpu_data.x86_phys_bits - 1, 0) + /* CTL2 register defines */ #define MCI_CTL2_CMCI_EN BIT_ULL(30) #define MCI_CTL2_CMCI_THRESHOLD_MASK 0x7fffULL @@ -116,6 +123,9 @@ #define MSR_AMD64_SMCA_MC0_DESTAT 0xc0002008 #define MSR_AMD64_SMCA_MC0_DEADDR 0xc0002009 #define MSR_AMD64_SMCA_MC0_MISC1 0xc000200a +/* Registers MISC2 to MISC4 are at offsets B to D. */ +#define MSR_AMD64_SMCA_MC0_SYND1 0xc000200e +#define MSR_AMD64_SMCA_MC0_SYND2 0xc000200f #define MSR_AMD64_SMCA_MCx_CTL(x) (MSR_AMD64_SMCA_MC0_CTL + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_STATUS(x) (MSR_AMD64_SMCA_MC0_STATUS + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_ADDR(x) (MSR_AMD64_SMCA_MC0_ADDR + 0x10*(x)) @@ -126,6 +136,8 @@ #define MSR_AMD64_SMCA_MCx_DESTAT(x) (MSR_AMD64_SMCA_MC0_DESTAT + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_DEADDR(x) (MSR_AMD64_SMCA_MC0_DEADDR + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_MISCy(x, y) ((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x))) +#define MSR_AMD64_SMCA_MCx_SYND1(x) (MSR_AMD64_SMCA_MC0_SYND1 + 0x10*(x)) +#define MSR_AMD64_SMCA_MCx_SYND2(x) (MSR_AMD64_SMCA_MC0_SYND2 + 0x10*(x)) #define XEC(x, mask) (((x) >> 16) & mask) @@ -181,6 +193,32 @@ enum mce_notifier_prios { MCE_PRIO_HIGHEST = MCE_PRIO_CEC }; +/** + * struct mce_hw_err - Hardware Error Record. + * @m: Machine Check record. + * @vendor: Vendor-specific error information. + * + * Vendor-specific fields should not be added to struct mce. Instead, vendors + * should export their vendor-specific data through their structure in the + * vendor union below. + * + * AMD's vendor data is parsed by error decoding tools for supplemental error + * information. Thus, current offsets of existing fields must be maintained. + * Only add new fields at the end of AMD's vendor structure. + */ +struct mce_hw_err { + struct mce m; + + union vendor_info { + struct { + u64 synd1; /* MCA_SYND1 MSR */ + u64 synd2; /* MCA_SYND2 MSR */ + } amd; + } vendor; +}; + +#define to_mce_hw_err(mce) container_of(mce, struct mce_hw_err, m) + struct notifier_block; extern void mce_register_decode_chain(struct notifier_block *nb); extern void mce_unregister_decode_chain(struct notifier_block *nb); @@ -215,8 +253,8 @@ static inline int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) { return -EINVAL; } #endif -void mce_setup(struct mce *m); -void mce_log(struct mce *m); +void mce_prep_record(struct mce_hw_err *err); +void mce_log(struct mce_hw_err *err); DECLARE_PER_CPU(struct device *, mce_device); /* Maximum number of MCA banks per CPU. */ @@ -238,10 +276,10 @@ static inline void cmci_rediscover(void) {} static inline void cmci_recheck(void) {} #endif -int mce_available(struct cpuinfo_x86 *c); +bool mce_available(struct cpuinfo_x86 *c); bool mce_is_memory_error(struct mce *m); bool mce_is_correctable(struct mce *m); -int mce_usable_address(struct mce *m); +bool mce_usable_address(struct mce *m); DECLARE_PER_CPU(unsigned, mce_exception_count); DECLARE_PER_CPU(unsigned, mce_poll_count); @@ -255,9 +293,8 @@ enum mcp_flags { MCP_DONTLOG = BIT(2), /* only clear, don't log */ MCP_QUEUE_LOG = BIT(3), /* only queue to genpool */ }; -bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b); -int mce_notify_irq(void); +void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); DECLARE_PER_CPU(struct mce, injectm); @@ -307,6 +344,7 @@ enum smca_bank_types { SMCA_PIE, /* Power, Interrupts, etc. */ SMCA_UMC, /* Unified Memory Controller */ SMCA_UMC_V2, + SMCA_MA_LLC, /* Memory Attached Last Level Cache */ SMCA_PB, /* Parameter Block */ SMCA_PSP, /* Platform Security Processor */ SMCA_PSP_V2, @@ -322,6 +360,8 @@ enum smca_bank_types { SMCA_SHUB, /* System HUB Unit */ SMCA_SATA, /* SATA Unit */ SMCA_USB, /* USB Unit */ + SMCA_USR_DP, /* Ultra Short Reach Data Plane Controller */ + SMCA_USR_CP, /* Ultra Short Reach Control Plane Controller */ SMCA_GMI_PCS, /* GMI PCS Unit */ SMCA_XGMI_PHY, /* xGMI PHY Unit */ SMCA_WAFL_PHY, /* WAFL PHY Unit */ @@ -329,7 +369,6 @@ enum smca_bank_types { N_SMCA_BANK_TYPES }; -extern const char *smca_get_long_name(enum smca_bank_types t); extern bool amd_mce_is_memory_error(struct mce *m); extern int mce_threshold_create_device(unsigned int cpu); @@ -345,5 +384,6 @@ static inline bool amd_mce_is_memory_error(struct mce *m) { return false; }; static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } #endif -static inline void mce_hygon_feature_init(struct cpuinfo_x86 *c) { return mce_amd_feature_init(c); } +unsigned long copy_mc_fragile_handle_tail(char *to, char *from, unsigned len); + #endif /* _ASM_X86_MCE_H */ diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 88ceaf3648b3..ea6494628cb0 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -10,12 +10,21 @@ #ifndef __X86_MEM_ENCRYPT_H__ #define __X86_MEM_ENCRYPT_H__ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/init.h> #include <linux/cc_platform.h> -#include <asm/bootparam.h> +#include <asm/asm.h> +struct boot_params; + +#ifdef CONFIG_X86_MEM_ENCRYPT +void __init mem_encrypt_init(void); +void __init mem_encrypt_setup_arch(void); +#else +static inline void mem_encrypt_init(void) { } +static inline void __init mem_encrypt_setup_arch(void) { } +#endif #ifdef CONFIG_AMD_MEM_ENCRYPT @@ -37,25 +46,30 @@ void __init sme_map_bootdata(char *real_mode_data); void __init sme_unmap_bootdata(char *real_mode_data); void __init sme_early_init(void); -void __init sev_setup_arch(void); -void __init sme_encrypt_kernel(struct boot_params *bp); -void __init sme_enable(struct boot_params *bp); +void sme_encrypt_kernel(struct boot_params *bp); +void sme_enable(struct boot_params *bp); int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size); int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size); -void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, - bool enc); +void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, + unsigned long size, bool enc); void __init mem_encrypt_free_decrypted_mem(void); void __init sev_es_init_vc_handling(void); +static inline u64 sme_get_me_mask(void) +{ + return sme_me_mask; +} + #define __bss_decrypted __section(".bss..decrypted") #else /* !CONFIG_AMD_MEM_ENCRYPT */ #define sme_me_mask 0ULL +#define sev_status 0ULL static inline void __init sme_early_encrypt(resource_size_t paddr, unsigned long size) { } @@ -66,10 +80,9 @@ static inline void __init sme_map_bootdata(char *real_mode_data) { } static inline void __init sme_unmap_bootdata(char *real_mode_data) { } static inline void __init sme_early_init(void) { } -static inline void __init sev_setup_arch(void) { } -static inline void __init sme_encrypt_kernel(struct boot_params *bp) { } -static inline void __init sme_enable(struct boot_params *bp) { } +static inline void sme_encrypt_kernel(struct boot_params *bp) { } +static inline void sme_enable(struct boot_params *bp) { } static inline void sev_es_init_vc_handling(void) { } @@ -78,16 +91,17 @@ early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; static inline int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; } static inline void __init -early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, bool enc) {} +early_set_mem_enc_dec_hypercall(unsigned long vaddr, unsigned long size, bool enc) {} static inline void mem_encrypt_free_decrypted_mem(void) { } +static inline u64 sme_get_me_mask(void) { return 0; } + #define __bss_decrypted #endif /* CONFIG_AMD_MEM_ENCRYPT */ -/* Architecture __weak replacement functions */ -void __init mem_encrypt_init(void); +void add_encrypt_protection_map(void); /* * The __sme_pa() and __sme_pa_nodebug() macros are meant for use when @@ -100,11 +114,6 @@ void __init mem_encrypt_init(void); extern char __start_bss_decrypted[], __end_bss_decrypted[], __start_bss_decrypted_unused[]; -static inline u64 sme_get_me_mask(void) -{ - return sme_me_mask; -} - -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __X86_MEM_ENCRYPT_H__ */ diff --git a/arch/x86/include/asm/memtype.h b/arch/x86/include/asm/memtype.h index 9ca760e430b9..113b2fa51849 100644 --- a/arch/x86/include/asm/memtype.h +++ b/arch/x86/include/asm/memtype.h @@ -6,9 +6,8 @@ #include <asm/pgtable_types.h> extern bool pat_enabled(void); -extern void pat_disable(const char *reason); -extern void pat_init(void); -extern void init_cache_modes(void); +extern void pat_bp_init(void); +extern void pat_cpu_init(void); extern int memtype_reserve(u64 start, u64 end, enum page_cache_mode req_pcm, enum page_cache_mode *ret_pcm); diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 0c3d3440fe27..8b41f26f003b 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -2,18 +2,7 @@ #ifndef _ASM_X86_MICROCODE_H #define _ASM_X86_MICROCODE_H -#include <asm/cpu.h> -#include <linux/earlycpio.h> -#include <linux/initrd.h> - -struct ucode_patch { - struct list_head plist; - void *data; /* Intel uses only this one */ - u32 patch_id; - u16 equiv_cpu; -}; - -extern struct list_head microcode_cache; +#include <asm/msr.h> struct cpu_signature { unsigned int sig; @@ -21,122 +10,84 @@ struct cpu_signature { unsigned int rev; }; -struct device; - -enum ucode_state { - UCODE_OK = 0, - UCODE_NEW, - UCODE_UPDATED, - UCODE_NFOUND, - UCODE_ERROR, +struct ucode_cpu_info { + struct cpu_signature cpu_sig; + void *mc; }; -struct microcode_ops { - enum ucode_state (*request_microcode_user) (int cpu, - const void __user *buf, size_t size); - - enum ucode_state (*request_microcode_fw) (int cpu, struct device *, - bool refresh_fw); - - void (*microcode_fini_cpu) (int cpu); +#ifdef CONFIG_MICROCODE +void load_ucode_bsp(void); +void load_ucode_ap(void); +void microcode_bsp_resume(void); +bool __init microcode_loader_disabled(void); +#else +static inline void load_ucode_bsp(void) { } +static inline void load_ucode_ap(void) { } +static inline void microcode_bsp_resume(void) { } +static inline bool __init microcode_loader_disabled(void) { return false; } +#endif - /* - * The generic 'microcode_core' part guarantees that - * the callbacks below run on a target cpu when they - * are being called. - * See also the "Synchronization" section in microcode_core.c. - */ - enum ucode_state (*apply_microcode) (int cpu); - int (*collect_cpu_info) (int cpu, struct cpu_signature *csig); +extern unsigned long initrd_start_early; + +#ifdef CONFIG_CPU_SUP_INTEL +/* Intel specific microcode defines. Public for IFS */ +struct microcode_header_intel { + unsigned int hdrver; + unsigned int rev; + unsigned int date; + unsigned int sig; + unsigned int cksum; + unsigned int ldrver; + unsigned int pf; + unsigned int datasize; + unsigned int totalsize; + unsigned int metasize; + unsigned int min_req_ver; + unsigned int reserved; }; -struct ucode_cpu_info { - struct cpu_signature cpu_sig; - int valid; - void *mc; +struct microcode_intel { + struct microcode_header_intel hdr; + unsigned int bits[]; }; -extern struct ucode_cpu_info ucode_cpu_info[]; -struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa); -#ifdef CONFIG_MICROCODE_INTEL -extern struct microcode_ops * __init init_intel_microcode(void); -#else -static inline struct microcode_ops * __init init_intel_microcode(void) -{ - return NULL; -} -#endif /* CONFIG_MICROCODE_INTEL */ +#define DEFAULT_UCODE_DATASIZE (2000) +#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel)) +#define MC_HEADER_TYPE_MICROCODE 1 +#define MC_HEADER_TYPE_IFS 2 -#ifdef CONFIG_MICROCODE_AMD -extern struct microcode_ops * __init init_amd_microcode(void); -extern void __exit exit_amd_microcode(void); -#else -static inline struct microcode_ops * __init init_amd_microcode(void) +static inline int intel_microcode_get_datasize(struct microcode_header_intel *hdr) { - return NULL; + return hdr->datasize ? : DEFAULT_UCODE_DATASIZE; } -static inline void __exit exit_amd_microcode(void) {} -#endif -#define MAX_UCODE_COUNT 128 - -#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24)) -#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u') -#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I') -#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l') -#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h') -#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i') -#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D') - -#define CPUID_IS(a, b, c, ebx, ecx, edx) \ - (!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c)))) - -/* - * In early loading microcode phase on BSP, boot_cpu_data is not set up yet. - * x86_cpuid_vendor() gets vendor id for BSP. - * - * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify - * coding, we still use x86_cpuid_vendor() to get vendor id for AP. - * - * x86_cpuid_vendor() gets vendor information directly from CPUID. - */ -static inline int x86_cpuid_vendor(void) +static inline u32 intel_get_microcode_revision(void) { - u32 eax = 0x00000000; - u32 ebx, ecx = 0, edx; + u32 rev, dummy; - native_cpuid(&eax, &ebx, &ecx, &edx); + native_wrmsrq(MSR_IA32_UCODE_REV, 0); - if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx)) - return X86_VENDOR_INTEL; + /* As documented in the SDM: Do a CPUID 1 here */ + native_cpuid_eax(1); - if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx)) - return X86_VENDOR_AMD; + /* get the current revision from MSR 0x8B */ + native_rdmsr(MSR_IA32_UCODE_REV, dummy, rev); - return X86_VENDOR_UNKNOWN; + return rev; } +#endif /* !CONFIG_CPU_SUP_INTEL */ -static inline unsigned int x86_cpuid_family(void) -{ - u32 eax = 0x00000001; - u32 ebx, ecx = 0, edx; - - native_cpuid(&eax, &ebx, &ecx, &edx); +bool microcode_nmi_handler(void); +void microcode_offline_nmi_handler(void); - return x86_family(eax); +#ifdef CONFIG_MICROCODE_LATE_LOADING +DECLARE_STATIC_KEY_FALSE(microcode_nmi_handler_enable); +static __always_inline bool microcode_nmi_handler_enabled(void) +{ + return static_branch_unlikely(µcode_nmi_handler_enable); } - -#ifdef CONFIG_MICROCODE -extern void __init load_ucode_bsp(void); -extern void load_ucode_ap(void); -void reload_early_microcode(void); -extern bool initrd_gone; -void microcode_bsp_resume(void); #else -static inline void __init load_ucode_bsp(void) { } -static inline void load_ucode_ap(void) { } -static inline void reload_early_microcode(void) { } -static inline void microcode_bsp_resume(void) { } +static __always_inline bool microcode_nmi_handler_enabled(void) { return false; } #endif #endif /* _ASM_X86_MICROCODE_H */ diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h deleted file mode 100644 index ac31f9140d07..000000000000 --- a/arch/x86/include/asm/microcode_amd.h +++ /dev/null @@ -1,58 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_MICROCODE_AMD_H -#define _ASM_X86_MICROCODE_AMD_H - -#include <asm/microcode.h> - -#define UCODE_MAGIC 0x00414d44 -#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 -#define UCODE_UCODE_TYPE 0x00000001 - -#define SECTION_HDR_SIZE 8 -#define CONTAINER_HDR_SZ 12 - -struct equiv_cpu_entry { - u32 installed_cpu; - u32 fixed_errata_mask; - u32 fixed_errata_compare; - u16 equiv_cpu; - u16 res; -} __attribute__((packed)); - -struct microcode_header_amd { - u32 data_code; - u32 patch_id; - u16 mc_patch_data_id; - u8 mc_patch_data_len; - u8 init_flag; - u32 mc_patch_data_checksum; - u32 nb_dev_id; - u32 sb_dev_id; - u16 processor_rev_id; - u8 nb_rev_id; - u8 sb_rev_id; - u8 bios_api_rev; - u8 reserved1[3]; - u32 match_reg[8]; -} __attribute__((packed)); - -struct microcode_amd { - struct microcode_header_amd hdr; - unsigned int mpb[]; -}; - -#define PATCH_MAX_SIZE (3 * PAGE_SIZE) - -#ifdef CONFIG_MICROCODE_AMD -extern void __init load_ucode_amd_bsp(unsigned int family); -extern void load_ucode_amd_ap(unsigned int family); -extern int __init save_microcode_in_initrd_amd(unsigned int family); -void reload_ucode_amd(void); -#else -static inline void __init load_ucode_amd_bsp(unsigned int family) {} -static inline void load_ucode_amd_ap(unsigned int family) {} -static inline int __init -save_microcode_in_initrd_amd(unsigned int family) { return -EINVAL; } -static inline void reload_ucode_amd(void) {} -#endif -#endif /* _ASM_X86_MICROCODE_AMD_H */ diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h deleted file mode 100644 index 4c92cea7e4b5..000000000000 --- a/arch/x86/include/asm/microcode_intel.h +++ /dev/null @@ -1,85 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_MICROCODE_INTEL_H -#define _ASM_X86_MICROCODE_INTEL_H - -#include <asm/microcode.h> - -struct microcode_header_intel { - unsigned int hdrver; - unsigned int rev; - unsigned int date; - unsigned int sig; - unsigned int cksum; - unsigned int ldrver; - unsigned int pf; - unsigned int datasize; - unsigned int totalsize; - unsigned int reserved[3]; -}; - -struct microcode_intel { - struct microcode_header_intel hdr; - unsigned int bits[]; -}; - -/* microcode format is extended from prescott processors */ -struct extended_signature { - unsigned int sig; - unsigned int pf; - unsigned int cksum; -}; - -struct extended_sigtable { - unsigned int count; - unsigned int cksum; - unsigned int reserved[3]; - struct extended_signature sigs[]; -}; - -#define DEFAULT_UCODE_DATASIZE (2000) -#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel)) -#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) -#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable)) -#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature)) - -#define get_totalsize(mc) \ - (((struct microcode_intel *)mc)->hdr.datasize ? \ - ((struct microcode_intel *)mc)->hdr.totalsize : \ - DEFAULT_UCODE_TOTALSIZE) - -#define get_datasize(mc) \ - (((struct microcode_intel *)mc)->hdr.datasize ? \ - ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE) - -#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) - -static inline u32 intel_get_microcode_revision(void) -{ - u32 rev, dummy; - - native_wrmsrl(MSR_IA32_UCODE_REV, 0); - - /* As documented in the SDM: Do a CPUID 1 here */ - native_cpuid_eax(1); - - /* get the current revision from MSR 0x8B */ - native_rdmsr(MSR_IA32_UCODE_REV, dummy, rev); - - return rev; -} - -#ifdef CONFIG_MICROCODE_INTEL -extern void __init load_ucode_intel_bsp(void); -extern void load_ucode_intel_ap(void); -extern void show_ucode_info_early(void); -extern int __init save_microcode_in_initrd_intel(void); -void reload_ucode_intel(void); -#else -static inline __init void load_ucode_intel_bsp(void) {} -static inline void load_ucode_intel_ap(void) {} -static inline void show_ucode_info_early(void) {} -static inline int __init save_microcode_in_initrd_intel(void) { return -EINVAL; } -static inline void reload_ucode_intel(void) {} -#endif - -#endif /* _ASM_X86_MICROCODE_INTEL_H */ diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h new file mode 100644 index 000000000000..12b820259b9f --- /dev/null +++ b/arch/x86/include/asm/mman.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_MMAN_H__ +#define __ASM_MMAN_H__ + +#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS +#define arch_calc_vm_prot_bits(prot, key) ( \ + ((key) & 0x1 ? VM_PKEY_BIT0 : 0) | \ + ((key) & 0x2 ? VM_PKEY_BIT1 : 0) | \ + ((key) & 0x4 ? VM_PKEY_BIT2 : 0) | \ + ((key) & 0x8 ? VM_PKEY_BIT3 : 0)) +#endif + +#include <uapi/asm/mman.h> + +#endif /* __ASM_MMAN_H__ */ diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 5d7494631ea9..0fe9c569d171 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -9,9 +9,15 @@ #include <linux/bits.h> /* Uprobes on this MM assume 32-bit code */ -#define MM_CONTEXT_UPROBE_IA32 BIT(0) +#define MM_CONTEXT_UPROBE_IA32 0 /* vsyscall page is accessible on this MM */ -#define MM_CONTEXT_HAS_VSYSCALL BIT(1) +#define MM_CONTEXT_HAS_VSYSCALL 1 +/* Do not allow changing LAM mode */ +#define MM_CONTEXT_LOCK_LAM 2 +/* Allow LAM and SVA coexisting */ +#define MM_CONTEXT_FORCE_TAGGED_SVA 3 +/* Tracks mm_cpumask */ +#define MM_CONTEXT_NOTRACK 4 /* * x86 has arch-specific MMU state beyond what lives in mm_struct. @@ -33,13 +39,21 @@ typedef struct { */ atomic64_t tlb_gen; + unsigned long next_trim_cpumask; + #ifdef CONFIG_MODIFY_LDT_SYSCALL struct rw_semaphore ldt_usr_sem; struct ldt_struct *ldt; #endif -#ifdef CONFIG_X86_64 - unsigned short flags; + unsigned long flags; + +#ifdef CONFIG_ADDRESS_MASKING + /* Active LAM mode: X86_CR3_LAM_U48 or X86_CR3_LAM_U57 or 0 (disabled) */ + unsigned long lam_cr3_mask; + + /* Significant bits of the virtual address. Excludes tag bits. */ + u64 untag_mask; #endif struct mutex lock; @@ -55,6 +69,18 @@ typedef struct { u16 pkey_allocation_map; s16 execute_only_pkey; #endif + +#ifdef CONFIG_BROADCAST_TLB_FLUSH + /* + * The global ASID will be a non-zero value when the process has + * the same ASID across all CPUs, allowing it to make use of + * hardware-assisted remote TLB invalidation like AMD INVLPGB. + */ + u16 global_asid; + + /* The process is transitioning to a new global ASID number. */ + bool asid_transition; +#endif } mm_context_t; #define INIT_MM_CONTEXT(mm) \ @@ -63,7 +89,7 @@ typedef struct { .lock = __MUTEX_INITIALIZER(mm.context.lock), \ } -void leave_mm(int cpu); +void leave_mm(void); #define leave_mm leave_mm #endif /* _ASM_X86_MMU_H */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index b8d40ddeab00..73bf3b1b44e8 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -2,7 +2,6 @@ #ifndef _ASM_X86_MMU_CONTEXT_H #define _ASM_X86_MMU_CONTEXT_H -#include <asm/desc.h> #include <linux/atomic.h> #include <linux/mm_types.h> #include <linux/pkeys.h> @@ -12,16 +11,11 @@ #include <asm/tlbflush.h> #include <asm/paravirt.h> #include <asm/debugreg.h> +#include <asm/gsseg.h> +#include <asm/desc.h> extern atomic64_t last_mm_ctx_id; -#ifndef CONFIG_PARAVIRT_XXL -static inline void paravirt_activate_mm(struct mm_struct *prev, - struct mm_struct *next) -{ -} -#endif /* !CONFIG_PARAVIRT_XXL */ - #ifdef CONFIG_PERF_EVENTS DECLARE_STATIC_KEY_FALSE(rdpmc_never_available_key); DECLARE_STATIC_KEY_FALSE(rdpmc_always_available_key); @@ -91,9 +85,65 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next) } #endif +#ifdef CONFIG_ADDRESS_MASKING +static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm) +{ + /* + * When switch_mm_irqs_off() is called for a kthread, it may race with + * LAM enablement. switch_mm_irqs_off() uses the LAM mask to do two + * things: populate CR3 and populate 'cpu_tlbstate.lam'. Make sure it + * reads a single value for both. + */ + return READ_ONCE(mm->context.lam_cr3_mask); +} + +static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm) +{ + mm->context.lam_cr3_mask = oldmm->context.lam_cr3_mask; + mm->context.untag_mask = oldmm->context.untag_mask; +} + +#define mm_untag_mask mm_untag_mask +static inline unsigned long mm_untag_mask(struct mm_struct *mm) +{ + return mm->context.untag_mask; +} + +static inline void mm_reset_untag_mask(struct mm_struct *mm) +{ + mm->context.untag_mask = -1UL; +} + +#define arch_pgtable_dma_compat arch_pgtable_dma_compat +static inline bool arch_pgtable_dma_compat(struct mm_struct *mm) +{ + return !mm_lam_cr3_mask(mm) || + test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags); +} +#else + +static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm) +{ + return 0; +} + +static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm) +{ +} + +static inline void mm_reset_untag_mask(struct mm_struct *mm) +{ +} +#endif + #define enter_lazy_tlb enter_lazy_tlb extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk); +#define mm_init_global_asid mm_init_global_asid +extern void mm_init_global_asid(struct mm_struct *mm); + +extern void mm_free_global_asid(struct mm_struct *mm); + /* * Init a new mm. Used on mm copies, like at fork() * and on mm's that are brand-new, like at execve(). @@ -106,6 +156,7 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id); atomic64_set(&mm->context.tlb_gen, 0); + mm->context.next_trim_cpumask = jiffies + HZ; #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS if (cpu_feature_enabled(X86_FEATURE_OSPKE)) { @@ -115,6 +166,9 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.execute_only_pkey = -1; } #endif + + mm_init_global_asid(mm); + mm_reset_untag_mask(mm); init_new_context_ldt(mm); return 0; } @@ -123,6 +177,7 @@ static inline int init_new_context(struct task_struct *tsk, static inline void destroy_context(struct mm_struct *mm) { destroy_context_ldt(mm); + mm_free_global_asid(mm); } extern void switch_mm(struct mm_struct *prev, struct mm_struct *next, @@ -134,8 +189,8 @@ extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, #define activate_mm(prev, next) \ do { \ - paravirt_activate_mm((prev), (next)); \ - switch_mm((prev), (next), NULL); \ + paravirt_enter_mmap(next); \ + switch_mm_irqs_off((prev), (next), NULL); \ } while (0); #ifdef CONFIG_X86_32 @@ -146,6 +201,7 @@ do { \ #else #define deactivate_mm(tsk, mm) \ do { \ + shstk_free(tsk); \ load_gs_index(0); \ loadsegment(fs, 0); \ } while (0) @@ -167,7 +223,8 @@ static inline void arch_dup_pkeys(struct mm_struct *oldmm, static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) { arch_dup_pkeys(oldmm, mm); - paravirt_arch_dup_mmap(oldmm, mm); + paravirt_enter_mmap(mm); + dup_lam(oldmm, mm); return ldt_dup_context(oldmm, mm); } @@ -181,7 +238,7 @@ static inline void arch_exit_mmap(struct mm_struct *mm) static inline bool is_64bit_mm(struct mm_struct *mm) { return !IS_ENABLED(CONFIG_IA32_EMULATION) || - !(mm->context.flags & MM_CONTEXT_UPROBE_IA32); + !test_bit(MM_CONTEXT_UPROBE_IA32, &mm->context.flags); } #else static inline bool is_64bit_mm(struct mm_struct *mm) @@ -190,9 +247,14 @@ static inline bool is_64bit_mm(struct mm_struct *mm) } #endif -static inline void arch_unmap(struct mm_struct *mm, unsigned long start, - unsigned long end) +static inline bool is_notrack_mm(struct mm_struct *mm) { + return test_bit(MM_CONTEXT_NOTRACK, &mm->context.flags); +} + +static inline void set_notrack_mm(struct mm_struct *mm) +{ + set_bit(MM_CONTEXT_NOTRACK, &mm->context.flags); } /* @@ -220,4 +282,7 @@ unsigned long __get_current_cr3_fast(void); #include <asm-generic/mmu_context.h> +extern struct mm_struct *use_temporary_mm(struct mm_struct *temp_mm); +extern void unuse_temporary_mm(struct mm_struct *prev_mm); + #endif /* _ASM_X86_MMU_CONTEXT_H */ diff --git a/arch/x86/include/asm/mmzone.h b/arch/x86/include/asm/mmzone.h deleted file mode 100644 index c41b41edd691..000000000000 --- a/arch/x86/include/asm/mmzone.h +++ /dev/null @@ -1,6 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifdef CONFIG_X86_32 -# include <asm/mmzone_32.h> -#else -# include <asm/mmzone_64.h> -#endif diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h deleted file mode 100644 index 2d4515e8b7df..000000000000 --- a/arch/x86/include/asm/mmzone_32.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Written by Pat Gaughen (gone@us.ibm.com) Mar 2002 - * - */ - -#ifndef _ASM_X86_MMZONE_32_H -#define _ASM_X86_MMZONE_32_H - -#include <asm/smp.h> - -#ifdef CONFIG_NUMA -extern struct pglist_data *node_data[]; -#define NODE_DATA(nid) (node_data[nid]) -#endif /* CONFIG_NUMA */ - -#endif /* _ASM_X86_MMZONE_32_H */ diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h deleted file mode 100644 index 0c585046f744..000000000000 --- a/arch/x86/include/asm/mmzone_64.h +++ /dev/null @@ -1,18 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* K8 NUMA support */ -/* Copyright 2002,2003 by Andi Kleen, SuSE Labs */ -/* 2.5 Version loosely based on the NUMAQ Code by Pat Gaughen. */ -#ifndef _ASM_X86_MMZONE_64_H -#define _ASM_X86_MMZONE_64_H - -#ifdef CONFIG_NUMA - -#include <linux/mmdebug.h> -#include <asm/smp.h> - -extern struct pglist_data *node_data[]; - -#define NODE_DATA(nid) (node_data[nid]) - -#endif -#endif /* _ASM_X86_MMZONE_64_H */ diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index e988bac0a4a1..3c2de4ce3b10 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -5,12 +5,20 @@ #include <asm-generic/module.h> #include <asm/orc_types.h> +struct its_array { +#ifdef CONFIG_MITIGATION_ITS + void **pages; + int num; +#endif +}; + struct mod_arch_specific { #ifdef CONFIG_UNWINDER_ORC unsigned int num_orcs; int *orc_unwind_ip; struct orc_entry *orc_unwind; #endif + struct its_array its_pages; }; #endif /* _ASM_X86_MODULE_H */ diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index e90ac7e9ae2c..d593e52e6635 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -2,6 +2,7 @@ #ifndef _ASM_X86_MPSPEC_H #define _ASM_X86_MPSPEC_H +#include <linux/types.h> #include <asm/mpspec_def.h> #include <asm/x86_init.h> @@ -15,16 +16,14 @@ extern int pic_mode; * Summit or generic (i.e. installer) kernels need lots of bus entries. * Maximum 256 PCI busses, plus 1 ISA bus in each of 4 cabinets. */ -#if CONFIG_BASE_SMALL == 0 -# define MAX_MP_BUSSES 260 -#else +#ifdef CONFIG_BASE_SMALL # define MAX_MP_BUSSES 32 +#else +# define MAX_MP_BUSSES 260 #endif #define MAX_IRQ_SOURCES 256 -extern unsigned int def_to_bigsmp; - #else /* CONFIG_X86_64: */ #define MAX_MP_BUSSES 256 @@ -39,9 +38,8 @@ extern int mp_bus_id_to_type[MAX_MP_BUSSES]; extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); -extern unsigned int boot_cpu_physical_apicid; +extern u32 boot_cpu_physical_apicid; extern u8 boot_cpu_apic_version; -extern unsigned long mp_lapic_addr; #ifdef CONFIG_X86_LOCAL_APIC extern int smp_found_config; @@ -49,96 +47,31 @@ extern int smp_found_config; # define smp_found_config 0 #endif -static inline void get_smp_config(void) -{ - x86_init.mpparse.get_smp_config(0); -} - -static inline void early_get_smp_config(void) -{ - x86_init.mpparse.get_smp_config(1); -} - -static inline void find_smp_config(void) -{ - x86_init.mpparse.find_smp_config(); -} - #ifdef CONFIG_X86_MPPARSE extern void e820__memblock_alloc_reserved_mpc_new(void); extern int enable_update_mptable; -extern void default_find_smp_config(void); -extern void default_get_smp_config(unsigned int early); +extern void mpparse_find_mptable(void); +extern void mpparse_parse_early_smp_config(void); +extern void mpparse_parse_smp_config(void); #else static inline void e820__memblock_alloc_reserved_mpc_new(void) { } -#define enable_update_mptable 0 -#define default_find_smp_config x86_init_noop -#define default_get_smp_config x86_init_uint_noop +#define enable_update_mptable 0 +#define mpparse_find_mptable x86_init_noop +#define mpparse_parse_early_smp_config x86_init_noop +#define mpparse_parse_smp_config x86_init_noop #endif -int generic_processor_info(int apicid, int version); - -#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC) - -struct physid_mask { - unsigned long mask[PHYSID_ARRAY_SIZE]; -}; +extern DECLARE_BITMAP(phys_cpu_present_map, MAX_LOCAL_APIC); -typedef struct physid_mask physid_mask_t; - -#define physid_set(physid, map) set_bit(physid, (map).mask) -#define physid_clear(physid, map) clear_bit(physid, (map).mask) -#define physid_isset(physid, map) test_bit(physid, (map).mask) -#define physid_test_and_set(physid, map) \ - test_and_set_bit(physid, (map).mask) - -#define physids_and(dst, src1, src2) \ - bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC) - -#define physids_or(dst, src1, src2) \ - bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC) - -#define physids_clear(map) \ - bitmap_zero((map).mask, MAX_LOCAL_APIC) - -#define physids_complement(dst, src) \ - bitmap_complement((dst).mask, (src).mask, MAX_LOCAL_APIC) - -#define physids_empty(map) \ - bitmap_empty((map).mask, MAX_LOCAL_APIC) - -#define physids_equal(map1, map2) \ - bitmap_equal((map1).mask, (map2).mask, MAX_LOCAL_APIC) - -#define physids_weight(map) \ - bitmap_weight((map).mask, MAX_LOCAL_APIC) - -#define physids_shift_right(d, s, n) \ - bitmap_shift_right((d).mask, (s).mask, n, MAX_LOCAL_APIC) - -#define physids_shift_left(d, s, n) \ - bitmap_shift_left((d).mask, (s).mask, n, MAX_LOCAL_APIC) - -static inline unsigned long physids_coerce(physid_mask_t *map) -{ - return map->mask[0]; -} - -static inline void physids_promote(unsigned long physids, physid_mask_t *map) +static inline void reset_phys_cpu_present_map(u32 apicid) { - physids_clear(*map); - map->mask[0] = physids; + bitmap_zero(phys_cpu_present_map, MAX_LOCAL_APIC); + set_bit(apicid, phys_cpu_present_map); } -static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map) +static inline void copy_phys_cpu_present_map(unsigned long *dst) { - physids_clear(*map); - physid_set(physid, *map); + bitmap_copy(dst, phys_cpu_present_map, MAX_LOCAL_APIC); } -#define PHYSID_MASK_ALL { {[0 ... PHYSID_ARRAY_SIZE-1] = ~0UL} } -#define PHYSID_MASK_NONE { {[0 ... PHYSID_ARRAY_SIZE-1] = 0UL} } - -extern physid_mask_t phys_cpu_present_map; - #endif /* _ASM_X86_MPSPEC_H */ diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 61f0c206bff0..e1752ba47e67 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -5,37 +5,62 @@ #include <linux/types.h> #include <linux/nmi.h> #include <linux/msi.h> -#include <asm/io.h> -#include <asm/hyperv-tlfs.h> +#include <linux/io.h> #include <asm/nospec-branch.h> #include <asm/paravirt.h> -#include <asm/mshyperv.h> +#include <asm/msr.h> +#include <hyperv/hvhdk.h> + +/* + * Hyper-V always provides a single IO-APIC at this MMIO address. + * Ideally, the value should be looked up in ACPI tables, but it + * is needed for mapping the IO-APIC early in boot on Confidential + * VMs, before ACPI functions can be used. + */ +#define HV_IOAPIC_BASE_ADDRESS 0xfec00000 + +#define HV_VTL_NORMAL 0x0 +#define HV_VTL_SECURE 0x1 +#define HV_VTL_MGMT 0x2 union hv_ghcb; DECLARE_STATIC_KEY_FALSE(isolation_type_snp); +DECLARE_STATIC_KEY_FALSE(isolation_type_tdx); typedef int (*hyperv_fill_flush_list_func)( struct hv_guest_mapping_flush_list *flush, void *data); -#define hv_get_raw_timer() rdtsc_ordered() - void hyperv_vector_handler(struct pt_regs *regs); +static inline unsigned char hv_get_nmi_reason(void) +{ + return 0; +} + #if IS_ENABLED(CONFIG_HYPERV) -extern int hyperv_init_cpuhp; +extern bool hyperv_paravisor_present; extern void *hv_hypercall_pg; -extern u64 hv_current_partition_id; - extern union hv_ghcb * __percpu *hv_ghcb_pg; -int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages); -int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id); -int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags); - +bool hv_isolation_type_snp(void); +bool hv_isolation_type_tdx(void); +u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2); + +/* + * DEFAULT INIT GPAT and SEGMENT LIMIT value in struct VMSA + * to start AP in enlightened SEV guest. + */ +#define HV_AP_INIT_GPAT_DEFAULT 0x0007040600070406ULL +#define HV_AP_SEGMENT_LIMIT 0xffffffff + +/* + * If the hypercall involves no input or output parameters, the hypervisor + * ignores the corresponding GPA pointer. + */ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) { u64 input_address = input ? virt_to_phys(input) : 0; @@ -43,15 +68,28 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) u64 hv_status; #ifdef CONFIG_X86_64 + if (hv_isolation_type_tdx() && !hyperv_paravisor_present) + return hv_tdx_hypercall(control, input_address, output_address); + + if (hv_isolation_type_snp() && !hyperv_paravisor_present) { + __asm__ __volatile__("mov %[output_address], %%r8\n" + "vmmcall" + : "=a" (hv_status), ASM_CALL_CONSTRAINT, + "+c" (control), "+d" (input_address) + : [output_address] "r" (output_address) + : "cc", "memory", "r8", "r9", "r10", "r11"); + return hv_status; + } + if (!hv_hypercall_pg) return U64_MAX; - __asm__ __volatile__("mov %4, %%r8\n" + __asm__ __volatile__("mov %[output_address], %%r8\n" CALL_NOSPEC : "=a" (hv_status), ASM_CALL_CONSTRAINT, "+c" (control), "+d" (input_address) - : "r" (output_address), - THUNK_TARGET(hv_hypercall_pg) + : [output_address] "r" (output_address), + THUNK_TARGET(hv_hypercall_pg) : "cc", "memory", "r8", "r9", "r10", "r11"); #else u32 input_address_hi = upper_32_bits(input_address); @@ -74,13 +112,28 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) return hv_status; } +/* Hypercall to the L0 hypervisor */ +static inline u64 hv_do_nested_hypercall(u64 control, void *input, void *output) +{ + return hv_do_hypercall(control | HV_HYPERCALL_NESTED, input, output); +} + /* Fast hypercall with 8 bytes of input and no output */ -static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1) +static inline u64 _hv_do_fast_hypercall8(u64 control, u64 input1) { - u64 hv_status, control = (u64)code | HV_HYPERCALL_FAST_BIT; + u64 hv_status; #ifdef CONFIG_X86_64 - { + if (hv_isolation_type_tdx() && !hyperv_paravisor_present) + return hv_tdx_hypercall(control, input1, 0); + + if (hv_isolation_type_snp() && !hyperv_paravisor_present) { + __asm__ __volatile__( + "vmmcall" + : "=a" (hv_status), ASM_CALL_CONSTRAINT, + "+c" (control), "+d" (input1) + :: "cc", "r8", "r9", "r10", "r11"); + } else { __asm__ __volatile__(CALL_NOSPEC : "=a" (hv_status), ASM_CALL_CONSTRAINT, "+c" (control), "+d" (input1) @@ -102,21 +155,45 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1) : "cc", "edi", "esi"); } #endif - return hv_status; + return hv_status; +} + +static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1) +{ + u64 control = (u64)code | HV_HYPERCALL_FAST_BIT; + + return _hv_do_fast_hypercall8(control, input1); +} + +static inline u64 hv_do_fast_nested_hypercall8(u16 code, u64 input1) +{ + u64 control = (u64)code | HV_HYPERCALL_FAST_BIT | HV_HYPERCALL_NESTED; + + return _hv_do_fast_hypercall8(control, input1); } /* Fast hypercall with 16 bytes of input */ -static inline u64 hv_do_fast_hypercall16(u16 code, u64 input1, u64 input2) +static inline u64 _hv_do_fast_hypercall16(u64 control, u64 input1, u64 input2) { - u64 hv_status, control = (u64)code | HV_HYPERCALL_FAST_BIT; + u64 hv_status; #ifdef CONFIG_X86_64 - { - __asm__ __volatile__("mov %4, %%r8\n" + if (hv_isolation_type_tdx() && !hyperv_paravisor_present) + return hv_tdx_hypercall(control, input1, input2); + + if (hv_isolation_type_snp() && !hyperv_paravisor_present) { + __asm__ __volatile__("mov %[input2], %%r8\n" + "vmmcall" + : "=a" (hv_status), ASM_CALL_CONSTRAINT, + "+c" (control), "+d" (input1) + : [input2] "r" (input2) + : "cc", "r8", "r9", "r10", "r11"); + } else { + __asm__ __volatile__("mov %[input2], %%r8\n" CALL_NOSPEC : "=a" (hv_status), ASM_CALL_CONSTRAINT, "+c" (control), "+d" (input1) - : "r" (input2), + : [input2] "r" (input2), THUNK_TARGET(hv_hypercall_pg) : "cc", "r8", "r9", "r10", "r11"); } @@ -139,6 +216,20 @@ static inline u64 hv_do_fast_hypercall16(u16 code, u64 input1, u64 input2) return hv_status; } +static inline u64 hv_do_fast_hypercall16(u16 code, u64 input1, u64 input2) +{ + u64 control = (u64)code | HV_HYPERCALL_FAST_BIT; + + return _hv_do_fast_hypercall16(control, input1, input2); +} + +static inline u64 hv_do_fast_nested_hypercall16(u16 code, u64 input1, u64 input2) +{ + u64 control = (u64)code | HV_HYPERCALL_FAST_BIT | HV_HYPERCALL_NESTED; + + return _hv_do_fast_hypercall16(control, input1, input2); +} + extern struct hv_vp_assist_page **hv_vp_assist_page; static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu) @@ -174,54 +265,50 @@ struct irq_domain *hv_create_pci_msi_domain(void); int hv_map_ioapic_interrupt(int ioapic_id, bool level, int vcpu, int vector, struct hv_interrupt_entry *entry); int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry); -int hv_set_mem_host_visibility(unsigned long addr, int numpages, bool visible); #ifdef CONFIG_AMD_MEM_ENCRYPT -void hv_ghcb_msr_write(u64 msr, u64 value); -void hv_ghcb_msr_read(u64 msr, u64 *value); bool hv_ghcb_negotiate_protocol(void); -void hv_ghcb_terminate(unsigned int set, unsigned int reason); +void __noreturn hv_ghcb_terminate(unsigned int set, unsigned int reason); +int hv_snp_boot_ap(u32 apic_id, unsigned long start_ip, unsigned int cpu); #else -static inline void hv_ghcb_msr_write(u64 msr, u64 value) {} -static inline void hv_ghcb_msr_read(u64 msr, u64 *value) {} static inline bool hv_ghcb_negotiate_protocol(void) { return false; } static inline void hv_ghcb_terminate(unsigned int set, unsigned int reason) {} +static inline int hv_snp_boot_ap(u32 apic_id, unsigned long start_ip, + unsigned int cpu) { return 0; } #endif -extern bool hv_isolation_type_snp(void); +#if defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST) +void hv_vtom_init(void); +void hv_ivm_msr_write(u64 msr, u64 value); +void hv_ivm_msr_read(u64 msr, u64 *value); +#else +static inline void hv_vtom_init(void) {} +static inline void hv_ivm_msr_write(u64 msr, u64 value) {} +static inline void hv_ivm_msr_read(u64 msr, u64 *value) {} +#endif -static inline bool hv_is_synic_reg(unsigned int reg) +static inline bool hv_is_synic_msr(unsigned int reg) { - if ((reg >= HV_REGISTER_SCONTROL) && - (reg <= HV_REGISTER_SINT15)) - return true; - return false; + return (reg >= HV_X64_MSR_SCONTROL) && + (reg <= HV_X64_MSR_SINT15); } -static inline u64 hv_get_register(unsigned int reg) +static inline bool hv_is_sint_msr(unsigned int reg) { - u64 value; - - if (hv_is_synic_reg(reg) && hv_isolation_type_snp()) - hv_ghcb_msr_read(reg, &value); - else - rdmsrl(reg, value); - return value; + return (reg >= HV_X64_MSR_SINT0) && + (reg <= HV_X64_MSR_SINT15); } -static inline void hv_set_register(unsigned int reg, u64 value) -{ - if (hv_is_synic_reg(reg) && hv_isolation_type_snp()) { - hv_ghcb_msr_write(reg, value); +u64 hv_get_msr(unsigned int reg); +void hv_set_msr(unsigned int reg, u64 value); +u64 hv_get_non_nested_msr(unsigned int reg); +void hv_set_non_nested_msr(unsigned int reg, u64 value); - /* Write proxy bit via wrmsl instruction */ - if (reg >= HV_REGISTER_SINT0 && - reg <= HV_REGISTER_SINT15) - wrmsrl(reg, value | 1 << 20); - } else { - wrmsrl(reg, value); - } +static __always_inline u64 hv_raw_get_msr(unsigned int reg) +{ + return native_rdmsrq(reg); } +int hv_apicid_to_vp_index(u32 apic_id); #else /* CONFIG_HYPERV */ static inline void hyperv_init(void) {} @@ -239,16 +326,22 @@ static inline int hyperv_flush_guest_mapping_range(u64 as, { return -1; } -static inline void hv_set_register(unsigned int reg, u64 value) { } -static inline u64 hv_get_register(unsigned int reg) { return 0; } -static inline int hv_set_mem_host_visibility(unsigned long addr, int numpages, - bool visible) -{ - return -1; -} +static inline void hv_set_msr(unsigned int reg, u64 value) { } +static inline u64 hv_get_msr(unsigned int reg) { return 0; } +static inline void hv_set_non_nested_msr(unsigned int reg, u64 value) { } +static inline u64 hv_get_non_nested_msr(unsigned int reg) { return 0; } +static inline int hv_apicid_to_vp_index(u32 apic_id) { return -EINVAL; } #endif /* CONFIG_HYPERV */ +#ifdef CONFIG_HYPERV_VTL_MODE +void __init hv_vtl_init_platform(void); +int __init hv_vtl_early_init(void); +#else +static inline void __init hv_vtl_init_platform(void) {} +static inline int __init hv_vtl_early_init(void) { return 0; } +#endif + #include <asm-generic/mshyperv.h> #endif diff --git a/arch/x86/include/asm/msi.h b/arch/x86/include/asm/msi.h index d71c7e8b738d..935c6d470341 100644 --- a/arch/x86/include/asm/msi.h +++ b/arch/x86/include/asm/msi.h @@ -62,4 +62,10 @@ typedef struct x86_msi_addr_hi { struct msi_msg; u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid); +#define X86_VECTOR_MSI_FLAGS_SUPPORTED \ + (MSI_GENERIC_FLAGS_MASK | MSI_FLAG_PCI_MSIX | MSI_FLAG_PCI_MSIX_ALLOC_DYN) + +#define X86_VECTOR_MSI_FLAGS_REQUIRED \ + (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS) + #endif /* _ASM_X86_MSI_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 1ac0f9bf4b90..b7dded3c8113 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -4,12 +4,7 @@ #include <linux/bits.h> -/* - * CPU model specific register (MSR) numbers. - * - * Do not add new entries to this file unless the definitions are shared - * between multiple compilation units. - */ +/* CPU model specific register (MSR) numbers. */ /* x86-64 specific MSRs */ #define MSR_EFER 0xc0000080 /* extended feature register */ @@ -30,6 +25,8 @@ #define _EFER_SVME 12 /* Enable virtualization */ #define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */ #define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */ +#define _EFER_TCE 15 /* Enable Translation Cache Extensions */ +#define _EFER_AUTOIBRS 21 /* Enable Automatic IBRS */ #define EFER_SCE (1<<_EFER_SCE) #define EFER_LME (1<<_EFER_LME) @@ -38,9 +35,36 @@ #define EFER_SVME (1<<_EFER_SVME) #define EFER_LMSLE (1<<_EFER_LMSLE) #define EFER_FFXSR (1<<_EFER_FFXSR) +#define EFER_TCE (1<<_EFER_TCE) +#define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS) -/* Intel MSRs. Some also available on other CPUs */ +/* + * Architectural memory types that are common to MTRRs, PAT, VMX MSRs, etc. + * Most MSRs support/allow only a subset of memory types, but the values + * themselves are common across all relevant MSRs. + */ +#define X86_MEMTYPE_UC 0ull /* Uncacheable, a.k.a. Strong Uncacheable */ +#define X86_MEMTYPE_WC 1ull /* Write Combining */ +/* RESERVED 2 */ +/* RESERVED 3 */ +#define X86_MEMTYPE_WT 4ull /* Write Through */ +#define X86_MEMTYPE_WP 5ull /* Write Protected */ +#define X86_MEMTYPE_WB 6ull /* Write Back */ +#define X86_MEMTYPE_UC_MINUS 7ull /* Weak Uncacheabled (PAT only) */ + +/* FRED MSRs */ +#define MSR_IA32_FRED_RSP0 0x1cc /* Level 0 stack pointer */ +#define MSR_IA32_FRED_RSP1 0x1cd /* Level 1 stack pointer */ +#define MSR_IA32_FRED_RSP2 0x1ce /* Level 2 stack pointer */ +#define MSR_IA32_FRED_RSP3 0x1cf /* Level 3 stack pointer */ +#define MSR_IA32_FRED_STKLVLS 0x1d0 /* Exception stack levels */ +#define MSR_IA32_FRED_SSP0 MSR_IA32_PL0_SSP /* Level 0 shadow stack pointer */ +#define MSR_IA32_FRED_SSP1 0x1d1 /* Level 1 shadow stack pointer */ +#define MSR_IA32_FRED_SSP2 0x1d2 /* Level 2 shadow stack pointer */ +#define MSR_IA32_FRED_SSP3 0x1d3 /* Level 3 shadow stack pointer */ +#define MSR_IA32_FRED_CONFIG 0x1d4 /* Entrypoint and interrupt stack level */ +/* Intel MSRs. Some also available on other CPUs */ #define MSR_TEST_CTRL 0x00000033 #define MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT 29 #define MSR_TEST_CTRL_SPLIT_LOCK_DETECT BIT(MSR_TEST_CTRL_SPLIT_LOCK_DETECT_BIT) @@ -53,9 +77,17 @@ #define SPEC_CTRL_SSBD BIT(SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */ #define SPEC_CTRL_RRSBA_DIS_S_SHIFT 6 /* Disable RRSBA behavior */ #define SPEC_CTRL_RRSBA_DIS_S BIT(SPEC_CTRL_RRSBA_DIS_S_SHIFT) +#define SPEC_CTRL_BHI_DIS_S_SHIFT 10 /* Disable Branch History Injection behavior */ +#define SPEC_CTRL_BHI_DIS_S BIT(SPEC_CTRL_BHI_DIS_S_SHIFT) + +/* A mask for bits which the kernel toggles when controlling mitigations */ +#define SPEC_CTRL_MITIGATIONS_MASK (SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD \ + | SPEC_CTRL_RRSBA_DIS_S \ + | SPEC_CTRL_BHI_DIS_S) #define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */ #define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */ +#define PRED_CMD_SBPB BIT(7) /* Selective Branch Prediction Barrier */ #define MSR_PPIN_CTL 0x0000004e #define MSR_PPIN 0x0000004f @@ -150,6 +182,43 @@ * are restricted to targets in * kernel. */ +#define ARCH_CAP_BHI_NO BIT(20) /* + * CPU is not affected by Branch + * History Injection. + */ +#define ARCH_CAP_XAPIC_DISABLE BIT(21) /* + * IA32_XAPIC_DISABLE_STATUS MSR + * supported + */ +#define ARCH_CAP_PBRSB_NO BIT(24) /* + * Not susceptible to Post-Barrier + * Return Stack Buffer Predictions. + */ +#define ARCH_CAP_GDS_CTRL BIT(25) /* + * CPU is vulnerable to Gather + * Data Sampling (GDS) and + * has controls for mitigation. + */ +#define ARCH_CAP_GDS_NO BIT(26) /* + * CPU is not vulnerable to Gather + * Data Sampling (GDS). + */ +#define ARCH_CAP_RFDS_NO BIT(27) /* + * Not susceptible to Register + * File Data Sampling. + */ +#define ARCH_CAP_RFDS_CLEAR BIT(28) /* + * VERW clears CPU Register + * File. + */ +#define ARCH_CAP_ITS_NO BIT_ULL(62) /* + * Not susceptible to + * Indirect Target Selection. + * This bit is not set by + * HW, but is synthesized by + * VMMs for guests to know + * their affected status. + */ #define MSR_IA32_FLUSH_CMD 0x0000010b #define L1D_FLUSH BIT(0) /* @@ -168,6 +237,8 @@ #define RNGDS_MITG_DIS BIT(0) /* SRBDS support */ #define RTM_ALLOW BIT(1) /* TSX development mode */ #define FB_CLEAR_DIS BIT(3) /* CPU Fill buffer clear disable */ +#define GDS_MITG_DIS BIT(4) /* Disable GDS mitigation */ +#define GDS_MITG_LOCKED BIT(5) /* GDS mitigation locked */ #define MSR_IA32_SYSENTER_CS 0x00000174 #define MSR_IA32_SYSENTER_ESP 0x00000175 @@ -185,6 +256,9 @@ #define MSR_TURBO_RATIO_LIMIT1 0x000001ae #define MSR_TURBO_RATIO_LIMIT2 0x000001af +#define MSR_SNOOP_RSP_0 0x00001328 +#define MSR_SNOOP_RSP_1 0x00001329 + #define MSR_LBR_SELECT 0x000001c8 #define MSR_LBR_TOS 0x000001c9 @@ -193,8 +267,13 @@ /* Abbreviated from Intel SDM name IA32_INTEGRITY_CAPABILITIES */ #define MSR_INTEGRITY_CAPS 0x000002d9 +#define MSR_INTEGRITY_CAPS_ARRAY_BIST_BIT 2 +#define MSR_INTEGRITY_CAPS_ARRAY_BIST BIT(MSR_INTEGRITY_CAPS_ARRAY_BIST_BIT) #define MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT 4 #define MSR_INTEGRITY_CAPS_PERIODIC_BIST BIT(MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT) +#define MSR_INTEGRITY_CAPS_SBAF_BIT 8 +#define MSR_INTEGRITY_CAPS_SBAF BIT(MSR_INTEGRITY_CAPS_SBAF_BIT) +#define MSR_INTEGRITY_CAPS_SAF_GEN_MASK GENMASK_ULL(10, 9) #define MSR_LBR_NHM_FROM 0x00000680 #define MSR_LBR_NHM_TO 0x000006c0 @@ -209,6 +288,11 @@ #define LBR_INFO_CYCLES 0xffff #define LBR_INFO_BR_TYPE_OFFSET 56 #define LBR_INFO_BR_TYPE (0xfull << LBR_INFO_BR_TYPE_OFFSET) +#define LBR_INFO_BR_CNTR_OFFSET 32 +#define LBR_INFO_BR_CNTR_NUM 4 +#define LBR_INFO_BR_CNTR_BITS 2 +#define LBR_INFO_BR_CNTR_MASK GENMASK_ULL(LBR_INFO_BR_CNTR_BITS - 1, 0) +#define LBR_INFO_BR_CNTR_FULL_MASK GENMASK_ULL(LBR_INFO_BR_CNTR_NUM * LBR_INFO_BR_CNTR_BITS - 1, 0) #define MSR_ARCH_LBR_CTL 0x000014ce #define ARCH_LBR_CTL_LBREN BIT(0) @@ -231,6 +315,12 @@ #define PERF_CAP_PT_IDX 16 #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 +#define PERF_CAP_PEBS_TRAP BIT_ULL(6) +#define PERF_CAP_ARCH_REG BIT_ULL(7) +#define PERF_CAP_PEBS_FORMAT 0xf00 +#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) +#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ + PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE) #define MSR_IA32_RTIT_CTL 0x00000570 #define RTIT_CTL_TRACEEN BIT(0) @@ -299,6 +389,12 @@ #define MSR_IA32_CR_PAT 0x00000277 +#define PAT_VALUE(p0, p1, p2, p3, p4, p5, p6, p7) \ + ((X86_MEMTYPE_ ## p0) | (X86_MEMTYPE_ ## p1 << 8) | \ + (X86_MEMTYPE_ ## p2 << 16) | (X86_MEMTYPE_ ## p3 << 24) | \ + (X86_MEMTYPE_ ## p4 << 32) | (X86_MEMTYPE_ ## p5 << 40) | \ + (X86_MEMTYPE_ ## p6 << 48) | (X86_MEMTYPE_ ## p7 << 56)) + #define MSR_IA32_DEBUGCTLMSR 0x000001d9 #define MSR_IA32_LASTBRANCHFROMIP 0x000001db #define MSR_IA32_LASTBRANCHTOIP 0x000001dc @@ -309,7 +405,8 @@ #define MSR_IA32_PASID_VALID BIT_ULL(31) /* DEBUGCTLMSR bits (others vary by model): */ -#define DEBUGCTLMSR_LBR (1UL << 0) /* last branch recording */ +#define DEBUGCTLMSR_LBR_BIT 0 /* last branch recording */ +#define DEBUGCTLMSR_LBR (1UL << DEBUGCTLMSR_LBR_BIT) #define DEBUGCTLMSR_BTF_SHIFT 1 #define DEBUGCTLMSR_BTF (1UL << 1) /* single-step on branches */ #define DEBUGCTLMSR_BUS_LOCK_DETECT (1UL << 2) @@ -436,7 +533,7 @@ #define MSR_HWP_CAPABILITIES 0x00000771 #define MSR_HWP_REQUEST_PKG 0x00000772 #define MSR_HWP_INTERRUPT 0x00000773 -#define MSR_HWP_REQUEST 0x00000774 +#define MSR_HWP_REQUEST 0x00000774 #define MSR_HWP_STATUS 0x00000777 /* CPUID.6.EAX */ @@ -453,16 +550,16 @@ #define HWP_LOWEST_PERF(x) (((x) >> 24) & 0xff) /* IA32_HWP_REQUEST */ -#define HWP_MIN_PERF(x) (x & 0xff) -#define HWP_MAX_PERF(x) ((x & 0xff) << 8) +#define HWP_MIN_PERF(x) (x & 0xff) +#define HWP_MAX_PERF(x) ((x & 0xff) << 8) #define HWP_DESIRED_PERF(x) ((x & 0xff) << 16) -#define HWP_ENERGY_PERF_PREFERENCE(x) (((unsigned long long) x & 0xff) << 24) +#define HWP_ENERGY_PERF_PREFERENCE(x) (((u64)x & 0xff) << 24) #define HWP_EPP_PERFORMANCE 0x00 #define HWP_EPP_BALANCE_PERFORMANCE 0x80 #define HWP_EPP_BALANCE_POWERSAVE 0xC0 #define HWP_EPP_POWERSAVE 0xFF -#define HWP_ACTIVITY_WINDOW(x) ((unsigned long long)(x & 0xff3) << 32) -#define HWP_PACKAGE_CONTROL(x) ((unsigned long long)(x & 0x1) << 42) +#define HWP_ACTIVITY_WINDOW(x) ((u64)(x & 0xff3) << 32) +#define HWP_PACKAGE_CONTROL(x) ((u64)(x & 0x1) << 42) /* IA32_HWP_STATUS */ #define HWP_GUARANTEED_CHANGE(x) (x & 0x1) @@ -502,6 +599,19 @@ #define MSR_RELOAD_PMC0 0x000014c1 #define MSR_RELOAD_FIXED_CTR0 0x00001309 +/* V6 PMON MSR range */ +#define MSR_IA32_PMC_V6_GP0_CTR 0x1900 +#define MSR_IA32_PMC_V6_GP0_CFG_A 0x1901 +#define MSR_IA32_PMC_V6_GP0_CFG_B 0x1902 +#define MSR_IA32_PMC_V6_GP0_CFG_C 0x1903 +#define MSR_IA32_PMC_V6_FX0_CTR 0x1980 +#define MSR_IA32_PMC_V6_FX0_CFG_B 0x1982 +#define MSR_IA32_PMC_V6_FX0_CFG_C 0x1983 +#define MSR_IA32_PMC_V6_STEP 4 + +/* KeyID partitioning between MKTME and TDX */ +#define MSR_IA32_MKTME_KEYID_PARTITIONING 0x00000087 + /* * AMD64 MSRs. Not complete. See the architecture manual for a more * complete list. @@ -513,6 +623,7 @@ #define MSR_AMD_PERF_CTL 0xc0010062 #define MSR_AMD_PERF_STATUS 0xc0010063 #define MSR_AMD_PSTATE_DEF_BASE 0xc0010064 +#define MSR_AMD64_GUEST_TSC_FREQ 0xc0010134 #define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 #define MSR_AMD64_OSVW_STATUS 0xc0010141 #define MSR_AMD_PPIN_CTL 0xc00102f0 @@ -520,6 +631,13 @@ #define MSR_AMD64_CPUID_FN_1 0xc0011004 #define MSR_AMD64_LS_CFG 0xc0011020 #define MSR_AMD64_DC_CFG 0xc0011022 +#define MSR_AMD64_TW_CFG 0xc0011023 + +#define MSR_AMD64_DE_CFG 0xc0011029 +#define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT 1 +#define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE BIT_ULL(MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT) +#define MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT 9 + #define MSR_AMD64_BU_CFG2 0xc001102a #define MSR_AMD64_IBSFETCHCTL 0xc0011030 #define MSR_AMD64_IBSFETCHLINAD 0xc0011031 @@ -542,16 +660,53 @@ #define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */ #define MSR_AMD64_SVM_AVIC_DOORBELL 0xc001011b #define MSR_AMD64_VM_PAGE_FLUSH 0xc001011e +#define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f #define MSR_AMD64_SEV_ES_GHCB 0xc0010130 #define MSR_AMD64_SEV 0xc0010131 #define MSR_AMD64_SEV_ENABLED_BIT 0 -#define MSR_AMD64_SEV_ES_ENABLED_BIT 1 -#define MSR_AMD64_SEV_SNP_ENABLED_BIT 2 #define MSR_AMD64_SEV_ENABLED BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT) +#define MSR_AMD64_SEV_ES_ENABLED_BIT 1 #define MSR_AMD64_SEV_ES_ENABLED BIT_ULL(MSR_AMD64_SEV_ES_ENABLED_BIT) +#define MSR_AMD64_SEV_SNP_ENABLED_BIT 2 #define MSR_AMD64_SEV_SNP_ENABLED BIT_ULL(MSR_AMD64_SEV_SNP_ENABLED_BIT) - -#define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f +#define MSR_AMD64_SNP_VTOM_BIT 3 +#define MSR_AMD64_SNP_VTOM BIT_ULL(MSR_AMD64_SNP_VTOM_BIT) +#define MSR_AMD64_SNP_REFLECT_VC_BIT 4 +#define MSR_AMD64_SNP_REFLECT_VC BIT_ULL(MSR_AMD64_SNP_REFLECT_VC_BIT) +#define MSR_AMD64_SNP_RESTRICTED_INJ_BIT 5 +#define MSR_AMD64_SNP_RESTRICTED_INJ BIT_ULL(MSR_AMD64_SNP_RESTRICTED_INJ_BIT) +#define MSR_AMD64_SNP_ALT_INJ_BIT 6 +#define MSR_AMD64_SNP_ALT_INJ BIT_ULL(MSR_AMD64_SNP_ALT_INJ_BIT) +#define MSR_AMD64_SNP_DEBUG_SWAP_BIT 7 +#define MSR_AMD64_SNP_DEBUG_SWAP BIT_ULL(MSR_AMD64_SNP_DEBUG_SWAP_BIT) +#define MSR_AMD64_SNP_PREVENT_HOST_IBS_BIT 8 +#define MSR_AMD64_SNP_PREVENT_HOST_IBS BIT_ULL(MSR_AMD64_SNP_PREVENT_HOST_IBS_BIT) +#define MSR_AMD64_SNP_BTB_ISOLATION_BIT 9 +#define MSR_AMD64_SNP_BTB_ISOLATION BIT_ULL(MSR_AMD64_SNP_BTB_ISOLATION_BIT) +#define MSR_AMD64_SNP_VMPL_SSS_BIT 10 +#define MSR_AMD64_SNP_VMPL_SSS BIT_ULL(MSR_AMD64_SNP_VMPL_SSS_BIT) +#define MSR_AMD64_SNP_SECURE_TSC_BIT 11 +#define MSR_AMD64_SNP_SECURE_TSC BIT_ULL(MSR_AMD64_SNP_SECURE_TSC_BIT) +#define MSR_AMD64_SNP_VMGEXIT_PARAM_BIT 12 +#define MSR_AMD64_SNP_VMGEXIT_PARAM BIT_ULL(MSR_AMD64_SNP_VMGEXIT_PARAM_BIT) +#define MSR_AMD64_SNP_RESERVED_BIT13 BIT_ULL(13) +#define MSR_AMD64_SNP_IBS_VIRT_BIT 14 +#define MSR_AMD64_SNP_IBS_VIRT BIT_ULL(MSR_AMD64_SNP_IBS_VIRT_BIT) +#define MSR_AMD64_SNP_RESERVED_BIT15 BIT_ULL(15) +#define MSR_AMD64_SNP_VMSA_REG_PROT_BIT 16 +#define MSR_AMD64_SNP_VMSA_REG_PROT BIT_ULL(MSR_AMD64_SNP_VMSA_REG_PROT_BIT) +#define MSR_AMD64_SNP_SMT_PROT_BIT 17 +#define MSR_AMD64_SNP_SMT_PROT BIT_ULL(MSR_AMD64_SNP_SMT_PROT_BIT) +#define MSR_AMD64_SNP_RESV_BIT 18 +#define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, MSR_AMD64_SNP_RESV_BIT) +#define MSR_AMD64_RMP_BASE 0xc0010132 +#define MSR_AMD64_RMP_END 0xc0010133 +#define MSR_AMD64_RMP_CFG 0xc0010136 +#define MSR_AMD64_SEG_RMP_ENABLED_BIT 0 +#define MSR_AMD64_SEG_RMP_ENABLED BIT_ULL(MSR_AMD64_SEG_RMP_ENABLED_BIT) +#define MSR_AMD64_RMP_SEGMENT_SHIFT(x) (((x) & GENMASK_ULL(13, 8)) >> 8) + +#define MSR_SVSM_CAA 0xc001f000 /* AMD Collaborative Processor Performance Control MSRs */ #define MSR_AMD_CPPC_CAP1 0xc00102b0 @@ -560,27 +715,42 @@ #define MSR_AMD_CPPC_REQ 0xc00102b3 #define MSR_AMD_CPPC_STATUS 0xc00102b4 -#define AMD_CPPC_LOWEST_PERF(x) (((x) >> 0) & 0xff) -#define AMD_CPPC_LOWNONLIN_PERF(x) (((x) >> 8) & 0xff) -#define AMD_CPPC_NOMINAL_PERF(x) (((x) >> 16) & 0xff) -#define AMD_CPPC_HIGHEST_PERF(x) (((x) >> 24) & 0xff) +/* Masks for use with MSR_AMD_CPPC_CAP1 */ +#define AMD_CPPC_LOWEST_PERF_MASK GENMASK(7, 0) +#define AMD_CPPC_LOWNONLIN_PERF_MASK GENMASK(15, 8) +#define AMD_CPPC_NOMINAL_PERF_MASK GENMASK(23, 16) +#define AMD_CPPC_HIGHEST_PERF_MASK GENMASK(31, 24) -#define AMD_CPPC_MAX_PERF(x) (((x) & 0xff) << 0) -#define AMD_CPPC_MIN_PERF(x) (((x) & 0xff) << 8) -#define AMD_CPPC_DES_PERF(x) (((x) & 0xff) << 16) -#define AMD_CPPC_ENERGY_PERF_PREF(x) (((x) & 0xff) << 24) +/* Masks for use with MSR_AMD_CPPC_REQ */ +#define AMD_CPPC_MAX_PERF_MASK GENMASK(7, 0) +#define AMD_CPPC_MIN_PERF_MASK GENMASK(15, 8) +#define AMD_CPPC_DES_PERF_MASK GENMASK(23, 16) +#define AMD_CPPC_EPP_PERF_MASK GENMASK(31, 24) /* AMD Performance Counter Global Status and Control MSRs */ #define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS 0xc0000300 #define MSR_AMD64_PERF_CNTR_GLOBAL_CTL 0xc0000301 #define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR 0xc0000302 +/* AMD Last Branch Record MSRs */ +#define MSR_AMD64_LBR_SELECT 0xc000010e + +/* Zen4 */ +#define MSR_ZEN4_BP_CFG 0xc001102e +#define MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT 4 +#define MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT 5 + +/* Fam 19h MSRs */ +#define MSR_F19H_UMC_PERF_CTL 0xc0010800 +#define MSR_F19H_UMC_PERF_CTR 0xc0010801 + +/* Zen 2 */ +#define MSR_ZEN2_SPECTRAL_CHICKEN 0xc00110e3 +#define MSR_ZEN2_SPECTRAL_CHICKEN_BIT BIT_ULL(1) + /* Fam 17h MSRs */ #define MSR_F17H_IRPERF 0xc00000e9 -#define MSR_ZEN2_SPECTRAL_CHICKEN 0xc00110e3 -#define MSR_ZEN2_SPECTRAL_CHICKEN_BIT BIT_ULL(1) - /* Fam 16h MSRs */ #define MSR_F16H_L2I_PERF_CTL 0xc0010230 #define MSR_F16H_L2I_PERF_CTR 0xc0010231 @@ -622,16 +792,20 @@ #define FAM10H_MMIO_CONF_BASE_MASK 0xfffffffULL #define FAM10H_MMIO_CONF_BASE_SHIFT 20 #define MSR_FAM10H_NODE_ID 0xc001100c -#define MSR_F10H_DECFG 0xc0011029 -#define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT 1 -#define MSR_F10H_DECFG_LFENCE_SERIALIZE BIT_ULL(MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT) /* K8 MSRs */ #define MSR_K8_TOP_MEM1 0xc001001a #define MSR_K8_TOP_MEM2 0xc001001d #define MSR_AMD64_SYSCFG 0xc0010010 -#define MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT 23 +#define MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT 23 #define MSR_AMD64_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT) +#define MSR_AMD64_SYSCFG_SNP_EN_BIT 24 +#define MSR_AMD64_SYSCFG_SNP_EN BIT_ULL(MSR_AMD64_SYSCFG_SNP_EN_BIT) +#define MSR_AMD64_SYSCFG_SNP_VMPL_EN_BIT 25 +#define MSR_AMD64_SYSCFG_SNP_VMPL_EN BIT_ULL(MSR_AMD64_SYSCFG_SNP_VMPL_EN_BIT) +#define MSR_AMD64_SYSCFG_MFDM_BIT 19 +#define MSR_AMD64_SYSCFG_MFDM BIT_ULL(MSR_AMD64_SYSCFG_MFDM_BIT) + #define MSR_K8_INT_PENDING_MSG 0xc0010055 /* C1E active bits in int pending message */ #define K8_INTP_C1E_ACTIVE_MASK 0x18000000 @@ -658,6 +832,8 @@ #define MSR_K7_HWCR_IRPERF_EN BIT_ULL(MSR_K7_HWCR_IRPERF_EN_BIT) #define MSR_K7_FID_VID_CTL 0xc0010041 #define MSR_K7_FID_VID_STATUS 0xc0010042 +#define MSR_K7_HWCR_CPB_DIS_BIT 25 +#define MSR_K7_HWCR_CPB_DIS BIT_ULL(MSR_K7_HWCR_CPB_DIS_BIT) /* K6 MSRs */ #define MSR_K6_WHCR 0xc0000082 @@ -746,6 +922,8 @@ #define MSR_AMD_DBG_EXTN_CFG 0xc000010f #define MSR_AMD_SAMP_BR_FROM 0xc0010300 +#define DBG_EXTN_CFG_LBRV2EN BIT_ULL(6) + #define MSR_IA32_MPERF 0x000000e7 #define MSR_IA32_APERF 0x000000e8 @@ -776,6 +954,7 @@ #define ENERGY_PERF_BIAS_PERFORMANCE 0 #define ENERGY_PERF_BIAS_BALANCE_PERFORMANCE 4 #define ENERGY_PERF_BIAS_NORMAL 6 +#define ENERGY_PERF_BIAS_NORMAL_POWERSAVE 7 #define ENERGY_PERF_BIAS_BALANCE_POWERSAVE 8 #define ENERGY_PERF_BIAS_POWERSAVE 15 @@ -1019,28 +1198,44 @@ #define MSR_IA32_VMX_TRUE_EXIT_CTLS 0x0000048f #define MSR_IA32_VMX_TRUE_ENTRY_CTLS 0x00000490 #define MSR_IA32_VMX_VMFUNC 0x00000491 +#define MSR_IA32_VMX_PROCBASED_CTLS3 0x00000492 + +/* Resctrl MSRs: */ +/* - Intel: */ +#define MSR_IA32_L3_QOS_CFG 0xc81 +#define MSR_IA32_L2_QOS_CFG 0xc82 +#define MSR_IA32_QM_EVTSEL 0xc8d +#define MSR_IA32_QM_CTR 0xc8e +#define MSR_IA32_PQR_ASSOC 0xc8f +#define MSR_IA32_L3_CBM_BASE 0xc90 +#define MSR_RMID_SNC_CONFIG 0xca0 +#define MSR_IA32_L2_CBM_BASE 0xd10 +#define MSR_IA32_MBA_THRTL_BASE 0xd50 + +/* - AMD: */ +#define MSR_IA32_MBA_BW_BASE 0xc0000200 +#define MSR_IA32_SMBA_BW_BASE 0xc0000280 +#define MSR_IA32_EVT_CFG_BASE 0xc0000400 -/* VMX_BASIC bits and bitmasks */ -#define VMX_BASIC_VMCS_SIZE_SHIFT 32 -#define VMX_BASIC_TRUE_CTLS (1ULL << 55) -#define VMX_BASIC_64 0x0001000000000000LLU -#define VMX_BASIC_MEM_TYPE_SHIFT 50 -#define VMX_BASIC_MEM_TYPE_MASK 0x003c000000000000LLU -#define VMX_BASIC_MEM_TYPE_WB 6LLU -#define VMX_BASIC_INOUT 0x0040000000000000LLU - -/* MSR_IA32_VMX_MISC bits */ -#define MSR_IA32_VMX_MISC_INTEL_PT (1ULL << 14) -#define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29) -#define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE 0x1F /* AMD-V MSRs */ - #define MSR_VM_CR 0xc0010114 #define MSR_VM_IGNNE 0xc0010115 #define MSR_VM_HSAVE_PA 0xc0010117 +#define SVM_VM_CR_VALID_MASK 0x001fULL +#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL +#define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL + /* Hardware Feedback Interface */ #define MSR_IA32_HW_FEEDBACK_PTR 0x17d0 #define MSR_IA32_HW_FEEDBACK_CONFIG 0x17d1 +/* x2APIC locked status */ +#define MSR_IA32_XAPIC_DISABLE_STATUS 0xBD +#define LEGACY_XAPIC_DISABLED BIT(0) /* + * x2APIC mode is locked and + * disabling x2APIC will cause + * a #GP + */ + #endif /* _ASM_X86_MSR_INDEX_H */ diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 65ec1965cd28..9c2ea29e12a9 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -4,7 +4,7 @@ #include "msr-index.h" -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <asm/asm.h> #include <asm/errno.h> @@ -12,11 +12,14 @@ #include <uapi/asm/msr.h> #include <asm/shared/msr.h> +#include <linux/types.h> +#include <linux/percpu.h> + struct msr_info { - u32 msr_no; - struct msr reg; - struct msr *msrs; - int err; + u32 msr_no; + struct msr reg; + struct msr __percpu *msrs; + int err; }; struct msr_regs_info { @@ -35,23 +38,6 @@ struct saved_msrs { }; /* - * both i386 and x86_64 returns 64-bit value in edx:eax, but gcc's "A" - * constraint has different meanings. For i386, "A" means exactly - * edx:eax, while for x86_64 it doesn't mean rdx:rax or edx:eax. Instead, - * it means rax *or* rdx. - */ -#ifdef CONFIG_X86_64 -/* Using 64-bit values saves one instruction clearing the high half of low */ -#define DECLARE_ARGS(val, low, high) unsigned long low, high -#define EAX_EDX_VAL(val, low, high) ((low) | (high) << 32) -#define EAX_EDX_RET(val, low, high) "=a" (low), "=d" (high) -#else -#define DECLARE_ARGS(val, low, high) unsigned long long val -#define EAX_EDX_VAL(val, low, high) (val) -#define EAX_EDX_RET(val, low, high) "=A" (val) -#endif - -/* * Be very careful with includes. This header is prone to include loops. */ #include <asm/atomic.h> @@ -61,13 +47,13 @@ struct saved_msrs { DECLARE_TRACEPOINT(read_msr); DECLARE_TRACEPOINT(write_msr); DECLARE_TRACEPOINT(rdpmc); -extern void do_trace_write_msr(unsigned int msr, u64 val, int failed); -extern void do_trace_read_msr(unsigned int msr, u64 val, int failed); -extern void do_trace_rdpmc(unsigned int msr, u64 val, int failed); +extern void do_trace_write_msr(u32 msr, u64 val, int failed); +extern void do_trace_read_msr(u32 msr, u64 val, int failed); +extern void do_trace_rdpmc(u32 msr, u64 val, int failed); #else -static inline void do_trace_write_msr(unsigned int msr, u64 val, int failed) {} -static inline void do_trace_read_msr(unsigned int msr, u64 val, int failed) {} -static inline void do_trace_rdpmc(unsigned int msr, u64 val, int failed) {} +static inline void do_trace_write_msr(u32 msr, u64 val, int failed) {} +static inline void do_trace_read_msr(u32 msr, u64 val, int failed) {} +static inline void do_trace_rdpmc(u32 msr, u64 val, int failed) {} #endif /* @@ -77,9 +63,9 @@ static inline void do_trace_rdpmc(unsigned int msr, u64 val, int failed) {} * think of extending them - you will be slapped with a stinking trout or a frozen * shark will reach you, wherever you are! You've been warned. */ -static __always_inline unsigned long long __rdmsr(unsigned int msr) +static __always_inline u64 __rdmsr(u32 msr) { - DECLARE_ARGS(val, low, high); + EAX_EDX_DECLARE_ARGS(val, low, high); asm volatile("1: rdmsr\n" "2:\n" @@ -89,12 +75,12 @@ static __always_inline unsigned long long __rdmsr(unsigned int msr) return EAX_EDX_VAL(val, low, high); } -static __always_inline void __wrmsr(unsigned int msr, u32 low, u32 high) +static __always_inline void __wrmsrq(u32 msr, u64 val) { asm volatile("1: wrmsr\n" "2:\n" _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR) - : : "c" (msr), "a"(low), "d" (high) : "memory"); + : : "c" (msr), "a" ((u32)val), "d" ((u32)(val >> 32)) : "memory"); } #define native_rdmsr(msr, val1, val2) \ @@ -104,16 +90,20 @@ do { \ (void)((val2) = (u32)(__val >> 32)); \ } while (0) +static __always_inline u64 native_rdmsrq(u32 msr) +{ + return __rdmsr(msr); +} + #define native_wrmsr(msr, low, high) \ - __wrmsr(msr, low, high) + __wrmsrq((msr), (u64)(high) << 32 | (low)) -#define native_wrmsrl(msr, val) \ - __wrmsr((msr), (u32)((u64)(val)), \ - (u32)((u64)(val) >> 32)) +#define native_wrmsrq(msr, val) \ + __wrmsrq((msr), (val)) -static inline unsigned long long native_read_msr(unsigned int msr) +static inline u64 native_read_msr(u32 msr) { - unsigned long long val; + u64 val; val = __rdmsr(msr); @@ -123,34 +113,35 @@ static inline unsigned long long native_read_msr(unsigned int msr) return val; } -static inline unsigned long long native_read_msr_safe(unsigned int msr, - int *err) +static inline int native_read_msr_safe(u32 msr, u64 *p) { - DECLARE_ARGS(val, low, high); + int err; + EAX_EDX_DECLARE_ARGS(val, low, high); asm volatile("1: rdmsr ; xor %[err],%[err]\n" "2:\n\t" _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_RDMSR_SAFE, %[err]) - : [err] "=r" (*err), EAX_EDX_RET(val, low, high) + : [err] "=r" (err), EAX_EDX_RET(val, low, high) : "c" (msr)); if (tracepoint_enabled(read_msr)) - do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), *err); - return EAX_EDX_VAL(val, low, high); + do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), err); + + *p = EAX_EDX_VAL(val, low, high); + + return err; } /* Can be uninlined because referenced by paravirt */ -static inline void notrace -native_write_msr(unsigned int msr, u32 low, u32 high) +static inline void notrace native_write_msr(u32 msr, u64 val) { - __wrmsr(msr, low, high); + native_wrmsrq(msr, val); if (tracepoint_enabled(write_msr)) - do_trace_write_msr(msr, ((u64)high << 32 | low), 0); + do_trace_write_msr(msr, val, 0); } /* Can be uninlined because referenced by paravirt */ -static inline int notrace -native_write_msr_safe(unsigned int msr, u32 low, u32 high) +static inline int notrace native_write_msr_safe(u32 msr, u64 val) { int err; @@ -158,73 +149,19 @@ native_write_msr_safe(unsigned int msr, u32 low, u32 high) "2:\n\t" _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_WRMSR_SAFE, %[err]) : [err] "=a" (err) - : "c" (msr), "0" (low), "d" (high) + : "c" (msr), "0" ((u32)val), "d" ((u32)(val >> 32)) : "memory"); if (tracepoint_enabled(write_msr)) - do_trace_write_msr(msr, ((u64)high << 32 | low), err); + do_trace_write_msr(msr, val, err); return err; } extern int rdmsr_safe_regs(u32 regs[8]); extern int wrmsr_safe_regs(u32 regs[8]); -/** - * rdtsc() - returns the current TSC without ordering constraints - * - * rdtsc() returns the result of RDTSC as a 64-bit integer. The - * only ordering constraint it supplies is the ordering implied by - * "asm volatile": it will put the RDTSC in the place you expect. The - * CPU can and will speculatively execute that RDTSC, though, so the - * results can be non-monotonic if compared on different CPUs. - */ -static __always_inline unsigned long long rdtsc(void) -{ - DECLARE_ARGS(val, low, high); - - asm volatile("rdtsc" : EAX_EDX_RET(val, low, high)); - - return EAX_EDX_VAL(val, low, high); -} - -/** - * rdtsc_ordered() - read the current TSC in program order - * - * rdtsc_ordered() returns the result of RDTSC as a 64-bit integer. - * It is ordered like a load to a global in-memory counter. It should - * be impossible to observe non-monotonic rdtsc_unordered() behavior - * across multiple CPUs as long as the TSC is synced. - */ -static __always_inline unsigned long long rdtsc_ordered(void) +static inline u64 native_read_pmc(int counter) { - DECLARE_ARGS(val, low, high); - - /* - * The RDTSC instruction is not ordered relative to memory - * access. The Intel SDM and the AMD APM are both vague on this - * point, but empirically an RDTSC instruction can be - * speculatively executed before prior loads. An RDTSC - * immediately after an appropriate barrier appears to be - * ordered as a normal load, that is, it provides the same - * ordering guarantees as reading from a global memory location - * that some other imaginary CPU is updating continuously with a - * time stamp. - * - * Thus, use the preferred barrier on the respective CPU, aiming for - * RDTSCP as the default. - */ - asm volatile(ALTERNATIVE_2("rdtsc", - "lfence; rdtsc", X86_FEATURE_LFENCE_RDTSC, - "rdtscp", X86_FEATURE_RDTSCP) - : EAX_EDX_RET(val, low, high) - /* RDTSCP clobbers ECX with MSR_TSC_AUX. */ - :: "ecx"); - - return EAX_EDX_VAL(val, low, high); -} - -static inline unsigned long long native_read_pmc(int counter) -{ - DECLARE_ARGS(val, low, high); + EAX_EDX_DECLARE_ARGS(val, low, high); asm volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter)); if (tracepoint_enabled(rdpmc)) @@ -249,78 +186,86 @@ do { \ (void)((high) = (u32)(__val >> 32)); \ } while (0) -static inline void wrmsr(unsigned int msr, u32 low, u32 high) +static inline void wrmsr(u32 msr, u32 low, u32 high) { - native_write_msr(msr, low, high); + native_write_msr(msr, (u64)high << 32 | low); } -#define rdmsrl(msr, val) \ +#define rdmsrq(msr, val) \ ((val) = native_read_msr((msr))) -static inline void wrmsrl(unsigned int msr, u64 val) +static inline void wrmsrq(u32 msr, u64 val) { - native_write_msr(msr, (u32)(val & 0xffffffffULL), (u32)(val >> 32)); + native_write_msr(msr, val); } /* wrmsr with exception handling */ -static inline int wrmsr_safe(unsigned int msr, u32 low, u32 high) +static inline int wrmsrq_safe(u32 msr, u64 val) { - return native_write_msr_safe(msr, low, high); + return native_write_msr_safe(msr, val); } /* rdmsr with exception handling */ #define rdmsr_safe(msr, low, high) \ ({ \ - int __err; \ - u64 __val = native_read_msr_safe((msr), &__err); \ + u64 __val; \ + int __err = native_read_msr_safe((msr), &__val); \ (*low) = (u32)__val; \ (*high) = (u32)(__val >> 32); \ __err; \ }) -static inline int rdmsrl_safe(unsigned int msr, unsigned long long *p) +static inline int rdmsrq_safe(u32 msr, u64 *p) { - int err; + return native_read_msr_safe(msr, p); +} - *p = native_read_msr_safe(msr, &err); - return err; +static __always_inline u64 rdpmc(int counter) +{ + return native_read_pmc(counter); } -#define rdpmc(counter, low, high) \ -do { \ - u64 _l = native_read_pmc((counter)); \ - (low) = (u32)_l; \ - (high) = (u32)(_l >> 32); \ -} while (0) +#endif /* !CONFIG_PARAVIRT_XXL */ -#define rdpmcl(counter, val) ((val) = native_read_pmc(counter)) +/* Instruction opcode for WRMSRNS supported in binutils >= 2.40 */ +#define ASM_WRMSRNS _ASM_BYTES(0x0f,0x01,0xc6) -#endif /* !CONFIG_PARAVIRT_XXL */ +/* Non-serializing WRMSR, when available. Falls back to a serializing WRMSR. */ +static __always_inline void wrmsrns(u32 msr, u64 val) +{ + /* + * WRMSR is 2 bytes. WRMSRNS is 3 bytes. Pad WRMSR with a redundant + * DS prefix to avoid a trailing NOP. + */ + asm volatile("1: " ALTERNATIVE("ds wrmsr", ASM_WRMSRNS, X86_FEATURE_WRMSRNS) + "2: " _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR) + : : "c" (msr), "a" ((u32)val), "d" ((u32)(val >> 32))); +} /* - * 64-bit version of wrmsr_safe(): + * Dual u32 version of wrmsrq_safe(): */ -static inline int wrmsrl_safe(u32 msr, u64 val) +static inline int wrmsr_safe(u32 msr, u32 low, u32 high) { - return wrmsr_safe(msr, (u32)val, (u32)(val >> 32)); + return wrmsrq_safe(msr, (u64)high << 32 | low); } -struct msr *msrs_alloc(void); -void msrs_free(struct msr *msrs); +struct msr __percpu *msrs_alloc(void); +void msrs_free(struct msr __percpu *msrs); int msr_set_bit(u32 msr, u8 bit); int msr_clear_bit(u32 msr, u8 bit); #ifdef CONFIG_SMP int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); -int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q); -int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q); -void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs); -void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs); +int rdmsrq_on_cpu(unsigned int cpu, u32 msr_no, u64 *q); +int wrmsrq_on_cpu(unsigned int cpu, u32 msr_no, u64 q); +void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr __percpu *msrs); +void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr __percpu *msrs); int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); -int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q); -int wrmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q); +int rdmsrq_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q); +int wrmsrq_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q); int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]); int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]); #else /* CONFIG_SMP */ @@ -334,25 +279,25 @@ static inline int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) wrmsr(msr_no, l, h); return 0; } -static inline int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) +static inline int rdmsrq_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) { - rdmsrl(msr_no, *q); + rdmsrq(msr_no, *q); return 0; } -static inline int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q) +static inline int wrmsrq_on_cpu(unsigned int cpu, u32 msr_no, u64 q) { - wrmsrl(msr_no, q); + wrmsrq(msr_no, q); return 0; } static inline void rdmsr_on_cpus(const struct cpumask *m, u32 msr_no, - struct msr *msrs) + struct msr __percpu *msrs) { - rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h)); + rdmsr_on_cpu(0, msr_no, raw_cpu_ptr(&msrs->l), raw_cpu_ptr(&msrs->h)); } static inline void wrmsr_on_cpus(const struct cpumask *m, u32 msr_no, - struct msr *msrs) + struct msr __percpu *msrs) { - wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h); + wrmsr_on_cpu(0, msr_no, raw_cpu_read(msrs->l), raw_cpu_read(msrs->h)); } static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) @@ -363,13 +308,13 @@ static inline int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) { return wrmsr_safe(msr_no, l, h); } -static inline int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) +static inline int rdmsrq_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) { - return rdmsrl_safe(msr_no, q); + return rdmsrq_safe(msr_no, q); } -static inline int wrmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q) +static inline int wrmsrq_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q) { - return wrmsrl_safe(msr_no, q); + return wrmsrq_safe(msr_no, q); } static inline int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]) { @@ -380,5 +325,11 @@ static inline int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]) return wrmsr_safe_regs(regs); } #endif /* CONFIG_SMP */ -#endif /* __ASSEMBLY__ */ + +/* Compatibility wrappers: */ +#define rdmsrl(msr, val) rdmsrq(msr, val) +#define wrmsrl(msr, val) wrmsrq(msr, val) +#define rdmsrl_on_cpu(cpu, msr, q) rdmsrq_on_cpu(cpu, msr, q) + +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_MSR_H */ diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index 76d726074c16..c69e269937c5 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -23,15 +23,43 @@ #ifndef _ASM_X86_MTRR_H #define _ASM_X86_MTRR_H +#include <linux/bits.h> #include <uapi/asm/mtrr.h> -void mtrr_bp_init(void); +/* Defines for hardware MTRR registers. */ +#define MTRR_CAP_VCNT GENMASK(7, 0) +#define MTRR_CAP_FIX BIT_MASK(8) +#define MTRR_CAP_WC BIT_MASK(10) + +#define MTRR_DEF_TYPE_TYPE GENMASK(7, 0) +#define MTRR_DEF_TYPE_FE BIT_MASK(10) +#define MTRR_DEF_TYPE_E BIT_MASK(11) + +#define MTRR_DEF_TYPE_ENABLE (MTRR_DEF_TYPE_FE | MTRR_DEF_TYPE_E) +#define MTRR_DEF_TYPE_DISABLE ~(MTRR_DEF_TYPE_TYPE | MTRR_DEF_TYPE_ENABLE) + +#define MTRR_PHYSBASE_TYPE GENMASK(7, 0) +#define MTRR_PHYSBASE_RSVD GENMASK(11, 8) + +#define MTRR_PHYSMASK_RSVD GENMASK(10, 0) +#define MTRR_PHYSMASK_V BIT_MASK(11) + +struct mtrr_state_type { + struct mtrr_var_range var_ranges[MTRR_MAX_VAR_RANGES]; + mtrr_type fixed_ranges[MTRR_NUM_FIXED_RANGES]; + unsigned char enabled; + bool have_fixed; + mtrr_type def_type; +}; /* * The following functions are for use by other drivers that cannot use * arch_phys_wc_add and arch_phys_wc_del. */ # ifdef CONFIG_MTRR +void mtrr_bp_init(void); +void guest_force_mtrr_state(struct mtrr_var_range *var, unsigned int num_var, + mtrr_type def_type); extern u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform); extern void mtrr_save_fixed_ranges(void *); extern void mtrr_save_state(void); @@ -41,20 +69,27 @@ extern int mtrr_add_page(unsigned long base, unsigned long size, unsigned int type, bool increment); extern int mtrr_del(int reg, unsigned long base, unsigned long size); extern int mtrr_del_page(int reg, unsigned long base, unsigned long size); -extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi); -extern void mtrr_ap_init(void); -extern void set_mtrr_aps_delayed_init(void); -extern void mtrr_aps_init(void); -extern void mtrr_bp_restore(void); extern int mtrr_trim_uncached_memory(unsigned long end_pfn); extern int amd_special_default_mtrr(void); +void mtrr_disable(void); +void mtrr_enable(void); +void mtrr_generic_set_state(void); # else +static inline void guest_force_mtrr_state(struct mtrr_var_range *var, + unsigned int num_var, + mtrr_type def_type) +{ +} + static inline u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform) { /* - * Return no-MTRRs: + * Return the default MTRR type, without any known other types in + * that range. */ - return MTRR_TYPE_INVALID; + *uniform = 1; + + return MTRR_TYPE_UNCACHABLE; } #define mtrr_save_fixed_ranges(arg) do {} while (0) #define mtrr_save_state() do {} while (0) @@ -80,13 +115,10 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn) { return 0; } -static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) -{ -} -#define mtrr_ap_init() do {} while (0) -#define set_mtrr_aps_delayed_init() do {} while (0) -#define mtrr_aps_init() do {} while (0) -#define mtrr_bp_restore() do {} while (0) +#define mtrr_bp_init() do {} while (0) +#define mtrr_disable() do {} while (0) +#define mtrr_enable() do {} while (0) +#define mtrr_generic_set_state() do {} while (0) # endif #ifdef CONFIG_COMPAT @@ -121,7 +153,8 @@ struct mtrr_gentry32 { #endif /* CONFIG_COMPAT */ /* Bit fields for enabled in struct mtrr_state_type */ -#define MTRR_STATE_MTRR_FIXED_ENABLED 0x01 -#define MTRR_STATE_MTRR_ENABLED 0x02 +#define MTRR_STATE_SHIFT 10 +#define MTRR_STATE_MTRR_FIXED_ENABLED (MTRR_DEF_TYPE_FE >> MTRR_STATE_SHIFT) +#define MTRR_STATE_MTRR_ENABLED (MTRR_DEF_TYPE_E >> MTRR_STATE_SHIFT) #endif /* _ASM_X86_MTRR_H */ diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 3a8fdf881313..dd2b129b0418 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -15,7 +15,6 @@ #define MWAIT_HINT2SUBSTATE(hint) ((hint) & MWAIT_CSTATE_MASK) #define MWAIT_C1_SUBSTATE_MASK 0xf0 -#define CPUID_MWAIT_LEAF 5 #define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1 #define CPUID5_ECX_INTERRUPT_BREAK 0x2 @@ -26,29 +25,31 @@ #define TPAUSE_C01_STATE 1 #define TPAUSE_C02_STATE 0 -static inline void __monitor(const void *eax, unsigned long ecx, - unsigned long edx) +static __always_inline void __monitor(const void *eax, u32 ecx, u32 edx) { - /* "monitor %eax, %ecx, %edx;" */ - asm volatile(".byte 0x0f, 0x01, 0xc8;" - :: "a" (eax), "c" (ecx), "d"(edx)); + /* + * Use the instruction mnemonic with implicit operands, as the LLVM + * assembler fails to assemble the mnemonic with explicit operands: + */ + asm volatile("monitor" :: "a" (eax), "c" (ecx), "d" (edx)); } -static inline void __monitorx(const void *eax, unsigned long ecx, - unsigned long edx) +static __always_inline void __monitorx(const void *eax, u32 ecx, u32 edx) { - /* "monitorx %eax, %ecx, %edx;" */ - asm volatile(".byte 0x0f, 0x01, 0xfa;" + /* "monitorx %eax, %ecx, %edx" */ + asm volatile(".byte 0x0f, 0x01, 0xfa" :: "a" (eax), "c" (ecx), "d"(edx)); } -static inline void __mwait(unsigned long eax, unsigned long ecx) +static __always_inline void __mwait(u32 eax, u32 ecx) { mds_idle_clear_cpu_buffers(); - /* "mwait %eax, %ecx;" */ - asm volatile(".byte 0x0f, 0x01, 0xc9;" - :: "a" (eax), "c" (ecx)); + /* + * Use the instruction mnemonic with implicit operands, as the LLVM + * assembler fails to assemble the mnemonic with explicit operands: + */ + asm volatile("mwait" :: "a" (eax), "c" (ecx)); } /* @@ -77,22 +78,29 @@ static inline void __mwait(unsigned long eax, unsigned long ecx) * EAX (logical) address to monitor * ECX #GP if not zero */ -static inline void __mwaitx(unsigned long eax, unsigned long ebx, - unsigned long ecx) +static __always_inline void __mwaitx(u32 eax, u32 ebx, u32 ecx) { /* No MDS buffer clear as this is AMD/HYGON only */ - /* "mwaitx %eax, %ebx, %ecx;" */ - asm volatile(".byte 0x0f, 0x01, 0xfb;" + /* "mwaitx %eax, %ebx, %ecx" */ + asm volatile(".byte 0x0f, 0x01, 0xfb" :: "a" (eax), "b" (ebx), "c" (ecx)); } -static inline void __sti_mwait(unsigned long eax, unsigned long ecx) +/* + * Re-enable interrupts right upon calling mwait in such a way that + * no interrupt can fire _before_ the execution of mwait, ie: no + * instruction must be placed between "sti" and "mwait". + * + * This is necessary because if an interrupt queues a timer before + * executing mwait, it would otherwise go unnoticed and the next tick + * would not be reprogrammed accordingly before mwait ever wakes up. + */ +static __always_inline void __sti_mwait(u32 eax, u32 ecx) { mds_idle_clear_cpu_buffers(); - /* "mwait %eax, %ecx;" */ - asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" - :: "a" (eax), "c" (ecx)); + + asm volatile("sti; mwait" :: "a" (eax), "c" (ecx)); } /* @@ -105,18 +113,22 @@ static inline void __sti_mwait(unsigned long eax, unsigned long ecx) * New with Core Duo processors, MWAIT can take some hints based on CPU * capability. */ -static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) +static __always_inline void mwait_idle_with_hints(u32 eax, u32 ecx) { if (static_cpu_has_bug(X86_BUG_MONITOR) || !current_set_polling_and_test()) { - if (static_cpu_has_bug(X86_BUG_CLFLUSH_MONITOR)) { - mb(); - clflush((void *)¤t_thread_info()->flags); - mb(); + const void *addr = ¤t_thread_info()->flags; + + alternative_input("", "clflush (%[addr])", X86_BUG_CLFLUSH_MONITOR, [addr] "a" (addr)); + __monitor(addr, 0, 0); + + if (!need_resched()) { + if (ecx & 1) { + __mwait(eax, ecx); + } else { + __sti_mwait(eax, ecx); + raw_local_irq_disable(); + } } - - __monitor((void *)¤t_thread_info()->flags, 0, 0); - if (!need_resched()) - __mwait(eax, ecx); } current_clr_polling(); } @@ -129,16 +141,9 @@ static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) */ static inline void __tpause(u32 ecx, u32 edx, u32 eax) { - /* "tpause %ecx, %edx, %eax;" */ - #ifdef CONFIG_AS_TPAUSE - asm volatile("tpause %%ecx\n" - : - : "c"(ecx), "d"(edx), "a"(eax)); - #else - asm volatile(".byte 0x66, 0x0f, 0xae, 0xf1\t\n" - : - : "c"(ecx), "d"(edx), "a"(eax)); - #endif + /* "tpause %ecx" */ + asm volatile(".byte 0x66, 0x0f, 0xae, 0xf1" + :: "c" (ecx), "d" (edx), "a" (eax)); } #endif /* _ASM_X86_MWAIT_H */ diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 5c5f1e56c404..79d88d12c8fb 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -14,15 +14,26 @@ extern void release_perfctr_nmi(unsigned int); extern int reserve_evntsel_nmi(unsigned int); extern void release_evntsel_nmi(unsigned int); -struct ctl_table; -extern int proc_nmi_enabled(struct ctl_table *, int , - void __user *, size_t *, loff_t *); -extern int unknown_nmi_panic; - #endif /* CONFIG_X86_LOCAL_APIC */ +extern int unknown_nmi_panic; +extern int panic_on_unrecovered_nmi; +extern int panic_on_io_nmi; + +/* NMI handler flags */ #define NMI_FLAG_FIRST 1 +/** + * enum - NMI types. + * @NMI_LOCAL: Local NMI, CPU-specific NMI generated by the Local APIC. + * @NMI_UNKNOWN: Unknown NMI, the source of the NMI may not be identified. + * @NMI_SERR: System Error NMI, typically triggered by PCI errors. + * @NMI_IO_CHECK: I/O Check NMI, related to I/O errors. + * @NMI_MAX: Maximum value for NMI types. + * + * NMI types are used to categorize NMIs and to dispatch them to the + * appropriate handler. + */ enum { NMI_LOCAL=0, NMI_UNKNOWN, @@ -31,6 +42,7 @@ enum { NMI_MAX }; +/* NMI handler return values */ #define NMI_DONE 0 #define NMI_HANDLED 1 @@ -44,6 +56,25 @@ struct nmiaction { const char *name; }; +/** + * register_nmi_handler - Register a handler for a specific NMI type + * @t: NMI type (e.g. NMI_LOCAL) + * @fn: The NMI handler + * @fg: Flags associated with the NMI handler + * @n: Name of the NMI handler + * @init: Optional __init* attributes for struct nmiaction + * + * Adds the provided handler to the list of handlers for the specified + * NMI type. Handlers flagged with NMI_FLAG_FIRST would be executed first. + * + * Sometimes the source of an NMI can't be reliably determined which + * results in an NMI being tagged as "unknown". Register an additional + * handler using the NMI type - NMI_UNKNOWN to handle such cases. The + * caller would get one last chance to assume responsibility for the + * NMI. + * + * Return: 0 on success, or an error code on failure. + */ #define register_nmi_handler(t, fn, fg, n, init...) \ ({ \ static struct nmiaction init fn##_na = { \ @@ -57,7 +88,18 @@ struct nmiaction { int __register_nmi_handler(unsigned int, struct nmiaction *); -void unregister_nmi_handler(unsigned int, const char *); +/** + * unregister_nmi_handler - Unregister a handler for a specific NMI type + * @type: NMI type (e.g. NMI_LOCAL) + * @name: Name of the NMI handler used during registration + * + * Removes the handler associated with the specified NMI type from the + * NMI handler list. The "name" is used as a lookup key to identify the + * handler. + */ +void unregister_nmi_handler(unsigned int type, const char *name); + +void set_emergency_nmi_handler(unsigned int type, nmi_handler_t handler); void stop_nmi(void); void restart_nmi(void); diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h index c5573eaa5bb9..cd94221d8335 100644 --- a/arch/x86/include/asm/nops.h +++ b/arch/x86/include/asm/nops.h @@ -34,6 +34,8 @@ #define BYTES_NOP7 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00 #define BYTES_NOP8 0x3e,BYTES_NOP7 +#define ASM_NOP_MAX 8 + #else /* @@ -47,6 +49,9 @@ * 6: osp nopl 0x00(%eax,%eax,1) * 7: nopl 0x00000000(%eax) * 8: nopl 0x00000000(%eax,%eax,1) + * 9: cs nopl 0x00000000(%eax,%eax,1) + * 10: osp cs nopl 0x00000000(%eax,%eax,1) + * 11: osp osp cs nopl 0x00000000(%eax,%eax,1) */ #define BYTES_NOP1 0x90 #define BYTES_NOP2 0x66,BYTES_NOP1 @@ -56,6 +61,15 @@ #define BYTES_NOP6 0x66,BYTES_NOP5 #define BYTES_NOP7 0x0f,0x1f,0x80,0x00,0x00,0x00,0x00 #define BYTES_NOP8 0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 +#define BYTES_NOP9 0x2e,BYTES_NOP8 +#define BYTES_NOP10 0x66,BYTES_NOP9 +#define BYTES_NOP11 0x66,BYTES_NOP10 + +#define ASM_NOP9 _ASM_BYTES(BYTES_NOP9) +#define ASM_NOP10 _ASM_BYTES(BYTES_NOP10) +#define ASM_NOP11 _ASM_BYTES(BYTES_NOP11) + +#define ASM_NOP_MAX 11 #endif /* CONFIG_64BIT */ @@ -68,9 +82,7 @@ #define ASM_NOP7 _ASM_BYTES(BYTES_NOP7) #define ASM_NOP8 _ASM_BYTES(BYTES_NOP8) -#define ASM_NOP_MAX 8 - -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern const unsigned char * const x86_nops[]; #endif diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index cba942006ffe..20d754b98f3f 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -13,7 +13,93 @@ #include <asm/unwind_hints.h> #include <asm/percpu.h> -#define RETPOLINE_THUNK_SIZE 32 +/* + * Call depth tracking for Intel SKL CPUs to address the RSB underflow + * issue in software. + * + * The tracking does not use a counter. It uses uses arithmetic shift + * right on call entry and logical shift left on return. + * + * The depth tracking variable is initialized to 0x8000.... when the call + * depth is zero. The arithmetic shift right sign extends the MSB and + * saturates after the 12th call. The shift count is 5 for both directions + * so the tracking covers 12 nested calls. + * + * Call + * 0: 0x8000000000000000 0x0000000000000000 + * 1: 0xfc00000000000000 0xf000000000000000 + * ... + * 11: 0xfffffffffffffff8 0xfffffffffffffc00 + * 12: 0xffffffffffffffff 0xffffffffffffffe0 + * + * After a return buffer fill the depth is credited 12 calls before the + * next stuffing has to take place. + * + * There is a inaccuracy for situations like this: + * + * 10 calls + * 5 returns + * 3 calls + * 4 returns + * 3 calls + * .... + * + * The shift count might cause this to be off by one in either direction, + * but there is still a cushion vs. the RSB depth. The algorithm does not + * claim to be perfect and it can be speculated around by the CPU, but it + * is considered that it obfuscates the problem enough to make exploitation + * extremely difficult. + */ +#define RET_DEPTH_SHIFT 5 +#define RSB_RET_STUFF_LOOPS 16 +#define RET_DEPTH_INIT 0x8000000000000000ULL +#define RET_DEPTH_INIT_FROM_CALL 0xfc00000000000000ULL +#define RET_DEPTH_CREDIT 0xffffffffffffffffULL + +#ifdef CONFIG_CALL_THUNKS_DEBUG +# define CALL_THUNKS_DEBUG_INC_CALLS \ + incq PER_CPU_VAR(__x86_call_count); +# define CALL_THUNKS_DEBUG_INC_RETS \ + incq PER_CPU_VAR(__x86_ret_count); +# define CALL_THUNKS_DEBUG_INC_STUFFS \ + incq PER_CPU_VAR(__x86_stuffs_count); +# define CALL_THUNKS_DEBUG_INC_CTXSW \ + incq PER_CPU_VAR(__x86_ctxsw_count); +#else +# define CALL_THUNKS_DEBUG_INC_CALLS +# define CALL_THUNKS_DEBUG_INC_RETS +# define CALL_THUNKS_DEBUG_INC_STUFFS +# define CALL_THUNKS_DEBUG_INC_CTXSW +#endif + +#if defined(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) && !defined(COMPILE_OFFSETS) + +#include <asm/asm-offsets.h> + +#define CREDIT_CALL_DEPTH \ + movq $-1, PER_CPU_VAR(__x86_call_depth); + +#define RESET_CALL_DEPTH \ + xor %eax, %eax; \ + bts $63, %rax; \ + movq %rax, PER_CPU_VAR(__x86_call_depth); + +#define RESET_CALL_DEPTH_FROM_CALL \ + movb $0xfc, %al; \ + shl $56, %rax; \ + movq %rax, PER_CPU_VAR(__x86_call_depth); \ + CALL_THUNKS_DEBUG_INC_CALLS + +#define INCREMENT_CALL_DEPTH \ + sarq $5, PER_CPU_VAR(__x86_call_depth); \ + CALL_THUNKS_DEBUG_INC_CALLS + +#else +#define CREDIT_CALL_DEPTH +#define RESET_CALL_DEPTH +#define RESET_CALL_DEPTH_FROM_CALL +#define INCREMENT_CALL_DEPTH +#endif /* * Fill the CPU return stack buffer. @@ -32,49 +118,65 @@ * from C via asm(".include <asm/nospec-branch.h>") but let's not go there. */ +#define RETPOLINE_THUNK_SIZE 32 #define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */ /* + * Common helper for __FILL_RETURN_BUFFER and __FILL_ONE_RETURN. + */ +#define __FILL_RETURN_SLOT \ + ANNOTATE_INTRA_FUNCTION_CALL; \ + call 772f; \ + int3; \ +772: + +/* + * Stuff the entire RSB. + * * Google experimented with loop-unrolling and this turned out to be * the optimal version - two calls, each with their own speculation * trap should their return address end up getting used, in a loop. */ -#define __FILL_RETURN_BUFFER(reg, nr, sp) \ - mov $(nr/2), reg; \ -771: \ - ANNOTATE_INTRA_FUNCTION_CALL; \ - call 772f; \ -773: /* speculation trap */ \ - UNWIND_HINT_EMPTY; \ - pause; \ - lfence; \ - jmp 773b; \ -772: \ - ANNOTATE_INTRA_FUNCTION_CALL; \ - call 774f; \ -775: /* speculation trap */ \ - UNWIND_HINT_EMPTY; \ - pause; \ - lfence; \ - jmp 775b; \ -774: \ - add $(BITS_PER_LONG/8) * 2, sp; \ - dec reg; \ - jnz 771b; - -#ifdef __ASSEMBLY__ +#ifdef CONFIG_X86_64 +#define __FILL_RETURN_BUFFER(reg, nr) \ + mov $(nr/2), reg; \ +771: \ + __FILL_RETURN_SLOT \ + __FILL_RETURN_SLOT \ + add $(BITS_PER_LONG/8) * 2, %_ASM_SP; \ + dec reg; \ + jnz 771b; \ + /* barrier for jnz misprediction */ \ + lfence; \ + CREDIT_CALL_DEPTH \ + CALL_THUNKS_DEBUG_INC_CTXSW +#else +/* + * i386 doesn't unconditionally have LFENCE, as such it can't + * do a loop. + */ +#define __FILL_RETURN_BUFFER(reg, nr) \ + .rept nr; \ + __FILL_RETURN_SLOT; \ + .endr; \ + add $(BITS_PER_LONG/8) * nr, %_ASM_SP; +#endif /* - * This should be used immediately before an indirect jump/call. It tells - * objtool the subsequent indirect jump/call is vouched safe for retpoline - * builds. + * Stuff a single RSB slot. + * + * To mitigate Post-Barrier RSB speculation, one CALL instruction must be + * forced to retire before letting a RET instruction execute. + * + * On PBRSB-vulnerable CPUs, it is not safe for a RET to be executed + * before this point. */ -.macro ANNOTATE_RETPOLINE_SAFE - .Lannotate_\@: - .pushsection .discard.retpoline_safe - _ASM_PTR .Lannotate_\@ - .popsection -.endm +#define __FILL_ONE_RETURN \ + __FILL_RETURN_SLOT \ + add $(BITS_PER_LONG/8), %_ASM_SP; \ + lfence; + +#ifdef __ASSEMBLER__ /* * (ab)use RETPOLINE_SAFE on RET to annotate away 'bare' RET instructions @@ -84,19 +186,19 @@ /* * Abuse ANNOTATE_RETPOLINE_SAFE on a NOP to indicate UNRET_END, should - * eventually turn into it's own annotation. + * eventually turn into its own annotation. */ -.macro ANNOTATE_UNRET_END -#ifdef CONFIG_DEBUG_ENTRY +.macro VALIDATE_UNRET_END +#if defined(CONFIG_NOINSTR_VALIDATION) && \ + (defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO)) ANNOTATE_RETPOLINE_SAFE nop #endif .endm /* - * Equivalent to -mindirect-branch-cs-prefix; emit the 5 byte jmp/call - * to the retpoline thunk with a CS prefix when the register requires - * a RAX prefix byte to encode. Also see apply_retpolines(). + * Emits a conditional CS prefix that is compatible with + * -mindirect-branch-cs-prefix. */ .macro __CS_PREFIX reg:req .irp rs,r8,r9,r10,r11,r12,r13,r14,r15 @@ -110,9 +212,13 @@ * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple * indirect jmp/call which may be susceptible to the Spectre variant 2 * attack. + * + * NOTE: these do not take kCFI into account and are thus not comparable to C + * indirect calls, take care when using. The target of these should be an ENDBR + * instruction irrespective of kCFI. */ .macro JMP_NOSPEC reg:req -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE __CS_PREFIX \reg jmp __x86_indirect_thunk_\reg #else @@ -122,7 +228,7 @@ .endm .macro CALL_NOSPEC reg:req -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE __CS_PREFIX \reg call __x86_indirect_thunk_\reg #else @@ -134,76 +240,213 @@ * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP * monstrosity above, manually. */ -.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req - ALTERNATIVE "jmp .Lskip_rsb_\@", "", \ftr - __FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP) +.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req ftr2=ALT_NOT(X86_FEATURE_ALWAYS) + ALTERNATIVE_2 "jmp .Lskip_rsb_\@", \ + __stringify(__FILL_RETURN_BUFFER(\reg,\nr)), \ftr, \ + __stringify(nop;nop;__FILL_ONE_RETURN), \ftr2 + .Lskip_rsb_\@: .endm -#ifdef CONFIG_CPU_UNRET_ENTRY -#define CALL_ZEN_UNTRAIN_RET "call zen_untrain_ret" -#else -#define CALL_ZEN_UNTRAIN_RET "" +/* + * The CALL to srso_alias_untrain_ret() must be patched in directly at + * the spot where untraining must be done, ie., srso_alias_untrain_ret() + * must be the target of a CALL instruction instead of indirectly + * jumping to a wrapper which then calls it. Therefore, this macro is + * called outside of __UNTRAIN_RET below, for the time being, before the + * kernel can support nested alternatives with arbitrary nesting. + */ +.macro CALL_UNTRAIN_RET +#if defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO) + ALTERNATIVE_2 "", "call entry_untrain_ret", X86_FEATURE_UNRET, \ + "call srso_alias_untrain_ret", X86_FEATURE_SRSO_ALIAS #endif +.endm /* * Mitigate RETBleed for AMD/Hygon Zen uarch. Requires KERNEL CR3 because the * return thunk isn't mapped into the userspace tables (then again, AMD * typically has NO_MELTDOWN). * - * While zen_untrain_ret() doesn't clobber anything but requires stack, - * entry_ibpb() will clobber AX, CX, DX. + * While retbleed_untrain_ret() doesn't clobber anything but requires stack, + * write_ibpb() will clobber AX, CX, DX. * * As such, this must be placed after every *SWITCH_TO_KERNEL_CR3 at a point * where we have a stack but before any RET instruction. */ -.macro UNTRAIN_RET -#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) - ANNOTATE_UNRET_END +.macro __UNTRAIN_RET ibpb_feature, call_depth_insns +#if defined(CONFIG_MITIGATION_RETHUNK) || defined(CONFIG_MITIGATION_IBPB_ENTRY) + VALIDATE_UNRET_END + CALL_UNTRAIN_RET ALTERNATIVE_2 "", \ - CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \ - "call entry_ibpb", X86_FEATURE_ENTRY_IBPB + "call write_ibpb", \ibpb_feature, \ + __stringify(\call_depth_insns), X86_FEATURE_CALL_DEPTH +#endif +.endm + +#define UNTRAIN_RET \ + __UNTRAIN_RET X86_FEATURE_ENTRY_IBPB, __stringify(RESET_CALL_DEPTH) + +#define UNTRAIN_RET_VM \ + __UNTRAIN_RET X86_FEATURE_IBPB_ON_VMEXIT, __stringify(RESET_CALL_DEPTH) + +#define UNTRAIN_RET_FROM_CALL \ + __UNTRAIN_RET X86_FEATURE_ENTRY_IBPB, __stringify(RESET_CALL_DEPTH_FROM_CALL) + + +.macro CALL_DEPTH_ACCOUNT +#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING + ALTERNATIVE "", \ + __stringify(INCREMENT_CALL_DEPTH), X86_FEATURE_CALL_DEPTH +#endif +.endm + +/* + * Macro to execute VERW instruction that mitigate transient data sampling + * attacks such as MDS. On affected systems a microcode update overloaded VERW + * instruction to also clear the CPU buffers. VERW clobbers CFLAGS.ZF. + * + * Note: Only the memory operand variant of VERW clears the CPU buffers. + */ +.macro CLEAR_CPU_BUFFERS +#ifdef CONFIG_X86_64 + ALTERNATIVE "", "verw mds_verw_sel(%rip)", X86_FEATURE_CLEAR_CPU_BUF +#else + /* + * In 32bit mode, the memory operand must be a %cs reference. The data + * segments may not be usable (vm86 mode), and the stack segment may not + * be flat (ESPFIX32). + */ + ALTERNATIVE "", "verw %cs:mds_verw_sel", X86_FEATURE_CLEAR_CPU_BUF #endif .endm -#else /* __ASSEMBLY__ */ +#ifdef CONFIG_X86_64 +.macro CLEAR_BRANCH_HISTORY + ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP +.endm + +.macro CLEAR_BRANCH_HISTORY_VMEXIT + ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_VMEXIT +.endm +#else +#define CLEAR_BRANCH_HISTORY +#define CLEAR_BRANCH_HISTORY_VMEXIT +#endif -#define ANNOTATE_RETPOLINE_SAFE \ - "999:\n\t" \ - ".pushsection .discard.retpoline_safe\n\t" \ - _ASM_PTR " 999b\n\t" \ - ".popsection\n\t" +#else /* __ASSEMBLER__ */ + +#define ITS_THUNK_SIZE 64 typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE]; +typedef u8 its_thunk_t[ITS_THUNK_SIZE]; extern retpoline_thunk_t __x86_indirect_thunk_array[]; +extern retpoline_thunk_t __x86_indirect_call_thunk_array[]; +extern retpoline_thunk_t __x86_indirect_jump_thunk_array[]; +extern its_thunk_t __x86_indirect_its_thunk_array[]; +#ifdef CONFIG_MITIGATION_RETHUNK extern void __x86_return_thunk(void); -extern void zen_untrain_ret(void); -extern void entry_ibpb(void); +#else +static inline void __x86_return_thunk(void) {} +#endif + +#ifdef CONFIG_MITIGATION_UNRET_ENTRY +extern void retbleed_return_thunk(void); +#else +static inline void retbleed_return_thunk(void) {} +#endif + +extern void srso_alias_untrain_ret(void); + +#ifdef CONFIG_MITIGATION_SRSO +extern void srso_return_thunk(void); +extern void srso_alias_return_thunk(void); +#else +static inline void srso_return_thunk(void) {} +static inline void srso_alias_return_thunk(void) {} +#endif + +#ifdef CONFIG_MITIGATION_ITS +extern void its_return_thunk(void); +#else +static inline void its_return_thunk(void) {} +#endif + +extern void retbleed_return_thunk(void); +extern void srso_return_thunk(void); +extern void srso_alias_return_thunk(void); + +extern void entry_untrain_ret(void); +extern void write_ibpb(void); + +#ifdef CONFIG_X86_64 +extern void clear_bhb_loop(void); +#endif + +extern void (*x86_return_thunk)(void); + +extern void __warn_thunk(void); + +#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING +extern void call_depth_return_thunk(void); + +#define CALL_DEPTH_ACCOUNT \ + ALTERNATIVE("", \ + __stringify(INCREMENT_CALL_DEPTH), \ + X86_FEATURE_CALL_DEPTH) + +DECLARE_PER_CPU_CACHE_HOT(u64, __x86_call_depth); + +#ifdef CONFIG_CALL_THUNKS_DEBUG +DECLARE_PER_CPU(u64, __x86_call_count); +DECLARE_PER_CPU(u64, __x86_ret_count); +DECLARE_PER_CPU(u64, __x86_stuffs_count); +DECLARE_PER_CPU(u64, __x86_ctxsw_count); +#endif +#else /* !CONFIG_MITIGATION_CALL_DEPTH_TRACKING */ + +static inline void call_depth_return_thunk(void) {} +#define CALL_DEPTH_ACCOUNT "" -#ifdef CONFIG_RETPOLINE +#endif /* CONFIG_MITIGATION_CALL_DEPTH_TRACKING */ + +#ifdef CONFIG_MITIGATION_RETPOLINE #define GEN(reg) \ extern retpoline_thunk_t __x86_indirect_thunk_ ## reg; #include <asm/GEN-for-each-reg.h> #undef GEN +#define GEN(reg) \ + extern retpoline_thunk_t __x86_indirect_call_thunk_ ## reg; +#include <asm/GEN-for-each-reg.h> +#undef GEN + +#define GEN(reg) \ + extern retpoline_thunk_t __x86_indirect_jump_thunk_ ## reg; +#include <asm/GEN-for-each-reg.h> +#undef GEN + #ifdef CONFIG_X86_64 /* + * Emits a conditional CS prefix that is compatible with + * -mindirect-branch-cs-prefix. + */ +#define __CS_PREFIX(reg) \ + ".irp rs,r8,r9,r10,r11,r12,r13,r14,r15\n" \ + ".ifc \\rs," reg "\n" \ + ".byte 0x2e\n" \ + ".endif\n" \ + ".endr\n" + +/* * Inline asm uses the %V modifier which is only in newer GCC - * which is ensured when CONFIG_RETPOLINE is defined. + * which is ensured when CONFIG_MITIGATION_RETPOLINE is defined. */ -# define CALL_NOSPEC \ - ALTERNATIVE_2( \ - ANNOTATE_RETPOLINE_SAFE \ - "call *%[thunk_target]\n", \ - "call __x86_indirect_thunk_%V[thunk_target]\n", \ - X86_FEATURE_RETPOLINE, \ - "lfence;\n" \ - ANNOTATE_RETPOLINE_SAFE \ - "call *%[thunk_target]\n", \ - X86_FEATURE_RETPOLINE_LFENCE) +#define CALL_NOSPEC __CS_PREFIX("%V[thunk_target]") \ + "call __x86_indirect_thunk_%V[thunk_target]\n" # define THUNK_TARGET(addr) [thunk_target] "r" (addr) @@ -270,9 +513,6 @@ enum ssb_mitigation { SPEC_STORE_BYPASS_SECCOMP, }; -extern char __indirect_thunk_start[]; -extern char __indirect_thunk_end[]; - static __always_inline void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature) { @@ -286,15 +526,15 @@ void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature) static inline void indirect_branch_prediction_barrier(void) { - u64 val = PRED_CMD_IBPB; - - alternative_msr_write(MSR_IA32_PRED_CMD, val, X86_FEATURE_USE_IBPB); + asm_inline volatile(ALTERNATIVE("", "call write_ibpb", X86_FEATURE_IBPB) + : ASM_CALL_CONSTRAINT + :: "rax", "rcx", "rdx", "memory"); } /* The Intel SPEC CTRL MSR base value cache */ extern u64 x86_spec_ctrl_base; DECLARE_PER_CPU(u64, x86_spec_ctrl_current); -extern void write_spec_ctrl_current(u64 val, bool force); +extern void update_spec_ctrl_cond(u64 val); extern u64 spec_ctrl_current(void); /* @@ -325,12 +565,15 @@ DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp); DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb); -DECLARE_STATIC_KEY_FALSE(mds_user_clear); +DECLARE_STATIC_KEY_FALSE(switch_vcpu_ibpb); + DECLARE_STATIC_KEY_FALSE(mds_idle_clear); DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); -DECLARE_STATIC_KEY_FALSE(mmio_stale_data_clear); +DECLARE_STATIC_KEY_FALSE(cpu_buf_vm_clear); + +extern u16 mds_verw_sel; #include <asm/segment.h> @@ -358,27 +601,16 @@ static __always_inline void mds_clear_cpu_buffers(void) } /** - * mds_user_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability - * - * Clear CPU buffers if the corresponding static key is enabled - */ -static __always_inline void mds_user_clear_cpu_buffers(void) -{ - if (static_branch_likely(&mds_user_clear)) - mds_clear_cpu_buffers(); -} - -/** * mds_idle_clear_cpu_buffers - Mitigation for MDS vulnerability * * Clear CPU buffers if the corresponding static key is enabled */ -static inline void mds_idle_clear_cpu_buffers(void) +static __always_inline void mds_idle_clear_cpu_buffers(void) { if (static_branch_likely(&mds_idle_clear)) mds_clear_cpu_buffers(); } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_NOSPEC_BRANCH_H_ */ diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index e3bae2b60a0d..53ba39ce010c 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -10,15 +10,6 @@ #ifdef CONFIG_NUMA -#define NR_NODE_MEMBLKS (MAX_NUMNODES*2) - -/* - * Too small node sizes may confuse the VM badly. Usually they - * result from BIOS bugs. So dont recognize nodes as standalone - * NUMA entities that have less than this amount of RAM listed: - */ -#define NODE_MIN_SIZE (4*1024*1024) - extern int numa_off; /* @@ -32,9 +23,6 @@ extern int numa_off; extern s16 __apicid_to_node[MAX_LOCAL_APIC]; extern nodemask_t numa_nodes_parsed __initdata; -extern int __init numa_add_memblk(int nodeid, u64 start, u64 end); -extern void __init numa_set_distance(int from, int to, int distance); - static inline void set_apicid_to_node(int apicid, s16 node) { __apicid_to_node[apicid] = node; @@ -53,39 +41,24 @@ static inline int numa_cpu_node(int cpu) } #endif /* CONFIG_NUMA */ -#ifdef CONFIG_X86_32 -# include <asm/numa_32.h> -#endif - #ifdef CONFIG_NUMA extern void numa_set_node(int cpu, int node); extern void numa_clear_node(int cpu); extern void __init init_cpu_to_node(void); -extern void numa_add_cpu(int cpu); -extern void numa_remove_cpu(int cpu); +extern void numa_add_cpu(unsigned int cpu); +extern void numa_remove_cpu(unsigned int cpu); extern void init_gi_nodes(void); #else /* CONFIG_NUMA */ static inline void numa_set_node(int cpu, int node) { } static inline void numa_clear_node(int cpu) { } static inline void init_cpu_to_node(void) { } -static inline void numa_add_cpu(int cpu) { } -static inline void numa_remove_cpu(int cpu) { } +static inline void numa_add_cpu(unsigned int cpu) { } +static inline void numa_remove_cpu(unsigned int cpu) { } static inline void init_gi_nodes(void) { } #endif /* CONFIG_NUMA */ #ifdef CONFIG_DEBUG_PER_CPU_MAPS -void debug_cpumask_set_cpu(int cpu, int node, bool enable); +void debug_cpumask_set_cpu(unsigned int cpu, int node, bool enable); #endif -#ifdef CONFIG_NUMA_EMU -#define FAKE_NODE_MIN_SIZE ((u64)32 << 20) -#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) -int numa_emu_cmdline(char *str); -#else /* CONFIG_NUMA_EMU */ -static inline int numa_emu_cmdline(char *str) -{ - return -EINVAL; -} -#endif /* CONFIG_NUMA_EMU */ - #endif /* _ASM_X86_NUMA_H */ diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h deleted file mode 100644 index 9c8e9e85be77..000000000000 --- a/arch/x86/include/asm/numa_32.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_NUMA_32_H -#define _ASM_X86_NUMA_32_H - -#ifdef CONFIG_HIGHMEM -extern void set_highmem_pages_init(void); -#else -static inline void set_highmem_pages_init(void) -{ -} -#endif - -#endif /* _ASM_X86_NUMA_32_H */ diff --git a/arch/x86/include/asm/orc_header.h b/arch/x86/include/asm/orc_header.h new file mode 100644 index 000000000000..07bacf3e160e --- /dev/null +++ b/arch/x86/include/asm/orc_header.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Copyright (c) Meta Platforms, Inc. and affiliates. */ + +#ifndef _ORC_HEADER_H +#define _ORC_HEADER_H + +#include <linux/types.h> +#include <linux/compiler.h> +#include <asm/orc_hash.h> + +/* + * The header is currently a 20-byte hash of the ORC entry definition; see + * scripts/orc_hash.sh. + */ +#define ORC_HEADER \ + __used __section(".orc_header") __aligned(4) \ + static const u8 orc_header[] = { ORC_HASH } + +#endif /* _ORC_HEADER_H */ diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h index 5a2baf28a1dc..e0125afa53fb 100644 --- a/arch/x86/include/asm/orc_types.h +++ b/arch/x86/include/asm/orc_types.h @@ -39,7 +39,13 @@ #define ORC_REG_SP_INDIRECT 9 #define ORC_REG_MAX 15 -#ifndef __ASSEMBLY__ +#define ORC_TYPE_UNDEFINED 0 +#define ORC_TYPE_END_OF_STACK 1 +#define ORC_TYPE_CALL 2 +#define ORC_TYPE_REGS 3 +#define ORC_TYPE_REGS_PARTIAL 4 + +#ifndef __ASSEMBLER__ #include <asm/byteorder.h> /* @@ -56,17 +62,17 @@ struct orc_entry { #if defined(__LITTLE_ENDIAN_BITFIELD) unsigned sp_reg:4; unsigned bp_reg:4; - unsigned type:2; - unsigned end:1; + unsigned type:3; + unsigned signal:1; #elif defined(__BIG_ENDIAN_BITFIELD) unsigned bp_reg:4; unsigned sp_reg:4; - unsigned unused:5; - unsigned end:1; - unsigned type:2; + unsigned unused:4; + unsigned signal:1; + unsigned type:3; #endif } __packed; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ORC_TYPES_H */ diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 9cc82f305f4b..9265f2fca99a 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -14,7 +14,7 @@ #include <asm/page_32.h> #endif /* CONFIG_X86_64 */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct page; @@ -34,9 +34,8 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr, copy_page(to, from); } -#define alloc_zeroed_user_highpage_movable(vma, vaddr) \ - alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr) -#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE +#define vma_alloc_zeroed_movable_folio(vma, vaddr) \ + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr) #ifndef __pa #define __pa(x) __phys_addr((unsigned long)(x)) @@ -67,10 +66,14 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr, * virt_addr_valid(kaddr) returns true. */ #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) -#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) extern bool __virt_addr_valid(unsigned long kaddr); #define virt_addr_valid(kaddr) __virt_addr_valid((unsigned long) (kaddr)) +static __always_inline void *pfn_to_kaddr(unsigned long pfn) +{ + return __va(pfn << PAGE_SHIFT); +} + static __always_inline u64 __canonical_address(u64 vaddr, u8 vaddr_bits) { return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits); @@ -81,7 +84,7 @@ static __always_inline u64 __is_canonical_address(u64 vaddr, u8 vaddr_bits) return __canonical_address(vaddr, vaddr_bits) == vaddr; } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #include <asm-generic/memory_model.h> #include <asm-generic/getorder.h> diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h index df42f8aa99e4..0c623706cb7e 100644 --- a/arch/x86/include/asm/page_32.h +++ b/arch/x86/include/asm/page_32.h @@ -4,7 +4,7 @@ #include <asm/page_32_types.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define __phys_addr_nodebug(x) ((x) - PAGE_OFFSET) #ifdef CONFIG_DEBUG_VIRTUAL @@ -15,10 +15,6 @@ extern unsigned long __phys_addr(unsigned long); #define __phys_addr_symbol(x) __phys_addr(x) #define __phys_reloc_hide(x) RELOC_HIDE((x), 0) -#ifdef CONFIG_FLATMEM -#define pfn_valid(pfn) ((pfn) < max_mapnr) -#endif /* CONFIG_FLATMEM */ - #include <linux/string.h> static inline void clear_page(void *page) @@ -30,6 +26,6 @@ static inline void copy_page(void *to, void *from) { memcpy(to, from, PAGE_SIZE); } -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_X86_PAGE_32_H */ diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index faf9cc1c14bb..623f1e9f493e 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -11,8 +11,8 @@ * a virtual address space of one gigabyte, which limits the * amount of physical memory you can use to about 950MB. * - * If you want more physical memory than this then see the CONFIG_HIGHMEM4G - * and CONFIG_HIGHMEM64G options in the kernel configuration. + * If you want more physical memory than this then see the CONFIG_VMSPLIT_2G + * and CONFIG_HIGHMEM4G options in the kernel configuration. */ #define __PAGE_OFFSET_BASE _AC(CONFIG_PAGE_OFFSET, UL) #define __PAGE_OFFSET __PAGE_OFFSET_BASE @@ -63,7 +63,7 @@ */ #define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * This much address space is reserved for vmalloc() and iomap() @@ -73,8 +73,7 @@ extern unsigned int __VMALLOC_RESERVE; extern int sysctl_legacy_va_layout; extern void find_low_pfn_range(void); -extern void setup_bootmem_allocator(void); -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_X86_PAGE_32_DEFS_H */ diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index baa70451b8df..015d23f3e01f 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -4,10 +4,12 @@ #include <asm/page_64_types.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <asm/cpufeatures.h> #include <asm/alternative.h> +#include <linux/kmsan-checks.h> + /* duplicated to the one in bootmem.h */ extern unsigned long max_pfn; extern unsigned long phys_base; @@ -15,6 +17,7 @@ extern unsigned long phys_base; extern unsigned long page_offset_base; extern unsigned long vmalloc_base; extern unsigned long vmemmap_base; +extern unsigned long direct_map_physmem_end; static __always_inline unsigned long __phys_addr_nodebug(unsigned long x) { @@ -37,27 +40,28 @@ extern unsigned long __phys_addr_symbol(unsigned long); #define __phys_reloc_hide(x) (x) -#ifdef CONFIG_FLATMEM -#define pfn_valid(pfn) ((pfn) < max_pfn) -#endif - void clear_page_orig(void *page); void clear_page_rep(void *page); void clear_page_erms(void *page); static inline void clear_page(void *page) { + /* + * Clean up KMSAN metadata for the page being cleared. The assembly call + * below clobbers @page, so we perform unpoisoning before it. + */ + kmsan_unpoison_memory(page, PAGE_SIZE); alternative_call_2(clear_page_orig, clear_page_rep, X86_FEATURE_REP_GOOD, clear_page_erms, X86_FEATURE_ERMS, "=D" (page), - "0" (page) - : "cc", "memory", "rax", "rcx"); + "D" (page), + "cc", "memory", "rax", "rcx"); } void copy_page(void *to, void *from); +KCFI_REFERENCE(copy_page); -#ifdef CONFIG_X86_5LEVEL /* * User space process size. This is the first address outside the user range. * There are a few constraints that determine this: @@ -88,9 +92,8 @@ static __always_inline unsigned long task_size_max(void) return ret; } -#endif /* CONFIG_X86_5LEVEL */ -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #ifdef CONFIG_X86_VSYSCALL_EMULATION # define __HAVE_ARCH_GATE_AREA 1 diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index e9e2c3ba5923..7400dab373fe 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -2,7 +2,7 @@ #ifndef _ASM_X86_PAGE_64_DEFS_H #define _ASM_X86_PAGE_64_DEFS_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <asm/kaslr.h> #endif @@ -41,25 +41,14 @@ #define __PAGE_OFFSET_BASE_L5 _AC(0xff11000000000000, UL) #define __PAGE_OFFSET_BASE_L4 _AC(0xffff888000000000, UL) -#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT #define __PAGE_OFFSET page_offset_base -#else -#define __PAGE_OFFSET __PAGE_OFFSET_BASE_L4 -#endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */ #define __START_KERNEL_map _AC(0xffffffff80000000, UL) -/* See Documentation/x86/x86_64/mm.rst for a description of the memory map. */ +/* See Documentation/arch/x86/x86_64/mm.rst for a description of the memory map. */ #define __PHYSICAL_MASK_SHIFT 52 - -#ifdef CONFIG_X86_5LEVEL #define __VIRTUAL_MASK_SHIFT (pgtable_l5_enabled() ? 56 : 47) -/* See task_size_max() in <asm/page_64.h> */ -#else -#define __VIRTUAL_MASK_SHIFT 47 -#define task_size_max() ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) -#endif #define TASK_SIZE_MAX task_size_max() #define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index a506a411474d..018a8d906ca3 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -6,25 +6,16 @@ #include <linux/types.h> #include <linux/mem_encrypt.h> -/* PAGE_SHIFT determines the page size */ -#define PAGE_SHIFT 12 -#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) -#define PAGE_MASK (~(PAGE_SIZE-1)) - -#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT) -#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1)) - -#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) -#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) +#include <vdso/page.h> #define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) -/* Cast *PAGE_MASK to a signed type so that it is sign-extended if +/* Cast P*D_MASK to a signed type so that it is sign-extended if virtual addresses are 32-bits but physical addresses are larger (ie, 32-bit PAE). */ #define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK) -#define PHYSICAL_PMD_PAGE_MASK (((signed long)PMD_PAGE_MASK) & __PHYSICAL_MASK) -#define PHYSICAL_PUD_PAGE_MASK (((signed long)PUD_PAGE_MASK) & __PHYSICAL_MASK) +#define PHYSICAL_PMD_PAGE_MASK (((signed long)PMD_MASK) & __PHYSICAL_MASK) +#define PHYSICAL_PUD_PAGE_MASK (((signed long)PUD_MASK) & __PHYSICAL_MASK) #define HPAGE_SHIFT PMD_SHIFT #define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT) @@ -37,10 +28,10 @@ #define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC -#define __PHYSICAL_START ALIGN(CONFIG_PHYSICAL_START, \ - CONFIG_PHYSICAL_ALIGN) +/* Physical address where kernel should be loaded. */ +#define LOAD_PHYSICAL_ADDR __ALIGN_KERNEL_MASK(CONFIG_PHYSICAL_START, CONFIG_PHYSICAL_ALIGN - 1) -#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) +#define __START_KERNEL (__START_KERNEL_map + LOAD_PHYSICAL_ADDR) #ifdef CONFIG_X86_64 #include <asm/page_64_types.h> @@ -50,7 +41,7 @@ #define IOREMAP_MAX_ORDER (PMD_SHIFT) #endif /* CONFIG_X86_64 */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK extern phys_addr_t physical_mask; @@ -73,6 +64,6 @@ bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn); extern void initmem_init(void); -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_X86_PAGE_DEFS_H */ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 964442b99245..b5e59a7ba0d0 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -4,14 +4,18 @@ /* Various instructions on x86 need to be replaced for * para-virtualization: those hooks are defined here. */ +#include <asm/paravirt_types.h> + +#ifndef __ASSEMBLER__ +struct mm_struct; +#endif + #ifdef CONFIG_PARAVIRT #include <asm/pgtable_types.h> #include <asm/asm.h> #include <asm/nospec-branch.h> -#include <asm/paravirt_types.h> - -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/bug.h> #include <linux/types.h> #include <linux/cpumask.h> @@ -26,7 +30,7 @@ DECLARE_STATIC_CALL(pv_sched_clock, dummy_sched_clock); void paravirt_set_sched_clock(u64 (*func)(void)); -static inline u64 paravirt_sched_clock(void) +static __always_inline u64 paravirt_sched_clock(void) { return static_call(pv_sched_clock)(); } @@ -87,11 +91,6 @@ static inline void __flush_tlb_multi(const struct cpumask *cpumask, PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info); } -static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) -{ - PVOP_VCALL2(mmu.tlb_remove_table, tlb, table); -} - static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) { PVOP_VCALL1(mmu.exit_mmap, mm); @@ -103,6 +102,16 @@ static inline void notify_page_enc_status_changed(unsigned long pfn, PVOP_VCALL3(mmu.notify_page_enc_status_changed, pfn, npages, enc); } +static __always_inline void arch_safe_halt(void) +{ + PVOP_VCALL0(irq.safe_halt); +} + +static inline void halt(void) +{ + PVOP_VCALL0(irq.halt); +} + #ifdef CONFIG_PARAVIRT_XXL static inline void load_sp0(unsigned long sp0) { @@ -142,8 +151,7 @@ static inline void write_cr0(unsigned long x) static __always_inline unsigned long read_cr2(void) { return PVOP_ALT_CALLEE0(unsigned long, mmu.read_cr2, - "mov %%cr2, %%rax;", - ALT_NOT(X86_FEATURE_XENPV)); + "mov %%cr2, %%rax;", ALT_NOT_XEN); } static __always_inline void write_cr2(unsigned long x) @@ -154,13 +162,12 @@ static __always_inline void write_cr2(unsigned long x) static inline unsigned long __read_cr3(void) { return PVOP_ALT_CALL0(unsigned long, mmu.read_cr3, - "mov %%cr3, %%rax;", ALT_NOT(X86_FEATURE_XENPV)); + "mov %%cr3, %%rax;", ALT_NOT_XEN); } static inline void write_cr3(unsigned long x) { - PVOP_ALT_VCALL1(mmu.write_cr3, x, - "mov %%rdi, %%cr3", ALT_NOT(X86_FEATURE_XENPV)); + PVOP_ALT_VCALL1(mmu.write_cr3, x, "mov %%rdi, %%cr3", ALT_NOT_XEN); } static inline void __write_cr4(unsigned long x) @@ -168,41 +175,24 @@ static inline void __write_cr4(unsigned long x) PVOP_VCALL1(cpu.write_cr4, x); } -static inline void arch_safe_halt(void) -{ - PVOP_VCALL0(irq.safe_halt); -} - -static inline void halt(void) -{ - PVOP_VCALL0(irq.halt); -} - -static inline void wbinvd(void) -{ - PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT(X86_FEATURE_XENPV)); -} - -static inline u64 paravirt_read_msr(unsigned msr) +static inline u64 paravirt_read_msr(u32 msr) { return PVOP_CALL1(u64, cpu.read_msr, msr); } -static inline void paravirt_write_msr(unsigned msr, - unsigned low, unsigned high) +static inline void paravirt_write_msr(u32 msr, u64 val) { - PVOP_VCALL3(cpu.write_msr, msr, low, high); + PVOP_VCALL2(cpu.write_msr, msr, val); } -static inline u64 paravirt_read_msr_safe(unsigned msr, int *err) +static inline int paravirt_read_msr_safe(u32 msr, u64 *val) { - return PVOP_CALL2(u64, cpu.read_msr_safe, msr, err); + return PVOP_CALL2(int, cpu.read_msr_safe, msr, val); } -static inline int paravirt_write_msr_safe(unsigned msr, - unsigned low, unsigned high) +static inline int paravirt_write_msr_safe(u32 msr, u64 val) { - return PVOP_CALL3(int, cpu.write_msr_safe, msr, low, high); + return PVOP_CALL2(int, cpu.write_msr_safe, msr, val); } #define rdmsr(msr, val1, val2) \ @@ -212,55 +202,46 @@ do { \ val2 = _l >> 32; \ } while (0) -#define wrmsr(msr, val1, val2) \ -do { \ - paravirt_write_msr(msr, val1, val2); \ -} while (0) +static __always_inline void wrmsr(u32 msr, u32 low, u32 high) +{ + paravirt_write_msr(msr, (u64)high << 32 | low); +} -#define rdmsrl(msr, val) \ +#define rdmsrq(msr, val) \ do { \ val = paravirt_read_msr(msr); \ } while (0) -static inline void wrmsrl(unsigned msr, u64 val) +static inline void wrmsrq(u32 msr, u64 val) { - wrmsr(msr, (u32)val, (u32)(val>>32)); + paravirt_write_msr(msr, val); } -#define wrmsr_safe(msr, a, b) paravirt_write_msr_safe(msr, a, b) +static inline int wrmsrq_safe(u32 msr, u64 val) +{ + return paravirt_write_msr_safe(msr, val); +} /* rdmsr with exception handling */ #define rdmsr_safe(msr, a, b) \ ({ \ - int _err; \ - u64 _l = paravirt_read_msr_safe(msr, &_err); \ + u64 _l; \ + int _err = paravirt_read_msr_safe((msr), &_l); \ (*a) = (u32)_l; \ - (*b) = _l >> 32; \ + (*b) = (u32)(_l >> 32); \ _err; \ }) -static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) +static __always_inline int rdmsrq_safe(u32 msr, u64 *p) { - int err; - - *p = paravirt_read_msr_safe(msr, &err); - return err; + return paravirt_read_msr_safe(msr, p); } -static inline unsigned long long paravirt_read_pmc(int counter) +static __always_inline u64 rdpmc(int counter) { return PVOP_CALL1(u64, cpu.read_pmc, counter); } -#define rdpmc(counter, low, high) \ -do { \ - u64 _l = paravirt_read_pmc(counter); \ - low = (u32)_l; \ - high = _l >> 32; \ -} while (0) - -#define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter)) - static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries) { PVOP_VCALL2(cpu.alloc_ldt, ldt, entries); @@ -332,16 +313,9 @@ static inline void tss_update_io_bitmap(void) } #endif -static inline void paravirt_activate_mm(struct mm_struct *prev, - struct mm_struct *next) +static inline void paravirt_enter_mmap(struct mm_struct *next) { - PVOP_VCALL2(mmu.activate_mm, prev, next); -} - -static inline void paravirt_arch_dup_mmap(struct mm_struct *oldmm, - struct mm_struct *mm) -{ - PVOP_VCALL2(mmu.dup_mmap, oldmm, mm); + PVOP_VCALL1(mmu.enter_mmap, next); } static inline int paravirt_pgd_alloc(struct mm_struct *mm) @@ -395,27 +369,25 @@ static inline void paravirt_release_p4d(unsigned long pfn) static inline pte_t __pte(pteval_t val) { return (pte_t) { PVOP_ALT_CALLEE1(pteval_t, mmu.make_pte, val, - "mov %%rdi, %%rax", - ALT_NOT(X86_FEATURE_XENPV)) }; + "mov %%rdi, %%rax", ALT_NOT_XEN) }; } static inline pteval_t pte_val(pte_t pte) { return PVOP_ALT_CALLEE1(pteval_t, mmu.pte_val, pte.pte, - "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); + "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline pgd_t __pgd(pgdval_t val) { return (pgd_t) { PVOP_ALT_CALLEE1(pgdval_t, mmu.make_pgd, val, - "mov %%rdi, %%rax", - ALT_NOT(X86_FEATURE_XENPV)) }; + "mov %%rdi, %%rax", ALT_NOT_XEN) }; } static inline pgdval_t pgd_val(pgd_t pgd) { return PVOP_ALT_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd, - "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); + "mov %%rdi, %%rax", ALT_NOT_XEN); } #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION @@ -449,14 +421,13 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) static inline pmd_t __pmd(pmdval_t val) { return (pmd_t) { PVOP_ALT_CALLEE1(pmdval_t, mmu.make_pmd, val, - "mov %%rdi, %%rax", - ALT_NOT(X86_FEATURE_XENPV)) }; + "mov %%rdi, %%rax", ALT_NOT_XEN) }; } static inline pmdval_t pmd_val(pmd_t pmd) { return PVOP_ALT_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd, - "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); + "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline void set_pud(pud_t *pudp, pud_t pud) @@ -469,7 +440,7 @@ static inline pud_t __pud(pudval_t val) pudval_t ret; ret = PVOP_ALT_CALLEE1(pudval_t, mmu.make_pud, val, - "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); + "mov %%rdi, %%rax", ALT_NOT_XEN); return (pud_t) { ret }; } @@ -477,7 +448,7 @@ static inline pud_t __pud(pudval_t val) static inline pudval_t pud_val(pud_t pud) { return PVOP_ALT_CALLEE1(pudval_t, mmu.pud_val, pud.pud, - "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); + "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline void pud_clear(pud_t *pudp) @@ -492,13 +463,10 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) PVOP_VCALL2(mmu.set_p4d, p4dp, val); } -#if CONFIG_PGTABLE_LEVELS >= 5 - static inline p4d_t __p4d(p4dval_t val) { p4dval_t ret = PVOP_ALT_CALLEE1(p4dval_t, mmu.make_p4d, val, - "mov %%rdi, %%rax", - ALT_NOT(X86_FEATURE_XENPV)); + "mov %%rdi, %%rax", ALT_NOT_XEN); return (p4d_t) { ret }; } @@ -506,7 +474,7 @@ static inline p4d_t __p4d(p4dval_t val) static inline p4dval_t p4d_val(p4d_t p4d) { return PVOP_ALT_CALLEE1(p4dval_t, mmu.p4d_val, p4d.p4d, - "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); + "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd) @@ -526,8 +494,6 @@ static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd) set_pgd(pgdp, native_make_pgd(0)); \ } while (0) -#endif /* CONFIG_PGTABLE_LEVELS == 5 */ - static inline void p4d_clear(p4d_t *p4dp) { set_p4d(p4dp, native_make_p4d(0)); @@ -665,6 +631,7 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu); asm(".pushsection " section ", \"ax\";" \ ".globl " PV_THUNK_NAME(func) ";" \ ".type " PV_THUNK_NAME(func) ", @function;" \ + ASM_FUNC_ALIGN \ PV_THUNK_NAME(func) ":" \ ASM_ENDBR \ FRAME_BEGIN \ @@ -691,17 +658,17 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu); static __always_inline unsigned long arch_local_save_flags(void) { return PVOP_ALT_CALLEE0(unsigned long, irq.save_fl, "pushf; pop %%rax;", - ALT_NOT(X86_FEATURE_XENPV)); + ALT_NOT_XEN); } static __always_inline void arch_local_irq_disable(void) { - PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT(X86_FEATURE_XENPV)); + PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT_XEN); } static __always_inline void arch_local_irq_enable(void) { - PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT(X86_FEATURE_XENPV)); + PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT_XEN); } static __always_inline unsigned long arch_local_irq_save(void) @@ -731,50 +698,42 @@ static __always_inline unsigned long arch_local_irq_save(void) #undef PVOP_CALL4 extern void default_banner(void); +void native_pv_lock_init(void) __init; -#else /* __ASSEMBLY__ */ - -#define _PVSITE(ptype, ops, word, algn) \ -771:; \ - ops; \ -772:; \ - .pushsection .parainstructions,"a"; \ - .align algn; \ - word 771b; \ - .byte ptype; \ - .byte 772b-771b; \ - .popsection - +#else /* __ASSEMBLER__ */ #ifdef CONFIG_X86_64 #ifdef CONFIG_PARAVIRT_XXL +#ifdef CONFIG_DEBUG_ENTRY -#define PARA_PATCH(off) ((off) / 8) -#define PARA_SITE(ptype, ops) _PVSITE(ptype, ops, .quad, 8) #define PARA_INDIRECT(addr) *addr(%rip) -#ifdef CONFIG_DEBUG_ENTRY .macro PARA_IRQ_save_fl - PARA_SITE(PARA_PATCH(PV_IRQ_save_fl), - ANNOTATE_RETPOLINE_SAFE; - call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl);) + ANNOTATE_RETPOLINE_SAFE; + call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl); .endm -#define SAVE_FLAGS ALTERNATIVE "PARA_IRQ_save_fl;", "pushf; pop %rax;", \ - ALT_NOT(X86_FEATURE_XENPV) +#define SAVE_FLAGS ALTERNATIVE_2 "PARA_IRQ_save_fl;", \ + "ALT_CALL_INSTR;", ALT_CALL_ALWAYS, \ + "pushf; pop %rax;", ALT_NOT_XEN #endif #endif /* CONFIG_PARAVIRT_XXL */ #endif /* CONFIG_X86_64 */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #else /* CONFIG_PARAVIRT */ # define default_banner x86_init_noop + +#ifndef __ASSEMBLER__ +static inline void native_pv_lock_init(void) +{ +} +#endif #endif /* !CONFIG_PARAVIRT */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifndef CONFIG_PARAVIRT_XXL -static inline void paravirt_arch_dup_mmap(struct mm_struct *oldmm, - struct mm_struct *mm) +static inline void paravirt_enter_mmap(struct mm_struct *mm) { } #endif @@ -790,5 +749,5 @@ static inline void paravirt_set_cap(void) { } #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_PARAVIRT_H */ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 89df6c6617f5..37a8627d8277 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -2,38 +2,10 @@ #ifndef _ASM_X86_PARAVIRT_TYPES_H #define _ASM_X86_PARAVIRT_TYPES_H -/* Bitmask of what can be clobbered: usually at least eax. */ -#define CLBR_EAX (1 << 0) -#define CLBR_ECX (1 << 1) -#define CLBR_EDX (1 << 2) -#define CLBR_EDI (1 << 3) +#ifdef CONFIG_PARAVIRT -#ifdef CONFIG_X86_32 -/* CLBR_ANY should match all regs platform has. For i386, that's just it */ -#define CLBR_ANY ((1 << 4) - 1) - -#define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX) -#define CLBR_RET_REG (CLBR_EAX | CLBR_EDX) -#else -#define CLBR_RAX CLBR_EAX -#define CLBR_RCX CLBR_ECX -#define CLBR_RDX CLBR_EDX -#define CLBR_RDI CLBR_EDI -#define CLBR_RSI (1 << 4) -#define CLBR_R8 (1 << 5) -#define CLBR_R9 (1 << 6) -#define CLBR_R10 (1 << 7) -#define CLBR_R11 (1 << 8) - -#define CLBR_ANY ((1 << 9) - 1) - -#define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \ - CLBR_RCX | CLBR_R8 | CLBR_R9) -#define CLBR_RET_REG (CLBR_RAX) - -#endif /* X86_64 */ - -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ +#include <linux/types.h> #include <asm/desc_defs.h> #include <asm/pgtable_types.h> @@ -114,22 +86,20 @@ struct pv_cpu_ops { void (*update_io_bitmap)(void); #endif - void (*wbinvd)(void); - /* cpuid emulation, mostly so that caps bits can be disabled */ void (*cpuid)(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx); /* Unsafe MSR operations. These will warn or panic on failure. */ - u64 (*read_msr)(unsigned int msr); - void (*write_msr)(unsigned int msr, unsigned low, unsigned high); + u64 (*read_msr)(u32 msr); + void (*write_msr)(u32 msr, u64 val); /* * Safe MSR operations. - * read sets err to 0 or -EIO. write returns 0 or -EIO. + * Returns 0 or -EIO. */ - u64 (*read_msr_safe)(unsigned int msr, int *err); - int (*write_msr_safe)(unsigned int msr, unsigned low, unsigned high); + int (*read_msr_safe)(u32 msr, u64 *val); + int (*write_msr_safe)(u32 msr, u64 val); u64 (*read_pmc)(int counter); @@ -150,10 +120,9 @@ struct pv_irq_ops { struct paravirt_callee_save save_fl; struct paravirt_callee_save irq_disable; struct paravirt_callee_save irq_enable; - +#endif void (*safe_halt)(void); void (*halt)(void); -#endif } __no_randomize_layout; struct pv_mmu_ops { @@ -164,8 +133,6 @@ struct pv_mmu_ops { void (*flush_tlb_multi)(const struct cpumask *cpus, const struct flush_tlb_info *info); - void (*tlb_remove_table)(struct mmu_gather *tlb, void *table); - /* Hook for intercepting the destruction of an mm_struct. */ void (*exit_mmap)(struct mm_struct *mm); void (*notify_page_enc_status_changed)(unsigned long pfn, int npages, bool enc); @@ -177,11 +144,8 @@ struct pv_mmu_ops { unsigned long (*read_cr3)(void); void (*write_cr3)(unsigned long); - /* Hooks for intercepting the creation/use of an mm_struct. */ - void (*activate_mm)(struct mm_struct *prev, - struct mm_struct *next); - void (*dup_mmap)(struct mm_struct *oldmm, - struct mm_struct *mm); + /* Hook for intercepting the creation/use of an mm_struct. */ + void (*enter_mmap)(struct mm_struct *mm); /* Hooks for allocating and freeing a pagetable top-level */ int (*pgd_alloc)(struct mm_struct *mm); @@ -225,12 +189,10 @@ struct pv_mmu_ops { void (*set_p4d)(p4d_t *p4dp, p4d_t p4dval); -#if CONFIG_PGTABLE_LEVELS >= 5 struct paravirt_callee_save p4d_val; struct paravirt_callee_save make_p4d; void (*set_pgd)(pgd_t *pgdp, pgd_t pgdval); -#endif /* CONFIG_PGTABLE_LEVELS >= 5 */ struct pv_lazy_ops lazy_mode; @@ -273,45 +235,18 @@ struct paravirt_patch_template { extern struct pv_info pv_info; extern struct paravirt_patch_template pv_ops; -#define PARAVIRT_PATCH(x) \ - (offsetof(struct paravirt_patch_template, x) / sizeof(void *)) - -#define paravirt_type(op) \ - [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \ - [paravirt_opptr] "m" (pv_ops.op) -#define paravirt_clobber(clobber) \ - [paravirt_clobber] "i" (clobber) - -/* - * Generate some code, and mark it as patchable by the - * apply_paravirt() alternate instruction patcher. - */ -#define _paravirt_alt(insn_string, type, clobber) \ - "771:\n\t" insn_string "\n" "772:\n" \ - ".pushsection .parainstructions,\"a\"\n" \ - _ASM_ALIGN "\n" \ - _ASM_PTR " 771b\n" \ - " .byte " type "\n" \ - " .byte 772b-771b\n" \ - " .short " clobber "\n" \ - ".popsection\n" - -/* Generate patchable code, with the default asm parameters. */ -#define paravirt_alt(insn_string) \ - _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]") - -/* Simple instruction patching code. */ -#define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t" - -unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr, unsigned int len); - -int paravirt_disable_iospace(void); +#define paravirt_ptr(op) [paravirt_opptr] "m" (pv_ops.op) /* * This generates an indirect call based on the operation type number. - * The type number, computed in PARAVIRT_PATCH, is derived from the - * offset into the paravirt_patch_template structure, and can therefore be - * freely converted back into a structure offset. + * + * Since alternatives run after enabling CET/IBT -- the latter setting/clearing + * capabilities and the former requiring all capabilities being finalized -- + * these indirect calls are subject to IBT and the paravirt stubs should have + * ENDBR on. + * + * OTOH since this is effectively a __nocfi indirect call, the paravirt stubs + * don't need to bother with CFI prefixes. */ #define PARAVIRT_CALL \ ANNOTATE_RETPOLINE_SAFE \ @@ -328,7 +263,7 @@ int paravirt_disable_iospace(void); * Unfortunately, this is a relatively slow operation for modern CPUs, * because it cannot necessarily determine what the destination * address is. In this case, the address is a runtime constant, so at - * the very least we can patch the call to e a simple direct call, or + * the very least we can patch the call to a simple direct call, or, * ideally, patch an inline implementation into the callsite. (Direct * calls are essentially free, because the call and return addresses * are completely predictable.) @@ -339,18 +274,12 @@ int paravirt_disable_iospace(void); * on the stack. All caller-save registers (eax,edx,ecx) are expected * to be modified (either clobbered or used for return values). * X86_64, on the other hand, already specifies a register-based calling - * conventions, returning at %rax, with parameters going on %rdi, %rsi, + * conventions, returning at %rax, with parameters going in %rdi, %rsi, * %rdx, and %rcx. Note that for this reason, x86_64 does not need any * special handling for dealing with 4 arguments, unlike i386. - * However, x86_64 also have to clobber all caller saved registers, which + * However, x86_64 also has to clobber all caller saved registers, which * unfortunately, are quite a bit (r8 - r11) * - * The call instruction itself is marked by placing its start address - * and size into the .parainstructions section, so that - * apply_paravirt() in arch/i386/kernel/alternative.c can do the - * appropriate patching under the control of the backend pv_init_ops - * implementation. - * * Unfortunately there's no way to get gcc to generate the args setup * for the call, and then allow the call itself to be generated by an * inline asm. Because of this, we must do the complete arg setup and @@ -360,22 +289,22 @@ int paravirt_disable_iospace(void); * There are 5 sets of PVOP_* macros for dealing with 0-4 arguments. * It could be extended to more arguments, but there would be little * to be gained from that. For each number of arguments, there are - * the two VCALL and CALL variants for void and non-void functions. + * two VCALL and CALL variants for void and non-void functions. * * When there is a return value, the invoker of the macro must specify * the return type. The macro then uses sizeof() on that type to - * determine whether its a 32 or 64 bit value, and places the return + * determine whether it's a 32 or 64 bit value and places the return * in the right register(s) (just %eax for 32-bit, and %edx:%eax for - * 64-bit). For x86_64 machines, it just returns at %rax regardless of + * 64-bit). For x86_64 machines, it just returns in %rax regardless of * the return value size. * - * 64-bit arguments are passed as a pair of adjacent 32-bit arguments + * 64-bit arguments are passed as a pair of adjacent 32-bit arguments; * i386 also passes 64-bit arguments as a pair of adjacent 32-bit arguments * in low,high order * * Small structures are passed and returned in registers. The macro * calling convention can't directly deal with this, so the wrapper - * functions must do this. + * functions must do it. * * These PVOP_* macros are only defined within this header. This * means that all uses must be wrapped in inline functions. This also @@ -414,8 +343,17 @@ int paravirt_disable_iospace(void); "=c" (__ecx) #define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax) -/* void functions are still allowed [re]ax for scratch */ +/* + * void functions are still allowed [re]ax for scratch. + * + * The ZERO_CALL_USED REGS feature may end up zeroing out callee-saved + * registers. Make sure we model this with the appropriate clobbers. + */ +#ifdef CONFIG_ZERO_CALL_USED_REGS +#define PVOP_VCALLEE_CLOBBERS "=a" (__eax), PVOP_VCALL_CLOBBERS +#else #define PVOP_VCALLEE_CLOBBERS "=a" (__eax) +#endif #define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS #define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11" @@ -440,68 +378,80 @@ int paravirt_disable_iospace(void); __mask & __eax; \ }) - -#define ____PVOP_CALL(ret, op, clbr, call_clbr, extra_clbr, ...) \ +/* + * Use alternative patching for paravirt calls: + * - For replacing an indirect call with a direct one, use the "normal" + * ALTERNATIVE() macro with the indirect call as the initial code sequence, + * which will be replaced with the related direct call by using the + * ALT_FLAG_DIRECT_CALL special case and the "always on" feature. + * - In case the replacement is either a direct call or a short code sequence + * depending on a feature bit, the ALTERNATIVE_2() macro is being used. + * The indirect call is the initial code sequence again, while the special + * code sequence is selected with the specified feature bit. In case the + * feature is not active, the direct call is used as above via the + * ALT_FLAG_DIRECT_CALL special case and the "always on" feature. + */ +#define ____PVOP_CALL(ret, op, call_clbr, extra_clbr, ...) \ ({ \ PVOP_CALL_ARGS; \ PVOP_TEST_NULL(op); \ - asm volatile(paravirt_alt(PARAVIRT_CALL) \ + asm volatile(ALTERNATIVE(PARAVIRT_CALL, ALT_CALL_INSTR, \ + ALT_CALL_ALWAYS) \ : call_clbr, ASM_CALL_CONSTRAINT \ - : paravirt_type(op), \ - paravirt_clobber(clbr), \ + : paravirt_ptr(op), \ ##__VA_ARGS__ \ : "memory", "cc" extra_clbr); \ ret; \ }) -#define ____PVOP_ALT_CALL(ret, op, alt, cond, clbr, call_clbr, \ +#define ____PVOP_ALT_CALL(ret, op, alt, cond, call_clbr, \ extra_clbr, ...) \ ({ \ PVOP_CALL_ARGS; \ PVOP_TEST_NULL(op); \ - asm volatile(ALTERNATIVE(paravirt_alt(PARAVIRT_CALL), \ - alt, cond) \ + asm volatile(ALTERNATIVE_2(PARAVIRT_CALL, \ + ALT_CALL_INSTR, ALT_CALL_ALWAYS, \ + alt, cond) \ : call_clbr, ASM_CALL_CONSTRAINT \ - : paravirt_type(op), \ - paravirt_clobber(clbr), \ + : paravirt_ptr(op), \ ##__VA_ARGS__ \ : "memory", "cc" extra_clbr); \ ret; \ }) #define __PVOP_CALL(rettype, op, ...) \ - ____PVOP_CALL(PVOP_RETVAL(rettype), op, CLBR_ANY, \ + ____PVOP_CALL(PVOP_RETVAL(rettype), op, \ PVOP_CALL_CLOBBERS, EXTRA_CLOBBERS, ##__VA_ARGS__) #define __PVOP_ALT_CALL(rettype, op, alt, cond, ...) \ - ____PVOP_ALT_CALL(PVOP_RETVAL(rettype), op, alt, cond, CLBR_ANY,\ + ____PVOP_ALT_CALL(PVOP_RETVAL(rettype), op, alt, cond, \ PVOP_CALL_CLOBBERS, EXTRA_CLOBBERS, \ ##__VA_ARGS__) #define __PVOP_CALLEESAVE(rettype, op, ...) \ - ____PVOP_CALL(PVOP_RETVAL(rettype), op.func, CLBR_RET_REG, \ + ____PVOP_CALL(PVOP_RETVAL(rettype), op.func, \ PVOP_CALLEE_CLOBBERS, , ##__VA_ARGS__) #define __PVOP_ALT_CALLEESAVE(rettype, op, alt, cond, ...) \ ____PVOP_ALT_CALL(PVOP_RETVAL(rettype), op.func, alt, cond, \ - CLBR_RET_REG, PVOP_CALLEE_CLOBBERS, , ##__VA_ARGS__) + PVOP_CALLEE_CLOBBERS, , ##__VA_ARGS__) #define __PVOP_VCALL(op, ...) \ - (void)____PVOP_CALL(, op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \ + (void)____PVOP_CALL(, op, PVOP_VCALL_CLOBBERS, \ VEXTRA_CLOBBERS, ##__VA_ARGS__) #define __PVOP_ALT_VCALL(op, alt, cond, ...) \ - (void)____PVOP_ALT_CALL(, op, alt, cond, CLBR_ANY, \ + (void)____PVOP_ALT_CALL(, op, alt, cond, \ PVOP_VCALL_CLOBBERS, VEXTRA_CLOBBERS, \ ##__VA_ARGS__) #define __PVOP_VCALLEESAVE(op, ...) \ - (void)____PVOP_CALL(, op.func, CLBR_RET_REG, \ + (void)____PVOP_CALL(, op.func, \ PVOP_VCALLEE_CLOBBERS, , ##__VA_ARGS__) #define __PVOP_ALT_VCALLEESAVE(op, alt, cond, ...) \ - (void)____PVOP_ALT_CALL(, op.func, alt, cond, CLBR_RET_REG, \ + (void)____PVOP_ALT_CALL(, op.func, alt, cond, \ PVOP_VCALLEE_CLOBBERS, , ##__VA_ARGS__) @@ -561,38 +511,20 @@ int paravirt_disable_iospace(void); __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) -/* Lazy mode for batching updates / context switch */ -enum paravirt_lazy_mode { - PARAVIRT_LAZY_NONE, - PARAVIRT_LAZY_MMU, - PARAVIRT_LAZY_CPU, -}; - -enum paravirt_lazy_mode paravirt_get_lazy_mode(void); -void paravirt_start_context_switch(struct task_struct *prev); -void paravirt_end_context_switch(struct task_struct *next); - -void paravirt_enter_lazy_mmu(void); -void paravirt_leave_lazy_mmu(void); -void paravirt_flush_lazy_mmu(void); - -void _paravirt_nop(void); -void paravirt_BUG(void); -u64 _paravirt_ident_64(u64); unsigned long paravirt_ret0(void); +#ifdef CONFIG_PARAVIRT_XXL +u64 _paravirt_ident_64(u64); +unsigned long pv_native_save_fl(void); +void pv_native_irq_disable(void); +void pv_native_irq_enable(void); +unsigned long pv_native_read_cr2(void); +#endif -#define paravirt_nop ((void *)_paravirt_nop) - -/* These all sit in the .parainstructions section to tell us what to patch. */ -struct paravirt_patch_site { - u8 *instr; /* original instructions */ - u8 type; /* type of this instruction */ - u8 len; /* length of original instruction */ -}; +#define paravirt_nop ((void *)nop_func) -extern struct paravirt_patch_site __parainstructions[], - __parainstructions_end[]; +#endif /* __ASSEMBLER__ */ -#endif /* __ASSEMBLY__ */ +#define ALT_NOT_XEN ALT_NOT(X86_FEATURE_XENPV) +#endif /* CONFIG_PARAVIRT */ #endif /* _ASM_X86_PARAVIRT_TYPES_H */ diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index f3fd5928bcbb..b3ab80a03365 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -10,7 +10,6 @@ #include <linux/numa.h> #include <asm/io.h> #include <asm/memtype.h> -#include <asm/x86_init.h> struct pci_sysdata { int domain; /* PCI domain */ @@ -21,7 +20,7 @@ struct pci_sysdata { #ifdef CONFIG_X86_64 void *iommu; /* IOMMU private data */ #endif -#ifdef CONFIG_PCI_MSI_IRQ_DOMAIN +#ifdef CONFIG_PCI_MSI void *fwnode; /* IRQ domain for MSI assignment */ #endif #if IS_ENABLED(CONFIG_VMD) @@ -52,7 +51,7 @@ static inline int pci_proc_domain(struct pci_bus *bus) } #endif -#ifdef CONFIG_PCI_MSI_IRQ_DOMAIN +#ifdef CONFIG_PCI_MSI static inline void *_pci_root_bus_fwnode(struct pci_bus *bus) { return to_pci_sysdata(bus)->fwnode; @@ -92,6 +91,7 @@ void pcibios_scan_root(int bus); struct irq_routing_table *pcibios_get_irq_routing_table(void); int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq); +bool pci_dev_has_default_msi_parent_domain(struct pci_dev *dev); #define HAVE_PCI_MMAP #define arch_can_pci_mmap_wc() pat_enabled() @@ -105,9 +105,6 @@ static inline void early_quirks(void) { } extern void pci_iommu_alloc(void); -/* generic pci stuff */ -#include <asm-generic/pci.h> - #ifdef CONFIG_NUMA /* Returns the node based on pci bus */ static inline int __pcibus_to_node(const struct pci_bus *bus) @@ -126,16 +123,4 @@ cpumask_of_pcibus(const struct pci_bus *bus) } #endif -struct pci_setup_rom { - struct setup_data data; - uint16_t vendor; - uint16_t devid; - uint64_t pcilen; - unsigned long segment; - unsigned long bus; - unsigned long device; - unsigned long function; - uint8_t romdata[]; -}; - #endif /* _ASM_X86_PCI_H */ diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 13c0d63ed55e..b0d03b6c279b 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -3,131 +3,231 @@ #define _ASM_X86_PERCPU_H #ifdef CONFIG_X86_64 -#define __percpu_seg gs +# define __percpu_seg gs +# define __percpu_rel (%rip) #else -#define __percpu_seg fs +# define __percpu_seg fs +# define __percpu_rel #endif -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #ifdef CONFIG_SMP -#define PER_CPU_VAR(var) %__percpu_seg:var -#else /* ! SMP */ -#define PER_CPU_VAR(var) var -#endif /* SMP */ - -#ifdef CONFIG_X86_64_SMP -#define INIT_PER_CPU_VAR(var) init_per_cpu__##var +# define __percpu %__percpu_seg: #else -#define INIT_PER_CPU_VAR(var) var +# define __percpu #endif -#else /* ...!ASSEMBLY */ +#define PER_CPU_VAR(var) __percpu(var)__percpu_rel + +#else /* !__ASSEMBLY__: */ -#include <linux/kernel.h> +#include <linux/args.h> +#include <linux/build_bug.h> #include <linux/stringify.h> +#include <asm/asm.h> #ifdef CONFIG_SMP -#define __percpu_prefix "%%"__stringify(__percpu_seg)":" -#define __my_cpu_offset this_cpu_read(this_cpu_off) + +#define __force_percpu_prefix "%%"__stringify(__percpu_seg)":" + +#ifdef CONFIG_CC_HAS_NAMED_AS + +#ifdef __CHECKER__ +# define __seg_gs __attribute__((address_space(__seg_gs))) +# define __seg_fs __attribute__((address_space(__seg_fs))) +#endif + +#define __percpu_prefix +#define __percpu_seg_override CONCATENATE(__seg_, __percpu_seg) + +#else /* !CONFIG_CC_HAS_NAMED_AS: */ + +#define __percpu_prefix __force_percpu_prefix +#define __percpu_seg_override + +#endif /* CONFIG_CC_HAS_NAMED_AS */ /* * Compared to the generic __my_cpu_offset version, the following * saves one instruction and avoids clobbering a temp register. */ -#define arch_raw_cpu_ptr(ptr) \ -({ \ - unsigned long tcp_ptr__; \ - asm ("add " __percpu_arg(1) ", %0" \ - : "=r" (tcp_ptr__) \ - : "m" (this_cpu_off), "0" (ptr)); \ - (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \ +#define __my_cpu_offset this_cpu_read(this_cpu_off) + +/* + * arch_raw_cpu_ptr should not be used in 32-bit VDSO for a 64-bit + * kernel, because games are played with CONFIG_X86_64 there and + * sizeof(this_cpu_off) becames 4. + */ +#ifndef BUILD_VDSO32_64 +#define arch_raw_cpu_ptr(_ptr) \ +({ \ + unsigned long tcp_ptr__ = raw_cpu_read_long(this_cpu_off); \ + \ + tcp_ptr__ += (__force unsigned long)(_ptr); \ + (TYPEOF_UNQUAL(*(_ptr)) __force __kernel *)tcp_ptr__; \ }) #else -#define __percpu_prefix "" +#define arch_raw_cpu_ptr(_ptr) \ +({ \ + BUILD_BUG(); \ + (TYPEOF_UNQUAL(*(_ptr)) __force __kernel *)0; \ +}) #endif +#define PER_CPU_VAR(var) %__percpu_seg:(var)__percpu_rel + +#else /* !CONFIG_SMP: */ + +#define __force_percpu_prefix +#define __percpu_prefix +#define __percpu_seg_override + +#define PER_CPU_VAR(var) (var)__percpu_rel + +#endif /* CONFIG_SMP */ + +#if defined(CONFIG_USE_X86_SEG_SUPPORT) && defined(USE_TYPEOF_UNQUAL) +# define __my_cpu_type(var) typeof(var) +# define __my_cpu_ptr(ptr) (ptr) +# define __my_cpu_var(var) (var) + +# define __percpu_qual __percpu_seg_override +#else +# define __my_cpu_type(var) typeof(var) __percpu_seg_override +# define __my_cpu_ptr(ptr) (__my_cpu_type(*(ptr))*)(__force uintptr_t)(ptr) +# define __my_cpu_var(var) (*__my_cpu_ptr(&(var))) +#endif + +#define __force_percpu_arg(x) __force_percpu_prefix "%" #x #define __percpu_arg(x) __percpu_prefix "%" #x /* - * Initialized pointers to per-cpu variables needed for the boot - * processor need to use these macros to get the proper address - * offset from __per_cpu_load on SMP. - * - * There also must be an entry in vmlinux_64.lds.S + * For arch-specific code, we can use direct single-insn ops (they + * don't give an lvalue though). */ -#define DECLARE_INIT_PER_CPU(var) \ - extern typeof(var) init_per_cpu_var(var) -#ifdef CONFIG_X86_64_SMP -#define init_per_cpu_var(var) init_per_cpu__##var -#else -#define init_per_cpu_var(var) var -#endif +#define __pcpu_type_1 u8 +#define __pcpu_type_2 u16 +#define __pcpu_type_4 u32 +#define __pcpu_type_8 u64 + +#define __pcpu_cast_1(val) ((u8)(((unsigned long) val) & 0xff)) +#define __pcpu_cast_2(val) ((u16)(((unsigned long) val) & 0xffff)) +#define __pcpu_cast_4(val) ((u32)(((unsigned long) val) & 0xffffffff)) +#define __pcpu_cast_8(val) ((u64)(val)) + +#define __pcpu_op_1(op) op "b " +#define __pcpu_op_2(op) op "w " +#define __pcpu_op_4(op) op "l " +#define __pcpu_op_8(op) op "q " -/* For arch-specific code, we can use direct single-insn ops (they - * don't give an lvalue though). */ +#define __pcpu_reg_1(mod, x) mod "q" (x) +#define __pcpu_reg_2(mod, x) mod "r" (x) +#define __pcpu_reg_4(mod, x) mod "r" (x) +#define __pcpu_reg_8(mod, x) mod "r" (x) -#define __pcpu_type_1 u8 -#define __pcpu_type_2 u16 -#define __pcpu_type_4 u32 -#define __pcpu_type_8 u64 +#define __pcpu_reg_imm_1(x) "qi" (x) +#define __pcpu_reg_imm_2(x) "ri" (x) +#define __pcpu_reg_imm_4(x) "ri" (x) +#define __pcpu_reg_imm_8(x) "re" (x) -#define __pcpu_cast_1(val) ((u8)(((unsigned long) val) & 0xff)) -#define __pcpu_cast_2(val) ((u16)(((unsigned long) val) & 0xffff)) -#define __pcpu_cast_4(val) ((u32)(((unsigned long) val) & 0xffffffff)) -#define __pcpu_cast_8(val) ((u64)(val)) +#ifdef CONFIG_USE_X86_SEG_SUPPORT -#define __pcpu_op1_1(op, dst) op "b " dst -#define __pcpu_op1_2(op, dst) op "w " dst -#define __pcpu_op1_4(op, dst) op "l " dst -#define __pcpu_op1_8(op, dst) op "q " dst +#define __raw_cpu_read(size, qual, pcp) \ +({ \ + *(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp)); \ +}) + +#define __raw_cpu_write(size, qual, pcp, val) \ +do { \ + *(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp)) = (val); \ +} while (0) -#define __pcpu_op2_1(op, src, dst) op "b " src ", " dst -#define __pcpu_op2_2(op, src, dst) op "w " src ", " dst -#define __pcpu_op2_4(op, src, dst) op "l " src ", " dst -#define __pcpu_op2_8(op, src, dst) op "q " src ", " dst +#define __raw_cpu_read_const(pcp) __raw_cpu_read(, , pcp) -#define __pcpu_reg_1(mod, x) mod "q" (x) -#define __pcpu_reg_2(mod, x) mod "r" (x) -#define __pcpu_reg_4(mod, x) mod "r" (x) -#define __pcpu_reg_8(mod, x) mod "r" (x) +#else /* !CONFIG_USE_X86_SEG_SUPPORT: */ -#define __pcpu_reg_imm_1(x) "qi" (x) -#define __pcpu_reg_imm_2(x) "ri" (x) -#define __pcpu_reg_imm_4(x) "ri" (x) -#define __pcpu_reg_imm_8(x) "re" (x) +#define __raw_cpu_read(size, qual, _var) \ +({ \ + __pcpu_type_##size pfo_val__; \ + \ + asm qual (__pcpu_op_##size("mov") \ + __percpu_arg([var]) ", %[val]" \ + : [val] __pcpu_reg_##size("=", pfo_val__) \ + : [var] "m" (__my_cpu_var(_var))); \ + \ + (typeof(_var))(unsigned long) pfo_val__; \ +}) -#define percpu_to_op(size, qual, op, _var, _val) \ +#define __raw_cpu_write(size, qual, _var, _val) \ do { \ __pcpu_type_##size pto_val__ = __pcpu_cast_##size(_val); \ + \ if (0) { \ - typeof(_var) pto_tmp__; \ + TYPEOF_UNQUAL(_var) pto_tmp__; \ pto_tmp__ = (_val); \ (void)pto_tmp__; \ } \ - asm qual(__pcpu_op2_##size(op, "%[val]", __percpu_arg([var])) \ - : [var] "+m" (_var) \ + asm qual (__pcpu_op_##size("mov") "%[val], " \ + __percpu_arg([var]) \ + : [var] "=m" (__my_cpu_var(_var)) \ : [val] __pcpu_reg_imm_##size(pto_val__)); \ } while (0) +/* + * The generic per-CPU infrastrucutre is not suitable for + * reading const-qualified variables. + */ +#define __raw_cpu_read_const(pcp) ({ BUILD_BUG(); (typeof(pcp))0; }) + +#endif /* CONFIG_USE_X86_SEG_SUPPORT */ + +#define __raw_cpu_read_stable(size, _var) \ +({ \ + __pcpu_type_##size pfo_val__; \ + \ + asm(__pcpu_op_##size("mov") \ + __force_percpu_arg(a[var]) ", %[val]" \ + : [val] __pcpu_reg_##size("=", pfo_val__) \ + : [var] "i" (&(_var))); \ + \ + (typeof(_var))(unsigned long) pfo_val__; \ +}) + #define percpu_unary_op(size, qual, op, _var) \ ({ \ - asm qual (__pcpu_op1_##size(op, __percpu_arg([var])) \ - : [var] "+m" (_var)); \ + asm qual (__pcpu_op_##size(op) __percpu_arg([var]) \ + : [var] "+m" (__my_cpu_var(_var))); \ }) +#define percpu_binary_op(size, qual, op, _var, _val) \ +do { \ + __pcpu_type_##size pto_val__ = __pcpu_cast_##size(_val); \ + \ + if (0) { \ + TYPEOF_UNQUAL(_var) pto_tmp__; \ + pto_tmp__ = (_val); \ + (void)pto_tmp__; \ + } \ + asm qual (__pcpu_op_##size(op) "%[val], " __percpu_arg([var]) \ + : [var] "+m" (__my_cpu_var(_var)) \ + : [val] __pcpu_reg_imm_##size(pto_val__)); \ +} while (0) + /* - * Generate a percpu add to memory instruction and optimize code + * Generate a per-CPU add to memory instruction and optimize code * if one is added or subtracted. */ #define percpu_add_op(size, qual, var, val) \ do { \ - const int pao_ID__ = (__builtin_constant_p(val) && \ - ((val) == 1 || (val) == -1)) ? \ - (int)(val) : 0; \ + const int pao_ID__ = \ + (__builtin_constant_p(val) && \ + ((val) == 1 || \ + (val) == (typeof(val))-1)) ? (int)(val) : 0; \ + \ if (0) { \ - typeof(var) pao_tmp__; \ + TYPEOF_UNQUAL(var) pao_tmp__; \ pao_tmp__ = (val); \ (void)pao_tmp__; \ } \ @@ -136,263 +236,374 @@ do { \ else if (pao_ID__ == -1) \ percpu_unary_op(size, qual, "dec", var); \ else \ - percpu_to_op(size, qual, "add", var, val); \ + percpu_binary_op(size, qual, "add", var, val); \ } while (0) -#define percpu_from_op(size, qual, op, _var) \ -({ \ - __pcpu_type_##size pfo_val__; \ - asm qual (__pcpu_op2_##size(op, __percpu_arg([var]), "%[val]") \ - : [val] __pcpu_reg_##size("=", pfo_val__) \ - : [var] "m" (_var)); \ - (typeof(_var))(unsigned long) pfo_val__; \ -}) - -#define percpu_stable_op(size, op, _var) \ -({ \ - __pcpu_type_##size pfo_val__; \ - asm(__pcpu_op2_##size(op, __percpu_arg(P[var]), "%[val]") \ - : [val] __pcpu_reg_##size("=", pfo_val__) \ - : [var] "p" (&(_var))); \ - (typeof(_var))(unsigned long) pfo_val__; \ -}) - /* * Add return operation */ #define percpu_add_return_op(size, qual, _var, _val) \ ({ \ __pcpu_type_##size paro_tmp__ = __pcpu_cast_##size(_val); \ - asm qual (__pcpu_op2_##size("xadd", "%[tmp]", \ - __percpu_arg([var])) \ + \ + asm qual (__pcpu_op_##size("xadd") "%[tmp], " \ + __percpu_arg([var]) \ : [tmp] __pcpu_reg_##size("+", paro_tmp__), \ - [var] "+m" (_var) \ + [var] "+m" (__my_cpu_var(_var)) \ : : "memory"); \ (typeof(_var))(unsigned long) (paro_tmp__ + _val); \ }) /* - * xchg is implemented using cmpxchg without a lock prefix. xchg is - * expensive due to the implied lock prefix. The processor cannot prefetch - * cachelines if xchg is used. + * raw_cpu_xchg() can use a load-store since + * it is not required to be IRQ-safe. */ -#define percpu_xchg_op(size, qual, _var, _nval) \ +#define raw_percpu_xchg_op(_var, _nval) \ ({ \ - __pcpu_type_##size pxo_old__; \ - __pcpu_type_##size pxo_new__ = __pcpu_cast_##size(_nval); \ - asm qual (__pcpu_op2_##size("mov", __percpu_arg([var]), \ - "%[oval]") \ - "\n1:\t" \ - __pcpu_op2_##size("cmpxchg", "%[nval]", \ - __percpu_arg([var])) \ - "\n\tjnz 1b" \ - : [oval] "=&a" (pxo_old__), \ - [var] "+m" (_var) \ - : [nval] __pcpu_reg_##size(, pxo_new__) \ - : "memory"); \ - (typeof(_var))(unsigned long) pxo_old__; \ + TYPEOF_UNQUAL(_var) pxo_old__ = raw_cpu_read(_var); \ + \ + raw_cpu_write(_var, _nval); \ + \ + pxo_old__; \ +}) + +/* + * this_cpu_xchg() is implemented using CMPXCHG without a LOCK prefix. + * XCHG is expensive due to the implied LOCK prefix. The processor + * cannot prefetch cachelines if XCHG is used. + */ +#define this_percpu_xchg_op(_var, _nval) \ +({ \ + TYPEOF_UNQUAL(_var) pxo_old__ = this_cpu_read(_var); \ + \ + do { } while (!this_cpu_try_cmpxchg(_var, &pxo_old__, _nval)); \ + \ + pxo_old__; \ }) /* - * cmpxchg has no such implied lock semantics as a result it is much - * more efficient for cpu local operations. + * CMPXCHG has no such implied lock semantics as a result it is much + * more efficient for CPU-local operations. */ #define percpu_cmpxchg_op(size, qual, _var, _oval, _nval) \ ({ \ __pcpu_type_##size pco_old__ = __pcpu_cast_##size(_oval); \ __pcpu_type_##size pco_new__ = __pcpu_cast_##size(_nval); \ - asm qual (__pcpu_op2_##size("cmpxchg", "%[nval]", \ - __percpu_arg([var])) \ + \ + asm qual (__pcpu_op_##size("cmpxchg") "%[nval], " \ + __percpu_arg([var]) \ : [oval] "+a" (pco_old__), \ - [var] "+m" (_var) \ + [var] "+m" (__my_cpu_var(_var)) \ : [nval] __pcpu_reg_##size(, pco_new__) \ : "memory"); \ + \ (typeof(_var))(unsigned long) pco_old__; \ }) -/* - * this_cpu_read() makes gcc load the percpu variable every time it is - * accessed while this_cpu_read_stable() allows the value to be cached. - * this_cpu_read_stable() is more efficient and can be used if its value - * is guaranteed to be valid across cpus. The current users include - * get_current() and get_thread_info() both of which are actually - * per-thread variables implemented as per-cpu variables and thus - * stable for the duration of the respective task. - */ -#define this_cpu_read_stable_1(pcp) percpu_stable_op(1, "mov", pcp) -#define this_cpu_read_stable_2(pcp) percpu_stable_op(2, "mov", pcp) -#define this_cpu_read_stable_4(pcp) percpu_stable_op(4, "mov", pcp) -#define this_cpu_read_stable_8(pcp) percpu_stable_op(8, "mov", pcp) -#define this_cpu_read_stable(pcp) __pcpu_size_call_return(this_cpu_read_stable_, pcp) - -#define raw_cpu_read_1(pcp) percpu_from_op(1, , "mov", pcp) -#define raw_cpu_read_2(pcp) percpu_from_op(2, , "mov", pcp) -#define raw_cpu_read_4(pcp) percpu_from_op(4, , "mov", pcp) - -#define raw_cpu_write_1(pcp, val) percpu_to_op(1, , "mov", (pcp), val) -#define raw_cpu_write_2(pcp, val) percpu_to_op(2, , "mov", (pcp), val) -#define raw_cpu_write_4(pcp, val) percpu_to_op(4, , "mov", (pcp), val) -#define raw_cpu_add_1(pcp, val) percpu_add_op(1, , (pcp), val) -#define raw_cpu_add_2(pcp, val) percpu_add_op(2, , (pcp), val) -#define raw_cpu_add_4(pcp, val) percpu_add_op(4, , (pcp), val) -#define raw_cpu_and_1(pcp, val) percpu_to_op(1, , "and", (pcp), val) -#define raw_cpu_and_2(pcp, val) percpu_to_op(2, , "and", (pcp), val) -#define raw_cpu_and_4(pcp, val) percpu_to_op(4, , "and", (pcp), val) -#define raw_cpu_or_1(pcp, val) percpu_to_op(1, , "or", (pcp), val) -#define raw_cpu_or_2(pcp, val) percpu_to_op(2, , "or", (pcp), val) -#define raw_cpu_or_4(pcp, val) percpu_to_op(4, , "or", (pcp), val) +#define percpu_try_cmpxchg_op(size, qual, _var, _ovalp, _nval) \ +({ \ + bool success; \ + __pcpu_type_##size *pco_oval__ = (__pcpu_type_##size *)(_ovalp); \ + __pcpu_type_##size pco_old__ = *pco_oval__; \ + __pcpu_type_##size pco_new__ = __pcpu_cast_##size(_nval); \ + \ + asm qual (__pcpu_op_##size("cmpxchg") "%[nval], " \ + __percpu_arg([var]) \ + CC_SET(z) \ + : CC_OUT(z) (success), \ + [oval] "+a" (pco_old__), \ + [var] "+m" (__my_cpu_var(_var)) \ + : [nval] __pcpu_reg_##size(, pco_new__) \ + : "memory"); \ + if (unlikely(!success)) \ + *pco_oval__ = pco_old__; \ + \ + likely(success); \ +}) -/* - * raw_cpu_xchg() can use a load-store since it is not required to be - * IRQ-safe. - */ -#define raw_percpu_xchg_op(var, nval) \ +#if defined(CONFIG_X86_32) && !defined(CONFIG_UML) + +#define percpu_cmpxchg64_op(size, qual, _var, _oval, _nval) \ ({ \ - typeof(var) pxo_ret__ = raw_cpu_read(var); \ - raw_cpu_write(var, (nval)); \ - pxo_ret__; \ + union { \ + u64 var; \ + struct { \ + u32 low, high; \ + }; \ + } old__, new__; \ + \ + old__.var = _oval; \ + new__.var = _nval; \ + \ + asm_inline qual ( \ + ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \ + "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ + : ALT_OUTPUT_SP([var] "+m" (__my_cpu_var(_var)), \ + "+a" (old__.low), "+d" (old__.high)) \ + : "b" (new__.low), "c" (new__.high), \ + "S" (&(_var)) \ + : "memory"); \ + \ + old__.var; \ }) -#define raw_cpu_xchg_1(pcp, val) raw_percpu_xchg_op(pcp, val) -#define raw_cpu_xchg_2(pcp, val) raw_percpu_xchg_op(pcp, val) -#define raw_cpu_xchg_4(pcp, val) raw_percpu_xchg_op(pcp, val) - -#define this_cpu_read_1(pcp) percpu_from_op(1, volatile, "mov", pcp) -#define this_cpu_read_2(pcp) percpu_from_op(2, volatile, "mov", pcp) -#define this_cpu_read_4(pcp) percpu_from_op(4, volatile, "mov", pcp) -#define this_cpu_write_1(pcp, val) percpu_to_op(1, volatile, "mov", (pcp), val) -#define this_cpu_write_2(pcp, val) percpu_to_op(2, volatile, "mov", (pcp), val) -#define this_cpu_write_4(pcp, val) percpu_to_op(4, volatile, "mov", (pcp), val) -#define this_cpu_add_1(pcp, val) percpu_add_op(1, volatile, (pcp), val) -#define this_cpu_add_2(pcp, val) percpu_add_op(2, volatile, (pcp), val) -#define this_cpu_add_4(pcp, val) percpu_add_op(4, volatile, (pcp), val) -#define this_cpu_and_1(pcp, val) percpu_to_op(1, volatile, "and", (pcp), val) -#define this_cpu_and_2(pcp, val) percpu_to_op(2, volatile, "and", (pcp), val) -#define this_cpu_and_4(pcp, val) percpu_to_op(4, volatile, "and", (pcp), val) -#define this_cpu_or_1(pcp, val) percpu_to_op(1, volatile, "or", (pcp), val) -#define this_cpu_or_2(pcp, val) percpu_to_op(2, volatile, "or", (pcp), val) -#define this_cpu_or_4(pcp, val) percpu_to_op(4, volatile, "or", (pcp), val) -#define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(1, volatile, pcp, nval) -#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(2, volatile, pcp, nval) -#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(4, volatile, pcp, nval) - -#define raw_cpu_add_return_1(pcp, val) percpu_add_return_op(1, , pcp, val) -#define raw_cpu_add_return_2(pcp, val) percpu_add_return_op(2, , pcp, val) -#define raw_cpu_add_return_4(pcp, val) percpu_add_return_op(4, , pcp, val) -#define raw_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(1, , pcp, oval, nval) -#define raw_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, , pcp, oval, nval) -#define raw_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, , pcp, oval, nval) - -#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(1, volatile, pcp, val) -#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(2, volatile, pcp, val) -#define this_cpu_add_return_4(pcp, val) percpu_add_return_op(4, volatile, pcp, val) -#define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(1, volatile, pcp, oval, nval) -#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, volatile, pcp, oval, nval) -#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, volatile, pcp, oval, nval) - -#ifdef CONFIG_X86_CMPXCHG64 -#define percpu_cmpxchg8b_double(pcp1, pcp2, o1, o2, n1, n2) \ +#define raw_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg64_op(8, , pcp, oval, nval) +#define this_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg64_op(8, volatile, pcp, oval, nval) + +#define percpu_try_cmpxchg64_op(size, qual, _var, _ovalp, _nval) \ ({ \ - bool __ret; \ - typeof(pcp1) __o1 = (o1), __n1 = (n1); \ - typeof(pcp2) __o2 = (o2), __n2 = (n2); \ - asm volatile("cmpxchg8b "__percpu_arg(1) \ - CC_SET(z) \ - : CC_OUT(z) (__ret), "+m" (pcp1), "+m" (pcp2), "+a" (__o1), "+d" (__o2) \ - : "b" (__n1), "c" (__n2)); \ - __ret; \ + bool success; \ + u64 *_oval = (u64 *)(_ovalp); \ + union { \ + u64 var; \ + struct { \ + u32 low, high; \ + }; \ + } old__, new__; \ + \ + old__.var = *_oval; \ + new__.var = _nval; \ + \ + asm_inline qual ( \ + ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \ + "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \ + CC_SET(z) \ + : ALT_OUTPUT_SP(CC_OUT(z) (success), \ + [var] "+m" (__my_cpu_var(_var)), \ + "+a" (old__.low), "+d" (old__.high)) \ + : "b" (new__.low), "c" (new__.high), \ + "S" (&(_var)) \ + : "memory"); \ + if (unlikely(!success)) \ + *_oval = old__.var; \ + \ + likely(success); \ }) -#define raw_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double -#define this_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double -#endif /* CONFIG_X86_CMPXCHG64 */ +#define raw_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg64_op(8, , pcp, ovalp, nval) +#define this_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg64_op(8, volatile, pcp, ovalp, nval) + +#endif /* defined(CONFIG_X86_32) && !defined(CONFIG_UML) */ -/* - * Per cpu atomic 64 bit operations are only available under 64 bit. - * 32 bit must fall back to generic operations. - */ #ifdef CONFIG_X86_64 -#define raw_cpu_read_8(pcp) percpu_from_op(8, , "mov", pcp) -#define raw_cpu_write_8(pcp, val) percpu_to_op(8, , "mov", (pcp), val) -#define raw_cpu_add_8(pcp, val) percpu_add_op(8, , (pcp), val) -#define raw_cpu_and_8(pcp, val) percpu_to_op(8, , "and", (pcp), val) -#define raw_cpu_or_8(pcp, val) percpu_to_op(8, , "or", (pcp), val) -#define raw_cpu_add_return_8(pcp, val) percpu_add_return_op(8, , pcp, val) -#define raw_cpu_xchg_8(pcp, nval) raw_percpu_xchg_op(pcp, nval) -#define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval) - -#define this_cpu_read_8(pcp) percpu_from_op(8, volatile, "mov", pcp) -#define this_cpu_write_8(pcp, val) percpu_to_op(8, volatile, "mov", (pcp), val) -#define this_cpu_add_8(pcp, val) percpu_add_op(8, volatile, (pcp), val) -#define this_cpu_and_8(pcp, val) percpu_to_op(8, volatile, "and", (pcp), val) -#define this_cpu_or_8(pcp, val) percpu_to_op(8, volatile, "or", (pcp), val) -#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(8, volatile, pcp, val) -#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(8, volatile, pcp, nval) -#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval) +#define raw_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval); +#define this_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval); -/* - * Pretty complex macro to generate cmpxchg16 instruction. The instruction - * is not supported on early AMD64 processors so we must be able to emulate - * it in software. The address used in the cmpxchg16 instruction must be - * aligned to a 16 byte boundary. - */ -#define percpu_cmpxchg16b_double(pcp1, pcp2, o1, o2, n1, n2) \ +#define raw_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, , pcp, ovalp, nval); +#define this_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval); + +#define percpu_cmpxchg128_op(size, qual, _var, _oval, _nval) \ ({ \ - bool __ret; \ - typeof(pcp1) __o1 = (o1), __n1 = (n1); \ - typeof(pcp2) __o2 = (o2), __n2 = (n2); \ - alternative_io("leaq %P1,%%rsi\n\tcall this_cpu_cmpxchg16b_emu\n\t", \ - "cmpxchg16b " __percpu_arg(1) "\n\tsetz %0\n\t", \ - X86_FEATURE_CX16, \ - ASM_OUTPUT2("=a" (__ret), "+m" (pcp1), \ - "+m" (pcp2), "+d" (__o2)), \ - "b" (__n1), "c" (__n2), "a" (__o1) : "rsi"); \ - __ret; \ + union { \ + u128 var; \ + struct { \ + u64 low, high; \ + }; \ + } old__, new__; \ + \ + old__.var = _oval; \ + new__.var = _nval; \ + \ + asm_inline qual ( \ + ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \ + "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ + : ALT_OUTPUT_SP([var] "+m" (__my_cpu_var(_var)), \ + "+a" (old__.low), "+d" (old__.high)) \ + : "b" (new__.low), "c" (new__.high), \ + "S" (&(_var)) \ + : "memory"); \ + \ + old__.var; \ }) -#define raw_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double -#define this_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double +#define raw_cpu_cmpxchg128(pcp, oval, nval) percpu_cmpxchg128_op(16, , pcp, oval, nval) +#define this_cpu_cmpxchg128(pcp, oval, nval) percpu_cmpxchg128_op(16, volatile, pcp, oval, nval) -#endif +#define percpu_try_cmpxchg128_op(size, qual, _var, _ovalp, _nval) \ +({ \ + bool success; \ + u128 *_oval = (u128 *)(_ovalp); \ + union { \ + u128 var; \ + struct { \ + u64 low, high; \ + }; \ + } old__, new__; \ + \ + old__.var = *_oval; \ + new__.var = _nval; \ + \ + asm_inline qual ( \ + ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \ + "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \ + CC_SET(z) \ + : ALT_OUTPUT_SP(CC_OUT(z) (success), \ + [var] "+m" (__my_cpu_var(_var)), \ + "+a" (old__.low), "+d" (old__.high)) \ + : "b" (new__.low), "c" (new__.high), \ + "S" (&(_var)) \ + : "memory"); \ + if (unlikely(!success)) \ + *_oval = old__.var; \ + \ + likely(success); \ +}) -static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr, - const unsigned long __percpu *addr) -{ - unsigned long __percpu *a = - (unsigned long __percpu *)addr + nr / BITS_PER_LONG; +#define raw_cpu_try_cmpxchg128(pcp, ovalp, nval) percpu_try_cmpxchg128_op(16, , pcp, ovalp, nval) +#define this_cpu_try_cmpxchg128(pcp, ovalp, nval) percpu_try_cmpxchg128_op(16, volatile, pcp, ovalp, nval) + +#endif /* CONFIG_X86_64 */ + +#define raw_cpu_read_1(pcp) __raw_cpu_read(1, , pcp) +#define raw_cpu_read_2(pcp) __raw_cpu_read(2, , pcp) +#define raw_cpu_read_4(pcp) __raw_cpu_read(4, , pcp) +#define raw_cpu_write_1(pcp, val) __raw_cpu_write(1, , pcp, val) +#define raw_cpu_write_2(pcp, val) __raw_cpu_write(2, , pcp, val) +#define raw_cpu_write_4(pcp, val) __raw_cpu_write(4, , pcp, val) + +#define this_cpu_read_1(pcp) __raw_cpu_read(1, volatile, pcp) +#define this_cpu_read_2(pcp) __raw_cpu_read(2, volatile, pcp) +#define this_cpu_read_4(pcp) __raw_cpu_read(4, volatile, pcp) +#define this_cpu_write_1(pcp, val) __raw_cpu_write(1, volatile, pcp, val) +#define this_cpu_write_2(pcp, val) __raw_cpu_write(2, volatile, pcp, val) +#define this_cpu_write_4(pcp, val) __raw_cpu_write(4, volatile, pcp, val) + +#define this_cpu_read_stable_1(pcp) __raw_cpu_read_stable(1, pcp) +#define this_cpu_read_stable_2(pcp) __raw_cpu_read_stable(2, pcp) +#define this_cpu_read_stable_4(pcp) __raw_cpu_read_stable(4, pcp) + +#define raw_cpu_add_1(pcp, val) percpu_add_op(1, , (pcp), val) +#define raw_cpu_add_2(pcp, val) percpu_add_op(2, , (pcp), val) +#define raw_cpu_add_4(pcp, val) percpu_add_op(4, , (pcp), val) +#define raw_cpu_and_1(pcp, val) percpu_binary_op(1, , "and", (pcp), val) +#define raw_cpu_and_2(pcp, val) percpu_binary_op(2, , "and", (pcp), val) +#define raw_cpu_and_4(pcp, val) percpu_binary_op(4, , "and", (pcp), val) +#define raw_cpu_or_1(pcp, val) percpu_binary_op(1, , "or", (pcp), val) +#define raw_cpu_or_2(pcp, val) percpu_binary_op(2, , "or", (pcp), val) +#define raw_cpu_or_4(pcp, val) percpu_binary_op(4, , "or", (pcp), val) +#define raw_cpu_xchg_1(pcp, val) raw_percpu_xchg_op(pcp, val) +#define raw_cpu_xchg_2(pcp, val) raw_percpu_xchg_op(pcp, val) +#define raw_cpu_xchg_4(pcp, val) raw_percpu_xchg_op(pcp, val) + +#define this_cpu_add_1(pcp, val) percpu_add_op(1, volatile, (pcp), val) +#define this_cpu_add_2(pcp, val) percpu_add_op(2, volatile, (pcp), val) +#define this_cpu_add_4(pcp, val) percpu_add_op(4, volatile, (pcp), val) +#define this_cpu_and_1(pcp, val) percpu_binary_op(1, volatile, "and", (pcp), val) +#define this_cpu_and_2(pcp, val) percpu_binary_op(2, volatile, "and", (pcp), val) +#define this_cpu_and_4(pcp, val) percpu_binary_op(4, volatile, "and", (pcp), val) +#define this_cpu_or_1(pcp, val) percpu_binary_op(1, volatile, "or", (pcp), val) +#define this_cpu_or_2(pcp, val) percpu_binary_op(2, volatile, "or", (pcp), val) +#define this_cpu_or_4(pcp, val) percpu_binary_op(4, volatile, "or", (pcp), val) +#define this_cpu_xchg_1(pcp, nval) this_percpu_xchg_op(pcp, nval) +#define this_cpu_xchg_2(pcp, nval) this_percpu_xchg_op(pcp, nval) +#define this_cpu_xchg_4(pcp, nval) this_percpu_xchg_op(pcp, nval) + +#define raw_cpu_add_return_1(pcp, val) percpu_add_return_op(1, , pcp, val) +#define raw_cpu_add_return_2(pcp, val) percpu_add_return_op(2, , pcp, val) +#define raw_cpu_add_return_4(pcp, val) percpu_add_return_op(4, , pcp, val) +#define raw_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(1, , pcp, oval, nval) +#define raw_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, , pcp, oval, nval) +#define raw_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, , pcp, oval, nval) +#define raw_cpu_try_cmpxchg_1(pcp, ovalp, nval) percpu_try_cmpxchg_op(1, , pcp, ovalp, nval) +#define raw_cpu_try_cmpxchg_2(pcp, ovalp, nval) percpu_try_cmpxchg_op(2, , pcp, ovalp, nval) +#define raw_cpu_try_cmpxchg_4(pcp, ovalp, nval) percpu_try_cmpxchg_op(4, , pcp, ovalp, nval) + +#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(1, volatile, pcp, val) +#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(2, volatile, pcp, val) +#define this_cpu_add_return_4(pcp, val) percpu_add_return_op(4, volatile, pcp, val) +#define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(1, volatile, pcp, oval, nval) +#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, volatile, pcp, oval, nval) +#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, volatile, pcp, oval, nval) +#define this_cpu_try_cmpxchg_1(pcp, ovalp, nval) percpu_try_cmpxchg_op(1, volatile, pcp, ovalp, nval) +#define this_cpu_try_cmpxchg_2(pcp, ovalp, nval) percpu_try_cmpxchg_op(2, volatile, pcp, ovalp, nval) +#define this_cpu_try_cmpxchg_4(pcp, ovalp, nval) percpu_try_cmpxchg_op(4, volatile, pcp, ovalp, nval) +/* + * Per-CPU atomic 64-bit operations are only available under 64-bit kernels. + * 32-bit kernels must fall back to generic operations. + */ #ifdef CONFIG_X86_64 - return ((1UL << (nr % BITS_PER_LONG)) & raw_cpu_read_8(*a)) != 0; -#else - return ((1UL << (nr % BITS_PER_LONG)) & raw_cpu_read_4(*a)) != 0; -#endif -} -static inline bool x86_this_cpu_variable_test_bit(int nr, - const unsigned long __percpu *addr) -{ - bool oldbit; +#define raw_cpu_read_8(pcp) __raw_cpu_read(8, , pcp) +#define raw_cpu_write_8(pcp, val) __raw_cpu_write(8, , pcp, val) + +#define this_cpu_read_8(pcp) __raw_cpu_read(8, volatile, pcp) +#define this_cpu_write_8(pcp, val) __raw_cpu_write(8, volatile, pcp, val) + +#define this_cpu_read_stable_8(pcp) __raw_cpu_read_stable(8, pcp) + +#define raw_cpu_add_8(pcp, val) percpu_add_op(8, , (pcp), val) +#define raw_cpu_and_8(pcp, val) percpu_binary_op(8, , "and", (pcp), val) +#define raw_cpu_or_8(pcp, val) percpu_binary_op(8, , "or", (pcp), val) +#define raw_cpu_add_return_8(pcp, val) percpu_add_return_op(8, , pcp, val) +#define raw_cpu_xchg_8(pcp, nval) raw_percpu_xchg_op(pcp, nval) +#define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval) +#define raw_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, , pcp, ovalp, nval) + +#define this_cpu_add_8(pcp, val) percpu_add_op(8, volatile, (pcp), val) +#define this_cpu_and_8(pcp, val) percpu_binary_op(8, volatile, "and", (pcp), val) +#define this_cpu_or_8(pcp, val) percpu_binary_op(8, volatile, "or", (pcp), val) +#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(8, volatile, pcp, val) +#define this_cpu_xchg_8(pcp, nval) this_percpu_xchg_op(pcp, nval) +#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval) +#define this_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval) + +#define raw_cpu_read_long(pcp) raw_cpu_read_8(pcp) - asm volatile("btl "__percpu_arg(2)",%1" - CC_SET(c) - : CC_OUT(c) (oldbit) - : "m" (*(unsigned long __percpu *)addr), "Ir" (nr)); +#else /* !CONFIG_X86_64: */ - return oldbit; -} +/* There is no generic 64-bit read stable operation for 32-bit targets. */ +#define this_cpu_read_stable_8(pcp) ({ BUILD_BUG(); (typeof(pcp))0; }) -#define x86_this_cpu_test_bit(nr, addr) \ - (__builtin_constant_p((nr)) \ - ? x86_this_cpu_constant_test_bit((nr), (addr)) \ - : x86_this_cpu_variable_test_bit((nr), (addr))) +#define raw_cpu_read_long(pcp) raw_cpu_read_4(pcp) + +#endif /* CONFIG_X86_64 */ + +#define this_cpu_read_const(pcp) __raw_cpu_read_const(pcp) + +/* + * this_cpu_read() makes the compiler load the per-CPU variable every time + * it is accessed while this_cpu_read_stable() allows the value to be cached. + * this_cpu_read_stable() is more efficient and can be used if its value + * is guaranteed to be valid across CPUs. The current users include + * current_task and cpu_current_top_of_stack, both of which are + * actually per-thread variables implemented as per-CPU variables and + * thus stable for the duration of the respective task. + */ +#define this_cpu_read_stable(pcp) __pcpu_size_call_return(this_cpu_read_stable_, pcp) + +#define x86_this_cpu_constant_test_bit(_nr, _var) \ +({ \ + unsigned long __percpu *addr__ = \ + (unsigned long __percpu *)&(_var) + ((_nr) / BITS_PER_LONG); \ + \ + !!((1UL << ((_nr) % BITS_PER_LONG)) & raw_cpu_read(*addr__)); \ +}) + +#define x86_this_cpu_variable_test_bit(_nr, _var) \ +({ \ + bool oldbit; \ + \ + asm volatile("btl %[nr], " __percpu_arg([var]) \ + CC_SET(c) \ + : CC_OUT(c) (oldbit) \ + : [var] "m" (__my_cpu_var(_var)), \ + [nr] "rI" (_nr)); \ + oldbit; \ +}) + +#define x86_this_cpu_test_bit(_nr, _var) \ + (__builtin_constant_p(_nr) \ + ? x86_this_cpu_constant_test_bit(_nr, _var) \ + : x86_this_cpu_variable_test_bit(_nr, _var)) #include <asm-generic/percpu.h> /* We can use this directly for local CPU (faster). */ -DECLARE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off); +DECLARE_PER_CPU_CACHE_HOT(unsigned long, this_cpu_off); -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #ifdef CONFIG_SMP @@ -414,46 +625,47 @@ DECLARE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off); { [0 ... NR_CPUS-1] = _initvalue }; \ __typeof__(_type) *_name##_early_ptr __refdata = _name##_early_map -#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \ +#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \ EXPORT_PER_CPU_SYMBOL(_name) -#define DECLARE_EARLY_PER_CPU(_type, _name) \ - DECLARE_PER_CPU(_type, _name); \ - extern __typeof__(_type) *_name##_early_ptr; \ +#define DECLARE_EARLY_PER_CPU(_type, _name) \ + DECLARE_PER_CPU(_type, _name); \ + extern __typeof__(_type) *_name##_early_ptr; \ extern __typeof__(_type) _name##_early_map[] -#define DECLARE_EARLY_PER_CPU_READ_MOSTLY(_type, _name) \ - DECLARE_PER_CPU_READ_MOSTLY(_type, _name); \ - extern __typeof__(_type) *_name##_early_ptr; \ +#define DECLARE_EARLY_PER_CPU_READ_MOSTLY(_type, _name) \ + DECLARE_PER_CPU_READ_MOSTLY(_type, _name); \ + extern __typeof__(_type) *_name##_early_ptr; \ extern __typeof__(_type) _name##_early_map[] -#define early_per_cpu_ptr(_name) (_name##_early_ptr) -#define early_per_cpu_map(_name, _idx) (_name##_early_map[_idx]) -#define early_per_cpu(_name, _cpu) \ - *(early_per_cpu_ptr(_name) ? \ - &early_per_cpu_ptr(_name)[_cpu] : \ +#define early_per_cpu_ptr(_name) (_name##_early_ptr) +#define early_per_cpu_map(_name, _idx) (_name##_early_map[_idx]) + +#define early_per_cpu(_name, _cpu) \ + *(early_per_cpu_ptr(_name) ? \ + &early_per_cpu_ptr(_name)[_cpu] : \ &per_cpu(_name, _cpu)) -#else /* !CONFIG_SMP */ -#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \ +#else /* !CONFIG_SMP: */ +#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \ DEFINE_PER_CPU(_type, _name) = _initvalue #define DEFINE_EARLY_PER_CPU_READ_MOSTLY(_type, _name, _initvalue) \ DEFINE_PER_CPU_READ_MOSTLY(_type, _name) = _initvalue -#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \ +#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \ EXPORT_PER_CPU_SYMBOL(_name) -#define DECLARE_EARLY_PER_CPU(_type, _name) \ +#define DECLARE_EARLY_PER_CPU(_type, _name) \ DECLARE_PER_CPU(_type, _name) -#define DECLARE_EARLY_PER_CPU_READ_MOSTLY(_type, _name) \ +#define DECLARE_EARLY_PER_CPU_READ_MOSTLY(_type, _name) \ DECLARE_PER_CPU_READ_MOSTLY(_type, _name) -#define early_per_cpu(_name, _cpu) per_cpu(_name, _cpu) -#define early_per_cpu_ptr(_name) NULL +#define early_per_cpu(_name, _cpu) per_cpu(_name, _cpu) +#define early_per_cpu_ptr(_name) NULL /* no early_per_cpu_map() */ -#endif /* !CONFIG_SMP */ +#endif /* !CONFIG_SMP */ #endif /* _ASM_X86_PERCPU_H */ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 34348ae41cdb..70d1d94aca7e 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -31,12 +31,26 @@ #define ARCH_PERFMON_EVENTSEL_ENABLE (1ULL << 22) #define ARCH_PERFMON_EVENTSEL_INV (1ULL << 23) #define ARCH_PERFMON_EVENTSEL_CMASK 0xFF000000ULL +#define ARCH_PERFMON_EVENTSEL_BR_CNTR (1ULL << 35) +#define ARCH_PERFMON_EVENTSEL_EQ (1ULL << 36) +#define ARCH_PERFMON_EVENTSEL_UMASK2 (0xFFULL << 40) + +#define INTEL_FIXED_BITS_MASK 0xFULL +#define INTEL_FIXED_BITS_STRIDE 4 +#define INTEL_FIXED_0_KERNEL (1ULL << 0) +#define INTEL_FIXED_0_USER (1ULL << 1) +#define INTEL_FIXED_0_ANYTHREAD (1ULL << 2) +#define INTEL_FIXED_0_ENABLE_PMI (1ULL << 3) +#define INTEL_FIXED_3_METRICS_CLEAR (1ULL << 2) #define HSW_IN_TX (1ULL << 32) #define HSW_IN_TX_CHECKPOINTED (1ULL << 33) #define ICL_EVENTSEL_ADAPTIVE (1ULL << 34) #define ICL_FIXED_0_ADAPTIVE (1ULL << 32) +#define intel_fixed_bits_by_idx(_idx, _bits) \ + ((_bits) << ((_idx) * INTEL_FIXED_BITS_STRIDE)) + #define AMD64_EVENTSEL_INT_CORE_ENABLE (1ULL << 36) #define AMD64_EVENTSEL_GUESTONLY (1ULL << 40) #define AMD64_EVENTSEL_HOSTONLY (1ULL << 41) @@ -102,6 +116,13 @@ (AMD64_PERFMON_V2_EVENTSEL_EVENT_NB | \ AMD64_PERFMON_V2_EVENTSEL_UMASK_NB) +#define AMD64_PERFMON_V2_ENABLE_UMC BIT_ULL(31) +#define AMD64_PERFMON_V2_EVENTSEL_EVENT_UMC GENMASK_ULL(7, 0) +#define AMD64_PERFMON_V2_EVENTSEL_RDWRMASK_UMC GENMASK_ULL(9, 8) +#define AMD64_PERFMON_V2_RAW_EVENT_MASK_UMC \ + (AMD64_PERFMON_V2_EVENTSEL_EVENT_UMC | \ + AMD64_PERFMON_V2_EVENTSEL_RDWRMASK_UMC) + #define AMD64_NUM_COUNTERS 4 #define AMD64_NUM_COUNTERS_CORE 6 #define AMD64_NUM_COUNTERS_NB 4 @@ -120,6 +141,15 @@ #define PEBS_DATACFG_XMMS BIT_ULL(2) #define PEBS_DATACFG_LBRS BIT_ULL(3) #define PEBS_DATACFG_LBR_SHIFT 24 +#define PEBS_DATACFG_CNTR BIT_ULL(4) +#define PEBS_DATACFG_CNTR_SHIFT 32 +#define PEBS_DATACFG_CNTR_MASK GENMASK_ULL(15, 0) +#define PEBS_DATACFG_FIX_SHIFT 48 +#define PEBS_DATACFG_FIX_MASK GENMASK_ULL(7, 0) +#define PEBS_DATACFG_METRICS BIT_ULL(5) + +/* Steal the highest bit of pebs_data_cfg for SW usage */ +#define PEBS_UPDATE_DS_SW BIT_ULL(63) /* * Intel "Architectural Performance Monitoring" CPUID @@ -160,6 +190,39 @@ union cpuid10_edx { }; /* + * Intel "Architectural Performance Monitoring extension" CPUID + * detection/enumeration details: + */ +#define ARCH_PERFMON_EXT_LEAF 0x00000023 +#define ARCH_PERFMON_NUM_COUNTER_LEAF 0x1 +#define ARCH_PERFMON_ACR_LEAF 0x2 + +union cpuid35_eax { + struct { + unsigned int leaf0:1; + /* Counters Sub-Leaf */ + unsigned int cntr_subleaf:1; + /* Auto Counter Reload Sub-Leaf */ + unsigned int acr_subleaf:1; + /* Events Sub-Leaf */ + unsigned int events_subleaf:1; + unsigned int reserved:28; + } split; + unsigned int full; +}; + +union cpuid35_ebx { + struct { + /* UnitMask2 Supported */ + unsigned int umask2:1; + /* EQ-bit Supported */ + unsigned int eq:1; + unsigned int reserved:30; + } split; + unsigned int full; +}; + +/* * Intel Architectural LBR CPUID detection/enumeration details: */ union cpuid28_eax { @@ -195,6 +258,9 @@ union cpuid28_ecx { unsigned int lbr_timed_lbr:1; /* Branch Type Field Supported */ unsigned int lbr_br_type:1; + unsigned int reserved:13; + /* Branch counters (Event Logging) Supported */ + unsigned int lbr_counters:4; } split; unsigned int full; }; @@ -207,9 +273,12 @@ union cpuid_0x80000022_ebx { struct { /* Number of Core Performance Counters */ unsigned int num_core_pmc:4; - unsigned int reserved:6; + /* Number of available LBR Stack Entries */ + unsigned int lbr_v2_stack_sz:6; /* Number of Data Fabric Counters */ unsigned int num_df_pmc:6; + /* Number of Unified Memory Controller Counters */ + unsigned int num_umc_pmc:6; } split; unsigned int full; }; @@ -222,6 +291,7 @@ struct x86_pmu_capability { int bit_width_fixed; unsigned int events_mask; int events_mask_len; + unsigned int pebs_ept :1; }; /* @@ -271,6 +341,10 @@ struct x86_pmu_capability { #define INTEL_PMC_IDX_FIXED_SLOTS (INTEL_PMC_IDX_FIXED + 3) #define INTEL_PMC_MSK_FIXED_SLOTS (1ULL << INTEL_PMC_IDX_FIXED_SLOTS) +/* TOPDOWN_BAD_SPECULATION.ALL: fixed counter 4 (Atom only) */ +/* TOPDOWN_FE_BOUND.ALL: fixed counter 5 (Atom only) */ +/* TOPDOWN_RETIRING.ALL: fixed counter 6 (Atom only) */ + static inline bool use_fixed_pseudo_encoding(u64 code) { return !(code & 0xff); @@ -328,6 +402,9 @@ static inline bool use_fixed_pseudo_encoding(u64 code) #define INTEL_TD_METRIC_MAX INTEL_TD_METRIC_MEM_BOUND #define INTEL_TD_METRIC_NUM 8 +#define INTEL_TD_CFG_METRIC_CLEAR_BIT 0 +#define INTEL_TD_CFG_METRIC_CLEAR BIT_ULL(INTEL_TD_CFG_METRIC_CLEAR_BIT) + static inline bool is_metric_idx(int idx) { return (unsigned)(idx - INTEL_PMC_IDX_METRIC_BASE) < INTEL_TD_METRIC_NUM; @@ -378,7 +455,9 @@ static inline bool is_topdown_idx(int idx) */ struct pebs_basic { - u64 format_size; + u64 format_group:32, + retire_latency:16, + format_size:16; u64 ip; u64 applicable_counters; u64 tsc; @@ -387,7 +466,17 @@ struct pebs_basic { struct pebs_meminfo { u64 address; u64 aux; - u64 latency; + union { + /* pre Alder Lake */ + u64 mem_latency; + /* Alder Lake and later */ + struct { + u64 instr_latency:16; + u64 pad2:16; + u64 cache_latency:16; + u64 pad3:16; + }; + }; u64 tsx_tuning; }; @@ -400,6 +489,15 @@ struct pebs_xmm { u64 xmm[16*2]; /* two entries for each register */ }; +struct pebs_cntr_header { + u32 cntr; + u32 fixed; + u32 metrics; + u32 reserved; +}; + +#define INTEL_CNTR_METRICS 0x3 + /* * AMD Extended Performance Monitoring and Debug cpuid feature detection */ @@ -427,6 +525,8 @@ struct pebs_xmm { #define IBS_CAPS_FETCHCTLEXTD (1U<<9) #define IBS_CAPS_OPDATA4 (1U<<10) #define IBS_CAPS_ZEN4 (1U<<11) +#define IBS_CAPS_OPLDLAT (1U<<12) +#define IBS_CAPS_OPDTLBPGSIZE (1U<<19) #define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \ | IBS_CAPS_FETCHSAM \ @@ -452,8 +552,11 @@ struct pebs_xmm { * The lower 7 bits of the current count are random bits * preloaded by hardware and ignored in software */ +#define IBS_OP_LDLAT_EN (1ULL<<63) +#define IBS_OP_LDLAT_THRSH (0xFULL<<59) #define IBS_OP_CUR_CNT (0xFFF80ULL<<32) #define IBS_OP_CUR_CNT_RAND (0x0007FULL<<32) +#define IBS_OP_CUR_CNT_EXT_MASK (0x7FULL<<52) #define IBS_OP_CNT_CTL (1ULL<<19) #define IBS_OP_VAL (1ULL<<18) #define IBS_OP_ENABLE (1ULL<<17) @@ -465,8 +568,10 @@ struct pebs_xmm { #ifdef CONFIG_X86_LOCAL_APIC extern u32 get_ibs_caps(void); +extern int forward_event_to_ibs(struct perf_event *event); #else static inline u32 get_ibs_caps(void) { return 0; } +static inline int forward_event_to_ibs(struct perf_event *event) { return -ENOENT; } #endif #ifdef CONFIG_PERF_EVENTS @@ -490,15 +595,17 @@ struct x86_perf_regs { u64 *xmm_regs; }; -extern unsigned long perf_instruction_pointer(struct pt_regs *regs); -extern unsigned long perf_misc_flags(struct pt_regs *regs); -#define perf_misc_flags(regs) perf_misc_flags(regs) +extern unsigned long perf_arch_instruction_pointer(struct pt_regs *regs); +extern unsigned long perf_arch_misc_flags(struct pt_regs *regs); +extern unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs); +#define perf_arch_misc_flags(regs) perf_arch_misc_flags(regs) +#define perf_arch_guest_misc_flags(regs) perf_arch_guest_misc_flags(regs) #include <asm/stacktrace.h> /* - * We abuse bit 3 from flags to pass exact information, see perf_misc_flags - * and the comment with PERF_EFLAGS_EXACT. + * We abuse bit 3 from flags to pass exact information, see + * perf_arch_misc_flags() and the comment with PERF_EFLAGS_EXACT. */ #define perf_arch_fetch_caller_regs(regs, __ip) { \ (regs)->ip = (__ip); \ @@ -517,9 +624,11 @@ struct x86_pmu_lbr { unsigned int from; unsigned int to; unsigned int info; + bool has_callstack; }; extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap); +extern u64 perf_get_hw_event_config(int hw_event); extern void perf_check_microcode(void); extern void perf_clear_dirty_counters(void); extern int x86_perf_rdpmc_index(struct perf_event *event); @@ -529,18 +638,23 @@ static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) memset(cap, 0, sizeof(*cap)); } +static inline u64 perf_get_hw_event_config(int hw_event) +{ + return 0; +} + static inline void perf_events_lapic_init(void) { } static inline void perf_check_microcode(void) { } #endif #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) -extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); -extern int x86_perf_get_lbr(struct x86_pmu_lbr *lbr); +extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data); +extern void x86_perf_get_lbr(struct x86_pmu_lbr *lbr); #else -struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); -static inline int x86_perf_get_lbr(struct x86_pmu_lbr *lbr) +struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data); +static inline void x86_perf_get_lbr(struct x86_pmu_lbr *lbr) { - return -1; + memset(lbr, 0, sizeof(*lbr)); } #endif @@ -570,7 +684,7 @@ extern void perf_amd_brs_lopwr_cb(bool lopwr_in); DECLARE_STATIC_CALL(perf_lopwr_cb, perf_amd_brs_lopwr_cb); -static inline void perf_lopwr_cb(bool lopwr_in) +static __always_inline void perf_lopwr_cb(bool lopwr_in) { static_call_mod(perf_lopwr_cb)(lopwr_in); } diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h index 94de1a05aeba..d65e338b6a5f 100644 --- a/arch/x86/include/asm/perf_event_p4.h +++ b/arch/x86/include/asm/perf_event_p4.h @@ -181,7 +181,7 @@ static inline u64 p4_clear_ht_bit(u64 config) static inline int p4_ht_active(void) { #ifdef CONFIG_SMP - return smp_num_siblings > 1; + return __max_threads_per_core > 1; #endif return 0; } @@ -189,7 +189,7 @@ static inline int p4_ht_active(void) static inline int p4_ht_thread(int cpu) { #ifdef CONFIG_SMP - if (smp_num_siblings == 2) + if (__max_threads_per_core == 2) return cpu != cpumask_first(this_cpu_cpumask_var_ptr(cpu_sibling_map)); #endif return 0; diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index c7ec5bb88334..c88691b15f3c 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -6,6 +6,8 @@ #include <linux/mm.h> /* for struct page */ #include <linux/pagemap.h> +#include <asm/cpufeature.h> + #define __HAVE_ARCH_PTE_ALLOC_ONE #define __HAVE_ARCH_PGD_FREE #include <asm-generic/pgalloc.h> @@ -30,20 +32,16 @@ static inline void paravirt_release_p4d(unsigned long pfn) {} #endif /* - * Flags to use when allocating a user page table page. + * In case of Page Table Isolation active, we acquire two PGDs instead of one. + * Being order-1, it is both 8k in size and 8k-aligned. That lets us just + * flip bit 12 in a pointer to swap between the two 4k halves. */ -extern gfp_t __userpte_alloc_gfp; - -#ifdef CONFIG_PAGE_TABLE_ISOLATION -/* - * Instead of one PGD, we acquire two PGDs. Being order-1, it is - * both 8k in size and 8k-aligned. That lets us just flip bit 12 - * in a pointer to swap between the two 4k halves. - */ -#define PGD_ALLOCATION_ORDER 1 -#else -#define PGD_ALLOCATION_ORDER 0 -#endif +static inline unsigned int pgd_allocation_order(void) +{ + if (cpu_feature_enabled(X86_FEATURE_PTI)) + return 1; + return 0; +} /* * Allocate and free page tables. @@ -147,24 +145,6 @@ static inline void pgd_populate_safe(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4 set_pgd_safe(pgd, __pgd(_PAGE_TABLE | __pa(p4d))); } -static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr) -{ - gfp_t gfp = GFP_KERNEL_ACCOUNT; - - if (mm == &init_mm) - gfp &= ~__GFP_ACCOUNT; - return (p4d_t *)get_zeroed_page(gfp); -} - -static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) -{ - if (!pgtable_l5_enabled()) - return; - - BUG_ON((unsigned long)p4d & (PAGE_SIZE-1)); - free_page((unsigned long)p4d); -} - extern void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d); static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index 60d0f9015317..e9482a11ac52 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -80,21 +80,37 @@ static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshi return ((value >> rightshift) & mask) << leftshift; } -/* Encode and de-code a swap entry */ +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <----------------- offset ------------------> 0 E <- type --> 0 + * + * E is the exclusive marker that is not stored in swap entries. + */ #define SWP_TYPE_BITS 5 +#define _SWP_TYPE_MASK ((1U << SWP_TYPE_BITS) - 1) +#define _SWP_TYPE_SHIFT (_PAGE_BIT_PRESENT + 1) #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) -#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) +#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5) -#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \ - & ((1U << SWP_TYPE_BITS) - 1)) +#define __swp_type(x) (((x).val >> _SWP_TYPE_SHIFT) \ + & _SWP_TYPE_MASK) #define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT) #define __swp_entry(type, offset) ((swp_entry_t) { \ - ((type) << (_PAGE_BIT_PRESENT + 1)) \ + (((type) & _SWP_TYPE_MASK) << _SWP_TYPE_SHIFT) \ | ((offset) << SWP_OFFSET_SHIFT) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low }) #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) +/* We borrow bit 7 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE _PAGE_PSE + /* No inverted PFNs on 2 level page tables */ static inline u64 protnone_mask(u64 val) diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h index 7f6ccff0ba72..54690bd4ddbe 100644 --- a/arch/x86/include/asm/pgtable-2level_types.h +++ b/arch/x86/include/asm/pgtable-2level_types.h @@ -2,7 +2,7 @@ #ifndef _ASM_X86_PGTABLE_2LEVEL_DEFS_H #define _ASM_X86_PGTABLE_2LEVEL_DEFS_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/types.h> typedef unsigned long pteval_t; @@ -16,24 +16,22 @@ typedef union { pteval_t pte; pteval_t pte_low; } pte_t; -#endif /* !__ASSEMBLY__ */ - -#define SHARED_KERNEL_PMD 0 +#endif /* !__ASSEMBLER__ */ #define ARCH_PAGE_TABLE_SYNC_MASK PGTBL_PMD_MODIFIED /* - * traditional i386 two-level paging structure: + * Traditional i386 two-level paging structure: */ #define PGDIR_SHIFT 22 #define PTRS_PER_PGD 1024 - /* - * the i386 is two-level, so we don't really have any - * PMD directory physically. + * The i386 is two-level, so we don't really have any + * PMD directory physically: */ +#define PTRS_PER_PMD 1 #define PTRS_PER_PTE 1024 diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index e896ebef8c24..dabafba957ea 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -2,8 +2,6 @@ #ifndef _ASM_X86_PGTABLE_3LEVEL_H #define _ASM_X86_PGTABLE_3LEVEL_H -#include <asm/atomic64_32.h> - /* * Intel Physical Address Extension (PAE) Mode - three-level page * tables on PPro+ CPUs. @@ -21,7 +19,15 @@ pr_err("%s:%d: bad pgd %p(%016Lx)\n", \ __FILE__, __LINE__, &(e), pgd_val(e)) -/* Rules for using set_pte: the pte being assigned *must* be +#define pxx_xchg64(_pxx, _ptr, _val) ({ \ + _pxx##val_t *_p = (_pxx##val_t *)_ptr; \ + _pxx##val_t _o = *_p; \ + do { } while (!try_cmpxchg64(_p, &_o, (_val))); \ + native_make_##_pxx(_o); \ +}) + +/* + * Rules for using set_pte: the pte being assigned *must* be * either not present or in a state where the hardware will * not attempt to update the pte. In places where this is * not possible, use pte_get_and_clear to obtain the old pte @@ -29,83 +35,27 @@ */ static inline void native_set_pte(pte_t *ptep, pte_t pte) { - ptep->pte_high = pte.pte_high; + WRITE_ONCE(ptep->pte_high, pte.pte_high); smp_wmb(); - ptep->pte_low = pte.pte_low; -} - -#define pmd_read_atomic pmd_read_atomic -/* - * pte_offset_map_lock() on 32-bit PAE kernels was reading the pmd_t with - * a "*pmdp" dereference done by GCC. Problem is, in certain places - * where pte_offset_map_lock() is called, concurrent page faults are - * allowed, if the mmap_lock is hold for reading. An example is mincore - * vs page faults vs MADV_DONTNEED. On the page fault side - * pmd_populate() rightfully does a set_64bit(), but if we're reading the - * pmd_t with a "*pmdp" on the mincore side, a SMP race can happen - * because GCC will not read the 64-bit value of the pmd atomically. - * - * To fix this all places running pte_offset_map_lock() while holding the - * mmap_lock in read mode, shall read the pmdp pointer using this - * function to know if the pmd is null or not, and in turn to know if - * they can run pte_offset_map_lock() or pmd_trans_huge() or other pmd - * operations. - * - * Without THP if the mmap_lock is held for reading, the pmd can only - * transition from null to not null while pmd_read_atomic() runs. So - * we can always return atomic pmd values with this function. - * - * With THP if the mmap_lock is held for reading, the pmd can become - * trans_huge or none or point to a pte (and in turn become "stable") - * at any time under pmd_read_atomic(). We could read it truly - * atomically here with an atomic64_read() for the THP enabled case (and - * it would be a whole lot simpler), but to avoid using cmpxchg8b we - * only return an atomic pmdval if the low part of the pmdval is later - * found to be stable (i.e. pointing to a pte). We are also returning a - * 'none' (zero) pmdval if the low part of the pmd is zero. - * - * In some cases the high and low part of the pmdval returned may not be - * consistent if THP is enabled (the low part may point to previously - * mapped hugepage, while the high part may point to a more recently - * mapped hugepage), but pmd_none_or_trans_huge_or_clear_bad() only - * needs the low part of the pmd to be read atomically to decide if the - * pmd is unstable or not, with the only exception when the low part - * of the pmd is zero, in which case we return a 'none' pmd. - */ -static inline pmd_t pmd_read_atomic(pmd_t *pmdp) -{ - pmdval_t ret; - u32 *tmp = (u32 *)pmdp; - - ret = (pmdval_t) (*tmp); - if (ret) { - /* - * If the low part is null, we must not read the high part - * or we can end up with a partial pmd. - */ - smp_rmb(); - ret |= ((pmdval_t)*(tmp + 1)) << 32; - } - - return (pmd_t) { ret }; + WRITE_ONCE(ptep->pte_low, pte.pte_low); } static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) { - set_64bit((unsigned long long *)(ptep), native_pte_val(pte)); + pxx_xchg64(pte, ptep, native_pte_val(pte)); } static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) { - set_64bit((unsigned long long *)(pmdp), native_pmd_val(pmd)); + pxx_xchg64(pmd, pmdp, native_pmd_val(pmd)); } static inline void native_set_pud(pud_t *pudp, pud_t pud) { -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION pud.p4d.pgd = pti_set_user_pgtbl(&pudp->p4d.pgd, pud.p4d.pgd); #endif - set_64bit((unsigned long long *)(pudp), native_pud_val(pud)); + pxx_xchg64(pud, pudp, native_pud_val(pud)); } /* @@ -116,17 +66,16 @@ static inline void native_set_pud(pud_t *pudp, pud_t pud) static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - ptep->pte_low = 0; + WRITE_ONCE(ptep->pte_low, 0); smp_wmb(); - ptep->pte_high = 0; + WRITE_ONCE(ptep->pte_high, 0); } -static inline void native_pmd_clear(pmd_t *pmd) +static inline void native_pmd_clear(pmd_t *pmdp) { - u32 *tmp = (u32 *)pmd; - *tmp = 0; + WRITE_ONCE(pmdp->pmd_low, 0); smp_wmb(); - *(tmp + 1) = 0; + WRITE_ONCE(pmdp->pmd_high, 0); } static inline void native_pud_clear(pud_t *pudp) @@ -149,41 +98,26 @@ static inline void pud_clear(pud_t *pudp) */ } + #ifdef CONFIG_SMP static inline pte_t native_ptep_get_and_clear(pte_t *ptep) { - pte_t res; - - res.pte = (pteval_t)arch_atomic64_xchg((atomic64_t *)ptep, 0); - - return res; + return pxx_xchg64(pte, ptep, 0ULL); } -#else -#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) -#endif -union split_pmd { - struct { - u32 pmd_low; - u32 pmd_high; - }; - pmd_t pmd; -}; - -#ifdef CONFIG_SMP static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp) { - union split_pmd res, *orig = (union split_pmd *)pmdp; - - /* xchg acts as a barrier before setting of the high bits */ - res.pmd_low = xchg(&orig->pmd_low, 0); - res.pmd_high = orig->pmd_high; - orig->pmd_high = 0; + return pxx_xchg64(pmd, pmdp, 0ULL); +} - return res.pmd; +static inline pud_t native_pudp_get_and_clear(pud_t *pudp) +{ + return pxx_xchg64(pud, pudp, 0ULL); } #else +#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) +#define native_pudp_get_and_clear(xp) native_local_pudp_get_and_clear(xp) #endif #ifndef pmdp_establish @@ -199,67 +133,47 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, * anybody. */ if (!(pmd_val(pmd) & _PAGE_PRESENT)) { - union split_pmd old, new, *ptr; - - ptr = (union split_pmd *)pmdp; - - new.pmd = pmd; - /* xchg acts as a barrier before setting of the high bits */ - old.pmd_low = xchg(&ptr->pmd_low, new.pmd_low); - old.pmd_high = ptr->pmd_high; - ptr->pmd_high = new.pmd_high; - return old.pmd; - } - - do { - old = *pmdp; - } while (cmpxchg64(&pmdp->pmd, old.pmd, pmd.pmd) != old.pmd); + old.pmd_low = xchg(&pmdp->pmd_low, pmd.pmd_low); + old.pmd_high = READ_ONCE(pmdp->pmd_high); + WRITE_ONCE(pmdp->pmd_high, pmd.pmd_high); - return old; -} -#endif - -#ifdef CONFIG_SMP -union split_pud { - struct { - u32 pud_low; - u32 pud_high; - }; - pud_t pud; -}; - -static inline pud_t native_pudp_get_and_clear(pud_t *pudp) -{ - union split_pud res, *orig = (union split_pud *)pudp; - -#ifdef CONFIG_PAGE_TABLE_ISOLATION - pti_set_user_pgtbl(&pudp->p4d.pgd, __pgd(0)); -#endif - - /* xchg acts as a barrier before setting of the high bits */ - res.pud_low = xchg(&orig->pud_low, 0); - res.pud_high = orig->pud_high; - orig->pud_high = 0; + return old; + } - return res.pud; + return pxx_xchg64(pmd, pmdp, pmd.pmd); } -#else -#define native_pudp_get_and_clear(xp) native_local_pudp_get_and_clear(xp) #endif -/* Encode and de-code a swap entry */ +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 6 6 6 6 5 5 5 5 5 5 5 5 5 5 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 + * 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 + * < type -> <---------------------- offset ---------------------- + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * --------------------------------------------> 0 E 0 0 0 0 0 0 0 + * + * E is the exclusive marker that is not stored in swap entries. + */ #define SWP_TYPE_BITS 5 +#define _SWP_TYPE_MASK ((1U << SWP_TYPE_BITS) - 1) #define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 1) /* We always extract/encode the offset by shifting it all the way up, and then down again */ #define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT + SWP_TYPE_BITS) -#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5) -#define __swp_type(x) (((x).val) & 0x1f) -#define __swp_offset(x) ((x).val >> 5) -#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5}) +#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) +#define __swp_type(x) (((x).val) & _SWP_TYPE_MASK) +#define __swp_offset(x) ((x).val >> SWP_TYPE_BITS) +#define __swp_entry(type, offset) ((swp_entry_t){((type) & _SWP_TYPE_MASK) \ + | (offset) << SWP_TYPE_BITS}) /* * Normally, __swp_entry() converts from arch-independent swp_entry_t to @@ -287,6 +201,9 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp) #define __pte_to_swp_entry(pte) (__swp_entry(__pteval_swp_type(pte), \ __pteval_swp_offset(pte))) +/* We borrow bit 7 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE _PAGE_PSE + #include <asm/pgtable-invert.h> #endif /* _ASM_X86_PGTABLE_3LEVEL_H */ diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h index 56baf43befb4..580b09bf6a45 100644 --- a/arch/x86/include/asm/pgtable-3level_types.h +++ b/arch/x86/include/asm/pgtable-3level_types.h @@ -2,7 +2,7 @@ #ifndef _ASM_X86_PGTABLE_3LEVEL_DEFS_H #define _ASM_X86_PGTABLE_3LEVEL_DEFS_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/types.h> typedef u64 pteval_t; @@ -18,11 +18,16 @@ typedef union { }; pteval_t pte; } pte_t; -#endif /* !__ASSEMBLY__ */ -#define SHARED_KERNEL_PMD (!static_cpu_has(X86_FEATURE_PTI)) +typedef union { + struct { + unsigned long pmd_low, pmd_high; + }; + pmdval_t pmd; +} pmd_t; +#endif /* !__ASSEMBLER__ */ -#define ARCH_PAGE_TABLE_SYNC_MASK (SHARED_KERNEL_PMD ? 0 : PGTBL_PMD_MODIFIED) +#define ARCH_PAGE_TABLE_SYNC_MASK PGTBL_PMD_MODIFIED /* * PGDIR_SHIFT determines what a top-level page table entry can map diff --git a/arch/x86/include/asm/pgtable-invert.h b/arch/x86/include/asm/pgtable-invert.h index a0c1525f1b6f..e12e52ae8083 100644 --- a/arch/x86/include/asm/pgtable-invert.h +++ b/arch/x86/include/asm/pgtable-invert.h @@ -2,7 +2,7 @@ #ifndef _ASM_PGTABLE_INVERT_H #define _ASM_PGTABLE_INVERT_H 1 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * A clear pte value is special, and doesn't get inverted. @@ -36,6 +36,6 @@ static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask) return val; } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 44e2d6f1dbaa..97954c936c54 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -15,7 +15,7 @@ cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS))) \ : (prot)) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/spinlock.h> #include <asm/x86_init.h> #include <asm/pkru.h> @@ -27,10 +27,12 @@ extern pgd_t early_top_pgt[PTRS_PER_PGD]; bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd); +struct seq_file; void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm); void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm, bool user); -void ptdump_walk_pgd_level_checkwx(void); +bool ptdump_walk_pgd_level_checkwx(void); +#define ptdump_check_wx ptdump_walk_pgd_level_checkwx void ptdump_walk_user_pgd_level_checkwx(void); /* @@ -40,10 +42,8 @@ void ptdump_walk_user_pgd_level_checkwx(void); #define pgprot_decrypted(prot) __pgprot(cc_mkdec(pgprot_val(prot))) #ifdef CONFIG_DEBUG_WX -#define debug_checkwx() ptdump_walk_pgd_level_checkwx() #define debug_checkwx_user() ptdump_walk_user_pgd_level_checkwx() #else -#define debug_checkwx() do { } while (0) #define debug_checkwx_user() do { } while (0) #endif @@ -120,13 +120,47 @@ extern pmdval_t early_pmd_flags; #define arch_end_context_switch(prev) do {} while(0) #endif /* CONFIG_PARAVIRT_XXL */ +static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) +{ + pmdval_t v = native_pmd_val(pmd); + + return native_make_pmd(v | set); +} + +static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) +{ + pmdval_t v = native_pmd_val(pmd); + + return native_make_pmd(v & ~clear); +} + +static inline pud_t pud_set_flags(pud_t pud, pudval_t set) +{ + pudval_t v = native_pud_val(pud); + + return native_make_pud(v | set); +} + +static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear) +{ + pudval_t v = native_pud_val(pud); + + return native_make_pud(v & ~clear); +} + /* * The following only work if pte_present() is true. * Undefined behaviour if not.. */ -static inline int pte_dirty(pte_t pte) +static inline bool pte_dirty(pte_t pte) { - return pte_flags(pte) & _PAGE_DIRTY; + return pte_flags(pte) & _PAGE_DIRTY_BITS; +} + +static inline bool pte_shstk(pte_t pte) +{ + return cpu_feature_enabled(X86_FEATURE_SHSTK) && + (pte_flags(pte) & (_PAGE_RW | _PAGE_DIRTY)) == _PAGE_DIRTY; } static inline int pte_young(pte_t pte) @@ -134,19 +168,33 @@ static inline int pte_young(pte_t pte) return pte_flags(pte) & _PAGE_ACCESSED; } -static inline int pmd_dirty(pmd_t pmd) +static inline bool pte_decrypted(pte_t pte) +{ + return cc_mkdec(pte_val(pte)) == pte_val(pte); +} + +#define pmd_dirty pmd_dirty +static inline bool pmd_dirty(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_DIRTY_BITS; +} + +static inline bool pmd_shstk(pmd_t pmd) { - return pmd_flags(pmd) & _PAGE_DIRTY; + return cpu_feature_enabled(X86_FEATURE_SHSTK) && + (pmd_flags(pmd) & (_PAGE_RW | _PAGE_DIRTY | _PAGE_PSE)) == + (_PAGE_DIRTY | _PAGE_PSE); } +#define pmd_young pmd_young static inline int pmd_young(pmd_t pmd) { return pmd_flags(pmd) & _PAGE_ACCESSED; } -static inline int pud_dirty(pud_t pud) +static inline bool pud_dirty(pud_t pud) { - return pud_flags(pud) & _PAGE_DIRTY; + return pud_flags(pud) & _PAGE_DIRTY_BITS; } static inline int pud_young(pud_t pud) @@ -154,9 +202,36 @@ static inline int pud_young(pud_t pud) return pud_flags(pud) & _PAGE_ACCESSED; } +static inline bool pud_shstk(pud_t pud) +{ + return cpu_feature_enabled(X86_FEATURE_SHSTK) && + (pud_flags(pud) & (_PAGE_RW | _PAGE_DIRTY | _PAGE_PSE)) == + (_PAGE_DIRTY | _PAGE_PSE); +} + static inline int pte_write(pte_t pte) { - return pte_flags(pte) & _PAGE_RW; + /* + * Shadow stack pages are logically writable, but do not have + * _PAGE_RW. Check for them separately from _PAGE_RW itself. + */ + return (pte_flags(pte) & _PAGE_RW) || pte_shstk(pte); +} + +#define pmd_write pmd_write +static inline int pmd_write(pmd_t pmd) +{ + /* + * Shadow stack pages are logically writable, but do not have + * _PAGE_RW. Check for them separately from _PAGE_RW itself. + */ + return (pmd_flags(pmd) & _PAGE_RW) || pmd_shstk(pmd); +} + +#define pud_write pud_write +static inline int pud_write(pud_t pud) +{ + return pud_flags(pud) & _PAGE_RW; } static inline int pte_huge(pte_t pte) @@ -183,6 +258,8 @@ static inline int pte_special(pte_t pte) static inline u64 protnone_mask(u64 val); +#define PFN_PTE_SHIFT PAGE_SHIFT + static inline unsigned long pte_pfn(pte_t pte) { phys_addr_t pfn = pte_val(pte); @@ -197,6 +274,7 @@ static inline unsigned long pmd_pfn(pmd_t pmd) return (pfn & pmd_pfn_mask(pmd)) >> PAGE_SHIFT; } +#define pud_pfn pud_pfn static inline unsigned long pud_pfn(pud_t pud) { phys_addr_t pfn = pud_val(pud); @@ -214,23 +292,16 @@ static inline unsigned long pgd_pfn(pgd_t pgd) return (pgd_val(pgd) & PTE_PFN_MASK) >> PAGE_SHIFT; } -#define p4d_leaf p4d_large -static inline int p4d_large(p4d_t p4d) -{ - /* No 512 GiB pages yet */ - return 0; -} - #define pte_page(pte) pfn_to_page(pte_pfn(pte)) -#define pmd_leaf pmd_large -static inline int pmd_large(pmd_t pte) +#define pmd_leaf pmd_leaf +static inline bool pmd_leaf(pmd_t pte) { return pmd_flags(pte) & _PAGE_PSE; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -/* NOTE: when predicate huge page, consider also pmd_devmap, or use pmd_large */ +/* NOTE: when predicate huge page, consider also pmd_devmap, or use pmd_leaf */ static inline int pmd_trans_huge(pmd_t pmd) { return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE; @@ -267,6 +338,30 @@ static inline int pud_devmap(pud_t pud) } #endif +#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP +static inline bool pmd_special(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_SPECIAL; +} + +static inline pmd_t pmd_mkspecial(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_SPECIAL); +} +#endif /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */ + +#ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP +static inline bool pud_special(pud_t pud) +{ + return pud_flags(pud) & _PAGE_SPECIAL; +} + +static inline pud_t pud_mkspecial(pud_t pud) +{ + return pud_set_flags(pud, _PAGE_SPECIAL); +} +#endif /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */ + static inline int pgd_devmap(pgd_t pgd) { return 0; @@ -288,6 +383,65 @@ static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear) return native_make_pte(v & ~clear); } +/* + * Write protection operations can result in Dirty=1,Write=0 PTEs. But in the + * case of X86_FEATURE_USER_SHSTK, these PTEs denote shadow stack memory. So + * when creating dirty, write-protected memory, a software bit is used: + * _PAGE_BIT_SAVED_DIRTY. The following functions take a PTE and transition the + * Dirty bit to SavedDirty, and vice-vesra. + * + * This shifting is only done if needed. In the case of shifting + * Dirty->SavedDirty, the condition is if the PTE is Write=0. In the case of + * shifting SavedDirty->Dirty, the condition is Write=1. + */ +static inline pgprotval_t mksaveddirty_shift(pgprotval_t v) +{ + pgprotval_t cond = (~v >> _PAGE_BIT_RW) & 1; + + v |= ((v >> _PAGE_BIT_DIRTY) & cond) << _PAGE_BIT_SAVED_DIRTY; + v &= ~(cond << _PAGE_BIT_DIRTY); + + return v; +} + +static inline pgprotval_t clear_saveddirty_shift(pgprotval_t v) +{ + pgprotval_t cond = (v >> _PAGE_BIT_RW) & 1; + + v |= ((v >> _PAGE_BIT_SAVED_DIRTY) & cond) << _PAGE_BIT_DIRTY; + v &= ~(cond << _PAGE_BIT_SAVED_DIRTY); + + return v; +} + +static inline pte_t pte_mksaveddirty(pte_t pte) +{ + pteval_t v = native_pte_val(pte); + + v = mksaveddirty_shift(v); + return native_make_pte(v); +} + +static inline pte_t pte_clear_saveddirty(pte_t pte) +{ + pteval_t v = native_pte_val(pte); + + v = clear_saveddirty_shift(v); + return native_make_pte(v); +} + +static inline pte_t pte_wrprotect(pte_t pte) +{ + pte = pte_clear_flags(pte, _PAGE_RW); + + /* + * Blindly clearing _PAGE_RW might accidentally create + * a shadow stack PTE (Write=0,Dirty=1). Move the hardware + * dirty value to the software bit, if present. + */ + return pte_mksaveddirty(pte); +} + #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP static inline int pte_uffd_wp(pte_t pte) { @@ -296,7 +450,7 @@ static inline int pte_uffd_wp(pte_t pte) static inline pte_t pte_mkuffd_wp(pte_t pte) { - return pte_set_flags(pte, _PAGE_UFFD_WP); + return pte_wrprotect(pte_set_flags(pte, _PAGE_UFFD_WP)); } static inline pte_t pte_clear_uffd_wp(pte_t pte) @@ -307,7 +461,7 @@ static inline pte_t pte_clear_uffd_wp(pte_t pte) static inline pte_t pte_mkclean(pte_t pte) { - return pte_clear_flags(pte, _PAGE_DIRTY); + return pte_clear_flags(pte, _PAGE_DIRTY_BITS); } static inline pte_t pte_mkold(pte_t pte) @@ -315,11 +469,6 @@ static inline pte_t pte_mkold(pte_t pte) return pte_clear_flags(pte, _PAGE_ACCESSED); } -static inline pte_t pte_wrprotect(pte_t pte) -{ - return pte_clear_flags(pte, _PAGE_RW); -} - static inline pte_t pte_mkexec(pte_t pte) { return pte_clear_flags(pte, _PAGE_NX); @@ -327,7 +476,16 @@ static inline pte_t pte_mkexec(pte_t pte) static inline pte_t pte_mkdirty(pte_t pte) { - return pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); + pte = pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); + + return pte_mksaveddirty(pte); +} + +static inline pte_t pte_mkwrite_shstk(pte_t pte) +{ + pte = pte_clear_flags(pte, _PAGE_RW); + + return pte_set_flags(pte, _PAGE_DIRTY); } static inline pte_t pte_mkyoung(pte_t pte) @@ -335,11 +493,15 @@ static inline pte_t pte_mkyoung(pte_t pte) return pte_set_flags(pte, _PAGE_ACCESSED); } -static inline pte_t pte_mkwrite(pte_t pte) +static inline pte_t pte_mkwrite_novma(pte_t pte) { return pte_set_flags(pte, _PAGE_RW); } +struct vm_area_struct; +pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma); +#define pte_mkwrite pte_mkwrite + static inline pte_t pte_mkhuge(pte_t pte) { return pte_set_flags(pte, _PAGE_PSE); @@ -370,18 +532,34 @@ static inline pte_t pte_mkdevmap(pte_t pte) return pte_set_flags(pte, _PAGE_SPECIAL|_PAGE_DEVMAP); } -static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) +/* See comments above mksaveddirty_shift() */ +static inline pmd_t pmd_mksaveddirty(pmd_t pmd) { pmdval_t v = native_pmd_val(pmd); - return native_make_pmd(v | set); + v = mksaveddirty_shift(v); + return native_make_pmd(v); } -static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) +/* See comments above mksaveddirty_shift() */ +static inline pmd_t pmd_clear_saveddirty(pmd_t pmd) { pmdval_t v = native_pmd_val(pmd); - return native_make_pmd(v & ~clear); + v = clear_saveddirty_shift(v); + return native_make_pmd(v); +} + +static inline pmd_t pmd_wrprotect(pmd_t pmd) +{ + pmd = pmd_clear_flags(pmd, _PAGE_RW); + + /* + * Blindly clearing _PAGE_RW might accidentally create + * a shadow stack PMD (RW=0, Dirty=1). Move the hardware + * dirty value to the software bit. + */ + return pmd_mksaveddirty(pmd); } #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP @@ -392,7 +570,7 @@ static inline int pmd_uffd_wp(pmd_t pmd) static inline pmd_t pmd_mkuffd_wp(pmd_t pmd) { - return pmd_set_flags(pmd, _PAGE_UFFD_WP); + return pmd_wrprotect(pmd_set_flags(pmd, _PAGE_UFFD_WP)); } static inline pmd_t pmd_clear_uffd_wp(pmd_t pmd) @@ -408,17 +586,21 @@ static inline pmd_t pmd_mkold(pmd_t pmd) static inline pmd_t pmd_mkclean(pmd_t pmd) { - return pmd_clear_flags(pmd, _PAGE_DIRTY); + return pmd_clear_flags(pmd, _PAGE_DIRTY_BITS); } -static inline pmd_t pmd_wrprotect(pmd_t pmd) +static inline pmd_t pmd_mkdirty(pmd_t pmd) { - return pmd_clear_flags(pmd, _PAGE_RW); + pmd = pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); + + return pmd_mksaveddirty(pmd); } -static inline pmd_t pmd_mkdirty(pmd_t pmd) +static inline pmd_t pmd_mkwrite_shstk(pmd_t pmd) { - return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); + pmd = pmd_clear_flags(pmd, _PAGE_RW); + + return pmd_set_flags(pmd, _PAGE_DIRTY); } static inline pmd_t pmd_mkdevmap(pmd_t pmd) @@ -436,23 +618,30 @@ static inline pmd_t pmd_mkyoung(pmd_t pmd) return pmd_set_flags(pmd, _PAGE_ACCESSED); } -static inline pmd_t pmd_mkwrite(pmd_t pmd) +static inline pmd_t pmd_mkwrite_novma(pmd_t pmd) { return pmd_set_flags(pmd, _PAGE_RW); } -static inline pud_t pud_set_flags(pud_t pud, pudval_t set) +pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); +#define pmd_mkwrite pmd_mkwrite + +/* See comments above mksaveddirty_shift() */ +static inline pud_t pud_mksaveddirty(pud_t pud) { pudval_t v = native_pud_val(pud); - return native_make_pud(v | set); + v = mksaveddirty_shift(v); + return native_make_pud(v); } -static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear) +/* See comments above mksaveddirty_shift() */ +static inline pud_t pud_clear_saveddirty(pud_t pud) { pudval_t v = native_pud_val(pud); - return native_make_pud(v & ~clear); + v = clear_saveddirty_shift(v); + return native_make_pud(v); } static inline pud_t pud_mkold(pud_t pud) @@ -462,17 +651,26 @@ static inline pud_t pud_mkold(pud_t pud) static inline pud_t pud_mkclean(pud_t pud) { - return pud_clear_flags(pud, _PAGE_DIRTY); + return pud_clear_flags(pud, _PAGE_DIRTY_BITS); } static inline pud_t pud_wrprotect(pud_t pud) { - return pud_clear_flags(pud, _PAGE_RW); + pud = pud_clear_flags(pud, _PAGE_RW); + + /* + * Blindly clearing _PAGE_RW might accidentally create + * a shadow stack PUD (RW=0, Dirty=1). Move the hardware + * dirty value to the software bit. + */ + return pud_mksaveddirty(pud); } static inline pud_t pud_mkdirty(pud_t pud) { - return pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); + pud = pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); + + return pud_mksaveddirty(pud); } static inline pud_t pud_mkdevmap(pud_t pud) @@ -492,7 +690,9 @@ static inline pud_t pud_mkyoung(pud_t pud) static inline pud_t pud_mkwrite(pud_t pud) { - return pud_set_flags(pud, _PAGE_RW); + pud = pud_set_flags(pud, _PAGE_RW); + + return pud_clear_saveddirty(pud); } #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY @@ -577,6 +777,9 @@ static inline pgprotval_t check_pgprot(pgprot_t pgprot) static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) { phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT; + /* This bit combination is used to mark shadow stacks */ + WARN_ON_ONCE((pgprot_val(pgprot) & (_PAGE_DIRTY | _PAGE_RW)) == + _PAGE_DIRTY); pfn ^= protnone_mask(pgprot_val(pgprot)); pfn &= PTE_PFN_MASK; return __pte(pfn | check_pgprot(pgprot)); @@ -604,11 +807,18 @@ static inline pmd_t pmd_mkinvalid(pmd_t pmd) __pgprot(pmd_flags(pmd) & ~(_PAGE_PRESENT|_PAGE_PROTNONE))); } +static inline pud_t pud_mkinvalid(pud_t pud) +{ + return pfn_pud(pud_pfn(pud), + __pgprot(pud_flags(pud) & ~(_PAGE_PRESENT|_PAGE_PROTNONE))); +} + static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask); static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { pteval_t val = pte_val(pte), oldval = val; + pte_t pte_result; /* * Chop off the NX bit (if present), and add the NX portion of @@ -617,17 +827,71 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) val &= _PAGE_CHG_MASK; val |= check_pgprot(newprot) & ~_PAGE_CHG_MASK; val = flip_protnone_guard(oldval, val, PTE_PFN_MASK); - return __pte(val); + + pte_result = __pte(val); + + /* + * To avoid creating Write=0,Dirty=1 PTEs, pte_modify() needs to avoid: + * 1. Marking Write=0 PTEs Dirty=1 + * 2. Marking Dirty=1 PTEs Write=0 + * + * The first case cannot happen because the _PAGE_CHG_MASK will filter + * out any Dirty bit passed in newprot. Handle the second case by + * going through the mksaveddirty exercise. Only do this if the old + * value was Write=1 to avoid doing this on Shadow Stack PTEs. + */ + if (oldval & _PAGE_RW) + pte_result = pte_mksaveddirty(pte_result); + else + pte_result = pte_clear_saveddirty(pte_result); + + return pte_result; } static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) { pmdval_t val = pmd_val(pmd), oldval = val; + pmd_t pmd_result; - val &= _HPAGE_CHG_MASK; + val &= (_HPAGE_CHG_MASK & ~_PAGE_DIRTY); val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK; val = flip_protnone_guard(oldval, val, PHYSICAL_PMD_PAGE_MASK); - return __pmd(val); + + pmd_result = __pmd(val); + + /* + * Avoid creating shadow stack PMD by accident. See comment in + * pte_modify(). + */ + if (oldval & _PAGE_RW) + pmd_result = pmd_mksaveddirty(pmd_result); + else + pmd_result = pmd_clear_saveddirty(pmd_result); + + return pmd_result; +} + +static inline pud_t pud_modify(pud_t pud, pgprot_t newprot) +{ + pudval_t val = pud_val(pud), oldval = val; + pud_t pud_result; + + val &= _HPAGE_CHG_MASK; + val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK; + val = flip_protnone_guard(oldval, val, PHYSICAL_PUD_PAGE_MASK); + + pud_result = __pud(val); + + /* + * Avoid creating shadow stack PUD by accident. See comment in + * pte_modify(). + */ + if (oldval & _PAGE_RW) + pud_result = pud_mksaveddirty(pud_result); + else + pud_result = pud_clear_saveddirty(pud_result); + + return pud_result; } /* @@ -684,7 +948,7 @@ static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, pmd_t *populate_extra_pmd(unsigned long vaddr); pte_t *populate_extra_pte(unsigned long vaddr); -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd); /* @@ -698,14 +962,14 @@ static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) return pgd; return __pti_set_user_pgtbl(pgdp, pgd); } -#else /* CONFIG_PAGE_TABLE_ISOLATION */ +#else /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */ static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) { return pgd; } -#endif /* CONFIG_PAGE_TABLE_ISOLATION */ +#endif /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #ifdef CONFIG_X86_32 @@ -714,7 +978,7 @@ static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) # include <asm/pgtable_64.h> #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/mm_types.h> #include <linux/mmdebug.h> #include <linux/log2.h> @@ -731,6 +995,14 @@ static inline int pte_same(pte_t a, pte_t b) return a.pte == b.pte; } +static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) +{ + if (__pte_needs_invert(pte_val(pte))) + return __pte(pte_val(pte) - (nr << PFN_PTE_SHIFT)); + return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT)); +} +#define pte_advance_pfn pte_advance_pfn + static inline int pte_present(pte_t a) { return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); @@ -804,18 +1076,10 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) */ #define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd)) -/* - * Conversion functions: convert a page and protection to a page entry, - * and a page entry and page directory to the page they refer to. - * - * (Currently stuck as a macro because of indirect forward reference - * to linux/mm.h:page_to_nid()) - */ -#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) - static inline int pmd_bad(pmd_t pmd) { - return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; + return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) != + (_KERNPG_TABLE & ~_PAGE_ACCESSED); } static inline unsigned long pages_to_mb(unsigned long npg) @@ -845,23 +1109,16 @@ static inline pmd_t *pud_pgtable(pud_t pud) */ #define pud_page(pud) pfn_to_page(pud_pfn(pud)) -#define pud_leaf pud_large -static inline int pud_large(pud_t pud) +#define pud_leaf pud_leaf +static inline bool pud_leaf(pud_t pud) { - return (pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) == - (_PAGE_PSE | _PAGE_PRESENT); + return pud_val(pud) & _PAGE_PSE; } static inline int pud_bad(pud_t pud) { return (pud_flags(pud) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0; } -#else -#define pud_leaf pud_large -static inline int pud_large(pud_t pud) -{ - return 0; -} #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #if CONFIG_PGTABLE_LEVELS > 3 @@ -890,7 +1147,7 @@ static inline int p4d_bad(p4d_t p4d) { unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER; - if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) + if (IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)) ignore_flags |= _PAGE_NX; return (p4d_flags(p4d) & ~ignore_flags) != 0; @@ -936,7 +1193,7 @@ static inline int pgd_bad(pgd_t pgd) if (!pgtable_l5_enabled()) return 0; - if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) + if (IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)) ignore_flags |= _PAGE_NX; return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; @@ -956,17 +1213,16 @@ static inline int pgd_none(pgd_t pgd) } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET) #define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern int direct_gbpages; void init_mem_mapping(void); void early_alloc_pgt_buf(void); -extern void memblock_find_dma_reserve(void); void __init poking_init(void); unsigned long init_memory_mapping(unsigned long start, unsigned long end, pgprot_t prot); @@ -1001,24 +1257,17 @@ static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp) return res; } -static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - page_table_check_pte_set(mm, addr, ptep, pte); - set_pte(ptep, pte); -} - static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(mm, addr, pmdp, pmd); + page_table_check_pmd_set(mm, pmdp, pmd); set_pmd(pmdp, pmd); } static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud) { - page_table_check_pud_set(mm, addr, pudp, pud); + page_table_check_pud_set(mm, pudp, pud); native_set_pud(pudp, pud); } @@ -1049,7 +1298,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t pte = native_ptep_get_and_clear(ptep); - page_table_check_pte_clear(mm, addr, pte); + page_table_check_pte_clear(mm, pte); return pte; } @@ -1065,7 +1314,7 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, * care about updates and native needs no locking */ pte = native_local_ptep_get_and_clear(ptep); - page_table_check_pte_clear(mm, addr, pte); + page_table_check_pte_clear(mm, pte); } else { pte = ptep_get_and_clear(mm, addr, ptep); } @@ -1076,12 +1325,20 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte); -} + /* + * Avoid accidentally creating shadow stack PTEs + * (Write=0,Dirty=1). Use cmpxchg() to prevent races with + * the hardware setting Dirty=1. + */ + pte_t old_pte, new_pte; -#define flush_tlb_fix_spurious_fault(vma, address) do { } while (0) + old_pte = READ_ONCE(*ptep); + do { + new_pte = pte_wrprotect(old_pte); + } while (!try_cmpxchg((long *)&ptep->pte, (long *)&old_pte, *(long *)&new_pte)); +} -#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) +#define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0) #define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS extern int pmdp_set_access_flags(struct vm_area_struct *vma, @@ -1102,19 +1359,13 @@ extern int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); -#define pmd_write pmd_write -static inline int pmd_write(pmd_t pmd) -{ - return pmd_flags(pmd) & _PAGE_RW; -} - #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { pmd_t pmd = native_pmdp_get_and_clear(pmdp); - page_table_check_pmd_clear(mm, addr, pmd); + page_table_check_pmd_clear(mm, pmd); return pmd; } @@ -1125,7 +1376,7 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, { pud_t pud = native_pudp_get_and_clear(pudp); - page_table_check_pud_clear(mm, addr, pud); + page_table_check_pud_clear(mm, pud); return pud; } @@ -1134,13 +1385,17 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { - clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp); -} + /* + * Avoid accidentally creating shadow stack PTEs + * (Write=0,Dirty=1). Use cmpxchg() to prevent races with + * the hardware setting Dirty=1. + */ + pmd_t old_pmd, new_pmd; -#define pud_write pud_write -static inline int pud_write(pud_t pud) -{ - return pud_flags(pud) & _PAGE_RW; + old_pmd = READ_ONCE(*pmdp); + do { + new_pmd = pmd_wrprotect(old_pmd); + } while (!try_cmpxchg((long *)pmdp, (long *)&old_pmd, *(long *)&new_pmd)); } #ifndef pmdp_establish @@ -1148,7 +1403,7 @@ static inline int pud_write(pud_t pud) static inline pmd_t pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd); + page_table_check_pmd_set(vma->vm_mm, pmdp, pmd); if (IS_ENABLED(CONFIG_SMP)) { return xchg(pmdp, pmd); } else { @@ -1159,10 +1414,28 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, } #endif +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +static inline pud_t pudp_establish(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp, pud_t pud) +{ + page_table_check_pud_set(vma->vm_mm, pudp, pud); + if (IS_ENABLED(CONFIG_SMP)) { + return xchg(pudp, pud); + } else { + pud_t old = *pudp; + WRITE_ONCE(*pudp, pud); + return old; + } +} +#endif + #define __HAVE_ARCH_PMDP_INVALIDATE_AD extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); +pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp); + /* * Page table pages are page-aligned. The lower half of the top * level is used for userspace and the top half for the kernel. @@ -1177,12 +1450,9 @@ static inline bool pgdp_maps_userspace(void *__ptr) return (((ptr & ~PAGE_MASK) / sizeof(pgd_t)) < PGD_KERNEL_START); } -#define pgd_leaf pgd_large -static inline int pgd_large(pgd_t pgd) { return 0; } - -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* - * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages + * All top-level MITIGATION_PAGE_TABLE_ISOLATION page tables are order-1 pages * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and * the user one is in the last 4k. To switch between them, you * just need to flip the 12th bit in their addresses. @@ -1227,7 +1497,7 @@ static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp) { return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); } -#endif /* CONFIG_PAGE_TABLE_ISOLATION */ +#endif /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */ /* * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); @@ -1242,7 +1512,7 @@ static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp) static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) { memcpy(dst, src, count * sizeof(pgd_t)); -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION if (!static_cpu_has(X86_FEATURE_PTI)) return; /* Clone the user space pgd as well */ @@ -1273,6 +1543,11 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { } +static inline void update_mmu_cache_range(struct vm_fault *vmf, + struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, unsigned int nr) +{ +} static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd) { @@ -1281,14 +1556,12 @@ static inline void update_mmu_cache_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud) { } -#ifdef _PAGE_SWP_EXCLUSIVE -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline pte_t pte_swp_mkexclusive(pte_t pte) { return pte_set_flags(pte, _PAGE_SWP_EXCLUSIVE); } -static inline int pte_swp_exclusive(pte_t pte) +static inline bool pte_swp_exclusive(pte_t pte) { return pte_flags(pte) & _PAGE_SWP_EXCLUSIVE; } @@ -1297,7 +1570,6 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) { return pte_clear_flags(pte, _PAGE_SWP_EXCLUSIVE); } -#endif /* _PAGE_SWP_EXCLUSIVE */ #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline pte_t pte_swp_mksoft_dirty(pte_t pte) @@ -1396,6 +1668,11 @@ static inline bool __pte_access_permitted(unsigned long pteval, bool write) { unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER; + /* + * Write=0,Dirty=1 PTEs are shadow stack, which the kernel + * shouldn't generally allow access to, but since they + * are already Write=0, the below logic covers both cases. + */ if (write) need_pte_bits |= _PAGE_RW; @@ -1431,11 +1708,22 @@ static inline bool arch_has_pfn_modify_check(void) return boot_cpu_has_bug(X86_BUG_L1TF); } -#define arch_faults_on_old_pte arch_faults_on_old_pte -static inline bool arch_faults_on_old_pte(void) +#define arch_check_zapped_pte arch_check_zapped_pte +void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte); + +#define arch_check_zapped_pmd arch_check_zapped_pmd +void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd); + +#define arch_check_zapped_pud arch_check_zapped_pud +void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud); + +#ifdef CONFIG_XEN_PV +#define arch_has_hw_nonleaf_pmd_young arch_has_hw_nonleaf_pmd_young +static inline bool arch_has_hw_nonleaf_pmd_young(void) { - return false; + return !cpu_feature_enabled(X86_FEATURE_XENPV); } +#endif #ifdef CONFIG_PAGE_TABLE_CHECK static inline bool pte_user_accessible_page(pte_t pte) @@ -1454,6 +1742,51 @@ static inline bool pud_user_accessible_page(pud_t pud) } #endif -#endif /* __ASSEMBLY__ */ +#ifdef CONFIG_X86_SGX +int arch_memory_failure(unsigned long pfn, int flags); +#define arch_memory_failure arch_memory_failure + +bool arch_is_platform_page(u64 paddr); +#define arch_is_platform_page arch_is_platform_page +#endif + +/* + * Use set_p*_safe(), and elide TLB flushing, when confident that *no* + * TLB flush will be required as a result of the "set". For example, use + * in scenarios where it is known ahead of time that the routine is + * setting non-present entries, or re-setting an existing entry to the + * same value. Otherwise, use the typical "set" helpers and flush the + * TLB. + */ +#define set_pte_safe(ptep, pte) \ +({ \ + WARN_ON_ONCE(pte_present(*ptep) && !pte_same(*ptep, pte)); \ + set_pte(ptep, pte); \ +}) + +#define set_pmd_safe(pmdp, pmd) \ +({ \ + WARN_ON_ONCE(pmd_present(*pmdp) && !pmd_same(*pmdp, pmd)); \ + set_pmd(pmdp, pmd); \ +}) + +#define set_pud_safe(pudp, pud) \ +({ \ + WARN_ON_ONCE(pud_present(*pudp) && !pud_same(*pudp, pud)); \ + set_pud(pudp, pud); \ +}) + +#define set_p4d_safe(p4dp, p4d) \ +({ \ + WARN_ON_ONCE(p4d_present(*p4dp) && !p4d_same(*p4dp, p4d)); \ + set_p4d(p4dp, p4d); \ +}) + +#define set_pgd_safe(pgdp, pgd) \ +({ \ + WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \ + set_pgd(pgdp, pgd); \ +}) +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_PGTABLE_H */ diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index 7c9c968a42ef..b612cc57a4d3 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -13,7 +13,7 @@ * This file contains the functions and defines necessary to modify and use * the i386 page table tree. */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <asm/processor.h> #include <linux/threads.h> #include <asm/paravirt.h> @@ -45,16 +45,7 @@ do { \ flush_tlb_one_kernel((vaddr)); \ } while (0) -#endif /* !__ASSEMBLY__ */ - -/* - * kern_addr_valid() is (1) for FLATMEM and (0) for SPARSEMEM - */ -#ifdef CONFIG_FLATMEM -#define kern_addr_valid(addr) (1) -#else -#define kern_addr_valid(kaddr) (0) -#endif +#endif /* !__ASSEMBLER__ */ /* * This is used to calculate the .brk reservation for initial pagetables. diff --git a/arch/x86/include/asm/pgtable_32_areas.h b/arch/x86/include/asm/pgtable_32_areas.h index b6355416a15a..921148b42967 100644 --- a/arch/x86/include/asm/pgtable_32_areas.h +++ b/arch/x86/include/asm/pgtable_32_areas.h @@ -13,7 +13,7 @@ */ #define VMALLOC_OFFSET (8 * 1024 * 1024) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern bool __vmalloc_start_set; /* set once high_memory is set */ #endif diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index e479491da8d5..f06e5d6a2747 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -5,7 +5,7 @@ #include <linux/const.h> #include <asm/pgtable_64_types.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * This file contains the functions and defines necessary to modify and use @@ -41,11 +41,9 @@ static inline void sync_initial_page_table(void) { } pr_err("%s:%d: bad pud %p(%016lx)\n", \ __FILE__, __LINE__, &(e), pud_val(e)) -#if CONFIG_PGTABLE_LEVELS >= 5 #define p4d_ERROR(e) \ pr_err("%s:%d: bad p4d %p(%016lx)\n", \ __FILE__, __LINE__, &(e), p4d_val(e)) -#endif #define pgd_ERROR(e) \ pr_err("%s:%d: bad pgd %p(%016lx)\n", \ @@ -143,7 +141,8 @@ static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) { pgd_t pgd; - if (pgtable_l5_enabled() || !IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) { + if (pgtable_l5_enabled() || + !IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)) { WRITE_ONCE(*p4dp, p4d); return; } @@ -203,7 +202,7 @@ static inline void native_pgd_clear(pgd_t *pgd) * F (2) in swp entry is used to record when a pagetable is * writeprotected by userfaultfd WP support. * - * E (3) in swp entry is used to rememeber PG_anon_exclusive. + * E (3) in swp entry is used to remember PG_anon_exclusive. * * Bit 7 in swp entry should be 0 because pmd_present checks not only P, * but also L and G. @@ -237,10 +236,9 @@ static inline void native_pgd_clear(pgd_t *pgd) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) }) #define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val((pmd)) }) -#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) -#define __swp_entry_to_pmd(x) ((pmd_t) { .pmd = (x).val }) +#define __swp_entry_to_pte(x) (__pte((x).val)) +#define __swp_entry_to_pmd(x) (__pmd((x).val)) -extern int kern_addr_valid(unsigned long addr); extern void cleanup_highmap(void); #define HAVE_ARCH_UNMAPPED_AREA @@ -270,5 +268,26 @@ static inline bool gup_fast_permitted(unsigned long start, unsigned long end) #include <asm/pgtable-invert.h> -#endif /* !__ASSEMBLY__ */ +#else /* __ASSEMBLER__ */ + +#define l4_index(x) (((x) >> 39) & 511) +#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)) + +L4_PAGE_OFFSET = l4_index(__PAGE_OFFSET_BASE_L4) +L4_START_KERNEL = l4_index(__START_KERNEL_map) + +L3_START_KERNEL = pud_index(__START_KERNEL_map) + +#define SYM_DATA_START_PAGE_ALIGNED(name) \ + SYM_START(name, SYM_L_GLOBAL, .balign PAGE_SIZE) + +/* Automate the creation of 1 to 1 mapping pmd entries */ +#define PMDS(START, PERM, COUNT) \ + i = 0 ; \ + .rept (COUNT) ; \ + .quad (START) + (i << PMD_SHIFT) + (PERM) ; \ + i = i + 1 ; \ + .endr + +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_PGTABLE_64_H */ diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 70e360a2e5fb..4604f924d8b8 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -4,7 +4,7 @@ #include <asm/sparsemem.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/types.h> #include <asm/kaslr.h> @@ -19,8 +19,8 @@ typedef unsigned long pgdval_t; typedef unsigned long pgprotval_t; typedef struct { pteval_t pte; } pte_t; +typedef struct { pmdval_t pmd; } pmd_t; -#ifdef CONFIG_X86_5LEVEL extern unsigned int __pgtable_l5_enabled; #ifdef USE_EARLY_PGTABLE_L5 @@ -36,18 +36,10 @@ static inline bool pgtable_l5_enabled(void) #define pgtable_l5_enabled() cpu_feature_enabled(X86_FEATURE_LA57) #endif /* USE_EARLY_PGTABLE_L5 */ -#else -#define pgtable_l5_enabled() 0 -#endif /* CONFIG_X86_5LEVEL */ - extern unsigned int pgdir_shift; extern unsigned int ptrs_per_p4d; -#endif /* !__ASSEMBLY__ */ - -#define SHARED_KERNEL_PMD 0 - -#ifdef CONFIG_X86_5LEVEL +#endif /* !__ASSEMBLER__ */ /* * PGDIR_SHIFT determines what a top-level page table entry can map @@ -66,17 +58,6 @@ extern unsigned int ptrs_per_p4d; #define MAX_POSSIBLE_PHYSMEM_BITS 52 -#else /* CONFIG_X86_5LEVEL */ - -/* - * PGDIR_SHIFT determines what a top-level page table entry can map - */ -#define PGDIR_SHIFT 39 -#define PTRS_PER_PGD 512 -#define MAX_PTRS_PER_P4D 1 - -#endif /* CONFIG_X86_5LEVEL */ - /* * 3rd level page */ @@ -103,7 +84,7 @@ extern unsigned int ptrs_per_p4d; #define PGDIR_MASK (~(PGDIR_SIZE - 1)) /* - * See Documentation/x86/x86_64/mm.rst for a description of the memory map. + * See Documentation/arch/x86/x86_64/mm.rst for a description of the memory map. * * Be very careful vs. KASLR when changing anything here. The KASLR address * range must not overlap with anything except the KASAN shadow area, which @@ -129,17 +110,60 @@ extern unsigned int ptrs_per_p4d; #define __VMEMMAP_BASE_L4 0xffffea0000000000UL #define __VMEMMAP_BASE_L5 0xffd4000000000000UL -#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT # define VMALLOC_START vmalloc_base # define VMALLOC_SIZE_TB (pgtable_l5_enabled() ? VMALLOC_SIZE_TB_L5 : VMALLOC_SIZE_TB_L4) # define VMEMMAP_START vmemmap_base + +#ifdef CONFIG_RANDOMIZE_MEMORY +# define DIRECT_MAP_PHYSMEM_END direct_map_physmem_end +#endif + +/* + * End of the region for which vmalloc page tables are pre-allocated. + * For non-KMSAN builds, this is the same as VMALLOC_END. + * For KMSAN builds, VMALLOC_START..VMEMORY_END is 4 times bigger than + * VMALLOC_START..VMALLOC_END (see below). + */ +#define VMEMORY_END (VMALLOC_START + (VMALLOC_SIZE_TB << 40) - 1) + +#ifndef CONFIG_KMSAN +#define VMALLOC_END VMEMORY_END #else -# define VMALLOC_START __VMALLOC_BASE_L4 -# define VMALLOC_SIZE_TB VMALLOC_SIZE_TB_L4 -# define VMEMMAP_START __VMEMMAP_BASE_L4 -#endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */ +/* + * In KMSAN builds vmalloc area is four times smaller, and the remaining 3/4 + * are used to keep the metadata for virtual pages. The memory formerly + * belonging to vmalloc area is now laid out as follows: + * + * 1st quarter: VMALLOC_START to VMALLOC_END - new vmalloc area + * 2nd quarter: KMSAN_VMALLOC_SHADOW_START to + * VMALLOC_END+KMSAN_VMALLOC_SHADOW_OFFSET - vmalloc area shadow + * 3rd quarter: KMSAN_VMALLOC_ORIGIN_START to + * VMALLOC_END+KMSAN_VMALLOC_ORIGIN_OFFSET - vmalloc area origins + * 4th quarter: KMSAN_MODULES_SHADOW_START to KMSAN_MODULES_ORIGIN_START + * - shadow for modules, + * KMSAN_MODULES_ORIGIN_START to + * KMSAN_MODULES_ORIGIN_START + MODULES_LEN - origins for modules. + */ +#define VMALLOC_QUARTER_SIZE ((VMALLOC_SIZE_TB << 40) >> 2) +#define VMALLOC_END (VMALLOC_START + VMALLOC_QUARTER_SIZE - 1) -#define VMALLOC_END (VMALLOC_START + (VMALLOC_SIZE_TB << 40) - 1) +/* + * vmalloc metadata addresses are calculated by adding shadow/origin offsets + * to vmalloc address. + */ +#define KMSAN_VMALLOC_SHADOW_OFFSET VMALLOC_QUARTER_SIZE +#define KMSAN_VMALLOC_ORIGIN_OFFSET (VMALLOC_QUARTER_SIZE << 1) + +#define KMSAN_VMALLOC_SHADOW_START (VMALLOC_START + KMSAN_VMALLOC_SHADOW_OFFSET) +#define KMSAN_VMALLOC_ORIGIN_START (VMALLOC_START + KMSAN_VMALLOC_ORIGIN_OFFSET) + +/* + * The shadow/origin for modules are placed one by one in the last 1/4 of + * vmalloc space. + */ +#define KMSAN_MODULES_SHADOW_START (VMALLOC_END + KMSAN_VMALLOC_ORIGIN_OFFSET + 1) +#define KMSAN_MODULES_ORIGIN_START (KMSAN_MODULES_SHADOW_START + MODULES_LEN) +#endif /* CONFIG_KMSAN */ #define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) /* The module sections ends with the start of the fixmap */ diff --git a/arch/x86/include/asm/pgtable_areas.h b/arch/x86/include/asm/pgtable_areas.h index d34cce1b995c..4f056fb88174 100644 --- a/arch/x86/include/asm/pgtable_areas.h +++ b/arch/x86/include/asm/pgtable_areas.h @@ -11,6 +11,12 @@ #define CPU_ENTRY_AREA_RO_IDT_VADDR ((void *)CPU_ENTRY_AREA_RO_IDT) -#define CPU_ENTRY_AREA_MAP_SIZE (CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_ARRAY_SIZE - CPU_ENTRY_AREA_BASE) +#ifdef CONFIG_X86_32 +#define CPU_ENTRY_AREA_MAP_SIZE (CPU_ENTRY_AREA_PER_CPU + \ + (CPU_ENTRY_AREA_SIZE * NR_CPUS) - \ + CPU_ENTRY_AREA_BASE) +#else +#define CPU_ENTRY_AREA_MAP_SIZE P4D_SIZE +#endif #endif /* _ASM_X86_PGTABLE_AREAS_H */ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index bdaf8391e2e0..b74ec5c3643b 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -21,7 +21,8 @@ #define _PAGE_BIT_SOFTW2 10 /* " */ #define _PAGE_BIT_SOFTW3 11 /* " */ #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ -#define _PAGE_BIT_SOFTW4 58 /* available for programmer */ +#define _PAGE_BIT_SOFTW4 57 /* available for programmer */ +#define _PAGE_BIT_SOFTW5 58 /* available for programmer */ #define _PAGE_BIT_PKEY_BIT0 59 /* Protection Keys, bit 1/4 */ #define _PAGE_BIT_PKEY_BIT1 60 /* Protection Keys, bit 2/4 */ #define _PAGE_BIT_PKEY_BIT2 61 /* Protection Keys, bit 3/4 */ @@ -32,8 +33,18 @@ #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 #define _PAGE_BIT_UFFD_WP _PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */ #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ +#define _PAGE_BIT_KERNEL_4K _PAGE_BIT_SOFTW3 /* page must not be converted to large */ #define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4 +#ifdef CONFIG_X86_64 +#define _PAGE_BIT_SAVED_DIRTY _PAGE_BIT_SOFTW5 /* Saved Dirty bit (leaf) */ +#define _PAGE_BIT_NOPTISHADOW _PAGE_BIT_SOFTW5 /* No PTI shadow (root PGD) */ +#else +/* Shared with _PAGE_BIT_UFFD_WP which is not supported on 32 bit */ +#define _PAGE_BIT_SAVED_DIRTY _PAGE_BIT_SOFTW2 /* Saved Dirty bit (leaf) */ +#define _PAGE_BIT_NOPTISHADOW _PAGE_BIT_SOFTW2 /* No PTI shadow (root PGD) */ +#endif + /* If _PAGE_BIT_PRESENT is clear, we use these: */ /* - if the user mapped it with PROT_NONE; pte_present gives true */ #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL @@ -54,6 +65,7 @@ #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) +#define _PAGE_KERNEL_4K (_AT(pteval_t, 1) << _PAGE_BIT_KERNEL_4K) #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS #define _PAGE_PKEY_BIT0 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT0) #define _PAGE_PKEY_BIT1 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT1) @@ -117,19 +129,34 @@ #define _PAGE_SOFTW4 (_AT(pteval_t, 0)) #endif +/* + * The hardware requires shadow stack to be Write=0,Dirty=1. However, + * there are valid cases where the kernel might create read-only PTEs that + * are dirty (e.g., fork(), mprotect(), uffd-wp(), soft-dirty tracking). In + * this case, the _PAGE_SAVED_DIRTY bit is used instead of the HW-dirty bit, + * to avoid creating a wrong "shadow stack" PTEs. Such PTEs have + * (Write=0,SavedDirty=1,Dirty=0) set. + */ +#define _PAGE_SAVED_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SAVED_DIRTY) + +#define _PAGE_DIRTY_BITS (_PAGE_DIRTY | _PAGE_SAVED_DIRTY) + #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) +#define _PAGE_NOPTISHADOW (_AT(pteval_t, 1) << _PAGE_BIT_NOPTISHADOW) + /* * Set of bits not changed in pte_modify. The pte's * protection key is treated like _PAGE_RW, for * instance, and is *not* included in this mask since * pte_modify() does modify it. */ -#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ - _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ - _PAGE_SOFT_DIRTY | _PAGE_DEVMAP | _PAGE_ENC | \ - _PAGE_UFFD_WP) -#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) +#define _COMMON_PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ + _PAGE_SPECIAL | _PAGE_ACCESSED | \ + _PAGE_DIRTY_BITS | _PAGE_SOFT_DIRTY | \ + _PAGE_DEVMAP | _PAGE_CC | _PAGE_UFFD_WP) +#define _PAGE_CHG_MASK (_COMMON_PAGE_CHG_MASK | _PAGE_PAT) +#define _HPAGE_CHG_MASK (_COMMON_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_PAT_LARGE) /* * The cache modes defined here are used to translate between pure SW usage @@ -139,7 +166,7 @@ * to have the WB mode at index 0 (all bits clear). This is the default * right now and likely would break too much if changed. */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ enum page_cache_mode { _PAGE_CACHE_MODE_WB = 0, _PAGE_CACHE_MODE_WC = 1, @@ -152,6 +179,7 @@ enum page_cache_mode { }; #endif +#define _PAGE_CC (_AT(pteval_t, cc_get_mask())) #define _PAGE_ENC (_AT(pteval_t, sme_me_mask)) #define _PAGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT) @@ -188,14 +216,22 @@ enum page_cache_mode { #define __PAGE_KERNEL (__PP|__RW| 0|___A|__NX|___D| 0|___G) #define __PAGE_KERNEL_EXEC (__PP|__RW| 0|___A| 0|___D| 0|___G) + +/* + * Page tables needs to have Write=1 in order for any lower PTEs to be + * writable. This includes shadow stack memory (Write=0, Dirty=1) + */ #define _KERNPG_TABLE_NOENC (__PP|__RW| 0|___A| 0|___D| 0| 0) #define _KERNPG_TABLE (__PP|__RW| 0|___A| 0|___D| 0| 0| _ENC) #define _PAGE_TABLE_NOENC (__PP|__RW|_USR|___A| 0|___D| 0| 0) #define _PAGE_TABLE (__PP|__RW|_USR|___A| 0|___D| 0| 0| _ENC) -#define __PAGE_KERNEL_RO (__PP| 0| 0|___A|__NX|___D| 0|___G) -#define __PAGE_KERNEL_ROX (__PP| 0| 0|___A| 0|___D| 0|___G) + +#define __PAGE_KERNEL_RO (__PP| 0| 0|___A|__NX| 0| 0|___G) +#define __PAGE_KERNEL_ROX (__PP| 0| 0|___A| 0| 0| 0|___G) +#define __PAGE_KERNEL (__PP|__RW| 0|___A|__NX|___D| 0|___G) +#define __PAGE_KERNEL_EXEC (__PP|__RW| 0|___A| 0|___D| 0|___G) #define __PAGE_KERNEL_NOCACHE (__PP|__RW| 0|___A|__NX|___D| 0|___G| __NC) -#define __PAGE_KERNEL_VVAR (__PP| 0|_USR|___A|__NX|___D| 0|___G) +#define __PAGE_KERNEL_VVAR (__PP| 0|_USR|___A|__NX| 0| 0|___G) #define __PAGE_KERNEL_LARGE (__PP|__RW| 0|___A|__NX|___D|_PSE|___G) #define __PAGE_KERNEL_LARGE_EXEC (__PP|__RW| 0|___A| 0|___D|_PSE|___G) #define __PAGE_KERNEL_WP (__PP|__RW| 0|___A|__NX|___D| 0|___G| __WP) @@ -205,7 +241,7 @@ enum page_cache_mode { #define __PAGE_KERNEL_IO_NOCACHE __PAGE_KERNEL_NOCACHE -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define __PAGE_KERNEL_ENC (__PAGE_KERNEL | _ENC) #define __PAGE_KERNEL_ENC_WP (__PAGE_KERNEL_WP | _ENC) @@ -228,26 +264,7 @@ enum page_cache_mode { #define PAGE_KERNEL_IO __pgprot_mask(__PAGE_KERNEL_IO) #define PAGE_KERNEL_IO_NOCACHE __pgprot_mask(__PAGE_KERNEL_IO_NOCACHE) -#endif /* __ASSEMBLY__ */ - -/* xwr */ -#define __P000 PAGE_NONE -#define __P001 PAGE_READONLY -#define __P010 PAGE_COPY -#define __P011 PAGE_COPY -#define __P100 PAGE_READONLY_EXEC -#define __P101 PAGE_READONLY_EXEC -#define __P110 PAGE_COPY_EXEC -#define __P111 PAGE_COPY_EXEC - -#define __S000 PAGE_NONE -#define __S001 PAGE_READONLY -#define __S010 PAGE_SHARED -#define __S011 PAGE_SHARED -#define __S100 PAGE_READONLY_EXEC -#define __S101 PAGE_READONLY_EXEC -#define __S110 PAGE_SHARED_EXEC -#define __S111 PAGE_SHARED_EXEC +#endif /* __ASSEMBLER__ */ /* * early identity mapping pte attrib macros. @@ -266,7 +283,7 @@ enum page_cache_mode { # include <asm/pgtable_64_types.h> #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/types.h> @@ -380,11 +397,9 @@ static inline pudval_t native_pud_val(pud_t pud) #endif #if CONFIG_PGTABLE_LEVELS > 2 -typedef struct { pmdval_t pmd; } pmd_t; - static inline pmd_t native_make_pmd(pmdval_t val) { - return (pmd_t) { val }; + return (pmd_t) { .pmd = val }; } static inline pmdval_t native_pmd_val(pmd_t pmd) @@ -508,8 +523,6 @@ typedef struct page *pgtable_t; extern pteval_t __supported_pte_mask; extern pteval_t __default_kernel_pte_mask; -extern void set_nx(void); -extern int nx_enabled; #define pgprot_writecombine pgprot_writecombine extern pgprot_t pgprot_writecombine(pgprot_t prot); @@ -534,15 +547,13 @@ extern void native_pagetable_init(void); #define native_pagetable_init paging_init #endif -struct seq_file; -extern void arch_report_meminfo(struct seq_file *m); - enum pg_level { PG_LEVEL_NONE, PG_LEVEL_4K, PG_LEVEL_2M, PG_LEVEL_1G, PG_LEVEL_512G, + PG_LEVEL_256T, PG_LEVEL_NUM }; @@ -561,6 +572,8 @@ static inline void update_page_count(int level, unsigned long pages) { } extern pte_t *lookup_address(unsigned long address, unsigned int *level); extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, unsigned int *level); +pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address, + unsigned int *level, bool *nx, bool *rw); extern pmd_t *lookup_pmd_address(unsigned long address); extern phys_addr_t slow_virt_to_phys(void *__address); extern int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, @@ -569,6 +582,6 @@ extern int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long page_flags); extern int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address, unsigned long numpages); -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_X86_PGTABLE_DEFS_H */ diff --git a/arch/x86/include/asm/posted_intr.h b/arch/x86/include/asm/posted_intr.h new file mode 100644 index 000000000000..a5f761fbf45b --- /dev/null +++ b/arch/x86/include/asm/posted_intr.h @@ -0,0 +1,187 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _X86_POSTED_INTR_H +#define _X86_POSTED_INTR_H + +#include <asm/cmpxchg.h> +#include <asm/rwonce.h> +#include <asm/irq_vectors.h> + +#include <linux/bitmap.h> + +#define POSTED_INTR_ON 0 +#define POSTED_INTR_SN 1 + +#define PID_TABLE_ENTRY_VALID 1 + +#define NR_PIR_VECTORS 256 +#define NR_PIR_WORDS (NR_PIR_VECTORS / BITS_PER_LONG) + +/* Posted-Interrupt Descriptor */ +struct pi_desc { + unsigned long pir[NR_PIR_WORDS]; /* Posted interrupt requested */ + union { + struct { + u16 notifications; /* Suppress and outstanding bits */ + u8 nv; + u8 rsvd_2; + u32 ndst; + }; + u64 control; + }; + u32 rsvd[6]; +} __aligned(64); + +/* + * De-multiplexing posted interrupts is on the performance path, the code + * below is written to optimize the cache performance based on the following + * considerations: + * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently + * accessed by both CPU and IOMMU. + * 2.During software processing of posted interrupts, the CPU needs to do + * natural width read and xchg for checking and clearing posted interrupt + * request (PIR), a 256 bit field within the PID. + * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache + * line when posting interrupts and setting control bits. + * 4.The CPU can access the cache line a magnitude faster than the IOMMU. + * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID + * cache line. The cache line states after each operation are as follows, + * assuming a 64-bit kernel: + * CPU IOMMU PID Cache line state + * --------------------------------------------------------------- + *...read64 exclusive + *...lock xchg64 modified + *... post/atomic swap invalid + *...------------------------------------------------------------- + * + * To reduce L1 data cache miss, it is important to avoid contention with + * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used + * when processing posted interrupts in software, e.g. to dispatch interrupt + * handlers for posted MSIs, or to move interrupts from the PIR to the vIRR + * in KVM. + * + * In addition, the code is trying to keep the cache line state consistent + * as much as possible. e.g. when making a copy and clearing the PIR + * (assuming non-zero PIR bits are present in the entire PIR), it does: + * read, read, read, read, xchg, xchg, xchg, xchg + * instead of: + * read, xchg, read, xchg, read, xchg, read, xchg + */ +static __always_inline bool pi_harvest_pir(unsigned long *pir, + unsigned long *pir_vals) +{ + unsigned long pending = 0; + int i; + + for (i = 0; i < NR_PIR_WORDS; i++) { + pir_vals[i] = READ_ONCE(pir[i]); + pending |= pir_vals[i]; + } + + if (!pending) + return false; + + for (i = 0; i < NR_PIR_WORDS; i++) { + if (!pir_vals[i]) + continue; + + pir_vals[i] = arch_xchg(&pir[i], 0); + } + + return true; +} + +static inline bool pi_test_and_set_on(struct pi_desc *pi_desc) +{ + return test_and_set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control); +} + +static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc) +{ + return test_and_clear_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control); +} + +static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc) +{ + return test_and_clear_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control); +} + +static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) +{ + return test_and_set_bit(vector, pi_desc->pir); +} + +static inline bool pi_is_pir_empty(struct pi_desc *pi_desc) +{ + return bitmap_empty(pi_desc->pir, NR_VECTORS); +} + +static inline void pi_set_sn(struct pi_desc *pi_desc) +{ + set_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control); +} + +static inline void pi_set_on(struct pi_desc *pi_desc) +{ + set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control); +} + +static inline void pi_clear_on(struct pi_desc *pi_desc) +{ + clear_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control); +} + +static inline void pi_clear_sn(struct pi_desc *pi_desc) +{ + clear_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control); +} + +static inline bool pi_test_on(struct pi_desc *pi_desc) +{ + return test_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control); +} + +static inline bool pi_test_sn(struct pi_desc *pi_desc) +{ + return test_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control); +} + +static inline bool pi_test_pir(int vector, struct pi_desc *pi_desc) +{ + return test_bit(vector, (unsigned long *)pi_desc->pir); +} + +/* Non-atomic helpers */ +static inline void __pi_set_sn(struct pi_desc *pi_desc) +{ + pi_desc->notifications |= BIT(POSTED_INTR_SN); +} + +static inline void __pi_clear_sn(struct pi_desc *pi_desc) +{ + pi_desc->notifications &= ~BIT(POSTED_INTR_SN); +} + +#ifdef CONFIG_X86_POSTED_MSI +/* + * Not all external vectors are subject to interrupt remapping, e.g. IOMMU's + * own interrupts. Here we do not distinguish them since those vector bits in + * PIR will always be zero. + */ +static inline bool pi_pending_this_cpu(unsigned int vector) +{ + struct pi_desc *pid = this_cpu_ptr(&posted_msi_pi_desc); + + if (WARN_ON_ONCE(vector > NR_VECTORS || vector < FIRST_EXTERNAL_VECTOR)) + return false; + + return test_bit(vector, pid->pir); +} + +extern void intel_posted_msi_init(void); +#else +static inline bool pi_pending_this_cpu(unsigned int vector) { return false; } + +static inline void intel_posted_msi_init(void) {}; +#endif /* X86_POSTED_MSI */ + +#endif /* _X86_POSTED_INTR_H */ diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 5f6daea1ee24..578441db09f0 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -4,10 +4,10 @@ #include <asm/rmwcc.h> #include <asm/percpu.h> -#include <linux/thread_info.h> + #include <linux/static_call_types.h> -DECLARE_PER_CPU(int, __preempt_count); +DECLARE_PER_CPU_CACHE_HOT(int, __preempt_count); /* We use the MSB mostly because its available */ #define PREEMPT_NEED_RESCHED 0x80000000 @@ -31,11 +31,11 @@ static __always_inline void preempt_count_set(int pc) { int old, new; + old = raw_cpu_read_4(__preempt_count); do { - old = raw_cpu_read_4(__preempt_count); new = (old & PREEMPT_NEED_RESCHED) | (pc & ~PREEMPT_NEED_RESCHED); - } while (raw_cpu_cmpxchg_4(__preempt_count, old, new) != old); + } while (!raw_cpu_try_cmpxchg_4(__preempt_count, &old, new)); } /* @@ -92,7 +92,8 @@ static __always_inline void __preempt_count_sub(int val) */ static __always_inline bool __preempt_count_dec_and_test(void) { - return GEN_UNARY_RMWcc("decl", __preempt_count, e, __percpu_arg([var])); + return GEN_UNARY_RMWcc("decl", __my_cpu_var(__preempt_count), e, + __percpu_arg([var])); } /* diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index 02c2cbda4a74..e5f204b9b33d 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -28,6 +28,8 @@ * On systems with SME, one bit (in a variable position!) is stolen to indicate * that the top-level paging structure is encrypted. * + * On systemms with LAM, bits 61 and 62 are used to indicate LAM mode. + * * All of the remaining bits indicate the physical address of the top-level * paging structure. * @@ -35,7 +37,7 @@ */ #ifdef CONFIG_X86_64 /* Mask off the address space ID and SME encryption bits. */ -#define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull) +#define CR3_ADDR_MASK __sme_clr(PHYSICAL_PAGE_MASK) #define CR3_PCID_MASK 0xFFFull #define CR3_NOFLUSH BIT_ULL(63) @@ -49,7 +51,7 @@ #define CR3_NOFLUSH 0 #endif -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION # define X86_CR3_PTI_PCID_USER_BIT 11 #endif diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 356308c73951..bde58f6510ac 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -16,10 +16,10 @@ struct vm86; #include <uapi/asm/sigcontext.h> #include <asm/current.h> #include <asm/cpufeatures.h> +#include <asm/cpuid/api.h> #include <asm/page.h> #include <asm/pgtable_types.h> #include <asm/percpu.h> -#include <asm/msr.h> #include <asm/desc_defs.h> #include <asm/nops.h> #include <asm/special_insns.h> @@ -27,6 +27,7 @@ struct vm86; #include <asm/unwind_hints.h> #include <asm/vmxfeatures.h> #include <asm/vdso/processor.h> +#include <asm/shstk.h> #include <linux/personality.h> #include <linux/cache.h> @@ -59,29 +60,85 @@ struct vm86; # define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif -enum tlb_infos { - ENTRIES, - NR_INFO -}; - -extern u16 __read_mostly tlb_lli_4k[NR_INFO]; -extern u16 __read_mostly tlb_lli_2m[NR_INFO]; -extern u16 __read_mostly tlb_lli_4m[NR_INFO]; -extern u16 __read_mostly tlb_lld_4k[NR_INFO]; -extern u16 __read_mostly tlb_lld_2m[NR_INFO]; -extern u16 __read_mostly tlb_lld_4m[NR_INFO]; -extern u16 __read_mostly tlb_lld_1g[NR_INFO]; +extern u16 __read_mostly tlb_lli_4k; +extern u16 __read_mostly tlb_lli_2m; +extern u16 __read_mostly tlb_lli_4m; +extern u16 __read_mostly tlb_lld_4k; +extern u16 __read_mostly tlb_lld_2m; +extern u16 __read_mostly tlb_lld_4m; +extern u16 __read_mostly tlb_lld_1g; /* - * CPU type and hardware bug flags. Kept separately for each CPU. - * Members of this structure are referenced in head_32.S, so think twice - * before touching them. [mj] + * CPU type and hardware bug flags. Kept separately for each CPU. */ +struct cpuinfo_topology { + // Real APIC ID read from the local APIC + u32 apicid; + // The initial APIC ID provided by CPUID + u32 initial_apicid; + + // Physical package ID + u32 pkg_id; + + // Physical die ID on AMD, Relative on Intel + u32 die_id; + + // Compute unit ID - AMD specific + u32 cu_id; + + // Core ID relative to the package + u32 core_id; + + // Logical ID mappings + u32 logical_pkg_id; + u32 logical_die_id; + u32 logical_core_id; + + // AMD Node ID and Nodes per Package info + u32 amd_node_id; + + // Cache level topology IDs + u32 llc_id; + u32 l2c_id; + + // Hardware defined CPU-type + union { + u32 cpu_type; + struct { + // CPUID.1A.EAX[23-0] + u32 intel_native_model_id :24; + // CPUID.1A.EAX[31-24] + u32 intel_type :8; + }; + struct { + // CPUID 0x80000026.EBX + u32 amd_num_processors :16, + amd_power_eff_ranking :8, + amd_native_model_id :4, + amd_type :4; + }; + }; +}; + struct cpuinfo_x86 { - __u8 x86; /* CPU family */ - __u8 x86_vendor; /* CPU vendor */ - __u8 x86_model; + union { + /* + * The particular ordering (low-to-high) of (vendor, + * family, model) is done in case range of models, like + * it is usually done on AMD, need to be compared. + */ + struct { + __u8 x86_model; + /* CPU family */ + __u8 x86; + /* CPU vendor */ + __u8 x86_vendor; + __u8 x86_reserved; + }; + /* combined vendor, family, model */ + __u32 x86_vfm; + }; __u8 x86_stepping; #ifdef CONFIG_X86_64 /* Number of 4K pages in DTLB/ITLB combined(in pages): */ @@ -92,9 +149,6 @@ struct cpuinfo_x86 { #endif __u8 x86_virt_bits; __u8 x86_phys_bits; - /* CPUID returned core id bits: */ - __u8 x86_coreid_bits; - __u8 cu_id; /* Max extended CPUID function supported: */ __u32 extended_cpuid_level; /* Maximum supported CPUID level, -1=no CPUID: */ @@ -110,6 +164,7 @@ struct cpuinfo_x86 { }; char x86_vendor_id[16]; char x86_model_id[64]; + struct cpuinfo_topology topo; /* in KB - valid for CPUS which support this call: */ unsigned int x86_cache_size; int x86_cache_alignment; /* In bytes */ @@ -121,21 +176,9 @@ struct cpuinfo_x86 { unsigned long loops_per_jiffy; /* protected processor identification number */ u64 ppin; - /* cpuid returned max cores value: */ - u16 x86_max_cores; - u16 apicid; - u16 initial_apicid; u16 x86_clflush_size; /* number of cores as seen by the OS: */ u16 booted_cores; - /* Physical processor id: */ - u16 phys_proc_id; - /* Logical processor id: */ - u16 logical_proc_id; - /* Core id: */ - u16 cpu_core_id; - u16 cpu_die_id; - u16 logical_die_id; /* Index into per_cpu list: */ u16 cpu_index; /* Is SMT active on this core? */ @@ -146,17 +189,6 @@ struct cpuinfo_x86 { unsigned initialized : 1; } __randomize_layout; -struct cpuid_regs { - u32 eax, ebx, ecx, edx; -}; - -enum cpuid_regs_idx { - CPUID_EAX = 0, - CPUID_EBX, - CPUID_ECX, - CPUID_EDX, -}; - #define X86_VENDOR_INTEL 0 #define X86_VENDOR_CYRIX 1 #define X86_VENDOR_AMD 2 @@ -180,13 +212,8 @@ extern struct cpuinfo_x86 new_cpu_data; extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS]; extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS]; -#ifdef CONFIG_SMP DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); #define cpu_data(cpu) per_cpu(cpu_info, cpu) -#else -#define cpu_info boot_cpu_data -#define cpu_data(cpu) boot_cpu_data -#endif extern const struct seq_operations cpuinfo_op; @@ -199,51 +226,13 @@ static inline unsigned long long l1tf_pfn_limit(void) return BIT_ULL(boot_cpu_data.x86_cache_bits - 1 - PAGE_SHIFT); } +void init_cpu_devs(void); +void get_cpu_vendor(struct cpuinfo_x86 *c); extern void early_cpu_init(void); -extern void identify_boot_cpu(void); -extern void identify_secondary_cpu(struct cpuinfo_x86 *); +extern void identify_secondary_cpu(unsigned int cpu); extern void print_cpu_info(struct cpuinfo_x86 *); void print_cpu_msr(struct cpuinfo_x86 *); -#ifdef CONFIG_X86_32 -extern int have_cpuid_p(void); -#else -static inline int have_cpuid_p(void) -{ - return 1; -} -#endif -static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) -{ - /* ecx is often an input as well as an output. */ - asm volatile("cpuid" - : "=a" (*eax), - "=b" (*ebx), - "=c" (*ecx), - "=d" (*edx) - : "0" (*eax), "2" (*ecx) - : "memory"); -} - -#define native_cpuid_reg(reg) \ -static inline unsigned int native_cpuid_##reg(unsigned int op) \ -{ \ - unsigned int eax = op, ebx, ecx = 0, edx; \ - \ - native_cpuid(&eax, &ebx, &ecx, &edx); \ - \ - return reg; \ -} - -/* - * Native CPUID functions returning a single datum. - */ -native_cpuid_reg(eax) -native_cpuid_reg(ebx) -native_cpuid_reg(ecx) -native_cpuid_reg(edx) - /* * Friendlier CR3 helpers. */ @@ -426,43 +415,33 @@ struct irq_stack { char stack[IRQ_STACK_SIZE]; } __aligned(IRQ_STACK_SIZE); -DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); - +DECLARE_PER_CPU_CACHE_HOT(struct irq_stack *, hardirq_stack_ptr); #ifdef CONFIG_X86_64 -struct fixed_percpu_data { - /* - * GCC hardcodes the stack canary as %gs:40. Since the - * irq_stack is the object at %gs:0, we reserve the bottom - * 48 bytes of the irq stack for the canary. - * - * Once we are willing to require -mstack-protector-guard-symbol= - * support for x86_64 stackprotector, we can get rid of this. - */ - char gs_base[40]; - unsigned long stack_canary; -}; +DECLARE_PER_CPU_CACHE_HOT(bool, hardirq_stack_inuse); +#else +DECLARE_PER_CPU_CACHE_HOT(struct irq_stack *, softirq_stack_ptr); +#endif -DECLARE_PER_CPU_FIRST(struct fixed_percpu_data, fixed_percpu_data) __visible; -DECLARE_INIT_PER_CPU(fixed_percpu_data); +DECLARE_PER_CPU_CACHE_HOT(unsigned long, cpu_current_top_of_stack); +/* const-qualified alias provided by the linker. */ +DECLARE_PER_CPU_CACHE_HOT(const unsigned long __percpu_seg_override, + const_cpu_current_top_of_stack); +#ifdef CONFIG_X86_64 static inline unsigned long cpu_kernelmode_gs_base(int cpu) { - return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu); +#ifdef CONFIG_SMP + return per_cpu_offset(cpu); +#else + return 0; +#endif } -DECLARE_PER_CPU(void *, hardirq_stack_ptr); -DECLARE_PER_CPU(bool, hardirq_stack_inuse); -extern asmlinkage void ignore_sysret(void); +extern asmlinkage void entry_SYSCALL32_ignore(void); /* Save actual FS/GS selectors and bases to current->thread */ void current_save_fsgs(void); -#else /* X86_64 */ -#ifdef CONFIG_STACKPROTECTOR -DECLARE_PER_CPU(unsigned long, __stack_chk_guard); -#endif -DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); -DECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr); -#endif /* !X86_64 */ +#endif /* X86_64 */ struct perf_event; @@ -519,7 +498,6 @@ struct thread_struct { unsigned long iopl_emul; unsigned int iopl_warn:1; - unsigned int sig_on_uaccess_err:1; /* * Protection Keys Register for Userspace. Loaded immediately on @@ -530,14 +508,20 @@ struct thread_struct { */ u32 pkru; - /* Floating point and extended processor state */ - struct fpu fpu; - /* - * WARNING: 'fpu' is dynamically-sized. It *MUST* be at - * the end. - */ +#ifdef CONFIG_X86_USER_SHADOW_STACK + unsigned long features; + unsigned long features_locked; + + struct thread_shstk shstk; +#endif }; +#ifdef CONFIG_X86_DEBUG_FPU +extern struct fpu *x86_task_fpu(struct task_struct *task); +#else +# define x86_task_fpu(task) ((struct fpu *)((void *)(task) + sizeof(*(task)))) +#endif + extern void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size); static inline void arch_thread_struct_whitelist(unsigned long *offset, @@ -566,6 +550,9 @@ static __always_inline unsigned long current_top_of_stack(void) * and around vm86 mode and sp0 on x86_64 is special because of the * entry trampoline. */ + if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT)) + return this_cpu_read_const(const_cpu_current_top_of_stack); + return this_cpu_read_stable(cpu_current_top_of_stack); } @@ -578,7 +565,6 @@ static __always_inline bool on_thread_stack(void) #ifdef CONFIG_PARAVIRT_XXL #include <asm/paravirt.h> #else -#define __cpuid native_cpuid static inline void load_sp0(unsigned long sp0) { @@ -587,75 +573,9 @@ static inline void load_sp0(unsigned long sp0) #endif /* CONFIG_PARAVIRT_XXL */ -/* Free all resources held by a thread. */ -extern void release_thread(struct task_struct *); - unsigned long __get_wchan(struct task_struct *p); -/* - * Generic CPUID function - * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx - * resulting in stale register contents being returned. - */ -static inline void cpuid(unsigned int op, - unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) -{ - *eax = op; - *ecx = 0; - __cpuid(eax, ebx, ecx, edx); -} - -/* Some CPUID calls want 'count' to be placed in ecx */ -static inline void cpuid_count(unsigned int op, int count, - unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) -{ - *eax = op; - *ecx = count; - __cpuid(eax, ebx, ecx, edx); -} - -/* - * CPUID functions returning a single datum - */ -static inline unsigned int cpuid_eax(unsigned int op) -{ - unsigned int eax, ebx, ecx, edx; - - cpuid(op, &eax, &ebx, &ecx, &edx); - - return eax; -} - -static inline unsigned int cpuid_ebx(unsigned int op) -{ - unsigned int eax, ebx, ecx, edx; - - cpuid(op, &eax, &ebx, &ecx, &edx); - - return ebx; -} - -static inline unsigned int cpuid_ecx(unsigned int op) -{ - unsigned int eax, ebx, ecx, edx; - - cpuid(op, &eax, &ebx, &ecx, &edx); - - return ecx; -} - -static inline unsigned int cpuid_edx(unsigned int op) -{ - unsigned int eax, ebx, ecx, edx; - - cpuid(op, &eax, &ebx, &ecx, &edx); - - return edx; -} - -extern void select_idle_routine(const struct cpuinfo_x86 *c); +extern void select_idle_routine(void); extern void amd_e400_c1e_apic_setup(void); extern unsigned long boot_option_idle_override; @@ -664,43 +584,19 @@ enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT, IDLE_POLL}; extern void enable_sep_cpu(void); -extern int sysenter_setup(void); /* Defined in head.S */ extern struct desc_ptr early_gdt_descr; -extern void switch_to_new_gdt(int); +extern void switch_gdt_and_percpu_base(int); extern void load_direct_gdt(int); extern void load_fixmap_gdt(int); -extern void load_percpu_segment(int); extern void cpu_init(void); -extern void cpu_init_secondary(void); -extern void cpu_init_exception_handling(void); +extern void cpu_init_exception_handling(bool boot_cpu); +extern void cpu_init_replace_early_idt(void); extern void cr4_init(void); -static inline unsigned long get_debugctlmsr(void) -{ - unsigned long debugctlmsr = 0; - -#ifndef CONFIG_X86_DEBUGCTLMSR - if (boot_cpu_data.x86 < 6) - return 0; -#endif - rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); - - return debugctlmsr; -} - -static inline void update_debugctlmsr(unsigned long debugctlmsr) -{ -#ifndef CONFIG_X86_DEBUGCTLMSR - if (boot_cpu_data.x86 < 6) - return; -#endif - wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); -} - extern void set_task_blockstep(struct task_struct *task, bool on); /* Boot loader type from the setup header: */ @@ -711,13 +607,12 @@ extern char ignore_fpu_irq; #define HAVE_ARCH_PICK_MMAP_LAYOUT 1 #define ARCH_HAS_PREFETCHW -#define ARCH_HAS_SPINLOCK_PREFETCH #ifdef CONFIG_X86_32 # define BASE_PREFETCH "" # define ARCH_HAS_PREFETCH #else -# define BASE_PREFETCH "prefetcht0 %P1" +# define BASE_PREFETCH "prefetcht0 %1" #endif /* @@ -728,7 +623,7 @@ extern char ignore_fpu_irq; */ static inline void prefetch(const void *x) { - alternative_input(BASE_PREFETCH, "prefetchnta %P1", + alternative_input(BASE_PREFETCH, "prefetchnta %1", X86_FEATURE_XMM, "m" (*(const char *)x)); } @@ -740,16 +635,11 @@ static inline void prefetch(const void *x) */ static __always_inline void prefetchw(const void *x) { - alternative_input(BASE_PREFETCH, "prefetchw %P1", + alternative_input(BASE_PREFETCH, "prefetchw %1", X86_FEATURE_3DNOWPREFETCH, "m" (*(const char *)x)); } -static inline void spin_lock_prefetch(const void *x) -{ - prefetchw(x); -} - #define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \ TOP_OF_KERNEL_STACK_PADDING) @@ -768,12 +658,12 @@ static inline void spin_lock_prefetch(const void *x) .sysenter_cs = __KERNEL_CS, \ } -#define KSTK_ESP(task) (task_pt_regs(task)->sp) - #else -#define INIT_THREAD { } +extern unsigned long __top_init_kernel_stack[]; -extern unsigned long KSTK_ESP(struct task_struct *task); +#define INIT_THREAD { \ + .sp = (unsigned long)&__top_init_kernel_stack, \ +} #endif /* CONFIG_X86_64 */ @@ -788,6 +678,7 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, #define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE_LOW) #define KSTK_EIP(task) (task_pt_regs(task)->ip) +#define KSTK_ESP(task) (task_pt_regs(task)->sp) /* Get/set a process' ability to use the timestamp counter instruction */ #define GET_TSC_CTL(adr) get_tsc_mode((adr)) @@ -798,34 +689,33 @@ extern int set_tsc_mode(unsigned int val); DECLARE_PER_CPU(u64, msr_misc_features_shadow); -extern u16 get_llc_id(unsigned int cpu); - -#ifdef CONFIG_CPU_SUP_AMD -extern u32 amd_get_nodes_per_socket(void); -extern u32 amd_get_highest_perf(void); -#else -static inline u32 amd_get_nodes_per_socket(void) { return 0; } -static inline u32 amd_get_highest_perf(void) { return 0; } -#endif - -#define for_each_possible_hypervisor_cpuid_base(function) \ - for (function = 0x40000000; function < 0x40010000; function += 0x100) - -static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) +static inline u32 per_cpu_llc_id(unsigned int cpu) { - uint32_t base, eax, signature[3]; - - for_each_possible_hypervisor_cpuid_base(base) { - cpuid(base, &eax, &signature[0], &signature[1], &signature[2]); + return per_cpu(cpu_info.topo.llc_id, cpu); +} - if (!memcmp(sig, signature, 12) && - (leaves == 0 || ((eax - base) >= leaves))) - return base; - } +static inline u32 per_cpu_l2c_id(unsigned int cpu) +{ + return per_cpu(cpu_info.topo.l2c_id, cpu); +} - return 0; +#ifdef CONFIG_CPU_SUP_AMD +/* + * Issue a DIV 0/1 insn to clear any division data from previous DIV + * operations. + */ +static __always_inline void amd_clear_divider(void) +{ + asm volatile(ALTERNATIVE("", "div %2\n\t", X86_BUG_DIV0) + :: "a" (0), "d" (0), "r" (1)); } +extern void amd_check_microcode(void); +#else +static inline void amd_clear_divider(void) { } +static inline void amd_check_microcode(void) { } +#endif + extern unsigned long arch_align_stack(unsigned long sp); void free_init_pages(const char *what, unsigned long begin, unsigned long end); extern void free_kernel_image_pages(const char *what, void *begin, void *end); @@ -838,10 +728,12 @@ bool xen_set_default_idle(void); #endif void __noreturn stop_this_cpu(void *dummy); -void microcode_check(void); +void microcode_check(struct cpuinfo_x86 *prev_info); +void store_cpu_caps(struct cpuinfo_x86 *info); enum l1tf_mitigations { L1TF_MITIGATION_OFF, + L1TF_MITIGATION_AUTO, L1TF_MITIGATION_FLUSH_NOWARN, L1TF_MITIGATION_FLUSH, L1TF_MITIGATION_FLUSH_NOSMT, @@ -853,16 +745,29 @@ extern enum l1tf_mitigations l1tf_mitigation; enum mds_mitigations { MDS_MITIGATION_OFF, + MDS_MITIGATION_AUTO, MDS_MITIGATION_FULL, MDS_MITIGATION_VMWERV, }; -#ifdef CONFIG_X86_SGX -int arch_memory_failure(unsigned long pfn, int flags); -#define arch_memory_failure arch_memory_failure +extern bool gds_ucode_mitigated(void); -bool arch_is_platform_page(u64 paddr); -#define arch_is_platform_page arch_is_platform_page -#endif +/* + * Make previous memory operations globally visible before + * a WRMSR. + * + * MFENCE makes writes visible, but only affects load/store + * instructions. WRMSR is unfortunately not a load/store + * instruction and is unaffected by MFENCE. The LFENCE ensures + * that the WRMSR is not reordered. + * + * Most WRMSRs are full serializing instructions themselves and + * do not require this barrier. This is only required for the + * IA32_TSC_DEADLINE and X2APIC MSRs. + */ +static inline void weak_wrmsr_fence(void) +{ + alternative("mfence; lfence", "", ALT_NOT(X86_FEATURE_APIC_MSRS_FENCE)); +} #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h index b716d291d0d4..5d0dbab85264 100644 --- a/arch/x86/include/asm/prom.h +++ b/arch/x86/include/asm/prom.h @@ -8,7 +8,7 @@ #ifndef _ASM_X86_PROM_H #define _ASM_X86_PROM_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/of.h> #include <linux/types.h> @@ -23,15 +23,15 @@ extern int of_ioapic; extern u64 initial_dtb; extern void add_dtb(u64 data); void x86_of_pci_init(void); -void x86_dtb_init(void); +void x86_flattree_get_config(void); #else static inline void add_dtb(u64 data) { } static inline void x86_of_pci_init(void) { } -static inline void x86_dtb_init(void) { } +static inline void x86_flattree_get_config(void) { } #define of_ioapic 0 #endif extern char cmd_line[COMMAND_LINE_SIZE]; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 12ef86b19910..05224a695872 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -15,7 +15,6 @@ void entry_SYSCALL_64(void); void entry_SYSCALL_64_safe_stack(void); void entry_SYSRETQ_unsafe_stack(void); void entry_SYSRETQ_end(void); -long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2); #endif #ifdef CONFIG_X86_32 @@ -32,16 +31,15 @@ void entry_SYSCALL_compat(void); void entry_SYSCALL_compat_safe_stack(void); void entry_SYSRETL_compat_unsafe_stack(void); void entry_SYSRETL_compat_end(void); -void entry_INT80_compat(void); -#ifdef CONFIG_XEN_PV -void xen_entry_INT80_compat(void); -#endif +#else /* !CONFIG_IA32_EMULATION */ +#define entry_SYSCALL_compat NULL +#define entry_SYSENTER_compat NULL #endif void x86_configure_nx(void); extern int reboot_force; -long do_arch_prctl_common(int option, unsigned long arg2); +long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2); #endif /* _ASM_X86_PROTO_H */ diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h index 07375b476c4f..88d0a1ab1f77 100644 --- a/arch/x86/include/asm/pti.h +++ b/arch/x86/include/asm/pti.h @@ -1,9 +1,9 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PTI_H #define _ASM_X86_PTI_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION extern void pti_init(void); extern void pti_check_boottime_disable(void); extern void pti_finalize(void); @@ -11,5 +11,5 @@ extern void pti_finalize(void); static inline void pti_check_boottime_disable(void) { } #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_PTI_H */ diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index f4db78b09c8f..50f75467f73d 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -6,7 +6,7 @@ #include <asm/page_types.h> #include <uapi/asm/ptrace.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef __i386__ struct pt_regs { @@ -56,18 +56,64 @@ struct pt_regs { #else /* __i386__ */ +struct fred_cs { + /* CS selector */ + u64 cs : 16, + /* Stack level at event time */ + sl : 2, + /* IBT in WAIT_FOR_ENDBRANCH state */ + wfe : 1, + : 45; +}; + +struct fred_ss { + /* SS selector */ + u64 ss : 16, + /* STI state */ + sti : 1, + /* Set if syscall, sysenter or INT n */ + swevent : 1, + /* Event is NMI type */ + nmi : 1, + : 13, + /* Event vector */ + vector : 8, + : 8, + /* Event type */ + type : 4, + : 4, + /* Event was incident to enclave execution */ + enclave : 1, + /* CPU was in long mode */ + lm : 1, + /* + * Nested exception during FRED delivery, not set + * for #DF. + */ + nested : 1, + : 1, + /* + * The length of the instruction causing the event. + * Only set for INTO, INT1, INT3, INT n, SYSCALL + * and SYSENTER. 0 otherwise. + */ + insnlen : 4; +}; + struct pt_regs { -/* - * C ABI says these regs are callee-preserved. They aren't saved on kernel entry - * unless syscall needs a complete, fully filled "struct pt_regs". - */ + /* + * C ABI says these regs are callee-preserved. They aren't saved on + * kernel entry unless syscall needs a complete, fully filled + * "struct pt_regs". + */ unsigned long r15; unsigned long r14; unsigned long r13; unsigned long r12; unsigned long bp; unsigned long bx; -/* These regs are callee-clobbered. Always saved on kernel entry. */ + + /* These regs are callee-clobbered. Always saved on kernel entry. */ unsigned long r11; unsigned long r10; unsigned long r9; @@ -77,18 +123,50 @@ struct pt_regs { unsigned long dx; unsigned long si; unsigned long di; -/* - * On syscall entry, this is syscall#. On CPU exception, this is error code. - * On hw interrupt, it's IRQ number: - */ + + /* + * orig_ax is used on entry for: + * - the syscall number (syscall, sysenter, int80) + * - error_code stored by the CPU on traps and exceptions + * - the interrupt number for device interrupts + * + * A FRED stack frame starts here: + * 1) It _always_ includes an error code; + * + * 2) The return frame for ERET[US] starts here, but + * the content of orig_ax is ignored. + */ unsigned long orig_ax; -/* Return frame for iretq */ + + /* The IRETQ return frame starts here */ unsigned long ip; - unsigned long cs; + + union { + /* CS selector */ + u16 cs; + /* The extended 64-bit data slot containing CS */ + u64 csx; + /* The FRED CS extension */ + struct fred_cs fred_cs; + }; + unsigned long flags; unsigned long sp; - unsigned long ss; -/* top of stack page */ + + union { + /* SS selector */ + u16 ss; + /* The extended 64-bit data slot containing SS */ + u64 ssx; + /* The FRED SS extension */ + struct fred_ss fred_ss; + }; + + /* + * Top of stack on IDT systems, while FRED systems have extra fields + * defined above for storing exception related information, e.g. CR2 or + * DR6. + */ }; #endif /* !__i386__ */ @@ -391,5 +469,5 @@ extern int do_set_thread_area(struct task_struct *p, int idx, # define do_set_thread_area_64(p, s, t) (0) #endif -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_X86_PTRACE_H */ diff --git a/arch/x86/include/asm/purgatory.h b/arch/x86/include/asm/purgatory.h index 5528e9325049..2fee5e9f1ccc 100644 --- a/arch/x86/include/asm/purgatory.h +++ b/arch/x86/include/asm/purgatory.h @@ -2,10 +2,10 @@ #ifndef _ASM_X86_PURGATORY_H #define _ASM_X86_PURGATORY_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/purgatory.h> extern void purgatory(void); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_PURGATORY_H */ diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h index 1436226efe3e..b9fece5fc96d 100644 --- a/arch/x86/include/asm/pvclock-abi.h +++ b/arch/x86/include/asm/pvclock-abi.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PVCLOCK_ABI_H #define _ASM_X86_PVCLOCK_ABI_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * These structs MUST NOT be changed. @@ -44,5 +44,5 @@ struct pvclock_wall_clock { #define PVCLOCK_GUEST_STOPPED (1 << 1) /* PVCLOCK_COUNTS_FROM_ZERO broke ABI and can't be used anymore. */ #define PVCLOCK_COUNTS_FROM_ZERO (1 << 2) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_PVCLOCK_ABI_H */ diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 19b695ff2c68..6e4f8fae3ce9 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -5,8 +5,10 @@ #include <asm/clocksource.h> #include <asm/pvclock-abi.h> +struct timespec64; /* some helper functions for xen and kvm pv clock sources */ u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); +u64 pvclock_clocksource_read_nowd(struct pvclock_vcpu_time_info *src); u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src); void pvclock_set_flags(u8 flags); unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src); @@ -39,7 +41,7 @@ bool pvclock_read_retry(const struct pvclock_vcpu_time_info *src, * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, * yielding a 64-bit result. */ -static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) +static __always_inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) { u64 product; #ifdef __i386__ diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index d87451df480b..68da67df304d 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -66,15 +66,15 @@ static inline bool vcpu_is_preempted(long cpu) #ifdef CONFIG_PARAVIRT /* - * virt_spin_lock_key - enables (by default) the virt_spin_lock() hijack. + * virt_spin_lock_key - disables by default the virt_spin_lock() hijack. * - * Native (and PV wanting native due to vCPU pinning) should disable this key. - * It is done in this backwards fashion to only have a single direction change, - * which removes ordering between native_pv_spin_init() and HV setup. + * Native (and PV wanting native due to vCPU pinning) should keep this key + * disabled. Native does not touch the key. + * + * When in a guest then native_pv_lock_init() enables the key first and + * KVM/XEN might conditionally disable it later in the boot process again. */ -DECLARE_STATIC_KEY_TRUE(virt_spin_lock_key); - -void native_pv_lock_init(void) __init; +DECLARE_STATIC_KEY_FALSE(virt_spin_lock_key); /* * Shortcut for the queued_spin_lock_slowpath() function that allows @@ -87,6 +87,8 @@ void native_pv_lock_init(void) __init; #define virt_spin_lock virt_spin_lock static inline bool virt_spin_lock(struct qspinlock *lock) { + int val; + if (!static_branch_likely(&virt_spin_lock_key)) return false; @@ -96,17 +98,17 @@ static inline bool virt_spin_lock(struct qspinlock *lock) * horrible lock 'holder' preemption issues. */ - do { - while (atomic_read(&lock->val) != 0) - cpu_relax(); - } while (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) != 0); + __retry: + val = atomic_read(&lock->val); + + if (val || !atomic_try_cmpxchg(&lock->val, &val, _Q_LOCKED_VAL)) { + cpu_relax(); + goto __retry; + } return true; } -#else -static inline void native_pv_lock_init(void) -{ -} + #endif /* CONFIG_PARAVIRT */ #include <asm-generic/qspinlock.h> diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h index 892fd8c3a6f7..0a985784be9b 100644 --- a/arch/x86/include/asm/qspinlock_paravirt.h +++ b/arch/x86/include/asm/qspinlock_paravirt.h @@ -4,6 +4,8 @@ #include <asm/ibt.h> +void __lockfunc __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked); + /* * For x86-64, PV_CALLEE_SAVE_REGS_THUNK() saves and restores 8 64-bit * registers. For i386, however, only 1 32-bit register needs to be saved @@ -12,21 +14,20 @@ */ #ifdef CONFIG_64BIT -PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath); +__PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text"); #define __pv_queued_spin_unlock __pv_queued_spin_unlock -#define PV_UNLOCK "__raw_callee_save___pv_queued_spin_unlock" -#define PV_UNLOCK_SLOWPATH "__raw_callee_save___pv_queued_spin_unlock_slowpath" /* * Optimized assembly version of __raw_callee_save___pv_queued_spin_unlock * which combines the registers saving trunk and the body of the following - * C code: + * C code. Note that it puts the code in the .spinlock.text section which + * is equivalent to adding __lockfunc in the C code: * - * void __pv_queued_spin_unlock(struct qspinlock *lock) + * void __lockfunc __pv_queued_spin_unlock(struct qspinlock *lock) * { - * u8 lockval = cmpxchg(&lock->locked, _Q_LOCKED_VAL, 0); + * u8 lockval = _Q_LOCKED_VAL; * - * if (likely(lockval == _Q_LOCKED_VAL)) + * if (try_cmpxchg(&lock->locked, &lockval, 0)) * return; * pv_queued_spin_unlock_slowpath(lock, lockval); * } @@ -36,37 +37,31 @@ PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath); * rsi = lockval (second argument) * rdx = internal variable (set to 0) */ -asm (".pushsection .text;" - ".globl " PV_UNLOCK ";" - ".type " PV_UNLOCK ", @function;" - ".align 4,0x90;" - PV_UNLOCK ": " - ASM_ENDBR - FRAME_BEGIN - "push %rdx;" - "mov $0x1,%eax;" - "xor %edx,%edx;" - LOCK_PREFIX "cmpxchg %dl,(%rdi);" - "cmp $0x1,%al;" - "jne .slowpath;" - "pop %rdx;" - FRAME_END - ASM_RET - ".slowpath: " - "push %rsi;" - "movzbl %al,%esi;" - "call " PV_UNLOCK_SLOWPATH ";" - "pop %rsi;" - "pop %rdx;" +#define PV_UNLOCK_ASM \ + FRAME_BEGIN \ + "push %rdx\n\t" \ + "mov $" __stringify(_Q_LOCKED_VAL) ",%eax\n\t" \ + "xor %edx,%edx\n\t" \ + LOCK_PREFIX "cmpxchg %dl,(%rdi)\n\t" \ + "jne .slowpath\n\t" \ + "pop %rdx\n\t" \ + FRAME_END \ + ASM_RET \ + ".slowpath:\n\t" \ + "push %rsi\n\t" \ + "movzbl %al,%esi\n\t" \ + "call __raw_callee_save___pv_queued_spin_unlock_slowpath\n\t" \ + "pop %rsi\n\t" \ + "pop %rdx\n\t" \ FRAME_END - ASM_RET - ".size " PV_UNLOCK ", .-" PV_UNLOCK ";" - ".popsection"); + +DEFINE_ASM_FUNC(__raw_callee_save___pv_queued_spin_unlock, + PV_UNLOCK_ASM, .spinlock.text); #else /* CONFIG_64BIT */ -extern void __pv_queued_spin_unlock(struct qspinlock *lock); -PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock); +extern void __lockfunc __pv_queued_spin_unlock(struct qspinlock *lock); +__PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock, ".spinlock.text"); #endif /* CONFIG_64BIT */ #endif diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index fd6f6e5b755a..f607081a022a 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -9,7 +9,7 @@ #define TH_FLAGS_SME_ACTIVE_BIT 0 #define TH_FLAGS_SME_ACTIVE BIT(TH_FLAGS_SME_ACTIVE_BIT) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/types.h> #include <asm/io.h> @@ -52,6 +52,7 @@ struct trampoline_header { u64 efer; u32 cr4; u32 flags; + u32 lock; #endif }; @@ -59,12 +60,13 @@ extern struct real_mode_header *real_mode_header; extern unsigned char real_mode_blob_end[]; extern unsigned long initial_code; -extern unsigned long initial_gs; extern unsigned long initial_stack; #ifdef CONFIG_AMD_MEM_ENCRYPT extern unsigned long initial_vc_handler; #endif +extern u32 *trampoline_lock; + extern unsigned char real_mode_blob[]; extern unsigned char real_mode_relocs[]; @@ -91,7 +93,8 @@ static inline void set_real_mode_mem(phys_addr_t mem) void reserve_real_mode(void); void load_trampoline_pgtable(void); +void init_real_mode(void); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ARCH_X86_REALMODE_H */ diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index 04c17be9b5fd..ecd58ea9a837 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -25,8 +25,18 @@ void __noreturn machine_real_restart(unsigned int type); #define MRR_BIOS 0 #define MRR_APM 1 +typedef void (cpu_emergency_virt_cb)(void); +#if IS_ENABLED(CONFIG_KVM_X86) +void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback); +void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback); +void cpu_emergency_disable_virtualization(void); +#else +static inline void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback) {} +static inline void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback) {} +static inline void cpu_emergency_disable_virtualization(void) {} +#endif /* CONFIG_KVM_X86 */ + typedef void (*nmi_shootdown_cb)(int, struct pt_regs*); -void nmi_panic_self_stop(struct pt_regs *regs); void nmi_shootdown_cpus(nmi_shootdown_cb callback); void run_crash_ipi_callback(struct pt_regs *regs); diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h deleted file mode 100644 index aff774775c67..000000000000 --- a/arch/x86/include/asm/required-features.h +++ /dev/null @@ -1,103 +0,0 @@ -#ifndef _ASM_X86_REQUIRED_FEATURES_H -#define _ASM_X86_REQUIRED_FEATURES_H - -/* Define minimum CPUID feature set for kernel These bits are checked - really early to actually display a visible error message before the - kernel dies. Make sure to assign features to the proper mask! - - Some requirements that are not in CPUID yet are also in the - CONFIG_X86_MINIMUM_CPU_FAMILY which is checked too. - - The real information is in arch/x86/Kconfig.cpu, this just converts - the CONFIGs into a bitmask */ - -#ifndef CONFIG_MATH_EMULATION -# define NEED_FPU (1<<(X86_FEATURE_FPU & 31)) -#else -# define NEED_FPU 0 -#endif - -#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) -# define NEED_PAE (1<<(X86_FEATURE_PAE & 31)) -#else -# define NEED_PAE 0 -#endif - -#ifdef CONFIG_X86_CMPXCHG64 -# define NEED_CX8 (1<<(X86_FEATURE_CX8 & 31)) -#else -# define NEED_CX8 0 -#endif - -#if defined(CONFIG_X86_CMOV) || defined(CONFIG_X86_64) -# define NEED_CMOV (1<<(X86_FEATURE_CMOV & 31)) -#else -# define NEED_CMOV 0 -#endif - -# define NEED_3DNOW 0 - -#if defined(CONFIG_X86_P6_NOP) || defined(CONFIG_X86_64) -# define NEED_NOPL (1<<(X86_FEATURE_NOPL & 31)) -#else -# define NEED_NOPL 0 -#endif - -#ifdef CONFIG_MATOM -# define NEED_MOVBE (1<<(X86_FEATURE_MOVBE & 31)) -#else -# define NEED_MOVBE 0 -#endif - -#ifdef CONFIG_X86_64 -#ifdef CONFIG_PARAVIRT_XXL -/* Paravirtualized systems may not have PSE or PGE available */ -#define NEED_PSE 0 -#define NEED_PGE 0 -#else -#define NEED_PSE (1<<(X86_FEATURE_PSE) & 31) -#define NEED_PGE (1<<(X86_FEATURE_PGE) & 31) -#endif -#define NEED_MSR (1<<(X86_FEATURE_MSR & 31)) -#define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31)) -#define NEED_XMM (1<<(X86_FEATURE_XMM & 31)) -#define NEED_XMM2 (1<<(X86_FEATURE_XMM2 & 31)) -#define NEED_LM (1<<(X86_FEATURE_LM & 31)) -#else -#define NEED_PSE 0 -#define NEED_MSR 0 -#define NEED_PGE 0 -#define NEED_FXSR 0 -#define NEED_XMM 0 -#define NEED_XMM2 0 -#define NEED_LM 0 -#endif - -#define REQUIRED_MASK0 (NEED_FPU|NEED_PSE|NEED_MSR|NEED_PAE|\ - NEED_CX8|NEED_PGE|NEED_FXSR|NEED_CMOV|\ - NEED_XMM|NEED_XMM2) -#define SSE_MASK (NEED_XMM|NEED_XMM2) - -#define REQUIRED_MASK1 (NEED_LM|NEED_3DNOW) - -#define REQUIRED_MASK2 0 -#define REQUIRED_MASK3 (NEED_NOPL) -#define REQUIRED_MASK4 (NEED_MOVBE) -#define REQUIRED_MASK5 0 -#define REQUIRED_MASK6 0 -#define REQUIRED_MASK7 0 -#define REQUIRED_MASK8 0 -#define REQUIRED_MASK9 0 -#define REQUIRED_MASK10 0 -#define REQUIRED_MASK11 0 -#define REQUIRED_MASK12 0 -#define REQUIRED_MASK13 0 -#define REQUIRED_MASK14 0 -#define REQUIRED_MASK15 0 -#define REQUIRED_MASK16 0 -#define REQUIRED_MASK17 0 -#define REQUIRED_MASK18 0 -#define REQUIRED_MASK19 0 -#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20) - -#endif /* _ASM_X86_REQUIRED_FEATURES_H */ diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index d60ed0668a59..feb93b50e990 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -4,10 +4,19 @@ #ifdef CONFIG_X86_CPU_RESCTRL -#include <linux/sched.h> #include <linux/jump_label.h> +#include <linux/percpu.h> +#include <linux/resctrl_types.h> +#include <linux/sched.h> -#define IA32_PQR_ASSOC 0x0c8f +#include <asm/msr.h> + +/* + * This value can never be a valid CLOSID, and is used when mapping a + * (closid, rmid) pair to an index and back. On x86 only the RMID is + * needed. The index is a software defined value. + */ +#define X86_RESCTRL_EMPTY_CLOSID ((u32)~0) /** * struct resctrl_pqr_state - State cache for the PQR MSR @@ -16,8 +25,8 @@ * @default_rmid: The user assigned Resource Monitoring ID * @default_closid: The user assigned cached Class Of Service ID * - * The upper 32 bits of IA32_PQR_ASSOC contain closid and the - * lower 10 bits rmid. The update to IA32_PQR_ASSOC always + * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the + * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always * contains both parts, so we need to cache them. This also * stores the user configured per cpu CLOSID and RMID. * @@ -33,10 +42,63 @@ struct resctrl_pqr_state { DECLARE_PER_CPU(struct resctrl_pqr_state, pqr_state); +extern bool rdt_alloc_capable; +extern bool rdt_mon_capable; +extern unsigned int rdt_mon_features; + DECLARE_STATIC_KEY_FALSE(rdt_enable_key); DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); +static inline bool resctrl_arch_alloc_capable(void) +{ + return rdt_alloc_capable; +} + +static inline void resctrl_arch_enable_alloc(void) +{ + static_branch_enable_cpuslocked(&rdt_alloc_enable_key); + static_branch_inc_cpuslocked(&rdt_enable_key); +} + +static inline void resctrl_arch_disable_alloc(void) +{ + static_branch_disable_cpuslocked(&rdt_alloc_enable_key); + static_branch_dec_cpuslocked(&rdt_enable_key); +} + +static inline bool resctrl_arch_mon_capable(void) +{ + return rdt_mon_capable; +} + +static inline void resctrl_arch_enable_mon(void) +{ + static_branch_enable_cpuslocked(&rdt_mon_enable_key); + static_branch_inc_cpuslocked(&rdt_enable_key); +} + +static inline void resctrl_arch_disable_mon(void) +{ + static_branch_disable_cpuslocked(&rdt_mon_enable_key); + static_branch_dec_cpuslocked(&rdt_enable_key); +} + +static inline bool resctrl_arch_is_llc_occupancy_enabled(void) +{ + return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID)); +} + +static inline bool resctrl_arch_is_mbm_total_enabled(void) +{ + return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID)); +} + +static inline bool resctrl_arch_is_mbm_local_enabled(void) +{ + return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID)); +} + /* * __resctrl_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR * @@ -51,11 +113,11 @@ DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); * simple as possible. * Must be called with preemption disabled. */ -static void __resctrl_sched_in(void) +static inline void __resctrl_sched_in(struct task_struct *tsk) { struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state); - u32 closid = state->default_closid; - u32 rmid = state->default_rmid; + u32 closid = READ_ONCE(state->default_closid); + u32 rmid = READ_ONCE(state->default_rmid); u32 tmp; /* @@ -63,13 +125,13 @@ static void __resctrl_sched_in(void) * Else use the closid/rmid assigned to this cpu. */ if (static_branch_likely(&rdt_alloc_enable_key)) { - tmp = READ_ONCE(current->closid); + tmp = READ_ONCE(tsk->closid); if (tmp) closid = tmp; } if (static_branch_likely(&rdt_mon_enable_key)) { - tmp = READ_ONCE(current->rmid); + tmp = READ_ONCE(tsk->rmid); if (tmp) rmid = tmp; } @@ -77,21 +139,79 @@ static void __resctrl_sched_in(void) if (closid != state->cur_closid || rmid != state->cur_rmid) { state->cur_closid = closid; state->cur_rmid = rmid; - wrmsr(IA32_PQR_ASSOC, rmid, closid); + wrmsr(MSR_IA32_PQR_ASSOC, rmid, closid); } } -static inline void resctrl_sched_in(void) +static inline unsigned int resctrl_arch_round_mon_val(unsigned int val) +{ + unsigned int scale = boot_cpu_data.x86_cache_occ_scale; + + /* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */ + val /= scale; + return val * scale; +} + +static inline void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, + u32 rmid) +{ + WRITE_ONCE(per_cpu(pqr_state.default_closid, cpu), closid); + WRITE_ONCE(per_cpu(pqr_state.default_rmid, cpu), rmid); +} + +static inline void resctrl_arch_set_closid_rmid(struct task_struct *tsk, + u32 closid, u32 rmid) +{ + WRITE_ONCE(tsk->closid, closid); + WRITE_ONCE(tsk->rmid, rmid); +} + +static inline bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid) +{ + return READ_ONCE(tsk->closid) == closid; +} + +static inline bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 ignored, + u32 rmid) +{ + return READ_ONCE(tsk->rmid) == rmid; +} + +static inline void resctrl_arch_sched_in(struct task_struct *tsk) { if (static_branch_likely(&rdt_enable_key)) - __resctrl_sched_in(); + __resctrl_sched_in(tsk); +} + +static inline void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid) +{ + *rmid = idx; + *closid = X86_RESCTRL_EMPTY_CLOSID; } +static inline u32 resctrl_arch_rmid_idx_encode(u32 ignored, u32 rmid) +{ + return rmid; +} + +/* x86 can always read an rmid, nothing needs allocating */ +struct rdt_resource; +static inline void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, + enum resctrl_event_id evtid) +{ + might_sleep(); + return NULL; +} + +static inline void resctrl_arch_mon_ctx_free(struct rdt_resource *r, + enum resctrl_event_id evtid, + void *ctx) { } + void resctrl_cpu_detect(struct cpuinfo_x86 *c); #else -static inline void resctrl_sched_in(void) {} +static inline void resctrl_arch_sched_in(struct task_struct *tsk) {} static inline void resctrl_cpu_detect(struct cpuinfo_x86 *c) {} #endif /* CONFIG_X86_CPU_RESCTRL */ diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h index 8a9eba191516..3821ee3fae35 100644 --- a/arch/x86/include/asm/rmwcc.h +++ b/arch/x86/include/asm/rmwcc.h @@ -2,23 +2,18 @@ #ifndef _ASM_X86_RMWcc #define _ASM_X86_RMWcc -/* This counts to 12. Any more, it will return 13th argument. */ -#define __RMWcc_ARGS(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _n, X...) _n -#define RMWcc_ARGS(X...) __RMWcc_ARGS(, ##X, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) - -#define __RMWcc_CONCAT(a, b) a ## b -#define RMWcc_CONCAT(a, b) __RMWcc_CONCAT(a, b) +#include <linux/args.h> #define __CLOBBERS_MEM(clb...) "memory", ## clb -#if !defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(CONFIG_CC_HAS_ASM_GOTO) +#ifndef __GCC_ASM_FLAG_OUTPUTS__ /* Use asm goto */ #define __GEN_RMWcc(fullop, _var, cc, clobbers, ...) \ ({ \ bool c = false; \ - asm_volatile_goto (fullop "; j" #cc " %l[cc_label]" \ + asm goto (fullop "; j" #cc " %l[cc_label]" \ : : [var] "m" (_var), ## __VA_ARGS__ \ : clobbers : cc_label); \ if (0) { \ @@ -27,20 +22,20 @@ cc_label: c = true; \ c; \ }) -#else /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CONFIG_CC_HAS_ASM_GOTO) */ +#else /* defined(__GCC_ASM_FLAG_OUTPUTS__) */ /* Use flags output or a set instruction */ #define __GEN_RMWcc(fullop, _var, cc, clobbers, ...) \ ({ \ bool c; \ - asm volatile (fullop CC_SET(cc) \ + asm_inline volatile (fullop CC_SET(cc) \ : [var] "+m" (_var), CC_OUT(cc) (c) \ : __VA_ARGS__ : clobbers); \ c; \ }) -#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CONFIG_CC_HAS_ASM_GOTO) */ +#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) */ #define GEN_UNARY_RMWcc_4(op, var, cc, arg0) \ __GEN_RMWcc(op " " arg0, var, cc, __CLOBBERS_MEM()) @@ -48,7 +43,7 @@ cc_label: c = true; \ #define GEN_UNARY_RMWcc_3(op, var, cc) \ GEN_UNARY_RMWcc_4(op, var, cc, "%[var]") -#define GEN_UNARY_RMWcc(X...) RMWcc_CONCAT(GEN_UNARY_RMWcc_, RMWcc_ARGS(X))(X) +#define GEN_UNARY_RMWcc(X...) CONCATENATE(GEN_UNARY_RMWcc_, COUNT_ARGS(X))(X) #define GEN_BINARY_RMWcc_6(op, var, cc, vcon, _val, arg0) \ __GEN_RMWcc(op " %[val], " arg0, var, cc, \ @@ -57,7 +52,7 @@ cc_label: c = true; \ #define GEN_BINARY_RMWcc_5(op, var, cc, vcon, val) \ GEN_BINARY_RMWcc_6(op, var, cc, vcon, val, "%[var]") -#define GEN_BINARY_RMWcc(X...) RMWcc_CONCAT(GEN_BINARY_RMWcc_, RMWcc_ARGS(X))(X) +#define GEN_BINARY_RMWcc(X...) CONCATENATE(GEN_BINARY_RMWcc_, COUNT_ARGS(X))(X) #define GEN_UNARY_SUFFIXED_RMWcc(op, suffix, var, cc, clobbers...) \ __GEN_RMWcc(op " %[var]\n\t" suffix, var, cc, \ diff --git a/arch/x86/include/asm/rqspinlock.h b/arch/x86/include/asm/rqspinlock.h new file mode 100644 index 000000000000..24a885449ee6 --- /dev/null +++ b/arch/x86/include/asm/rqspinlock.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_RQSPINLOCK_H +#define _ASM_X86_RQSPINLOCK_H + +#include <asm/paravirt.h> + +#ifdef CONFIG_PARAVIRT +DECLARE_STATIC_KEY_FALSE(virt_spin_lock_key); + +#define resilient_virt_spin_lock_enabled resilient_virt_spin_lock_enabled +static __always_inline bool resilient_virt_spin_lock_enabled(void) +{ + return static_branch_likely(&virt_spin_lock_key); +} + +#ifdef CONFIG_QUEUED_SPINLOCKS +typedef struct qspinlock rqspinlock_t; +#else +typedef struct rqspinlock rqspinlock_t; +#endif +extern int resilient_tas_spin_lock(rqspinlock_t *lock); + +#define resilient_virt_spin_lock resilient_virt_spin_lock +static inline int resilient_virt_spin_lock(rqspinlock_t *lock) +{ + return resilient_tas_spin_lock(lock); +} + +#endif /* CONFIG_PARAVIRT */ + +#include <asm-generic/rqspinlock.h> + +#endif /* _ASM_X86_RQSPINLOCK_H */ diff --git a/arch/x86/include/asm/runtime-const.h b/arch/x86/include/asm/runtime-const.h new file mode 100644 index 000000000000..8d983cfd06ea --- /dev/null +++ b/arch/x86/include/asm/runtime-const.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_RUNTIME_CONST_H +#define _ASM_RUNTIME_CONST_H + +#ifdef __ASSEMBLY__ + +.macro RUNTIME_CONST_PTR sym reg + movq $0x0123456789abcdef, %\reg + 1: + .pushsection runtime_ptr_\sym, "a" + .long 1b - 8 - . + .popsection +.endm + +#else /* __ASSEMBLY__ */ + +#define runtime_const_ptr(sym) ({ \ + typeof(sym) __ret; \ + asm_inline("mov %1,%0\n1:\n" \ + ".pushsection runtime_ptr_" #sym ",\"a\"\n\t" \ + ".long 1b - %c2 - .\n" \ + ".popsection" \ + :"=r" (__ret) \ + :"i" ((unsigned long)0x0123456789abcdefull), \ + "i" (sizeof(long))); \ + __ret; }) + +// The 'typeof' will create at _least_ a 32-bit type, but +// will happily also take a bigger type and the 'shrl' will +// clear the upper bits +#define runtime_const_shift_right_32(val, sym) ({ \ + typeof(0u+(val)) __ret = (val); \ + asm_inline("shrl $12,%k0\n1:\n" \ + ".pushsection runtime_shift_" #sym ",\"a\"\n\t" \ + ".long 1b - 1 - .\n" \ + ".popsection" \ + :"+r" (__ret)); \ + __ret; }) + +#define runtime_const_init(type, sym) do { \ + extern s32 __start_runtime_##type##_##sym[]; \ + extern s32 __stop_runtime_##type##_##sym[]; \ + runtime_const_fixup(__runtime_fixup_##type, \ + (unsigned long)(sym), \ + __start_runtime_##type##_##sym, \ + __stop_runtime_##type##_##sym); \ +} while (0) + +/* + * The text patching is trivial - you can only do this at init time, + * when the text section hasn't been marked RO, and before the text + * has ever been executed. + */ +static inline void __runtime_fixup_ptr(void *where, unsigned long val) +{ + *(unsigned long *)where = val; +} + +static inline void __runtime_fixup_shift(void *where, unsigned long val) +{ + *(unsigned char *)where = val; +} + +static inline void runtime_const_fixup(void (*fn)(void *, unsigned long), + unsigned long val, s32 *start, s32 *end) +{ + while (start < end) { + fn(*start + (void *)start, val); + start++; + } +} + +#endif /* __ASSEMBLY__ */ +#endif diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h index fef16e398161..42bcd42d70d1 100644 --- a/arch/x86/include/asm/seccomp.h +++ b/arch/x86/include/asm/seccomp.h @@ -9,7 +9,7 @@ #endif #ifdef CONFIG_COMPAT -#include <asm/ia32_unistd.h> +#include <asm/unistd_32_ia32.h> #define __NR_seccomp_read_32 __NR_ia32_read #define __NR_seccomp_write_32 __NR_ia32_write #define __NR_seccomp_exit_32 __NR_ia32_exit diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h index a6e8373a5170..30e8ee7006f9 100644 --- a/arch/x86/include/asm/sections.h +++ b/arch/x86/include/asm/sections.h @@ -2,11 +2,10 @@ #ifndef _ASM_X86_SECTIONS_H #define _ASM_X86_SECTIONS_H -#define arch_is_kernel_initmem_freed arch_is_kernel_initmem_freed - #include <asm-generic/sections.h> #include <asm/extable.h> +extern char __relocate_kernel_start[], __relocate_kernel_end[]; extern char __brk_base[], __brk_limit[]; extern char __end_rodata_aligned[]; @@ -18,20 +17,4 @@ extern char __end_of_kernel_reserve[]; extern unsigned long _brk_start, _brk_end; -static inline bool arch_is_kernel_initmem_freed(unsigned long addr) -{ - /* - * If _brk_start has not been cleared, brk allocation is incomplete, - * and we can not make assumptions about its use. - */ - if (_brk_start) - return 0; - - /* - * After brk allocation is complete, space between _brk_end and _end - * is available for allocation. - */ - return addr >= _brk_end && addr < (unsigned long)&_end; -} - #endif /* _ASM_X86_SECTIONS_H */ diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 2e7890dd58a4..77d8f49b92bd 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -56,7 +56,7 @@ #define GDT_ENTRY_INVALID_SEG 0 -#ifdef CONFIG_X86_32 +#if defined(CONFIG_X86_32) && !defined(BUILD_VDSO32_64) /* * The layout of the per-CPU GDT under Linux: * @@ -96,7 +96,7 @@ * * 26 - ESPFIX small SS * 27 - per-cpu [ offset to per-cpu data area ] - * 28 - unused + * 28 - VDSO getcpu * 29 - unused * 30 - unused * 31 - TSS for double fault handler @@ -119,6 +119,7 @@ #define GDT_ENTRY_ESPFIX_SS 26 #define GDT_ENTRY_PERCPU 27 +#define GDT_ENTRY_CPUNODE 28 #define GDT_ENTRY_DOUBLEFAULT_TSS 31 @@ -135,6 +136,7 @@ #define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) #define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3) #define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3) +#define __USER32_CS __USER_CS #define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8) /* segment for calling fn: */ @@ -158,6 +160,8 @@ # define __KERNEL_PERCPU 0 #endif +#define __CPUNODE_SEG (GDT_ENTRY_CPUNODE*8 + 3) + #else /* 64-bit: */ #include <asm/cache.h> @@ -210,7 +214,6 @@ #define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) #define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8 + 3) #define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3) -#define __USER32_DS __USER_DS #define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3) #define __CPUNODE_SEG (GDT_ENTRY_CPUNODE*8 + 3) @@ -226,13 +229,11 @@ #define GDT_ENTRY_TLS_ENTRIES 3 #define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES* 8) -#ifdef CONFIG_X86_64 - /* Bit size and mask of CPU number stored in the per CPU data (and TSC_AUX) */ #define VDSO_CPUNODE_BITS 12 #define VDSO_CPUNODE_MASK 0xfff -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* Helper functions to store/load CPU and node numbers */ @@ -264,8 +265,7 @@ static inline void vdso_read_cpunode(unsigned *cpu, unsigned *node) *node = (p >> VDSO_CPUNODE_BITS); } -#endif /* !__ASSEMBLY__ */ -#endif /* CONFIG_X86_64 */ +#endif /* !__ASSEMBLER__ */ #ifdef __KERNEL__ @@ -286,7 +286,7 @@ static inline void vdso_read_cpunode(unsigned *cpu, unsigned *node) */ #define XEN_EARLY_IDT_HANDLER_SIZE (8 + ENDBR_INSN_SIZE) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE]; extern void early_ignore_irq(void); @@ -350,7 +350,7 @@ static inline void __loadsegment_fs(unsigned short value) #define savesegment(seg, value) \ asm("mov %%" #seg ",%0":"=r" (value) : : "memory") -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* __KERNEL__ */ #endif /* _ASM_X86_SEGMENT_H */ diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index b45c4d27fd46..61f56cdaccb5 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -2,9 +2,12 @@ #ifndef _ASM_X86_SET_MEMORY_H #define _ASM_X86_SET_MEMORY_H -#include <linux/mm.h> #include <asm/page.h> #include <asm-generic/set_memory.h> +#include <asm/pgtable.h> + +#define set_memory_rox set_memory_rox +int set_memory_rox(unsigned long addr, int numpages); /* * The set_memory_* API can be used to change various attributes of a virtual @@ -44,9 +47,13 @@ int set_memory_uc(unsigned long addr, int numpages); int set_memory_wc(unsigned long addr, int numpages); int set_memory_wb(unsigned long addr, int numpages); int set_memory_np(unsigned long addr, int numpages); +int set_memory_p(unsigned long addr, int numpages); int set_memory_4k(unsigned long addr, int numpages); + +bool set_memory_enc_stop_conversion(void); int set_memory_encrypted(unsigned long addr, int numpages); int set_memory_decrypted(unsigned long addr, int numpages); + int set_memory_np_noalias(unsigned long addr, int numpages); int set_memory_nonglobal(unsigned long addr, int numpages); int set_memory_global(unsigned long addr, int numpages); @@ -82,6 +89,7 @@ int set_pages_rw(struct page *page, int numpages); int set_direct_map_invalid_noflush(struct page *page); int set_direct_map_default_noflush(struct page *page); +int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid); bool kernel_page_present(struct page *page); extern int kernel_set_to_readonly; diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index f37cbff7354c..692af46603a1 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -27,12 +27,12 @@ #define OLD_CL_ADDRESS 0x020 /* Relative to real mode data */ #define NEW_CL_POINTER 0x228 /* Relative to real mode data */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ +#include <linux/cache.h> + #include <asm/bootparam.h> #include <asm/x86_init.h> -extern u64 relocated_ramdisk; - /* Interrupt control for vSMPowered x86_64 systems */ #ifdef CONFIG_X86_64 void vsmp_init(void); @@ -46,11 +46,13 @@ void setup_bios_corruption_check(void); void early_platform_quirks(void); extern unsigned long saved_video_mode; +extern unsigned long acpi_realmode_flags; extern void reserve_standard_io_resources(void); extern void i386_reserve_resources(void); -extern unsigned long __startup_64(unsigned long physaddr, struct boot_params *bp); -extern void startup_64_setup_env(unsigned long physbase); +extern unsigned long __startup_64(unsigned long p2v_offset, struct boot_params *bp); +extern void startup_64_setup_gdt_idt(void); +extern void startup_64_load_idt(void *vc_handler); extern void early_setup_idt(void); extern void __init do_early_exception(struct pt_regs *regs, int trapnr); @@ -66,6 +68,8 @@ extern void x86_ce4100_early_setup(void); static inline void x86_ce4100_early_setup(void) { } #endif +#include <linux/kexec_handover.h> + #ifndef _SETUP #include <asm/espfix.h> @@ -125,16 +129,23 @@ void clear_bss(void); #ifdef __i386__ -asmlinkage void __init i386_start_kernel(void); +asmlinkage void __init __noreturn i386_start_kernel(void); +void __init mk_early_pgtbl_32(void); #else -asmlinkage void __init x86_64_start_kernel(char *real_mode); -asmlinkage void __init x86_64_start_reservations(char *real_mode_data); +asmlinkage void __init __noreturn x86_64_start_kernel(char *real_mode); +asmlinkage void __init __noreturn x86_64_start_reservations(char *real_mode_data); #endif /* __i386__ */ #endif /* _SETUP */ -#else /* __ASSEMBLY */ +#ifdef CONFIG_CMDLINE_BOOL +extern bool builtin_cmdline_added __ro_after_init; +#else +#define builtin_cmdline_added 0 +#endif + +#else /* __ASSEMBLER__ */ .macro __RESERVE_BRK name, size .pushsection .bss..brk, "aw" @@ -146,6 +157,6 @@ SYM_DATA_END(__brk_\name) #define RESERVE_BRK(name, size) __RESERVE_BRK name, size -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_SETUP_H */ diff --git a/arch/x86/include/asm/setup_data.h b/arch/x86/include/asm/setup_data.h new file mode 100644 index 000000000000..7bb16f843c93 --- /dev/null +++ b/arch/x86/include/asm/setup_data.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_SETUP_DATA_H +#define _ASM_X86_SETUP_DATA_H + +#include <uapi/asm/setup_data.h> + +#ifndef __ASSEMBLER__ + +struct pci_setup_rom { + struct setup_data data; + uint16_t vendor; + uint16_t devid; + uint64_t pcilen; + unsigned long segment; + unsigned long bus; + unsigned long device; + unsigned long function; + uint8_t romdata[]; +}; + +/* kexec external ABI */ +struct efi_setup_data { + u64 fw_vendor; + u64 __unused; + u64 tables; + u64 smbios; + u64 reserved[8]; +}; + +#endif /* __ASSEMBLER__ */ + +#endif /* _ASM_X86_SETUP_DATA_H */ diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index b8357d6ecd47..0020d77a0800 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -54,8 +54,18 @@ (((unsigned long)fn) << 32)) /* AP Reset Hold */ -#define GHCB_MSR_AP_RESET_HOLD_REQ 0x006 -#define GHCB_MSR_AP_RESET_HOLD_RESP 0x007 +#define GHCB_MSR_AP_RESET_HOLD_REQ 0x006 +#define GHCB_MSR_AP_RESET_HOLD_RESP 0x007 +#define GHCB_MSR_AP_RESET_HOLD_RESULT_POS 12 +#define GHCB_MSR_AP_RESET_HOLD_RESULT_MASK GENMASK_ULL(51, 0) + +/* Preferred GHCB GPA Request */ +#define GHCB_MSR_PREF_GPA_REQ 0x010 +#define GHCB_MSR_GPA_VALUE_POS 12 +#define GHCB_MSR_GPA_VALUE_MASK GENMASK_ULL(51, 0) + +#define GHCB_MSR_PREF_GPA_RESP 0x011 +#define GHCB_MSR_PREF_GPA_NONE 0xfffffffffffff /* GHCB GPA Register */ #define GHCB_MSR_REG_GPA_REQ 0x012 @@ -91,23 +101,61 @@ enum psc_op { /* GHCBData[11:0] */ \ GHCB_MSR_PSC_REQ) +#define GHCB_MSR_PSC_REQ_TO_GFN(msr) (((msr) & GENMASK_ULL(51, 12)) >> 12) +#define GHCB_MSR_PSC_REQ_TO_OP(msr) (((msr) & GENMASK_ULL(55, 52)) >> 52) + #define GHCB_MSR_PSC_RESP 0x015 #define GHCB_MSR_PSC_RESP_VAL(val) \ /* GHCBData[63:32] */ \ (((u64)(val) & GENMASK_ULL(63, 32)) >> 32) +/* Set highest bit as a generic error response */ +#define GHCB_MSR_PSC_RESP_ERROR (BIT_ULL(63) | GHCB_MSR_PSC_RESP) + +/* GHCB Run at VMPL Request/Response */ +#define GHCB_MSR_VMPL_REQ 0x016 +#define GHCB_MSR_VMPL_REQ_LEVEL(v) \ + /* GHCBData[39:32] */ \ + ((((u64)(v) & GENMASK_ULL(7, 0)) << 32) | \ + /* GHCBDdata[11:0] */ \ + GHCB_MSR_VMPL_REQ) + +#define GHCB_MSR_VMPL_RESP 0x017 +#define GHCB_MSR_VMPL_RESP_VAL(v) \ + /* GHCBData[63:32] */ \ + (((u64)(v) & GENMASK_ULL(63, 32)) >> 32) + /* GHCB Hypervisor Feature Request/Response */ #define GHCB_MSR_HV_FT_REQ 0x080 #define GHCB_MSR_HV_FT_RESP 0x081 +#define GHCB_MSR_HV_FT_POS 12 +#define GHCB_MSR_HV_FT_MASK GENMASK_ULL(51, 0) #define GHCB_MSR_HV_FT_RESP_VAL(v) \ /* GHCBData[63:12] */ \ (((u64)(v) & GENMASK_ULL(63, 12)) >> 12) #define GHCB_HV_FT_SNP BIT_ULL(0) #define GHCB_HV_FT_SNP_AP_CREATION BIT_ULL(1) +#define GHCB_HV_FT_SNP_MULTI_VMPL BIT_ULL(5) + +/* + * SNP Page State Change NAE event + * The VMGEXIT_PSC_MAX_ENTRY determines the size of the PSC structure, which + * is a local stack variable in set_pages_state(). Do not increase this value + * without evaluating the impact to stack usage. + * + * Use VMGEXIT_PSC_MAX_COUNT in cases where the actual GHCB-defined max value + * is needed, such as when processing GHCB requests on the hypervisor side. + */ +#define VMGEXIT_PSC_MAX_ENTRY 64 +#define VMGEXIT_PSC_MAX_COUNT 253 -/* SNP Page State Change NAE event */ -#define VMGEXIT_PSC_MAX_ENTRY 253 +#define VMGEXIT_PSC_ERROR_GENERIC (0x100UL << 32) +#define VMGEXIT_PSC_ERROR_INVALID_HDR ((1UL << 32) | 1) +#define VMGEXIT_PSC_ERROR_INVALID_ENTRY ((1UL << 32) | 2) + +#define VMGEXIT_PSC_OP_PRIVATE 1 +#define VMGEXIT_PSC_OP_SHARED 2 struct psc_hdr { u16 cur_entry; @@ -128,9 +176,6 @@ struct snp_psc_desc { struct psc_entry entries[VMGEXIT_PSC_MAX_ENTRY]; } __packed; -/* Guest message request error code */ -#define SNP_GUEST_REQ_INVALID_LEN BIT_ULL(32) - #define GHCB_MSR_TERM_REQ 0x100 #define GHCB_MSR_TERM_REASON_SET_POS 12 #define GHCB_MSR_TERM_REASON_SET_MASK 0xf @@ -157,12 +202,26 @@ struct snp_psc_desc { #define GHCB_TERM_NOT_VMPL0 3 /* SNP guest is not running at VMPL-0 */ #define GHCB_TERM_CPUID 4 /* CPUID-validation failure */ #define GHCB_TERM_CPUID_HV 5 /* CPUID failure during hypervisor fallback */ +#define GHCB_TERM_SECRETS_PAGE 6 /* Secrets page failure */ +#define GHCB_TERM_NO_SVSM 7 /* SVSM is not advertised in the secrets page */ +#define GHCB_TERM_SVSM_VMPL0 8 /* SVSM is present but has set VMPL to 0 */ +#define GHCB_TERM_SVSM_CAA 9 /* SVSM is present but CAA is not page aligned */ +#define GHCB_TERM_SECURE_TSC 10 /* Secure TSC initialization failed */ +#define GHCB_TERM_SVSM_CA_REMAP_FAIL 11 /* SVSM is present but CA could not be remapped */ #define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK) /* - * Error codes related to GHCB input that can be communicated back to the guest - * by setting the lower 32-bits of the GHCB SW_EXITINFO1 field to 2. + * GHCB-defined return codes that are communicated back to the guest via + * SW_EXITINFO1. + */ +#define GHCB_HV_RESP_NO_ACTION 0 +#define GHCB_HV_RESP_ISSUE_EXCEPTION 1 +#define GHCB_HV_RESP_MALFORMED_INPUT 2 + +/* + * GHCB-defined sub-error codes for malformed input (see above) that are + * communicated back to the guest via SW_EXITINFO2[31:0]. */ #define GHCB_ERR_NOT_REGISTERED 1 #define GHCB_ERR_INVALID_USAGE 2 @@ -171,4 +230,31 @@ struct snp_psc_desc { #define GHCB_ERR_INVALID_INPUT 5 #define GHCB_ERR_INVALID_EVENT 6 +struct sev_config { + __u64 debug : 1, + + /* + * Indicates when the per-CPU GHCB has been created and registered + * and thus can be used by the BSP instead of the early boot GHCB. + * + * For APs, the per-CPU GHCB is created before they are started + * and registered upon startup, so this flag can be used globally + * for the BSP and APs. + */ + ghcbs_initialized : 1, + + /* + * Indicates when the per-CPU SVSM CA is to be used instead of the + * boot SVSM CA. + * + * For APs, the per-CPU SVSM CA is created as part of the AP + * bringup, so this flag can be used globally for the BSP and APs. + */ + use_cas : 1, + + __reserved : 61; +}; + +extern struct sev_config sev_cfg; + #endif diff --git a/arch/x86/include/asm/sev-internal.h b/arch/x86/include/asm/sev-internal.h new file mode 100644 index 000000000000..3dfd306d1c9e --- /dev/null +++ b/arch/x86/include/asm/sev-internal.h @@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#define DR7_RESET_VALUE 0x400 + +extern struct ghcb boot_ghcb_page; +extern u64 sev_hv_features; +extern u64 sev_secrets_pa; + +/* #VC handler runtime per-CPU data */ +struct sev_es_runtime_data { + struct ghcb ghcb_page; + + /* + * Reserve one page per CPU as backup storage for the unencrypted GHCB. + * It is needed when an NMI happens while the #VC handler uses the real + * GHCB, and the NMI handler itself is causing another #VC exception. In + * that case the GHCB content of the first handler needs to be backed up + * and restored. + */ + struct ghcb backup_ghcb; + + /* + * Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions. + * There is no need for it to be atomic, because nothing is written to + * the GHCB between the read and the write of ghcb_active. So it is safe + * to use it when a nested #VC exception happens before the write. + * + * This is necessary for example in the #VC->NMI->#VC case when the NMI + * happens while the first #VC handler uses the GHCB. When the NMI code + * raises a second #VC handler it might overwrite the contents of the + * GHCB written by the first handler. To avoid this the content of the + * GHCB is saved and restored when the GHCB is detected to be in use + * already. + */ + bool ghcb_active; + bool backup_ghcb_active; + + /* + * Cached DR7 value - write it on DR7 writes and return it on reads. + * That value will never make it to the real hardware DR7 as debugging + * is currently unsupported in SEV-ES guests. + */ + unsigned long dr7; +}; + +struct ghcb_state { + struct ghcb *ghcb; +}; + +extern struct svsm_ca boot_svsm_ca_page; + +struct ghcb *__sev_get_ghcb(struct ghcb_state *state); +void __sev_put_ghcb(struct ghcb_state *state); + +DECLARE_PER_CPU(struct sev_es_runtime_data*, runtime_data); +DECLARE_PER_CPU(struct sev_es_save_area *, sev_vmsa); + +void early_set_pages_state(unsigned long vaddr, unsigned long paddr, + unsigned long npages, enum psc_op op); + +DECLARE_PER_CPU(struct svsm_ca *, svsm_caa); +DECLARE_PER_CPU(u64, svsm_caa_pa); + +extern struct svsm_ca *boot_svsm_caa; +extern u64 boot_svsm_caa_pa; + +static __always_inline struct svsm_ca *svsm_get_caa(void) +{ + if (sev_cfg.use_cas) + return this_cpu_read(svsm_caa); + else + return boot_svsm_caa; +} + +static __always_inline u64 svsm_get_caa_pa(void) +{ + if (sev_cfg.use_cas) + return this_cpu_read(svsm_caa_pa); + else + return boot_svsm_caa_pa; +} + +int svsm_perform_call_protocol(struct svsm_call *call); + +static inline u64 sev_es_rd_ghcb_msr(void) +{ + return native_rdmsrq(MSR_AMD64_SEV_ES_GHCB); +} + +static __always_inline void sev_es_wr_ghcb_msr(u64 val) +{ + u32 low, high; + + low = (u32)(val); + high = (u32)(val >> 32); + + native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high); +} + +void snp_register_ghcb_early(unsigned long paddr); +bool sev_es_negotiate_protocol(void); +bool sev_es_check_cpu_features(void); +u64 get_hv_features(void); + +const struct snp_cpuid_table *snp_cpuid_get_table(void); diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 4a23e52fe0ee..58e028d42e41 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -9,9 +9,13 @@ #define __ASM_ENCRYPTED_STATE_H #include <linux/types.h> +#include <linux/sev-guest.h> + #include <asm/insn.h> #include <asm/sev-common.h> -#include <asm/bootparam.h> +#include <asm/coco.h> +#include <asm/set_memory.h> +#include <asm/svm.h> #define GHCB_PROTOCOL_MIN 1ULL #define GHCB_PROTOCOL_MAX 2ULL @@ -19,6 +23,8 @@ #define VMGEXIT() { asm volatile("rep; vmmcall\n\r"); } +struct boot_params; + enum es_result { ES_OK, /* All good */ ES_UNSUPPORTED, /* Requested operation not supported */ @@ -78,11 +84,62 @@ extern void vc_no_ghcb(void); extern void vc_boot_ghcb(void); extern bool handle_vc_boot_ghcb(struct pt_regs *regs); +/* + * Individual entries of the SNP CPUID table, as defined by the SNP + * Firmware ABI, Revision 0.9, Section 7.1, Table 14. + */ +struct snp_cpuid_fn { + u32 eax_in; + u32 ecx_in; + u64 xcr0_in; + u64 xss_in; + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; + u64 __reserved; +} __packed; + +/* + * SNP CPUID table, as defined by the SNP Firmware ABI, Revision 0.9, + * Section 8.14.2.6. Also noted there is the SNP firmware-enforced limit + * of 64 entries per CPUID table. + */ +#define SNP_CPUID_COUNT_MAX 64 + +struct snp_cpuid_table { + u32 count; + u32 __reserved1; + u64 __reserved2; + struct snp_cpuid_fn fn[SNP_CPUID_COUNT_MAX]; +} __packed; + +/* PVALIDATE return codes */ +#define PVALIDATE_FAIL_SIZEMISMATCH 6 + /* Software defined (when rFlags.CF = 1) */ #define PVALIDATE_FAIL_NOUPDATE 255 +/* RMUPDATE detected 4K page and 2MB page overlap. */ +#define RMPUPDATE_FAIL_OVERLAP 4 + +/* PSMASH failed due to concurrent access by another CPU */ +#define PSMASH_FAIL_INUSE 3 + /* RMP page size */ #define RMP_PG_SIZE_4K 0 +#define RMP_PG_SIZE_2M 1 +#define RMP_TO_PG_LEVEL(level) (((level) == RMP_PG_SIZE_4K) ? PG_LEVEL_4K : PG_LEVEL_2M) +#define PG_LEVEL_TO_RMP(level) (((level) == PG_LEVEL_4K) ? RMP_PG_SIZE_4K : RMP_PG_SIZE_2M) + +struct rmp_state { + u64 gpa; + u8 assigned; + u8 pagesize; + u8 immutable; + u8 rsvd; + u32 asid; +} __packed; #define RMPADJUST_VMSA_PAGE_BIT BIT(16) @@ -94,8 +151,92 @@ struct snp_req_data { unsigned int data_npages; }; -struct sev_guest_platform_data { - u64 secrets_gpa; +#define MAX_AUTHTAG_LEN 32 +#define AUTHTAG_LEN 16 +#define AAD_LEN 48 +#define MSG_HDR_VER 1 + +#define SNP_REQ_MAX_RETRY_DURATION (60*HZ) +#define SNP_REQ_RETRY_DELAY (2*HZ) + +/* See SNP spec SNP_GUEST_REQUEST section for the structure */ +enum msg_type { + SNP_MSG_TYPE_INVALID = 0, + SNP_MSG_CPUID_REQ, + SNP_MSG_CPUID_RSP, + SNP_MSG_KEY_REQ, + SNP_MSG_KEY_RSP, + SNP_MSG_REPORT_REQ, + SNP_MSG_REPORT_RSP, + SNP_MSG_EXPORT_REQ, + SNP_MSG_EXPORT_RSP, + SNP_MSG_IMPORT_REQ, + SNP_MSG_IMPORT_RSP, + SNP_MSG_ABSORB_REQ, + SNP_MSG_ABSORB_RSP, + SNP_MSG_VMRK_REQ, + SNP_MSG_VMRK_RSP, + + SNP_MSG_TSC_INFO_REQ = 17, + SNP_MSG_TSC_INFO_RSP, + + SNP_MSG_TYPE_MAX +}; + +enum aead_algo { + SNP_AEAD_INVALID, + SNP_AEAD_AES_256_GCM, +}; + +struct snp_guest_msg_hdr { + u8 authtag[MAX_AUTHTAG_LEN]; + u64 msg_seqno; + u8 rsvd1[8]; + u8 algo; + u8 hdr_version; + u16 hdr_sz; + u8 msg_type; + u8 msg_version; + u16 msg_sz; + u32 rsvd2; + u8 msg_vmpck; + u8 rsvd3[35]; +} __packed; + +struct snp_guest_msg { + struct snp_guest_msg_hdr hdr; + u8 payload[PAGE_SIZE - sizeof(struct snp_guest_msg_hdr)]; +} __packed; + +#define SNP_TSC_INFO_REQ_SZ 128 + +struct snp_tsc_info_req { + u8 rsvd[SNP_TSC_INFO_REQ_SZ]; +} __packed; + +struct snp_tsc_info_resp { + u32 status; + u32 rsvd1; + u64 tsc_scale; + u64 tsc_offset; + u32 tsc_factor; + u8 rsvd2[100]; +} __packed; + +struct snp_guest_req { + void *req_buf; + size_t req_sz; + + void *resp_buf; + size_t resp_sz; + + u64 exit_code; + unsigned int vmpck_id; + u8 msg_version; + u8 msg_type; + + struct snp_req_data input; + void *certs_data; }; /* @@ -118,7 +259,7 @@ struct secrets_os_area { #define VMPCK_KEY_LEN 32 /* See the SNP spec version 0.9 for secrets page format */ -struct snp_secrets_page_layout { +struct snp_secrets_page { u32 version; u32 imien : 1, rsvd1 : 31; @@ -130,32 +271,191 @@ struct snp_secrets_page_layout { u8 vmpck2[VMPCK_KEY_LEN]; u8 vmpck3[VMPCK_KEY_LEN]; struct secrets_os_area os_area; - u8 rsvd3[3840]; + + u8 vmsa_tweak_bitmap[64]; + + /* SVSM fields */ + u64 svsm_base; + u64 svsm_size; + u64 svsm_caa; + u32 svsm_max_version; + u8 svsm_guest_vmpl; + u8 rsvd3[3]; + + /* Remainder of page */ + u8 rsvd4[3744]; } __packed; +struct snp_msg_desc { + /* request and response are in unencrypted memory */ + struct snp_guest_msg *request, *response; + + /* + * Avoid information leakage by double-buffering shared messages + * in fields that are in regular encrypted memory. + */ + struct snp_guest_msg secret_request, secret_response; + + struct snp_secrets_page *secrets; + + struct aesgcm_ctx *ctx; + + u32 *os_area_msg_seqno; + u8 *vmpck; + int vmpck_id; +}; + +/* + * The SVSM Calling Area (CA) related structures. + */ +struct svsm_ca { + u8 call_pending; + u8 mem_available; + u8 rsvd1[6]; + + u8 svsm_buffer[PAGE_SIZE - 8]; +}; + +#define SVSM_SUCCESS 0 +#define SVSM_ERR_INCOMPLETE 0x80000000 +#define SVSM_ERR_UNSUPPORTED_PROTOCOL 0x80000001 +#define SVSM_ERR_UNSUPPORTED_CALL 0x80000002 +#define SVSM_ERR_INVALID_ADDRESS 0x80000003 +#define SVSM_ERR_INVALID_FORMAT 0x80000004 +#define SVSM_ERR_INVALID_PARAMETER 0x80000005 +#define SVSM_ERR_INVALID_REQUEST 0x80000006 +#define SVSM_ERR_BUSY 0x80000007 +#define SVSM_PVALIDATE_FAIL_SIZEMISMATCH 0x80001006 + +/* + * The SVSM PVALIDATE related structures + */ +struct svsm_pvalidate_entry { + u64 page_size : 2, + action : 1, + ignore_cf : 1, + rsvd : 8, + pfn : 52; +}; + +struct svsm_pvalidate_call { + u16 num_entries; + u16 cur_index; + + u8 rsvd1[4]; + + struct svsm_pvalidate_entry entry[]; +}; + +#define SVSM_PVALIDATE_MAX_COUNT ((sizeof_field(struct svsm_ca, svsm_buffer) - \ + offsetof(struct svsm_pvalidate_call, entry)) / \ + sizeof(struct svsm_pvalidate_entry)) + +/* + * The SVSM Attestation related structures + */ +struct svsm_loc_entry { + u64 pa; + u32 len; + u8 rsvd[4]; +}; + +struct svsm_attest_call { + struct svsm_loc_entry report_buf; + struct svsm_loc_entry nonce; + struct svsm_loc_entry manifest_buf; + struct svsm_loc_entry certificates_buf; + + /* For attesting a single service */ + u8 service_guid[16]; + u32 service_manifest_ver; + u8 rsvd[4]; +}; + +/* PTE descriptor used for the prepare_pte_enc() operations. */ +struct pte_enc_desc { + pte_t *kpte; + int pte_level; + bool encrypt; + /* pfn of the kpte above */ + unsigned long pfn; + /* physical address of @pfn */ + unsigned long pa; + /* virtual address of @pfn */ + void *va; + /* memory covered by the pte */ + unsigned long size; + pgprot_t new_pgprot; +}; + +/* + * SVSM protocol structure + */ +struct svsm_call { + struct svsm_ca *caa; + u64 rax; + u64 rcx; + u64 rdx; + u64 r8; + u64 r9; + u64 rax_out; + u64 rcx_out; + u64 rdx_out; + u64 r8_out; + u64 r9_out; +}; + +#define SVSM_CORE_CALL(x) ((0ULL << 32) | (x)) +#define SVSM_CORE_REMAP_CA 0 +#define SVSM_CORE_PVALIDATE 1 +#define SVSM_CORE_CREATE_VCPU 2 +#define SVSM_CORE_DELETE_VCPU 3 + +#define SVSM_ATTEST_CALL(x) ((1ULL << 32) | (x)) +#define SVSM_ATTEST_SERVICES 0 +#define SVSM_ATTEST_SINGLE_SERVICE 1 + +#define SVSM_VTPM_CALL(x) ((2ULL << 32) | (x)) +#define SVSM_VTPM_QUERY 0 +#define SVSM_VTPM_CMD 1 + #ifdef CONFIG_AMD_MEM_ENCRYPT -extern struct static_key_false sev_es_enable_key; + +extern u8 snp_vmpl; + extern void __sev_es_ist_enter(struct pt_regs *regs); extern void __sev_es_ist_exit(void); static __always_inline void sev_es_ist_enter(struct pt_regs *regs) { - if (static_branch_unlikely(&sev_es_enable_key)) + if (cc_vendor == CC_VENDOR_AMD && + cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) __sev_es_ist_enter(regs); } static __always_inline void sev_es_ist_exit(void) { - if (static_branch_unlikely(&sev_es_enable_key)) + if (cc_vendor == CC_VENDOR_AMD && + cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) __sev_es_ist_exit(); } extern int sev_es_setup_ap_jump_table(struct real_mode_header *rmh); extern void __sev_es_nmi_complete(void); static __always_inline void sev_es_nmi_complete(void) { - if (static_branch_unlikely(&sev_es_enable_key)) + if (cc_vendor == CC_VENDOR_AMD && + cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) __sev_es_nmi_complete(); } extern int __init sev_es_efi_map_ghcbs(pgd_t *pgd); +extern void sev_enable(struct boot_params *bp); +/* + * RMPADJUST modifies the RMP permissions of a page of a lesser- + * privileged (numerically higher) VMPL. + * + * If the guest is running at a higher-privilege than the privilege + * level the instruction is targeting, the instruction will succeed, + * otherwise, it will fail. + */ static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) { int rc; @@ -185,42 +485,142 @@ static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) return rc; } + +struct snp_guest_request_ioctl; + void setup_ghcb(void); -void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, - unsigned int npages); -void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, - unsigned int npages); -void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op); -void snp_set_memory_shared(unsigned long vaddr, unsigned int npages); -void snp_set_memory_private(unsigned long vaddr, unsigned int npages); +void early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, + unsigned long npages); +void early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, + unsigned long npages); +void snp_set_memory_shared(unsigned long vaddr, unsigned long npages); +void snp_set_memory_private(unsigned long vaddr, unsigned long npages); void snp_set_wakeup_secondary_cpu(void); bool snp_init(struct boot_params *bp); -void snp_abort(void); -int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned long *fw_err); -#else +void __noreturn snp_abort(void); +void snp_dmi_setup(void); +int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call, struct svsm_attest_call *input); +void snp_accept_memory(phys_addr_t start, phys_addr_t end); +u64 snp_get_unsupported_features(u64 status); +u64 sev_get_status(void); +void sev_show_status(void); +void snp_update_svsm_ca(void); +int prepare_pte_enc(struct pte_enc_desc *d); +void set_pte_enc_mask(pte_t *kpte, unsigned long pfn, pgprot_t new_prot); +void snp_kexec_finish(void); +void snp_kexec_begin(void); + +int snp_msg_init(struct snp_msg_desc *mdesc, int vmpck_id); +struct snp_msg_desc *snp_msg_alloc(void); +void snp_msg_free(struct snp_msg_desc *mdesc); +int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req, + struct snp_guest_request_ioctl *rio); + +int snp_svsm_vtpm_send_command(u8 *buffer); + +void __init snp_secure_tsc_prepare(void); +void __init snp_secure_tsc_init(void); + +static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) +{ + ghcb->save.sw_exit_code = 0; + __builtin_memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); +} + +void vc_forward_exception(struct es_em_ctxt *ctxt); + +/* I/O parameters for CPUID-related helpers */ +struct cpuid_leaf { + u32 fn; + u32 subfn; + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; +}; + +int snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf); + +void __noreturn sev_es_terminate(unsigned int set, unsigned int reason); +enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, + struct es_em_ctxt *ctxt, + u64 exit_code, u64 exit_info_1, + u64 exit_info_2); + +extern struct ghcb *boot_ghcb; + +#else /* !CONFIG_AMD_MEM_ENCRYPT */ + +#define snp_vmpl 0 static inline void sev_es_ist_enter(struct pt_regs *regs) { } static inline void sev_es_ist_exit(void) { } static inline int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { return 0; } static inline void sev_es_nmi_complete(void) { } static inline int sev_es_efi_map_ghcbs(pgd_t *pgd) { return 0; } +static inline void sev_enable(struct boot_params *bp) { } static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) { return 0; } static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) { return 0; } static inline void setup_ghcb(void) { } static inline void __init -early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned int npages) { } +early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned long npages) { } static inline void __init -early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned int npages) { } -static inline void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) { } -static inline void snp_set_memory_shared(unsigned long vaddr, unsigned int npages) { } -static inline void snp_set_memory_private(unsigned long vaddr, unsigned int npages) { } +early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned long npages) { } +static inline void snp_set_memory_shared(unsigned long vaddr, unsigned long npages) { } +static inline void snp_set_memory_private(unsigned long vaddr, unsigned long npages) { } static inline void snp_set_wakeup_secondary_cpu(void) { } static inline bool snp_init(struct boot_params *bp) { return false; } static inline void snp_abort(void) { } -static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, - unsigned long *fw_err) +static inline void snp_dmi_setup(void) { } +static inline int snp_issue_svsm_attest_req(u64 call_id, struct svsm_call *call, struct svsm_attest_call *input) { return -ENOTTY; } +static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { } +static inline u64 snp_get_unsupported_features(u64 status) { return 0; } +static inline u64 sev_get_status(void) { return 0; } +static inline void sev_show_status(void) { } +static inline void snp_update_svsm_ca(void) { } +static inline int prepare_pte_enc(struct pte_enc_desc *d) { return 0; } +static inline void set_pte_enc_mask(pte_t *kpte, unsigned long pfn, pgprot_t new_prot) { } +static inline void snp_kexec_finish(void) { } +static inline void snp_kexec_begin(void) { } +static inline int snp_msg_init(struct snp_msg_desc *mdesc, int vmpck_id) { return -1; } +static inline struct snp_msg_desc *snp_msg_alloc(void) { return NULL; } +static inline void snp_msg_free(struct snp_msg_desc *mdesc) { } +static inline int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req, + struct snp_guest_request_ioctl *rio) { return -ENODEV; } +static inline int snp_svsm_vtpm_send_command(u8 *buffer) { return -ENODEV; } +static inline void __init snp_secure_tsc_prepare(void) { } +static inline void __init snp_secure_tsc_init(void) { } + +#endif /* CONFIG_AMD_MEM_ENCRYPT */ + +#ifdef CONFIG_KVM_AMD_SEV +bool snp_probe_rmptable_info(void); +int snp_rmptable_init(void); +int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level); +void snp_dump_hva_rmpentry(unsigned long address); +int psmash(u64 pfn); +int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable); +int rmp_make_shared(u64 pfn, enum pg_level level); +void snp_leak_pages(u64 pfn, unsigned int npages); +void kdump_sev_callback(void); +void snp_fixup_e820_tables(void); +#else +static inline bool snp_probe_rmptable_info(void) { return false; } +static inline int snp_rmptable_init(void) { return -ENOSYS; } +static inline int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level) { return -ENODEV; } +static inline void snp_dump_hva_rmpentry(unsigned long address) {} +static inline int psmash(u64 pfn) { return -ENODEV; } +static inline int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, + bool immutable) +{ + return -ENODEV; +} +static inline int rmp_make_shared(u64 pfn, enum pg_level level) { return -ENODEV; } +static inline void snp_leak_pages(u64 pfn, unsigned int npages) {} +static inline void kdump_sev_callback(void) { } +static inline void snp_fixup_e820_tables(void) {} #endif #endif diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h index 3f9334ef67cd..6a0069761508 100644 --- a/arch/x86/include/asm/sgx.h +++ b/arch/x86/include/asm/sgx.h @@ -65,17 +65,22 @@ enum sgx_encls_function { /** * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV + * %SGX_EPC_PAGE_CONFLICT: Page is being written by other ENCLS function. * %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not * been completed yet. * %SGX_CHILD_PRESENT SECS has child pages present in the EPC. * %SGX_INVALID_EINITTOKEN: EINITTOKEN is invalid and enclave signer's * public key does not match IA32_SGXLEPUBKEYHASH. + * %SGX_PAGE_NOT_MODIFIABLE: The EPC page cannot be modified because it + * is in the PENDING or MODIFIED state. * %SGX_UNMASKED_EVENT: An unmasked event, e.g. INTR, was received */ enum sgx_return_code { + SGX_EPC_PAGE_CONFLICT = 7, SGX_NOT_TRACKED = 11, SGX_CHILD_PRESENT = 13, SGX_INVALID_EINITTOKEN = 16, + SGX_PAGE_NOT_MODIFIABLE = 20, SGX_UNMASKED_EVENT = 128, }; @@ -110,17 +115,36 @@ enum sgx_miscselect { * %SGX_ATTR_EINITTOKENKEY: Allow to use token signing key that is used to * sign cryptographic tokens that can be passed to * EINIT as an authorization to run an enclave. + * %SGX_ATTR_ASYNC_EXIT_NOTIFY: Allow enclaves to be notified after an + * asynchronous exit has occurred. */ enum sgx_attribute { - SGX_ATTR_INIT = BIT(0), - SGX_ATTR_DEBUG = BIT(1), - SGX_ATTR_MODE64BIT = BIT(2), - SGX_ATTR_PROVISIONKEY = BIT(4), - SGX_ATTR_EINITTOKENKEY = BIT(5), - SGX_ATTR_KSS = BIT(7), + SGX_ATTR_INIT = BIT(0), + SGX_ATTR_DEBUG = BIT(1), + SGX_ATTR_MODE64BIT = BIT(2), + /* BIT(3) is reserved */ + SGX_ATTR_PROVISIONKEY = BIT(4), + SGX_ATTR_EINITTOKENKEY = BIT(5), + /* BIT(6) is for CET */ + SGX_ATTR_KSS = BIT(7), + /* BIT(8) is reserved */ + /* BIT(9) is reserved */ + SGX_ATTR_ASYNC_EXIT_NOTIFY = BIT(10), }; -#define SGX_ATTR_RESERVED_MASK (BIT_ULL(3) | BIT_ULL(6) | GENMASK_ULL(63, 8)) +#define SGX_ATTR_RESERVED_MASK (BIT_ULL(3) | \ + BIT_ULL(6) | \ + BIT_ULL(8) | \ + BIT_ULL(9) | \ + GENMASK_ULL(63, 11)) + +#define SGX_ATTR_UNPRIV_MASK (SGX_ATTR_DEBUG | \ + SGX_ATTR_MODE64BIT | \ + SGX_ATTR_KSS | \ + SGX_ATTR_ASYNC_EXIT_NOTIFY) + +#define SGX_ATTR_PRIV_MASK (SGX_ATTR_PROVISIONKEY | \ + SGX_ATTR_EINITTOKENKEY) /** * struct sgx_secs - SGX Enclave Control Structure (SECS) @@ -234,6 +258,9 @@ struct sgx_pageinfo { * %SGX_PAGE_TYPE_REG: a regular page * %SGX_PAGE_TYPE_VA: a VA page * %SGX_PAGE_TYPE_TRIM: a page in trimmed state + * + * Make sure when making changes to this enum that its values can still fit + * in the bitfield within &struct sgx_encl_page */ enum sgx_page_type { SGX_PAGE_TYPE_SECS, diff --git a/arch/x86/include/asm/shared/io.h b/arch/x86/include/asm/shared/io.h index c0ef921c0586..8009d781c2f9 100644 --- a/arch/x86/include/asm/shared/io.h +++ b/arch/x86/include/asm/shared/io.h @@ -5,13 +5,13 @@ #include <linux/types.h> #define BUILDIO(bwl, bw, type) \ -static inline void __out##bwl(type value, u16 port) \ +static __always_inline void __out##bwl(type value, u16 port) \ { \ asm volatile("out" #bwl " %" #bw "0, %w1" \ : : "a"(value), "Nd"(port)); \ } \ \ -static inline type __in##bwl(u16 port) \ +static __always_inline type __in##bwl(u16 port) \ { \ type value; \ asm volatile("in" #bwl " %w1, %" #bw "0" \ diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h index e53f26228fbb..d8525e6ef50a 100644 --- a/arch/x86/include/asm/shared/tdx.h +++ b/arch/x86/include/asm/shared/tdx.h @@ -7,34 +7,184 @@ #define TDX_HYPERCALL_STANDARD 0 -#define TDX_HCALL_HAS_OUTPUT BIT(0) -#define TDX_HCALL_ISSUE_STI BIT(1) - #define TDX_CPUID_LEAF_ID 0x21 #define TDX_IDENT "IntelTDX " -#ifndef __ASSEMBLY__ +/* TDX module Call Leaf IDs */ +#define TDG_VP_VMCALL 0 +#define TDG_VP_INFO 1 +#define TDG_MR_RTMR_EXTEND 2 +#define TDG_VP_VEINFO_GET 3 +#define TDG_MR_REPORT 4 +#define TDG_MEM_PAGE_ACCEPT 6 +#define TDG_VM_RD 7 +#define TDG_VM_WR 8 + +/* TDX attributes */ +#define TDX_ATTR_DEBUG_BIT 0 +#define TDX_ATTR_DEBUG BIT_ULL(TDX_ATTR_DEBUG_BIT) +#define TDX_ATTR_HGS_PLUS_PROF_BIT 4 +#define TDX_ATTR_HGS_PLUS_PROF BIT_ULL(TDX_ATTR_HGS_PLUS_PROF_BIT) +#define TDX_ATTR_PERF_PROF_BIT 5 +#define TDX_ATTR_PERF_PROF BIT_ULL(TDX_ATTR_PERF_PROF_BIT) +#define TDX_ATTR_PMT_PROF_BIT 6 +#define TDX_ATTR_PMT_PROF BIT_ULL(TDX_ATTR_PMT_PROF_BIT) +#define TDX_ATTR_ICSSD_BIT 16 +#define TDX_ATTR_ICSSD BIT_ULL(TDX_ATTR_ICSSD_BIT) +#define TDX_ATTR_LASS_BIT 27 +#define TDX_ATTR_LASS BIT_ULL(TDX_ATTR_LASS_BIT) +#define TDX_ATTR_SEPT_VE_DISABLE_BIT 28 +#define TDX_ATTR_SEPT_VE_DISABLE BIT_ULL(TDX_ATTR_SEPT_VE_DISABLE_BIT) +#define TDX_ATTR_MIGRTABLE_BIT 29 +#define TDX_ATTR_MIGRTABLE BIT_ULL(TDX_ATTR_MIGRTABLE_BIT) +#define TDX_ATTR_PKS_BIT 30 +#define TDX_ATTR_PKS BIT_ULL(TDX_ATTR_PKS_BIT) +#define TDX_ATTR_KL_BIT 31 +#define TDX_ATTR_KL BIT_ULL(TDX_ATTR_KL_BIT) +#define TDX_ATTR_TPA_BIT 62 +#define TDX_ATTR_TPA BIT_ULL(TDX_ATTR_TPA_BIT) +#define TDX_ATTR_PERFMON_BIT 63 +#define TDX_ATTR_PERFMON BIT_ULL(TDX_ATTR_PERFMON_BIT) + +/* TDX TD-Scope Metadata. To be used by TDG.VM.WR and TDG.VM.RD */ +#define TDCS_CONFIG_FLAGS 0x1110000300000016 +#define TDCS_TD_CTLS 0x1110000300000017 +#define TDCS_NOTIFY_ENABLES 0x9100000000000010 +#define TDCS_TOPOLOGY_ENUM_CONFIGURED 0x9100000000000019 + +/* TDCS_CONFIG_FLAGS bits */ +#define TDCS_CONFIG_FLEXIBLE_PENDING_VE BIT_ULL(1) + +/* TDCS_TD_CTLS bits */ +#define TD_CTLS_PENDING_VE_DISABLE_BIT 0 +#define TD_CTLS_PENDING_VE_DISABLE BIT_ULL(TD_CTLS_PENDING_VE_DISABLE_BIT) +#define TD_CTLS_ENUM_TOPOLOGY_BIT 1 +#define TD_CTLS_ENUM_TOPOLOGY BIT_ULL(TD_CTLS_ENUM_TOPOLOGY_BIT) +#define TD_CTLS_VIRT_CPUID2_BIT 2 +#define TD_CTLS_VIRT_CPUID2 BIT_ULL(TD_CTLS_VIRT_CPUID2_BIT) +#define TD_CTLS_REDUCE_VE_BIT 3 +#define TD_CTLS_REDUCE_VE BIT_ULL(TD_CTLS_REDUCE_VE_BIT) +#define TD_CTLS_LOCK_BIT 63 +#define TD_CTLS_LOCK BIT_ULL(TD_CTLS_LOCK_BIT) + +/* TDX hypercall Leaf IDs */ +#define TDVMCALL_GET_TD_VM_CALL_INFO 0x10000 +#define TDVMCALL_MAP_GPA 0x10001 +#define TDVMCALL_GET_QUOTE 0x10002 +#define TDVMCALL_REPORT_FATAL_ERROR 0x10003 + +/* + * TDG.VP.VMCALL Status Codes (returned in R10) + */ +#define TDVMCALL_STATUS_SUCCESS 0x0000000000000000ULL +#define TDVMCALL_STATUS_RETRY 0x0000000000000001ULL +#define TDVMCALL_STATUS_INVALID_OPERAND 0x8000000000000000ULL +#define TDVMCALL_STATUS_ALIGN_ERROR 0x8000000000000002ULL +#define TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED 0x8000000000000003ULL + +/* + * Bitmasks of exposed registers (with VMM). + */ +#define TDX_RDX BIT(2) +#define TDX_RBX BIT(3) +#define TDX_RSI BIT(6) +#define TDX_RDI BIT(7) +#define TDX_R8 BIT(8) +#define TDX_R9 BIT(9) +#define TDX_R10 BIT(10) +#define TDX_R11 BIT(11) +#define TDX_R12 BIT(12) +#define TDX_R13 BIT(13) +#define TDX_R14 BIT(14) +#define TDX_R15 BIT(15) + +/* + * These registers are clobbered to hold arguments for each + * TDVMCALL. They are safe to expose to the VMM. + * Each bit in this mask represents a register ID. Bit field + * details can be found in TDX GHCI specification, section + * titled "TDCALL [TDG.VP.VMCALL] leaf". + */ +#define TDVMCALL_EXPOSE_REGS_MASK \ + (TDX_RDX | TDX_RBX | TDX_RSI | TDX_RDI | TDX_R8 | TDX_R9 | \ + TDX_R10 | TDX_R11 | TDX_R12 | TDX_R13 | TDX_R14 | TDX_R15) + +/* TDX supported page sizes from the TDX module ABI. */ +#define TDX_PS_4K 0 +#define TDX_PS_2M 1 +#define TDX_PS_1G 2 +#define TDX_PS_NR (TDX_PS_1G + 1) + +#ifndef __ASSEMBLER__ + +#include <linux/compiler_attributes.h> /* - * Used in __tdx_hypercall() to pass down and get back registers' values of - * the TDCALL instruction when requesting services from the VMM. - * - * This is a software only structure and not part of the TDX module/VMM ABI. + * Used in __tdcall*() to gather the input/output registers' values of the + * TDCALL instruction when requesting services from the TDX module. This is a + * software only structure and not part of the TDX module/VMM ABI */ -struct tdx_hypercall_args { +struct tdx_module_args { + /* callee-clobbered */ + u64 rcx; + u64 rdx; + u64 r8; + u64 r9; + /* extra callee-clobbered */ u64 r10; u64 r11; + /* callee-saved + rdi/rsi */ u64 r12; u64 r13; u64 r14; u64 r15; + u64 rbx; + u64 rdi; + u64 rsi; }; +/* Used to communicate with the TDX module */ +u64 __tdcall(u64 fn, struct tdx_module_args *args); +u64 __tdcall_ret(u64 fn, struct tdx_module_args *args); +u64 __tdcall_saved_ret(u64 fn, struct tdx_module_args *args); + /* Used to request services from the VMM */ -u64 __tdx_hypercall(struct tdx_hypercall_args *args, unsigned long flags); +u64 __tdx_hypercall(struct tdx_module_args *args); + +/* + * Wrapper for standard use of __tdx_hypercall with no output aside from + * return code. + */ +static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14, u64 r15) +{ + struct tdx_module_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = fn, + .r12 = r12, + .r13 = r13, + .r14 = r14, + .r15 = r15, + }; + + return __tdx_hypercall(&args); +} + /* Called from __tdx_hypercall() for unrecoverable failure */ -void __tdx_hypercall_failed(void); +void __noreturn __tdx_hypercall_failed(void); + +bool tdx_accept_memory(phys_addr_t start, phys_addr_t end); + +/* + * The TDG.VP.VMCALL-Instruction-execution sub-functions are defined + * independently from but are currently matched 1:1 with VMX EXIT_REASONs. + * Reusing the KVM EXIT_REASON macros makes it easier to connect the host and + * guest sides of these calls. + */ +static __always_inline u64 hcall_func(u64 exit_reason) +{ + return exit_reason; +} -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_X86_SHARED_TDX_H */ diff --git a/arch/x86/include/asm/shstk.h b/arch/x86/include/asm/shstk.h new file mode 100644 index 000000000000..ba6f2fe43848 --- /dev/null +++ b/arch/x86/include/asm/shstk.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_SHSTK_H +#define _ASM_X86_SHSTK_H + +#ifndef __ASSEMBLER__ +#include <linux/types.h> + +struct task_struct; +struct ksignal; + +#ifdef CONFIG_X86_USER_SHADOW_STACK +struct thread_shstk { + u64 base; + u64 size; +}; + +long shstk_prctl(struct task_struct *task, int option, unsigned long arg2); +void reset_thread_features(void); +unsigned long shstk_alloc_thread_stack(struct task_struct *p, unsigned long clone_flags, + unsigned long stack_size); +void shstk_free(struct task_struct *p); +int setup_signal_shadow_stack(struct ksignal *ksig); +int restore_signal_shadow_stack(void); +int shstk_update_last_frame(unsigned long val); +bool shstk_is_enabled(void); +#else +static inline long shstk_prctl(struct task_struct *task, int option, + unsigned long arg2) { return -EINVAL; } +static inline void reset_thread_features(void) {} +static inline unsigned long shstk_alloc_thread_stack(struct task_struct *p, + unsigned long clone_flags, + unsigned long stack_size) { return 0; } +static inline void shstk_free(struct task_struct *p) {} +static inline int setup_signal_shadow_stack(struct ksignal *ksig) { return 0; } +static inline int restore_signal_shadow_stack(void) { return 0; } +static inline int shstk_update_last_frame(unsigned long val) { return 0; } +static inline bool shstk_is_enabled(void) { return false; } +#endif /* CONFIG_X86_USER_SHADOW_STACK */ + +#endif /* __ASSEMBLER__ */ + +#endif /* _ASM_X86_SHSTK_H */ diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h index 5b1ed650b124..84eab2724875 100644 --- a/arch/x86/include/asm/sigframe.h +++ b/arch/x86/include/asm/sigframe.h @@ -85,6 +85,4 @@ struct rt_sigframe_x32 { #endif /* CONFIG_X86_64 */ -void __init init_sigframe_size(void); - #endif /* _ASM_X86_SIGFRAME_H */ diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h index 65e667279e0f..8727c7e21dd1 100644 --- a/arch/x86/include/asm/sighandling.h +++ b/arch/x86/include/asm/sighandling.h @@ -15,4 +15,35 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where); +void __user * +get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size, + void __user **fpstate); + +int ia32_setup_frame(struct ksignal *ksig, struct pt_regs *regs); +int ia32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs); +int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs); +int x32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs); + +/* + * To prevent immediate repeat of single step trap on return from SIGTRAP + * handler if the trap flag (TF) is set without an external debugger attached, + * clear the software event flag in the augmented SS, ensuring no single-step + * trap is pending upon ERETU completion. + * + * Note, this function should be called in sigreturn() before the original + * state is restored to make sure the TF is read from the entry frame. + */ +static __always_inline void prevent_single_step_upon_eretu(struct pt_regs *regs) +{ + /* + * If the trap flag (TF) is set, i.e., the sigreturn() SYSCALL instruction + * is being single-stepped, do not clear the software event flag in the + * augmented SS, thus a debugger won't skip over the following instruction. + */ +#ifdef CONFIG_X86_FRED + if (!(regs->flags & X86_EFLAGS_TF)) + regs->fred_ss.swevent = 0; +#endif +} + #endif /* _ASM_X86_SIGHANDLING_H */ diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h index 2dfb5fea13af..c72d46175374 100644 --- a/arch/x86/include/asm/signal.h +++ b/arch/x86/include/asm/signal.h @@ -2,7 +2,7 @@ #ifndef _ASM_X86_SIGNAL_H #define _ASM_X86_SIGNAL_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/linkage.h> /* Most things should be clean enough to redefine this at will, if care @@ -28,14 +28,9 @@ typedef struct { #define SA_IA32_ABI 0x02000000u #define SA_X32_ABI 0x01000000u -#ifndef CONFIG_COMPAT -#define compat_sigset_t compat_sigset_t -typedef sigset_t compat_sigset_t; -#endif - -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #include <uapi/asm/signal.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define __ARCH_HAS_SA_RESTORER @@ -106,5 +101,5 @@ struct pt_regs; #endif /* !__i386__ */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_SIGNAL_H */ diff --git a/arch/x86/include/asm/simd.h b/arch/x86/include/asm/simd.h index a341c878e977..b8027b63cd7a 100644 --- a/arch/x86/include/asm/simd.h +++ b/arch/x86/include/asm/simd.h @@ -1,6 +1,10 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_SIMD_H +#define _ASM_SIMD_H #include <asm/fpu/api.h> +#include <linux/compiler_attributes.h> +#include <linux/types.h> /* * may_use_simd - whether it is allowable at this time to issue SIMD @@ -10,3 +14,5 @@ static __must_check inline bool may_use_simd(void) { return irq_fpu_usable(); } + +#endif /* _ASM_SIMD_H */ diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index bab490379c65..4f84d421d1cf 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -13,30 +13,26 @@ #include <asm/cpufeatures.h> #include <asm/alternative.h> -/* "Raw" instruction opcodes */ -#define __ASM_CLAC ".byte 0x0f,0x01,0xca" -#define __ASM_STAC ".byte 0x0f,0x01,0xcb" - -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #define ASM_CLAC \ - ALTERNATIVE "", __ASM_CLAC, X86_FEATURE_SMAP + ALTERNATIVE "", "clac", X86_FEATURE_SMAP #define ASM_STAC \ - ALTERNATIVE "", __ASM_STAC, X86_FEATURE_SMAP + ALTERNATIVE "", "stac", X86_FEATURE_SMAP -#else /* __ASSEMBLY__ */ +#else /* __ASSEMBLER__ */ static __always_inline void clac(void) { /* Note: a barrier is implicit in alternative() */ - alternative("", __ASM_CLAC, X86_FEATURE_SMAP); + alternative("", "clac", X86_FEATURE_SMAP); } static __always_inline void stac(void) { /* Note: a barrier is implicit in alternative() */ - alternative("", __ASM_STAC, X86_FEATURE_SMAP); + alternative("", "stac", X86_FEATURE_SMAP); } static __always_inline unsigned long smap_save(void) @@ -44,7 +40,8 @@ static __always_inline unsigned long smap_save(void) unsigned long flags; asm volatile ("# smap_save\n\t" - ALTERNATIVE("", "pushf; pop %0; " __ASM_CLAC "\n\t", + ALTERNATIVE(ANNOTATE_IGNORE_ALTERNATIVE + "", "pushf; pop %0; clac", X86_FEATURE_SMAP) : "=rm" (flags) : : "memory", "cc"); @@ -54,17 +51,23 @@ static __always_inline unsigned long smap_save(void) static __always_inline void smap_restore(unsigned long flags) { asm volatile ("# smap_restore\n\t" - ALTERNATIVE("", "push %0; popf\n\t", + ALTERNATIVE(ANNOTATE_IGNORE_ALTERNATIVE + "", "push %0; popf", X86_FEATURE_SMAP) : : "g" (flags) : "memory", "cc"); } /* These macros can be used in asm() statements */ #define ASM_CLAC \ - ALTERNATIVE("", __ASM_CLAC, X86_FEATURE_SMAP) + ALTERNATIVE("", "clac", X86_FEATURE_SMAP) #define ASM_STAC \ - ALTERNATIVE("", __ASM_STAC, X86_FEATURE_SMAP) + ALTERNATIVE("", "stac", X86_FEATURE_SMAP) + +#define ASM_CLAC_UNSAFE \ + ALTERNATIVE("", ANNOTATE_IGNORE_ALTERNATIVE "clac", X86_FEATURE_SMAP) +#define ASM_STAC_UNSAFE \ + ALTERNATIVE("", ANNOTATE_IGNORE_ALTERNATIVE "stac", X86_FEATURE_SMAP) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_SMAP_H */ diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 81a0211a372d..0c1c68039d6f 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -1,15 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_SMP_H #define _ASM_X86_SMP_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/cpumask.h> -#include <asm/percpu.h> +#include <linux/thread_info.h> -#include <asm/thread_info.h> #include <asm/cpumask.h> -extern int smp_num_siblings; -extern unsigned int num_processors; +DECLARE_PER_CPU_CACHE_HOT(int, cpu_number); DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map); @@ -17,26 +15,9 @@ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map); /* cpus sharing the last level cache: */ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map); -DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id); -DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id); -DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number); -static inline struct cpumask *cpu_llc_shared_mask(int cpu) -{ - return per_cpu(cpu_llc_shared_map, cpu); -} - -static inline struct cpumask *cpu_l2c_shared_mask(int cpu) -{ - return per_cpu(cpu_l2c_shared_map, cpu); -} - -DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid); +DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_apicid); DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid); -DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid); -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) -DECLARE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid); -#endif struct task_struct; @@ -49,10 +30,13 @@ struct smp_ops { void (*crash_stop_other_cpus)(void); void (*smp_send_reschedule)(int cpu); - int (*cpu_up)(unsigned cpu, struct task_struct *tidle); + void (*cleanup_dead_cpu)(unsigned cpu); + void (*poll_sync_state)(void); + int (*kick_ap_alive)(unsigned cpu, struct task_struct *tidle); int (*cpu_disable)(void); void (*cpu_die)(unsigned int cpu); void (*play_dead)(void); + void (*stop_this_cpu)(void); void (*send_call_func_ipi)(const struct cpumask *mask); void (*send_call_func_single_ipi)(int cpu); @@ -74,11 +58,6 @@ static inline void stop_other_cpus(void) smp_ops.stop_other_cpus(1); } -static inline void smp_prepare_boot_cpu(void) -{ - smp_ops.smp_prepare_boot_cpu(); -} - static inline void smp_prepare_cpus(unsigned int max_cpus) { smp_ops.smp_prepare_cpus(max_cpus); @@ -89,11 +68,6 @@ static inline void smp_cpus_done(unsigned int max_cpus) smp_ops.smp_cpus_done(max_cpus); } -static inline int __cpu_up(unsigned int cpu, struct task_struct *tidle) -{ - return smp_ops.cpu_up(cpu, tidle); -} - static inline int __cpu_disable(void) { return smp_ops.cpu_disable(); @@ -101,15 +75,17 @@ static inline int __cpu_disable(void) static inline void __cpu_die(unsigned int cpu) { - smp_ops.cpu_die(cpu); + if (smp_ops.cpu_die) + smp_ops.cpu_die(cpu); } -static inline void play_dead(void) +static inline void __noreturn play_dead(void) { smp_ops.play_dead(); + BUG(); } -static inline void smp_send_reschedule(int cpu) +static inline void arch_smp_send_reschedule(int cpu) { smp_ops.smp_send_reschedule(cpu); } @@ -128,27 +104,22 @@ void cpu_disable_common(void); void native_smp_prepare_boot_cpu(void); void smp_prepare_cpus_common(void); void native_smp_prepare_cpus(unsigned int max_cpus); -void calculate_max_logical_packages(void); void native_smp_cpus_done(unsigned int max_cpus); int common_cpu_up(unsigned int cpunum, struct task_struct *tidle); -int native_cpu_up(unsigned int cpunum, struct task_struct *tidle); +int native_kick_ap(unsigned int cpu, struct task_struct *tidle); int native_cpu_disable(void); -int common_cpu_die(unsigned int cpu); -void native_cpu_die(unsigned int cpu); -void hlt_play_dead(void); +void __noreturn hlt_play_dead(void); void native_play_dead(void); void play_dead_common(void); void wbinvd_on_cpu(int cpu); int wbinvd_on_all_cpus(void); -void cond_wakeup_cpu0(void); + +void smp_kick_mwait_play_dead(void); +void __noreturn mwait_play_dead(unsigned int eax_hint); void native_smp_send_reschedule(int cpu); void native_send_call_func_ipi(const struct cpumask *mask); void native_send_call_func_single_ipi(int cpu); -void x86_idle_thread_init(unsigned int cpu, struct task_struct *idle); - -void smp_store_boot_cpu_info(void); -void smp_store_cpu_info(int id); asmlinkage __visible void smp_reboot_interrupt(void); __visible void smp_reschedule_interrupt(struct pt_regs *regs); @@ -160,17 +131,20 @@ __visible void smp_call_function_single_interrupt(struct pt_regs *r); /* * This function is needed by all SMP systems. It must _always_ be valid - * from the initial startup. We map APIC_BASE very early in page_setup(), - * so this is correct in the x86 case. + * from the initial startup. */ #define raw_smp_processor_id() this_cpu_read(cpu_number) #define __smp_processor_id() __this_cpu_read(cpu_number) -#ifdef CONFIG_X86_32 -extern int safe_smp_processor_id(void); -#else -# define safe_smp_processor_id() smp_processor_id() -#endif +static inline struct cpumask *cpu_llc_shared_mask(int cpu) +{ + return per_cpu(cpu_llc_shared_map, cpu); +} + +static inline struct cpumask *cpu_l2c_shared_mask(int cpu) +{ + return per_cpu(cpu_l2c_shared_map, cpu); +} #else /* !CONFIG_SMP */ #define wbinvd_on_cpu(cpu) wbinvd() @@ -179,16 +153,14 @@ static inline int wbinvd_on_all_cpus(void) wbinvd(); return 0; } -#endif /* CONFIG_SMP */ - -extern unsigned disabled_cpus; -#ifdef CONFIG_X86_LOCAL_APIC -extern int hard_smp_processor_id(void); +static inline struct cpumask *cpu_llc_shared_mask(int cpu) +{ + return (struct cpumask *)cpumask_of(0); +} -#else /* CONFIG_X86_LOCAL_APIC */ -#define hard_smp_processor_id() 0 -#endif /* CONFIG_X86_LOCAL_APIC */ +static inline void __noreturn mwait_play_dead(unsigned int eax_hint) { BUG(); } +#endif /* CONFIG_SMP */ #ifdef CONFIG_DEBUG_NMI_SELFTEST extern void nmi_selftest(void); @@ -196,5 +168,15 @@ extern void nmi_selftest(void); #define nmi_selftest() do { } while (0) #endif -#endif /* __ASSEMBLY__ */ +extern unsigned int smpboot_control; +extern unsigned long apic_mmio_base; + +#endif /* !__ASSEMBLER__ */ + +/* Control bits for startup_64 */ +#define STARTUP_READ_APICID 0x80000000 + +/* Top 8 bits are reserved for control */ +#define STARTUP_PARALLEL_MASK 0xFF000000 + #endif /* _ASM_X86_SMP_H */ diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h index 6a9ccc1b2be5..3918c7a434f5 100644 --- a/arch/x86/include/asm/sparsemem.h +++ b/arch/x86/include/asm/sparsemem.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_SPARSEMEM_H #define _ASM_X86_SPARSEMEM_H +#include <linux/types.h> + #ifdef CONFIG_SPARSEMEM /* * generic non-linear memory support: @@ -29,13 +31,4 @@ #endif /* CONFIG_SPARSEMEM */ -#ifndef __ASSEMBLY__ -#ifdef CONFIG_NUMA_KEEP_MEMINFO -extern int phys_to_target_node(phys_addr_t start); -#define phys_to_target_node phys_to_target_node -extern int memory_add_physaddr_to_nid(u64 start); -#define memory_add_physaddr_to_nid memory_add_physaddr_to_nid -#endif -#endif /* __ASSEMBLY__ */ - #endif /* _ASM_X86_SPARSEMEM_H */ diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h index 5393babc0598..00b7e0398210 100644 --- a/arch/x86/include/asm/spec-ctrl.h +++ b/arch/x86/include/asm/spec-ctrl.h @@ -4,6 +4,7 @@ #include <linux/thread_info.h> #include <asm/nospec-branch.h> +#include <asm/msr.h> /* * On VMENTER we must preserve whatever view of the SPEC_CTRL MSR @@ -13,7 +14,7 @@ * Takes the guest view of SPEC_CTRL MSR as a parameter and also * the guest's version of VIRT_SPEC_CTRL, if emulated. */ -extern void x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool guest); +extern void x86_virt_spec_ctrl(u64 guest_virt_spec_ctrl, bool guest); /** * x86_spec_ctrl_set_guest - Set speculation control registers for the guest @@ -24,9 +25,9 @@ extern void x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bo * Avoids writing to the MSR if the content/bits are the same */ static inline -void x86_spec_ctrl_set_guest(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl) +void x86_spec_ctrl_set_guest(u64 guest_virt_spec_ctrl) { - x86_virt_spec_ctrl(guest_spec_ctrl, guest_virt_spec_ctrl, true); + x86_virt_spec_ctrl(guest_virt_spec_ctrl, true); } /** @@ -38,9 +39,9 @@ void x86_spec_ctrl_set_guest(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl) * Avoids writing to the MSR if the content/bits are the same */ static inline -void x86_spec_ctrl_restore_host(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl) +void x86_spec_ctrl_restore_host(u64 guest_virt_spec_ctrl) { - x86_virt_spec_ctrl(guest_spec_ctrl, guest_virt_spec_ctrl, false); + x86_virt_spec_ctrl(guest_virt_spec_ctrl, false); } /* AMD specific Speculative Store Bypass MSR data */ @@ -76,6 +77,16 @@ static inline u64 ssbd_tif_to_amd_ls_cfg(u64 tifn) return (tifn & _TIF_SSBD) ? x86_amd_ls_cfg_ssbd_mask : 0ULL; } +/* + * This can be used in noinstr functions & should only be called in bare + * metal context. + */ +static __always_inline void __update_spec_ctrl(u64 val) +{ + __this_cpu_write(x86_spec_ctrl_current, val); + native_wrmsrq(MSR_IA32_SPEC_CTRL, val); +} + #ifdef CONFIG_SMP extern void speculative_store_bypass_ht_init(void); #else @@ -85,4 +96,6 @@ static inline void speculative_store_bypass_ht_init(void) { } extern void speculation_ctrl_update(unsigned long tif); extern void speculation_ctrl_update_current(void); +extern bool itlb_multihit_kvm_mitigation; + #endif diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 35f709f619fb..ecda17efa042 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -2,38 +2,27 @@ #ifndef _ASM_X86_SPECIAL_INSNS_H #define _ASM_X86_SPECIAL_INSNS_H - #ifdef __KERNEL__ - #include <asm/nops.h> #include <asm/processor-flags.h> + +#include <linux/errno.h> #include <linux/irqflags.h> #include <linux/jump_label.h> -/* - * The compiler should not reorder volatile asm statements with respect to each - * other: they should execute in program order. However GCC 4.9.x and 5.x have - * a bug (which was fixed in 8.1, 7.3 and 6.5) where they might reorder - * volatile asm. The write functions are not affected since they have memory - * clobbers preventing reordering. To prevent reads from being reordered with - * respect to writes, use a dummy memory operand. - */ - -#define __FORCE_ORDER "m"(*(unsigned int *)0x1000UL) - void native_write_cr0(unsigned long val); static inline unsigned long native_read_cr0(void) { unsigned long val; - asm volatile("mov %%cr0,%0\n\t" : "=r" (val) : __FORCE_ORDER); + asm volatile("mov %%cr0,%0" : "=r" (val)); return val; } static __always_inline unsigned long native_read_cr2(void) { unsigned long val; - asm volatile("mov %%cr2,%0\n\t" : "=r" (val) : __FORCE_ORDER); + asm volatile("mov %%cr2,%0" : "=r" (val)); return val; } @@ -42,14 +31,14 @@ static __always_inline void native_write_cr2(unsigned long val) asm volatile("mov %0,%%cr2": : "r" (val) : "memory"); } -static inline unsigned long __native_read_cr3(void) +static __always_inline unsigned long __native_read_cr3(void) { unsigned long val; - asm volatile("mov %%cr3,%0\n\t" : "=r" (val) : __FORCE_ORDER); + asm volatile("mov %%cr3,%0" : "=r" (val)); return val; } -static inline void native_write_cr3(unsigned long val) +static __always_inline void native_write_cr3(unsigned long val) { asm volatile("mov %0,%%cr3": : "r" (val) : "memory"); } @@ -66,10 +55,10 @@ static inline unsigned long native_read_cr4(void) asm volatile("1: mov %%cr4, %0\n" "2:\n" _ASM_EXTABLE(1b, 2b) - : "=r" (val) : "0" (0), __FORCE_ORDER); + : "=r" (val) : "0" (0)); #else /* CR4 always exists on x86_64. */ - asm volatile("mov %%cr4,%0\n\t" : "=r" (val) : __FORCE_ORDER); + asm volatile("mov %%cr4,%0" : "=r" (val)); #endif return val; } @@ -115,22 +104,11 @@ static inline void wrpkru(u32 pkru) } #endif -static inline void native_wbinvd(void) +static __always_inline void wbinvd(void) { asm volatile("wbinvd": : :"memory"); } -extern asmlinkage void asm_load_gs_index(unsigned int selector); - -static inline void native_load_gs_index(unsigned int selector) -{ - unsigned long flags; - - local_irq_save(flags); - asm_load_gs_index(selector); - local_irq_restore(flags); -} - static inline unsigned long __read_cr4(void) { return native_read_cr4(); @@ -178,34 +156,17 @@ static inline void __write_cr4(unsigned long x) { native_write_cr4(x); } - -static inline void wbinvd(void) -{ - native_wbinvd(); -} - - -static inline void load_gs_index(unsigned int selector) -{ -#ifdef CONFIG_X86_64 - native_load_gs_index(selector); -#else - loadsegment(gs, selector); -#endif -} - #endif /* CONFIG_PARAVIRT_XXL */ -static inline void clflush(volatile void *__p) +static __always_inline void clflush(volatile void *__p) { asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p)); } static inline void clflushopt(volatile void *__p) { - alternative_io(".byte 0x3e; clflush %P0", - ".byte 0x66; clflush %P0", - X86_FEATURE_CLFLUSHOPT, + alternative_io("ds clflush %0", + "clflushopt %0", X86_FEATURE_CLFLUSHOPT, "+m" (*(volatile char __force *)__p)); } @@ -213,29 +174,39 @@ static inline void clwb(volatile void *__p) { volatile struct { char x[64]; } *p = __p; - asm volatile(ALTERNATIVE_2( - ".byte 0x3e; clflush (%[pax])", - ".byte 0x66; clflush (%[pax])", /* clflushopt (%%rax) */ - X86_FEATURE_CLFLUSHOPT, - ".byte 0x66, 0x0f, 0xae, 0x30", /* clwb (%%rax) */ - X86_FEATURE_CLWB) - : [p] "+m" (*p) - : [pax] "a" (p)); + asm_inline volatile(ALTERNATIVE_2( + "ds clflush %0", + "clflushopt %0", X86_FEATURE_CLFLUSHOPT, + "clwb %0", X86_FEATURE_CLWB) + : "+m" (*p)); } +#ifdef CONFIG_X86_USER_SHADOW_STACK +static inline int write_user_shstk_64(u64 __user *addr, u64 val) +{ + asm goto("1: wrussq %[val], %[addr]\n" + _ASM_EXTABLE(1b, %l[fail]) + :: [addr] "m" (*addr), [val] "r" (val) + :: fail); + return 0; +fail: + return -EFAULT; +} +#endif /* CONFIG_X86_USER_SHADOW_STACK */ + #define nop() asm volatile ("nop") -static inline void serialize(void) +static __always_inline void serialize(void) { /* Instruction opcode for SERIALIZE; supported in binutils >= 2.35. */ asm volatile(".byte 0xf, 0x1, 0xe8" ::: "memory"); } /* The dst parameter must be 64-bytes aligned */ -static inline void movdir64b(void __iomem *dst, const void *src) +static inline void movdir64b(void *dst, const void *src) { const struct { char _[64]; } *__src = src; - struct { char _[64]; } __iomem *__dst = dst; + struct { char _[64]; } *__dst = dst; /* * MOVDIR64B %(rdx), rax. @@ -253,6 +224,11 @@ static inline void movdir64b(void __iomem *dst, const void *src) : "m" (*__src), "a" (__dst), "d" (__src)); } +static inline void movdir64b_io(void __iomem *dst, const void *src) +{ + movdir64b((void __force *)dst, src); +} + /** * enqcmds - Enqueue a command in supervisor (CPL0) mode * @dst: destination, in MMIO space (must be 512-bit aligned) @@ -295,7 +271,7 @@ static inline int enqcmds(void __iomem *dst, const void *src) return 0; } -static inline void tile_release(void) +static __always_inline void tile_release(void) { /* * Instruction opcode for TILERELEASE; supported in binutils diff --git a/arch/x86/include/asm/sta2x11.h b/arch/x86/include/asm/sta2x11.h deleted file mode 100644 index e0975e9c4f47..000000000000 --- a/arch/x86/include/asm/sta2x11.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Header file for STMicroelectronics ConneXt (STA2X11) IOHub - */ -#ifndef __ASM_STA2X11_H -#define __ASM_STA2X11_H - -#include <linux/pci.h> - -/* This needs to be called from the MFD to configure its sub-devices */ -struct sta2x11_instance *sta2x11_get_instance(struct pci_dev *pdev); - -#endif /* __ASM_STA2X11_H */ diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index 24a8d6c4fb18..cd761b14eb02 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -2,26 +2,10 @@ /* * GCC stack protector support. * - * Stack protector works by putting predefined pattern at the start of + * Stack protector works by putting a predefined pattern at the start of * the stack frame and verifying that it hasn't been overwritten when - * returning from the function. The pattern is called stack canary - * and unfortunately gcc historically required it to be at a fixed offset - * from the percpu segment base. On x86_64, the offset is 40 bytes. - * - * The same segment is shared by percpu area and stack canary. On - * x86_64, percpu symbols are zero based and %gs (64-bit) points to the - * base of percpu area. The first occupant of the percpu area is always - * fixed_percpu_data which contains stack_canary at the appropriate - * offset. On x86_32, the stack canary is just a regular percpu - * variable. - * - * Putting percpu data in %fs on 32-bit is a minor optimization compared to - * using %gs. Since 32-bit userspace normally has %fs == 0, we are likely - * to load 0 into %fs on exit to usermode, whereas with percpu data in - * %gs, we are likely to load a non-null %gs on return to user mode. - * - * Once we are willing to require GCC 8.1 or better for 64-bit stackprotector - * support, we can remove some of this complexity. + * returning from the function. The pattern is called the stack canary + * and is a unique value for each task. */ #ifndef _ASM_STACKPROTECTOR_H @@ -34,9 +18,10 @@ #include <asm/percpu.h> #include <asm/desc.h> -#include <linux/random.h> #include <linux/sched.h> +DECLARE_PER_CPU_CACHE_HOT(unsigned long, __stack_chk_guard); + /* * Initialize the stackprotector canary value. * @@ -50,38 +35,15 @@ */ static __always_inline void boot_init_stack_canary(void) { - u64 canary; - u64 tsc; - -#ifdef CONFIG_X86_64 - BUILD_BUG_ON(offsetof(struct fixed_percpu_data, stack_canary) != 40); -#endif - /* - * We both use the random pool and the current TSC as a source - * of randomness. The TSC only matters for very early init, - * there it already has some randomness on most systems. Later - * on during the bootup the random pool has true entropy too. - */ - get_random_bytes(&canary, sizeof(canary)); - tsc = rdtsc(); - canary += tsc + (tsc << 32UL); - canary &= CANARY_MASK; + unsigned long canary = get_random_canary(); current->stack_canary = canary; -#ifdef CONFIG_X86_64 - this_cpu_write(fixed_percpu_data.stack_canary, canary); -#else this_cpu_write(__stack_chk_guard, canary); -#endif } static inline void cpu_init_stack_canary(int cpu, struct task_struct *idle) { -#ifdef CONFIG_X86_64 - per_cpu(fixed_percpu_data.stack_canary, cpu) = idle->stack_canary; -#else per_cpu(__stack_chk_guard, cpu) = idle->stack_canary; -#endif } #else /* STACKPROTECTOR */ diff --git a/arch/x86/include/asm/static_call.h b/arch/x86/include/asm/static_call.h index 343b722ccaf2..41502bd2afd6 100644 --- a/arch/x86/include/asm/static_call.h +++ b/arch/x86/include/asm/static_call.h @@ -46,7 +46,7 @@ #define ARCH_DEFINE_STATIC_CALL_TRAMP(name, func) \ __ARCH_DEFINE_STATIC_CALL_TRAMP(name, ".byte 0xe9; .long " #func " - (. + 4)") -#ifdef CONFIG_RETHUNK +#ifdef CONFIG_MITIGATION_RETHUNK #define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) \ __ARCH_DEFINE_STATIC_CALL_TRAMP(name, "jmp __x86_return_thunk") #else @@ -65,4 +65,19 @@ extern bool __static_call_fixup(void *tramp, u8 op, void *dest); +extern void __static_call_update_early(void *tramp, void *func); + +#define static_call_update_early(name, _func) \ +({ \ + typeof(&STATIC_CALL_TRAMP(name)) __F = (_func); \ + if (static_call_initialized) { \ + __static_call_update(&STATIC_CALL_KEY(name), \ + STATIC_CALL_TRAMP_ADDR(name), __F);\ + } else { \ + WRITE_ONCE(STATIC_CALL_KEY(name).func, _func); \ + __static_call_update_early(STATIC_CALL_TRAMP_ADDR(name),\ + __F); \ + } \ +}) + #endif /* _ASM_STATIC_CALL_H */ diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h index 32c0d981a82a..e9cce169bb4c 100644 --- a/arch/x86/include/asm/string_32.h +++ b/arch/x86/include/asm/string_32.h @@ -33,11 +33,11 @@ extern size_t strlen(const char *s); static __always_inline void *__memcpy(void *to, const void *from, size_t n) { int d0, d1, d2; - asm volatile("rep ; movsl\n\t" + asm volatile("rep movsl\n\t" "movl %4,%%ecx\n\t" "andl $3,%%ecx\n\t" "jz 1f\n\t" - "rep ; movsb\n\t" + "rep movsb\n\t" "1:" : "=&c" (d0), "=&D" (d1), "=&S" (d2) : "0" (n / 4), "g" (n), "1" ((long)to), "2" ((long)from) @@ -89,7 +89,7 @@ static __always_inline void *__constant_memcpy(void *to, const void *from, if (n >= 5 * 4) { /* large block: use rep prefix */ int ecx; - asm volatile("rep ; movsl" + asm volatile("rep movsl" : "=&c" (ecx), "=&D" (edi), "=&S" (esi) : "0" (n / 4), "1" (edi), "2" (esi) : "memory" @@ -165,8 +165,7 @@ extern void *memchr(const void *cs, int c, size_t count); static inline void *__memset_generic(void *s, char c, size_t count) { int d0, d1; - asm volatile("rep\n\t" - "stosb" + asm volatile("rep stosb" : "=&c" (d0), "=&D" (d1) : "a" (c), "1" (s), "0" (count) : "memory"); @@ -199,8 +198,7 @@ extern void *memset(void *, int, size_t); static inline void *memset16(uint16_t *s, uint16_t v, size_t n) { int d0, d1; - asm volatile("rep\n\t" - "stosw" + asm volatile("rep stosw" : "=&c" (d0), "=&D" (d1) : "a" (v), "1" (s), "0" (n) : "memory"); @@ -211,8 +209,7 @@ static inline void *memset16(uint16_t *s, uint16_t v, size_t n) static inline void *memset32(uint32_t *s, uint32_t v, size_t n) { int d0, d1; - asm volatile("rep\n\t" - "stosl" + asm volatile("rep stosl" : "=&c" (d0), "=&D" (d1) : "a" (v), "1" (s), "0" (n) : "memory"); diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 6e450827f677..79e9695dc13e 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -10,6 +10,10 @@ /* Even with __builtin_ the compiler may decide to use the out of line function. */ +#if defined(__SANITIZE_MEMORY__) && defined(__NO_FORTIFY) +#include <linux/kmsan_string.h> +#endif + #define __HAVE_ARCH_MEMCPY 1 extern void *memcpy(void *to, const void *from, size_t len); extern void *__memcpy(void *to, const void *from, size_t len); @@ -17,46 +21,57 @@ extern void *__memcpy(void *to, const void *from, size_t len); #define __HAVE_ARCH_MEMSET void *memset(void *s, int c, size_t n); void *__memset(void *s, int c, size_t n); +KCFI_REFERENCE(__memset); +/* + * KMSAN needs to instrument as much code as possible. Use C versions of + * memsetXX() from lib/string.c under KMSAN. + */ +#if !defined(CONFIG_KMSAN) #define __HAVE_ARCH_MEMSET16 static inline void *memset16(uint16_t *s, uint16_t v, size_t n) { - long d0, d1; - asm volatile("rep\n\t" - "stosw" - : "=&c" (d0), "=&D" (d1) - : "a" (v), "1" (s), "0" (n) - : "memory"); - return s; + const __auto_type s0 = s; + asm volatile ( + "rep stosw" + : "+D" (s), "+c" (n) + : "a" (v) + : "memory" + ); + return s0; } #define __HAVE_ARCH_MEMSET32 static inline void *memset32(uint32_t *s, uint32_t v, size_t n) { - long d0, d1; - asm volatile("rep\n\t" - "stosl" - : "=&c" (d0), "=&D" (d1) - : "a" (v), "1" (s), "0" (n) - : "memory"); - return s; + const __auto_type s0 = s; + asm volatile ( + "rep stosl" + : "+D" (s), "+c" (n) + : "a" (v) + : "memory" + ); + return s0; } #define __HAVE_ARCH_MEMSET64 static inline void *memset64(uint64_t *s, uint64_t v, size_t n) { - long d0, d1; - asm volatile("rep\n\t" - "stosq" - : "=&c" (d0), "=&D" (d1) - : "a" (v), "1" (s), "0" (n) - : "memory"); - return s; + const __auto_type s0 = s; + asm volatile ( + "rep stosq" + : "+D" (s), "+c" (n) + : "a" (v) + : "memory" + ); + return s0; } +#endif #define __HAVE_ARCH_MEMMOVE void *memmove(void *dest, const void *src, size_t count); void *__memmove(void *dest, const void *src, size_t count); +KCFI_REFERENCE(__memmove); int memcmp(const void *cs, const void *ct, size_t count); size_t strlen(const char *s); @@ -64,24 +79,6 @@ char *strcpy(char *dest, const char *src); char *strcat(char *dest, const char *src); int strcmp(const char *cs, const char *ct); -#if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__) - -/* - * For files that not instrumented (e.g. mm/slub.c) we - * should use not instrumented version of mem* functions. - */ - -#undef memcpy -#define memcpy(dst, src, len) __memcpy(dst, src, len) -#define memmove(dst, src, len) __memmove(dst, src, len) -#define memset(s, c, n) __memset(s, c, n) - -#ifndef __NO_FORTIFY -#define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */ -#endif - -#endif - #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE #define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1 void __memcpy_flushcache(void *dst, const void *src, size_t cnt); diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h index a800abb1a992..e8e5aab06255 100644 --- a/arch/x86/include/asm/suspend_32.h +++ b/arch/x86/include/asm/suspend_32.h @@ -9,14 +9,10 @@ #include <asm/desc.h> #include <asm/fpu/api.h> +#include <asm/msr.h> /* image of the saved processor state */ struct saved_context { - /* - * On x86_32, all segment registers except gs are saved at kernel - * entry in pt_regs. - */ - u16 gs; unsigned long cr0, cr2, cr3, cr4; u64 misc_enable; struct saved_msrs saved_msrs; @@ -27,6 +23,11 @@ struct saved_context { unsigned long tr; unsigned long safety; unsigned long return_address; + /* + * On x86_32, all segment registers except gs are saved at kernel + * entry in pt_regs. + */ + u16 gs; bool misc_enable_saved; } __attribute__((packed)); diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h index 54df06687d83..b512f9665f78 100644 --- a/arch/x86/include/asm/suspend_64.h +++ b/arch/x86/include/asm/suspend_64.h @@ -9,6 +9,7 @@ #include <asm/desc.h> #include <asm/fpu/api.h> +#include <asm/msr.h> /* * Image of the saved processor state, used by the low level ACPI suspend to diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 1b07fba11704..ad954a1a6656 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -5,6 +5,8 @@ #include <uapi/asm/svm.h> #include <uapi/asm/kvm.h> +#include <hyperv/hvhdk.h> + /* * 32-bit intercept words in the VMCB Control Area, starting * at Byte offset 000h. @@ -114,6 +116,8 @@ enum { INTERCEPT_INVPCID, INTERCEPT_MCOMMIT, INTERCEPT_TLBSYNC, + INTERCEPT_BUSLOCK, + INTERCEPT_IDLE_HLT = 166, }; @@ -156,12 +160,20 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u64 avic_physical_id; /* Offset 0xf8 */ u8 reserved_7[8]; u64 vmsa_pa; /* Used for an SEV-ES guest */ - u8 reserved_8[720]; + u8 reserved_8[16]; + u16 bus_lock_counter; /* Offset 0x120 */ + u8 reserved_9[22]; + u64 allowed_sev_features; /* Offset 0x138 */ + u64 guest_sev_features; /* Offset 0x140 */ + u8 reserved_10[664]; /* * Offset 0x3e0, 32 bytes reserved * for use by hypervisor/software. */ - u8 reserved_sw[32]; + union { + struct hv_vmcb_enlightenments hv_enlightenments; + u8 reserved_sw[32]; + }; }; @@ -178,6 +190,12 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define V_GIF_SHIFT 9 #define V_GIF_MASK (1 << V_GIF_SHIFT) +#define V_NMI_PENDING_SHIFT 11 +#define V_NMI_PENDING_MASK (1 << V_NMI_PENDING_SHIFT) + +#define V_NMI_BLOCKING_SHIFT 12 +#define V_NMI_BLOCKING_MASK (1 << V_NMI_BLOCKING_SHIFT) + #define V_INTR_PRIO_SHIFT 16 #define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT) @@ -192,9 +210,15 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define V_GIF_ENABLE_SHIFT 25 #define V_GIF_ENABLE_MASK (1 << V_GIF_ENABLE_SHIFT) +#define V_NMI_ENABLE_SHIFT 26 +#define V_NMI_ENABLE_MASK (1 << V_NMI_ENABLE_SHIFT) + #define AVIC_ENABLE_SHIFT 31 #define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT) +#define X2APIC_MODE_SHIFT 30 +#define X2APIC_MODE_MASK (1 << X2APIC_MODE_SHIFT) + #define LBR_CTL_ENABLE_MASK BIT_ULL(0) #define VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK BIT_ULL(1) @@ -212,10 +236,6 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT) #define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT) -#define SVM_VM_CR_VALID_MASK 0x001fULL -#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL -#define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL - #define SVM_NESTED_CTL_NP_ENABLE BIT(0) #define SVM_NESTED_CTL_SEV_ENABLE BIT(1) #define SVM_NESTED_CTL_SEV_ES_ENABLE BIT(2) @@ -251,18 +271,33 @@ enum avic_ipi_failure_cause { AVIC_IPI_FAILURE_TARGET_NOT_RUNNING, AVIC_IPI_FAILURE_INVALID_TARGET, AVIC_IPI_FAILURE_INVALID_BACKING_PAGE, + AVIC_IPI_FAILURE_INVALID_IPI_VECTOR, }; +#define AVIC_PHYSICAL_MAX_INDEX_MASK GENMASK_ULL(8, 0) /* - * 0xff is broadcast, so the max index allowed for physical APIC ID - * table is 0xfe. APIC IDs above 0xff are reserved. + * For AVIC, the max index allowed for physical APIC ID table is 0xfe (254), as + * 0xff is a broadcast to all CPUs, i.e. can't be targeted individually. */ -#define AVIC_MAX_PHYSICAL_ID_COUNT 0xff +#define AVIC_MAX_PHYSICAL_ID 0XFEULL + +/* + * For x2AVIC, the max index allowed for physical APIC ID table is 0x1ff (511). + */ +#define X2AVIC_MAX_PHYSICAL_ID 0x1FFUL + +static_assert((AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == AVIC_MAX_PHYSICAL_ID); +static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_MAX_PHYSICAL_ID); #define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) -#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL +#define SVM_SEV_FEAT_SNP_ACTIVE BIT(0) +#define SVM_SEV_FEAT_RESTRICTED_INJECTION BIT(3) +#define SVM_SEV_FEAT_ALTERNATE_INJECTION BIT(4) +#define SVM_SEV_FEAT_DEBUG_SWAP BIT(5) + +#define VMCB_ALLOWED_SEV_FEATURES_VALID BIT_ULL(63) struct vmcb_seg { u16 selector; @@ -283,12 +318,13 @@ struct vmcb_save_area { struct vmcb_seg ldtr; struct vmcb_seg idtr; struct vmcb_seg tr; - u8 reserved_1[42]; + /* Reserved fields are named following their struct offset */ + u8 reserved_0xa0[42]; u8 vmpl; u8 cpl; - u8 reserved_2[4]; + u8 reserved_0xcc[4]; u64 efer; - u8 reserved_3[112]; + u8 reserved_0xd8[112]; u64 cr4; u64 cr3; u64 cr0; @@ -296,7 +332,7 @@ struct vmcb_save_area { u64 dr6; u64 rflags; u64 rip; - u8 reserved_4[88]; + u8 reserved_0x180[88]; u64 rsp; u64 s_cet; u64 ssp; @@ -311,15 +347,15 @@ struct vmcb_save_area { u64 sysenter_esp; u64 sysenter_eip; u64 cr2; - u8 reserved_5[32]; + u8 reserved_0x248[32]; u64 g_pat; u64 dbgctl; u64 br_from; u64 br_to; u64 last_excp_from; u64 last_excp_to; - u8 reserved_6[72]; - u32 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */ + u8 reserved_0x298[72]; + u64 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */ } __packed; /* Save area definition for SEV-ES and SEV-SNP guests */ @@ -334,17 +370,17 @@ struct sev_es_save_area { struct vmcb_seg ldtr; struct vmcb_seg idtr; struct vmcb_seg tr; - u64 vmpl0_ssp; - u64 vmpl1_ssp; - u64 vmpl2_ssp; - u64 vmpl3_ssp; + u64 pl0_ssp; + u64 pl1_ssp; + u64 pl2_ssp; + u64 pl3_ssp; u64 u_cet; - u8 reserved_1[2]; + u8 reserved_0xc8[2]; u8 vmpl; u8 cpl; - u8 reserved_2[4]; + u8 reserved_0xcc[4]; u64 efer; - u8 reserved_3[104]; + u8 reserved_0xd8[104]; u64 xss; u64 cr4; u64 cr3; @@ -361,7 +397,7 @@ struct sev_es_save_area { u64 dr1_addr_mask; u64 dr2_addr_mask; u64 dr3_addr_mask; - u8 reserved_4[24]; + u8 reserved_0x1c0[24]; u64 rsp; u64 s_cet; u64 ssp; @@ -376,21 +412,23 @@ struct sev_es_save_area { u64 sysenter_esp; u64 sysenter_eip; u64 cr2; - u8 reserved_5[32]; + u8 reserved_0x248[32]; u64 g_pat; u64 dbgctl; u64 br_from; u64 br_to; u64 last_excp_from; u64 last_excp_to; - u8 reserved_7[80]; + u8 reserved_0x298[80]; u32 pkru; - u8 reserved_8[20]; - u64 reserved_9; /* rax already available at 0x01f8 */ + u32 tsc_aux; + u64 tsc_scale; + u64 tsc_offset; + u8 reserved_0x300[8]; u64 rcx; u64 rdx; u64 rbx; - u64 reserved_10; /* rsp already available at 0x01d8 */ + u64 reserved_0x320; /* rsp already available at 0x01d8 */ u64 rbp; u64 rsi; u64 rdi; @@ -402,7 +440,7 @@ struct sev_es_save_area { u64 r13; u64 r14; u64 r15; - u8 reserved_11[16]; + u8 reserved_0x380[16]; u64 guest_exit_info_1; u64 guest_exit_info_2; u64 guest_exit_int_info; @@ -415,7 +453,7 @@ struct sev_es_save_area { u64 pcpu_id; u64 event_inj; u64 xcr0; - u8 reserved_12[16]; + u8 reserved_0x3f0[16]; /* Floating point area */ u64 x87_dp; @@ -433,23 +471,23 @@ struct sev_es_save_area { } __packed; struct ghcb_save_area { - u8 reserved_1[203]; + u8 reserved_0x0[203]; u8 cpl; - u8 reserved_2[116]; + u8 reserved_0xcc[116]; u64 xss; - u8 reserved_3[24]; + u8 reserved_0x148[24]; u64 dr7; - u8 reserved_4[16]; + u8 reserved_0x168[16]; u64 rip; - u8 reserved_5[88]; + u8 reserved_0x180[88]; u64 rsp; - u8 reserved_6[24]; + u8 reserved_0x1e0[24]; u64 rax; - u8 reserved_7[264]; + u8 reserved_0x200[264]; u64 rcx; u64 rdx; u64 rbx; - u8 reserved_8[8]; + u8 reserved_0x320[8]; u64 rbp; u64 rsi; u64 rdi; @@ -461,12 +499,12 @@ struct ghcb_save_area { u64 r13; u64 r14; u64 r15; - u8 reserved_9[16]; + u8 reserved_0x380[16]; u64 sw_exit_code; u64 sw_exit_info_1; u64 sw_exit_info_2; u64 sw_scratch; - u8 reserved_10[56]; + u8 reserved_0x3b0[56]; u64 xcr0; u8 valid_bitmap[16]; u64 x87_state_gpa; @@ -480,36 +518,80 @@ struct ghcb { u8 shared_buffer[GHCB_SHARED_BUF_SIZE]; - u8 reserved_1[10]; + u8 reserved_0xff0[10]; u16 protocol_version; /* negotiated SEV-ES/GHCB protocol version */ u32 ghcb_usage; } __packed; +struct vmcb { + struct vmcb_control_area control; + union { + struct vmcb_save_area save; + + /* + * For SEV-ES VMs, the save area in the VMCB is used only to + * save/load host state. Guest state resides in a separate + * page, the aptly named VM Save Area (VMSA), that is encrypted + * with the guest's private key. + */ + struct sev_es_save_area host_sev_es_save; + }; +} __packed; -#define EXPECTED_VMCB_SAVE_AREA_SIZE 740 +#define EXPECTED_VMCB_SAVE_AREA_SIZE 744 #define EXPECTED_GHCB_SAVE_AREA_SIZE 1032 #define EXPECTED_SEV_ES_SAVE_AREA_SIZE 1648 #define EXPECTED_VMCB_CONTROL_AREA_SIZE 1024 #define EXPECTED_GHCB_SIZE PAGE_SIZE +#define BUILD_BUG_RESERVED_OFFSET(x, y) \ + ASSERT_STRUCT_OFFSET(struct x, reserved ## _ ## y, y) + static inline void __unused_size_checks(void) { BUILD_BUG_ON(sizeof(struct vmcb_save_area) != EXPECTED_VMCB_SAVE_AREA_SIZE); BUILD_BUG_ON(sizeof(struct ghcb_save_area) != EXPECTED_GHCB_SAVE_AREA_SIZE); BUILD_BUG_ON(sizeof(struct sev_es_save_area) != EXPECTED_SEV_ES_SAVE_AREA_SIZE); BUILD_BUG_ON(sizeof(struct vmcb_control_area) != EXPECTED_VMCB_CONTROL_AREA_SIZE); + BUILD_BUG_ON(offsetof(struct vmcb, save) != EXPECTED_VMCB_CONTROL_AREA_SIZE); BUILD_BUG_ON(sizeof(struct ghcb) != EXPECTED_GHCB_SIZE); -} -struct vmcb { - struct vmcb_control_area control; - struct vmcb_save_area save; -} __packed; + /* Check offsets of reserved fields */ + + BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0xa0); + BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0xcc); + BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0xd8); + BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0x180); + BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0x248); + BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0x298); + + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0xc8); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0xcc); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0xd8); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x1c0); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x248); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x298); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x300); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x320); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x380); + BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x3f0); + + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x0); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0xcc); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x148); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x168); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x180); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x1e0); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x200); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x320); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x380); + BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x3b0); + + BUILD_BUG_RESERVED_OFFSET(ghcb, 0xff0); +} #define SVM_CPUID_FUNC 0x8000000a -#define SVM_VM_CR_SVM_DISABLE 4 - #define SVM_SELECTOR_S_SHIFT 4 #define SVM_SELECTOR_DPL_SHIFT 5 #define SVM_SELECTOR_P_SHIFT 7 diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index c08eb0fdd11f..499b1c15cc8b 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -12,7 +12,9 @@ struct task_struct *__switch_to_asm(struct task_struct *prev, __visible struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *next); -asmlinkage void ret_from_fork(void); +asmlinkage void ret_from_fork_asm(void); +__visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs, + int (*fn)(void *), void *fn_arg); /* * This is the structure pointed to by thread.sp for an inactive task. The @@ -50,6 +52,8 @@ do { \ } while (0) #ifdef CONFIG_X86_32 +#include <asm/msr.h> + static inline void refresh_sysenter_cs(struct thread_struct *thread) { /* Only happens when SEP is enabled, no need to test "SEP"arately: */ @@ -57,7 +61,7 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread) return; this_cpu_write(cpu_tss_rw.x86_tss.ss1, thread->sysenter_cs); - wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); + wrmsrq(MSR_IA32_SYSENTER_CS, thread->sysenter_cs); } #endif @@ -66,13 +70,10 @@ static inline void update_task_stack(struct task_struct *task) { /* sp0 always points to the entry trampoline stack, which is constant: */ #ifdef CONFIG_X86_32 - if (static_cpu_has(X86_FEATURE_XENPV)) - load_sp0(task->thread.sp0); - else - this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0); + this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0); #else - /* Xen PV enters the kernel on the thread stack. */ - if (static_cpu_has(X86_FEATURE_XENPV)) + if (!cpu_feature_enabled(X86_FEATURE_FRED) && cpu_feature_enabled(X86_FEATURE_XENPV)) + /* Xen PV enters the kernel on the thread stack. */ load_sp0(task_top_of_stack(task)); #endif } diff --git a/arch/x86/include/asm/sync_bitops.h b/arch/x86/include/asm/sync_bitops.h index 6d8d6bc183b7..cd21a0405ac5 100644 --- a/arch/x86/include/asm/sync_bitops.h +++ b/arch/x86/include/asm/sync_bitops.h @@ -31,7 +31,7 @@ */ static inline void sync_set_bit(long nr, volatile unsigned long *addr) { - asm volatile("lock; " __ASM_SIZE(bts) " %1,%0" + asm volatile("lock " __ASM_SIZE(bts) " %1,%0" : "+m" (ADDR) : "Ir" (nr) : "memory"); @@ -49,7 +49,7 @@ static inline void sync_set_bit(long nr, volatile unsigned long *addr) */ static inline void sync_clear_bit(long nr, volatile unsigned long *addr) { - asm volatile("lock; " __ASM_SIZE(btr) " %1,%0" + asm volatile("lock " __ASM_SIZE(btr) " %1,%0" : "+m" (ADDR) : "Ir" (nr) : "memory"); @@ -66,7 +66,7 @@ static inline void sync_clear_bit(long nr, volatile unsigned long *addr) */ static inline void sync_change_bit(long nr, volatile unsigned long *addr) { - asm volatile("lock; " __ASM_SIZE(btc) " %1,%0" + asm volatile("lock " __ASM_SIZE(btc) " %1,%0" : "+m" (ADDR) : "Ir" (nr) : "memory"); @@ -82,7 +82,7 @@ static inline void sync_change_bit(long nr, volatile unsigned long *addr) */ static inline bool sync_test_and_set_bit(long nr, volatile unsigned long *addr) { - return GEN_BINARY_RMWcc("lock; " __ASM_SIZE(bts), *addr, c, "Ir", nr); + return GEN_BINARY_RMWcc("lock " __ASM_SIZE(bts), *addr, c, "Ir", nr); } /** @@ -95,7 +95,7 @@ static inline bool sync_test_and_set_bit(long nr, volatile unsigned long *addr) */ static inline int sync_test_and_clear_bit(long nr, volatile unsigned long *addr) { - return GEN_BINARY_RMWcc("lock; " __ASM_SIZE(btr), *addr, c, "Ir", nr); + return GEN_BINARY_RMWcc("lock " __ASM_SIZE(btr), *addr, c, "Ir", nr); } /** @@ -108,7 +108,7 @@ static inline int sync_test_and_clear_bit(long nr, volatile unsigned long *addr) */ static inline int sync_test_and_change_bit(long nr, volatile unsigned long *addr) { - return GEN_BINARY_RMWcc("lock; " __ASM_SIZE(btc), *addr, c, "Ir", nr); + return GEN_BINARY_RMWcc("lock " __ASM_SIZE(btc), *addr, c, "Ir", nr); } #define sync_test_bit(nr, addr) test_bit(nr, addr) diff --git a/arch/x86/include/asm/sync_core.h b/arch/x86/include/asm/sync_core.h index ab7382f92aff..96bda43538ee 100644 --- a/arch/x86/include/asm/sync_core.h +++ b/arch/x86/include/asm/sync_core.h @@ -8,7 +8,7 @@ #include <asm/special_insns.h> #ifdef CONFIG_X86_32 -static inline void iret_to_self(void) +static __always_inline void iret_to_self(void) { asm volatile ( "pushfl\n\t" @@ -19,7 +19,7 @@ static inline void iret_to_self(void) : ASM_CALL_CONSTRAINT : : "memory"); } #else -static inline void iret_to_self(void) +static __always_inline void iret_to_self(void) { unsigned int tmp; @@ -55,7 +55,7 @@ static inline void iret_to_self(void) * Like all of Linux's memory ordering operations, this is a * compiler barrier as well. */ -static inline void sync_core(void) +static __always_inline void sync_core(void) { /* * The SERIALIZE instruction is the most straightforward way to diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 5b85987a5e97..c10dbb74cd00 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -16,19 +16,17 @@ #include <asm/thread_info.h> /* for TS_COMPAT */ #include <asm/unistd.h> +/* This is used purely for kernel/trace/trace_syscalls.c */ typedef long (*sys_call_ptr_t)(const struct pt_regs *); extern const sys_call_ptr_t sys_call_table[]; -#if defined(CONFIG_X86_32) -#define ia32_sys_call_table sys_call_table -#else /* * These may not exist, but still put the prototypes in so we * can use IS_ENABLED(). */ -extern const sys_call_ptr_t ia32_sys_call_table[]; -extern const sys_call_ptr_t x32_sys_call_table[]; -#endif +extern long ia32_sys_call(const struct pt_regs *, unsigned int nr); +extern long x32_sys_call(const struct pt_regs *, unsigned int nr); +extern long x64_sys_call(const struct pt_regs *, unsigned int nr); /* * Only the low 32 bits of orig_ax are meaningful, so we return int. @@ -40,6 +38,13 @@ static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs) return regs->orig_ax; } +static inline void syscall_set_nr(struct task_struct *task, + struct pt_regs *regs, + int nr) +{ + regs->orig_ax = nr; +} + static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { @@ -84,7 +89,24 @@ static inline void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, unsigned long *args) { - memcpy(args, ®s->bx, 6 * sizeof(args[0])); + args[0] = regs->bx; + args[1] = regs->cx; + args[2] = regs->dx; + args[3] = regs->si; + args[4] = regs->di; + args[5] = regs->bp; +} + +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + const unsigned long *args) +{ + regs->bx = args[0]; + regs->cx = args[1]; + regs->dx = args[2]; + regs->si = args[3]; + regs->di = args[4]; + regs->bp = args[5]; } static inline int syscall_get_arch(struct task_struct *task) @@ -118,6 +140,30 @@ static inline void syscall_get_arguments(struct task_struct *task, } } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + const unsigned long *args) +{ +# ifdef CONFIG_IA32_EMULATION + if (task->thread_info.status & TS_COMPAT) { + regs->bx = *args++; + regs->cx = *args++; + regs->dx = *args++; + regs->si = *args++; + regs->di = *args++; + regs->bp = *args; + } else +# endif + { + regs->di = *args++; + regs->si = *args++; + regs->dx = *args++; + regs->r10 = *args++; + regs->r8 = *args++; + regs->r9 = *args; + } +} + static inline int syscall_get_arch(struct task_struct *task) { /* x32 tasks should be considered AUDIT_ARCH_X86_64. */ @@ -126,10 +172,13 @@ static inline int syscall_get_arch(struct task_struct *task) ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; } -void do_syscall_64(struct pt_regs *regs, int nr); -void do_int80_syscall_32(struct pt_regs *regs); -long do_fast_syscall_32(struct pt_regs *regs); +bool do_syscall_64(struct pt_regs *regs, int nr); +void do_int80_emulation(struct pt_regs *regs); #endif /* CONFIG_X86_32 */ +void do_int80_syscall_32(struct pt_regs *regs); +bool do_fast_syscall_32(struct pt_regs *regs); +bool do_SYSENTER_32(struct pt_regs *regs); + #endif /* _ASM_X86_SYSCALL_H */ diff --git a/arch/x86/include/asm/syscall_wrapper.h b/arch/x86/include/asm/syscall_wrapper.h index 59358d1bf880..7e88705e907f 100644 --- a/arch/x86/include/asm/syscall_wrapper.h +++ b/arch/x86/include/asm/syscall_wrapper.h @@ -6,7 +6,7 @@ #ifndef _ASM_X86_SYSCALL_WRAPPER_H #define _ASM_X86_SYSCALL_WRAPPER_H -struct pt_regs; +#include <asm/ptrace.h> extern long __x64_sys_ni_syscall(const struct pt_regs *regs); extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); @@ -58,12 +58,29 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); ,,regs->di,,regs->si,,regs->dx \ ,,regs->r10,,regs->r8,,regs->r9) \ + +/* SYSCALL_PT_ARGS is Adapted from s390x */ +#define SYSCALL_PT_ARG6(m, t1, t2, t3, t4, t5, t6) \ + SYSCALL_PT_ARG5(m, t1, t2, t3, t4, t5), m(t6, (regs->bp)) +#define SYSCALL_PT_ARG5(m, t1, t2, t3, t4, t5) \ + SYSCALL_PT_ARG4(m, t1, t2, t3, t4), m(t5, (regs->di)) +#define SYSCALL_PT_ARG4(m, t1, t2, t3, t4) \ + SYSCALL_PT_ARG3(m, t1, t2, t3), m(t4, (regs->si)) +#define SYSCALL_PT_ARG3(m, t1, t2, t3) \ + SYSCALL_PT_ARG2(m, t1, t2), m(t3, (regs->dx)) +#define SYSCALL_PT_ARG2(m, t1, t2) \ + SYSCALL_PT_ARG1(m, t1), m(t2, (regs->cx)) +#define SYSCALL_PT_ARG1(m, t1) m(t1, (regs->bx)) +#define SYSCALL_PT_ARGS(x, ...) SYSCALL_PT_ARG##x(__VA_ARGS__) + +#define __SC_COMPAT_CAST(t, a) \ + (__typeof(__builtin_choose_expr(__TYPE_IS_L(t), 0, 0U))) \ + (unsigned int)a + /* Mapping of registers to parameters for syscalls on i386 */ #define SC_IA32_REGS_TO_ARGS(x, ...) \ - __MAP(x,__SC_ARGS \ - ,,(unsigned int)regs->bx,,(unsigned int)regs->cx \ - ,,(unsigned int)regs->dx,,(unsigned int)regs->si \ - ,,(unsigned int)regs->di,,(unsigned int)regs->bp) + SYSCALL_PT_ARGS(x, __SC_COMPAT_CAST, \ + __MAP(x, __SC_TYPE, __VA_ARGS__)) \ #define __SYS_STUB0(abi, name) \ long __##abi##_##name(const struct pt_regs *regs); \ @@ -86,9 +103,6 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); return sys_ni_syscall(); \ } -#define __SYS_NI(abi, name) \ - SYSCALL_ALIAS(__##abi##_##name, sys_ni_posix_timers); - #ifdef CONFIG_X86_64 #define __X64_SYS_STUB0(name) \ __SYS_STUB0(x64, sys_##name) @@ -100,13 +114,10 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); #define __X64_COND_SYSCALL(name) \ __COND_SYSCALL(x64, sys_##name) -#define __X64_SYS_NI(name) \ - __SYS_NI(x64, sys_##name) #else /* CONFIG_X86_64 */ #define __X64_SYS_STUB0(name) #define __X64_SYS_STUBx(x, name, ...) #define __X64_COND_SYSCALL(name) -#define __X64_SYS_NI(name) #endif /* CONFIG_X86_64 */ #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) @@ -120,13 +131,10 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); #define __IA32_COND_SYSCALL(name) \ __COND_SYSCALL(ia32, sys_##name) -#define __IA32_SYS_NI(name) \ - __SYS_NI(ia32, sys_##name) #else /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */ #define __IA32_SYS_STUB0(name) #define __IA32_SYS_STUBx(x, name, ...) #define __IA32_COND_SYSCALL(name) -#define __IA32_SYS_NI(name) #endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */ #ifdef CONFIG_IA32_EMULATION @@ -135,8 +143,7 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); * additional wrappers (aptly named __ia32_sys_xyzzy) which decode the * ia32 regs in the proper order for shared or "common" syscalls. As some * syscalls may not be implemented, we need to expand COND_SYSCALL in - * kernel/sys_ni.c and SYS_NI in kernel/time/posix-stubs.c to cover this - * case as well. + * kernel/sys_ni.c to cover this case as well. */ #define __IA32_COMPAT_SYS_STUB0(name) \ __SYS_STUB0(ia32, compat_sys_##name) @@ -148,14 +155,10 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); #define __IA32_COMPAT_COND_SYSCALL(name) \ __COND_SYSCALL(ia32, compat_sys_##name) -#define __IA32_COMPAT_SYS_NI(name) \ - __SYS_NI(ia32, compat_sys_##name) - #else /* CONFIG_IA32_EMULATION */ #define __IA32_COMPAT_SYS_STUB0(name) #define __IA32_COMPAT_SYS_STUBx(x, name, ...) #define __IA32_COMPAT_COND_SYSCALL(name) -#define __IA32_COMPAT_SYS_NI(name) #endif /* CONFIG_IA32_EMULATION */ @@ -175,13 +178,10 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); #define __X32_COMPAT_COND_SYSCALL(name) \ __COND_SYSCALL(x64, compat_sys_##name) -#define __X32_COMPAT_SYS_NI(name) \ - __SYS_NI(x64, compat_sys_##name) #else /* CONFIG_X86_X32_ABI */ #define __X32_COMPAT_SYS_STUB0(name) #define __X32_COMPAT_SYS_STUBx(x, name, ...) #define __X32_COMPAT_COND_SYSCALL(name) -#define __X32_COMPAT_SYS_NI(name) #endif /* CONFIG_X86_X32_ABI */ @@ -212,17 +212,12 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); /* * As some compat syscalls may not be implemented, we need to expand - * COND_SYSCALL_COMPAT in kernel/sys_ni.c and COMPAT_SYS_NI in - * kernel/time/posix-stubs.c to cover this case as well. + * COND_SYSCALL_COMPAT in kernel/sys_ni.c to cover this case as well. */ #define COND_SYSCALL_COMPAT(name) \ __IA32_COMPAT_COND_SYSCALL(name) \ __X32_COMPAT_COND_SYSCALL(name) -#define COMPAT_SYS_NI(name) \ - __IA32_COMPAT_SYS_NI(name) \ - __X32_COMPAT_SYS_NI(name) - #endif /* CONFIG_COMPAT */ #define __SYSCALL_DEFINEx(x, name, ...) \ @@ -243,8 +238,8 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); * As the generic SYSCALL_DEFINE0() macro does not decode any parameters for * obvious reasons, and passing struct pt_regs *regs to it in %rdi does not * hurt, we only need to re-define it here to keep the naming congruent to - * SYSCALL_DEFINEx() -- which is essential for the COND_SYSCALL() and SYS_NI() - * macros to work correctly. + * SYSCALL_DEFINEx() -- which is essential for the COND_SYSCALL() macro + * to work correctly. */ #define SYSCALL_DEFINE0(sname) \ SYSCALL_METADATA(_##sname, 0); \ @@ -257,10 +252,6 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); __X64_COND_SYSCALL(name) \ __IA32_COND_SYSCALL(name) -#define SYS_NI(name) \ - __X64_SYS_NI(name) \ - __IA32_SYS_NI(name) - /* * For VSYSCALLS, we need to declare these three syscalls with the new diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 020c81a7c729..7ddef3a69866 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -5,7 +5,11 @@ #include <linux/init.h> #include <linux/bits.h> +#include <linux/mmzone.h> + +#include <asm/errno.h> #include <asm/ptrace.h> +#include <asm/trapnr.h> #include <asm/shared/tdx.h> /* @@ -15,25 +19,24 @@ * TDX module. */ #define TDX_ERROR _BITUL(63) +#define TDX_NON_RECOVERABLE _BITUL(62) #define TDX_SW_ERROR (TDX_ERROR | GENMASK_ULL(47, 40)) #define TDX_SEAMCALL_VMFAILINVALID (TDX_SW_ERROR | _UL(0xFFFF0000)) -#ifndef __ASSEMBLY__ +#define TDX_SEAMCALL_GP (TDX_SW_ERROR | X86_TRAP_GP) +#define TDX_SEAMCALL_UD (TDX_SW_ERROR | X86_TRAP_UD) /* - * Used to gather the output registers values of the TDCALL and SEAMCALL - * instructions when requesting services from the TDX module. - * - * This is a software only structure and not part of the TDX module/VMM ABI. + * TDX module SEAMCALL leaf function error codes */ -struct tdx_module_output { - u64 rcx; - u64 rdx; - u64 r8; - u64 r9; - u64 r10; - u64 r11; -}; +#define TDX_SUCCESS 0ULL +#define TDX_RND_NO_ENTROPY 0x8000020300000000ULL + +#ifndef __ASSEMBLER__ + +#include <uapi/asm/mce.h> +#include <asm/tdx_global_metadata.h> +#include <linux/pgtable.h> /* * Used by the #VE exception handler to gather the #VE exception @@ -55,22 +58,27 @@ struct ve_info { void __init tdx_early_init(void); -/* Used to communicate with the TDX module */ -u64 __tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9, - struct tdx_module_output *out); - void tdx_get_ve_info(struct ve_info *ve); bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve); -void tdx_safe_halt(void); +void tdx_halt(void); bool tdx_early_handle_ve(struct pt_regs *regs); +int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport); + +int tdx_mcall_extend_rtmr(u8 index, u8 *data); + +u64 tdx_hcall_get_quote(u8 *buf, size_t size); + +void __init tdx_dump_attributes(u64 td_attr); +void __init tdx_dump_td_ctls(u64 td_ctls); + #else static inline void tdx_early_init(void) { }; -static inline void tdx_safe_halt(void) { }; +static inline void tdx_halt(void) { }; static inline bool tdx_early_handle_ve(struct pt_regs *regs) { return false; } @@ -87,5 +95,113 @@ static inline long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, return -ENODEV; } #endif /* CONFIG_INTEL_TDX_GUEST && CONFIG_KVM_GUEST */ -#endif /* !__ASSEMBLY__ */ + +#ifdef CONFIG_INTEL_TDX_HOST +u64 __seamcall(u64 fn, struct tdx_module_args *args); +u64 __seamcall_ret(u64 fn, struct tdx_module_args *args); +u64 __seamcall_saved_ret(u64 fn, struct tdx_module_args *args); +void tdx_init(void); + +#include <asm/archrandom.h> + +typedef u64 (*sc_func_t)(u64 fn, struct tdx_module_args *args); + +static __always_inline u64 sc_retry(sc_func_t func, u64 fn, + struct tdx_module_args *args) +{ + int retry = RDRAND_RETRY_LOOPS; + u64 ret; + + do { + ret = func(fn, args); + } while (ret == TDX_RND_NO_ENTROPY && --retry); + + return ret; +} + +#define seamcall(_fn, _args) sc_retry(__seamcall, (_fn), (_args)) +#define seamcall_ret(_fn, _args) sc_retry(__seamcall_ret, (_fn), (_args)) +#define seamcall_saved_ret(_fn, _args) sc_retry(__seamcall_saved_ret, (_fn), (_args)) +int tdx_cpu_enable(void); +int tdx_enable(void); +const char *tdx_dump_mce_info(struct mce *m); +const struct tdx_sys_info *tdx_get_sysinfo(void); + +int tdx_guest_keyid_alloc(void); +u32 tdx_get_nr_guest_keyids(void); +void tdx_guest_keyid_free(unsigned int keyid); + +struct tdx_td { + /* TD root structure: */ + struct page *tdr_page; + + int tdcs_nr_pages; + /* TD control structure: */ + struct page **tdcs_pages; + + /* Size of `tdcx_pages` in struct tdx_vp */ + int tdcx_nr_pages; +}; + +struct tdx_vp { + /* TDVP root page */ + struct page *tdvpr_page; + + /* TD vCPU control structure: */ + struct page **tdcx_pages; +}; + +static inline u64 mk_keyed_paddr(u16 hkid, struct page *page) +{ + u64 ret; + + ret = page_to_phys(page); + /* KeyID bits are just above the physical address bits: */ + ret |= (u64)hkid << boot_cpu_data.x86_phys_bits; + + return ret; +} + +static inline int pg_level_to_tdx_sept_level(enum pg_level level) +{ + WARN_ON_ONCE(level == PG_LEVEL_NONE); + return level - 1; +} + +u64 tdh_vp_enter(struct tdx_vp *vp, struct tdx_module_args *args); +u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page); +u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2); +u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2); +u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page); +u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2); +u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u64 *ext_err2); +u64 tdh_mng_key_config(struct tdx_td *td); +u64 tdh_mng_create(struct tdx_td *td, u16 hkid); +u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp); +u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data); +u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2); +u64 tdh_mr_finalize(struct tdx_td *td); +u64 tdh_vp_flush(struct tdx_vp *vp); +u64 tdh_mng_vpflushdone(struct tdx_td *td); +u64 tdh_mng_key_freeid(struct tdx_td *td); +u64 tdh_mng_init(struct tdx_td *td, u64 td_params, u64 *extended_err); +u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid); +u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data); +u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask); +u64 tdh_phymem_page_reclaim(struct page *page, u64 *tdx_pt, u64 *tdx_owner, u64 *tdx_size); +u64 tdh_mem_track(struct tdx_td *tdr); +u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u64 *ext_err2); +u64 tdh_phymem_cache_wb(bool resume); +u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td); +u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page); +#else +static inline void tdx_init(void) { } +static inline int tdx_cpu_enable(void) { return -ENODEV; } +static inline int tdx_enable(void) { return -ENODEV; } +static inline u32 tdx_get_nr_guest_keyids(void) { return 0; } +static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; } +static inline const struct tdx_sys_info *tdx_get_sysinfo(void) { return NULL; } +#endif /* CONFIG_INTEL_TDX_HOST */ + +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_X86_TDX_H */ diff --git a/arch/x86/include/asm/tdx_global_metadata.h b/arch/x86/include/asm/tdx_global_metadata.h new file mode 100644 index 000000000000..060a2ad744bf --- /dev/null +++ b/arch/x86/include/asm/tdx_global_metadata.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Automatically generated TDX global metadata structures. */ +#ifndef _X86_VIRT_TDX_AUTO_GENERATED_TDX_GLOBAL_METADATA_H +#define _X86_VIRT_TDX_AUTO_GENERATED_TDX_GLOBAL_METADATA_H + +#include <linux/types.h> + +struct tdx_sys_info_features { + u64 tdx_features0; +}; + +struct tdx_sys_info_tdmr { + u16 max_tdmrs; + u16 max_reserved_per_tdmr; + u16 pamt_4k_entry_size; + u16 pamt_2m_entry_size; + u16 pamt_1g_entry_size; +}; + +struct tdx_sys_info_td_ctrl { + u16 tdr_base_size; + u16 tdcs_base_size; + u16 tdvps_base_size; +}; + +struct tdx_sys_info_td_conf { + u64 attributes_fixed0; + u64 attributes_fixed1; + u64 xfam_fixed0; + u64 xfam_fixed1; + u16 num_cpuid_config; + u16 max_vcpus_per_td; + u64 cpuid_config_leaves[128]; + u64 cpuid_config_values[128][2]; +}; + +struct tdx_sys_info { + struct tdx_sys_info_features features; + struct tdx_sys_info_tdmr tdmr; + struct tdx_sys_info_td_ctrl td_ctrl; + struct tdx_sys_info_td_conf td_conf; +}; + +#endif diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 1cc15528ce29..5337f1be18f6 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -6,27 +6,17 @@ #include <linux/stddef.h> #include <asm/ptrace.h> -struct paravirt_patch_site; -#ifdef CONFIG_PARAVIRT -void apply_paravirt(struct paravirt_patch_site *start, - struct paravirt_patch_site *end); -#else -static inline void apply_paravirt(struct paravirt_patch_site *start, - struct paravirt_patch_site *end) -{} -#define __parainstructions NULL -#define __parainstructions_end NULL -#endif - /* * Currently, the max observed size in the kernel code is * JUMP_LABEL_NOP_SIZE/RELATIVEJUMP_SIZE, which are 5. * Raise it if needed. */ -#define POKE_MAX_OPCODE_SIZE 5 +#define TEXT_POKE_MAX_OPCODE_SIZE 5 extern void text_poke_early(void *addr, const void *opcode, size_t len); +extern void text_poke_apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len); + /* * Clear and restore the kernel write-protection flag on the local CPU. * Allows the kernel to edit read-only pages. @@ -42,15 +32,17 @@ extern void text_poke_early(void *addr, const void *opcode, size_t len); * an inconsistent instruction while you patch. */ extern void *text_poke(void *addr, const void *opcode, size_t len); -extern void text_poke_sync(void); +extern void smp_text_poke_sync_each_cpu(void); extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); extern void *text_poke_copy(void *addr, const void *opcode, size_t len); +#define text_poke_copy text_poke_copy +extern void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, bool core_ok); extern void *text_poke_set(void *addr, int c, size_t len); -extern int poke_int3_handler(struct pt_regs *regs); -extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate); +extern int smp_text_poke_int3_handler(struct pt_regs *regs); +extern void smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate); -extern void text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate); -extern void text_poke_finish(void); +extern void smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate); +extern void smp_text_poke_batch_finish(void); #define INT3_INSN_SIZE 1 #define INT3_INSN_OPCODE 0xCC @@ -90,7 +82,7 @@ static __always_inline int text_opcode_size(u8 opcode) } union text_poke_insn { - u8 text[POKE_MAX_OPCODE_SIZE]; + u8 text[TEXT_POKE_MAX_OPCODE_SIZE]; struct { u8 opcode; s32 disp; @@ -136,8 +128,8 @@ void *text_gen_insn(u8 opcode, const void *addr, const void *dest) } extern int after_bootmem; -extern __ro_after_init struct mm_struct *poking_mm; -extern __ro_after_init unsigned long poking_addr; +extern __ro_after_init struct mm_struct *text_poke_mm; +extern __ro_after_init unsigned long text_poke_mm_addr; #ifndef CONFIG_UML_X86 static __always_inline @@ -150,13 +142,14 @@ static __always_inline void int3_emulate_push(struct pt_regs *regs, unsigned long val) { /* - * The int3 handler in entry_64.S adds a gap between the + * The INT3 handler in entry_64.S adds a gap between the * stack where the break point happened, and the saving of * pt_regs. We can extend the original stack because of - * this gap. See the idtentry macro's create_gap option. + * this gap. See the idtentry macro's X86_TRAP_BP logic. * - * Similarly entry_32.S will have a gap on the stack for (any) hardware - * exception and pt_regs; see FIXUP_FRAME. + * Similarly, entry_32.S will have a gap on the stack for + * (any) hardware exception and pt_regs; see the + * FIXUP_FRAME macro. */ regs->sp -= sizeof(unsigned long); *(unsigned long *)regs->sp = val; @@ -183,6 +176,37 @@ void int3_emulate_ret(struct pt_regs *regs) unsigned long ip = int3_emulate_pop(regs); int3_emulate_jmp(regs, ip); } + +static __always_inline +void int3_emulate_jcc(struct pt_regs *regs, u8 cc, unsigned long ip, unsigned long disp) +{ + static const unsigned long jcc_mask[6] = { + [0] = X86_EFLAGS_OF, + [1] = X86_EFLAGS_CF, + [2] = X86_EFLAGS_ZF, + [3] = X86_EFLAGS_CF | X86_EFLAGS_ZF, + [4] = X86_EFLAGS_SF, + [5] = X86_EFLAGS_PF, + }; + + bool invert = cc & 1; + bool match; + + if (cc < 0xc) { + match = regs->flags & jcc_mask[cc >> 1]; + } else { + match = ((regs->flags & X86_EFLAGS_SF) >> X86_EFLAGS_SF_BIT) ^ + ((regs->flags & X86_EFLAGS_OF) >> X86_EFLAGS_OF_BIT); + if (cc >= 0xe) + match = match || (regs->flags & X86_EFLAGS_ZF); + } + + if ((match && !invert) || (!match && invert)) + ip += disp; + + int3_emulate_jmp(regs, ip); +} + #endif /* !CONFIG_UML_X86 */ #endif /* _ASM_X86_TEXT_PATCHING_H */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index f0cb881c1d69..9282465eea21 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -31,7 +31,9 @@ * In vm86 mode, the hardware frame is much longer still, so add 16 * bytes to make room for the real-mode segments. * - * x86_64 has a fixed-length stack frame. + * x86-64 has a fixed-length stack frame, but it depends on whether + * or not FRED is enabled. Future versions of FRED might make this + * dynamic, but for now it is always 2 words longer. */ #ifdef CONFIG_X86_32 # ifdef CONFIG_VM86 @@ -39,8 +41,12 @@ # else # define TOP_OF_KERNEL_STACK_PADDING 8 # endif -#else -# define TOP_OF_KERNEL_STACK_PADDING 0 +#else /* x86-64 */ +# ifdef CONFIG_X86_FRED +# define TOP_OF_KERNEL_STACK_PADDING (2 * 8) +# else +# define TOP_OF_KERNEL_STACK_PADDING 0 +# endif #endif /* @@ -48,7 +54,7 @@ * - this struct should fit entirely inside of one cache line * - this struct shares the supervisor stack pages */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct task_struct; #include <asm/cpufeature.h> #include <linux/atomic.h> @@ -67,7 +73,7 @@ struct thread_info { .flags = 0, \ } -#else /* !__ASSEMBLY__ */ +#else /* !__ASSEMBLER__ */ #include <asm/asm-offsets.h> @@ -81,8 +87,9 @@ struct thread_info { #define TIF_NOTIFY_RESUME 1 /* callback before returning to user */ #define TIF_SIGPENDING 2 /* signal pending */ #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ -#define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/ -#define TIF_SSBD 5 /* Speculative store bypass disable */ +#define TIF_NEED_RESCHED_LAZY 4 /* Lazy rescheduling needed */ +#define TIF_SINGLESTEP 5 /* reenable singlestep on user return*/ +#define TIF_SSBD 6 /* Speculative store bypass disable */ #define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */ #define TIF_SPEC_L1D_FLUSH 10 /* Flush L1D on mm switches (processes) */ #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ @@ -104,6 +111,7 @@ struct thread_info { #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) #define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) #define _TIF_SSBD (1 << TIF_SSBD) #define _TIF_SPEC_IB (1 << TIF_SPEC_IB) @@ -153,7 +161,7 @@ struct thread_info { * * preempt_count needs to be 1 initially, until the scheduler is functional. */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * Walks up the stack frames to make sure that the specified object is @@ -163,7 +171,12 @@ struct thread_info { * GOOD_FRAME if within a frame * BAD_STACK if placed across a frame boundary (or outside stack) * NOT_STACK unable to determine (no frame pointers, etc) + * + * This function reads pointers from the stack and dereferences them. The + * pointers may not have their KMSAN shadow set up properly, which may result + * in false positive reports. Disable instrumentation to avoid those. */ +__no_kmsan_checks static inline int arch_within_stack_frames(const void * const stack, const void * const stackend, const void *obj, unsigned long len) @@ -200,7 +213,7 @@ static inline int arch_within_stack_frames(const void * const stack, #endif } -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ /* * Thread-synchronous status. @@ -211,7 +224,7 @@ static inline int arch_within_stack_frames(const void * const stack, */ #define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_COMPAT #define TS_I386_REGS_POKED 0x0004 /* regs poked by 32-bit ptracer */ @@ -227,11 +240,8 @@ static inline int arch_within_stack_frames(const void * const stack, current_thread_info()->status & TS_COMPAT) #endif -extern void arch_task_cache_init(void); -extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); -extern void arch_release_task_struct(struct task_struct *tsk); extern void arch_setup_new_exec(void); #define arch_setup_new_exec arch_setup_new_exec -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_X86_THREAD_INFO_H */ diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h index 8ac563abb567..f360104ed172 100644 --- a/arch/x86/include/asm/time.h +++ b/arch/x86/include/asm/time.h @@ -6,8 +6,8 @@ #include <asm/mc146818rtc.h> extern void hpet_time_init(void); -extern void time_init(void); extern bool pit_timer_init(void); +extern bool tsc_clocksource_watchdog_disabled(void); extern struct clock_event_device *global_clock_event; diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 7365dd4acffb..23baf8c9b34c 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -6,8 +6,6 @@ #include <linux/interrupt.h> #include <linux/math64.h> -#define TICK_SIZE (tick_nsec / 1000) - unsigned long long native_sched_clock(void); extern void recalibrate_cpu_khz(void); diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h index 580636cdc257..866ea78ba156 100644 --- a/arch/x86/include/asm/tlb.h +++ b/arch/x86/include/asm/tlb.h @@ -6,6 +6,9 @@ static inline void tlb_flush(struct mmu_gather *tlb); #include <asm-generic/tlb.h> +#include <linux/kernel.h> +#include <vdso/bits.h> +#include <vdso/page.h> static inline void tlb_flush(struct mmu_gather *tlb) { @@ -20,18 +23,144 @@ static inline void tlb_flush(struct mmu_gather *tlb) flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables); } +static inline void invlpg(unsigned long addr) +{ + asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); +} + +enum addr_stride { + PTE_STRIDE = 0, + PMD_STRIDE = 1 +}; + +/* + * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination + * of the three. For example: + * - FLAG_VA | FLAG_INCLUDE_GLOBAL: invalidate all TLB entries at the address + * - FLAG_PCID: invalidate all TLB entries matching the PCID + * + * The first is used to invalidate (kernel) mappings at a particular + * address across all processes. + * + * The latter invalidates all TLB entries matching a PCID. + */ +#define INVLPGB_FLAG_VA BIT(0) +#define INVLPGB_FLAG_PCID BIT(1) +#define INVLPGB_FLAG_ASID BIT(2) +#define INVLPGB_FLAG_INCLUDE_GLOBAL BIT(3) +#define INVLPGB_FLAG_FINAL_ONLY BIT(4) +#define INVLPGB_FLAG_INCLUDE_NESTED BIT(5) + +/* The implied mode when all bits are clear: */ +#define INVLPGB_MODE_ALL_NONGLOBALS 0UL + +#ifdef CONFIG_BROADCAST_TLB_FLUSH /* - * While x86 architecture in general requires an IPI to perform TLB - * shootdown, enablement code for several hypervisors overrides - * .flush_tlb_others hook in pv_mmu_ops and implements it by issuing - * a hypercall. To keep software pagetable walkers safe in this case we - * switch to RCU based table free (MMU_GATHER_RCU_TABLE_FREE). See the comment - * below 'ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE' in include/asm-generic/tlb.h - * for more details. + * INVLPGB does broadcast TLB invalidation across all the CPUs in the system. + * + * The INVLPGB instruction is weakly ordered, and a batch of invalidations can + * be done in a parallel fashion. + * + * The instruction takes the number of extra pages to invalidate, beyond the + * first page, while __invlpgb gets the more human readable number of pages to + * invalidate. + * + * The bits in rax[0:2] determine respectively which components of the address + * (VA, PCID, ASID) get compared when flushing. If neither bits are set, *any* + * address in the specified range matches. + * + * Since it is desired to only flush TLB entries for the ASID that is executing + * the instruction (a host/hypervisor or a guest), the ASID valid bit should + * always be set. On a host/hypervisor, the hardware will use the ASID value + * specified in EDX[15:0] (which should be 0). On a guest, the hardware will + * use the actual ASID value of the guest. + * + * TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from + * this CPU have completed. */ -static inline void __tlb_remove_table(void *table) +static inline void __invlpgb(unsigned long asid, unsigned long pcid, + unsigned long addr, u16 nr_pages, + enum addr_stride stride, u8 flags) +{ + u64 rax = addr | flags | INVLPGB_FLAG_ASID; + u32 ecx = (stride << 31) | (nr_pages - 1); + u32 edx = (pcid << 16) | asid; + + /* The low bits in rax are for flags. Verify addr is clean. */ + VM_WARN_ON_ONCE(addr & ~PAGE_MASK); + + /* INVLPGB; supported in binutils >= 2.36. */ + asm volatile(".byte 0x0f, 0x01, 0xfe" :: "a" (rax), "c" (ecx), "d" (edx)); +} + +static inline void __invlpgb_all(unsigned long asid, unsigned long pcid, u8 flags) +{ + __invlpgb(asid, pcid, 0, 1, 0, flags); +} + +static inline void __tlbsync(void) { - free_page_and_swap_cache(table); + /* + * TLBSYNC waits for INVLPGB instructions originating on the same CPU + * to have completed. Print a warning if the task has been migrated, + * and might not be waiting on all the INVLPGBs issued during this TLB + * invalidation sequence. + */ + cant_migrate(); + + /* TLBSYNC: supported in binutils >= 0.36. */ + asm volatile(".byte 0x0f, 0x01, 0xff" ::: "memory"); +} +#else +/* Some compilers (I'm looking at you clang!) simply can't do DCE */ +static inline void __invlpgb(unsigned long asid, unsigned long pcid, + unsigned long addr, u16 nr_pages, + enum addr_stride s, u8 flags) { } +static inline void __invlpgb_all(unsigned long asid, unsigned long pcid, u8 flags) { } +static inline void __tlbsync(void) { } +#endif + +static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid, + unsigned long addr, + u16 nr, bool stride) +{ + enum addr_stride str = stride ? PMD_STRIDE : PTE_STRIDE; + u8 flags = INVLPGB_FLAG_PCID | INVLPGB_FLAG_VA; + + __invlpgb(0, pcid, addr, nr, str, flags); +} + +/* Flush all mappings for a given PCID, not including globals. */ +static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid) +{ + __invlpgb_all(0, pcid, INVLPGB_FLAG_PCID); } +/* Flush all mappings, including globals, for all PCIDs. */ +static inline void invlpgb_flush_all(void) +{ + /* + * TLBSYNC at the end needs to make sure all flushes done on the + * current CPU have been executed system-wide. Therefore, make + * sure nothing gets migrated in-between but disable preemption + * as it is cheaper. + */ + guard(preempt)(); + __invlpgb_all(0, 0, INVLPGB_FLAG_INCLUDE_GLOBAL); + __tlbsync(); +} + +/* Flush addr, including globals, for all PCIDs. */ +static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr) +{ + __invlpgb(0, 0, addr, nr, PTE_STRIDE, INVLPGB_FLAG_INCLUDE_GLOBAL); +} + +/* Flush all mappings for all PCIDs except globals. */ +static inline void invlpgb_flush_all_nonglobals(void) +{ + guard(preempt)(); + __invlpgb_all(0, 0, INVLPGB_MODE_ALL_NONGLOBALS); + __tlbsync(); +} #endif /* _ASM_X86_TLB_H */ diff --git a/arch/x86/include/asm/tlbbatch.h b/arch/x86/include/asm/tlbbatch.h index 1ad56eb3e8a8..80aaf64ff25f 100644 --- a/arch/x86/include/asm/tlbbatch.h +++ b/arch/x86/include/asm/tlbbatch.h @@ -10,6 +10,11 @@ struct arch_tlbflush_unmap_batch { * the PFNs being flushed.. */ struct cpumask cpumask; + /* + * Set if pages were unmapped from any MM, even one that does not + * have active CPUs in its cpumask. + */ + bool unmapped_pages; }; #endif /* _ARCH_X86_TLBBATCH_H */ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index cda3118f3b27..e9b81876ebe4 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -2,9 +2,11 @@ #ifndef _ASM_X86_TLBFLUSH_H #define _ASM_X86_TLBFLUSH_H -#include <linux/mm.h> +#include <linux/mm_types.h> +#include <linux/mmu_notifier.h> #include <linux/sched.h> +#include <asm/barrier.h> #include <asm/processor.h> #include <asm/cpufeature.h> #include <asm/special_insns.h> @@ -12,6 +14,9 @@ #include <asm/invpcid.h> #include <asm/pti.h> #include <asm/processor-flags.h> +#include <asm/pgtable.h> + +DECLARE_PER_CPU(u64, tlbstate_untag_mask); void __flush_tlb_all(void); @@ -101,6 +106,16 @@ struct tlb_state { */ bool invalidate_other; +#ifdef CONFIG_ADDRESS_MASKING + /* + * Active LAM mode. + * + * X86_CR3_LAM_U57/U48 shifted right by X86_CR3_LAM_U57_BIT or 0 if LAM + * disabled. + */ + u8 lam; +#endif + /* * Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate * the corresponding user PCID needs a flush next time we @@ -169,6 +184,9 @@ static inline void cr4_init_shadow(void) extern unsigned long mmu_cr4_features; extern u32 *trampoline_cr4_features; +/* How many pages can be invalidated with one INVLPGB. */ +extern u16 invlpgb_count_max; + extern void initialize_tlbstate_and_flush(void); /* @@ -208,6 +226,7 @@ struct flush_tlb_info { unsigned int initiating_cpu; u8 stride_shift; u8 freed_tables; + u8 trim_cpumask; }; void flush_tlb_local(void); @@ -216,6 +235,71 @@ void flush_tlb_one_kernel(unsigned long addr); void flush_tlb_multi(const struct cpumask *cpumask, const struct flush_tlb_info *info); +static inline bool is_dyn_asid(u16 asid) +{ + return asid < TLB_NR_DYN_ASIDS; +} + +static inline bool is_global_asid(u16 asid) +{ + return !is_dyn_asid(asid); +} + +#ifdef CONFIG_BROADCAST_TLB_FLUSH +static inline u16 mm_global_asid(struct mm_struct *mm) +{ + u16 asid; + + if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) + return 0; + + asid = smp_load_acquire(&mm->context.global_asid); + + /* mm->context.global_asid is either 0, or a global ASID */ + VM_WARN_ON_ONCE(asid && is_dyn_asid(asid)); + + return asid; +} + +static inline void mm_init_global_asid(struct mm_struct *mm) +{ + if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) { + mm->context.global_asid = 0; + mm->context.asid_transition = false; + } +} + +static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) +{ + /* + * Notably flush_tlb_mm_range() -> broadcast_tlb_flush() -> + * finish_asid_transition() needs to observe asid_transition = true + * once it observes global_asid. + */ + mm->context.asid_transition = true; + smp_store_release(&mm->context.global_asid, asid); +} + +static inline void mm_clear_asid_transition(struct mm_struct *mm) +{ + WRITE_ONCE(mm->context.asid_transition, false); +} + +static inline bool mm_in_asid_transition(struct mm_struct *mm) +{ + if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) + return false; + + return mm && READ_ONCE(mm->context.asid_transition); +} +#else +static inline u16 mm_global_asid(struct mm_struct *mm) { return 0; } +static inline void mm_init_global_asid(struct mm_struct *mm) { } +static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { } +static inline void mm_clear_asid_transition(struct mm_struct *mm) { } +static inline bool mm_in_asid_transition(struct mm_struct *mm) { return false; } +#endif /* CONFIG_BROADCAST_TLB_FLUSH */ + #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #endif @@ -227,7 +311,7 @@ void flush_tlb_multi(const struct cpumask *cpumask, flush_tlb_mm_range((vma)->vm_mm, start, end, \ ((vma)->vm_flags & VM_HUGETLB) \ ? huge_page_shift(hstate_vma(vma)) \ - : PAGE_SHIFT, false) + : PAGE_SHIFT, true) extern void flush_tlb_all(void); extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, @@ -240,6 +324,18 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false); } +static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm) +{ + bool should_defer = false; + + /* If remote CPUs need to be flushed then defer batch the flush */ + if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids) + should_defer = true; + put_cpu(); + + return should_defer; +} + static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) { /* @@ -251,11 +347,18 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) return atomic64_inc_return(&mm->context.tlb_gen); } -static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, - struct mm_struct *mm) +static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, + struct mm_struct *mm, unsigned long start, unsigned long end) { inc_mm_tlb_gen(mm); cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); + batch->unmapped_pages = true; + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); +} + +static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm) +{ + flush_tlb_mm(mm); } extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); @@ -273,7 +376,8 @@ static inline bool pte_flags_need_flush(unsigned long oldflags, const pteval_t flush_on_clear = _PAGE_DIRTY | _PAGE_PRESENT | _PAGE_ACCESSED; const pteval_t software_flags = _PAGE_SOFTW1 | _PAGE_SOFTW2 | - _PAGE_SOFTW3 | _PAGE_SOFTW4; + _PAGE_SOFTW3 | _PAGE_SOFTW4 | + _PAGE_SAVED_DIRTY; const pteval_t flush_on_change = _PAGE_RW | _PAGE_USER | _PAGE_PWT | _PAGE_PCD | _PAGE_PSE | _PAGE_GLOBAL | _PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PKEY_BIT0 | _PAGE_PKEY_BIT1 | @@ -357,6 +461,31 @@ static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd) } #define huge_pmd_needs_flush huge_pmd_needs_flush +#ifdef CONFIG_ADDRESS_MASKING +static inline u64 tlbstate_lam_cr3_mask(void) +{ + u64 lam = this_cpu_read(cpu_tlbstate.lam); + + return lam << X86_CR3_LAM_U57_BIT; +} + +static inline void cpu_tlbstate_update_lam(unsigned long lam, u64 untag_mask) +{ + this_cpu_write(cpu_tlbstate.lam, lam >> X86_CR3_LAM_U57_BIT); + this_cpu_write(tlbstate_untag_mask, untag_mask); +} + +#else + +static inline u64 tlbstate_lam_cr3_mask(void) +{ + return 0; +} + +static inline void cpu_tlbstate_update_lam(unsigned long lam, u64 untag_mask) +{ +} +#endif #endif /* !MODULE */ static inline void __native_tlb_flush_global(unsigned long cr4) diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 458c891a8273..6c79ee7c0957 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -31,9 +31,9 @@ * CONFIG_NUMA. */ #include <linux/numa.h> +#include <linux/cpumask.h> #ifdef CONFIG_NUMA -#include <linux/cpumask.h> #include <asm/mpspec.h> #include <asm/percpu.h> @@ -102,31 +102,104 @@ static inline void setup_node_to_cpumask_map(void) { } #include <asm-generic/topology.h> +/* Topology information */ +enum x86_topology_domains { + TOPO_SMT_DOMAIN, + TOPO_CORE_DOMAIN, + TOPO_MODULE_DOMAIN, + TOPO_TILE_DOMAIN, + TOPO_DIE_DOMAIN, + TOPO_DIEGRP_DOMAIN, + TOPO_PKG_DOMAIN, + TOPO_MAX_DOMAIN, +}; + +enum x86_topology_cpu_type { + TOPO_CPU_TYPE_PERFORMANCE, + TOPO_CPU_TYPE_EFFICIENCY, + TOPO_CPU_TYPE_UNKNOWN, +}; + +struct x86_topology_system { + unsigned int dom_shifts[TOPO_MAX_DOMAIN]; + unsigned int dom_size[TOPO_MAX_DOMAIN]; +}; + +extern struct x86_topology_system x86_topo_system; + +static inline unsigned int topology_get_domain_size(enum x86_topology_domains dom) +{ + return x86_topo_system.dom_size[dom]; +} + +static inline unsigned int topology_get_domain_shift(enum x86_topology_domains dom) +{ + return dom == TOPO_SMT_DOMAIN ? 0 : x86_topo_system.dom_shifts[dom - 1]; +} + extern const struct cpumask *cpu_coregroup_mask(int cpu); extern const struct cpumask *cpu_clustergroup_mask(int cpu); -#define topology_logical_package_id(cpu) (cpu_data(cpu).logical_proc_id) -#define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id) -#define topology_logical_die_id(cpu) (cpu_data(cpu).logical_die_id) -#define topology_die_id(cpu) (cpu_data(cpu).cpu_die_id) -#define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id) +#define topology_logical_package_id(cpu) (cpu_data(cpu).topo.logical_pkg_id) +#define topology_physical_package_id(cpu) (cpu_data(cpu).topo.pkg_id) +#define topology_logical_die_id(cpu) (cpu_data(cpu).topo.logical_die_id) +#define topology_logical_core_id(cpu) (cpu_data(cpu).topo.logical_core_id) +#define topology_die_id(cpu) (cpu_data(cpu).topo.die_id) +#define topology_core_id(cpu) (cpu_data(cpu).topo.core_id) #define topology_ppin(cpu) (cpu_data(cpu).ppin) -extern unsigned int __max_die_per_package; +#define topology_amd_node_id(cpu) (cpu_data(cpu).topo.amd_node_id) + +extern unsigned int __max_dies_per_package; +extern unsigned int __max_logical_packages; +extern unsigned int __max_threads_per_core; +extern unsigned int __num_threads_per_package; +extern unsigned int __num_cores_per_package; + +const char *get_topology_cpu_type_name(struct cpuinfo_x86 *c); +enum x86_topology_cpu_type get_topology_cpu_type(struct cpuinfo_x86 *c); + +static inline unsigned int topology_max_packages(void) +{ + return __max_logical_packages; +} + +static inline unsigned int topology_max_dies_per_package(void) +{ + return __max_dies_per_package; +} + +static inline unsigned int topology_num_cores_per_package(void) +{ + return __num_cores_per_package; +} + +static inline unsigned int topology_num_threads_per_package(void) +{ + return __num_threads_per_package; +} + +#ifdef CONFIG_X86_LOCAL_APIC +int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level); +#else +static inline int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level) +{ + return 0; +} +#endif #ifdef CONFIG_SMP -#define topology_cluster_id(cpu) (per_cpu(cpu_l2c_id, cpu)) +#define topology_cluster_id(cpu) (cpu_data(cpu).topo.l2c_id) #define topology_die_cpumask(cpu) (per_cpu(cpu_die_map, cpu)) #define topology_cluster_cpumask(cpu) (cpu_clustergroup_mask(cpu)) #define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) #define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) -extern unsigned int __max_logical_packages; -#define topology_max_packages() (__max_logical_packages) -static inline int topology_max_die_per_package(void) +static inline int topology_phys_to_logical_pkg(unsigned int pkg) { - return __max_die_per_package; + return topology_get_logical_id(pkg << x86_topo_system.dom_shifts[TOPO_PKG_DOMAIN], + TOPO_PKG_DOMAIN); } extern int __max_smt_threads; @@ -136,26 +209,33 @@ static inline int topology_max_smt_threads(void) return __max_smt_threads; } -int topology_update_package_map(unsigned int apicid, unsigned int cpu); -int topology_update_die_map(unsigned int dieid, unsigned int cpu); -int topology_phys_to_logical_pkg(unsigned int pkg); -int topology_phys_to_logical_die(unsigned int die, unsigned int cpu); -bool topology_is_primary_thread(unsigned int cpu); -bool topology_smt_supported(void); -#else -#define topology_max_packages() (1) -static inline int -topology_update_package_map(unsigned int apicid, unsigned int cpu) { return 0; } -static inline int -topology_update_die_map(unsigned int dieid, unsigned int cpu) { return 0; } +#include <linux/cpu_smt.h> + +extern unsigned int __amd_nodes_per_pkg; + +static inline unsigned int topology_amd_nodes_per_pkg(void) +{ + return __amd_nodes_per_pkg; +} + +extern struct cpumask __cpu_primary_thread_mask; +#define cpu_primary_thread_mask ((const struct cpumask *)&__cpu_primary_thread_mask) + +/** + * topology_is_primary_thread - Check whether CPU is the primary SMT thread + * @cpu: CPU to check + */ +static inline bool topology_is_primary_thread(unsigned int cpu) +{ + return cpumask_test_cpu(cpu, cpu_primary_thread_mask); +} +#define topology_is_primary_thread topology_is_primary_thread + +#else /* CONFIG_SMP */ static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; } -static inline int topology_phys_to_logical_die(unsigned int die, - unsigned int cpu) { return 0; } -static inline int topology_max_die_per_package(void) { return 1; } static inline int topology_max_smt_threads(void) { return 1; } -static inline bool topology_is_primary_thread(unsigned int cpu) { return true; } -static inline bool topology_smt_supported(void) { return false; } -#endif +static inline unsigned int topology_amd_nodes_per_pkg(void) { return 1; } +#endif /* !CONFIG_SMP */ static inline void arch_fix_phys_package_id(int num, u32 slot) { @@ -171,7 +251,7 @@ extern bool x86_topology_update; #include <asm/percpu.h> DECLARE_PER_CPU_READ_MOSTLY(int, sched_core_priority); -extern unsigned int __read_mostly sysctl_sched_itmt_enabled; +extern bool __read_mostly sysctl_sched_itmt_enabled; /* Interface to set priority of a cpu */ void sched_set_itmt_core_prio(int prio, int core_cpu); @@ -184,7 +264,7 @@ void sched_clear_itmt_support(void); #else /* CONFIG_SCHED_MC_PRIO */ -#define sysctl_sched_itmt_enabled 0 +#define sysctl_sched_itmt_enabled false static inline void sched_set_itmt_core_prio(int prio, int core_cpu) { } @@ -212,9 +292,22 @@ static inline long arch_scale_freq_capacity(int cpu) } #define arch_scale_freq_capacity arch_scale_freq_capacity +bool arch_enable_hybrid_capacity_scale(void); +void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap, + unsigned long cap_freq, unsigned long base_freq); + +unsigned long arch_scale_cpu_capacity(int cpu); +#define arch_scale_cpu_capacity arch_scale_cpu_capacity + extern void arch_set_max_freq_ratio(bool turbo_disabled); extern void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled); #else +static inline bool arch_enable_hybrid_capacity_scale(void) { return false; } +static inline void arch_set_cpu_capacity(int cpu, unsigned long cap, + unsigned long max_cap, + unsigned long cap_freq, + unsigned long base_freq) { } + static inline void arch_set_max_freq_ratio(bool turbo_disabled) { } static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) { } #endif @@ -222,9 +315,4 @@ static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled extern void arch_scale_freq_tick(void); #define arch_scale_freq_tick arch_scale_freq_tick -#ifdef CONFIG_ACPI_CPPC_LIB -void init_freq_invariance_cppc(void); -#define arch_init_invariance_cppc init_freq_invariance_cppc -#endif - #endif /* _ASM_X86_TOPOLOGY_H */ diff --git a/arch/x86/include/asm/trace/common.h b/arch/x86/include/asm/trace/common.h deleted file mode 100644 index f0f9bcdb74d9..000000000000 --- a/arch/x86/include/asm/trace/common.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef _ASM_TRACE_COMMON_H -#define _ASM_TRACE_COMMON_H - -#ifdef CONFIG_TRACING -DECLARE_STATIC_KEY_FALSE(trace_pagefault_key); -#define trace_pagefault_enabled() \ - static_branch_unlikely(&trace_pagefault_key) -#else -static inline bool trace_pagefault_enabled(void) { return false; } -#endif - -#endif diff --git a/arch/x86/include/asm/trace/exceptions.h b/arch/x86/include/asm/trace/exceptions.h deleted file mode 100644 index 6b1e87194809..000000000000 --- a/arch/x86/include/asm/trace/exceptions.h +++ /dev/null @@ -1,54 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM exceptions - -#if !defined(_TRACE_PAGE_FAULT_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_PAGE_FAULT_H - -#include <linux/tracepoint.h> -#include <asm/trace/common.h> - -extern int trace_pagefault_reg(void); -extern void trace_pagefault_unreg(void); - -DECLARE_EVENT_CLASS(x86_exceptions, - - TP_PROTO(unsigned long address, struct pt_regs *regs, - unsigned long error_code), - - TP_ARGS(address, regs, error_code), - - TP_STRUCT__entry( - __field( unsigned long, address ) - __field( unsigned long, ip ) - __field( unsigned long, error_code ) - ), - - TP_fast_assign( - __entry->address = address; - __entry->ip = regs->ip; - __entry->error_code = error_code; - ), - - TP_printk("address=%ps ip=%ps error_code=0x%lx", - (void *)__entry->address, (void *)__entry->ip, - __entry->error_code) ); - -#define DEFINE_PAGE_FAULT_EVENT(name) \ -DEFINE_EVENT_FN(x86_exceptions, name, \ - TP_PROTO(unsigned long address, struct pt_regs *regs, \ - unsigned long error_code), \ - TP_ARGS(address, regs, error_code), \ - trace_pagefault_reg, trace_pagefault_unreg); - -DEFINE_PAGE_FAULT_EVENT(page_fault_user); -DEFINE_PAGE_FAULT_EVENT(page_fault_kernel); - -#undef TRACE_INCLUDE_PATH -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_PATH . -#define TRACE_INCLUDE_FILE exceptions -#endif /* _TRACE_PAGE_FAULT_H */ - -/* This part must be outside protection */ -#include <trace/define_trace.h> diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h index 4645a6334063..721b408d9a67 100644 --- a/arch/x86/include/asm/trace/fpu.h +++ b/arch/x86/include/asm/trace/fpu.h @@ -44,16 +44,6 @@ DEFINE_EVENT(x86_fpu, x86_fpu_after_save, TP_ARGS(fpu) ); -DEFINE_EVENT(x86_fpu, x86_fpu_before_restore, - TP_PROTO(struct fpu *fpu), - TP_ARGS(fpu) -); - -DEFINE_EVENT(x86_fpu, x86_fpu_after_restore, - TP_PROTO(struct fpu *fpu), - TP_ARGS(fpu) -); - DEFINE_EVENT(x86_fpu, x86_fpu_regs_activated, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) @@ -64,21 +54,11 @@ DEFINE_EVENT(x86_fpu, x86_fpu_regs_deactivated, TP_ARGS(fpu) ); -DEFINE_EVENT(x86_fpu, x86_fpu_init_state, - TP_PROTO(struct fpu *fpu), - TP_ARGS(fpu) -); - DEFINE_EVENT(x86_fpu, x86_fpu_dropped, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); -DEFINE_EVENT(x86_fpu, x86_fpu_copy_src, - TP_PROTO(struct fpu *fpu), - TP_ARGS(fpu) -); - DEFINE_EVENT(x86_fpu, x86_fpu_copy_dst, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h index 88e7f0f3bf62..7408bebdfde0 100644 --- a/arch/x86/include/asm/trace/irq_vectors.h +++ b/arch/x86/include/asm/trace/irq_vectors.h @@ -6,7 +6,6 @@ #define _TRACE_IRQ_VECTORS_H #include <linux/tracepoint.h> -#include <asm/trace/common.h> #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/include/asm/trap_pf.h b/arch/x86/include/asm/trap_pf.h index 10b1de500ab1..a23a7b707b64 100644 --- a/arch/x86/include/asm/trap_pf.h +++ b/arch/x86/include/asm/trap_pf.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_TRAP_PF_H #define _ASM_X86_TRAP_PF_H +#include <linux/bits.h> + /* * Page fault error code bits: * @@ -11,16 +13,20 @@ * bit 3 == 1: use of reserved bit detected * bit 4 == 1: fault was an instruction fetch * bit 5 == 1: protection keys block access + * bit 6 == 1: shadow stack access fault * bit 15 == 1: SGX MMU page-fault + * bit 31 == 1: fault was due to RMP violation */ enum x86_pf_error_code { - X86_PF_PROT = 1 << 0, - X86_PF_WRITE = 1 << 1, - X86_PF_USER = 1 << 2, - X86_PF_RSVD = 1 << 3, - X86_PF_INSTR = 1 << 4, - X86_PF_PK = 1 << 5, - X86_PF_SGX = 1 << 15, + X86_PF_PROT = BIT(0), + X86_PF_WRITE = BIT(1), + X86_PF_USER = BIT(2), + X86_PF_RSVD = BIT(3), + X86_PF_INSTR = BIT(4), + X86_PF_PK = BIT(5), + X86_PF_SHSTK = BIT(6), + X86_PF_SGX = BIT(15), + X86_PF_RMP = BIT(31), }; #endif /* _ASM_X86_TRAP_PF_H */ diff --git a/arch/x86/include/asm/trapnr.h b/arch/x86/include/asm/trapnr.h index f5d2325aa0b7..8d1154cdf787 100644 --- a/arch/x86/include/asm/trapnr.h +++ b/arch/x86/include/asm/trapnr.h @@ -2,6 +2,18 @@ #ifndef _ASM_X86_TRAPNR_H #define _ASM_X86_TRAPNR_H +/* + * Event type codes used by FRED, Intel VT-x and AMD SVM + */ +#define EVENT_TYPE_EXTINT 0 // External interrupt +#define EVENT_TYPE_RESERVED 1 +#define EVENT_TYPE_NMI 2 // NMI +#define EVENT_TYPE_HWEXC 3 // Hardware originated traps, exceptions +#define EVENT_TYPE_SWINT 4 // INT n +#define EVENT_TYPE_PRIV_SWEXC 5 // INT1 +#define EVENT_TYPE_SWEXC 6 // INTO, INT3 +#define EVENT_TYPE_OTHER 7 // FRED SYSCALL/SYSENTER, VT-x MTF + /* Interrupts/Exceptions */ #define X86_TRAP_DE 0 /* Divide-by-zero */ diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 47ecfff2c83d..869b88061801 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -14,11 +14,11 @@ asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs); asmlinkage __visible notrace struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs); -void __init trap_init(void); asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *eregs); #endif -extern bool ibt_selftest(void); +extern int ibt_selftest(void); +extern int ibt_selftest_noendbr(void); #ifdef CONFIG_X86_F00F_BUG /* For handling the FOOF bug */ @@ -35,8 +35,6 @@ static inline int get_si_code(unsigned long condition) return TRAP_BRKPT; } -extern int panic_on_unrecovered_nmi; - void math_emulate(struct math_emu_info *); bool fault_in_kernel_space(unsigned long address); @@ -47,4 +45,16 @@ void __noreturn handle_stack_overflow(struct pt_regs *regs, struct stack_info *info); #endif +static inline void cond_local_irq_enable(struct pt_regs *regs) +{ + if (regs->flags & X86_EFLAGS_IF) + local_irq_enable(); +} + +static inline void cond_local_irq_disable(struct pt_regs *regs) +{ + if (regs->flags & X86_EFLAGS_IF) + local_irq_disable(); +} + #endif /* _ASM_X86_TRAPS_H */ diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index fbdc3d951494..4f7f09f50552 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -5,8 +5,64 @@ #ifndef _ASM_X86_TSC_H #define _ASM_X86_TSC_H -#include <asm/processor.h> +#include <asm/asm.h> #include <asm/cpufeature.h> +#include <asm/processor.h> +#include <asm/msr.h> + +/** + * rdtsc() - returns the current TSC without ordering constraints + * + * rdtsc() returns the result of RDTSC as a 64-bit integer. The + * only ordering constraint it supplies is the ordering implied by + * "asm volatile": it will put the RDTSC in the place you expect. The + * CPU can and will speculatively execute that RDTSC, though, so the + * results can be non-monotonic if compared on different CPUs. + */ +static __always_inline u64 rdtsc(void) +{ + EAX_EDX_DECLARE_ARGS(val, low, high); + + asm volatile("rdtsc" : EAX_EDX_RET(val, low, high)); + + return EAX_EDX_VAL(val, low, high); +} + +/** + * rdtsc_ordered() - read the current TSC in program order + * + * rdtsc_ordered() returns the result of RDTSC as a 64-bit integer. + * It is ordered like a load to a global in-memory counter. It should + * be impossible to observe non-monotonic rdtsc_unordered() behavior + * across multiple CPUs as long as the TSC is synced. + */ +static __always_inline u64 rdtsc_ordered(void) +{ + EAX_EDX_DECLARE_ARGS(val, low, high); + + /* + * The RDTSC instruction is not ordered relative to memory + * access. The Intel SDM and the AMD APM are both vague on this + * point, but empirically an RDTSC instruction can be + * speculatively executed before prior loads. An RDTSC + * immediately after an appropriate barrier appears to be + * ordered as a normal load, that is, it provides the same + * ordering guarantees as reading from a global memory location + * that some other imaginary CPU is updating continuously with a + * time stamp. + * + * Thus, use the preferred barrier on the respective CPU, aiming for + * RDTSCP as the default. + */ + asm volatile(ALTERNATIVE_2("rdtsc", + "lfence; rdtsc", X86_FEATURE_LFENCE_RDTSC, + "rdtscp", X86_FEATURE_RDTSCP) + : EAX_EDX_RET(val, low, high) + /* RDTSCP clobbers ECX with MSR_TSC_AUX. */ + :: "ecx"); + + return EAX_EDX_VAL(val, low, high); +} /* * Standard way to access the cycle counter. @@ -27,12 +83,8 @@ static inline cycles_t get_cycles(void) } #define get_cycles get_cycles -extern struct system_counterval_t convert_art_to_tsc(u64 art); -extern struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns); - extern void tsc_early_init(void); extern void tsc_init(void); -extern unsigned long calibrate_delay_is_known(void); extern void mark_tsc_unstable(char *reason); extern int unsynchronized_tsc(void); extern int check_tsc_unstable(void); @@ -55,12 +107,10 @@ extern bool tsc_async_resets; #ifdef CONFIG_X86_TSC extern bool tsc_store_and_check_tsc_adjust(bool bootcpu); extern void tsc_verify_tsc_adjust(bool resume); -extern void check_tsc_sync_source(int cpu); extern void check_tsc_sync_target(void); #else static inline bool tsc_store_and_check_tsc_adjust(bool bootcpu) { return false; } static inline void tsc_verify_tsc_adjust(bool resume) { } -static inline void check_tsc_sync_source(int cpu) { } static inline void check_tsc_sync_target(void) { } #endif diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 913e593a3b45..3a7755c1a441 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -5,44 +5,23 @@ * User space memory access functions */ #include <linux/compiler.h> +#include <linux/instrumented.h> #include <linux/kasan-checks.h> +#include <linux/mm_types.h> #include <linux/string.h> +#include <linux/mmap_lock.h> #include <asm/asm.h> #include <asm/page.h> #include <asm/smap.h> #include <asm/extable.h> +#include <asm/tlbflush.h> -#ifdef CONFIG_DEBUG_ATOMIC_SLEEP -static inline bool pagefault_disabled(void); -# define WARN_ON_IN_IRQ() \ - WARN_ON_ONCE(!in_task() && !pagefault_disabled()) +#ifdef CONFIG_X86_32 +# include <asm/uaccess_32.h> #else -# define WARN_ON_IN_IRQ() +# include <asm/uaccess_64.h> #endif -/** - * access_ok - Checks if a user space pointer is valid - * @addr: User space pointer to start of block to check - * @size: Size of block to check - * - * Context: User context only. This function may sleep if pagefaults are - * enabled. - * - * Checks if a pointer to a block of memory in user space is valid. - * - * Note that, depending on architecture, this function probably just - * checks that the pointer is in the user space range - after calling - * this function, memory access functions may still return -EFAULT. - * - * Return: true (nonzero) if the memory block may be valid, false (zero) - * if it is definitely invalid. - */ -#define access_ok(addr, size) \ -({ \ - WARN_ON_IN_IRQ(); \ - likely(__access_ok(addr, size)); \ -}) - #include <asm-generic/access_ok.h> extern int __get_user_1(void); @@ -99,10 +78,11 @@ extern int __get_user_bad(void); int __ret_gu; \ register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX); \ __chk_user_ptr(ptr); \ - asm volatile("call __" #fn "_%P4" \ + asm volatile("call __" #fn "_%c[size]" \ : "=a" (__ret_gu), "=r" (__val_gu), \ ASM_CALL_CONSTRAINT \ - : "0" (ptr), "i" (sizeof(*(ptr)))); \ + : "0" (ptr), [size] "i" (sizeof(*(ptr)))); \ + instrument_get_user(__val_gu); \ (x) = (__force __typeof__(*(ptr))) __val_gu; \ __builtin_expect(__ret_gu, 0); \ }) @@ -153,7 +133,7 @@ extern int __get_user_bad(void); #ifdef CONFIG_X86_32 #define __put_user_goto_u64(x, addr, label) \ - asm_volatile_goto("\n" \ + asm goto("\n" \ "1: movl %%eax,0(%1)\n" \ "2: movl %%edx,4(%1)\n" \ _ASM_EXTABLE_UA(1b, %l2) \ @@ -192,16 +172,19 @@ extern void __put_user_nocheck_8(void); int __ret_pu; \ void __user *__ptr_pu; \ register __typeof__(*(ptr)) __val_pu asm("%"_ASM_AX); \ - __chk_user_ptr(ptr); \ - __ptr_pu = (ptr); \ - __val_pu = (x); \ - asm volatile("call __" #fn "_%P[size]" \ + __typeof__(*(ptr)) __x = (x); /* eval x once */ \ + __typeof__(ptr) __ptr = (ptr); /* eval ptr once */ \ + __chk_user_ptr(__ptr); \ + __ptr_pu = __ptr; \ + __val_pu = __x; \ + asm volatile("call __" #fn "_%c[size]" \ : "=c" (__ret_pu), \ ASM_CALL_CONSTRAINT \ : "0" (__ptr_pu), \ "r" (__val_pu), \ [size] "i" (sizeof(*(ptr))) \ :"ebx"); \ + instrument_put_user(__x, __ptr, sizeof(*(ptr))); \ __builtin_expect(__ret_pu, 0); \ }) @@ -248,23 +231,26 @@ extern void __put_user_nocheck_8(void); #define __put_user_size(x, ptr, size, label) \ do { \ - __chk_user_ptr(ptr); \ + __typeof__(*(ptr)) __x = (x); /* eval x once */ \ + __typeof__(ptr) __ptr = (ptr); /* eval ptr once */ \ + __chk_user_ptr(__ptr); \ switch (size) { \ case 1: \ - __put_user_goto(x, ptr, "b", "iq", label); \ + __put_user_goto(__x, __ptr, "b", "iq", label); \ break; \ case 2: \ - __put_user_goto(x, ptr, "w", "ir", label); \ + __put_user_goto(__x, __ptr, "w", "ir", label); \ break; \ case 4: \ - __put_user_goto(x, ptr, "l", "ir", label); \ + __put_user_goto(__x, __ptr, "l", "ir", label); \ break; \ case 8: \ - __put_user_goto_u64(x, ptr, label); \ + __put_user_goto_u64(__x, __ptr, label); \ break; \ default: \ __put_user_bad(); \ } \ + instrument_put_user(__x, __ptr, size); \ } while (0) #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT @@ -305,10 +291,11 @@ do { \ default: \ (x) = __get_user_bad(); \ } \ + instrument_get_user(x); \ } while (0) #define __get_user_asm(x, addr, itype, ltype, label) \ - asm_volatile_goto("\n" \ + asm_goto_output("\n" \ "1: mov"itype" %[umem],%[output]\n" \ _ASM_EXTABLE_UA(1b, %l2) \ : [output] ltype(x) \ @@ -388,7 +375,7 @@ do { \ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ __typeof__(*(_ptr)) __old = *_old; \ __typeof__(*(_ptr)) __new = (_new); \ - asm_volatile_goto("\n" \ + asm_goto_output("\n" \ "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\ _ASM_EXTABLE_UA(1b, %l[label]) \ : CC_OUT(z) (success), \ @@ -407,7 +394,7 @@ do { \ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ __typeof__(*(_ptr)) __old = *_old; \ __typeof__(*(_ptr)) __new = (_new); \ - asm_volatile_goto("\n" \ + asm_goto_output("\n" \ "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n" \ _ASM_EXTABLE_UA(1b, %l[label]) \ : CC_OUT(z) (success), \ @@ -448,7 +435,7 @@ do { \ #ifdef CONFIG_X86_32 /* - * Unlike the normal CMPXCHG, hardcode ECX for both success/fail and error. + * Unlike the normal CMPXCHG, use output GPR for both success/fail and error. * There are only six GPRs available and four (EAX, EBX, ECX, and EDX) are * hardcoded by CMPXCHG8B, leaving only ESI and EDI. If the compiler uses * both ESI and EDI for the memory operand, compilation will fail if the error @@ -461,11 +448,12 @@ do { \ __typeof__(*(_ptr)) __new = (_new); \ asm volatile("\n" \ "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n" \ - "mov $0, %%ecx\n\t" \ - "setz %%cl\n" \ + "mov $0, %[result]\n\t" \ + "setz %b[result]\n" \ "2:\n" \ - _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %%ecx) \ - : [result]"=c" (__result), \ + _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, \ + %[result]) \ + : [result] "=q" (__result), \ "+A" (__old), \ [ptr] "+m" (*_ptr) \ : "b" ((u32)__new), \ @@ -489,7 +477,7 @@ struct __large_struct { unsigned long buf[100]; }; * aliasing issues. */ #define __put_user_goto(x, addr, itype, ltype, label) \ - asm_volatile_goto("\n" \ + asm goto("\n" \ "1: mov"itype" %0,%1\n" \ _ASM_EXTABLE_UA(1b, %l2) \ : : ltype(x), "m" (__m(addr)) \ @@ -502,16 +490,13 @@ strncpy_from_user(char *dst, const char __user *src, long count); extern __must_check long strnlen_user(const char __user *str, long n); -unsigned long __must_check clear_user(void __user *mem, unsigned long len); -unsigned long __must_check __clear_user(void __user *mem, unsigned long len); - #ifdef CONFIG_ARCH_HAS_COPY_MC unsigned long __must_check copy_mc_to_kernel(void *to, const void *from, unsigned len); #define copy_mc_to_kernel copy_mc_to_kernel unsigned long __must_check -copy_mc_to_user(void *to, const void *from, unsigned len); +copy_mc_to_user(void __user *to, const void *from, unsigned len); #endif /* @@ -525,12 +510,6 @@ extern struct movsl_mask { #define ARCH_HAS_NOCACHE_UACCESS 1 -#ifdef CONFIG_X86_32 -# include <asm/uaccess_32.h> -#else -# include <asm/uaccess_64.h> -#endif - /* * The "unsafe" user accesses aren't really "unsafe", but the naming * is a big fat warning: you have to not only do the access_ok() diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 388a40660c7b..40379a1adbb8 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -33,4 +33,7 @@ __copy_from_user_inatomic_nocache(void *to, const void __user *from, return __copy_from_user_ll_nocache_nozero(to, from, n); } +unsigned long __must_check clear_user(void __user *mem, unsigned long len); +unsigned long __must_check __clear_user(void __user *mem, unsigned long len); + #endif /* _ASM_X86_UACCESS_32_H */ diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 45697e04d771..c8a5ae35c871 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -11,6 +11,101 @@ #include <asm/alternative.h> #include <asm/cpufeatures.h> #include <asm/page.h> +#include <asm/percpu.h> +#include <asm/runtime-const.h> + +/* + * Virtual variable: there's no actual backing store for this, + * it can purely be used as 'runtime_const_ptr(USER_PTR_MAX)' + */ +extern unsigned long USER_PTR_MAX; + +#ifdef CONFIG_ADDRESS_MASKING +/* + * Mask out tag bits from the address. + */ +static inline unsigned long __untagged_addr(unsigned long addr) +{ + asm_inline (ALTERNATIVE("", "and " __percpu_arg([mask]) ", %[addr]", + X86_FEATURE_LAM) + : [addr] "+r" (addr) + : [mask] "m" (__my_cpu_var(tlbstate_untag_mask))); + + return addr; +} + +#define untagged_addr(addr) ({ \ + unsigned long __addr = (__force unsigned long)(addr); \ + (__force __typeof__(addr))__untagged_addr(__addr); \ +}) + +static inline unsigned long __untagged_addr_remote(struct mm_struct *mm, + unsigned long addr) +{ + mmap_assert_locked(mm); + return addr & (mm)->context.untag_mask; +} + +#define untagged_addr_remote(mm, addr) ({ \ + unsigned long __addr = (__force unsigned long)(addr); \ + (__force __typeof__(addr))__untagged_addr_remote(mm, __addr); \ +}) + +#endif + +#define valid_user_address(x) \ + likely((__force unsigned long)(x) <= runtime_const_ptr(USER_PTR_MAX)) + +/* + * Masking the user address is an alternative to a conditional + * user_access_begin that can avoid the fencing. This only works + * for dense accesses starting at the address. + */ +static inline void __user *mask_user_address(const void __user *ptr) +{ + void __user *ret; + asm("cmp %1,%0\n\t" + "cmova %1,%0" + :"=r" (ret) + :"r" (runtime_const_ptr(USER_PTR_MAX)), + "0" (ptr)); + return ret; +} +#define masked_user_access_begin(x) ({ \ + __auto_type __masked_ptr = (x); \ + __masked_ptr = mask_user_address(__masked_ptr); \ + __uaccess_begin(); __masked_ptr; }) + +/* + * User pointers can have tag bits on x86-64. This scheme tolerates + * arbitrary values in those bits rather then masking them off. + * + * Enforce two rules: + * 1. 'ptr' must be in the user part of the address space + * 2. 'ptr+size' must not overflow into kernel addresses + * + * Note that we always have at least one guard page between the + * max user address and the non-canonical gap, allowing us to + * ignore small sizes entirely. + * + * In fact, we could probably remove the size check entirely, since + * any kernel accesses will be in increasing address order starting + * at 'ptr'. + * + * That's a separate optimization, for now just handle the small + * constant case. + */ +static inline bool __access_ok(const void __user *ptr, unsigned long size) +{ + if (__builtin_constant_p(size <= PAGE_SIZE) && size <= PAGE_SIZE) { + return valid_user_address(ptr); + } else { + unsigned long sum = size + (__force unsigned long)ptr; + + return valid_user_address(sum) && sum >= (__force unsigned long)ptr; + } +} +#define __access_ok __access_ok /* * Copy To/From Userspace @@ -18,32 +113,26 @@ /* Handles exceptions in both to and from, but doesn't do access_ok */ __must_check unsigned long -copy_user_enhanced_fast_string(void *to, const void *from, unsigned len); -__must_check unsigned long -copy_user_generic_string(void *to, const void *from, unsigned len); -__must_check unsigned long -copy_user_generic_unrolled(void *to, const void *from, unsigned len); +rep_movs_alternative(void *to, const void *from, unsigned len); static __always_inline __must_check unsigned long -copy_user_generic(void *to, const void *from, unsigned len) +copy_user_generic(void *to, const void *from, unsigned long len) { - unsigned ret; - + stac(); /* - * If CPU has ERMS feature, use copy_user_enhanced_fast_string. - * Otherwise, if CPU has rep_good feature, use copy_user_generic_string. - * Otherwise, use copy_user_generic_unrolled. + * If CPU has FSRM feature, use 'rep movs'. + * Otherwise, use rep_movs_alternative. */ - alternative_call_2(copy_user_generic_unrolled, - copy_user_generic_string, - X86_FEATURE_REP_GOOD, - copy_user_enhanced_fast_string, - X86_FEATURE_ERMS, - ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from), - "=d" (len)), - "1" (to), "2" (from), "3" (len) - : "memory", "rcx", "r8", "r9", "r10", "r11"); - return ret; + asm volatile( + "1:\n\t" + ALTERNATIVE("rep movsb", + "call rep_movs_alternative", ALT_NOT(X86_FEATURE_FSRM)) + "2:\n" + _ASM_EXTABLE_UA(1b, 2b) + :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT + : : "memory", "rax"); + clac(); + return len; } static __always_inline __must_check unsigned long @@ -58,19 +147,19 @@ raw_copy_to_user(void __user *dst, const void *src, unsigned long size) return copy_user_generic((__force void *)dst, src, size); } -extern long __copy_user_nocache(void *dst, const void __user *src, - unsigned size, int zerorest); - +extern long __copy_user_nocache(void *dst, const void __user *src, unsigned size); extern long __copy_user_flushcache(void *dst, const void __user *src, unsigned size); -extern void memcpy_page_flushcache(char *to, struct page *page, size_t offset, - size_t len); static inline int __copy_from_user_inatomic_nocache(void *dst, const void __user *src, unsigned size) { + long ret; kasan_check_write(dst, size); - return __copy_user_nocache(dst, src, size, 0); + stac(); + ret = __copy_user_nocache(dst, src, size); + clac(); + return ret; } static inline int @@ -79,4 +168,41 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size) kasan_check_write(dst, size); return __copy_user_flushcache(dst, src, size); } + +/* + * Zero Userspace. + */ + +__must_check unsigned long +rep_stos_alternative(void __user *addr, unsigned long len); + +static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size) +{ + might_fault(); + stac(); + + /* + * No memory constraint because it doesn't change any memory gcc + * knows about. + */ + asm volatile( + "1:\n\t" + ALTERNATIVE("rep stosb", + "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRS)) + "2:\n" + _ASM_EXTABLE_UA(1b, 2b) + : "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT + : "a" (0)); + + clac(); + + return size; +} + +static __always_inline unsigned long clear_user(void __user *to, unsigned long n) +{ + if (__access_ok(to, n)) + return __clear_user(to, n); + return n; +} #endif /* _ASM_X86_UACCESS_64_H */ diff --git a/arch/x86/include/asm/unaccepted_memory.h b/arch/x86/include/asm/unaccepted_memory.h new file mode 100644 index 000000000000..f5937e9866ac --- /dev/null +++ b/arch/x86/include/asm/unaccepted_memory.h @@ -0,0 +1,27 @@ +#ifndef _ASM_X86_UNACCEPTED_MEMORY_H +#define _ASM_X86_UNACCEPTED_MEMORY_H + +#include <linux/efi.h> +#include <asm/tdx.h> +#include <asm/sev.h> + +static inline void arch_accept_memory(phys_addr_t start, phys_addr_t end) +{ + /* Platform-specific memory-acceptance call goes here */ + if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { + if (!tdx_accept_memory(start, end)) + panic("TDX: Failed to accept memory\n"); + } else if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) { + snp_accept_memory(start, end); + } else { + panic("Cannot accept memory: unknown platform\n"); + } +} + +static inline struct efi_unaccepted_memory *efi_get_unaccepted_table(void) +{ + if (efi.unaccepted == EFI_INVALID_TABLE_ADDR) + return NULL; + return __va(efi.unaccepted); +} +#endif diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index 761173ccc33c..6c9e5bdd3916 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -56,6 +56,5 @@ # define __ARCH_WANT_SYS_FORK # define __ARCH_WANT_SYS_VFORK # define __ARCH_WANT_SYS_CLONE -# define __ARCH_WANT_SYS_CLONE3 #endif /* _ASM_X86_UNISTD_H */ diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h index f66fbe6537dd..8f4579c5a6f8 100644 --- a/arch/x86/include/asm/unwind_hints.h +++ b/arch/x86/include/asm/unwind_hints.h @@ -5,17 +5,22 @@ #include "orc_types.h" -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ -.macro UNWIND_HINT_EMPTY - UNWIND_HINT type=UNWIND_HINT_TYPE_CALL end=1 +.macro UNWIND_HINT_END_OF_STACK + UNWIND_HINT type=UNWIND_HINT_TYPE_END_OF_STACK +.endm + +.macro UNWIND_HINT_UNDEFINED + UNWIND_HINT type=UNWIND_HINT_TYPE_UNDEFINED .endm .macro UNWIND_HINT_ENTRY - UNWIND_HINT type=UNWIND_HINT_TYPE_ENTRY end=1 + VALIDATE_UNRET_BEGIN + UNWIND_HINT_END_OF_STACK .endm -.macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 partial=0 +.macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 partial=0 signal=1 .if \base == %rsp .if \indirect .set sp_reg, ORC_REG_SP_INDIRECT @@ -45,11 +50,16 @@ .set type, UNWIND_HINT_TYPE_REGS .endif - UNWIND_HINT sp_reg=sp_reg sp_offset=sp_offset type=type + UNWIND_HINT sp_reg=sp_reg sp_offset=sp_offset type=type signal=\signal +.endm + +.macro UNWIND_HINT_IRET_REGS base=%rsp offset=0 signal=1 + UNWIND_HINT_REGS base=\base offset=\offset partial=1 signal=\signal .endm -.macro UNWIND_HINT_IRET_REGS base=%rsp offset=0 - UNWIND_HINT_REGS base=\base offset=\offset partial=1 +.macro UNWIND_HINT_IRET_ENTRY base=%rsp offset=0 signal=1 + VALIDATE_UNRET_BEGIN + UNWIND_HINT_IRET_REGS base=\base offset=\offset signal=\signal .endm .macro UNWIND_HINT_FUNC @@ -66,9 +76,18 @@ #else +#define UNWIND_HINT_UNDEFINED \ + UNWIND_HINT(UNWIND_HINT_TYPE_UNDEFINED, 0, 0, 0) + #define UNWIND_HINT_FUNC \ - UNWIND_HINT(ORC_REG_SP, 8, UNWIND_HINT_TYPE_FUNC, 0) + UNWIND_HINT(UNWIND_HINT_TYPE_FUNC, ORC_REG_SP, 8, 0) + +#define UNWIND_HINT_SAVE \ + UNWIND_HINT(UNWIND_HINT_TYPE_SAVE, 0, 0, 0) + +#define UNWIND_HINT_RESTORE \ + UNWIND_HINT(UNWIND_HINT_TYPE_RESTORE, 0, 0, 0) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_UNWIND_HINTS_H */ diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index 1b6455f881f9..6989b824fd32 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -10,6 +10,7 @@ * Copyright (c) Russ Anderson <rja@sgi.com> */ +#include <linux/efi.h> #include <linux/rtc.h> /* @@ -115,7 +116,8 @@ struct uv_arch_type_entry { struct uv_systab { char signature[4]; /* must be UV_SYSTAB_SIG */ u32 revision; /* distinguish different firmware revs */ - u64 function; /* BIOS runtime callback function ptr */ + u64 (__efiapi *function)(enum uv_bios_cmd, ...); + /* BIOS runtime callback function ptr */ u32 size; /* systab size (starting with _VERSION_UV4) */ struct { u32 type:8; /* type of entry */ diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index d3e3197917be..ea877fd83114 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -177,6 +177,7 @@ struct uv_hub_info_s { unsigned short nr_possible_cpus; unsigned short nr_online_cpus; short memory_nid; + unsigned short *node_to_socket; }; /* CPU specific info with a pointer to the hub common info struct */ @@ -519,25 +520,30 @@ static inline int uv_socket_to_node(int socket) return _uv_socket_to_node(socket, uv_hub_info->socket_to_node); } +static inline int uv_pnode_to_socket(int pnode) +{ + unsigned short *p2s = uv_hub_info->pnode_to_socket; + + return p2s ? p2s[pnode - uv_hub_info->min_pnode] : pnode; +} + /* pnode, offset --> socket virtual */ static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset) { unsigned int m_val = uv_hub_info->m_val; unsigned long base; - unsigned short sockid, node, *p2s; + unsigned short sockid; if (m_val) return __va(((unsigned long)pnode << m_val) | offset); - p2s = uv_hub_info->pnode_to_socket; - sockid = p2s ? p2s[pnode - uv_hub_info->min_pnode] : pnode; - node = uv_socket_to_node(sockid); + sockid = uv_pnode_to_socket(pnode); /* limit address of previous socket is our base, except node 0 is 0 */ - if (!node) + if (sockid == 0) return __va((unsigned long)offset); - base = (unsigned long)(uv_hub_info->gr_table[node - 1].limit); + base = (unsigned long)(uv_hub_info->gr_table[sockid - 1].limit); return __va(base << UV_GAM_RANGE_SHFT | offset); } @@ -644,10 +650,10 @@ static inline int uv_cpu_blade_processor_id(int cpu) /* Blade number to Node number (UV2..UV4 is 1:1) */ static inline int uv_blade_to_node(int blade) { - return blade; + return uv_socket_to_node(blade); } -/* Blade number of current cpu. Numnbered 0 .. <#blades -1> */ +/* Blade number of current cpu. Numbered 0 .. <#blades -1> */ static inline int uv_numa_blade_id(void) { return uv_hub_info->numa_blade_id; @@ -656,23 +662,27 @@ static inline int uv_numa_blade_id(void) /* * Convert linux node number to the UV blade number. * .. Currently for UV2 thru UV4 the node and the blade are identical. - * .. If this changes then you MUST check references to this function! + * .. UV5 needs conversion when sub-numa clustering is enabled. */ static inline int uv_node_to_blade_id(int nid) { - return nid; + unsigned short *n2s = uv_hub_info->node_to_socket; + + return n2s ? n2s[nid] : nid; } /* Convert a CPU number to the UV blade number */ static inline int uv_cpu_to_blade_id(int cpu) { - return uv_node_to_blade_id(cpu_to_node(cpu)); + return uv_cpu_hub_info(cpu)->numa_blade_id; } /* Convert a blade id to the PNODE of the blade */ static inline int uv_blade_to_pnode(int bid) { - return uv_hub_info_list(uv_blade_to_node(bid))->pnode; + unsigned short *s2p = uv_hub_info->socket_to_pnode; + + return s2p ? s2p[bid] : bid; } /* Nid of memory node on blade. -1 if no blade-local memory */ diff --git a/arch/x86/include/asm/uv/uv_irq.h b/arch/x86/include/asm/uv/uv_irq.h index d6b17c760622..1876b5edd142 100644 --- a/arch/x86/include/asm/uv/uv_irq.h +++ b/arch/x86/include/asm/uv/uv_irq.h @@ -31,7 +31,6 @@ enum { UV_AFFINITY_CPU }; -extern int uv_irq_2_mmr_info(int, unsigned long *, int *); extern int uv_setup_irq(char *, int, int, unsigned long, int); extern void uv_teardown_irq(unsigned int); diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h index 57fa67373262..bb45812889dd 100644 --- a/arch/x86/include/asm/uv/uv_mmrs.h +++ b/arch/x86/include/asm/uv/uv_mmrs.h @@ -4199,6 +4199,13 @@ union uvh_rh_gam_mmioh_overlay_config1_u { #define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_SHFT 0 #define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK 0x0000000000007fffUL +/* UVH common defines */ +#define UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK ( \ + is_uv(UV4A) ? UV4AH_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK : \ + is_uv(UV4) ? UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK : \ + is_uv(UV3) ? UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK : \ + 0) + union uvh_rh_gam_mmioh_redirect_config0_u { unsigned long v; @@ -4247,8 +4254,8 @@ union uvh_rh_gam_mmioh_redirect_config0_u { 0) /* UV4A unique defines */ -#define UV4AH_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_SHFT 0 -#define UV4AH_RH_GAM_MMIOH_REDIRECT_CONFIG0_NASID_MASK 0x0000000000000fffUL +#define UV4AH_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_SHFT 0 +#define UV4AH_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK 0x0000000000000fffUL /* UV4 unique defines */ #define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_SHFT 0 @@ -4258,6 +4265,13 @@ union uvh_rh_gam_mmioh_redirect_config0_u { #define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_SHFT 0 #define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK 0x0000000000007fffUL +/* UVH common defines */ +#define UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK ( \ + is_uv(UV4A) ? UV4AH_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK : \ + is_uv(UV4) ? UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK : \ + is_uv(UV3) ? UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_NASID_MASK : \ + 0) + union uvh_rh_gam_mmioh_redirect_config1_u { unsigned long v; diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index 2963a2f5dbc4..b7253ef3205a 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -18,12 +18,6 @@ struct vdso_image { unsigned long extable_base, extable_len; const void *extable; - long sym_vvar_start; /* Negative offset to the vvar area */ - - long sym_vvar_page; - long sym_pvclock_page; - long sym_hvclock_page; - long sym_timens_page; long sym_VDSO32_NOTE_MASK; long sym___kernel_sigreturn; long sym___kernel_rt_sigreturn; @@ -33,19 +27,11 @@ struct vdso_image { long sym_vdso32_rt_sigreturn_landing_pad; }; -#ifdef CONFIG_X86_64 extern const struct vdso_image vdso_image_64; -#endif - -#ifdef CONFIG_X86_X32_ABI extern const struct vdso_image vdso_image_x32; -#endif - -#if defined CONFIG_X86_32 || defined CONFIG_COMPAT extern const struct vdso_image vdso_image_32; -#endif -extern void __init init_vdso_image(const struct vdso_image *image); +extern int __init init_vdso_image(const struct vdso_image *image); extern int map_vdso_once(const struct vdso_image *image, unsigned long addr); diff --git a/arch/x86/include/asm/vdso/getrandom.h b/arch/x86/include/asm/vdso/getrandom.h new file mode 100644 index 000000000000..ff1c11b9fa27 --- /dev/null +++ b/arch/x86/include/asm/vdso/getrandom.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + */ +#ifndef __ASM_VDSO_GETRANDOM_H +#define __ASM_VDSO_GETRANDOM_H + +#ifndef __ASSEMBLER__ + +#include <asm/unistd.h> + +/** + * getrandom_syscall - Invoke the getrandom() syscall. + * @buffer: Destination buffer to fill with random bytes. + * @len: Size of @buffer in bytes. + * @flags: Zero or more GRND_* flags. + * Returns: The number of random bytes written to @buffer, or a negative value indicating an error. + */ +static __always_inline ssize_t getrandom_syscall(void *buffer, size_t len, unsigned int flags) +{ + long ret; + + asm ("syscall" : "=a" (ret) : + "0" (__NR_getrandom), "D" (buffer), "S" (len), "d" (flags) : + "rcx", "r11", "memory"); + + return ret; +} + +#endif /* !__ASSEMBLER__ */ + +#endif /* __ASM_VDSO_GETRANDOM_H */ diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h index 1936f21ed8cd..73b2e7ee8f0f 100644 --- a/arch/x86/include/asm/vdso/gettimeofday.h +++ b/arch/x86/include/asm/vdso/gettimeofday.h @@ -10,19 +10,15 @@ #ifndef __ASM_VDSO_GETTIMEOFDAY_H #define __ASM_VDSO_GETTIMEOFDAY_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <uapi/linux/time.h> #include <asm/vgtod.h> -#include <asm/vvar.h> #include <asm/unistd.h> #include <asm/msr.h> #include <asm/pvclock.h> #include <clocksource/hyperv_timer.h> -#define __vdso_data (VVAR(_vdso_data)) -#define __timens_vdso_data (TIMENS(_vdso_data)) - #define VDSO_HAS_TIME 1 #define VDSO_HAS_CLOCK_GETRES 1 @@ -57,14 +53,6 @@ extern struct ms_hyperv_tsc_page hvclock_page __attribute__((visibility("hidden"))); #endif -#ifdef CONFIG_TIME_NS -static __always_inline -const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd) -{ - return __timens_vdso_data; -} -#endif - #ifndef BUILD_VDSO32 static __always_inline @@ -231,22 +219,27 @@ static u64 vread_pvclock(void) ret = __pvclock_read_cycles(pvti, rdtsc_ordered()); } while (pvclock_read_retry(pvti, version)); - return ret; + return ret & S64_MAX; } #endif #ifdef CONFIG_HYPERV_TIMER static u64 vread_hvclock(void) { - return hv_read_tsc_page(&hvclock_page); + u64 tsc, time; + + if (hv_read_tsc_page_tsc(&hvclock_page, &tsc, &time)) + return time & S64_MAX; + + return U64_MAX; } #endif static inline u64 __arch_get_hw_counter(s32 clock_mode, - const struct vdso_data *vd) + const struct vdso_time_data *vd) { if (likely(clock_mode == VDSO_CLOCKMODE_TSC)) - return (u64)rdtsc_ordered(); + return (u64)rdtsc_ordered() & S64_MAX; /* * For any memory-mapped vclock type, we need to make sure that gcc * doesn't cleverly hoist a load before the mode check. Otherwise we @@ -268,12 +261,7 @@ static inline u64 __arch_get_hw_counter(s32 clock_mode, return U64_MAX; } -static __always_inline const struct vdso_data *__arch_get_vdso_data(void) -{ - return __vdso_data; -} - -static inline bool arch_vdso_clocksource_ok(const struct vdso_data *vd) +static inline bool arch_vdso_clocksource_ok(const struct vdso_clock *vc) { return true; } @@ -284,6 +272,9 @@ static inline bool arch_vdso_clocksource_ok(const struct vdso_data *vd) * which can be invalidated asynchronously and indicate invalidation by * returning U64_MAX, which can be effectively tested by checking for a * negative value after casting it to s64. + * + * This effectively forces a S64_MAX mask on the calculations, unlike the + * U64_MAX mask normally used by x86 clocksources. */ static inline bool arch_vdso_cycles_ok(u64 cycles) { @@ -292,7 +283,7 @@ static inline bool arch_vdso_cycles_ok(u64 cycles) #define vdso_cycles_ok arch_vdso_cycles_ok /* - * x86 specific delta calculation. + * x86 specific calculation of nanoseconds for the current cycle count * * The regular implementation assumes that clocksource reads are globally * monotonic. The TSC can be slightly off across sockets which can cause @@ -300,24 +291,46 @@ static inline bool arch_vdso_cycles_ok(u64 cycles) * jump. * * Therefore it needs to be verified that @cycles are greater than - * @last. If not then use @last, which is the base time of the current - * conversion period. + * @vd->cycles_last. If not then use @vd->cycles_last, which is the base + * time of the current conversion period. * - * This variant also removes the masking of the subtraction because the - * clocksource mask of all VDSO capable clocksources on x86 is U64_MAX - * which would result in a pointless operation. The compiler cannot - * optimize it away as the mask comes from the vdso data and is not compile - * time constant. + * This variant also uses a custom mask because while the clocksource mask of + * all the VDSO capable clocksources on x86 is U64_MAX, the above code uses + * U64_MASK as an exception value, additionally arch_vdso_cycles_ok() above + * declares everything with the MSB/Sign-bit set as invalid. Therefore the + * effective mask is S64_MAX. */ -static __always_inline -u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult) +static __always_inline u64 vdso_calc_ns(const struct vdso_clock *vc, u64 cycles, u64 base) { - if (cycles > last) - return (cycles - last) * mult; - return 0; + u64 delta = cycles - vc->cycle_last; + + /* + * Negative motion and deltas which can cause multiplication + * overflow require special treatment. This check covers both as + * negative motion is guaranteed to be greater than @vc::max_cycles + * due to unsigned comparison. + * + * Due to the MSB/Sign-bit being used as invalid marker (see + * arch_vdso_cycles_ok() above), the effective mask is S64_MAX, but that + * case is also unlikely and will also take the unlikely path here. + */ + if (unlikely(delta > vc->max_cycles)) { + /* + * Due to the above mentioned TSC wobbles, filter out + * negative motion. Per the above masking, the effective + * sign bit is now bit 62. + */ + if (delta & (1ULL << 62)) + return base >> vc->shift; + + /* Handle multiplication overflow gracefully */ + return mul_u64_u32_add_u64_shr(delta & S64_MAX, vc->mult, base, vc->shift); + } + + return ((delta * vc->mult) + base) >> vc->shift; } -#define vdso_calc_delta vdso_calc_delta +#define vdso_calc_ns vdso_calc_ns -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* __ASM_VDSO_GETTIMEOFDAY_H */ diff --git a/arch/x86/include/asm/vdso/processor.h b/arch/x86/include/asm/vdso/processor.h index 57b1a7034c64..7000aeb59aa2 100644 --- a/arch/x86/include/asm/vdso/processor.h +++ b/arch/x86/include/asm/vdso/processor.h @@ -5,19 +5,23 @@ #ifndef __ASM_VDSO_PROCESSOR_H #define __ASM_VDSO_PROCESSOR_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ -/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ -static __always_inline void rep_nop(void) +/* PAUSE is a good thing to insert into busy-wait loops. */ +static __always_inline void native_pause(void) { - asm volatile("rep; nop" ::: "memory"); + asm volatile("pause" ::: "memory"); } static __always_inline void cpu_relax(void) { - rep_nop(); + native_pause(); } -#endif /* __ASSEMBLY__ */ +struct getcpu_cache; + +notrace long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused); + +#endif /* __ASSEMBLER__ */ #endif /* __ASM_VDSO_PROCESSOR_H */ diff --git a/arch/x86/include/asm/vdso/vsyscall.h b/arch/x86/include/asm/vdso/vsyscall.h index be199a9b2676..4aa311a923f2 100644 --- a/arch/x86/include/asm/vdso/vsyscall.h +++ b/arch/x86/include/asm/vdso/vsyscall.h @@ -2,28 +2,21 @@ #ifndef __ASM_VDSO_VSYSCALL_H #define __ASM_VDSO_VSYSCALL_H -#ifndef __ASSEMBLY__ +#define __VDSO_PAGES 6 + +#define VDSO_NR_VCLOCK_PAGES 2 +#define VDSO_VCLOCK_PAGES_START(_b) ((_b) + (__VDSO_PAGES - VDSO_NR_VCLOCK_PAGES) * PAGE_SIZE) +#define VDSO_PAGE_PVCLOCK_OFFSET 0 +#define VDSO_PAGE_HVCLOCK_OFFSET 1 + +#ifndef __ASSEMBLER__ -#include <linux/hrtimer.h> -#include <linux/timekeeper_internal.h> #include <vdso/datapage.h> #include <asm/vgtod.h> -#include <asm/vvar.h> - -DEFINE_VVAR(struct vdso_data, _vdso_data); -/* - * Update the vDSO data page to keep in sync with kernel timekeeping. - */ -static __always_inline -struct vdso_data *__x86_get_k_vdso_data(void) -{ - return _vdso_data; -} -#define __arch_get_k_vdso_data __x86_get_k_vdso_data /* The asm-generic header needs to be included after the definitions above */ #include <asm-generic/vdso/vsyscall.h> -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* __ASM_VDSO_VSYSCALL_H */ diff --git a/arch/x86/include/asm/vermagic.h b/arch/x86/include/asm/vermagic.h index 75884d2cdec3..5d471253c755 100644 --- a/arch/x86/include/asm/vermagic.h +++ b/arch/x86/include/asm/vermagic.h @@ -15,8 +15,6 @@ #define MODULE_PROC_FAMILY "586TSC " #elif defined CONFIG_M586MMX #define MODULE_PROC_FAMILY "586MMX " -#elif defined CONFIG_MCORE2 -#define MODULE_PROC_FAMILY "CORE2 " #elif defined CONFIG_MATOM #define MODULE_PROC_FAMILY "ATOM " #elif defined CONFIG_M686 @@ -33,8 +31,6 @@ #define MODULE_PROC_FAMILY "K6 " #elif defined CONFIG_MK7 #define MODULE_PROC_FAMILY "K7 " -#elif defined CONFIG_MK8 -#define MODULE_PROC_FAMILY "K8 " #elif defined CONFIG_MELAN #define MODULE_PROC_FAMILY "ELAN " #elif defined CONFIG_MCRUSOE diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index 7aa38b2ad8a9..a0ce291abcae 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -14,11 +14,6 @@ #include <uapi/linux/time.h> -#ifdef BUILD_VDSO32_64 -typedef u64 gtod_long_t; -#else -typedef unsigned long gtod_long_t; -#endif #endif /* CONFIG_GENERIC_GETTIMEOFDAY */ #endif /* _ASM_X86_VGTOD_H */ diff --git a/arch/x86/include/asm/video.h b/arch/x86/include/asm/video.h new file mode 100644 index 000000000000..0950c9535fae --- /dev/null +++ b/arch/x86/include/asm/video.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_VIDEO_H +#define _ASM_X86_VIDEO_H + +#include <linux/types.h> + +#include <asm/page.h> + +struct device; + +pgprot_t pgprot_framebuffer(pgprot_t prot, + unsigned long vm_start, unsigned long vm_end, + unsigned long offset); +#define pgprot_framebuffer pgprot_framebuffer + +bool video_is_primary_device(struct device *dev); +#define video_is_primary_device video_is_primary_device + +#include <asm-generic/video.h> + +#endif /* _ASM_X86_VIDEO_H */ diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h deleted file mode 100644 index 8757078d4442..000000000000 --- a/arch/x86/include/asm/virtext.h +++ /dev/null @@ -1,140 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* CPU virtualization extensions handling - * - * This should carry the code for handling CPU virtualization extensions - * that needs to live in the kernel core. - * - * Author: Eduardo Habkost <ehabkost@redhat.com> - * - * Copyright (C) 2008, Red Hat Inc. - * - * Contains code from KVM, Copyright (C) 2006 Qumranet, Inc. - */ -#ifndef _ASM_X86_VIRTEX_H -#define _ASM_X86_VIRTEX_H - -#include <asm/processor.h> - -#include <asm/vmx.h> -#include <asm/svm.h> -#include <asm/tlbflush.h> - -/* - * VMX functions: - */ - -static inline int cpu_has_vmx(void) -{ - unsigned long ecx = cpuid_ecx(1); - return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */ -} - - -/** - * cpu_vmxoff() - Disable VMX on the current CPU - * - * Disable VMX and clear CR4.VMXE (even if VMXOFF faults) - * - * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to - * atomically track post-VMXON state, e.g. this may be called in NMI context. - * Eat all faults as all other faults on VMXOFF faults are mode related, i.e. - * faults are guaranteed to be due to the !post-VMXON check unless the CPU is - * magically in RM, VM86, compat mode, or at CPL>0. - */ -static inline int cpu_vmxoff(void) -{ - asm_volatile_goto("1: vmxoff\n\t" - _ASM_EXTABLE(1b, %l[fault]) - ::: "cc", "memory" : fault); - - cr4_clear_bits(X86_CR4_VMXE); - return 0; - -fault: - cr4_clear_bits(X86_CR4_VMXE); - return -EIO; -} - -static inline int cpu_vmx_enabled(void) -{ - return __read_cr4() & X86_CR4_VMXE; -} - -/** Disable VMX if it is enabled on the current CPU - * - * You shouldn't call this if cpu_has_vmx() returns 0. - */ -static inline void __cpu_emergency_vmxoff(void) -{ - if (cpu_vmx_enabled()) - cpu_vmxoff(); -} - -/** Disable VMX if it is supported and enabled on the current CPU - */ -static inline void cpu_emergency_vmxoff(void) -{ - if (cpu_has_vmx()) - __cpu_emergency_vmxoff(); -} - - - - -/* - * SVM functions: - */ - -/** Check if the CPU has SVM support - * - * You can use the 'msg' arg to get a message describing the problem, - * if the function returns zero. Simply pass NULL if you are not interested - * on the messages; gcc should take care of not generating code for - * the messages on this case. - */ -static inline int cpu_has_svm(const char **msg) -{ - if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && - boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) { - if (msg) - *msg = "not amd or hygon"; - return 0; - } - - if (boot_cpu_data.extended_cpuid_level < SVM_CPUID_FUNC) { - if (msg) - *msg = "can't execute cpuid_8000000a"; - return 0; - } - - if (!boot_cpu_has(X86_FEATURE_SVM)) { - if (msg) - *msg = "svm not available"; - return 0; - } - return 1; -} - - -/** Disable SVM on the current CPU - * - * You should call this only if cpu_has_svm() returned true. - */ -static inline void cpu_svm_disable(void) -{ - uint64_t efer; - - wrmsrl(MSR_VM_HSAVE_PA, 0); - rdmsrl(MSR_EFER, efer); - wrmsrl(MSR_EFER, efer & ~EFER_SVME); -} - -/** Makes sure SVM is disabled, if it is supported on the CPU - */ -static inline void cpu_emergency_svm_disable(void) -{ - if (cpu_has_svm(NULL)) - cpu_svm_disable(); -} - -#endif /* _ASM_X86_VIRTEX_H */ diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h index 9e8ac5073ecb..62ee19909903 100644 --- a/arch/x86/include/asm/vm86.h +++ b/arch/x86/include/asm/vm86.h @@ -84,7 +84,7 @@ static inline int handle_vm86_trap(struct kernel_vm86_regs *a, long b, int c) static inline void save_v86_state(struct kernel_vm86_regs *a, int b) { } -#define free_vm86(t) do { } while(0) +#define free_vm86(task) do { (void)(task); } while(0) #endif /* CONFIG_VM86 */ diff --git a/arch/x86/include/asm/vmware.h b/arch/x86/include/asm/vmware.h index ac9fc51e2b18..c9cf43d5ef23 100644 --- a/arch/x86/include/asm/vmware.h +++ b/arch/x86/include/asm/vmware.h @@ -7,51 +7,321 @@ #include <linux/stringify.h> /* - * The hypercall definitions differ in the low word of the %edx argument - * in the following way: the old port base interface uses the port - * number to distinguish between high- and low bandwidth versions. + * VMware hypercall ABI. + * + * - Low bandwidth (LB) hypercalls (I/O port based, vmcall and vmmcall) + * have up to 6 input and 6 output arguments passed and returned using + * registers: %eax (arg0), %ebx (arg1), %ecx (arg2), %edx (arg3), + * %esi (arg4), %edi (arg5). + * The following input arguments must be initialized by the caller: + * arg0 - VMWARE_HYPERVISOR_MAGIC + * arg2 - Hypercall command + * arg3 bits [15:0] - Port number, LB and direction flags + * + * - Low bandwidth TDX hypercalls (x86_64 only) are similar to LB + * hypercalls. They also have up to 6 input and 6 output on registers + * arguments, with different argument to register mapping: + * %r12 (arg0), %rbx (arg1), %r13 (arg2), %rdx (arg3), + * %rsi (arg4), %rdi (arg5). + * + * - High bandwidth (HB) hypercalls are I/O port based only. They have + * up to 7 input and 7 output arguments passed and returned using + * registers: %eax (arg0), %ebx (arg1), %ecx (arg2), %edx (arg3), + * %esi (arg4), %edi (arg5), %ebp (arg6). + * The following input arguments must be initialized by the caller: + * arg0 - VMWARE_HYPERVISOR_MAGIC + * arg1 - Hypercall command + * arg3 bits [15:0] - Port number, HB and direction flags + * + * For compatibility purposes, x86_64 systems use only lower 32 bits + * for input and output arguments. + * + * The hypercall definitions differ in the low word of the %edx (arg3) + * in the following way: the old I/O port based interface uses the port + * number to distinguish between high- and low bandwidth versions, and + * uses IN/OUT instructions to define transfer direction. * * The new vmcall interface instead uses a set of flags to select * bandwidth mode and transfer direction. The flags should be loaded - * into %dx by any user and are automatically replaced by the port - * number if the VMWARE_HYPERVISOR_PORT method is used. - * - * In short, new driver code should strictly use the new definition of - * %dx content. + * into arg3 by any user and are automatically replaced by the port + * number if the I/O port method is used. */ -/* Old port-based version */ -#define VMWARE_HYPERVISOR_PORT 0x5658 -#define VMWARE_HYPERVISOR_PORT_HB 0x5659 +#define VMWARE_HYPERVISOR_HB BIT(0) +#define VMWARE_HYPERVISOR_OUT BIT(1) -/* Current vmcall / vmmcall version */ -#define VMWARE_HYPERVISOR_HB BIT(0) -#define VMWARE_HYPERVISOR_OUT BIT(1) +#define VMWARE_HYPERVISOR_PORT 0x5658 +#define VMWARE_HYPERVISOR_PORT_HB (VMWARE_HYPERVISOR_PORT | \ + VMWARE_HYPERVISOR_HB) -/* The low bandwidth call. The low word of edx is presumed clear. */ -#define VMWARE_HYPERCALL \ - ALTERNATIVE_2("movw $" __stringify(VMWARE_HYPERVISOR_PORT) ", %%dx; " \ - "inl (%%dx), %%eax", \ - "vmcall", X86_FEATURE_VMCALL, \ - "vmmcall", X86_FEATURE_VMW_VMMCALL) +#define VMWARE_HYPERVISOR_MAGIC 0x564d5868U +#define VMWARE_CMD_GETVERSION 10 +#define VMWARE_CMD_GETHZ 45 +#define VMWARE_CMD_GETVCPU_INFO 68 +#define VMWARE_CMD_STEALCLOCK 91 /* - * The high bandwidth out call. The low word of edx is presumed to have the - * HB and OUT bits set. + * Hypercall command mask: + * bits [6:0] command, range [0, 127] + * bits [19:16] sub-command, range [0, 15] */ -#define VMWARE_HYPERCALL_HB_OUT \ - ALTERNATIVE_2("movw $" __stringify(VMWARE_HYPERVISOR_PORT_HB) ", %%dx; " \ - "rep outsb", \ - "vmcall", X86_FEATURE_VMCALL, \ - "vmmcall", X86_FEATURE_VMW_VMMCALL) +#define VMWARE_CMD_MASK 0xf007fU + +#define CPUID_VMWARE_FEATURES_ECX_VMMCALL BIT(0) +#define CPUID_VMWARE_FEATURES_ECX_VMCALL BIT(1) + +extern unsigned long vmware_hypercall_slow(unsigned long cmd, + unsigned long in1, unsigned long in3, + unsigned long in4, unsigned long in5, + u32 *out1, u32 *out2, u32 *out3, + u32 *out4, u32 *out5); + +#define VMWARE_TDX_VENDOR_LEAF 0x1af7e4909ULL +#define VMWARE_TDX_HCALL_FUNC 1 + +extern unsigned long vmware_tdx_hypercall(unsigned long cmd, + unsigned long in1, unsigned long in3, + unsigned long in4, unsigned long in5, + u32 *out1, u32 *out2, u32 *out3, + u32 *out4, u32 *out5); /* - * The high bandwidth in call. The low word of edx is presumed to have the - * HB bit set. + * The low bandwidth call. The low word of %edx is presumed to have OUT bit + * set. The high word of %edx may contain input data from the caller. */ -#define VMWARE_HYPERCALL_HB_IN \ - ALTERNATIVE_2("movw $" __stringify(VMWARE_HYPERVISOR_PORT_HB) ", %%dx; " \ - "rep insb", \ - "vmcall", X86_FEATURE_VMCALL, \ +#define VMWARE_HYPERCALL \ + ALTERNATIVE_2("movw %[port], %%dx\n\t" \ + "inl (%%dx), %%eax", \ + "vmcall", X86_FEATURE_VMCALL, \ "vmmcall", X86_FEATURE_VMW_VMMCALL) + +static inline +unsigned long vmware_hypercall1(unsigned long cmd, unsigned long in1) +{ + unsigned long out0; + + if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) + return vmware_tdx_hypercall(cmd, in1, 0, 0, 0, + NULL, NULL, NULL, NULL, NULL); + + if (unlikely(!alternatives_patched) && !__is_defined(MODULE)) + return vmware_hypercall_slow(cmd, in1, 0, 0, 0, + NULL, NULL, NULL, NULL, NULL); + + asm_inline volatile (VMWARE_HYPERCALL + : "=a" (out0) + : [port] "i" (VMWARE_HYPERVISOR_PORT), + "a" (VMWARE_HYPERVISOR_MAGIC), + "b" (in1), + "c" (cmd), + "d" (0) + : "cc", "memory"); + return out0; +} + +static inline +unsigned long vmware_hypercall3(unsigned long cmd, unsigned long in1, + u32 *out1, u32 *out2) +{ + unsigned long out0; + + if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) + return vmware_tdx_hypercall(cmd, in1, 0, 0, 0, + out1, out2, NULL, NULL, NULL); + + if (unlikely(!alternatives_patched) && !__is_defined(MODULE)) + return vmware_hypercall_slow(cmd, in1, 0, 0, 0, + out1, out2, NULL, NULL, NULL); + + asm_inline volatile (VMWARE_HYPERCALL + : "=a" (out0), "=b" (*out1), "=c" (*out2) + : [port] "i" (VMWARE_HYPERVISOR_PORT), + "a" (VMWARE_HYPERVISOR_MAGIC), + "b" (in1), + "c" (cmd), + "d" (0) + : "cc", "memory"); + return out0; +} + +static inline +unsigned long vmware_hypercall4(unsigned long cmd, unsigned long in1, + u32 *out1, u32 *out2, u32 *out3) +{ + unsigned long out0; + + if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) + return vmware_tdx_hypercall(cmd, in1, 0, 0, 0, + out1, out2, out3, NULL, NULL); + + if (unlikely(!alternatives_patched) && !__is_defined(MODULE)) + return vmware_hypercall_slow(cmd, in1, 0, 0, 0, + out1, out2, out3, NULL, NULL); + + asm_inline volatile (VMWARE_HYPERCALL + : "=a" (out0), "=b" (*out1), "=c" (*out2), "=d" (*out3) + : [port] "i" (VMWARE_HYPERVISOR_PORT), + "a" (VMWARE_HYPERVISOR_MAGIC), + "b" (in1), + "c" (cmd), + "d" (0) + : "cc", "memory"); + return out0; +} + +static inline +unsigned long vmware_hypercall5(unsigned long cmd, unsigned long in1, + unsigned long in3, unsigned long in4, + unsigned long in5, u32 *out2) +{ + unsigned long out0; + + if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) + return vmware_tdx_hypercall(cmd, in1, in3, in4, in5, + NULL, out2, NULL, NULL, NULL); + + if (unlikely(!alternatives_patched) && !__is_defined(MODULE)) + return vmware_hypercall_slow(cmd, in1, in3, in4, in5, + NULL, out2, NULL, NULL, NULL); + + asm_inline volatile (VMWARE_HYPERCALL + : "=a" (out0), "=c" (*out2) + : [port] "i" (VMWARE_HYPERVISOR_PORT), + "a" (VMWARE_HYPERVISOR_MAGIC), + "b" (in1), + "c" (cmd), + "d" (in3), + "S" (in4), + "D" (in5) + : "cc", "memory"); + return out0; +} + +static inline +unsigned long vmware_hypercall6(unsigned long cmd, unsigned long in1, + unsigned long in3, u32 *out2, + u32 *out3, u32 *out4, u32 *out5) +{ + unsigned long out0; + + if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) + return vmware_tdx_hypercall(cmd, in1, in3, 0, 0, + NULL, out2, out3, out4, out5); + + if (unlikely(!alternatives_patched) && !__is_defined(MODULE)) + return vmware_hypercall_slow(cmd, in1, in3, 0, 0, + NULL, out2, out3, out4, out5); + + asm_inline volatile (VMWARE_HYPERCALL + : "=a" (out0), "=c" (*out2), "=d" (*out3), "=S" (*out4), + "=D" (*out5) + : [port] "i" (VMWARE_HYPERVISOR_PORT), + "a" (VMWARE_HYPERVISOR_MAGIC), + "b" (in1), + "c" (cmd), + "d" (in3) + : "cc", "memory"); + return out0; +} + +static inline +unsigned long vmware_hypercall7(unsigned long cmd, unsigned long in1, + unsigned long in3, unsigned long in4, + unsigned long in5, u32 *out1, + u32 *out2, u32 *out3) +{ + unsigned long out0; + + if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) + return vmware_tdx_hypercall(cmd, in1, in3, in4, in5, + out1, out2, out3, NULL, NULL); + + if (unlikely(!alternatives_patched) && !__is_defined(MODULE)) + return vmware_hypercall_slow(cmd, in1, in3, in4, in5, + out1, out2, out3, NULL, NULL); + + asm_inline volatile (VMWARE_HYPERCALL + : "=a" (out0), "=b" (*out1), "=c" (*out2), "=d" (*out3) + : [port] "i" (VMWARE_HYPERVISOR_PORT), + "a" (VMWARE_HYPERVISOR_MAGIC), + "b" (in1), + "c" (cmd), + "d" (in3), + "S" (in4), + "D" (in5) + : "cc", "memory"); + return out0; +} + +#ifdef CONFIG_X86_64 +#define VMW_BP_CONSTRAINT "r" +#else +#define VMW_BP_CONSTRAINT "m" +#endif + +/* + * High bandwidth calls are not supported on encrypted memory guests. + * The caller should check cc_platform_has(CC_ATTR_MEM_ENCRYPT) and use + * low bandwidth hypercall if memory encryption is set. + * This assumption simplifies HB hypercall implementation to just I/O port + * based approach without alternative patching. + */ +static inline +unsigned long vmware_hypercall_hb_out(unsigned long cmd, unsigned long in2, + unsigned long in3, unsigned long in4, + unsigned long in5, unsigned long in6, + u32 *out1) +{ + unsigned long out0; + + asm_inline volatile ( + UNWIND_HINT_SAVE + "push %%" _ASM_BP "\n\t" + UNWIND_HINT_UNDEFINED + "mov %[in6], %%" _ASM_BP "\n\t" + "rep outsb\n\t" + "pop %%" _ASM_BP "\n\t" + UNWIND_HINT_RESTORE + : "=a" (out0), "=b" (*out1) + : "a" (VMWARE_HYPERVISOR_MAGIC), + "b" (cmd), + "c" (in2), + "d" (in3 | VMWARE_HYPERVISOR_PORT_HB), + "S" (in4), + "D" (in5), + [in6] VMW_BP_CONSTRAINT (in6) + : "cc", "memory"); + return out0; +} + +static inline +unsigned long vmware_hypercall_hb_in(unsigned long cmd, unsigned long in2, + unsigned long in3, unsigned long in4, + unsigned long in5, unsigned long in6, + u32 *out1) +{ + unsigned long out0; + + asm_inline volatile ( + UNWIND_HINT_SAVE + "push %%" _ASM_BP "\n\t" + UNWIND_HINT_UNDEFINED + "mov %[in6], %%" _ASM_BP "\n\t" + "rep insb\n\t" + "pop %%" _ASM_BP "\n\t" + UNWIND_HINT_RESTORE + : "=a" (out0), "=b" (*out1) + : "a" (VMWARE_HYPERVISOR_MAGIC), + "b" (cmd), + "c" (in2), + "d" (in3 | VMWARE_HYPERVISOR_PORT_HB), + "S" (in4), + "D" (in5), + [in6] VMW_BP_CONSTRAINT (in6) + : "cc", "memory"); + return out0; +} +#undef VMW_BP_CONSTRAINT +#undef VMWARE_HYPERCALL + #endif diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 6c343c6a1855..cca7d6641287 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -13,8 +13,11 @@ #include <linux/bitops.h> +#include <linux/bug.h> #include <linux/types.h> + #include <uapi/asm/vmx.h> +#include <asm/trapnr.h> #include <asm/vmxfeatures.h> #define VMCS_CONTROL_BIT(x) BIT(VMX_FEATURE_##x & 0x1f) @@ -31,6 +34,7 @@ #define CPU_BASED_RDTSC_EXITING VMCS_CONTROL_BIT(RDTSC_EXITING) #define CPU_BASED_CR3_LOAD_EXITING VMCS_CONTROL_BIT(CR3_LOAD_EXITING) #define CPU_BASED_CR3_STORE_EXITING VMCS_CONTROL_BIT(CR3_STORE_EXITING) +#define CPU_BASED_ACTIVATE_TERTIARY_CONTROLS VMCS_CONTROL_BIT(TERTIARY_CONTROLS) #define CPU_BASED_CR8_LOAD_EXITING VMCS_CONTROL_BIT(CR8_LOAD_EXITING) #define CPU_BASED_CR8_STORE_EXITING VMCS_CONTROL_BIT(CR8_STORE_EXITING) #define CPU_BASED_TPR_SHADOW VMCS_CONTROL_BIT(VIRTUAL_TPR) @@ -67,13 +71,20 @@ #define SECONDARY_EXEC_ENCLS_EXITING VMCS_CONTROL_BIT(ENCLS_EXITING) #define SECONDARY_EXEC_RDSEED_EXITING VMCS_CONTROL_BIT(RDSEED_EXITING) #define SECONDARY_EXEC_ENABLE_PML VMCS_CONTROL_BIT(PAGE_MOD_LOGGING) +#define SECONDARY_EXEC_EPT_VIOLATION_VE VMCS_CONTROL_BIT(EPT_VIOLATION_VE) #define SECONDARY_EXEC_PT_CONCEAL_VMX VMCS_CONTROL_BIT(PT_CONCEAL_VMX) -#define SECONDARY_EXEC_XSAVES VMCS_CONTROL_BIT(XSAVES) +#define SECONDARY_EXEC_ENABLE_XSAVES VMCS_CONTROL_BIT(XSAVES) #define SECONDARY_EXEC_MODE_BASED_EPT_EXEC VMCS_CONTROL_BIT(MODE_BASED_EPT_EXEC) #define SECONDARY_EXEC_PT_USE_GPA VMCS_CONTROL_BIT(PT_USE_GPA) #define SECONDARY_EXEC_TSC_SCALING VMCS_CONTROL_BIT(TSC_SCALING) #define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE VMCS_CONTROL_BIT(USR_WAIT_PAUSE) #define SECONDARY_EXEC_BUS_LOCK_DETECTION VMCS_CONTROL_BIT(BUS_LOCK_DETECTION) +#define SECONDARY_EXEC_NOTIFY_VM_EXITING VMCS_CONTROL_BIT(NOTIFY_VM_EXITING) + +/* + * Definitions of Tertiary Processor-Based VM-Execution Controls. + */ +#define TERTIARY_EXEC_IPI_VIRT VMCS_CONTROL_BIT(IPI_VIRT) #define PIN_BASED_EXT_INTR_MASK VMCS_CONTROL_BIT(INTR_EXITING) #define PIN_BASED_NMI_EXITING VMCS_CONTROL_BIT(NMI_EXITING) @@ -111,19 +122,17 @@ #define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff -#define VMX_MISC_PREEMPTION_TIMER_RATE_MASK 0x0000001f -#define VMX_MISC_SAVE_EFER_LMA 0x00000020 -#define VMX_MISC_ACTIVITY_HLT 0x00000040 -#define VMX_MISC_ACTIVITY_WAIT_SIPI 0x00000100 -#define VMX_MISC_ZERO_LEN_INS 0x40000000 -#define VMX_MISC_MSR_LIST_MULTIPLIER 512 - /* VMFUNC functions */ #define VMFUNC_CONTROL_BIT(x) BIT((VMX_FEATURE_##x & 0x1f) - 28) #define VMX_VMFUNC_EPTP_SWITCHING VMFUNC_CONTROL_BIT(EPTP_SWITCHING) #define VMFUNC_EPTP_ENTRIES 512 +#define VMX_BASIC_32BIT_PHYS_ADDR_ONLY BIT_ULL(48) +#define VMX_BASIC_DUAL_MONITOR_TREATMENT BIT_ULL(49) +#define VMX_BASIC_INOUT BIT_ULL(54) +#define VMX_BASIC_TRUE_CTLS BIT_ULL(55) + static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic) { return vmx_basic & GENMASK_ULL(30, 0); @@ -134,9 +143,30 @@ static inline u32 vmx_basic_vmcs_size(u64 vmx_basic) return (vmx_basic & GENMASK_ULL(44, 32)) >> 32; } +static inline u32 vmx_basic_vmcs_mem_type(u64 vmx_basic) +{ + return (vmx_basic & GENMASK_ULL(53, 50)) >> 50; +} + +static inline u64 vmx_basic_encode_vmcs_info(u32 revision, u16 size, u8 memtype) +{ + return revision | ((u64)size << 32) | ((u64)memtype << 50); +} + +#define VMX_MISC_SAVE_EFER_LMA BIT_ULL(5) +#define VMX_MISC_ACTIVITY_HLT BIT_ULL(6) +#define VMX_MISC_ACTIVITY_SHUTDOWN BIT_ULL(7) +#define VMX_MISC_ACTIVITY_WAIT_SIPI BIT_ULL(8) +#define VMX_MISC_INTEL_PT BIT_ULL(14) +#define VMX_MISC_RDMSR_IN_SMM BIT_ULL(15) +#define VMX_MISC_VMXOFF_BLOCK_SMI BIT_ULL(28) +#define VMX_MISC_VMWRITE_SHADOW_RO_FIELDS BIT_ULL(29) +#define VMX_MISC_ZERO_LEN_INS BIT_ULL(30) +#define VMX_MISC_MSR_LIST_MULTIPLIER 512 + static inline int vmx_misc_preemption_timer_rate(u64 vmx_misc) { - return vmx_misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; + return vmx_misc & GENMASK_ULL(4, 0); } static inline int vmx_misc_cr3_count(u64 vmx_misc) @@ -158,6 +188,7 @@ static inline int vmx_misc_mseg_revid(u64 vmx_misc) enum vmcs_field { VIRTUAL_PROCESSOR_ID = 0x00000000, POSTED_INTR_NV = 0x00000002, + LAST_PID_POINTER_INDEX = 0x00000008, GUEST_ES_SELECTOR = 0x00000800, GUEST_CS_SELECTOR = 0x00000802, GUEST_SS_SELECTOR = 0x00000804, @@ -215,12 +246,19 @@ enum vmcs_field { VMREAD_BITMAP_HIGH = 0x00002027, VMWRITE_BITMAP = 0x00002028, VMWRITE_BITMAP_HIGH = 0x00002029, + VE_INFORMATION_ADDRESS = 0x0000202A, + VE_INFORMATION_ADDRESS_HIGH = 0x0000202B, XSS_EXIT_BITMAP = 0x0000202C, XSS_EXIT_BITMAP_HIGH = 0x0000202D, ENCLS_EXITING_BITMAP = 0x0000202E, ENCLS_EXITING_BITMAP_HIGH = 0x0000202F, TSC_MULTIPLIER = 0x00002032, TSC_MULTIPLIER_HIGH = 0x00002033, + TERTIARY_VM_EXEC_CONTROL = 0x00002034, + TERTIARY_VM_EXEC_CONTROL_HIGH = 0x00002035, + SHARED_EPT_POINTER = 0x0000203C, + PID_POINTER_TABLE = 0x00002042, + PID_POINTER_TABLE_HIGH = 0x00002043, GUEST_PHYSICAL_ADDRESS = 0x00002400, GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, VMCS_LINK_POINTER = 0x00002800, @@ -269,6 +307,7 @@ enum vmcs_field { SECONDARY_VM_EXEC_CONTROL = 0x0000401e, PLE_GAP = 0x00004020, PLE_WINDOW = 0x00004022, + NOTIFY_WINDOW = 0x00004024, VM_INSTRUCTION_ERROR = 0x00004400, VM_EXIT_REASON = 0x00004402, VM_EXIT_INTR_INFO = 0x00004404, @@ -296,7 +335,7 @@ enum vmcs_field { GUEST_LDTR_AR_BYTES = 0x00004820, GUEST_TR_AR_BYTES = 0x00004822, GUEST_INTERRUPTIBILITY_INFO = 0x00004824, - GUEST_ACTIVITY_STATE = 0X00004826, + GUEST_ACTIVITY_STATE = 0x00004826, GUEST_SYSENTER_CS = 0x0000482A, VMX_PREEMPTION_TIMER_VALUE = 0x0000482E, HOST_IA32_SYSENTER_CS = 0x00004c00, @@ -359,14 +398,14 @@ enum vmcs_field { #define VECTORING_INFO_DELIVER_CODE_MASK INTR_INFO_DELIVER_CODE_MASK #define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK -#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ -#define INTR_TYPE_RESERVED (1 << 8) /* reserved */ -#define INTR_TYPE_NMI_INTR (2 << 8) /* NMI */ -#define INTR_TYPE_HARD_EXCEPTION (3 << 8) /* processor exception */ -#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */ -#define INTR_TYPE_PRIV_SW_EXCEPTION (5 << 8) /* ICE breakpoint - undocumented */ -#define INTR_TYPE_SOFT_EXCEPTION (6 << 8) /* software exception */ -#define INTR_TYPE_OTHER_EVENT (7 << 8) /* other event */ +#define INTR_TYPE_EXT_INTR (EVENT_TYPE_EXTINT << 8) /* external interrupt */ +#define INTR_TYPE_RESERVED (EVENT_TYPE_RESERVED << 8) /* reserved */ +#define INTR_TYPE_NMI_INTR (EVENT_TYPE_NMI << 8) /* NMI */ +#define INTR_TYPE_HARD_EXCEPTION (EVENT_TYPE_HWEXC << 8) /* processor exception */ +#define INTR_TYPE_SOFT_INTR (EVENT_TYPE_SWINT << 8) /* software interrupt */ +#define INTR_TYPE_PRIV_SW_EXCEPTION (EVENT_TYPE_PRIV_SWEXC << 8) /* ICE breakpoint */ +#define INTR_TYPE_SOFT_EXCEPTION (EVENT_TYPE_SWEXC << 8) /* software exception */ +#define INTR_TYPE_OTHER_EVENT (EVENT_TYPE_OTHER << 8) /* other event */ /* GUEST_INTERRUPTIBILITY_INFO flags. */ #define GUEST_INTR_STATE_STI 0x00000001 @@ -489,15 +528,17 @@ enum vmcs_field { #define VMX_EPTP_PWL_4 0x18ull #define VMX_EPTP_PWL_5 0x20ull #define VMX_EPTP_AD_ENABLE_BIT (1ull << 6) +/* The EPTP memtype is encoded in bits 2:0, i.e. doesn't need to be shifted. */ #define VMX_EPTP_MT_MASK 0x7ull -#define VMX_EPTP_MT_WB 0x6ull -#define VMX_EPTP_MT_UC 0x0ull +#define VMX_EPTP_MT_WB X86_MEMTYPE_WB +#define VMX_EPTP_MT_UC X86_MEMTYPE_UC #define VMX_EPT_READABLE_MASK 0x1ull #define VMX_EPT_WRITABLE_MASK 0x2ull #define VMX_EPT_EXECUTABLE_MASK 0x4ull #define VMX_EPT_IPAT_BIT (1ull << 6) #define VMX_EPT_ACCESS_BIT (1ull << 8) #define VMX_EPT_DIRTY_BIT (1ull << 9) +#define VMX_EPT_SUPPRESS_VE_BIT (1ull << 63) #define VMX_EPT_RWX_MASK (VMX_EPT_READABLE_MASK | \ VMX_EPT_WRITABLE_MASK | \ VMX_EPT_EXECUTABLE_MASK) @@ -540,18 +581,28 @@ enum vm_entry_failure_code { /* * Exit Qualifications for EPT Violations */ -#define EPT_VIOLATION_ACC_READ_BIT 0 -#define EPT_VIOLATION_ACC_WRITE_BIT 1 -#define EPT_VIOLATION_ACC_INSTR_BIT 2 -#define EPT_VIOLATION_RWX_SHIFT 3 -#define EPT_VIOLATION_GVA_IS_VALID_BIT 7 -#define EPT_VIOLATION_GVA_TRANSLATED_BIT 8 -#define EPT_VIOLATION_ACC_READ (1 << EPT_VIOLATION_ACC_READ_BIT) -#define EPT_VIOLATION_ACC_WRITE (1 << EPT_VIOLATION_ACC_WRITE_BIT) -#define EPT_VIOLATION_ACC_INSTR (1 << EPT_VIOLATION_ACC_INSTR_BIT) -#define EPT_VIOLATION_RWX_MASK (VMX_EPT_RWX_MASK << EPT_VIOLATION_RWX_SHIFT) -#define EPT_VIOLATION_GVA_IS_VALID (1 << EPT_VIOLATION_GVA_IS_VALID_BIT) -#define EPT_VIOLATION_GVA_TRANSLATED (1 << EPT_VIOLATION_GVA_TRANSLATED_BIT) +#define EPT_VIOLATION_ACC_READ BIT(0) +#define EPT_VIOLATION_ACC_WRITE BIT(1) +#define EPT_VIOLATION_ACC_INSTR BIT(2) +#define EPT_VIOLATION_PROT_READ BIT(3) +#define EPT_VIOLATION_PROT_WRITE BIT(4) +#define EPT_VIOLATION_PROT_EXEC BIT(5) +#define EPT_VIOLATION_EXEC_FOR_RING3_LIN BIT(6) +#define EPT_VIOLATION_PROT_MASK (EPT_VIOLATION_PROT_READ | \ + EPT_VIOLATION_PROT_WRITE | \ + EPT_VIOLATION_PROT_EXEC) +#define EPT_VIOLATION_GVA_IS_VALID BIT(7) +#define EPT_VIOLATION_GVA_TRANSLATED BIT(8) + +#define EPT_VIOLATION_RWX_TO_PROT(__epte) (((__epte) & VMX_EPT_RWX_MASK) << 3) + +static_assert(EPT_VIOLATION_RWX_TO_PROT(VMX_EPT_RWX_MASK) == + (EPT_VIOLATION_PROT_READ | EPT_VIOLATION_PROT_WRITE | EPT_VIOLATION_PROT_EXEC)); + +/* + * Exit Qualifications for NOTIFY VM EXIT + */ +#define NOTIFY_VM_CONTEXT_INVALID BIT(0) /* * VM-instruction error numbers @@ -609,4 +660,13 @@ enum vmx_l1d_flush_state { extern enum vmx_l1d_flush_state l1tf_vmx_mitigation; +struct vmx_ve_information { + u32 exit_reason; + u32 delivery; + u64 exit_qualification; + u64 guest_linear_address; + u64 guest_physical_address; + u16 eptp_index; +}; + #endif diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h index d9a74681a77d..09b1d7e607c1 100644 --- a/arch/x86/include/asm/vmxfeatures.h +++ b/arch/x86/include/asm/vmxfeatures.h @@ -5,84 +5,89 @@ /* * Defines VMX CPU feature bits */ -#define NVMXINTS 3 /* N 32-bit words worth of info */ +#define NVMXINTS 5 /* N 32-bit words worth of info */ /* * Note: If the comment begins with a quoted string, that string is used - * in /proc/cpuinfo instead of the macro name. If the string is "", - * this feature bit is not displayed in /proc/cpuinfo at all. + * in /proc/cpuinfo instead of the macro name. Otherwise, this feature bit + * is not displayed in /proc/cpuinfo at all. */ /* Pin-Based VM-Execution Controls, EPT/VPID, APIC and VM-Functions, word 0 */ -#define VMX_FEATURE_INTR_EXITING ( 0*32+ 0) /* "" VM-Exit on vectored interrupts */ -#define VMX_FEATURE_NMI_EXITING ( 0*32+ 3) /* "" VM-Exit on NMIs */ +#define VMX_FEATURE_INTR_EXITING ( 0*32+ 0) /* VM-Exit on vectored interrupts */ +#define VMX_FEATURE_NMI_EXITING ( 0*32+ 3) /* VM-Exit on NMIs */ #define VMX_FEATURE_VIRTUAL_NMIS ( 0*32+ 5) /* "vnmi" NMI virtualization */ -#define VMX_FEATURE_PREEMPTION_TIMER ( 0*32+ 6) /* VMX Preemption Timer */ -#define VMX_FEATURE_POSTED_INTR ( 0*32+ 7) /* Posted Interrupts */ +#define VMX_FEATURE_PREEMPTION_TIMER ( 0*32+ 6) /* "preemption_timer" VMX Preemption Timer */ +#define VMX_FEATURE_POSTED_INTR ( 0*32+ 7) /* "posted_intr" Posted Interrupts */ /* EPT/VPID features, scattered to bits 16-23 */ -#define VMX_FEATURE_INVVPID ( 0*32+ 16) /* INVVPID is supported */ +#define VMX_FEATURE_INVVPID ( 0*32+ 16) /* "invvpid" INVVPID is supported */ #define VMX_FEATURE_EPT_EXECUTE_ONLY ( 0*32+ 17) /* "ept_x_only" EPT entries can be execute only */ -#define VMX_FEATURE_EPT_AD ( 0*32+ 18) /* EPT Accessed/Dirty bits */ -#define VMX_FEATURE_EPT_1GB ( 0*32+ 19) /* 1GB EPT pages */ +#define VMX_FEATURE_EPT_AD ( 0*32+ 18) /* "ept_ad" EPT Accessed/Dirty bits */ +#define VMX_FEATURE_EPT_1GB ( 0*32+ 19) /* "ept_1gb" 1GB EPT pages */ +#define VMX_FEATURE_EPT_5LEVEL ( 0*32+ 20) /* "ept_5level" 5-level EPT paging */ /* Aggregated APIC features 24-27 */ -#define VMX_FEATURE_FLEXPRIORITY ( 0*32+ 24) /* TPR shadow + virt APIC */ -#define VMX_FEATURE_APICV ( 0*32+ 25) /* TPR shadow + APIC reg virt + virt intr delivery + posted interrupts */ +#define VMX_FEATURE_FLEXPRIORITY ( 0*32+ 24) /* "flexpriority" TPR shadow + virt APIC */ +#define VMX_FEATURE_APICV ( 0*32+ 25) /* "apicv" TPR shadow + APIC reg virt + virt intr delivery + posted interrupts */ /* VM-Functions, shifted to bits 28-31 */ -#define VMX_FEATURE_EPTP_SWITCHING ( 0*32+ 28) /* EPTP switching (in guest) */ +#define VMX_FEATURE_EPTP_SWITCHING ( 0*32+ 28) /* "eptp_switching" EPTP switching (in guest) */ /* Primary Processor-Based VM-Execution Controls, word 1 */ -#define VMX_FEATURE_INTR_WINDOW_EXITING ( 1*32+ 2) /* "" VM-Exit if INTRs are unblocked in guest */ +#define VMX_FEATURE_INTR_WINDOW_EXITING ( 1*32+ 2) /* VM-Exit if INTRs are unblocked in guest */ #define VMX_FEATURE_USE_TSC_OFFSETTING ( 1*32+ 3) /* "tsc_offset" Offset hardware TSC when read in guest */ -#define VMX_FEATURE_HLT_EXITING ( 1*32+ 7) /* "" VM-Exit on HLT */ -#define VMX_FEATURE_INVLPG_EXITING ( 1*32+ 9) /* "" VM-Exit on INVLPG */ -#define VMX_FEATURE_MWAIT_EXITING ( 1*32+ 10) /* "" VM-Exit on MWAIT */ -#define VMX_FEATURE_RDPMC_EXITING ( 1*32+ 11) /* "" VM-Exit on RDPMC */ -#define VMX_FEATURE_RDTSC_EXITING ( 1*32+ 12) /* "" VM-Exit on RDTSC */ -#define VMX_FEATURE_CR3_LOAD_EXITING ( 1*32+ 15) /* "" VM-Exit on writes to CR3 */ -#define VMX_FEATURE_CR3_STORE_EXITING ( 1*32+ 16) /* "" VM-Exit on reads from CR3 */ -#define VMX_FEATURE_CR8_LOAD_EXITING ( 1*32+ 19) /* "" VM-Exit on writes to CR8 */ -#define VMX_FEATURE_CR8_STORE_EXITING ( 1*32+ 20) /* "" VM-Exit on reads from CR8 */ +#define VMX_FEATURE_HLT_EXITING ( 1*32+ 7) /* VM-Exit on HLT */ +#define VMX_FEATURE_INVLPG_EXITING ( 1*32+ 9) /* VM-Exit on INVLPG */ +#define VMX_FEATURE_MWAIT_EXITING ( 1*32+ 10) /* VM-Exit on MWAIT */ +#define VMX_FEATURE_RDPMC_EXITING ( 1*32+ 11) /* VM-Exit on RDPMC */ +#define VMX_FEATURE_RDTSC_EXITING ( 1*32+ 12) /* VM-Exit on RDTSC */ +#define VMX_FEATURE_CR3_LOAD_EXITING ( 1*32+ 15) /* VM-Exit on writes to CR3 */ +#define VMX_FEATURE_CR3_STORE_EXITING ( 1*32+ 16) /* VM-Exit on reads from CR3 */ +#define VMX_FEATURE_TERTIARY_CONTROLS ( 1*32+ 17) /* Enable Tertiary VM-Execution Controls */ +#define VMX_FEATURE_CR8_LOAD_EXITING ( 1*32+ 19) /* VM-Exit on writes to CR8 */ +#define VMX_FEATURE_CR8_STORE_EXITING ( 1*32+ 20) /* VM-Exit on reads from CR8 */ #define VMX_FEATURE_VIRTUAL_TPR ( 1*32+ 21) /* "vtpr" TPR virtualization, a.k.a. TPR shadow */ -#define VMX_FEATURE_NMI_WINDOW_EXITING ( 1*32+ 22) /* "" VM-Exit if NMIs are unblocked in guest */ -#define VMX_FEATURE_MOV_DR_EXITING ( 1*32+ 23) /* "" VM-Exit on accesses to debug registers */ -#define VMX_FEATURE_UNCOND_IO_EXITING ( 1*32+ 24) /* "" VM-Exit on *all* IN{S} and OUT{S}*/ -#define VMX_FEATURE_USE_IO_BITMAPS ( 1*32+ 25) /* "" VM-Exit based on I/O port */ +#define VMX_FEATURE_NMI_WINDOW_EXITING ( 1*32+ 22) /* VM-Exit if NMIs are unblocked in guest */ +#define VMX_FEATURE_MOV_DR_EXITING ( 1*32+ 23) /* VM-Exit on accesses to debug registers */ +#define VMX_FEATURE_UNCOND_IO_EXITING ( 1*32+ 24) /* VM-Exit on *all* IN{S} and OUT{S}*/ +#define VMX_FEATURE_USE_IO_BITMAPS ( 1*32+ 25) /* VM-Exit based on I/O port */ #define VMX_FEATURE_MONITOR_TRAP_FLAG ( 1*32+ 27) /* "mtf" VMX single-step VM-Exits */ -#define VMX_FEATURE_USE_MSR_BITMAPS ( 1*32+ 28) /* "" VM-Exit based on MSR index */ -#define VMX_FEATURE_MONITOR_EXITING ( 1*32+ 29) /* "" VM-Exit on MONITOR (MWAIT's accomplice) */ -#define VMX_FEATURE_PAUSE_EXITING ( 1*32+ 30) /* "" VM-Exit on PAUSE (unconditionally) */ -#define VMX_FEATURE_SEC_CONTROLS ( 1*32+ 31) /* "" Enable Secondary VM-Execution Controls */ +#define VMX_FEATURE_USE_MSR_BITMAPS ( 1*32+ 28) /* VM-Exit based on MSR index */ +#define VMX_FEATURE_MONITOR_EXITING ( 1*32+ 29) /* VM-Exit on MONITOR (MWAIT's accomplice) */ +#define VMX_FEATURE_PAUSE_EXITING ( 1*32+ 30) /* VM-Exit on PAUSE (unconditionally) */ +#define VMX_FEATURE_SEC_CONTROLS ( 1*32+ 31) /* Enable Secondary VM-Execution Controls */ /* Secondary Processor-Based VM-Execution Controls, word 2 */ #define VMX_FEATURE_VIRT_APIC_ACCESSES ( 2*32+ 0) /* "vapic" Virtualize memory mapped APIC accesses */ -#define VMX_FEATURE_EPT ( 2*32+ 1) /* Extended Page Tables, a.k.a. Two-Dimensional Paging */ -#define VMX_FEATURE_DESC_EXITING ( 2*32+ 2) /* "" VM-Exit on {S,L}*DT instructions */ -#define VMX_FEATURE_RDTSCP ( 2*32+ 3) /* "" Enable RDTSCP in guest */ -#define VMX_FEATURE_VIRTUAL_X2APIC ( 2*32+ 4) /* "" Virtualize X2APIC for the guest */ -#define VMX_FEATURE_VPID ( 2*32+ 5) /* Virtual Processor ID (TLB ASID modifier) */ -#define VMX_FEATURE_WBINVD_EXITING ( 2*32+ 6) /* "" VM-Exit on WBINVD */ -#define VMX_FEATURE_UNRESTRICTED_GUEST ( 2*32+ 7) /* Allow Big Real Mode and other "invalid" states */ +#define VMX_FEATURE_EPT ( 2*32+ 1) /* "ept" Extended Page Tables, a.k.a. Two-Dimensional Paging */ +#define VMX_FEATURE_DESC_EXITING ( 2*32+ 2) /* VM-Exit on {S,L}*DT instructions */ +#define VMX_FEATURE_RDTSCP ( 2*32+ 3) /* Enable RDTSCP in guest */ +#define VMX_FEATURE_VIRTUAL_X2APIC ( 2*32+ 4) /* Virtualize X2APIC for the guest */ +#define VMX_FEATURE_VPID ( 2*32+ 5) /* "vpid" Virtual Processor ID (TLB ASID modifier) */ +#define VMX_FEATURE_WBINVD_EXITING ( 2*32+ 6) /* VM-Exit on WBINVD */ +#define VMX_FEATURE_UNRESTRICTED_GUEST ( 2*32+ 7) /* "unrestricted_guest" Allow Big Real Mode and other "invalid" states */ #define VMX_FEATURE_APIC_REGISTER_VIRT ( 2*32+ 8) /* "vapic_reg" Hardware emulation of reads to the virtual-APIC */ #define VMX_FEATURE_VIRT_INTR_DELIVERY ( 2*32+ 9) /* "vid" Evaluation and delivery of pending virtual interrupts */ #define VMX_FEATURE_PAUSE_LOOP_EXITING ( 2*32+ 10) /* "ple" Conditionally VM-Exit on PAUSE at CPL0 */ -#define VMX_FEATURE_RDRAND_EXITING ( 2*32+ 11) /* "" VM-Exit on RDRAND*/ -#define VMX_FEATURE_INVPCID ( 2*32+ 12) /* "" Enable INVPCID in guest */ -#define VMX_FEATURE_VMFUNC ( 2*32+ 13) /* "" Enable VM-Functions (leaf dependent) */ -#define VMX_FEATURE_SHADOW_VMCS ( 2*32+ 14) /* VMREAD/VMWRITE in guest can access shadow VMCS */ -#define VMX_FEATURE_ENCLS_EXITING ( 2*32+ 15) /* "" VM-Exit on ENCLS (leaf dependent) */ -#define VMX_FEATURE_RDSEED_EXITING ( 2*32+ 16) /* "" VM-Exit on RDSEED */ +#define VMX_FEATURE_RDRAND_EXITING ( 2*32+ 11) /* VM-Exit on RDRAND*/ +#define VMX_FEATURE_INVPCID ( 2*32+ 12) /* Enable INVPCID in guest */ +#define VMX_FEATURE_VMFUNC ( 2*32+ 13) /* Enable VM-Functions (leaf dependent) */ +#define VMX_FEATURE_SHADOW_VMCS ( 2*32+ 14) /* "shadow_vmcs" VMREAD/VMWRITE in guest can access shadow VMCS */ +#define VMX_FEATURE_ENCLS_EXITING ( 2*32+ 15) /* VM-Exit on ENCLS (leaf dependent) */ +#define VMX_FEATURE_RDSEED_EXITING ( 2*32+ 16) /* VM-Exit on RDSEED */ #define VMX_FEATURE_PAGE_MOD_LOGGING ( 2*32+ 17) /* "pml" Log dirty pages into buffer */ -#define VMX_FEATURE_EPT_VIOLATION_VE ( 2*32+ 18) /* "" Conditionally reflect EPT violations as #VE exceptions */ -#define VMX_FEATURE_PT_CONCEAL_VMX ( 2*32+ 19) /* "" Suppress VMX indicators in Processor Trace */ -#define VMX_FEATURE_XSAVES ( 2*32+ 20) /* "" Enable XSAVES and XRSTORS in guest */ +#define VMX_FEATURE_EPT_VIOLATION_VE ( 2*32+ 18) /* "ept_violation_ve" Conditionally reflect EPT violations as #VE exceptions */ +#define VMX_FEATURE_PT_CONCEAL_VMX ( 2*32+ 19) /* Suppress VMX indicators in Processor Trace */ +#define VMX_FEATURE_XSAVES ( 2*32+ 20) /* Enable XSAVES and XRSTORS in guest */ #define VMX_FEATURE_MODE_BASED_EPT_EXEC ( 2*32+ 22) /* "ept_mode_based_exec" Enable separate EPT EXEC bits for supervisor vs. user */ -#define VMX_FEATURE_PT_USE_GPA ( 2*32+ 24) /* "" Processor Trace logs GPAs */ -#define VMX_FEATURE_TSC_SCALING ( 2*32+ 25) /* Scale hardware TSC when read in guest */ -#define VMX_FEATURE_USR_WAIT_PAUSE ( 2*32+ 26) /* Enable TPAUSE, UMONITOR, UMWAIT in guest */ -#define VMX_FEATURE_ENCLV_EXITING ( 2*32+ 28) /* "" VM-Exit on ENCLV (leaf dependent) */ -#define VMX_FEATURE_BUS_LOCK_DETECTION ( 2*32+ 30) /* "" VM-Exit when bus lock caused */ +#define VMX_FEATURE_PT_USE_GPA ( 2*32+ 24) /* Processor Trace logs GPAs */ +#define VMX_FEATURE_TSC_SCALING ( 2*32+ 25) /* "tsc_scaling" Scale hardware TSC when read in guest */ +#define VMX_FEATURE_USR_WAIT_PAUSE ( 2*32+ 26) /* "usr_wait_pause" Enable TPAUSE, UMONITOR, UMWAIT in guest */ +#define VMX_FEATURE_ENCLV_EXITING ( 2*32+ 28) /* VM-Exit on ENCLV (leaf dependent) */ +#define VMX_FEATURE_BUS_LOCK_DETECTION ( 2*32+ 30) /* VM-Exit when bus lock caused */ +#define VMX_FEATURE_NOTIFY_VM_EXITING ( 2*32+ 31) /* "notify_vm_exiting" VM-Exit when no event windows after notify window */ +/* Tertiary Processor-Based VM-Execution Controls, word 3 */ +#define VMX_FEATURE_IPI_VIRT ( 3*32+ 4) /* "ipi_virt" Enable IPI virtualization */ #endif /* _ASM_X86_VMXFEATURES_H */ diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index ab60a71a8dcb..472f0263dbc6 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -4,6 +4,7 @@ #include <linux/seqlock.h> #include <uapi/asm/vsyscall.h> +#include <asm/page_types.h> #ifdef CONFIG_X86_VSYSCALL_EMULATION extern void map_vsyscall(void); @@ -24,4 +25,13 @@ static inline bool emulate_vsyscall(unsigned long error_code, } #endif +/* + * The (legacy) vsyscall page is the long page in the kernel portion + * of the address space that has user-accessible permissions. + */ +static inline bool is_vsyscall_vaddr(unsigned long vaddr) +{ + return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR); +} + #endif /* _ASM_X86_VSYSCALL_H */ diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h deleted file mode 100644 index 183e98e49ab9..000000000000 --- a/arch/x86/include/asm/vvar.h +++ /dev/null @@ -1,55 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * vvar.h: Shared vDSO/kernel variable declarations - * Copyright (c) 2011 Andy Lutomirski - * - * A handful of variables are accessible (read-only) from userspace - * code in the vsyscall page and the vdso. They are declared here. - * Some other file must define them with DEFINE_VVAR. - * - * In normal kernel code, they are used like any other variable. - * In user code, they are accessed through the VVAR macro. - * - * These variables live in a page of kernel data that has an extra RO - * mapping for userspace. Each variable needs a unique offset within - * that page; specify that offset with the DECLARE_VVAR macro. (If - * you mess up, the linker will catch it.) - */ - -#ifndef _ASM_X86_VVAR_H -#define _ASM_X86_VVAR_H - -#ifdef EMIT_VVAR -/* - * EMIT_VVAR() is used by the kernel linker script to put vvars in the - * right place. Also, it's used by kernel code to import offsets values. - */ -#define DECLARE_VVAR(offset, type, name) \ - EMIT_VVAR(name, offset) - -#else - -extern char __vvar_page; - -#define DECLARE_VVAR(offset, type, name) \ - extern type vvar_ ## name[CS_BASES] \ - __attribute__((visibility("hidden"))); \ - extern type timens_ ## name[CS_BASES] \ - __attribute__((visibility("hidden"))); \ - -#define VVAR(name) (vvar_ ## name) -#define TIMENS(name) (timens_ ## name) - -#define DEFINE_VVAR(type, name) \ - type name[CS_BASES] \ - __attribute__((section(".vvar_" #name), aligned(16))) __visible - -#endif - -/* DECLARE_VVAR(offset, type, name) */ - -DECLARE_VVAR(128, struct vdso_data, _vdso_data) - -#undef DECLARE_VVAR - -#endif diff --git a/arch/x86/include/asm/word-at-a-time.h b/arch/x86/include/asm/word-at-a-time.h index 8338b0432b50..422a47746657 100644 --- a/arch/x86/include/asm/word-at-a-time.h +++ b/arch/x86/include/asm/word-at-a-time.h @@ -2,47 +2,15 @@ #ifndef _ASM_WORD_AT_A_TIME_H #define _ASM_WORD_AT_A_TIME_H -#include <linux/kernel.h> +#include <linux/bitops.h> +#include <linux/wordpart.h> -/* - * This is largely generic for little-endian machines, but the - * optimal byte mask counting is probably going to be something - * that is architecture-specific. If you have a reliably fast - * bit count instruction, that might be better than the multiply - * and shift, for example. - */ struct word_at_a_time { const unsigned long one_bits, high_bits; }; #define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) } -#ifdef CONFIG_64BIT - -/* - * Jan Achrenius on G+: microoptimized version of - * the simpler "(mask & ONEBYTES) * ONEBYTES >> 56" - * that works for the bytemasks without having to - * mask them first. - */ -static inline long count_masked_bytes(unsigned long mask) -{ - return mask*0x0001020304050608ul >> 56; -} - -#else /* 32-bit case */ - -/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */ -static inline long count_masked_bytes(long mask) -{ - /* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */ - long a = (0x0ff0001+mask) >> 23; - /* Fix the 1 for 00 case */ - return a & mask; -} - -#endif - /* Return nonzero if it has a zero */ static inline unsigned long has_zero(unsigned long a, unsigned long *bits, const struct word_at_a_time *c) { @@ -56,6 +24,22 @@ static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits, return bits; } +#ifdef CONFIG_64BIT + +/* Keep the initial has_zero() value for both bitmask and size calc */ +#define create_zero_mask(bits) (bits) + +static inline unsigned long zero_bytemask(unsigned long bits) +{ + bits = (bits - 1) & ~bits; + return bits >> 7; +} + +#define find_zero(bits) (__ffs(bits) >> 3) + +#else + +/* Create the final mask for both bytemask and size */ static inline unsigned long create_zero_mask(unsigned long bits) { bits = (bits - 1) & ~bits; @@ -65,11 +49,17 @@ static inline unsigned long create_zero_mask(unsigned long bits) /* The mask we created is directly usable as a bytemask */ #define zero_bytemask(mask) (mask) +/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */ static inline unsigned long find_zero(unsigned long mask) { - return count_masked_bytes(mask); + /* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */ + long a = (0x0ff0001+mask) >> 23; + /* Fix the 1 for 00 case */ + return a & mask; } +#endif + /* * Load an unaligned word from kernel space. * @@ -77,58 +67,18 @@ static inline unsigned long find_zero(unsigned long mask) * and the next page not being mapped, take the exception and * return zeroes in the non-existing part. */ -#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT - static inline unsigned long load_unaligned_zeropad(const void *addr) { - unsigned long offset, data; unsigned long ret; - asm_volatile_goto( + asm volatile( "1: mov %[mem], %[ret]\n" - - _ASM_EXTABLE(1b, %l[do_exception]) - - : [ret] "=r" (ret) - : [mem] "m" (*(unsigned long *)addr) - : : do_exception); - - return ret; - -do_exception: - offset = (unsigned long)addr & (sizeof(long) - 1); - addr = (void *)((unsigned long)addr & ~(sizeof(long) - 1)); - data = *(unsigned long *)addr; - ret = data >> offset * 8; - - return ret; -} - -#else /* !CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ - -static inline unsigned long load_unaligned_zeropad(const void *addr) -{ - unsigned long offset, data; - unsigned long ret, err = 0; - - asm( "1: mov %[mem], %[ret]\n" "2:\n" - - _ASM_EXTABLE_FAULT(1b, 2b) - - : [ret] "=&r" (ret), "+a" (err) + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_ZEROPAD) + : [ret] "=r" (ret) : [mem] "m" (*(unsigned long *)addr)); - if (unlikely(err)) { - offset = (unsigned long)addr & (sizeof(long) - 1); - addr = (void *)((unsigned long)addr & ~(sizeof(long) - 1)); - data = *(unsigned long *)addr; - ret = data >> offset * 8; - } - return ret; } -#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ - #endif /* _ASM_WORD_AT_A_TIME_H */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index e9170457697e..36698cc9fb44 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -2,8 +2,6 @@ #ifndef _ASM_X86_PLATFORM_H #define _ASM_X86_PLATFORM_H -#include <asm/bootparam.h> - struct ghcb; struct mpc_bus; struct mpc_cpu; @@ -15,13 +13,15 @@ struct irq_domain; /** * struct x86_init_mpparse - platform specific mpparse ops * @setup_ioapic_ids: platform specific ioapic id override - * @find_smp_config: find the smp configuration - * @get_smp_config: get the smp configuration + * @find_mptable: Find MPTABLE early to reserve the memory region + * @early_parse_smp_cfg: Parse the SMP configuration data early before initmem_init() + * @parse_smp_cfg: Parse the SMP configuration data */ struct x86_init_mpparse { void (*setup_ioapic_ids)(void); - void (*find_smp_config)(void); - void (*get_smp_config)(unsigned int early); + void (*find_mptable)(void); + void (*early_parse_smp_cfg)(void); + void (*parse_smp_cfg)(void); }; /** @@ -30,12 +30,13 @@ struct x86_init_mpparse { * @reserve_resources: reserve the standard resources for the * platform * @memory_setup: platform specific memory setup - * + * @dmi_setup: platform specific DMI setup */ struct x86_init_resources { void (*probe_roms)(void); void (*reserve_resources)(void); char *(*memory_setup)(void); + void (*dmi_setup)(void); }; /** @@ -148,12 +149,22 @@ struct x86_init_acpi { * @enc_status_change_finish Notify HV after the encryption status of a range is changed * @enc_tlb_flush_required Returns true if a TLB flush is needed before changing page encryption status * @enc_cache_flush_required Returns true if a cache flush is needed before changing page encryption status + * @enc_kexec_begin Begin the two-step process of converting shared memory back + * to private. It stops the new conversions from being started + * and waits in-flight conversions to finish, if possible. + * @enc_kexec_finish Finish the two-step process of converting shared memory to + * private. All memory is private after the call when + * the function returns. + * It is called on only one CPU while the others are shut down + * and with interrupts disabled. */ struct x86_guest { - void (*enc_status_change_prepare)(unsigned long vaddr, int npages, bool enc); - bool (*enc_status_change_finish)(unsigned long vaddr, int npages, bool enc); + int (*enc_status_change_prepare)(unsigned long vaddr, int npages, bool enc); + int (*enc_status_change_finish)(unsigned long vaddr, int npages, bool enc); bool (*enc_tlb_flush_required)(bool enc); bool (*enc_cache_flush_required)(void); + void (*enc_kexec_begin)(void); + void (*enc_kexec_finish)(void); }; /** @@ -177,11 +188,14 @@ struct x86_init_ops { * struct x86_cpuinit_ops - platform specific cpu hotplug setups * @setup_percpu_clockev: set up the per cpu clock event device * @early_percpu_clock_init: early init of the per cpu clock event device + * @fixup_cpu_id: fixup function for cpuinfo_x86::topo.pkg_id + * @parallel_bringup: Parallel bringup control */ struct x86_cpuinit_ops { void (*setup_percpu_clockev)(void); void (*early_percpu_clock_init)(void); void (*fixup_cpu_id)(struct cpuinfo_x86 *c, int node); + bool parallel_bringup; }; struct timespec64; @@ -259,11 +273,15 @@ struct x86_legacy_features { * VMMCALL under SEV-ES. Needs to return 'false' * if the checks fail. Called from the #VC * exception handler. + * @is_private_mmio: For CoCo VMs, must map MMIO address as private. + * Used when device is emulated by a paravisor + * layer in the VM context. */ struct x86_hyper_runtime { void (*pin_vcpu)(int cpu); void (*sev_es_hcall_prepare)(struct ghcb *ghcb, struct pt_regs *regs); bool (*sev_es_hcall_finish)(struct ghcb *ghcb, struct pt_regs *regs); + bool (*is_private_mmio)(u64 addr); }; /** @@ -274,6 +292,7 @@ struct x86_hyper_runtime { * @set_wallclock: set time back to HW clock * @is_untracked_pat_range exclude from PAT logic * @nmi_init enable NMI on cpus + * @get_nmi_reason get the reason an NMI was received * @save_sched_clock_state: save state for sched_clock() on suspend * @restore_sched_clock_state: restore state for sched_clock() on resume * @apic_post_init: adjust apic if needed @@ -285,6 +304,8 @@ struct x86_hyper_runtime { * possible in x86_early_init_platform_quirks() by * only using the current x86_hardware_subarch * semantics. + * @realmode_reserve: reserve memory for realmode trampoline + * @realmode_init: initialize realmode trampoline * @hyper: x86 hypervisor specific runtime callbacks */ struct x86_platform_ops { @@ -301,6 +322,8 @@ struct x86_platform_ops { void (*apic_post_init)(void); struct x86_legacy_features legacy; void (*set_legacy_features)(void); + void (*realmode_reserve)(void); + void (*realmode_init)(void); struct x86_hyper_runtime hyper; struct x86_guest guest; }; @@ -322,5 +345,7 @@ extern void x86_init_uint_noop(unsigned int unused); extern bool bool_x86_init_noop(void); extern void x86_op_int_noop(int cpu); extern bool x86_pnpbios_disabled(void); +extern int set_rtc_noop(const struct timespec64 *now); +extern void get_rtc_noop(struct timespec64 *now); #endif diff --git a/arch/x86/include/asm/xen/cpuid.h b/arch/x86/include/asm/xen/cpuid.h index 78e667a31d6c..a3c29b1496c8 100644 --- a/arch/x86/include/asm/xen/cpuid.h +++ b/arch/x86/include/asm/xen/cpuid.h @@ -89,11 +89,21 @@ * Sub-leaf 2: EAX: host tsc frequency in kHz */ +#define XEN_CPUID_TSC_EMULATED (1u << 0) +#define XEN_CPUID_HOST_TSC_RELIABLE (1u << 1) +#define XEN_CPUID_RDTSCP_INSTR_AVAIL (1u << 2) + +#define XEN_CPUID_TSC_MODE_DEFAULT (0) +#define XEN_CPUID_TSC_MODE_ALWAYS_EMULATE (1u) +#define XEN_CPUID_TSC_MODE_NEVER_EMULATE (2u) +#define XEN_CPUID_TSC_MODE_PVRDTSCP (3u) + /* * Leaf 5 (0x40000x04) * HVM-specific features * Sub-leaf 0: EAX: Features * Sub-leaf 0: EBX: vcpu id (iff EAX has XEN_HVM_CPUID_VCPU_ID_PRESENT flag) + * Sub-leaf 0: ECX: domain id (iff EAX has XEN_HVM_CPUID_DOMID_PRESENT flag) */ #define XEN_HVM_CPUID_APIC_ACCESS_VIRT (1u << 0) /* Virtualized APIC registers */ #define XEN_HVM_CPUID_X2APIC_VIRT (1u << 1) /* Virtualized x2APIC accesses */ @@ -102,11 +112,17 @@ #define XEN_HVM_CPUID_VCPU_ID_PRESENT (1u << 3) /* vcpu id is present in EBX */ #define XEN_HVM_CPUID_DOMID_PRESENT (1u << 4) /* domid is present in ECX */ /* - * Bits 55:49 from the IO-APIC RTE and bits 11:5 from the MSI address can be - * used to store high bits for the Destination ID. This expands the Destination - * ID field from 8 to 15 bits, allowing to target APIC IDs up 32768. + * With interrupt format set to 0 (non-remappable) bits 55:49 from the + * IO-APIC RTE and bits 11:5 from the MSI address can be used to store + * high bits for the Destination ID. This expands the Destination ID + * field from 8 to 15 bits, allowing to target APIC IDs up 32768. */ #define XEN_HVM_CPUID_EXT_DEST_ID (1u << 5) +/* + * Per-vCPU event channel upcalls work correctly with physical IRQs + * bound to event channels. + */ +#define XEN_HVM_CPUID_UPCALL_VECTOR (1u << 6) /* * Leaf 6 (0x40000x05) diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h index 068d9b067c83..62bdceb594f1 100644 --- a/arch/x86/include/asm/xen/events.h +++ b/arch/x86/include/asm/xen/events.h @@ -23,7 +23,7 @@ static inline int xen_irqs_disabled(struct pt_regs *regs) /* No need for a barrier -- XCHG is a barrier on x86. */ #define xchg_xen_ulong(ptr, val) xchg((ptr), (val)) -extern int xen_have_vector_callback; +extern bool xen_have_vector_callback; /* * Events delivered via platform PCI interrupts are always @@ -34,4 +34,5 @@ static inline bool xen_support_evtchn_rebind(void) return (!xen_hvm_domain() || xen_have_vector_callback); } +extern bool xen_percpu_upcall; #endif /* _ASM_X86_XEN_EVENTS_H */ diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index e5e0fe10c692..59a62c3780a2 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -39,9 +39,11 @@ #include <linux/string.h> #include <linux/types.h> #include <linux/pgtable.h> +#include <linux/instrumentation.h> #include <trace/events/xen.h> +#include <asm/alternative.h> #include <asm/page.h> #include <asm/smap.h> #include <asm/nospec-branch.h> @@ -86,11 +88,20 @@ struct xen_dm_op_buf; * there aren't more than 5 arguments...) */ -extern struct { char _entry[32]; } hypercall_page[]; +void xen_hypercall_func(void); +DECLARE_STATIC_CALL(xen_hypercall, xen_hypercall_func); -#define __HYPERCALL "call hypercall_page+%c[offset]" -#define __HYPERCALL_ENTRY(x) \ - [offset] "i" (__HYPERVISOR_##x * sizeof(hypercall_page[0])) +#ifdef MODULE +#define __ADDRESSABLE_xen_hypercall +#else +#define __ADDRESSABLE_xen_hypercall __ADDRESSABLE_ASM_STR(__SCK__xen_hypercall) +#endif + +#define __HYPERCALL \ + __ADDRESSABLE_xen_hypercall \ + "call __SCT__xen_hypercall" + +#define __HYPERCALL_ENTRY(x) "a" (x) #ifdef CONFIG_X86_32 #define __HYPERCALL_RETREG "eax" @@ -148,7 +159,7 @@ extern struct { char _entry[32]; } hypercall_page[]; __HYPERCALL_0ARG(); \ asm volatile (__HYPERCALL \ : __HYPERCALL_0PARAM \ - : __HYPERCALL_ENTRY(name) \ + : __HYPERCALL_ENTRY(__HYPERVISOR_ ## name) \ : __HYPERCALL_CLOBBER0); \ (type)__res; \ }) @@ -159,7 +170,7 @@ extern struct { char _entry[32]; } hypercall_page[]; __HYPERCALL_1ARG(a1); \ asm volatile (__HYPERCALL \ : __HYPERCALL_1PARAM \ - : __HYPERCALL_ENTRY(name) \ + : __HYPERCALL_ENTRY(__HYPERVISOR_ ## name) \ : __HYPERCALL_CLOBBER1); \ (type)__res; \ }) @@ -170,7 +181,7 @@ extern struct { char _entry[32]; } hypercall_page[]; __HYPERCALL_2ARG(a1, a2); \ asm volatile (__HYPERCALL \ : __HYPERCALL_2PARAM \ - : __HYPERCALL_ENTRY(name) \ + : __HYPERCALL_ENTRY(__HYPERVISOR_ ## name) \ : __HYPERCALL_CLOBBER2); \ (type)__res; \ }) @@ -181,7 +192,7 @@ extern struct { char _entry[32]; } hypercall_page[]; __HYPERCALL_3ARG(a1, a2, a3); \ asm volatile (__HYPERCALL \ : __HYPERCALL_3PARAM \ - : __HYPERCALL_ENTRY(name) \ + : __HYPERCALL_ENTRY(__HYPERVISOR_ ## name) \ : __HYPERCALL_CLOBBER3); \ (type)__res; \ }) @@ -192,7 +203,7 @@ extern struct { char _entry[32]; } hypercall_page[]; __HYPERCALL_4ARG(a1, a2, a3, a4); \ asm volatile (__HYPERCALL \ : __HYPERCALL_4PARAM \ - : __HYPERCALL_ENTRY(name) \ + : __HYPERCALL_ENTRY(__HYPERVISOR_ ## name) \ : __HYPERCALL_CLOBBER4); \ (type)__res; \ }) @@ -206,12 +217,9 @@ xen_single_call(unsigned int call, __HYPERCALL_DECLS; __HYPERCALL_5ARG(a1, a2, a3, a4, a5); - if (call >= PAGE_SIZE / sizeof(hypercall_page[0])) - return -EINVAL; - - asm volatile(CALL_NOSPEC + asm volatile(__HYPERCALL : __HYPERCALL_5PARAM - : [thunk_target] "a" (&hypercall_page[call]) + : __HYPERCALL_ENTRY(call) : __HYPERCALL_CLOBBER5); return (long)__res; @@ -223,14 +231,12 @@ static __always_inline void __xen_stac(void) * Suppress objtool seeing the STAC/CLAC and getting confused about it * calling random code with AC=1. */ - asm volatile(ANNOTATE_IGNORE_ALTERNATIVE - ASM_STAC ::: "memory", "flags"); + asm volatile(ASM_STAC_UNSAFE ::: "memory", "flags"); } static __always_inline void __xen_clac(void) { - asm volatile(ANNOTATE_IGNORE_ALTERNATIVE - ASM_CLAC ::: "memory", "flags"); + asm volatile(ASM_CLAC_UNSAFE ::: "memory", "flags"); } static inline long @@ -382,7 +388,7 @@ MULTI_stack_switch(struct multicall_entry *mcl, } #endif -static inline int +static __always_inline int HYPERVISOR_sched_op(int cmd, void *arg) { return _hypercall2(int, sched_op, cmd, arg); diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index 16f548a661cf..c2fc7869b996 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -36,11 +36,14 @@ extern struct shared_info *HYPERVISOR_shared_info; extern struct start_info *xen_start_info; +#include <asm/bug.h> #include <asm/processor.h> +#define XEN_SIGNATURE "XenVMMXenVMM" + static inline uint32_t xen_cpuid_base(void) { - return hypervisor_cpuid_base("XenVMMXenVMM", 2); + return cpuid_base_hypervisor(XEN_SIGNATURE, 2); } struct pci_dev; @@ -61,4 +64,38 @@ void __init xen_pvh_init(struct boot_params *boot_params); void __init mem_map_via_hcall(struct boot_params *boot_params_p); #endif +/* Lazy mode for batching updates / context switch */ +enum xen_lazy_mode { + XEN_LAZY_NONE, + XEN_LAZY_MMU, + XEN_LAZY_CPU, +}; + +DECLARE_PER_CPU(enum xen_lazy_mode, xen_lazy_mode); + +static inline void enter_lazy(enum xen_lazy_mode mode) +{ + BUG_ON(this_cpu_read(xen_lazy_mode) != XEN_LAZY_NONE); + + this_cpu_write(xen_lazy_mode, mode); +} + +static inline void leave_lazy(enum xen_lazy_mode mode) +{ + BUG_ON(this_cpu_read(xen_lazy_mode) != mode); + + this_cpu_write(xen_lazy_mode, XEN_LAZY_NONE); +} + +enum xen_lazy_mode xen_get_lazy_mode(void); + +#if defined(CONFIG_XEN_DOM0) && defined(CONFIG_ACPI) +void xen_sanitize_proc_cap_bits(uint32_t *buf); +#else +static inline void xen_sanitize_proc_cap_bits(uint32_t *buf) +{ + BUG(); +} +#endif + #endif /* _ASM_X86_XEN_HYPERVISOR_H */ diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index baca0b00ef76..a078a2b0f032 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -72,7 +72,7 @@ #endif #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* Explicitly size integers that represent pfns in the public interface * with Xen so that on ARM we can have one ABI that works for 32 and 64 * bit guests. */ @@ -137,7 +137,7 @@ DEFINE_GUEST_HANDLE(xen_ulong_t); #define TI_SET_DPL(_ti, _dpl) ((_ti)->flags |= (_dpl)) #define TI_SET_IF(_ti, _if) ((_ti)->flags |= ((!!(_if))<<2)) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct trap_info { uint8_t vector; /* exception vector */ uint8_t flags; /* 0-3: privilege level; 4: clear event enable? */ @@ -186,7 +186,7 @@ struct arch_shared_info { uint32_t wc_sec_hi; #endif }; -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #ifdef CONFIG_X86_32 #include <asm/xen/interface_32.h> @@ -196,7 +196,7 @@ struct arch_shared_info { #include <asm/pvclock-abi.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * The following is all CPU context. Note that the fpu_ctxt block is filled * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used. @@ -376,7 +376,7 @@ struct xen_pmu_arch { } c; }; -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ /* * Prefix forces emulation of some non-trapping instructions. diff --git a/arch/x86/include/asm/xen/interface_32.h b/arch/x86/include/asm/xen/interface_32.h index dc40578abded..74d9768a9cf7 100644 --- a/arch/x86/include/asm/xen/interface_32.h +++ b/arch/x86/include/asm/xen/interface_32.h @@ -44,7 +44,7 @@ */ #define __HYPERVISOR_VIRT_START 0xF5800000 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct cpu_user_regs { uint32_t ebx; @@ -85,7 +85,7 @@ typedef struct xen_callback xen_callback_t; #define XEN_CALLBACK(__cs, __eip) \ ((struct xen_callback){ .cs = (__cs), .eip = (unsigned long)(__eip) }) -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ /* diff --git a/arch/x86/include/asm/xen/interface_64.h b/arch/x86/include/asm/xen/interface_64.h index c599ec269a25..38a19edb81a3 100644 --- a/arch/x86/include/asm/xen/interface_64.h +++ b/arch/x86/include/asm/xen/interface_64.h @@ -61,7 +61,7 @@ * RING1 -> RING3 kernel mode. * RING2 -> RING3 kernel mode. * RING3 -> RING3 user mode. - * However RING0 indicates that the guest kernel should return to iteself + * However RING0 indicates that the guest kernel should return to itself * directly with * orb $3,1*8(%rsp) * iretq @@ -77,7 +77,7 @@ #define VGCF_in_syscall (1<<_VGCF_in_syscall) #define VGCF_IN_SYSCALL VGCF_in_syscall -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct iret_context { /* Top of stack (%rsp at point of hypercall). */ @@ -143,7 +143,7 @@ typedef unsigned long xen_callback_t; #define XEN_CALLBACK(__cs, __rip) \ ((unsigned long)(__rip)) -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_X86_XEN_INTERFACE_64_H */ diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index fa9ec20783fa..85e63d58c074 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -295,7 +295,10 @@ static inline unsigned long bfn_to_local_pfn(unsigned long mfn) /* VIRT <-> MACHINE conversion */ #define virt_to_machine(v) (phys_to_machine(XPADDR(__pa(v)))) -#define virt_to_pfn(v) (PFN_DOWN(__pa(v))) +static inline unsigned long virt_to_pfn(const void *v) +{ + return PFN_DOWN(__pa(v)); +} #define virt_to_mfn(v) (pfn_to_mfn(virt_to_pfn(v))) #define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT)) diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h b/arch/x86/include/asm/xen/swiotlb-xen.h index 77a2d19cc990..abde0f44df57 100644 --- a/arch/x86/include/asm/xen/swiotlb-xen.h +++ b/arch/x86/include/asm/xen/swiotlb-xen.h @@ -2,12 +2,6 @@ #ifndef _ASM_X86_SWIOTLB_XEN_H #define _ASM_X86_SWIOTLB_XEN_H -#ifdef CONFIG_SWIOTLB_XEN -extern int pci_xen_swiotlb_init_late(void); -#else -static inline int pci_xen_swiotlb_init_late(void) { return -ENXIO; } -#endif - int xen_swiotlb_fixup(void *buf, unsigned long nslabs); int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, unsigned int address_bits, diff --git a/arch/x86/include/uapi/asm/amd_hsmp.h b/arch/x86/include/uapi/asm/amd_hsmp.h index 769b939444ae..92d8f256d096 100644 --- a/arch/x86/include/uapi/asm/amd_hsmp.h +++ b/arch/x86/include/uapi/asm/amd_hsmp.h @@ -47,6 +47,15 @@ enum hsmp_message_ids { HSMP_SET_PCI_RATE, /* 20h Control link rate on PCIe devices */ HSMP_SET_POWER_MODE, /* 21h Select power efficiency profile policy */ HSMP_SET_PSTATE_MAX_MIN, /* 22h Set the max and min DF P-State */ + HSMP_GET_METRIC_TABLE_VER, /* 23h Get metrics table version */ + HSMP_GET_METRIC_TABLE, /* 24h Get metrics table */ + HSMP_GET_METRIC_TABLE_DRAM_ADDR,/* 25h Get metrics table dram address */ + HSMP_SET_XGMI_PSTATE_RANGE, /* 26h Set xGMI P-state range */ + HSMP_CPU_RAIL_ISO_FREQ_POLICY, /* 27h Get/Set Cpu Iso frequency policy */ + HSMP_DFC_ENABLE_CTRL, /* 28h Enable/Disable DF C-state */ + HSMP_GET_RAPL_UNITS = 0x30, /* 30h Get scaling factor for energy */ + HSMP_GET_RAPL_CORE_COUNTER, /* 31h Get core energy counter value */ + HSMP_GET_RAPL_PACKAGE_COUNTER, /* 32h Get package energy counter value */ HSMP_MSG_ID_MAX, }; @@ -62,6 +71,16 @@ enum hsmp_msg_type { HSMP_RSVD = -1, HSMP_SET = 0, HSMP_GET = 1, + HSMP_SET_GET = 2, +}; + +enum hsmp_proto_versions { + HSMP_PROTO_VER2 = 2, + HSMP_PROTO_VER3, + HSMP_PROTO_VER4, + HSMP_PROTO_VER5, + HSMP_PROTO_VER6, + HSMP_PROTO_VER7 }; struct hsmp_msg_desc { @@ -77,7 +96,8 @@ struct hsmp_msg_desc { * * Not supported messages would return -ENOMSG. */ -static const struct hsmp_msg_desc hsmp_msg_desc_table[] = { +static const struct hsmp_msg_desc hsmp_msg_desc_table[] + __attribute__((unused)) = { /* RESERVED */ {0, 0, HSMP_RSVD}, @@ -227,7 +247,7 @@ static const struct hsmp_msg_desc hsmp_msg_desc_table[] = { /* * HSMP_GET_DIMM_THERMAL, num_args = 1, response_sz = 1 * input: args[0] = DIMM address[7:0] - * output: args[0] = temperature in degree celcius[31:21] + update rate in ms[16:8] + + * output: args[0] = temperature in degree celsius[31:21] + update rate in ms[16:8] + * DIMM address[7:0] */ {1, 1, HSMP_GET}, @@ -288,13 +308,163 @@ static const struct hsmp_msg_desc hsmp_msg_desc_table[] = { * HSMP_SET_POWER_MODE, num_args = 1, response_sz = 0 * input: args[0] = power efficiency mode[2:0] */ - {1, 0, HSMP_SET}, + {1, 1, HSMP_SET_GET}, /* * HSMP_SET_PSTATE_MAX_MIN, num_args = 1, response_sz = 0 * input: args[0] = min df pstate[15:8] + max df pstate[7:0] */ {1, 0, HSMP_SET}, + + /* + * HSMP_GET_METRIC_TABLE_VER, num_args = 0, response_sz = 1 + * output: args[0] = metrics table version + */ + {0, 1, HSMP_GET}, + + /* + * HSMP_GET_METRIC_TABLE, num_args = 0, response_sz = 0 + */ + {0, 0, HSMP_GET}, + + /* + * HSMP_GET_METRIC_TABLE_DRAM_ADDR, num_args = 0, response_sz = 2 + * output: args[0] = lower 32 bits of the address + * output: args[1] = upper 32 bits of the address + */ + {0, 2, HSMP_GET}, + + /* + * HSMP_SET_XGMI_PSTATE_RANGE, num_args = 1, response_sz = 0 + * input: args[0] = min xGMI p-state[15:8] + max xGMI p-state[7:0] + */ + {1, 0, HSMP_SET}, + + /* + * HSMP_CPU_RAIL_ISO_FREQ_POLICY, num_args = 1, response_sz = 1 + * input: args[0] = set/get policy[31] + + * disable/enable independent control[0] + * output: args[0] = current policy[0] + */ + {1, 1, HSMP_SET_GET}, + + /* + * HSMP_DFC_ENABLE_CTRL, num_args = 1, response_sz = 1 + * input: args[0] = set/get policy[31] + enable/disable DFC[0] + * output: args[0] = current policy[0] + */ + {1, 1, HSMP_SET_GET}, + + /* RESERVED(0x29-0x2f) */ + {0, 0, HSMP_RSVD}, + {0, 0, HSMP_RSVD}, + {0, 0, HSMP_RSVD}, + {0, 0, HSMP_RSVD}, + {0, 0, HSMP_RSVD}, + {0, 0, HSMP_RSVD}, + {0, 0, HSMP_RSVD}, + + /* + * HSMP_GET_RAPL_UNITS, response_sz = 1 + * output: args[0] = tu value[19:16] + esu value[12:8] + */ + {0, 1, HSMP_GET}, + + /* + * HSMP_GET_RAPL_CORE_COUNTER, num_args = 1, response_sz = 1 + * input: args[0] = apic id[15:0] + * output: args[0] = lower 32 bits of energy + * output: args[1] = upper 32 bits of energy + */ + {1, 2, HSMP_GET}, + + /* + * HSMP_GET_RAPL_PACKAGE_COUNTER, num_args = 0, response_sz = 1 + * output: args[0] = lower 32 bits of energy + * output: args[1] = upper 32 bits of energy + */ + {0, 2, HSMP_GET}, + +}; + +/* Metrics table (supported only with proto version 6) */ +struct hsmp_metric_table { + __u32 accumulation_counter; + + /* TEMPERATURE */ + __u32 max_socket_temperature; + __u32 max_vr_temperature; + __u32 max_hbm_temperature; + __u64 max_socket_temperature_acc; + __u64 max_vr_temperature_acc; + __u64 max_hbm_temperature_acc; + + /* POWER */ + __u32 socket_power_limit; + __u32 max_socket_power_limit; + __u32 socket_power; + + /* ENERGY */ + __u64 timestamp; + __u64 socket_energy_acc; + __u64 ccd_energy_acc; + __u64 xcd_energy_acc; + __u64 aid_energy_acc; + __u64 hbm_energy_acc; + + /* FREQUENCY */ + __u32 cclk_frequency_limit; + __u32 gfxclk_frequency_limit; + __u32 fclk_frequency; + __u32 uclk_frequency; + __u32 socclk_frequency[4]; + __u32 vclk_frequency[4]; + __u32 dclk_frequency[4]; + __u32 lclk_frequency[4]; + __u64 gfxclk_frequency_acc[8]; + __u64 cclk_frequency_acc[96]; + + /* FREQUENCY RANGE */ + __u32 max_cclk_frequency; + __u32 min_cclk_frequency; + __u32 max_gfxclk_frequency; + __u32 min_gfxclk_frequency; + __u32 fclk_frequency_table[4]; + __u32 uclk_frequency_table[4]; + __u32 socclk_frequency_table[4]; + __u32 vclk_frequency_table[4]; + __u32 dclk_frequency_table[4]; + __u32 lclk_frequency_table[4]; + __u32 max_lclk_dpm_range; + __u32 min_lclk_dpm_range; + + /* XGMI */ + __u32 xgmi_width; + __u32 xgmi_bitrate; + __u64 xgmi_read_bandwidth_acc[8]; + __u64 xgmi_write_bandwidth_acc[8]; + + /* ACTIVITY */ + __u32 socket_c0_residency; + __u32 socket_gfx_busy; + __u32 dram_bandwidth_utilization; + __u64 socket_c0_residency_acc; + __u64 socket_gfx_busy_acc; + __u64 dram_bandwidth_acc; + __u32 max_dram_bandwidth; + __u64 dram_bandwidth_utilization_acc; + __u64 pcie_bandwidth_acc[4]; + + /* THROTTLERS */ + __u32 prochot_residency_acc; + __u32 ppt_residency_acc; + __u32 socket_thm_residency_acc; + __u32 vr_thm_residency_acc; + __u32 hbm_thm_residency_acc; + __u32 spare; + + /* New items at the end to maintain driver compatibility */ + __u32 gfxclk_frequency[8]; }; /* Reset to default packing */ diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 01d19fc22346..dafbf581c515 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -2,21 +2,7 @@ #ifndef _ASM_X86_BOOTPARAM_H #define _ASM_X86_BOOTPARAM_H -/* setup_data/setup_indirect types */ -#define SETUP_NONE 0 -#define SETUP_E820_EXT 1 -#define SETUP_DTB 2 -#define SETUP_PCI 3 -#define SETUP_EFI 4 -#define SETUP_APPLE_PROPERTIES 5 -#define SETUP_JAILHOUSE 6 -#define SETUP_CC_BLOB 7 -#define SETUP_IMA 8 -#define SETUP_RNG_SEED 9 -#define SETUP_ENUM_MAX SETUP_RNG_SEED - -#define SETUP_INDIRECT (1<<31) -#define SETUP_TYPE_MAX (SETUP_ENUM_MAX | SETUP_INDIRECT) +#include <asm/setup_data.h> /* ram_size flags */ #define RAMDISK_IMAGE_START_MASK 0x07FF @@ -38,8 +24,9 @@ #define XLF_EFI_KEXEC (1<<4) #define XLF_5LEVEL (1<<5) #define XLF_5LEVEL_ENABLED (1<<6) +#define XLF_MEM_ENCRYPTION (1<<7) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/types.h> #include <linux/screen_info.h> @@ -48,22 +35,6 @@ #include <asm/ist.h> #include <video/edid.h> -/* extensible setup data list node */ -struct setup_data { - __u64 next; - __u32 type; - __u32 len; - __u8 data[]; -}; - -/* extensible setup indirect data node */ -struct setup_indirect { - __u32 type; - __u32 reserved; /* Reserved, must be set to zero. */ - __u64 len; - __u64 addr; -}; - struct setup_header { __u8 setup_sects; __u16 root_flags; @@ -137,50 +108,10 @@ struct efi_info { #define E820_MAX_ENTRIES_ZEROPAGE 128 /* - * The E820 memory region entry of the boot protocol ABI: - */ -struct boot_e820_entry { - __u64 addr; - __u64 size; - __u32 type; -} __attribute__((packed)); - -/* * Smallest compatible version of jailhouse_setup_data required by this kernel. */ #define JAILHOUSE_SETUP_REQUIRED_VERSION 1 -/* - * The boot loader is passing platform information via this Jailhouse-specific - * setup data structure. - */ -struct jailhouse_setup_data { - struct { - __u16 version; - __u16 compatible_version; - } __attribute__((packed)) hdr; - struct { - __u16 pm_timer_address; - __u16 num_cpus; - __u64 pci_mmconfig_base; - __u32 tsc_khz; - __u32 apic_khz; - __u8 standard_ioapic; - __u8 cpu_ids[255]; - } __attribute__((packed)) v1; - struct { - __u32 flags; - } __attribute__((packed)) v2; -} __attribute__((packed)); - -/* - * IMA buffer setup data information from the previous kernel during kexec - */ -struct ima_setup_data { - __u64 addr; - __u64 size; -} __attribute__((packed)); - /* The so-called "zeropage" */ struct boot_params { struct screen_info screen_info; /* 0x000 */ @@ -279,6 +210,6 @@ enum x86_hardware_subarch { X86_NR_SUBARCHS, }; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_BOOTPARAM_H */ diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h index 2f491efe3a12..55bc66867156 100644 --- a/arch/x86/include/uapi/asm/e820.h +++ b/arch/x86/include/uapi/asm/e820.h @@ -54,7 +54,7 @@ */ #define E820_RESERVED_KERN 128 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/types.h> struct e820entry { __u64 addr; /* start of memory segment */ @@ -76,7 +76,7 @@ struct e820map { #define BIOS_ROM_BASE 0xffe00000 #define BIOS_ROM_END 0xffffffff -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _UAPI_ASM_X86_E820_H */ diff --git a/arch/x86/include/uapi/asm/elf.h b/arch/x86/include/uapi/asm/elf.h new file mode 100644 index 000000000000..468e135fa285 --- /dev/null +++ b/arch/x86/include/uapi/asm/elf.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_ASM_X86_ELF_H +#define _UAPI_ASM_X86_ELF_H + +#include <linux/types.h> + +struct x86_xfeat_component { + __u32 type; + __u32 size; + __u32 offset; + __u32 flags; +} __packed; + +_Static_assert(sizeof(struct x86_xfeat_component) % 4 == 0, "x86_xfeat_component is not aligned"); + +#endif /* _UAPI_ASM_X86_ELF_H */ diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index ec53c9fa1da9..6f3499507c5e 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -7,8 +7,11 @@ * */ +#include <linux/const.h> +#include <linux/bits.h> #include <linux/types.h> #include <linux/ioctl.h> +#include <linux/stddef.h> #define KVM_PIO_PAGE_OFFSET 1 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2 @@ -39,7 +42,6 @@ #define __KVM_HAVE_IRQ_LINE #define __KVM_HAVE_MSI #define __KVM_HAVE_USER_NMI -#define __KVM_HAVE_GUEST_DEBUG #define __KVM_HAVE_MSIX #define __KVM_HAVE_MCE #define __KVM_HAVE_PIT_STATE2 @@ -48,19 +50,10 @@ #define __KVM_HAVE_DEBUGREGS #define __KVM_HAVE_XSAVE #define __KVM_HAVE_XCRS -#define __KVM_HAVE_READONLY_MEM /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 -struct kvm_memory_alias { - __u32 slot; /* this has a different namespace than memory slots */ - __u32 flags; - __u64 guest_phys_addr; - __u64 memory_size; - __u64 target_phys_addr; -}; - /* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */ struct kvm_pic_state { __u8 last_irr; /* edge detection */ @@ -113,6 +106,7 @@ struct kvm_ioapic_state { #define KVM_RUN_X86_SMM (1 << 0) #define KVM_RUN_X86_BUS_LOCK (1 << 1) +#define KVM_RUN_X86_GUEST_MODE (1 << 2) /* for KVM_GET_REGS and KVM_SET_REGS */ struct kvm_regs { @@ -214,6 +208,8 @@ struct kvm_msr_list { struct kvm_msr_filter_range { #define KVM_MSR_FILTER_READ (1 << 0) #define KVM_MSR_FILTER_WRITE (1 << 1) +#define KVM_MSR_FILTER_RANGE_VALID_MASK (KVM_MSR_FILTER_READ | \ + KVM_MSR_FILTER_WRITE) __u32 flags; __u32 nmsrs; /* number of msrs in bitmap */ __u32 base; /* MSR index the bitmap starts at */ @@ -222,8 +218,11 @@ struct kvm_msr_filter_range { #define KVM_MSR_FILTER_MAX_RANGES 16 struct kvm_msr_filter { +#ifndef __KERNEL__ #define KVM_MSR_FILTER_DEFAULT_ALLOW (0 << 0) +#endif #define KVM_MSR_FILTER_DEFAULT_DENY (1 << 0) +#define KVM_MSR_FILTER_VALID_MASK (KVM_MSR_FILTER_DEFAULT_DENY) __u32 flags; struct kvm_msr_filter_range ranges[KVM_MSR_FILTER_MAX_RANGES]; }; @@ -306,7 +305,8 @@ struct kvm_pit_state { struct kvm_pit_channel_state channels[3]; }; -#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001 +#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001 +#define KVM_PIT_FLAGS_SPEAKER_DATA_ON 0x00000002 struct kvm_pit_state2 { struct kvm_pit_channel_state channels[3]; @@ -325,6 +325,7 @@ struct kvm_reinject_control { #define KVM_VCPUEVENT_VALID_SHADOW 0x00000004 #define KVM_VCPUEVENT_VALID_SMM 0x00000008 #define KVM_VCPUEVENT_VALID_PAYLOAD 0x00000010 +#define KVM_VCPUEVENT_VALID_TRIPLE_FAULT 0x00000020 /* Interrupt shadow states */ #define KVM_X86_SHADOW_INT_MOV_SS 0x01 @@ -359,7 +360,10 @@ struct kvm_vcpu_events { __u8 smm_inside_nmi; __u8 latched_init; } smi; - __u8 reserved[27]; + struct { + __u8 pending; + } triple_fault; + __u8 reserved[26]; __u8 exception_has_payload; __u64 exception_payload; }; @@ -434,6 +438,10 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3) #define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4) #define KVM_X86_QUIRK_FIX_HYPERCALL_INSN (1 << 5) +#define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS (1 << 6) +#define KVM_X86_QUIRK_SLOT_ZAP_ALL (1 << 7) +#define KVM_X86_QUIRK_STUFF_FEATURE_MSRS (1 << 8) +#define KVM_X86_QUIRK_IGNORE_GUEST_PAT (1 << 9) #define KVM_STATE_NESTED_FORMAT_VMX 0 #define KVM_STATE_NESTED_FORMAT_SVM 1 @@ -453,8 +461,13 @@ struct kvm_sync_regs { #define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE 0x00000001 -/* attributes for system fd (group 0) */ -#define KVM_X86_XCOMP_GUEST_SUPP 0 +/* vendor-independent attributes for system fd (group 0) */ +#define KVM_X86_GRP_SYSTEM 0 +# define KVM_X86_XCOMP_GUEST_SUPP 0 + +/* vendor-specific groups and attributes for system fd */ +#define KVM_X86_GRP_SEV 1 +# define KVM_X86_SEV_VMSA_FEATURES 0 struct kvm_vmx_nested_state_data { __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; @@ -504,8 +517,8 @@ struct kvm_nested_state { * KVM_{GET,PUT}_NESTED_STATE ioctl values. */ union { - struct kvm_vmx_nested_state_data vmx[0]; - struct kvm_svm_nested_state_data svm[0]; + __DECLARE_FLEX_ARRAY(struct kvm_vmx_nested_state_data, vmx); + __DECLARE_FLEX_ARRAY(struct kvm_svm_nested_state_data, svm); } data; }; @@ -522,8 +535,471 @@ struct kvm_pmu_event_filter { #define KVM_PMU_EVENT_ALLOW 0 #define KVM_PMU_EVENT_DENY 1 +#define KVM_PMU_EVENT_FLAG_MASKED_EVENTS _BITUL(0) +#define KVM_PMU_EVENT_FLAGS_VALID_MASK (KVM_PMU_EVENT_FLAG_MASKED_EVENTS) + +/* for KVM_CAP_MCE */ +struct kvm_x86_mce { + __u64 status; + __u64 addr; + __u64 misc; + __u64 mcg_status; + __u8 bank; + __u8 pad1[7]; + __u64 pad2[3]; +}; + +/* for KVM_CAP_XEN_HVM */ +#define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR (1 << 0) +#define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1) +#define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2) +#define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3) +#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4) +#define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5) +#define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG (1 << 6) +#define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE (1 << 7) +#define KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA (1 << 8) + +#define KVM_XEN_MSR_MIN_INDEX 0x40000000u +#define KVM_XEN_MSR_MAX_INDEX 0x4fffffffu + +struct kvm_xen_hvm_config { + __u32 flags; + __u32 msr; + __u64 blob_addr_32; + __u64 blob_addr_64; + __u8 blob_size_32; + __u8 blob_size_64; + __u8 pad2[30]; +}; + +struct kvm_xen_hvm_attr { + __u16 type; + __u16 pad[3]; + union { + __u8 long_mode; + __u8 vector; + __u8 runstate_update_flag; + union { + __u64 gfn; +#define KVM_XEN_INVALID_GFN ((__u64)-1) + __u64 hva; + } shared_info; + struct { + __u32 send_port; + __u32 type; /* EVTCHNSTAT_ipi / EVTCHNSTAT_interdomain */ + __u32 flags; +#define KVM_XEN_EVTCHN_DEASSIGN (1 << 0) +#define KVM_XEN_EVTCHN_UPDATE (1 << 1) +#define KVM_XEN_EVTCHN_RESET (1 << 2) + /* + * Events sent by the guest are either looped back to + * the guest itself (potentially on a different port#) + * or signalled via an eventfd. + */ + union { + struct { + __u32 port; + __u32 vcpu; + __u32 priority; + } port; + struct { + __u32 port; /* Zero for eventfd */ + __s32 fd; + } eventfd; + __u32 padding[4]; + } deliver; + } evtchn; + __u32 xen_version; + __u64 pad[8]; + } u; +}; + + +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */ +#define KVM_XEN_ATTR_TYPE_LONG_MODE 0x0 +#define KVM_XEN_ATTR_TYPE_SHARED_INFO 0x1 +#define KVM_XEN_ATTR_TYPE_UPCALL_VECTOR 0x2 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */ +#define KVM_XEN_ATTR_TYPE_EVTCHN 0x3 +#define KVM_XEN_ATTR_TYPE_XEN_VERSION 0x4 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG */ +#define KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG 0x5 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA */ +#define KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA 0x6 + +struct kvm_xen_vcpu_attr { + __u16 type; + __u16 pad[3]; + union { + __u64 gpa; +#define KVM_XEN_INVALID_GPA ((__u64)-1) + __u64 hva; + __u64 pad[8]; + struct { + __u64 state; + __u64 state_entry_time; + __u64 time_running; + __u64 time_runnable; + __u64 time_blocked; + __u64 time_offline; + } runstate; + __u32 vcpu_id; + struct { + __u32 port; + __u32 priority; + __u64 expires_ns; + } timer; + __u8 vector; + } u; +}; + +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */ +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO 0x0 +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO 0x1 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR 0x2 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT 0x3 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA 0x4 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST 0x5 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */ +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID 0x6 +#define KVM_XEN_VCPU_ATTR_TYPE_TIMER 0x7 +#define KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR 0x8 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA */ +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA 0x9 + +/* Secure Encrypted Virtualization command */ +enum sev_cmd_id { + /* Guest initialization commands */ + KVM_SEV_INIT = 0, + KVM_SEV_ES_INIT, + /* Guest launch commands */ + KVM_SEV_LAUNCH_START, + KVM_SEV_LAUNCH_UPDATE_DATA, + KVM_SEV_LAUNCH_UPDATE_VMSA, + KVM_SEV_LAUNCH_SECRET, + KVM_SEV_LAUNCH_MEASURE, + KVM_SEV_LAUNCH_FINISH, + /* Guest migration commands (outgoing) */ + KVM_SEV_SEND_START, + KVM_SEV_SEND_UPDATE_DATA, + KVM_SEV_SEND_UPDATE_VMSA, + KVM_SEV_SEND_FINISH, + /* Guest migration commands (incoming) */ + KVM_SEV_RECEIVE_START, + KVM_SEV_RECEIVE_UPDATE_DATA, + KVM_SEV_RECEIVE_UPDATE_VMSA, + KVM_SEV_RECEIVE_FINISH, + /* Guest status and debug commands */ + KVM_SEV_GUEST_STATUS, + KVM_SEV_DBG_DECRYPT, + KVM_SEV_DBG_ENCRYPT, + /* Guest certificates commands */ + KVM_SEV_CERT_EXPORT, + /* Attestation report */ + KVM_SEV_GET_ATTESTATION_REPORT, + /* Guest Migration Extension */ + KVM_SEV_SEND_CANCEL, + + /* Second time is the charm; improved versions of the above ioctls. */ + KVM_SEV_INIT2, + + /* SNP-specific commands */ + KVM_SEV_SNP_LAUNCH_START = 100, + KVM_SEV_SNP_LAUNCH_UPDATE, + KVM_SEV_SNP_LAUNCH_FINISH, + + KVM_SEV_NR_MAX, +}; + +struct kvm_sev_cmd { + __u32 id; + __u32 pad0; + __u64 data; + __u32 error; + __u32 sev_fd; +}; + +struct kvm_sev_init { + __u64 vmsa_features; + __u32 flags; + __u16 ghcb_version; + __u16 pad1; + __u32 pad2[8]; +}; + +struct kvm_sev_launch_start { + __u32 handle; + __u32 policy; + __u64 dh_uaddr; + __u32 dh_len; + __u32 pad0; + __u64 session_uaddr; + __u32 session_len; + __u32 pad1; +}; + +struct kvm_sev_launch_update_data { + __u64 uaddr; + __u32 len; + __u32 pad0; +}; + + +struct kvm_sev_launch_secret { + __u64 hdr_uaddr; + __u32 hdr_len; + __u32 pad0; + __u64 guest_uaddr; + __u32 guest_len; + __u32 pad1; + __u64 trans_uaddr; + __u32 trans_len; + __u32 pad2; +}; + +struct kvm_sev_launch_measure { + __u64 uaddr; + __u32 len; + __u32 pad0; +}; + +struct kvm_sev_guest_status { + __u32 handle; + __u32 policy; + __u32 state; +}; + +struct kvm_sev_dbg { + __u64 src_uaddr; + __u64 dst_uaddr; + __u32 len; + __u32 pad0; +}; + +struct kvm_sev_attestation_report { + __u8 mnonce[16]; + __u64 uaddr; + __u32 len; + __u32 pad0; +}; + +struct kvm_sev_send_start { + __u32 policy; + __u32 pad0; + __u64 pdh_cert_uaddr; + __u32 pdh_cert_len; + __u32 pad1; + __u64 plat_certs_uaddr; + __u32 plat_certs_len; + __u32 pad2; + __u64 amd_certs_uaddr; + __u32 amd_certs_len; + __u32 pad3; + __u64 session_uaddr; + __u32 session_len; + __u32 pad4; +}; + +struct kvm_sev_send_update_data { + __u64 hdr_uaddr; + __u32 hdr_len; + __u32 pad0; + __u64 guest_uaddr; + __u32 guest_len; + __u32 pad1; + __u64 trans_uaddr; + __u32 trans_len; + __u32 pad2; +}; + +struct kvm_sev_receive_start { + __u32 handle; + __u32 policy; + __u64 pdh_uaddr; + __u32 pdh_len; + __u32 pad0; + __u64 session_uaddr; + __u32 session_len; + __u32 pad1; +}; + +struct kvm_sev_receive_update_data { + __u64 hdr_uaddr; + __u32 hdr_len; + __u32 pad0; + __u64 guest_uaddr; + __u32 guest_len; + __u32 pad1; + __u64 trans_uaddr; + __u32 trans_len; + __u32 pad2; +}; + +struct kvm_sev_snp_launch_start { + __u64 policy; + __u8 gosvw[16]; + __u16 flags; + __u8 pad0[6]; + __u64 pad1[4]; +}; + +/* Kept in sync with firmware values for simplicity. */ +#define KVM_SEV_PAGE_TYPE_INVALID 0x0 +#define KVM_SEV_SNP_PAGE_TYPE_NORMAL 0x1 +#define KVM_SEV_SNP_PAGE_TYPE_ZERO 0x3 +#define KVM_SEV_SNP_PAGE_TYPE_UNMEASURED 0x4 +#define KVM_SEV_SNP_PAGE_TYPE_SECRETS 0x5 +#define KVM_SEV_SNP_PAGE_TYPE_CPUID 0x6 + +struct kvm_sev_snp_launch_update { + __u64 gfn_start; + __u64 uaddr; + __u64 len; + __u8 type; + __u8 pad0; + __u16 flags; + __u32 pad1; + __u64 pad2[4]; +}; + +#define KVM_SEV_SNP_ID_BLOCK_SIZE 96 +#define KVM_SEV_SNP_ID_AUTH_SIZE 4096 +#define KVM_SEV_SNP_FINISH_DATA_SIZE 32 + +struct kvm_sev_snp_launch_finish { + __u64 id_block_uaddr; + __u64 id_auth_uaddr; + __u8 id_block_en; + __u8 auth_key_en; + __u8 vcek_disabled; + __u8 host_data[KVM_SEV_SNP_FINISH_DATA_SIZE]; + __u8 pad0[3]; + __u16 flags; + __u64 pad1[4]; +}; + +#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) +#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) + +struct kvm_hyperv_eventfd { + __u32 conn_id; + __s32 fd; + __u32 flags; + __u32 padding[3]; +}; + +#define KVM_HYPERV_CONN_ID_MASK 0x00ffffff +#define KVM_HYPERV_EVENTFD_DEASSIGN (1 << 0) + +/* + * Masked event layout. + * Bits Description + * ---- ----------- + * 7:0 event select (low bits) + * 15:8 umask match + * 31:16 unused + * 35:32 event select (high bits) + * 36:54 unused + * 55 exclude bit + * 63:56 umask mask + */ + +#define KVM_PMU_ENCODE_MASKED_ENTRY(event_select, mask, match, exclude) \ + (((event_select) & 0xFFULL) | (((event_select) & 0XF00ULL) << 24) | \ + (((mask) & 0xFFULL) << 56) | \ + (((match) & 0xFFULL) << 8) | \ + ((__u64)(!!(exclude)) << 55)) + +#define KVM_PMU_MASKED_ENTRY_EVENT_SELECT \ + (__GENMASK_ULL(7, 0) | __GENMASK_ULL(35, 32)) +#define KVM_PMU_MASKED_ENTRY_UMASK_MASK (__GENMASK_ULL(63, 56)) +#define KVM_PMU_MASKED_ENTRY_UMASK_MATCH (__GENMASK_ULL(15, 8)) +#define KVM_PMU_MASKED_ENTRY_EXCLUDE (_BITULL(55)) +#define KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT (56) + /* for KVM_{GET,SET,HAS}_DEVICE_ATTR */ #define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */ #define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */ +/* x86-specific KVM_EXIT_HYPERCALL flags. */ +#define KVM_EXIT_HYPERCALL_LONG_MODE _BITULL(0) + +#define KVM_X86_DEFAULT_VM 0 +#define KVM_X86_SW_PROTECTED_VM 1 +#define KVM_X86_SEV_VM 2 +#define KVM_X86_SEV_ES_VM 3 +#define KVM_X86_SNP_VM 4 +#define KVM_X86_TDX_VM 5 + +/* Trust Domain eXtension sub-ioctl() commands. */ +enum kvm_tdx_cmd_id { + KVM_TDX_CAPABILITIES = 0, + KVM_TDX_INIT_VM, + KVM_TDX_INIT_VCPU, + KVM_TDX_INIT_MEM_REGION, + KVM_TDX_FINALIZE_VM, + KVM_TDX_GET_CPUID, + + KVM_TDX_CMD_NR_MAX, +}; + +struct kvm_tdx_cmd { + /* enum kvm_tdx_cmd_id */ + __u32 id; + /* flags for sub-commend. If sub-command doesn't use this, set zero. */ + __u32 flags; + /* + * data for each sub-command. An immediate or a pointer to the actual + * data in process virtual address. If sub-command doesn't use it, + * set zero. + */ + __u64 data; + /* + * Auxiliary error code. The sub-command may return TDX SEAMCALL + * status code in addition to -Exxx. + */ + __u64 hw_error; +}; + +struct kvm_tdx_capabilities { + __u64 supported_attrs; + __u64 supported_xfam; + __u64 reserved[254]; + + /* Configurable CPUID bits for userspace */ + struct kvm_cpuid2 cpuid; +}; + +struct kvm_tdx_init_vm { + __u64 attributes; + __u64 xfam; + __u64 mrconfigid[6]; /* sha384 digest */ + __u64 mrowner[6]; /* sha384 digest */ + __u64 mrownerconfig[6]; /* sha384 digest */ + + /* The total space for TD_PARAMS before the CPUIDs is 256 bytes */ + __u64 reserved[12]; + + /* + * Call KVM_TDX_INIT_VM before vcpu creation, thus before + * KVM_SET_CPUID2. + * This configuration supersedes KVM_SET_CPUID2s for VCPUs because the + * TDX module directly virtualizes those CPUIDs without VMM. The user + * space VMM, e.g. qemu, should make KVM_SET_CPUID2 consistent with + * those values. If it doesn't, KVM may have wrong idea of vCPUIDs of + * the guest, and KVM may wrongly emulate CPUIDs or MSRs that the TDX + * module doesn't virtualize. + */ + struct kvm_cpuid2 cpuid; +}; + +#define KVM_TDX_MEASURE_MEMORY_REGION _BITULL(0) + +struct kvm_tdx_init_mem_region { + __u64 source_addr; + __u64 gpa; + __u64 nr_pages; +}; + #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 6e64b27b2c1e..a1efa7907a0b 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -92,7 +92,7 @@ struct kvm_clock_pairing { #define KVM_ASYNC_PF_DELIVERY_AS_INT (1 << 3) /* MSR_KVM_ASYNC_PF_INT */ -#define KVM_ASYNC_PF_VEC_MASK GENMASK(7, 0) +#define KVM_ASYNC_PF_VEC_MASK __GENMASK(7, 0) /* MSR_KVM_MIGRATION_CONTROL */ #define KVM_MIGRATION_READY (1 << 0) @@ -142,7 +142,6 @@ struct kvm_vcpu_pv_apf_data { __u32 token; __u8 pad[56]; - __u32 enabled; }; #define KVM_PV_EOI_BIT 0 diff --git a/arch/x86/include/uapi/asm/ldt.h b/arch/x86/include/uapi/asm/ldt.h index d62ac5db093b..a82c039d8e6a 100644 --- a/arch/x86/include/uapi/asm/ldt.h +++ b/arch/x86/include/uapi/asm/ldt.h @@ -12,7 +12,7 @@ /* The size of each LDT entry. */ #define LDT_ENTRY_SIZE 8 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * Note on 64bit base and limit is ignored and you cannot set DS/ES/CS * not to the default values if you still want to do syscalls. This @@ -44,5 +44,5 @@ struct user_desc { #define MODIFY_LDT_CONTENTS_STACK 1 #define MODIFY_LDT_CONTENTS_CODE 2 -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _ASM_X86_LDT_H */ diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h index db9adc081c5a..cb6b48a7c22b 100644 --- a/arch/x86/include/uapi/asm/mce.h +++ b/arch/x86/include/uapi/asm/mce.h @@ -8,7 +8,8 @@ /* * Fields are zero when not available. Also, this struct is shared with * userspace mcelog and thus must keep existing fields at current offsets. - * Only add new fields to the end of the structure + * Only add new, shared fields to the end of the structure. + * Do not add vendor-specific fields. */ struct mce { __u64 status; /* Bank's MCi_STATUS MSR */ diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h index 775dbd3aff73..ac1e6277212b 100644 --- a/arch/x86/include/uapi/asm/mman.h +++ b/arch/x86/include/uapi/asm/mman.h @@ -3,14 +3,7 @@ #define _ASM_X86_MMAN_H #define MAP_32BIT 0x40 /* only give out 32bit addresses */ - -#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS -#define arch_calc_vm_prot_bits(prot, key) ( \ - ((key) & 0x1 ? VM_PKEY_BIT0 : 0) | \ - ((key) & 0x2 ? VM_PKEY_BIT1 : 0) | \ - ((key) & 0x4 ? VM_PKEY_BIT2 : 0) | \ - ((key) & 0x8 ? VM_PKEY_BIT3 : 0)) -#endif +#define MAP_ABOVE4G 0x80 /* only map above 4GB */ #include <asm-generic/mman.h> diff --git a/arch/x86/include/uapi/asm/msr.h b/arch/x86/include/uapi/asm/msr.h index e7516b402a00..4b8917ca28fe 100644 --- a/arch/x86/include/uapi/asm/msr.h +++ b/arch/x86/include/uapi/asm/msr.h @@ -2,7 +2,7 @@ #ifndef _UAPI_ASM_X86_MSR_H #define _UAPI_ASM_X86_MSR_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/types.h> #include <linux/ioctl.h> @@ -10,5 +10,5 @@ #define X86_IOC_RDMSR_REGS _IOWR('c', 0xA0, __u32[8]) #define X86_IOC_WRMSR_REGS _IOWR('c', 0xA1, __u32[8]) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _UAPI_ASM_X86_MSR_H */ diff --git a/arch/x86/include/uapi/asm/mtrr.h b/arch/x86/include/uapi/asm/mtrr.h index 376563f2bac1..3a8a8eb8ac3a 100644 --- a/arch/x86/include/uapi/asm/mtrr.h +++ b/arch/x86/include/uapi/asm/mtrr.h @@ -81,14 +81,6 @@ typedef __u8 mtrr_type; #define MTRR_NUM_FIXED_RANGES 88 #define MTRR_MAX_VAR_RANGES 256 -struct mtrr_state_type { - struct mtrr_var_range var_ranges[MTRR_MAX_VAR_RANGES]; - mtrr_type fixed_ranges[MTRR_NUM_FIXED_RANGES]; - unsigned char enabled; - unsigned char have_fixed; - mtrr_type def_type; -}; - #define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg)) #define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1) @@ -115,9 +107,9 @@ struct mtrr_state_type { #define MTRR_NUM_TYPES 7 /* - * Invalid MTRR memory type. mtrr_type_lookup() returns this value when - * MTRRs are disabled. Note, this value is allocated from the reserved - * values (0x7-0xff) of the MTRR memory types. + * Invalid MTRR memory type. No longer used outside of MTRR code. + * Note, this value is allocated from the reserved values (0x7-0xff) of + * the MTRR memory types. */ #define MTRR_TYPE_INVALID 0xff diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h index 500b96e71f18..384e2cc6ac19 100644 --- a/arch/x86/include/uapi/asm/prctl.h +++ b/arch/x86/include/uapi/asm/prctl.h @@ -16,8 +16,28 @@ #define ARCH_GET_XCOMP_GUEST_PERM 0x1024 #define ARCH_REQ_XCOMP_GUEST_PERM 0x1025 +#define ARCH_XCOMP_TILECFG 17 +#define ARCH_XCOMP_TILEDATA 18 + #define ARCH_MAP_VDSO_X32 0x2001 #define ARCH_MAP_VDSO_32 0x2002 #define ARCH_MAP_VDSO_64 0x2003 +/* Don't use 0x3001-0x3004 because of old glibcs */ + +#define ARCH_GET_UNTAG_MASK 0x4001 +#define ARCH_ENABLE_TAGGED_ADDR 0x4002 +#define ARCH_GET_MAX_TAG_BITS 0x4003 +#define ARCH_FORCE_TAGGED_SVA 0x4004 + +#define ARCH_SHSTK_ENABLE 0x5001 +#define ARCH_SHSTK_DISABLE 0x5002 +#define ARCH_SHSTK_LOCK 0x5003 +#define ARCH_SHSTK_UNLOCK 0x5004 +#define ARCH_SHSTK_STATUS 0x5005 + +/* ARCH_SHSTK_ features bits */ +#define ARCH_SHSTK_SHSTK (1ULL << 0) +#define ARCH_SHSTK_WRSS (1ULL << 1) + #endif /* _ASM_X86_PRCTL_H */ diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h index c47cc7f2feeb..f1a4adc78272 100644 --- a/arch/x86/include/uapi/asm/processor-flags.h +++ b/arch/x86/include/uapi/asm/processor-flags.h @@ -82,6 +82,10 @@ #define X86_CR3_PCID_BITS 12 #define X86_CR3_PCID_MASK (_AC((1UL << X86_CR3_PCID_BITS) - 1, UL)) +#define X86_CR3_LAM_U57_BIT 61 /* Activate LAM for userspace, 62:57 bits masked */ +#define X86_CR3_LAM_U57 _BITULL(X86_CR3_LAM_U57_BIT) +#define X86_CR3_LAM_U48_BIT 62 /* Activate LAM for userspace, 62:48 bits masked */ +#define X86_CR3_LAM_U48 _BITULL(X86_CR3_LAM_U48_BIT) #define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */ #define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT) @@ -132,6 +136,15 @@ #define X86_CR4_PKE _BITUL(X86_CR4_PKE_BIT) #define X86_CR4_CET_BIT 23 /* enable Control-flow Enforcement Technology */ #define X86_CR4_CET _BITUL(X86_CR4_CET_BIT) +#define X86_CR4_LAM_SUP_BIT 28 /* LAM for supervisor pointers */ +#define X86_CR4_LAM_SUP _BITUL(X86_CR4_LAM_SUP_BIT) + +#ifdef __x86_64__ +#define X86_CR4_FRED_BIT 32 /* enable FRED kernel entry */ +#define X86_CR4_FRED _BITUL(X86_CR4_FRED_BIT) +#else +#define X86_CR4_FRED (0) +#endif /* * x86-64 Task Priority Register, CR8 diff --git a/arch/x86/include/uapi/asm/ptrace-abi.h b/arch/x86/include/uapi/asm/ptrace-abi.h index 16074b9c93bb..5823584dea13 100644 --- a/arch/x86/include/uapi/asm/ptrace-abi.h +++ b/arch/x86/include/uapi/asm/ptrace-abi.h @@ -25,7 +25,7 @@ #else /* __i386__ */ -#if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS) +#if defined(__ASSEMBLER__) || defined(__FRAME_OFFSETS) /* * C ABI says these regs are callee-preserved. They aren't saved on kernel entry * unless syscall needs a complete, fully filled "struct pt_regs". @@ -57,7 +57,7 @@ #define EFLAGS 144 #define RSP 152 #define SS 160 -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* top of stack page */ #define FRAME_SIZE 168 @@ -87,7 +87,7 @@ #define PTRACE_SINGLEBLOCK 33 /* resume execution until next branch */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/types.h> #endif diff --git a/arch/x86/include/uapi/asm/ptrace.h b/arch/x86/include/uapi/asm/ptrace.h index 85165c0edafc..e0b5b4f6226b 100644 --- a/arch/x86/include/uapi/asm/ptrace.h +++ b/arch/x86/include/uapi/asm/ptrace.h @@ -7,7 +7,7 @@ #include <asm/processor-flags.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef __i386__ /* this struct defines the way the registers are stored on the @@ -81,6 +81,6 @@ struct pt_regs { -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _UAPI_ASM_X86_PTRACE_H */ diff --git a/arch/x86/include/uapi/asm/setup_data.h b/arch/x86/include/uapi/asm/setup_data.h new file mode 100644 index 000000000000..2671c4e1b3a0 --- /dev/null +++ b/arch/x86/include/uapi/asm/setup_data.h @@ -0,0 +1,94 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_ASM_X86_SETUP_DATA_H +#define _UAPI_ASM_X86_SETUP_DATA_H + +/* setup_data/setup_indirect types */ +#define SETUP_NONE 0 +#define SETUP_E820_EXT 1 +#define SETUP_DTB 2 +#define SETUP_PCI 3 +#define SETUP_EFI 4 +#define SETUP_APPLE_PROPERTIES 5 +#define SETUP_JAILHOUSE 6 +#define SETUP_CC_BLOB 7 +#define SETUP_IMA 8 +#define SETUP_RNG_SEED 9 +#define SETUP_KEXEC_KHO 10 +#define SETUP_ENUM_MAX SETUP_KEXEC_KHO + +#define SETUP_INDIRECT (1<<31) +#define SETUP_TYPE_MAX (SETUP_ENUM_MAX | SETUP_INDIRECT) + +#ifndef __ASSEMBLER__ + +#include <linux/types.h> + +/* extensible setup data list node */ +struct setup_data { + __u64 next; + __u32 type; + __u32 len; + __u8 data[]; +}; + +/* extensible setup indirect data node */ +struct setup_indirect { + __u32 type; + __u32 reserved; /* Reserved, must be set to zero. */ + __u64 len; + __u64 addr; +}; + +/* + * The E820 memory region entry of the boot protocol ABI: + */ +struct boot_e820_entry { + __u64 addr; + __u64 size; + __u32 type; +} __attribute__((packed)); + +/* + * The boot loader is passing platform information via this Jailhouse-specific + * setup data structure. + */ +struct jailhouse_setup_data { + struct { + __u16 version; + __u16 compatible_version; + } __attribute__((packed)) hdr; + struct { + __u16 pm_timer_address; + __u16 num_cpus; + __u64 pci_mmconfig_base; + __u32 tsc_khz; + __u32 apic_khz; + __u8 standard_ioapic; + __u8 cpu_ids[255]; + } __attribute__((packed)) v1; + struct { + __u32 flags; + } __attribute__((packed)) v2; +} __attribute__((packed)); + +/* + * IMA buffer setup data information from the previous kernel during kexec + */ +struct ima_setup_data { + __u64 addr; + __u64 size; +} __attribute__((packed)); + +/* + * Locations of kexec handover metadata + */ +struct kho_data { + __u64 fdt_addr; + __u64 fdt_size; + __u64 scratch_addr; + __u64 scratch_size; +} __attribute__((packed)); + +#endif /* __ASSEMBLER__ */ + +#endif /* _UAPI_ASM_X86_SETUP_DATA_H */ diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h index f4b81587e90b..2dd35bbdc822 100644 --- a/arch/x86/include/uapi/asm/sgx.h +++ b/arch/x86/include/uapi/asm/sgx.h @@ -29,6 +29,12 @@ enum sgx_page_flags { _IOW(SGX_MAGIC, 0x03, struct sgx_enclave_provision) #define SGX_IOC_VEPC_REMOVE_ALL \ _IO(SGX_MAGIC, 0x04) +#define SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS \ + _IOWR(SGX_MAGIC, 0x05, struct sgx_enclave_restrict_permissions) +#define SGX_IOC_ENCLAVE_MODIFY_TYPES \ + _IOWR(SGX_MAGIC, 0x06, struct sgx_enclave_modify_types) +#define SGX_IOC_ENCLAVE_REMOVE_PAGES \ + _IOWR(SGX_MAGIC, 0x07, struct sgx_enclave_remove_pages) /** * struct sgx_enclave_create - parameter structure for the @@ -76,6 +82,62 @@ struct sgx_enclave_provision { __u64 fd; }; +/** + * struct sgx_enclave_restrict_permissions - parameters for ioctl + * %SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS + * @offset: starting page offset (page aligned relative to enclave base + * address defined in SECS) + * @length: length of memory (multiple of the page size) + * @permissions:new permission bits for pages in range described by @offset + * and @length + * @result: (output) SGX result code of ENCLS[EMODPR] function + * @count: (output) bytes successfully changed (multiple of page size) + */ +struct sgx_enclave_restrict_permissions { + __u64 offset; + __u64 length; + __u64 permissions; + __u64 result; + __u64 count; +}; + +/** + * struct sgx_enclave_modify_types - parameters for ioctl + * %SGX_IOC_ENCLAVE_MODIFY_TYPES + * @offset: starting page offset (page aligned relative to enclave base + * address defined in SECS) + * @length: length of memory (multiple of the page size) + * @page_type: new type for pages in range described by @offset and @length + * @result: (output) SGX result code of ENCLS[EMODT] function + * @count: (output) bytes successfully changed (multiple of page size) + */ +struct sgx_enclave_modify_types { + __u64 offset; + __u64 length; + __u64 page_type; + __u64 result; + __u64 count; +}; + +/** + * struct sgx_enclave_remove_pages - %SGX_IOC_ENCLAVE_REMOVE_PAGES parameters + * @offset: starting page offset (page aligned relative to enclave base + * address defined in SECS) + * @length: length of memory (multiple of the page size) + * @count: (output) bytes successfully changed (multiple of page size) + * + * Regular (PT_REG) or TCS (PT_TCS) can be removed from an initialized + * enclave if the system supports SGX2. First, the %SGX_IOC_ENCLAVE_MODIFY_TYPES + * ioctl() should be used to change the page type to PT_TRIM. After that + * succeeds ENCLU[EACCEPT] should be run from within the enclave and then + * %SGX_IOC_ENCLAVE_REMOVE_PAGES can be used to complete the page removal. + */ +struct sgx_enclave_remove_pages { + __u64 offset; + __u64 length; + __u64 count; +}; + struct sgx_enclave_run; /** diff --git a/arch/x86/include/uapi/asm/signal.h b/arch/x86/include/uapi/asm/signal.h index 777c3a0f4e23..1067efabf18b 100644 --- a/arch/x86/include/uapi/asm/signal.h +++ b/arch/x86/include/uapi/asm/signal.h @@ -2,9 +2,8 @@ #ifndef _UAPI_ASM_X86_SIGNAL_H #define _UAPI_ASM_X86_SIGNAL_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/types.h> -#include <linux/time.h> #include <linux/compiler.h> /* Avoid too many header ordering problems. */ @@ -17,7 +16,7 @@ struct siginfo; typedef unsigned long sigset_t; #endif /* __KERNEL__ */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define SIGHUP 1 @@ -69,7 +68,7 @@ typedef unsigned long sigset_t; #include <asm-generic/signal-defs.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ # ifndef __KERNEL__ @@ -107,6 +106,6 @@ typedef struct sigaltstack { __kernel_size_t ss_size; } stack_t; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _UAPI_ASM_X86_SIGNAL_H */ diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index f69c168391aa..9c640a521a67 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -95,6 +95,8 @@ #define SVM_EXIT_CR14_WRITE_TRAP 0x09e #define SVM_EXIT_CR15_WRITE_TRAP 0x09f #define SVM_EXIT_INVPCID 0x0a2 +#define SVM_EXIT_BUS_LOCK 0x0a5 +#define SVM_EXIT_IDLE_HLT 0x0a6 #define SVM_EXIT_NPF 0x400 #define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401 #define SVM_EXIT_AVIC_UNACCELERATED_ACCESS 0x402 @@ -115,7 +117,14 @@ #define SVM_VMGEXIT_AP_CREATE_ON_INIT 0 #define SVM_VMGEXIT_AP_CREATE 1 #define SVM_VMGEXIT_AP_DESTROY 2 +#define SVM_VMGEXIT_SNP_RUN_VMPL 0x80000018 #define SVM_VMGEXIT_HV_FEATURES 0x8000fffd +#define SVM_VMGEXIT_TERM_REQUEST 0x8000fffe +#define SVM_VMGEXIT_TERM_REASON(reason_set, reason_code) \ + /* SW_EXITINFO1[3:0] */ \ + (((((u64)reason_set) & 0xf)) | \ + /* SW_EXITINFO1[11:4] */ \ + ((((u64)reason_code) & 0xff) << 4)) #define SVM_VMGEXIT_UNSUPPORTED_EVENT 0x8000ffff /* Exit code reserved for hypervisor/software use */ @@ -217,6 +226,8 @@ { SVM_EXIT_CR4_WRITE_TRAP, "write_cr4_trap" }, \ { SVM_EXIT_CR8_WRITE_TRAP, "write_cr8_trap" }, \ { SVM_EXIT_INVPCID, "invpcid" }, \ + { SVM_EXIT_BUS_LOCK, "buslock" }, \ + { SVM_EXIT_IDLE_HLT, "idle-halt" }, \ { SVM_EXIT_NPF, "npf" }, \ { SVM_EXIT_AVIC_INCOMPLETE_IPI, "avic_incomplete_ipi" }, \ { SVM_EXIT_AVIC_UNACCELERATED_ACCESS, "avic_unaccelerated_access" }, \ diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 946d761adbd3..f0f4a4cf84a7 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -34,6 +34,7 @@ #define EXIT_REASON_TRIPLE_FAULT 2 #define EXIT_REASON_INIT_SIGNAL 3 #define EXIT_REASON_SIPI_SIGNAL 4 +#define EXIT_REASON_OTHER_SMI 6 #define EXIT_REASON_INTERRUPT_WINDOW 7 #define EXIT_REASON_NMI_WINDOW 8 @@ -91,6 +92,8 @@ #define EXIT_REASON_UMWAIT 67 #define EXIT_REASON_TPAUSE 68 #define EXIT_REASON_BUS_LOCK 74 +#define EXIT_REASON_NOTIFY 75 +#define EXIT_REASON_TDCALL 77 #define VMX_EXIT_REASONS \ { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ @@ -153,7 +156,9 @@ { EXIT_REASON_XRSTORS, "XRSTORS" }, \ { EXIT_REASON_UMWAIT, "UMWAIT" }, \ { EXIT_REASON_TPAUSE, "TPAUSE" }, \ - { EXIT_REASON_BUS_LOCK, "BUS_LOCK" } + { EXIT_REASON_BUS_LOCK, "BUS_LOCK" }, \ + { EXIT_REASON_NOTIFY, "NOTIFY" }, \ + { EXIT_REASON_TDCALL, "TDCALL" } #define VMX_EXIT_REASON_FLAGS \ { VMX_EXIT_REASONS_FAILED_VMENTRY, "FAILED_VMENTRY" } |