diff options
Diffstat (limited to 'arch/x86')
56 files changed, 1207 insertions, 827 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7399327d1eff..8910b09b5601 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -269,6 +269,7 @@ config X86 select HAVE_ARCH_KCSAN if X86_64 select X86_FEATURE_NAMES if PROC_FS select PROC_PID_ARCH_STATUS if PROC_FS + select HAVE_ARCH_NODE_DEV_GROUP if X86_SGX imply IMA_SECURE_AND_OR_TRUSTED_BOOT if EFI config INSTRUCTION_DECODER @@ -1523,16 +1524,20 @@ config X86_CPA_STATISTICS helps to determine the effectiveness of preserving large and huge page mappings when mapping protections are changed. +config X86_MEM_ENCRYPT + select ARCH_HAS_FORCE_DMA_UNENCRYPTED + select DYNAMIC_PHYSICAL_MASK + select ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS + def_bool n + config AMD_MEM_ENCRYPT bool "AMD Secure Memory Encryption (SME) support" depends on X86_64 && CPU_SUP_AMD select DMA_COHERENT_POOL - select DYNAMIC_PHYSICAL_MASK select ARCH_USE_MEMREMAP_PROT - select ARCH_HAS_FORCE_DMA_UNENCRYPTED select INSTRUCTION_DECODER - select ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS select ARCH_HAS_CC_PLATFORM + select X86_MEM_ENCRYPT help Say yes to enable support for the encryption of system memory. This requires an AMD processor that supports Secure Memory @@ -1917,6 +1922,7 @@ config X86_SGX select SRCU select MMU_NOTIFIER select NUMA_KEEP_MEMINFO if NUMA + select XARRAY_MULTI help Intel(R) Software Guard eXtensions (SGX) is a set of CPU instructions that can be used by applications to set aside private regions of code @@ -1932,6 +1938,7 @@ config EFI depends on ACPI select UCS2_STRING select EFI_RUNTIME_WRAPPERS + select ARCH_USE_MEMREMAP_PROT help This enables the kernel to use EFI runtime services that are available (such as the EFI variable services). diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 670e998fe930..28bcf04c022e 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -122,7 +122,7 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, static bool early_setup_sev_es(void) { if (!sev_es_negotiate_protocol()) - sev_es_terminate(GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED); + sev_es_terminate(GHCB_SEV_ES_PROT_UNSUPPORTED); if (set_page_decrypted((unsigned long)&boot_ghcb_page)) return false; @@ -175,7 +175,7 @@ void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code) enum es_result result; if (!boot_ghcb && !early_setup_sev_es()) - sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); + sev_es_terminate(GHCB_SEV_ES_GEN_REQ); vc_ghcb_invalidate(boot_ghcb); result = vc_init_em_ctxt(&ctxt, regs, exit_code); @@ -202,5 +202,5 @@ finish: if (result == ES_OK) vc_finish_insn(&ctxt); else if (result != ES_RETRY) - sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); + sev_es_terminate(GHCB_SEV_ES_GEN_REQ); } diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 38b2c779146f..68dea7ce6a22 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2476,7 +2476,7 @@ static int x86_pmu_event_init(struct perf_event *event) if (READ_ONCE(x86_pmu.attr_rdpmc) && !(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS)) - event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED; + event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT; return err; } @@ -2510,7 +2510,7 @@ void perf_clear_dirty_counters(void) static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) { - if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) + if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)) return; /* @@ -2531,7 +2531,7 @@ static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm) { - if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) + if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)) return; if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed)) @@ -2542,7 +2542,7 @@ static int x86_pmu_event_idx(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - if (!(hwc->flags & PERF_X86_EVENT_RDPMC_ALLOWED)) + if (!(hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT)) return 0; if (is_metric_idx(hwc->idx)) @@ -2725,7 +2725,7 @@ void arch_perf_update_userpage(struct perf_event *event, userpg->cap_user_time = 0; userpg->cap_user_time_zero = 0; userpg->cap_user_rdpmc = - !!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED); + !!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT); userpg->pmc_width = x86_pmu.cntval_bits; if (!using_native_sched_clock() || !sched_clock_stable()) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 5480db242083..9d376e528dfc 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -74,7 +74,7 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode) #define PERF_X86_EVENT_PEBS_NA_HSW 0x0010 /* haswell style datala, unknown */ #define PERF_X86_EVENT_EXCL 0x0020 /* HT exclusivity on counter */ #define PERF_X86_EVENT_DYNAMIC 0x0040 /* dynamic alloc'd constraint */ -#define PERF_X86_EVENT_RDPMC_ALLOWED 0x0080 /* grant rdpmc permission */ + #define PERF_X86_EVENT_EXCL_ACCT 0x0100 /* accounted EXCL event */ #define PERF_X86_EVENT_AUTO_RELOAD 0x0200 /* use PEBS auto-reload */ #define PERF_X86_EVENT_LARGE_PEBS 0x0400 /* use large PEBS */ diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 4d0b126835b8..63158fd55856 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -197,8 +197,6 @@ static inline bool efi_runtime_supported(void) extern void parse_efi_setup(u64 phys_addr, u32 data_len); -extern void efifb_setup_from_dmi(struct screen_info *si, const char *opt); - extern void efi_thunk_runtime_setup(void); efi_status_t efi_set_virtual_address_map(unsigned long memory_map_size, unsigned long descriptor_size, diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h index 4ec3613551e3..43785ee363f1 100644 --- a/arch/x86/include/asm/insn-eval.h +++ b/arch/x86/include/asm/insn-eval.h @@ -19,6 +19,7 @@ bool insn_has_rep_prefix(struct insn *insn); void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs); int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs); int insn_get_modrm_reg_off(struct insn *insn, struct pt_regs *regs); +unsigned long *insn_get_modrm_reg_ptr(struct insn *insn, struct pt_regs *regs); unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx); int insn_get_code_seg_params(struct pt_regs *regs); int insn_get_effective_ip(struct pt_regs *regs, unsigned long *ip); @@ -29,4 +30,16 @@ int insn_fetch_from_user_inatomic(struct pt_regs *regs, bool insn_decode_from_regs(struct insn *insn, struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE], int buf_size); +enum mmio_type { + MMIO_DECODE_FAILED, + MMIO_WRITE, + MMIO_WRITE_IMM, + MMIO_READ, + MMIO_READ_ZERO_EXTEND, + MMIO_READ_SIGN_EXTEND, + MMIO_MOVS, +}; + +enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes); + #endif /* _ASM_X86_INSN_EVAL_H */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 5c6a4af0b911..f6d91ecb8026 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -40,6 +40,7 @@ #include <linux/string.h> #include <linux/compiler.h> +#include <linux/cc_platform.h> #include <asm/page.h> #include <asm/early_ioremap.h> #include <asm/pgtable_types.h> @@ -256,21 +257,6 @@ static inline void slow_down_io(void) #endif -#ifdef CONFIG_AMD_MEM_ENCRYPT -#include <linux/jump_label.h> - -extern struct static_key_false sev_enable_key; -static inline bool sev_key_active(void) -{ - return static_branch_unlikely(&sev_enable_key); -} - -#else /* !CONFIG_AMD_MEM_ENCRYPT */ - -static inline bool sev_key_active(void) { return false; } - -#endif /* CONFIG_AMD_MEM_ENCRYPT */ - #define BUILDIO(bwl, bw, type) \ static inline void out##bwl(unsigned type value, int port) \ { \ @@ -301,7 +287,7 @@ static inline unsigned type in##bwl##_p(int port) \ \ static inline void outs##bwl(int port, const void *addr, unsigned long count) \ { \ - if (sev_key_active()) { \ + if (cc_platform_has(CC_ATTR_GUEST_UNROLL_STRING_IO)) { \ unsigned type *value = (unsigned type *)addr; \ while (count) { \ out##bwl(*value, port); \ @@ -317,7 +303,7 @@ static inline void outs##bwl(int port, const void *addr, unsigned long count) \ \ static inline void ins##bwl(int port, void *addr, unsigned long count) \ { \ - if (sev_key_active()) { \ + if (cc_platform_has(CC_ATTR_GUEST_UNROLL_STRING_IO)) { \ unsigned type *value = (unsigned type *)addr; \ while (count) { \ *value = in##bwl(port); \ diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index c5ce9845c999..87761396e8cc 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -114,8 +114,6 @@ static __always_inline unsigned long arch_local_irq_save(void) #define SAVE_FLAGS pushfq; popq %rax #endif -#define INTERRUPT_RETURN jmp native_iret - #endif #endif /* __ASSEMBLY__ */ @@ -143,8 +141,13 @@ static __always_inline void arch_local_irq_restore(unsigned long flags) #ifdef CONFIG_X86_64 #ifdef CONFIG_XEN_PV #define SWAPGS ALTERNATIVE "swapgs", "", X86_FEATURE_XENPV +#define INTERRUPT_RETURN \ + ANNOTATE_RETPOLINE_SAFE; \ + ALTERNATIVE_TERNARY("jmp *paravirt_iret(%rip);", \ + X86_FEATURE_XENPV, "jmp xen_iret;", "jmp native_iret;") #else #define SWAPGS swapgs +#define INTERRUPT_RETURN jmp native_iret #endif #endif #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index cefe1d81e2e8..9e50da3ed01a 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -47,6 +47,7 @@ KVM_X86_OP(set_dr7) KVM_X86_OP(cache_reg) KVM_X86_OP(get_rflags) KVM_X86_OP(set_rflags) +KVM_X86_OP(get_if_flag) KVM_X86_OP(tlb_flush_all) KVM_X86_OP(tlb_flush_current) KVM_X86_OP_NULL(tlb_remote_flush) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 860ed500580c..555f4de47ef2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -97,7 +97,7 @@ KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_TLB_FLUSH_CURRENT KVM_ARCH_REQ(26) #define KVM_REQ_TLB_FLUSH_GUEST \ - KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP) + KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_APF_READY KVM_ARCH_REQ(28) #define KVM_REQ_MSR_FILTER_CHANGED KVM_ARCH_REQ(29) #define KVM_REQ_UPDATE_CPU_DIRTY_LOGGING \ @@ -1349,6 +1349,7 @@ struct kvm_x86_ops { void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); + bool (*get_if_flag)(struct kvm_vcpu *vcpu); void (*tlb_flush_all)(struct kvm_vcpu *vcpu); void (*tlb_flush_current)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index 829df26fd7a3..76d726074c16 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -24,8 +24,8 @@ #define _ASM_X86_MTRR_H #include <uapi/asm/mtrr.h> -#include <asm/memtype.h> +void mtrr_bp_init(void); /* * The following functions are for use by other drivers that cannot use @@ -43,7 +43,6 @@ extern int mtrr_del(int reg, unsigned long base, unsigned long size); extern int mtrr_del_page(int reg, unsigned long base, unsigned long size); extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi); extern void mtrr_ap_init(void); -extern void mtrr_bp_init(void); extern void set_mtrr_aps_delayed_init(void); extern void mtrr_aps_init(void); extern void mtrr_bp_restore(void); @@ -84,11 +83,6 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn) static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) { } -static inline void mtrr_bp_init(void) -{ - pat_disable("PAT support disabled because CONFIG_MTRR is disabled in the kernel."); -} - #define mtrr_ap_init() do {} while (0) #define set_mtrr_aps_delayed_init() do {} while (0) #define mtrr_aps_init() do {} while (0) diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index 4bde0dc66100..e9c86299b835 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -5,6 +5,7 @@ #include <asm/page_64_types.h> #ifndef __ASSEMBLY__ +#include <asm/cpufeatures.h> #include <asm/alternative.h> /* duplicated to the one in bootmem.h */ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 21c4a694ca11..27d276232c80 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -752,11 +752,6 @@ extern void default_banner(void); #define PARA_SITE(ptype, ops) _PVSITE(ptype, ops, .quad, 8) #define PARA_INDIRECT(addr) *addr(%rip) -#define INTERRUPT_RETURN \ - ANNOTATE_RETPOLINE_SAFE; \ - ALTERNATIVE_TERNARY("jmp *paravirt_iret(%rip);", \ - X86_FEATURE_XENPV, "jmp xen_iret;", "jmp native_iret;") - #ifdef CONFIG_DEBUG_ENTRY .macro PARA_IRQ_save_fl PARA_SITE(PARA_PATCH(PV_IRQ_save_fl), diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 448cd01eb3ec..a34430b7af4a 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -22,6 +22,7 @@ #define pgprot_decrypted(prot) __pgprot(__sme_clr(pgprot_val(prot))) #ifndef __ASSEMBLY__ +#include <linux/spinlock.h> #include <asm/x86_init.h> #include <asm/pkru.h> #include <asm/fpu/api.h> diff --git a/arch/x86/include/asm/pkru.h b/arch/x86/include/asm/pkru.h index 4cd49afa0ca4..74f0a2d34ffd 100644 --- a/arch/x86/include/asm/pkru.h +++ b/arch/x86/include/asm/pkru.h @@ -4,8 +4,8 @@ #include <asm/cpufeature.h> -#define PKRU_AD_BIT 0x1 -#define PKRU_WD_BIT 0x2 +#define PKRU_AD_BIT 0x1u +#define PKRU_WD_BIT 0x2u #define PKRU_BITS_PER_PKEY 2 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 355d38c0cf60..2c5f12ae7d04 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -855,4 +855,12 @@ enum mds_mitigations { MDS_MITIGATION_VMWERV, }; +#ifdef CONFIG_X86_SGX +int arch_memory_failure(unsigned long pfn, int flags); +#define arch_memory_failure arch_memory_failure + +bool arch_is_platform_page(u64 paddr); +#define arch_is_platform_page arch_is_platform_page +#endif + #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index 5db5d083c873..331474b150f1 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -89,6 +89,7 @@ static inline void set_real_mode_mem(phys_addr_t mem) } void reserve_real_mode(void); +void load_trampoline_pgtable(void); #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index 872617542bbc..ff0f2d90338a 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -2,6 +2,7 @@ #ifndef _ASM_X86_SET_MEMORY_H #define _ASM_X86_SET_MEMORY_H +#include <linux/mm.h> #include <asm/page.h> #include <asm-generic/set_memory.h> @@ -99,6 +100,9 @@ static inline int set_mce_nospec(unsigned long pfn, bool unmap) unsigned long decoy_addr; int rc; + /* SGX pages are not in the 1:1 map */ + if (arch_is_platform_page(pfn << PAGE_SHIFT)) + return 0; /* * We would like to just call: * set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1); diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index 6acaf5af0a3d..1b2fd32b42fe 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -18,20 +18,19 @@ /* SEV Information Request/Response */ #define GHCB_MSR_SEV_INFO_RESP 0x001 #define GHCB_MSR_SEV_INFO_REQ 0x002 -#define GHCB_MSR_VER_MAX_POS 48 -#define GHCB_MSR_VER_MAX_MASK 0xffff -#define GHCB_MSR_VER_MIN_POS 32 -#define GHCB_MSR_VER_MIN_MASK 0xffff -#define GHCB_MSR_CBIT_POS 24 -#define GHCB_MSR_CBIT_MASK 0xff -#define GHCB_MSR_SEV_INFO(_max, _min, _cbit) \ - ((((_max) & GHCB_MSR_VER_MAX_MASK) << GHCB_MSR_VER_MAX_POS) | \ - (((_min) & GHCB_MSR_VER_MIN_MASK) << GHCB_MSR_VER_MIN_POS) | \ - (((_cbit) & GHCB_MSR_CBIT_MASK) << GHCB_MSR_CBIT_POS) | \ + +#define GHCB_MSR_SEV_INFO(_max, _min, _cbit) \ + /* GHCBData[63:48] */ \ + ((((_max) & 0xffff) << 48) | \ + /* GHCBData[47:32] */ \ + (((_min) & 0xffff) << 32) | \ + /* GHCBData[31:24] */ \ + (((_cbit) & 0xff) << 24) | \ GHCB_MSR_SEV_INFO_RESP) + #define GHCB_MSR_INFO(v) ((v) & 0xfffUL) -#define GHCB_MSR_PROTO_MAX(v) (((v) >> GHCB_MSR_VER_MAX_POS) & GHCB_MSR_VER_MAX_MASK) -#define GHCB_MSR_PROTO_MIN(v) (((v) >> GHCB_MSR_VER_MIN_POS) & GHCB_MSR_VER_MIN_MASK) +#define GHCB_MSR_PROTO_MAX(v) (((v) >> 48) & 0xffff) +#define GHCB_MSR_PROTO_MIN(v) (((v) >> 32) & 0xffff) /* CPUID Request/Response */ #define GHCB_MSR_CPUID_REQ 0x004 @@ -46,30 +45,36 @@ #define GHCB_CPUID_REQ_EBX 1 #define GHCB_CPUID_REQ_ECX 2 #define GHCB_CPUID_REQ_EDX 3 -#define GHCB_CPUID_REQ(fn, reg) \ - (GHCB_MSR_CPUID_REQ | \ - (((unsigned long)reg & GHCB_MSR_CPUID_REG_MASK) << GHCB_MSR_CPUID_REG_POS) | \ - (((unsigned long)fn) << GHCB_MSR_CPUID_FUNC_POS)) +#define GHCB_CPUID_REQ(fn, reg) \ + /* GHCBData[11:0] */ \ + (GHCB_MSR_CPUID_REQ | \ + /* GHCBData[31:12] */ \ + (((unsigned long)(reg) & 0x3) << 30) | \ + /* GHCBData[63:32] */ \ + (((unsigned long)fn) << 32)) /* AP Reset Hold */ -#define GHCB_MSR_AP_RESET_HOLD_REQ 0x006 -#define GHCB_MSR_AP_RESET_HOLD_RESP 0x007 +#define GHCB_MSR_AP_RESET_HOLD_REQ 0x006 +#define GHCB_MSR_AP_RESET_HOLD_RESP 0x007 /* GHCB Hypervisor Feature Request/Response */ -#define GHCB_MSR_HV_FT_REQ 0x080 -#define GHCB_MSR_HV_FT_RESP 0x081 +#define GHCB_MSR_HV_FT_REQ 0x080 +#define GHCB_MSR_HV_FT_RESP 0x081 #define GHCB_MSR_TERM_REQ 0x100 #define GHCB_MSR_TERM_REASON_SET_POS 12 #define GHCB_MSR_TERM_REASON_SET_MASK 0xf #define GHCB_MSR_TERM_REASON_POS 16 #define GHCB_MSR_TERM_REASON_MASK 0xff -#define GHCB_SEV_TERM_REASON(reason_set, reason_val) \ - (((((u64)reason_set) & GHCB_MSR_TERM_REASON_SET_MASK) << GHCB_MSR_TERM_REASON_SET_POS) | \ - ((((u64)reason_val) & GHCB_MSR_TERM_REASON_MASK) << GHCB_MSR_TERM_REASON_POS)) -#define GHCB_SEV_ES_REASON_GENERAL_REQUEST 0 -#define GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED 1 +#define GHCB_SEV_TERM_REASON(reason_set, reason_val) \ + /* GHCBData[15:12] */ \ + (((((u64)reason_set) & 0xf) << 12) | \ + /* GHCBData[23:16] */ \ + ((((u64)reason_val) & 0xff) << 16)) + +#define GHCB_SEV_ES_GEN_REQ 0 +#define GHCB_SEV_ES_PROT_UNSUPPORTED 1 #define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index b587a9ee9cb2..98fa0a114074 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -261,4 +261,9 @@ extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); #endif /* !MODULE */ +static inline void __native_tlb_flush_global(unsigned long cr4) +{ + native_write_cr4(cr4 ^ X86_CR4_PGE); + native_write_cr4(cr4); +} #endif /* _ASM_X86_TLBFLUSH_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 2ff3e600f426..6aef9ee28a39 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -84,7 +84,7 @@ obj-$(CONFIG_IA32_EMULATION) += tls.o obj-y += step.o obj-$(CONFIG_INTEL_TXT) += tboot.o obj-$(CONFIG_ISA_DMA_API) += i8237.o -obj-$(CONFIG_STACKTRACE) += stacktrace.o +obj-y += stacktrace.o obj-y += cpu/ obj-y += acpi/ obj-y += reboot.o diff --git a/arch/x86/kernel/cc_platform.c b/arch/x86/kernel/cc_platform.c index 03bb2f343ddb..8a25b1c0d480 100644 --- a/arch/x86/kernel/cc_platform.c +++ b/arch/x86/kernel/cc_platform.c @@ -50,6 +50,14 @@ static bool amd_cc_platform_has(enum cc_attr attr) case CC_ATTR_GUEST_STATE_ENCRYPT: return sev_status & MSR_AMD64_SEV_ES_ENABLED; + /* + * With SEV, the rep string I/O instructions need to be unrolled + * but SEV-ES supports them through the #VC handler. + */ + case CC_ATTR_GUEST_UNROLL_STRING_IO: + return (sev_status & MSR_AMD64_SEV_ENABLED) && + !(sev_status & MSR_AMD64_SEV_ES_ENABLED); + default: return false; } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 0083464de5e3..79b3d67addcc 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -384,7 +384,7 @@ set_register: } EXPORT_SYMBOL(native_write_cr0); -void native_write_cr4(unsigned long val) +void __no_profile native_write_cr4(unsigned long val) { unsigned long bits_changed = 0; diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index c9f0f3d63f75..eaf25a234ff5 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -282,7 +282,7 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) u64 shift = 64 - width, chunks; chunks = (cur_msr << shift) - (prev_msr << shift); - return chunks >>= shift; + return chunks >> shift; } static u64 __mon_event_count(u32 rmid, struct rmid_read *rr) diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 8471a8b9b48e..4b41efc9e367 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -6,11 +6,13 @@ #include <linux/highmem.h> #include <linux/kthread.h> #include <linux/miscdevice.h> +#include <linux/node.h> #include <linux/pagemap.h> #include <linux/ratelimit.h> #include <linux/sched/mm.h> #include <linux/sched/signal.h> #include <linux/slab.h> +#include <linux/sysfs.h> #include <asm/sgx.h> #include "driver.h" #include "encl.h" @@ -20,6 +22,7 @@ struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS]; static int sgx_nr_epc_sections; static struct task_struct *ksgxd_tsk; static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq); +static DEFINE_XARRAY(sgx_epc_address_space); /* * These variables are part of the state of the reclaimer, and must be accessed @@ -60,6 +63,24 @@ static void __sgx_sanitize_pages(struct list_head *dirty_page_list) page = list_first_entry(dirty_page_list, struct sgx_epc_page, list); + /* + * Checking page->poison without holding the node->lock + * is racy, but losing the race (i.e. poison is set just + * after the check) just means __eremove() will be uselessly + * called for a page that sgx_free_epc_page() will put onto + * the node->sgx_poison_page_list later. + */ + if (page->poison) { + struct sgx_epc_section *section = &sgx_epc_sections[page->section]; + struct sgx_numa_node *node = section->node; + + spin_lock(&node->lock); + list_move(&page->list, &node->sgx_poison_page_list); + spin_unlock(&node->lock); + + continue; + } + ret = __eremove(sgx_get_epc_virt_addr(page)); if (!ret) { /* @@ -471,6 +492,7 @@ static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid) page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list); list_del_init(&page->list); + page->flags = 0; spin_unlock(&node->lock); atomic_long_dec(&sgx_nr_free_pages); @@ -624,7 +646,12 @@ void sgx_free_epc_page(struct sgx_epc_page *page) spin_lock(&node->lock); - list_add_tail(&page->list, &node->free_page_list); + page->owner = NULL; + if (page->poison) + list_add(&page->list, &node->sgx_poison_page_list); + else + list_add_tail(&page->list, &node->free_page_list); + page->flags = SGX_EPC_PAGE_IS_FREE; spin_unlock(&node->lock); atomic_long_inc(&sgx_nr_free_pages); @@ -648,17 +675,102 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, } section->phys_addr = phys_addr; + xa_store_range(&sgx_epc_address_space, section->phys_addr, + phys_addr + size - 1, section, GFP_KERNEL); for (i = 0; i < nr_pages; i++) { section->pages[i].section = index; section->pages[i].flags = 0; section->pages[i].owner = NULL; + section->pages[i].poison = 0; list_add_tail(§ion->pages[i].list, &sgx_dirty_page_list); } return true; } +bool arch_is_platform_page(u64 paddr) +{ + return !!xa_load(&sgx_epc_address_space, paddr); +} +EXPORT_SYMBOL_GPL(arch_is_platform_page); + +static struct sgx_epc_page *sgx_paddr_to_page(u64 paddr) +{ + struct sgx_epc_section *section; + + section = xa_load(&sgx_epc_address_space, paddr); + if (!section) + return NULL; + + return §ion->pages[PFN_DOWN(paddr - section->phys_addr)]; +} + +/* + * Called in process context to handle a hardware reported + * error in an SGX EPC page. + * If the MF_ACTION_REQUIRED bit is set in flags, then the + * context is the task that consumed the poison data. Otherwise + * this is called from a kernel thread unrelated to the page. + */ +int arch_memory_failure(unsigned long pfn, int flags) +{ + struct sgx_epc_page *page = sgx_paddr_to_page(pfn << PAGE_SHIFT); + struct sgx_epc_section *section; + struct sgx_numa_node *node; + + /* + * mm/memory-failure.c calls this routine for all errors + * where there isn't a "struct page" for the address. But that + * includes other address ranges besides SGX. + */ + if (!page) + return -ENXIO; + + /* + * If poison was consumed synchronously. Send a SIGBUS to + * the task. Hardware has already exited the SGX enclave and + * will not allow re-entry to an enclave that has a memory + * error. The signal may help the task understand why the + * enclave is broken. + */ + if (flags & MF_ACTION_REQUIRED) + force_sig(SIGBUS); + + section = &sgx_epc_sections[page->section]; + node = section->node; + + spin_lock(&node->lock); + + /* Already poisoned? Nothing more to do */ + if (page->poison) + goto out; + + page->poison = 1; + + /* + * If the page is on a free list, move it to the per-node + * poison page list. + */ + if (page->flags & SGX_EPC_PAGE_IS_FREE) { + list_move(&page->list, &node->sgx_poison_page_list); + goto out; + } + + /* + * TBD: Add additional plumbing to enable pre-emptive + * action for asynchronous poison notification. Until + * then just hope that the poison: + * a) is not accessed - sgx_free_epc_page() will deal with it + * when the user gives it back + * b) results in a recoverable machine check rather than + * a fatal one + */ +out: + spin_unlock(&node->lock); + return 0; +} + /** * A section metric is concatenated in a way that @low bits 12-31 define the * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the @@ -670,6 +782,48 @@ static inline u64 __init sgx_calc_section_metric(u64 low, u64 high) ((high & GENMASK_ULL(19, 0)) << 32); } +#ifdef CONFIG_NUMA +static ssize_t sgx_total_bytes_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lu\n", sgx_numa_nodes[dev->id].size); +} +static DEVICE_ATTR_RO(sgx_total_bytes); + +static umode_t arch_node_attr_is_visible(struct kobject *kobj, + struct attribute *attr, int idx) +{ + /* Make all x86/ attributes invisible when SGX is not initialized: */ + if (nodes_empty(sgx_numa_mask)) + return 0; + + return attr->mode; +} + +static struct attribute *arch_node_dev_attrs[] = { + &dev_attr_sgx_total_bytes.attr, + NULL, +}; + +const struct attribute_group arch_node_dev_group = { + .name = "x86", + .attrs = arch_node_dev_attrs, + .is_visible = arch_node_attr_is_visible, +}; + +static void __init arch_update_sysfs_visibility(int nid) +{ + struct node *node = node_devices[nid]; + int ret; + + ret = sysfs_update_group(&node->dev.kobj, &arch_node_dev_group); + + if (ret) + pr_err("sysfs update failed (%d), files may be invisible", ret); +} +#else /* !CONFIG_NUMA */ +static void __init arch_update_sysfs_visibility(int nid) {} +#endif + static bool __init sgx_page_cache_init(void) { u32 eax, ebx, ecx, edx, type; @@ -713,10 +867,16 @@ static bool __init sgx_page_cache_init(void) if (!node_isset(nid, sgx_numa_mask)) { spin_lock_init(&sgx_numa_nodes[nid].lock); INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list); + INIT_LIST_HEAD(&sgx_numa_nodes[nid].sgx_poison_page_list); node_set(nid, sgx_numa_mask); + sgx_numa_nodes[nid].size = 0; + + /* Make SGX-specific node sysfs files visible: */ + arch_update_sysfs_visibility(nid); } sgx_epc_sections[i].node = &sgx_numa_nodes[nid]; + sgx_numa_nodes[nid].size += size; sgx_nr_epc_sections++; } diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index 4628acec0009..0f17def9fe6f 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -26,9 +26,13 @@ /* Pages, which are being tracked by the page reclaimer. */ #define SGX_EPC_PAGE_RECLAIMER_TRACKED BIT(0) +/* Pages on free list */ +#define SGX_EPC_PAGE_IS_FREE BIT(1) + struct sgx_epc_page { unsigned int section; - unsigned int flags; + u16 flags; + u16 poison; struct sgx_encl_page *owner; struct list_head list; }; @@ -39,6 +43,8 @@ struct sgx_epc_page { */ struct sgx_numa_node { struct list_head free_page_list; + struct list_head sgx_poison_page_list; + unsigned long size; spinlock_t lock; }; diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 8ea306b1bf8e..dd3777ac0443 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -99,6 +99,19 @@ bool irq_fpu_usable(void) EXPORT_SYMBOL(irq_fpu_usable); /* + * Track AVX512 state use because it is known to slow the max clock + * speed of the core. + */ +static void update_avx_timestamp(struct fpu *fpu) +{ + +#define AVX512_TRACKING_MASK (XFEATURE_MASK_ZMM_Hi256 | XFEATURE_MASK_Hi16_ZMM) + + if (fpu->fpstate->regs.xsave.header.xfeatures & AVX512_TRACKING_MASK) + fpu->avx512_timestamp = jiffies; +} + +/* * Save the FPU register state in fpu->fpstate->regs. The register state is * preserved. * @@ -116,13 +129,7 @@ void save_fpregs_to_fpstate(struct fpu *fpu) { if (likely(use_xsave())) { os_xsave(fpu->fpstate); - - /* - * AVX512 state is tracked here because its use is - * known to slow the max clock speed of the core. - */ - if (fpu->fpstate->regs.xsave.header.xfeatures & XFEATURE_MASK_AVX512) - fpu->avx512_timestamp = jiffies; + update_avx_timestamp(fpu); return; } diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index fc5371a7e9d1..de563db9cdcd 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -126,6 +126,36 @@ static bool __head check_la57_support(unsigned long physaddr) } #endif +static unsigned long sme_postprocess_startup(struct boot_params *bp, pmdval_t *pmd) +{ + unsigned long vaddr, vaddr_end; + int i; + + /* Encrypt the kernel and related (if SME is active) */ + sme_encrypt_kernel(bp); + + /* + * Clear the memory encryption mask from the .bss..decrypted section. + * The bss section will be memset to zero later in the initialization so + * there is no need to zero it after changing the memory encryption + * attribute. + */ + if (sme_get_me_mask()) { + vaddr = (unsigned long)__start_bss_decrypted; + vaddr_end = (unsigned long)__end_bss_decrypted; + for (; vaddr < vaddr_end; vaddr += PMD_SIZE) { + i = pmd_index(vaddr); + pmd[i] -= sme_get_me_mask(); + } + } + + /* + * Return the SME encryption mask (if SME is active) to be used as a + * modifier for the initial pgdir entry programmed into CR3. + */ + return sme_get_me_mask(); +} + /* Code in __startup_64() can be relocated during execution, but the compiler * doesn't have to generate PC-relative relocations when accessing globals from * that function. Clang actually does not generate them, which leads to @@ -135,7 +165,6 @@ static bool __head check_la57_support(unsigned long physaddr) unsigned long __head __startup_64(unsigned long physaddr, struct boot_params *bp) { - unsigned long vaddr, vaddr_end; unsigned long load_delta, *p; unsigned long pgtable_flags; pgdval_t *pgd; @@ -276,34 +305,7 @@ unsigned long __head __startup_64(unsigned long physaddr, */ *fixup_long(&phys_base, physaddr) += load_delta - sme_get_me_mask(); - /* Encrypt the kernel and related (if SME is active) */ - sme_encrypt_kernel(bp); - - /* - * Clear the memory encryption mask from the .bss..decrypted section. - * The bss section will be memset to zero later in the initialization so - * there is no need to zero it after changing the memory encryption - * attribute. - * - * This is early code, use an open coded check for SME instead of - * using cc_platform_has(). This eliminates worries about removing - * instrumentation or checking boot_cpu_data in the cc_platform_has() - * function. - */ - if (sme_get_me_mask()) { - vaddr = (unsigned long)__start_bss_decrypted; - vaddr_end = (unsigned long)__end_bss_decrypted; - for (; vaddr < vaddr_end; vaddr += PMD_SIZE) { - i = pmd_index(vaddr); - pmd[i] -= sme_get_me_mask(); - } - } - - /* - * Return the SME encryption mask (if SME is active) to be used as a - * modifier for the initial pgdir entry programmed into CR3. - */ - return sme_get_me_mask(); + return sme_postprocess_startup(bp, pmd); } unsigned long __startup_secondary_64(void) @@ -485,6 +487,10 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) clear_bss(); + /* + * This needs to happen *before* kasan_early_init() because latter maps stuff + * into that page. + */ clear_page(init_top_pgt); /* @@ -496,6 +502,16 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) kasan_early_init(); + /* + * Flush global TLB entries which could be left over from the trampoline page + * table. + * + * This needs to happen *after* kasan_early_init() as KASAN-enabled .configs + * instrument native_write_cr4() so KASAN must be initialized for that + * instrumentation to work. + */ + __native_tlb_flush_global(this_cpu_read(cpu_tlbstate.cr4)); + idt_setup_early_handler(); copy_bootdata(__va(real_mode_data)); diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index d8b3ebd2bb85..9c63fc5988cd 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -166,9 +166,26 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) call sev_verify_cbit popq %rsi - /* Switch to new page-table */ + /* + * Switch to new page-table + * + * For the boot CPU this switches to early_top_pgt which still has the + * indentity mappings present. The secondary CPUs will switch to the + * init_top_pgt here, away from the trampoline_pgd and unmap the + * indentity mapped ranges. + */ movq %rax, %cr3 + /* + * Do a global TLB flush after the CR3 switch to make sure the TLB + * entries from the identity mapping are flushed. + */ + movq %cr4, %rcx + movq %rcx, %rax + xorq $X86_CR4_PGE, %rcx + movq %rcx, %cr4 + movq %rax, %cr4 + /* Ensure I am executing from virtual addresses */ movq $1f, %rax ANNOTATE_RETPOLINE_SAFE diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 0a40df66a40d..fa700b46588e 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -113,17 +113,9 @@ void __noreturn machine_real_restart(unsigned int type) spin_unlock(&rtc_lock); /* - * Switch back to the initial page table. + * Switch to the trampoline page table. */ -#ifdef CONFIG_X86_32 - load_cr3(initial_page_table); -#else - write_cr3(real_mode_header->trampoline_pgd); - - /* Exiting long mode will fail if CR4.PCIDE is set. */ - if (boot_cpu_has(X86_FEATURE_PCID)) - cr4_clear_bits(X86_CR4_PCIDE); -#endif + load_trampoline_pgtable(); /* Jump to the identity-mapped low memory code */ #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 6a190c7f4d71..f7a132eb794d 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -40,6 +40,7 @@ #include <asm/kasan.h> #include <asm/kaslr.h> #include <asm/mce.h> +#include <asm/memtype.h> #include <asm/mtrr.h> #include <asm/realmode.h> #include <asm/olpc_ofw.h> @@ -713,9 +714,6 @@ static void __init early_reserve_memory(void) early_reserve_initrd(); - if (efi_enabled(EFI_BOOT)) - efi_memblock_x86_reserve_range(); - memblock_x86_reserve_range_setup_data(); reserve_ibft_region(); @@ -742,28 +740,6 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) return 0; } -static char * __init prepare_command_line(void) -{ -#ifdef CONFIG_CMDLINE_BOOL -#ifdef CONFIG_CMDLINE_OVERRIDE - strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); -#else - if (builtin_cmdline[0]) { - /* append boot loader cmdline to builtin */ - strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE); - strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE); - strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); - } -#endif -#endif - - strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); - - parse_early_param(); - - return command_line; -} - /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -853,23 +829,6 @@ void __init setup_arch(char **cmdline_p) x86_init.oem.arch_setup(); /* - * x86_configure_nx() is called before parse_early_param() (called by - * prepare_command_line()) to detect whether hardware doesn't support - * NX (so that the early EHCI debug console setup can safely call - * set_fixmap()). It may then be called again from within noexec_setup() - * during parsing early parameters to honor the respective command line - * option. - */ - x86_configure_nx(); - - /* - * This parses early params and it needs to run before - * early_reserve_memory() because latter relies on such settings - * supplied as early params. - */ - *cmdline_p = prepare_command_line(); - - /* * Do some memory reservations *before* memory is added to memblock, so * memblock allocations won't overwrite it. * @@ -902,6 +861,36 @@ void __init setup_arch(char **cmdline_p) bss_resource.start = __pa_symbol(__bss_start); bss_resource.end = __pa_symbol(__bss_stop)-1; +#ifdef CONFIG_CMDLINE_BOOL +#ifdef CONFIG_CMDLINE_OVERRIDE + strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); +#else + if (builtin_cmdline[0]) { + /* append boot loader cmdline to builtin */ + strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE); + strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE); + strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); + } +#endif +#endif + + strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); + *cmdline_p = command_line; + + /* + * x86_configure_nx() is called before parse_early_param() to detect + * whether hardware doesn't support NX (so that the early EHCI debug + * console setup can safely call set_fixmap()). It may then be called + * again from within noexec_setup() during parsing early parameters + * to honor the respective command line option. + */ + x86_configure_nx(); + + parse_early_param(); + + if (efi_enabled(EFI_BOOT)) + efi_memblock_x86_reserve_range(); + #ifdef CONFIG_MEMORY_HOTPLUG /* * Memory used by the kernel cannot be hot-removed because Linux @@ -979,7 +968,11 @@ void __init setup_arch(char **cmdline_p) max_pfn = e820__end_of_ram_pfn(); /* update e820 for memory not covered by WB MTRRs */ - mtrr_bp_init(); + if (IS_ENABLED(CONFIG_MTRR)) + mtrr_bp_init(); + else + pat_disable("PAT support disabled because CONFIG_MTRR is disabled in the kernel."); + if (mtrr_trim_uncached_memory(max_pfn)) max_pfn = e820__end_of_ram_pfn(); diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index 787dc5f568b5..ce987688bbc0 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -221,7 +221,7 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) fail: /* Terminate the guest */ - sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); + sev_es_terminate(GHCB_SEV_ES_GEN_REQ); } static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt, diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index a9fc2ac7a8bd..e6d316a01fdd 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -26,6 +26,7 @@ #include <asm/fpu/xcr.h> #include <asm/processor.h> #include <asm/realmode.h> +#include <asm/setup.h> #include <asm/traps.h> #include <asm/svm.h> #include <asm/smp.h> @@ -86,9 +87,6 @@ struct ghcb_state { static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data); DEFINE_STATIC_KEY_FALSE(sev_es_enable_key); -/* Needed in vc_early_forward_exception */ -void do_early_exception(struct pt_regs *regs, int trapnr); - static __always_inline bool on_vc_stack(struct pt_regs *regs) { unsigned long sp = regs->sp; @@ -209,9 +207,6 @@ static noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state) return ghcb; } -/* Needed in vc_early_forward_exception */ -void do_early_exception(struct pt_regs *regs, int trapnr); - static inline u64 sev_es_rd_ghcb_msr(void) { return __rdmsr(MSR_AMD64_SEV_ES_GHCB); @@ -797,22 +792,6 @@ static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt) do_early_exception(ctxt->regs, trapnr); } -static long *vc_insn_get_reg(struct es_em_ctxt *ctxt) -{ - long *reg_array; - int offset; - - reg_array = (long *)ctxt->regs; - offset = insn_get_modrm_reg_off(&ctxt->insn, ctxt->regs); - - if (offset < 0) - return NULL; - - offset /= sizeof(long); - - return reg_array + offset; -} - static long *vc_insn_get_rm(struct es_em_ctxt *ctxt) { long *reg_array; @@ -860,76 +839,6 @@ static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt, return sev_es_ghcb_hv_call(ghcb, true, ctxt, exit_code, exit_info_1, exit_info_2); } -static enum es_result vc_handle_mmio_twobyte_ops(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - struct insn *insn = &ctxt->insn; - unsigned int bytes = 0; - enum es_result ret; - int sign_byte; - long *reg_data; - - switch (insn->opcode.bytes[1]) { - /* MMIO Read w/ zero-extension */ - case 0xb6: - bytes = 1; - fallthrough; - case 0xb7: - if (!bytes) - bytes = 2; - - ret = vc_do_mmio(ghcb, ctxt, bytes, true); - if (ret) - break; - - /* Zero extend based on operand size */ - reg_data = vc_insn_get_reg(ctxt); - if (!reg_data) - return ES_DECODE_FAILED; - - memset(reg_data, 0, insn->opnd_bytes); - - memcpy(reg_data, ghcb->shared_buffer, bytes); - break; - - /* MMIO Read w/ sign-extension */ - case 0xbe: - bytes = 1; - fallthrough; - case 0xbf: - if (!bytes) - bytes = 2; - - ret = vc_do_mmio(ghcb, ctxt, bytes, true); - if (ret) - break; - - /* Sign extend based on operand size */ - reg_data = vc_insn_get_reg(ctxt); - if (!reg_data) - return ES_DECODE_FAILED; - - if (bytes == 1) { - u8 *val = (u8 *)ghcb->shared_buffer; - - sign_byte = (*val & 0x80) ? 0xff : 0x00; - } else { - u16 *val = (u16 *)ghcb->shared_buffer; - - sign_byte = (*val & 0x8000) ? 0xff : 0x00; - } - memset(reg_data, sign_byte, insn->opnd_bytes); - - memcpy(reg_data, ghcb->shared_buffer, bytes); - break; - - default: - ret = ES_UNSUPPORTED; - } - - return ret; -} - /* * The MOVS instruction has two memory operands, which raises the * problem that it is not known whether the access to the source or the @@ -997,83 +906,79 @@ static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt, return ES_RETRY; } -static enum es_result vc_handle_mmio(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) +static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) { struct insn *insn = &ctxt->insn; unsigned int bytes = 0; + enum mmio_type mmio; enum es_result ret; + u8 sign_byte; long *reg_data; - switch (insn->opcode.bytes[0]) { - /* MMIO Write */ - case 0x88: - bytes = 1; - fallthrough; - case 0x89: - if (!bytes) - bytes = insn->opnd_bytes; + mmio = insn_decode_mmio(insn, &bytes); + if (mmio == MMIO_DECODE_FAILED) + return ES_DECODE_FAILED; - reg_data = vc_insn_get_reg(ctxt); + if (mmio != MMIO_WRITE_IMM && mmio != MMIO_MOVS) { + reg_data = insn_get_modrm_reg_ptr(insn, ctxt->regs); if (!reg_data) return ES_DECODE_FAILED; + } + switch (mmio) { + case MMIO_WRITE: memcpy(ghcb->shared_buffer, reg_data, bytes); - ret = vc_do_mmio(ghcb, ctxt, bytes, false); break; - - case 0xc6: - bytes = 1; - fallthrough; - case 0xc7: - if (!bytes) - bytes = insn->opnd_bytes; - + case MMIO_WRITE_IMM: memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes); - ret = vc_do_mmio(ghcb, ctxt, bytes, false); break; - - /* MMIO Read */ - case 0x8a: - bytes = 1; - fallthrough; - case 0x8b: - if (!bytes) - bytes = insn->opnd_bytes; - + case MMIO_READ: ret = vc_do_mmio(ghcb, ctxt, bytes, true); if (ret) break; - reg_data = vc_insn_get_reg(ctxt); - if (!reg_data) - return ES_DECODE_FAILED; - /* Zero-extend for 32-bit operation */ if (bytes == 4) *reg_data = 0; memcpy(reg_data, ghcb->shared_buffer, bytes); break; + case MMIO_READ_ZERO_EXTEND: + ret = vc_do_mmio(ghcb, ctxt, bytes, true); + if (ret) + break; - /* MOVS instruction */ - case 0xa4: - bytes = 1; - fallthrough; - case 0xa5: - if (!bytes) - bytes = insn->opnd_bytes; + /* Zero extend based on operand size */ + memset(reg_data, 0, insn->opnd_bytes); + memcpy(reg_data, ghcb->shared_buffer, bytes); + break; + case MMIO_READ_SIGN_EXTEND: + ret = vc_do_mmio(ghcb, ctxt, bytes, true); + if (ret) + break; - ret = vc_handle_mmio_movs(ctxt, bytes); + if (bytes == 1) { + u8 *val = (u8 *)ghcb->shared_buffer; + + sign_byte = (*val & 0x80) ? 0xff : 0x00; + } else { + u16 *val = (u16 *)ghcb->shared_buffer; + + sign_byte = (*val & 0x8000) ? 0xff : 0x00; + } + + /* Sign extend based on operand size */ + memset(reg_data, sign_byte, insn->opnd_bytes); + memcpy(reg_data, ghcb->shared_buffer, bytes); break; - /* Two-Byte Opcodes */ - case 0x0f: - ret = vc_handle_mmio_twobyte_ops(ghcb, ctxt); + case MMIO_MOVS: + ret = vc_handle_mmio_movs(ctxt, bytes); break; default: ret = ES_UNSUPPORTED; + break; } return ret; @@ -1432,7 +1337,7 @@ DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication) show_regs(regs); /* Ask hypervisor to sev_es_terminate */ - sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); + sev_es_terminate(GHCB_SEV_ES_GEN_REQ); /* If that fails and we get here - just panic */ panic("Returned from Terminate-Request to Hypervisor\n"); @@ -1480,7 +1385,7 @@ bool __init handle_vc_boot_ghcb(struct pt_regs *regs) /* Do initial setup or terminate the guest */ if (unlikely(boot_ghcb == NULL && !sev_es_setup_ghcb())) - sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); + sev_es_terminate(GHCB_SEV_ES_GEN_REQ); vc_ghcb_invalidate(boot_ghcb); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ac2909f0cab3..617012f4619f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -579,6 +579,17 @@ static struct sched_domain_topology_level x86_numa_in_package_topology[] = { { NULL, }, }; +static struct sched_domain_topology_level x86_hybrid_topology[] = { +#ifdef CONFIG_SCHED_SMT + { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, +#endif +#ifdef CONFIG_SCHED_MC + { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, +#endif + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + static struct sched_domain_topology_level x86_topology[] = { #ifdef CONFIG_SCHED_SMT { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, @@ -1469,8 +1480,11 @@ void __init native_smp_cpus_done(unsigned int max_cpus) calculate_max_logical_packages(); + /* XXX for now assume numa-in-package and hybrid don't overlap */ if (x86_has_numa_in_package) set_sched_topology(x86_numa_in_package_topology); + if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) + set_sched_topology(x86_hybrid_topology); nmi_selftest(); impress_friends(); diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index 54a83a744538..f33c804a922a 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -95,6 +95,9 @@ static int kvm_mmu_rmaps_stat_show(struct seq_file *m, void *v) unsigned int *log[KVM_NR_PAGE_SIZES], *cur; int i, j, k, l, ret; + if (!kvm_memslots_have_rmaps(kvm)) + return 0; + ret = -ENOMEM; memset(log, 0, sizeof(log)); for (i = 0; i < KVM_NR_PAGE_SIZES; i++) { diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 5e19e6e4c2ce..8d8c1cc7cb53 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1922,11 +1922,13 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL; + if (all_cpus) + goto check_and_send_ipi; + if (!sparse_banks_len) goto ret_success; - if (!all_cpus && - kvm_read_guest(kvm, + if (kvm_read_guest(kvm, hc->ingpa + offsetof(struct hv_send_ipi_ex, vp_set.bank_contents), sparse_banks, @@ -1934,6 +1936,7 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool return HV_STATUS_INVALID_HYPERCALL_INPUT; } +check_and_send_ipi: if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) return HV_STATUS_INVALID_HYPERCALL_INPUT; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e2e1d012df22..fcdf3f8bb59a 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3987,7 +3987,21 @@ out_retry: static bool is_page_fault_stale(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, int mmu_seq) { - if (is_obsolete_sp(vcpu->kvm, to_shadow_page(vcpu->arch.mmu->root_hpa))) + struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root_hpa); + + /* Special roots, e.g. pae_root, are not backed by shadow pages. */ + if (sp && is_obsolete_sp(vcpu->kvm, sp)) + return true; + + /* + * Roots without an associated shadow page are considered invalid if + * there is a pending request to free obsolete roots. The request is + * only a hint that the current root _may_ be obsolete and needs to be + * reloaded, e.g. if the guest frees a PGD that KVM is tracking as a + * previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs + * to reload even if no vCPU is actively using the root. + */ + if (!sp && kvm_test_request(KVM_REQ_MMU_RELOAD, vcpu)) return true; return fault->slot && diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 0c76c45fdb68..fad546df0bba 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -16,6 +16,7 @@ #include "spte.h" #include <asm/e820/api.h> +#include <asm/memtype.h> #include <asm/vmx.h> static bool __read_mostly enable_mmio_caching = true; diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index b3ed302c1a35..caa96c270b95 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -26,6 +26,7 @@ static gfn_t round_gfn_for_level(gfn_t gfn, int level) */ void tdp_iter_restart(struct tdp_iter *iter) { + iter->yielded = false; iter->yielded_gfn = iter->next_last_level_gfn; iter->level = iter->root_level; @@ -160,6 +161,11 @@ static bool try_step_up(struct tdp_iter *iter) */ void tdp_iter_next(struct tdp_iter *iter) { + if (iter->yielded) { + tdp_iter_restart(iter); + return; + } + if (try_step_down(iter)) return; diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index b1748b988d3a..e19cabbcb65c 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -45,6 +45,12 @@ struct tdp_iter { * iterator walks off the end of the paging structure. */ bool valid; + /* + * True if KVM dropped mmu_lock and yielded in the middle of a walk, in + * which case tdp_iter_next() needs to restart the walk at the root + * level instead of advancing to the next entry. + */ + bool yielded; }; /* diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 1db8496259ad..1beb4ca90560 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -502,6 +502,8 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte) { + WARN_ON_ONCE(iter->yielded); + lockdep_assert_held_read(&kvm->mmu_lock); /* @@ -575,6 +577,8 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte, bool record_acc_track, bool record_dirty_log) { + WARN_ON_ONCE(iter->yielded); + lockdep_assert_held_write(&kvm->mmu_lock); /* @@ -640,18 +644,19 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, * If this function should yield and flush is set, it will perform a remote * TLB flush before yielding. * - * If this function yields, it will also reset the tdp_iter's walk over the - * paging structure and the calling function should skip to the next - * iteration to allow the iterator to continue its traversal from the - * paging structure root. + * If this function yields, iter->yielded is set and the caller must skip to + * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk + * over the paging structures to allow the iterator to continue its traversal + * from the paging structure root. * - * Return true if this function yielded and the iterator's traversal was reset. - * Return false if a yield was not needed. + * Returns true if this function yielded. */ -static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, - struct tdp_iter *iter, bool flush, - bool shared) +static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, + struct tdp_iter *iter, + bool flush, bool shared) { + WARN_ON(iter->yielded); + /* Ensure forward progress has been made before yielding. */ if (iter->next_last_level_gfn == iter->yielded_gfn) return false; @@ -671,12 +676,10 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, WARN_ON(iter->gfn > iter->next_last_level_gfn); - tdp_iter_restart(iter); - - return true; + iter->yielded = true; } - return false; + return iter->yielded; } /* diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 7656a2c5662a..be2883141220 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1565,7 +1565,7 @@ static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) r = -EINTR; if (mutex_lock_killable(&dst_kvm->lock)) goto release_src; - if (mutex_lock_killable(&src_kvm->lock)) + if (mutex_lock_killable_nested(&src_kvm->lock, SINGLE_DEPTH_NESTING)) goto unlock_dst; return 0; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d0f68d11ec70..5151efa424ac 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1585,6 +1585,15 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) to_svm(vcpu)->vmcb->save.rflags = rflags; } +static bool svm_get_if_flag(struct kvm_vcpu *vcpu) +{ + struct vmcb *vmcb = to_svm(vcpu)->vmcb; + + return sev_es_guest(vcpu->kvm) + ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK + : kvm_get_rflags(vcpu) & X86_EFLAGS_IF; +} + static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) { switch (reg) { @@ -3568,14 +3577,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) if (!gif_set(svm)) return true; - if (sev_es_guest(vcpu->kvm)) { - /* - * SEV-ES guests to not expose RFLAGS. Use the VMCB interrupt mask - * bit to determine the state of the IF flag. - */ - if (!(vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK)) - return true; - } else if (is_guest_mode(vcpu)) { + if (is_guest_mode(vcpu)) { /* As long as interrupts are being delivered... */ if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF) @@ -3586,7 +3588,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) if (nested_exit_on_intr(svm)) return false; } else { - if (!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF)) + if (!svm_get_if_flag(vcpu)) return true; } @@ -4621,6 +4623,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, + .get_if_flag = svm_get_if_flag, .tlb_flush_all = svm_flush_tlb, .tlb_flush_current = svm_flush_tlb, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 9453743ce0c4..0dbf94eb954f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1363,6 +1363,11 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) vmx->emulation_required = vmx_emulation_required(vcpu); } +static bool vmx_get_if_flag(struct kvm_vcpu *vcpu) +{ + return vmx_get_rflags(vcpu) & X86_EFLAGS_IF; +} + u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) { u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); @@ -2646,15 +2651,6 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) if (!loaded_vmcs->msr_bitmap) goto out_vmcs; memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); - - if (IS_ENABLED(CONFIG_HYPERV) && - static_branch_unlikely(&enable_evmcs) && - (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { - struct hv_enlightened_vmcs *evmcs = - (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs; - - evmcs->hv_enlightenments_control.msr_bitmap = 1; - } } memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); @@ -3968,8 +3964,7 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) if (pi_test_and_set_on(&vmx->pi_desc)) return 0; - if (vcpu != kvm_get_running_vcpu() && - !kvm_vcpu_trigger_posted_interrupt(vcpu, false)) + if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false)) kvm_vcpu_kick(vcpu); return 0; @@ -5886,18 +5881,14 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) vmx_flush_pml_buffer(vcpu); /* - * We should never reach this point with a pending nested VM-Enter, and - * more specifically emulation of L2 due to invalid guest state (see - * below) should never happen as that means we incorrectly allowed a - * nested VM-Enter with an invalid vmcs12. + * KVM should never reach this point with a pending nested VM-Enter. + * More specifically, short-circuiting VM-Entry to emulate L2 due to + * invalid guest state should never happen as that means KVM knowingly + * allowed a nested VM-Enter with an invalid vmcs12. More below. */ if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm)) return -EIO; - /* If guest state is invalid, start emulating */ - if (vmx->emulation_required) - return handle_invalid_guest_state(vcpu); - if (is_guest_mode(vcpu)) { /* * PML is never enabled when running L2, bail immediately if a @@ -5919,10 +5910,30 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) */ nested_mark_vmcs12_pages_dirty(vcpu); + /* + * Synthesize a triple fault if L2 state is invalid. In normal + * operation, nested VM-Enter rejects any attempt to enter L2 + * with invalid state. However, those checks are skipped if + * state is being stuffed via RSM or KVM_SET_NESTED_STATE. If + * L2 state is invalid, it means either L1 modified SMRAM state + * or userspace provided bad state. Synthesize TRIPLE_FAULT as + * doing so is architecturally allowed in the RSM case, and is + * the least awful solution for the userspace case without + * risking false positives. + */ + if (vmx->emulation_required) { + nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); + return 1; + } + if (nested_vmx_reflect_vmexit(vcpu)) return 1; } + /* If guest state is invalid, start emulating. L2 is handled above. */ + if (vmx->emulation_required) + return handle_invalid_guest_state(vcpu); + if (exit_reason.failed_vmentry) { dump_vmcs(vcpu); vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; @@ -6617,9 +6628,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) * consistency check VM-Exit due to invalid guest state and bail. */ if (unlikely(vmx->emulation_required)) { - - /* We don't emulate invalid state of a nested guest */ - vmx->fail = is_guest_mode(vcpu); + vmx->fail = 0; vmx->exit_reason.full = EXIT_REASON_INVALID_STATE; vmx->exit_reason.failed_vmentry = 1; @@ -6842,6 +6851,19 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) if (err < 0) goto free_pml; + /* + * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a + * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the + * feature only for vmcs01, KVM currently isn't equipped to realize any + * performance benefits from enabling it for vmcs02. + */ + if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs) && + (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { + struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; + + evmcs->hv_enlightenments_control.msr_bitmap = 1; + } + /* The MSR bitmap starts with all ones */ bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS); bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS); @@ -7575,6 +7597,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, + .get_if_flag = vmx_get_if_flag, .tlb_flush_all = vmx_flush_tlb_all, .tlb_flush_current = vmx_flush_tlb_current, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e0aa4dd53c7f..e50e97ac4408 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -890,7 +890,8 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) !load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu))) return 1; - if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) + if (!(cr0 & X86_CR0_PG) && + (is_64_bit_mode(vcpu) || kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))) return 1; static_call(kvm_x86_set_cr0)(vcpu, cr0); @@ -1330,7 +1331,7 @@ static const u32 msrs_to_save_all[] = { MSR_IA32_UMWAIT_CONTROL, MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, - MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3, + MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL, MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, @@ -3412,7 +3413,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated) return 1; - if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM) && kvm_get_msr_feature(&msr_ent)) + if (kvm_get_msr_feature(&msr_ent)) return 1; if (data & ~msr_ent.data) return 1; @@ -7121,7 +7122,13 @@ static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, unsigned short port, void *val, unsigned int count) { if (vcpu->arch.pio.count) { - /* Complete previous iteration. */ + /* + * Complete a previous iteration that required userspace I/O. + * Note, @count isn't guaranteed to match pio.count as userspace + * can modify ECX before rerunning the vCPU. Ignore any such + * shenanigans as KVM doesn't support modifying the rep count, + * and the emulator ensures @count doesn't overflow the buffer. + */ } else { int r = __emulator_pio_in(vcpu, size, port, count); if (!r) @@ -7130,7 +7137,6 @@ static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, /* Results already available, fall through. */ } - WARN_ON(count != vcpu->arch.pio.count); complete_emulator_pio_in(vcpu, val); return 1; } @@ -8995,14 +9001,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) { struct kvm_run *kvm_run = vcpu->run; - /* - * if_flag is obsolete and useless, so do not bother - * setting it for SEV-ES guests. Userspace can just - * use kvm_run->ready_for_interrupt_injection. - */ - kvm_run->if_flag = !vcpu->arch.guest_state_protected - && (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0; - + kvm_run->if_flag = static_call(kvm_x86_get_if_flag)(vcpu); kvm_run->cr8 = kvm_get_cr8(vcpu); kvm_run->apic_base = kvm_get_apic_base(vcpu); diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index eb3ccffb9b9d..53e57ef5925c 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -37,8 +37,6 @@ enum reg_type { */ static bool is_string_insn(struct insn *insn) { - insn_get_opcode(insn); - /* All string instructions have a 1-byte opcode. */ if (insn->opcode.nbytes != 1) return false; @@ -851,6 +849,26 @@ int insn_get_modrm_reg_off(struct insn *insn, struct pt_regs *regs) } /** + * insn_get_modrm_reg_ptr() - Obtain register pointer based on ModRM byte + * @insn: Instruction containing the ModRM byte + * @regs: Register values as seen when entering kernel mode + * + * Returns: + * + * The register indicated by the reg part of the ModRM byte. + * The register is obtained as a pointer within pt_regs. + */ +unsigned long *insn_get_modrm_reg_ptr(struct insn *insn, struct pt_regs *regs) +{ + int offset; + + offset = insn_get_modrm_reg_off(insn, regs); + if (offset < 0) + return NULL; + return (void *)regs + offset; +} + +/** * get_seg_base_limit() - obtain base address and limit of a segment * @insn: Instruction. Must be valid. * @regs: Register values as seen when entering kernel mode @@ -1405,6 +1423,9 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs) if (!insn || !regs) return (void __user *)-1L; + if (insn_get_opcode(insn)) + return (void __user *)-1L; + switch (insn->addr_bytes) { case 2: return get_addr_ref_16(insn, regs); @@ -1539,3 +1560,87 @@ bool insn_decode_from_regs(struct insn *insn, struct pt_regs *regs, return true; } + +/** + * insn_decode_mmio() - Decode a MMIO instruction + * @insn: Structure to store decoded instruction + * @bytes: Returns size of memory operand + * + * Decodes instruction that used for Memory-mapped I/O. + * + * Returns: + * + * Type of the instruction. Size of the memory operand is stored in + * @bytes. If decode failed, MMIO_DECODE_FAILED returned. + */ +enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes) +{ + enum mmio_type type = MMIO_DECODE_FAILED; + + *bytes = 0; + + if (insn_get_opcode(insn)) + return MMIO_DECODE_FAILED; + + switch (insn->opcode.bytes[0]) { + case 0x88: /* MOV m8,r8 */ + *bytes = 1; + fallthrough; + case 0x89: /* MOV m16/m32/m64, r16/m32/m64 */ + if (!*bytes) + *bytes = insn->opnd_bytes; + type = MMIO_WRITE; + break; + + case 0xc6: /* MOV m8, imm8 */ + *bytes = 1; + fallthrough; + case 0xc7: /* MOV m16/m32/m64, imm16/imm32/imm64 */ + if (!*bytes) + *bytes = insn->opnd_bytes; + type = MMIO_WRITE_IMM; + break; + + case 0x8a: /* MOV r8, m8 */ + *bytes = 1; + fallthrough; + case 0x8b: /* MOV r16/r32/r64, m16/m32/m64 */ + if (!*bytes) + *bytes = insn->opnd_bytes; + type = MMIO_READ; + break; + + case 0xa4: /* MOVS m8, m8 */ + *bytes = 1; + fallthrough; + case 0xa5: /* MOVS m16/m32/m64, m16/m32/m64 */ + if (!*bytes) + *bytes = insn->opnd_bytes; + type = MMIO_MOVS; + break; + + case 0x0f: /* Two-byte instruction */ + switch (insn->opcode.bytes[1]) { + case 0xb6: /* MOVZX r16/r32/r64, m8 */ + *bytes = 1; + fallthrough; + case 0xb7: /* MOVZX r32/r64, m16 */ + if (!*bytes) + *bytes = 2; + type = MMIO_READ_ZERO_EXTEND; + break; + + case 0xbe: /* MOVSX r16/r32/r64, m8 */ + *bytes = 1; + fallthrough; + case 0xbf: /* MOVSX r32/r64, m16 */ + if (!*bytes) + *bytes = 2; + type = MMIO_READ_SIGN_EXTEND; + break; + } + break; + } + + return type; +} diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 5864219221ca..fe3d3061fc11 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -2,9 +2,11 @@ # Kernel does not boot with instrumentation of tlb.c and mem_encrypt*.c KCOV_INSTRUMENT_tlb.o := n KCOV_INSTRUMENT_mem_encrypt.o := n +KCOV_INSTRUMENT_mem_encrypt_amd.o := n KCOV_INSTRUMENT_mem_encrypt_identity.o := n KASAN_SANITIZE_mem_encrypt.o := n +KASAN_SANITIZE_mem_encrypt_amd.o := n KASAN_SANITIZE_mem_encrypt_identity.o := n # Disable KCSAN entirely, because otherwise we get warnings that some functions @@ -13,6 +15,7 @@ KCSAN_SANITIZE := n ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_mem_encrypt.o = -pg +CFLAGS_REMOVE_mem_encrypt_amd.o = -pg CFLAGS_REMOVE_mem_encrypt_identity.o = -pg endif @@ -52,6 +55,8 @@ obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o -obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o +obj-$(CONFIG_X86_MEM_ENCRYPT) += mem_encrypt.o +obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_amd.o + obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_identity.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 1895986842b9..4ba024d5b63a 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -714,6 +714,11 @@ static void __init memory_map_bottom_up(unsigned long map_start, static void __init init_trampoline(void) { #ifdef CONFIG_X86_64 + /* + * The code below will alias kernel page-tables in the user-range of the + * address space, including the Global bit. So global TLB entries will + * be created when using the trampoline page-table. + */ if (!kaslr_memory_enabled()) trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)]; else diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 35487305d8af..50d209939c66 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -1,419 +1,18 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * AMD Memory Encryption Support + * Memory Encryption Support Common Code * * Copyright (C) 2016 Advanced Micro Devices, Inc. * * Author: Tom Lendacky <thomas.lendacky@amd.com> */ -#define DISABLE_BRANCH_PROFILING - -#include <linux/linkage.h> -#include <linux/init.h> -#include <linux/mm.h> #include <linux/dma-direct.h> +#include <linux/dma-mapping.h> #include <linux/swiotlb.h> +#include <linux/cc_platform.h> #include <linux/mem_encrypt.h> -#include <linux/device.h> -#include <linux/kernel.h> -#include <linux/bitops.h> -#include <linux/dma-mapping.h> #include <linux/virtio_config.h> -#include <linux/cc_platform.h> - -#include <asm/tlbflush.h> -#include <asm/fixmap.h> -#include <asm/setup.h> -#include <asm/bootparam.h> -#include <asm/set_memory.h> -#include <asm/cacheflush.h> -#include <asm/processor-flags.h> -#include <asm/msr.h> -#include <asm/cmdline.h> - -#include "mm_internal.h" - -/* - * Since SME related variables are set early in the boot process they must - * reside in the .data section so as not to be zeroed out when the .bss - * section is later cleared. - */ -u64 sme_me_mask __section(".data") = 0; -u64 sev_status __section(".data") = 0; -u64 sev_check_data __section(".data") = 0; -EXPORT_SYMBOL(sme_me_mask); -DEFINE_STATIC_KEY_FALSE(sev_enable_key); -EXPORT_SYMBOL_GPL(sev_enable_key); - -/* Buffer used for early in-place encryption by BSP, no locking needed */ -static char sme_early_buffer[PAGE_SIZE] __initdata __aligned(PAGE_SIZE); - -/* - * This routine does not change the underlying encryption setting of the - * page(s) that map this memory. It assumes that eventually the memory is - * meant to be accessed as either encrypted or decrypted but the contents - * are currently not in the desired state. - * - * This routine follows the steps outlined in the AMD64 Architecture - * Programmer's Manual Volume 2, Section 7.10.8 Encrypt-in-Place. - */ -static void __init __sme_early_enc_dec(resource_size_t paddr, - unsigned long size, bool enc) -{ - void *src, *dst; - size_t len; - - if (!sme_me_mask) - return; - - wbinvd(); - - /* - * There are limited number of early mapping slots, so map (at most) - * one page at time. - */ - while (size) { - len = min_t(size_t, sizeof(sme_early_buffer), size); - - /* - * Create mappings for the current and desired format of - * the memory. Use a write-protected mapping for the source. - */ - src = enc ? early_memremap_decrypted_wp(paddr, len) : - early_memremap_encrypted_wp(paddr, len); - - dst = enc ? early_memremap_encrypted(paddr, len) : - early_memremap_decrypted(paddr, len); - - /* - * If a mapping can't be obtained to perform the operation, - * then eventual access of that area in the desired mode - * will cause a crash. - */ - BUG_ON(!src || !dst); - - /* - * Use a temporary buffer, of cache-line multiple size, to - * avoid data corruption as documented in the APM. - */ - memcpy(sme_early_buffer, src, len); - memcpy(dst, sme_early_buffer, len); - - early_memunmap(dst, len); - early_memunmap(src, len); - - paddr += len; - size -= len; - } -} - -void __init sme_early_encrypt(resource_size_t paddr, unsigned long size) -{ - __sme_early_enc_dec(paddr, size, true); -} - -void __init sme_early_decrypt(resource_size_t paddr, unsigned long size) -{ - __sme_early_enc_dec(paddr, size, false); -} - -static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size, - bool map) -{ - unsigned long paddr = (unsigned long)vaddr - __PAGE_OFFSET; - pmdval_t pmd_flags, pmd; - - /* Use early_pmd_flags but remove the encryption mask */ - pmd_flags = __sme_clr(early_pmd_flags); - - do { - pmd = map ? (paddr & PMD_MASK) + pmd_flags : 0; - __early_make_pgtable((unsigned long)vaddr, pmd); - - vaddr += PMD_SIZE; - paddr += PMD_SIZE; - size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE; - } while (size); - - flush_tlb_local(); -} - -void __init sme_unmap_bootdata(char *real_mode_data) -{ - struct boot_params *boot_data; - unsigned long cmdline_paddr; - - if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) - return; - - /* Get the command line address before unmapping the real_mode_data */ - boot_data = (struct boot_params *)real_mode_data; - cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32); - - __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), false); - - if (!cmdline_paddr) - return; - - __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, false); -} - -void __init sme_map_bootdata(char *real_mode_data) -{ - struct boot_params *boot_data; - unsigned long cmdline_paddr; - - if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) - return; - - __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), true); - - /* Get the command line address after mapping the real_mode_data */ - boot_data = (struct boot_params *)real_mode_data; - cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32); - - if (!cmdline_paddr) - return; - - __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true); -} - -void __init sme_early_init(void) -{ - unsigned int i; - - if (!sme_me_mask) - return; - - early_pmd_flags = __sme_set(early_pmd_flags); - - __supported_pte_mask = __sme_set(__supported_pte_mask); - - /* Update the protection map with memory encryption mask */ - for (i = 0; i < ARRAY_SIZE(protection_map); i++) - protection_map[i] = pgprot_encrypted(protection_map[i]); - - if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) - swiotlb_force = SWIOTLB_FORCE; -} - -void __init sev_setup_arch(void) -{ - phys_addr_t total_mem = memblock_phys_mem_size(); - unsigned long size; - - if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) - return; - - /* - * For SEV, all DMA has to occur via shared/unencrypted pages. - * SEV uses SWIOTLB to make this happen without changing device - * drivers. However, depending on the workload being run, the - * default 64MB of SWIOTLB may not be enough and SWIOTLB may - * run out of buffers for DMA, resulting in I/O errors and/or - * performance degradation especially with high I/O workloads. - * - * Adjust the default size of SWIOTLB for SEV guests using - * a percentage of guest memory for SWIOTLB buffers. - * Also, as the SWIOTLB bounce buffer memory is allocated - * from low memory, ensure that the adjusted size is within - * the limits of low available memory. - * - * The percentage of guest memory used here for SWIOTLB buffers - * is more of an approximation of the static adjustment which - * 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6% - */ - size = total_mem * 6 / 100; - size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G); - swiotlb_adjust_size(size); -} - -static unsigned long pg_level_to_pfn(int level, pte_t *kpte, pgprot_t *ret_prot) -{ - unsigned long pfn = 0; - pgprot_t prot; - - switch (level) { - case PG_LEVEL_4K: - pfn = pte_pfn(*kpte); - prot = pte_pgprot(*kpte); - break; - case PG_LEVEL_2M: - pfn = pmd_pfn(*(pmd_t *)kpte); - prot = pmd_pgprot(*(pmd_t *)kpte); - break; - case PG_LEVEL_1G: - pfn = pud_pfn(*(pud_t *)kpte); - prot = pud_pgprot(*(pud_t *)kpte); - break; - default: - WARN_ONCE(1, "Invalid level for kpte\n"); - return 0; - } - - if (ret_prot) - *ret_prot = prot; - - return pfn; -} - -void notify_range_enc_status_changed(unsigned long vaddr, int npages, bool enc) -{ -#ifdef CONFIG_PARAVIRT - unsigned long sz = npages << PAGE_SHIFT; - unsigned long vaddr_end = vaddr + sz; - - while (vaddr < vaddr_end) { - int psize, pmask, level; - unsigned long pfn; - pte_t *kpte; - - kpte = lookup_address(vaddr, &level); - if (!kpte || pte_none(*kpte)) { - WARN_ONCE(1, "kpte lookup for vaddr\n"); - return; - } - - pfn = pg_level_to_pfn(level, kpte, NULL); - if (!pfn) - continue; - - psize = page_level_size(level); - pmask = page_level_mask(level); - - notify_page_enc_status_changed(pfn, psize >> PAGE_SHIFT, enc); - - vaddr = (vaddr & pmask) + psize; - } -#endif -} - -static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc) -{ - pgprot_t old_prot, new_prot; - unsigned long pfn, pa, size; - pte_t new_pte; - - pfn = pg_level_to_pfn(level, kpte, &old_prot); - if (!pfn) - return; - - new_prot = old_prot; - if (enc) - pgprot_val(new_prot) |= _PAGE_ENC; - else - pgprot_val(new_prot) &= ~_PAGE_ENC; - - /* If prot is same then do nothing. */ - if (pgprot_val(old_prot) == pgprot_val(new_prot)) - return; - - pa = pfn << PAGE_SHIFT; - size = page_level_size(level); - - /* - * We are going to perform in-place en-/decryption and change the - * physical page attribute from C=1 to C=0 or vice versa. Flush the - * caches to ensure that data gets accessed with the correct C-bit. - */ - clflush_cache_range(__va(pa), size); - - /* Encrypt/decrypt the contents in-place */ - if (enc) - sme_early_encrypt(pa, size); - else - sme_early_decrypt(pa, size); - - /* Change the page encryption mask. */ - new_pte = pfn_pte(pfn, new_prot); - set_pte_atomic(kpte, new_pte); -} - -static int __init early_set_memory_enc_dec(unsigned long vaddr, - unsigned long size, bool enc) -{ - unsigned long vaddr_end, vaddr_next, start; - unsigned long psize, pmask; - int split_page_size_mask; - int level, ret; - pte_t *kpte; - - start = vaddr; - vaddr_next = vaddr; - vaddr_end = vaddr + size; - - for (; vaddr < vaddr_end; vaddr = vaddr_next) { - kpte = lookup_address(vaddr, &level); - if (!kpte || pte_none(*kpte)) { - ret = 1; - goto out; - } - - if (level == PG_LEVEL_4K) { - __set_clr_pte_enc(kpte, level, enc); - vaddr_next = (vaddr & PAGE_MASK) + PAGE_SIZE; - continue; - } - - psize = page_level_size(level); - pmask = page_level_mask(level); - - /* - * Check whether we can change the large page in one go. - * We request a split when the address is not aligned and - * the number of pages to set/clear encryption bit is smaller - * than the number of pages in the large page. - */ - if (vaddr == (vaddr & pmask) && - ((vaddr_end - vaddr) >= psize)) { - __set_clr_pte_enc(kpte, level, enc); - vaddr_next = (vaddr & pmask) + psize; - continue; - } - - /* - * The virtual address is part of a larger page, create the next - * level page table mapping (4K or 2M). If it is part of a 2M - * page then we request a split of the large page into 4K - * chunks. A 1GB large page is split into 2M pages, resp. - */ - if (level == PG_LEVEL_2M) - split_page_size_mask = 0; - else - split_page_size_mask = 1 << PG_LEVEL_2M; - - /* - * kernel_physical_mapping_change() does not flush the TLBs, so - * a TLB flush is required after we exit from the for loop. - */ - kernel_physical_mapping_change(__pa(vaddr & pmask), - __pa((vaddr_end & pmask) + psize), - split_page_size_mask); - } - - ret = 0; - - notify_range_enc_status_changed(start, PAGE_ALIGN(size) >> PAGE_SHIFT, enc); -out: - __flush_tlb_all(); - return ret; -} - -int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size) -{ - return early_set_memory_enc_dec(vaddr, size, false); -} - -int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size) -{ - return early_set_memory_enc_dec(vaddr, size, true); -} - -void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, bool enc) -{ - notify_range_enc_status_changed(vaddr, npages, enc); -} /* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */ bool force_dma_unencrypted(struct device *dev) @@ -441,30 +40,6 @@ bool force_dma_unencrypted(struct device *dev) return false; } -void __init mem_encrypt_free_decrypted_mem(void) -{ - unsigned long vaddr, vaddr_end, npages; - int r; - - vaddr = (unsigned long)__start_bss_decrypted_unused; - vaddr_end = (unsigned long)__end_bss_decrypted; - npages = (vaddr_end - vaddr) >> PAGE_SHIFT; - - /* - * The unused memory range was mapped decrypted, change the encryption - * attribute from decrypted to encrypted before freeing it. - */ - if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) { - r = set_memory_encrypted(vaddr, npages); - if (r) { - pr_warn("failed to free unused decrypted pages\n"); - return; - } - } - - free_init_pages("unused decrypted", vaddr, vaddr_end); -} - static void print_mem_encrypt_feature_info(void) { pr_info("AMD Memory Encryption Features active:"); @@ -493,20 +68,12 @@ static void print_mem_encrypt_feature_info(void) /* Architecture __weak replacement functions */ void __init mem_encrypt_init(void) { - if (!sme_me_mask) + if (!cc_platform_has(CC_ATTR_MEM_ENCRYPT)) return; /* Call into SWIOTLB to update the SWIOTLB DMA buffers */ swiotlb_update_mem_attributes(); - /* - * With SEV, we need to unroll the rep string I/O instructions, - * but SEV-ES supports them through the #VC handler. - */ - if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) && - !cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) - static_branch_enable(&sev_enable_key); - print_mem_encrypt_feature_info(); } diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c new file mode 100644 index 000000000000..2b2d018ea345 --- /dev/null +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -0,0 +1,438 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + */ + +#define DISABLE_BRANCH_PROFILING + +#include <linux/linkage.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/dma-direct.h> +#include <linux/swiotlb.h> +#include <linux/mem_encrypt.h> +#include <linux/device.h> +#include <linux/kernel.h> +#include <linux/bitops.h> +#include <linux/dma-mapping.h> +#include <linux/virtio_config.h> +#include <linux/cc_platform.h> + +#include <asm/tlbflush.h> +#include <asm/fixmap.h> +#include <asm/setup.h> +#include <asm/bootparam.h> +#include <asm/set_memory.h> +#include <asm/cacheflush.h> +#include <asm/processor-flags.h> +#include <asm/msr.h> +#include <asm/cmdline.h> + +#include "mm_internal.h" + +/* + * Since SME related variables are set early in the boot process they must + * reside in the .data section so as not to be zeroed out when the .bss + * section is later cleared. + */ +u64 sme_me_mask __section(".data") = 0; +u64 sev_status __section(".data") = 0; +u64 sev_check_data __section(".data") = 0; +EXPORT_SYMBOL(sme_me_mask); + +/* Buffer used for early in-place encryption by BSP, no locking needed */ +static char sme_early_buffer[PAGE_SIZE] __initdata __aligned(PAGE_SIZE); + +/* + * This routine does not change the underlying encryption setting of the + * page(s) that map this memory. It assumes that eventually the memory is + * meant to be accessed as either encrypted or decrypted but the contents + * are currently not in the desired state. + * + * This routine follows the steps outlined in the AMD64 Architecture + * Programmer's Manual Volume 2, Section 7.10.8 Encrypt-in-Place. + */ +static void __init __sme_early_enc_dec(resource_size_t paddr, + unsigned long size, bool enc) +{ + void *src, *dst; + size_t len; + + if (!sme_me_mask) + return; + + wbinvd(); + + /* + * There are limited number of early mapping slots, so map (at most) + * one page at time. + */ + while (size) { + len = min_t(size_t, sizeof(sme_early_buffer), size); + + /* + * Create mappings for the current and desired format of + * the memory. Use a write-protected mapping for the source. + */ + src = enc ? early_memremap_decrypted_wp(paddr, len) : + early_memremap_encrypted_wp(paddr, len); + + dst = enc ? early_memremap_encrypted(paddr, len) : + early_memremap_decrypted(paddr, len); + + /* + * If a mapping can't be obtained to perform the operation, + * then eventual access of that area in the desired mode + * will cause a crash. + */ + BUG_ON(!src || !dst); + + /* + * Use a temporary buffer, of cache-line multiple size, to + * avoid data corruption as documented in the APM. + */ + memcpy(sme_early_buffer, src, len); + memcpy(dst, sme_early_buffer, len); + + early_memunmap(dst, len); + early_memunmap(src, len); + + paddr += len; + size -= len; + } +} + +void __init sme_early_encrypt(resource_size_t paddr, unsigned long size) +{ + __sme_early_enc_dec(paddr, size, true); +} + +void __init sme_early_decrypt(resource_size_t paddr, unsigned long size) +{ + __sme_early_enc_dec(paddr, size, false); +} + +static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size, + bool map) +{ + unsigned long paddr = (unsigned long)vaddr - __PAGE_OFFSET; + pmdval_t pmd_flags, pmd; + + /* Use early_pmd_flags but remove the encryption mask */ + pmd_flags = __sme_clr(early_pmd_flags); + + do { + pmd = map ? (paddr & PMD_MASK) + pmd_flags : 0; + __early_make_pgtable((unsigned long)vaddr, pmd); + + vaddr += PMD_SIZE; + paddr += PMD_SIZE; + size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE; + } while (size); + + flush_tlb_local(); +} + +void __init sme_unmap_bootdata(char *real_mode_data) +{ + struct boot_params *boot_data; + unsigned long cmdline_paddr; + + if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) + return; + + /* Get the command line address before unmapping the real_mode_data */ + boot_data = (struct boot_params *)real_mode_data; + cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32); + + __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), false); + + if (!cmdline_paddr) + return; + + __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, false); +} + +void __init sme_map_bootdata(char *real_mode_data) +{ + struct boot_params *boot_data; + unsigned long cmdline_paddr; + + if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) + return; + + __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), true); + + /* Get the command line address after mapping the real_mode_data */ + boot_data = (struct boot_params *)real_mode_data; + cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32); + + if (!cmdline_paddr) + return; + + __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true); +} + +void __init sme_early_init(void) +{ + unsigned int i; + + if (!sme_me_mask) + return; + + early_pmd_flags = __sme_set(early_pmd_flags); + + __supported_pte_mask = __sme_set(__supported_pte_mask); + + /* Update the protection map with memory encryption mask */ + for (i = 0; i < ARRAY_SIZE(protection_map); i++) + protection_map[i] = pgprot_encrypted(protection_map[i]); + + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) + swiotlb_force = SWIOTLB_FORCE; +} + +void __init sev_setup_arch(void) +{ + phys_addr_t total_mem = memblock_phys_mem_size(); + unsigned long size; + + if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) + return; + + /* + * For SEV, all DMA has to occur via shared/unencrypted pages. + * SEV uses SWIOTLB to make this happen without changing device + * drivers. However, depending on the workload being run, the + * default 64MB of SWIOTLB may not be enough and SWIOTLB may + * run out of buffers for DMA, resulting in I/O errors and/or + * performance degradation especially with high I/O workloads. + * + * Adjust the default size of SWIOTLB for SEV guests using + * a percentage of guest memory for SWIOTLB buffers. + * Also, as the SWIOTLB bounce buffer memory is allocated + * from low memory, ensure that the adjusted size is within + * the limits of low available memory. + * + * The percentage of guest memory used here for SWIOTLB buffers + * is more of an approximation of the static adjustment which + * 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6% + */ + size = total_mem * 6 / 100; + size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G); + swiotlb_adjust_size(size); +} + +static unsigned long pg_level_to_pfn(int level, pte_t *kpte, pgprot_t *ret_prot) +{ + unsigned long pfn = 0; + pgprot_t prot; + + switch (level) { + case PG_LEVEL_4K: + pfn = pte_pfn(*kpte); + prot = pte_pgprot(*kpte); + break; + case PG_LEVEL_2M: + pfn = pmd_pfn(*(pmd_t *)kpte); + prot = pmd_pgprot(*(pmd_t *)kpte); + break; + case PG_LEVEL_1G: + pfn = pud_pfn(*(pud_t *)kpte); + prot = pud_pgprot(*(pud_t *)kpte); + break; + default: + WARN_ONCE(1, "Invalid level for kpte\n"); + return 0; + } + + if (ret_prot) + *ret_prot = prot; + + return pfn; +} + +void notify_range_enc_status_changed(unsigned long vaddr, int npages, bool enc) +{ +#ifdef CONFIG_PARAVIRT + unsigned long sz = npages << PAGE_SHIFT; + unsigned long vaddr_end = vaddr + sz; + + while (vaddr < vaddr_end) { + int psize, pmask, level; + unsigned long pfn; + pte_t *kpte; + + kpte = lookup_address(vaddr, &level); + if (!kpte || pte_none(*kpte)) { + WARN_ONCE(1, "kpte lookup for vaddr\n"); + return; + } + + pfn = pg_level_to_pfn(level, kpte, NULL); + if (!pfn) + continue; + + psize = page_level_size(level); + pmask = page_level_mask(level); + + notify_page_enc_status_changed(pfn, psize >> PAGE_SHIFT, enc); + + vaddr = (vaddr & pmask) + psize; + } +#endif +} + +static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc) +{ + pgprot_t old_prot, new_prot; + unsigned long pfn, pa, size; + pte_t new_pte; + + pfn = pg_level_to_pfn(level, kpte, &old_prot); + if (!pfn) + return; + + new_prot = old_prot; + if (enc) + pgprot_val(new_prot) |= _PAGE_ENC; + else + pgprot_val(new_prot) &= ~_PAGE_ENC; + + /* If prot is same then do nothing. */ + if (pgprot_val(old_prot) == pgprot_val(new_prot)) + return; + + pa = pfn << PAGE_SHIFT; + size = page_level_size(level); + + /* + * We are going to perform in-place en-/decryption and change the + * physical page attribute from C=1 to C=0 or vice versa. Flush the + * caches to ensure that data gets accessed with the correct C-bit. + */ + clflush_cache_range(__va(pa), size); + + /* Encrypt/decrypt the contents in-place */ + if (enc) + sme_early_encrypt(pa, size); + else + sme_early_decrypt(pa, size); + + /* Change the page encryption mask. */ + new_pte = pfn_pte(pfn, new_prot); + set_pte_atomic(kpte, new_pte); +} + +static int __init early_set_memory_enc_dec(unsigned long vaddr, + unsigned long size, bool enc) +{ + unsigned long vaddr_end, vaddr_next, start; + unsigned long psize, pmask; + int split_page_size_mask; + int level, ret; + pte_t *kpte; + + start = vaddr; + vaddr_next = vaddr; + vaddr_end = vaddr + size; + + for (; vaddr < vaddr_end; vaddr = vaddr_next) { + kpte = lookup_address(vaddr, &level); + if (!kpte || pte_none(*kpte)) { + ret = 1; + goto out; + } + + if (level == PG_LEVEL_4K) { + __set_clr_pte_enc(kpte, level, enc); + vaddr_next = (vaddr & PAGE_MASK) + PAGE_SIZE; + continue; + } + + psize = page_level_size(level); + pmask = page_level_mask(level); + + /* + * Check whether we can change the large page in one go. + * We request a split when the address is not aligned and + * the number of pages to set/clear encryption bit is smaller + * than the number of pages in the large page. + */ + if (vaddr == (vaddr & pmask) && + ((vaddr_end - vaddr) >= psize)) { + __set_clr_pte_enc(kpte, level, enc); + vaddr_next = (vaddr & pmask) + psize; + continue; + } + + /* + * The virtual address is part of a larger page, create the next + * level page table mapping (4K or 2M). If it is part of a 2M + * page then we request a split of the large page into 4K + * chunks. A 1GB large page is split into 2M pages, resp. + */ + if (level == PG_LEVEL_2M) + split_page_size_mask = 0; + else + split_page_size_mask = 1 << PG_LEVEL_2M; + + /* + * kernel_physical_mapping_change() does not flush the TLBs, so + * a TLB flush is required after we exit from the for loop. + */ + kernel_physical_mapping_change(__pa(vaddr & pmask), + __pa((vaddr_end & pmask) + psize), + split_page_size_mask); + } + + ret = 0; + + notify_range_enc_status_changed(start, PAGE_ALIGN(size) >> PAGE_SHIFT, enc); +out: + __flush_tlb_all(); + return ret; +} + +int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size) +{ + return early_set_memory_enc_dec(vaddr, size, false); +} + +int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size) +{ + return early_set_memory_enc_dec(vaddr, size, true); +} + +void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, bool enc) +{ + notify_range_enc_status_changed(vaddr, npages, enc); +} + +void __init mem_encrypt_free_decrypted_mem(void) +{ + unsigned long vaddr, vaddr_end, npages; + int r; + + vaddr = (unsigned long)__start_bss_decrypted_unused; + vaddr_end = (unsigned long)__end_bss_decrypted; + npages = (vaddr_end - vaddr) >> PAGE_SHIFT; + + /* + * The unused memory range was mapped decrypted, change the encryption + * attribute from decrypted to encrypted before freeing it. + */ + if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) { + r = set_memory_encrypted(vaddr, npages); + if (r) { + pr_warn("failed to free unused decrypted pages\n"); + return; + } + } + + free_init_pages("unused decrypted", vaddr, vaddr_end); +} diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 59ba2968af1b..1e6513f95133 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -1148,7 +1148,7 @@ void flush_tlb_one_user(unsigned long addr) */ STATIC_NOPV void native_flush_tlb_global(void) { - unsigned long cr4, flags; + unsigned long flags; if (static_cpu_has(X86_FEATURE_INVPCID)) { /* @@ -1168,11 +1168,7 @@ STATIC_NOPV void native_flush_tlb_global(void) */ raw_local_irq_save(flags); - cr4 = this_cpu_read(cpu_tlbstate.cr4); - /* toggle PGE */ - native_write_cr4(cr4 ^ X86_CR4_PGE); - /* write old PGE again and flush TLBs */ - native_write_cr4(cr4); + __native_tlb_flush_global(this_cpu_read(cpu_tlbstate.cr4)); raw_local_irq_restore(flags); } diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 726700fabca6..bafe36e69227 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1252,19 +1252,54 @@ st: if (is_imm8(insn->off)) case BPF_LDX | BPF_MEM | BPF_DW: case BPF_LDX | BPF_PROBE_MEM | BPF_DW: if (BPF_MODE(insn->code) == BPF_PROBE_MEM) { - /* test src_reg, src_reg */ - maybe_emit_mod(&prog, src_reg, src_reg, true); /* always 1 byte */ - EMIT2(0x85, add_2reg(0xC0, src_reg, src_reg)); - /* jne start_of_ldx */ - EMIT2(X86_JNE, 0); + /* Though the verifier prevents negative insn->off in BPF_PROBE_MEM + * add abs(insn->off) to the limit to make sure that negative + * offset won't be an issue. + * insn->off is s16, so it won't affect valid pointers. + */ + u64 limit = TASK_SIZE_MAX + PAGE_SIZE + abs(insn->off); + u8 *end_of_jmp1, *end_of_jmp2; + + /* Conservatively check that src_reg + insn->off is a kernel address: + * 1. src_reg + insn->off >= limit + * 2. src_reg + insn->off doesn't become small positive. + * Cannot do src_reg + insn->off >= limit in one branch, + * since it needs two spare registers, but JIT has only one. + */ + + /* movabsq r11, limit */ + EMIT2(add_1mod(0x48, AUX_REG), add_1reg(0xB8, AUX_REG)); + EMIT((u32)limit, 4); + EMIT(limit >> 32, 4); + /* cmp src_reg, r11 */ + maybe_emit_mod(&prog, src_reg, AUX_REG, true); + EMIT2(0x39, add_2reg(0xC0, src_reg, AUX_REG)); + /* if unsigned '<' goto end_of_jmp2 */ + EMIT2(X86_JB, 0); + end_of_jmp1 = prog; + + /* mov r11, src_reg */ + emit_mov_reg(&prog, true, AUX_REG, src_reg); + /* add r11, insn->off */ + maybe_emit_1mod(&prog, AUX_REG, true); + EMIT2_off32(0x81, add_1reg(0xC0, AUX_REG), insn->off); + /* jmp if not carry to start_of_ldx + * Otherwise ERR_PTR(-EINVAL) + 128 will be the user addr + * that has to be rejected. + */ + EMIT2(0x73 /* JNC */, 0); + end_of_jmp2 = prog; + /* xor dst_reg, dst_reg */ emit_mov_imm32(&prog, false, dst_reg, 0); /* jmp byte_after_ldx */ EMIT2(0xEB, 0); - /* populate jmp_offset for JNE above */ - temp[4] = prog - temp - 5 /* sizeof(test + jne) */; + /* populate jmp_offset for JB above to jump to xor dst_reg */ + end_of_jmp1[-1] = end_of_jmp2 - end_of_jmp1; + /* populate jmp_offset for JNC above to jump to start_of_ldx */ start_of_ldx = prog; + end_of_jmp2[-1] = start_of_ldx - end_of_jmp2; } emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); if (BPF_MODE(insn->code) == BPF_PROBE_MEM) { @@ -1305,7 +1340,7 @@ st: if (is_imm8(insn->off)) * End result: x86 insn "mov rbx, qword ptr [rax+0x14]" * of 4 bytes will be ignored and rbx will be zero inited. */ - ex->fixup = (prog - temp) | (reg2pt_regs[dst_reg] << 8); + ex->fixup = (prog - start_of_ldx) | (reg2pt_regs[dst_reg] << 8); } break; diff --git a/arch/x86/platform/ce4100/falconfalls.dts b/arch/x86/platform/ce4100/falconfalls.dts index 0ac3d4357136..65fa3d866226 100644 --- a/arch/x86/platform/ce4100/falconfalls.dts +++ b/arch/x86/platform/ce4100/falconfalls.dts @@ -249,7 +249,7 @@ gpio@26 { #gpio-cells = <2>; - compatible = "ti,pcf8575"; + compatible = "nxp,pcf8575"; reg = <0x26>; gpio-controller; }; @@ -263,7 +263,7 @@ gpio@26 { #gpio-cells = <2>; - compatible = "ti,pcf8575"; + compatible = "nxp,pcf8575"; reg = <0x26>; gpio-controller; }; diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index b15ebfe40a73..b0b848d6933a 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -277,7 +277,8 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size) return; } - new = early_memremap(data.phys_map, data.size); + new = early_memremap_prot(data.phys_map, data.size, + pgprot_val(pgprot_encrypted(FIXMAP_PAGE_NORMAL))); if (!new) { pr_err("Failed to map new boot services memmap\n"); return; diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 38d24d2ab38b..c5e29db02a46 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -17,6 +17,32 @@ u32 *trampoline_cr4_features; /* Hold the pgd entry used on booting additional CPUs */ pgd_t trampoline_pgd_entry; +void load_trampoline_pgtable(void) +{ +#ifdef CONFIG_X86_32 + load_cr3(initial_page_table); +#else + /* + * This function is called before exiting to real-mode and that will + * fail with CR4.PCIDE still set. + */ + if (boot_cpu_has(X86_FEATURE_PCID)) + cr4_clear_bits(X86_CR4_PCIDE); + + write_cr3(real_mode_header->trampoline_pgd); +#endif + + /* + * The CR3 write above will not flush global TLB entries. + * Stale, global entries from previous page tables may still be + * present. Flush those stale entries. + * + * This ensures that memory accessed while running with + * trampoline_pgd is *actually* mapped into trampoline_pgd. + */ + __flush_tlb_all(); +} + void __init reserve_real_mode(void) { phys_addr_t mem; diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index c736cf2ac76b..e2c5b296120d 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -68,7 +68,7 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = { "(__parainstructions|__alt_instructions)(_end)?|" "(__iommu_table|__apicdrivers|__smp_locks)(_end)?|" "__(start|end)_pci_.*|" -#if CONFIG_FW_LOADER_BUILTIN +#if CONFIG_FW_LOADER "__(start|end)_builtin_fw|" #endif "__(start|stop)___ksymtab(_gpl)?|" |