aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig13
-rw-r--r--arch/x86/boot/compressed/sev.c6
-rw-r--r--arch/x86/events/core.c10
-rw-r--r--arch/x86/events/perf_event.h2
-rw-r--r--arch/x86/include/asm/efi.h2
-rw-r--r--arch/x86/include/asm/insn-eval.h13
-rw-r--r--arch/x86/include/asm/io.h20
-rw-r--r--arch/x86/include/asm/irqflags.h7
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h3
-rw-r--r--arch/x86/include/asm/mtrr.h8
-rw-r--r--arch/x86/include/asm/page_64.h1
-rw-r--r--arch/x86/include/asm/paravirt.h5
-rw-r--r--arch/x86/include/asm/pgtable.h1
-rw-r--r--arch/x86/include/asm/pkru.h4
-rw-r--r--arch/x86/include/asm/processor.h8
-rw-r--r--arch/x86/include/asm/realmode.h1
-rw-r--r--arch/x86/include/asm/set_memory.h4
-rw-r--r--arch/x86/include/asm/sev-common.h55
-rw-r--r--arch/x86/include/asm/tlbflush.h5
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/cc_platform.c8
-rw-r--r--arch/x86/kernel/cpu/common.c2
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c2
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c162
-rw-r--r--arch/x86/kernel/cpu/sgx/sgx.h8
-rw-r--r--arch/x86/kernel/fpu/core.c21
-rw-r--r--arch/x86/kernel/head64.c74
-rw-r--r--arch/x86/kernel/head_64.S19
-rw-r--r--arch/x86/kernel/reboot.c12
-rw-r--r--arch/x86/kernel/setup.c79
-rw-r--r--arch/x86/kernel/sev-shared.c2
-rw-r--r--arch/x86/kernel/sev.c183
-rw-r--r--arch/x86/kernel/smpboot.c14
-rw-r--r--arch/x86/kvm/debugfs.c3
-rw-r--r--arch/x86/kvm/hyperv.c7
-rw-r--r--arch/x86/kvm/mmu/mmu.c16
-rw-r--r--arch/x86/kvm/mmu/spte.c1
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.c6
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.h6
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c29
-rw-r--r--arch/x86/kvm/svm/sev.c2
-rw-r--r--arch/x86/kvm/svm/svm.c21
-rw-r--r--arch/x86/kvm/vmx/vmx.c67
-rw-r--r--arch/x86/kvm/x86.c25
-rw-r--r--arch/x86/lib/insn-eval.c109
-rw-r--r--arch/x86/mm/Makefile7
-rw-r--r--arch/x86/mm/init.c5
-rw-r--r--arch/x86/mm/mem_encrypt.c441
-rw-r--r--arch/x86/mm/mem_encrypt_amd.c438
-rw-r--r--arch/x86/mm/tlb.c8
-rw-r--r--arch/x86/net/bpf_jit_comp.c51
-rw-r--r--arch/x86/platform/ce4100/falconfalls.dts4
-rw-r--r--arch/x86/platform/efi/quirks.c3
-rw-r--r--arch/x86/realmode/init.c26
-rw-r--r--arch/x86/tools/relocs.c2
56 files changed, 1207 insertions, 827 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7399327d1eff..8910b09b5601 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -269,6 +269,7 @@ config X86
select HAVE_ARCH_KCSAN if X86_64
select X86_FEATURE_NAMES if PROC_FS
select PROC_PID_ARCH_STATUS if PROC_FS
+ select HAVE_ARCH_NODE_DEV_GROUP if X86_SGX
imply IMA_SECURE_AND_OR_TRUSTED_BOOT if EFI
config INSTRUCTION_DECODER
@@ -1523,16 +1524,20 @@ config X86_CPA_STATISTICS
helps to determine the effectiveness of preserving large and huge
page mappings when mapping protections are changed.
+config X86_MEM_ENCRYPT
+ select ARCH_HAS_FORCE_DMA_UNENCRYPTED
+ select DYNAMIC_PHYSICAL_MASK
+ select ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS
+ def_bool n
+
config AMD_MEM_ENCRYPT
bool "AMD Secure Memory Encryption (SME) support"
depends on X86_64 && CPU_SUP_AMD
select DMA_COHERENT_POOL
- select DYNAMIC_PHYSICAL_MASK
select ARCH_USE_MEMREMAP_PROT
- select ARCH_HAS_FORCE_DMA_UNENCRYPTED
select INSTRUCTION_DECODER
- select ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS
select ARCH_HAS_CC_PLATFORM
+ select X86_MEM_ENCRYPT
help
Say yes to enable support for the encryption of system memory.
This requires an AMD processor that supports Secure Memory
@@ -1917,6 +1922,7 @@ config X86_SGX
select SRCU
select MMU_NOTIFIER
select NUMA_KEEP_MEMINFO if NUMA
+ select XARRAY_MULTI
help
Intel(R) Software Guard eXtensions (SGX) is a set of CPU instructions
that can be used by applications to set aside private regions of code
@@ -1932,6 +1938,7 @@ config EFI
depends on ACPI
select UCS2_STRING
select EFI_RUNTIME_WRAPPERS
+ select ARCH_USE_MEMREMAP_PROT
help
This enables the kernel to use EFI runtime services that are
available (such as the EFI variable services).
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index 670e998fe930..28bcf04c022e 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -122,7 +122,7 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
static bool early_setup_sev_es(void)
{
if (!sev_es_negotiate_protocol())
- sev_es_terminate(GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED);
+ sev_es_terminate(GHCB_SEV_ES_PROT_UNSUPPORTED);
if (set_page_decrypted((unsigned long)&boot_ghcb_page))
return false;
@@ -175,7 +175,7 @@ void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code)
enum es_result result;
if (!boot_ghcb && !early_setup_sev_es())
- sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
+ sev_es_terminate(GHCB_SEV_ES_GEN_REQ);
vc_ghcb_invalidate(boot_ghcb);
result = vc_init_em_ctxt(&ctxt, regs, exit_code);
@@ -202,5 +202,5 @@ finish:
if (result == ES_OK)
vc_finish_insn(&ctxt);
else if (result != ES_RETRY)
- sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
+ sev_es_terminate(GHCB_SEV_ES_GEN_REQ);
}
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 38b2c779146f..68dea7ce6a22 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2476,7 +2476,7 @@ static int x86_pmu_event_init(struct perf_event *event)
if (READ_ONCE(x86_pmu.attr_rdpmc) &&
!(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS))
- event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED;
+ event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT;
return err;
}
@@ -2510,7 +2510,7 @@ void perf_clear_dirty_counters(void)
static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
{
- if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+ if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT))
return;
/*
@@ -2531,7 +2531,7 @@ static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm)
{
- if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+ if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT))
return;
if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed))
@@ -2542,7 +2542,7 @@ static int x86_pmu_event_idx(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
- if (!(hwc->flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+ if (!(hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT))
return 0;
if (is_metric_idx(hwc->idx))
@@ -2725,7 +2725,7 @@ void arch_perf_update_userpage(struct perf_event *event,
userpg->cap_user_time = 0;
userpg->cap_user_time_zero = 0;
userpg->cap_user_rdpmc =
- !!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED);
+ !!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT);
userpg->pmc_width = x86_pmu.cntval_bits;
if (!using_native_sched_clock() || !sched_clock_stable())
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 5480db242083..9d376e528dfc 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -74,7 +74,7 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode)
#define PERF_X86_EVENT_PEBS_NA_HSW 0x0010 /* haswell style datala, unknown */
#define PERF_X86_EVENT_EXCL 0x0020 /* HT exclusivity on counter */
#define PERF_X86_EVENT_DYNAMIC 0x0040 /* dynamic alloc'd constraint */
-#define PERF_X86_EVENT_RDPMC_ALLOWED 0x0080 /* grant rdpmc permission */
+
#define PERF_X86_EVENT_EXCL_ACCT 0x0100 /* accounted EXCL event */
#define PERF_X86_EVENT_AUTO_RELOAD 0x0200 /* use PEBS auto-reload */
#define PERF_X86_EVENT_LARGE_PEBS 0x0400 /* use large PEBS */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 4d0b126835b8..63158fd55856 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -197,8 +197,6 @@ static inline bool efi_runtime_supported(void)
extern void parse_efi_setup(u64 phys_addr, u32 data_len);
-extern void efifb_setup_from_dmi(struct screen_info *si, const char *opt);
-
extern void efi_thunk_runtime_setup(void);
efi_status_t efi_set_virtual_address_map(unsigned long memory_map_size,
unsigned long descriptor_size,
diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h
index 4ec3613551e3..43785ee363f1 100644
--- a/arch/x86/include/asm/insn-eval.h
+++ b/arch/x86/include/asm/insn-eval.h
@@ -19,6 +19,7 @@ bool insn_has_rep_prefix(struct insn *insn);
void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs);
int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs);
int insn_get_modrm_reg_off(struct insn *insn, struct pt_regs *regs);
+unsigned long *insn_get_modrm_reg_ptr(struct insn *insn, struct pt_regs *regs);
unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx);
int insn_get_code_seg_params(struct pt_regs *regs);
int insn_get_effective_ip(struct pt_regs *regs, unsigned long *ip);
@@ -29,4 +30,16 @@ int insn_fetch_from_user_inatomic(struct pt_regs *regs,
bool insn_decode_from_regs(struct insn *insn, struct pt_regs *regs,
unsigned char buf[MAX_INSN_SIZE], int buf_size);
+enum mmio_type {
+ MMIO_DECODE_FAILED,
+ MMIO_WRITE,
+ MMIO_WRITE_IMM,
+ MMIO_READ,
+ MMIO_READ_ZERO_EXTEND,
+ MMIO_READ_SIGN_EXTEND,
+ MMIO_MOVS,
+};
+
+enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes);
+
#endif /* _ASM_X86_INSN_EVAL_H */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 5c6a4af0b911..f6d91ecb8026 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -40,6 +40,7 @@
#include <linux/string.h>
#include <linux/compiler.h>
+#include <linux/cc_platform.h>
#include <asm/page.h>
#include <asm/early_ioremap.h>
#include <asm/pgtable_types.h>
@@ -256,21 +257,6 @@ static inline void slow_down_io(void)
#endif
-#ifdef CONFIG_AMD_MEM_ENCRYPT
-#include <linux/jump_label.h>
-
-extern struct static_key_false sev_enable_key;
-static inline bool sev_key_active(void)
-{
- return static_branch_unlikely(&sev_enable_key);
-}
-
-#else /* !CONFIG_AMD_MEM_ENCRYPT */
-
-static inline bool sev_key_active(void) { return false; }
-
-#endif /* CONFIG_AMD_MEM_ENCRYPT */
-
#define BUILDIO(bwl, bw, type) \
static inline void out##bwl(unsigned type value, int port) \
{ \
@@ -301,7 +287,7 @@ static inline unsigned type in##bwl##_p(int port) \
\
static inline void outs##bwl(int port, const void *addr, unsigned long count) \
{ \
- if (sev_key_active()) { \
+ if (cc_platform_has(CC_ATTR_GUEST_UNROLL_STRING_IO)) { \
unsigned type *value = (unsigned type *)addr; \
while (count) { \
out##bwl(*value, port); \
@@ -317,7 +303,7 @@ static inline void outs##bwl(int port, const void *addr, unsigned long count) \
\
static inline void ins##bwl(int port, void *addr, unsigned long count) \
{ \
- if (sev_key_active()) { \
+ if (cc_platform_has(CC_ATTR_GUEST_UNROLL_STRING_IO)) { \
unsigned type *value = (unsigned type *)addr; \
while (count) { \
*value = in##bwl(port); \
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index c5ce9845c999..87761396e8cc 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -114,8 +114,6 @@ static __always_inline unsigned long arch_local_irq_save(void)
#define SAVE_FLAGS pushfq; popq %rax
#endif
-#define INTERRUPT_RETURN jmp native_iret
-
#endif
#endif /* __ASSEMBLY__ */
@@ -143,8 +141,13 @@ static __always_inline void arch_local_irq_restore(unsigned long flags)
#ifdef CONFIG_X86_64
#ifdef CONFIG_XEN_PV
#define SWAPGS ALTERNATIVE "swapgs", "", X86_FEATURE_XENPV
+#define INTERRUPT_RETURN \
+ ANNOTATE_RETPOLINE_SAFE; \
+ ALTERNATIVE_TERNARY("jmp *paravirt_iret(%rip);", \
+ X86_FEATURE_XENPV, "jmp xen_iret;", "jmp native_iret;")
#else
#define SWAPGS swapgs
+#define INTERRUPT_RETURN jmp native_iret
#endif
#endif
#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index cefe1d81e2e8..9e50da3ed01a 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -47,6 +47,7 @@ KVM_X86_OP(set_dr7)
KVM_X86_OP(cache_reg)
KVM_X86_OP(get_rflags)
KVM_X86_OP(set_rflags)
+KVM_X86_OP(get_if_flag)
KVM_X86_OP(tlb_flush_all)
KVM_X86_OP(tlb_flush_current)
KVM_X86_OP_NULL(tlb_remote_flush)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 860ed500580c..555f4de47ef2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -97,7 +97,7 @@
KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_TLB_FLUSH_CURRENT KVM_ARCH_REQ(26)
#define KVM_REQ_TLB_FLUSH_GUEST \
- KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP)
+ KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_APF_READY KVM_ARCH_REQ(28)
#define KVM_REQ_MSR_FILTER_CHANGED KVM_ARCH_REQ(29)
#define KVM_REQ_UPDATE_CPU_DIRTY_LOGGING \
@@ -1349,6 +1349,7 @@ struct kvm_x86_ops {
void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
+ bool (*get_if_flag)(struct kvm_vcpu *vcpu);
void (*tlb_flush_all)(struct kvm_vcpu *vcpu);
void (*tlb_flush_current)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
index 829df26fd7a3..76d726074c16 100644
--- a/arch/x86/include/asm/mtrr.h
+++ b/arch/x86/include/asm/mtrr.h
@@ -24,8 +24,8 @@
#define _ASM_X86_MTRR_H
#include <uapi/asm/mtrr.h>
-#include <asm/memtype.h>
+void mtrr_bp_init(void);
/*
* The following functions are for use by other drivers that cannot use
@@ -43,7 +43,6 @@ extern int mtrr_del(int reg, unsigned long base, unsigned long size);
extern int mtrr_del_page(int reg, unsigned long base, unsigned long size);
extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi);
extern void mtrr_ap_init(void);
-extern void mtrr_bp_init(void);
extern void set_mtrr_aps_delayed_init(void);
extern void mtrr_aps_init(void);
extern void mtrr_bp_restore(void);
@@ -84,11 +83,6 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn)
static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
{
}
-static inline void mtrr_bp_init(void)
-{
- pat_disable("PAT support disabled because CONFIG_MTRR is disabled in the kernel.");
-}
-
#define mtrr_ap_init() do {} while (0)
#define set_mtrr_aps_delayed_init() do {} while (0)
#define mtrr_aps_init() do {} while (0)
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 4bde0dc66100..e9c86299b835 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -5,6 +5,7 @@
#include <asm/page_64_types.h>
#ifndef __ASSEMBLY__
+#include <asm/cpufeatures.h>
#include <asm/alternative.h>
/* duplicated to the one in bootmem.h */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 21c4a694ca11..27d276232c80 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -752,11 +752,6 @@ extern void default_banner(void);
#define PARA_SITE(ptype, ops) _PVSITE(ptype, ops, .quad, 8)
#define PARA_INDIRECT(addr) *addr(%rip)
-#define INTERRUPT_RETURN \
- ANNOTATE_RETPOLINE_SAFE; \
- ALTERNATIVE_TERNARY("jmp *paravirt_iret(%rip);", \
- X86_FEATURE_XENPV, "jmp xen_iret;", "jmp native_iret;")
-
#ifdef CONFIG_DEBUG_ENTRY
.macro PARA_IRQ_save_fl
PARA_SITE(PARA_PATCH(PV_IRQ_save_fl),
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 448cd01eb3ec..a34430b7af4a 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -22,6 +22,7 @@
#define pgprot_decrypted(prot) __pgprot(__sme_clr(pgprot_val(prot)))
#ifndef __ASSEMBLY__
+#include <linux/spinlock.h>
#include <asm/x86_init.h>
#include <asm/pkru.h>
#include <asm/fpu/api.h>
diff --git a/arch/x86/include/asm/pkru.h b/arch/x86/include/asm/pkru.h
index 4cd49afa0ca4..74f0a2d34ffd 100644
--- a/arch/x86/include/asm/pkru.h
+++ b/arch/x86/include/asm/pkru.h
@@ -4,8 +4,8 @@
#include <asm/cpufeature.h>
-#define PKRU_AD_BIT 0x1
-#define PKRU_WD_BIT 0x2
+#define PKRU_AD_BIT 0x1u
+#define PKRU_WD_BIT 0x2u
#define PKRU_BITS_PER_PKEY 2
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 355d38c0cf60..2c5f12ae7d04 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -855,4 +855,12 @@ enum mds_mitigations {
MDS_MITIGATION_VMWERV,
};
+#ifdef CONFIG_X86_SGX
+int arch_memory_failure(unsigned long pfn, int flags);
+#define arch_memory_failure arch_memory_failure
+
+bool arch_is_platform_page(u64 paddr);
+#define arch_is_platform_page arch_is_platform_page
+#endif
+
#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index 5db5d083c873..331474b150f1 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -89,6 +89,7 @@ static inline void set_real_mode_mem(phys_addr_t mem)
}
void reserve_real_mode(void);
+void load_trampoline_pgtable(void);
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index 872617542bbc..ff0f2d90338a 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -2,6 +2,7 @@
#ifndef _ASM_X86_SET_MEMORY_H
#define _ASM_X86_SET_MEMORY_H
+#include <linux/mm.h>
#include <asm/page.h>
#include <asm-generic/set_memory.h>
@@ -99,6 +100,9 @@ static inline int set_mce_nospec(unsigned long pfn, bool unmap)
unsigned long decoy_addr;
int rc;
+ /* SGX pages are not in the 1:1 map */
+ if (arch_is_platform_page(pfn << PAGE_SHIFT))
+ return 0;
/*
* We would like to just call:
* set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1);
diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h
index 6acaf5af0a3d..1b2fd32b42fe 100644
--- a/arch/x86/include/asm/sev-common.h
+++ b/arch/x86/include/asm/sev-common.h
@@ -18,20 +18,19 @@
/* SEV Information Request/Response */
#define GHCB_MSR_SEV_INFO_RESP 0x001
#define GHCB_MSR_SEV_INFO_REQ 0x002
-#define GHCB_MSR_VER_MAX_POS 48
-#define GHCB_MSR_VER_MAX_MASK 0xffff
-#define GHCB_MSR_VER_MIN_POS 32
-#define GHCB_MSR_VER_MIN_MASK 0xffff
-#define GHCB_MSR_CBIT_POS 24
-#define GHCB_MSR_CBIT_MASK 0xff
-#define GHCB_MSR_SEV_INFO(_max, _min, _cbit) \
- ((((_max) & GHCB_MSR_VER_MAX_MASK) << GHCB_MSR_VER_MAX_POS) | \
- (((_min) & GHCB_MSR_VER_MIN_MASK) << GHCB_MSR_VER_MIN_POS) | \
- (((_cbit) & GHCB_MSR_CBIT_MASK) << GHCB_MSR_CBIT_POS) | \
+
+#define GHCB_MSR_SEV_INFO(_max, _min, _cbit) \
+ /* GHCBData[63:48] */ \
+ ((((_max) & 0xffff) << 48) | \
+ /* GHCBData[47:32] */ \
+ (((_min) & 0xffff) << 32) | \
+ /* GHCBData[31:24] */ \
+ (((_cbit) & 0xff) << 24) | \
GHCB_MSR_SEV_INFO_RESP)
+
#define GHCB_MSR_INFO(v) ((v) & 0xfffUL)
-#define GHCB_MSR_PROTO_MAX(v) (((v) >> GHCB_MSR_VER_MAX_POS) & GHCB_MSR_VER_MAX_MASK)
-#define GHCB_MSR_PROTO_MIN(v) (((v) >> GHCB_MSR_VER_MIN_POS) & GHCB_MSR_VER_MIN_MASK)
+#define GHCB_MSR_PROTO_MAX(v) (((v) >> 48) & 0xffff)
+#define GHCB_MSR_PROTO_MIN(v) (((v) >> 32) & 0xffff)
/* CPUID Request/Response */
#define GHCB_MSR_CPUID_REQ 0x004
@@ -46,30 +45,36 @@
#define GHCB_CPUID_REQ_EBX 1
#define GHCB_CPUID_REQ_ECX 2
#define GHCB_CPUID_REQ_EDX 3
-#define GHCB_CPUID_REQ(fn, reg) \
- (GHCB_MSR_CPUID_REQ | \
- (((unsigned long)reg & GHCB_MSR_CPUID_REG_MASK) << GHCB_MSR_CPUID_REG_POS) | \
- (((unsigned long)fn) << GHCB_MSR_CPUID_FUNC_POS))
+#define GHCB_CPUID_REQ(fn, reg) \
+ /* GHCBData[11:0] */ \
+ (GHCB_MSR_CPUID_REQ | \
+ /* GHCBData[31:12] */ \
+ (((unsigned long)(reg) & 0x3) << 30) | \
+ /* GHCBData[63:32] */ \
+ (((unsigned long)fn) << 32))
/* AP Reset Hold */
-#define GHCB_MSR_AP_RESET_HOLD_REQ 0x006
-#define GHCB_MSR_AP_RESET_HOLD_RESP 0x007
+#define GHCB_MSR_AP_RESET_HOLD_REQ 0x006
+#define GHCB_MSR_AP_RESET_HOLD_RESP 0x007
/* GHCB Hypervisor Feature Request/Response */
-#define GHCB_MSR_HV_FT_REQ 0x080
-#define GHCB_MSR_HV_FT_RESP 0x081
+#define GHCB_MSR_HV_FT_REQ 0x080
+#define GHCB_MSR_HV_FT_RESP 0x081
#define GHCB_MSR_TERM_REQ 0x100
#define GHCB_MSR_TERM_REASON_SET_POS 12
#define GHCB_MSR_TERM_REASON_SET_MASK 0xf
#define GHCB_MSR_TERM_REASON_POS 16
#define GHCB_MSR_TERM_REASON_MASK 0xff
-#define GHCB_SEV_TERM_REASON(reason_set, reason_val) \
- (((((u64)reason_set) & GHCB_MSR_TERM_REASON_SET_MASK) << GHCB_MSR_TERM_REASON_SET_POS) | \
- ((((u64)reason_val) & GHCB_MSR_TERM_REASON_MASK) << GHCB_MSR_TERM_REASON_POS))
-#define GHCB_SEV_ES_REASON_GENERAL_REQUEST 0
-#define GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED 1
+#define GHCB_SEV_TERM_REASON(reason_set, reason_val) \
+ /* GHCBData[15:12] */ \
+ (((((u64)reason_set) & 0xf) << 12) | \
+ /* GHCBData[23:16] */ \
+ ((((u64)reason_val) & 0xff) << 16))
+
+#define GHCB_SEV_ES_GEN_REQ 0
+#define GHCB_SEV_ES_PROT_UNSUPPORTED 1
#define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index b587a9ee9cb2..98fa0a114074 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -261,4 +261,9 @@ extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
#endif /* !MODULE */
+static inline void __native_tlb_flush_global(unsigned long cr4)
+{
+ native_write_cr4(cr4 ^ X86_CR4_PGE);
+ native_write_cr4(cr4);
+}
#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 2ff3e600f426..6aef9ee28a39 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -84,7 +84,7 @@ obj-$(CONFIG_IA32_EMULATION) += tls.o
obj-y += step.o
obj-$(CONFIG_INTEL_TXT) += tboot.o
obj-$(CONFIG_ISA_DMA_API) += i8237.o
-obj-$(CONFIG_STACKTRACE) += stacktrace.o
+obj-y += stacktrace.o
obj-y += cpu/
obj-y += acpi/
obj-y += reboot.o
diff --git a/arch/x86/kernel/cc_platform.c b/arch/x86/kernel/cc_platform.c
index 03bb2f343ddb..8a25b1c0d480 100644
--- a/arch/x86/kernel/cc_platform.c
+++ b/arch/x86/kernel/cc_platform.c
@@ -50,6 +50,14 @@ static bool amd_cc_platform_has(enum cc_attr attr)
case CC_ATTR_GUEST_STATE_ENCRYPT:
return sev_status & MSR_AMD64_SEV_ES_ENABLED;
+ /*
+ * With SEV, the rep string I/O instructions need to be unrolled
+ * but SEV-ES supports them through the #VC handler.
+ */
+ case CC_ATTR_GUEST_UNROLL_STRING_IO:
+ return (sev_status & MSR_AMD64_SEV_ENABLED) &&
+ !(sev_status & MSR_AMD64_SEV_ES_ENABLED);
+
default:
return false;
}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 0083464de5e3..79b3d67addcc 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -384,7 +384,7 @@ set_register:
}
EXPORT_SYMBOL(native_write_cr0);
-void native_write_cr4(unsigned long val)
+void __no_profile native_write_cr4(unsigned long val)
{
unsigned long bits_changed = 0;
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index c9f0f3d63f75..eaf25a234ff5 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -282,7 +282,7 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
u64 shift = 64 - width, chunks;
chunks = (cur_msr << shift) - (prev_msr << shift);
- return chunks >>= shift;
+ return chunks >> shift;
}
static u64 __mon_event_count(u32 rmid, struct rmid_read *rr)
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 8471a8b9b48e..4b41efc9e367 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -6,11 +6,13 @@
#include <linux/highmem.h>
#include <linux/kthread.h>
#include <linux/miscdevice.h>
+#include <linux/node.h>
#include <linux/pagemap.h>
#include <linux/ratelimit.h>
#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
+#include <linux/sysfs.h>
#include <asm/sgx.h>
#include "driver.h"
#include "encl.h"
@@ -20,6 +22,7 @@ struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
static int sgx_nr_epc_sections;
static struct task_struct *ksgxd_tsk;
static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq);
+static DEFINE_XARRAY(sgx_epc_address_space);
/*
* These variables are part of the state of the reclaimer, and must be accessed
@@ -60,6 +63,24 @@ static void __sgx_sanitize_pages(struct list_head *dirty_page_list)
page = list_first_entry(dirty_page_list, struct sgx_epc_page, list);
+ /*
+ * Checking page->poison without holding the node->lock
+ * is racy, but losing the race (i.e. poison is set just
+ * after the check) just means __eremove() will be uselessly
+ * called for a page that sgx_free_epc_page() will put onto
+ * the node->sgx_poison_page_list later.
+ */
+ if (page->poison) {
+ struct sgx_epc_section *section = &sgx_epc_sections[page->section];
+ struct sgx_numa_node *node = section->node;
+
+ spin_lock(&node->lock);
+ list_move(&page->list, &node->sgx_poison_page_list);
+ spin_unlock(&node->lock);
+
+ continue;
+ }
+
ret = __eremove(sgx_get_epc_virt_addr(page));
if (!ret) {
/*
@@ -471,6 +492,7 @@ static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid)
page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list);
list_del_init(&page->list);
+ page->flags = 0;
spin_unlock(&node->lock);
atomic_long_dec(&sgx_nr_free_pages);
@@ -624,7 +646,12 @@ void sgx_free_epc_page(struct sgx_epc_page *page)
spin_lock(&node->lock);
- list_add_tail(&page->list, &node->free_page_list);
+ page->owner = NULL;
+ if (page->poison)
+ list_add(&page->list, &node->sgx_poison_page_list);
+ else
+ list_add_tail(&page->list, &node->free_page_list);
+ page->flags = SGX_EPC_PAGE_IS_FREE;
spin_unlock(&node->lock);
atomic_long_inc(&sgx_nr_free_pages);
@@ -648,17 +675,102 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
}
section->phys_addr = phys_addr;
+ xa_store_range(&sgx_epc_address_space, section->phys_addr,
+ phys_addr + size - 1, section, GFP_KERNEL);
for (i = 0; i < nr_pages; i++) {
section->pages[i].section = index;
section->pages[i].flags = 0;
section->pages[i].owner = NULL;
+ section->pages[i].poison = 0;
list_add_tail(&section->pages[i].list, &sgx_dirty_page_list);
}
return true;
}
+bool arch_is_platform_page(u64 paddr)
+{
+ return !!xa_load(&sgx_epc_address_space, paddr);
+}
+EXPORT_SYMBOL_GPL(arch_is_platform_page);
+
+static struct sgx_epc_page *sgx_paddr_to_page(u64 paddr)
+{
+ struct sgx_epc_section *section;
+
+ section = xa_load(&sgx_epc_address_space, paddr);
+ if (!section)
+ return NULL;
+
+ return &section->pages[PFN_DOWN(paddr - section->phys_addr)];
+}
+
+/*
+ * Called in process context to handle a hardware reported
+ * error in an SGX EPC page.
+ * If the MF_ACTION_REQUIRED bit is set in flags, then the
+ * context is the task that consumed the poison data. Otherwise
+ * this is called from a kernel thread unrelated to the page.
+ */
+int arch_memory_failure(unsigned long pfn, int flags)
+{
+ struct sgx_epc_page *page = sgx_paddr_to_page(pfn << PAGE_SHIFT);
+ struct sgx_epc_section *section;
+ struct sgx_numa_node *node;
+
+ /*
+ * mm/memory-failure.c calls this routine for all errors
+ * where there isn't a "struct page" for the address. But that
+ * includes other address ranges besides SGX.
+ */
+ if (!page)
+ return -ENXIO;
+
+ /*
+ * If poison was consumed synchronously. Send a SIGBUS to
+ * the task. Hardware has already exited the SGX enclave and
+ * will not allow re-entry to an enclave that has a memory
+ * error. The signal may help the task understand why the
+ * enclave is broken.
+ */
+ if (flags & MF_ACTION_REQUIRED)
+ force_sig(SIGBUS);
+
+ section = &sgx_epc_sections[page->section];
+ node = section->node;
+
+ spin_lock(&node->lock);
+
+ /* Already poisoned? Nothing more to do */
+ if (page->poison)
+ goto out;
+
+ page->poison = 1;
+
+ /*
+ * If the page is on a free list, move it to the per-node
+ * poison page list.
+ */
+ if (page->flags & SGX_EPC_PAGE_IS_FREE) {
+ list_move(&page->list, &node->sgx_poison_page_list);
+ goto out;
+ }
+
+ /*
+ * TBD: Add additional plumbing to enable pre-emptive
+ * action for asynchronous poison notification. Until
+ * then just hope that the poison:
+ * a) is not accessed - sgx_free_epc_page() will deal with it
+ * when the user gives it back
+ * b) results in a recoverable machine check rather than
+ * a fatal one
+ */
+out:
+ spin_unlock(&node->lock);
+ return 0;
+}
+
/**
* A section metric is concatenated in a way that @low bits 12-31 define the
* bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the
@@ -670,6 +782,48 @@ static inline u64 __init sgx_calc_section_metric(u64 low, u64 high)
((high & GENMASK_ULL(19, 0)) << 32);
}
+#ifdef CONFIG_NUMA
+static ssize_t sgx_total_bytes_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lu\n", sgx_numa_nodes[dev->id].size);
+}
+static DEVICE_ATTR_RO(sgx_total_bytes);
+
+static umode_t arch_node_attr_is_visible(struct kobject *kobj,
+ struct attribute *attr, int idx)
+{
+ /* Make all x86/ attributes invisible when SGX is not initialized: */
+ if (nodes_empty(sgx_numa_mask))
+ return 0;
+
+ return attr->mode;
+}
+
+static struct attribute *arch_node_dev_attrs[] = {
+ &dev_attr_sgx_total_bytes.attr,
+ NULL,
+};
+
+const struct attribute_group arch_node_dev_group = {
+ .name = "x86",
+ .attrs = arch_node_dev_attrs,
+ .is_visible = arch_node_attr_is_visible,
+};
+
+static void __init arch_update_sysfs_visibility(int nid)
+{
+ struct node *node = node_devices[nid];
+ int ret;
+
+ ret = sysfs_update_group(&node->dev.kobj, &arch_node_dev_group);
+
+ if (ret)
+ pr_err("sysfs update failed (%d), files may be invisible", ret);
+}
+#else /* !CONFIG_NUMA */
+static void __init arch_update_sysfs_visibility(int nid) {}
+#endif
+
static bool __init sgx_page_cache_init(void)
{
u32 eax, ebx, ecx, edx, type;
@@ -713,10 +867,16 @@ static bool __init sgx_page_cache_init(void)
if (!node_isset(nid, sgx_numa_mask)) {
spin_lock_init(&sgx_numa_nodes[nid].lock);
INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list);
+ INIT_LIST_HEAD(&sgx_numa_nodes[nid].sgx_poison_page_list);
node_set(nid, sgx_numa_mask);
+ sgx_numa_nodes[nid].size = 0;
+
+ /* Make SGX-specific node sysfs files visible: */
+ arch_update_sysfs_visibility(nid);
}
sgx_epc_sections[i].node = &sgx_numa_nodes[nid];
+ sgx_numa_nodes[nid].size += size;
sgx_nr_epc_sections++;
}
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
index 4628acec0009..0f17def9fe6f 100644
--- a/arch/x86/kernel/cpu/sgx/sgx.h
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -26,9 +26,13 @@
/* Pages, which are being tracked by the page reclaimer. */
#define SGX_EPC_PAGE_RECLAIMER_TRACKED BIT(0)
+/* Pages on free list */
+#define SGX_EPC_PAGE_IS_FREE BIT(1)
+
struct sgx_epc_page {
unsigned int section;
- unsigned int flags;
+ u16 flags;
+ u16 poison;
struct sgx_encl_page *owner;
struct list_head list;
};
@@ -39,6 +43,8 @@ struct sgx_epc_page {
*/
struct sgx_numa_node {
struct list_head free_page_list;
+ struct list_head sgx_poison_page_list;
+ unsigned long size;
spinlock_t lock;
};
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 8ea306b1bf8e..dd3777ac0443 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -99,6 +99,19 @@ bool irq_fpu_usable(void)
EXPORT_SYMBOL(irq_fpu_usable);
/*
+ * Track AVX512 state use because it is known to slow the max clock
+ * speed of the core.
+ */
+static void update_avx_timestamp(struct fpu *fpu)
+{
+
+#define AVX512_TRACKING_MASK (XFEATURE_MASK_ZMM_Hi256 | XFEATURE_MASK_Hi16_ZMM)
+
+ if (fpu->fpstate->regs.xsave.header.xfeatures & AVX512_TRACKING_MASK)
+ fpu->avx512_timestamp = jiffies;
+}
+
+/*
* Save the FPU register state in fpu->fpstate->regs. The register state is
* preserved.
*
@@ -116,13 +129,7 @@ void save_fpregs_to_fpstate(struct fpu *fpu)
{
if (likely(use_xsave())) {
os_xsave(fpu->fpstate);
-
- /*
- * AVX512 state is tracked here because its use is
- * known to slow the max clock speed of the core.
- */
- if (fpu->fpstate->regs.xsave.header.xfeatures & XFEATURE_MASK_AVX512)
- fpu->avx512_timestamp = jiffies;
+ update_avx_timestamp(fpu);
return;
}
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index fc5371a7e9d1..de563db9cdcd 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -126,6 +126,36 @@ static bool __head check_la57_support(unsigned long physaddr)
}
#endif
+static unsigned long sme_postprocess_startup(struct boot_params *bp, pmdval_t *pmd)
+{
+ unsigned long vaddr, vaddr_end;
+ int i;
+
+ /* Encrypt the kernel and related (if SME is active) */
+ sme_encrypt_kernel(bp);
+
+ /*
+ * Clear the memory encryption mask from the .bss..decrypted section.
+ * The bss section will be memset to zero later in the initialization so
+ * there is no need to zero it after changing the memory encryption
+ * attribute.
+ */
+ if (sme_get_me_mask()) {
+ vaddr = (unsigned long)__start_bss_decrypted;
+ vaddr_end = (unsigned long)__end_bss_decrypted;
+ for (; vaddr < vaddr_end; vaddr += PMD_SIZE) {
+ i = pmd_index(vaddr);
+ pmd[i] -= sme_get_me_mask();
+ }
+ }
+
+ /*
+ * Return the SME encryption mask (if SME is active) to be used as a
+ * modifier for the initial pgdir entry programmed into CR3.
+ */
+ return sme_get_me_mask();
+}
+
/* Code in __startup_64() can be relocated during execution, but the compiler
* doesn't have to generate PC-relative relocations when accessing globals from
* that function. Clang actually does not generate them, which leads to
@@ -135,7 +165,6 @@ static bool __head check_la57_support(unsigned long physaddr)
unsigned long __head __startup_64(unsigned long physaddr,
struct boot_params *bp)
{
- unsigned long vaddr, vaddr_end;
unsigned long load_delta, *p;
unsigned long pgtable_flags;
pgdval_t *pgd;
@@ -276,34 +305,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
*/
*fixup_long(&phys_base, physaddr) += load_delta - sme_get_me_mask();
- /* Encrypt the kernel and related (if SME is active) */
- sme_encrypt_kernel(bp);
-
- /*
- * Clear the memory encryption mask from the .bss..decrypted section.
- * The bss section will be memset to zero later in the initialization so
- * there is no need to zero it after changing the memory encryption
- * attribute.
- *
- * This is early code, use an open coded check for SME instead of
- * using cc_platform_has(). This eliminates worries about removing
- * instrumentation or checking boot_cpu_data in the cc_platform_has()
- * function.
- */
- if (sme_get_me_mask()) {
- vaddr = (unsigned long)__start_bss_decrypted;
- vaddr_end = (unsigned long)__end_bss_decrypted;
- for (; vaddr < vaddr_end; vaddr += PMD_SIZE) {
- i = pmd_index(vaddr);
- pmd[i] -= sme_get_me_mask();
- }
- }
-
- /*
- * Return the SME encryption mask (if SME is active) to be used as a
- * modifier for the initial pgdir entry programmed into CR3.
- */
- return sme_get_me_mask();
+ return sme_postprocess_startup(bp, pmd);
}
unsigned long __startup_secondary_64(void)
@@ -485,6 +487,10 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
clear_bss();
+ /*
+ * This needs to happen *before* kasan_early_init() because latter maps stuff
+ * into that page.
+ */
clear_page(init_top_pgt);
/*
@@ -496,6 +502,16 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
kasan_early_init();
+ /*
+ * Flush global TLB entries which could be left over from the trampoline page
+ * table.
+ *
+ * This needs to happen *after* kasan_early_init() as KASAN-enabled .configs
+ * instrument native_write_cr4() so KASAN must be initialized for that
+ * instrumentation to work.
+ */
+ __native_tlb_flush_global(this_cpu_read(cpu_tlbstate.cr4));
+
idt_setup_early_handler();
copy_bootdata(__va(real_mode_data));
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index d8b3ebd2bb85..9c63fc5988cd 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -166,9 +166,26 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
call sev_verify_cbit
popq %rsi
- /* Switch to new page-table */
+ /*
+ * Switch to new page-table
+ *
+ * For the boot CPU this switches to early_top_pgt which still has the
+ * indentity mappings present. The secondary CPUs will switch to the
+ * init_top_pgt here, away from the trampoline_pgd and unmap the
+ * indentity mapped ranges.
+ */
movq %rax, %cr3
+ /*
+ * Do a global TLB flush after the CR3 switch to make sure the TLB
+ * entries from the identity mapping are flushed.
+ */
+ movq %cr4, %rcx
+ movq %rcx, %rax
+ xorq $X86_CR4_PGE, %rcx
+ movq %rcx, %cr4
+ movq %rax, %cr4
+
/* Ensure I am executing from virtual addresses */
movq $1f, %rax
ANNOTATE_RETPOLINE_SAFE
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 0a40df66a40d..fa700b46588e 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -113,17 +113,9 @@ void __noreturn machine_real_restart(unsigned int type)
spin_unlock(&rtc_lock);
/*
- * Switch back to the initial page table.
+ * Switch to the trampoline page table.
*/
-#ifdef CONFIG_X86_32
- load_cr3(initial_page_table);
-#else
- write_cr3(real_mode_header->trampoline_pgd);
-
- /* Exiting long mode will fail if CR4.PCIDE is set. */
- if (boot_cpu_has(X86_FEATURE_PCID))
- cr4_clear_bits(X86_CR4_PCIDE);
-#endif
+ load_trampoline_pgtable();
/* Jump to the identity-mapped low memory code */
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 6a190c7f4d71..f7a132eb794d 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -40,6 +40,7 @@
#include <asm/kasan.h>
#include <asm/kaslr.h>
#include <asm/mce.h>
+#include <asm/memtype.h>
#include <asm/mtrr.h>
#include <asm/realmode.h>
#include <asm/olpc_ofw.h>
@@ -713,9 +714,6 @@ static void __init early_reserve_memory(void)
early_reserve_initrd();
- if (efi_enabled(EFI_BOOT))
- efi_memblock_x86_reserve_range();
-
memblock_x86_reserve_range_setup_data();
reserve_ibft_region();
@@ -742,28 +740,6 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
return 0;
}
-static char * __init prepare_command_line(void)
-{
-#ifdef CONFIG_CMDLINE_BOOL
-#ifdef CONFIG_CMDLINE_OVERRIDE
- strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
-#else
- if (builtin_cmdline[0]) {
- /* append boot loader cmdline to builtin */
- strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
- strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
- strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
- }
-#endif
-#endif
-
- strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
-
- parse_early_param();
-
- return command_line;
-}
-
/*
* Determine if we were loaded by an EFI loader. If so, then we have also been
* passed the efi memmap, systab, etc., so we should use these data structures
@@ -853,23 +829,6 @@ void __init setup_arch(char **cmdline_p)
x86_init.oem.arch_setup();
/*
- * x86_configure_nx() is called before parse_early_param() (called by
- * prepare_command_line()) to detect whether hardware doesn't support
- * NX (so that the early EHCI debug console setup can safely call
- * set_fixmap()). It may then be called again from within noexec_setup()
- * during parsing early parameters to honor the respective command line
- * option.
- */
- x86_configure_nx();
-
- /*
- * This parses early params and it needs to run before
- * early_reserve_memory() because latter relies on such settings
- * supplied as early params.
- */
- *cmdline_p = prepare_command_line();
-
- /*
* Do some memory reservations *before* memory is added to memblock, so
* memblock allocations won't overwrite it.
*
@@ -902,6 +861,36 @@ void __init setup_arch(char **cmdline_p)
bss_resource.start = __pa_symbol(__bss_start);
bss_resource.end = __pa_symbol(__bss_stop)-1;
+#ifdef CONFIG_CMDLINE_BOOL
+#ifdef CONFIG_CMDLINE_OVERRIDE
+ strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+#else
+ if (builtin_cmdline[0]) {
+ /* append boot loader cmdline to builtin */
+ strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
+ strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
+ strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+ }
+#endif
+#endif
+
+ strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
+ *cmdline_p = command_line;
+
+ /*
+ * x86_configure_nx() is called before parse_early_param() to detect
+ * whether hardware doesn't support NX (so that the early EHCI debug
+ * console setup can safely call set_fixmap()). It may then be called
+ * again from within noexec_setup() during parsing early parameters
+ * to honor the respective command line option.
+ */
+ x86_configure_nx();
+
+ parse_early_param();
+
+ if (efi_enabled(EFI_BOOT))
+ efi_memblock_x86_reserve_range();
+
#ifdef CONFIG_MEMORY_HOTPLUG
/*
* Memory used by the kernel cannot be hot-removed because Linux
@@ -979,7 +968,11 @@ void __init setup_arch(char **cmdline_p)
max_pfn = e820__end_of_ram_pfn();
/* update e820 for memory not covered by WB MTRRs */
- mtrr_bp_init();
+ if (IS_ENABLED(CONFIG_MTRR))
+ mtrr_bp_init();
+ else
+ pat_disable("PAT support disabled because CONFIG_MTRR is disabled in the kernel.");
+
if (mtrr_trim_uncached_memory(max_pfn))
max_pfn = e820__end_of_ram_pfn();
diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c
index 787dc5f568b5..ce987688bbc0 100644
--- a/arch/x86/kernel/sev-shared.c
+++ b/arch/x86/kernel/sev-shared.c
@@ -221,7 +221,7 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
fail:
/* Terminate the guest */
- sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
+ sev_es_terminate(GHCB_SEV_ES_GEN_REQ);
}
static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt,
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index a9fc2ac7a8bd..e6d316a01fdd 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -26,6 +26,7 @@
#include <asm/fpu/xcr.h>
#include <asm/processor.h>
#include <asm/realmode.h>
+#include <asm/setup.h>
#include <asm/traps.h>
#include <asm/svm.h>
#include <asm/smp.h>
@@ -86,9 +87,6 @@ struct ghcb_state {
static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data);
DEFINE_STATIC_KEY_FALSE(sev_es_enable_key);
-/* Needed in vc_early_forward_exception */
-void do_early_exception(struct pt_regs *regs, int trapnr);
-
static __always_inline bool on_vc_stack(struct pt_regs *regs)
{
unsigned long sp = regs->sp;
@@ -209,9 +207,6 @@ static noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state)
return ghcb;
}
-/* Needed in vc_early_forward_exception */
-void do_early_exception(struct pt_regs *regs, int trapnr);
-
static inline u64 sev_es_rd_ghcb_msr(void)
{
return __rdmsr(MSR_AMD64_SEV_ES_GHCB);
@@ -797,22 +792,6 @@ static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt)
do_early_exception(ctxt->regs, trapnr);
}
-static long *vc_insn_get_reg(struct es_em_ctxt *ctxt)
-{
- long *reg_array;
- int offset;
-
- reg_array = (long *)ctxt->regs;
- offset = insn_get_modrm_reg_off(&ctxt->insn, ctxt->regs);
-
- if (offset < 0)
- return NULL;
-
- offset /= sizeof(long);
-
- return reg_array + offset;
-}
-
static long *vc_insn_get_rm(struct es_em_ctxt *ctxt)
{
long *reg_array;
@@ -860,76 +839,6 @@ static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
return sev_es_ghcb_hv_call(ghcb, true, ctxt, exit_code, exit_info_1, exit_info_2);
}
-static enum es_result vc_handle_mmio_twobyte_ops(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt)
-{
- struct insn *insn = &ctxt->insn;
- unsigned int bytes = 0;
- enum es_result ret;
- int sign_byte;
- long *reg_data;
-
- switch (insn->opcode.bytes[1]) {
- /* MMIO Read w/ zero-extension */
- case 0xb6:
- bytes = 1;
- fallthrough;
- case 0xb7:
- if (!bytes)
- bytes = 2;
-
- ret = vc_do_mmio(ghcb, ctxt, bytes, true);
- if (ret)
- break;
-
- /* Zero extend based on operand size */
- reg_data = vc_insn_get_reg(ctxt);
- if (!reg_data)
- return ES_DECODE_FAILED;
-
- memset(reg_data, 0, insn->opnd_bytes);
-
- memcpy(reg_data, ghcb->shared_buffer, bytes);
- break;
-
- /* MMIO Read w/ sign-extension */
- case 0xbe:
- bytes = 1;
- fallthrough;
- case 0xbf:
- if (!bytes)
- bytes = 2;
-
- ret = vc_do_mmio(ghcb, ctxt, bytes, true);
- if (ret)
- break;
-
- /* Sign extend based on operand size */
- reg_data = vc_insn_get_reg(ctxt);
- if (!reg_data)
- return ES_DECODE_FAILED;
-
- if (bytes == 1) {
- u8 *val = (u8 *)ghcb->shared_buffer;
-
- sign_byte = (*val & 0x80) ? 0xff : 0x00;
- } else {
- u16 *val = (u16 *)ghcb->shared_buffer;
-
- sign_byte = (*val & 0x8000) ? 0xff : 0x00;
- }
- memset(reg_data, sign_byte, insn->opnd_bytes);
-
- memcpy(reg_data, ghcb->shared_buffer, bytes);
- break;
-
- default:
- ret = ES_UNSUPPORTED;
- }
-
- return ret;
-}
-
/*
* The MOVS instruction has two memory operands, which raises the
* problem that it is not known whether the access to the source or the
@@ -997,83 +906,79 @@ static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt,
return ES_RETRY;
}
-static enum es_result vc_handle_mmio(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt)
+static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
{
struct insn *insn = &ctxt->insn;
unsigned int bytes = 0;
+ enum mmio_type mmio;
enum es_result ret;
+ u8 sign_byte;
long *reg_data;
- switch (insn->opcode.bytes[0]) {
- /* MMIO Write */
- case 0x88:
- bytes = 1;
- fallthrough;
- case 0x89:
- if (!bytes)
- bytes = insn->opnd_bytes;
+ mmio = insn_decode_mmio(insn, &bytes);
+ if (mmio == MMIO_DECODE_FAILED)
+ return ES_DECODE_FAILED;
- reg_data = vc_insn_get_reg(ctxt);
+ if (mmio != MMIO_WRITE_IMM && mmio != MMIO_MOVS) {
+ reg_data = insn_get_modrm_reg_ptr(insn, ctxt->regs);
if (!reg_data)
return ES_DECODE_FAILED;
+ }
+ switch (mmio) {
+ case MMIO_WRITE:
memcpy(ghcb->shared_buffer, reg_data, bytes);
-
ret = vc_do_mmio(ghcb, ctxt, bytes, false);
break;
-
- case 0xc6:
- bytes = 1;
- fallthrough;
- case 0xc7:
- if (!bytes)
- bytes = insn->opnd_bytes;
-
+ case MMIO_WRITE_IMM:
memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes);
-
ret = vc_do_mmio(ghcb, ctxt, bytes, false);
break;
-
- /* MMIO Read */
- case 0x8a:
- bytes = 1;
- fallthrough;
- case 0x8b:
- if (!bytes)
- bytes = insn->opnd_bytes;
-
+ case MMIO_READ:
ret = vc_do_mmio(ghcb, ctxt, bytes, true);
if (ret)
break;
- reg_data = vc_insn_get_reg(ctxt);
- if (!reg_data)
- return ES_DECODE_FAILED;
-
/* Zero-extend for 32-bit operation */
if (bytes == 4)
*reg_data = 0;
memcpy(reg_data, ghcb->shared_buffer, bytes);
break;
+ case MMIO_READ_ZERO_EXTEND:
+ ret = vc_do_mmio(ghcb, ctxt, bytes, true);
+ if (ret)
+ break;
- /* MOVS instruction */
- case 0xa4:
- bytes = 1;
- fallthrough;
- case 0xa5:
- if (!bytes)
- bytes = insn->opnd_bytes;
+ /* Zero extend based on operand size */
+ memset(reg_data, 0, insn->opnd_bytes);
+ memcpy(reg_data, ghcb->shared_buffer, bytes);
+ break;
+ case MMIO_READ_SIGN_EXTEND:
+ ret = vc_do_mmio(ghcb, ctxt, bytes, true);
+ if (ret)
+ break;
- ret = vc_handle_mmio_movs(ctxt, bytes);
+ if (bytes == 1) {
+ u8 *val = (u8 *)ghcb->shared_buffer;
+
+ sign_byte = (*val & 0x80) ? 0xff : 0x00;
+ } else {
+ u16 *val = (u16 *)ghcb->shared_buffer;
+
+ sign_byte = (*val & 0x8000) ? 0xff : 0x00;
+ }
+
+ /* Sign extend based on operand size */
+ memset(reg_data, sign_byte, insn->opnd_bytes);
+ memcpy(reg_data, ghcb->shared_buffer, bytes);
break;
- /* Two-Byte Opcodes */
- case 0x0f:
- ret = vc_handle_mmio_twobyte_ops(ghcb, ctxt);
+ case MMIO_MOVS:
+ ret = vc_handle_mmio_movs(ctxt, bytes);
break;
default:
ret = ES_UNSUPPORTED;
+ break;
}
return ret;
@@ -1432,7 +1337,7 @@ DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication)
show_regs(regs);
/* Ask hypervisor to sev_es_terminate */
- sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
+ sev_es_terminate(GHCB_SEV_ES_GEN_REQ);
/* If that fails and we get here - just panic */
panic("Returned from Terminate-Request to Hypervisor\n");
@@ -1480,7 +1385,7 @@ bool __init handle_vc_boot_ghcb(struct pt_regs *regs)
/* Do initial setup or terminate the guest */
if (unlikely(boot_ghcb == NULL && !sev_es_setup_ghcb()))
- sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
+ sev_es_terminate(GHCB_SEV_ES_GEN_REQ);
vc_ghcb_invalidate(boot_ghcb);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ac2909f0cab3..617012f4619f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -579,6 +579,17 @@ static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
{ NULL, },
};
+static struct sched_domain_topology_level x86_hybrid_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+ { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+ { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
+#endif
+ { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+ { NULL, },
+};
+
static struct sched_domain_topology_level x86_topology[] = {
#ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
@@ -1469,8 +1480,11 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
calculate_max_logical_packages();
+ /* XXX for now assume numa-in-package and hybrid don't overlap */
if (x86_has_numa_in_package)
set_sched_topology(x86_numa_in_package_topology);
+ if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
+ set_sched_topology(x86_hybrid_topology);
nmi_selftest();
impress_friends();
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
index 54a83a744538..f33c804a922a 100644
--- a/arch/x86/kvm/debugfs.c
+++ b/arch/x86/kvm/debugfs.c
@@ -95,6 +95,9 @@ static int kvm_mmu_rmaps_stat_show(struct seq_file *m, void *v)
unsigned int *log[KVM_NR_PAGE_SIZES], *cur;
int i, j, k, l, ret;
+ if (!kvm_memslots_have_rmaps(kvm))
+ return 0;
+
ret = -ENOMEM;
memset(log, 0, sizeof(log));
for (i = 0; i < KVM_NR_PAGE_SIZES; i++) {
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 5e19e6e4c2ce..8d8c1cc7cb53 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1922,11 +1922,13 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool
all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL;
+ if (all_cpus)
+ goto check_and_send_ipi;
+
if (!sparse_banks_len)
goto ret_success;
- if (!all_cpus &&
- kvm_read_guest(kvm,
+ if (kvm_read_guest(kvm,
hc->ingpa + offsetof(struct hv_send_ipi_ex,
vp_set.bank_contents),
sparse_banks,
@@ -1934,6 +1936,7 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool
return HV_STATUS_INVALID_HYPERCALL_INPUT;
}
+check_and_send_ipi:
if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index e2e1d012df22..fcdf3f8bb59a 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3987,7 +3987,21 @@ out_retry:
static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault, int mmu_seq)
{
- if (is_obsolete_sp(vcpu->kvm, to_shadow_page(vcpu->arch.mmu->root_hpa)))
+ struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root_hpa);
+
+ /* Special roots, e.g. pae_root, are not backed by shadow pages. */
+ if (sp && is_obsolete_sp(vcpu->kvm, sp))
+ return true;
+
+ /*
+ * Roots without an associated shadow page are considered invalid if
+ * there is a pending request to free obsolete roots. The request is
+ * only a hint that the current root _may_ be obsolete and needs to be
+ * reloaded, e.g. if the guest frees a PGD that KVM is tracking as a
+ * previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs
+ * to reload even if no vCPU is actively using the root.
+ */
+ if (!sp && kvm_test_request(KVM_REQ_MMU_RELOAD, vcpu))
return true;
return fault->slot &&
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 0c76c45fdb68..fad546df0bba 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -16,6 +16,7 @@
#include "spte.h"
#include <asm/e820/api.h>
+#include <asm/memtype.h>
#include <asm/vmx.h>
static bool __read_mostly enable_mmio_caching = true;
diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index b3ed302c1a35..caa96c270b95 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -26,6 +26,7 @@ static gfn_t round_gfn_for_level(gfn_t gfn, int level)
*/
void tdp_iter_restart(struct tdp_iter *iter)
{
+ iter->yielded = false;
iter->yielded_gfn = iter->next_last_level_gfn;
iter->level = iter->root_level;
@@ -160,6 +161,11 @@ static bool try_step_up(struct tdp_iter *iter)
*/
void tdp_iter_next(struct tdp_iter *iter)
{
+ if (iter->yielded) {
+ tdp_iter_restart(iter);
+ return;
+ }
+
if (try_step_down(iter))
return;
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index b1748b988d3a..e19cabbcb65c 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -45,6 +45,12 @@ struct tdp_iter {
* iterator walks off the end of the paging structure.
*/
bool valid;
+ /*
+ * True if KVM dropped mmu_lock and yielded in the middle of a walk, in
+ * which case tdp_iter_next() needs to restart the walk at the root
+ * level instead of advancing to the next entry.
+ */
+ bool yielded;
};
/*
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 1db8496259ad..1beb4ca90560 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -502,6 +502,8 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
struct tdp_iter *iter,
u64 new_spte)
{
+ WARN_ON_ONCE(iter->yielded);
+
lockdep_assert_held_read(&kvm->mmu_lock);
/*
@@ -575,6 +577,8 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
u64 new_spte, bool record_acc_track,
bool record_dirty_log)
{
+ WARN_ON_ONCE(iter->yielded);
+
lockdep_assert_held_write(&kvm->mmu_lock);
/*
@@ -640,18 +644,19 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
* If this function should yield and flush is set, it will perform a remote
* TLB flush before yielding.
*
- * If this function yields, it will also reset the tdp_iter's walk over the
- * paging structure and the calling function should skip to the next
- * iteration to allow the iterator to continue its traversal from the
- * paging structure root.
+ * If this function yields, iter->yielded is set and the caller must skip to
+ * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
+ * over the paging structures to allow the iterator to continue its traversal
+ * from the paging structure root.
*
- * Return true if this function yielded and the iterator's traversal was reset.
- * Return false if a yield was not needed.
+ * Returns true if this function yielded.
*/
-static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
- struct tdp_iter *iter, bool flush,
- bool shared)
+static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
+ struct tdp_iter *iter,
+ bool flush, bool shared)
{
+ WARN_ON(iter->yielded);
+
/* Ensure forward progress has been made before yielding. */
if (iter->next_last_level_gfn == iter->yielded_gfn)
return false;
@@ -671,12 +676,10 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
WARN_ON(iter->gfn > iter->next_last_level_gfn);
- tdp_iter_restart(iter);
-
- return true;
+ iter->yielded = true;
}
- return false;
+ return iter->yielded;
}
/*
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 7656a2c5662a..be2883141220 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -1565,7 +1565,7 @@ static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
r = -EINTR;
if (mutex_lock_killable(&dst_kvm->lock))
goto release_src;
- if (mutex_lock_killable(&src_kvm->lock))
+ if (mutex_lock_killable_nested(&src_kvm->lock, SINGLE_DEPTH_NESTING))
goto unlock_dst;
return 0;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index d0f68d11ec70..5151efa424ac 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1585,6 +1585,15 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
to_svm(vcpu)->vmcb->save.rflags = rflags;
}
+static bool svm_get_if_flag(struct kvm_vcpu *vcpu)
+{
+ struct vmcb *vmcb = to_svm(vcpu)->vmcb;
+
+ return sev_es_guest(vcpu->kvm)
+ ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK
+ : kvm_get_rflags(vcpu) & X86_EFLAGS_IF;
+}
+
static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
{
switch (reg) {
@@ -3568,14 +3577,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
if (!gif_set(svm))
return true;
- if (sev_es_guest(vcpu->kvm)) {
- /*
- * SEV-ES guests to not expose RFLAGS. Use the VMCB interrupt mask
- * bit to determine the state of the IF flag.
- */
- if (!(vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK))
- return true;
- } else if (is_guest_mode(vcpu)) {
+ if (is_guest_mode(vcpu)) {
/* As long as interrupts are being delivered... */
if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)
@@ -3586,7 +3588,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
if (nested_exit_on_intr(svm))
return false;
} else {
- if (!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
+ if (!svm_get_if_flag(vcpu))
return true;
}
@@ -4621,6 +4623,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.cache_reg = svm_cache_reg,
.get_rflags = svm_get_rflags,
.set_rflags = svm_set_rflags,
+ .get_if_flag = svm_get_if_flag,
.tlb_flush_all = svm_flush_tlb,
.tlb_flush_current = svm_flush_tlb,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 9453743ce0c4..0dbf94eb954f 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1363,6 +1363,11 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
vmx->emulation_required = vmx_emulation_required(vcpu);
}
+static bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
+{
+ return vmx_get_rflags(vcpu) & X86_EFLAGS_IF;
+}
+
u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
{
u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
@@ -2646,15 +2651,6 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
if (!loaded_vmcs->msr_bitmap)
goto out_vmcs;
memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
-
- if (IS_ENABLED(CONFIG_HYPERV) &&
- static_branch_unlikely(&enable_evmcs) &&
- (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
- struct hv_enlightened_vmcs *evmcs =
- (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs;
-
- evmcs->hv_enlightenments_control.msr_bitmap = 1;
- }
}
memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
@@ -3968,8 +3964,7 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
if (pi_test_and_set_on(&vmx->pi_desc))
return 0;
- if (vcpu != kvm_get_running_vcpu() &&
- !kvm_vcpu_trigger_posted_interrupt(vcpu, false))
+ if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
kvm_vcpu_kick(vcpu);
return 0;
@@ -5886,18 +5881,14 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
vmx_flush_pml_buffer(vcpu);
/*
- * We should never reach this point with a pending nested VM-Enter, and
- * more specifically emulation of L2 due to invalid guest state (see
- * below) should never happen as that means we incorrectly allowed a
- * nested VM-Enter with an invalid vmcs12.
+ * KVM should never reach this point with a pending nested VM-Enter.
+ * More specifically, short-circuiting VM-Entry to emulate L2 due to
+ * invalid guest state should never happen as that means KVM knowingly
+ * allowed a nested VM-Enter with an invalid vmcs12. More below.
*/
if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm))
return -EIO;
- /* If guest state is invalid, start emulating */
- if (vmx->emulation_required)
- return handle_invalid_guest_state(vcpu);
-
if (is_guest_mode(vcpu)) {
/*
* PML is never enabled when running L2, bail immediately if a
@@ -5919,10 +5910,30 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
*/
nested_mark_vmcs12_pages_dirty(vcpu);
+ /*
+ * Synthesize a triple fault if L2 state is invalid. In normal
+ * operation, nested VM-Enter rejects any attempt to enter L2
+ * with invalid state. However, those checks are skipped if
+ * state is being stuffed via RSM or KVM_SET_NESTED_STATE. If
+ * L2 state is invalid, it means either L1 modified SMRAM state
+ * or userspace provided bad state. Synthesize TRIPLE_FAULT as
+ * doing so is architecturally allowed in the RSM case, and is
+ * the least awful solution for the userspace case without
+ * risking false positives.
+ */
+ if (vmx->emulation_required) {
+ nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
+ return 1;
+ }
+
if (nested_vmx_reflect_vmexit(vcpu))
return 1;
}
+ /* If guest state is invalid, start emulating. L2 is handled above. */
+ if (vmx->emulation_required)
+ return handle_invalid_guest_state(vcpu);
+
if (exit_reason.failed_vmentry) {
dump_vmcs(vcpu);
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -6617,9 +6628,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
* consistency check VM-Exit due to invalid guest state and bail.
*/
if (unlikely(vmx->emulation_required)) {
-
- /* We don't emulate invalid state of a nested guest */
- vmx->fail = is_guest_mode(vcpu);
+ vmx->fail = 0;
vmx->exit_reason.full = EXIT_REASON_INVALID_STATE;
vmx->exit_reason.failed_vmentry = 1;
@@ -6842,6 +6851,19 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
if (err < 0)
goto free_pml;
+ /*
+ * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a
+ * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the
+ * feature only for vmcs01, KVM currently isn't equipped to realize any
+ * performance benefits from enabling it for vmcs02.
+ */
+ if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs) &&
+ (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
+ struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
+
+ evmcs->hv_enlightenments_control.msr_bitmap = 1;
+ }
+
/* The MSR bitmap starts with all ones */
bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
@@ -7575,6 +7597,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.cache_reg = vmx_cache_reg,
.get_rflags = vmx_get_rflags,
.set_rflags = vmx_set_rflags,
+ .get_if_flag = vmx_get_if_flag,
.tlb_flush_all = vmx_flush_tlb_all,
.tlb_flush_current = vmx_flush_tlb_current,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e0aa4dd53c7f..e50e97ac4408 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -890,7 +890,8 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
!load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)))
return 1;
- if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
+ if (!(cr0 & X86_CR0_PG) &&
+ (is_64_bit_mode(vcpu) || kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)))
return 1;
static_call(kvm_x86_set_cr0)(vcpu, cr0);
@@ -1330,7 +1331,7 @@ static const u32 msrs_to_save_all[] = {
MSR_IA32_UMWAIT_CONTROL,
MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
- MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3,
+ MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
@@ -3412,7 +3413,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!msr_info->host_initiated)
return 1;
- if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM) && kvm_get_msr_feature(&msr_ent))
+ if (kvm_get_msr_feature(&msr_ent))
return 1;
if (data & ~msr_ent.data)
return 1;
@@ -7121,7 +7122,13 @@ static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
unsigned short port, void *val, unsigned int count)
{
if (vcpu->arch.pio.count) {
- /* Complete previous iteration. */
+ /*
+ * Complete a previous iteration that required userspace I/O.
+ * Note, @count isn't guaranteed to match pio.count as userspace
+ * can modify ECX before rerunning the vCPU. Ignore any such
+ * shenanigans as KVM doesn't support modifying the rep count,
+ * and the emulator ensures @count doesn't overflow the buffer.
+ */
} else {
int r = __emulator_pio_in(vcpu, size, port, count);
if (!r)
@@ -7130,7 +7137,6 @@ static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
/* Results already available, fall through. */
}
- WARN_ON(count != vcpu->arch.pio.count);
complete_emulator_pio_in(vcpu, val);
return 1;
}
@@ -8995,14 +9001,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
{
struct kvm_run *kvm_run = vcpu->run;
- /*
- * if_flag is obsolete and useless, so do not bother
- * setting it for SEV-ES guests. Userspace can just
- * use kvm_run->ready_for_interrupt_injection.
- */
- kvm_run->if_flag = !vcpu->arch.guest_state_protected
- && (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
-
+ kvm_run->if_flag = static_call(kvm_x86_get_if_flag)(vcpu);
kvm_run->cr8 = kvm_get_cr8(vcpu);
kvm_run->apic_base = kvm_get_apic_base(vcpu);
diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c
index eb3ccffb9b9d..53e57ef5925c 100644
--- a/arch/x86/lib/insn-eval.c
+++ b/arch/x86/lib/insn-eval.c
@@ -37,8 +37,6 @@ enum reg_type {
*/
static bool is_string_insn(struct insn *insn)
{
- insn_get_opcode(insn);
-
/* All string instructions have a 1-byte opcode. */
if (insn->opcode.nbytes != 1)
return false;
@@ -851,6 +849,26 @@ int insn_get_modrm_reg_off(struct insn *insn, struct pt_regs *regs)
}
/**
+ * insn_get_modrm_reg_ptr() - Obtain register pointer based on ModRM byte
+ * @insn: Instruction containing the ModRM byte
+ * @regs: Register values as seen when entering kernel mode
+ *
+ * Returns:
+ *
+ * The register indicated by the reg part of the ModRM byte.
+ * The register is obtained as a pointer within pt_regs.
+ */
+unsigned long *insn_get_modrm_reg_ptr(struct insn *insn, struct pt_regs *regs)
+{
+ int offset;
+
+ offset = insn_get_modrm_reg_off(insn, regs);
+ if (offset < 0)
+ return NULL;
+ return (void *)regs + offset;
+}
+
+/**
* get_seg_base_limit() - obtain base address and limit of a segment
* @insn: Instruction. Must be valid.
* @regs: Register values as seen when entering kernel mode
@@ -1405,6 +1423,9 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs)
if (!insn || !regs)
return (void __user *)-1L;
+ if (insn_get_opcode(insn))
+ return (void __user *)-1L;
+
switch (insn->addr_bytes) {
case 2:
return get_addr_ref_16(insn, regs);
@@ -1539,3 +1560,87 @@ bool insn_decode_from_regs(struct insn *insn, struct pt_regs *regs,
return true;
}
+
+/**
+ * insn_decode_mmio() - Decode a MMIO instruction
+ * @insn: Structure to store decoded instruction
+ * @bytes: Returns size of memory operand
+ *
+ * Decodes instruction that used for Memory-mapped I/O.
+ *
+ * Returns:
+ *
+ * Type of the instruction. Size of the memory operand is stored in
+ * @bytes. If decode failed, MMIO_DECODE_FAILED returned.
+ */
+enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes)
+{
+ enum mmio_type type = MMIO_DECODE_FAILED;
+
+ *bytes = 0;
+
+ if (insn_get_opcode(insn))
+ return MMIO_DECODE_FAILED;
+
+ switch (insn->opcode.bytes[0]) {
+ case 0x88: /* MOV m8,r8 */
+ *bytes = 1;
+ fallthrough;
+ case 0x89: /* MOV m16/m32/m64, r16/m32/m64 */
+ if (!*bytes)
+ *bytes = insn->opnd_bytes;
+ type = MMIO_WRITE;
+ break;
+
+ case 0xc6: /* MOV m8, imm8 */
+ *bytes = 1;
+ fallthrough;
+ case 0xc7: /* MOV m16/m32/m64, imm16/imm32/imm64 */
+ if (!*bytes)
+ *bytes = insn->opnd_bytes;
+ type = MMIO_WRITE_IMM;
+ break;
+
+ case 0x8a: /* MOV r8, m8 */
+ *bytes = 1;
+ fallthrough;
+ case 0x8b: /* MOV r16/r32/r64, m16/m32/m64 */
+ if (!*bytes)
+ *bytes = insn->opnd_bytes;
+ type = MMIO_READ;
+ break;
+
+ case 0xa4: /* MOVS m8, m8 */
+ *bytes = 1;
+ fallthrough;
+ case 0xa5: /* MOVS m16/m32/m64, m16/m32/m64 */
+ if (!*bytes)
+ *bytes = insn->opnd_bytes;
+ type = MMIO_MOVS;
+ break;
+
+ case 0x0f: /* Two-byte instruction */
+ switch (insn->opcode.bytes[1]) {
+ case 0xb6: /* MOVZX r16/r32/r64, m8 */
+ *bytes = 1;
+ fallthrough;
+ case 0xb7: /* MOVZX r32/r64, m16 */
+ if (!*bytes)
+ *bytes = 2;
+ type = MMIO_READ_ZERO_EXTEND;
+ break;
+
+ case 0xbe: /* MOVSX r16/r32/r64, m8 */
+ *bytes = 1;
+ fallthrough;
+ case 0xbf: /* MOVSX r32/r64, m16 */
+ if (!*bytes)
+ *bytes = 2;
+ type = MMIO_READ_SIGN_EXTEND;
+ break;
+ }
+ break;
+ }
+
+ return type;
+}
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 5864219221ca..fe3d3061fc11 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -2,9 +2,11 @@
# Kernel does not boot with instrumentation of tlb.c and mem_encrypt*.c
KCOV_INSTRUMENT_tlb.o := n
KCOV_INSTRUMENT_mem_encrypt.o := n
+KCOV_INSTRUMENT_mem_encrypt_amd.o := n
KCOV_INSTRUMENT_mem_encrypt_identity.o := n
KASAN_SANITIZE_mem_encrypt.o := n
+KASAN_SANITIZE_mem_encrypt_amd.o := n
KASAN_SANITIZE_mem_encrypt_identity.o := n
# Disable KCSAN entirely, because otherwise we get warnings that some functions
@@ -13,6 +15,7 @@ KCSAN_SANITIZE := n
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_mem_encrypt.o = -pg
+CFLAGS_REMOVE_mem_encrypt_amd.o = -pg
CFLAGS_REMOVE_mem_encrypt_identity.o = -pg
endif
@@ -52,6 +55,8 @@ obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o
-obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o
+obj-$(CONFIG_X86_MEM_ENCRYPT) += mem_encrypt.o
+obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_amd.o
+
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_identity.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 1895986842b9..4ba024d5b63a 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -714,6 +714,11 @@ static void __init memory_map_bottom_up(unsigned long map_start,
static void __init init_trampoline(void)
{
#ifdef CONFIG_X86_64
+ /*
+ * The code below will alias kernel page-tables in the user-range of the
+ * address space, including the Global bit. So global TLB entries will
+ * be created when using the trampoline page-table.
+ */
if (!kaslr_memory_enabled())
trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
else
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 35487305d8af..50d209939c66 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -1,419 +1,18 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * AMD Memory Encryption Support
+ * Memory Encryption Support Common Code
*
* Copyright (C) 2016 Advanced Micro Devices, Inc.
*
* Author: Tom Lendacky <thomas.lendacky@amd.com>
*/
-#define DISABLE_BRANCH_PROFILING
-
-#include <linux/linkage.h>
-#include <linux/init.h>
-#include <linux/mm.h>
#include <linux/dma-direct.h>
+#include <linux/dma-mapping.h>
#include <linux/swiotlb.h>
+#include <linux/cc_platform.h>
#include <linux/mem_encrypt.h>
-#include <linux/device.h>
-#include <linux/kernel.h>
-#include <linux/bitops.h>
-#include <linux/dma-mapping.h>
#include <linux/virtio_config.h>
-#include <linux/cc_platform.h>
-
-#include <asm/tlbflush.h>
-#include <asm/fixmap.h>
-#include <asm/setup.h>
-#include <asm/bootparam.h>
-#include <asm/set_memory.h>
-#include <asm/cacheflush.h>
-#include <asm/processor-flags.h>
-#include <asm/msr.h>
-#include <asm/cmdline.h>
-
-#include "mm_internal.h"
-
-/*
- * Since SME related variables are set early in the boot process they must
- * reside in the .data section so as not to be zeroed out when the .bss
- * section is later cleared.
- */
-u64 sme_me_mask __section(".data") = 0;
-u64 sev_status __section(".data") = 0;
-u64 sev_check_data __section(".data") = 0;
-EXPORT_SYMBOL(sme_me_mask);
-DEFINE_STATIC_KEY_FALSE(sev_enable_key);
-EXPORT_SYMBOL_GPL(sev_enable_key);
-
-/* Buffer used for early in-place encryption by BSP, no locking needed */
-static char sme_early_buffer[PAGE_SIZE] __initdata __aligned(PAGE_SIZE);
-
-/*
- * This routine does not change the underlying encryption setting of the
- * page(s) that map this memory. It assumes that eventually the memory is
- * meant to be accessed as either encrypted or decrypted but the contents
- * are currently not in the desired state.
- *
- * This routine follows the steps outlined in the AMD64 Architecture
- * Programmer's Manual Volume 2, Section 7.10.8 Encrypt-in-Place.
- */
-static void __init __sme_early_enc_dec(resource_size_t paddr,
- unsigned long size, bool enc)
-{
- void *src, *dst;
- size_t len;
-
- if (!sme_me_mask)
- return;
-
- wbinvd();
-
- /*
- * There are limited number of early mapping slots, so map (at most)
- * one page at time.
- */
- while (size) {
- len = min_t(size_t, sizeof(sme_early_buffer), size);
-
- /*
- * Create mappings for the current and desired format of
- * the memory. Use a write-protected mapping for the source.
- */
- src = enc ? early_memremap_decrypted_wp(paddr, len) :
- early_memremap_encrypted_wp(paddr, len);
-
- dst = enc ? early_memremap_encrypted(paddr, len) :
- early_memremap_decrypted(paddr, len);
-
- /*
- * If a mapping can't be obtained to perform the operation,
- * then eventual access of that area in the desired mode
- * will cause a crash.
- */
- BUG_ON(!src || !dst);
-
- /*
- * Use a temporary buffer, of cache-line multiple size, to
- * avoid data corruption as documented in the APM.
- */
- memcpy(sme_early_buffer, src, len);
- memcpy(dst, sme_early_buffer, len);
-
- early_memunmap(dst, len);
- early_memunmap(src, len);
-
- paddr += len;
- size -= len;
- }
-}
-
-void __init sme_early_encrypt(resource_size_t paddr, unsigned long size)
-{
- __sme_early_enc_dec(paddr, size, true);
-}
-
-void __init sme_early_decrypt(resource_size_t paddr, unsigned long size)
-{
- __sme_early_enc_dec(paddr, size, false);
-}
-
-static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size,
- bool map)
-{
- unsigned long paddr = (unsigned long)vaddr - __PAGE_OFFSET;
- pmdval_t pmd_flags, pmd;
-
- /* Use early_pmd_flags but remove the encryption mask */
- pmd_flags = __sme_clr(early_pmd_flags);
-
- do {
- pmd = map ? (paddr & PMD_MASK) + pmd_flags : 0;
- __early_make_pgtable((unsigned long)vaddr, pmd);
-
- vaddr += PMD_SIZE;
- paddr += PMD_SIZE;
- size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE;
- } while (size);
-
- flush_tlb_local();
-}
-
-void __init sme_unmap_bootdata(char *real_mode_data)
-{
- struct boot_params *boot_data;
- unsigned long cmdline_paddr;
-
- if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
- return;
-
- /* Get the command line address before unmapping the real_mode_data */
- boot_data = (struct boot_params *)real_mode_data;
- cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
-
- __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), false);
-
- if (!cmdline_paddr)
- return;
-
- __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, false);
-}
-
-void __init sme_map_bootdata(char *real_mode_data)
-{
- struct boot_params *boot_data;
- unsigned long cmdline_paddr;
-
- if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
- return;
-
- __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), true);
-
- /* Get the command line address after mapping the real_mode_data */
- boot_data = (struct boot_params *)real_mode_data;
- cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
-
- if (!cmdline_paddr)
- return;
-
- __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true);
-}
-
-void __init sme_early_init(void)
-{
- unsigned int i;
-
- if (!sme_me_mask)
- return;
-
- early_pmd_flags = __sme_set(early_pmd_flags);
-
- __supported_pte_mask = __sme_set(__supported_pte_mask);
-
- /* Update the protection map with memory encryption mask */
- for (i = 0; i < ARRAY_SIZE(protection_map); i++)
- protection_map[i] = pgprot_encrypted(protection_map[i]);
-
- if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
- swiotlb_force = SWIOTLB_FORCE;
-}
-
-void __init sev_setup_arch(void)
-{
- phys_addr_t total_mem = memblock_phys_mem_size();
- unsigned long size;
-
- if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
- return;
-
- /*
- * For SEV, all DMA has to occur via shared/unencrypted pages.
- * SEV uses SWIOTLB to make this happen without changing device
- * drivers. However, depending on the workload being run, the
- * default 64MB of SWIOTLB may not be enough and SWIOTLB may
- * run out of buffers for DMA, resulting in I/O errors and/or
- * performance degradation especially with high I/O workloads.
- *
- * Adjust the default size of SWIOTLB for SEV guests using
- * a percentage of guest memory for SWIOTLB buffers.
- * Also, as the SWIOTLB bounce buffer memory is allocated
- * from low memory, ensure that the adjusted size is within
- * the limits of low available memory.
- *
- * The percentage of guest memory used here for SWIOTLB buffers
- * is more of an approximation of the static adjustment which
- * 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6%
- */
- size = total_mem * 6 / 100;
- size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G);
- swiotlb_adjust_size(size);
-}
-
-static unsigned long pg_level_to_pfn(int level, pte_t *kpte, pgprot_t *ret_prot)
-{
- unsigned long pfn = 0;
- pgprot_t prot;
-
- switch (level) {
- case PG_LEVEL_4K:
- pfn = pte_pfn(*kpte);
- prot = pte_pgprot(*kpte);
- break;
- case PG_LEVEL_2M:
- pfn = pmd_pfn(*(pmd_t *)kpte);
- prot = pmd_pgprot(*(pmd_t *)kpte);
- break;
- case PG_LEVEL_1G:
- pfn = pud_pfn(*(pud_t *)kpte);
- prot = pud_pgprot(*(pud_t *)kpte);
- break;
- default:
- WARN_ONCE(1, "Invalid level for kpte\n");
- return 0;
- }
-
- if (ret_prot)
- *ret_prot = prot;
-
- return pfn;
-}
-
-void notify_range_enc_status_changed(unsigned long vaddr, int npages, bool enc)
-{
-#ifdef CONFIG_PARAVIRT
- unsigned long sz = npages << PAGE_SHIFT;
- unsigned long vaddr_end = vaddr + sz;
-
- while (vaddr < vaddr_end) {
- int psize, pmask, level;
- unsigned long pfn;
- pte_t *kpte;
-
- kpte = lookup_address(vaddr, &level);
- if (!kpte || pte_none(*kpte)) {
- WARN_ONCE(1, "kpte lookup for vaddr\n");
- return;
- }
-
- pfn = pg_level_to_pfn(level, kpte, NULL);
- if (!pfn)
- continue;
-
- psize = page_level_size(level);
- pmask = page_level_mask(level);
-
- notify_page_enc_status_changed(pfn, psize >> PAGE_SHIFT, enc);
-
- vaddr = (vaddr & pmask) + psize;
- }
-#endif
-}
-
-static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc)
-{
- pgprot_t old_prot, new_prot;
- unsigned long pfn, pa, size;
- pte_t new_pte;
-
- pfn = pg_level_to_pfn(level, kpte, &old_prot);
- if (!pfn)
- return;
-
- new_prot = old_prot;
- if (enc)
- pgprot_val(new_prot) |= _PAGE_ENC;
- else
- pgprot_val(new_prot) &= ~_PAGE_ENC;
-
- /* If prot is same then do nothing. */
- if (pgprot_val(old_prot) == pgprot_val(new_prot))
- return;
-
- pa = pfn << PAGE_SHIFT;
- size = page_level_size(level);
-
- /*
- * We are going to perform in-place en-/decryption and change the
- * physical page attribute from C=1 to C=0 or vice versa. Flush the
- * caches to ensure that data gets accessed with the correct C-bit.
- */
- clflush_cache_range(__va(pa), size);
-
- /* Encrypt/decrypt the contents in-place */
- if (enc)
- sme_early_encrypt(pa, size);
- else
- sme_early_decrypt(pa, size);
-
- /* Change the page encryption mask. */
- new_pte = pfn_pte(pfn, new_prot);
- set_pte_atomic(kpte, new_pte);
-}
-
-static int __init early_set_memory_enc_dec(unsigned long vaddr,
- unsigned long size, bool enc)
-{
- unsigned long vaddr_end, vaddr_next, start;
- unsigned long psize, pmask;
- int split_page_size_mask;
- int level, ret;
- pte_t *kpte;
-
- start = vaddr;
- vaddr_next = vaddr;
- vaddr_end = vaddr + size;
-
- for (; vaddr < vaddr_end; vaddr = vaddr_next) {
- kpte = lookup_address(vaddr, &level);
- if (!kpte || pte_none(*kpte)) {
- ret = 1;
- goto out;
- }
-
- if (level == PG_LEVEL_4K) {
- __set_clr_pte_enc(kpte, level, enc);
- vaddr_next = (vaddr & PAGE_MASK) + PAGE_SIZE;
- continue;
- }
-
- psize = page_level_size(level);
- pmask = page_level_mask(level);
-
- /*
- * Check whether we can change the large page in one go.
- * We request a split when the address is not aligned and
- * the number of pages to set/clear encryption bit is smaller
- * than the number of pages in the large page.
- */
- if (vaddr == (vaddr & pmask) &&
- ((vaddr_end - vaddr) >= psize)) {
- __set_clr_pte_enc(kpte, level, enc);
- vaddr_next = (vaddr & pmask) + psize;
- continue;
- }
-
- /*
- * The virtual address is part of a larger page, create the next
- * level page table mapping (4K or 2M). If it is part of a 2M
- * page then we request a split of the large page into 4K
- * chunks. A 1GB large page is split into 2M pages, resp.
- */
- if (level == PG_LEVEL_2M)
- split_page_size_mask = 0;
- else
- split_page_size_mask = 1 << PG_LEVEL_2M;
-
- /*
- * kernel_physical_mapping_change() does not flush the TLBs, so
- * a TLB flush is required after we exit from the for loop.
- */
- kernel_physical_mapping_change(__pa(vaddr & pmask),
- __pa((vaddr_end & pmask) + psize),
- split_page_size_mask);
- }
-
- ret = 0;
-
- notify_range_enc_status_changed(start, PAGE_ALIGN(size) >> PAGE_SHIFT, enc);
-out:
- __flush_tlb_all();
- return ret;
-}
-
-int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size)
-{
- return early_set_memory_enc_dec(vaddr, size, false);
-}
-
-int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size)
-{
- return early_set_memory_enc_dec(vaddr, size, true);
-}
-
-void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, bool enc)
-{
- notify_range_enc_status_changed(vaddr, npages, enc);
-}
/* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */
bool force_dma_unencrypted(struct device *dev)
@@ -441,30 +40,6 @@ bool force_dma_unencrypted(struct device *dev)
return false;
}
-void __init mem_encrypt_free_decrypted_mem(void)
-{
- unsigned long vaddr, vaddr_end, npages;
- int r;
-
- vaddr = (unsigned long)__start_bss_decrypted_unused;
- vaddr_end = (unsigned long)__end_bss_decrypted;
- npages = (vaddr_end - vaddr) >> PAGE_SHIFT;
-
- /*
- * The unused memory range was mapped decrypted, change the encryption
- * attribute from decrypted to encrypted before freeing it.
- */
- if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) {
- r = set_memory_encrypted(vaddr, npages);
- if (r) {
- pr_warn("failed to free unused decrypted pages\n");
- return;
- }
- }
-
- free_init_pages("unused decrypted", vaddr, vaddr_end);
-}
-
static void print_mem_encrypt_feature_info(void)
{
pr_info("AMD Memory Encryption Features active:");
@@ -493,20 +68,12 @@ static void print_mem_encrypt_feature_info(void)
/* Architecture __weak replacement functions */
void __init mem_encrypt_init(void)
{
- if (!sme_me_mask)
+ if (!cc_platform_has(CC_ATTR_MEM_ENCRYPT))
return;
/* Call into SWIOTLB to update the SWIOTLB DMA buffers */
swiotlb_update_mem_attributes();
- /*
- * With SEV, we need to unroll the rep string I/O instructions,
- * but SEV-ES supports them through the #VC handler.
- */
- if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) &&
- !cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
- static_branch_enable(&sev_enable_key);
-
print_mem_encrypt_feature_info();
}
diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c
new file mode 100644
index 000000000000..2b2d018ea345
--- /dev/null
+++ b/arch/x86/mm/mem_encrypt_amd.c
@@ -0,0 +1,438 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ */
+
+#define DISABLE_BRANCH_PROFILING
+
+#include <linux/linkage.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/dma-direct.h>
+#include <linux/swiotlb.h>
+#include <linux/mem_encrypt.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/dma-mapping.h>
+#include <linux/virtio_config.h>
+#include <linux/cc_platform.h>
+
+#include <asm/tlbflush.h>
+#include <asm/fixmap.h>
+#include <asm/setup.h>
+#include <asm/bootparam.h>
+#include <asm/set_memory.h>
+#include <asm/cacheflush.h>
+#include <asm/processor-flags.h>
+#include <asm/msr.h>
+#include <asm/cmdline.h>
+
+#include "mm_internal.h"
+
+/*
+ * Since SME related variables are set early in the boot process they must
+ * reside in the .data section so as not to be zeroed out when the .bss
+ * section is later cleared.
+ */
+u64 sme_me_mask __section(".data") = 0;
+u64 sev_status __section(".data") = 0;
+u64 sev_check_data __section(".data") = 0;
+EXPORT_SYMBOL(sme_me_mask);
+
+/* Buffer used for early in-place encryption by BSP, no locking needed */
+static char sme_early_buffer[PAGE_SIZE] __initdata __aligned(PAGE_SIZE);
+
+/*
+ * This routine does not change the underlying encryption setting of the
+ * page(s) that map this memory. It assumes that eventually the memory is
+ * meant to be accessed as either encrypted or decrypted but the contents
+ * are currently not in the desired state.
+ *
+ * This routine follows the steps outlined in the AMD64 Architecture
+ * Programmer's Manual Volume 2, Section 7.10.8 Encrypt-in-Place.
+ */
+static void __init __sme_early_enc_dec(resource_size_t paddr,
+ unsigned long size, bool enc)
+{
+ void *src, *dst;
+ size_t len;
+
+ if (!sme_me_mask)
+ return;
+
+ wbinvd();
+
+ /*
+ * There are limited number of early mapping slots, so map (at most)
+ * one page at time.
+ */
+ while (size) {
+ len = min_t(size_t, sizeof(sme_early_buffer), size);
+
+ /*
+ * Create mappings for the current and desired format of
+ * the memory. Use a write-protected mapping for the source.
+ */
+ src = enc ? early_memremap_decrypted_wp(paddr, len) :
+ early_memremap_encrypted_wp(paddr, len);
+
+ dst = enc ? early_memremap_encrypted(paddr, len) :
+ early_memremap_decrypted(paddr, len);
+
+ /*
+ * If a mapping can't be obtained to perform the operation,
+ * then eventual access of that area in the desired mode
+ * will cause a crash.
+ */
+ BUG_ON(!src || !dst);
+
+ /*
+ * Use a temporary buffer, of cache-line multiple size, to
+ * avoid data corruption as documented in the APM.
+ */
+ memcpy(sme_early_buffer, src, len);
+ memcpy(dst, sme_early_buffer, len);
+
+ early_memunmap(dst, len);
+ early_memunmap(src, len);
+
+ paddr += len;
+ size -= len;
+ }
+}
+
+void __init sme_early_encrypt(resource_size_t paddr, unsigned long size)
+{
+ __sme_early_enc_dec(paddr, size, true);
+}
+
+void __init sme_early_decrypt(resource_size_t paddr, unsigned long size)
+{
+ __sme_early_enc_dec(paddr, size, false);
+}
+
+static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size,
+ bool map)
+{
+ unsigned long paddr = (unsigned long)vaddr - __PAGE_OFFSET;
+ pmdval_t pmd_flags, pmd;
+
+ /* Use early_pmd_flags but remove the encryption mask */
+ pmd_flags = __sme_clr(early_pmd_flags);
+
+ do {
+ pmd = map ? (paddr & PMD_MASK) + pmd_flags : 0;
+ __early_make_pgtable((unsigned long)vaddr, pmd);
+
+ vaddr += PMD_SIZE;
+ paddr += PMD_SIZE;
+ size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE;
+ } while (size);
+
+ flush_tlb_local();
+}
+
+void __init sme_unmap_bootdata(char *real_mode_data)
+{
+ struct boot_params *boot_data;
+ unsigned long cmdline_paddr;
+
+ if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
+ return;
+
+ /* Get the command line address before unmapping the real_mode_data */
+ boot_data = (struct boot_params *)real_mode_data;
+ cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
+
+ __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), false);
+
+ if (!cmdline_paddr)
+ return;
+
+ __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, false);
+}
+
+void __init sme_map_bootdata(char *real_mode_data)
+{
+ struct boot_params *boot_data;
+ unsigned long cmdline_paddr;
+
+ if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
+ return;
+
+ __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), true);
+
+ /* Get the command line address after mapping the real_mode_data */
+ boot_data = (struct boot_params *)real_mode_data;
+ cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
+
+ if (!cmdline_paddr)
+ return;
+
+ __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true);
+}
+
+void __init sme_early_init(void)
+{
+ unsigned int i;
+
+ if (!sme_me_mask)
+ return;
+
+ early_pmd_flags = __sme_set(early_pmd_flags);
+
+ __supported_pte_mask = __sme_set(__supported_pte_mask);
+
+ /* Update the protection map with memory encryption mask */
+ for (i = 0; i < ARRAY_SIZE(protection_map); i++)
+ protection_map[i] = pgprot_encrypted(protection_map[i]);
+
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
+ swiotlb_force = SWIOTLB_FORCE;
+}
+
+void __init sev_setup_arch(void)
+{
+ phys_addr_t total_mem = memblock_phys_mem_size();
+ unsigned long size;
+
+ if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
+ return;
+
+ /*
+ * For SEV, all DMA has to occur via shared/unencrypted pages.
+ * SEV uses SWIOTLB to make this happen without changing device
+ * drivers. However, depending on the workload being run, the
+ * default 64MB of SWIOTLB may not be enough and SWIOTLB may
+ * run out of buffers for DMA, resulting in I/O errors and/or
+ * performance degradation especially with high I/O workloads.
+ *
+ * Adjust the default size of SWIOTLB for SEV guests using
+ * a percentage of guest memory for SWIOTLB buffers.
+ * Also, as the SWIOTLB bounce buffer memory is allocated
+ * from low memory, ensure that the adjusted size is within
+ * the limits of low available memory.
+ *
+ * The percentage of guest memory used here for SWIOTLB buffers
+ * is more of an approximation of the static adjustment which
+ * 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6%
+ */
+ size = total_mem * 6 / 100;
+ size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G);
+ swiotlb_adjust_size(size);
+}
+
+static unsigned long pg_level_to_pfn(int level, pte_t *kpte, pgprot_t *ret_prot)
+{
+ unsigned long pfn = 0;
+ pgprot_t prot;
+
+ switch (level) {
+ case PG_LEVEL_4K:
+ pfn = pte_pfn(*kpte);
+ prot = pte_pgprot(*kpte);
+ break;
+ case PG_LEVEL_2M:
+ pfn = pmd_pfn(*(pmd_t *)kpte);
+ prot = pmd_pgprot(*(pmd_t *)kpte);
+ break;
+ case PG_LEVEL_1G:
+ pfn = pud_pfn(*(pud_t *)kpte);
+ prot = pud_pgprot(*(pud_t *)kpte);
+ break;
+ default:
+ WARN_ONCE(1, "Invalid level for kpte\n");
+ return 0;
+ }
+
+ if (ret_prot)
+ *ret_prot = prot;
+
+ return pfn;
+}
+
+void notify_range_enc_status_changed(unsigned long vaddr, int npages, bool enc)
+{
+#ifdef CONFIG_PARAVIRT
+ unsigned long sz = npages << PAGE_SHIFT;
+ unsigned long vaddr_end = vaddr + sz;
+
+ while (vaddr < vaddr_end) {
+ int psize, pmask, level;
+ unsigned long pfn;
+ pte_t *kpte;
+
+ kpte = lookup_address(vaddr, &level);
+ if (!kpte || pte_none(*kpte)) {
+ WARN_ONCE(1, "kpte lookup for vaddr\n");
+ return;
+ }
+
+ pfn = pg_level_to_pfn(level, kpte, NULL);
+ if (!pfn)
+ continue;
+
+ psize = page_level_size(level);
+ pmask = page_level_mask(level);
+
+ notify_page_enc_status_changed(pfn, psize >> PAGE_SHIFT, enc);
+
+ vaddr = (vaddr & pmask) + psize;
+ }
+#endif
+}
+
+static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc)
+{
+ pgprot_t old_prot, new_prot;
+ unsigned long pfn, pa, size;
+ pte_t new_pte;
+
+ pfn = pg_level_to_pfn(level, kpte, &old_prot);
+ if (!pfn)
+ return;
+
+ new_prot = old_prot;
+ if (enc)
+ pgprot_val(new_prot) |= _PAGE_ENC;
+ else
+ pgprot_val(new_prot) &= ~_PAGE_ENC;
+
+ /* If prot is same then do nothing. */
+ if (pgprot_val(old_prot) == pgprot_val(new_prot))
+ return;
+
+ pa = pfn << PAGE_SHIFT;
+ size = page_level_size(level);
+
+ /*
+ * We are going to perform in-place en-/decryption and change the
+ * physical page attribute from C=1 to C=0 or vice versa. Flush the
+ * caches to ensure that data gets accessed with the correct C-bit.
+ */
+ clflush_cache_range(__va(pa), size);
+
+ /* Encrypt/decrypt the contents in-place */
+ if (enc)
+ sme_early_encrypt(pa, size);
+ else
+ sme_early_decrypt(pa, size);
+
+ /* Change the page encryption mask. */
+ new_pte = pfn_pte(pfn, new_prot);
+ set_pte_atomic(kpte, new_pte);
+}
+
+static int __init early_set_memory_enc_dec(unsigned long vaddr,
+ unsigned long size, bool enc)
+{
+ unsigned long vaddr_end, vaddr_next, start;
+ unsigned long psize, pmask;
+ int split_page_size_mask;
+ int level, ret;
+ pte_t *kpte;
+
+ start = vaddr;
+ vaddr_next = vaddr;
+ vaddr_end = vaddr + size;
+
+ for (; vaddr < vaddr_end; vaddr = vaddr_next) {
+ kpte = lookup_address(vaddr, &level);
+ if (!kpte || pte_none(*kpte)) {
+ ret = 1;
+ goto out;
+ }
+
+ if (level == PG_LEVEL_4K) {
+ __set_clr_pte_enc(kpte, level, enc);
+ vaddr_next = (vaddr & PAGE_MASK) + PAGE_SIZE;
+ continue;
+ }
+
+ psize = page_level_size(level);
+ pmask = page_level_mask(level);
+
+ /*
+ * Check whether we can change the large page in one go.
+ * We request a split when the address is not aligned and
+ * the number of pages to set/clear encryption bit is smaller
+ * than the number of pages in the large page.
+ */
+ if (vaddr == (vaddr & pmask) &&
+ ((vaddr_end - vaddr) >= psize)) {
+ __set_clr_pte_enc(kpte, level, enc);
+ vaddr_next = (vaddr & pmask) + psize;
+ continue;
+ }
+
+ /*
+ * The virtual address is part of a larger page, create the next
+ * level page table mapping (4K or 2M). If it is part of a 2M
+ * page then we request a split of the large page into 4K
+ * chunks. A 1GB large page is split into 2M pages, resp.
+ */
+ if (level == PG_LEVEL_2M)
+ split_page_size_mask = 0;
+ else
+ split_page_size_mask = 1 << PG_LEVEL_2M;
+
+ /*
+ * kernel_physical_mapping_change() does not flush the TLBs, so
+ * a TLB flush is required after we exit from the for loop.
+ */
+ kernel_physical_mapping_change(__pa(vaddr & pmask),
+ __pa((vaddr_end & pmask) + psize),
+ split_page_size_mask);
+ }
+
+ ret = 0;
+
+ notify_range_enc_status_changed(start, PAGE_ALIGN(size) >> PAGE_SHIFT, enc);
+out:
+ __flush_tlb_all();
+ return ret;
+}
+
+int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size)
+{
+ return early_set_memory_enc_dec(vaddr, size, false);
+}
+
+int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size)
+{
+ return early_set_memory_enc_dec(vaddr, size, true);
+}
+
+void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, bool enc)
+{
+ notify_range_enc_status_changed(vaddr, npages, enc);
+}
+
+void __init mem_encrypt_free_decrypted_mem(void)
+{
+ unsigned long vaddr, vaddr_end, npages;
+ int r;
+
+ vaddr = (unsigned long)__start_bss_decrypted_unused;
+ vaddr_end = (unsigned long)__end_bss_decrypted;
+ npages = (vaddr_end - vaddr) >> PAGE_SHIFT;
+
+ /*
+ * The unused memory range was mapped decrypted, change the encryption
+ * attribute from decrypted to encrypted before freeing it.
+ */
+ if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) {
+ r = set_memory_encrypted(vaddr, npages);
+ if (r) {
+ pr_warn("failed to free unused decrypted pages\n");
+ return;
+ }
+ }
+
+ free_init_pages("unused decrypted", vaddr, vaddr_end);
+}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 59ba2968af1b..1e6513f95133 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1148,7 +1148,7 @@ void flush_tlb_one_user(unsigned long addr)
*/
STATIC_NOPV void native_flush_tlb_global(void)
{
- unsigned long cr4, flags;
+ unsigned long flags;
if (static_cpu_has(X86_FEATURE_INVPCID)) {
/*
@@ -1168,11 +1168,7 @@ STATIC_NOPV void native_flush_tlb_global(void)
*/
raw_local_irq_save(flags);
- cr4 = this_cpu_read(cpu_tlbstate.cr4);
- /* toggle PGE */
- native_write_cr4(cr4 ^ X86_CR4_PGE);
- /* write old PGE again and flush TLBs */
- native_write_cr4(cr4);
+ __native_tlb_flush_global(this_cpu_read(cpu_tlbstate.cr4));
raw_local_irq_restore(flags);
}
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 726700fabca6..bafe36e69227 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1252,19 +1252,54 @@ st: if (is_imm8(insn->off))
case BPF_LDX | BPF_MEM | BPF_DW:
case BPF_LDX | BPF_PROBE_MEM | BPF_DW:
if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
- /* test src_reg, src_reg */
- maybe_emit_mod(&prog, src_reg, src_reg, true); /* always 1 byte */
- EMIT2(0x85, add_2reg(0xC0, src_reg, src_reg));
- /* jne start_of_ldx */
- EMIT2(X86_JNE, 0);
+ /* Though the verifier prevents negative insn->off in BPF_PROBE_MEM
+ * add abs(insn->off) to the limit to make sure that negative
+ * offset won't be an issue.
+ * insn->off is s16, so it won't affect valid pointers.
+ */
+ u64 limit = TASK_SIZE_MAX + PAGE_SIZE + abs(insn->off);
+ u8 *end_of_jmp1, *end_of_jmp2;
+
+ /* Conservatively check that src_reg + insn->off is a kernel address:
+ * 1. src_reg + insn->off >= limit
+ * 2. src_reg + insn->off doesn't become small positive.
+ * Cannot do src_reg + insn->off >= limit in one branch,
+ * since it needs two spare registers, but JIT has only one.
+ */
+
+ /* movabsq r11, limit */
+ EMIT2(add_1mod(0x48, AUX_REG), add_1reg(0xB8, AUX_REG));
+ EMIT((u32)limit, 4);
+ EMIT(limit >> 32, 4);
+ /* cmp src_reg, r11 */
+ maybe_emit_mod(&prog, src_reg, AUX_REG, true);
+ EMIT2(0x39, add_2reg(0xC0, src_reg, AUX_REG));
+ /* if unsigned '<' goto end_of_jmp2 */
+ EMIT2(X86_JB, 0);
+ end_of_jmp1 = prog;
+
+ /* mov r11, src_reg */
+ emit_mov_reg(&prog, true, AUX_REG, src_reg);
+ /* add r11, insn->off */
+ maybe_emit_1mod(&prog, AUX_REG, true);
+ EMIT2_off32(0x81, add_1reg(0xC0, AUX_REG), insn->off);
+ /* jmp if not carry to start_of_ldx
+ * Otherwise ERR_PTR(-EINVAL) + 128 will be the user addr
+ * that has to be rejected.
+ */
+ EMIT2(0x73 /* JNC */, 0);
+ end_of_jmp2 = prog;
+
/* xor dst_reg, dst_reg */
emit_mov_imm32(&prog, false, dst_reg, 0);
/* jmp byte_after_ldx */
EMIT2(0xEB, 0);
- /* populate jmp_offset for JNE above */
- temp[4] = prog - temp - 5 /* sizeof(test + jne) */;
+ /* populate jmp_offset for JB above to jump to xor dst_reg */
+ end_of_jmp1[-1] = end_of_jmp2 - end_of_jmp1;
+ /* populate jmp_offset for JNC above to jump to start_of_ldx */
start_of_ldx = prog;
+ end_of_jmp2[-1] = start_of_ldx - end_of_jmp2;
}
emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
@@ -1305,7 +1340,7 @@ st: if (is_imm8(insn->off))
* End result: x86 insn "mov rbx, qword ptr [rax+0x14]"
* of 4 bytes will be ignored and rbx will be zero inited.
*/
- ex->fixup = (prog - temp) | (reg2pt_regs[dst_reg] << 8);
+ ex->fixup = (prog - start_of_ldx) | (reg2pt_regs[dst_reg] << 8);
}
break;
diff --git a/arch/x86/platform/ce4100/falconfalls.dts b/arch/x86/platform/ce4100/falconfalls.dts
index 0ac3d4357136..65fa3d866226 100644
--- a/arch/x86/platform/ce4100/falconfalls.dts
+++ b/arch/x86/platform/ce4100/falconfalls.dts
@@ -249,7 +249,7 @@
gpio@26 {
#gpio-cells = <2>;
- compatible = "ti,pcf8575";
+ compatible = "nxp,pcf8575";
reg = <0x26>;
gpio-controller;
};
@@ -263,7 +263,7 @@
gpio@26 {
#gpio-cells = <2>;
- compatible = "ti,pcf8575";
+ compatible = "nxp,pcf8575";
reg = <0x26>;
gpio-controller;
};
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index b15ebfe40a73..b0b848d6933a 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -277,7 +277,8 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size)
return;
}
- new = early_memremap(data.phys_map, data.size);
+ new = early_memremap_prot(data.phys_map, data.size,
+ pgprot_val(pgprot_encrypted(FIXMAP_PAGE_NORMAL)));
if (!new) {
pr_err("Failed to map new boot services memmap\n");
return;
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index 38d24d2ab38b..c5e29db02a46 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -17,6 +17,32 @@ u32 *trampoline_cr4_features;
/* Hold the pgd entry used on booting additional CPUs */
pgd_t trampoline_pgd_entry;
+void load_trampoline_pgtable(void)
+{
+#ifdef CONFIG_X86_32
+ load_cr3(initial_page_table);
+#else
+ /*
+ * This function is called before exiting to real-mode and that will
+ * fail with CR4.PCIDE still set.
+ */
+ if (boot_cpu_has(X86_FEATURE_PCID))
+ cr4_clear_bits(X86_CR4_PCIDE);
+
+ write_cr3(real_mode_header->trampoline_pgd);
+#endif
+
+ /*
+ * The CR3 write above will not flush global TLB entries.
+ * Stale, global entries from previous page tables may still be
+ * present. Flush those stale entries.
+ *
+ * This ensures that memory accessed while running with
+ * trampoline_pgd is *actually* mapped into trampoline_pgd.
+ */
+ __flush_tlb_all();
+}
+
void __init reserve_real_mode(void)
{
phys_addr_t mem;
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index c736cf2ac76b..e2c5b296120d 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -68,7 +68,7 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = {
"(__parainstructions|__alt_instructions)(_end)?|"
"(__iommu_table|__apicdrivers|__smp_locks)(_end)?|"
"__(start|end)_pci_.*|"
-#if CONFIG_FW_LOADER_BUILTIN
+#if CONFIG_FW_LOADER
"__(start|end)_builtin_fw|"
#endif
"__(start|stop)___ksymtab(_gpl)?|"