diff options
Diffstat (limited to 'arch/x86/kvm')
74 files changed, 15462 insertions, 6376 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 3aaf7e86a859..2eeffcec5382 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -17,12 +17,12 @@ menuconfig VIRTUALIZATION if VIRTUALIZATION -config KVM - tristate "Kernel-based Virtual Machine (KVM) support" - depends on HIGH_RES_TIMERS - depends on X86_LOCAL_APIC +config KVM_X86 + def_tristate KVM if (KVM_INTEL != n || KVM_AMD != n) select KVM_COMMON select KVM_GENERIC_MMU_NOTIFIER + select KVM_ELIDE_TLB_FLUSH_IF_YOUNG + select KVM_MMU_LOCKLESS_AGING select HAVE_KVM_IRQCHIP select HAVE_KVM_PFNCACHE select HAVE_KVM_DIRTY_RING_TSO @@ -30,6 +30,7 @@ config KVM select HAVE_KVM_IRQ_BYPASS select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_READONLY_MEM + select VHOST_TASK select KVM_ASYNC_PF select USER_RETURN_NOTIFIER select KVM_MMIO @@ -44,6 +45,13 @@ config KVM select KVM_VFIO select HAVE_KVM_PM_NOTIFIER if PM select KVM_GENERIC_HARDWARE_ENABLING + select KVM_GENERIC_PRE_FAULT_MEMORY + select KVM_GENERIC_PRIVATE_MEM if KVM_SW_PROTECTED_VM + select KVM_WERROR if WERROR + +config KVM + tristate "Kernel-based Virtual Machine (KVM) support" + depends on X86_LOCAL_APIC help Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent @@ -66,7 +74,7 @@ config KVM_WERROR # FRAME_WARN, i.e. KVM_WERROR=y with KASAN=y requires special tuning. # Building KVM with -Werror and KASAN is still doable via enabling # the kernel-wide WERROR=y. - depends on KVM && EXPERT && !KASAN + depends on KVM && ((EXPERT && !KASAN) || WERROR) help Add -Werror to the build flags for KVM. @@ -76,7 +84,6 @@ config KVM_SW_PROTECTED_VM bool "Enable support for KVM software-protected VMs" depends on EXPERT depends on KVM && X86_64 - select KVM_GENERIC_PRIVATE_MEM help Enable support for KVM software-protected VMs. Currently, software- protected VMs are purely a development and testing vehicle for @@ -88,6 +95,8 @@ config KVM_SW_PROTECTED_VM config KVM_INTEL tristate "KVM for Intel (and compatible) processors support" depends on KVM && IA32_FEAT_CTL + select KVM_GENERIC_PRIVATE_MEM if INTEL_TDX_HOST + select KVM_GENERIC_MEMORY_ATTRIBUTES if INTEL_TDX_HOST help Provides support for KVM on processors equipped with Intel's VT extensions, a.k.a. Virtual Machine Extensions (VMX). @@ -95,6 +104,21 @@ config KVM_INTEL To compile this as a module, choose M here: the module will be called kvm-intel. +config KVM_INTEL_PROVE_VE + bool "Check that guests do not receive #VE exceptions" + depends on KVM_INTEL && EXPERT + help + Checks that KVM's page table management code will not incorrectly + let guests receive a virtualization exception. Virtualization + exceptions will be trapped by the hypervisor rather than injected + in the guest. + + Note: some CPUs appear to generate spurious EPT Violations #VEs + that trigger KVM's WARN, in particular with eptad=0 and/or nested + virtualization. + + If unsure, say N. + config X86_SGX_KVM bool "Software Guard eXtensions (SGX) Virtualization" depends on X86_SGX && KVM_INTEL @@ -107,6 +131,16 @@ config X86_SGX_KVM If unsure, say N. +config KVM_INTEL_TDX + bool "Intel Trust Domain Extensions (TDX) support" + default y + depends on INTEL_TDX_HOST + help + Provides support for launching Intel Trust Domain Extensions (TDX) + confidential VMs on Intel processors. + + If unsure, say N. + config KVM_AMD tristate "KVM for AMD processors support" depends on KVM && (CPU_SUP_AMD || CPU_SUP_HYGON) @@ -122,9 +156,15 @@ config KVM_AMD_SEV default y depends on KVM_AMD && X86_64 depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m) + select ARCH_HAS_CC_PLATFORM + select KVM_GENERIC_PRIVATE_MEM + select HAVE_KVM_ARCH_GMEM_PREPARE + select HAVE_KVM_ARCH_GMEM_INVALIDATE help - Provides support for launching Encrypted VMs (SEV) and Encrypted VMs - with Encrypted State (SEV-ES) on AMD processors. + Provides support for launching encrypted VMs which use Secure + Encrypted Virtualization (SEV), Secure Encrypted Virtualization with + Encrypted State (SEV-ES), and Secure Encrypted Virtualization with + Secure Nested Paging (SEV-SNP) technologies on AMD processors. config KVM_SMM bool "System Management Mode emulation" diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index a88bb14266b6..a5d362c7b504 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -3,11 +3,6 @@ ccflags-y += -I $(srctree)/arch/x86/kvm ccflags-$(CONFIG_KVM_WERROR) += -Werror -ifeq ($(CONFIG_FRAME_POINTER),y) -OBJECT_FILES_NON_STANDARD_vmx/vmenter.o := y -OBJECT_FILES_NON_STANDARD_svm/vmenter.o := y -endif - include $(srctree)/virt/kvm/Makefile.kvm kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ @@ -21,14 +16,16 @@ kvm-$(CONFIG_KVM_XEN) += xen.o kvm-$(CONFIG_KVM_SMM) += smm.o kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ - vmx/nested.o vmx/posted_intr.o + vmx/nested.o vmx/posted_intr.o vmx/main.o kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o kvm-intel-$(CONFIG_KVM_HYPERV) += vmx/hyperv.o vmx/hyperv_evmcs.o +kvm-intel-$(CONFIG_KVM_INTEL_TDX) += vmx/tdx.o + +kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o -kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o \ - svm/sev.o -kvm-amd-$(CONFIG_KVM_HYPERV) += svm/hyperv.o +kvm-amd-$(CONFIG_KVM_AMD_SEV) += svm/sev.o +kvm-amd-$(CONFIG_KVM_HYPERV) += svm/hyperv.o ifdef CONFIG_HYPERV kvm-y += kvm_onhyperv.o @@ -36,7 +33,7 @@ kvm-intel-y += vmx/vmx_onhyperv.o vmx/hyperv_evmcs.o kvm-amd-y += svm/svm_onhyperv.o endif -obj-$(CONFIG_KVM) += kvm.o +obj-$(CONFIG_KVM_X86) += kvm.o obj-$(CONFIG_KVM_INTEL) += kvm-intel.o obj-$(CONFIG_KVM_AMD) += kvm-amd.o diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index adba49afb5fe..b2d006756e02 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -21,7 +21,7 @@ #include <asm/user.h> #include <asm/fpu/xstate.h> #include <asm/sgx.h> -#include <asm/cpuid.h> +#include <asm/cpuid/api.h> #include "cpuid.h" #include "lapic.h" #include "mmu.h" @@ -36,50 +36,52 @@ u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; EXPORT_SYMBOL_GPL(kvm_cpu_caps); +struct cpuid_xstate_sizes { + u32 eax; + u32 ebx; + u32 ecx; +}; + +static struct cpuid_xstate_sizes xstate_sizes[XFEATURE_MAX] __ro_after_init; + +void __init kvm_init_xstate_sizes(void) +{ + u32 ign; + int i; + + for (i = XFEATURE_YMM; i < ARRAY_SIZE(xstate_sizes); i++) { + struct cpuid_xstate_sizes *xs = &xstate_sizes[i]; + + cpuid_count(0xD, i, &xs->eax, &xs->ebx, &xs->ecx, &ign); + } +} + u32 xstate_required_size(u64 xstate_bv, bool compacted) { - int feature_bit = 0; u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; + int i; xstate_bv &= XFEATURE_MASK_EXTEND; - while (xstate_bv) { - if (xstate_bv & 0x1) { - u32 eax, ebx, ecx, edx, offset; - cpuid_count(0xD, feature_bit, &eax, &ebx, &ecx, &edx); - /* ECX[1]: 64B alignment in compacted form */ - if (compacted) - offset = (ecx & 0x2) ? ALIGN(ret, 64) : ret; - else - offset = ebx; - ret = max(ret, offset + eax); - } + for (i = XFEATURE_YMM; i < ARRAY_SIZE(xstate_sizes) && xstate_bv; i++) { + struct cpuid_xstate_sizes *xs = &xstate_sizes[i]; + u32 offset; + + if (!(xstate_bv & BIT_ULL(i))) + continue; - xstate_bv >>= 1; - feature_bit++; + /* ECX[1]: 64B alignment in compacted form */ + if (compacted) + offset = (xs->ecx & 0x2) ? ALIGN(ret, 64) : ret; + else + offset = xs->ebx; + ret = max(ret, offset + xs->eax); + xstate_bv &= ~BIT_ULL(i); } return ret; } -#define F feature_bit - -/* Scattered Flag - For features that are scattered by cpufeatures.h. */ -#define SF(name) \ -({ \ - BUILD_BUG_ON(X86_FEATURE_##name >= MAX_CPU_FEATURES); \ - (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0); \ -}) - -/* - * Magic value used by KVM when querying userspace-provided CPUID entries and - * doesn't care about the CPIUD index because the index of the function in - * question is not significant. Note, this magic value must have at least one - * bit set in bits[63:32] and must be consumed as a u64 by cpuid_entry2_find() - * to avoid false positives when processing guest CPUID input. - */ -#define KVM_CPUID_INDEX_NOT_SIGNIFICANT -1ull - -static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2( struct kvm_cpuid_entry2 *entries, int nent, u32 function, u64 index) { struct kvm_cpuid_entry2 *e; @@ -129,10 +131,9 @@ static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( return NULL; } +EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry2); -static int kvm_check_cpuid(struct kvm_vcpu *vcpu, - struct kvm_cpuid_entry2 *entries, - int nent) +static int kvm_check_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; u64 xfeatures; @@ -141,8 +142,7 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu, * The existing code assumes virtual address is 48-bit or 57-bit in the * canonical address checks; exit if it is ever changed. */ - best = cpuid_entry2_find(entries, nent, 0x80000008, - KVM_CPUID_INDEX_NOT_SIGNIFICANT); + best = kvm_find_cpuid_entry(vcpu, 0x80000008); if (best) { int vaddr_bits = (best->eax & 0xff00) >> 8; @@ -154,7 +154,7 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu, * Exposing dynamic xfeatures to the guest requires additional * enabling in the FPU, e.g. to expand the guest XSAVE state size. */ - best = cpuid_entry2_find(entries, nent, 0xd, 0); + best = kvm_find_cpuid_entry_index(vcpu, 0xd, 0); if (!best) return 0; @@ -166,6 +166,9 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu, return fpu_enable_guest_xfd_features(&vcpu->arch.guest_fpu, xfeatures); } +static u32 kvm_apply_cpuid_pv_features_quirk(struct kvm_vcpu *vcpu); +static void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu); + /* Check whether the supplied CPUID data is equal to what is already set for the vCPU. */ static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, int nent) @@ -173,6 +176,15 @@ static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 struct kvm_cpuid_entry2 *orig; int i; + /* + * Apply runtime CPUID updates to the incoming CPUID entries to avoid + * false positives due mismatches on KVM-owned feature flags. + * + * Note! @e2 and @nent track the _old_ CPUID entries! + */ + kvm_update_cpuid_runtime(vcpu); + kvm_apply_cpuid_pv_features_quirk(vcpu); + if (nent != vcpu->arch.cpuid_nent) return -EINVAL; @@ -196,7 +208,7 @@ static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcp struct kvm_cpuid_entry2 *entry; u32 base; - for_each_possible_hypervisor_cpuid_base(base) { + for_each_possible_cpuid_base_hypervisor(base) { entry = kvm_find_cpuid_entry(vcpu, base); if (entry) { @@ -217,123 +229,174 @@ static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcp return cpuid; } -static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu, - struct kvm_cpuid_entry2 *entries, int nent) +static u32 kvm_apply_cpuid_pv_features_quirk(struct kvm_vcpu *vcpu) { - u32 base = vcpu->arch.kvm_cpuid.base; - - if (!base) - return NULL; + struct kvm_hypervisor_cpuid kvm_cpuid; + struct kvm_cpuid_entry2 *best; - return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES, - KVM_CPUID_INDEX_NOT_SIGNIFICANT); -} + kvm_cpuid = kvm_get_hypervisor_cpuid(vcpu, KVM_SIGNATURE); + if (!kvm_cpuid.base) + return 0; -static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu) -{ - return __kvm_find_kvm_cpuid_features(vcpu, vcpu->arch.cpuid_entries, - vcpu->arch.cpuid_nent); -} + best = kvm_find_cpuid_entry(vcpu, kvm_cpuid.base | KVM_CPUID_FEATURES); + if (!best) + return 0; -void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best = kvm_find_kvm_cpuid_features(vcpu); + if (kvm_hlt_in_guest(vcpu->kvm)) + best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); - /* - * save the feature bitmap to avoid cpuid lookup for every PV - * operation - */ - if (best) - vcpu->arch.pv_cpuid.features = best->eax; + return best->eax; } /* * Calculate guest's supported XCR0 taking into account guest CPUID data and * KVM's supported XCR0 (comprised of host's XCR0 and KVM_SUPPORTED_XCR0). */ -static u64 cpuid_get_supported_xcr0(struct kvm_cpuid_entry2 *entries, int nent) +static u64 cpuid_get_supported_xcr0(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = cpuid_entry2_find(entries, nent, 0xd, 0); + best = kvm_find_cpuid_entry_index(vcpu, 0xd, 0); if (!best) return 0; return (best->eax | ((u64)best->edx << 32)) & kvm_caps.supported_xcr0; } -static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries, - int nent) +static __always_inline void kvm_update_feature_runtime(struct kvm_vcpu *vcpu, + struct kvm_cpuid_entry2 *entry, + unsigned int x86_feature, + bool has_feature) +{ + cpuid_entry_change(entry, x86_feature, has_feature); + guest_cpu_cap_change(vcpu, x86_feature, has_feature); +} + +static void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = cpuid_entry2_find(entries, nent, 1, KVM_CPUID_INDEX_NOT_SIGNIFICANT); + vcpu->arch.cpuid_dynamic_bits_dirty = false; + + best = kvm_find_cpuid_entry(vcpu, 1); if (best) { - /* Update OSXSAVE bit */ - if (boot_cpu_has(X86_FEATURE_XSAVE)) - cpuid_entry_change(best, X86_FEATURE_OSXSAVE, + kvm_update_feature_runtime(vcpu, best, X86_FEATURE_OSXSAVE, kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)); - cpuid_entry_change(best, X86_FEATURE_APIC, - vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE); + kvm_update_feature_runtime(vcpu, best, X86_FEATURE_APIC, + vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE); + + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) + kvm_update_feature_runtime(vcpu, best, X86_FEATURE_MWAIT, + vcpu->arch.ia32_misc_enable_msr & + MSR_IA32_MISC_ENABLE_MWAIT); } - best = cpuid_entry2_find(entries, nent, 7, 0); - if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7) - cpuid_entry_change(best, X86_FEATURE_OSPKE, - kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE)); + best = kvm_find_cpuid_entry_index(vcpu, 7, 0); + if (best) + kvm_update_feature_runtime(vcpu, best, X86_FEATURE_OSPKE, + kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE)); + - best = cpuid_entry2_find(entries, nent, 0xD, 0); + best = kvm_find_cpuid_entry_index(vcpu, 0xD, 0); if (best) best->ebx = xstate_required_size(vcpu->arch.xcr0, false); - best = cpuid_entry2_find(entries, nent, 0xD, 1); + best = kvm_find_cpuid_entry_index(vcpu, 0xD, 1); if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) || cpuid_entry_has(best, X86_FEATURE_XSAVEC))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); - - best = __kvm_find_kvm_cpuid_features(vcpu, entries, nent); - if (kvm_hlt_in_guest(vcpu->kvm) && best && - (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) - best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); - - if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) { - best = cpuid_entry2_find(entries, nent, 0x1, KVM_CPUID_INDEX_NOT_SIGNIFICANT); - if (best) - cpuid_entry_change(best, X86_FEATURE_MWAIT, - vcpu->arch.ia32_misc_enable_msr & - MSR_IA32_MISC_ENABLE_MWAIT); - } } -void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) -{ - __kvm_update_cpuid_runtime(vcpu, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent); -} -EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime); - -static bool kvm_cpuid_has_hyperv(struct kvm_cpuid_entry2 *entries, int nent) +static bool kvm_cpuid_has_hyperv(struct kvm_vcpu *vcpu) { #ifdef CONFIG_KVM_HYPERV struct kvm_cpuid_entry2 *entry; - entry = cpuid_entry2_find(entries, nent, HYPERV_CPUID_INTERFACE, - KVM_CPUID_INDEX_NOT_SIGNIFICANT); + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE); return entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX; #else return false; #endif } -static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) +static bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *entry; + + entry = kvm_find_cpuid_entry(vcpu, 0); + if (!entry) + return false; + + return is_guest_vendor_amd(entry->ebx, entry->ecx, entry->edx) || + is_guest_vendor_hygon(entry->ebx, entry->ecx, entry->edx); +} + +/* + * This isn't truly "unsafe", but except for the cpu_caps initialization code, + * all register lookups should use __cpuid_entry_get_reg(), which provides + * compile-time validation of the input. + */ +static u32 cpuid_get_reg_unsafe(struct kvm_cpuid_entry2 *entry, u32 reg) +{ + switch (reg) { + case CPUID_EAX: + return entry->eax; + case CPUID_EBX: + return entry->ebx; + case CPUID_ECX: + return entry->ecx; + case CPUID_EDX: + return entry->edx; + default: + WARN_ON_ONCE(1); + return 0; + } +} + +static int cpuid_func_emulated(struct kvm_cpuid_entry2 *entry, u32 func, + bool include_partially_emulated); + +void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry2 *entry; bool allow_gbpages; + int i; - BUILD_BUG_ON(KVM_NR_GOVERNED_FEATURES > KVM_MAX_NR_GOVERNED_FEATURES); - bitmap_zero(vcpu->arch.governed_features.enabled, - KVM_MAX_NR_GOVERNED_FEATURES); + memset(vcpu->arch.cpu_caps, 0, sizeof(vcpu->arch.cpu_caps)); + BUILD_BUG_ON(ARRAY_SIZE(reverse_cpuid) != NR_KVM_CPU_CAPS); + + /* + * Reset guest capabilities to userspace's guest CPUID definition, i.e. + * honor userspace's definition for features that don't require KVM or + * hardware management/support (or that KVM simply doesn't care about). + */ + for (i = 0; i < NR_KVM_CPU_CAPS; i++) { + const struct cpuid_reg cpuid = reverse_cpuid[i]; + struct kvm_cpuid_entry2 emulated; + + if (!cpuid.function) + continue; + + entry = kvm_find_cpuid_entry_index(vcpu, cpuid.function, cpuid.index); + if (!entry) + continue; + + cpuid_func_emulated(&emulated, cpuid.function, true); + + /* + * A vCPU has a feature if it's supported by KVM and is enabled + * in guest CPUID. Note, this includes features that are + * supported by KVM but aren't advertised to userspace! + */ + vcpu->arch.cpu_caps[i] = kvm_cpu_caps[i] | + cpuid_get_reg_unsafe(&emulated, cpuid.reg); + vcpu->arch.cpu_caps[i] &= cpuid_get_reg_unsafe(entry, cpuid.reg); + } + + kvm_update_cpuid_runtime(vcpu); /* * If TDP is enabled, let the guest use GBPAGES if they're supported in @@ -347,9 +410,8 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) * and can install smaller shadow pages if the host lacks 1GiB support. */ allow_gbpages = tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) : - guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES); - if (allow_gbpages) - kvm_governed_feature_set(vcpu, X86_FEATURE_GBPAGES); + guest_cpu_cap_has(vcpu, X86_FEATURE_GBPAGES); + guest_cpu_cap_change(vcpu, X86_FEATURE_GBPAGES, allow_gbpages); best = kvm_find_cpuid_entry(vcpu, 1); if (best && apic) { @@ -361,23 +423,25 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_apic_set_version(vcpu); } - vcpu->arch.guest_supported_xcr0 = - cpuid_get_supported_xcr0(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent); + vcpu->arch.guest_supported_xcr0 = cpuid_get_supported_xcr0(vcpu); - kvm_update_pv_runtime(vcpu); + vcpu->arch.pv_cpuid.features = kvm_apply_cpuid_pv_features_quirk(vcpu); + vcpu->arch.is_amd_compatible = guest_cpuid_is_amd_or_hygon(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu); kvm_pmu_refresh(vcpu); - vcpu->arch.cr4_guest_rsvd_bits = - __cr4_reserved_bits(guest_cpuid_has, vcpu); - kvm_hv_set_cpuid(vcpu, kvm_cpuid_has_hyperv(vcpu->arch.cpuid_entries, - vcpu->arch.cpuid_nent)); +#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f) + vcpu->arch.cr4_guest_rsvd_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_) | + __cr4_reserved_bits(guest_cpu_cap_has, vcpu); +#undef __kvm_cpu_cap_has + + kvm_hv_set_cpuid(vcpu, kvm_cpuid_has_hyperv(vcpu)); /* Invoke the vendor callback only after the above state is updated. */ - static_call(kvm_x86_vcpu_after_set_cpuid)(vcpu); + kvm_x86_call(vcpu_after_set_cpuid)(vcpu); /* * Except for the MMU, which needs to do its thing any vendor specific @@ -400,6 +464,20 @@ not_found: return 36; } +int cpuid_query_maxguestphyaddr(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x80000000); + if (!best || best->eax < 0x80000008) + goto not_found; + best = kvm_find_cpuid_entry(vcpu, 0x80000008); + if (best) + return (best->eax >> 16) & 0xff; +not_found: + return 0; +} + /* * This "raw" version returns the reserved GPA bits without any adjustments for * encryption technologies that usurp bits. The raw mask should be used if and @@ -413,9 +491,25 @@ u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu) static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, int nent) { + u32 vcpu_caps[NR_KVM_CPU_CAPS]; int r; - __kvm_update_cpuid_runtime(vcpu, e2, nent); + /* + * Swap the existing (old) entries with the incoming (new) entries in + * order to massage the new entries, e.g. to account for dynamic bits + * that KVM controls, without clobbering the current guest CPUID, which + * KVM needs to preserve in order to unwind on failure. + * + * Similarly, save the vCPU's current cpu_caps so that the capabilities + * can be updated alongside the CPUID entries when performing runtime + * updates. Full initialization is done if and only if the vCPU hasn't + * run, i.e. only if userspace is potentially changing CPUID features. + */ + swap(vcpu->arch.cpuid_entries, e2); + swap(vcpu->arch.cpuid_nent, nent); + + memcpy(vcpu_caps, vcpu->arch.cpu_caps, sizeof(vcpu_caps)); + BUILD_BUG_ON(sizeof(vcpu_caps) != sizeof(vcpu->arch.cpu_caps)); /* * KVM does not correctly handle changing guest CPUID after KVM_RUN, as @@ -431,35 +525,36 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, if (kvm_vcpu_has_run(vcpu)) { r = kvm_cpuid_check_equal(vcpu, e2, nent); if (r) - return r; - - kvfree(e2); - return 0; + goto err; + goto success; } #ifdef CONFIG_KVM_HYPERV - if (kvm_cpuid_has_hyperv(e2, nent)) { + if (kvm_cpuid_has_hyperv(vcpu)) { r = kvm_hv_vcpu_init(vcpu); if (r) - return r; + goto err; } #endif - r = kvm_check_cpuid(vcpu, e2, nent); + r = kvm_check_cpuid(vcpu); if (r) - return r; - - kvfree(vcpu->arch.cpuid_entries); - vcpu->arch.cpuid_entries = e2; - vcpu->arch.cpuid_nent = nent; + goto err; - vcpu->arch.kvm_cpuid = kvm_get_hypervisor_cpuid(vcpu, KVM_SIGNATURE); #ifdef CONFIG_KVM_XEN vcpu->arch.xen.cpuid = kvm_get_hypervisor_cpuid(vcpu, XEN_SIGNATURE); #endif kvm_vcpu_after_set_cpuid(vcpu); +success: + kvfree(e2); return 0; + +err: + memcpy(vcpu->arch.cpu_caps, vcpu_caps, sizeof(vcpu_caps)); + swap(vcpu->arch.cpuid_entries, e2); + swap(vcpu->arch.cpuid_nent, nent); + return r; } /* when an old userspace process fills a new kernel module */ @@ -538,6 +633,9 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, if (cpuid->nent < vcpu->arch.cpuid_nent) return -E2BIG; + if (vcpu->arch.cpuid_dynamic_bits_dirty) + kvm_update_cpuid_runtime(vcpu); + if (copy_to_user(entries, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) return -EFAULT; @@ -546,107 +644,294 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, return 0; } -/* Mask kvm_cpu_caps for @leaf with the raw CPUID capabilities of this CPU. */ -static __always_inline void __kvm_cpu_cap_mask(unsigned int leaf) +static __always_inline u32 raw_cpuid_get(struct cpuid_reg cpuid) { - const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32); struct kvm_cpuid_entry2 entry; + u32 base; + + /* + * KVM only supports features defined by Intel (0x0), AMD (0x80000000), + * and Centaur (0xc0000000). WARN if a feature for new vendor base is + * defined, as this and other code would need to be updated. + */ + base = cpuid.function & 0xffff0000; + if (WARN_ON_ONCE(base && base != 0x80000000 && base != 0xc0000000)) + return 0; - reverse_cpuid_check(leaf); + if (cpuid_eax(base) < cpuid.function) + return 0; cpuid_count(cpuid.function, cpuid.index, &entry.eax, &entry.ebx, &entry.ecx, &entry.edx); - kvm_cpu_caps[leaf] &= *__cpuid_entry_get_reg(&entry, cpuid.reg); + return *__cpuid_entry_get_reg(&entry, cpuid.reg); } -static __always_inline -void kvm_cpu_cap_init_kvm_defined(enum kvm_only_cpuid_leafs leaf, u32 mask) -{ - /* Use kvm_cpu_cap_mask for leafs that aren't KVM-only. */ - BUILD_BUG_ON(leaf < NCAPINTS); +/* + * For kernel-defined leafs, mask KVM's supported feature set with the kernel's + * capabilities as well as raw CPUID. For KVM-defined leafs, consult only raw + * CPUID, as KVM is the one and only authority (in the kernel). + */ +#define kvm_cpu_cap_init(leaf, feature_initializers...) \ +do { \ + const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32); \ + const u32 __maybe_unused kvm_cpu_cap_init_in_progress = leaf; \ + const u32 *kernel_cpu_caps = boot_cpu_data.x86_capability; \ + u32 kvm_cpu_cap_passthrough = 0; \ + u32 kvm_cpu_cap_synthesized = 0; \ + u32 kvm_cpu_cap_emulated = 0; \ + u32 kvm_cpu_cap_features = 0; \ + \ + feature_initializers \ + \ + kvm_cpu_caps[leaf] = kvm_cpu_cap_features; \ + \ + if (leaf < NCAPINTS) \ + kvm_cpu_caps[leaf] &= kernel_cpu_caps[leaf]; \ + \ + kvm_cpu_caps[leaf] |= kvm_cpu_cap_passthrough; \ + kvm_cpu_caps[leaf] &= (raw_cpuid_get(cpuid) | \ + kvm_cpu_cap_synthesized); \ + kvm_cpu_caps[leaf] |= kvm_cpu_cap_emulated; \ +} while (0) - kvm_cpu_caps[leaf] = mask; +/* + * Assert that the feature bit being declared, e.g. via F(), is in the CPUID + * word that's being initialized. Exempt 0x8000_0001.EDX usage of 0x1.EDX + * features, as AMD duplicated many 0x1.EDX features into 0x8000_0001.EDX. + */ +#define KVM_VALIDATE_CPU_CAP_USAGE(name) \ +do { \ + u32 __leaf = __feature_leaf(X86_FEATURE_##name); \ + \ + BUILD_BUG_ON(__leaf != kvm_cpu_cap_init_in_progress); \ +} while (0) + +#define F(name) \ +({ \ + KVM_VALIDATE_CPU_CAP_USAGE(name); \ + kvm_cpu_cap_features |= feature_bit(name); \ +}) - __kvm_cpu_cap_mask(leaf); -} +/* Scattered Flag - For features that are scattered by cpufeatures.h. */ +#define SCATTERED_F(name) \ +({ \ + BUILD_BUG_ON(X86_FEATURE_##name >= MAX_CPU_FEATURES); \ + KVM_VALIDATE_CPU_CAP_USAGE(name); \ + if (boot_cpu_has(X86_FEATURE_##name)) \ + F(name); \ +}) -static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask) -{ - /* Use kvm_cpu_cap_init_kvm_defined for KVM-only leafs. */ - BUILD_BUG_ON(leaf >= NCAPINTS); +/* Features that KVM supports only on 64-bit kernels. */ +#define X86_64_F(name) \ +({ \ + KVM_VALIDATE_CPU_CAP_USAGE(name); \ + if (IS_ENABLED(CONFIG_X86_64)) \ + F(name); \ +}) - kvm_cpu_caps[leaf] &= mask; +/* + * Emulated Feature - For features that KVM emulates in software irrespective + * of host CPU/kernel support. + */ +#define EMULATED_F(name) \ +({ \ + kvm_cpu_cap_emulated |= feature_bit(name); \ + F(name); \ +}) - __kvm_cpu_cap_mask(leaf); -} +/* + * Synthesized Feature - For features that are synthesized into boot_cpu_data, + * i.e. may not be present in the raw CPUID, but can still be advertised to + * userspace. Primarily used for mitigation related feature flags. + */ +#define SYNTHESIZED_F(name) \ +({ \ + kvm_cpu_cap_synthesized |= feature_bit(name); \ + F(name); \ +}) + +/* + * Passthrough Feature - For features that KVM supports based purely on raw + * hardware CPUID, i.e. that KVM virtualizes even if the host kernel doesn't + * use the feature. Simply force set the feature in KVM's capabilities, raw + * CPUID support will be factored in by kvm_cpu_cap_mask(). + */ +#define PASSTHROUGH_F(name) \ +({ \ + kvm_cpu_cap_passthrough |= feature_bit(name); \ + F(name); \ +}) + +/* + * Aliased Features - For features in 0x8000_0001.EDX that are duplicates of + * identical 0x1.EDX features, and thus are aliased from 0x1 to 0x8000_0001. + */ +#define ALIASED_1_EDX_F(name) \ +({ \ + BUILD_BUG_ON(__feature_leaf(X86_FEATURE_##name) != CPUID_1_EDX); \ + BUILD_BUG_ON(kvm_cpu_cap_init_in_progress != CPUID_8000_0001_EDX); \ + kvm_cpu_cap_features |= feature_bit(name); \ +}) + +/* + * Vendor Features - For features that KVM supports, but are added in later + * because they require additional vendor enabling. + */ +#define VENDOR_F(name) \ +({ \ + KVM_VALIDATE_CPU_CAP_USAGE(name); \ +}) + +/* + * Runtime Features - For features that KVM dynamically sets/clears at runtime, + * e.g. when CR4 changes, but which are never advertised to userspace. + */ +#define RUNTIME_F(name) \ +({ \ + KVM_VALIDATE_CPU_CAP_USAGE(name); \ +}) + +/* + * Undefine the MSR bit macro to avoid token concatenation issues when + * processing X86_FEATURE_SPEC_CTRL_SSBD. + */ +#undef SPEC_CTRL_SSBD + +/* DS is defined by ptrace-abi.h on 32-bit builds. */ +#undef DS void kvm_set_cpu_caps(void) { -#ifdef CONFIG_X86_64 - unsigned int f_gbpages = F(GBPAGES); - unsigned int f_lm = F(LM); - unsigned int f_xfd = F(XFD); -#else - unsigned int f_gbpages = 0; - unsigned int f_lm = 0; - unsigned int f_xfd = 0; -#endif memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps)); BUILD_BUG_ON(sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps)) > sizeof(boot_cpu_data.x86_capability)); - memcpy(&kvm_cpu_caps, &boot_cpu_data.x86_capability, - sizeof(kvm_cpu_caps) - (NKVMCAPINTS * sizeof(*kvm_cpu_caps))); - - kvm_cpu_cap_mask(CPUID_1_ECX, + kvm_cpu_cap_init(CPUID_1_ECX, + F(XMM3), + F(PCLMULQDQ), + VENDOR_F(DTES64), /* * NOTE: MONITOR (and MWAIT) are emulated as NOP, but *not* - * advertised to guests via CPUID! + * advertised to guests via CPUID! MWAIT is also technically a + * runtime flag thanks to IA32_MISC_ENABLES; mark it as such so + * that KVM is aware that it's a known, unadvertised flag. */ - F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | - 0 /* DS-CPL, VMX, SMX, EST */ | - 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | - F(FMA) | F(CX16) | 0 /* xTPR Update */ | F(PDCM) | - F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) | - F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | - 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | - F(F16C) | F(RDRAND) + RUNTIME_F(MWAIT), + /* DS-CPL */ + VENDOR_F(VMX), + /* SMX, EST */ + /* TM2 */ + F(SSSE3), + /* CNXT-ID */ + /* Reserved */ + F(FMA), + F(CX16), + /* xTPR Update */ + F(PDCM), + F(PCID), + /* Reserved, DCA */ + F(XMM4_1), + F(XMM4_2), + EMULATED_F(X2APIC), + F(MOVBE), + F(POPCNT), + EMULATED_F(TSC_DEADLINE_TIMER), + F(AES), + F(XSAVE), + RUNTIME_F(OSXSAVE), + F(AVX), + F(F16C), + F(RDRAND), + EMULATED_F(HYPERVISOR), ); - /* KVM emulates x2apic in software irrespective of host support. */ - kvm_cpu_cap_set(X86_FEATURE_X2APIC); - - kvm_cpu_cap_mask(CPUID_1_EDX, - F(FPU) | F(VME) | F(DE) | F(PSE) | - F(TSC) | F(MSR) | F(PAE) | F(MCE) | - F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | - F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | - F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLUSH) | - 0 /* Reserved, DS, ACPI */ | F(MMX) | - F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | - 0 /* HTT, TM, Reserved, PBE */ + + kvm_cpu_cap_init(CPUID_1_EDX, + F(FPU), + F(VME), + F(DE), + F(PSE), + F(TSC), + F(MSR), + F(PAE), + F(MCE), + F(CX8), + F(APIC), + /* Reserved */ + F(SEP), + F(MTRR), + F(PGE), + F(MCA), + F(CMOV), + F(PAT), + F(PSE36), + /* PSN */ + F(CLFLUSH), + /* Reserved */ + VENDOR_F(DS), + /* ACPI */ + F(MMX), + F(FXSR), + F(XMM), + F(XMM2), + F(SELFSNOOP), + /* HTT, TM, Reserved, PBE */ ); - kvm_cpu_cap_mask(CPUID_7_0_EBX, - F(FSGSBASE) | F(SGX) | F(BMI1) | F(HLE) | F(AVX2) | - F(FDP_EXCPTN_ONLY) | F(SMEP) | F(BMI2) | F(ERMS) | F(INVPCID) | - F(RTM) | F(ZERO_FCS_FDS) | 0 /*MPX*/ | F(AVX512F) | - F(AVX512DQ) | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512IFMA) | - F(CLFLUSHOPT) | F(CLWB) | 0 /*INTEL_PT*/ | F(AVX512PF) | - F(AVX512ER) | F(AVX512CD) | F(SHA_NI) | F(AVX512BW) | - F(AVX512VL)); - - kvm_cpu_cap_mask(CPUID_7_ECX, - F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) | - F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | - F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | - F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ | - F(SGX_LC) | F(BUS_LOCK_DETECT) + kvm_cpu_cap_init(CPUID_7_0_EBX, + F(FSGSBASE), + EMULATED_F(TSC_ADJUST), + F(SGX), + F(BMI1), + F(HLE), + F(AVX2), + F(FDP_EXCPTN_ONLY), + F(SMEP), + F(BMI2), + F(ERMS), + F(INVPCID), + F(RTM), + F(ZERO_FCS_FDS), + VENDOR_F(MPX), + F(AVX512F), + F(AVX512DQ), + F(RDSEED), + F(ADX), + F(SMAP), + F(AVX512IFMA), + F(CLFLUSHOPT), + F(CLWB), + VENDOR_F(INTEL_PT), + F(AVX512PF), + F(AVX512ER), + F(AVX512CD), + F(SHA_NI), + F(AVX512BW), + F(AVX512VL), + ); + + kvm_cpu_cap_init(CPUID_7_ECX, + F(AVX512VBMI), + PASSTHROUGH_F(LA57), + F(PKU), + RUNTIME_F(OSPKE), + F(RDPID), + F(AVX512_VPOPCNTDQ), + F(UMIP), + F(AVX512_VBMI2), + F(GFNI), + F(VAES), + F(VPCLMULQDQ), + F(AVX512_VNNI), + F(AVX512_BITALG), + F(CLDEMOTE), + F(MOVDIRI), + F(MOVDIR64B), + VENDOR_F(WAITPKG), + F(SGX_LC), + F(BUS_LOCK_DETECT), ); - /* Set LA57 based on hardware capability. */ - if (cpuid_ecx(7) & F(LA57)) - kvm_cpu_cap_set(X86_FEATURE_LA57); /* * PKU not yet implemented for shadow paging and requires OSPKE @@ -655,80 +940,163 @@ void kvm_set_cpu_caps(void) if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) kvm_cpu_cap_clear(X86_FEATURE_PKU); - kvm_cpu_cap_mask(CPUID_7_EDX, - F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | - F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | - F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) | - F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) | - F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D) + kvm_cpu_cap_init(CPUID_7_EDX, + F(AVX512_4VNNIW), + F(AVX512_4FMAPS), + F(SPEC_CTRL), + F(SPEC_CTRL_SSBD), + EMULATED_F(ARCH_CAPABILITIES), + F(INTEL_STIBP), + F(MD_CLEAR), + F(AVX512_VP2INTERSECT), + F(FSRM), + F(SERIALIZE), + F(TSXLDTRK), + F(AVX512_FP16), + F(AMX_TILE), + F(AMX_INT8), + F(AMX_BF16), + F(FLUSH_L1D), ); - /* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */ - kvm_cpu_cap_set(X86_FEATURE_TSC_ADJUST); - kvm_cpu_cap_set(X86_FEATURE_ARCH_CAPABILITIES); - - if (boot_cpu_has(X86_FEATURE_IBPB) && boot_cpu_has(X86_FEATURE_IBRS)) + if (boot_cpu_has(X86_FEATURE_AMD_IBPB_RET) && + boot_cpu_has(X86_FEATURE_AMD_IBPB) && + boot_cpu_has(X86_FEATURE_AMD_IBRS)) kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL); if (boot_cpu_has(X86_FEATURE_STIBP)) kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP); if (boot_cpu_has(X86_FEATURE_AMD_SSBD)) kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD); - kvm_cpu_cap_mask(CPUID_7_1_EAX, - F(AVX_VNNI) | F(AVX512_BF16) | F(CMPCCXADD) | - F(FZRM) | F(FSRS) | F(FSRC) | - F(AMX_FP16) | F(AVX_IFMA) | F(LAM) + kvm_cpu_cap_init(CPUID_7_1_EAX, + F(SHA512), + F(SM3), + F(SM4), + F(AVX_VNNI), + F(AVX512_BF16), + F(CMPCCXADD), + F(FZRM), + F(FSRS), + F(FSRC), + F(WRMSRNS), + F(AMX_FP16), + F(AVX_IFMA), + F(LAM), + ); + + kvm_cpu_cap_init(CPUID_7_1_EDX, + F(AVX_VNNI_INT8), + F(AVX_NE_CONVERT), + F(AMX_COMPLEX), + F(AVX_VNNI_INT16), + F(PREFETCHITI), + F(AVX10), ); - kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX, - F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI) | - F(AMX_COMPLEX) + kvm_cpu_cap_init(CPUID_7_2_EDX, + F(INTEL_PSFD), + F(IPRED_CTRL), + F(RRSBA_CTRL), + F(DDPD_U), + F(BHI_CTRL), + F(MCDT_NO), ); - kvm_cpu_cap_init_kvm_defined(CPUID_7_2_EDX, - F(INTEL_PSFD) | F(IPRED_CTRL) | F(RRSBA_CTRL) | F(DDPD_U) | - F(BHI_CTRL) | F(MCDT_NO) + kvm_cpu_cap_init(CPUID_D_1_EAX, + F(XSAVEOPT), + F(XSAVEC), + F(XGETBV1), + F(XSAVES), + X86_64_F(XFD), ); - kvm_cpu_cap_mask(CPUID_D_1_EAX, - F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) | f_xfd + kvm_cpu_cap_init(CPUID_12_EAX, + SCATTERED_F(SGX1), + SCATTERED_F(SGX2), + SCATTERED_F(SGX_EDECCSSA), ); - kvm_cpu_cap_init_kvm_defined(CPUID_12_EAX, - SF(SGX1) | SF(SGX2) | SF(SGX_EDECCSSA) + kvm_cpu_cap_init(CPUID_24_0_EBX, + F(AVX10_128), + F(AVX10_256), + F(AVX10_512), ); - kvm_cpu_cap_mask(CPUID_8000_0001_ECX, - F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | - F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | - F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) | - 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) | - F(TOPOEXT) | 0 /* PERFCTR_CORE */ + kvm_cpu_cap_init(CPUID_8000_0001_ECX, + F(LAHF_LM), + F(CMP_LEGACY), + VENDOR_F(SVM), + /* ExtApicSpace */ + F(CR8_LEGACY), + F(ABM), + F(SSE4A), + F(MISALIGNSSE), + F(3DNOWPREFETCH), + F(OSVW), + /* IBS */ + F(XOP), + /* SKINIT, WDT, LWP */ + F(FMA4), + F(TBM), + F(TOPOEXT), + VENDOR_F(PERFCTR_CORE), ); - kvm_cpu_cap_mask(CPUID_8000_0001_EDX, - F(FPU) | F(VME) | F(DE) | F(PSE) | - F(TSC) | F(MSR) | F(PAE) | F(MCE) | - F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | - F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | - F(PAT) | F(PSE36) | 0 /* Reserved */ | - F(NX) | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | - F(FXSR) | F(FXSR_OPT) | f_gbpages | F(RDTSCP) | - 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW) + kvm_cpu_cap_init(CPUID_8000_0001_EDX, + ALIASED_1_EDX_F(FPU), + ALIASED_1_EDX_F(VME), + ALIASED_1_EDX_F(DE), + ALIASED_1_EDX_F(PSE), + ALIASED_1_EDX_F(TSC), + ALIASED_1_EDX_F(MSR), + ALIASED_1_EDX_F(PAE), + ALIASED_1_EDX_F(MCE), + ALIASED_1_EDX_F(CX8), + ALIASED_1_EDX_F(APIC), + /* Reserved */ + F(SYSCALL), + ALIASED_1_EDX_F(MTRR), + ALIASED_1_EDX_F(PGE), + ALIASED_1_EDX_F(MCA), + ALIASED_1_EDX_F(CMOV), + ALIASED_1_EDX_F(PAT), + ALIASED_1_EDX_F(PSE36), + /* Reserved */ + F(NX), + /* Reserved */ + F(MMXEXT), + ALIASED_1_EDX_F(MMX), + ALIASED_1_EDX_F(FXSR), + F(FXSR_OPT), + X86_64_F(GBPAGES), + F(RDTSCP), + /* Reserved */ + X86_64_F(LM), + F(3DNOWEXT), + F(3DNOW), ); if (!tdp_enabled && IS_ENABLED(CONFIG_X86_64)) kvm_cpu_cap_set(X86_FEATURE_GBPAGES); - kvm_cpu_cap_init_kvm_defined(CPUID_8000_0007_EDX, - SF(CONSTANT_TSC) + kvm_cpu_cap_init(CPUID_8000_0007_EDX, + SCATTERED_F(CONSTANT_TSC), ); - kvm_cpu_cap_mask(CPUID_8000_0008_EBX, - F(CLZERO) | F(XSAVEERPTR) | - F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | - F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON) | - F(AMD_PSFD) + kvm_cpu_cap_init(CPUID_8000_0008_EBX, + F(CLZERO), + F(XSAVEERPTR), + F(WBNOINVD), + F(AMD_IBPB), + F(AMD_IBRS), + F(AMD_SSBD), + F(VIRT_SSBD), + F(AMD_SSB_NO), + F(AMD_STIBP), + F(AMD_STIBP_ALWAYS_ON), + F(AMD_IBRS_SAME_MODE), + F(AMD_PSFD), + F(AMD_IBPB_RET), ); /* @@ -736,8 +1104,12 @@ void kvm_set_cpu_caps(void) * arch/x86/kernel/cpu/bugs.c is kind enough to * record that in cpufeatures so use them. */ - if (boot_cpu_has(X86_FEATURE_IBPB)) + if (boot_cpu_has(X86_FEATURE_IBPB)) { kvm_cpu_cap_set(X86_FEATURE_AMD_IBPB); + if (boot_cpu_has(X86_FEATURE_SPEC_CTRL) && + !boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) + kvm_cpu_cap_set(X86_FEATURE_AMD_IBPB_RET); + } if (boot_cpu_has(X86_FEATURE_IBRS)) kvm_cpu_cap_set(X86_FEATURE_AMD_IBRS); if (boot_cpu_has(X86_FEATURE_STIBP)) @@ -754,50 +1126,77 @@ void kvm_set_cpu_caps(void) !boot_cpu_has(X86_FEATURE_AMD_SSBD)) kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); - /* - * Hide all SVM features by default, SVM will set the cap bits for - * features it emulates and/or exposes for L1. - */ - kvm_cpu_cap_mask(CPUID_8000_000A_EDX, 0); - - kvm_cpu_cap_mask(CPUID_8000_001F_EAX, - 0 /* SME */ | F(SEV) | 0 /* VM_PAGE_FLUSH */ | F(SEV_ES) | - F(SME_COHERENT)); + /* All SVM features required additional vendor module enabling. */ + kvm_cpu_cap_init(CPUID_8000_000A_EDX, + VENDOR_F(NPT), + VENDOR_F(VMCBCLEAN), + VENDOR_F(FLUSHBYASID), + VENDOR_F(NRIPS), + VENDOR_F(TSCRATEMSR), + VENDOR_F(V_VMSAVE_VMLOAD), + VENDOR_F(LBRV), + VENDOR_F(PAUSEFILTER), + VENDOR_F(PFTHRESHOLD), + VENDOR_F(VGIF), + VENDOR_F(VNMI), + VENDOR_F(SVME_ADDR_CHK), + ); - kvm_cpu_cap_mask(CPUID_8000_0021_EAX, - F(NO_NESTED_DATA_BP) | F(LFENCE_RDTSC) | 0 /* SmmPgCfgLock */ | - F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */ | - F(WRMSR_XX_BASE_NS) + kvm_cpu_cap_init(CPUID_8000_001F_EAX, + VENDOR_F(SME), + VENDOR_F(SEV), + /* VM_PAGE_FLUSH */ + VENDOR_F(SEV_ES), + F(SME_COHERENT), ); - kvm_cpu_cap_check_and_set(X86_FEATURE_SBPB); - kvm_cpu_cap_check_and_set(X86_FEATURE_IBPB_BRTYPE); - kvm_cpu_cap_check_and_set(X86_FEATURE_SRSO_NO); + kvm_cpu_cap_init(CPUID_8000_0021_EAX, + F(NO_NESTED_DATA_BP), + F(WRMSR_XX_BASE_NS), + /* + * Synthesize "LFENCE is serializing" into the AMD-defined entry + * in KVM's supported CPUID, i.e. if the feature is reported as + * supported by the kernel. LFENCE_RDTSC was a Linux-defined + * synthetic feature long before AMD joined the bandwagon, e.g. + * LFENCE is serializing on most CPUs that support SSE2. On + * CPUs that don't support AMD's leaf, ANDing with the raw host + * CPUID will drop the flags, and reporting support in AMD's + * leaf can make it easier for userspace to detect the feature. + */ + SYNTHESIZED_F(LFENCE_RDTSC), + /* SmmPgCfgLock */ + F(NULL_SEL_CLR_BASE), + /* UpperAddressIgnore */ + F(AUTOIBRS), + F(PREFETCHI), + EMULATED_F(NO_SMM_CTL_MSR), + /* PrefetchCtlMsr */ + /* GpOnUserCpuid */ + /* EPSF */ + SYNTHESIZED_F(SBPB), + SYNTHESIZED_F(IBPB_BRTYPE), + SYNTHESIZED_F(SRSO_NO), + F(SRSO_USER_KERNEL_NO), + ); - kvm_cpu_cap_init_kvm_defined(CPUID_8000_0022_EAX, - F(PERFMON_V2) + kvm_cpu_cap_init(CPUID_8000_0022_EAX, + F(PERFMON_V2), ); - /* - * Synthesize "LFENCE is serializing" into the AMD-defined entry in - * KVM's supported CPUID if the feature is reported as supported by the - * kernel. LFENCE_RDTSC was a Linux-defined synthetic feature long - * before AMD joined the bandwagon, e.g. LFENCE is serializing on most - * CPUs that support SSE2. On CPUs that don't support AMD's leaf, - * kvm_cpu_cap_mask() will unfortunately drop the flag due to ANDing - * the mask with the raw host CPUID, and reporting support in AMD's - * leaf can make it easier for userspace to detect the feature. - */ - if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC)) - kvm_cpu_cap_set(X86_FEATURE_LFENCE_RDTSC); if (!static_cpu_has_bug(X86_BUG_NULL_SEG)) kvm_cpu_cap_set(X86_FEATURE_NULL_SEL_CLR_BASE); - kvm_cpu_cap_set(X86_FEATURE_NO_SMM_CTL_MSR); - kvm_cpu_cap_mask(CPUID_C000_0001_EDX, - F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) | - F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | - F(PMM) | F(PMM_EN) + kvm_cpu_cap_init(CPUID_C000_0001_EDX, + F(XSTORE), + F(XSTORE_EN), + F(XCRYPT), + F(XCRYPT_EN), + F(ACE2), + F(ACE2_EN), + F(PHE), + F(PHE_EN), + F(PMM), + F(PMM_EN), ); /* @@ -817,6 +1216,16 @@ void kvm_set_cpu_caps(void) } EXPORT_SYMBOL_GPL(kvm_set_cpu_caps); +#undef F +#undef SCATTERED_F +#undef X86_64_F +#undef EMULATED_F +#undef SYNTHESIZED_F +#undef PASSTHROUGH_F +#undef ALIASED_1_EDX_F +#undef VENDOR_F +#undef RUNTIME_F + struct kvm_cpuid_array { struct kvm_cpuid_entry2 *entries; int maxnent; @@ -874,14 +1283,11 @@ static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array, return entry; } -static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func) +static int cpuid_func_emulated(struct kvm_cpuid_entry2 *entry, u32 func, + bool include_partially_emulated) { - struct kvm_cpuid_entry2 *entry; - - if (array->nent >= array->maxnent) - return -E2BIG; + memset(entry, 0, sizeof(*entry)); - entry = &array->entries[array->nent]; entry->function = func; entry->index = 0; entry->flags = 0; @@ -889,23 +1295,37 @@ static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func) switch (func) { case 0: entry->eax = 7; - ++array->nent; - break; + return 1; case 1: - entry->ecx = F(MOVBE); - ++array->nent; - break; + entry->ecx = feature_bit(MOVBE); + /* + * KVM allows userspace to enumerate MONITOR+MWAIT support to + * the guest, but the MWAIT feature flag is never advertised + * to userspace because MONITOR+MWAIT aren't virtualized by + * hardware, can't be faithfully emulated in software (KVM + * emulates them as NOPs), and allowing the guest to execute + * them natively requires enabling a per-VM capability. + */ + if (include_partially_emulated) + entry->ecx |= feature_bit(MWAIT); + return 1; case 7: entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; entry->eax = 0; if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) - entry->ecx = F(RDPID); - ++array->nent; - break; + entry->ecx = feature_bit(RDPID); + return 1; default: - break; + return 0; } +} +static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func) +{ + if (array->nent >= array->maxnent) + return -E2BIG; + + array->nent += cpuid_func_emulated(&array->entries[array->nent], func, false); return 0; } @@ -926,7 +1346,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) switch (function) { case 0: /* Limited to the highest leaf implemented in KVM. */ - entry->eax = min(entry->eax, 0x1fU); + entry->eax = min(entry->eax, 0x24U); break; case 1: cpuid_entry_override(entry, CPUID_1_EDX); @@ -999,8 +1419,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) } break; case 0xa: { /* Architectural Performance Monitoring */ - union cpuid10_eax eax; - union cpuid10_edx edx; + union cpuid10_eax eax = { }; + union cpuid10_edx edx = { }; if (!enable_pmu || !static_cpu_has(X86_FEATURE_ARCH_PERFMON)) { entry->eax = entry->ebx = entry->ecx = entry->edx = 0; @@ -1016,8 +1436,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) if (kvm_pmu_cap.version) edx.split.anythread_deprecated = 1; - edx.split.reserved1 = 0; - edx.split.reserved2 = 0; entry->eax = eax.full; entry->ebx = kvm_pmu_cap.events_mask; @@ -1049,7 +1467,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) goto out; cpuid_entry_override(entry, CPUID_D_1_EAX); - if (entry->eax & (F(XSAVES)|F(XSAVEC))) + if (entry->eax & (feature_bit(XSAVES) | feature_bit(XSAVEC))) entry->ebx = xstate_required_size(permitted_xcr0 | permitted_xss, true); else { @@ -1151,6 +1569,28 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) break; } break; + case 0x24: { + u8 avx10_version; + + if (!kvm_cpu_cap_has(X86_FEATURE_AVX10)) { + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; + break; + } + + /* + * The AVX10 version is encoded in EBX[7:0]. Note, the version + * is guaranteed to be >=1 if AVX10 is supported. Note #2, the + * version needs to be captured before overriding EBX features! + */ + avx10_version = min_t(u8, entry->ebx & 0xff, 1); + cpuid_entry_override(entry, CPUID_24_0_EBX); + entry->ebx |= avx10_version; + + entry->eax = 0; + entry->ecx = 0; + entry->edx = 0; + break; + } case KVM_CPUID_SIGNATURE: { const u32 *sigptr = (const u32 *)KVM_SIGNATURE; entry->eax = KVM_CPUID_FEATURES; @@ -1221,9 +1661,22 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->eax = entry->ebx = entry->ecx = 0; break; case 0x80000008: { - unsigned g_phys_as = (entry->eax >> 16) & 0xff; - unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U); - unsigned phys_as = entry->eax & 0xff; + /* + * GuestPhysAddrSize (EAX[23:16]) is intended for software + * use. + * + * KVM's ABI is to report the effective MAXPHYADDR for the + * guest in PhysAddrSize (phys_as), and the maximum + * *addressable* GPA in GuestPhysAddrSize (g_phys_as). + * + * GuestPhysAddrSize is valid if and only if TDP is enabled, + * in which case the max GPA that can be addressed by KVM may + * be less than the max GPA that can be legally generated by + * the guest, e.g. if MAXPHYADDR>48 but the CPU doesn't + * support 5-level TDP. + */ + unsigned int virt_as = max((entry->eax >> 8) & 0xff, 48U); + unsigned int phys_as, g_phys_as; /* * If TDP (NPT) is disabled use the adjusted host MAXPHYADDR as @@ -1231,16 +1684,24 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) * reductions in MAXPHYADDR for memory encryption affect shadow * paging, too. * - * If TDP is enabled but an explicit guest MAXPHYADDR is not - * provided, use the raw bare metal MAXPHYADDR as reductions to - * the HPAs do not affect GPAs. + * If TDP is enabled, use the raw bare metal MAXPHYADDR as + * reductions to the HPAs do not affect GPAs. The max + * addressable GPA is the same as the max effective GPA, except + * that it's capped at 48 bits if 5-level TDP isn't supported + * (hardware processes bits 51:48 only when walking the fifth + * level page table). */ - if (!tdp_enabled) - g_phys_as = boot_cpu_data.x86_phys_bits; - else if (!g_phys_as) + if (!tdp_enabled) { + phys_as = boot_cpu_data.x86_phys_bits; + g_phys_as = 0; + } else { + phys_as = entry->eax & 0xff; g_phys_as = phys_as; + if (kvm_mmu_get_max_tdp_level() < 5) + g_phys_as = min(g_phys_as, 48U); + } - entry->eax = g_phys_as | (virt_as << 8); + entry->eax = phys_as | (virt_as << 8) | (g_phys_as << 16); entry->ecx &= ~(GENMASK(31, 16) | GENMASK(11, 8)); entry->edx = 0; cpuid_entry_override(entry, CPUID_8000_0008_EBX); @@ -1292,23 +1753,17 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) break; /* AMD Extended Performance Monitoring and Debug */ case 0x80000022: { - union cpuid_0x80000022_ebx ebx; + union cpuid_0x80000022_ebx ebx = { }; entry->ecx = entry->edx = 0; if (!enable_pmu || !kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) { - entry->eax = entry->ebx; + entry->eax = entry->ebx = 0; break; } cpuid_entry_override(entry, CPUID_8000_0022_EAX); - if (kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) - ebx.split.num_core_pmc = kvm_pmu_cap.num_counters_gp; - else if (kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE)) - ebx.split.num_core_pmc = AMD64_NUM_COUNTERS_CORE; - else - ebx.split.num_core_pmc = AMD64_NUM_COUNTERS; - + ebx.split.num_core_pmc = kvm_pmu_cap.num_counters_gp; entry->ebx = ebx.full; break; } @@ -1443,22 +1898,6 @@ out_free: return r; } -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, - u32 function, u32 index) -{ - return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent, - function, index); -} -EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry_index); - -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, - u32 function) -{ - return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent, - function, KVM_CPUID_INDEX_NOT_SIGNIFICANT); -} -EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); - /* * Intel CPUID semantics treats any query for an out-of-range leaf as if the * highest basic leaf (i.e. CPUID.0H:EAX) were requested. AMD CPUID semantics @@ -1534,6 +1973,9 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, struct kvm_cpuid_entry2 *entry; bool exact, used_max_basic = false; + if (vcpu->arch.cpuid_dynamic_bits_dirty) + kvm_update_cpuid_runtime(vcpu); + entry = kvm_find_cpuid_entry_index(vcpu, function, index); exact = !!entry; @@ -1549,12 +1991,29 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, *edx = entry->edx; if (function == 7 && index == 0) { u64 data; - if (!__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) && + if ((*ebx & (feature_bit(RTM) | feature_bit(HLE))) && + !__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) && (data & TSX_CTRL_CPUID_CLEAR)) - *ebx &= ~(F(RTM) | F(HLE)); + *ebx &= ~(feature_bit(RTM) | feature_bit(HLE)); } else if (function == 0x80000007) { if (kvm_hv_invtsc_suppressed(vcpu)) - *edx &= ~SF(CONSTANT_TSC); + *edx &= ~feature_bit(CONSTANT_TSC); + } else if (IS_ENABLED(CONFIG_KVM_XEN) && + kvm_xen_is_tsc_leaf(vcpu, function)) { + /* + * Update guest TSC frequency information if necessary. + * Ignore failures, there is no sane value that can be + * provided if KVM can't get the TSC frequency. + */ + if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) + kvm_guest_time_update(vcpu); + + if (index == 1) { + *ecx = vcpu->arch.pvclock_tsc_mul; + *edx = vcpu->arch.pvclock_tsc_shift; + } else if (index == 2) { + *eax = vcpu->arch.hw_tsc_khz; + } } } else { *eax = *ebx = *ecx = *edx = 0; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 856e3037e74f..d3f5ae15a7ca 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -2,7 +2,6 @@ #ifndef ARCH_X86_KVM_CPUID_H #define ARCH_X86_KVM_CPUID_H -#include "x86.h" #include "reverse_cpuid.h" #include <asm/cpu.h> #include <asm/processor.h> @@ -11,12 +10,35 @@ extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; void kvm_set_cpu_caps(void); -void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu); -void kvm_update_pv_runtime(struct kvm_vcpu *vcpu); -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, - u32 function, u32 index); -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, - u32 function); +void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu); +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2(struct kvm_cpuid_entry2 *entries, + int nent, u32 function, u64 index); +/* + * Magic value used by KVM when querying userspace-provided CPUID entries and + * doesn't care about the CPIUD index because the index of the function in + * question is not significant. Note, this magic value must have at least one + * bit set in bits[63:32] and must be consumed as a u64 by kvm_find_cpuid_entry2() + * to avoid false positives when processing guest CPUID input. + * + * KVM_CPUID_INDEX_NOT_SIGNIFICANT should never be used directly outside of + * kvm_find_cpuid_entry2() and kvm_find_cpuid_entry(). + */ +#define KVM_CPUID_INDEX_NOT_SIGNIFICANT -1ull + +static inline struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, + u32 function, u32 index) +{ + return kvm_find_cpuid_entry2(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent, + function, index); +} + +static inline struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, + u32 function) +{ + return kvm_find_cpuid_entry2(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent, + function, KVM_CPUID_INDEX_NOT_SIGNIFICANT); +} + int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries, unsigned int type); @@ -32,9 +54,11 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool exact_only); +void __init kvm_init_xstate_sizes(void); u32 xstate_required_size(u64 xstate_bv, bool compacted); int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu); +int cpuid_query_maxguestphyaddr(struct kvm_vcpu *vcpu); u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu); static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) @@ -67,57 +91,48 @@ static __always_inline void cpuid_entry_override(struct kvm_cpuid_entry2 *entry, *reg = kvm_cpu_caps[leaf]; } -static __always_inline u32 *guest_cpuid_get_register(struct kvm_vcpu *vcpu, - unsigned int x86_feature) +static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu, + unsigned int x86_feature) { const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature); struct kvm_cpuid_entry2 *entry; + u32 *reg; + + /* + * XSAVES is a special snowflake. Due to lack of a dedicated intercept + * on SVM, KVM must assume that XSAVES (and thus XRSTORS) is usable by + * the guest if the host supports XSAVES and *XSAVE* is exposed to the + * guest. Because the guest can execute XSAVES and XRSTORS, i.e. can + * indirectly consume XSS, KVM must ensure XSS is zeroed when running + * the guest, i.e. must set XSAVES in vCPU capabilities. But to reject + * direct XSS reads and writes (to minimize the virtualization hole and + * honor userspace's CPUID), KVM needs to check the raw guest CPUID, + * not KVM's view of guest capabilities. + * + * For all other features, guest capabilities are accurate. Expand + * this allowlist with extreme vigilance. + */ + BUILD_BUG_ON(x86_feature != X86_FEATURE_XSAVES); entry = kvm_find_cpuid_entry_index(vcpu, cpuid.function, cpuid.index); if (!entry) return NULL; - return __cpuid_entry_get_reg(entry, cpuid.reg); -} - -static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu, - unsigned int x86_feature) -{ - u32 *reg; - - reg = guest_cpuid_get_register(vcpu, x86_feature); + reg = __cpuid_entry_get_reg(entry, cpuid.reg); if (!reg) return false; return *reg & __feature_bit(x86_feature); } -static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu, - unsigned int x86_feature) +static inline bool guest_cpuid_is_amd_compatible(struct kvm_vcpu *vcpu) { - u32 *reg; - - reg = guest_cpuid_get_register(vcpu, x86_feature); - if (reg) - *reg &= ~__feature_bit(x86_feature); + return vcpu->arch.is_amd_compatible; } -static inline bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu) +static inline bool guest_cpuid_is_intel_compatible(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, 0); - return best && - (is_guest_vendor_amd(best->ebx, best->ecx, best->edx) || - is_guest_vendor_hygon(best->ebx, best->ecx, best->edx)); -} - -static inline bool guest_cpuid_is_intel(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, 0); - return best && is_guest_vendor_intel(best->ebx, best->ecx, best->edx); + return !guest_cpuid_is_amd_compatible(vcpu); } static inline int guest_cpuid_family(struct kvm_vcpu *vcpu) @@ -158,21 +173,6 @@ static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu) return x86_stepping(best->eax); } -static inline bool guest_has_spec_ctrl_msr(struct kvm_vcpu *vcpu) -{ - return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) || - guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) || - guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) || - guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)); -} - -static inline bool guest_has_pred_cmd_msr(struct kvm_vcpu *vcpu) -{ - return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) || - guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB) || - guest_cpuid_has(vcpu, X86_FEATURE_SBPB)); -} - static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu) { return vcpu->arch.msr_platform_info & MSR_PLATFORM_INFO_CPUID_FAULT; @@ -188,7 +188,6 @@ static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature) { unsigned int x86_leaf = __feature_leaf(x86_feature); - reverse_cpuid_check(x86_leaf); kvm_cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature); } @@ -196,7 +195,6 @@ static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature) { unsigned int x86_leaf = __feature_leaf(x86_feature); - reverse_cpuid_check(x86_leaf); kvm_cpu_caps[x86_leaf] |= __feature_bit(x86_feature); } @@ -204,7 +202,6 @@ static __always_inline u32 kvm_cpu_cap_get(unsigned int x86_feature) { unsigned int x86_leaf = __feature_leaf(x86_feature); - reverse_cpuid_check(x86_leaf); return kvm_cpu_caps[x86_leaf] & __feature_bit(x86_feature); } @@ -228,58 +225,69 @@ static __always_inline bool guest_pv_has(struct kvm_vcpu *vcpu, return vcpu->arch.pv_cpuid.features & (1u << kvm_feature); } -enum kvm_governed_features { -#define KVM_GOVERNED_FEATURE(x) KVM_GOVERNED_##x, -#include "governed_features.h" - KVM_NR_GOVERNED_FEATURES -}; - -static __always_inline int kvm_governed_feature_index(unsigned int x86_feature) +static __always_inline void guest_cpu_cap_set(struct kvm_vcpu *vcpu, + unsigned int x86_feature) { - switch (x86_feature) { -#define KVM_GOVERNED_FEATURE(x) case x: return KVM_GOVERNED_##x; -#include "governed_features.h" - default: - return -1; - } -} + unsigned int x86_leaf = __feature_leaf(x86_feature); -static __always_inline bool kvm_is_governed_feature(unsigned int x86_feature) -{ - return kvm_governed_feature_index(x86_feature) >= 0; + vcpu->arch.cpu_caps[x86_leaf] |= __feature_bit(x86_feature); } -static __always_inline void kvm_governed_feature_set(struct kvm_vcpu *vcpu, - unsigned int x86_feature) +static __always_inline void guest_cpu_cap_clear(struct kvm_vcpu *vcpu, + unsigned int x86_feature) { - BUILD_BUG_ON(!kvm_is_governed_feature(x86_feature)); + unsigned int x86_leaf = __feature_leaf(x86_feature); - __set_bit(kvm_governed_feature_index(x86_feature), - vcpu->arch.governed_features.enabled); + vcpu->arch.cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature); } -static __always_inline void kvm_governed_feature_check_and_set(struct kvm_vcpu *vcpu, - unsigned int x86_feature) +static __always_inline void guest_cpu_cap_change(struct kvm_vcpu *vcpu, + unsigned int x86_feature, + bool guest_has_cap) { - if (kvm_cpu_cap_has(x86_feature) && guest_cpuid_has(vcpu, x86_feature)) - kvm_governed_feature_set(vcpu, x86_feature); + if (guest_has_cap) + guest_cpu_cap_set(vcpu, x86_feature); + else + guest_cpu_cap_clear(vcpu, x86_feature); } -static __always_inline bool guest_can_use(struct kvm_vcpu *vcpu, - unsigned int x86_feature) +static __always_inline bool guest_cpu_cap_has(struct kvm_vcpu *vcpu, + unsigned int x86_feature) { - BUILD_BUG_ON(!kvm_is_governed_feature(x86_feature)); + unsigned int x86_leaf = __feature_leaf(x86_feature); - return test_bit(kvm_governed_feature_index(x86_feature), - vcpu->arch.governed_features.enabled); + /* + * Except for MWAIT, querying dynamic feature bits is disallowed, so + * that KVM can defer runtime updates until the next CPUID emulation. + */ + BUILD_BUG_ON(x86_feature == X86_FEATURE_APIC || + x86_feature == X86_FEATURE_OSXSAVE || + x86_feature == X86_FEATURE_OSPKE); + + return vcpu->arch.cpu_caps[x86_leaf] & __feature_bit(x86_feature); } static inline bool kvm_vcpu_is_legal_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { - if (guest_can_use(vcpu, X86_FEATURE_LAM)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_LAM)) cr3 &= ~(X86_CR3_LAM_U48 | X86_CR3_LAM_U57); return kvm_vcpu_is_legal_gpa(vcpu, cr3); } +static inline bool guest_has_spec_ctrl_msr(struct kvm_vcpu *vcpu) +{ + return (guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL) || + guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_STIBP) || + guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_IBRS) || + guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_SSBD)); +} + +static inline bool guest_has_pred_cmd_msr(struct kvm_vcpu *vcpu) +{ + return (guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL) || + guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_IBPB) || + guest_cpu_cap_has(vcpu, X86_FEATURE_SBPB)); +} + #endif diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5d4c86133453..1349e278cd2a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -477,8 +477,11 @@ static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, .dst_val = ctxt->dst.val64, .src_bytes = ctxt->src.bytes, .dst_bytes = ctxt->dst.bytes, + .src_type = ctxt->src.type, + .dst_type = ctxt->dst.type, .ad_bytes = ctxt->ad_bytes, - .next_rip = ctxt->eip, + .rip = ctxt->eip, + .next_rip = ctxt->_eip, }; return ctxt->ops->intercept(ctxt, &info, stage); @@ -651,9 +654,10 @@ static inline u8 ctxt_virt_addr_bits(struct x86_emulate_ctxt *ctxt) } static inline bool emul_is_noncanonical_address(u64 la, - struct x86_emulate_ctxt *ctxt) + struct x86_emulate_ctxt *ctxt, + unsigned int flags) { - return !__is_canonical_address(la, ctxt_virt_addr_bits(ctxt)); + return !ctxt->ops->is_canonical_addr(ctxt, la, flags); } /* @@ -1069,7 +1073,7 @@ static __always_inline u8 test_cc(unsigned int condition, unsigned long flags) flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF; asm("push %[flags]; popf; " CALL_NOSPEC - : "=a"(rc) : [thunk_target]"r"(fop), [flags]"r"(flags)); + : "=a"(rc), ASM_CALL_CONSTRAINT : [thunk_target]"r"(fop), [flags]"r"(flags)); return rc; } @@ -1733,7 +1737,8 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (ret != X86EMUL_CONTINUE) return ret; if (emul_is_noncanonical_address(get_desc_base(&seg_desc) | - ((u64)base3 << 32), ctxt)) + ((u64)base3 << 32), ctxt, + X86EMUL_F_DT_LOAD)) return emulate_gp(ctxt, err_code); } @@ -2354,50 +2359,6 @@ setup_syscalls_segments(struct desc_struct *cs, struct desc_struct *ss) ss->avl = 0; } -static bool vendor_intel(struct x86_emulate_ctxt *ctxt) -{ - u32 eax, ebx, ecx, edx; - - eax = ecx = 0; - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, true); - return is_guest_vendor_intel(ebx, ecx, edx); -} - -static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt) -{ - const struct x86_emulate_ops *ops = ctxt->ops; - u32 eax, ebx, ecx, edx; - - /* - * syscall should always be enabled in longmode - so only become - * vendor specific (cpuid) if other modes are active... - */ - if (ctxt->mode == X86EMUL_MODE_PROT64) - return true; - - eax = 0x00000000; - ecx = 0x00000000; - ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, true); - /* - * remark: Intel CPUs only support "syscall" in 64bit longmode. Also a - * 64bit guest with a 32bit compat-app running will #UD !! While this - * behaviour can be fixed (by emulating) into AMD response - CPUs of - * AMD can't behave like Intel. - */ - if (is_guest_vendor_intel(ebx, ecx, edx)) - return false; - - if (is_guest_vendor_amd(ebx, ecx, edx) || - is_guest_vendor_hygon(ebx, ecx, edx)) - return true; - - /* - * default: (not Intel, not AMD, not Hygon), apply Intel's - * stricter rules... - */ - return false; -} - static int em_syscall(struct x86_emulate_ctxt *ctxt) { const struct x86_emulate_ops *ops = ctxt->ops; @@ -2411,7 +2372,15 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) ctxt->mode == X86EMUL_MODE_VM86) return emulate_ud(ctxt); - if (!(em_syscall_is_enabled(ctxt))) + /* + * Intel compatible CPUs only support SYSCALL in 64-bit mode, whereas + * AMD allows SYSCALL in any flavor of protected mode. Note, it's + * infeasible to emulate Intel behavior when running on AMD hardware, + * as SYSCALL won't fault in the "wrong" mode, i.e. there is no #UD + * for KVM to trap-and-emulate, unlike emulating AMD on Intel. + */ + if (ctxt->mode != X86EMUL_MODE_PROT64 && + ctxt->ops->guest_cpuid_is_intel_compatible(ctxt)) return emulate_ud(ctxt); ops->get_msr(ctxt, MSR_EFER, &efer); @@ -2471,11 +2440,11 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt) return emulate_gp(ctxt, 0); /* - * Not recognized on AMD in compat mode (but is recognized in legacy - * mode). + * Intel's architecture allows SYSENTER in compatibility mode, but AMD + * does not. Note, AMD does allow SYSENTER in legacy protected mode. */ - if ((ctxt->mode != X86EMUL_MODE_PROT64) && (efer & EFER_LMA) - && !vendor_intel(ctxt)) + if ((ctxt->mode != X86EMUL_MODE_PROT64) && (efer & EFER_LMA) && + !ctxt->ops->guest_cpuid_is_intel_compatible(ctxt)) return emulate_ud(ctxt); /* sysenter/sysexit have not been tested in 64bit mode. */ @@ -2552,8 +2521,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) ss_sel = cs_sel + 8; cs.d = 0; cs.l = 1; - if (emul_is_noncanonical_address(rcx, ctxt) || - emul_is_noncanonical_address(rdx, ctxt)) + if (emul_is_noncanonical_address(rcx, ctxt, 0) || + emul_is_noncanonical_address(rdx, ctxt, 0)) return emulate_gp(ctxt, 0); break; } @@ -2647,7 +2616,14 @@ static void string_registers_quirk(struct x86_emulate_ctxt *ctxt) * manner when ECX is zero due to REP-string optimizations. */ #ifdef CONFIG_X86_64 - if (ctxt->ad_bytes != 4 || !vendor_intel(ctxt)) + u32 eax, ebx, ecx, edx; + + if (ctxt->ad_bytes != 4) + return; + + eax = ecx = 0; + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, true); + if (!is_guest_vendor_intel(ebx, ecx, edx)) return; *reg_write(ctxt, VCPU_REGS_RCX) = 0; @@ -3523,7 +3499,8 @@ static int em_lgdt_lidt(struct x86_emulate_ctxt *ctxt, bool lgdt) if (rc != X86EMUL_CONTINUE) return rc; if (ctxt->mode == X86EMUL_MODE_PROT64 && - emul_is_noncanonical_address(desc_ptr.address, ctxt)) + emul_is_noncanonical_address(desc_ptr.address, ctxt, + X86EMUL_F_DT_LOAD)) return emulate_gp(ctxt, 0); if (lgdt) ctxt->ops->set_gdt(ctxt, &desc_ptr); diff --git a/arch/x86/kvm/governed_features.h b/arch/x86/kvm/governed_features.h deleted file mode 100644 index ad463b1ed4e4..000000000000 --- a/arch/x86/kvm/governed_features.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#if !defined(KVM_GOVERNED_FEATURE) || defined(KVM_GOVERNED_X86_FEATURE) -BUILD_BUG() -#endif - -#define KVM_GOVERNED_X86_FEATURE(x) KVM_GOVERNED_FEATURE(X86_FEATURE_##x) - -KVM_GOVERNED_X86_FEATURE(GBPAGES) -KVM_GOVERNED_X86_FEATURE(XSAVES) -KVM_GOVERNED_X86_FEATURE(VMX) -KVM_GOVERNED_X86_FEATURE(NRIPS) -KVM_GOVERNED_X86_FEATURE(TSCRATEMSR) -KVM_GOVERNED_X86_FEATURE(V_VMSAVE_VMLOAD) -KVM_GOVERNED_X86_FEATURE(LBRV) -KVM_GOVERNED_X86_FEATURE(PAUSEFILTER) -KVM_GOVERNED_X86_FEATURE(PFTHRESHOLD) -KVM_GOVERNED_X86_FEATURE(VGIF) -KVM_GOVERNED_X86_FEATURE(VNMI) -KVM_GOVERNED_X86_FEATURE(LAM) - -#undef KVM_GOVERNED_X86_FEATURE -#undef KVM_GOVERNED_FEATURE diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 8a47f8541eab..24f0318c50d7 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -952,8 +952,7 @@ static void stimer_init(struct kvm_vcpu_hv_stimer *stimer, int timer_index) { memset(stimer, 0, sizeof(*stimer)); stimer->index = timer_index; - hrtimer_init(&stimer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - stimer->timer.function = stimer_timer_callback; + hrtimer_setup(&stimer->timer, stimer_timer_callback, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); stimer_prepare_msg(stimer); } @@ -1352,7 +1351,7 @@ static void __kvm_hv_xsaves_xsavec_maybe_warn(struct kvm_vcpu *vcpu) return; if (guest_cpuid_has(vcpu, X86_FEATURE_XSAVES) || - !guest_cpuid_has(vcpu, X86_FEATURE_XSAVEC)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVEC)) return; pr_notice_ratelimited("Booting SMP Windows KVM VM with !XSAVES && XSAVEC. " @@ -1417,7 +1416,7 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, } /* vmcall/vmmcall */ - static_call(kvm_x86_patch_hypercall)(vcpu, instructions + i); + kvm_x86_call(patch_hypercall)(vcpu, instructions + i); i += 3; /* ret */ @@ -1737,7 +1736,8 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, data = (u64)vcpu->arch.virtual_tsc_khz * 1000; break; case HV_X64_MSR_APIC_FREQUENCY: - data = APIC_BUS_FREQUENCY; + data = div64_u64(1000000000ULL, + vcpu->kvm->arch.apic_bus_cycle_ns); break; default: kvm_pr_unimpl_rdmsr(vcpu, msr); @@ -1985,7 +1985,7 @@ int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu) */ gva = entries[i] & PAGE_MASK; for (j = 0; j < (entries[i] & ~PAGE_MASK) + 1; j++) - static_call(kvm_x86_flush_tlb_gva)(vcpu, gva + j * PAGE_SIZE); + kvm_x86_call(flush_tlb_gva)(vcpu, gva + j * PAGE_SIZE); ++vcpu->stat.tlb_flush; } @@ -2225,6 +2225,9 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) u32 vector; bool all_cpus; + if (!lapic_in_kernel(vcpu)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + if (hc->code == HVCALL_SEND_IPI) { if (!hc->fast) { if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi, @@ -2526,7 +2529,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) * hypercall generates UD from non zero cpl and real mode * per HYPER-V spec */ - if (static_call(kvm_x86_get_cpl)(vcpu) != 0 || !is_protmode(vcpu)) { + if (kvm_x86_call(get_cpl)(vcpu) != 0 || !is_protmode(vcpu)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } @@ -2851,7 +2854,8 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->eax |= HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; ent->eax |= HV_X64_APIC_ACCESS_RECOMMENDED; ent->eax |= HV_X64_RELAXED_TIMING_RECOMMENDED; - ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED; + if (!vcpu || lapic_in_kernel(vcpu)) + ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED; ent->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED; if (evmcs_ver) ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 923e64903da9..913bfc96959c 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -286,7 +286,6 @@ static inline int kvm_hv_hypercall(struct kvm_vcpu *vcpu) return HV_STATUS_ACCESS_DENIED; } static inline void kvm_hv_vcpu_purge_flush_tlb(struct kvm_vcpu *vcpu) {} -static inline void kvm_hv_free_pa_page(struct kvm *kvm) {} static inline bool kvm_hv_synic_has_vector(struct kvm_vcpu *vcpu, int vector) { return false; diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index cd57a517d04a..739aa6c0d0c3 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -681,7 +681,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) pid_nr = pid_vnr(pid); put_pid(pid); - pit->worker = kthread_create_worker(0, "kvm-pit/%d", pid_nr); + pit->worker = kthread_run_worker(0, "kvm-pit/%d", pid_nr); if (IS_ERR(pit->worker)) goto fail_kthread; @@ -690,8 +690,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) pit->kvm = kvm; pit_state = &pit->pit_state; - hrtimer_init(&pit_state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - pit_state->timer.function = pit_timer_fn; + hrtimer_setup(&pit_state->timer, pit_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); pit_state->irq_ack_notifier.gsi = 0; pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq; diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 8dec646e764b..a8fb19940975 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -567,7 +567,7 @@ static void pic_irq_request(struct kvm *kvm, int level) { struct kvm_pic *s = kvm->arch.vpic; - if (!s->output) + if (!s->output && level) s->wakeup_needed = true; s->output = level; } diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 995eb5054360..45dae2d5d2f1 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -296,11 +296,8 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors) index == RTC_GSI) { u16 dm = kvm_lapic_irq_dest_mode(!!e->fields.dest_mode); - if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, - e->fields.dest_id, dm) || - kvm_apic_pending_eoi(vcpu, e->fields.vector)) - __set_bit(e->fields.vector, - ioapic_handled_vectors); + kvm_scan_ioapic_irq(vcpu, e->fields.dest_id, dm, + e->fields.vector, ioapic_handled_vectors); } } spin_unlock(&ioapic->lock); diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 539333ac4b38..aa8cb4ac0479 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -120,4 +120,6 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors); void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors); +void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode, + u8 vector, unsigned long *ioapic_handled_vectors); #endif diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index ad9ca8a60144..97d68d837929 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -100,6 +100,9 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v) if (kvm_cpu_has_extint(v)) return 1; + if (lapic_in_kernel(v) && v->arch.apic->guest_apic_protected) + return kvm_x86_call(protected_apic_has_interrupt)(v); + return kvm_apic_has_interrupt(v) != -1; /* LAPIC */ } EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); @@ -108,7 +111,7 @@ EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); * Read pending interrupt(from non-APIC source) * vector and intack. */ -static int kvm_cpu_get_extint(struct kvm_vcpu *v) +int kvm_cpu_get_extint(struct kvm_vcpu *v) { if (!kvm_cpu_has_extint(v)) { WARN_ON(!lapic_in_kernel(v)); @@ -131,6 +134,7 @@ static int kvm_cpu_get_extint(struct kvm_vcpu *v) } else return kvm_pic_read_irq(v->kvm); /* PIC */ } +EXPORT_SYMBOL_GPL(kvm_cpu_get_extint); /* * Read pending interrupt vector and intack. @@ -141,9 +145,12 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) if (vector != -1) return vector; /* PIC */ - return kvm_get_apic_interrupt(v); /* APIC */ + vector = kvm_apic_has_interrupt(v); /* APIC */ + if (vector != -1) + kvm_apic_ack_interrupt(v, vector); + + return vector; } -EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) { @@ -157,7 +164,7 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu) { __kvm_migrate_apic_timer(vcpu); __kvm_migrate_pit_timer(vcpu); - static_call_cond(kvm_x86_migrate_timers)(vcpu); + kvm_x86_call(migrate_timers)(vcpu); } bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args) diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index c2d7cfe82d00..76d46b2f41dd 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -106,7 +106,6 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu); int apic_has_pending_timer(struct kvm_vcpu *vcpu); int kvm_setup_default_irq_routing(struct kvm *kvm); -int kvm_setup_empty_irq_routing(struct kvm *kvm); int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, struct dest_map *dest_map); diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 68f3f6c26046..d6d792b5d1bd 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -395,13 +395,6 @@ int kvm_setup_default_irq_routing(struct kvm *kvm) ARRAY_SIZE(default_routing), 0); } -static const struct kvm_irq_routing_entry empty_routing[] = {}; - -int kvm_setup_empty_irq_routing(struct kvm *kvm) -{ - return kvm_set_irq_routing(kvm, empty_routing, 0, 0); -} - void kvm_arch_post_irq_routing_update(struct kvm *kvm) { if (!irqchip_split(kvm)) @@ -409,6 +402,33 @@ void kvm_arch_post_irq_routing_update(struct kvm *kvm) kvm_make_scan_ioapic_request(kvm); } +void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode, + u8 vector, unsigned long *ioapic_handled_vectors) +{ + /* + * Intercept EOI if the vCPU is the target of the new IRQ routing, or + * the vCPU has a pending IRQ from the old routing, i.e. if the vCPU + * may receive a level-triggered IRQ in the future, or already received + * level-triggered IRQ. The EOI needs to be intercepted and forwarded + * to I/O APIC emulation so that the IRQ can be de-asserted. + */ + if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, dest_id, dest_mode)) { + __set_bit(vector, ioapic_handled_vectors); + } else if (kvm_apic_pending_eoi(vcpu, vector)) { + __set_bit(vector, ioapic_handled_vectors); + + /* + * Track the highest pending EOI for which the vCPU is NOT the + * target in the new routing. Only the EOI for the IRQ that is + * in-flight (for the old routing) needs to be intercepted, any + * future IRQs that arrive on this vCPU will be coincidental to + * the level-triggered routing and don't need to be intercepted. + */ + if ((int)vector > vcpu->arch.highest_stale_pending_ioapic_eoi) + vcpu->arch.highest_stale_pending_ioapic_eoi = vector; + } +} + void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors) { @@ -431,11 +451,11 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, kvm_set_msi_irq(vcpu->kvm, entry, &irq); - if (irq.trig_mode && - (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, - irq.dest_id, irq.dest_mode) || - kvm_apic_pending_eoi(vcpu, irq.vector))) - __set_bit(irq.vector, ioapic_handled_vectors); + if (!irq.trig_mode) + continue; + + kvm_scan_ioapic_irq(vcpu, irq.dest_id, irq.dest_mode, + irq.vector, ioapic_handled_vectors); } } srcu_read_unlock(&kvm->irq_srcu, idx); diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 75eae9c4998a..36a8786db291 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -44,6 +44,18 @@ BUILD_KVM_GPR_ACCESSORS(r15, R15) #endif /* + * Using the register cache from interrupt context is generally not allowed, as + * caching a register and marking it available/dirty can't be done atomically, + * i.e. accesses from interrupt context may clobber state or read stale data if + * the vCPU task is in the process of updating the cache. The exception is if + * KVM is handling a PMI IRQ/NMI VM-Exit, as that bound code sequence doesn't + * touch the cache, it runs after the cache is reset (post VM-Exit), and PMIs + * need to access several registers that are cacheable. + */ +#define kvm_assert_register_caching_allowed(vcpu) \ + lockdep_assert_once(in_task() || kvm_arch_pmi_in_guest(vcpu)) + +/* * avail dirty * 0 0 register in VMCS/VMCB * 0 1 *INVALID* @@ -53,24 +65,28 @@ BUILD_KVM_GPR_ACCESSORS(r15, R15) static inline bool kvm_register_is_available(struct kvm_vcpu *vcpu, enum kvm_reg reg) { + kvm_assert_register_caching_allowed(vcpu); return test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); } static inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu, enum kvm_reg reg) { + kvm_assert_register_caching_allowed(vcpu); return test_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); } static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu, enum kvm_reg reg) { + kvm_assert_register_caching_allowed(vcpu); __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); } static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu, enum kvm_reg reg) { + kvm_assert_register_caching_allowed(vcpu); __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); } @@ -84,6 +100,7 @@ static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu, static __always_inline bool kvm_register_test_and_mark_available(struct kvm_vcpu *vcpu, enum kvm_reg reg) { + kvm_assert_register_caching_allowed(vcpu); return arch___test_and_set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); } @@ -98,7 +115,7 @@ static inline unsigned long kvm_register_read_raw(struct kvm_vcpu *vcpu, int reg return 0; if (!kvm_register_is_available(vcpu, reg)) - static_call(kvm_x86_cache_reg)(vcpu, reg); + kvm_x86_call(cache_reg)(vcpu, reg); return vcpu->arch.regs[reg]; } @@ -138,7 +155,7 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) might_sleep(); /* on svm */ if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR)) - static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_PDPTR); + kvm_x86_call(cache_reg)(vcpu, VCPU_EXREG_PDPTR); return vcpu->arch.walk_mmu->pdptrs[index]; } @@ -153,7 +170,7 @@ static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS; if ((tmask & vcpu->arch.cr0_guest_owned_bits) && !kvm_register_is_available(vcpu, VCPU_EXREG_CR0)) - static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR0); + kvm_x86_call(cache_reg)(vcpu, VCPU_EXREG_CR0); return vcpu->arch.cr0 & mask; } @@ -175,7 +192,7 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS; if ((tmask & vcpu->arch.cr4_guest_owned_bits) && !kvm_register_is_available(vcpu, VCPU_EXREG_CR4)) - static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR4); + kvm_x86_call(cache_reg)(vcpu, VCPU_EXREG_CR4); return vcpu->arch.cr4 & mask; } @@ -190,7 +207,7 @@ static __always_inline bool kvm_is_cr4_bit_set(struct kvm_vcpu *vcpu, static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu) { if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) - static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR3); + kvm_x86_call(cache_reg)(vcpu, VCPU_EXREG_CR3); return vcpu->arch.cr3; } diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 5382646162a3..c1df5acfacaf 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -26,6 +26,7 @@ struct x86_exception { bool nested_page_fault; u64 address; /* cr2 or nested page fault gpa */ u8 async_page_fault; + unsigned long exit_qualification; }; /* @@ -43,7 +44,10 @@ struct x86_instruction_info { u64 dst_val; /* value of destination operand */ u8 src_bytes; /* size of source operand */ u8 dst_bytes; /* size of destination operand */ + u8 src_type; /* type of source operand */ + u8 dst_type; /* type of destination operand */ u8 ad_bytes; /* size of src/dst address */ + u64 rip; /* rip of the instruction */ u64 next_rip; /* rip following the instruction */ }; @@ -87,12 +91,16 @@ struct x86_instruction_info { #define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */ #define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */ #define X86EMUL_INTERCEPTED 6 /* Intercepted by nested VMCB/VMCS */ +/* Emulation during event vectoring is unhandleable. */ +#define X86EMUL_UNHANDLEABLE_VECTORING 7 /* x86-specific emulation flags */ #define X86EMUL_F_WRITE BIT(0) #define X86EMUL_F_FETCH BIT(1) #define X86EMUL_F_IMPLICIT BIT(2) #define X86EMUL_F_INVLPG BIT(3) +#define X86EMUL_F_MSR BIT(4) +#define X86EMUL_F_DT_LOAD BIT(5) struct x86_emulate_ops { void (*vm_bugged)(struct x86_emulate_ctxt *ctxt); @@ -222,6 +230,7 @@ struct x86_emulate_ops { bool (*guest_has_movbe)(struct x86_emulate_ctxt *ctxt); bool (*guest_has_fxsr)(struct x86_emulate_ctxt *ctxt); bool (*guest_has_rdpid)(struct x86_emulate_ctxt *ctxt); + bool (*guest_cpuid_is_intel_compatible)(struct x86_emulate_ctxt *ctxt); void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked); @@ -233,6 +242,9 @@ struct x86_emulate_ops { gva_t (*get_untagged_addr)(struct x86_emulate_ctxt *ctxt, gva_t addr, unsigned int flags); + + bool (*is_canonical_addr)(struct x86_emulate_ctxt *ctxt, gva_t addr, + unsigned int flags); }; /* Type, address-of, and value of an instruction's operand. */ @@ -263,8 +275,10 @@ struct operand { }; }; +#define X86_MAX_INSTRUCTION_LENGTH 15 + struct fetch_cache { - u8 data[15]; + u8 data[X86_MAX_INSTRUCTION_LENGTH]; u8 *ptr; u8 *end; }; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index cf37586f0466..73418dc0ebb2 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -59,7 +59,17 @@ #define MAX_APIC_VECTOR 256 #define APIC_VECTORS_PER_REG 32 -static bool lapic_timer_advance_dynamic __read_mostly; +/* + * Enable local APIC timer advancement (tscdeadline mode only) with adaptive + * tuning. When enabled, KVM programs the host timer event to fire early, i.e. + * before the deadline expires, to account for the delay between taking the + * VM-Exit (to inject the guest event) and the subsequent VM-Enter to resume + * the guest, i.e. so that the interrupt arrives in the guest with minimal + * latency relative to the deadline programmed by the guest. + */ +static bool lapic_timer_advance __read_mostly = true; +module_param(lapic_timer_advance, bool, 0444); + #define LAPIC_TIMER_ADVANCE_ADJUST_MIN 100 /* clock cycles */ #define LAPIC_TIMER_ADVANCE_ADJUST_MAX 10000 /* clock cycles */ #define LAPIC_TIMER_ADVANCE_NS_INIT 1000 @@ -211,13 +221,6 @@ static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, } } -static void kvm_apic_map_free(struct rcu_head *rcu) -{ - struct kvm_apic_map *map = container_of(rcu, struct kvm_apic_map, rcu); - - kvfree(map); -} - static int kvm_recalculate_phys_map(struct kvm_apic_map *new, struct kvm_vcpu *vcpu, bool *xapic_id_mismatch) @@ -341,10 +344,8 @@ static void kvm_recalculate_logical_map(struct kvm_apic_map *new, * reversing the LDR calculation to get cluster of APICs, i.e. no * additional work is required. */ - if (apic_x2apic_mode(apic)) { - WARN_ON_ONCE(ldr != kvm_apic_calc_x2apic_ldr(kvm_x2apic_id(apic))); + if (apic_x2apic_mode(apic)) return; - } if (WARN_ON_ONCE(!kvm_apic_map_get_logical_dest(new, ldr, &cluster, &mask))) { @@ -374,7 +375,7 @@ enum { DIRTY }; -void kvm_recalculate_apic_map(struct kvm *kvm) +static void kvm_recalculate_apic_map(struct kvm *kvm) { struct kvm_apic_map *new, *old = NULL; struct kvm_vcpu *vcpu; @@ -481,7 +482,7 @@ out: mutex_unlock(&kvm->arch.apic_map_lock); if (old) - call_rcu(&old->rcu, kvm_apic_map_free); + kvfree_rcu(old, rcu); kvm_make_scan_ioapic_request(kvm); } @@ -590,7 +591,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu) * version first and level-triggered interrupts never get EOIed in * IOAPIC. */ - if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) && + if (guest_cpu_cap_has(vcpu, X86_FEATURE_X2APIC) && !ioapic_in_kernel(vcpu->kvm)) v |= APIC_LVR_DIRECTED_EOI; kvm_lapic_set_reg(apic, APIC_LVR, v); @@ -654,27 +655,29 @@ static u8 count_vectors(void *bitmap) return count; } -bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr) +bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr) { + unsigned long pir_vals[NR_PIR_WORDS]; + u32 *__pir = (void *)pir_vals; u32 i, vec; - u32 pir_val, irr_val, prev_irr_val; + u32 irr_val, prev_irr_val; int max_updated_irr; max_updated_irr = -1; *max_irr = -1; + if (!pi_harvest_pir(pir, pir_vals)) + return false; + for (i = vec = 0; i <= 7; i++, vec += 32) { u32 *p_irr = (u32 *)(regs + APIC_IRR + i * 0x10); - irr_val = *p_irr; - pir_val = READ_ONCE(pir[i]); - - if (pir_val) { - pir_val = xchg(&pir[i], 0); + irr_val = READ_ONCE(*p_irr); + if (__pir[i]) { prev_irr_val = irr_val; do { - irr_val = prev_irr_val | pir_val; + irr_val = prev_irr_val | __pir[i]; } while (prev_irr_val != irr_val && !try_cmpxchg(p_irr, &prev_irr_val, irr_val)); @@ -690,7 +693,7 @@ bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr) } EXPORT_SYMBOL_GPL(__kvm_apic_update_irr); -bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr) +bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr) { struct kvm_lapic *apic = vcpu->arch.apic; bool irr_updated = __kvm_apic_update_irr(pir, apic->regs, max_irr); @@ -726,10 +729,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic) static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) { if (unlikely(apic->apicv_active)) { - /* need to update RVI */ kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR); - static_call_cond(kvm_x86_hwapic_irr_update)(apic->vcpu, - apic_find_highest_irr(apic)); } else { apic->irr_pending = false; kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR); @@ -755,7 +755,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic) * just set SVI. */ if (unlikely(apic->apicv_active)) - static_call_cond(kvm_x86_hwapic_isr_update)(vec); + kvm_x86_call(hwapic_isr_update)(apic->vcpu, vec); else { ++apic->isr_count; BUG_ON(apic->isr_count > MAX_APIC_VECTOR); @@ -800,7 +800,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) * and must be left alone. */ if (unlikely(apic->apicv_active)) - static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic)); + kvm_x86_call(hwapic_isr_update)(apic->vcpu, apic_find_highest_isr(apic)); else { --apic->isr_count; BUG_ON(apic->isr_count < 0); @@ -808,6 +808,17 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) } } +void kvm_apic_update_hwapic_isr(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + if (WARN_ON_ONCE(!lapic_in_kernel(vcpu)) || !apic->apicv_active) + return; + + kvm_x86_call(hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); +} +EXPORT_SYMBOL_GPL(kvm_apic_update_hwapic_isr); + int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) { /* This may race with setting of irr in __apic_accept_irq() and @@ -936,7 +947,7 @@ static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr) { int highest_irr; if (kvm_x86_ops.sync_pir_to_irr) - highest_irr = static_call(kvm_x86_sync_pir_to_irr)(apic->vcpu); + highest_irr = kvm_x86_call(sync_pir_to_irr)(apic->vcpu); else highest_irr = apic_find_highest_irr(apic); if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr) @@ -1328,8 +1339,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, apic->regs + APIC_TMR); } - static_call(kvm_x86_deliver_interrupt)(apic, delivery_mode, - trig_mode, vector); + kvm_x86_call(deliver_interrupt)(apic, delivery_mode, + trig_mode, vector); break; case APIC_DM_REMRD: @@ -1450,6 +1461,14 @@ static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) if (!kvm_ioapic_handles_vector(apic, vector)) return; + /* + * If the intercepted EOI is for an IRQ that was pending from previous + * routing, then re-scan the I/O APIC routes as EOIs for the IRQ likely + * no longer need to be intercepted. + */ + if (apic->vcpu->arch.highest_stale_pending_ioapic_eoi == vector) + kvm_make_request(KVM_REQ_SCAN_IOAPIC, apic->vcpu); + /* Request a KVM exit to inform the userspace IOAPIC. */ if (irqchip_split(apic->vcpu->kvm)) { apic->vcpu->arch.pending_ioapic_eoi = vector; @@ -1547,7 +1566,8 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic) remaining = 0; ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period); - return div64_u64(ns, (APIC_BUS_CYCLE_NS * apic->divide_count)); + return div64_u64(ns, (apic->vcpu->kvm->arch.apic_bus_cycle_ns * + apic->divide_count)); } static void __report_tpr_access(struct kvm_lapic *apic, bool write) @@ -1732,7 +1752,7 @@ static void limit_periodic_timer_frequency(struct kvm_lapic *apic) s64 min_period = min_timer_period_us * 1000LL; if (apic->lapic_timer.period < min_period) { - pr_info_ratelimited( + pr_info_once( "vcpu %i: requested %lld ns " "lapic timer period limited to %lld ns\n", apic->vcpu->vcpu_id, @@ -1780,8 +1800,17 @@ static void apic_update_lvtt(struct kvm_lapic *apic) static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - u32 reg = kvm_lapic_get_reg(apic, APIC_LVTT); + u32 reg; + + /* + * Assume a timer IRQ was "injected" if the APIC is protected. KVM's + * copy of the vIRR is bogus, it's the responsibility of the caller to + * precisely check whether or not a timer IRQ is pending. + */ + if (apic->guest_apic_protected) + return true; + reg = kvm_lapic_get_reg(apic, APIC_LVTT); if (kvm_apic_hw_enabled(apic)) { int vec = reg & APIC_VECTOR_MASK; void *bitmap = apic->regs + APIC_ISR; @@ -1854,16 +1883,14 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline); - if (lapic_timer_advance_dynamic) { - adjust_lapic_timer_advance(vcpu, guest_tsc - tsc_deadline); - /* - * If the timer fired early, reread the TSC to account for the - * overhead of the above adjustment to avoid waiting longer - * than is necessary. - */ - if (guest_tsc < tsc_deadline) - guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); - } + adjust_lapic_timer_advance(vcpu, guest_tsc - tsc_deadline); + + /* + * If the timer fired early, reread the TSC to account for the overhead + * of the above adjustment to avoid waiting longer than is necessary. + */ + if (guest_tsc < tsc_deadline) + guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); if (guest_tsc < tsc_deadline) __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc); @@ -1937,7 +1964,7 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic) u64 ns = 0; ktime_t expire; struct kvm_vcpu *vcpu = apic->vcpu; - unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz; + u32 this_tsc_khz = vcpu->arch.virtual_tsc_khz; unsigned long flags; ktime_t now; @@ -1965,7 +1992,8 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic) static inline u64 tmict_to_ns(struct kvm_lapic *apic, u32 tmict) { - return (u64)tmict * APIC_BUS_CYCLE_NS * (u64)apic->divide_count; + return (u64)tmict * apic->vcpu->kvm->arch.apic_bus_cycle_ns * + (u64)apic->divide_count; } static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor) @@ -2095,7 +2123,7 @@ static void cancel_hv_timer(struct kvm_lapic *apic) { WARN_ON(preemptible()); WARN_ON(!apic->lapic_timer.hv_timer_in_use); - static_call(kvm_x86_cancel_hv_timer)(apic->vcpu); + kvm_x86_call(cancel_hv_timer)(apic->vcpu); apic->lapic_timer.hv_timer_in_use = false; } @@ -2112,7 +2140,7 @@ static bool start_hv_timer(struct kvm_lapic *apic) if (!ktimer->tscdeadline) return false; - if (static_call(kvm_x86_set_hv_timer)(vcpu, ktimer->tscdeadline, &expired)) + if (kvm_x86_call(set_hv_timer)(vcpu, ktimer->tscdeadline, &expired)) return false; ktimer->hv_timer_in_use = true; @@ -2349,7 +2377,7 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_LVTT: if (!kvm_apic_sw_enabled(apic)) val |= APIC_LVT_MASKED; - val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask); + val &= (apic_lvt_mask[LVT_TIMER] | apic->lapic_timer.timer_mode_mask); kvm_lapic_set_reg(apic, APIC_LVTT, val); apic_update_lvtt(apic); break; @@ -2445,6 +2473,43 @@ void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); +#define X2APIC_ICR_RESERVED_BITS (GENMASK_ULL(31, 20) | GENMASK_ULL(17, 16) | BIT(13)) + +int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) +{ + if (data & X2APIC_ICR_RESERVED_BITS) + return 1; + + /* + * The BUSY bit is reserved on both Intel and AMD in x2APIC mode, but + * only AMD requires it to be zero, Intel essentially just ignores the + * bit. And if IPI virtualization (Intel) or x2AVIC (AMD) is enabled, + * the CPU performs the reserved bits checks, i.e. the underlying CPU + * behavior will "win". Arbitrarily clear the BUSY bit, as there is no + * sane way to provide consistent behavior with respect to hardware. + */ + data &= ~APIC_ICR_BUSY; + + kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32)); + if (kvm_x86_ops.x2apic_icr_is_split) { + kvm_lapic_set_reg(apic, APIC_ICR, data); + kvm_lapic_set_reg(apic, APIC_ICR2, data >> 32); + } else { + kvm_lapic_set_reg64(apic, APIC_ICR, data); + } + trace_kvm_apic_write(APIC_ICR, data); + return 0; +} + +static u64 kvm_x2apic_icr_read(struct kvm_lapic *apic) +{ + if (kvm_x86_ops.x2apic_icr_is_split) + return (u64)kvm_lapic_get_reg(apic, APIC_ICR) | + (u64)kvm_lapic_get_reg(apic, APIC_ICR2) << 32; + + return kvm_lapic_get_reg64(apic, APIC_ICR); +} + /* emulate APIC access in a trap manner */ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) { @@ -2462,7 +2527,7 @@ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) * maybe-unecessary write, and both are in the noise anyways. */ if (apic_x2apic_mode(apic) && offset == APIC_ICR) - kvm_x2apic_icr_write(apic, kvm_lapic_get_reg64(apic, APIC_ICR)); + WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_x2apic_icr_read(apic))); else kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset)); } @@ -2532,7 +2597,7 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) return (tpr & 0xf0) >> 4; } -void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) +static void __kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value) { u64 old_value = vcpu->arch.apic_base; struct kvm_lapic *apic = vcpu->arch.apic; @@ -2540,7 +2605,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) vcpu->arch.apic_base = value; if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; if (!apic) return; @@ -2567,7 +2632,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) { kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); - static_call_cond(kvm_x86_set_virtual_apic_mode)(vcpu); + kvm_x86_call(set_virtual_apic_mode)(vcpu); } apic->base_address = apic->vcpu->arch.apic_base & @@ -2580,29 +2645,61 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) } } +int kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value, bool host_initiated) +{ + enum lapic_mode old_mode = kvm_get_apic_mode(vcpu); + enum lapic_mode new_mode = kvm_apic_mode(value); + + if (vcpu->arch.apic_base == value) + return 0; + + u64 reserved_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu) | 0x2ff | + (guest_cpu_cap_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE); + + if ((value & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID) + return 1; + if (!host_initiated) { + if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC) + return 1; + if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC) + return 1; + } + + __kvm_apic_set_base(vcpu, value); + kvm_recalculate_apic_map(vcpu->kvm); + return 0; +} +EXPORT_SYMBOL_GPL(kvm_apic_set_base); + void kvm_apic_update_apicv(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (apic->apicv_active) { - /* irr_pending is always true when apicv is activated. */ - apic->irr_pending = true; + /* + * When APICv is enabled, KVM must always search the IRR for a pending + * IRQ, as other vCPUs and devices can set IRR bits even if the vCPU + * isn't running. If APICv is disabled, KVM _should_ search the IRR + * for a pending IRQ. But KVM currently doesn't ensure *all* hardware, + * e.g. CPUs and IOMMUs, has seen the change in state, i.e. searching + * the IRR at this time could race with IRQ delivery from hardware that + * still sees APICv as being enabled. + * + * FIXME: Ensure other vCPUs and devices observe the change in APICv + * state prior to updating KVM's metadata caches, so that KVM + * can safely search the IRR and set irr_pending accordingly. + */ + apic->irr_pending = true; + + if (apic->apicv_active) apic->isr_count = 1; - } else { - /* - * Don't clear irr_pending, searching the IRR can race with - * updates from the CPU as APICv is still active from hardware's - * perspective. The flag will be cleared as appropriate when - * KVM injects the interrupt. - */ + else apic->isr_count = count_vectors(apic->regs + APIC_ISR); - } + apic->highest_isr_cache = -1; } int kvm_alloc_apic_access_page(struct kvm *kvm) { - struct page *page; void __user *hva; int ret = 0; @@ -2618,17 +2715,6 @@ int kvm_alloc_apic_access_page(struct kvm *kvm) goto out; } - page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); - if (is_error_page(page)) { - ret = -EFAULT; - goto out; - } - - /* - * Do not pin the page in memory, so that memory hot-unplug - * is able to migrate it. - */ - put_page(page); kvm->arch.apic_access_memslot_enabled = true; out: mutex_unlock(&kvm->slots_lock); @@ -2677,13 +2763,20 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) u64 msr_val; int i; - static_call_cond(kvm_x86_apicv_pre_state_restore)(vcpu); + kvm_x86_call(apicv_pre_state_restore)(vcpu); if (!init_event) { msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_reset_bsp(vcpu)) msr_val |= MSR_IA32_APICBASE_BSP; - kvm_lapic_set_base(vcpu, msr_val); + + /* + * Use the inner helper to avoid an extra recalcuation of the + * optimized APIC map if some other task has dirtied the map. + * The recalculation needed for this vCPU will be done after + * all APIC state has been initialized (see below). + */ + __kvm_apic_set_base(vcpu, msr_val); } if (!apic) @@ -2732,9 +2825,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.pv_eoi.msr_val = 0; apic_update_ppr(apic); if (apic->apicv_active) { - static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu); - static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, -1); - static_call_cond(kvm_x86_hwapic_isr_update)(-1); + kvm_x86_call(apicv_post_state_restore)(vcpu); + kvm_x86_call(hwapic_isr_update)(vcpu, -1); } vcpu->arch.apic_arb_prio = 0; @@ -2776,7 +2868,8 @@ int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) trig_mode = reg & APIC_LVT_LEVEL_TRIGGER; r = __apic_accept_irq(apic, mode, vector, 1, trig_mode, NULL); - if (r && lvt_type == APIC_LVTPC) + if (r && lvt_type == APIC_LVTPC && + guest_cpuid_is_intel_compatible(apic->vcpu)) kvm_lapic_set_reg(apic, APIC_LVTPC, reg | APIC_LVT_MASKED); return r; } @@ -2811,7 +2904,7 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) return HRTIMER_NORESTART; } -int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) +int kvm_create_lapic(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic; @@ -2829,7 +2922,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) vcpu->arch.apic = apic; if (kvm_x86_ops.alloc_apic_backing_page) - apic->regs = static_call(kvm_x86_alloc_apic_backing_page)(vcpu); + apic->regs = kvm_x86_call(alloc_apic_backing_page)(vcpu); else apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!apic->regs) { @@ -2841,16 +2934,10 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) apic->nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu); - hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS_HARD); - apic->lapic_timer.timer.function = apic_timer_fn; - if (timer_advance_ns == -1) { + hrtimer_setup(&apic->lapic_timer.timer, apic_timer_fn, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_HARD); + if (lapic_timer_advance) apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT; - lapic_timer_advance_dynamic = true; - } else { - apic->lapic_timer.timer_advance_ns = timer_advance_ns; - lapic_timer_advance_dynamic = false; - } /* * Stuff the APIC ENABLE bit in lieu of temporarily incrementing @@ -2891,6 +2978,9 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) if (!kvm_apic_present(vcpu)) return -1; + if (apic->guest_apic_protected) + return -1; + __apic_update_ppr(apic, &ppr); return apic_has_interrupt_for_ppr(apic, ppr); } @@ -2918,14 +3008,13 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) } } -int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) +void kvm_apic_ack_interrupt(struct kvm_vcpu *vcpu, int vector) { - int vector = kvm_apic_has_interrupt(vcpu); struct kvm_lapic *apic = vcpu->arch.apic; u32 ppr; - if (vector == -1) - return -1; + if (WARN_ON_ONCE(vector < 0 || !apic)) + return; /* * We get here even with APIC virtualization enabled, if doing @@ -2953,41 +3042,55 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) __apic_update_ppr(apic, &ppr); } - return vector; } +EXPORT_SYMBOL_GPL(kvm_apic_ack_interrupt); static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s, bool set) { if (apic_x2apic_mode(vcpu->arch.apic)) { + u32 x2apic_id = kvm_x2apic_id(vcpu->arch.apic); u32 *id = (u32 *)(s->regs + APIC_ID); u32 *ldr = (u32 *)(s->regs + APIC_LDR); u64 icr; if (vcpu->kvm->arch.x2apic_format) { - if (*id != vcpu->vcpu_id) + if (*id != x2apic_id) return -EINVAL; } else { + /* + * Ignore the userspace value when setting APIC state. + * KVM's model is that the x2APIC ID is readonly, e.g. + * KVM only supports delivering interrupts to KVM's + * version of the x2APIC ID. However, for backwards + * compatibility, don't reject attempts to set a + * mismatched ID for userspace that hasn't opted into + * x2apic_format. + */ if (set) - *id >>= 24; + *id = x2apic_id; else - *id <<= 24; + *id = x2apic_id << 24; } /* * In x2APIC mode, the LDR is fixed and based on the id. And - * ICR is internally a single 64-bit register, but needs to be - * split to ICR+ICR2 in userspace for backwards compatibility. + * if the ICR is _not_ split, ICR is internally a single 64-bit + * register, but needs to be split to ICR+ICR2 in userspace for + * backwards compatibility. */ - if (set) { - *ldr = kvm_apic_calc_x2apic_ldr(*id); - - icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) | - (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32; - __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr); - } else { - icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR); - __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32); + if (set) + *ldr = kvm_apic_calc_x2apic_ldr(x2apic_id); + + if (!kvm_x86_ops.x2apic_icr_is_split) { + if (set) { + icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) | + (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32; + __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr); + } else { + icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR); + __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32); + } } } @@ -3013,9 +3116,8 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) struct kvm_lapic *apic = vcpu->arch.apic; int r; - static_call_cond(kvm_x86_apicv_pre_state_restore)(vcpu); + kvm_x86_call(apicv_pre_state_restore)(vcpu); - kvm_lapic_set_base(vcpu, vcpu->arch.apic_base); /* set SPIV separately to get count of SW disabled APICs right */ apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV))); @@ -3040,9 +3142,8 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) kvm_lapic_set_reg(apic, APIC_TMCCT, 0); kvm_apic_update_apicv(vcpu); if (apic->apicv_active) { - static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu); - static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic)); - static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic)); + kvm_x86_call(apicv_post_state_restore)(vcpu); + kvm_x86_call(hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); } kvm_make_request(KVM_REQ_EVENT, vcpu); if (ioapic_in_kernel(vcpu->kvm)) @@ -3179,22 +3280,12 @@ int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) return 0; } -int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data) -{ - data &= ~APIC_ICR_BUSY; - - kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32)); - kvm_lapic_set_reg64(apic, APIC_ICR, data); - trace_kvm_apic_write(APIC_ICR, data); - return 0; -} - static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data) { u32 low; if (reg == APIC_ICR) { - *data = kvm_lapic_get_reg64(apic, APIC_ICR); + *data = kvm_x2apic_icr_read(apic); return 0; } @@ -3321,17 +3412,18 @@ int kvm_apic_accept_events(struct kvm_vcpu *vcpu) if (test_and_clear_bit(KVM_APIC_INIT, &apic->pending_events)) { kvm_vcpu_reset(vcpu, true); if (kvm_vcpu_is_bsp(apic->vcpu)) - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); else - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED); } if (test_and_clear_bit(KVM_APIC_SIPI, &apic->pending_events)) { if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { /* evaluate pending_events before reading the vector */ smp_rmb(); sipi_vector = apic->sipi_vector; - static_call(kvm_x86_vcpu_deliver_sipi_vector)(vcpu, sipi_vector); - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_x86_call(vcpu_deliver_sipi_vector)(vcpu, + sipi_vector); + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); } } return 0; diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 0a0ea4b5dd8c..4ce30db65828 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -16,8 +16,7 @@ #define APIC_DEST_NOSHORT 0x0 #define APIC_DEST_MASK 0x800 -#define APIC_BUS_CYCLE_NS 1 -#define APIC_BUS_FREQUENCY (1000000000ULL / APIC_BUS_CYCLE_NS) +#define APIC_BUS_CYCLE_NS_DEFAULT 1 #define APIC_BROADCAST 0xFF #define X2APIC_BROADCAST 0xFFFFFFFFul @@ -66,6 +65,8 @@ struct kvm_lapic { bool sw_enabled; bool irr_pending; bool lvt0_in_nmi_mode; + /* Select registers in the vAPIC cannot be read/written. */ + bool guest_apic_protected; /* Number of bits set in ISR. */ s16 isr_count; /* The highest vector set in ISR; if -1 - invalid, must scan ISR. */ @@ -85,28 +86,25 @@ struct kvm_lapic { struct dest_map; -int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns); +int kvm_create_lapic(struct kvm_vcpu *vcpu); void kvm_free_lapic(struct kvm_vcpu *vcpu); int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); +void kvm_apic_ack_interrupt(struct kvm_vcpu *vcpu, int vector); int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); -int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); int kvm_apic_accept_events(struct kvm_vcpu *vcpu); void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event); u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu); -void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); -u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); -void kvm_recalculate_apic_map(struct kvm *kvm); void kvm_apic_set_version(struct kvm_vcpu *vcpu); void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu); bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int shorthand, unsigned int dest, int dest_mode); int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec); -bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr); -bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr); +bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr); +bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr); void kvm_apic_update_ppr(struct kvm_vcpu *vcpu); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, struct dest_map *dest_map); @@ -119,11 +117,10 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map); void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high); -u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); -int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info); +int kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value, bool host_initiated); int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); -enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu); +void kvm_apic_update_hwapic_isr(struct kvm_vcpu *vcpu); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); @@ -236,7 +233,7 @@ static inline bool kvm_apic_has_pending_init_or_sipi(struct kvm_vcpu *vcpu) static inline bool kvm_apic_init_sipi_allowed(struct kvm_vcpu *vcpu) { return !is_smm(vcpu) && - !static_call(kvm_x86_apic_init_signal_blocked)(vcpu); + !kvm_x86_call(apic_init_signal_blocked)(vcpu); } static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq) @@ -273,6 +270,11 @@ static inline enum lapic_mode kvm_apic_mode(u64 apic_base) return apic_base & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE); } +static inline enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu) +{ + return kvm_apic_mode(vcpu->arch.apic_base); +} + static inline u8 kvm_xapic_id(struct kvm_lapic *apic) { return kvm_lapic_get_reg(apic, APIC_ID) >> 24; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 60f21bb4c27b..b4b6860ab971 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -4,6 +4,7 @@ #include <linux/kvm_host.h> #include "kvm_cache_regs.h" +#include "x86.h" #include "cpuid.h" extern bool __read_mostly enable_mmio_caching; @@ -57,12 +58,6 @@ static __always_inline u64 rsvd_bits(int s, int e) return ((2ULL << (e - s)) - 1) << s; } -/* - * The number of non-reserved physical address bits irrespective of features - * that repurpose legal bits, e.g. MKTME. - */ -extern u8 __read_mostly shadow_phys_bits; - static inline gfn_t kvm_mmu_max_gfn(void) { /* @@ -76,31 +71,15 @@ static inline gfn_t kvm_mmu_max_gfn(void) * than hardware's real MAXPHYADDR. Using the host MAXPHYADDR * disallows such SPTEs entirely and simplifies the TDP MMU. */ - int max_gpa_bits = likely(tdp_enabled) ? shadow_phys_bits : 52; + int max_gpa_bits = likely(tdp_enabled) ? kvm_host.maxphyaddr : 52; return (1ULL << (max_gpa_bits - PAGE_SHIFT)) - 1; } -static inline u8 kvm_get_shadow_phys_bits(void) -{ - /* - * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected - * in CPU detection code, but the processor treats those reduced bits as - * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at - * the physical address bits reported by CPUID. - */ - if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008)) - return cpuid_eax(0x80000008) & 0xff; - - /* - * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with - * custom CPUID. Proceed with whatever the kernel found since these features - * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008). - */ - return boot_cpu_data.x86_phys_bits; -} +u8 kvm_mmu_get_max_tdp_level(void); void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask); +void kvm_mmu_set_mmio_spte_value(struct kvm *kvm, u64 mmio_value); void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask); void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only); @@ -126,6 +105,18 @@ void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) { + if (kvm_check_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu)) + kvm_mmu_free_obsolete_roots(vcpu); + + /* + * Checking root.hpa is sufficient even when KVM has mirror root. + * We can have either: + * (1) mirror_root_hpa = INVALID_PAGE, root.hpa = INVALID_PAGE + * (2) mirror_root_hpa = root, root.hpa = INVALID_PAGE + * (3) mirror_root_hpa = root1, root.hpa = root2 + * We don't ever have: + * mirror_root_hpa = INVALID_PAGE, root.hpa = root + */ if (likely(vcpu->arch.mmu->root.hpa != INVALID_PAGE)) return 0; @@ -148,7 +139,7 @@ static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu) static inline unsigned long kvm_get_active_cr3_lam_bits(struct kvm_vcpu *vcpu) { - if (!guest_can_use(vcpu, X86_FEATURE_LAM)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LAM)) return 0; return kvm_read_cr3(vcpu) & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57); @@ -161,8 +152,8 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu) if (!VALID_PAGE(root_hpa)) return; - static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa, - vcpu->arch.mmu->root_role.level); + kvm_x86_call(load_mmu_pgd)(vcpu, root_hpa, + vcpu->arch.mmu->root_role.level); } static inline void kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu, @@ -197,7 +188,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, { /* strip nested paging fault error codes */ unsigned int pfec = access; - unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu); + unsigned long rflags = kvm_x86_call(get_rflags)(vcpu); /* * For explicit supervisor accesses, SMAP is disabled if EFLAGS.AC = 1. @@ -213,7 +204,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, */ u64 implicit_access = access & PFERR_IMPLICIT_ACCESS; bool not_smap = ((rflags & X86_EFLAGS_AC) | implicit_access) == X86_EFLAGS_AC; - int index = (pfec + (not_smap << PFERR_RSVD_BIT)) >> 1; + int index = (pfec | (not_smap ? PFERR_RSVD_MASK : 0)) >> 1; u32 errcode = PFERR_PRESENT_MASK; bool fault; @@ -234,8 +225,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, pkru_bits = (vcpu->arch.pkru >> (pte_pkey * 2)) & 3; /* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */ - offset = (pfec & ~1) + - ((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT)); + offset = (pfec & ~1) | ((pte_access & PT_USER_MASK) ? PFERR_RSVD_MASK : 0); pkru_bits &= mmu->pkru_mask >> offset; errcode |= -pkru_bits & PFERR_PK_MASK; @@ -245,16 +235,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return -(u32)fault & errcode; } -bool __kvm_mmu_honors_guest_mtrrs(bool vm_has_noncoherent_dma); - -static inline bool kvm_mmu_honors_guest_mtrrs(struct kvm *kvm) -{ - return __kvm_mmu_honors_guest_mtrrs(kvm_arch_has_noncoherent_dma(kvm)); -} - -void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end); - -int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); +bool kvm_mmu_may_ignore_guest_pat(struct kvm *kvm); int kvm_mmu_post_init_vm(struct kvm *kvm); void kvm_mmu_pre_destroy_vm(struct kvm *kvm); @@ -276,6 +257,9 @@ extern bool tdp_mmu_enabled; #define tdp_mmu_enabled false #endif +bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa); +int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level); + static inline bool kvm_memslots_have_rmaps(struct kvm *kvm) { return !tdp_mmu_enabled || kvm_shadow_root_allocated(kvm); @@ -319,4 +303,26 @@ static inline gpa_t kvm_translate_gpa(struct kvm_vcpu *vcpu, return gpa; return translate_nested_gpa(vcpu, gpa, access, exception); } + +static inline bool kvm_has_mirrored_tdp(const struct kvm *kvm) +{ + return kvm->arch.vm_type == KVM_X86_TDX_VM; +} + +static inline gfn_t kvm_gfn_direct_bits(const struct kvm *kvm) +{ + return kvm->arch.gfn_direct_bits; +} + +static inline bool kvm_is_addr_direct(struct kvm *kvm, gpa_t gpa) +{ + gpa_t gpa_direct_bits = gfn_to_gpa(kvm_gfn_direct_bits(kvm)); + + return !gpa_direct_bits || (gpa & gpa_direct_bits); +} + +static inline bool kvm_is_gfn_alias(struct kvm *kvm, gfn_t gfn) +{ + return gfn & kvm_gfn_direct_bits(kvm); +} #endif diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 992e651540e8..4e06e2e89a8f 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -110,6 +110,7 @@ static bool __ro_after_init tdp_mmu_allowed; #ifdef CONFIG_X86_64 bool __read_mostly tdp_mmu_enabled = true; module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0444); +EXPORT_SYMBOL_GPL(tdp_mmu_enabled); #endif static int max_huge_page_level __read_mostly; @@ -179,7 +180,6 @@ struct kvm_shadow_walk_iterator { static struct kmem_cache *pte_list_desc_cache; struct kmem_cache *mmu_page_header_cache; -static struct percpu_counter kvm_total_used_mmu_pages; static void mmu_spte_set(u64 *sptep, u64 spte); @@ -336,16 +336,19 @@ static int is_cpuid_PSE36(void) #ifdef CONFIG_X86_64 static void __set_spte(u64 *sptep, u64 spte) { + KVM_MMU_WARN_ON(is_ept_ve_possible(spte)); WRITE_ONCE(*sptep, spte); } static void __update_clear_spte_fast(u64 *sptep, u64 spte) { + KVM_MMU_WARN_ON(is_ept_ve_possible(spte)); WRITE_ONCE(*sptep, spte); } static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) { + KVM_MMU_WARN_ON(is_ept_ve_possible(spte)); return xchg(sptep, spte); } @@ -432,8 +435,8 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) * The idea using the light way get the spte on x86_32 guest is from * gup_get_pte (mm/gup.c). * - * An spte tlb flush may be pending, because kvm_set_pte_rmap - * coalesces them and we are running out of the MMU lock. Therefore + * An spte tlb flush may be pending, because they are coalesced and + * we are running out of the MMU lock. Therefore * we need to protect against in-progress updates of the spte. * * Reading the spte while an update is in progress may get the old value @@ -482,11 +485,12 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte) __set_spte(sptep, new_spte); } -/* - * Update the SPTE (excluding the PFN), but do not track changes in its - * accessed/dirty status. +/* Rules for using mmu_spte_update: + * Update the state bits, it means the mapped pfn is not changed. + * + * Returns true if the TLB needs to be flushed */ -static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte) +static bool mmu_spte_update(u64 *sptep, u64 new_spte) { u64 old_spte = *sptep; @@ -495,61 +499,18 @@ static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte) if (!is_shadow_present_pte(old_spte)) { mmu_spte_set(sptep, new_spte); - return old_spte; + return false; } - if (!spte_has_volatile_bits(old_spte)) + if (!spte_needs_atomic_update(old_spte)) __update_clear_spte_fast(sptep, new_spte); else old_spte = __update_clear_spte_slow(sptep, new_spte); - WARN_ON_ONCE(spte_to_pfn(old_spte) != spte_to_pfn(new_spte)); - - return old_spte; -} - -/* Rules for using mmu_spte_update: - * Update the state bits, it means the mapped pfn is not changed. - * - * Whenever an MMU-writable SPTE is overwritten with a read-only SPTE, remote - * TLBs must be flushed. Otherwise rmap_write_protect will find a read-only - * spte, even though the writable spte might be cached on a CPU's TLB. - * - * Returns true if the TLB needs to be flushed - */ -static bool mmu_spte_update(u64 *sptep, u64 new_spte) -{ - bool flush = false; - u64 old_spte = mmu_spte_update_no_track(sptep, new_spte); - - if (!is_shadow_present_pte(old_spte)) - return false; - - /* - * For the spte updated out of mmu-lock is safe, since - * we always atomically update it, see the comments in - * spte_has_volatile_bits(). - */ - if (is_mmu_writable_spte(old_spte) && - !is_writable_pte(new_spte)) - flush = true; - - /* - * Flush TLB when accessed/dirty states are changed in the page tables, - * to guarantee consistency between TLB and page tables. - */ - - if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) { - flush = true; - kvm_set_pfn_accessed(spte_to_pfn(old_spte)); - } + WARN_ON_ONCE(!is_shadow_present_pte(old_spte) || + spte_to_pfn(old_spte) != spte_to_pfn(new_spte)); - if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) { - flush = true; - kvm_set_pfn_dirty(spte_to_pfn(old_spte)); - } - - return flush; + return leaf_spte_change_needs_tlb_flush(old_spte, new_spte); } /* @@ -560,39 +521,19 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) */ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) { - kvm_pfn_t pfn; u64 old_spte = *sptep; int level = sptep_to_sp(sptep)->role.level; - struct page *page; if (!is_shadow_present_pte(old_spte) || - !spte_has_volatile_bits(old_spte)) - __update_clear_spte_fast(sptep, 0ull); + !spte_needs_atomic_update(old_spte)) + __update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE); else - old_spte = __update_clear_spte_slow(sptep, 0ull); + old_spte = __update_clear_spte_slow(sptep, SHADOW_NONPRESENT_VALUE); if (!is_shadow_present_pte(old_spte)) return old_spte; kvm_update_page_stats(kvm, level, -1); - - pfn = spte_to_pfn(old_spte); - - /* - * KVM doesn't hold a reference to any pages mapped into the guest, and - * instead uses the mmu_notifier to ensure that KVM unmaps any pages - * before they are reclaimed. Sanity check that, if the pfn is backed - * by a refcounted page, the refcount is elevated. - */ - page = kvm_pfn_to_refcounted_page(pfn); - WARN_ON_ONCE(page && !page_count(page)); - - if (is_accessed_spte(old_spte)) - kvm_set_pfn_accessed(pfn); - - if (is_dirty_spte(old_spte)) - kvm_set_pfn_dirty(pfn); - return old_spte; } @@ -603,7 +544,7 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) */ static void mmu_spte_clear_no_track(u64 *sptep) { - __update_clear_spte_fast(sptep, 0ull); + __update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE); } static u64 mmu_spte_get_lockless(u64 *sptep) @@ -611,32 +552,6 @@ static u64 mmu_spte_get_lockless(u64 *sptep) return __get_spte_lockless(sptep); } -/* Returns the Accessed status of the PTE and resets it at the same time. */ -static bool mmu_spte_age(u64 *sptep) -{ - u64 spte = mmu_spte_get_lockless(sptep); - - if (!is_accessed_spte(spte)) - return false; - - if (spte_ad_enabled(spte)) { - clear_bit((ffs(shadow_accessed_mask) - 1), - (unsigned long *)sptep); - } else { - /* - * Capture the dirty status of the page, so that it doesn't get - * lost when the SPTE is marked for access tracking. - */ - if (is_writable_pte(spte)) - kvm_set_pfn_dirty(spte_to_pfn(spte)); - - spte = mark_spte_for_access_track(spte); - mmu_spte_update_no_track(sptep, spte); - } - - return true; -} - static inline bool is_tdp_mmu_active(struct kvm_vcpu *vcpu) { return tdp_mmu_enabled && vcpu->arch.mmu->root_role.direct; @@ -685,6 +600,12 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect) 1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM); if (r) return r; + if (kvm_has_mirrored_tdp(vcpu->kvm)) { + r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_external_spt_cache, + PT64_ROOT_MAX_LEVEL); + if (r) + return r; + } r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache, PT64_ROOT_MAX_LEVEL); if (r) @@ -704,6 +625,7 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache); kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache); kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache); + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_external_spt_cache); kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); } @@ -719,7 +641,7 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) if (sp->role.passthrough) return sp->gfn; - if (!sp->role.direct) + if (sp->shadowed_translation) return sp->shadowed_translation[index] >> PAGE_SHIFT; return sp->gfn + (index << ((sp->role.level - 1) * SPTE_LEVEL_BITS)); @@ -733,7 +655,7 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) */ static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index) { - if (sp_has_gptes(sp)) + if (sp->shadowed_translation) return sp->shadowed_translation[index] & ACC_ALL; /* @@ -754,7 +676,7 @@ static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index) static void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index, gfn_t gfn, unsigned int access) { - if (sp_has_gptes(sp)) { + if (sp->shadowed_translation) { sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) | access; return; } @@ -831,6 +753,15 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) gfn_t gfn; kvm->arch.indirect_shadow_pages++; + /* + * Ensure indirect_shadow_pages is elevated prior to re-reading guest + * child PTEs in FNAME(gpte_changed), i.e. guarantee either in-flight + * emulated writes are visible before re-reading guest PTEs, or that + * an emulated write will see the elevated count and acquire mmu_lock + * to update SPTEs. Pairs with the smp_mb() in kvm_mmu_track_write(). + */ + smp_mb(); + gfn = sp->gfn; slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, gfn); @@ -923,31 +854,173 @@ static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu * About rmap_head encoding: * * If the bit zero of rmap_head->val is clear, then it points to the only spte - * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct + * in this rmap chain. Otherwise, (rmap_head->val & ~3) points to a struct * pte_list_desc containing more mappings. */ +#define KVM_RMAP_MANY BIT(0) + +/* + * rmaps and PTE lists are mostly protected by mmu_lock (the shadow MMU always + * operates with mmu_lock held for write), but rmaps can be walked without + * holding mmu_lock so long as the caller can tolerate SPTEs in the rmap chain + * being zapped/dropped _while the rmap is locked_. + * + * Other than the KVM_RMAP_LOCKED flag, modifications to rmap entries must be + * done while holding mmu_lock for write. This allows a task walking rmaps + * without holding mmu_lock to concurrently walk the same entries as a task + * that is holding mmu_lock but _not_ the rmap lock. Neither task will modify + * the rmaps, thus the walks are stable. + * + * As alluded to above, SPTEs in rmaps are _not_ protected by KVM_RMAP_LOCKED, + * only the rmap chains themselves are protected. E.g. holding an rmap's lock + * ensures all "struct pte_list_desc" fields are stable. + */ +#define KVM_RMAP_LOCKED BIT(1) + +static unsigned long __kvm_rmap_lock(struct kvm_rmap_head *rmap_head) +{ + unsigned long old_val, new_val; + + lockdep_assert_preemption_disabled(); + + /* + * Elide the lock if the rmap is empty, as lockless walkers (read-only + * mode) don't need to (and can't) walk an empty rmap, nor can they add + * entries to the rmap. I.e. the only paths that process empty rmaps + * do so while holding mmu_lock for write, and are mutually exclusive. + */ + old_val = atomic_long_read(&rmap_head->val); + if (!old_val) + return 0; + + do { + /* + * If the rmap is locked, wait for it to be unlocked before + * trying acquire the lock, e.g. to avoid bouncing the cache + * line. + */ + while (old_val & KVM_RMAP_LOCKED) { + cpu_relax(); + old_val = atomic_long_read(&rmap_head->val); + } + + /* + * Recheck for an empty rmap, it may have been purged by the + * task that held the lock. + */ + if (!old_val) + return 0; + + new_val = old_val | KVM_RMAP_LOCKED; + /* + * Use try_cmpxchg_acquire() to prevent reads and writes to the rmap + * from being reordered outside of the critical section created by + * __kvm_rmap_lock(). + * + * Pairs with the atomic_long_set_release() in kvm_rmap_unlock(). + * + * For the !old_val case, no ordering is needed, as there is no rmap + * to walk. + */ + } while (!atomic_long_try_cmpxchg_acquire(&rmap_head->val, &old_val, new_val)); + + /* + * Return the old value, i.e. _without_ the LOCKED bit set. It's + * impossible for the return value to be 0 (see above), i.e. the read- + * only unlock flow can't get a false positive and fail to unlock. + */ + return old_val; +} + +static unsigned long kvm_rmap_lock(struct kvm *kvm, + struct kvm_rmap_head *rmap_head) +{ + lockdep_assert_held_write(&kvm->mmu_lock); + + return __kvm_rmap_lock(rmap_head); +} + +static void __kvm_rmap_unlock(struct kvm_rmap_head *rmap_head, + unsigned long val) +{ + KVM_MMU_WARN_ON(val & KVM_RMAP_LOCKED); + /* + * Ensure that all accesses to the rmap have completed before unlocking + * the rmap. + * + * Pairs with the atomic_long_try_cmpxchg_acquire() in __kvm_rmap_lock(). + */ + atomic_long_set_release(&rmap_head->val, val); +} + +static void kvm_rmap_unlock(struct kvm *kvm, + struct kvm_rmap_head *rmap_head, + unsigned long new_val) +{ + lockdep_assert_held_write(&kvm->mmu_lock); + + __kvm_rmap_unlock(rmap_head, new_val); +} + +static unsigned long kvm_rmap_get(struct kvm_rmap_head *rmap_head) +{ + return atomic_long_read(&rmap_head->val) & ~KVM_RMAP_LOCKED; +} + +/* + * If mmu_lock isn't held, rmaps can only be locked in read-only mode. The + * actual locking is the same, but the caller is disallowed from modifying the + * rmap, and so the unlock flow is a nop if the rmap is/was empty. + */ +static unsigned long kvm_rmap_lock_readonly(struct kvm_rmap_head *rmap_head) +{ + unsigned long rmap_val; + + preempt_disable(); + rmap_val = __kvm_rmap_lock(rmap_head); + + if (!rmap_val) + preempt_enable(); + + return rmap_val; +} + +static void kvm_rmap_unlock_readonly(struct kvm_rmap_head *rmap_head, + unsigned long old_val) +{ + if (!old_val) + return; + + KVM_MMU_WARN_ON(old_val != kvm_rmap_get(rmap_head)); + + __kvm_rmap_unlock(rmap_head, old_val); + preempt_enable(); +} /* * Returns the number of pointers in the rmap chain, not counting the new one. */ -static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, - struct kvm_rmap_head *rmap_head) +static int pte_list_add(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, + u64 *spte, struct kvm_rmap_head *rmap_head) { + unsigned long old_val, new_val; struct pte_list_desc *desc; int count = 0; - if (!rmap_head->val) { - rmap_head->val = (unsigned long)spte; - } else if (!(rmap_head->val & 1)) { + old_val = kvm_rmap_lock(kvm, rmap_head); + + if (!old_val) { + new_val = (unsigned long)spte; + } else if (!(old_val & KVM_RMAP_MANY)) { desc = kvm_mmu_memory_cache_alloc(cache); - desc->sptes[0] = (u64 *)rmap_head->val; + desc->sptes[0] = (u64 *)old_val; desc->sptes[1] = spte; desc->spte_count = 2; desc->tail_count = 0; - rmap_head->val = (unsigned long)desc | 1; + new_val = (unsigned long)desc | KVM_RMAP_MANY; ++count; } else { - desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + desc = (struct pte_list_desc *)(old_val & ~KVM_RMAP_MANY); count = desc->tail_count + desc->spte_count; /* @@ -956,21 +1029,25 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, */ if (desc->spte_count == PTE_LIST_EXT) { desc = kvm_mmu_memory_cache_alloc(cache); - desc->more = (struct pte_list_desc *)(rmap_head->val & ~1ul); + desc->more = (struct pte_list_desc *)(old_val & ~KVM_RMAP_MANY); desc->spte_count = 0; desc->tail_count = count; - rmap_head->val = (unsigned long)desc | 1; + new_val = (unsigned long)desc | KVM_RMAP_MANY; + } else { + new_val = old_val; } desc->sptes[desc->spte_count++] = spte; } + + kvm_rmap_unlock(kvm, rmap_head, new_val); + return count; } -static void pte_list_desc_remove_entry(struct kvm *kvm, - struct kvm_rmap_head *rmap_head, +static void pte_list_desc_remove_entry(struct kvm *kvm, unsigned long *rmap_val, struct pte_list_desc *desc, int i) { - struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + struct pte_list_desc *head_desc = (struct pte_list_desc *)(*rmap_val & ~KVM_RMAP_MANY); int j = head_desc->spte_count - 1; /* @@ -997,9 +1074,9 @@ static void pte_list_desc_remove_entry(struct kvm *kvm, * head at the next descriptor, i.e. the new head. */ if (!head_desc->more) - rmap_head->val = 0; + *rmap_val = 0; else - rmap_head->val = (unsigned long)head_desc->more | 1; + *rmap_val = (unsigned long)head_desc->more | KVM_RMAP_MANY; mmu_free_pte_list_desc(head_desc); } @@ -1007,24 +1084,26 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; + unsigned long rmap_val; int i; - if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm)) - return; + rmap_val = kvm_rmap_lock(kvm, rmap_head); + if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_val, kvm)) + goto out; - if (!(rmap_head->val & 1)) { - if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm)) - return; + if (!(rmap_val & KVM_RMAP_MANY)) { + if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_val != spte, kvm)) + goto out; - rmap_head->val = 0; + rmap_val = 0; } else { - desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY); while (desc) { for (i = 0; i < desc->spte_count; ++i) { if (desc->sptes[i] == spte) { - pte_list_desc_remove_entry(kvm, rmap_head, + pte_list_desc_remove_entry(kvm, &rmap_val, desc, i); - return; + goto out; } } desc = desc->more; @@ -1032,6 +1111,9 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte, KVM_BUG_ON_DATA_CORRUPTION(true, kvm); } + +out: + kvm_rmap_unlock(kvm, rmap_head, rmap_val); } static void kvm_zap_one_rmap_spte(struct kvm *kvm, @@ -1046,17 +1128,19 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc, *next; + unsigned long rmap_val; int i; - if (!rmap_head->val) + rmap_val = kvm_rmap_lock(kvm, rmap_head); + if (!rmap_val) return false; - if (!(rmap_head->val & 1)) { - mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val); + if (!(rmap_val & KVM_RMAP_MANY)) { + mmu_spte_clear_track_bits(kvm, (u64 *)rmap_val); goto out; } - desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY); for (; desc; desc = next) { for (i = 0; i < desc->spte_count; i++) @@ -1066,20 +1150,21 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm, } out: /* rmap_head is meaningless now, remember to reset it */ - rmap_head->val = 0; + kvm_rmap_unlock(kvm, rmap_head, 0); return true; } unsigned int pte_list_count(struct kvm_rmap_head *rmap_head) { + unsigned long rmap_val = kvm_rmap_get(rmap_head); struct pte_list_desc *desc; - if (!rmap_head->val) + if (!rmap_val) return 0; - else if (!(rmap_head->val & 1)) + else if (!(rmap_val & KVM_RMAP_MANY)) return 1; - desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY); return desc->tail_count + desc->spte_count; } @@ -1122,6 +1207,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) */ struct rmap_iterator { /* private fields */ + struct rmap_head *head; struct pte_list_desc *desc; /* holds the sptep if not NULL */ int pos; /* index of the sptep */ }; @@ -1136,23 +1222,19 @@ struct rmap_iterator { static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head, struct rmap_iterator *iter) { - u64 *sptep; + unsigned long rmap_val = kvm_rmap_get(rmap_head); - if (!rmap_head->val) + if (!rmap_val) return NULL; - if (!(rmap_head->val & 1)) { + if (!(rmap_val & KVM_RMAP_MANY)) { iter->desc = NULL; - sptep = (u64 *)rmap_head->val; - goto out; + return (u64 *)rmap_val; } - iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + iter->desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY); iter->pos = 0; - sptep = iter->desc->sptes[iter->pos]; -out: - BUG_ON(!is_shadow_present_pte(*sptep)); - return sptep; + return iter->desc->sptes[iter->pos]; } /* @@ -1162,14 +1244,11 @@ out: */ static u64 *rmap_get_next(struct rmap_iterator *iter) { - u64 *sptep; - if (iter->desc) { if (iter->pos < PTE_LIST_EXT - 1) { ++iter->pos; - sptep = iter->desc->sptes[iter->pos]; - if (sptep) - goto out; + if (iter->desc->sptes[iter->pos]) + return iter->desc->sptes[iter->pos]; } iter->desc = iter->desc->more; @@ -1177,20 +1256,24 @@ static u64 *rmap_get_next(struct rmap_iterator *iter) if (iter->desc) { iter->pos = 0; /* desc->sptes[0] cannot be NULL */ - sptep = iter->desc->sptes[iter->pos]; - goto out; + return iter->desc->sptes[iter->pos]; } } return NULL; -out: - BUG_ON(!is_shadow_present_pte(*sptep)); - return sptep; } -#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \ - for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \ - _spte_; _spte_ = rmap_get_next(_iter_)) +#define __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \ + for (_sptep_ = rmap_get_first(_rmap_head_, _iter_); \ + _sptep_; _sptep_ = rmap_get_next(_iter_)) + +#define for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \ + __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \ + if (!WARN_ON_ONCE(!is_shadow_present_pte(*(_sptep_)))) \ + +#define for_each_rmap_spte_lockless(_rmap_head_, _iter_, _sptep_, _spte_) \ + __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \ + if (is_shadow_present_pte(_spte_ = mmu_spte_get_lockless(sptep))) static void drop_spte(struct kvm *kvm, u64 *sptep) { @@ -1263,16 +1346,6 @@ static bool spte_clear_dirty(u64 *sptep) return mmu_spte_update(sptep, spte); } -static bool spte_wrprot_for_clear_dirty(u64 *sptep) -{ - bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT, - (unsigned long *)sptep); - if (was_writable && !spte_ad_enabled(*sptep)) - kvm_set_pfn_dirty(spte_to_pfn(*sptep)); - - return was_writable; -} - /* * Gets the GFN ready for another round of dirty logging by clearing the * - D bit on ad-enabled SPTEs, and @@ -1286,24 +1359,17 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct rmap_iterator iter; bool flush = false; - for_each_rmap_spte(rmap_head, &iter, sptep) + for_each_rmap_spte(rmap_head, &iter, sptep) { if (spte_ad_need_write_protect(*sptep)) - flush |= spte_wrprot_for_clear_dirty(sptep); + flush |= test_and_clear_bit(PT_WRITABLE_SHIFT, + (unsigned long *)sptep); else flush |= spte_clear_dirty(sptep); + } return flush; } -/** - * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages - * @kvm: kvm instance - * @slot: slot to protect - * @gfn_offset: start of the BITS_PER_LONG pages we care about - * @mask: indicates which pages we should protect - * - * Used when we do not need to care about huge page mappings. - */ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) @@ -1327,16 +1393,6 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, } } -/** - * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write - * protect the page if the D-bit isn't supported. - * @kvm: kvm instance - * @slot: slot to clear D-bit - * @gfn_offset: start of the BITS_PER_LONG pages we care about - * @mask: indicates which pages we should clear D-bit - * - * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap. - */ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) @@ -1360,24 +1416,16 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, } } -/** - * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected - * PT level pages. - * - * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to - * enable dirty logging for them. - * - * We need to care about huge page mappings: e.g. during dirty logging we may - * have such mappings. - */ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) { /* - * Huge pages are NOT write protected when we start dirty logging in - * initially-all-set mode; must write protect them here so that they - * are split to 4K on the first write. + * If the slot was assumed to be "initially all dirty", write-protect + * huge pages to ensure they are split to 4KiB on the first write (KVM + * dirty logs at 4KiB granularity). If eager page splitting is enabled, + * immediately try to split huge pages, e.g. so that vCPUs don't get + * saddled with the cost of splitting. * * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn * of memslot has no such restriction, so the range can cross two large @@ -1399,16 +1447,25 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, PG_LEVEL_2M); } - /* Now handle 4K PTEs. */ - if (kvm_x86_ops.cpu_dirty_log_size) + /* + * (Re)Enable dirty logging for all 4KiB SPTEs that map the GFNs in + * mask. If PML is enabled and the GFN doesn't need to be write- + * protected for other reasons, e.g. shadow paging, clear the Dirty bit. + * Otherwise clear the Writable bit. + * + * Note that kvm_mmu_clear_dirty_pt_masked() is called whenever PML is + * enabled but it chooses between clearing the Dirty bit and Writeable + * bit based on the context. + */ + if (kvm->arch.cpu_dirty_log_size) kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask); else kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } -int kvm_cpu_dirty_log_size(void) +int kvm_cpu_dirty_log_size(struct kvm *kvm) { - return kvm_x86_ops.cpu_dirty_log_size; + return kvm->arch.cpu_dirty_log_size; } bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, @@ -1441,54 +1498,10 @@ static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn) return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K); } -static bool __kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - const struct kvm_memory_slot *slot) -{ - return kvm_zap_all_rmap_sptes(kvm, rmap_head); -} - static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - pte_t unused) -{ - return __kvm_zap_rmap(kvm, rmap_head, slot); -} - -static bool kvm_set_pte_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - pte_t pte) + const struct kvm_memory_slot *slot) { - u64 *sptep; - struct rmap_iterator iter; - bool need_flush = false; - u64 new_spte; - kvm_pfn_t new_pfn; - - WARN_ON_ONCE(pte_huge(pte)); - new_pfn = pte_pfn(pte); - -restart: - for_each_rmap_spte(rmap_head, &iter, sptep) { - need_flush = true; - - if (pte_write(pte)) { - kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); - goto restart; - } else { - new_spte = kvm_mmu_changed_pte_notifier_make_spte( - *sptep, new_pfn); - - mmu_spte_clear_track_bits(kvm, sptep); - mmu_spte_set(sptep, new_spte); - } - } - - if (need_flush && kvm_available_flush_remote_tlbs_range()) { - kvm_flush_remote_tlbs_gfn(kvm, gfn, level); - return false; - } - - return need_flush; + return kvm_zap_all_rmap_sptes(kvm, rmap_head); } struct slot_rmap_walk_iterator { @@ -1539,9 +1552,9 @@ static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator) static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) { while (++iterator->rmap <= iterator->end_rmap) { - iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level)); + iterator->gfn += KVM_PAGES_PER_HPAGE(iterator->level); - if (iterator->rmap->val) + if (atomic_long_read(&iterator->rmap->val)) return; } @@ -1560,80 +1573,101 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) slot_rmap_walk_okay(_iter_); \ slot_rmap_walk_next(_iter_)) -typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, - int level, pte_t pte); +/* The return value indicates if tlb flush on all vcpus is needed. */ +typedef bool (*slot_rmaps_handler) (struct kvm *kvm, + struct kvm_rmap_head *rmap_head, + const struct kvm_memory_slot *slot); -static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm, - struct kvm_gfn_range *range, - rmap_handler_t handler) +static __always_inline bool __walk_slot_rmaps(struct kvm *kvm, + const struct kvm_memory_slot *slot, + slot_rmaps_handler fn, + int start_level, int end_level, + gfn_t start_gfn, gfn_t end_gfn, + bool can_yield, bool flush_on_yield, + bool flush) { struct slot_rmap_walk_iterator iterator; - bool ret = false; - for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, - range->start, range->end - 1, &iterator) - ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn, - iterator.level, range->arg.pte); - - return ret; -} - -bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) -{ - bool flush = false; + lockdep_assert_held_write(&kvm->mmu_lock); - if (kvm_memslots_have_rmaps(kvm)) - flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap); + for_each_slot_rmap_range(slot, start_level, end_level, start_gfn, + end_gfn, &iterator) { + if (iterator.rmap) + flush |= fn(kvm, iterator.rmap, slot); - if (tdp_mmu_enabled) - flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); + if (!can_yield) + continue; - if (kvm_x86_ops.set_apic_access_page_addr && - range->slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) - kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { + if (flush && flush_on_yield) { + kvm_flush_remote_tlbs_range(kvm, start_gfn, + iterator.gfn - start_gfn + 1); + flush = false; + } + cond_resched_rwlock_write(&kvm->mmu_lock); + } + } return flush; } -bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) +static __always_inline bool walk_slot_rmaps(struct kvm *kvm, + const struct kvm_memory_slot *slot, + slot_rmaps_handler fn, + int start_level, int end_level, + bool flush_on_yield) { - bool flush = false; - - if (kvm_memslots_have_rmaps(kvm)) - flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmap); + return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level, + slot->base_gfn, slot->base_gfn + slot->npages - 1, + true, flush_on_yield, false); +} - if (tdp_mmu_enabled) - flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range); +static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm, + const struct kvm_memory_slot *slot, + slot_rmaps_handler fn, + bool flush_on_yield) +{ + return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield); +} - return flush; +static bool __kvm_rmap_zap_gfn_range(struct kvm *kvm, + const struct kvm_memory_slot *slot, + gfn_t start, gfn_t end, bool can_yield, + bool flush) +{ + return __walk_slot_rmaps(kvm, slot, kvm_zap_rmap, + PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, + start, end - 1, can_yield, true, flush); } -static bool kvm_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - pte_t unused) +bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { - u64 *sptep; - struct rmap_iterator iter; - int young = 0; + bool flush = false; - for_each_rmap_spte(rmap_head, &iter, sptep) - young |= mmu_spte_age(sptep); + /* + * To prevent races with vCPUs faulting in a gfn using stale data, + * zapping a gfn range must be protected by mmu_invalidate_in_progress + * (and mmu_invalidate_seq). The only exception is memslot deletion; + * in that case, SRCU synchronization ensures that SPTEs are zapped + * after all vCPUs have unlocked SRCU, guaranteeing that vCPUs see the + * invalid slot. + */ + lockdep_assert_once(kvm->mmu_invalidate_in_progress || + lockdep_is_held(&kvm->slots_lock)); - return young; -} + if (kvm_memslots_have_rmaps(kvm)) + flush = __kvm_rmap_zap_gfn_range(kvm, range->slot, + range->start, range->end, + range->may_block, flush); -static bool kvm_test_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, - int level, pte_t unused) -{ - u64 *sptep; - struct rmap_iterator iter; + if (tdp_mmu_enabled) + flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); - for_each_rmap_spte(rmap_head, &iter, sptep) - if (is_accessed_spte(*sptep)) - return true; - return false; + if (kvm_x86_ops.set_apic_access_page_addr && + range->slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) + kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); + + return flush; } #define RMAP_RECYCLE_THRESHOLD 1000 @@ -1652,7 +1686,7 @@ static void __rmap_add(struct kvm *kvm, kvm_update_page_stats(kvm, sp->role.level, 1); rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); - rmap_count = pte_list_add(cache, spte, rmap_head); + rmap_count = pte_list_add(kvm, cache, spte, rmap_head); if (rmap_count > kvm->stat.max_mmu_rmap_size) kvm->stat.max_mmu_rmap_size = rmap_count; @@ -1670,15 +1704,68 @@ static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot, __rmap_add(vcpu->kvm, cache, slot, spte, gfn, access); } -bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) +static bool kvm_rmap_age_gfn_range(struct kvm *kvm, + struct kvm_gfn_range *range, + bool test_only) { + struct kvm_rmap_head *rmap_head; + struct rmap_iterator iter; + unsigned long rmap_val; bool young = false; + u64 *sptep; + gfn_t gfn; + int level; + u64 spte; - if (kvm_memslots_have_rmaps(kvm)) - young = kvm_handle_gfn_range(kvm, range, kvm_age_rmap); + for (level = PG_LEVEL_4K; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) { + for (gfn = range->start; gfn < range->end; + gfn += KVM_PAGES_PER_HPAGE(level)) { + rmap_head = gfn_to_rmap(gfn, level, range->slot); + rmap_val = kvm_rmap_lock_readonly(rmap_head); + + for_each_rmap_spte_lockless(rmap_head, &iter, sptep, spte) { + if (!is_accessed_spte(spte)) + continue; + + if (test_only) { + kvm_rmap_unlock_readonly(rmap_head, rmap_val); + return true; + } + + if (spte_ad_enabled(spte)) + clear_bit((ffs(shadow_accessed_mask) - 1), + (unsigned long *)sptep); + else + /* + * If the following cmpxchg fails, the + * spte is being concurrently modified + * and should most likely stay young. + */ + cmpxchg64(sptep, spte, + mark_spte_for_access_track(spte)); + young = true; + } + + kvm_rmap_unlock_readonly(rmap_head, rmap_val); + } + } + return young; +} + +static bool kvm_may_have_shadow_mmu_sptes(struct kvm *kvm) +{ + return !tdp_mmu_enabled || READ_ONCE(kvm->arch.indirect_shadow_pages); +} + +bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) +{ + bool young = false; if (tdp_mmu_enabled) - young |= kvm_tdp_mmu_age_gfn_range(kvm, range); + young = kvm_tdp_mmu_age_gfn_range(kvm, range); + + if (kvm_may_have_shadow_mmu_sptes(kvm)) + young |= kvm_rmap_age_gfn_range(kvm, range, false); return young; } @@ -1687,11 +1774,14 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { bool young = false; - if (kvm_memslots_have_rmaps(kvm)) - young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmap); - if (tdp_mmu_enabled) - young |= kvm_tdp_mmu_test_age_gfn(kvm, range); + young = kvm_tdp_mmu_test_age_gfn(kvm, range); + + if (young) + return young; + + if (kvm_may_have_shadow_mmu_sptes(kvm)) + young |= kvm_rmap_age_gfn_range(kvm, range, true); return young; } @@ -1710,27 +1800,15 @@ static void kvm_mmu_check_sptes_at_free(struct kvm_mmu_page *sp) #endif } -/* - * This value is the sum of all of the kvm instances's - * kvm->arch.n_used_mmu_pages values. We need a global, - * aggregate version in order to make the slab shrinker - * faster - */ -static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr) -{ - kvm->arch.n_used_mmu_pages += nr; - percpu_counter_add(&kvm_total_used_mmu_pages, nr); -} - static void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { - kvm_mod_used_mmu_pages(kvm, +1); + kvm->arch.n_used_mmu_pages++; kvm_account_pgtable_pages((void *)sp->spt, +1); } static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { - kvm_mod_used_mmu_pages(kvm, -1); + kvm->arch.n_used_mmu_pages--; kvm_account_pgtable_pages((void *)sp->spt, -1); } @@ -1741,8 +1819,7 @@ static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp) hlist_del(&sp->hash_link); list_del(&sp->link); free_page((unsigned long)sp->spt); - if (!sp->role.direct) - free_page((unsigned long)sp->shadowed_translation); + free_page((unsigned long)sp->shadowed_translation); kmem_cache_free(mmu_page_header_cache, sp); } @@ -1751,13 +1828,14 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn) return hash_64(gfn, KVM_MMU_HASH_SHIFT); } -static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache, +static void mmu_page_add_parent_pte(struct kvm *kvm, + struct kvm_mmu_memory_cache *cache, struct kvm_mmu_page *sp, u64 *parent_pte) { if (!parent_pte) return; - pte_list_add(cache, parent_pte, &sp->parent_ptes); + pte_list_add(kvm, cache, parent_pte, &sp->parent_ptes); } static void mmu_page_remove_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp, @@ -1950,7 +2028,8 @@ static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) static int kvm_sync_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i) { - if (!sp->spt[i]) + /* sp->spt[i] has initial value of shadow page table allocation */ + if (sp->spt[i] == SHADOW_NONPRESENT_VALUE) return 0; return vcpu->arch.mmu->sync_spte(vcpu, sp, i); @@ -2243,7 +2322,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm, sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache); sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache); - if (!role.direct) + if (!role.direct && role.level <= KVM_MAX_HUGEPAGE_LEVEL) sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); @@ -2446,7 +2525,7 @@ static void __link_shadow_page(struct kvm *kvm, mmu_spte_set(sptep, spte); - mmu_page_add_parent_pte(cache, sp, sptep); + mmu_page_add_parent_pte(kvm, cache, sp, sptep); /* * The non-direct sub-pagetable must be updated before linking. For @@ -2510,11 +2589,12 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, * avoids retaining a large number of stale nested SPs. */ if (tdp_enabled && invalid_list && - child->role.guest_mode && !child->parent_ptes.val) + child->role.guest_mode && + !atomic_long_read(&child->parent_ptes.val)) return kvm_mmu_prepare_zap_page(kvm, child, invalid_list); } - } else if (is_mmio_spte(pte)) { + } else if (is_mmio_spte(kvm, pte)) { mmu_spte_clear_no_track(spte); } return 0; @@ -2754,36 +2834,49 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages) write_unlock(&kvm->mmu_lock); } -int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) +bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + bool always_retry) { - struct kvm_mmu_page *sp; + struct kvm *kvm = vcpu->kvm; LIST_HEAD(invalid_list); - int r; + struct kvm_mmu_page *sp; + gpa_t gpa = cr2_or_gpa; + bool r = false; + + /* + * Bail early if there aren't any write-protected shadow pages to avoid + * unnecessarily taking mmu_lock lock, e.g. if the gfn is write-tracked + * by a third party. Reading indirect_shadow_pages without holding + * mmu_lock is safe, as this is purely an optimization, i.e. a false + * positive is benign, and a false negative will simply result in KVM + * skipping the unprotect+retry path, which is also an optimization. + */ + if (!READ_ONCE(kvm->arch.indirect_shadow_pages)) + goto out; + + if (!vcpu->arch.mmu->root_role.direct) { + gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); + if (gpa == INVALID_GPA) + goto out; + } - r = 0; write_lock(&kvm->mmu_lock); - for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) { - r = 1; + for_each_gfn_valid_sp_with_gptes(kvm, sp, gpa_to_gfn(gpa)) kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); - } + + /* + * Snapshot the result before zapping, as zapping will remove all list + * entries, i.e. checking the list later would yield a false negative. + */ + r = !list_empty(&invalid_list); kvm_mmu_commit_zap_page(kvm, &invalid_list); write_unlock(&kvm->mmu_lock); - return r; -} - -static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) -{ - gpa_t gpa; - int r; - - if (vcpu->arch.mmu->root_role.direct) - return 0; - - gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); - - r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); - +out: + if (r || always_retry) { + vcpu->arch.last_retry_eip = kvm_rip_read(vcpu); + vcpu->arch.last_retry_addr = cr2_or_gpa; + } return r; } @@ -2803,7 +2896,7 @@ static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) * be write-protected. */ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, - gfn_t gfn, bool can_unsync, bool prefetch) + gfn_t gfn, bool synchronizing, bool prefetch) { struct kvm_mmu_page *sp; bool locked = false; @@ -2818,12 +2911,12 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, /* * The page is not write-tracked, mark existing shadow pages unsync - * unless KVM is synchronizing an unsync SP (can_unsync = false). In - * that case, KVM must complete emulation of the guest TLB flush before - * allowing shadow pages to become unsync (writable by the guest). + * unless KVM is synchronizing an unsync SP. In that case, KVM must + * complete emulation of the guest TLB flush before allowing shadow + * pages to become unsync (writable by the guest). */ for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) { - if (!can_unsync) + if (synchronizing) return -EPERM; if (sp->unsync) @@ -2927,6 +3020,10 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, } if (is_shadow_present_pte(*sptep)) { + if (prefetch && is_last_spte(*sptep, level) && + pfn == spte_to_pfn(*sptep)) + return RET_PF_SPURIOUS; + /* * If we overwrite a PTE page pointer with a 2MB PMD, unlink * the parent of the now unreachable PTE. @@ -2938,7 +3035,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, child = spte_to_child_sp(pte); drop_parent_pte(vcpu->kvm, child, sptep); flush = true; - } else if (pfn != spte_to_pfn(*sptep)) { + } else if (WARN_ON_ONCE(pfn != spte_to_pfn(*sptep))) { drop_spte(vcpu->kvm, sptep); flush = true; } else @@ -2946,7 +3043,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, } wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch, - true, host_writable, &spte); + false, host_writable, &spte); if (*sptep == spte) { ret = RET_PF_SPURIOUS; @@ -2955,10 +3052,8 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, trace_kvm_mmu_set_spte(level, gfn, sptep); } - if (wrprot) { - if (write_fault) - ret = RET_PF_EMULATE; - } + if (wrprot && write_fault) + ret = RET_PF_WRITE_PROTECTED; if (flush) kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level); @@ -2974,32 +3069,51 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, return ret; } -static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, - u64 *start, u64 *end) +static bool kvm_mmu_prefetch_sptes(struct kvm_vcpu *vcpu, gfn_t gfn, u64 *sptep, + int nr_pages, unsigned int access) { struct page *pages[PTE_PREFETCH_NUM]; struct kvm_memory_slot *slot; - unsigned int access = sp->role.access; - int i, ret; - gfn_t gfn; + int i; + + if (WARN_ON_ONCE(nr_pages > PTE_PREFETCH_NUM)) + return false; - gfn = kvm_mmu_page_get_gfn(sp, spte_index(start)); slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK); if (!slot) - return -1; + return false; - ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start); - if (ret <= 0) - return -1; + nr_pages = kvm_prefetch_pages(slot, gfn, pages, nr_pages); + if (nr_pages <= 0) + return false; - for (i = 0; i < ret; i++, gfn++, start++) { - mmu_set_spte(vcpu, slot, start, access, gfn, + for (i = 0; i < nr_pages; i++, gfn++, sptep++) { + mmu_set_spte(vcpu, slot, sptep, access, gfn, page_to_pfn(pages[i]), NULL); - put_page(pages[i]); + + /* + * KVM always prefetches writable pages from the primary MMU, + * and KVM can make its SPTE writable in the fast page handler, + * without notifying the primary MMU. Mark pages/folios dirty + * now to ensure file data is written back if it ends up being + * written by the guest. Because KVM's prefetching GUPs + * writable PTEs, the probability of unnecessary writeback is + * extremely low. + */ + kvm_release_page_dirty(pages[i]); } - return 0; + return true; +} + +static bool direct_pte_prefetch_many(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, + u64 *start, u64 *end) +{ + gfn_t gfn = kvm_mmu_page_get_gfn(sp, spte_index(start)); + unsigned int access = sp->role.access; + + return kvm_mmu_prefetch_sptes(vcpu, gfn, start, end - start, access); } static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, @@ -3017,8 +3131,9 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, if (is_shadow_present_pte(*spte) || spte == sptep) { if (!start) continue; - if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0) + if (!direct_pte_prefetch_many(vcpu, sp, start, spte)) return; + start = NULL; } else if (!start) start = spte; @@ -3168,13 +3283,12 @@ static int __kvm_mmu_max_mapping_level(struct kvm *kvm, } int kvm_mmu_max_mapping_level(struct kvm *kvm, - const struct kvm_memory_slot *slot, gfn_t gfn, - int max_level) + const struct kvm_memory_slot *slot, gfn_t gfn) { bool is_private = kvm_slot_can_be_private(slot) && kvm_mem_is_private(kvm, gfn); - return __kvm_mmu_max_mapping_level(kvm, slot, gfn, max_level, is_private); + return __kvm_mmu_max_mapping_level(kvm, slot, gfn, PG_LEVEL_NUM, is_private); } void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) @@ -3314,9 +3428,18 @@ static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu, { gva_t gva = fault->is_tdp ? 0 : fault->addr; + if (fault->is_private) { + kvm_mmu_prepare_memory_fault_exit(vcpu, fault); + return -EFAULT; + } + vcpu_cache_mmio_info(vcpu, gva, fault->gfn, access & shadow_mmio_access_mask); + fault->slot = NULL; + fault->pfn = KVM_PFN_NOSLOT; + fault->map_writable = false; + /* * If MMIO caching is disabled, emulate immediately without * touching the shadow page tables as attempting to install an @@ -3338,7 +3461,7 @@ static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu, return RET_PF_CONTINUE; } -static bool page_fault_can_be_fast(struct kvm_page_fault *fault) +static bool page_fault_can_be_fast(struct kvm *kvm, struct kvm_page_fault *fault) { /* * Page faults with reserved bits set, i.e. faults on MMIO SPTEs, only @@ -3350,6 +3473,26 @@ static bool page_fault_can_be_fast(struct kvm_page_fault *fault) return false; /* + * For hardware-protected VMs, certain conditions like attempting to + * perform a write to a page which is not in the state that the guest + * expects it to be in can result in a nested/extended #PF. In this + * case, the below code might misconstrue this situation as being the + * result of a write-protected access, and treat it as a spurious case + * rather than taking any action to satisfy the real source of the #PF + * such as generating a KVM_EXIT_MEMORY_FAULT. This can lead to the + * guest spinning on a #PF indefinitely, so don't attempt the fast path + * in this case. + * + * Note that the kvm_mem_is_private() check might race with an + * attribute update, but this will either result in the guest spinning + * on RET_PF_SPURIOUS until the update completes, or an actual spurious + * case might go down the slow path. Either case will resolve itself. + */ + if (kvm->arch.has_private_mem && + fault->is_private != kvm_mem_is_private(kvm, fault->gfn)) + return false; + + /* * #PF can be fast if: * * 1. The shadow page table entry is not present and A/D bits are @@ -3365,7 +3508,7 @@ static bool page_fault_can_be_fast(struct kvm_page_fault *fault) * by setting the Writable bit, which can be done out of mmu_lock. */ if (!fault->present) - return !kvm_ad_enabled(); + return !kvm_ad_enabled; /* * Note, instruction fetches and writes are mutually exclusive, ignore @@ -3392,7 +3535,7 @@ static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, * harm. This also avoids the TLB flush needed after setting dirty bit * so non-PML cases won't be impacted. * - * Compare with set_spte where instead shadow_dirty_mask is set. + * Compare with make_spte() where instead shadow_dirty_mask is set. */ if (!try_cmpxchg64(sptep, &old_spte, new_spte)) return false; @@ -3403,18 +3546,6 @@ static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, return true; } -static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte) -{ - if (fault->exec) - return is_executable_pte(spte); - - if (fault->write) - return is_writable_pte(spte); - - /* Fault was on Read access */ - return spte & PT_PRESENT_MASK; -} - /* * Returns the last level spte pointer of the shadow page walk for the given * gpa, and sets *spte to the spte value. This spte may be non-preset. If no @@ -3449,7 +3580,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) u64 *sptep; uint retry_count = 0; - if (!page_fault_can_be_fast(fault)) + if (!page_fault_can_be_fast(vcpu->kvm, fault)) return ret; walk_shadow_page_lockless_begin(vcpu); @@ -3458,7 +3589,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) u64 new_spte; if (tdp_mmu_enabled) - sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte); + sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->gfn, &spte); else sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte); @@ -3468,7 +3599,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) * available as the vCPU holds a reference to its root(s). */ if (WARN_ON_ONCE(!sptep)) - spte = REMOVED_SPTE; + spte = FROZEN_SPTE; if (!is_shadow_present_pte(spte)) break; @@ -3500,8 +3631,9 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) * uses A/D bits for non-nested MMUs. Thus, if A/D bits are * enabled, the SPTE can't be an access-tracked SPTE. */ - if (unlikely(!kvm_ad_enabled()) && is_access_track_spte(spte)) - new_spte = restore_acc_track_spte(new_spte); + if (unlikely(!kvm_ad_enabled) && is_access_track_spte(spte)) + new_spte = restore_acc_track_spte(new_spte) | + shadow_accessed_mask; /* * To keep things simple, only SPTEs that are MMU-writable can @@ -3706,8 +3838,13 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) unsigned i; int r; - if (tdp_mmu_enabled) - return kvm_tdp_mmu_alloc_root(vcpu); + if (tdp_mmu_enabled) { + if (kvm_has_mirrored_tdp(vcpu->kvm) && + !VALID_PAGE(mmu->mirror_root_hpa)) + kvm_tdp_mmu_alloc_root(vcpu, true); + kvm_tdp_mmu_alloc_root(vcpu, false); + return 0; + } write_lock(&vcpu->kvm->mmu_lock); r = make_mmu_pages_available(vcpu); @@ -4134,23 +4271,31 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level return leaf; } -/* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */ -static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) +static int get_sptes_lockless(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, + int *root_level) { - u64 sptes[PT64_ROOT_MAX_LEVEL + 1]; - struct rsvd_bits_validate *rsvd_check; - int root, leaf, level; - bool reserved = false; + int leaf; walk_shadow_page_lockless_begin(vcpu); if (is_tdp_mmu_active(vcpu)) - leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root); + leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, root_level); else - leaf = get_walk(vcpu, addr, sptes, &root); + leaf = get_walk(vcpu, addr, sptes, root_level); walk_shadow_page_lockless_end(vcpu); + return leaf; +} +/* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */ +static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) +{ + u64 sptes[PT64_ROOT_MAX_LEVEL + 1]; + struct rsvd_bits_validate *rsvd_check; + int root, leaf, level; + bool reserved = false; + + leaf = get_sptes_lockless(vcpu, addr, sptes, &root); if (unlikely(leaf < 0)) { *sptep = 0ull; return reserved; @@ -4196,7 +4341,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) if (WARN_ON_ONCE(reserved)) return -EINVAL; - if (is_mmio_spte(spte)) { + if (is_mmio_spte(vcpu->kvm, spte)) { gfn_t gfn = get_mmio_spte_gfn(spte); unsigned int access = get_mmio_spte_access(spte); @@ -4259,24 +4404,28 @@ static u32 alloc_apf_token(struct kvm_vcpu *vcpu) return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; } -static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - gfn_t gfn) +static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) { struct kvm_arch_async_pf arch; arch.token = alloc_apf_token(vcpu); - arch.gfn = gfn; + arch.gfn = fault->gfn; + arch.error_code = fault->error_code; arch.direct_map = vcpu->arch.mmu->root_role.direct; arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu); - return kvm_setup_async_pf(vcpu, cr2_or_gpa, - kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); + return kvm_setup_async_pf(vcpu, fault->addr, + kvm_vcpu_gfn_to_hva(vcpu, fault->gfn), &arch); } void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) { int r; + if (WARN_ON_ONCE(work->arch.error_code & PFERR_PRIVATE_ACCESS)) + return; + if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) || work->wakeup_all) return; @@ -4289,7 +4438,16 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu)) return; - kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL); + r = kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code, + true, NULL, NULL); + + /* + * Account fixed page faults, otherwise they'll never be counted, but + * ignore stats for all other return times. Page-ready "faults" aren't + * truly spurious and never trigger emulation + */ + if (r == RET_PF_FIXED) + vcpu->stat.pf_fixed++; } static inline u8 kvm_max_level_for_order(int order) @@ -4309,16 +4467,34 @@ static inline u8 kvm_max_level_for_order(int order) return PG_LEVEL_4K; } -static void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, - struct kvm_page_fault *fault) +static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, + u8 max_level, int gmem_order) { - kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT, - PAGE_SIZE, fault->write, fault->exec, - fault->is_private); + u8 req_max_level; + + if (max_level == PG_LEVEL_4K) + return PG_LEVEL_4K; + + max_level = min(kvm_max_level_for_order(gmem_order), max_level); + if (max_level == PG_LEVEL_4K) + return PG_LEVEL_4K; + + req_max_level = kvm_x86_call(private_max_mapping_level)(kvm, pfn); + if (req_max_level) + max_level = min(max_level, req_max_level); + + return max_level; +} + +static void kvm_mmu_finish_page_fault(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault, int r) +{ + kvm_release_faultin_page(vcpu->kvm, fault->refcounted_page, + r == RET_PF_RETRY, fault->map_writable); } -static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, - struct kvm_page_fault *fault) +static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) { int max_order, r; @@ -4328,65 +4504,39 @@ static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, } r = kvm_gmem_get_pfn(vcpu->kvm, fault->slot, fault->gfn, &fault->pfn, - &max_order); + &fault->refcounted_page, &max_order); if (r) { kvm_mmu_prepare_memory_fault_exit(vcpu, fault); return r; } - fault->max_level = min(kvm_max_level_for_order(max_order), - fault->max_level); fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY); + fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->pfn, + fault->max_level, max_order); return RET_PF_CONTINUE; } -static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) +static int __kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) { - struct kvm_memory_slot *slot = fault->slot; - bool async; - - /* - * Retry the page fault if the gfn hit a memslot that is being deleted - * or moved. This ensures any existing SPTEs for the old memslot will - * be zapped before KVM inserts a new MMIO SPTE for the gfn. - */ - if (slot && (slot->flags & KVM_MEMSLOT_INVALID)) - return RET_PF_RETRY; - - if (!kvm_is_visible_memslot(slot)) { - /* Don't expose private memslots to L2. */ - if (is_guest_mode(vcpu)) { - fault->slot = NULL; - fault->pfn = KVM_PFN_NOSLOT; - fault->map_writable = false; - return RET_PF_CONTINUE; - } - /* - * If the APIC access page exists but is disabled, go directly - * to emulation without caching the MMIO access or creating a - * MMIO SPTE. That way the cache doesn't need to be purged - * when the AVIC is re-enabled. - */ - if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT && - !kvm_apicv_activated(vcpu->kvm)) - return RET_PF_EMULATE; - } - - if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) { - kvm_mmu_prepare_memory_fault_exit(vcpu, fault); - return -EFAULT; - } + unsigned int foll = fault->write ? FOLL_WRITE : 0; if (fault->is_private) - return kvm_faultin_pfn_private(vcpu, fault); + return kvm_mmu_faultin_pfn_private(vcpu, fault); + + foll |= FOLL_NOWAIT; + fault->pfn = __kvm_faultin_pfn(fault->slot, fault->gfn, foll, + &fault->map_writable, &fault->refcounted_page); - async = false; - fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async, - fault->write, &fault->map_writable, - &fault->hva); - if (!async) - return RET_PF_CONTINUE; /* *pfn has correct page already */ + /* + * If resolving the page failed because I/O is needed to fault-in the + * page, then either set up an asynchronous #PF to do the I/O, or if + * doing an async #PF isn't possible, retry with I/O allowed. All + * other failures are terminal, i.e. retrying won't help. + */ + if (fault->pfn != KVM_PFN_ERR_NEEDS_IO) + return RET_PF_CONTINUE; if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) { trace_kvm_try_async_get_page(fault->addr, fault->gfn); @@ -4394,7 +4544,7 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault trace_kvm_async_pf_repeated_fault(fault->addr, fault->gfn); kvm_make_request(KVM_REQ_APF_HALT, vcpu); return RET_PF_RETRY; - } else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn)) { + } else if (kvm_arch_setup_async_pf(vcpu, fault)) { return RET_PF_RETRY; } } @@ -4404,21 +4554,79 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault * to wait for IO. Note, gup always bails if it is unable to quickly * get a page and a fatal signal, i.e. SIGKILL, is pending. */ - fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, true, NULL, - fault->write, &fault->map_writable, - &fault->hva); + foll |= FOLL_INTERRUPTIBLE; + foll &= ~FOLL_NOWAIT; + fault->pfn = __kvm_faultin_pfn(fault->slot, fault->gfn, foll, + &fault->map_writable, &fault->refcounted_page); + return RET_PF_CONTINUE; } -static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, - unsigned int access) +static int kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault, unsigned int access) { + struct kvm_memory_slot *slot = fault->slot; + struct kvm *kvm = vcpu->kvm; int ret; + if (KVM_BUG_ON(kvm_is_gfn_alias(kvm, fault->gfn), kvm)) + return -EFAULT; + + /* + * Note that the mmu_invalidate_seq also serves to detect a concurrent + * change in attributes. is_page_fault_stale() will detect an + * invalidation relate to fault->fn and resume the guest without + * installing a mapping in the page tables. + */ fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq; smp_rmb(); /* + * Now that we have a snapshot of mmu_invalidate_seq we can check for a + * private vs. shared mismatch. + */ + if (fault->is_private != kvm_mem_is_private(kvm, fault->gfn)) { + kvm_mmu_prepare_memory_fault_exit(vcpu, fault); + return -EFAULT; + } + + if (unlikely(!slot)) + return kvm_handle_noslot_fault(vcpu, fault, access); + + /* + * Retry the page fault if the gfn hit a memslot that is being deleted + * or moved. This ensures any existing SPTEs for the old memslot will + * be zapped before KVM inserts a new MMIO SPTE for the gfn. + */ + if (slot->flags & KVM_MEMSLOT_INVALID) + return RET_PF_RETRY; + + if (slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) { + /* + * Don't map L1's APIC access page into L2, KVM doesn't support + * using APICv/AVIC to accelerate L2 accesses to L1's APIC, + * i.e. the access needs to be emulated. Emulating access to + * L1's APIC is also correct if L1 is accelerating L2's own + * virtual APIC, but for some reason L1 also maps _L1's_ APIC + * into L2. Note, vcpu_is_mmio_gpa() always treats access to + * the APIC as MMIO. Allow an MMIO SPTE to be created, as KVM + * uses different roots for L1 vs. L2, i.e. there is no danger + * of breaking APICv/AVIC for L1. + */ + if (is_guest_mode(vcpu)) + return kvm_handle_noslot_fault(vcpu, fault, access); + + /* + * If the APIC access page exists but is disabled, go directly + * to emulation without caching the MMIO access or creating a + * MMIO SPTE. That way the cache doesn't need to be purged + * when the AVIC is re-enabled. + */ + if (!kvm_apicv_activated(vcpu->kvm)) + return RET_PF_EMULATE; + } + + /* * Check for a relevant mmu_notifier invalidation event before getting * the pfn from the primary MMU, and before acquiring mmu_lock. * @@ -4439,18 +4647,17 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, * *guaranteed* to need to retry, i.e. waiting until mmu_lock is held * to detect retry guarantees the worst case latency for the vCPU. */ - if (fault->slot && - mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) + if (mmu_invalidate_retry_gfn_unsafe(kvm, fault->mmu_seq, fault->gfn)) return RET_PF_RETRY; - ret = __kvm_faultin_pfn(vcpu, fault); + ret = __kvm_mmu_faultin_pfn(vcpu, fault); if (ret != RET_PF_CONTINUE) return ret; if (unlikely(is_error_pfn(fault->pfn))) return kvm_handle_error_pfn(vcpu, fault); - if (unlikely(!fault->slot)) + if (WARN_ON_ONCE(!fault->slot || is_noslot_pfn(fault->pfn))) return kvm_handle_noslot_fault(vcpu, fault, access); /* @@ -4460,8 +4667,8 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, * overall cost of failing to detect the invalidation until after * mmu_lock is acquired. */ - if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) { - kvm_release_pfn_clean(fault->pfn); + if (mmu_invalidate_retry_gfn_unsafe(kvm, fault->mmu_seq, fault->gfn)) { + kvm_mmu_finish_page_fault(vcpu, fault, RET_PF_RETRY); return RET_PF_RETRY; } @@ -4510,7 +4717,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault return RET_PF_RETRY; if (page_fault_handle_page_track(vcpu, fault)) - return RET_PF_EMULATE; + return RET_PF_WRITE_PROTECTED; r = fast_page_fault(vcpu, fault); if (r != RET_PF_INVALID) @@ -4520,7 +4727,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (r) return r; - r = kvm_faultin_pfn(vcpu, fault, ACC_ALL); + r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_ALL); if (r != RET_PF_CONTINUE) return r; @@ -4537,8 +4744,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault r = direct_map(vcpu, fault); out_unlock: + kvm_mmu_finish_page_fault(vcpu, fault, r); write_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(fault->pfn); return r; } @@ -4561,13 +4768,24 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, if (WARN_ON_ONCE(fault_address >> 32)) return -EFAULT; #endif + /* + * Legacy #PF exception only have a 32-bit error code. Simply drop the + * upper bits as KVM doesn't use them for #PF (because they are never + * set), and to ensure there are no collisions with KVM-defined bits. + */ + if (WARN_ON_ONCE(error_code >> 32)) + error_code = lower_32_bits(error_code); + + /* + * Restrict KVM-defined flags to bits 63:32 so that it's impossible for + * them to conflict with #PF error codes, which are limited to 32 bits. + */ + BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK)); vcpu->arch.l1tf_flush_l1d = true; if (!flags) { trace_kvm_page_fault(vcpu, fault_address, error_code); - if (kvm_event_needs_reinjection(vcpu)) - kvm_mmu_unprotect_page_virt(vcpu, fault_address); r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn, insn_len); } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) { @@ -4590,7 +4808,7 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, int r; if (page_fault_handle_page_track(vcpu, fault)) - return RET_PF_EMULATE; + return RET_PF_WRITE_PROTECTED; r = fast_page_fault(vcpu, fault); if (r != RET_PF_INVALID) @@ -4600,7 +4818,7 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, if (r) return r; - r = kvm_faultin_pfn(vcpu, fault, ACC_ALL); + r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_ALL); if (r != RET_PF_CONTINUE) return r; @@ -4613,50 +4831,110 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, r = kvm_tdp_mmu_map(vcpu, fault); out_unlock: + kvm_mmu_finish_page_fault(vcpu, fault, r); read_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(fault->pfn); return r; } #endif -bool __kvm_mmu_honors_guest_mtrrs(bool vm_has_noncoherent_dma) +int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) +{ +#ifdef CONFIG_X86_64 + if (tdp_mmu_enabled) + return kvm_tdp_mmu_page_fault(vcpu, fault); +#endif + + return direct_page_fault(vcpu, fault); +} + +int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level) { + int r; + /* - * If host MTRRs are ignored (shadow_memtype_mask is non-zero), and the - * VM has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is - * to honor the memtype from the guest's MTRRs so that guest accesses - * to memory that is DMA'd aren't cached against the guest's wishes. - * - * Note, KVM may still ultimately ignore guest MTRRs for certain PFNs, - * e.g. KVM will force UC memtype for host MMIO. + * Restrict to TDP page fault, since that's the only case where the MMU + * is indexed by GPA. */ - return vm_has_noncoherent_dma && shadow_memtype_mask; + if (vcpu->arch.mmu->page_fault != kvm_tdp_page_fault) + return -EOPNOTSUPP; + + do { + if (signal_pending(current)) + return -EINTR; + + if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) + return -EIO; + + cond_resched(); + r = kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level); + } while (r == RET_PF_RETRY); + + if (r < 0) + return r; + + switch (r) { + case RET_PF_FIXED: + case RET_PF_SPURIOUS: + case RET_PF_WRITE_PROTECTED: + return 0; + + case RET_PF_EMULATE: + return -ENOENT; + + case RET_PF_RETRY: + case RET_PF_CONTINUE: + case RET_PF_INVALID: + default: + WARN_ONCE(1, "could not fix page fault during prefault"); + return -EIO; + } } +EXPORT_SYMBOL_GPL(kvm_tdp_map_page); -int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) +long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, + struct kvm_pre_fault_memory *range) { + u64 error_code = PFERR_GUEST_FINAL_MASK; + u8 level = PG_LEVEL_4K; + u64 direct_bits; + u64 end; + int r; + + if (!vcpu->kvm->arch.pre_fault_allowed) + return -EOPNOTSUPP; + + if (kvm_is_gfn_alias(vcpu->kvm, gpa_to_gfn(range->gpa))) + return -EINVAL; + /* - * If the guest's MTRRs may be used to compute the "real" memtype, - * restrict the mapping level to ensure KVM uses a consistent memtype - * across the entire mapping. + * reload is efficient when called repeatedly, so we can do it on + * every iteration. */ - if (kvm_mmu_honors_guest_mtrrs(vcpu->kvm)) { - for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) { - int page_num = KVM_PAGES_PER_HPAGE(fault->max_level); - gfn_t base = gfn_round_for_level(fault->gfn, - fault->max_level); + r = kvm_mmu_reload(vcpu); + if (r) + return r; - if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num)) - break; - } - } + direct_bits = 0; + if (kvm_arch_has_private_mem(vcpu->kvm) && + kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa))) + error_code |= PFERR_PRIVATE_ACCESS; + else + direct_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm)); -#ifdef CONFIG_X86_64 - if (tdp_mmu_enabled) - return kvm_tdp_mmu_page_fault(vcpu, fault); -#endif + /* + * Shadow paging uses GVA for kvm page fault, so restrict to + * two-dimensional paging. + */ + r = kvm_tdp_map_page(vcpu, range->gpa | direct_bits, error_code, &level); + if (r < 0) + return r; - return direct_page_fault(vcpu, fault); + /* + * If the mapping that covers range->gpa can use a huge page, it + * may start below it or end after range->gpa + range->size. + */ + end = (range->gpa & KVM_HPAGE_MASK(level)) + KVM_HPAGE_SIZE(level); + return min(range->size, end - range->gpa); } static void nonpaging_init_context(struct kvm_mmu *context) @@ -4812,7 +5090,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd); static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, unsigned int access) { - if (unlikely(is_mmio_spte(*sptep))) { + if (unlikely(is_mmio_spte(vcpu->kvm, *sptep))) { if (gfn != get_mmio_spte_gfn(*sptep)) { mmu_spte_clear_no_track(sptep); return true; @@ -4933,9 +5211,9 @@ static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu, __reset_rsvds_bits_mask(&context->guest_rsvd_check, vcpu->arch.reserved_gpa_bits, context->cpu_role.base.level, is_efer_nx(context), - guest_can_use(vcpu, X86_FEATURE_GBPAGES), + guest_cpu_cap_has(vcpu, X86_FEATURE_GBPAGES), is_cr4_pse(context), - guest_cpuid_is_amd_or_hygon(vcpu)); + guest_cpuid_is_amd_compatible(vcpu)); } static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, @@ -4986,7 +5264,7 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, static inline u64 reserved_hpa_bits(void) { - return rsvd_bits(shadow_phys_bits, 63); + return rsvd_bits(kvm_host.maxphyaddr, 63); } /* @@ -5010,7 +5288,7 @@ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), context->root_role.level, context->root_role.efer_nx, - guest_can_use(vcpu, X86_FEATURE_GBPAGES), + guest_cpu_cap_has(vcpu, X86_FEATURE_GBPAGES), is_pse, is_amd); if (!shadow_me_mask) @@ -5311,17 +5589,29 @@ void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu, static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) { + int maxpa; + + if (vcpu->kvm->arch.vm_type == KVM_X86_TDX_VM) + maxpa = cpuid_query_maxguestphyaddr(vcpu); + else + maxpa = cpuid_maxphyaddr(vcpu); + /* tdp_root_level is architecture forced level, use it if nonzero */ if (tdp_root_level) return tdp_root_level; /* Use 5-level TDP if and only if it's useful/necessary. */ - if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48) + if (max_tdp_level == 5 && maxpa <= 48) return 4; return max_tdp_level; } +u8 kvm_mmu_get_max_tdp_level(void) +{ + return tdp_root_level ? tdp_root_level : max_tdp_level; +} + static union kvm_mmu_page_role kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, union kvm_cpu_role cpu_role) @@ -5333,7 +5623,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, role.efer_nx = true; role.smm = cpu_role.base.smm; role.guest_mode = cpu_role.base.guest_mode; - role.ad_disabled = !kvm_ad_enabled(); + role.ad_disabled = !kvm_ad_enabled; role.level = kvm_mmu_get_tdp_level(vcpu); role.direct = true; role.has_4_byte_gpte = false; @@ -5430,7 +5720,7 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, union kvm_mmu_page_role root_role; /* NPT requires CR0.PG=1. */ - WARN_ON_ONCE(cpu_role.base.direct); + WARN_ON_ONCE(cpu_role.base.direct || !cpu_role.base.guest_mode); root_role = cpu_role.base; root_role.level = kvm_mmu_get_tdp_level(vcpu); @@ -5576,9 +5866,9 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) * that problem is swept under the rug; KVM's CPUID API is horrific and * it's all but impossible to solve it without introducing a new API. */ - vcpu->arch.root_mmu.root_role.word = 0; - vcpu->arch.guest_mmu.root_role.word = 0; - vcpu->arch.nested_mmu.root_role.word = 0; + vcpu->arch.root_mmu.root_role.invalid = 1; + vcpu->arch.guest_mmu.root_role.invalid = 1; + vcpu->arch.nested_mmu.root_role.invalid = 1; vcpu->arch.root_mmu.cpu_role.ext.valid = 0; vcpu->arch.guest_mmu.cpu_role.ext.valid = 0; vcpu->arch.nested_mmu.cpu_role.ext.valid = 0; @@ -5626,10 +5916,11 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) * stale entries. Flushing on alloc also allows KVM to skip the TLB * flush when freeing a root (see kvm_tdp_mmu_put_root()). */ - static_call(kvm_x86_flush_tlb_current)(vcpu); + kvm_x86_call(flush_tlb_current)(vcpu); out: return r; } +EXPORT_SYMBOL_GPL(kvm_mmu_load); void kvm_mmu_unload(struct kvm_vcpu *vcpu) { @@ -5691,6 +5982,7 @@ void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu) __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu); __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu); } +EXPORT_SYMBOL_GPL(kvm_mmu_free_obsolete_roots); static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, int *bytes) @@ -5802,10 +6094,15 @@ void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, bool flush = false; /* - * If we don't have indirect shadow pages, it means no page is - * write-protected, so we can exit simply. + * When emulating guest writes, ensure the written value is visible to + * any task that is handling page faults before checking whether or not + * KVM is shadowing a guest PTE. This ensures either KVM will create + * the correct SPTE in the page fault handler, or this task will see + * a non-zero indirect_shadow_pages. Pairs with the smp_mb() in + * account_shadowed(). */ - if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) + smp_mb(); + if (!vcpu->kvm->arch.indirect_shadow_pages) return; write_lock(&vcpu->kvm->mmu_lock); @@ -5840,78 +6137,195 @@ void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, write_unlock(&vcpu->kvm->mmu_lock); } -int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, - void *insn, int insn_len) +static bool is_write_to_guest_page_table(u64 error_code) +{ + const u64 mask = PFERR_GUEST_PAGE_MASK | PFERR_WRITE_MASK | PFERR_PRESENT_MASK; + + return (error_code & mask) == mask; +} + +static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + u64 error_code, int *emulation_type) { - int r, emulation_type = EMULTYPE_PF; bool direct = vcpu->arch.mmu->root_role.direct; /* - * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP - * checks when emulating instructions that triggers implicit access. - * WARN if hardware generates a fault with an error code that collides - * with the KVM-defined value. Clear the flag and continue on, i.e. - * don't terminate the VM, as KVM can't possibly be relying on a flag - * that KVM doesn't know about. + * Do not try to unprotect and retry if the vCPU re-faulted on the same + * RIP with the same address that was previously unprotected, as doing + * so will likely put the vCPU into an infinite. E.g. if the vCPU uses + * a non-page-table modifying instruction on the PDE that points to the + * instruction, then unprotecting the gfn will unmap the instruction's + * code, i.e. make it impossible for the instruction to ever complete. + */ + if (vcpu->arch.last_retry_eip == kvm_rip_read(vcpu) && + vcpu->arch.last_retry_addr == cr2_or_gpa) + return RET_PF_EMULATE; + + /* + * Reset the unprotect+retry values that guard against infinite loops. + * The values will be refreshed if KVM explicitly unprotects a gfn and + * retries, in all other cases it's safe to retry in the future even if + * the next page fault happens on the same RIP+address. + */ + vcpu->arch.last_retry_eip = 0; + vcpu->arch.last_retry_addr = 0; + + /* + * It should be impossible to reach this point with an MMIO cache hit, + * as RET_PF_WRITE_PROTECTED is returned if and only if there's a valid, + * writable memslot, and creating a memslot should invalidate the MMIO + * cache by way of changing the memslot generation. WARN and disallow + * retry if MMIO is detected, as retrying MMIO emulation is pointless + * and could put the vCPU into an infinite loop because the processor + * will keep faulting on the non-existent MMIO address. */ - if (WARN_ON_ONCE(error_code & PFERR_IMPLICIT_ACCESS)) - error_code &= ~PFERR_IMPLICIT_ACCESS; + if (WARN_ON_ONCE(mmio_info_in_cache(vcpu, cr2_or_gpa, direct))) + return RET_PF_EMULATE; + + /* + * Before emulating the instruction, check to see if the access was due + * to a read-only violation while the CPU was walking non-nested NPT + * page tables, i.e. for a direct MMU, for _guest_ page tables in L1. + * If L1 is sharing (a subset of) its page tables with L2, e.g. by + * having nCR3 share lower level page tables with hCR3, then when KVM + * (L0) write-protects the nested NPTs, i.e. npt12 entries, KVM is also + * unknowingly write-protecting L1's guest page tables, which KVM isn't + * shadowing. + * + * Because the CPU (by default) walks NPT page tables using a write + * access (to ensure the CPU can do A/D updates), page walks in L1 can + * trigger write faults for the above case even when L1 isn't modifying + * PTEs. As a result, KVM will unnecessarily emulate (or at least, try + * to emulate) an excessive number of L1 instructions; because L1's MMU + * isn't shadowed by KVM, there is no need to write-protect L1's gPTEs + * and thus no need to emulate in order to guarantee forward progress. + * + * Try to unprotect the gfn, i.e. zap any shadow pages, so that L1 can + * proceed without triggering emulation. If one or more shadow pages + * was zapped, skip emulation and resume L1 to let it natively execute + * the instruction. If no shadow pages were zapped, then the write- + * fault is due to something else entirely, i.e. KVM needs to emulate, + * as resuming the guest will put it into an infinite loop. + * + * Note, this code also applies to Intel CPUs, even though it is *very* + * unlikely that an L1 will share its page tables (IA32/PAE/paging64 + * format) with L2's page tables (EPT format). + * + * For indirect MMUs, i.e. if KVM is shadowing the current MMU, try to + * unprotect the gfn and retry if an event is awaiting reinjection. If + * KVM emulates multiple instructions before completing event injection, + * the event could be delayed beyond what is architecturally allowed, + * e.g. KVM could inject an IRQ after the TPR has been raised. + */ + if (((direct && is_write_to_guest_page_table(error_code)) || + (!direct && kvm_event_needs_reinjection(vcpu))) && + kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa)) + return RET_PF_RETRY; + + /* + * The gfn is write-protected, but if KVM detects its emulating an + * instruction that is unlikely to be used to modify page tables, or if + * emulation fails, KVM can try to unprotect the gfn and let the CPU + * re-execute the instruction that caused the page fault. Do not allow + * retrying an instruction from a nested guest as KVM is only explicitly + * shadowing L1's page tables, i.e. unprotecting something for L1 isn't + * going to magically fix whatever issue caused L2 to fail. + */ + if (!is_guest_mode(vcpu)) + *emulation_type |= EMULTYPE_ALLOW_RETRY_PF; + + return RET_PF_EMULATE; +} + +int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, + void *insn, int insn_len) +{ + int r, emulation_type = EMULTYPE_PF; + bool direct = vcpu->arch.mmu->root_role.direct; if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) return RET_PF_RETRY; + /* + * Except for reserved faults (emulated MMIO is shared-only), set the + * PFERR_PRIVATE_ACCESS flag for software-protected VMs based on the gfn's + * current attributes, which are the source of truth for such VMs. Note, + * this wrong for nested MMUs as the GPA is an L2 GPA, but KVM doesn't + * currently supported nested virtualization (among many other things) + * for software-protected VMs. + */ + if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && + !(error_code & PFERR_RSVD_MASK) && + vcpu->kvm->arch.vm_type == KVM_X86_SW_PROTECTED_VM && + kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(cr2_or_gpa))) + error_code |= PFERR_PRIVATE_ACCESS; + r = RET_PF_INVALID; if (unlikely(error_code & PFERR_RSVD_MASK)) { + if (WARN_ON_ONCE(error_code & PFERR_PRIVATE_ACCESS)) + return -EFAULT; + r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct); if (r == RET_PF_EMULATE) goto emulate; } if (r == RET_PF_INVALID) { - r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, - lower_32_bits(error_code), false, - &emulation_type); + vcpu->stat.pf_taken++; + + r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false, + &emulation_type, NULL); if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm)) return -EIO; } if (r < 0) return r; - if (r != RET_PF_EMULATE) - return 1; + + if (r == RET_PF_WRITE_PROTECTED) + r = kvm_mmu_write_protect_fault(vcpu, cr2_or_gpa, error_code, + &emulation_type); + + if (r == RET_PF_FIXED) + vcpu->stat.pf_fixed++; + else if (r == RET_PF_EMULATE) + vcpu->stat.pf_emulate++; + else if (r == RET_PF_SPURIOUS) + vcpu->stat.pf_spurious++; /* - * Before emulating the instruction, check if the error code - * was due to a RO violation while translating the guest page. - * This can occur when using nested virtualization with nested - * paging in both guests. If true, we simply unprotect the page - * and resume the guest. + * None of handle_mmio_page_fault(), kvm_mmu_do_page_fault(), or + * kvm_mmu_write_protect_fault() return RET_PF_CONTINUE. + * kvm_mmu_do_page_fault() only uses RET_PF_CONTINUE internally to + * indicate continuing the page fault handling until to the final + * page table mapping phase. */ - if (vcpu->arch.mmu->root_role.direct && - (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) { - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)); - return 1; - } + WARN_ON_ONCE(r == RET_PF_CONTINUE); + if (r != RET_PF_EMULATE) + return r; - /* - * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still - * optimistically try to just unprotect the page and let the processor - * re-execute the instruction that caused the page fault. Do not allow - * retrying MMIO emulation, as it's not only pointless but could also - * cause us to enter an infinite loop because the processor will keep - * faulting on the non-existent MMIO address. Retrying an instruction - * from a nested guest is also pointless and dangerous as we are only - * explicitly shadowing L1's page tables, i.e. unprotecting something - * for L1 isn't going to magically fix whatever issue cause L2 to fail. - */ - if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu)) - emulation_type |= EMULTYPE_ALLOW_RETRY_PF; emulate: return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn, insn_len); } EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); +void kvm_mmu_print_sptes(struct kvm_vcpu *vcpu, gpa_t gpa, const char *msg) +{ + u64 sptes[PT64_ROOT_MAX_LEVEL + 1]; + int root_level, leaf, level; + + leaf = get_sptes_lockless(vcpu, gpa, sptes, &root_level); + if (unlikely(leaf < 0)) + return; + + pr_err("%s %llx", msg, gpa); + for (level = root_level; level >= leaf; level--) + pr_cont(", spte[%d] = 0x%llx", level, sptes[level]); + pr_cont("\n"); +} +EXPORT_SYMBOL_GPL(kvm_mmu_print_sptes); + static void __kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, u64 addr, hpa_t root_hpa) { @@ -5959,10 +6373,10 @@ void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, /* It's actually a GPA for vcpu->arch.guest_mmu. */ if (mmu != &vcpu->arch.guest_mmu) { /* INVLPG on a non-canonical address is a NOP according to the SDM. */ - if (is_noncanonical_address(addr, vcpu)) + if (is_noncanonical_invlpg_address(addr, vcpu)) return; - static_call(kvm_x86_flush_tlb_gva)(vcpu, addr); + kvm_x86_call(flush_tlb_gva)(vcpu, addr); } if (!mmu->sync_spte) @@ -6048,59 +6462,6 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, } EXPORT_SYMBOL_GPL(kvm_configure_mmu); -/* The return value indicates if tlb flush on all vcpus is needed. */ -typedef bool (*slot_rmaps_handler) (struct kvm *kvm, - struct kvm_rmap_head *rmap_head, - const struct kvm_memory_slot *slot); - -static __always_inline bool __walk_slot_rmaps(struct kvm *kvm, - const struct kvm_memory_slot *slot, - slot_rmaps_handler fn, - int start_level, int end_level, - gfn_t start_gfn, gfn_t end_gfn, - bool flush_on_yield, bool flush) -{ - struct slot_rmap_walk_iterator iterator; - - lockdep_assert_held_write(&kvm->mmu_lock); - - for_each_slot_rmap_range(slot, start_level, end_level, start_gfn, - end_gfn, &iterator) { - if (iterator.rmap) - flush |= fn(kvm, iterator.rmap, slot); - - if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { - if (flush && flush_on_yield) { - kvm_flush_remote_tlbs_range(kvm, start_gfn, - iterator.gfn - start_gfn + 1); - flush = false; - } - cond_resched_rwlock_write(&kvm->mmu_lock); - } - } - - return flush; -} - -static __always_inline bool walk_slot_rmaps(struct kvm *kvm, - const struct kvm_memory_slot *slot, - slot_rmaps_handler fn, - int start_level, int end_level, - bool flush_on_yield) -{ - return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level, - slot->base_gfn, slot->base_gfn + slot->npages - 1, - flush_on_yield, false); -} - -static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm, - const struct kvm_memory_slot *slot, - slot_rmaps_handler fn, - bool flush_on_yield) -{ - return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield); -} - static void free_mmu_pages(struct kvm_mmu *mmu) { if (!tdp_enabled && mmu->pae_root) @@ -6117,6 +6478,7 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) mmu->root.hpa = INVALID_PAGE; mmu->root.pgd = 0; + mmu->mirror_root_hpa = INVALID_PAGE; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; @@ -6173,7 +6535,10 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache; vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO; - vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO; + vcpu->arch.mmu_shadow_page_cache.init_value = + SHADOW_NONPRESENT_VALUE; + if (!vcpu->arch.mmu_shadow_page_cache.init_value) + vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO; vcpu->arch.mmu = &vcpu->arch.root_mmu; vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; @@ -6197,8 +6562,11 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; int nr_zapped, batch = 0; + LIST_HEAD(invalid_list); bool unstable; + lockdep_assert_held(&kvm->slots_lock); + restart: list_for_each_entry_safe_reverse(sp, node, &kvm->arch.active_mmu_pages, link) { @@ -6230,7 +6598,7 @@ restart: } unstable = __kvm_mmu_prepare_zap_page(kvm, sp, - &kvm->arch.zapped_obsolete_pages, &nr_zapped); + &invalid_list, &nr_zapped); batch += nr_zapped; if (unstable) @@ -6246,7 +6614,7 @@ restart: * kvm_mmu_load()), and the reload in the caller ensure no vCPUs are * running with an obsolete MMU. */ - kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages); + kvm_mmu_commit_zap_page(kvm, &invalid_list); } /* @@ -6280,8 +6648,13 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) * write and in the same critical section as making the reload request, * e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield. */ - if (tdp_mmu_enabled) - kvm_tdp_mmu_invalidate_all_roots(kvm); + if (tdp_mmu_enabled) { + /* + * External page tables don't support fast zapping, therefore + * their mirrors must be invalidated separately by the caller. + */ + kvm_tdp_mmu_invalidate_roots(kvm, KVM_DIRECT_ROOTS); + } /* * Notify all vcpus to reload its shadow page table and flush TLB. @@ -6306,18 +6679,13 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) * lead to use-after-free. */ if (tdp_mmu_enabled) - kvm_tdp_mmu_zap_invalidated_roots(kvm); -} - -static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) -{ - return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); + kvm_tdp_mmu_zap_invalidated_roots(kvm, true); } void kvm_mmu_init_vm(struct kvm *kvm) { + kvm->arch.shadow_mmio_value = shadow_mmio_value; INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); - INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages); spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); @@ -6370,9 +6738,8 @@ static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_e if (WARN_ON_ONCE(start >= end)) continue; - flush = __walk_slot_rmaps(kvm, memslot, __kvm_zap_rmap, - PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, - start, end - 1, true, flush); + flush = __kvm_rmap_zap_gfn_range(kvm, memslot, start, + end, true, flush); } } @@ -6552,7 +6919,7 @@ static void shadow_mmu_split_huge_page(struct kvm *kvm, continue; } - spte = make_huge_page_split_spte(kvm, huge_spte, sp->role, index); + spte = make_small_spte(kvm, huge_spte, sp->role, index); mmu_spte_set(sptep, spte); __rmap_add(kvm, cache, slot, sptep, gfn, sp->role.access); } @@ -6660,7 +7027,7 @@ static void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm, */ for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--) __walk_slot_rmaps(kvm, slot, shadow_mmu_try_split_huge_pages, - level, level, start, end - 1, true, false); + level, level, start, end - 1, true, true, false); } /* Must be called with the mmu_lock held in write-mode. */ @@ -6735,8 +7102,7 @@ restart: * mapping if the indirect sp has level = 1. */ if (sp->role.direct && - sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, - PG_LEVEL_NUM)) { + sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn)) { kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); if (kvm_available_flush_remote_tlbs_range()) @@ -6750,6 +7116,7 @@ restart: return need_tlb_flush; } +EXPORT_SYMBOL_GPL(kvm_zap_gfn_range); static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *slot) @@ -6763,8 +7130,8 @@ static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm, kvm_flush_remote_tlbs_memslot(kvm, slot); } -void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot) +void kvm_mmu_recover_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *slot) { if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); @@ -6774,7 +7141,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, if (tdp_mmu_enabled) { read_lock(&kvm->mmu_lock); - kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot); + kvm_tdp_mmu_recover_huge_pages(kvm, slot); read_unlock(&kvm->mmu_lock); } } @@ -6838,10 +7205,71 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm) kvm_mmu_zap_all(kvm); } +static void kvm_mmu_zap_memslot_pages_and_flush(struct kvm *kvm, + struct kvm_memory_slot *slot, + bool flush) +{ + LIST_HEAD(invalid_list); + unsigned long i; + + if (list_empty(&kvm->arch.active_mmu_pages)) + goto out_flush; + + /* + * Since accounting information is stored in struct kvm_arch_memory_slot, + * all MMU pages that are shadowing guest PTEs must be zapped before the + * memslot is deleted, as freeing such pages after the memslot is freed + * will result in use-after-free, e.g. in unaccount_shadowed(). + */ + for (i = 0; i < slot->npages; i++) { + struct kvm_mmu_page *sp; + gfn_t gfn = slot->base_gfn + i; + + for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); + + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { + kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); + flush = false; + cond_resched_rwlock_write(&kvm->mmu_lock); + } + } + +out_flush: + kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); +} + +static void kvm_mmu_zap_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + struct kvm_gfn_range range = { + .slot = slot, + .start = slot->base_gfn, + .end = slot->base_gfn + slot->npages, + .may_block = true, + .attr_filter = KVM_FILTER_PRIVATE | KVM_FILTER_SHARED, + }; + bool flush; + + write_lock(&kvm->mmu_lock); + flush = kvm_unmap_gfn_range(kvm, &range); + kvm_mmu_zap_memslot_pages_and_flush(kvm, slot, flush); + write_unlock(&kvm->mmu_lock); +} + +static inline bool kvm_memslot_flush_zap_all(struct kvm *kvm) +{ + return kvm->arch.vm_type == KVM_X86_DEFAULT_VM && + kvm_check_has_quirk(kvm, KVM_X86_QUIRK_SLOT_ZAP_ALL); +} + void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { - kvm_mmu_zap_all_fast(kvm); + if (kvm_memslot_flush_zap_all(kvm)) + kvm_mmu_zap_all_fast(kvm); + else + kvm_mmu_zap_memslot(kvm, slot); } void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) @@ -6869,77 +7297,23 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) } } -static unsigned long mmu_shrink_scan(struct shrinker *shrink, - struct shrink_control *sc) +static void mmu_destroy_caches(void) { - struct kvm *kvm; - int nr_to_scan = sc->nr_to_scan; - unsigned long freed = 0; - - mutex_lock(&kvm_lock); - - list_for_each_entry(kvm, &vm_list, vm_list) { - int idx; - LIST_HEAD(invalid_list); - - /* - * Never scan more than sc->nr_to_scan VM instances. - * Will not hit this condition practically since we do not try - * to shrink more than one VM and it is very unlikely to see - * !n_used_mmu_pages so many times. - */ - if (!nr_to_scan--) - break; - /* - * n_used_mmu_pages is accessed without holding kvm->mmu_lock - * here. We may skip a VM instance errorneosly, but we do not - * want to shrink a VM that only started to populate its MMU - * anyway. - */ - if (!kvm->arch.n_used_mmu_pages && - !kvm_has_zapped_obsolete_pages(kvm)) - continue; - - idx = srcu_read_lock(&kvm->srcu); - write_lock(&kvm->mmu_lock); - - if (kvm_has_zapped_obsolete_pages(kvm)) { - kvm_mmu_commit_zap_page(kvm, - &kvm->arch.zapped_obsolete_pages); - goto unlock; - } - - freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan); - -unlock: - write_unlock(&kvm->mmu_lock); - srcu_read_unlock(&kvm->srcu, idx); - - /* - * unfair on small ones - * per-vm shrinkers cry out - * sadness comes quickly - */ - list_move_tail(&kvm->vm_list, &vm_list); - break; - } - - mutex_unlock(&kvm_lock); - return freed; + kmem_cache_destroy(pte_list_desc_cache); + kmem_cache_destroy(mmu_page_header_cache); } -static unsigned long mmu_shrink_count(struct shrinker *shrink, - struct shrink_control *sc) +static void kvm_wake_nx_recovery_thread(struct kvm *kvm) { - return percpu_counter_read_positive(&kvm_total_used_mmu_pages); -} - -static struct shrinker *mmu_shrinker; + /* + * The NX recovery thread is spawned on-demand at the first KVM_RUN and + * may not be valid even though the VM is globally visible. Do nothing, + * as such a VM can't have any possible NX huge pages. + */ + struct vhost_task *nx_thread = READ_ONCE(kvm->arch.nx_huge_page_recovery_thread); -static void mmu_destroy_caches(void) -{ - kmem_cache_destroy(pte_list_desc_cache); - kmem_cache_destroy(mmu_page_header_cache); + if (nx_thread) + vhost_task_wake(nx_thread); } static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp) @@ -7002,7 +7376,7 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) kvm_mmu_zap_all_fast(kvm); mutex_unlock(&kvm->slots_lock); - wake_up_process(kvm->arch.nx_huge_page_recovery_thread); + kvm_wake_nx_recovery_thread(kvm); } mutex_unlock(&kvm_lock); } @@ -7062,23 +7436,8 @@ int kvm_mmu_vendor_module_init(void) if (!mmu_page_header_cache) goto out; - if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL)) - goto out; - - mmu_shrinker = shrinker_alloc(0, "x86-mmu"); - if (!mmu_shrinker) - goto out_shrinker; - - mmu_shrinker->count_objects = mmu_shrink_count; - mmu_shrinker->scan_objects = mmu_shrink_scan; - mmu_shrinker->seeks = DEFAULT_SEEKS * 10; - - shrinker_register(mmu_shrinker); - return 0; -out_shrinker: - percpu_counter_destroy(&kvm_total_used_mmu_pages); out: mmu_destroy_caches(); return ret; @@ -7087,6 +7446,12 @@ out: void kvm_mmu_destroy(struct kvm_vcpu *vcpu) { kvm_mmu_unload(vcpu); + if (tdp_mmu_enabled) { + read_lock(&vcpu->kvm->mmu_lock); + mmu_free_root_page(vcpu->kvm, &vcpu->arch.mmu->mirror_root_hpa, + NULL); + read_unlock(&vcpu->kvm->mmu_lock); + } free_mmu_pages(&vcpu->arch.root_mmu); free_mmu_pages(&vcpu->arch.guest_mmu); mmu_free_memory_caches(vcpu); @@ -7095,8 +7460,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu) void kvm_mmu_vendor_module_exit(void) { mmu_destroy_caches(); - percpu_counter_destroy(&kvm_total_used_mmu_pages); - shrinker_free(mmu_shrinker); } /* @@ -7148,7 +7511,7 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) - wake_up_process(kvm->arch.nx_huge_page_recovery_thread); + kvm_wake_nx_recovery_thread(kvm); mutex_unlock(&kvm_lock); } @@ -7251,68 +7614,95 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm) srcu_read_unlock(&kvm->srcu, rcu_idx); } -static long get_nx_huge_page_recovery_timeout(u64 start_time) +static void kvm_nx_huge_page_recovery_worker_kill(void *data) { +} + +static bool kvm_nx_huge_page_recovery_worker(void *data) +{ + struct kvm *kvm = data; bool enabled; uint period; + long remaining_time; enabled = calc_nx_huge_pages_recovery_period(&period); + if (!enabled) + return false; + + remaining_time = kvm->arch.nx_huge_page_last + msecs_to_jiffies(period) + - get_jiffies_64(); + if (remaining_time > 0) { + schedule_timeout(remaining_time); + /* check for signals and come back */ + return true; + } - return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64() - : MAX_SCHEDULE_TIMEOUT; + __set_current_state(TASK_RUNNING); + kvm_recover_nx_huge_pages(kvm); + kvm->arch.nx_huge_page_last = get_jiffies_64(); + return true; } -static int kvm_nx_huge_page_recovery_worker(struct kvm *kvm, uintptr_t data) +static int kvm_mmu_start_lpage_recovery(struct once *once) { - u64 start_time; - long remaining_time; - - while (true) { - start_time = get_jiffies_64(); - remaining_time = get_nx_huge_page_recovery_timeout(start_time); + struct kvm_arch *ka = container_of(once, struct kvm_arch, nx_once); + struct kvm *kvm = container_of(ka, struct kvm, arch); + struct vhost_task *nx_thread; - set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop() && remaining_time > 0) { - schedule_timeout(remaining_time); - remaining_time = get_nx_huge_page_recovery_timeout(start_time); - set_current_state(TASK_INTERRUPTIBLE); - } + kvm->arch.nx_huge_page_last = get_jiffies_64(); + nx_thread = vhost_task_create(kvm_nx_huge_page_recovery_worker, + kvm_nx_huge_page_recovery_worker_kill, + kvm, "kvm-nx-lpage-recovery"); - set_current_state(TASK_RUNNING); + if (IS_ERR(nx_thread)) + return PTR_ERR(nx_thread); - if (kthread_should_stop()) - return 0; + vhost_task_start(nx_thread); - kvm_recover_nx_huge_pages(kvm); - } + /* Make the task visible only once it is fully started. */ + WRITE_ONCE(kvm->arch.nx_huge_page_recovery_thread, nx_thread); + return 0; } int kvm_mmu_post_init_vm(struct kvm *kvm) { - int err; - if (nx_hugepage_mitigation_hard_disabled) return 0; - err = kvm_vm_create_worker_thread(kvm, kvm_nx_huge_page_recovery_worker, 0, - "kvm-nx-lpage-recovery", - &kvm->arch.nx_huge_page_recovery_thread); - if (!err) - kthread_unpark(kvm->arch.nx_huge_page_recovery_thread); - - return err; + return call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery); } void kvm_mmu_pre_destroy_vm(struct kvm *kvm) { if (kvm->arch.nx_huge_page_recovery_thread) - kthread_stop(kvm->arch.nx_huge_page_recovery_thread); + vhost_task_stop(kvm->arch.nx_huge_page_recovery_thread); } #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES +static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG; +} + +static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG; +} + +static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG; +} + bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, struct kvm_gfn_range *range) { + struct kvm_memory_slot *slot = range->slot; + int level; + /* * Zap SPTEs even if the slot can't be mapped PRIVATE. KVM x86 only * supports KVM_MEMORY_ATTRIBUTE_PRIVATE, and so it *seems* like KVM @@ -7327,27 +7717,49 @@ bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm))) return false; - return kvm_unmap_gfn_range(kvm, range); -} + if (WARN_ON_ONCE(range->end <= range->start)) + return false; -static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn, - int level) -{ - return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG; -} + /* + * If the head and tail pages of the range currently allow a hugepage, + * i.e. reside fully in the slot and don't have mixed attributes, then + * add each corresponding hugepage range to the ongoing invalidation, + * e.g. to prevent KVM from creating a hugepage in response to a fault + * for a gfn whose attributes aren't changing. Note, only the range + * of gfns whose attributes are being modified needs to be explicitly + * unmapped, as that will unmap any existing hugepages. + */ + for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) { + gfn_t start = gfn_round_for_level(range->start, level); + gfn_t end = gfn_round_for_level(range->end - 1, level); + gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level); -static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn, - int level) -{ - lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG; -} + if ((start != range->start || start + nr_pages > range->end) && + start >= slot->base_gfn && + start + nr_pages <= slot->base_gfn + slot->npages && + !hugepage_test_mixed(slot, start, level)) + kvm_mmu_invalidate_range_add(kvm, start, start + nr_pages); -static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn, - int level) -{ - lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG; + if (end == start) + continue; + + if ((end + nr_pages) > range->end && + (end + nr_pages) <= (slot->base_gfn + slot->npages) && + !hugepage_test_mixed(slot, end, level)) + kvm_mmu_invalidate_range_add(kvm, end, end + nr_pages); + } + + /* Unmap the old attribute page. */ + if (range->arg.attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE) + range->attr_filter = KVM_FILTER_SHARED; + else + range->attr_filter = KVM_FILTER_PRIVATE; + + return kvm_unmap_gfn_range(kvm, range); } + + static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, int level, unsigned long attrs) { @@ -7355,7 +7767,7 @@ static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot, const unsigned long end = start + KVM_PAGES_PER_HPAGE(level); if (level == PG_LEVEL_2M) - return kvm_range_has_memory_attributes(kvm, start, end, attrs); + return kvm_range_has_memory_attributes(kvm, start, end, ~0, attrs); for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1)) { if (hugepage_test_mixed(slot, gfn, level - 1) || @@ -7399,7 +7811,8 @@ bool kvm_arch_post_set_memory_attributes(struct kvm *kvm, * by the memslot, KVM can't use a hugepage due to the * misaligned address regardless of memory attributes. */ - if (gfn >= slot->base_gfn) { + if (gfn >= slot->base_gfn && + gfn + nr_pages <= slot->base_gfn + slot->npages) { if (hugepage_has_attrs(kvm, slot, gfn, level, attrs)) hugepage_clear_mixed(slot, gfn, level); else diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 5390a591a571..db8f33e4de62 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -6,6 +6,8 @@ #include <linux/kvm_host.h> #include <asm/kvm_host.h> +#include "mmu.h" + #ifdef CONFIG_KVM_PROVE_MMU #define KVM_MMU_WARN_ON(x) WARN_ON_ONCE(x) #else @@ -101,7 +103,22 @@ struct kvm_mmu_page { int root_count; refcount_t tdp_mmu_root_count; }; - unsigned int unsync_children; + union { + /* These two members aren't used for TDP MMU */ + struct { + unsigned int unsync_children; + /* + * Number of writes since the last time traversal + * visited this page. + */ + atomic_t write_flooding_count; + }; + /* + * Page table page of external PT. + * Passed to TDX module, not accessed by KVM. + */ + void *external_spt; + }; union { struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ tdp_ptep_t ptep; @@ -124,9 +141,6 @@ struct kvm_mmu_page { int clear_spte_count; #endif - /* Number of writes since the last time traversal visited this page. */ - atomic_t write_flooding_count; - #ifdef CONFIG_X86_64 /* Used for freeing the page asynchronously if it is a TDP MMU page. */ struct rcu_head rcu_head; @@ -145,7 +159,36 @@ static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) return kvm_mmu_role_as_id(sp->role); } -static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp) +static inline bool is_mirror_sp(const struct kvm_mmu_page *sp) +{ + return sp->role.is_mirror; +} + +static inline void kvm_mmu_alloc_external_spt(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + /* + * external_spt is allocated for TDX module to hold private EPT mappings, + * TDX module will initialize the page by itself. + * Therefore, KVM does not need to initialize or access external_spt. + * KVM only interacts with sp->spt for private EPT operations. + */ + sp->external_spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_external_spt_cache); +} + +static inline gfn_t kvm_gfn_root_bits(const struct kvm *kvm, const struct kvm_mmu_page *root) +{ + /* + * Since mirror SPs are used only for TDX, which maps private memory + * at its "natural" GFN, no mask needs to be applied to them - and, dually, + * we expect that the bits is only used for the shared PT. + */ + if (is_mirror_sp(root)) + return 0; + return kvm_gfn_direct_bits(kvm); +} + +static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm *kvm, + struct kvm_mmu_page *sp) { /* * When using the EPT page-modification log, the GPAs in the CPU dirty @@ -155,7 +198,7 @@ static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp) * being enabled is mandatory as the bits used to denote WP-only SPTEs * are reserved for PAE paging (32-bit KVM). */ - return kvm_x86_ops.cpu_dirty_log_size && sp->role.guest_mode; + return kvm->arch.cpu_dirty_log_size && sp->role.guest_mode; } static inline gfn_t gfn_round_for_level(gfn_t gfn, int level) @@ -164,7 +207,7 @@ static inline gfn_t gfn_round_for_level(gfn_t gfn, int level) } int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, - gfn_t gfn, bool can_unsync, bool prefetch); + gfn_t gfn, bool synchronizing, bool prefetch); void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); @@ -190,7 +233,7 @@ static inline bool is_nx_huge_page_enabled(struct kvm *kvm) struct kvm_page_fault { /* arguments to kvm_mmu_do_page_fault. */ const gpa_t addr; - const u32 error_code; + const u64 error_code; const bool prefetch; /* Derived from error_code. */ @@ -229,16 +272,21 @@ struct kvm_page_fault { */ u8 goal_level; - /* Shifted addr, or result of guest page table walk if addr is a gva. */ + /* + * Shifted addr, or result of guest page table walk if addr is a gva. In + * the case of VM where memslot's can be mapped at multiple GPA aliases + * (i.e. TDX), the gfn field does not contain the bit that selects between + * the aliases (i.e. the shared bit for TDX). + */ gfn_t gfn; /* The memslot containing gfn. May be NULL. */ struct kvm_memory_slot *slot; - /* Outputs of kvm_faultin_pfn. */ + /* Outputs of kvm_mmu_faultin_pfn(). */ unsigned long mmu_seq; kvm_pfn_t pfn; - hva_t hva; + struct page *refcounted_page; bool map_writable; /* @@ -258,6 +306,8 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); * RET_PF_CONTINUE: So far, so good, keep handling the page fault. * RET_PF_RETRY: let CPU fault again on the address. * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. + * RET_PF_WRITE_PROTECTED: the gfn is write-protected, either unprotected the + * gfn and retry, or emulate the instruction directly. * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. * RET_PF_FIXED: The faulting entry has been fixed. * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. @@ -266,21 +316,37 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); * tracepoints via TRACE_DEFINE_ENUM() in mmutrace.h * * Note, all values must be greater than or equal to zero so as not to encroach - * on -errno return values. Somewhat arbitrarily use '0' for CONTINUE, which - * will allow for efficient machine code when checking for CONTINUE, e.g. - * "TEST %rax, %rax, JNZ", as all "stop!" values are non-zero. + * on -errno return values. */ enum { RET_PF_CONTINUE = 0, RET_PF_RETRY, RET_PF_EMULATE, + RET_PF_WRITE_PROTECTED, RET_PF_INVALID, RET_PF_FIXED, RET_PF_SPURIOUS, }; +/* + * Define RET_PF_CONTINUE as 0 to allow for + * - efficient machine code when checking for CONTINUE, e.g. + * "TEST %rax, %rax, JNZ", as all "stop!" values are non-zero, + * - kvm_mmu_do_page_fault() to return other RET_PF_* as a positive value. + */ +static_assert(RET_PF_CONTINUE == 0); + +static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) +{ + kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT, + PAGE_SIZE, fault->write, fault->exec, + fault->is_private); +} + static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - u32 err, bool prefetch, int *emulation_type) + u64 err, bool prefetch, + int *emulation_type, u8 *level) { struct kvm_page_fault fault = { .addr = cr2_or_gpa, @@ -298,55 +364,55 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, .max_level = KVM_MAX_HUGEPAGE_LEVEL, .req_level = PG_LEVEL_4K, .goal_level = PG_LEVEL_4K, - .is_private = kvm_mem_is_private(vcpu->kvm, cr2_or_gpa >> PAGE_SHIFT), + .is_private = err & PFERR_PRIVATE_ACCESS, + + .pfn = KVM_PFN_ERR_FAULT, }; int r; if (vcpu->arch.mmu->root_role.direct) { - fault.gfn = fault.addr >> PAGE_SHIFT; + /* + * Things like memslots don't understand the concept of a shared + * bit. Strip it so that the GFN can be used like normal, and the + * fault.addr can be used when the shared bit is needed. + */ + fault.gfn = gpa_to_gfn(fault.addr) & ~kvm_gfn_direct_bits(vcpu->kvm); fault.slot = kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn); } /* - * Async #PF "faults", a.k.a. prefetch faults, are not faults from the - * guest perspective and have already been counted at the time of the - * original fault. + * With retpoline being active an indirect call is rather expensive, + * so do a direct call in the most common case. */ - if (!prefetch) - vcpu->stat.pf_taken++; - if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && fault.is_tdp) r = kvm_tdp_page_fault(vcpu, &fault); else r = vcpu->arch.mmu->page_fault(vcpu, &fault); + /* + * Not sure what's happening, but punt to userspace and hope that + * they can fix it by changing memory to shared, or they can + * provide a better error. + */ + if (r == RET_PF_EMULATE && fault.is_private) { + pr_warn_ratelimited("kvm: unexpected emulation request on private memory\n"); + kvm_mmu_prepare_memory_fault_exit(vcpu, &fault); + return -EFAULT; + } + if (fault.write_fault_to_shadow_pgtable && emulation_type) *emulation_type |= EMULTYPE_WRITE_PF_TO_SP; + if (level) + *level = fault.goal_level; - /* - * Similar to above, prefetch faults aren't truly spurious, and the - * async #PF path doesn't do emulation. Do count faults that are fixed - * by the async #PF handler though, otherwise they'll never be counted. - */ - if (r == RET_PF_FIXED) - vcpu->stat.pf_fixed++; - else if (prefetch) - ; - else if (r == RET_PF_EMULATE) - vcpu->stat.pf_emulate++; - else if (r == RET_PF_SPURIOUS) - vcpu->stat.pf_spurious++; return r; } int kvm_mmu_max_mapping_level(struct kvm *kvm, - const struct kvm_memory_slot *slot, gfn_t gfn, - int max_level); + const struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level); -void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); - void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index ae86820cef69..f35a830ce469 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -57,6 +57,7 @@ TRACE_DEFINE_ENUM(RET_PF_CONTINUE); TRACE_DEFINE_ENUM(RET_PF_RETRY); TRACE_DEFINE_ENUM(RET_PF_EMULATE); +TRACE_DEFINE_ENUM(RET_PF_WRITE_PROTECTED); TRACE_DEFINE_ENUM(RET_PF_INVALID); TRACE_DEFINE_ENUM(RET_PF_FIXED); TRACE_DEFINE_ENUM(RET_PF_SPURIOUS); @@ -260,7 +261,7 @@ TRACE_EVENT( TP_STRUCT__entry( __field(int, vcpu_id) __field(gpa_t, cr2_or_gpa) - __field(u32, error_code) + __field(u64, error_code) __field(u64 *, sptep) __field(u64, old_spte) __field(u64, new_spte) diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index f6448284c18e..1b17b12393a8 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -41,7 +41,7 @@ bool kvm_page_track_write_tracking_enabled(struct kvm *kvm) void kvm_page_track_free_memslot(struct kvm_memory_slot *slot) { - kvfree(slot->arch.gfn_write_track); + vfree(slot->arch.gfn_write_track); slot->arch.gfn_write_track = NULL; } @@ -172,6 +172,9 @@ static int kvm_enable_external_write_tracking(struct kvm *kvm) struct kvm_memory_slot *slot; int r = 0, i, bkt; + if (kvm->arch.vm_type == KVM_X86_TDX_VM) + return -EOPNOTSUPP; + mutex_lock(&kvm->slots_arch_lock); /* diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 4d4e98fe4f35..68e323568e95 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -497,21 +497,20 @@ error: * The other bits are set to 0. */ if (!(errcode & PFERR_RSVD_MASK)) { - vcpu->arch.exit_qualification &= (EPT_VIOLATION_GVA_IS_VALID | - EPT_VIOLATION_GVA_TRANSLATED); + walker->fault.exit_qualification = 0; + if (write_fault) - vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_WRITE; + walker->fault.exit_qualification |= EPT_VIOLATION_ACC_WRITE; if (user_fault) - vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_READ; + walker->fault.exit_qualification |= EPT_VIOLATION_ACC_READ; if (fetch_fault) - vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_INSTR; + walker->fault.exit_qualification |= EPT_VIOLATION_ACC_INSTR; /* * Note, pte_access holds the raw RWX bits from the EPTE, not * ACC_*_MASK flags! */ - vcpu->arch.exit_qualification |= (pte_access & VMX_EPT_RWX_MASK) << - EPT_VIOLATION_RWX_SHIFT; + walker->fault.exit_qualification |= EPT_VIOLATION_RWX_TO_PROT(pte_access); } #endif walker->fault.address = addr; @@ -533,10 +532,8 @@ static bool FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, pt_element_t gpte) { - struct kvm_memory_slot *slot; unsigned pte_access; gfn_t gfn; - kvm_pfn_t pfn; if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) return false; @@ -545,17 +542,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, pte_access = sp->role.access & FNAME(gpte_access)(gpte); FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); - slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, pte_access & ACC_WRITE_MASK); - if (!slot) - return false; - - pfn = gfn_to_pfn_memslot_atomic(slot, gfn); - if (is_error_pfn(pfn)) - return false; - - mmu_set_spte(vcpu, slot, spte, pte_access, gfn, pfn, NULL); - kvm_release_pfn_clean(pfn); - return true; + return kvm_mmu_prefetch_sptes(vcpu, gfn, spte, 1, pte_access); } static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, @@ -646,10 +633,10 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, * really care if it changes underneath us after this point). */ if (FNAME(gpte_changed)(vcpu, gw, top_level)) - goto out_gpte_changed; + return RET_PF_RETRY; if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) - goto out_gpte_changed; + return RET_PF_RETRY; /* * Load a new root and retry the faulting instruction in the extremely @@ -659,7 +646,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, */ if (unlikely(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa))) { kvm_make_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu); - goto out_gpte_changed; + return RET_PF_RETRY; } for_each_shadow_entry(vcpu, fault->addr, it) { @@ -674,34 +661,38 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, sp = kvm_mmu_get_child_sp(vcpu, it.sptep, table_gfn, false, access); - if (sp != ERR_PTR(-EEXIST)) { - /* - * We must synchronize the pagetable before linking it - * because the guest doesn't need to flush tlb when - * the gpte is changed from non-present to present. - * Otherwise, the guest may use the wrong mapping. - * - * For PG_LEVEL_4K, kvm_mmu_get_page() has already - * synchronized it transiently via kvm_sync_page(). - * - * For higher level pagetable, we synchronize it via - * the slower mmu_sync_children(). If it needs to - * break, some progress has been made; return - * RET_PF_RETRY and retry on the next #PF. - * KVM_REQ_MMU_SYNC is not necessary but it - * expedites the process. - */ - if (sp->unsync_children && - mmu_sync_children(vcpu, sp, false)) - return RET_PF_RETRY; - } + /* + * Synchronize the new page before linking it, as the CPU (KVM) + * is architecturally disallowed from inserting non-present + * entries into the TLB, i.e. the guest isn't required to flush + * the TLB when changing the gPTE from non-present to present. + * + * For PG_LEVEL_4K, kvm_mmu_find_shadow_page() has already + * synchronized the page via kvm_sync_page(). + * + * For higher level pages, which cannot be unsync themselves + * but can have unsync children, synchronize via the slower + * mmu_sync_children(). If KVM needs to drop mmu_lock due to + * contention or to reschedule, instruct the caller to retry + * the #PF (mmu_sync_children() ensures forward progress will + * be made). + */ + if (sp != ERR_PTR(-EEXIST) && sp->unsync_children && + mmu_sync_children(vcpu, sp, false)) + return RET_PF_RETRY; /* - * Verify that the gpte in the page we've just write - * protected is still there. + * Verify that the gpte in the page, which is now either + * write-protected or unsync, wasn't modified between the fault + * and acquiring mmu_lock. This needs to be done even when + * reusing an existing shadow page to ensure the information + * gathered by the walker matches the information stored in the + * shadow page (which could have been modified by a different + * vCPU even if the page was already linked). Holding mmu_lock + * prevents the shadow page from changing after this point. */ if (FNAME(gpte_changed)(vcpu, gw, it.level - 1)) - goto out_gpte_changed; + return RET_PF_RETRY; if (sp != ERR_PTR(-EEXIST)) link_shadow_page(vcpu, it.sptep, sp); @@ -755,9 +746,6 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, FNAME(pte_prefetch)(vcpu, gw, it.sptep); return ret; - -out_gpte_changed: - return RET_PF_RETRY; } /* @@ -805,14 +793,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (page_fault_handle_page_track(vcpu, fault)) { shadow_page_table_clear_flood(vcpu, fault->addr); - return RET_PF_EMULATE; + return RET_PF_WRITE_PROTECTED; } r = mmu_topup_memory_caches(vcpu, true); if (r) return r; - r = kvm_faultin_pfn(vcpu, fault, walker.pte_access); + r = kvm_mmu_faultin_pfn(vcpu, fault, walker.pte_access); if (r != RET_PF_CONTINUE) return r; @@ -847,8 +835,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault r = FNAME(fetch)(vcpu, fault, &walker); out_unlock: + kvm_mmu_finish_page_fault(vcpu, fault, r); write_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(fault->pfn); return r; } @@ -891,9 +879,9 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, /* * Using the information in sp->shadowed_translation (kvm_mmu_page_get_gfn()) is - * safe because: - * - The spte has a reference to the struct page, so the pfn for a given gfn - * can't change unless all sptes pointing to it are nuked first. + * safe because SPTEs are protected by mmu_notifiers and memslot generations, so + * the pfn for a given gfn can't change unless all SPTEs pointing to the gfn are + * nuked first. * * Returns * < 0: failed to sync spte @@ -911,7 +899,8 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int gpa_t pte_gpa; gfn_t gfn; - if (WARN_ON_ONCE(!sp->spt[i])) + if (WARN_ON_ONCE(sp->spt[i] == SHADOW_NONPRESENT_VALUE || + !sp->shadowed_translation)) return 0; first_pte_gpa = FNAME(get_level1_sp_gpa)(sp); @@ -933,13 +922,13 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int return 0; /* - * Drop the SPTE if the new protections would result in a RWX=0 - * SPTE or if the gfn is changing. The RWX=0 case only affects - * EPT with execute-only support, i.e. EPT without an effective - * "present" bit, as all other paging modes will create a - * read-only SPTE if pte_access is zero. + * Drop the SPTE if the new protections result in no effective + * "present" bit or if the gfn is changing. The former case + * only affects EPT with execute-only support with pte_access==0; + * all other paging modes will create a read-only SPTE if + * pte_access is zero. */ - if ((!pte_access && !shadow_present_mask) || + if ((pte_access | shadow_present_mask) == SHADOW_NONPRESENT_VALUE || gfn != kvm_mmu_page_get_gfn(sp, i)) { drop_spte(vcpu->kvm, &sp->spt[i]); return 1; @@ -961,9 +950,14 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int host_writable = spte & shadow_host_writable_mask; slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); make_spte(vcpu, sp, slot, pte_access, gfn, - spte_to_pfn(spte), spte, true, false, + spte_to_pfn(spte), spte, true, true, host_writable, &spte); + /* + * There is no need to mark the pfn dirty, as the new protections must + * be a subset of the old protections, i.e. synchronizing a SPTE cannot + * change the SPTE from read-only to writable. + */ return mmu_spte_update(sptep, spte); } diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 4a599130e9c9..cfce03d8f123 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -24,6 +24,8 @@ static bool __ro_after_init allow_mmio_caching; module_param_named(mmio_caching, enable_mmio_caching, bool, 0444); EXPORT_SYMBOL_GPL(enable_mmio_caching); +bool __read_mostly kvm_ad_enabled; + u64 __read_mostly shadow_host_writable_mask; u64 __read_mostly shadow_mmu_writable_mask; u64 __read_mostly shadow_nx_mask; @@ -35,7 +37,6 @@ u64 __read_mostly shadow_mmio_value; u64 __read_mostly shadow_mmio_mask; u64 __read_mostly shadow_mmio_access_mask; u64 __read_mostly shadow_present_mask; -u64 __read_mostly shadow_memtype_mask; u64 __read_mostly shadow_me_value; u64 __read_mostly shadow_me_mask; u64 __read_mostly shadow_acc_track_mask; @@ -43,7 +44,25 @@ u64 __read_mostly shadow_acc_track_mask; u64 __read_mostly shadow_nonpresent_or_rsvd_mask; u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; -u8 __read_mostly shadow_phys_bits; +static u8 __init kvm_get_host_maxphyaddr(void) +{ + /* + * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected + * in CPU detection code, but the processor treats those reduced bits as + * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at + * the physical address bits reported by CPUID, i.e. the raw MAXPHYADDR, + * when reasoning about CPU behavior with respect to MAXPHYADDR. + */ + if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008)) + return cpuid_eax(0x80000008) & 0xff; + + /* + * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with + * custom CPUID. Proceed with whatever the kernel found since these features + * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008). + */ + return boot_cpu_data.x86_phys_bits; +} void __init kvm_mmu_spte_module_init(void) { @@ -55,6 +74,8 @@ void __init kvm_mmu_spte_module_init(void) * will change when the vendor module is (re)loaded. */ allow_mmio_caching = enable_mmio_caching; + + kvm_host.maxphyaddr = kvm_get_host_maxphyaddr(); } static u64 generation_mmio_spte_mask(u64 gen) @@ -74,10 +95,8 @@ u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access) u64 spte = generation_mmio_spte_mask(gen); u64 gpa = gfn << PAGE_SHIFT; - WARN_ON_ONCE(!shadow_mmio_value); - access &= shadow_mmio_access_mask; - spte |= shadow_mmio_value | access; + spte |= vcpu->kvm->arch.shadow_mmio_value | access; spte |= gpa | shadow_nonpresent_or_rsvd_mask; spte |= (gpa & shadow_nonpresent_or_rsvd_mask) << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN; @@ -107,59 +126,60 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) } /* - * Returns true if the SPTE has bits that may be set without holding mmu_lock. - * The caller is responsible for checking if the SPTE is shadow-present, and - * for determining whether or not the caller cares about non-leaf SPTEs. + * Returns true if the SPTE needs to be updated atomically due to having bits + * that may be changed without holding mmu_lock, and for which KVM must not + * lose information. E.g. KVM must not drop Dirty bit information. The caller + * is responsible for checking if the SPTE is shadow-present, and for + * determining whether or not the caller cares about non-leaf SPTEs. */ -bool spte_has_volatile_bits(u64 spte) +bool spte_needs_atomic_update(u64 spte) { - /* - * Always atomically update spte if it can be updated - * out of mmu-lock, it can ensure dirty bit is not lost, - * also, it can help us to get a stable is_writable_pte() - * to ensure tlb flush is not missed. - */ + /* SPTEs can be made Writable bit by KVM's fast page fault handler. */ if (!is_writable_pte(spte) && is_mmu_writable_spte(spte)) return true; - if (is_access_track_spte(spte)) + /* + * A/D-disabled SPTEs can be access-tracked by aging, and access-tracked + * SPTEs can be restored by KVM's fast page fault handler. + */ + if (!spte_ad_enabled(spte)) return true; - if (spte_ad_enabled(spte)) { - if (!(spte & shadow_accessed_mask) || - (is_writable_pte(spte) && !(spte & shadow_dirty_mask))) - return true; - } - - return false; + /* + * Dirty and Accessed bits can be set by the CPU. Ignore the Accessed + * bit, as KVM tolerates false negatives/positives, e.g. KVM doesn't + * invalidate TLBs when aging SPTEs, and so it's safe to clobber the + * Accessed bit (and rare in practice). + */ + return is_writable_pte(spte) && !(spte & shadow_dirty_mask); } bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, const struct kvm_memory_slot *slot, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, - u64 old_spte, bool prefetch, bool can_unsync, + u64 old_spte, bool prefetch, bool synchronizing, bool host_writable, u64 *new_spte) { int level = sp->role.level; u64 spte = SPTE_MMU_PRESENT_MASK; bool wrprot = false; - WARN_ON_ONCE(!pte_access && !shadow_present_mask); + /* + * For the EPT case, shadow_present_mask has no RWX bits set if + * exec-only page table entries are supported. In that case, + * ACC_USER_MASK and shadow_user_mask are used to represent + * read access. See FNAME(gpte_access) in paging_tmpl.h. + */ + WARN_ON_ONCE((pte_access | shadow_present_mask) == SHADOW_NONPRESENT_VALUE); if (sp->role.ad_disabled) spte |= SPTE_TDP_AD_DISABLED; - else if (kvm_mmu_page_ad_need_write_protect(sp)) + else if (kvm_mmu_page_ad_need_write_protect(vcpu->kvm, sp)) spte |= SPTE_TDP_AD_WRPROT_ONLY; - /* - * For the EPT case, shadow_present_mask is 0 if hardware - * supports exec-only page table entries. In that case, - * ACC_USER_MASK and shadow_user_mask are used to represent - * read access. See FNAME(gpte_access) in paging_tmpl.h. - */ spte |= shadow_present_mask; - if (!prefetch) - spte |= spte_shadow_accessed_mask(spte); + if (!prefetch || synchronizing) + spte |= shadow_accessed_mask; /* * For simplicity, enforce the NX huge page mitigation even if not @@ -189,9 +209,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (level > PG_LEVEL_4K) spte |= PT_PAGE_SIZE_MASK; - if (shadow_memtype_mask) - spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn, - kvm_is_mmio_pfn(pfn)); + spte |= kvm_x86_call(get_mt_mask)(vcpu, gfn, kvm_is_mmio_pfn(pfn)); if (host_writable) spte |= shadow_host_writable_mask; else @@ -203,41 +221,39 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, spte |= (u64)pfn << PAGE_SHIFT; if (pte_access & ACC_WRITE_MASK) { - spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask; - - /* - * Optimization: for pte sync, if spte was writable the hash - * lookup is unnecessary (and expensive). Write protection - * is responsibility of kvm_mmu_get_page / kvm_mmu_sync_roots. - * Same reasoning can be applied to dirty page accounting. - */ - if (is_writable_pte(old_spte)) - goto out; - /* * Unsync shadow pages that are reachable by the new, writable * SPTE. Write-protect the SPTE if the page can't be unsync'd, * e.g. it's write-tracked (upper-level SPs) or has one or more * shadow pages and unsync'ing pages is not allowed. + * + * When overwriting an existing leaf SPTE, and the old SPTE was + * writable, skip trying to unsync shadow pages as any relevant + * shadow pages must already be unsync, i.e. the hash lookup is + * unnecessary (and expensive). Note, this relies on KVM not + * changing PFNs without first zapping the old SPTE, which is + * guaranteed by both the shadow MMU and the TDP MMU. */ - if (mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, can_unsync, prefetch)) { + if ((!is_last_spte(old_spte, level) || !is_writable_pte(old_spte)) && + mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, synchronizing, prefetch)) wrprot = true; - pte_access &= ~ACC_WRITE_MASK; - spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); - } + else + spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask | + shadow_dirty_mask; } - if (pte_access & ACC_WRITE_MASK) - spte |= spte_shadow_dirty_mask(spte); - -out: - if (prefetch) + if (prefetch && !synchronizing) spte = mark_spte_for_access_track(spte); WARN_ONCE(is_rsvd_spte(&vcpu->arch.mmu->shadow_zero_check, spte, level), "spte = 0x%llx, level = %d, rsvd bits = 0x%llx", spte, level, get_rsvd_bits(&vcpu->arch.mmu->shadow_zero_check, spte, level)); + /* + * Mark the memslot dirty *after* modifying it for access tracking. + * Unlike folios, memslots can be safely marked dirty out of mmu_lock, + * i.e. in the fast page fault handler. + */ if ((spte & PT_WRITABLE_MASK) && kvm_slot_dirty_track_enabled(slot)) { /* Enforced by kvm_mmu_hugepage_adjust. */ WARN_ON_ONCE(level > PG_LEVEL_4K); @@ -248,15 +264,15 @@ out: return wrprot; } -static u64 make_spte_executable(u64 spte) +static u64 modify_spte_protections(u64 spte, u64 set, u64 clear) { bool is_access_track = is_access_track_spte(spte); if (is_access_track) spte = restore_acc_track_spte(spte); - spte &= ~shadow_nx_mask; - spte |= shadow_x_mask; + KVM_MMU_WARN_ON(set & clear); + spte = (spte | set) & ~clear; if (is_access_track) spte = mark_spte_for_access_track(spte); @@ -264,6 +280,16 @@ static u64 make_spte_executable(u64 spte) return spte; } +static u64 make_spte_executable(u64 spte) +{ + return modify_spte_protections(spte, shadow_x_mask, shadow_nx_mask); +} + +static u64 make_spte_nonexecutable(u64 spte) +{ + return modify_spte_protections(spte, shadow_nx_mask, shadow_x_mask); +} + /* * Construct an SPTE that maps a sub-page of the given huge page SPTE where * `index` identifies which sub-page. @@ -271,18 +297,12 @@ static u64 make_spte_executable(u64 spte) * This is used during huge page splitting to build the SPTEs that make up the * new page table. */ -u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, union kvm_mmu_page_role role, - int index) +u64 make_small_spte(struct kvm *kvm, u64 huge_spte, + union kvm_mmu_page_role role, int index) { - u64 child_spte; - - if (WARN_ON_ONCE(!is_shadow_present_pte(huge_spte))) - return 0; + u64 child_spte = huge_spte; - if (WARN_ON_ONCE(!is_large_pte(huge_spte))) - return 0; - - child_spte = huge_spte; + KVM_BUG_ON(!is_shadow_present_pte(huge_spte) || !is_large_pte(huge_spte), kvm); /* * The child_spte already has the base address of the huge page being @@ -306,6 +326,26 @@ u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, union kvm_mmu_page return child_spte; } +u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level) +{ + u64 huge_spte; + + KVM_BUG_ON(!is_shadow_present_pte(small_spte) || level == PG_LEVEL_4K, kvm); + + huge_spte = small_spte | PT_PAGE_SIZE_MASK; + + /* + * huge_spte already has the address of the sub-page being collapsed + * from small_spte, so just clear the lower address bits to create the + * huge page address. + */ + huge_spte &= KVM_HPAGE_MASK(level) | ~PAGE_MASK; + + if (is_nx_huge_page_enabled(kvm)) + huge_spte = make_spte_nonexecutable(huge_spte); + + return huge_spte; +} u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) { @@ -322,22 +362,6 @@ u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) return spte; } -u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn) -{ - u64 new_spte; - - new_spte = old_spte & ~SPTE_BASE_ADDR_MASK; - new_spte |= (u64)new_pfn << PAGE_SHIFT; - - new_spte &= ~PT_WRITABLE_MASK; - new_spte &= ~shadow_host_writable_mask; - new_spte &= ~shadow_mmu_writable_mask; - - new_spte = mark_spte_for_access_track(new_spte); - - return new_spte; -} - u64 mark_spte_for_access_track(u64 spte) { if (spte_ad_enabled(spte)) @@ -354,7 +378,7 @@ u64 mark_spte_for_access_track(u64 spte) spte |= (spte & SHADOW_ACC_TRACK_SAVED_BITS_MASK) << SHADOW_ACC_TRACK_SAVED_BITS_SHIFT; - spte &= ~shadow_acc_track_mask; + spte &= ~(shadow_acc_track_mask | shadow_accessed_mask); return spte; } @@ -393,13 +417,13 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask) mmio_value = 0; /* - * The masked MMIO value must obviously match itself and a removed SPTE - * must not get a false positive. Removed SPTEs and MMIO SPTEs should - * never collide as MMIO must set some RWX bits, and removed SPTEs must + * The masked MMIO value must obviously match itself and a frozen SPTE + * must not get a false positive. Frozen SPTEs and MMIO SPTEs should + * never collide as MMIO must set some RWX bits, and frozen SPTEs must * not set any RWX bits. */ if (WARN_ON((mmio_value & mmio_mask) != mmio_value) || - WARN_ON(mmio_value && (REMOVED_SPTE & mmio_mask) == mmio_value)) + WARN_ON(mmio_value && (FROZEN_SPTE & mmio_mask) == mmio_value)) mmio_value = 0; if (!mmio_value) @@ -411,6 +435,12 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask) } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); +void kvm_mmu_set_mmio_spte_value(struct kvm *kvm, u64 mmio_value) +{ + kvm->arch.shadow_mmio_value = mmio_value; +} +EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_value); + void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask) { /* shadow_me_value must be a subset of shadow_me_mask */ @@ -424,19 +454,17 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_me_spte_mask); void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) { + kvm_ad_enabled = has_ad_bits; + shadow_user_mask = VMX_EPT_READABLE_MASK; - shadow_accessed_mask = has_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull; - shadow_dirty_mask = has_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull; + shadow_accessed_mask = VMX_EPT_ACCESS_BIT; + shadow_dirty_mask = VMX_EPT_DIRTY_BIT; shadow_nx_mask = 0ull; shadow_x_mask = VMX_EPT_EXECUTABLE_MASK; - shadow_present_mask = has_exec_only ? 0ull : VMX_EPT_READABLE_MASK; - /* - * EPT overrides the host MTRRs, and so KVM must program the desired - * memtype directly into the SPTEs. Note, this mask is just the mask - * of all bits that factor into the memtype, the actual memtype must be - * dynamically calculated, e.g. to ensure host MMIO is mapped UC. - */ - shadow_memtype_mask = VMX_EPT_MT_MASK | VMX_EPT_IPAT_BIT; + /* VMX_EPT_SUPPRESS_VE_BIT is needed for W or X violation. */ + shadow_present_mask = + (has_exec_only ? 0ull : VMX_EPT_READABLE_MASK) | VMX_EPT_SUPPRESS_VE_BIT; + shadow_acc_track_mask = VMX_EPT_RWX_MASK; shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE; shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE; @@ -446,7 +474,7 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) * of an EPT paging-structure entry is 110b (write/execute). */ kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, - VMX_EPT_RWX_MASK, 0); + VMX_EPT_RWX_MASK | VMX_EPT_SUPPRESS_VE_BIT, 0); } EXPORT_SYMBOL_GPL(kvm_mmu_set_ept_masks); @@ -455,7 +483,7 @@ void kvm_mmu_reset_all_pte_masks(void) u8 low_phys_bits; u64 mask; - shadow_phys_bits = kvm_get_shadow_phys_bits(); + kvm_ad_enabled = true; /* * If the CPU has 46 or less physical address bits, then set an @@ -488,12 +516,6 @@ void kvm_mmu_reset_all_pte_masks(void) shadow_x_mask = 0; shadow_present_mask = PT_PRESENT_MASK; - /* - * For shadow paging and NPT, KVM uses PAT entry '0' to encode WB - * memtype in the SPTEs, i.e. relies on host MTRRs to provide the - * correct memtype (WB is the "weakest" memtype). - */ - shadow_memtype_mask = 0; shadow_acc_track_mask = 0; shadow_me_mask = 0; shadow_me_value = 0; @@ -508,7 +530,7 @@ void kvm_mmu_reset_all_pte_masks(void) * 52-bit physical addresses then there are no reserved PA bits in the * PTEs and so the reserved PA approach must be disabled. */ - if (shadow_phys_bits < 52) + if (kvm_host.maxphyaddr < 52) mask = BIT_ULL(51) | PT_PRESENT_MASK; else mask = 0; diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index a129951c9a88..1e94f081bdaf 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -3,6 +3,8 @@ #ifndef KVM_X86_MMU_SPTE_H #define KVM_X86_MMU_SPTE_H +#include <asm/vmx.h> + #include "mmu.h" #include "mmu_internal.h" @@ -149,6 +151,31 @@ static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11); #define MMIO_SPTE_GEN_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0) +/* + * Non-present SPTE value needs to set bit 63 for TDX, in order to suppress + * #VE and get EPT violations on non-present PTEs. We can use the + * same value also without TDX for both VMX and SVM: + * + * For SVM NPT, for non-present spte (bit 0 = 0), other bits are ignored. + * For VMX EPT, bit 63 is ignored if #VE is disabled. (EPT_VIOLATION_VE=0) + * bit 63 is #VE suppress if #VE is enabled. (EPT_VIOLATION_VE=1) + */ +#ifdef CONFIG_X86_64 +#define SHADOW_NONPRESENT_VALUE BIT_ULL(63) +static_assert(!(SHADOW_NONPRESENT_VALUE & SPTE_MMU_PRESENT_MASK)); +#else +#define SHADOW_NONPRESENT_VALUE 0ULL +#endif + + +/* + * True if A/D bits are supported in hardware and are enabled by KVM. When + * enabled, KVM uses A/D bits for all non-nested MMUs. Because L1 can disable + * A/D bits in EPTP12, SP and SPTE variants are needed to handle the scenario + * where KVM is using A/D bits for L1, but not L2. + */ +extern bool __read_mostly kvm_ad_enabled; + extern u64 __read_mostly shadow_host_writable_mask; extern u64 __read_mostly shadow_mmu_writable_mask; extern u64 __read_mostly shadow_nx_mask; @@ -160,7 +187,6 @@ extern u64 __read_mostly shadow_mmio_value; extern u64 __read_mostly shadow_mmio_mask; extern u64 __read_mostly shadow_mmio_access_mask; extern u64 __read_mostly shadow_present_mask; -extern u64 __read_mostly shadow_memtype_mask; extern u64 __read_mostly shadow_me_value; extern u64 __read_mostly shadow_me_mask; @@ -184,24 +210,24 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; /* * If a thread running without exclusive control of the MMU lock must perform a - * multi-part operation on an SPTE, it can set the SPTE to REMOVED_SPTE as a + * multi-part operation on an SPTE, it can set the SPTE to FROZEN_SPTE as a * non-present intermediate value. Other threads which encounter this value * should not modify the SPTE. * * Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on * both AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF - * vulnerability. Use only low bits to avoid 64-bit immediates. + * vulnerability. * * Only used by the TDP MMU. */ -#define REMOVED_SPTE 0x5a0ULL +#define FROZEN_SPTE (SHADOW_NONPRESENT_VALUE | 0x5a0ULL) -/* Removed SPTEs must not be misconstrued as shadow present PTEs. */ -static_assert(!(REMOVED_SPTE & SPTE_MMU_PRESENT_MASK)); +/* Frozen SPTEs must not be misconstrued as shadow present PTEs. */ +static_assert(!(FROZEN_SPTE & SPTE_MMU_PRESENT_MASK)); -static inline bool is_removed_spte(u64 spte) +static inline bool is_frozen_spte(u64 spte) { - return spte == REMOVED_SPTE; + return spte == FROZEN_SPTE; } /* Get an SPTE's index into its parent's page table (and the spt array). */ @@ -249,9 +275,14 @@ static inline struct kvm_mmu_page *root_to_sp(hpa_t root) return spte_to_child_sp(root); } -static inline bool is_mmio_spte(u64 spte) +static inline bool is_mirror_sptep(tdp_ptep_t sptep) { - return (spte & shadow_mmio_mask) == shadow_mmio_value && + return is_mirror_sp(sptep_to_sp(rcu_dereference(sptep))); +} + +static inline bool is_mmio_spte(struct kvm *kvm, u64 spte) +{ + return (spte & shadow_mmio_mask) == kvm->arch.shadow_mmio_value && likely(enable_mmio_caching); } @@ -260,15 +291,11 @@ static inline bool is_shadow_present_pte(u64 pte) return !!(pte & SPTE_MMU_PRESENT_MASK); } -/* - * Returns true if A/D bits are supported in hardware and are enabled by KVM. - * When enabled, KVM uses A/D bits for all non-nested MMUs. Because L1 can - * disable A/D bits in EPTP12, SP and SPTE variants are needed to handle the - * scenario where KVM is using A/D bits for L1, but not L2. - */ -static inline bool kvm_ad_enabled(void) +static inline bool is_ept_ve_possible(u64 spte) { - return !!shadow_accessed_mask; + return (shadow_present_mask & VMX_EPT_SUPPRESS_VE_BIT) && + !(spte & VMX_EPT_SUPPRESS_VE_BIT) && + (spte & VMX_EPT_RWX_MASK) != VMX_EPT_MISCONFIG_WX_VALUE; } static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) @@ -293,18 +320,6 @@ static inline bool spte_ad_need_write_protect(u64 spte) return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED; } -static inline u64 spte_shadow_accessed_mask(u64 spte) -{ - KVM_MMU_WARN_ON(!is_shadow_present_pte(spte)); - return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; -} - -static inline u64 spte_shadow_dirty_mask(u64 spte) -{ - KVM_MMU_WARN_ON(!is_shadow_present_pte(spte)); - return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; -} - static inline bool is_access_track_spte(u64 spte) { return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0; @@ -332,17 +347,7 @@ static inline kvm_pfn_t spte_to_pfn(u64 pte) static inline bool is_accessed_spte(u64 spte) { - u64 accessed_mask = spte_shadow_accessed_mask(spte); - - return accessed_mask ? spte & accessed_mask - : !is_access_track_spte(spte); -} - -static inline bool is_dirty_spte(u64 spte) -{ - u64 dirty_mask = spte_shadow_dirty_mask(spte); - - return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK; + return spte & shadow_accessed_mask; } static inline u64 get_rsvd_bits(struct rsvd_bits_validate *rsvd_check, u64 pte, @@ -460,6 +465,50 @@ static inline bool is_mmu_writable_spte(u64 spte) return spte & shadow_mmu_writable_mask; } +/* + * Returns true if the access indicated by @fault is allowed by the existing + * SPTE protections. Note, the caller is responsible for checking that the + * SPTE is a shadow-present, leaf SPTE (either before or after). + */ +static inline bool is_access_allowed(struct kvm_page_fault *fault, u64 spte) +{ + if (fault->exec) + return is_executable_pte(spte); + + if (fault->write) + return is_writable_pte(spte); + + /* Fault was on Read access */ + return spte & PT_PRESENT_MASK; +} + +/* + * If the MMU-writable flag is cleared, i.e. the SPTE is write-protected for + * write-tracking, remote TLBs must be flushed, even if the SPTE was read-only, + * as KVM allows stale Writable TLB entries to exist. When dirty logging, KVM + * flushes TLBs based on whether or not dirty bitmap/ring entries were reaped, + * not whether or not SPTEs were modified, i.e. only the write-tracking case + * needs to flush at the time the SPTEs is modified, before dropping mmu_lock. + * + * Don't flush if the Accessed bit is cleared, as access tracking tolerates + * false negatives, e.g. KVM x86 omits TLB flushes even when aging SPTEs for a + * mmu_notifier.clear_flush_young() event. + * + * Lastly, don't flush if the Dirty bit is cleared, as KVM unconditionally + * flushes when enabling dirty logging (see kvm_mmu_slot_apply_flags()), and + * when clearing dirty logs, KVM flushes based on whether or not dirty entries + * were reaped from the bitmap/ring, not whether or not dirty SPTEs were found. + * + * Note, this logic only applies to shadow-present leaf SPTEs. The caller is + * responsible for checking that the old SPTE is shadow-present, and is also + * responsible for determining whether or not a TLB flush is required when + * modifying a shadow-present non-leaf SPTE. + */ +static inline bool leaf_spte_change_needs_tlb_flush(u64 old_spte, u64 new_spte) +{ + return is_mmu_writable_spte(old_spte) && !is_mmu_writable_spte(new_spte); +} + static inline u64 get_mmio_spte_generation(u64 spte) { u64 gen; @@ -469,15 +518,16 @@ static inline u64 get_mmio_spte_generation(u64 spte) return gen; } -bool spte_has_volatile_bits(u64 spte); +bool spte_needs_atomic_update(u64 spte); bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, const struct kvm_memory_slot *slot, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, - u64 old_spte, bool prefetch, bool can_unsync, + u64 old_spte, bool prefetch, bool synchronizing, bool host_writable, u64 *new_spte); -u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, - union kvm_mmu_page_role role, int index); +u64 make_small_spte(struct kvm *kvm, u64 huge_spte, + union kvm_mmu_page_role role, int index); +u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level); u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled); u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access); u64 mark_spte_for_access_track(u64 spte); @@ -496,8 +546,6 @@ static inline u64 restore_acc_track_spte(u64 spte) return spte; } -u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn); - void __init kvm_mmu_spte_module_init(void); void kvm_mmu_reset_all_pte_masks(void); diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index 04c247bfe318..9e17bfa80901 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -12,7 +12,7 @@ static void tdp_iter_refresh_sptep(struct tdp_iter *iter) { iter->sptep = iter->pt_path[iter->level - 1] + - SPTE_INDEX(iter->gfn << PAGE_SHIFT, iter->level); + SPTE_INDEX((iter->gfn | iter->gfn_bits) << PAGE_SHIFT, iter->level); iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep); } @@ -37,15 +37,17 @@ void tdp_iter_restart(struct tdp_iter *iter) * rooted at root_pt, starting with the walk to translate next_last_level_gfn. */ void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root, - int min_level, gfn_t next_last_level_gfn) + int min_level, gfn_t next_last_level_gfn, gfn_t gfn_bits) { if (WARN_ON_ONCE(!root || (root->role.level < 1) || - (root->role.level > PT64_ROOT_MAX_LEVEL))) { + (root->role.level > PT64_ROOT_MAX_LEVEL) || + (gfn_bits && next_last_level_gfn >= gfn_bits))) { iter->valid = false; return; } iter->next_last_level_gfn = next_last_level_gfn; + iter->gfn_bits = gfn_bits; iter->root_level = root->role.level; iter->min_level = min_level; iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root->spt; @@ -113,7 +115,7 @@ static bool try_step_side(struct tdp_iter *iter) * Check if the iterator is already at the end of the current page * table. */ - if (SPTE_INDEX(iter->gfn << PAGE_SHIFT, iter->level) == + if (SPTE_INDEX((iter->gfn | iter->gfn_bits) << PAGE_SHIFT, iter->level) == (SPTE_ENT_PER_PAGE - 1)) return false; diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index fae559559a80..364c5da6c499 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -21,37 +21,39 @@ static inline u64 kvm_tdp_mmu_read_spte(tdp_ptep_t sptep) static inline u64 kvm_tdp_mmu_write_spte_atomic(tdp_ptep_t sptep, u64 new_spte) { + KVM_MMU_WARN_ON(is_ept_ve_possible(new_spte)); return xchg(rcu_dereference(sptep), new_spte); } +static inline u64 tdp_mmu_clear_spte_bits_atomic(tdp_ptep_t sptep, u64 mask) +{ + atomic64_t *sptep_atomic = (atomic64_t *)rcu_dereference(sptep); + + return (u64)atomic64_fetch_and(~mask, sptep_atomic); +} + static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte) { + KVM_MMU_WARN_ON(is_ept_ve_possible(new_spte)); WRITE_ONCE(*rcu_dereference(sptep), new_spte); } /* - * SPTEs must be modified atomically if they are shadow-present, leaf - * SPTEs, and have volatile bits, i.e. has bits that can be set outside - * of mmu_lock. The Writable bit can be set by KVM's fast page fault - * handler, and Accessed and Dirty bits can be set by the CPU. - * - * Note, non-leaf SPTEs do have Accessed bits and those bits are - * technically volatile, but KVM doesn't consume the Accessed bit of - * non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This - * logic needs to be reassessed if KVM were to use non-leaf Accessed - * bits, e.g. to skip stepping down into child SPTEs when aging SPTEs. + * SPTEs must be modified atomically if they are shadow-present, leaf SPTEs, + * and have volatile bits (bits that can be set outside of mmu_lock) that + * must not be clobbered. */ -static inline bool kvm_tdp_mmu_spte_need_atomic_write(u64 old_spte, int level) +static inline bool kvm_tdp_mmu_spte_need_atomic_update(u64 old_spte, int level) { return is_shadow_present_pte(old_spte) && is_last_spte(old_spte, level) && - spte_has_volatile_bits(old_spte); + spte_needs_atomic_update(old_spte); } static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte, u64 new_spte, int level) { - if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level)) + if (kvm_tdp_mmu_spte_need_atomic_update(old_spte, level)) return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte); __kvm_tdp_mmu_write_spte(sptep, new_spte); @@ -61,12 +63,8 @@ static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte, static inline u64 tdp_mmu_clear_spte_bits(tdp_ptep_t sptep, u64 old_spte, u64 mask, int level) { - atomic64_t *sptep_atomic; - - if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level)) { - sptep_atomic = (atomic64_t *)rcu_dereference(sptep); - return (u64)atomic64_fetch_and(~mask, sptep_atomic); - } + if (kvm_tdp_mmu_spte_need_atomic_update(old_spte, level)) + return tdp_mmu_clear_spte_bits_atomic(sptep, mask); __kvm_tdp_mmu_write_spte(sptep, old_spte & ~mask); return old_spte; @@ -91,8 +89,10 @@ struct tdp_iter { tdp_ptep_t pt_path[PT64_ROOT_MAX_LEVEL]; /* A pointer to the current SPTE */ tdp_ptep_t sptep; - /* The lowest GFN mapped by the current SPTE */ + /* The lowest GFN (mask bits excluded) mapped by the current SPTE */ gfn_t gfn; + /* Mask applied to convert the GFN to the mapping GPA */ + gfn_t gfn_bits; /* The level of the root page given to the iterator */ int root_level; /* The lowest level the iterator should traverse to */ @@ -120,18 +120,23 @@ struct tdp_iter { * Iterates over every SPTE mapping the GFN range [start, end) in a * preorder traversal. */ -#define for_each_tdp_pte_min_level(iter, root, min_level, start, end) \ - for (tdp_iter_start(&iter, root, min_level, start); \ - iter.valid && iter.gfn < end; \ +#define for_each_tdp_pte_min_level(iter, kvm, root, min_level, start, end) \ + for (tdp_iter_start(&iter, root, min_level, start, kvm_gfn_root_bits(kvm, root)); \ + iter.valid && iter.gfn < end; \ tdp_iter_next(&iter)) -#define for_each_tdp_pte(iter, root, start, end) \ - for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) +#define for_each_tdp_pte_min_level_all(iter, root, min_level) \ + for (tdp_iter_start(&iter, root, min_level, 0, 0); \ + iter.valid && iter.gfn < tdp_mmu_max_gfn_exclusive(); \ + tdp_iter_next(&iter)) + +#define for_each_tdp_pte(iter, kvm, root, start, end) \ + for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end) tdp_ptep_t spte_to_child_pt(u64 pte, int level); void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root, - int min_level, gfn_t next_last_level_gfn); + int min_level, gfn_t next_last_level_gfn, gfn_t gfn_bits); void tdp_iter_next(struct tdp_iter *iter); void tdp_iter_restart(struct tdp_iter *iter); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index d078157e62aa..7f3d7229b2c1 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -37,10 +37,12 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) * for zapping and thus puts the TDP MMU's reference to each root, i.e. * ultimately frees all roots. */ - kvm_tdp_mmu_invalidate_all_roots(kvm); - kvm_tdp_mmu_zap_invalidated_roots(kvm); + kvm_tdp_mmu_invalidate_roots(kvm, KVM_VALID_ROOTS); + kvm_tdp_mmu_zap_invalidated_roots(kvm, false); - WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); +#ifdef CONFIG_KVM_PROVE_MMU + KVM_MMU_WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); +#endif WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); /* @@ -53,6 +55,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) { + free_page((unsigned long)sp->external_spt); free_page((unsigned long)sp->spt); kmem_cache_free(mmu_page_header_cache, sp); } @@ -91,19 +94,33 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); } +static bool tdp_mmu_root_match(struct kvm_mmu_page *root, + enum kvm_tdp_mmu_root_types types) +{ + if (WARN_ON_ONCE(!(types & KVM_VALID_ROOTS))) + return false; + + if (root->role.invalid && !(types & KVM_INVALID_ROOTS)) + return false; + + if (likely(!is_mirror_sp(root))) + return types & KVM_DIRECT_ROOTS; + return types & KVM_MIRROR_ROOTS; +} + /* * Returns the next root after @prev_root (or the first root if @prev_root is - * NULL). A reference to the returned root is acquired, and the reference to - * @prev_root is released (the caller obviously must hold a reference to - * @prev_root if it's non-NULL). + * NULL) that matches with @types. A reference to the returned root is + * acquired, and the reference to @prev_root is released (the caller obviously + * must hold a reference to @prev_root if it's non-NULL). * - * If @only_valid is true, invalid roots are skipped. + * Roots that doesn't match with @types are skipped. * * Returns NULL if the end of tdp_mmu_roots was reached. */ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, struct kvm_mmu_page *prev_root, - bool only_valid) + enum kvm_tdp_mmu_root_types types) { struct kvm_mmu_page *next_root; @@ -124,7 +141,7 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, typeof(*next_root), link); while (next_root) { - if ((!only_valid || !next_root->role.invalid) && + if (tdp_mmu_root_match(next_root, types) && kvm_tdp_mmu_get_root(next_root)) break; @@ -149,20 +166,20 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, * If shared is set, this function is operating under the MMU lock in read * mode. */ -#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _only_valid) \ - for (_root = tdp_mmu_next_root(_kvm, NULL, _only_valid); \ +#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _types) \ + for (_root = tdp_mmu_next_root(_kvm, NULL, _types); \ ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \ - _root = tdp_mmu_next_root(_kvm, _root, _only_valid)) \ + _root = tdp_mmu_next_root(_kvm, _root, _types)) \ if (_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) { \ } else #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ - __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, true) + __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, KVM_VALID_ROOTS) #define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \ - for (_root = tdp_mmu_next_root(_kvm, NULL, false); \ + for (_root = tdp_mmu_next_root(_kvm, NULL, KVM_ALL_ROOTS); \ ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \ - _root = tdp_mmu_next_root(_kvm, _root, false)) + _root = tdp_mmu_next_root(_kvm, _root, KVM_ALL_ROOTS)) /* * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write, @@ -171,18 +188,28 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, * Holding mmu_lock for write obviates the need for RCU protection as the list * is guaranteed to be stable. */ -#define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _only_valid) \ +#define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _types) \ list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \ if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \ ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \ - ((_only_valid) && (_root)->role.invalid))) { \ + !tdp_mmu_root_match((_root), (_types)))) { \ } else -#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ - __for_each_tdp_mmu_root(_kvm, _root, _as_id, false) +/* + * Iterate over all TDP MMU roots in an RCU read-side critical section. + * It is safe to iterate over the SPTEs under the root, but their values will + * be unstable, so all writes must be atomic. As this routine is meant to be + * used without holding the mmu_lock at all, any bits that are flipped must + * be reflected in kvm_tdp_mmu_spte_need_atomic_write(). + */ +#define for_each_tdp_mmu_root_rcu(_kvm, _root, _as_id, _types) \ + list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link) \ + if ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \ + !tdp_mmu_root_match((_root), (_types))) { \ + } else #define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) \ - __for_each_tdp_mmu_root(_kvm, _root, _as_id, true) + __for_each_tdp_mmu_root(_kvm, _root, _as_id, KVM_VALID_ROOTS) static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) { @@ -223,7 +250,7 @@ static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp, tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role); } -int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu) +void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool mirror) { struct kvm_mmu *mmu = vcpu->arch.mmu; union kvm_mmu_page_role role = mmu->root_role; @@ -231,6 +258,9 @@ int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu) struct kvm *kvm = vcpu->kvm; struct kvm_mmu_page *root; + if (mirror) + role.is_mirror = true; + /* * Check for an existing root before acquiring the pages lock to avoid * unnecessary serialization if multiple vCPUs are loading a new root. @@ -282,9 +312,12 @@ out_read_unlock: * and actually consuming the root if it's invalidated after dropping * mmu_lock, and the root can't be freed as this vCPU holds a reference. */ - mmu->root.hpa = __pa(root->spt); - mmu->root.pgd = 0; - return 0; + if (mirror) { + mmu->mirror_root_hpa = __pa(root->spt); + } else { + mmu->root.hpa = __pa(root->spt); + mmu->root.pgd = 0; + } } static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, @@ -294,13 +327,17 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { kvm_account_pgtable_pages((void *)sp->spt, +1); +#ifdef CONFIG_KVM_PROVE_MMU atomic64_inc(&kvm->arch.tdp_mmu_pages); +#endif } static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { kvm_account_pgtable_pages((void *)sp->spt, -1); +#ifdef CONFIG_KVM_PROVE_MMU atomic64_dec(&kvm->arch.tdp_mmu_pages); +#endif } /** @@ -322,6 +359,29 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp) spin_unlock(&kvm->arch.tdp_mmu_pages_lock); } +static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte, + int level) +{ + kvm_pfn_t old_pfn = spte_to_pfn(old_spte); + int ret; + + /* + * External (TDX) SPTEs are limited to PG_LEVEL_4K, and external + * PTs are removed in a special order, involving free_external_spt(). + * But remove_external_spte() will be called on non-leaf PTEs via + * __tdp_mmu_zap_root(), so avoid the error the former would return + * in this case. + */ + if (!is_last_spte(old_spte, level)) + return; + + /* Zapping leaf spte is allowed only when write lock is held. */ + lockdep_assert_held_write(&kvm->mmu_lock); + /* Because write lock is held, operation should success. */ + ret = kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_pfn); + KVM_BUG_ON(ret, kvm); +} + /** * handle_removed_pt() - handle a page table removed from the TDP structure * @@ -359,14 +419,14 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) /* * Set the SPTE to a nonpresent value that other * threads will not overwrite. If the SPTE was - * already marked as removed then another thread + * already marked as frozen then another thread * handling a page fault could overwrite it, so * set the SPTE until it is set from some other - * value to the removed SPTE value. + * value to the frozen SPTE value. */ for (;;) { - old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE); - if (!is_removed_spte(old_spte)) + old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, FROZEN_SPTE); + if (!is_frozen_spte(old_spte)) break; cpu_relax(); } @@ -397,11 +457,11 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) * No retry is needed in the atomic update path as the * sole concern is dropping a Dirty bit, i.e. no other * task can zap/remove the SPTE as mmu_lock is held for - * write. Marking the SPTE as a removed SPTE is not + * write. Marking the SPTE as a frozen SPTE is not * strictly necessary for the same reason, but using - * the remove SPTE value keeps the shared/exclusive + * the frozen SPTE value keeps the shared/exclusive * paths consistent and allows the handle_changed_spte() - * call below to hardcode the new value to REMOVED_SPTE. + * call below to hardcode the new value to FROZEN_SPTE. * * Note, even though dropping a Dirty bit is the only * scenario where a non-atomic update could result in a @@ -413,15 +473,85 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) * it here. */ old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, - REMOVED_SPTE, level); + FROZEN_SPTE, level); } handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, - old_spte, REMOVED_SPTE, level, shared); + old_spte, FROZEN_SPTE, level, shared); + + if (is_mirror_sp(sp)) { + KVM_BUG_ON(shared, kvm); + remove_external_spte(kvm, gfn, old_spte, level); + } + } + + if (is_mirror_sp(sp) && + WARN_ON(kvm_x86_call(free_external_spt)(kvm, base_gfn, sp->role.level, + sp->external_spt))) { + /* + * Failed to free page table page in mirror page table and + * there is nothing to do further. + * Intentionally leak the page to prevent the kernel from + * accessing the encrypted page. + */ + sp->external_spt = NULL; } call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); } +static void *get_external_spt(gfn_t gfn, u64 new_spte, int level) +{ + if (is_shadow_present_pte(new_spte) && !is_last_spte(new_spte, level)) { + struct kvm_mmu_page *sp = spte_to_child_sp(new_spte); + + WARN_ON_ONCE(sp->role.level + 1 != level); + WARN_ON_ONCE(sp->gfn != gfn); + return sp->external_spt; + } + + return NULL; +} + +static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sptep, + gfn_t gfn, u64 old_spte, + u64 new_spte, int level) +{ + bool was_present = is_shadow_present_pte(old_spte); + bool is_present = is_shadow_present_pte(new_spte); + bool is_leaf = is_present && is_last_spte(new_spte, level); + kvm_pfn_t new_pfn = spte_to_pfn(new_spte); + int ret = 0; + + KVM_BUG_ON(was_present, kvm); + + lockdep_assert_held(&kvm->mmu_lock); + /* + * We need to lock out other updates to the SPTE until the external + * page table has been modified. Use FROZEN_SPTE similar to + * the zapping case. + */ + if (!try_cmpxchg64(rcu_dereference(sptep), &old_spte, FROZEN_SPTE)) + return -EBUSY; + + /* + * Use different call to either set up middle level + * external page table, or leaf. + */ + if (is_leaf) { + ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_pfn); + } else { + void *external_spt = get_external_spt(gfn, new_spte, level); + + KVM_BUG_ON(!external_spt, kvm); + ret = kvm_x86_call(link_external_spt)(kvm, gfn, level, external_spt); + } + if (ret) + __kvm_tdp_mmu_write_spte(sptep, old_spte); + else + __kvm_tdp_mmu_write_spte(sptep, new_spte); + return ret; +} + /** * handle_changed_spte - handle bookkeeping associated with an SPTE change * @kvm: kvm instance @@ -490,19 +620,19 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, */ if (!was_present && !is_present) { /* - * If this change does not involve a MMIO SPTE or removed SPTE, + * If this change does not involve a MMIO SPTE or frozen SPTE, * it is unexpected. Log the change, though it should not * impact the guest since both the former and current SPTEs * are nonpresent. */ - if (WARN_ON_ONCE(!is_mmio_spte(old_spte) && - !is_mmio_spte(new_spte) && - !is_removed_spte(new_spte))) + if (WARN_ON_ONCE(!is_mmio_spte(kvm, old_spte) && + !is_mmio_spte(kvm, new_spte) && + !is_frozen_spte(new_spte))) pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" "should not be replaced with another,\n" "different nonpresent SPTE, unless one or both\n" "are MMIO SPTEs, or the new SPTE is\n" - "a temporary removed SPTE.\n" + "a temporary frozen SPTE.\n" "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", as_id, gfn, old_spte, new_spte, level); return; @@ -511,10 +641,6 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, if (is_leaf != was_leaf) kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1); - if (was_leaf && is_dirty_spte(old_spte) && - (!is_present || !is_dirty_spte(new_spte) || pfn_changed)) - kvm_set_pfn_dirty(spte_to_pfn(old_spte)); - /* * Recursively handle child PTs if the change removed a subtree from * the paging structure. Note the WARN on the PFN changing without the @@ -524,10 +650,50 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, if (was_present && !was_leaf && (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed))) handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared); +} + +static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm, + struct tdp_iter *iter, + u64 new_spte) +{ + /* + * The caller is responsible for ensuring the old SPTE is not a FROZEN + * SPTE. KVM should never attempt to zap or manipulate a FROZEN SPTE, + * and pre-checking before inserting a new SPTE is advantageous as it + * avoids unnecessary work. + */ + WARN_ON_ONCE(iter->yielded || is_frozen_spte(iter->old_spte)); + + if (is_mirror_sptep(iter->sptep) && !is_frozen_spte(new_spte)) { + int ret; + + /* + * Users of atomic zapping don't operate on mirror roots, + * so don't handle it and bug the VM if it's seen. + */ + if (KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm)) + return -EBUSY; + + ret = set_external_spte_present(kvm, iter->sptep, iter->gfn, + iter->old_spte, new_spte, iter->level); + if (ret) + return ret; + } else { + u64 *sptep = rcu_dereference(iter->sptep); + + /* + * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs + * and does not hold the mmu_lock. On failure, i.e. if a + * different logical CPU modified the SPTE, try_cmpxchg64() + * updates iter->old_spte with the current value, so the caller + * operates on fresh data, e.g. if it retries + * tdp_mmu_set_spte_atomic() + */ + if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte)) + return -EBUSY; + } - if (was_leaf && is_accessed_spte(old_spte) && - (!is_present || !is_accessed_spte(new_spte) || pfn_changed)) - kvm_set_pfn_accessed(spte_to_pfn(old_spte)); + return 0; } /* @@ -547,68 +713,24 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, * no side-effects other than setting iter->old_spte to the last * known value of the spte. */ -static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, - struct tdp_iter *iter, - u64 new_spte) +static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm, + struct tdp_iter *iter, + u64 new_spte) { - u64 *sptep = rcu_dereference(iter->sptep); - - /* - * The caller is responsible for ensuring the old SPTE is not a REMOVED - * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE, - * and pre-checking before inserting a new SPTE is advantageous as it - * avoids unnecessary work. - */ - WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte)); + int ret; lockdep_assert_held_read(&kvm->mmu_lock); - /* - * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and - * does not hold the mmu_lock. On failure, i.e. if a different logical - * CPU modified the SPTE, try_cmpxchg64() updates iter->old_spte with - * the current value, so the caller operates on fresh data, e.g. if it - * retries tdp_mmu_set_spte_atomic() - */ - if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte)) - return -EBUSY; - - handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, - new_spte, iter->level, true); - - return 0; -} - -static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, - struct tdp_iter *iter) -{ - int ret; - - /* - * Freeze the SPTE by setting it to a special, - * non-present value. This will stop other threads from - * immediately installing a present entry in its place - * before the TLBs are flushed. - */ - ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE); + ret = __tdp_mmu_set_spte_atomic(kvm, iter, new_spte); if (ret) return ret; - kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level); - - /* - * No other thread can overwrite the removed SPTE as they must either - * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not - * overwrite the special removed SPTE value. No bookkeeping is needed - * here since the SPTE is going from non-present to non-present. Use - * the raw write helper to avoid an unnecessary check on volatile bits. - */ - __kvm_tdp_mmu_write_spte(iter->sptep, 0); + handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, + new_spte, iter->level, true); return 0; } - /* * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping * @kvm: KVM instance @@ -629,16 +751,26 @@ static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, /* * No thread should be using this function to set SPTEs to or from the - * temporary removed SPTE value. + * temporary frozen SPTE value. * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic * should be used. If operating under the MMU lock in write mode, the - * use of the removed SPTE should not be necessary. + * use of the frozen SPTE should not be necessary. */ - WARN_ON_ONCE(is_removed_spte(old_spte) || is_removed_spte(new_spte)); + WARN_ON_ONCE(is_frozen_spte(old_spte) || is_frozen_spte(new_spte)); old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level); handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false); + + /* + * Users that do non-atomic setting of PTEs don't operate on mirror + * roots, so don't handle it and bug the VM if it's seen. + */ + if (is_mirror_sptep(sptep)) { + KVM_BUG_ON(is_shadow_present_pte(new_spte), kvm); + remove_external_spte(kvm, gfn, old_spte, level); + } + return old_spte; } @@ -651,18 +783,25 @@ static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter, iter->gfn, iter->level); } -#define tdp_root_for_each_pte(_iter, _root, _start, _end) \ - for_each_tdp_pte(_iter, _root, _start, _end) +#define tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \ + for_each_tdp_pte(_iter, _kvm, _root, _start, _end) -#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ - tdp_root_for_each_pte(_iter, _root, _start, _end) \ +#define tdp_root_for_each_leaf_pte(_iter, _kvm, _root, _start, _end) \ + tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \ if (!is_shadow_present_pte(_iter.old_spte) || \ !is_last_spte(_iter.old_spte, _iter.level)) \ continue; \ else -#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ - for_each_tdp_pte(_iter, root_to_sp(_mmu->root.hpa), _start, _end) +static inline bool __must_check tdp_mmu_iter_need_resched(struct kvm *kvm, + struct tdp_iter *iter) +{ + if (!need_resched() && !rwlock_needbreak(&kvm->mmu_lock)) + return false; + + /* Ensure forward progress has been made before yielding. */ + return iter->next_last_level_gfn != iter->yielded_gfn; +} /* * Yield if the MMU lock is contended or this thread needs to return control @@ -682,31 +821,27 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, struct tdp_iter *iter, bool flush, bool shared) { - WARN_ON_ONCE(iter->yielded); + KVM_MMU_WARN_ON(iter->yielded); - /* Ensure forward progress has been made before yielding. */ - if (iter->next_last_level_gfn == iter->yielded_gfn) + if (!tdp_mmu_iter_need_resched(kvm, iter)) return false; - if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { - if (flush) - kvm_flush_remote_tlbs(kvm); + if (flush) + kvm_flush_remote_tlbs(kvm); - rcu_read_unlock(); - - if (shared) - cond_resched_rwlock_read(&kvm->mmu_lock); - else - cond_resched_rwlock_write(&kvm->mmu_lock); + rcu_read_unlock(); - rcu_read_lock(); + if (shared) + cond_resched_rwlock_read(&kvm->mmu_lock); + else + cond_resched_rwlock_write(&kvm->mmu_lock); - WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn); + rcu_read_lock(); - iter->yielded = true; - } + WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn); - return iter->yielded; + iter->yielded = true; + return true; } static inline gfn_t tdp_mmu_max_gfn_exclusive(void) @@ -725,10 +860,7 @@ static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, { struct tdp_iter iter; - gfn_t end = tdp_mmu_max_gfn_exclusive(); - gfn_t start = 0; - - for_each_tdp_pte_min_level(iter, root, zap_level, start, end) { + for_each_tdp_pte_min_level_all(iter, root, zap_level) { retry: if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) continue; @@ -740,8 +872,8 @@ retry: continue; if (!shared) - tdp_mmu_iter_set_spte(kvm, &iter, 0); - else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0)) + tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE); + else if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE)) goto retry; } } @@ -808,8 +940,8 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte))) return false; - tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0, - sp->gfn, sp->role.level + 1); + tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, + SHADOW_NONPRESENT_VALUE, sp->gfn, sp->role.level + 1); return true; } @@ -832,7 +964,7 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, rcu_read_lock(); - for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) { + for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end) { if (can_yield && tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) { flush = false; @@ -843,7 +975,7 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, !is_last_spte(iter.old_spte, iter.level)) continue; - tdp_mmu_iter_set_spte(kvm, &iter, 0); + tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE); /* * Zappings SPTEs in invalid roots doesn't require a TLB flush, @@ -883,19 +1015,21 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm) struct kvm_mmu_page *root; /* - * Zap all roots, including invalid roots, as all SPTEs must be dropped - * before returning to the caller. Zap directly even if the root is - * also being zapped by a worker. Walking zapped top-level SPTEs isn't - * all that expensive and mmu_lock is already held, which means the - * worker has yielded, i.e. flushing the work instead of zapping here - * isn't guaranteed to be any faster. + * Zap all direct roots, including invalid direct roots, as all direct + * SPTEs must be dropped before returning to the caller. For TDX, mirror + * roots don't need handling in response to the mmu notifier (the caller). + * + * Zap directly even if the root is also being zapped by a concurrent + * "fast zap". Walking zapped top-level SPTEs isn't all that expensive + * and mmu_lock is already held, which means the other thread has yielded. * * A TLB flush is unnecessary, KVM zaps everything if and only the VM * is being destroyed or the userspace VMM has exited. In both cases, * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request. */ lockdep_assert_held_write(&kvm->mmu_lock); - for_each_tdp_mmu_root_yield_safe(kvm, root) + __for_each_tdp_mmu_root_yield_safe(kvm, root, -1, + KVM_DIRECT_ROOTS | KVM_INVALID_ROOTS) tdp_mmu_zap_root(kvm, root, false); } @@ -903,11 +1037,14 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm) * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast * zap" completes. */ -void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) +void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared) { struct kvm_mmu_page *root; - read_lock(&kvm->mmu_lock); + if (shared) + read_lock(&kvm->mmu_lock); + else + write_lock(&kvm->mmu_lock); for_each_tdp_mmu_root_yield_safe(kvm, root) { if (!root->tdp_mmu_scheduled_root_to_zap) @@ -925,7 +1062,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) * that may be zapped, as such entries are associated with the * ASID on both VMX and SVM. */ - tdp_mmu_zap_root(kvm, root, true); + tdp_mmu_zap_root(kvm, root, shared); /* * The referenced needs to be put *after* zapping the root, as @@ -935,7 +1072,10 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) kvm_tdp_mmu_put_root(kvm, root); } - read_unlock(&kvm->mmu_lock); + if (shared) + read_unlock(&kvm->mmu_lock); + else + write_unlock(&kvm->mmu_lock); } /* @@ -948,11 +1088,19 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference. * See kvm_tdp_mmu_alloc_root(). */ -void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) +void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm, + enum kvm_tdp_mmu_root_types root_types) { struct kvm_mmu_page *root; /* + * Invalidating invalid roots doesn't make sense, prevent developers from + * having to think about it. + */ + if (WARN_ON_ONCE(root_types & KVM_INVALID_ROOTS)) + root_types &= ~KVM_INVALID_ROOTS; + + /* * mmu_lock must be held for write to ensure that a root doesn't become * invalid while there are active readers (invalidating a root while * there are active readers may or may not be problematic in practice, @@ -973,6 +1121,9 @@ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) * or get/put references to roots. */ list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { + if (!tdp_mmu_root_match(root, root_types)) + continue; + /* * Note, invalid roots can outlive a memslot update! Invalid * roots must be *zapped* before the memslot update completes, @@ -1002,19 +1153,27 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, if (WARN_ON_ONCE(sp->role.level != fault->goal_level)) return RET_PF_RETRY; + if (is_shadow_present_pte(iter->old_spte) && + (fault->prefetch || is_access_allowed(fault, iter->old_spte)) && + is_last_spte(iter->old_spte, iter->level)) { + WARN_ON_ONCE(fault->pfn != spte_to_pfn(iter->old_spte)); + return RET_PF_SPURIOUS; + } + if (unlikely(!fault->slot)) new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); else wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn, - fault->pfn, iter->old_spte, fault->prefetch, true, - fault->map_writable, &new_spte); + fault->pfn, iter->old_spte, fault->prefetch, + false, fault->map_writable, &new_spte); if (new_spte == iter->old_spte) ret = RET_PF_SPURIOUS; else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) return RET_PF_RETRY; else if (is_shadow_present_pte(iter->old_spte) && - !is_last_spte(iter->old_spte, iter->level)) + (!is_last_spte(iter->old_spte, iter->level) || + WARN_ON_ONCE(leaf_spte_change_needs_tlb_flush(iter->old_spte, new_spte)))) kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level); /* @@ -1022,13 +1181,11 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, * protected, emulation is needed. If the emulation was skipped, * the vCPU would have the same fault again. */ - if (wrprot) { - if (fault->write) - ret = RET_PF_EMULATE; - } + if (wrprot && fault->write) + ret = RET_PF_WRITE_PROTECTED; /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ - if (unlikely(is_mmio_spte(new_spte))) { + if (unlikely(is_mmio_spte(vcpu->kvm, new_spte))) { vcpu->stat.pf_mmio_spte_created++; trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, new_spte); @@ -1056,7 +1213,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, struct kvm_mmu_page *sp, bool shared) { - u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled()); + u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled); int ret = 0; if (shared) { @@ -1081,7 +1238,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, */ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - struct kvm_mmu *mmu = vcpu->arch.mmu; + struct kvm_mmu_page *root = tdp_mmu_get_root_for_fault(vcpu, fault); struct kvm *kvm = vcpu->kvm; struct tdp_iter iter; struct kvm_mmu_page *sp; @@ -1093,7 +1250,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) rcu_read_lock(); - tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) { + for_each_tdp_pte(iter, kvm, root, fault->gfn, fault->gfn + 1) { int r; if (fault->nx_huge_page_workaround_enabled) @@ -1103,7 +1260,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) * If SPTE has been frozen by another thread, just give up and * retry, avoiding unnecessary page table allocation and free. */ - if (is_removed_spte(iter.old_spte)) + if (is_frozen_spte(iter.old_spte)) goto retry; if (iter.level == fault->goal_level) @@ -1120,13 +1277,18 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) */ sp = tdp_mmu_alloc_sp(vcpu); tdp_mmu_init_child_sp(sp, &iter); + if (is_mirror_sp(sp)) + kvm_mmu_alloc_external_spt(vcpu, sp); sp->nx_huge_page_disallowed = fault->huge_page_disallowed; - if (is_shadow_present_pte(iter.old_spte)) + if (is_shadow_present_pte(iter.old_spte)) { + /* Don't support large page for mirrored roots (TDX) */ + KVM_BUG_ON(is_mirror_sptep(iter.sptep), vcpu->kvm); r = tdp_mmu_split_huge_page(kvm, &iter, sp, true); - else + } else { r = tdp_mmu_link_sp(kvm, &iter, sp, true); + } /* * Force the guest to retry if installing an upper level SPTE @@ -1161,45 +1323,22 @@ retry: return ret; } +/* Used by mmu notifier via kvm_unmap_gfn_range() */ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, bool flush) { + enum kvm_tdp_mmu_root_types types; struct kvm_mmu_page *root; - __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false) + types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter) | KVM_INVALID_ROOTS; + + __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, types) flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end, range->may_block, flush); return flush; } -typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter, - struct kvm_gfn_range *range); - -static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, - struct kvm_gfn_range *range, - tdp_handler_t handler) -{ - struct kvm_mmu_page *root; - struct tdp_iter iter; - bool ret = false; - - /* - * Don't support rescheduling, none of the MMU notifiers that funnel - * into this helper allow blocking; it'd be dead, wasteful code. - */ - for_each_tdp_mmu_root(kvm, root, range->slot->as_id) { - rcu_read_lock(); - - tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) - ret |= handler(kvm, &iter, range); - - rcu_read_unlock(); - } - - return ret; -} - /* * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero * if any of the GFNs in the range have been accessed. @@ -1208,100 +1347,72 @@ static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, * from the clear_young() or clear_flush_young() notifier, which uses the * return value to determine if the page has been accessed. */ -static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, - struct kvm_gfn_range *range) +static void kvm_tdp_mmu_age_spte(struct kvm *kvm, struct tdp_iter *iter) { u64 new_spte; - /* If we have a non-accessed entry we don't need to change the pte. */ - if (!is_accessed_spte(iter->old_spte)) - return false; - if (spte_ad_enabled(iter->old_spte)) { - iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep, - iter->old_spte, - shadow_accessed_mask, - iter->level); + iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep, + shadow_accessed_mask); new_spte = iter->old_spte & ~shadow_accessed_mask; } else { + new_spte = mark_spte_for_access_track(iter->old_spte); /* - * Capture the dirty status of the page, so that it doesn't get - * lost when the SPTE is marked for access tracking. + * It is safe for the following cmpxchg to fail. Leave the + * Accessed bit set, as the spte is most likely young anyway. */ - if (is_writable_pte(iter->old_spte)) - kvm_set_pfn_dirty(spte_to_pfn(iter->old_spte)); - - new_spte = mark_spte_for_access_track(iter->old_spte); - iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep, - iter->old_spte, new_spte, - iter->level); + if (__tdp_mmu_set_spte_atomic(kvm, iter, new_spte)) + return; } trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level, iter->old_spte, new_spte); - return true; } -bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) -{ - return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range); -} - -static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter, - struct kvm_gfn_range *range) -{ - return is_accessed_spte(iter->old_spte); -} - -bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) +static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, + struct kvm_gfn_range *range, + bool test_only) { - return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn); -} - -static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter, - struct kvm_gfn_range *range) -{ - u64 new_spte; - - /* Huge pages aren't expected to be modified without first being zapped. */ - WARN_ON_ONCE(pte_huge(range->arg.pte) || range->start + 1 != range->end); + enum kvm_tdp_mmu_root_types types; + struct kvm_mmu_page *root; + struct tdp_iter iter; + bool ret = false; - if (iter->level != PG_LEVEL_4K || - !is_shadow_present_pte(iter->old_spte)) - return false; + types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter); /* - * Note, when changing a read-only SPTE, it's not strictly necessary to - * zero the SPTE before setting the new PFN, but doing so preserves the - * invariant that the PFN of a present * leaf SPTE can never change. - * See handle_changed_spte(). + * Don't support rescheduling, none of the MMU notifiers that funnel + * into this helper allow blocking; it'd be dead, wasteful code. Note, + * this helper must NOT be used to unmap GFNs, as it processes only + * valid roots! */ - tdp_mmu_iter_set_spte(kvm, iter, 0); + WARN_ON(types & ~KVM_VALID_ROOTS); + + guard(rcu)(); + for_each_tdp_mmu_root_rcu(kvm, root, range->slot->as_id, types) { + tdp_root_for_each_leaf_pte(iter, kvm, root, range->start, range->end) { + if (!is_accessed_spte(iter.old_spte)) + continue; - if (!pte_write(range->arg.pte)) { - new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte, - pte_pfn(range->arg.pte)); + if (test_only) + return true; - tdp_mmu_iter_set_spte(kvm, iter, new_spte); + ret = true; + kvm_tdp_mmu_age_spte(kvm, &iter); + } } - return true; + return ret; } -/* - * Handle the changed_pte MMU notifier for the TDP MMU. - * data is a pointer to the new pte_t mapping the HVA specified by the MMU - * notifier. - * Returns non-zero if a flush is needed before releasing the MMU lock. - */ -bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) +bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { - /* - * No need to handle the remote TLB flush under RCU protection, the - * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a - * shadow page. See the WARN on pfn_changed in handle_changed_spte(). - */ - return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn); + return __kvm_tdp_mmu_age_gfn_range(kvm, range, false); +} + +bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) +{ + return __kvm_tdp_mmu_age_gfn_range(kvm, range, true); } /* @@ -1320,7 +1431,7 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); - for_each_tdp_pte_min_level(iter, root, min_level, start, end) { + for_each_tdp_pte_min_level(iter, kvm, root, min_level, start, end) { retry: if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) continue; @@ -1362,17 +1473,15 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, return spte_set; } -static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp) +static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(void) { struct kvm_mmu_page *sp; - gfp |= __GFP_ZERO; - - sp = kmem_cache_alloc(mmu_page_header_cache, gfp); + sp = kmem_cache_zalloc(mmu_page_header_cache, GFP_KERNEL_ACCOUNT); if (!sp) return NULL; - sp->spt = (void *)__get_free_page(gfp); + sp->spt = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!sp->spt) { kmem_cache_free(mmu_page_header_cache, sp); return NULL; @@ -1381,47 +1490,6 @@ static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp) return sp; } -static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, - struct tdp_iter *iter, - bool shared) -{ - struct kvm_mmu_page *sp; - - kvm_lockdep_assert_mmu_lock_held(kvm, shared); - - /* - * Since we are allocating while under the MMU lock we have to be - * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct - * reclaim and to avoid making any filesystem callbacks (which can end - * up invoking KVM MMU notifiers, resulting in a deadlock). - * - * If this allocation fails we drop the lock and retry with reclaim - * allowed. - */ - sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT); - if (sp) - return sp; - - rcu_read_unlock(); - - if (shared) - read_unlock(&kvm->mmu_lock); - else - write_unlock(&kvm->mmu_lock); - - iter->yielded = true; - sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT); - - if (shared) - read_lock(&kvm->mmu_lock); - else - write_lock(&kvm->mmu_lock); - - rcu_read_lock(); - - return sp; -} - /* Note, the caller is responsible for initializing @sp. */ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, struct kvm_mmu_page *sp, bool shared) @@ -1435,7 +1503,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, * not been linked in yet and thus is not reachable from any other CPU. */ for (i = 0; i < SPTE_ENT_PER_PAGE; i++) - sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i); + sp->spt[i] = make_small_spte(kvm, huge_spte, sp->role, i); /* * Replace the huge spte with a pointer to the populated lower level @@ -1468,7 +1536,6 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm, { struct kvm_mmu_page *sp = NULL; struct tdp_iter iter; - int ret = 0; rcu_read_lock(); @@ -1483,7 +1550,7 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm, * level above the target level (e.g. splitting a 1GB to 512 2MB pages, * and then splitting each of those to 512 4KB pages). */ - for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) { + for_each_tdp_pte_min_level(iter, kvm, root, target_level + 1, start, end) { retry: if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) continue; @@ -1492,17 +1559,31 @@ retry: continue; if (!sp) { - sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared); + rcu_read_unlock(); + + if (shared) + read_unlock(&kvm->mmu_lock); + else + write_unlock(&kvm->mmu_lock); + + sp = tdp_mmu_alloc_sp_for_split(); + + if (shared) + read_lock(&kvm->mmu_lock); + else + write_lock(&kvm->mmu_lock); + if (!sp) { - ret = -ENOMEM; trace_kvm_mmu_split_huge_page(iter.gfn, iter.old_spte, - iter.level, ret); - break; + iter.level, -ENOMEM); + return -ENOMEM; } - if (iter.yielded) - continue; + rcu_read_lock(); + + iter.yielded = true; + continue; } tdp_mmu_init_child_sp(sp, &iter); @@ -1523,7 +1604,7 @@ retry: if (sp) tdp_mmu_free_sp(sp); - return ret; + return 0; } @@ -1548,23 +1629,26 @@ void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, } } -/* - * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If - * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. - * If AD bits are not enabled, this will require clearing the writable bit on - * each SPTE. Returns true if an SPTE has been changed and the TLBs need to - * be flushed. - */ -static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t start, gfn_t end) +static bool tdp_mmu_need_write_protect(struct kvm *kvm, struct kvm_mmu_page *sp) { - u64 dbit = kvm_ad_enabled() ? shadow_dirty_mask : PT_WRITABLE_MASK; + /* + * All TDP MMU shadow pages share the same role as their root, aside + * from level, so it is valid to key off any shadow page to determine if + * write protection is needed for an entire tree. + */ + return kvm_mmu_page_ad_need_write_protect(kvm, sp) || !kvm_ad_enabled; +} + +static void clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t start, gfn_t end) +{ + const u64 dbit = tdp_mmu_need_write_protect(kvm, root) ? + PT_WRITABLE_MASK : shadow_dirty_mask; struct tdp_iter iter; - bool spte_set = false; rcu_read_lock(); - tdp_root_for_each_pte(iter, root, start, end) { + tdp_root_for_each_pte(iter, kvm, root, start, end) { retry: if (!is_shadow_present_pte(iter.old_spte) || !is_last_spte(iter.old_spte, iter.level)) @@ -1573,7 +1657,7 @@ retry: if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) continue; - KVM_MMU_WARN_ON(kvm_ad_enabled() && + KVM_MMU_WARN_ON(dbit == shadow_dirty_mask && spte_ad_need_write_protect(iter.old_spte)); if (!(iter.old_spte & dbit)) @@ -1581,59 +1665,43 @@ retry: if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit)) goto retry; - - spte_set = true; } rcu_read_unlock(); - return spte_set; } /* - * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If - * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. - * If AD bits are not enabled, this will require clearing the writable bit on - * each SPTE. Returns true if an SPTE has been changed and the TLBs need to - * be flushed. + * Clear the dirty status (D-bit or W-bit) of all the SPTEs mapping GFNs in the + * memslot. */ -bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, +void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, const struct kvm_memory_slot *slot) { struct kvm_mmu_page *root; - bool spte_set = false; lockdep_assert_held_read(&kvm->mmu_lock); for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) - spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, - slot->base_gfn + slot->npages); - - return spte_set; + clear_dirty_gfn_range(kvm, root, slot->base_gfn, + slot->base_gfn + slot->npages); } -/* - * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is - * set in mask, starting at gfn. The given memslot is expected to contain all - * the GFNs represented by set bits in the mask. If AD bits are enabled, - * clearing the dirty status will involve clearing the dirty bit on each SPTE - * or, if AD bits are not enabled, clearing the writable bit on each SPTE. - */ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t gfn, unsigned long mask, bool wrprot) { - u64 dbit = (wrprot || !kvm_ad_enabled()) ? PT_WRITABLE_MASK : - shadow_dirty_mask; + const u64 dbit = (wrprot || tdp_mmu_need_write_protect(kvm, root)) ? + PT_WRITABLE_MASK : shadow_dirty_mask; struct tdp_iter iter; lockdep_assert_held_write(&kvm->mmu_lock); rcu_read_lock(); - tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), + tdp_root_for_each_leaf_pte(iter, kvm, root, gfn + __ffs(mask), gfn + BITS_PER_LONG) { if (!mask) break; - KVM_MMU_WARN_ON(kvm_ad_enabled() && + KVM_MMU_WARN_ON(dbit == shadow_dirty_mask && spte_ad_need_write_protect(iter.old_spte)); if (iter.level > PG_LEVEL_4K || @@ -1652,18 +1720,15 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level, iter.old_spte, iter.old_spte & ~dbit); - kvm_set_pfn_dirty(spte_to_pfn(iter.old_spte)); } rcu_read_unlock(); } /* - * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is - * set in mask, starting at gfn. The given memslot is expected to contain all - * the GFNs represented by set bits in the mask. If AD bits are enabled, - * clearing the dirty status will involve clearing the dirty bit on each SPTE - * or, if AD bits are not enabled, clearing the writable bit on each SPTE. + * Clear the dirty status (D-bit or W-bit) of all the 4k SPTEs mapping GFNs for + * which a bit is set in mask, starting at gfn. The given memslot is expected to + * contain all the GFNs represented by set bits in the mask. */ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, @@ -1676,21 +1741,55 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); } -static void zap_collapsible_spte_range(struct kvm *kvm, - struct kvm_mmu_page *root, - const struct kvm_memory_slot *slot) +static int tdp_mmu_make_huge_spte(struct kvm *kvm, + struct tdp_iter *parent, + u64 *huge_spte) +{ + struct kvm_mmu_page *root = spte_to_child_sp(parent->old_spte); + gfn_t start = parent->gfn; + gfn_t end = start + KVM_PAGES_PER_HPAGE(parent->level); + struct tdp_iter iter; + + tdp_root_for_each_leaf_pte(iter, kvm, root, start, end) { + /* + * Use the parent iterator when checking for forward progress so + * that KVM doesn't get stuck continuously trying to yield (i.e. + * returning -EAGAIN here and then failing the forward progress + * check in the caller ad nauseam). + */ + if (tdp_mmu_iter_need_resched(kvm, parent)) + return -EAGAIN; + + *huge_spte = make_huge_spte(kvm, iter.old_spte, parent->level); + return 0; + } + + return -ENOENT; +} + +static void recover_huge_pages_range(struct kvm *kvm, + struct kvm_mmu_page *root, + const struct kvm_memory_slot *slot) { gfn_t start = slot->base_gfn; gfn_t end = start + slot->npages; struct tdp_iter iter; int max_mapping_level; + bool flush = false; + u64 huge_spte; + int r; + + if (WARN_ON_ONCE(kvm_slot_dirty_track_enabled(slot))) + return; rcu_read_lock(); - for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) { + for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_2M, start, end) { retry: - if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) + if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) { + flush = false; continue; + } if (iter.level > KVM_MAX_HUGEPAGE_LEVEL || !is_shadow_present_pte(iter.old_spte)) @@ -1714,31 +1813,40 @@ retry: if (iter.gfn < start || iter.gfn >= end) continue; - max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, - iter.gfn, PG_LEVEL_NUM); + max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, iter.gfn); if (max_mapping_level < iter.level) continue; - /* Note, a successful atomic zap also does a remote TLB flush. */ - if (tdp_mmu_zap_spte_atomic(kvm, &iter)) + r = tdp_mmu_make_huge_spte(kvm, &iter, &huge_spte); + if (r == -EAGAIN) goto retry; + else if (r) + continue; + + if (tdp_mmu_set_spte_atomic(kvm, &iter, huge_spte)) + goto retry; + + flush = true; } + if (flush) + kvm_flush_remote_tlbs_memslot(kvm, slot); + rcu_read_unlock(); } /* - * Zap non-leaf SPTEs (and free their associated page tables) which could - * be replaced by huge pages, for GFNs within the slot. + * Recover huge page mappings within the slot by replacing non-leaf SPTEs with + * huge SPTEs where possible. */ -void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot) +void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *slot) { struct kvm_mmu_page *root; lockdep_assert_held_read(&kvm->mmu_lock); for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) - zap_collapsible_spte_range(kvm, root, slot); + recover_huge_pages_range(kvm, root, slot); } /* @@ -1757,7 +1865,7 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, rcu_read_lock(); - for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) { + for_each_tdp_pte_min_level(iter, kvm, root, min_level, gfn, gfn + 1) { if (!is_shadow_present_pte(iter.old_spte) || !is_last_spte(iter.old_spte, iter.level)) continue; @@ -1802,17 +1910,14 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, * * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. */ -int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, - int *root_level) +static int __kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, + struct kvm_mmu_page *root) { struct tdp_iter iter; - struct kvm_mmu *mmu = vcpu->arch.mmu; gfn_t gfn = addr >> PAGE_SHIFT; int leaf = -1; - *root_level = vcpu->arch.mmu->root_role.level; - - tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { + for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) { leaf = iter.level; sptes[leaf] = iter.old_spte; } @@ -1820,6 +1925,36 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, return leaf; } +int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, + int *root_level) +{ + struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa); + *root_level = vcpu->arch.mmu->root_role.level; + + return __kvm_tdp_mmu_get_walk(vcpu, addr, sptes, root); +} + +bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa) +{ + struct kvm *kvm = vcpu->kvm; + bool is_direct = kvm_is_addr_direct(kvm, gpa); + hpa_t root = is_direct ? vcpu->arch.mmu->root.hpa : + vcpu->arch.mmu->mirror_root_hpa; + u64 sptes[PT64_ROOT_MAX_LEVEL + 1], spte; + int leaf; + + lockdep_assert_held(&kvm->mmu_lock); + rcu_read_lock(); + leaf = __kvm_tdp_mmu_get_walk(vcpu, gpa, sptes, root_to_sp(root)); + rcu_read_unlock(); + if (leaf < 0) + return false; + + spte = sptes[leaf]; + return is_shadow_present_pte(spte) && is_last_spte(spte, leaf); +} +EXPORT_SYMBOL_GPL(kvm_tdp_mmu_gpa_is_mapped); + /* * Returns the last level spte pointer of the shadow page walk for the given * gpa, and sets *spte to the spte value. This spte may be non-preset. If no @@ -1831,15 +1966,15 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, * * WARNING: This function is only intended to be called during fast_page_fault. */ -u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, +u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn, u64 *spte) { + /* Fast pf is not supported for mirrored roots */ + struct kvm_mmu_page *root = tdp_mmu_get_root(vcpu, KVM_DIRECT_ROOTS); struct tdp_iter iter; - struct kvm_mmu *mmu = vcpu->arch.mmu; - gfn_t gfn = addr >> PAGE_SHIFT; tdp_ptep_t sptep = NULL; - tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { + for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) { *spte = iter.old_spte; sptep = iter.sptep; } diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 6e1ea04ca885..52acf99d40a0 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -10,7 +10,7 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm); void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); -int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu); +void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool private); __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root) { @@ -19,11 +19,56 @@ __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root) void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root); +enum kvm_tdp_mmu_root_types { + KVM_INVALID_ROOTS = BIT(0), + KVM_DIRECT_ROOTS = BIT(1), + KVM_MIRROR_ROOTS = BIT(2), + KVM_VALID_ROOTS = KVM_DIRECT_ROOTS | KVM_MIRROR_ROOTS, + KVM_ALL_ROOTS = KVM_VALID_ROOTS | KVM_INVALID_ROOTS, +}; + +static inline enum kvm_tdp_mmu_root_types kvm_gfn_range_filter_to_root_types(struct kvm *kvm, + enum kvm_gfn_range_filter process) +{ + enum kvm_tdp_mmu_root_types ret = 0; + + if (!kvm_has_mirrored_tdp(kvm)) + return KVM_DIRECT_ROOTS; + + if (process & KVM_FILTER_PRIVATE) + ret |= KVM_MIRROR_ROOTS; + if (process & KVM_FILTER_SHARED) + ret |= KVM_DIRECT_ROOTS; + + WARN_ON_ONCE(!ret); + + return ret; +} + +static inline struct kvm_mmu_page *tdp_mmu_get_root_for_fault(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) +{ + if (unlikely(!kvm_is_addr_direct(vcpu->kvm, fault->addr))) + return root_to_sp(vcpu->arch.mmu->mirror_root_hpa); + + return root_to_sp(vcpu->arch.mmu->root.hpa); +} + +static inline struct kvm_mmu_page *tdp_mmu_get_root(struct kvm_vcpu *vcpu, + enum kvm_tdp_mmu_root_types type) +{ + if (unlikely(type == KVM_MIRROR_ROOTS)) + return root_to_sp(vcpu->arch.mmu->mirror_root_hpa); + + return root_to_sp(vcpu->arch.mmu->root.hpa); +} + bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush); bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp); void kvm_tdp_mmu_zap_all(struct kvm *kvm); -void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm); -void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm); +void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm, + enum kvm_tdp_mmu_root_types root_types); +void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared); int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); @@ -31,18 +76,17 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, bool flush); bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); -bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, const struct kvm_memory_slot *slot, int min_level); -bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, +void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, const struct kvm_memory_slot *slot); void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, unsigned long mask, bool wrprot); -void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot); +void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *slot); bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, @@ -65,7 +109,7 @@ static inline void kvm_tdp_mmu_walk_lockless_end(void) int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level); -u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, +u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn, u64 *spte); #ifdef CONFIG_X86_64 diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index a67c28a56417..6f74e2b27c1e 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -19,33 +19,22 @@ #include <asm/mtrr.h> #include "cpuid.h" -#include "mmu.h" +#include "x86.h" -#define IA32_MTRR_DEF_TYPE_E (1ULL << 11) -#define IA32_MTRR_DEF_TYPE_FE (1ULL << 10) -#define IA32_MTRR_DEF_TYPE_TYPE_MASK (0xff) - -static bool is_mtrr_base_msr(unsigned int msr) -{ - /* MTRR base MSRs use even numbers, masks use odd numbers. */ - return !(msr & 0x1); -} - -static struct kvm_mtrr_range *var_mtrr_msr_to_range(struct kvm_vcpu *vcpu, - unsigned int msr) +static u64 *find_mtrr(struct kvm_vcpu *vcpu, unsigned int msr) { - int index = (msr - MTRRphysBase_MSR(0)) / 2; - - return &vcpu->arch.mtrr_state.var_ranges[index]; -} + int index; -static bool msr_mtrr_valid(unsigned msr) -{ switch (msr) { case MTRRphysBase_MSR(0) ... MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1): + index = msr - MTRRphysBase_MSR(0); + return &vcpu->arch.mtrr_state.var[index]; case MSR_MTRRfix64K_00000: + return &vcpu->arch.mtrr_state.fixed_64k; case MSR_MTRRfix16K_80000: case MSR_MTRRfix16K_A0000: + index = msr - MSR_MTRRfix16K_80000; + return &vcpu->arch.mtrr_state.fixed_16k[index]; case MSR_MTRRfix4K_C0000: case MSR_MTRRfix4K_C8000: case MSR_MTRRfix4K_D0000: @@ -54,10 +43,14 @@ static bool msr_mtrr_valid(unsigned msr) case MSR_MTRRfix4K_E8000: case MSR_MTRRfix4K_F0000: case MSR_MTRRfix4K_F8000: + index = msr - MSR_MTRRfix4K_C0000; + return &vcpu->arch.mtrr_state.fixed_4k[index]; case MSR_MTRRdefType: - return true; + return &vcpu->arch.mtrr_state.deftype; + default: + break; } - return false; + return NULL; } static bool valid_mtrr_type(unsigned t) @@ -70,9 +63,6 @@ static bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) int i; u64 mask; - if (!msr_mtrr_valid(msr)) - return false; - if (msr == MSR_MTRRdefType) { if (data & ~0xcff) return false; @@ -85,8 +75,9 @@ static bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) } /* variable MTRRs */ - WARN_ON(!(msr >= MTRRphysBase_MSR(0) && - msr <= MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1))); + if (WARN_ON_ONCE(!(msr >= MTRRphysBase_MSR(0) && + msr <= MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1)))) + return false; mask = kvm_vcpu_reserved_gpa_bits_raw(vcpu); if ((msr & 1) == 0) { @@ -94,309 +85,32 @@ static bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) if (!valid_mtrr_type(data & 0xff)) return false; mask |= 0xf00; - } else + } else { /* MTRR mask */ mask |= 0x7ff; - - return (data & mask) == 0; -} - -static bool mtrr_is_enabled(struct kvm_mtrr *mtrr_state) -{ - return !!(mtrr_state->deftype & IA32_MTRR_DEF_TYPE_E); -} - -static bool fixed_mtrr_is_enabled(struct kvm_mtrr *mtrr_state) -{ - return !!(mtrr_state->deftype & IA32_MTRR_DEF_TYPE_FE); -} - -static u8 mtrr_default_type(struct kvm_mtrr *mtrr_state) -{ - return mtrr_state->deftype & IA32_MTRR_DEF_TYPE_TYPE_MASK; -} - -static u8 mtrr_disabled_type(struct kvm_vcpu *vcpu) -{ - /* - * Intel SDM 11.11.2.2: all MTRRs are disabled when - * IA32_MTRR_DEF_TYPE.E bit is cleared, and the UC - * memory type is applied to all of physical memory. - * - * However, virtual machines can be run with CPUID such that - * there are no MTRRs. In that case, the firmware will never - * enable MTRRs and it is obviously undesirable to run the - * guest entirely with UC memory and we use WB. - */ - if (guest_cpuid_has(vcpu, X86_FEATURE_MTRR)) - return MTRR_TYPE_UNCACHABLE; - else - return MTRR_TYPE_WRBACK; -} - -/* -* Three terms are used in the following code: -* - segment, it indicates the address segments covered by fixed MTRRs. -* - unit, it corresponds to the MSR entry in the segment. -* - range, a range is covered in one memory cache type. -*/ -struct fixed_mtrr_segment { - u64 start; - u64 end; - - int range_shift; - - /* the start position in kvm_mtrr.fixed_ranges[]. */ - int range_start; -}; - -static struct fixed_mtrr_segment fixed_seg_table[] = { - /* MSR_MTRRfix64K_00000, 1 unit. 64K fixed mtrr. */ - { - .start = 0x0, - .end = 0x80000, - .range_shift = 16, /* 64K */ - .range_start = 0, - }, - - /* - * MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000, 2 units, - * 16K fixed mtrr. - */ - { - .start = 0x80000, - .end = 0xc0000, - .range_shift = 14, /* 16K */ - .range_start = 8, - }, - - /* - * MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000, 8 units, - * 4K fixed mtrr. - */ - { - .start = 0xc0000, - .end = 0x100000, - .range_shift = 12, /* 12K */ - .range_start = 24, - } -}; - -/* - * The size of unit is covered in one MSR, one MSR entry contains - * 8 ranges so that unit size is always 8 * 2^range_shift. - */ -static u64 fixed_mtrr_seg_unit_size(int seg) -{ - return 8 << fixed_seg_table[seg].range_shift; -} - -static bool fixed_msr_to_seg_unit(u32 msr, int *seg, int *unit) -{ - switch (msr) { - case MSR_MTRRfix64K_00000: - *seg = 0; - *unit = 0; - break; - case MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000: - *seg = 1; - *unit = array_index_nospec( - msr - MSR_MTRRfix16K_80000, - MSR_MTRRfix16K_A0000 - MSR_MTRRfix16K_80000 + 1); - break; - case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000: - *seg = 2; - *unit = array_index_nospec( - msr - MSR_MTRRfix4K_C0000, - MSR_MTRRfix4K_F8000 - MSR_MTRRfix4K_C0000 + 1); - break; - default: - return false; } - return true; -} - -static void fixed_mtrr_seg_unit_range(int seg, int unit, u64 *start, u64 *end) -{ - struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg]; - u64 unit_size = fixed_mtrr_seg_unit_size(seg); - - *start = mtrr_seg->start + unit * unit_size; - *end = *start + unit_size; - WARN_ON(*end > mtrr_seg->end); -} - -static int fixed_mtrr_seg_unit_range_index(int seg, int unit) -{ - struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg]; - - WARN_ON(mtrr_seg->start + unit * fixed_mtrr_seg_unit_size(seg) - > mtrr_seg->end); - - /* each unit has 8 ranges. */ - return mtrr_seg->range_start + 8 * unit; -} - -static int fixed_mtrr_seg_end_range_index(int seg) -{ - struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg]; - int n; - - n = (mtrr_seg->end - mtrr_seg->start) >> mtrr_seg->range_shift; - return mtrr_seg->range_start + n - 1; -} - -static bool fixed_msr_to_range(u32 msr, u64 *start, u64 *end) -{ - int seg, unit; - - if (!fixed_msr_to_seg_unit(msr, &seg, &unit)) - return false; - - fixed_mtrr_seg_unit_range(seg, unit, start, end); - return true; -} - -static int fixed_msr_to_range_index(u32 msr) -{ - int seg, unit; - - if (!fixed_msr_to_seg_unit(msr, &seg, &unit)) - return -1; - - return fixed_mtrr_seg_unit_range_index(seg, unit); -} - -static int fixed_mtrr_addr_to_seg(u64 addr) -{ - struct fixed_mtrr_segment *mtrr_seg; - int seg, seg_num = ARRAY_SIZE(fixed_seg_table); - - for (seg = 0; seg < seg_num; seg++) { - mtrr_seg = &fixed_seg_table[seg]; - if (mtrr_seg->start <= addr && addr < mtrr_seg->end) - return seg; - } - - return -1; -} - -static int fixed_mtrr_addr_seg_to_range_index(u64 addr, int seg) -{ - struct fixed_mtrr_segment *mtrr_seg; - int index; - - mtrr_seg = &fixed_seg_table[seg]; - index = mtrr_seg->range_start; - index += (addr - mtrr_seg->start) >> mtrr_seg->range_shift; - return index; -} - -static u64 fixed_mtrr_range_end_addr(int seg, int index) -{ - struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg]; - int pos = index - mtrr_seg->range_start; - - return mtrr_seg->start + ((pos + 1) << mtrr_seg->range_shift); -} - -static void var_mtrr_range(struct kvm_mtrr_range *range, u64 *start, u64 *end) -{ - u64 mask; - - *start = range->base & PAGE_MASK; - - mask = range->mask & PAGE_MASK; - - /* This cannot overflow because writing to the reserved bits of - * variable MTRRs causes a #GP. - */ - *end = (*start | ~mask) + 1; -} - -static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr) -{ - struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; - gfn_t start, end; - - if (!kvm_mmu_honors_guest_mtrrs(vcpu->kvm)) - return; - - if (!mtrr_is_enabled(mtrr_state) && msr != MSR_MTRRdefType) - return; - - /* fixed MTRRs. */ - if (fixed_msr_to_range(msr, &start, &end)) { - if (!fixed_mtrr_is_enabled(mtrr_state)) - return; - } else if (msr == MSR_MTRRdefType) { - start = 0x0; - end = ~0ULL; - } else { - /* variable range MTRRs. */ - var_mtrr_range(var_mtrr_msr_to_range(vcpu, msr), &start, &end); - } - - kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end)); -} - -static bool var_mtrr_range_is_valid(struct kvm_mtrr_range *range) -{ - return (range->mask & (1 << 11)) != 0; -} - -static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) -{ - struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; - struct kvm_mtrr_range *tmp, *cur; - - cur = var_mtrr_msr_to_range(vcpu, msr); - - /* remove the entry if it's in the list. */ - if (var_mtrr_range_is_valid(cur)) - list_del(&cur->node); - - /* - * Set all illegal GPA bits in the mask, since those bits must - * implicitly be 0. The bits are then cleared when reading them. - */ - if (is_mtrr_base_msr(msr)) - cur->base = data; - else - cur->mask = data | kvm_vcpu_reserved_gpa_bits_raw(vcpu); - - /* add it to the list if it's enabled. */ - if (var_mtrr_range_is_valid(cur)) { - list_for_each_entry(tmp, &mtrr_state->head, node) - if (cur->base >= tmp->base) - break; - list_add_tail(&cur->node, &tmp->node); - } + return (data & mask) == 0; } int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) { - int index; + u64 *mtrr; - if (!kvm_mtrr_valid(vcpu, msr, data)) + mtrr = find_mtrr(vcpu, msr); + if (!mtrr) return 1; - index = fixed_msr_to_range_index(msr); - if (index >= 0) - *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index] = data; - else if (msr == MSR_MTRRdefType) - vcpu->arch.mtrr_state.deftype = data; - else - set_var_mtrr_msr(vcpu, msr, data); + if (!kvm_mtrr_valid(vcpu, msr, data)) + return 1; - update_mtrr(vcpu, msr); + *mtrr = data; return 0; } int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) { - int index; + u64 *mtrr; /* MSR_MTRRcap is a readonly MSR. */ if (msr == MSR_MTRRcap) { @@ -410,311 +124,10 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return 0; } - if (!msr_mtrr_valid(msr)) + mtrr = find_mtrr(vcpu, msr); + if (!mtrr) return 1; - index = fixed_msr_to_range_index(msr); - if (index >= 0) { - *pdata = *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index]; - } else if (msr == MSR_MTRRdefType) { - *pdata = vcpu->arch.mtrr_state.deftype; - } else { - /* Variable MTRRs */ - if (is_mtrr_base_msr(msr)) - *pdata = var_mtrr_msr_to_range(vcpu, msr)->base; - else - *pdata = var_mtrr_msr_to_range(vcpu, msr)->mask; - - *pdata &= ~kvm_vcpu_reserved_gpa_bits_raw(vcpu); - } - + *pdata = *mtrr; return 0; } - -void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu) -{ - INIT_LIST_HEAD(&vcpu->arch.mtrr_state.head); -} - -struct mtrr_iter { - /* input fields. */ - struct kvm_mtrr *mtrr_state; - u64 start; - u64 end; - - /* output fields. */ - int mem_type; - /* mtrr is completely disabled? */ - bool mtrr_disabled; - /* [start, end) is not fully covered in MTRRs? */ - bool partial_map; - - /* private fields. */ - union { - /* used for fixed MTRRs. */ - struct { - int index; - int seg; - }; - - /* used for var MTRRs. */ - struct { - struct kvm_mtrr_range *range; - /* max address has been covered in var MTRRs. */ - u64 start_max; - }; - }; - - bool fixed; -}; - -static bool mtrr_lookup_fixed_start(struct mtrr_iter *iter) -{ - int seg, index; - - if (!fixed_mtrr_is_enabled(iter->mtrr_state)) - return false; - - seg = fixed_mtrr_addr_to_seg(iter->start); - if (seg < 0) - return false; - - iter->fixed = true; - index = fixed_mtrr_addr_seg_to_range_index(iter->start, seg); - iter->index = index; - iter->seg = seg; - return true; -} - -static bool match_var_range(struct mtrr_iter *iter, - struct kvm_mtrr_range *range) -{ - u64 start, end; - - var_mtrr_range(range, &start, &end); - if (!(start >= iter->end || end <= iter->start)) { - iter->range = range; - - /* - * the function is called when we do kvm_mtrr.head walking. - * Range has the minimum base address which interleaves - * [looker->start_max, looker->end). - */ - iter->partial_map |= iter->start_max < start; - - /* update the max address has been covered. */ - iter->start_max = max(iter->start_max, end); - return true; - } - - return false; -} - -static void __mtrr_lookup_var_next(struct mtrr_iter *iter) -{ - struct kvm_mtrr *mtrr_state = iter->mtrr_state; - - list_for_each_entry_continue(iter->range, &mtrr_state->head, node) - if (match_var_range(iter, iter->range)) - return; - - iter->range = NULL; - iter->partial_map |= iter->start_max < iter->end; -} - -static void mtrr_lookup_var_start(struct mtrr_iter *iter) -{ - struct kvm_mtrr *mtrr_state = iter->mtrr_state; - - iter->fixed = false; - iter->start_max = iter->start; - iter->range = NULL; - iter->range = list_prepare_entry(iter->range, &mtrr_state->head, node); - - __mtrr_lookup_var_next(iter); -} - -static void mtrr_lookup_fixed_next(struct mtrr_iter *iter) -{ - /* terminate the lookup. */ - if (fixed_mtrr_range_end_addr(iter->seg, iter->index) >= iter->end) { - iter->fixed = false; - iter->range = NULL; - return; - } - - iter->index++; - - /* have looked up for all fixed MTRRs. */ - if (iter->index >= ARRAY_SIZE(iter->mtrr_state->fixed_ranges)) - return mtrr_lookup_var_start(iter); - - /* switch to next segment. */ - if (iter->index > fixed_mtrr_seg_end_range_index(iter->seg)) - iter->seg++; -} - -static void mtrr_lookup_var_next(struct mtrr_iter *iter) -{ - __mtrr_lookup_var_next(iter); -} - -static void mtrr_lookup_start(struct mtrr_iter *iter) -{ - if (!mtrr_is_enabled(iter->mtrr_state)) { - iter->mtrr_disabled = true; - return; - } - - if (!mtrr_lookup_fixed_start(iter)) - mtrr_lookup_var_start(iter); -} - -static void mtrr_lookup_init(struct mtrr_iter *iter, - struct kvm_mtrr *mtrr_state, u64 start, u64 end) -{ - iter->mtrr_state = mtrr_state; - iter->start = start; - iter->end = end; - iter->mtrr_disabled = false; - iter->partial_map = false; - iter->fixed = false; - iter->range = NULL; - - mtrr_lookup_start(iter); -} - -static bool mtrr_lookup_okay(struct mtrr_iter *iter) -{ - if (iter->fixed) { - iter->mem_type = iter->mtrr_state->fixed_ranges[iter->index]; - return true; - } - - if (iter->range) { - iter->mem_type = iter->range->base & 0xff; - return true; - } - - return false; -} - -static void mtrr_lookup_next(struct mtrr_iter *iter) -{ - if (iter->fixed) - mtrr_lookup_fixed_next(iter); - else - mtrr_lookup_var_next(iter); -} - -#define mtrr_for_each_mem_type(_iter_, _mtrr_, _gpa_start_, _gpa_end_) \ - for (mtrr_lookup_init(_iter_, _mtrr_, _gpa_start_, _gpa_end_); \ - mtrr_lookup_okay(_iter_); mtrr_lookup_next(_iter_)) - -u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; - struct mtrr_iter iter; - u64 start, end; - int type = -1; - const int wt_wb_mask = (1 << MTRR_TYPE_WRBACK) - | (1 << MTRR_TYPE_WRTHROUGH); - - start = gfn_to_gpa(gfn); - end = start + PAGE_SIZE; - - mtrr_for_each_mem_type(&iter, mtrr_state, start, end) { - int curr_type = iter.mem_type; - - /* - * Please refer to Intel SDM Volume 3: 11.11.4.1 MTRR - * Precedences. - */ - - if (type == -1) { - type = curr_type; - continue; - } - - /* - * If two or more variable memory ranges match and the - * memory types are identical, then that memory type is - * used. - */ - if (type == curr_type) - continue; - - /* - * If two or more variable memory ranges match and one of - * the memory types is UC, the UC memory type used. - */ - if (curr_type == MTRR_TYPE_UNCACHABLE) - return MTRR_TYPE_UNCACHABLE; - - /* - * If two or more variable memory ranges match and the - * memory types are WT and WB, the WT memory type is used. - */ - if (((1 << type) & wt_wb_mask) && - ((1 << curr_type) & wt_wb_mask)) { - type = MTRR_TYPE_WRTHROUGH; - continue; - } - - /* - * For overlaps not defined by the above rules, processor - * behavior is undefined. - */ - - /* We use WB for this undefined behavior. :( */ - return MTRR_TYPE_WRBACK; - } - - if (iter.mtrr_disabled) - return mtrr_disabled_type(vcpu); - - /* not contained in any MTRRs. */ - if (type == -1) - return mtrr_default_type(mtrr_state); - - /* - * We just check one page, partially covered by MTRRs is - * impossible. - */ - WARN_ON(iter.partial_map); - - return type; -} -EXPORT_SYMBOL_GPL(kvm_mtrr_get_guest_memory_type); - -bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, - int page_num) -{ - struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; - struct mtrr_iter iter; - u64 start, end; - int type = -1; - - start = gfn_to_gpa(gfn); - end = gfn_to_gpa(gfn + page_num); - mtrr_for_each_mem_type(&iter, mtrr_state, start, end) { - if (type == -1) { - type = iter.mem_type; - continue; - } - - if (type != iter.mem_type) - return false; - } - - if (iter.mtrr_disabled) - return true; - - if (!iter.partial_map) - return true; - - if (type == -1) - return true; - - return type == mtrr_default_type(mtrr_state); -} diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index c397b28e3d1b..75e9cfc689f8 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -34,16 +34,16 @@ EXPORT_SYMBOL_GPL(kvm_pmu_eventsel); /* Precise Distribution of Instructions Retired (PDIR) */ static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = { - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, NULL), + X86_MATCH_VFM(INTEL_ICELAKE_D, NULL), + X86_MATCH_VFM(INTEL_ICELAKE_X, NULL), /* Instruction-Accurate PDIR (PDIR++) */ - X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL), + X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, NULL), {} }; /* Precise Distribution (PDist) */ static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = { - X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL), + X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, NULL), {} }; @@ -69,7 +69,7 @@ static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = { * code. Each pmc, stored in kvm_pmc.idx field, is unique across * all perf counters (both gp and fixed). The mapping relationship * between pmc and perf counters is as the following: - * * Intel: [0 .. KVM_INTEL_PMC_MAX_GENERIC-1] <=> gp counters + * * Intel: [0 .. KVM_MAX_NR_INTEL_GP_COUNTERS-1] <=> gp counters * [KVM_FIXED_PMC_BASE_IDX .. KVM_FIXED_PMC_BASE_IDX + 2] <=> fixed * * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H * and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters @@ -194,7 +194,7 @@ static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, attr.sample_period = get_sample_period(pmc, pmc->counter); if ((attr.config & HSW_IN_TX_CHECKPOINTED) && - guest_cpuid_is_intel(pmc->vcpu)) { + (boot_cpu_has(X86_FEATURE_RTM) || boot_cpu_has(X86_FEATURE_HLE))) { /* * HSW_IN_TX_CHECKPOINTED is not supported with nonzero * period. Just clear the sample period so at least @@ -469,11 +469,11 @@ static int reprogram_counter(struct kvm_pmc *pmc) if (pmc_is_fixed(pmc)) { fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, pmc->idx - KVM_FIXED_PMC_BASE_IDX); - if (fixed_ctr_ctrl & 0x1) + if (fixed_ctr_ctrl & INTEL_FIXED_0_KERNEL) eventsel |= ARCH_PERFMON_EVENTSEL_OS; - if (fixed_ctr_ctrl & 0x2) + if (fixed_ctr_ctrl & INTEL_FIXED_0_USER) eventsel |= ARCH_PERFMON_EVENTSEL_USR; - if (fixed_ctr_ctrl & 0x8) + if (fixed_ctr_ctrl & INTEL_FIXED_0_ENABLE_PMI) eventsel |= ARCH_PERFMON_EVENTSEL_INT; new_config = (u64)fixed_ctr_ctrl; } @@ -521,9 +521,9 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) } /* - * Unused perf_events are only released if the corresponding MSRs - * weren't accessed during the last vCPU time slice. kvm_arch_sched_in - * triggers KVM_REQ_PMU if cleanup is needed. + * Release unused perf_events if the corresponding guest MSRs weren't + * accessed during the last vCPU time slice (need_cleanup is set when + * the vCPU is scheduled back in). */ if (unlikely(pmu->need_cleanup)) kvm_pmu_cleanup(vcpu); @@ -542,7 +542,7 @@ int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx) if (!kvm_pmu_ops.check_rdpmc_early) return 0; - return static_call(kvm_x86_pmu_check_rdpmc_early)(vcpu, idx); + return kvm_pmu_call(check_rdpmc_early)(vcpu, idx); } bool is_vmware_backdoor_pmc(u32 pmc_idx) @@ -591,12 +591,12 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) if (is_vmware_backdoor_pmc(idx)) return kvm_pmu_rdpmc_vmware(vcpu, idx, data); - pmc = static_call(kvm_x86_pmu_rdpmc_ecx_to_pmc)(vcpu, idx, &mask); + pmc = kvm_pmu_call(rdpmc_ecx_to_pmc)(vcpu, idx, &mask); if (!pmc) return 1; if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCE) && - (static_call(kvm_x86_get_cpl)(vcpu) != 0) && + (kvm_x86_call(get_cpl)(vcpu) != 0) && kvm_is_cr0_bit_set(vcpu, X86_CR0_PE)) return 1; @@ -607,7 +607,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) { if (lapic_in_kernel(vcpu)) { - static_call_cond(kvm_x86_pmu_deliver_pmi)(vcpu); + kvm_pmu_call(deliver_pmi)(vcpu); kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC); } } @@ -622,14 +622,14 @@ bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) default: break; } - return static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr) || - static_call(kvm_x86_pmu_is_valid_msr)(vcpu, msr); + return kvm_pmu_call(msr_idx_to_pmc)(vcpu, msr) || + kvm_pmu_call(is_valid_msr)(vcpu, msr); } static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - struct kvm_pmc *pmc = static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr); + struct kvm_pmc *pmc = kvm_pmu_call(msr_idx_to_pmc)(vcpu, msr); if (pmc) __set_bit(pmc->idx, pmu->pmc_in_use); @@ -654,7 +654,7 @@ int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 0; break; default: - return static_call(kvm_x86_pmu_get_msr)(vcpu, msr_info); + return kvm_pmu_call(get_msr)(vcpu, msr_info); } return 0; @@ -681,13 +681,13 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated) break; - if (data & pmu->global_status_mask) + if (data & pmu->global_status_rsvd) return 1; pmu->global_status = data; break; case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: - data &= ~pmu->global_ctrl_mask; + data &= ~pmu->global_ctrl_rsvd; fallthrough; case MSR_CORE_PERF_GLOBAL_CTRL: if (!kvm_valid_perf_global_ctrl(pmu, data)) @@ -704,7 +704,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * GLOBAL_OVF_CTRL, a.k.a. GLOBAL STATUS_RESET, clears bits in * GLOBAL_STATUS, and so the set of reserved bits is the same. */ - if (data & pmu->global_status_mask) + if (data & pmu->global_status_rsvd) return 1; fallthrough; case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: @@ -713,7 +713,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; default: kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index); - return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info); + return kvm_pmu_call(set_msr)(vcpu, msr_info); } return 0; @@ -740,7 +740,7 @@ static void kvm_pmu_reset(struct kvm_vcpu *vcpu) pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0; - static_call_cond(kvm_x86_pmu_reset)(vcpu); + kvm_pmu_call(reset)(vcpu); } @@ -768,15 +768,27 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) pmu->counter_bitmask[KVM_PMC_FIXED] = 0; pmu->reserved_bits = 0xffffffff00200000ull; pmu->raw_event_mask = X86_RAW_EVENT_MASK; - pmu->global_ctrl_mask = ~0ull; - pmu->global_status_mask = ~0ull; - pmu->fixed_ctr_ctrl_mask = ~0ull; - pmu->pebs_enable_mask = ~0ull; - pmu->pebs_data_cfg_mask = ~0ull; + pmu->global_ctrl_rsvd = ~0ull; + pmu->global_status_rsvd = ~0ull; + pmu->fixed_ctr_ctrl_rsvd = ~0ull; + pmu->pebs_enable_rsvd = ~0ull; + pmu->pebs_data_cfg_rsvd = ~0ull; bitmap_zero(pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX); - if (vcpu->kvm->arch.enable_pmu) - static_call(kvm_x86_pmu_refresh)(vcpu); + if (!vcpu->kvm->arch.enable_pmu) + return; + + kvm_pmu_call(refresh)(vcpu); + + /* + * At RESET, both Intel and AMD CPUs set all enable bits for general + * purpose counters in IA32_PERF_GLOBAL_CTRL (so that software that + * was written for v1 PMUs don't unknowingly leave GP counters disabled + * in the global controls). Emulate that behavior when refreshing the + * PMU so that userspace doesn't need to manually set PERF_GLOBAL_CTRL. + */ + if (kvm_pmu_has_perf_global_ctrl(pmu) && pmu->nr_arch_gp_counters) + pmu->global_ctrl = GENMASK_ULL(pmu->nr_arch_gp_counters - 1, 0); } void kvm_pmu_init(struct kvm_vcpu *vcpu) @@ -784,8 +796,7 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); memset(pmu, 0, sizeof(*pmu)); - static_call(kvm_x86_pmu_init)(vcpu); - kvm_pmu_refresh(vcpu); + kvm_pmu_call(init)(vcpu); } /* Release perf_events for vPMCs that have been unused for a full time slice. */ @@ -806,7 +817,7 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu) pmc_stop_counter(pmc); } - static_call_cond(kvm_x86_pmu_cleanup)(vcpu); + kvm_pmu_call(cleanup)(vcpu); bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX); } @@ -834,8 +845,8 @@ static inline bool cpl_is_matched(struct kvm_pmc *pmc) } else { config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl, pmc->idx - KVM_FIXED_PMC_BASE_IDX); - select_os = config & 0x1; - select_user = config & 0x2; + select_os = config & INTEL_FIXED_0_KERNEL; + select_user = config & INTEL_FIXED_0_USER; } /* @@ -845,7 +856,8 @@ static inline bool cpl_is_matched(struct kvm_pmc *pmc) if (select_os == select_user) return select_os; - return (static_call(kvm_x86_get_cpl)(pmc->vcpu) == 0) ? select_os : select_user; + return (kvm_x86_call(get_cpl)(pmc->vcpu) == 0) ? select_os : + select_user; } void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel) diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 4d52b0b539ba..ad89d0bd6005 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -14,7 +14,8 @@ MSR_IA32_MISC_ENABLE_BTS_UNAVAIL) /* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */ -#define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) & 0xf) +#define fixed_ctrl_field(ctrl_reg, idx) \ + (((ctrl_reg) >> ((idx) * INTEL_FIXED_BITS_STRIDE)) & INTEL_FIXED_BITS_MASK) #define VMWARE_BACKDOOR_PMC_HOST_TSC 0x10000 #define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001 @@ -129,7 +130,7 @@ static inline bool pmc_is_fixed(struct kvm_pmc *pmc) static inline bool kvm_valid_perf_global_ctrl(struct kvm_pmu *pmu, u64 data) { - return !(pmu->global_ctrl_mask & data); + return !(pmu->global_ctrl_rsvd & data); } /* returns general purpose PMC with the specified MSR. Note that it can be @@ -170,7 +171,8 @@ static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) if (pmc_is_fixed(pmc)) return fixed_ctrl_field(pmu->fixed_ctr_ctrl, - pmc->idx - KVM_FIXED_PMC_BASE_IDX) & 0x3; + pmc->idx - KVM_FIXED_PMC_BASE_IDX) & + (INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER); return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE; } @@ -217,7 +219,7 @@ static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) kvm_pmu_cap.num_counters_gp = min(kvm_pmu_cap.num_counters_gp, pmu_ops->MAX_NR_GP_COUNTERS); kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed, - KVM_PMC_MAX_FIXED); + KVM_MAX_NR_FIXED_COUNTERS); kvm_pmu_eventsel.INSTRUCTIONS_RETIRED = perf_get_hw_event_config(PERF_COUNT_HW_INSTRUCTIONS); diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index aadefcaa9561..fde0ae986003 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -7,22 +7,6 @@ #include <asm/cpufeatures.h> /* - * Hardware-defined CPUID leafs that are either scattered by the kernel or are - * unknown to the kernel, but need to be directly used by KVM. Note, these - * word values conflict with the kernel's "bug" caps, but KVM doesn't use those. - */ -enum kvm_only_cpuid_leafs { - CPUID_12_EAX = NCAPINTS, - CPUID_7_1_EDX, - CPUID_8000_0007_EDX, - CPUID_8000_0022_EAX, - CPUID_7_2_EDX, - NR_KVM_CPU_CAPS, - - NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, -}; - -/* * Define a KVM-only feature flag. * * For features that are scattered by cpufeatures.h, __feature_translate() also @@ -45,16 +29,23 @@ enum kvm_only_cpuid_leafs { #define X86_FEATURE_AVX_VNNI_INT8 KVM_X86_FEATURE(CPUID_7_1_EDX, 4) #define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5) #define X86_FEATURE_AMX_COMPLEX KVM_X86_FEATURE(CPUID_7_1_EDX, 8) +#define X86_FEATURE_AVX_VNNI_INT16 KVM_X86_FEATURE(CPUID_7_1_EDX, 10) #define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14) +#define X86_FEATURE_AVX10 KVM_X86_FEATURE(CPUID_7_1_EDX, 19) /* Intel-defined sub-features, CPUID level 0x00000007:2 (EDX) */ #define X86_FEATURE_INTEL_PSFD KVM_X86_FEATURE(CPUID_7_2_EDX, 0) #define X86_FEATURE_IPRED_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 1) #define KVM_X86_FEATURE_RRSBA_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 2) #define X86_FEATURE_DDPD_U KVM_X86_FEATURE(CPUID_7_2_EDX, 3) -#define X86_FEATURE_BHI_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 4) +#define KVM_X86_FEATURE_BHI_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 4) #define X86_FEATURE_MCDT_NO KVM_X86_FEATURE(CPUID_7_2_EDX, 5) +/* Intel-defined sub-features, CPUID level 0x00000024:0 (EBX) */ +#define X86_FEATURE_AVX10_128 KVM_X86_FEATURE(CPUID_24_0_EBX, 16) +#define X86_FEATURE_AVX10_256 KVM_X86_FEATURE(CPUID_24_0_EBX, 17) +#define X86_FEATURE_AVX10_512 KVM_X86_FEATURE(CPUID_24_0_EBX, 18) + /* CPUID level 0x80000007 (EDX). */ #define KVM_X86_FEATURE_CONSTANT_TSC KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8) @@ -90,6 +81,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_8000_0021_EAX] = {0x80000021, 0, CPUID_EAX}, [CPUID_8000_0022_EAX] = {0x80000022, 0, CPUID_EAX}, [CPUID_7_2_EDX] = { 7, 2, CPUID_EDX}, + [CPUID_24_0_EBX] = { 0x24, 0, CPUID_EBX}, }; /* @@ -102,10 +94,12 @@ static const struct cpuid_reg reverse_cpuid[] = { */ static __always_inline void reverse_cpuid_check(unsigned int x86_leaf) { + BUILD_BUG_ON(NR_CPUID_WORDS != NCAPINTS); BUILD_BUG_ON(x86_leaf == CPUID_LNX_1); BUILD_BUG_ON(x86_leaf == CPUID_LNX_2); BUILD_BUG_ON(x86_leaf == CPUID_LNX_3); BUILD_BUG_ON(x86_leaf == CPUID_LNX_4); + BUILD_BUG_ON(x86_leaf == CPUID_LNX_5); BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid)); BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0); } @@ -126,6 +120,7 @@ static __always_inline u32 __feature_translate(int x86_feature) KVM_X86_TRANSLATE_FEATURE(CONSTANT_TSC); KVM_X86_TRANSLATE_FEATURE(PERFMON_V2); KVM_X86_TRANSLATE_FEATURE(RRSBA_CTRL); + KVM_X86_TRANSLATE_FEATURE(BHI_CTRL); default: return x86_feature; } @@ -133,7 +128,10 @@ static __always_inline u32 __feature_translate(int x86_feature) static __always_inline u32 __feature_leaf(int x86_feature) { - return __feature_translate(x86_feature) / 32; + u32 x86_leaf = __feature_translate(x86_feature) / 32; + + reverse_cpuid_check(x86_leaf); + return x86_leaf; } /* @@ -156,7 +154,6 @@ static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned int x86_featu { unsigned int x86_leaf = __feature_leaf(x86_feature); - reverse_cpuid_check(x86_leaf); return reverse_cpuid[x86_leaf]; } diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index d06d43d8d2aa..9864c057187d 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -131,6 +131,7 @@ void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm) kvm_mmu_reset_context(vcpu); } +EXPORT_SYMBOL_GPL(kvm_smm_changed); void process_smi(struct kvm_vcpu *vcpu) { @@ -200,11 +201,11 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, enter_smm_save_seg_32(vcpu, &smram->tr, &smram->tr_sel, VCPU_SREG_TR); enter_smm_save_seg_32(vcpu, &smram->ldtr, &smram->ldtr_sel, VCPU_SREG_LDTR); - static_call(kvm_x86_get_gdt)(vcpu, &dt); + kvm_x86_call(get_gdt)(vcpu, &dt); smram->gdtr.base = dt.address; smram->gdtr.limit = dt.size; - static_call(kvm_x86_get_idt)(vcpu, &dt); + kvm_x86_call(get_idt)(vcpu, &dt); smram->idtr.base = dt.address; smram->idtr.limit = dt.size; @@ -220,7 +221,7 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, smram->smm_revision = 0x00020000; smram->smbase = vcpu->arch.smbase; - smram->int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); + smram->int_shadow = kvm_x86_call(get_interrupt_shadow)(vcpu); } #ifdef CONFIG_X86_64 @@ -250,13 +251,13 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, enter_smm_save_seg_64(vcpu, &smram->tr, VCPU_SREG_TR); - static_call(kvm_x86_get_idt)(vcpu, &dt); + kvm_x86_call(get_idt)(vcpu, &dt); smram->idtr.limit = dt.size; smram->idtr.base = dt.address; enter_smm_save_seg_64(vcpu, &smram->ldtr, VCPU_SREG_LDTR); - static_call(kvm_x86_get_gdt)(vcpu, &dt); + kvm_x86_call(get_gdt)(vcpu, &dt); smram->gdtr.limit = dt.size; smram->gdtr.base = dt.address; @@ -267,7 +268,7 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, enter_smm_save_seg_64(vcpu, &smram->fs, VCPU_SREG_FS); enter_smm_save_seg_64(vcpu, &smram->gs, VCPU_SREG_GS); - smram->int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); + smram->int_shadow = kvm_x86_call(get_interrupt_shadow)(vcpu); } #endif @@ -283,7 +284,7 @@ void enter_smm(struct kvm_vcpu *vcpu) memset(smram.bytes, 0, sizeof(smram.bytes)); #ifdef CONFIG_X86_64 - if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) enter_smm_save_state_64(vcpu, &smram.smram64); else #endif @@ -297,7 +298,7 @@ void enter_smm(struct kvm_vcpu *vcpu) * Kill the VM in the unlikely case of failure, because the VM * can be in undefined state in this case. */ - if (static_call(kvm_x86_enter_smm)(vcpu, &smram)) + if (kvm_x86_call(enter_smm)(vcpu, &smram)) goto error; kvm_smm_changed(vcpu, true); @@ -305,24 +306,24 @@ void enter_smm(struct kvm_vcpu *vcpu) if (kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, &smram, sizeof(smram))) goto error; - if (static_call(kvm_x86_get_nmi_mask)(vcpu)) + if (kvm_x86_call(get_nmi_mask)(vcpu)) vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; else - static_call(kvm_x86_set_nmi_mask)(vcpu, true); + kvm_x86_call(set_nmi_mask)(vcpu, true); kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); kvm_rip_write(vcpu, 0x8000); - static_call(kvm_x86_set_interrupt_shadow)(vcpu, 0); + kvm_x86_call(set_interrupt_shadow)(vcpu, 0); cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG); - static_call(kvm_x86_set_cr0)(vcpu, cr0); + kvm_x86_call(set_cr0)(vcpu, cr0); - static_call(kvm_x86_set_cr4)(vcpu, 0); + kvm_x86_call(set_cr4)(vcpu, 0); /* Undocumented: IDT limit is set to zero on entry to SMM. */ dt.address = dt.size = 0; - static_call(kvm_x86_set_idt)(vcpu, &dt); + kvm_x86_call(set_idt)(vcpu, &dt); if (WARN_ON_ONCE(kvm_set_dr(vcpu, 7, DR7_FIXED_1))) goto error; @@ -353,12 +354,12 @@ void enter_smm(struct kvm_vcpu *vcpu) kvm_set_segment(vcpu, &ds, VCPU_SREG_SS); #ifdef CONFIG_X86_64 - if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) - if (static_call(kvm_x86_set_efer)(vcpu, 0)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) + if (kvm_x86_call(set_efer)(vcpu, 0)) goto error; #endif - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; kvm_mmu_reset_context(vcpu); return; error: @@ -479,11 +480,11 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, dt.address = smstate->gdtr.base; dt.size = smstate->gdtr.limit; - static_call(kvm_x86_set_gdt)(vcpu, &dt); + kvm_x86_call(set_gdt)(vcpu, &dt); dt.address = smstate->idtr.base; dt.size = smstate->idtr.limit; - static_call(kvm_x86_set_idt)(vcpu, &dt); + kvm_x86_call(set_idt)(vcpu, &dt); rsm_load_seg_32(vcpu, &smstate->es, smstate->es_sel, VCPU_SREG_ES); rsm_load_seg_32(vcpu, &smstate->cs, smstate->cs_sel, VCPU_SREG_CS); @@ -501,7 +502,7 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, if (r != X86EMUL_CONTINUE) return r; - static_call(kvm_x86_set_interrupt_shadow)(vcpu, 0); + kvm_x86_call(set_interrupt_shadow)(vcpu, 0); ctxt->interruptibility = (u8)smstate->int_shadow; return r; @@ -535,13 +536,13 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, dt.size = smstate->idtr.limit; dt.address = smstate->idtr.base; - static_call(kvm_x86_set_idt)(vcpu, &dt); + kvm_x86_call(set_idt)(vcpu, &dt); rsm_load_seg_64(vcpu, &smstate->ldtr, VCPU_SREG_LDTR); dt.size = smstate->gdtr.limit; dt.address = smstate->gdtr.base; - static_call(kvm_x86_set_gdt)(vcpu, &dt); + kvm_x86_call(set_gdt)(vcpu, &dt); r = rsm_enter_protected_mode(vcpu, smstate->cr0, smstate->cr3, smstate->cr4); if (r != X86EMUL_CONTINUE) @@ -554,7 +555,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, rsm_load_seg_64(vcpu, &smstate->fs, VCPU_SREG_FS); rsm_load_seg_64(vcpu, &smstate->gs, VCPU_SREG_GS); - static_call(kvm_x86_set_interrupt_shadow)(vcpu, 0); + kvm_x86_call(set_interrupt_shadow)(vcpu, 0); ctxt->interruptibility = (u8)smstate->int_shadow; return X86EMUL_CONTINUE; @@ -576,7 +577,7 @@ int emulator_leave_smm(struct x86_emulate_ctxt *ctxt) return X86EMUL_UNHANDLEABLE; if ((vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK) == 0) - static_call(kvm_x86_set_nmi_mask)(vcpu, false); + kvm_x86_call(set_nmi_mask)(vcpu, false); kvm_smm_changed(vcpu, false); @@ -586,7 +587,7 @@ int emulator_leave_smm(struct x86_emulate_ctxt *ctxt) * supports long mode. */ #ifdef CONFIG_X86_64 - if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) { + if (guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) { struct kvm_segment cs_desc; unsigned long cr4; @@ -609,7 +610,7 @@ int emulator_leave_smm(struct x86_emulate_ctxt *ctxt) kvm_set_cr0(vcpu, cr0 & ~(X86_CR0_PG | X86_CR0_PE)); #ifdef CONFIG_X86_64 - if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) { + if (guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) { unsigned long cr4, efer; /* Clear CR4.PAE before clearing EFER.LME. */ @@ -624,17 +625,31 @@ int emulator_leave_smm(struct x86_emulate_ctxt *ctxt) #endif /* - * Give leave_smm() a chance to make ISA-specific changes to the vCPU - * state (e.g. enter guest mode) before loading state from the SMM - * state-save area. + * FIXME: When resuming L2 (a.k.a. guest mode), the transition to guest + * mode should happen _after_ loading state from SMRAM. However, KVM + * piggybacks the nested VM-Enter flows (which is wrong for many other + * reasons), and so nSVM/nVMX would clobber state that is loaded from + * SMRAM and from the VMCS/VMCB. */ - if (static_call(kvm_x86_leave_smm)(vcpu, &smram)) + if (kvm_x86_call(leave_smm)(vcpu, &smram)) return X86EMUL_UNHANDLEABLE; #ifdef CONFIG_X86_64 - if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) - return rsm_load_state_64(ctxt, &smram.smram64); + if (guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) + ret = rsm_load_state_64(ctxt, &smram.smram64); else #endif - return rsm_load_state_32(ctxt, &smram.smram32); + ret = rsm_load_state_32(ctxt, &smram.smram32); + + /* + * If RSM fails and triggers shutdown, architecturally the shutdown + * occurs *before* the transition to guest mode. But due to KVM's + * flawed handling of RSM to L2 (see above), the vCPU may already be + * in_guest_mode(). Force the vCPU out of guest mode before delivering + * the shutdown, so that L1 enters shutdown instead of seeing a VM-Exit + * that architecturally shouldn't be possible. + */ + if (ret != X86EMUL_CONTINUE && is_guest_mode(vcpu)) + kvm_leave_nested(vcpu); + return ret; } diff --git a/arch/x86/kvm/smm.h b/arch/x86/kvm/smm.h index a1cf2ac5bd78..551703fbe200 100644 --- a/arch/x86/kvm/smm.h +++ b/arch/x86/kvm/smm.h @@ -142,6 +142,9 @@ union kvm_smram { static inline int kvm_inject_smi(struct kvm_vcpu *vcpu) { + if (!kvm_x86_call(has_emulated_msr)(vcpu->kvm, MSR_IA32_SMBASE)) + return -ENOTTY; + kvm_make_request(KVM_REQ_SMI, vcpu); return 0; } diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 4b74ea91f4e6..067f8e3f5a0d 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -20,6 +20,7 @@ #include <linux/kvm_host.h> #include <asm/irq_remapping.h> +#include <asm/msr.h> #include "trace.h" #include "lapic.h" @@ -330,7 +331,7 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu) int cpu = READ_ONCE(vcpu->cpu); if (cpu != get_cpu()) { - wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); + wrmsrq(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); trace_kvm_avic_doorbell(vcpu->vcpu_id, kvm_cpu_get_apicid(cpu)); } put_cpu(); @@ -796,12 +797,15 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) struct amd_svm_iommu_ir *ir; u64 entry; + if (WARN_ON_ONCE(!pi->ir_data)) + return -EINVAL; + /** * In some cases, the existing irte is updated and re-set, * so we need to check here if it's already been * added * to the ir_list. */ - if (pi->ir_data && (pi->prev_ga_tag != 0)) { + if (pi->prev_ga_tag) { struct kvm *kvm = svm->vcpu.kvm; u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag); struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); @@ -820,7 +824,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) * Allocating new amd_iommu_pi_data, which will get * add to the per-vcpu ir_list. */ - ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT); + ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_ATOMIC | __GFP_ACCOUNT); if (!ir) { ret = -ENOMEM; goto out; @@ -896,10 +900,10 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, { struct kvm_kernel_irq_routing_entry *e; struct kvm_irq_routing_table *irq_rt; + bool enable_remapped_mode = true; int idx, ret = 0; - if (!kvm_arch_has_assigned_device(kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP)) + if (!kvm_arch_has_assigned_device(kvm) || !kvm_arch_has_irq_bypass()) return 0; pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n", @@ -933,6 +937,8 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, kvm_vcpu_apicv_active(&svm->vcpu)) { struct amd_iommu_pi_data pi; + enable_remapped_mode = false; + /* Try to enable guest_mode in IRTE */ pi.base = __sme_set(page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK); @@ -951,33 +957,6 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, */ if (!ret && pi.is_guest_mode) svm_ir_list_add(svm, &pi); - } else { - /* Use legacy mode in IRTE */ - struct amd_iommu_pi_data pi; - - /** - * Here, pi is used to: - * - Tell IOMMU to use legacy mode for this interrupt. - * - Retrieve ga_tag of prior interrupt remapping data. - */ - pi.prev_ga_tag = 0; - pi.is_guest_mode = false; - ret = irq_set_vcpu_affinity(host_irq, &pi); - - /** - * Check if the posted interrupt was previously - * setup with the guest_mode by checking if the ga_tag - * was cached. If so, we need to clean up the per-vcpu - * ir_list. - */ - if (!ret && pi.prev_ga_tag) { - int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); - struct kvm_vcpu *vcpu; - - vcpu = kvm_get_vcpu_by_id(kvm, id); - if (vcpu) - svm_ir_list_del(to_svm(vcpu), &pi); - } } if (!ret && svm) { @@ -993,6 +972,34 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, } ret = 0; + if (enable_remapped_mode) { + /* Use legacy mode in IRTE */ + struct amd_iommu_pi_data pi; + + /** + * Here, pi is used to: + * - Tell IOMMU to use legacy mode for this interrupt. + * - Retrieve ga_tag of prior interrupt remapping data. + */ + pi.prev_ga_tag = 0; + pi.is_guest_mode = false; + ret = irq_set_vcpu_affinity(host_irq, &pi); + + /** + * Check if the posted interrupt was previously + * setup with the guest_mode by checking if the ga_tag + * was cached. If so, we need to clean up the per-vcpu + * ir_list. + */ + if (!ret && pi.prev_ga_tag) { + int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); + struct kvm_vcpu *vcpu; + + vcpu = kvm_get_vcpu_by_id(kvm, id); + if (vcpu) + svm_ir_list_del(to_svm(vcpu), &pi); + } + } out: srcu_read_unlock(&kvm->irq_srcu, idx); return ret; @@ -1199,6 +1206,12 @@ bool avic_hardware_setup(void) return false; } + if (cc_platform_has(CC_ATTR_HOST_SEV_SNP) && + !boot_cpu_has(X86_FEATURE_HV_INUSE_WR_ALLOWED)) { + pr_warn("AVIC disabled: missing HvInUseWrAllowed on SNP-enabled system\n"); + return false; + } + if (boot_cpu_has(X86_FEATURE_AVIC)) { pr_info("AVIC enabled\n"); } else if (force_avic) { diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 55b9a6d96bcf..8427a48b8b7a 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -63,8 +63,12 @@ static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index) u64 pdpte; int ret; + /* + * Note, nCR3 is "assumed" to be 32-byte aligned, i.e. the CPU ignores + * nCR3[4:0] when loading PDPTEs from memory. + */ ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte, - offset_in_page(cr3) + index * 8, 8); + (cr3 & GENMASK(11, 5)) + index * 8, 8); if (ret) return 0; return pdpte; @@ -107,7 +111,7 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm) { - if (!guest_can_use(&svm->vcpu, X86_FEATURE_V_VMSAVE_VMLOAD)) + if (!guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_V_VMSAVE_VMLOAD)) return true; if (!nested_npt_enabled(svm)) @@ -590,7 +594,7 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 vmcb_mark_dirty(vmcb02, VMCB_DR); } - if (unlikely(guest_can_use(vcpu, X86_FEATURE_LBRV) && + if (unlikely(guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { /* * Reserved bits of DEBUGCTL are ignored. Be consistent with @@ -642,12 +646,17 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, u32 pause_count12; u32 pause_thresh12; + nested_svm_transition_tlb_flush(vcpu); + + /* Enter Guest-Mode */ + enter_guest_mode(vcpu); + /* * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2, * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes. */ - if (guest_can_use(vcpu, X86_FEATURE_VGIF) && + if (guest_cpu_cap_has(vcpu, X86_FEATURE_VGIF) && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK)) int_ctl_vmcb12_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK); else @@ -669,6 +678,33 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa; vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa; + /* + * Stash vmcb02's counter if the guest hasn't moved past the guilty + * instruction; otherwise, reset the counter to '0'. + * + * In order to detect if L2 has made forward progress or not, track the + * RIP at which a bus lock has occurred on a per-vmcb12 basis. If RIP + * is changed, guest has clearly made forward progress, bus_lock_counter + * still remained '1', so reset bus_lock_counter to '0'. Eg. In the + * scenario, where a buslock happened in L1 before VMRUN, the bus lock + * firmly happened on an instruction in the past. Even if vmcb01's + * counter is still '1', (because the guilty instruction got patched), + * the vCPU has clearly made forward progress and so KVM should reset + * vmcb02's counter to '0'. + * + * If the RIP hasn't changed, stash the bus lock counter at nested VMRUN + * to prevent the same guilty instruction from triggering a VM-Exit. Eg. + * if userspace rate-limits the vCPU, then it's entirely possible that + * L1's tick interrupt is pending by the time userspace re-runs the + * vCPU. If KVM unconditionally clears the counter on VMRUN, then when + * L1 re-enters L2, the same instruction will trigger a VM-Exit and the + * entire cycle start over. + */ + if (vmcb02->save.rip && (svm->nested.ctl.bus_lock_rip == vmcb02->save.rip)) + vmcb02->control.bus_lock_counter = 1; + else + vmcb02->control.bus_lock_counter = 0; + /* Done at vmrun: asid. */ /* Also overwritten later if necessary. */ @@ -685,7 +721,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, vmcb02->control.tsc_offset = vcpu->arch.tsc_offset; - if (guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR) && + if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR) && svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) nested_svm_update_tsc_ratio_msr(vcpu); @@ -706,7 +742,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, * what a nrips=0 CPU would do (L1 is responsible for advancing RIP * prior to injecting the event). */ - if (guest_can_use(vcpu, X86_FEATURE_NRIPS)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) vmcb02->control.next_rip = svm->nested.ctl.next_rip; else if (boot_cpu_has(X86_FEATURE_NRIPS)) vmcb02->control.next_rip = vmcb12_rip; @@ -716,7 +752,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, svm->soft_int_injected = true; svm->soft_int_csbase = vmcb12_csbase; svm->soft_int_old_rip = vmcb12_rip; - if (guest_can_use(vcpu, X86_FEATURE_NRIPS)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) svm->soft_int_next_rip = svm->nested.ctl.next_rip; else svm->soft_int_next_rip = vmcb12_rip; @@ -724,18 +760,18 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, vmcb02->control.virt_ext = vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK; - if (guest_can_use(vcpu, X86_FEATURE_LBRV)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV)) vmcb02->control.virt_ext |= (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK); if (!nested_vmcb_needs_vls_intercept(svm)) vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; - if (guest_can_use(vcpu, X86_FEATURE_PAUSEFILTER)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_PAUSEFILTER)) pause_count12 = svm->nested.ctl.pause_filter_count; else pause_count12 = 0; - if (guest_can_use(vcpu, X86_FEATURE_PFTHRESHOLD)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_PFTHRESHOLD)) pause_thresh12 = svm->nested.ctl.pause_filter_thresh; else pause_thresh12 = 0; @@ -758,11 +794,6 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, } } - nested_svm_transition_tlb_flush(vcpu); - - /* Enter Guest-Mode */ - enter_guest_mode(vcpu); - /* * Merge guest and host intercepts - must be called with vcpu in * guest-mode to take effect. @@ -922,7 +953,7 @@ out_exit_err: nested_svm_vmexit(svm); out: - kvm_vcpu_unmap(vcpu, &map, true); + kvm_vcpu_unmap(vcpu, &map); return ret; } @@ -990,7 +1021,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); /* in case we halted in L2 */ - svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); /* Give the current vmcb to the guest */ @@ -1022,7 +1053,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) if (vmcb12->control.exit_code != SVM_EXIT_ERR) nested_save_pending_event_to_vmcb12(svm, vmcb12); - if (guest_can_use(vcpu, X86_FEATURE_NRIPS)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) vmcb12->control.next_rip = vmcb02->control.next_rip; vmcb12->control.int_ctl = svm->nested.ctl.int_ctl; @@ -1035,8 +1066,17 @@ int nested_svm_vmexit(struct vcpu_svm *svm) } + /* + * Invalidate bus_lock_rip unless KVM is still waiting for the guest + * to make forward progress before re-enabling bus lock detection. + */ + if (!vmcb02->control.bus_lock_counter) + svm->nested.ctl.bus_lock_rip = INVALID_GPA; + nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr); + kvm_nested_vmexit_handle_ibrs(vcpu); + svm_switch_vmcb(svm, &svm->vmcb01); /* @@ -1061,7 +1101,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) if (!nested_exit_on_intr(svm)) kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); - if (unlikely(guest_can_use(vcpu, X86_FEATURE_LBRV) && + if (unlikely(guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { svm_copy_lbrs(vmcb12, vmcb02); svm_update_lbrv(vcpu); @@ -1126,7 +1166,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->control.exit_int_info_err, KVM_ISA_SVM); - kvm_vcpu_unmap(vcpu, &map, true); + kvm_vcpu_unmap(vcpu, &map); nested_svm_transition_tlb_flush(vcpu); @@ -1181,7 +1221,7 @@ int svm_allocate_nested(struct vcpu_svm *svm) if (svm->nested.initialized) return 0; - vmcb02_page = snp_safe_alloc_page(&svm->vcpu); + vmcb02_page = snp_safe_alloc_page(); if (!vmcb02_page) return -ENOMEM; svm->nested.vmcb02.ptr = page_address(vmcb02_page); @@ -1693,8 +1733,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, return -EINVAL; ret = -ENOMEM; - ctl = kzalloc(sizeof(*ctl), GFP_KERNEL_ACCOUNT); - save = kzalloc(sizeof(*save), GFP_KERNEL_ACCOUNT); + ctl = kzalloc(sizeof(*ctl), GFP_KERNEL); + save = kzalloc(sizeof(*save), GFP_KERNEL); if (!ctl || !save) goto out_free; diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index dfcc38bd97d3..288f7f2a46f2 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -46,7 +46,7 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, switch (msr) { case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: - if (!guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PERFCTR_CORE)) return NULL; /* * Each PMU counter has a pair of CTL and CTR MSRs. CTLn @@ -109,7 +109,7 @@ static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) case MSR_K7_EVNTSEL0 ... MSR_K7_PERFCTR3: return pmu->version > 0; case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: - return guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE); + return guest_cpu_cap_has(vcpu, X86_FEATURE_PERFCTR_CORE); case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: @@ -179,7 +179,7 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu) union cpuid_0x80000022_ebx ebx; pmu->version = 1; - if (guest_cpuid_has(vcpu, X86_FEATURE_PERFMON_V2)) { + if (guest_cpu_cap_has(vcpu, X86_FEATURE_PERFMON_V2)) { pmu->version = 2; /* * Note, PERFMON_V2 is also in 0x80000022.0x0, i.e. the guest @@ -189,7 +189,7 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu) x86_feature_cpuid(X86_FEATURE_PERFMON_V2).index); ebx.full = kvm_find_cpuid_entry_index(vcpu, 0x80000022, 0)->ebx; pmu->nr_arch_gp_counters = ebx.split.num_core_pmc; - } else if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) { + } else if (guest_cpu_cap_has(vcpu, X86_FEATURE_PERFCTR_CORE)) { pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS_CORE; } else { pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS; @@ -199,8 +199,8 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu) kvm_pmu_cap.num_counters_gp); if (pmu->version > 1) { - pmu->global_ctrl_mask = ~((1ull << pmu->nr_arch_gp_counters) - 1); - pmu->global_status_mask = pmu->global_ctrl_mask; + pmu->global_ctrl_rsvd = ~((1ull << pmu->nr_arch_gp_counters) - 1); + pmu->global_status_rsvd = pmu->global_ctrl_rsvd; } pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1; @@ -217,10 +217,9 @@ static void amd_pmu_init(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); int i; - BUILD_BUG_ON(KVM_AMD_PMC_MAX_GENERIC > AMD64_NUM_COUNTERS_CORE); - BUILD_BUG_ON(KVM_AMD_PMC_MAX_GENERIC > INTEL_PMC_MAX_GENERIC); + BUILD_BUG_ON(KVM_MAX_NR_AMD_GP_COUNTERS > AMD64_NUM_COUNTERS_CORE); - for (i = 0; i < KVM_AMD_PMC_MAX_GENERIC ; i++) { + for (i = 0; i < KVM_MAX_NR_AMD_GP_COUNTERS; i++) { pmu->gp_counters[i].type = KVM_PMC_GP; pmu->gp_counters[i].vcpu = vcpu; pmu->gp_counters[i].idx = i; @@ -238,6 +237,6 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = { .refresh = amd_pmu_refresh, .init = amd_pmu_init, .EVENTSEL_EVENT = AMD64_EVENTSEL_EVENT, - .MAX_NR_GP_COUNTERS = KVM_AMD_PMC_MAX_GENERIC, + .MAX_NR_GP_COUNTERS = KVM_MAX_NR_AMD_GP_COUNTERS, .MIN_NR_GP_COUNTERS = AMD64_NUM_COUNTERS, }; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index ae0ac12382b9..459c3b791fd4 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -19,11 +19,15 @@ #include <linux/misc_cgroup.h> #include <linux/processor.h> #include <linux/trace_events.h> +#include <uapi/linux/sev-guest.h> #include <asm/pkru.h> #include <asm/trapnr.h> #include <asm/fpu/xcr.h> +#include <asm/fpu/xstate.h> #include <asm/debugreg.h> +#include <asm/msr.h> +#include <asm/sev.h> #include "mmu.h" #include "x86.h" @@ -32,22 +36,12 @@ #include "cpuid.h" #include "trace.h" -#ifndef CONFIG_KVM_AMD_SEV -/* - * When this config is not defined, SEV feature is not supported and APIs in - * this file are not used but this file still gets compiled into the KVM AMD - * module. - * - * We will not have MISC_CG_RES_SEV and MISC_CG_RES_SEV_ES entries in the enum - * misc_res_type {} defined in linux/misc_cgroup.h. - * - * Below macros allow compilation to succeed. - */ -#define MISC_CG_RES_SEV MISC_CG_RES_TYPES -#define MISC_CG_RES_SEV_ES MISC_CG_RES_TYPES -#endif +#define GHCB_VERSION_MAX 2ULL +#define GHCB_VERSION_DEFAULT 2ULL +#define GHCB_VERSION_MIN 1ULL + +#define GHCB_HV_FT_SUPPORTED (GHCB_HV_FT_SNP | GHCB_HV_FT_SNP_AP_CREATION) -#ifdef CONFIG_KVM_AMD_SEV /* enable/disable SEV support */ static bool sev_enabled = true; module_param_named(sev, sev_enabled, bool, 0444); @@ -56,14 +50,35 @@ module_param_named(sev, sev_enabled, bool, 0444); static bool sev_es_enabled = true; module_param_named(sev_es, sev_es_enabled, bool, 0444); +/* enable/disable SEV-SNP support */ +static bool sev_snp_enabled = true; +module_param_named(sev_snp, sev_snp_enabled, bool, 0444); + /* enable/disable SEV-ES DebugSwap support */ -static bool sev_es_debug_swap_enabled = false; +static bool sev_es_debug_swap_enabled = true; module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444); -#else -#define sev_enabled false -#define sev_es_enabled false -#define sev_es_debug_swap_enabled false -#endif /* CONFIG_KVM_AMD_SEV */ +static u64 sev_supported_vmsa_features; + +#define AP_RESET_HOLD_NONE 0 +#define AP_RESET_HOLD_NAE_EVENT 1 +#define AP_RESET_HOLD_MSR_PROTO 2 + +/* As defined by SEV-SNP Firmware ABI, under "Guest Policy". */ +#define SNP_POLICY_MASK_API_MINOR GENMASK_ULL(7, 0) +#define SNP_POLICY_MASK_API_MAJOR GENMASK_ULL(15, 8) +#define SNP_POLICY_MASK_SMT BIT_ULL(16) +#define SNP_POLICY_MASK_RSVD_MBO BIT_ULL(17) +#define SNP_POLICY_MASK_DEBUG BIT_ULL(19) +#define SNP_POLICY_MASK_SINGLE_SOCKET BIT_ULL(20) + +#define SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \ + SNP_POLICY_MASK_API_MAJOR | \ + SNP_POLICY_MASK_SMT | \ + SNP_POLICY_MASK_RSVD_MBO | \ + SNP_POLICY_MASK_DEBUG | \ + SNP_POLICY_MASK_SINGLE_SOCKET) + +#define INITIAL_VMSA_GPA 0xFFFFFFFFF000 static u8 sev_enc_bit; static DECLARE_RWSEM(sev_deactivate_lock); @@ -75,6 +90,8 @@ static unsigned int nr_asids; static unsigned long *sev_asid_bitmap; static unsigned long *sev_reclaim_asid_bitmap; +static int snp_decommission_context(struct kvm *kvm); + struct enc_region { struct list_head list; unsigned long npages; @@ -84,9 +101,10 @@ struct enc_region { }; /* Called with the sev_bitmap_lock held, or on shutdown */ -static int sev_flush_asids(int min_asid, int max_asid) +static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid) { - int ret, asid, error = 0; + int ret, error = 0; + unsigned int asid; /* Check if there are any ASIDs to reclaim before performing a flush */ asid = find_next_bit(sev_reclaim_asid_bitmap, nr_asids, min_asid); @@ -100,23 +118,36 @@ static int sev_flush_asids(int min_asid, int max_asid) down_write(&sev_deactivate_lock); wbinvd_on_all_cpus(); - ret = sev_guest_df_flush(&error); + + if (sev_snp_enabled) + ret = sev_do_cmd(SEV_CMD_SNP_DF_FLUSH, NULL, &error); + else + ret = sev_guest_df_flush(&error); up_write(&sev_deactivate_lock); if (ret) - pr_err("SEV: DF_FLUSH failed, ret=%d, error=%#x\n", ret, error); + pr_err("SEV%s: DF_FLUSH failed, ret=%d, error=%#x\n", + sev_snp_enabled ? "-SNP" : "", ret, error); return ret; } static inline bool is_mirroring_enc_context(struct kvm *kvm) { - return !!to_kvm_svm(kvm)->sev_info.enc_context_owner; + return !!to_kvm_sev_info(kvm)->enc_context_owner; +} + +static bool sev_vcpu_has_debug_swap(struct vcpu_svm *svm) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); + + return sev->vmsa_features & SVM_SEV_FEAT_DEBUG_SWAP; } /* Must be called with the sev_bitmap_lock held */ -static bool __sev_recycle_asids(int min_asid, int max_asid) +static bool __sev_recycle_asids(unsigned int min_asid, unsigned int max_asid) { if (sev_flush_asids(min_asid, max_asid)) return false; @@ -143,8 +174,20 @@ static void sev_misc_cg_uncharge(struct kvm_sev_info *sev) static int sev_asid_new(struct kvm_sev_info *sev) { - int asid, min_asid, max_asid, ret; + /* + * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid. + * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1. + * Note: min ASID can end up larger than the max if basic SEV support is + * effectively disabled by disallowing use of ASIDs for SEV guests. + */ + unsigned int min_asid = sev->es_active ? 1 : min_sev_asid; + unsigned int max_asid = sev->es_active ? min_sev_asid - 1 : max_sev_asid; + unsigned int asid; bool retry = true; + int ret; + + if (min_asid > max_asid) + return -ENOTTY; WARN_ON(sev->misc_cg); sev->misc_cg = get_current_misc_cg(); @@ -157,12 +200,6 @@ static int sev_asid_new(struct kvm_sev_info *sev) mutex_lock(&sev_bitmap_lock); - /* - * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid. - * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1. - */ - min_asid = sev->es_active ? 1 : min_sev_asid; - max_asid = sev->es_active ? min_sev_asid - 1 : max_sev_asid; again: asid = find_next_zero_bit(sev_asid_bitmap, max_asid + 1, min_asid); if (asid > max_asid) { @@ -179,7 +216,8 @@ again: mutex_unlock(&sev_bitmap_lock); - return asid; + sev->asid = asid; + return 0; e_uncharge: sev_misc_cg_uncharge(sev); put_misc_cg(sev->misc_cg); @@ -187,11 +225,9 @@ e_uncharge: return ret; } -static int sev_get_asid(struct kvm *kvm) +static unsigned int sev_get_asid(struct kvm *kvm) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - - return sev->asid; + return to_kvm_sev_info(kvm)->asid; } static void sev_asid_free(struct kvm_sev_info *sev) @@ -226,6 +262,53 @@ static void sev_decommission(unsigned int handle) sev_guest_decommission(&decommission, NULL); } +/* + * Transition a page to hypervisor-owned/shared state in the RMP table. This + * should not fail under normal conditions, but leak the page should that + * happen since it will no longer be usable by the host due to RMP protections. + */ +static int kvm_rmp_make_shared(struct kvm *kvm, u64 pfn, enum pg_level level) +{ + if (KVM_BUG_ON(rmp_make_shared(pfn, level), kvm)) { + snp_leak_pages(pfn, page_level_size(level) >> PAGE_SHIFT); + return -EIO; + } + + return 0; +} + +/* + * Certain page-states, such as Pre-Guest and Firmware pages (as documented + * in Chapter 5 of the SEV-SNP Firmware ABI under "Page States") cannot be + * directly transitioned back to normal/hypervisor-owned state via RMPUPDATE + * unless they are reclaimed first. + * + * Until they are reclaimed and subsequently transitioned via RMPUPDATE, they + * might not be usable by the host due to being set as immutable or still + * being associated with a guest ASID. + * + * Bug the VM and leak the page if reclaim fails, or if the RMP entry can't be + * converted back to shared, as the page is no longer usable due to RMP + * protections, and it's infeasible for the guest to continue on. + */ +static int snp_page_reclaim(struct kvm *kvm, u64 pfn) +{ + struct sev_data_snp_page_reclaim data = {0}; + int fw_err, rc; + + data.paddr = __sme_set(pfn << PAGE_SHIFT); + rc = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &fw_err); + if (KVM_BUG(rc, kvm, "Failed to reclaim PFN %llx, rc %d fw_err %d", pfn, rc, fw_err)) { + snp_leak_pages(pfn, 1); + return -EIO; + } + + if (kvm_rmp_make_shared(kvm, pfn, PG_LEVEL_4K)) + return -EIO; + + return rc; +} + static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) { struct sev_data_deactivate deactivate; @@ -243,33 +326,138 @@ static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) sev_decommission(handle); } -static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) +/* + * This sets up bounce buffers/firmware pages to handle SNP Guest Request + * messages (e.g. attestation requests). See "SNP Guest Request" in the GHCB + * 2.0 specification for more details. + * + * Technically, when an SNP Guest Request is issued, the guest will provide its + * own request/response pages, which could in theory be passed along directly + * to firmware rather than using bounce pages. However, these pages would need + * special care: + * + * - Both pages are from shared guest memory, so they need to be protected + * from migration/etc. occurring while firmware reads/writes to them. At a + * minimum, this requires elevating the ref counts and potentially needing + * an explicit pinning of the memory. This places additional restrictions + * on what type of memory backends userspace can use for shared guest + * memory since there is some reliance on using refcounted pages. + * + * - The response page needs to be switched to Firmware-owned[1] state + * before the firmware can write to it, which can lead to potential + * host RMP #PFs if the guest is misbehaved and hands the host a + * guest page that KVM might write to for other reasons (e.g. virtio + * buffers/etc.). + * + * Both of these issues can be avoided completely by using separately-allocated + * bounce pages for both the request/response pages and passing those to + * firmware instead. So that's what is being set up here. + * + * Guest requests rely on message sequence numbers to ensure requests are + * issued to firmware in the order the guest issues them, so concurrent guest + * requests generally shouldn't happen. But a misbehaved guest could issue + * concurrent guest requests in theory, so a mutex is used to serialize + * access to the bounce buffers. + * + * [1] See the "Page States" section of the SEV-SNP Firmware ABI for more + * details on Firmware-owned pages, along with "RMP and VMPL Access Checks" + * in the APM for details on the related RMP restrictions. + */ +static int snp_guest_req_init(struct kvm *kvm) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + struct page *req_page; + + req_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!req_page) + return -ENOMEM; + + sev->guest_resp_buf = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!sev->guest_resp_buf) { + __free_page(req_page); + return -EIO; + } + + sev->guest_req_buf = page_address(req_page); + mutex_init(&sev->guest_req_mutex); + + return 0; +} + +static void snp_guest_req_cleanup(struct kvm *kvm) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + + if (sev->guest_resp_buf) + snp_free_firmware_page(sev->guest_resp_buf); + + if (sev->guest_req_buf) + __free_page(virt_to_page(sev->guest_req_buf)); + + sev->guest_req_buf = NULL; + sev->guest_resp_buf = NULL; +} + +static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, + struct kvm_sev_init *data, + unsigned long vm_type) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct sev_platform_init_args init_args = {0}; - int asid, ret; + bool es_active = vm_type != KVM_X86_SEV_VM; + u64 valid_vmsa_features = es_active ? sev_supported_vmsa_features : 0; + int ret; if (kvm->created_vcpus) return -EINVAL; - ret = -EBUSY; + if (data->flags) + return -EINVAL; + + if (data->vmsa_features & ~valid_vmsa_features) + return -EINVAL; + + if (data->ghcb_version > GHCB_VERSION_MAX || (!es_active && data->ghcb_version)) + return -EINVAL; + if (unlikely(sev->active)) - return ret; + return -EINVAL; sev->active = true; - sev->es_active = argp->id == KVM_SEV_ES_INIT; - asid = sev_asid_new(sev); - if (asid < 0) + sev->es_active = es_active; + sev->vmsa_features = data->vmsa_features; + sev->ghcb_version = data->ghcb_version; + + /* + * Currently KVM supports the full range of mandatory features defined + * by version 2 of the GHCB protocol, so default to that for SEV-ES + * guests created via KVM_SEV_INIT2. + */ + if (sev->es_active && !sev->ghcb_version) + sev->ghcb_version = GHCB_VERSION_DEFAULT; + + if (vm_type == KVM_X86_SNP_VM) + sev->vmsa_features |= SVM_SEV_FEAT_SNP_ACTIVE; + + ret = sev_asid_new(sev); + if (ret) goto e_no_asid; - sev->asid = asid; init_args.probe = false; ret = sev_platform_init(&init_args); if (ret) goto e_free; + /* This needs to happen after SEV/SNP firmware initialization. */ + if (vm_type == KVM_X86_SNP_VM) { + ret = snp_guest_req_init(kvm); + if (ret) + goto e_free; + } + INIT_LIST_HEAD(&sev->regions_list); INIT_LIST_HEAD(&sev->mirror_vms); + sev->need_init = false; kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_SEV); @@ -280,15 +468,57 @@ e_free: sev_asid_free(sev); sev->asid = 0; e_no_asid: + sev->vmsa_features = 0; sev->es_active = false; sev->active = false; return ret; } +static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_init data = { + .vmsa_features = 0, + .ghcb_version = 0, + }; + unsigned long vm_type; + + if (kvm->arch.vm_type != KVM_X86_DEFAULT_VM) + return -EINVAL; + + vm_type = (argp->id == KVM_SEV_INIT ? KVM_X86_SEV_VM : KVM_X86_SEV_ES_VM); + + /* + * KVM_SEV_ES_INIT has been deprecated by KVM_SEV_INIT2, so it will + * continue to only ever support the minimal GHCB protocol version. + */ + if (vm_type == KVM_X86_SEV_ES_VM) + data.ghcb_version = GHCB_VERSION_MIN; + + return __sev_guest_init(kvm, argp, &data, vm_type); +} + +static int sev_guest_init2(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_init data; + + if (!to_kvm_sev_info(kvm)->need_init) + return -EINVAL; + + if (kvm->arch.vm_type != KVM_X86_SEV_VM && + kvm->arch.vm_type != KVM_X86_SEV_ES_VM && + kvm->arch.vm_type != KVM_X86_SNP_VM) + return -EINVAL; + + if (copy_from_user(&data, u64_to_user_ptr(argp->data), sizeof(data))) + return -EFAULT; + + return __sev_guest_init(kvm, argp, &data, kvm->arch.vm_type); +} + static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error) { + unsigned int asid = sev_get_asid(kvm); struct sev_data_activate activate; - int asid = sev_get_asid(kvm); int ret; /* activate ASID on the given handle */ @@ -301,29 +531,24 @@ static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error) static int __sev_issue_cmd(int fd, int id, void *data, int *error) { - struct fd f; - int ret; + CLASS(fd, f)(fd); - f = fdget(fd); - if (!f.file) + if (fd_empty(f)) return -EBADF; - ret = sev_issue_cmd_external_user(f.file, id, data, error); - - fdput(f); - return ret; + return sev_issue_cmd_external_user(fd_file(f), id, data, error); } static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); return __sev_issue_cmd(sev->fd, id, data, error); } static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct sev_data_launch_start start; struct kvm_sev_launch_start params; void *dh_blob, *session_blob; @@ -333,9 +558,11 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!sev_guest(kvm)) return -ENOTTY; - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) return -EFAULT; + sev->policy = params.policy; + memset(&start, 0, sizeof(start)); dh_blob = NULL; @@ -377,7 +604,7 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) /* return handle to userspace */ params.handle = start.handle; - if (copy_to_user((void __user *)(uintptr_t)argp->data, ¶ms, sizeof(params))) { + if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, sizeof(params))) { sev_unbind_asid(kvm, start.handle); ret = -EFAULT; goto e_free_session; @@ -395,9 +622,9 @@ e_free_dh: static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, unsigned long ulen, unsigned long *n, - int write) + unsigned int flags) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); unsigned long npages, size; int npinned; unsigned long locked, lock_limit; @@ -428,7 +655,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, /* Avoid using vmalloc for smaller buffers. */ size = npages * sizeof(struct page *); if (size > PAGE_SIZE) - pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); + pages = __vmalloc(size, GFP_KERNEL_ACCOUNT); else pages = kmalloc(size, GFP_KERNEL_ACCOUNT); @@ -436,7 +663,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, return ERR_PTR(-ENOMEM); /* Pin the user virtual address. */ - npinned = pin_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages); + npinned = pin_user_pages_fast(uaddr, npages, flags, pages); if (npinned != npages) { pr_err("SEV: Failure locking %lu pages.\n", npages); ret = -ENOMEM; @@ -459,11 +686,9 @@ err: static void sev_unpin_memory(struct kvm *kvm, struct page **pages, unsigned long npages) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - unpin_user_pages(pages, npages); kvfree(pages); - sev->pages_locked -= npages; + to_kvm_sev_info(kvm)->pages_locked -= npages; } static void sev_clflush_pages(struct page *pages[], unsigned long npages) @@ -507,7 +732,6 @@ static unsigned long get_num_contig_pages(unsigned long idx, static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) { unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i; - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct kvm_sev_launch_update_data params; struct sev_data_launch_update_data data; struct page **inpages; @@ -516,7 +740,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!sev_guest(kvm)) return -ENOTTY; - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) return -EFAULT; vaddr = params.uaddr; @@ -524,7 +748,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) vaddr_end = vaddr + size; /* Lock the user memory. */ - inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1); + inpages = sev_pin_memory(kvm, vaddr, size, &npages, FOLL_WRITE); if (IS_ERR(inpages)) return PTR_ERR(inpages); @@ -535,7 +759,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) sev_clflush_pages(inpages, npages); data.reserved = 0; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) { int offset, len; @@ -574,7 +798,13 @@ e_unpin: static int sev_es_sync_vmsa(struct vcpu_svm *svm) { + struct kvm_vcpu *vcpu = &svm->vcpu; + struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); struct sev_es_save_area *save = svm->sev_es.vmsa; + struct xregs_state *xsave; + const u8 *s; + u8 *d; + int i; /* Check some debug related fields before encrypting the VMSA */ if (svm->vcpu.guest_debug || (svm->vmcb->save.dr7 & ~DR7_FIXED_1)) @@ -615,10 +845,44 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm) save->xss = svm->vcpu.arch.ia32_xss; save->dr6 = svm->vcpu.arch.dr6; - if (sev_es_debug_swap_enabled) { - save->sev_features |= SVM_SEV_FEAT_DEBUG_SWAP; - pr_warn_once("Enabling DebugSwap with KVM_SEV_ES_INIT. " - "This will not work starting with Linux 6.10\n"); + save->sev_features = sev->vmsa_features; + + /* + * Skip FPU and AVX setup with KVM_SEV_ES_INIT to avoid + * breaking older measurements. + */ + if (vcpu->kvm->arch.vm_type != KVM_X86_DEFAULT_VM) { + xsave = &vcpu->arch.guest_fpu.fpstate->regs.xsave; + save->x87_dp = xsave->i387.rdp; + save->mxcsr = xsave->i387.mxcsr; + save->x87_ftw = xsave->i387.twd; + save->x87_fsw = xsave->i387.swd; + save->x87_fcw = xsave->i387.cwd; + save->x87_fop = xsave->i387.fop; + save->x87_ds = 0; + save->x87_cs = 0; + save->x87_rip = xsave->i387.rip; + + for (i = 0; i < 8; i++) { + /* + * The format of the x87 save area is undocumented and + * definitely not what you would expect. It consists of + * an 8*8 bytes area with bytes 0-7, and an 8*2 bytes + * area with bytes 8-9 of each register. + */ + d = save->fpreg_x87 + i * 8; + s = ((u8 *)xsave->i387.st_space) + i * 16; + memcpy(d, s, 8); + save->fpreg_x87[64 + i * 2] = s[8]; + save->fpreg_x87[64 + i * 2 + 1] = s[9]; + } + memcpy(save->fpreg_xmm, xsave->i387.xmm_space, 256); + + s = get_xsave_addr(xsave, XFEATURE_YMM); + if (s) + memcpy(save->fpreg_ymm, s, 256); + else + memset(save->fpreg_ymm, 0, 256); } pr_debug("Virtual Machine Save Area (VMSA):\n"); @@ -652,14 +916,29 @@ static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu, clflush_cache_range(svm->sev_es.vmsa, PAGE_SIZE); vmsa.reserved = 0; - vmsa.handle = to_kvm_svm(kvm)->sev_info.handle; + vmsa.handle = to_kvm_sev_info(kvm)->handle; vmsa.address = __sme_pa(svm->sev_es.vmsa); vmsa.len = PAGE_SIZE; ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error); if (ret) return ret; + /* + * SEV-ES guests maintain an encrypted version of their FPU + * state which is restored and saved on VMRUN and VMEXIT. + * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't + * do xsave/xrstor on it. + */ + fpstate_set_confidential(&vcpu->arch.guest_fpu); vcpu->arch.guest_state_protected = true; + + /* + * SEV-ES guest mandates LBR Virtualization to be _always_ ON. Enable it + * only after setting guest_state_protected because KVM_SET_MSRS allows + * dynamic toggling of LBRV (for performance reason) on write access to + * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set. + */ + svm_enable_lbrv(vcpu); return 0; } @@ -689,8 +968,7 @@ static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) { - void __user *measure = (void __user *)(uintptr_t)argp->data; - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + void __user *measure = u64_to_user_ptr(argp->data); struct sev_data_launch_measure data; struct kvm_sev_launch_measure params; void __user *p = NULL; @@ -709,7 +987,7 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!params.len) goto cmd; - p = (void __user *)(uintptr_t)params.uaddr; + p = u64_to_user_ptr(params.uaddr); if (p) { if (params.len > SEV_FW_BLOB_MAX_SIZE) return -EINVAL; @@ -723,7 +1001,7 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) } cmd: - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, &data, &argp->error); /* @@ -751,19 +1029,17 @@ e_free_blob: static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_launch_finish data; if (!sev_guest(kvm)) return -ENOTTY; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; return sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, &data, &argp->error); } static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct kvm_sev_guest_status params; struct sev_data_guest_status data; int ret; @@ -773,7 +1049,7 @@ static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp) memset(&data, 0, sizeof(data)); - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, &data, &argp->error); if (ret) return ret; @@ -782,7 +1058,7 @@ static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp) params.state = data.state; params.handle = data.handle; - if (copy_to_user((void __user *)(uintptr_t)argp->data, ¶ms, sizeof(params))) + if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, sizeof(params))) ret = -EFAULT; return ret; @@ -792,11 +1068,10 @@ static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src, unsigned long dst, int size, int *error, bool enc) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_dbg data; data.reserved = 0; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; data.dst_addr = dst; data.src_addr = src; data.len = size; @@ -947,7 +1222,7 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) if (!sev_guest(kvm)) return -ENOTTY; - if (copy_from_user(&debug, (void __user *)(uintptr_t)argp->data, sizeof(debug))) + if (copy_from_user(&debug, u64_to_user_ptr(argp->data), sizeof(debug))) return -EFAULT; if (!debug.len || debug.src_uaddr + debug.len < debug.src_uaddr) @@ -968,7 +1243,7 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) if (IS_ERR(src_p)) return PTR_ERR(src_p); - dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, 1); + dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, FOLL_WRITE); if (IS_ERR(dst_p)) { sev_unpin_memory(kvm, src_p, n); return PTR_ERR(dst_p); @@ -1020,7 +1295,6 @@ err: static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_launch_secret data; struct kvm_sev_launch_secret params; struct page **pages; @@ -1031,10 +1305,10 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!sev_guest(kvm)) return -ENOTTY; - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) return -EFAULT; - pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1); + pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, FOLL_WRITE); if (IS_ERR(pages)) return PTR_ERR(pages); @@ -1076,7 +1350,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) data.hdr_address = __psp_pa(hdr); data.hdr_len = params.hdr_len; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, &data, &argp->error); kfree(hdr); @@ -1095,8 +1369,7 @@ e_unpin_memory: static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp) { - void __user *report = (void __user *)(uintptr_t)argp->data; - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + void __user *report = u64_to_user_ptr(argp->data); struct sev_data_attestation_report data; struct kvm_sev_attestation_report params; void __user *p; @@ -1106,7 +1379,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!sev_guest(kvm)) return -ENOTTY; - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) return -EFAULT; memset(&data, 0, sizeof(data)); @@ -1115,7 +1388,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!params.len) goto cmd; - p = (void __user *)(uintptr_t)params.uaddr; + p = u64_to_user_ptr(params.uaddr); if (p) { if (params.len > SEV_FW_BLOB_MAX_SIZE) return -EINVAL; @@ -1129,7 +1402,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp) memcpy(data.mnonce, params.mnonce, sizeof(params.mnonce)); } cmd: - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, &data, &argp->error); /* * If we query the session length, FW responded with expected data. @@ -1159,16 +1432,15 @@ static int __sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp, struct kvm_sev_send_start *params) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_send_start data; int ret; memset(&data, 0, sizeof(data)); - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error); params->session_len = data.session_len; - if (copy_to_user((void __user *)(uintptr_t)argp->data, params, + if (copy_to_user(u64_to_user_ptr(argp->data), params, sizeof(struct kvm_sev_send_start))) ret = -EFAULT; @@ -1177,7 +1449,6 @@ __sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp, static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_send_start data; struct kvm_sev_send_start params; void *amd_certs, *session_data; @@ -1187,7 +1458,7 @@ static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!sev_guest(kvm)) return -ENOTTY; - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(struct kvm_sev_send_start))) return -EFAULT; @@ -1238,11 +1509,11 @@ static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp) data.amd_certs_len = params.amd_certs_len; data.session_address = __psp_pa(session_data); data.session_len = params.session_len; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error); - if (!ret && copy_to_user((void __user *)(uintptr_t)params.session_uaddr, + if (!ret && copy_to_user(u64_to_user_ptr(params.session_uaddr), session_data, params.session_len)) { ret = -EFAULT; goto e_free_amd_cert; @@ -1250,7 +1521,7 @@ static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp) params.policy = data.policy; params.session_len = data.session_len; - if (copy_to_user((void __user *)(uintptr_t)argp->data, ¶ms, + if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, sizeof(struct kvm_sev_send_start))) ret = -EFAULT; @@ -1270,18 +1541,17 @@ static int __sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp, struct kvm_sev_send_update_data *params) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_send_update_data data; int ret; memset(&data, 0, sizeof(data)); - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error); params->hdr_len = data.hdr_len; params->trans_len = data.trans_len; - if (copy_to_user((void __user *)(uintptr_t)argp->data, params, + if (copy_to_user(u64_to_user_ptr(argp->data), params, sizeof(struct kvm_sev_send_update_data))) ret = -EFAULT; @@ -1290,7 +1560,6 @@ __sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp, static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_send_update_data data; struct kvm_sev_send_update_data params; void *hdr, *trans_data; @@ -1301,7 +1570,7 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!sev_guest(kvm)) return -ENOTTY; - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(struct kvm_sev_send_update_data))) return -EFAULT; @@ -1326,11 +1595,11 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) /* allocate memory for header and transport buffer */ ret = -ENOMEM; - hdr = kzalloc(params.hdr_len, GFP_KERNEL_ACCOUNT); + hdr = kzalloc(params.hdr_len, GFP_KERNEL); if (!hdr) goto e_unpin; - trans_data = kzalloc(params.trans_len, GFP_KERNEL_ACCOUNT); + trans_data = kzalloc(params.trans_len, GFP_KERNEL); if (!trans_data) goto e_free_hdr; @@ -1344,7 +1613,7 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset; data.guest_address |= sev_me_mask; data.guest_len = params.guest_len; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error); @@ -1352,14 +1621,14 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) goto e_free_trans_data; /* copy transport buffer to user space */ - if (copy_to_user((void __user *)(uintptr_t)params.trans_uaddr, + if (copy_to_user(u64_to_user_ptr(params.trans_uaddr), trans_data, params.trans_len)) { ret = -EFAULT; goto e_free_trans_data; } /* Copy packet header to userspace. */ - if (copy_to_user((void __user *)(uintptr_t)params.hdr_uaddr, hdr, + if (copy_to_user(u64_to_user_ptr(params.hdr_uaddr), hdr, params.hdr_len)) ret = -EFAULT; @@ -1375,31 +1644,29 @@ e_unpin: static int sev_send_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_send_finish data; if (!sev_guest(kvm)) return -ENOTTY; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; return sev_issue_cmd(kvm, SEV_CMD_SEND_FINISH, &data, &argp->error); } static int sev_send_cancel(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_send_cancel data; if (!sev_guest(kvm)) return -ENOTTY; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; return sev_issue_cmd(kvm, SEV_CMD_SEND_CANCEL, &data, &argp->error); } static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct sev_data_receive_start start; struct kvm_sev_receive_start params; int *error = &argp->error; @@ -1411,7 +1678,7 @@ static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp) return -ENOTTY; /* Get parameter from the userspace */ - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(struct kvm_sev_receive_start))) return -EFAULT; @@ -1453,7 +1720,7 @@ static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp) } params.handle = start.handle; - if (copy_to_user((void __user *)(uintptr_t)argp->data, + if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, sizeof(struct kvm_sev_receive_start))) { ret = -EFAULT; sev_unbind_asid(kvm, start.handle); @@ -1473,7 +1740,6 @@ e_free_pdh: static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct kvm_sev_receive_update_data params; struct sev_data_receive_update_data data; void *hdr = NULL, *trans = NULL; @@ -1484,7 +1750,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!sev_guest(kvm)) return -EINVAL; - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(struct kvm_sev_receive_update_data))) return -EFAULT; @@ -1516,7 +1782,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) /* Pin guest memory */ guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK, - PAGE_SIZE, &n, 1); + PAGE_SIZE, &n, FOLL_WRITE); if (IS_ERR(guest_page)) { ret = PTR_ERR(guest_page); goto e_free_trans; @@ -1533,7 +1799,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset; data.guest_address |= sev_me_mask; data.guest_len = params.guest_len; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_RECEIVE_UPDATE_DATA, &data, &argp->error); @@ -1550,13 +1816,12 @@ e_free_hdr: static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_receive_finish data; if (!sev_guest(kvm)) return -ENOTTY; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error); } @@ -1576,8 +1841,8 @@ static bool is_cmd_allowed_from_mirror(u32 cmd_id) static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) { - struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info; - struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info; + struct kvm_sev_info *dst_sev = to_kvm_sev_info(dst_kvm); + struct kvm_sev_info *src_sev = to_kvm_sev_info(src_kvm); int r = -EBUSY; if (dst_kvm == src_kvm) @@ -1611,8 +1876,8 @@ release_dst: static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) { - struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info; - struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info; + struct kvm_sev_info *dst_sev = to_kvm_sev_info(dst_kvm); + struct kvm_sev_info *src_sev = to_kvm_sev_info(src_kvm); mutex_unlock(&dst_kvm->lock); mutex_unlock(&src_kvm->lock); @@ -1620,74 +1885,10 @@ static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) atomic_set_release(&src_sev->migration_in_progress, 0); } -/* vCPU mutex subclasses. */ -enum sev_migration_role { - SEV_MIGRATION_SOURCE = 0, - SEV_MIGRATION_TARGET, - SEV_NR_MIGRATION_ROLES, -}; - -static int sev_lock_vcpus_for_migration(struct kvm *kvm, - enum sev_migration_role role) -{ - struct kvm_vcpu *vcpu; - unsigned long i, j; - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (mutex_lock_killable_nested(&vcpu->mutex, role)) - goto out_unlock; - -#ifdef CONFIG_PROVE_LOCKING - if (!i) - /* - * Reset the role to one that avoids colliding with - * the role used for the first vcpu mutex. - */ - role = SEV_NR_MIGRATION_ROLES; - else - mutex_release(&vcpu->mutex.dep_map, _THIS_IP_); -#endif - } - - return 0; - -out_unlock: - - kvm_for_each_vcpu(j, vcpu, kvm) { - if (i == j) - break; - -#ifdef CONFIG_PROVE_LOCKING - if (j) - mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_); -#endif - - mutex_unlock(&vcpu->mutex); - } - return -EINTR; -} - -static void sev_unlock_vcpus_for_migration(struct kvm *kvm) -{ - struct kvm_vcpu *vcpu; - unsigned long i; - bool first = true; - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (first) - first = false; - else - mutex_acquire(&vcpu->mutex.dep_map, - SEV_NR_MIGRATION_ROLES, 0, _THIS_IP_); - - mutex_unlock(&vcpu->mutex); - } -} - static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm) { - struct kvm_sev_info *dst = &to_kvm_svm(dst_kvm)->sev_info; - struct kvm_sev_info *src = &to_kvm_svm(src_kvm)->sev_info; + struct kvm_sev_info *dst = to_kvm_sev_info(dst_kvm); + struct kvm_sev_info *src = to_kvm_sev_info(src_kvm); struct kvm_vcpu *dst_vcpu, *src_vcpu; struct vcpu_svm *dst_svm, *src_svm; struct kvm_sev_info *mirror; @@ -1699,6 +1900,7 @@ static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm) dst->pages_locked = src->pages_locked; dst->enc_context_owner = src->enc_context_owner; dst->es_active = src->es_active; + dst->vmsa_features = src->vmsa_features; src->asid = 0; src->active = false; @@ -1726,8 +1928,7 @@ static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm) * and add the new mirror to the list. */ if (is_mirroring_enc_context(dst_kvm)) { - struct kvm_sev_info *owner_sev_info = - &to_kvm_svm(dst->enc_context_owner)->sev_info; + struct kvm_sev_info *owner_sev_info = to_kvm_sev_info(dst->enc_context_owner); list_del(&src->mirror_entry); list_add_tail(&dst->mirror_entry, &owner_sev_info->mirror_vms); @@ -1786,32 +1987,31 @@ static int sev_check_source_vcpus(struct kvm *dst, struct kvm *src) int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) { - struct kvm_sev_info *dst_sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *dst_sev = to_kvm_sev_info(kvm); struct kvm_sev_info *src_sev, *cg_cleanup_sev; - struct fd f = fdget(source_fd); + CLASS(fd, f)(source_fd); struct kvm *source_kvm; bool charged = false; int ret; - if (!f.file) + if (fd_empty(f)) return -EBADF; - if (!file_is_kvm(f.file)) { - ret = -EBADF; - goto out_fput; - } + if (!file_is_kvm(fd_file(f))) + return -EBADF; - source_kvm = f.file->private_data; + source_kvm = fd_file(f)->private_data; ret = sev_lock_two_vms(kvm, source_kvm); if (ret) - goto out_fput; + return ret; - if (sev_guest(kvm) || !sev_guest(source_kvm)) { + if (kvm->arch.vm_type != source_kvm->arch.vm_type || + sev_guest(kvm) || !sev_guest(source_kvm)) { ret = -EINVAL; goto out_unlock; } - src_sev = &to_kvm_svm(source_kvm)->sev_info; + src_sev = to_kvm_sev_info(source_kvm); dst_sev->misc_cg = get_current_misc_cg(); cg_cleanup_sev = dst_sev; @@ -1822,10 +2022,10 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) charged = true; } - ret = sev_lock_vcpus_for_migration(kvm, SEV_MIGRATION_SOURCE); + ret = kvm_lock_all_vcpus(kvm); if (ret) goto out_dst_cgroup; - ret = sev_lock_vcpus_for_migration(source_kvm, SEV_MIGRATION_TARGET); + ret = kvm_lock_all_vcpus(source_kvm); if (ret) goto out_dst_vcpu; @@ -1839,9 +2039,9 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) ret = 0; out_source_vcpu: - sev_unlock_vcpus_for_migration(source_kvm); + kvm_unlock_all_vcpus(source_kvm); out_dst_vcpu: - sev_unlock_vcpus_for_migration(kvm); + kvm_unlock_all_vcpus(kvm); out_dst_cgroup: /* Operates on the source on success, on the destination on failure. */ if (charged) @@ -1850,8 +2050,429 @@ out_dst_cgroup: cg_cleanup_sev->misc_cg = NULL; out_unlock: sev_unlock_two_vms(kvm, source_kvm); -out_fput: - fdput(f); + return ret; +} + +int sev_dev_get_attr(u32 group, u64 attr, u64 *val) +{ + if (group != KVM_X86_GRP_SEV) + return -ENXIO; + + switch (attr) { + case KVM_X86_SEV_VMSA_FEATURES: + *val = sev_supported_vmsa_features; + return 0; + + default: + return -ENXIO; + } +} + +/* + * The guest context contains all the information, keys and metadata + * associated with the guest that the firmware tracks to implement SEV + * and SNP features. The firmware stores the guest context in hypervisor + * provide page via the SNP_GCTX_CREATE command. + */ +static void *snp_context_create(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct sev_data_snp_addr data = {}; + void *context; + int rc; + + /* Allocate memory for context page */ + context = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT); + if (!context) + return NULL; + + data.address = __psp_pa(context); + rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_GCTX_CREATE, &data, &argp->error); + if (rc) { + pr_warn("Failed to create SEV-SNP context, rc %d fw_error %d", + rc, argp->error); + snp_free_firmware_page(context); + return NULL; + } + + return context; +} + +static int snp_bind_asid(struct kvm *kvm, int *error) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + struct sev_data_snp_activate data = {0}; + + data.gctx_paddr = __psp_pa(sev->snp_context); + data.asid = sev_get_asid(kvm); + return sev_issue_cmd(kvm, SEV_CMD_SNP_ACTIVATE, &data, error); +} + +static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + struct sev_data_snp_launch_start start = {0}; + struct kvm_sev_snp_launch_start params; + int rc; + + if (!sev_snp_guest(kvm)) + return -ENOTTY; + + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) + return -EFAULT; + + /* Don't allow userspace to allocate memory for more than 1 SNP context. */ + if (sev->snp_context) + return -EINVAL; + + if (params.flags) + return -EINVAL; + + if (params.policy & ~SNP_POLICY_MASK_VALID) + return -EINVAL; + + /* Check for policy bits that must be set */ + if (!(params.policy & SNP_POLICY_MASK_RSVD_MBO) || + !(params.policy & SNP_POLICY_MASK_SMT)) + return -EINVAL; + + if (params.policy & SNP_POLICY_MASK_SINGLE_SOCKET) + return -EINVAL; + + sev->policy = params.policy; + + sev->snp_context = snp_context_create(kvm, argp); + if (!sev->snp_context) + return -ENOTTY; + + start.gctx_paddr = __psp_pa(sev->snp_context); + start.policy = params.policy; + memcpy(start.gosvw, params.gosvw, sizeof(params.gosvw)); + rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_START, &start, &argp->error); + if (rc) { + pr_debug("%s: SEV_CMD_SNP_LAUNCH_START firmware command failed, rc %d\n", + __func__, rc); + goto e_free_context; + } + + sev->fd = argp->sev_fd; + rc = snp_bind_asid(kvm, &argp->error); + if (rc) { + pr_debug("%s: Failed to bind ASID to SEV-SNP context, rc %d\n", + __func__, rc); + goto e_free_context; + } + + return 0; + +e_free_context: + snp_decommission_context(kvm); + + return rc; +} + +struct sev_gmem_populate_args { + __u8 type; + int sev_fd; + int fw_error; +}; + +static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pfn, + void __user *src, int order, void *opaque) +{ + struct sev_gmem_populate_args *sev_populate_args = opaque; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + int n_private = 0, ret, i; + int npages = (1 << order); + gfn_t gfn; + + if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src)) + return -EINVAL; + + for (gfn = gfn_start, i = 0; gfn < gfn_start + npages; gfn++, i++) { + struct sev_data_snp_launch_update fw_args = {0}; + bool assigned = false; + int level; + + ret = snp_lookup_rmpentry((u64)pfn + i, &assigned, &level); + if (ret || assigned) { + pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n", + __func__, gfn, ret, assigned); + ret = ret ? -EINVAL : -EEXIST; + goto err; + } + + if (src) { + void *vaddr = kmap_local_pfn(pfn + i); + + if (copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE)) { + ret = -EFAULT; + goto err; + } + kunmap_local(vaddr); + } + + ret = rmp_make_private(pfn + i, gfn << PAGE_SHIFT, PG_LEVEL_4K, + sev_get_asid(kvm), true); + if (ret) + goto err; + + n_private++; + + fw_args.gctx_paddr = __psp_pa(sev->snp_context); + fw_args.address = __sme_set(pfn_to_hpa(pfn + i)); + fw_args.page_size = PG_LEVEL_TO_RMP(PG_LEVEL_4K); + fw_args.page_type = sev_populate_args->type; + + ret = __sev_issue_cmd(sev_populate_args->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE, + &fw_args, &sev_populate_args->fw_error); + if (ret) + goto fw_err; + } + + return 0; + +fw_err: + /* + * If the firmware command failed handle the reclaim and cleanup of that + * PFN specially vs. prior pages which can be cleaned up below without + * needing to reclaim in advance. + * + * Additionally, when invalid CPUID function entries are detected, + * firmware writes the expected values into the page and leaves it + * unencrypted so it can be used for debugging and error-reporting. + * + * Copy this page back into the source buffer so userspace can use this + * information to provide information on which CPUID leaves/fields + * failed CPUID validation. + */ + if (!snp_page_reclaim(kvm, pfn + i) && + sev_populate_args->type == KVM_SEV_SNP_PAGE_TYPE_CPUID && + sev_populate_args->fw_error == SEV_RET_INVALID_PARAM) { + void *vaddr = kmap_local_pfn(pfn + i); + + if (copy_to_user(src + i * PAGE_SIZE, vaddr, PAGE_SIZE)) + pr_debug("Failed to write CPUID page back to userspace\n"); + + kunmap_local(vaddr); + } + + /* pfn + i is hypervisor-owned now, so skip below cleanup for it. */ + n_private--; + +err: + pr_debug("%s: exiting with error ret %d (fw_error %d), restoring %d gmem PFNs to shared.\n", + __func__, ret, sev_populate_args->fw_error, n_private); + for (i = 0; i < n_private; i++) + kvm_rmp_make_shared(kvm, pfn + i, PG_LEVEL_4K); + + return ret; +} + +static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + struct sev_gmem_populate_args sev_populate_args = {0}; + struct kvm_sev_snp_launch_update params; + struct kvm_memory_slot *memslot; + long npages, count; + void __user *src; + int ret = 0; + + if (!sev_snp_guest(kvm) || !sev->snp_context) + return -EINVAL; + + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) + return -EFAULT; + + pr_debug("%s: GFN start 0x%llx length 0x%llx type %d flags %d\n", __func__, + params.gfn_start, params.len, params.type, params.flags); + + if (!PAGE_ALIGNED(params.len) || params.flags || + (params.type != KVM_SEV_SNP_PAGE_TYPE_NORMAL && + params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO && + params.type != KVM_SEV_SNP_PAGE_TYPE_UNMEASURED && + params.type != KVM_SEV_SNP_PAGE_TYPE_SECRETS && + params.type != KVM_SEV_SNP_PAGE_TYPE_CPUID)) + return -EINVAL; + + npages = params.len / PAGE_SIZE; + + /* + * For each GFN that's being prepared as part of the initial guest + * state, the following pre-conditions are verified: + * + * 1) The backing memslot is a valid private memslot. + * 2) The GFN has been set to private via KVM_SET_MEMORY_ATTRIBUTES + * beforehand. + * 3) The PFN of the guest_memfd has not already been set to private + * in the RMP table. + * + * The KVM MMU relies on kvm->mmu_invalidate_seq to retry nested page + * faults if there's a race between a fault and an attribute update via + * KVM_SET_MEMORY_ATTRIBUTES, and a similar approach could be utilized + * here. However, kvm->slots_lock guards against both this as well as + * concurrent memslot updates occurring while these checks are being + * performed, so use that here to make it easier to reason about the + * initial expected state and better guard against unexpected + * situations. + */ + mutex_lock(&kvm->slots_lock); + + memslot = gfn_to_memslot(kvm, params.gfn_start); + if (!kvm_slot_can_be_private(memslot)) { + ret = -EINVAL; + goto out; + } + + sev_populate_args.sev_fd = argp->sev_fd; + sev_populate_args.type = params.type; + src = params.type == KVM_SEV_SNP_PAGE_TYPE_ZERO ? NULL : u64_to_user_ptr(params.uaddr); + + count = kvm_gmem_populate(kvm, params.gfn_start, src, npages, + sev_gmem_post_populate, &sev_populate_args); + if (count < 0) { + argp->error = sev_populate_args.fw_error; + pr_debug("%s: kvm_gmem_populate failed, ret %ld (fw_error %d)\n", + __func__, count, argp->error); + ret = -EIO; + } else { + params.gfn_start += count; + params.len -= count * PAGE_SIZE; + if (params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO) + params.uaddr += count * PAGE_SIZE; + + ret = 0; + if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, sizeof(params))) + ret = -EFAULT; + } + +out: + mutex_unlock(&kvm->slots_lock); + + return ret; +} + +static int snp_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + struct sev_data_snp_launch_update data = {}; + struct kvm_vcpu *vcpu; + unsigned long i; + int ret; + + data.gctx_paddr = __psp_pa(sev->snp_context); + data.page_type = SNP_PAGE_TYPE_VMSA; + + kvm_for_each_vcpu(i, vcpu, kvm) { + struct vcpu_svm *svm = to_svm(vcpu); + u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT; + + ret = sev_es_sync_vmsa(svm); + if (ret) + return ret; + + /* Transition the VMSA page to a firmware state. */ + ret = rmp_make_private(pfn, INITIAL_VMSA_GPA, PG_LEVEL_4K, sev->asid, true); + if (ret) + return ret; + + /* Issue the SNP command to encrypt the VMSA */ + data.address = __sme_pa(svm->sev_es.vmsa); + ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE, + &data, &argp->error); + if (ret) { + snp_page_reclaim(kvm, pfn); + + return ret; + } + + svm->vcpu.arch.guest_state_protected = true; + /* + * SEV-ES (and thus SNP) guest mandates LBR Virtualization to + * be _always_ ON. Enable it only after setting + * guest_state_protected because KVM_SET_MSRS allows dynamic + * toggling of LBRV (for performance reason) on write access to + * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set. + */ + svm_enable_lbrv(vcpu); + } + + return 0; +} + +static int snp_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + struct kvm_sev_snp_launch_finish params; + struct sev_data_snp_launch_finish *data; + void *id_block = NULL, *id_auth = NULL; + int ret; + + if (!sev_snp_guest(kvm)) + return -ENOTTY; + + if (!sev->snp_context) + return -EINVAL; + + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) + return -EFAULT; + + if (params.flags) + return -EINVAL; + + /* Measure all vCPUs using LAUNCH_UPDATE before finalizing the launch flow. */ + ret = snp_launch_update_vmsa(kvm, argp); + if (ret) + return ret; + + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); + if (!data) + return -ENOMEM; + + if (params.id_block_en) { + id_block = psp_copy_user_blob(params.id_block_uaddr, KVM_SEV_SNP_ID_BLOCK_SIZE); + if (IS_ERR(id_block)) { + ret = PTR_ERR(id_block); + goto e_free; + } + + data->id_block_en = 1; + data->id_block_paddr = __sme_pa(id_block); + + id_auth = psp_copy_user_blob(params.id_auth_uaddr, KVM_SEV_SNP_ID_AUTH_SIZE); + if (IS_ERR(id_auth)) { + ret = PTR_ERR(id_auth); + goto e_free_id_block; + } + + data->id_auth_paddr = __sme_pa(id_auth); + + if (params.auth_key_en) + data->auth_key_en = 1; + } + + data->vcek_disabled = params.vcek_disabled; + + memcpy(data->host_data, params.host_data, KVM_SEV_SNP_FINISH_DATA_SIZE); + data->gctx_paddr = __psp_pa(sev->snp_context); + ret = sev_issue_cmd(kvm, SEV_CMD_SNP_LAUNCH_FINISH, data, &argp->error); + + /* + * Now that there will be no more SNP_LAUNCH_UPDATE ioctls, private pages + * can be given to the guest simply by marking the RMP entry as private. + * This can happen on first access and also with KVM_PRE_FAULT_MEMORY. + */ + if (!ret) + kvm->arch.pre_fault_allowed = true; + + kfree(id_auth); + +e_free_id_block: + kfree(id_block); + +e_free: + kfree(data); + return ret; } @@ -1878,6 +2499,15 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) goto out; } + /* + * Once KVM_SEV_INIT2 initializes a KVM instance as an SNP guest, only + * allow the use of SNP-specific commands. + */ + if (sev_snp_guest(kvm) && sev_cmd.id < KVM_SEV_SNP_LAUNCH_START) { + r = -EPERM; + goto out; + } + switch (sev_cmd.id) { case KVM_SEV_ES_INIT: if (!sev_es_enabled) { @@ -1888,6 +2518,9 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) case KVM_SEV_INIT: r = sev_guest_init(kvm, &sev_cmd); break; + case KVM_SEV_INIT2: + r = sev_guest_init2(kvm, &sev_cmd); + break; case KVM_SEV_LAUNCH_START: r = sev_launch_start(kvm, &sev_cmd); break; @@ -1939,6 +2572,15 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) case KVM_SEV_RECEIVE_FINISH: r = sev_receive_finish(kvm, &sev_cmd); break; + case KVM_SEV_SNP_LAUNCH_START: + r = snp_launch_start(kvm, &sev_cmd); + break; + case KVM_SEV_SNP_LAUNCH_UPDATE: + r = snp_launch_update(kvm, &sev_cmd); + break; + case KVM_SEV_SNP_LAUNCH_FINISH: + r = snp_launch_finish(kvm, &sev_cmd); + break; default: r = -EINVAL; goto out; @@ -1955,7 +2597,7 @@ out: int sev_mem_enc_register_region(struct kvm *kvm, struct kvm_enc_region *range) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct enc_region *region; int ret = 0; @@ -1974,7 +2616,8 @@ int sev_mem_enc_register_region(struct kvm *kvm, return -ENOMEM; mutex_lock(&kvm->lock); - region->pages = sev_pin_memory(kvm, range->addr, range->size, ®ion->npages, 1); + region->pages = sev_pin_memory(kvm, range->addr, range->size, ®ion->npages, + FOLL_WRITE | FOLL_LONGTERM); if (IS_ERR(region->pages)) { ret = PTR_ERR(region->pages); mutex_unlock(&kvm->lock); @@ -2007,7 +2650,7 @@ e_free: static struct enc_region * find_enc_region(struct kvm *kvm, struct kvm_enc_region *range) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct list_head *head = &sev->regions_list; struct enc_region *i; @@ -2070,23 +2713,21 @@ failed: int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) { - struct fd f = fdget(source_fd); + CLASS(fd, f)(source_fd); struct kvm *source_kvm; struct kvm_sev_info *source_sev, *mirror_sev; int ret; - if (!f.file) + if (fd_empty(f)) return -EBADF; - if (!file_is_kvm(f.file)) { - ret = -EBADF; - goto e_source_fput; - } + if (!file_is_kvm(fd_file(f))) + return -EBADF; - source_kvm = f.file->private_data; + source_kvm = fd_file(f)->private_data; ret = sev_lock_two_vms(kvm, source_kvm); if (ret) - goto e_source_fput; + return ret; /* * Mirrors of mirrors should work, but let's not get silly. Also @@ -2104,9 +2745,9 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) * The mirror kvm holds an enc_context_owner ref so its asid can't * disappear until we're done with it */ - source_sev = &to_kvm_svm(source_kvm)->sev_info; + source_sev = to_kvm_sev_info(source_kvm); kvm_get_kvm(source_kvm); - mirror_sev = &to_kvm_svm(kvm)->sev_info; + mirror_sev = to_kvm_sev_info(kvm); list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms); /* Set enc_context_owner and copy its encryption context over */ @@ -2115,6 +2756,7 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) mirror_sev->asid = source_sev->asid; mirror_sev->fd = source_sev->fd; mirror_sev->es_active = source_sev->es_active; + mirror_sev->need_init = false; mirror_sev->handle = source_sev->handle; INIT_LIST_HEAD(&mirror_sev->regions_list); INIT_LIST_HEAD(&mirror_sev->mirror_vms); @@ -2128,14 +2770,37 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) e_unlock: sev_unlock_two_vms(kvm, source_kvm); -e_source_fput: - fdput(f); return ret; } +static int snp_decommission_context(struct kvm *kvm) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + struct sev_data_snp_addr data = {}; + int ret; + + /* If context is not created then do nothing */ + if (!sev->snp_context) + return 0; + + /* Do the decommision, which will unbind the ASID from the SNP context */ + data.address = __sme_pa(sev->snp_context); + down_write(&sev_deactivate_lock); + ret = sev_do_cmd(SEV_CMD_SNP_DECOMMISSION, &data, NULL); + up_write(&sev_deactivate_lock); + + if (WARN_ONCE(ret, "Failed to release guest context, ret %d", ret)) + return ret; + + snp_free_firmware_page(sev->snp_context); + sev->snp_context = NULL; + + return 0; +} + void sev_vm_destroy(struct kvm *kvm) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct list_head *head = &sev->regions_list; struct list_head *pos, *q; @@ -2174,22 +2839,70 @@ void sev_vm_destroy(struct kvm *kvm) } } - sev_unbind_asid(kvm, sev->handle); + if (sev_snp_guest(kvm)) { + snp_guest_req_cleanup(kvm); + + /* + * Decomission handles unbinding of the ASID. If it fails for + * some unexpected reason, just leak the ASID. + */ + if (snp_decommission_context(kvm)) + return; + } else { + sev_unbind_asid(kvm, sev->handle); + } + sev_asid_free(sev); } void __init sev_set_cpu_caps(void) { - if (!sev_enabled) - kvm_cpu_cap_clear(X86_FEATURE_SEV); - if (!sev_es_enabled) - kvm_cpu_cap_clear(X86_FEATURE_SEV_ES); + if (sev_enabled) { + kvm_cpu_cap_set(X86_FEATURE_SEV); + kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_VM); + } + if (sev_es_enabled) { + kvm_cpu_cap_set(X86_FEATURE_SEV_ES); + kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_ES_VM); + } + if (sev_snp_enabled) { + kvm_cpu_cap_set(X86_FEATURE_SEV_SNP); + kvm_caps.supported_vm_types |= BIT(KVM_X86_SNP_VM); + } +} + +static bool is_sev_snp_initialized(void) +{ + struct sev_user_data_snp_status *status; + struct sev_data_snp_addr buf; + bool initialized = false; + int ret, error = 0; + + status = snp_alloc_firmware_page(GFP_KERNEL | __GFP_ZERO); + if (!status) + return false; + + buf.address = __psp_pa(status); + ret = sev_do_cmd(SEV_CMD_SNP_PLATFORM_STATUS, &buf, &error); + if (ret) { + pr_err("SEV: SNP_PLATFORM_STATUS failed ret=%d, fw_error=%d (%#x)\n", + ret, error, error); + goto out; + } + + initialized = !!status->state; + +out: + snp_free_firmware_page(status); + + return initialized; } void __init sev_hardware_setup(void) { -#ifdef CONFIG_KVM_AMD_SEV unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count; + struct sev_platform_init_args init_args = {0}; + bool sev_snp_supported = false; bool sev_es_supported = false; bool sev_supported = false; @@ -2208,6 +2921,16 @@ void __init sev_hardware_setup(void) WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_FLUSHBYASID))) goto out; + /* + * The kernel's initcall infrastructure lacks the ability to express + * dependencies between initcalls, whereas the modules infrastructure + * automatically handles dependencies via symbol loading. Ensure the + * PSP SEV driver is initialized before proceeding if KVM is built-in, + * as the dependency isn't handled by the initcall infrastructure. + */ + if (IS_BUILTIN(CONFIG_KVM_AMD) && sev_module_init()) + goto out; + /* Retrieve SEV CPUID information */ cpuid(0x8000001f, &eax, &ebx, &ecx, &edx); @@ -2240,8 +2963,10 @@ void __init sev_hardware_setup(void) goto out; } - sev_asid_count = max_sev_asid - min_sev_asid + 1; - WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count)); + if (min_sev_asid <= max_sev_asid) { + sev_asid_count = max_sev_asid - min_sev_asid + 1; + WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count)); + } sev_supported = true; /* SEV-ES support requested? */ @@ -2261,6 +2986,12 @@ void __init sev_hardware_setup(void) if (!boot_cpu_has(X86_FEATURE_SEV_ES)) goto out; + if (!lbrv) { + WARN_ONCE(!boot_cpu_has(X86_FEATURE_LBRV), + "LBRV must be present for SEV-ES support"); + goto out; + } + /* Has the system been allocated ASIDs for SEV-ES? */ if (min_sev_asid == 1) goto out; @@ -2268,23 +2999,43 @@ void __init sev_hardware_setup(void) sev_es_asid_count = min_sev_asid - 1; WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count)); sev_es_supported = true; + sev_snp_supported = sev_snp_enabled && cc_platform_has(CC_ATTR_HOST_SEV_SNP); out: + if (sev_enabled) { + init_args.probe = true; + if (sev_platform_init(&init_args)) + sev_supported = sev_es_supported = sev_snp_supported = false; + else if (sev_snp_supported) + sev_snp_supported = is_sev_snp_initialized(); + } + if (boot_cpu_has(X86_FEATURE_SEV)) pr_info("SEV %s (ASIDs %u - %u)\n", - sev_supported ? "enabled" : "disabled", + sev_supported ? min_sev_asid <= max_sev_asid ? "enabled" : + "unusable" : + "disabled", min_sev_asid, max_sev_asid); if (boot_cpu_has(X86_FEATURE_SEV_ES)) pr_info("SEV-ES %s (ASIDs %u - %u)\n", - sev_es_supported ? "enabled" : "disabled", + str_enabled_disabled(sev_es_supported), + min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1); + if (boot_cpu_has(X86_FEATURE_SEV_SNP)) + pr_info("SEV-SNP %s (ASIDs %u - %u)\n", + str_enabled_disabled(sev_snp_supported), min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1); sev_enabled = sev_supported; sev_es_enabled = sev_es_supported; + sev_snp_enabled = sev_snp_supported; + if (!sev_es_enabled || !cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) || !cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP)) sev_es_debug_swap_enabled = false; -#endif + + sev_supported_vmsa_features = 0; + if (sev_es_debug_swap_enabled) + sev_supported_vmsa_features |= SVM_SEV_FEAT_DEBUG_SWAP; } void sev_hardware_unsetup(void) @@ -2300,6 +3051,8 @@ void sev_hardware_unsetup(void) misc_cg_set_capacity(MISC_CG_RES_SEV, 0); misc_cg_set_capacity(MISC_CG_RES_SEV_ES, 0); + + sev_platform_shutdown(); } int sev_cpu_init(struct svm_cpu_data *sd) @@ -2320,7 +3073,7 @@ int sev_cpu_init(struct svm_cpu_data *sd) */ static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va) { - int asid = to_kvm_svm(vcpu->kvm)->sev_info.asid; + unsigned int asid = sev_get_asid(vcpu->kvm); /* * Note! The address must be a kernel address, as regular page walk @@ -2345,7 +3098,7 @@ static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va) * back to WBINVD if this faults so as not to make any problems worse * by leaving stale encrypted data in the cache. */ - if (WARN_ON_ONCE(wrmsrl_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid))) + if (WARN_ON_ONCE(wrmsrq_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid))) goto do_wbinvd; return; @@ -2356,7 +3109,13 @@ do_wbinvd: void sev_guest_memory_reclaimed(struct kvm *kvm) { - if (!sev_guest(kvm)) + /* + * With SNP+gmem, private/encrypted memory is unreachable via the + * hva-based mmu notifiers, so these events are only actually + * pertaining to shared pages where there is no need to perform + * the WBINVD to flush associated caches. + */ + if (!sev_guest(kvm) || sev_snp_guest(kvm)) return; wbinvd_on_all_cpus(); @@ -2371,18 +3130,36 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu) svm = to_svm(vcpu); + /* + * If it's an SNP guest, then the VMSA was marked in the RMP table as + * a guest-owned page. Transition the page to hypervisor state before + * releasing it back to the system. + */ + if (sev_snp_guest(vcpu->kvm)) { + u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT; + + if (kvm_rmp_make_shared(vcpu->kvm, pfn, PG_LEVEL_4K)) + goto skip_vmsa_free; + } + if (vcpu->arch.guest_state_protected) sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa); __free_page(virt_to_page(svm->sev_es.vmsa)); +skip_vmsa_free: if (svm->sev_es.ghcb_sa_free) kvfree(svm->sev_es.ghcb_sa); } +static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control) +{ + return (((u64)control->exit_code_hi) << 32) | control->exit_code; +} + static void dump_ghcb(struct vcpu_svm *svm) { - struct ghcb *ghcb = svm->sev_es.ghcb; + struct vmcb_control_area *control = &svm->vmcb->control; unsigned int nbits; /* Re-use the dump_invalid_vmcb module parameter */ @@ -2391,18 +3168,24 @@ static void dump_ghcb(struct vcpu_svm *svm) return; } - nbits = sizeof(ghcb->save.valid_bitmap) * 8; + nbits = sizeof(svm->sev_es.valid_bitmap) * 8; - pr_err("GHCB (GPA=%016llx):\n", svm->vmcb->control.ghcb_gpa); + /* + * Print KVM's snapshot of the GHCB values that were (unsuccessfully) + * used to handle the exit. If the guest has since modified the GHCB + * itself, dumping the raw GHCB won't help debug why KVM was unable to + * handle the VMGEXIT that KVM observed. + */ + pr_err("GHCB (GPA=%016llx) snapshot:\n", svm->vmcb->control.ghcb_gpa); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code", - ghcb->save.sw_exit_code, ghcb_sw_exit_code_is_valid(ghcb)); + kvm_ghcb_get_sw_exit_code(control), kvm_ghcb_sw_exit_code_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1", - ghcb->save.sw_exit_info_1, ghcb_sw_exit_info_1_is_valid(ghcb)); + control->exit_info_1, kvm_ghcb_sw_exit_info_1_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2", - ghcb->save.sw_exit_info_2, ghcb_sw_exit_info_2_is_valid(ghcb)); + control->exit_info_2, kvm_ghcb_sw_exit_info_2_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_scratch", - ghcb->save.sw_scratch, ghcb_sw_scratch_is_valid(ghcb)); - pr_err("%-20s%*pb\n", "valid_bitmap", nbits, ghcb->save.valid_bitmap); + svm->sev_es.sw_scratch, kvm_ghcb_sw_scratch_is_valid(svm)); + pr_err("%-20s%*pb\n", "valid_bitmap", nbits, svm->sev_es.valid_bitmap); } static void sev_es_sync_to_ghcb(struct vcpu_svm *svm) @@ -2458,7 +3241,7 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) if (kvm_ghcb_xcr0_is_valid(svm)) { vcpu->arch.xcr0 = ghcb_get_xcr0(ghcb); - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; } /* Copy the GHCB exit information into the VMCB fields */ @@ -2473,11 +3256,6 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); } -static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control) -{ - return (((u64)control->exit_code_hi) << 32) | control->exit_code; -} - static int sev_es_validate_vmgexit(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -2571,10 +3349,31 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) if (!kvm_ghcb_sw_scratch_is_valid(svm)) goto vmgexit_err; break; + case SVM_VMGEXIT_AP_CREATION: + if (!sev_snp_guest(vcpu->kvm)) + goto vmgexit_err; + if (lower_32_bits(control->exit_info_1) != SVM_VMGEXIT_AP_DESTROY) + if (!kvm_ghcb_rax_is_valid(svm)) + goto vmgexit_err; + break; case SVM_VMGEXIT_NMI_COMPLETE: case SVM_VMGEXIT_AP_HLT_LOOP: case SVM_VMGEXIT_AP_JUMP_TABLE: case SVM_VMGEXIT_UNSUPPORTED_EVENT: + case SVM_VMGEXIT_HV_FEATURES: + case SVM_VMGEXIT_TERM_REQUEST: + break; + case SVM_VMGEXIT_PSC: + if (!sev_snp_guest(vcpu->kvm) || !kvm_ghcb_sw_scratch_is_valid(svm)) + goto vmgexit_err; + break; + case SVM_VMGEXIT_GUEST_REQUEST: + case SVM_VMGEXIT_EXT_GUEST_REQUEST: + if (!sev_snp_guest(vcpu->kvm) || + !PAGE_ALIGNED(control->exit_info_1) || + !PAGE_ALIGNED(control->exit_info_2) || + control->exit_info_1 == control->exit_info_2) + goto vmgexit_err; break; default: reason = GHCB_ERR_INVALID_EVENT; @@ -2596,8 +3395,7 @@ vmgexit_err: dump_ghcb(svm); } - ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, reason); + svm_vmgexit_bad_input(svm, reason); /* Resume the guest to "return" the error code. */ return 1; @@ -2605,6 +3403,9 @@ vmgexit_err: void sev_es_unmap_ghcb(struct vcpu_svm *svm) { + /* Clear any indication that the vCPU is in a type of AP Reset Hold */ + svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NONE; + if (!svm->sev_es.ghcb) return; @@ -2631,14 +3432,23 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm) sev_es_sync_to_ghcb(svm); - kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map, true); + kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map); svm->sev_es.ghcb = NULL; } -void pre_sev_run(struct vcpu_svm *svm, int cpu) +int pre_sev_run(struct vcpu_svm *svm, int cpu) { struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); - int asid = sev_get_asid(svm->vcpu.kvm); + struct kvm *kvm = svm->vcpu.kvm; + unsigned int asid = sev_get_asid(kvm); + + /* + * Reject KVM_RUN if userspace attempts to run the vCPU with an invalid + * VMSA, e.g. if userspace forces the vCPU to be RUNNABLE after an SNP + * AP Destroy event. + */ + if (sev_es_guest(kvm) && !VALID_PAGE(svm->vmcb->control.vmsa_pa)) + return -EINVAL; /* Assign the asid allocated with this SEV guest */ svm->asid = asid; @@ -2651,11 +3461,12 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu) */ if (sd->sev_vmcbs[asid] == svm->vmcb && svm->vcpu.arch.last_vmentry_cpu == cpu) - return; + return 0; sd->sev_vmcbs[asid] = svm->vmcb; svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; vmcb_mark_dirty(svm->vmcb, VMCB_ASID); + return 0; } #define GHCB_SCRATCH_AREA_LIMIT (16ULL * PAGE_SIZE) @@ -2737,8 +3548,7 @@ static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) return 0; e_scratch: - ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_SCRATCH_AREA); + svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_SCRATCH_AREA); return 1; } @@ -2760,10 +3570,517 @@ static void set_ghcb_msr(struct vcpu_svm *svm, u64 value) svm->vmcb->control.ghcb_gpa = value; } +static int snp_rmptable_psmash(kvm_pfn_t pfn) +{ + int ret; + + pfn = pfn & ~(KVM_PAGES_PER_HPAGE(PG_LEVEL_2M) - 1); + + /* + * PSMASH_FAIL_INUSE indicates another processor is modifying the + * entry, so retry until that's no longer the case. + */ + do { + ret = psmash(pfn); + } while (ret == PSMASH_FAIL_INUSE); + + return ret; +} + +static int snp_complete_psc_msr(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (vcpu->run->hypercall.ret) + set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR); + else + set_ghcb_msr(svm, GHCB_MSR_PSC_RESP); + + return 1; /* resume guest */ +} + +static int snp_begin_psc_msr(struct vcpu_svm *svm, u64 ghcb_msr) +{ + u64 gpa = gfn_to_gpa(GHCB_MSR_PSC_REQ_TO_GFN(ghcb_msr)); + u8 op = GHCB_MSR_PSC_REQ_TO_OP(ghcb_msr); + struct kvm_vcpu *vcpu = &svm->vcpu; + + if (op != SNP_PAGE_STATE_PRIVATE && op != SNP_PAGE_STATE_SHARED) { + set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR); + return 1; /* resume guest */ + } + + if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) { + set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR); + return 1; /* resume guest */ + } + + vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; + vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; + /* + * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2) + * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that + * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting + * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU. + */ + vcpu->run->hypercall.ret = 0; + vcpu->run->hypercall.args[0] = gpa; + vcpu->run->hypercall.args[1] = 1; + vcpu->run->hypercall.args[2] = (op == SNP_PAGE_STATE_PRIVATE) + ? KVM_MAP_GPA_RANGE_ENCRYPTED + : KVM_MAP_GPA_RANGE_DECRYPTED; + vcpu->run->hypercall.args[2] |= KVM_MAP_GPA_RANGE_PAGE_SZ_4K; + + vcpu->arch.complete_userspace_io = snp_complete_psc_msr; + + return 0; /* forward request to userspace */ +} + +struct psc_buffer { + struct psc_hdr hdr; + struct psc_entry entries[]; +} __packed; + +static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc); + +static void snp_complete_psc(struct vcpu_svm *svm, u64 psc_ret) +{ + svm->sev_es.psc_inflight = 0; + svm->sev_es.psc_idx = 0; + svm->sev_es.psc_2m = false; + + /* + * PSC requests always get a "no action" response in SW_EXITINFO1, with + * a PSC-specific return code in SW_EXITINFO2 that provides the "real" + * return code. E.g. if the PSC request was interrupted, the need to + * retry is communicated via SW_EXITINFO2, not SW_EXITINFO1. + */ + svm_vmgexit_no_action(svm, psc_ret); +} + +static void __snp_complete_one_psc(struct vcpu_svm *svm) +{ + struct psc_buffer *psc = svm->sev_es.ghcb_sa; + struct psc_entry *entries = psc->entries; + struct psc_hdr *hdr = &psc->hdr; + __u16 idx; + + /* + * Everything in-flight has been processed successfully. Update the + * corresponding entries in the guest's PSC buffer and zero out the + * count of in-flight PSC entries. + */ + for (idx = svm->sev_es.psc_idx; svm->sev_es.psc_inflight; + svm->sev_es.psc_inflight--, idx++) { + struct psc_entry *entry = &entries[idx]; + + entry->cur_page = entry->pagesize ? 512 : 1; + } + + hdr->cur_entry = idx; +} + +static int snp_complete_one_psc(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct psc_buffer *psc = svm->sev_es.ghcb_sa; + + if (vcpu->run->hypercall.ret) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); + return 1; /* resume guest */ + } + + __snp_complete_one_psc(svm); + + /* Handle the next range (if any). */ + return snp_begin_psc(svm, psc); +} + +static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc) +{ + struct psc_entry *entries = psc->entries; + struct kvm_vcpu *vcpu = &svm->vcpu; + struct psc_hdr *hdr = &psc->hdr; + struct psc_entry entry_start; + u16 idx, idx_start, idx_end; + int npages; + bool huge; + u64 gfn; + + if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); + return 1; + } + +next_range: + /* There should be no other PSCs in-flight at this point. */ + if (WARN_ON_ONCE(svm->sev_es.psc_inflight)) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); + return 1; + } + + /* + * The PSC descriptor buffer can be modified by a misbehaved guest after + * validation, so take care to only use validated copies of values used + * for things like array indexing. + */ + idx_start = hdr->cur_entry; + idx_end = hdr->end_entry; + + if (idx_end >= VMGEXIT_PSC_MAX_COUNT) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_HDR); + return 1; + } + + /* Find the start of the next range which needs processing. */ + for (idx = idx_start; idx <= idx_end; idx++, hdr->cur_entry++) { + entry_start = entries[idx]; + + gfn = entry_start.gfn; + huge = entry_start.pagesize; + npages = huge ? 512 : 1; + + if (entry_start.cur_page > npages || !IS_ALIGNED(gfn, npages)) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_ENTRY); + return 1; + } + + if (entry_start.cur_page) { + /* + * If this is a partially-completed 2M range, force 4K handling + * for the remaining pages since they're effectively split at + * this point. Subsequent code should ensure this doesn't get + * combined with adjacent PSC entries where 2M handling is still + * possible. + */ + npages -= entry_start.cur_page; + gfn += entry_start.cur_page; + huge = false; + } + + if (npages) + break; + } + + if (idx > idx_end) { + /* Nothing more to process. */ + snp_complete_psc(svm, 0); + return 1; + } + + svm->sev_es.psc_2m = huge; + svm->sev_es.psc_idx = idx; + svm->sev_es.psc_inflight = 1; + + /* + * Find all subsequent PSC entries that contain adjacent GPA + * ranges/operations and can be combined into a single + * KVM_HC_MAP_GPA_RANGE exit. + */ + while (++idx <= idx_end) { + struct psc_entry entry = entries[idx]; + + if (entry.operation != entry_start.operation || + entry.gfn != entry_start.gfn + npages || + entry.cur_page || !!entry.pagesize != huge) + break; + + svm->sev_es.psc_inflight++; + npages += huge ? 512 : 1; + } + + switch (entry_start.operation) { + case VMGEXIT_PSC_OP_PRIVATE: + case VMGEXIT_PSC_OP_SHARED: + vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; + vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; + /* + * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2) + * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that + * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting + * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU. + */ + vcpu->run->hypercall.ret = 0; + vcpu->run->hypercall.args[0] = gfn_to_gpa(gfn); + vcpu->run->hypercall.args[1] = npages; + vcpu->run->hypercall.args[2] = entry_start.operation == VMGEXIT_PSC_OP_PRIVATE + ? KVM_MAP_GPA_RANGE_ENCRYPTED + : KVM_MAP_GPA_RANGE_DECRYPTED; + vcpu->run->hypercall.args[2] |= entry_start.pagesize + ? KVM_MAP_GPA_RANGE_PAGE_SZ_2M + : KVM_MAP_GPA_RANGE_PAGE_SZ_4K; + vcpu->arch.complete_userspace_io = snp_complete_one_psc; + return 0; /* forward request to userspace */ + default: + /* + * Only shared/private PSC operations are currently supported, so if the + * entire range consists of unsupported operations (e.g. SMASH/UNSMASH), + * then consider the entire range completed and avoid exiting to + * userspace. In theory snp_complete_psc() can always be called directly + * at this point to complete the current range and start the next one, + * but that could lead to unexpected levels of recursion. + */ + __snp_complete_one_psc(svm); + goto next_range; + } + + BUG(); +} + +/* + * Invoked as part of svm_vcpu_reset() processing of an init event. + */ +void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct kvm_memory_slot *slot; + struct page *page; + kvm_pfn_t pfn; + gfn_t gfn; + + if (!sev_snp_guest(vcpu->kvm)) + return; + + guard(mutex)(&svm->sev_es.snp_vmsa_mutex); + + if (!svm->sev_es.snp_ap_waiting_for_reset) + return; + + svm->sev_es.snp_ap_waiting_for_reset = false; + + /* Mark the vCPU as offline and not runnable */ + vcpu->arch.pv.pv_unhalted = false; + kvm_set_mp_state(vcpu, KVM_MP_STATE_HALTED); + + /* Clear use of the VMSA */ + svm->vmcb->control.vmsa_pa = INVALID_PAGE; + + /* + * When replacing the VMSA during SEV-SNP AP creation, + * mark the VMCB dirty so that full state is always reloaded. + */ + vmcb_mark_all_dirty(svm->vmcb); + + if (!VALID_PAGE(svm->sev_es.snp_vmsa_gpa)) + return; + + gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa); + svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; + + slot = gfn_to_memslot(vcpu->kvm, gfn); + if (!slot) + return; + + /* + * The new VMSA will be private memory guest memory, so retrieve the + * PFN from the gmem backend. + */ + if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, &page, NULL)) + return; + + /* + * From this point forward, the VMSA will always be a guest-mapped page + * rather than the initial one allocated by KVM in svm->sev_es.vmsa. In + * theory, svm->sev_es.vmsa could be free'd and cleaned up here, but + * that involves cleanups like wbinvd_on_all_cpus() which would ideally + * be handled during teardown rather than guest boot. Deferring that + * also allows the existing logic for SEV-ES VMSAs to be re-used with + * minimal SNP-specific changes. + */ + svm->sev_es.snp_has_guest_vmsa = true; + + /* Use the new VMSA */ + svm->vmcb->control.vmsa_pa = pfn_to_hpa(pfn); + + /* Mark the vCPU as runnable */ + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); + + /* + * gmem pages aren't currently migratable, but if this ever changes + * then care should be taken to ensure svm->sev_es.vmsa is pinned + * through some other means. + */ + kvm_release_page_clean(page); +} + +static int sev_snp_ap_creation(struct vcpu_svm *svm) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm); + struct kvm_vcpu *vcpu = &svm->vcpu; + struct kvm_vcpu *target_vcpu; + struct vcpu_svm *target_svm; + unsigned int request; + unsigned int apic_id; + + request = lower_32_bits(svm->vmcb->control.exit_info_1); + apic_id = upper_32_bits(svm->vmcb->control.exit_info_1); + + /* Validate the APIC ID */ + target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, apic_id); + if (!target_vcpu) { + vcpu_unimpl(vcpu, "vmgexit: invalid AP APIC ID [%#x] from guest\n", + apic_id); + return -EINVAL; + } + + target_svm = to_svm(target_vcpu); + + guard(mutex)(&target_svm->sev_es.snp_vmsa_mutex); + + switch (request) { + case SVM_VMGEXIT_AP_CREATE_ON_INIT: + case SVM_VMGEXIT_AP_CREATE: + if (vcpu->arch.regs[VCPU_REGS_RAX] != sev->vmsa_features) { + vcpu_unimpl(vcpu, "vmgexit: mismatched AP sev_features [%#lx] != [%#llx] from guest\n", + vcpu->arch.regs[VCPU_REGS_RAX], sev->vmsa_features); + return -EINVAL; + } + + if (!page_address_valid(vcpu, svm->vmcb->control.exit_info_2)) { + vcpu_unimpl(vcpu, "vmgexit: invalid AP VMSA address [%#llx] from guest\n", + svm->vmcb->control.exit_info_2); + return -EINVAL; + } + + /* + * Malicious guest can RMPADJUST a large page into VMSA which + * will hit the SNP erratum where the CPU will incorrectly signal + * an RMP violation #PF if a hugepage collides with the RMP entry + * of VMSA page, reject the AP CREATE request if VMSA address from + * guest is 2M aligned. + */ + if (IS_ALIGNED(svm->vmcb->control.exit_info_2, PMD_SIZE)) { + vcpu_unimpl(vcpu, + "vmgexit: AP VMSA address [%llx] from guest is unsafe as it is 2M aligned\n", + svm->vmcb->control.exit_info_2); + return -EINVAL; + } + + target_svm->sev_es.snp_vmsa_gpa = svm->vmcb->control.exit_info_2; + break; + case SVM_VMGEXIT_AP_DESTROY: + target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; + break; + default: + vcpu_unimpl(vcpu, "vmgexit: invalid AP creation request [%#x] from guest\n", + request); + return -EINVAL; + } + + target_svm->sev_es.snp_ap_waiting_for_reset = true; + + /* + * Unless Creation is deferred until INIT, signal the vCPU to update + * its state. + */ + if (request != SVM_VMGEXIT_AP_CREATE_ON_INIT) + kvm_make_request_and_kick(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu); + + return 0; +} + +static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa) +{ + struct sev_data_snp_guest_request data = {0}; + struct kvm *kvm = svm->vcpu.kvm; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + sev_ret_code fw_err = 0; + int ret; + + if (!sev_snp_guest(kvm)) + return -EINVAL; + + mutex_lock(&sev->guest_req_mutex); + + if (kvm_read_guest(kvm, req_gpa, sev->guest_req_buf, PAGE_SIZE)) { + ret = -EIO; + goto out_unlock; + } + + data.gctx_paddr = __psp_pa(sev->snp_context); + data.req_paddr = __psp_pa(sev->guest_req_buf); + data.res_paddr = __psp_pa(sev->guest_resp_buf); + + /* + * Firmware failures are propagated on to guest, but any other failure + * condition along the way should be reported to userspace. E.g. if + * the PSP is dead and commands are timing out. + */ + ret = sev_issue_cmd(kvm, SEV_CMD_SNP_GUEST_REQUEST, &data, &fw_err); + if (ret && !fw_err) + goto out_unlock; + + if (kvm_write_guest(kvm, resp_gpa, sev->guest_resp_buf, PAGE_SIZE)) { + ret = -EIO; + goto out_unlock; + } + + /* No action is requested *from KVM* if there was a firmware error. */ + svm_vmgexit_no_action(svm, SNP_GUEST_ERR(0, fw_err)); + + ret = 1; /* resume guest */ + +out_unlock: + mutex_unlock(&sev->guest_req_mutex); + return ret; +} + +static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa) +{ + struct kvm *kvm = svm->vcpu.kvm; + u8 msg_type; + + if (!sev_snp_guest(kvm)) + return -EINVAL; + + if (kvm_read_guest(kvm, req_gpa + offsetof(struct snp_guest_msg_hdr, msg_type), + &msg_type, 1)) + return -EIO; + + /* + * As per GHCB spec, requests of type MSG_REPORT_REQ also allow for + * additional certificate data to be provided alongside the attestation + * report via the guest-provided data pages indicated by RAX/RBX. The + * certificate data is optional and requires additional KVM enablement + * to provide an interface for userspace to provide it, but KVM still + * needs to be able to handle extended guest requests either way. So + * provide a stub implementation that will always return an empty + * certificate table in the guest-provided data pages. + */ + if (msg_type == SNP_MSG_REPORT_REQ) { + struct kvm_vcpu *vcpu = &svm->vcpu; + u64 data_npages; + gpa_t data_gpa; + + if (!kvm_ghcb_rax_is_valid(svm) || !kvm_ghcb_rbx_is_valid(svm)) + goto request_invalid; + + data_gpa = vcpu->arch.regs[VCPU_REGS_RAX]; + data_npages = vcpu->arch.regs[VCPU_REGS_RBX]; + + if (!PAGE_ALIGNED(data_gpa)) + goto request_invalid; + + /* + * As per GHCB spec (see "SNP Extended Guest Request"), the + * certificate table is terminated by 24-bytes of zeroes. + */ + if (data_npages && kvm_clear_guest(kvm, data_gpa, 24)) + return -EIO; + } + + return snp_handle_guest_req(svm, req_gpa, resp_gpa); + +request_invalid: + svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT); + return 1; /* resume guest */ +} + static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; struct kvm_vcpu *vcpu = &svm->vcpu; + struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); u64 ghcb_info; int ret = 1; @@ -2774,7 +4091,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) switch (ghcb_info) { case GHCB_MSR_SEV_INFO_REQ: - set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX, + set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version, GHCB_VERSION_MIN, sev_enc_bit)); break; @@ -2816,6 +4133,60 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) GHCB_MSR_INFO_POS); break; } + case GHCB_MSR_AP_RESET_HOLD_REQ: + svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_MSR_PROTO; + ret = kvm_emulate_ap_reset_hold(&svm->vcpu); + + /* + * Preset the result to a non-SIPI return and then only set + * the result to non-zero when delivering a SIPI. + */ + set_ghcb_msr_bits(svm, 0, + GHCB_MSR_AP_RESET_HOLD_RESULT_MASK, + GHCB_MSR_AP_RESET_HOLD_RESULT_POS); + + set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP, + GHCB_MSR_INFO_MASK, + GHCB_MSR_INFO_POS); + break; + case GHCB_MSR_HV_FT_REQ: + set_ghcb_msr_bits(svm, GHCB_HV_FT_SUPPORTED, + GHCB_MSR_HV_FT_MASK, GHCB_MSR_HV_FT_POS); + set_ghcb_msr_bits(svm, GHCB_MSR_HV_FT_RESP, + GHCB_MSR_INFO_MASK, GHCB_MSR_INFO_POS); + break; + case GHCB_MSR_PREF_GPA_REQ: + if (!sev_snp_guest(vcpu->kvm)) + goto out_terminate; + + set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_NONE, GHCB_MSR_GPA_VALUE_MASK, + GHCB_MSR_GPA_VALUE_POS); + set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_RESP, GHCB_MSR_INFO_MASK, + GHCB_MSR_INFO_POS); + break; + case GHCB_MSR_REG_GPA_REQ: { + u64 gfn; + + if (!sev_snp_guest(vcpu->kvm)) + goto out_terminate; + + gfn = get_ghcb_msr_bits(svm, GHCB_MSR_GPA_VALUE_MASK, + GHCB_MSR_GPA_VALUE_POS); + + svm->sev_es.ghcb_registered_gpa = gfn_to_gpa(gfn); + + set_ghcb_msr_bits(svm, gfn, GHCB_MSR_GPA_VALUE_MASK, + GHCB_MSR_GPA_VALUE_POS); + set_ghcb_msr_bits(svm, GHCB_MSR_REG_GPA_RESP, GHCB_MSR_INFO_MASK, + GHCB_MSR_INFO_POS); + break; + } + case GHCB_MSR_PSC_REQ: + if (!sev_snp_guest(vcpu->kvm)) + goto out_terminate; + + ret = snp_begin_psc_msr(svm, control->ghcb_gpa); + break; case GHCB_MSR_TERM_REQ: { u64 reason_set, reason_code; @@ -2828,12 +4199,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) pr_info("SEV-ES guest requested termination: %#llx:%#llx\n", reason_set, reason_code); - vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; - vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM; - vcpu->run->system_event.ndata = 1; - vcpu->run->system_event.data[0] = control->ghcb_gpa; - - return 0; + goto out_terminate; } default: /* Error, keep GHCB MSR value as-is */ @@ -2844,6 +4210,14 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) control->ghcb_gpa, ret); return ret; + +out_terminate: + vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; + vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM; + vcpu->run->system_event.ndata = 1; + vcpu->run->system_event.data[0] = control->ghcb_gpa; + + return 0; } int sev_handle_vmgexit(struct kvm_vcpu *vcpu) @@ -2879,12 +4253,18 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) trace_kvm_vmgexit_enter(vcpu->vcpu_id, svm->sev_es.ghcb); sev_es_sync_from_ghcb(svm); + + /* SEV-SNP guest requires that the GHCB GPA must be registered */ + if (sev_snp_guest(svm->vcpu.kvm) && !ghcb_gpa_is_registered(svm, ghcb_gpa)) { + vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB GPA [%#llx] is not registered.\n", ghcb_gpa); + return -EINVAL; + } + ret = sev_es_validate_vmgexit(svm); if (ret) return ret; - ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 0); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 0); + svm_vmgexit_success(svm, 0); exit_code = kvm_ghcb_get_sw_exit_code(control); switch (exit_code) { @@ -2915,10 +4295,11 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) ret = 1; break; case SVM_VMGEXIT_AP_HLT_LOOP: + svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NAE_EVENT; ret = kvm_emulate_ap_reset_hold(vcpu); break; case SVM_VMGEXIT_AP_JUMP_TABLE: { - struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); switch (control->exit_info_1) { case 0: @@ -2927,18 +4308,50 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) break; case 1: /* Get AP jump table address */ - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, sev->ap_jump_table); + svm_vmgexit_success(svm, sev->ap_jump_table); break; default: pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n", control->exit_info_1); - ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_INPUT); + svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT); } ret = 1; break; } + case SVM_VMGEXIT_HV_FEATURES: + svm_vmgexit_success(svm, GHCB_HV_FT_SUPPORTED); + ret = 1; + break; + case SVM_VMGEXIT_TERM_REQUEST: + pr_info("SEV-ES guest requested termination: reason %#llx info %#llx\n", + control->exit_info_1, control->exit_info_2); + vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; + vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM; + vcpu->run->system_event.ndata = 1; + vcpu->run->system_event.data[0] = control->ghcb_gpa; + break; + case SVM_VMGEXIT_PSC: + ret = setup_vmgexit_scratch(svm, true, control->exit_info_2); + if (ret) + break; + + ret = snp_begin_psc(svm, svm->sev_es.ghcb_sa); + break; + case SVM_VMGEXIT_AP_CREATION: + ret = sev_snp_ap_creation(svm); + if (ret) { + svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT); + } + + ret = 1; + break; + case SVM_VMGEXIT_GUEST_REQUEST: + ret = snp_handle_guest_req(svm, control->exit_info_1, control->exit_info_2); + break; + case SVM_VMGEXIT_EXT_GUEST_REQUEST: + ret = snp_handle_ext_guest_req(svm, control->exit_info_1, control->exit_info_2); + break; case SVM_VMGEXIT_UNSUPPORTED_EVENT: vcpu_unimpl(vcpu, "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n", @@ -2978,8 +4391,8 @@ static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm) struct kvm_vcpu *vcpu = &svm->vcpu; if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) { - bool v_tsc_aux = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) || - guest_cpuid_has(vcpu, X86_FEATURE_RDPID); + bool v_tsc_aux = guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) || + guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID); set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, v_tsc_aux, v_tsc_aux); } @@ -2988,16 +4401,15 @@ static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm) * For SEV-ES, accesses to MSR_IA32_XSS should not be intercepted if * the host/guest supports its use. * - * guest_can_use() checks a number of requirements on the host/guest to - * ensure that MSR_IA32_XSS is available, but it might report true even - * if X86_FEATURE_XSAVES isn't configured in the guest to ensure host - * MSR_IA32_XSS is always properly restored. For SEV-ES, it is better - * to further check that the guest CPUID actually supports - * X86_FEATURE_XSAVES so that accesses to MSR_IA32_XSS by misbehaved - * guests will still get intercepted and caught in the normal - * kvm_emulate_rdmsr()/kvm_emulated_wrmsr() paths. - */ - if (guest_can_use(vcpu, X86_FEATURE_XSAVES) && + * KVM treats the guest as being capable of using XSAVES even if XSAVES + * isn't enabled in guest CPUID as there is no intercept for XSAVES, + * i.e. the guest can use XSAVES/XRSTOR to read/write XSS if XSAVE is + * exposed to the guest and XSAVES is supported in hardware. Condition + * full XSS passthrough on the guest being able to use XSAVES *and* + * XSAVES being exposed to the guest so that KVM can at least honor + * guest CPUID for RDMSR and WRMSR. + */ + if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) && guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_XSS, 1, 1); else @@ -3020,11 +4432,11 @@ void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm) static void sev_es_init_vmcb(struct vcpu_svm *svm) { + struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm); struct vmcb *vmcb = svm->vmcb01.ptr; struct kvm_vcpu *vcpu = &svm->vcpu; svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ES_ENABLE; - svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; /* * An SEV-ES guest requires a VMSA area that is a separate from the @@ -3033,9 +4445,13 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) * the VMSA will be NULL if this vCPU is the destination for intrahost * migration, and will be copied later. */ - if (svm->sev_es.vmsa) + if (svm->sev_es.vmsa && !svm->sev_es.snp_has_guest_vmsa) svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa); + if (cpu_feature_enabled(X86_FEATURE_ALLOWED_SEV_FEATURES)) + svm->vmcb->control.allowed_sev_features = sev->vmsa_features | + VMCB_ALLOWED_SEV_FEATURES_VALID; + /* Can't intercept CR register access, HV can't modify CR registers */ svm_clr_intercept(svm, INTERCEPT_CR0_READ); svm_clr_intercept(svm, INTERCEPT_CR4_READ); @@ -3053,7 +4469,7 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) svm_set_intercept(svm, TRAP_CR8_WRITE); vmcb->control.intercepts[INTERCEPT_DR] = 0; - if (!sev_es_debug_swap_enabled) { + if (!sev_vcpu_has_debug_swap(svm)) { vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); recalc_intercepts(svm); @@ -3076,10 +4492,6 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) /* Clear intercepts on selected MSRs */ set_msr_interception(vcpu, svm->msrpm, MSR_EFER, 1, 1); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_CR_PAT, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); } void sev_init_vmcb(struct vcpu_svm *svm) @@ -3099,17 +4511,24 @@ void sev_init_vmcb(struct vcpu_svm *svm) void sev_es_vcpu_reset(struct vcpu_svm *svm) { + struct kvm_vcpu *vcpu = &svm->vcpu; + struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); + /* * Set the GHCB MSR value as per the GHCB specification when emulating * vCPU RESET for an SEV-ES guest. */ - set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX, + set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version, GHCB_VERSION_MIN, sev_enc_bit)); + + mutex_init(&svm->sev_es.snp_vmsa_mutex); } -void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa) +void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa) { + struct kvm *kvm = svm->vcpu.kvm; + /* * All host state for SEV-ES guests is categorized into three swap types * based on how it is handled by hardware during a world switch: @@ -3127,20 +4546,28 @@ void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa) * isn't saved by VMRUN, that isn't already saved by VMSAVE (performed * by common SVM code). */ - hostsa->xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + hostsa->xcr0 = kvm_host.xcr0; hostsa->pkru = read_pkru(); - hostsa->xss = host_xss; + hostsa->xss = kvm_host.xss; /* * If DebugSwap is enabled, debug registers are loaded but NOT saved by - * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU both - * saves and loads debug registers (Type-A). - */ - if (sev_es_debug_swap_enabled) { - hostsa->dr0 = native_get_debugreg(0); - hostsa->dr1 = native_get_debugreg(1); - hostsa->dr2 = native_get_debugreg(2); - hostsa->dr3 = native_get_debugreg(3); + * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU does + * not save or load debug registers. Sadly, KVM can't prevent SNP + * guests from lying about DebugSwap on secondary vCPUs, i.e. the + * SEV_FEATURES provided at "AP Create" isn't guaranteed to match what + * the guest has actually enabled (or not!) in the VMSA. + * + * If DebugSwap is *possible*, save the masks so that they're restored + * if the guest enables DebugSwap. But for the DRs themselves, do NOT + * rely on the CPU to restore the host values; KVM will restore them as + * needed in common code, via hw_breakpoint_restore(). Note, KVM does + * NOT support virtualizing Breakpoint Extensions, i.e. the mask MSRs + * don't need to be restored per se, KVM just needs to ensure they are + * loaded with the correct values *if* the CPU writes the MSRs. + */ + if (sev_vcpu_has_debug_swap(svm) || + (sev_snp_guest(kvm) && cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP))) { hostsa->dr0_addr_mask = amd_get_dr_addr_mask(0); hostsa->dr1_addr_mask = amd_get_dr_addr_mask(1); hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2); @@ -3158,24 +4585,40 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) return; } - /* - * Subsequent SIPI: Return from an AP Reset Hold VMGEXIT, where - * the guest will set the CS and RIP. Set SW_EXIT_INFO_2 to a - * non-zero value. - */ - if (!svm->sev_es.ghcb) - return; + /* Subsequent SIPI */ + switch (svm->sev_es.ap_reset_hold_type) { + case AP_RESET_HOLD_NAE_EVENT: + /* + * Return from an AP Reset Hold VMGEXIT, where the guest will + * set the CS and RIP. Set SW_EXIT_INFO_2 to a non-zero value. + */ + svm_vmgexit_success(svm, 1); + break; + case AP_RESET_HOLD_MSR_PROTO: + /* + * Return from an AP Reset Hold VMGEXIT, where the guest will + * set the CS and RIP. Set GHCB data field to a non-zero value. + */ + set_ghcb_msr_bits(svm, 1, + GHCB_MSR_AP_RESET_HOLD_RESULT_MASK, + GHCB_MSR_AP_RESET_HOLD_RESULT_POS); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1); + set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP, + GHCB_MSR_INFO_MASK, + GHCB_MSR_INFO_POS); + break; + default: + break; + } } -struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu) +struct page *snp_safe_alloc_page_node(int node, gfp_t gfp) { unsigned long pfn; struct page *p; - if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) - return alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + return alloc_pages_node(node, gfp | __GFP_ZERO, 0); /* * Allocate an SNP-safe page to workaround the SNP erratum where @@ -3186,7 +4629,7 @@ struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu) * Allocate one extra page, choose a page which is not * 2MB-aligned, and free the other. */ - p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1); + p = alloc_pages_node(node, gfp | __GFP_ZERO, 1); if (!p) return NULL; @@ -3200,3 +4643,366 @@ struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu) return p; } + +void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) +{ + struct kvm_memory_slot *slot; + struct kvm *kvm = vcpu->kvm; + int order, rmp_level, ret; + struct page *page; + bool assigned; + kvm_pfn_t pfn; + gfn_t gfn; + + gfn = gpa >> PAGE_SHIFT; + + /* + * The only time RMP faults occur for shared pages is when the guest is + * triggering an RMP fault for an implicit page-state change from + * shared->private. Implicit page-state changes are forwarded to + * userspace via KVM_EXIT_MEMORY_FAULT events, however, so RMP faults + * for shared pages should not end up here. + */ + if (!kvm_mem_is_private(kvm, gfn)) { + pr_warn_ratelimited("SEV: Unexpected RMP fault for non-private GPA 0x%llx\n", + gpa); + return; + } + + slot = gfn_to_memslot(kvm, gfn); + if (!kvm_slot_can_be_private(slot)) { + pr_warn_ratelimited("SEV: Unexpected RMP fault, non-private slot for GPA 0x%llx\n", + gpa); + return; + } + + ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, &page, &order); + if (ret) { + pr_warn_ratelimited("SEV: Unexpected RMP fault, no backing page for private GPA 0x%llx\n", + gpa); + return; + } + + ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level); + if (ret || !assigned) { + pr_warn_ratelimited("SEV: Unexpected RMP fault, no assigned RMP entry found for GPA 0x%llx PFN 0x%llx error %d\n", + gpa, pfn, ret); + goto out_no_trace; + } + + /* + * There are 2 cases where a PSMASH may be needed to resolve an #NPF + * with PFERR_GUEST_RMP_BIT set: + * + * 1) RMPADJUST/PVALIDATE can trigger an #NPF with PFERR_GUEST_SIZEM + * bit set if the guest issues them with a smaller granularity than + * what is indicated by the page-size bit in the 2MB RMP entry for + * the PFN that backs the GPA. + * + * 2) Guest access via NPT can trigger an #NPF if the NPT mapping is + * smaller than what is indicated by the 2MB RMP entry for the PFN + * that backs the GPA. + * + * In both these cases, the corresponding 2M RMP entry needs to + * be PSMASH'd to 512 4K RMP entries. If the RMP entry is already + * split into 4K RMP entries, then this is likely a spurious case which + * can occur when there are concurrent accesses by the guest to a 2MB + * GPA range that is backed by a 2MB-aligned PFN who's RMP entry is in + * the process of being PMASH'd into 4K entries. These cases should + * resolve automatically on subsequent accesses, so just ignore them + * here. + */ + if (rmp_level == PG_LEVEL_4K) + goto out; + + ret = snp_rmptable_psmash(pfn); + if (ret) { + /* + * Look it up again. If it's 4K now then the PSMASH may have + * raced with another process and the issue has already resolved + * itself. + */ + if (!snp_lookup_rmpentry(pfn, &assigned, &rmp_level) && + assigned && rmp_level == PG_LEVEL_4K) + goto out; + + pr_warn_ratelimited("SEV: Unable to split RMP entry for GPA 0x%llx PFN 0x%llx ret %d\n", + gpa, pfn, ret); + } + + kvm_zap_gfn_range(kvm, gfn, gfn + PTRS_PER_PMD); +out: + trace_kvm_rmp_fault(vcpu, gpa, pfn, error_code, rmp_level, ret); +out_no_trace: + kvm_release_page_unused(page); +} + +static bool is_pfn_range_shared(kvm_pfn_t start, kvm_pfn_t end) +{ + kvm_pfn_t pfn = start; + + while (pfn < end) { + int ret, rmp_level; + bool assigned; + + ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level); + if (ret) { + pr_warn_ratelimited("SEV: Failed to retrieve RMP entry: PFN 0x%llx GFN start 0x%llx GFN end 0x%llx RMP level %d error %d\n", + pfn, start, end, rmp_level, ret); + return false; + } + + if (assigned) { + pr_debug("%s: overlap detected, PFN 0x%llx start 0x%llx end 0x%llx RMP level %d\n", + __func__, pfn, start, end, rmp_level); + return false; + } + + pfn++; + } + + return true; +} + +static u8 max_level_for_order(int order) +{ + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M)) + return PG_LEVEL_2M; + + return PG_LEVEL_4K; +} + +static bool is_large_rmp_possible(struct kvm *kvm, kvm_pfn_t pfn, int order) +{ + kvm_pfn_t pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD); + + /* + * If this is a large folio, and the entire 2M range containing the + * PFN is currently shared, then the entire 2M-aligned range can be + * set to private via a single 2M RMP entry. + */ + if (max_level_for_order(order) > PG_LEVEL_4K && + is_pfn_range_shared(pfn_aligned, pfn_aligned + PTRS_PER_PMD)) + return true; + + return false; +} + +int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + kvm_pfn_t pfn_aligned; + gfn_t gfn_aligned; + int level, rc; + bool assigned; + + if (!sev_snp_guest(kvm)) + return 0; + + rc = snp_lookup_rmpentry(pfn, &assigned, &level); + if (rc) { + pr_err_ratelimited("SEV: Failed to look up RMP entry: GFN %llx PFN %llx error %d\n", + gfn, pfn, rc); + return -ENOENT; + } + + if (assigned) { + pr_debug("%s: already assigned: gfn %llx pfn %llx max_order %d level %d\n", + __func__, gfn, pfn, max_order, level); + return 0; + } + + if (is_large_rmp_possible(kvm, pfn, max_order)) { + level = PG_LEVEL_2M; + pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD); + gfn_aligned = ALIGN_DOWN(gfn, PTRS_PER_PMD); + } else { + level = PG_LEVEL_4K; + pfn_aligned = pfn; + gfn_aligned = gfn; + } + + rc = rmp_make_private(pfn_aligned, gfn_to_gpa(gfn_aligned), level, sev->asid, false); + if (rc) { + pr_err_ratelimited("SEV: Failed to update RMP entry: GFN %llx PFN %llx level %d error %d\n", + gfn, pfn, level, rc); + return -EINVAL; + } + + pr_debug("%s: updated: gfn %llx pfn %llx pfn_aligned %llx max_order %d level %d\n", + __func__, gfn, pfn, pfn_aligned, max_order, level); + + return 0; +} + +void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) +{ + kvm_pfn_t pfn; + + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + return; + + pr_debug("%s: PFN start 0x%llx PFN end 0x%llx\n", __func__, start, end); + + for (pfn = start; pfn < end;) { + bool use_2m_update = false; + int rc, rmp_level; + bool assigned; + + rc = snp_lookup_rmpentry(pfn, &assigned, &rmp_level); + if (rc || !assigned) + goto next_pfn; + + use_2m_update = IS_ALIGNED(pfn, PTRS_PER_PMD) && + end >= (pfn + PTRS_PER_PMD) && + rmp_level > PG_LEVEL_4K; + + /* + * If an unaligned PFN corresponds to a 2M region assigned as a + * large page in the RMP table, PSMASH the region into individual + * 4K RMP entries before attempting to convert a 4K sub-page. + */ + if (!use_2m_update && rmp_level > PG_LEVEL_4K) { + /* + * This shouldn't fail, but if it does, report it, but + * still try to update RMP entry to shared and pray this + * was a spurious error that can be addressed later. + */ + rc = snp_rmptable_psmash(pfn); + WARN_ONCE(rc, "SEV: Failed to PSMASH RMP entry for PFN 0x%llx error %d\n", + pfn, rc); + } + + rc = rmp_make_shared(pfn, use_2m_update ? PG_LEVEL_2M : PG_LEVEL_4K); + if (WARN_ONCE(rc, "SEV: Failed to update RMP entry for PFN 0x%llx error %d\n", + pfn, rc)) + goto next_pfn; + + /* + * SEV-ES avoids host/guest cache coherency issues through + * WBINVD hooks issued via MMU notifiers during run-time, and + * KVM's VM destroy path at shutdown. Those MMU notifier events + * don't cover gmem since there is no requirement to map pages + * to a HVA in order to use them for a running guest. While the + * shutdown path would still likely cover things for SNP guests, + * userspace may also free gmem pages during run-time via + * hole-punching operations on the guest_memfd, so flush the + * cache entries for these pages before free'ing them back to + * the host. + */ + clflush_cache_range(__va(pfn_to_hpa(pfn)), + use_2m_update ? PMD_SIZE : PAGE_SIZE); +next_pfn: + pfn += use_2m_update ? PTRS_PER_PMD : 1; + cond_resched(); + } +} + +int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +{ + int level, rc; + bool assigned; + + if (!sev_snp_guest(kvm)) + return 0; + + rc = snp_lookup_rmpentry(pfn, &assigned, &level); + if (rc || !assigned) + return PG_LEVEL_4K; + + return level; +} + +struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_save_area *vmsa; + struct kvm_sev_info *sev; + int error = 0; + int ret; + + if (!sev_es_guest(vcpu->kvm)) + return NULL; + + /* + * If the VMSA has not yet been encrypted, return a pointer to the + * current un-encrypted VMSA. + */ + if (!vcpu->arch.guest_state_protected) + return (struct vmcb_save_area *)svm->sev_es.vmsa; + + sev = to_kvm_sev_info(vcpu->kvm); + + /* Check if the SEV policy allows debugging */ + if (sev_snp_guest(vcpu->kvm)) { + if (!(sev->policy & SNP_POLICY_DEBUG)) + return NULL; + } else { + if (sev->policy & SEV_POLICY_NODBG) + return NULL; + } + + if (sev_snp_guest(vcpu->kvm)) { + struct sev_data_snp_dbg dbg = {0}; + + vmsa = snp_alloc_firmware_page(__GFP_ZERO); + if (!vmsa) + return NULL; + + dbg.gctx_paddr = __psp_pa(sev->snp_context); + dbg.src_addr = svm->vmcb->control.vmsa_pa; + dbg.dst_addr = __psp_pa(vmsa); + + ret = sev_do_cmd(SEV_CMD_SNP_DBG_DECRYPT, &dbg, &error); + + /* + * Return the target page to a hypervisor page no matter what. + * If this fails, the page can't be used, so leak it and don't + * try to use it. + */ + if (snp_page_reclaim(vcpu->kvm, PHYS_PFN(__pa(vmsa)))) + return NULL; + + if (ret) { + pr_err("SEV: SNP_DBG_DECRYPT failed ret=%d, fw_error=%d (%#x)\n", + ret, error, error); + free_page((unsigned long)vmsa); + + return NULL; + } + } else { + struct sev_data_dbg dbg = {0}; + struct page *vmsa_page; + + vmsa_page = alloc_page(GFP_KERNEL); + if (!vmsa_page) + return NULL; + + vmsa = page_address(vmsa_page); + + dbg.handle = sev->handle; + dbg.src_addr = svm->vmcb->control.vmsa_pa; + dbg.dst_addr = __psp_pa(vmsa); + dbg.len = PAGE_SIZE; + + ret = sev_do_cmd(SEV_CMD_DBG_DECRYPT, &dbg, &error); + if (ret) { + pr_err("SEV: SEV_CMD_DBG_DECRYPT failed ret=%d, fw_error=%d (0x%x)\n", + ret, error, error); + __free_page(vmsa_page); + + return NULL; + } + } + + return vmsa; +} + +void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa) +{ + /* If the VMSA has not yet been encrypted, nothing was allocated */ + if (!vcpu->arch.guest_state_protected || !vmsa) + return; + + free_page((unsigned long)vmsa); +} diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d1a9f9951635..ab9b947dbf4f 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -28,8 +28,11 @@ #include <linux/rwsem.h> #include <linux/cc_platform.h> #include <linux/smp.h> +#include <linux/string_choices.h> +#include <linux/mutex.h> #include <asm/apic.h> +#include <asm/msr.h> #include <asm/perf_event.h> #include <asm/tlbflush.h> #include <asm/desc.h> @@ -53,6 +56,7 @@ #include "svm_onhyperv.h" MODULE_AUTHOR("Qumranet"); +MODULE_DESCRIPTION("KVM support for SVM (AMD-V) extensions"); MODULE_LICENSE("GPL"); #ifdef MODULE @@ -99,6 +103,7 @@ static const struct svm_direct_access_msrs { { .index = MSR_IA32_SPEC_CTRL, .always = false }, { .index = MSR_IA32_PRED_CMD, .always = false }, { .index = MSR_IA32_FLUSH_CMD, .always = false }, + { .index = MSR_IA32_DEBUGCTLMSR, .always = false }, { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false }, { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, { .index = MSR_IA32_LASTINTFROMIP, .always = false }, @@ -215,7 +220,7 @@ int vgif = true; module_param(vgif, int, 0444); /* enable/disable LBR virtualization */ -static int lbrv = true; +int lbrv = true; module_param(lbrv, int, 0444); static int tsc_scaling = true; @@ -228,6 +233,8 @@ module_param(tsc_scaling, int, 0444); static bool avic; module_param(avic, bool, 0444); +module_param(enable_device_posted_irqs, bool, 0444); + bool __read_mostly dump_invalid_vmcb; module_param(dump_invalid_vmcb, bool, 0644); @@ -246,6 +253,8 @@ static unsigned long iopm_base; DEFINE_PER_CPU(struct svm_cpu_data, svm_data); +static DEFINE_MUTEX(vmcb_dump_mutex); + /* * Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE. @@ -282,8 +291,6 @@ u32 svm_msrpm_offset(u32 msr) return MSR_INVALID; } -static void svm_flush_tlb_current(struct kvm_vcpu *vcpu); - static int get_npt_level(void) { #ifdef CONFIG_X86_64 @@ -474,24 +481,18 @@ static void svm_inject_exception(struct kvm_vcpu *vcpu) static void svm_init_erratum_383(void) { - u32 low, high; - int err; u64 val; if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH)) return; /* Use _safe variants to not break nested virtualization */ - val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err); - if (err) + if (native_read_msr_safe(MSR_AMD64_DC_CFG, &val)) return; val |= (1ULL << 47); - low = lower_32_bits(val); - high = upper_32_bits(val); - - native_write_msr_safe(MSR_AMD64_DC_CFG, low, high); + native_write_msr_safe(MSR_AMD64_DC_CFG, val); erratum_383_found = true; } @@ -565,34 +566,39 @@ static void __svm_write_tsc_multiplier(u64 multiplier) if (multiplier == __this_cpu_read(current_tsc_ratio)) return; - wrmsrl(MSR_AMD64_TSC_RATIO, multiplier); + wrmsrq(MSR_AMD64_TSC_RATIO, multiplier); __this_cpu_write(current_tsc_ratio, multiplier); } +static __always_inline struct sev_es_save_area *sev_es_host_save_area(struct svm_cpu_data *sd) +{ + return &sd->save_area->host_sev_es_save; +} + static inline void kvm_cpu_svm_disable(void) { uint64_t efer; - wrmsrl(MSR_VM_HSAVE_PA, 0); - rdmsrl(MSR_EFER, efer); + wrmsrq(MSR_VM_HSAVE_PA, 0); + rdmsrq(MSR_EFER, efer); if (efer & EFER_SVME) { /* * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and * NMI aren't blocked. */ stgi(); - wrmsrl(MSR_EFER, efer & ~EFER_SVME); + wrmsrq(MSR_EFER, efer & ~EFER_SVME); } } -static void svm_emergency_disable(void) +static void svm_emergency_disable_virtualization_cpu(void) { kvm_rebooting = true; kvm_cpu_svm_disable(); } -static void svm_hardware_disable(void) +static void svm_disable_virtualization_cpu(void) { /* Make sure we clean up behind us */ if (tsc_scaling) @@ -603,14 +609,14 @@ static void svm_hardware_disable(void) amd_pmu_disable_virt(); } -static int svm_hardware_enable(void) +static int svm_enable_virtualization_cpu(void) { struct svm_cpu_data *sd; uint64_t efer; int me = raw_smp_processor_id(); - rdmsrl(MSR_EFER, efer); + rdmsrq(MSR_EFER, efer); if (efer & EFER_SVME) return -EBUSY; @@ -620,9 +626,9 @@ static int svm_hardware_enable(void) sd->next_asid = sd->max_asid + 1; sd->min_asid = max_sev_asid + 1; - wrmsrl(MSR_EFER, efer | EFER_SVME); + wrmsrq(MSR_EFER, efer | EFER_SVME); - wrmsrl(MSR_VM_HSAVE_PA, sd->save_area_pa); + wrmsrq(MSR_VM_HSAVE_PA, sd->save_area_pa); if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { /* @@ -643,13 +649,12 @@ static int svm_hardware_enable(void) * erratum is present everywhere). */ if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) { - uint64_t len, status = 0; + u64 len, status = 0; int err; - len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err); + err = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &len); if (!err) - status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, - &err); + err = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, &status); if (err) osvw_status = osvw_len = 0; @@ -673,12 +678,9 @@ static int svm_hardware_enable(void) * TSC_AUX field now to avoid a RDMSR on every vCPU run. */ if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) { - struct sev_es_save_area *hostsa; u32 __maybe_unused msr_hi; - hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400); - - rdmsr(MSR_TSC_AUX, hostsa->tsc_aux, msr_hi); + rdmsr(MSR_TSC_AUX, sev_es_host_save_area(sd)->tsc_aux, msr_hi); } return 0; @@ -692,7 +694,7 @@ static void svm_cpu_uninit(int cpu) return; kfree(sd->sev_vmcbs); - __free_page(sd->save_area); + __free_page(__sme_pa_to_page(sd->save_area_pa)); sd->save_area_pa = 0; sd->save_area = NULL; } @@ -700,23 +702,24 @@ static void svm_cpu_uninit(int cpu) static int svm_cpu_init(int cpu) { struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); + struct page *save_area_page; int ret = -ENOMEM; memset(sd, 0, sizeof(struct svm_cpu_data)); - sd->save_area = snp_safe_alloc_page(NULL); - if (!sd->save_area) + save_area_page = snp_safe_alloc_page_node(cpu_to_node(cpu), GFP_KERNEL); + if (!save_area_page) return ret; ret = sev_cpu_init(sd); if (ret) goto free_save_area; - sd->save_area_pa = __sme_page_pa(sd->save_area); + sd->save_area = page_address(save_area_page); + sd->save_area_pa = __sme_page_pa(save_area_page); return 0; free_save_area: - __free_page(sd->save_area); - sd->save_area = NULL; + __free_page(save_area_page); return ret; } @@ -990,7 +993,7 @@ void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb) vmcb_mark_dirty(to_vmcb, VMCB_LBR); } -static void svm_enable_lbrv(struct kvm_vcpu *vcpu) +void svm_enable_lbrv(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1000,6 +1003,9 @@ static void svm_enable_lbrv(struct kvm_vcpu *vcpu) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); + if (sev_es_guest(vcpu->kvm)) + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_DEBUGCTLMSR, 1, 1); + /* Move the LBR msrs to the vmcb02 so that the guest can see them. */ if (is_guest_mode(vcpu)) svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr); @@ -1009,6 +1015,8 @@ static void svm_disable_lbrv(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm); + svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); @@ -1039,7 +1047,7 @@ void svm_update_lbrv(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK; bool enable_lbrv = (svm_get_lbr_vmcb(svm)->save.dbgctl & DEBUGCTLMSR_LBR) || - (is_guest_mode(vcpu) && guest_can_use(vcpu, X86_FEATURE_LBRV) && + (is_guest_mode(vcpu) && guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)); if (enable_lbrv == current_enable_lbrv) @@ -1115,8 +1123,7 @@ static void svm_hardware_unsetup(void) for_each_possible_cpu(cpu) svm_cpu_uninit(cpu); - __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), - get_order(IOPM_SIZE)); + __free_pages(__sme_pa_to_page(iopm_base), get_order(IOPM_SIZE)); iopm_base = 0; } @@ -1178,14 +1185,14 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu, */ if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) { if (!npt_enabled || - !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID)) + !guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_INVPCID)) svm_set_intercept(svm, INTERCEPT_INVPCID); else svm_clr_intercept(svm, INTERCEPT_INVPCID); } if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) { - if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP)) svm_clr_intercept(svm, INTERCEPT_RDTSCP); else svm_set_intercept(svm, INTERCEPT_RDTSCP); @@ -1196,7 +1203,7 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if (guest_cpuid_is_intel(vcpu)) { + if (guest_cpuid_is_intel_compatible(vcpu)) { /* * We must intercept SYSENTER_EIP and SYSENTER_ESP * accesses because the processor only stores 32 bits. @@ -1289,10 +1296,14 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm_set_intercept(svm, INTERCEPT_MWAIT); } - if (!kvm_hlt_in_guest(vcpu->kvm)) - svm_set_intercept(svm, INTERCEPT_HLT); + if (!kvm_hlt_in_guest(vcpu->kvm)) { + if (cpu_feature_enabled(X86_FEATURE_IDLE_HLT)) + svm_set_intercept(svm, INTERCEPT_IDLE_HLT); + else + svm_set_intercept(svm, INTERCEPT_HLT); + } - control->iopm_base_pa = __sme_set(iopm_base); + control->iopm_base_pa = iopm_base; control->msrpm_base_pa = __sme_set(__pa(svm->msrpm)); control->int_ctl = V_INTR_MASKING_MASK; @@ -1363,6 +1374,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK; } + if (vcpu->kvm->arch.bus_lock_detection_enabled) + svm_set_intercept(svm, INTERCEPT_BUSLOCK); + if (sev_guest(vcpu->kvm)) sev_init_vmcb(svm); @@ -1381,7 +1395,9 @@ static void __svm_vcpu_reset(struct kvm_vcpu *vcpu) svm_vcpu_init_msrpm(vcpu, svm->msrpm); svm_init_osvw(vcpu); - vcpu->arch.microcode_version = 0x01000065; + + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) + vcpu->arch.microcode_version = 0x01000065; svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio; svm->nmi_masked = false; @@ -1398,6 +1414,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) svm->spec_ctrl = 0; svm->virt_spec_ctrl = 0; + if (init_event) + sev_snp_init_protected_guest_state(vcpu); + init_vmcb(vcpu); if (!init_event) @@ -1421,7 +1440,7 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu) svm = to_svm(vcpu); err = -ENOMEM; - vmcb01_page = snp_safe_alloc_page(vcpu); + vmcb01_page = snp_safe_alloc_page(); if (!vmcb01_page) goto out; @@ -1430,17 +1449,9 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu) * SEV-ES guests require a separate VMSA page used to contain * the encrypted register state of the guest. */ - vmsa_page = snp_safe_alloc_page(vcpu); + vmsa_page = snp_safe_alloc_page(); if (!vmsa_page) goto error_free_vmcb_page; - - /* - * SEV-ES guests maintain an encrypted version of their FPU - * state which is restored and saved on VMRUN and VMEXIT. - * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't - * do xsave/xrstor on it. - */ - fpstate_set_confidential(&vcpu->arch.guest_fpu); } err = avic_init_vcpu(svm); @@ -1475,33 +1486,75 @@ out: return err; } -static void svm_clear_current_vmcb(struct vmcb *vmcb) +static void svm_vcpu_free(struct kvm_vcpu *vcpu) { - int i; + struct vcpu_svm *svm = to_svm(vcpu); + + svm_leave_nested(vcpu); + svm_free_nested(svm); + + sev_free_vcpu(vcpu); - for_each_online_cpu(i) - cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL); + __free_page(__sme_pa_to_page(svm->vmcb01.pa)); + __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE)); } -static void svm_vcpu_free(struct kvm_vcpu *vcpu) +#ifdef CONFIG_CPU_MITIGATIONS +static DEFINE_SPINLOCK(srso_lock); +static atomic_t srso_nr_vms; + +static void svm_srso_clear_bp_spec_reduce(void *ign) { - struct vcpu_svm *svm = to_svm(vcpu); + struct svm_cpu_data *sd = this_cpu_ptr(&svm_data); + + if (!sd->bp_spec_reduce_set) + return; + + msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); + sd->bp_spec_reduce_set = false; +} + +static void svm_srso_vm_destroy(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) + return; + + if (atomic_dec_return(&srso_nr_vms)) + return; + + guard(spinlock)(&srso_lock); /* - * The vmcb page can be recycled, causing a false negative in - * svm_vcpu_load(). So, ensure that no logical CPU has this - * vmcb page recorded as its current vmcb. + * Verify a new VM didn't come along, acquire the lock, and increment + * the count before this task acquired the lock. */ - svm_clear_current_vmcb(svm->vmcb); + if (atomic_read(&srso_nr_vms)) + return; - svm_leave_nested(vcpu); - svm_free_nested(svm); + on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1); +} - sev_free_vcpu(vcpu); +static void svm_srso_vm_init(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) + return; - __free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT)); - __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE)); + /* + * Acquire the lock on 0 => 1 transitions to ensure a potential 1 => 0 + * transition, i.e. destroying the last VM, is fully complete, e.g. so + * that a delayed IPI doesn't clear BP_SPEC_REDUCE after a vCPU runs. + */ + if (atomic_inc_not_zero(&srso_nr_vms)) + return; + + guard(spinlock)(&srso_lock); + + atomic_inc(&srso_nr_vms); } +#else +static void svm_srso_vm_init(void) { } +static void svm_srso_vm_destroy(void) { } +#endif static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { @@ -1519,12 +1572,8 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) * or subsequent vmload of host save area. */ vmsave(sd->save_area_pa); - if (sev_es_guest(vcpu->kvm)) { - struct sev_es_save_area *hostsa; - hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400); - - sev_es_prepare_switch_to_guest(hostsa); - } + if (sev_es_guest(vcpu->kvm)) + sev_es_prepare_switch_to_guest(svm, sev_es_host_save_area(sd)); if (tsc_scaling) __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); @@ -1533,12 +1582,17 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) * TSC_AUX is always virtualized for SEV-ES guests when the feature is * available. The user return MSR support is not required in this case * because TSC_AUX is restored on #VMEXIT from the host save area - * (which has been initialized in svm_hardware_enable()). + * (which has been initialized in svm_enable_virtualization_cpu()). */ if (likely(tsc_aux_uret_slot >= 0) && (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm))) kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull); + if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE) && + !sd->bp_spec_reduce_set) { + sd->bp_spec_reduce_set = true; + msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); + } svm->guest_state_loaded = true; } @@ -1549,15 +1603,9 @@ static void svm_prepare_host_switch(struct kvm_vcpu *vcpu) static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - struct vcpu_svm *svm = to_svm(vcpu); - struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); - - if (sd->current_vmcb != svm->vmcb) { - sd->current_vmcb = svm->vmcb; + if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) + shrink_ple_window(vcpu); - if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT)) - indirect_branch_prediction_barrier(); - } if (kvm_vcpu_apicv_active(vcpu)) avic_vcpu_load(vcpu, cpu); } @@ -1916,9 +1964,6 @@ void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE; unsigned long old_cr4 = vcpu->arch.cr4; - if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) - svm_flush_tlb_current(vcpu); - vcpu->arch.cr4 = cr4; if (!npt_enabled) { cr4 |= X86_CR4_PAE; @@ -1931,7 +1976,7 @@ void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; } static void svm_set_segment(struct kvm_vcpu *vcpu, @@ -1990,11 +2035,11 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) svm->asid = sd->next_asid++; } -static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value) +static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value) { - struct vmcb *vmcb = svm->vmcb; + struct vmcb *vmcb = to_svm(vcpu)->vmcb; - if (svm->vcpu.arch.guest_state_protected) + if (vcpu->arch.guest_state_protected) return; if (unlikely(value != vmcb->save.dr6)) { @@ -2051,15 +2096,33 @@ static int pf_interception(struct kvm_vcpu *vcpu) static int npf_interception(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + int rc; u64 fault_address = svm->vmcb->control.exit_info_2; u64 error_code = svm->vmcb->control.exit_info_1; + /* + * WARN if hardware generates a fault with an error code that collides + * with KVM-defined sythentic flags. Clear the flags and continue on, + * i.e. don't terminate the VM, as KVM can't possibly be relying on a + * flag that KVM doesn't know about. + */ + if (WARN_ON_ONCE(error_code & PFERR_SYNTHETIC_MASK)) + error_code &= ~PFERR_SYNTHETIC_MASK; + + if (sev_snp_guest(vcpu->kvm) && (error_code & PFERR_GUEST_ENC_MASK)) + error_code |= PFERR_PRIVATE_ACCESS; + trace_kvm_page_fault(vcpu, fault_address, error_code); - return kvm_mmu_page_fault(vcpu, fault_address, error_code, - static_cpu_has(X86_FEATURE_DECODEASSISTS) ? - svm->vmcb->control.insn_bytes : NULL, - svm->vmcb->control.insn_len); + rc = kvm_mmu_page_fault(vcpu, fault_address, error_code, + static_cpu_has(X86_FEATURE_DECODEASSISTS) ? + svm->vmcb->control.insn_bytes : NULL, + svm->vmcb->control.insn_len); + + if (rc > 0 && error_code & PFERR_GUEST_RMP_MASK) + sev_handle_rmp_fault(vcpu, fault_address, error_code); + + return rc; } static int db_interception(struct kvm_vcpu *vcpu) @@ -2119,14 +2182,13 @@ static int ac_interception(struct kvm_vcpu *vcpu) static bool is_erratum_383(void) { - int err, i; + int i; u64 value; if (!erratum_383_found) return false; - value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err); - if (err) + if (native_read_msr_safe(MSR_IA32_MC0_STATUS, &value)) return false; /* Bit 62 may or may not be set for this mce */ @@ -2137,17 +2199,11 @@ static bool is_erratum_383(void) /* Clear MCi_STATUS registers */ for (i = 0; i < 6; ++i) - native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0); - - value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err); - if (!err) { - u32 low, high; + native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0); + if (!native_read_msr_safe(MSR_IA32_MCG_STATUS, &value)) { value &= ~(1ULL << 2); - low = lower_32_bits(value); - high = upper_32_bits(value); - - native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high); + native_write_msr_safe(MSR_IA32_MCG_STATUS, value); } /* Flush tlb to evict multi-match entries */ @@ -2201,6 +2257,10 @@ static int shutdown_interception(struct kvm_vcpu *vcpu) */ if (!sev_es_guest(vcpu->kvm)) { clear_page(svm->vmcb); +#ifdef CONFIG_KVM_SMM + if (is_smm(vcpu)) + kvm_smm_changed(vcpu, false); +#endif kvm_vcpu_reset(vcpu, true); } @@ -2278,7 +2338,7 @@ static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload) svm_copy_vmloadsave_state(vmcb12, svm->vmcb); } - kvm_vcpu_unmap(vcpu, &map, true); + kvm_vcpu_unmap(vcpu, &map); return ret; } @@ -2804,30 +2864,44 @@ static int efer_trap(struct kvm_vcpu *vcpu) return kvm_complete_insn_gp(vcpu, ret); } -static int svm_get_msr_feature(struct kvm_msr_entry *msr) +static int svm_get_feature_msr(u32 msr, u64 *data) { - msr->data = 0; + *data = 0; - switch (msr->index) { + switch (msr) { case MSR_AMD64_DE_CFG: if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC)) - msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE; + *data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE; break; default: - return KVM_MSR_RET_INVALID; + return KVM_MSR_RET_UNSUPPORTED; } return 0; } +static bool +sev_es_prevent_msr_access(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + return sev_es_guest(vcpu->kvm) && + vcpu->arch.guest_state_protected && + svm_msrpm_offset(msr_info->index) != MSR_INVALID && + !msr_write_intercepted(vcpu, msr_info->index); +} + static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_svm *svm = to_svm(vcpu); + if (sev_es_prevent_msr_access(vcpu, msr_info)) { + msr_info->data = 0; + return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0; + } + switch (msr_info->index) { case MSR_AMD64_TSC_RATIO: if (!msr_info->host_initiated && - !guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR)) return 1; msr_info->data = svm->tsc_ratio_msr; break; @@ -2841,6 +2915,12 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_CSTAR: msr_info->data = svm->vmcb01.ptr->save.cstar; break; + case MSR_GS_BASE: + msr_info->data = svm->vmcb01.ptr->save.gs.base; + break; + case MSR_FS_BASE: + msr_info->data = svm->vmcb01.ptr->save.fs.base; + break; case MSR_KERNEL_GS_BASE: msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base; break; @@ -2853,12 +2933,12 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_SYSENTER_EIP: msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip; - if (guest_cpuid_is_intel(vcpu)) + if (guest_cpuid_is_intel_compatible(vcpu)) msr_info->data |= (u64)svm->sysenter_eip_hi << 32; break; case MSR_IA32_SYSENTER_ESP: msr_info->data = svm->vmcb01.ptr->save.sysenter_esp; - if (guest_cpuid_is_intel(vcpu)) + if (guest_cpuid_is_intel_compatible(vcpu)) msr_info->data |= (u64)svm->sysenter_esp_hi << 32; break; case MSR_TSC_AUX: @@ -2897,7 +2977,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_AMD64_VIRT_SPEC_CTRL: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_VIRT_SSBD)) return 1; msr_info->data = svm->virt_spec_ctrl; @@ -2934,11 +3014,7 @@ static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb)) return kvm_complete_insn_gp(vcpu, err); - ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 1); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, - X86_TRAP_GP | - SVM_EVTINJ_TYPE_EXEPT | - SVM_EVTINJ_VALID); + svm_vmgexit_inject_exception(svm, X86_TRAP_GP); return 1; } @@ -2974,10 +3050,14 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) u32 ecx = msr->index; u64 data = msr->data; + + if (sev_es_prevent_msr_access(vcpu, msr)) + return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0; + switch (ecx) { case MSR_AMD64_TSC_RATIO: - if (!guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR)) { + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR)) { if (!msr->host_initiated) return 1; @@ -2999,7 +3079,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm->tsc_ratio_msr = data; - if (guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR) && + if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR) && is_guest_mode(vcpu)) nested_svm_update_tsc_ratio_msr(vcpu); @@ -3044,7 +3124,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; case MSR_AMD64_VIRT_SPEC_CTRL: if (!msr->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_VIRT_SSBD)) return 1; if (data & ~SPEC_CTRL_SSBD) @@ -3062,6 +3142,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) case MSR_CSTAR: svm->vmcb01.ptr->save.cstar = data; break; + case MSR_GS_BASE: + svm->vmcb01.ptr->save.gs.base = data; + break; + case MSR_FS_BASE: + svm->vmcb01.ptr->save.fs.base = data; + break; case MSR_KERNEL_GS_BASE: svm->vmcb01.ptr->save.kernel_gs_base = data; break; @@ -3081,11 +3167,11 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) * 32 bit part of these msrs to support Intel's * implementation of SYSENTER/SYSEXIT. */ - svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0; + svm->sysenter_eip_hi = guest_cpuid_is_intel_compatible(vcpu) ? (data >> 32) : 0; break; case MSR_IA32_SYSENTER_ESP: svm->vmcb01.ptr->save.sysenter_esp = (u32)data; - svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0; + svm->sysenter_esp_hi = guest_cpuid_is_intel_compatible(vcpu) ? (data >> 32) : 0; break; case MSR_TSC_AUX: /* @@ -3093,7 +3179,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) * feature is available. The user return MSR support is not * required in this case because TSC_AUX is restored on #VMEXIT * from the host save area (which has been initialized in - * svm_hardware_enable()). + * svm_enable_virtualization_cpu()). */ if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && sev_es_guest(vcpu->kvm)) break; @@ -3116,6 +3202,16 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) kvm_pr_unimpl_wrmsr(vcpu, ecx, data); break; } + + /* + * Suppress BTF as KVM doesn't virtualize BTF, but there's no + * way to communicate lack of support to the guest. + */ + if (data & DEBUGCTLMSR_BTF) { + kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data); + data &= ~DEBUGCTLMSR_BTF; + } + if (data & DEBUGCTL_RESERVED_BITS) return 1; @@ -3140,18 +3236,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) kvm_pr_unimpl_wrmsr(vcpu, ecx, data); break; case MSR_AMD64_DE_CFG: { - struct kvm_msr_entry msr_entry; - - msr_entry.index = msr->index; - if (svm_get_msr_feature(&msr_entry)) - return 1; + u64 supported_de_cfg; - /* Check the supported bits */ - if (data & ~msr_entry.data) + if (svm_get_feature_msr(ecx, &supported_de_cfg)) return 1; - /* Don't allow the guest to change a bit, #GP */ - if (!msr->host_initiated && (data ^ msr_entry.data)) + if (data & ~supported_de_cfg) return 1; svm->msr_decfg = data; @@ -3216,7 +3306,7 @@ static int invpcid_interception(struct kvm_vcpu *vcpu) unsigned long type; gva_t gva; - if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } @@ -3229,9 +3319,51 @@ static int invpcid_interception(struct kvm_vcpu *vcpu) type = svm->vmcb->control.exit_info_2; gva = svm->vmcb->control.exit_info_1; + /* + * FIXME: Perform segment checks for 32-bit mode, and inject #SS if the + * stack segment is used. The intercept takes priority over all + * #GP checks except CPL>0, but somehow still generates a linear + * address? The APM is sorely lacking. + */ + if (is_noncanonical_address(gva, vcpu, 0)) { + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return 1; + } + return kvm_handle_invpcid(vcpu, type, gva); } +static inline int complete_userspace_buslock(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + /* + * If userspace has NOT changed RIP, then KVM's ABI is to let the guest + * execute the bus-locking instruction. Set the bus lock counter to '1' + * to effectively step past the bus lock. + */ + if (kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip)) + svm->vmcb->control.bus_lock_counter = 1; + + return 1; +} + +static int bus_lock_exit(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; + vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK; + + vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu); + vcpu->arch.complete_userspace_io = complete_userspace_buslock; + + if (is_guest_mode(vcpu)) + svm->nested.ctl.bus_lock_rip = vcpu->arch.cui_linear_rip; + + return 0; +} + static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { [SVM_EXIT_READ_CR0] = cr_interception, [SVM_EXIT_READ_CR3] = cr_interception, @@ -3299,11 +3431,15 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { [SVM_EXIT_CR4_WRITE_TRAP] = cr_trap, [SVM_EXIT_CR8_WRITE_TRAP] = cr_trap, [SVM_EXIT_INVPCID] = invpcid_interception, + [SVM_EXIT_IDLE_HLT] = kvm_emulate_halt, [SVM_EXIT_NPF] = npf_interception, + [SVM_EXIT_BUS_LOCK] = bus_lock_exit, [SVM_EXIT_RSM] = rsm_interception, [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception, +#ifdef CONFIG_KVM_AMD_SEV [SVM_EXIT_VMGEXIT] = sev_handle_vmgexit, +#endif }; static void dump_vmcb(struct kvm_vcpu *vcpu) @@ -3312,14 +3448,21 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save; + char *vm_type; if (!dump_invalid_vmcb) { pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n"); return; } - pr_err("VMCB %p, last attempted VMRUN on CPU %d\n", - svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu); + guard(mutex)(&vmcb_dump_mutex); + + vm_type = sev_snp_guest(vcpu->kvm) ? "SEV-SNP" : + sev_es_guest(vcpu->kvm) ? "SEV-ES" : + sev_guest(vcpu->kvm) ? "SEV" : "SVM"; + + pr_err("%s vCPU%u VMCB %p, last attempted VMRUN on CPU %d\n", + vm_type, vcpu->vcpu_id, svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu); pr_err("VMCB Control Area:\n"); pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff); pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16); @@ -3357,6 +3500,17 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id); pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id); pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa); + pr_err("%-20s%016llx\n", "allowed_sev_features:", control->allowed_sev_features); + pr_err("%-20s%016llx\n", "guest_sev_features:", control->guest_sev_features); + + if (sev_es_guest(vcpu->kvm)) { + save = sev_decrypt_vmsa(vcpu); + if (!save) + goto no_vmsa; + + save01 = save; + } + pr_err("VMCB State Save Area:\n"); pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", "es:", @@ -3427,6 +3581,63 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-15s %016llx %-13s %016llx\n", "excp_from:", save->last_excp_from, "excp_to:", save->last_excp_to); + + if (sev_es_guest(vcpu->kvm)) { + struct sev_es_save_area *vmsa = (struct sev_es_save_area *)save; + + pr_err("%-15s %016llx\n", + "sev_features", vmsa->sev_features); + + pr_err("%-15s %016llx %-13s %016llx\n", + "rax:", vmsa->rax, "rbx:", vmsa->rbx); + pr_err("%-15s %016llx %-13s %016llx\n", + "rcx:", vmsa->rcx, "rdx:", vmsa->rdx); + pr_err("%-15s %016llx %-13s %016llx\n", + "rsi:", vmsa->rsi, "rdi:", vmsa->rdi); + pr_err("%-15s %016llx %-13s %016llx\n", + "rbp:", vmsa->rbp, "rsp:", vmsa->rsp); + pr_err("%-15s %016llx %-13s %016llx\n", + "r8:", vmsa->r8, "r9:", vmsa->r9); + pr_err("%-15s %016llx %-13s %016llx\n", + "r10:", vmsa->r10, "r11:", vmsa->r11); + pr_err("%-15s %016llx %-13s %016llx\n", + "r12:", vmsa->r12, "r13:", vmsa->r13); + pr_err("%-15s %016llx %-13s %016llx\n", + "r14:", vmsa->r14, "r15:", vmsa->r15); + pr_err("%-15s %016llx %-13s %016llx\n", + "xcr0:", vmsa->xcr0, "xss:", vmsa->xss); + } else { + pr_err("%-15s %016llx %-13s %016lx\n", + "rax:", save->rax, "rbx:", + vcpu->arch.regs[VCPU_REGS_RBX]); + pr_err("%-15s %016lx %-13s %016lx\n", + "rcx:", vcpu->arch.regs[VCPU_REGS_RCX], + "rdx:", vcpu->arch.regs[VCPU_REGS_RDX]); + pr_err("%-15s %016lx %-13s %016lx\n", + "rsi:", vcpu->arch.regs[VCPU_REGS_RSI], + "rdi:", vcpu->arch.regs[VCPU_REGS_RDI]); + pr_err("%-15s %016lx %-13s %016llx\n", + "rbp:", vcpu->arch.regs[VCPU_REGS_RBP], + "rsp:", save->rsp); +#ifdef CONFIG_X86_64 + pr_err("%-15s %016lx %-13s %016lx\n", + "r8:", vcpu->arch.regs[VCPU_REGS_R8], + "r9:", vcpu->arch.regs[VCPU_REGS_R9]); + pr_err("%-15s %016lx %-13s %016lx\n", + "r10:", vcpu->arch.regs[VCPU_REGS_R10], + "r11:", vcpu->arch.regs[VCPU_REGS_R11]); + pr_err("%-15s %016lx %-13s %016lx\n", + "r12:", vcpu->arch.regs[VCPU_REGS_R12], + "r13:", vcpu->arch.regs[VCPU_REGS_R13]); + pr_err("%-15s %016lx %-13s %016lx\n", + "r14:", vcpu->arch.regs[VCPU_REGS_R14], + "r15:", vcpu->arch.regs[VCPU_REGS_R15]); +#endif + } + +no_vmsa: + if (sev_es_guest(vcpu->kvm)) + sev_free_decrypted_vmsa(vcpu, save); } static bool svm_check_exit_valid(u64 exit_code) @@ -3459,10 +3670,14 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) return interrupt_window_interception(vcpu); else if (exit_code == SVM_EXIT_INTR) return intr_interception(vcpu); - else if (exit_code == SVM_EXIT_HLT) + else if (exit_code == SVM_EXIT_HLT || exit_code == SVM_EXIT_IDLE_HLT) return kvm_emulate_halt(vcpu); else if (exit_code == SVM_EXIT_NPF) return npf_interception(vcpu); +#ifdef CONFIG_KVM_AMD_SEV + else if (exit_code == SVM_EXIT_VMGEXIT) + return sev_handle_vmgexit(vcpu); +#endif #endif return svm_exit_handlers[exit_code](vcpu); } @@ -3484,6 +3699,21 @@ static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, *error_code = 0; } +static void svm_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, + u32 *error_code) +{ + struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; + + *intr_info = control->event_inj; + + if ((*intr_info & SVM_EXITINTINFO_VALID) && + (*intr_info & SVM_EXITINTINFO_VALID_ERR)) + *error_code = control->event_inj_err; + else + *error_code = 0; + +} + static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3527,7 +3757,7 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) return svm_invoke_exit_handler(vcpu, exit_code); } -static void pre_svm_run(struct kvm_vcpu *vcpu) +static int pre_svm_run(struct kvm_vcpu *vcpu) { struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); struct vcpu_svm *svm = to_svm(vcpu); @@ -3549,6 +3779,8 @@ static void pre_svm_run(struct kvm_vcpu *vcpu) /* FIXME: handle wraparound of asid_generation */ if (svm->current_vmcb->asid_generation != sd->asid_generation) new_asid(svm, sd); + + return 0; } static void svm_inject_nmi(struct kvm_vcpu *vcpu) @@ -3842,16 +4074,27 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); /* - * KVM should never request an NMI window when vNMI is enabled, as KVM - * allows at most one to-be-injected NMI and one pending NMI, i.e. if - * two NMIs arrive simultaneously, KVM will inject one and set - * V_NMI_PENDING for the other. WARN, but continue with the standard - * single-step approach to try and salvage the pending NMI. + * If NMIs are outright masked, i.e. the vCPU is already handling an + * NMI, and KVM has not yet intercepted an IRET, then there is nothing + * more to do at this time as KVM has already enabled IRET intercepts. + * If KVM has already intercepted IRET, then single-step over the IRET, + * as NMIs aren't architecturally unmasked until the IRET completes. + * + * If vNMI is enabled, KVM should never request an NMI window if NMIs + * are masked, as KVM allows at most one to-be-injected NMI and one + * pending NMI. If two NMIs arrive simultaneously, KVM will inject one + * NMI and set V_NMI_PENDING for the other, but if and only if NMIs are + * unmasked. KVM _will_ request an NMI window in some situations, e.g. + * if the vCPU is in an STI shadow or if GIF=0, KVM can't immediately + * inject the NMI. In those situations, KVM needs to single-step over + * the STI shadow or intercept STGI. */ - WARN_ON_ONCE(is_vnmi_enabled(svm)); + if (svm_get_nmi_mask(vcpu)) { + WARN_ON_ONCE(is_vnmi_enabled(svm)); - if (svm_get_nmi_mask(vcpu) && !svm->awaiting_iret_completion) - return; /* IRET will cause a vm exit */ + if (!svm->awaiting_iret_completion) + return; /* IRET will cause a vm exit */ + } /* * SEV-ES guests are responsible for signaling when a vCPU is ready to @@ -4045,20 +4288,23 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) vcpu->arch.nmi_injected = true; svm->nmi_l1_to_l2 = nmi_l1_to_l2; break; - case SVM_EXITINTINFO_TYPE_EXEPT: + case SVM_EXITINTINFO_TYPE_EXEPT: { + u32 error_code = 0; + /* * Never re-inject a #VC exception. */ if (vector == X86_TRAP_VC) break; - if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { - u32 err = svm->vmcb->control.exit_int_info_err; - kvm_requeue_exception_e(vcpu, vector, err); + if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) + error_code = svm->vmcb->control.exit_int_info_err; - } else - kvm_requeue_exception(vcpu, vector); + kvm_requeue_exception(vcpu, vector, + exitintinfo & SVM_EXITINTINFO_VALID_ERR, + error_code); break; + } case SVM_EXITINTINFO_TYPE_INTR: kvm_queue_interrupt(vcpu, vector, false); break; @@ -4084,34 +4330,62 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu) static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu) { + if (to_kvm_sev_info(vcpu->kvm)->need_init) + return -EINVAL; + return 1; } static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); + if (is_guest_mode(vcpu)) return EXIT_FASTPATH_NONE; - if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && - to_svm(vcpu)->vmcb->control.exit_info_1) + switch (svm->vmcb->control.exit_code) { + case SVM_EXIT_MSR: + if (!svm->vmcb->control.exit_info_1) + break; return handle_fastpath_set_msr_irqoff(vcpu); + case SVM_EXIT_HLT: + return handle_fastpath_hlt(vcpu); + default: + break; + } return EXIT_FASTPATH_NONE; } static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted) { + struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); struct vcpu_svm *svm = to_svm(vcpu); guest_state_enter_irqoff(); + /* + * Set RFLAGS.IF prior to VMRUN, as the host's RFLAGS.IF at the time of + * VMRUN controls whether or not physical IRQs are masked (KVM always + * runs with V_INTR_MASKING_MASK). Toggle RFLAGS.IF here to avoid the + * temptation to do STI+VMRUN+CLI, as AMD CPUs bleed the STI shadow + * into guest state if delivery of an event during VMRUN triggers a + * #VMEXIT, and the guest_state transitions already tell lockdep that + * IRQs are being enabled/disabled. Note! GIF=0 for the entirety of + * this path, so IRQs aren't actually unmasked while running host code. + */ + raw_local_irq_enable(); + amd_clear_divider(); if (sev_es_guest(vcpu->kvm)) - __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted); + __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted, + sev_es_host_save_area(sd)); else __svm_vcpu_run(svm, spec_ctrl_intercepted); + raw_local_irq_disable(); + guest_state_exit_irqoff(); } @@ -4146,7 +4420,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, if (force_immediate_exit) smp_send_reschedule(vcpu->cpu); - pre_svm_run(vcpu); + if (pre_svm_run(vcpu)) { + vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; + vcpu->run->fail_entry.hardware_entry_failure_reason = SVM_EXIT_ERR; + vcpu->run->fail_entry.cpu = vcpu->cpu; + return EXIT_FASTPATH_EXIT_USERSPACE; + } sync_lapic_to_cr8(vcpu); @@ -4162,14 +4441,22 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, * Run with all-zero DR6 unless needed, so that we can get the exact cause * of a #DB. */ - if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) - svm_set_dr6(svm, vcpu->arch.dr6); - else - svm_set_dr6(svm, DR6_ACTIVE_LOW); + if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))) + svm_set_dr6(vcpu, DR6_ACTIVE_LOW); clgi(); kvm_load_guest_xsave_state(vcpu); + /* + * Hardware only context switches DEBUGCTL if LBR virtualization is + * enabled. Manually load DEBUGCTL if necessary (and restore it after + * VM-Exit), as running with the host's DEBUGCTL can negatively affect + * guest state and can even be fatal, e.g. due to Bus Lock Detect. + */ + if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && + vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) + update_debugctlmsr(svm->vmcb->save.dbgctl); + kvm_wait_lapic_expire(vcpu); /* @@ -4197,6 +4484,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); + if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && + vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) + update_debugctlmsr(vcpu->arch.host_debugctl); + kvm_load_host_xsave_state(vcpu); stgi(); @@ -4318,27 +4609,17 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) * XSS on VM-Enter/VM-Exit. Failure to do so would effectively give * the guest read/write access to the host's XSS. */ - if (boot_cpu_has(X86_FEATURE_XSAVE) && - boot_cpu_has(X86_FEATURE_XSAVES) && - guest_cpuid_has(vcpu, X86_FEATURE_XSAVE)) - kvm_governed_feature_set(vcpu, X86_FEATURE_XSAVES); - - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_NRIPS); - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_TSCRATEMSR); - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LBRV); + guest_cpu_cap_change(vcpu, X86_FEATURE_XSAVES, + boot_cpu_has(X86_FEATURE_XSAVES) && + guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE)); /* - * Intercept VMLOAD if the vCPU mode is Intel in order to emulate that + * Intercept VMLOAD if the vCPU model is Intel in order to emulate that * VMLOAD drops bits 63:32 of SYSENTER (ignoring the fact that exposing * SVM on Intel is bonkers and extremely unlikely to work). */ - if (!guest_cpuid_is_intel(vcpu)) - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD); - - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PAUSEFILTER); - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PFTHRESHOLD); - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VGIF); - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VNMI); + if (guest_cpuid_is_intel_compatible(vcpu)) + guest_cpu_cap_clear(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD); svm_recalc_instruction_intercepts(vcpu, svm); @@ -4348,7 +4629,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) if (boot_cpu_has(X86_FEATURE_FLUSH_L1D)) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0, - !!guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D)); + !!guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); if (sev_guest(vcpu->kvm)) sev_vcpu_after_set_cpuid(svm); @@ -4551,12 +4832,6 @@ static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu) vcpu->arch.at_instruction_boundary = true; } -static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) -{ - if (!kvm_pause_in_guest(vcpu->kvm)) - shrink_ple_window(vcpu); -} - static void svm_setup_mce(struct kvm_vcpu *vcpu) { /* [63:9] are reserved. */ @@ -4605,7 +4880,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) * responsible for ensuring nested SVM and SMIs are mutually exclusive. */ - if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) return 1; smram->smram64.svm_guest_flag = 1; @@ -4639,7 +4914,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) svm_copy_vmrun_state(map_save.hva + 0x400, &svm->vmcb01.ptr->save); - kvm_vcpu_unmap(vcpu, &map_save, true); + kvm_vcpu_unmap(vcpu, &map_save); return 0; } @@ -4652,14 +4927,14 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) const struct kvm_smram_state_64 *smram64 = &smram->smram64; - if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) return 0; /* Non-zero if SMI arrived while vCPU was in guest mode. */ if (!smram64->svm_guest_flag) return 0; - if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SVM)) return 1; if (!(smram64->efer & EFER_SVME)) @@ -4699,9 +4974,9 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) svm->nested.nested_run_pending = 1; unmap_save: - kvm_vcpu_unmap(vcpu, &map_save, true); + kvm_vcpu_unmap(vcpu, &map_save); unmap_map: - kvm_vcpu_unmap(vcpu, &map, true); + kvm_vcpu_unmap(vcpu, &map); return ret; } @@ -4722,9 +4997,15 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu) static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, void *insn, int insn_len) { + struct vcpu_svm *svm = to_svm(vcpu); bool smep, smap, is_user; u64 error_code; + /* Check that emulation is possible during event vectoring */ + if ((svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK) && + !kvm_can_emulate_event_vectoring(emul_type)) + return X86EMUL_UNHANDLEABLE_VECTORING; + /* Emulation is always possible when KVM has access to all guest state. */ if (!sev_guest(vcpu->kvm)) return X86EMUL_CONTINUE; @@ -4821,7 +5102,7 @@ static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, * In addition, don't apply the erratum workaround if the #NPF occurred * while translating guest page tables (see below). */ - error_code = to_svm(vcpu)->vmcb->control.exit_info_1; + error_code = svm->vmcb->control.exit_info_1; if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK)) goto resume_guest; @@ -4885,10 +5166,24 @@ static void svm_vm_destroy(struct kvm *kvm) { avic_vm_destroy(kvm); sev_vm_destroy(kvm); + + svm_srso_vm_destroy(); } static int svm_vm_init(struct kvm *kvm) { + int type = kvm->arch.vm_type; + + if (type != KVM_X86_DEFAULT_VM && + type != KVM_X86_SW_PROTECTED_VM) { + kvm->arch.has_protected_state = + (type == KVM_X86_SEV_ES_VM || type == KVM_X86_SNP_VM); + to_kvm_sev_info(kvm)->need_init = true; + + kvm->arch.has_private_mem = (type == KVM_X86_SNP_VM); + kvm->arch.pre_fault_allowed = !kvm->arch.has_private_mem; + } + if (!pause_filter_count || !pause_filter_thresh) kvm->arch.pause_in_guest = true; @@ -4898,12 +5193,13 @@ static int svm_vm_init(struct kvm *kvm) return ret; } + svm_srso_vm_init(); return 0; } static void *svm_alloc_apic_backing_page(struct kvm_vcpu *vcpu) { - struct page *page = snp_safe_alloc_page(vcpu); + struct page *page = snp_safe_alloc_page(); if (!page) return NULL; @@ -4917,8 +5213,9 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .check_processor_compatibility = svm_check_processor_compat, .hardware_unsetup = svm_hardware_unsetup, - .hardware_enable = svm_hardware_enable, - .hardware_disable = svm_hardware_disable, + .enable_virtualization_cpu = svm_enable_virtualization_cpu, + .disable_virtualization_cpu = svm_disable_virtualization_cpu, + .emergency_disable_virtualization_cpu = svm_emergency_disable_virtualization_cpu, .has_emulated_msr = svm_has_emulated_msr, .vcpu_create = svm_vcpu_create, @@ -4936,13 +5233,14 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .vcpu_unblocking = avic_vcpu_unblocking, .update_exception_bitmap = svm_update_exception_bitmap, - .get_msr_feature = svm_get_msr_feature, + .get_feature_msr = svm_get_feature_msr, .get_msr = svm_get_msr, .set_msr = svm_set_msr, .get_segment_base = svm_get_segment_base, .get_segment = svm_get_segment, .set_segment = svm_set_segment, .get_cpl = svm_get_cpl, + .get_cpl_no_cache = svm_get_cpl, .get_cs_db_l_bits = svm_get_cs_db_l_bits, .is_valid_cr0 = svm_is_valid_cr0, .set_cr0 = svm_set_cr0, @@ -4954,6 +5252,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_idt = svm_set_idt, .get_gdt = svm_get_gdt, .set_gdt = svm_set_gdt, + .set_dr6 = svm_set_dr6, .set_dr7 = svm_set_dr7, .sync_dirty_debug_regs = svm_sync_dirty_debug_regs, .cache_reg = svm_cache_reg, @@ -4987,12 +5286,15 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .enable_nmi_window = svm_enable_nmi_window, .enable_irq_window = svm_enable_irq_window, .update_cr8_intercept = svm_update_cr8_intercept, + + .x2apic_icr_is_split = true, .set_virtual_apic_mode = avic_refresh_virtual_apic_mode, .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl, .apicv_post_state_restore = avic_apicv_post_state_restore, .required_apicv_inhibits = AVIC_REQUIRED_APICV_INHIBITS, .get_exit_info = svm_get_exit_info, + .get_entry_info = svm_get_entry_info, .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid, @@ -5008,8 +5310,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .check_intercept = svm_check_intercept, .handle_exit_irqoff = svm_handle_exit_irqoff, - .sched_in = svm_sched_in, - .nested_ops = &svm_nested_ops, .deliver_interrupt = svm_deliver_interrupt, @@ -5023,6 +5323,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .enable_smi_window = svm_enable_smi_window, #endif +#ifdef CONFIG_KVM_AMD_SEV + .dev_get_attr = sev_dev_get_attr, .mem_enc_ioctl = sev_mem_enc_ioctl, .mem_enc_register_region = sev_mem_enc_register_region, .mem_enc_unregister_region = sev_mem_enc_unregister_region, @@ -5030,7 +5332,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .vm_copy_enc_context_from = sev_vm_copy_enc_context_from, .vm_move_enc_context_from = sev_vm_move_enc_context_from, - +#endif .check_emulate_instruction = svm_check_emulate_instruction, .apic_init_signal_blocked = svm_apic_init_signal_blocked, @@ -5041,6 +5343,10 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector, .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons, .alloc_apic_backing_page = svm_alloc_apic_backing_page, + + .gmem_prepare = sev_gmem_prepare, + .gmem_invalidate = sev_gmem_invalidate, + .private_max_mapping_level = sev_private_max_mapping_level, }; /* @@ -5059,7 +5365,7 @@ static __init void svm_adjust_mmio_mask(void) return; /* If memory encryption is not enabled, use existing mask */ - rdmsrl(MSR_AMD64_SYSCFG, msr); + rdmsrq(MSR_AMD64_SYSCFG, msr); if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) return; @@ -5133,6 +5439,9 @@ static __init void svm_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); } + if (cpu_feature_enabled(X86_FEATURE_BUS_LOCK_THRESHOLD)) + kvm_caps.has_bus_lock_exit = true; + /* CPUID 0x80000008 */ if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || boot_cpu_has(X86_FEATURE_AMD_SSBD)) @@ -5157,6 +5466,9 @@ static __init void svm_set_cpu_caps(void) /* CPUID 0x8000001F (SME/SEV features) */ sev_set_cpu_caps(); + + /* Don't advertise Bus Lock Detect to guest if SVM support is absent */ + kvm_cpu_cap_clear(X86_FEATURE_BUS_LOCK_DETECT); } static __init int svm_hardware_setup(void) @@ -5184,7 +5496,7 @@ static __init int svm_hardware_setup(void) iopm_va = page_address(iopm_pages); memset(iopm_va, 0xff, PAGE_SIZE * (1 << order)); - iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; + iopm_base = __sme_page_pa(iopm_pages); init_msrpm_offsets(); @@ -5237,7 +5549,7 @@ static __init int svm_hardware_setup(void) /* Force VM NPT level equal to the host's paging level */ kvm_configure_mmu(npt_enabled, get_npt_level(), get_npt_level(), PG_LEVEL_1G); - pr_info("Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); + pr_info("Nested Paging %s\n", str_enabled_disabled(npt_enabled)); /* Setup shadow_me_value and shadow_me_mask */ kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask); @@ -5246,6 +5558,12 @@ static __init int svm_hardware_setup(void) nrips = nrips && boot_cpu_has(X86_FEATURE_NRIPS); + if (lbrv) { + if (!boot_cpu_has(X86_FEATURE_LBRV)) + lbrv = false; + else + pr_info("LBR virtualization supported\n"); + } /* * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which * may be modified by svm_adjust_mmio_mask()), as well as nrips. @@ -5299,14 +5617,6 @@ static __init int svm_hardware_setup(void) svm_x86_ops.set_vnmi_pending = NULL; } - - if (lbrv) { - if (!boot_cpu_has(X86_FEATURE_LBRV)) - lbrv = false; - else - pr_info("LBR virtualization supported\n"); - } - if (!enable_pmu) pr_info("PMU virtualization is disabled\n"); @@ -5327,6 +5637,7 @@ static __init int svm_hardware_setup(void) */ allow_smaller_maxphyaddr = !npt_enabled; + kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_CD_NW_CLEARED; return 0; err: @@ -5345,8 +5656,6 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = { static void __svm_exit(void) { kvm_x86_vendor_exit(); - - cpu_emergency_unregister_virt_callback(svm_emergency_disable); } static int __init svm_init(void) @@ -5362,8 +5671,6 @@ static int __init svm_init(void) if (r) return r; - cpu_emergency_register_virt_callback(svm_emergency_disable); - /* * Common KVM initialization _must_ come last, after this, /dev/kvm is * exposed to userspace! diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 7f1fbd874c45..e6f3c6a153a0 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -25,12 +25,26 @@ #include "cpuid.h" #include "kvm_cache_regs.h" -#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT) +/* + * Helpers to convert to/from physical addresses for pages whose address is + * consumed directly by hardware. Even though it's a physical address, SVM + * often restricts the address to the natural width, hence 'unsigned long' + * instead of 'hpa_t'. + */ +static inline unsigned long __sme_page_pa(struct page *page) +{ + return __sme_set(page_to_pfn(page) << PAGE_SHIFT); +} + +static inline struct page *__sme_pa_to_page(unsigned long pa) +{ + return pfn_to_page(__sme_clr(pa) >> PAGE_SHIFT); +} #define IOPM_SIZE PAGE_SIZE * 3 #define MSRPM_SIZE PAGE_SIZE * 2 -#define MAX_DIRECT_ACCESS_MSRS 47 +#define MAX_DIRECT_ACCESS_MSRS 48 #define MSRPM_OFFSETS 32 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; @@ -39,6 +53,7 @@ extern int vgif; extern bool intercept_smi; extern bool x2avic_enabled; extern bool vnmi; +extern int lbrv; /* * Clean bits in VMCB. @@ -79,19 +94,30 @@ enum { struct kvm_sev_info { bool active; /* SEV enabled guest */ bool es_active; /* SEV-ES enabled guest */ + bool need_init; /* waiting for SEV_INIT2 */ unsigned int asid; /* ASID used for this guest */ unsigned int handle; /* SEV firmware handle */ int fd; /* SEV device fd */ + unsigned long policy; unsigned long pages_locked; /* Number of pages locked */ struct list_head regions_list; /* List of registered regions */ u64 ap_jump_table; /* SEV-ES AP Jump Table address */ + u64 vmsa_features; + u16 ghcb_version; /* Highest guest GHCB protocol version allowed */ struct kvm *enc_context_owner; /* Owner of copied encryption context */ struct list_head mirror_vms; /* List of VMs mirroring */ struct list_head mirror_entry; /* Use as a list entry of mirrors */ struct misc_cg *misc_cg; /* For misc cgroup accounting */ atomic_t migration_in_progress; + void *snp_context; /* SNP guest context page */ + void *guest_req_buf; /* Bounce buffer for SNP Guest Request input */ + void *guest_resp_buf; /* Bounce buffer for SNP Guest Request output */ + struct mutex guest_req_mutex; /* Must acquire before using bounce buffers */ }; +#define SEV_POLICY_NODBG BIT_ULL(0) +#define SNP_POLICY_DEBUG BIT_ULL(19) + struct kvm_svm { struct kvm kvm; @@ -147,6 +173,7 @@ struct vmcb_ctrl_area_cached { u64 nested_cr3; u64 virt_ext; u32 clean; + u64 bus_lock_rip; union { #if IS_ENABLED(CONFIG_HYPERV) || IS_ENABLED(CONFIG_KVM_HYPERV) struct hv_vmcb_enlightenments hv_enlightenments; @@ -197,6 +224,7 @@ struct vcpu_sev_es_state { u8 valid_bitmap[16]; struct kvm_host_map ghcb_map; bool received_first_sipi; + unsigned int ap_reset_hold_type; /* SEV-ES scratch area support */ u64 sw_scratch; @@ -204,6 +232,18 @@ struct vcpu_sev_es_state { u32 ghcb_sa_len; bool ghcb_sa_sync; bool ghcb_sa_free; + + /* SNP Page-State-Change buffer entries currently being processed */ + u16 psc_idx; + u16 psc_inflight; + bool psc_2m; + + u64 ghcb_registered_gpa; + + struct mutex snp_vmsa_mutex; /* Used to handle concurrent updates of VMSA. */ + gpa_t snp_vmsa_gpa; + bool snp_ap_waiting_for_reset; + bool snp_has_guest_vmsa; }; struct vcpu_svm { @@ -300,10 +340,10 @@ struct svm_cpu_data { u32 next_asid; u32 min_asid; - struct page *save_area; - unsigned long save_area_pa; + bool bp_spec_reduce_set; - struct vmcb *current_vmcb; + struct vmcb *save_area; + unsigned long save_area_pa; /* index = sev_asid, value = vmcb pointer */ struct vmcb **sev_vmcbs; @@ -318,26 +358,39 @@ static __always_inline struct kvm_svm *to_kvm_svm(struct kvm *kvm) return container_of(kvm, struct kvm_svm, kvm); } -static __always_inline bool sev_guest(struct kvm *kvm) +static __always_inline struct kvm_sev_info *to_kvm_sev_info(struct kvm *kvm) { -#ifdef CONFIG_KVM_AMD_SEV - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - - return sev->active; -#else - return false; -#endif + return &to_kvm_svm(kvm)->sev_info; } +#ifdef CONFIG_KVM_AMD_SEV +static __always_inline bool sev_guest(struct kvm *kvm) +{ + return to_kvm_sev_info(kvm)->active; +} static __always_inline bool sev_es_guest(struct kvm *kvm) { -#ifdef CONFIG_KVM_AMD_SEV - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); return sev->es_active && !WARN_ON_ONCE(!sev->active); +} + +static __always_inline bool sev_snp_guest(struct kvm *kvm) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + + return (sev->vmsa_features & SVM_SEV_FEAT_SNP_ACTIVE) && + !WARN_ON_ONCE(!sev_es_guest(kvm)); +} #else - return false; +#define sev_guest(kvm) false +#define sev_es_guest(kvm) false +#define sev_snp_guest(kvm) false #endif + +static inline bool ghcb_gpa_is_registered(struct vcpu_svm *svm, u64 val) +{ + return svm->sev_es.ghcb_registered_gpa == val; } static inline void vmcb_mark_all_dirty(struct vmcb *vmcb) @@ -445,7 +498,7 @@ static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit) static inline bool nested_vgif_enabled(struct vcpu_svm *svm) { - return guest_can_use(&svm->vcpu, X86_FEATURE_VGIF) && + return guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_VGIF) && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK); } @@ -497,7 +550,7 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm) static inline bool nested_vnmi_enabled(struct vcpu_svm *svm) { - return guest_can_use(&svm->vcpu, X86_FEATURE_VNMI) && + return guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_VNMI) && (svm->nested.ctl.int_ctl & V_NMI_ENABLE_MASK); } @@ -531,10 +584,39 @@ static inline bool is_vnmi_enabled(struct vcpu_svm *svm) return false; } +static inline void svm_vmgexit_set_return_code(struct vcpu_svm *svm, + u64 response, u64 data) +{ + ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, response); + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, data); +} + +static inline void svm_vmgexit_inject_exception(struct vcpu_svm *svm, u8 vector) +{ + u64 data = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT | vector; + + svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_ISSUE_EXCEPTION, data); +} + +static inline void svm_vmgexit_bad_input(struct vcpu_svm *svm, u64 suberror) +{ + svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_MALFORMED_INPUT, suberror); +} + +static inline void svm_vmgexit_success(struct vcpu_svm *svm, u64 data) +{ + svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_NO_ACTION, data); +} + +static inline void svm_vmgexit_no_action(struct vcpu_svm *svm, u64 data) +{ + svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_NO_ACTION, data); +} + /* svm.c */ #define MSR_INVALID 0xffffffffU -#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) +#define DEBUGCTL_RESERVED_BITS (~DEBUGCTLMSR_LBR) extern bool dump_invalid_vmcb; @@ -543,6 +625,7 @@ u32 *svm_vcpu_alloc_msrpm(void); void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm); void svm_vcpu_free_msrpm(u32 *msrpm); void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb); +void svm_enable_lbrv(struct kvm_vcpu *vcpu); void svm_update_lbrv(struct kvm_vcpu *vcpu); int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer); @@ -627,7 +710,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops; /* avic.c */ #define AVIC_REQUIRED_APICV_INHIBITS \ ( \ - BIT(APICV_INHIBIT_REASON_DISABLE) | \ + BIT(APICV_INHIBIT_REASON_DISABLED) | \ BIT(APICV_INHIBIT_REASON_ABSENT) | \ BIT(APICV_INHIBIT_REASON_HYPERV) | \ BIT(APICV_INHIBIT_REASON_NESTED) | \ @@ -664,13 +747,16 @@ void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu); /* sev.c */ -#define GHCB_VERSION_MAX 1ULL -#define GHCB_VERSION_MIN 1ULL - - -extern unsigned int max_sev_asid; +int pre_sev_run(struct vcpu_svm *svm, int cpu); +void sev_init_vmcb(struct vcpu_svm *svm); +void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm); +int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in); +void sev_es_vcpu_reset(struct vcpu_svm *svm); +void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); +void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa); +void sev_es_unmap_ghcb(struct vcpu_svm *svm); -void sev_vm_destroy(struct kvm *kvm); +#ifdef CONFIG_KVM_AMD_SEV int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp); int sev_mem_enc_register_region(struct kvm *kvm, struct kvm_enc_region *range); @@ -679,26 +765,73 @@ int sev_mem_enc_unregister_region(struct kvm *kvm, int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd); int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd); void sev_guest_memory_reclaimed(struct kvm *kvm); +int sev_handle_vmgexit(struct kvm_vcpu *vcpu); -void pre_sev_run(struct vcpu_svm *svm, int cpu); +/* These symbols are used in common code and are stubbed below. */ + +struct page *snp_safe_alloc_page_node(int node, gfp_t gfp); +static inline struct page *snp_safe_alloc_page(void) +{ + return snp_safe_alloc_page_node(numa_node_id(), GFP_KERNEL_ACCOUNT); +} + +void sev_free_vcpu(struct kvm_vcpu *vcpu); +void sev_vm_destroy(struct kvm *kvm); void __init sev_set_cpu_caps(void); void __init sev_hardware_setup(void); void sev_hardware_unsetup(void); int sev_cpu_init(struct svm_cpu_data *sd); -void sev_init_vmcb(struct vcpu_svm *svm); -void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm); -void sev_free_vcpu(struct kvm_vcpu *vcpu); -int sev_handle_vmgexit(struct kvm_vcpu *vcpu); -int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in); -void sev_es_vcpu_reset(struct vcpu_svm *svm); -void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); -void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa); -void sev_es_unmap_ghcb(struct vcpu_svm *svm); -struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu); +int sev_dev_get_attr(u32 group, u64 attr, u64 *val); +extern unsigned int max_sev_asid; +void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code); +void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu); +int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); +void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); +int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn); +struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu); +void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa); +#else +static inline struct page *snp_safe_alloc_page_node(int node, gfp_t gfp) +{ + return alloc_pages_node(node, gfp | __GFP_ZERO, 0); +} + +static inline struct page *snp_safe_alloc_page(void) +{ + return snp_safe_alloc_page_node(numa_node_id(), GFP_KERNEL_ACCOUNT); +} + +static inline void sev_free_vcpu(struct kvm_vcpu *vcpu) {} +static inline void sev_vm_destroy(struct kvm *kvm) {} +static inline void __init sev_set_cpu_caps(void) {} +static inline void __init sev_hardware_setup(void) {} +static inline void sev_hardware_unsetup(void) {} +static inline int sev_cpu_init(struct svm_cpu_data *sd) { return 0; } +static inline int sev_dev_get_attr(u32 group, u64 attr, u64 *val) { return -ENXIO; } +#define max_sev_asid 0 +static inline void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) {} +static inline void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) {} +static inline int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order) +{ + return 0; +} +static inline void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) {} +static inline int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +{ + return 0; +} + +static inline struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu) +{ + return NULL; +} +static inline void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa) {} +#endif /* vmenter.S */ -void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted); +void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted, + struct sev_es_save_area *hostsa); void __svm_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted); #define DEFINE_KVM_GHCB_ACCESSORS(field) \ diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 187018c424bf..0c61153b275f 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -3,6 +3,7 @@ #include <asm/asm.h> #include <asm/asm-offsets.h> #include <asm/bitsperlong.h> +#include <asm/frame.h> #include <asm/kvm_vcpu_regs.h> #include <asm/nospec-branch.h> #include "kvm-asm-offsets.h" @@ -67,7 +68,7 @@ "", X86_FEATURE_V_SPEC_CTRL 901: .endm -.macro RESTORE_HOST_SPEC_CTRL_BODY +.macro RESTORE_HOST_SPEC_CTRL_BODY spec_ctrl_intercepted:req 900: /* Same for after vmexit. */ mov $MSR_IA32_SPEC_CTRL, %ecx @@ -76,7 +77,7 @@ * Load the value that the guest had written into MSR_IA32_SPEC_CTRL, * if it was not intercepted during guest execution. */ - cmpb $0, (%_ASM_SP) + cmpb $0, \spec_ctrl_intercepted jnz 998f rdmsr movl %eax, SVM_spec_ctrl(%_ASM_DI) @@ -99,6 +100,7 @@ */ SYM_FUNC_START(__svm_vcpu_run) push %_ASM_BP + mov %_ASM_SP, %_ASM_BP #ifdef CONFIG_X86_64 push %r15 push %r14 @@ -168,12 +170,8 @@ SYM_FUNC_START(__svm_vcpu_run) mov VCPU_RDI(%_ASM_DI), %_ASM_DI /* Enter guest mode */ - sti - 3: vmrun %_ASM_AX 4: - cli - /* Pop @svm to RAX while it's the only available register. */ pop %_ASM_AX @@ -207,10 +205,8 @@ SYM_FUNC_START(__svm_vcpu_run) 7: vmload %_ASM_AX 8: -#ifdef CONFIG_MITIGATION_RETPOLINE /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ - FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE -#endif + FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT /* Clobbers RAX, RCX, RDX. */ RESTORE_HOST_SPEC_CTRL @@ -268,7 +264,7 @@ SYM_FUNC_START(__svm_vcpu_run) RET RESTORE_GUEST_SPEC_CTRL_BODY - RESTORE_HOST_SPEC_CTRL_BODY + RESTORE_HOST_SPEC_CTRL_BODY (%_ASM_SP) 10: cmpb $0, _ASM_RIP(kvm_rebooting) jne 2b @@ -290,66 +286,62 @@ SYM_FUNC_START(__svm_vcpu_run) SYM_FUNC_END(__svm_vcpu_run) +#ifdef CONFIG_KVM_AMD_SEV + + +#ifdef CONFIG_X86_64 +#define SEV_ES_GPRS_BASE 0x300 +#define SEV_ES_RBX (SEV_ES_GPRS_BASE + __VCPU_REGS_RBX * WORD_SIZE) +#define SEV_ES_RBP (SEV_ES_GPRS_BASE + __VCPU_REGS_RBP * WORD_SIZE) +#define SEV_ES_RSI (SEV_ES_GPRS_BASE + __VCPU_REGS_RSI * WORD_SIZE) +#define SEV_ES_RDI (SEV_ES_GPRS_BASE + __VCPU_REGS_RDI * WORD_SIZE) +#define SEV_ES_R12 (SEV_ES_GPRS_BASE + __VCPU_REGS_R12 * WORD_SIZE) +#define SEV_ES_R13 (SEV_ES_GPRS_BASE + __VCPU_REGS_R13 * WORD_SIZE) +#define SEV_ES_R14 (SEV_ES_GPRS_BASE + __VCPU_REGS_R14 * WORD_SIZE) +#define SEV_ES_R15 (SEV_ES_GPRS_BASE + __VCPU_REGS_R15 * WORD_SIZE) +#endif + /** * __svm_sev_es_vcpu_run - Run a SEV-ES vCPU via a transition to SVM guest mode * @svm: struct vcpu_svm * * @spec_ctrl_intercepted: bool */ SYM_FUNC_START(__svm_sev_es_vcpu_run) - push %_ASM_BP -#ifdef CONFIG_X86_64 - push %r15 - push %r14 - push %r13 - push %r12 -#else - push %edi - push %esi -#endif - push %_ASM_BX + FRAME_BEGIN /* - * Save variables needed after vmexit on the stack, in inverse - * order compared to when they are needed. + * Save non-volatile (callee-saved) registers to the host save area. + * Except for RAX and RSP, all GPRs are restored on #VMEXIT, but not + * saved on VMRUN. */ + mov %rbp, SEV_ES_RBP (%rdx) + mov %r15, SEV_ES_R15 (%rdx) + mov %r14, SEV_ES_R14 (%rdx) + mov %r13, SEV_ES_R13 (%rdx) + mov %r12, SEV_ES_R12 (%rdx) + mov %rbx, SEV_ES_RBX (%rdx) - /* Accessed directly from the stack in RESTORE_HOST_SPEC_CTRL. */ - push %_ASM_ARG2 - - /* Save @svm. */ - push %_ASM_ARG1 - -.ifnc _ASM_ARG1, _ASM_DI /* - * Stash @svm in RDI early. On 32-bit, arguments are in RAX, RCX - * and RDX which are clobbered by RESTORE_GUEST_SPEC_CTRL. + * Save volatile registers that hold arguments that are needed after + * #VMEXIT (RDI=@svm and RSI=@spec_ctrl_intercepted). */ - mov %_ASM_ARG1, %_ASM_DI -.endif + mov %rdi, SEV_ES_RDI (%rdx) + mov %rsi, SEV_ES_RSI (%rdx) - /* Clobbers RAX, RCX, RDX. */ + /* Clobbers RAX, RCX, RDX (@hostsa). */ RESTORE_GUEST_SPEC_CTRL /* Get svm->current_vmcb->pa into RAX. */ - mov SVM_current_vmcb(%_ASM_DI), %_ASM_AX - mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX + mov SVM_current_vmcb(%rdi), %rax + mov KVM_VMCB_pa(%rax), %rax /* Enter guest mode */ - sti - -1: vmrun %_ASM_AX - -2: cli - - /* Pop @svm to RDI, guest registers have been saved already. */ - pop %_ASM_DI - -#ifdef CONFIG_MITIGATION_RETPOLINE +1: vmrun %rax +2: /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ - FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE -#endif + FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT - /* Clobbers RAX, RCX, RDX. */ + /* Clobbers RAX, RCX, RDX, consumes RDI (@svm) and RSI (@spec_ctrl_intercepted). */ RESTORE_HOST_SPEC_CTRL /* @@ -361,30 +353,17 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) */ UNTRAIN_RET_VM - /* "Pop" @spec_ctrl_intercepted. */ - pop %_ASM_BX - - pop %_ASM_BX - -#ifdef CONFIG_X86_64 - pop %r12 - pop %r13 - pop %r14 - pop %r15 -#else - pop %esi - pop %edi -#endif - pop %_ASM_BP + FRAME_END RET RESTORE_GUEST_SPEC_CTRL_BODY - RESTORE_HOST_SPEC_CTRL_BODY + RESTORE_HOST_SPEC_CTRL_BODY %sil -3: cmpb $0, _ASM_RIP(kvm_rebooting) +3: cmpb $0, kvm_rebooting(%rip) jne 2b ud2 _ASM_EXTABLE(1b, 3b) SYM_FUNC_END(__svm_sev_es_vcpu_run) +#endif /* CONFIG_KVM_AMD_SEV */ diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 88659de4d2a7..ba736cbb0587 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -11,6 +11,13 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM kvm +#ifdef CREATE_TRACE_POINTS +#define tracing_kvm_rip_read(vcpu) ({ \ + typeof(vcpu) __vcpu = vcpu; \ + __vcpu->arch.guest_state_protected ? 0 : kvm_rip_read(__vcpu); \ + }) +#endif + /* * Tracepoint for guest mode entry. */ @@ -22,15 +29,22 @@ TRACE_EVENT(kvm_entry, __field( unsigned int, vcpu_id ) __field( unsigned long, rip ) __field( bool, immediate_exit ) + __field( u32, intr_info ) + __field( u32, error_code ) ), TP_fast_assign( __entry->vcpu_id = vcpu->vcpu_id; - __entry->rip = kvm_rip_read(vcpu); + __entry->rip = tracing_kvm_rip_read(vcpu); __entry->immediate_exit = force_immediate_exit; + + kvm_x86_call(get_entry_info)(vcpu, &__entry->intr_info, + &__entry->error_code); ), - TP_printk("vcpu %u, rip 0x%lx%s", __entry->vcpu_id, __entry->rip, + TP_printk("vcpu %u, rip 0x%lx intr_info 0x%08x error_code 0x%08x%s", + __entry->vcpu_id, __entry->rip, + __entry->intr_info, __entry->error_code, __entry->immediate_exit ? "[immediate exit]" : "") ); @@ -308,26 +322,30 @@ TRACE_EVENT(name, \ __field( u32, intr_info ) \ __field( u32, error_code ) \ __field( unsigned int, vcpu_id ) \ + __field( u64, requests ) \ ), \ \ TP_fast_assign( \ - __entry->guest_rip = kvm_rip_read(vcpu); \ + __entry->guest_rip = tracing_kvm_rip_read(vcpu); \ __entry->isa = isa; \ __entry->vcpu_id = vcpu->vcpu_id; \ - static_call(kvm_x86_get_exit_info)(vcpu, \ - &__entry->exit_reason, \ - &__entry->info1, \ - &__entry->info2, \ - &__entry->intr_info, \ - &__entry->error_code); \ + __entry->requests = READ_ONCE(vcpu->requests); \ + kvm_x86_call(get_exit_info)(vcpu, \ + &__entry->exit_reason, \ + &__entry->info1, \ + &__entry->info2, \ + &__entry->intr_info, \ + &__entry->error_code); \ ), \ \ TP_printk("vcpu %u reason %s%s%s rip 0x%lx info1 0x%016llx " \ - "info2 0x%016llx intr_info 0x%08x error_code 0x%08x", \ + "info2 0x%016llx intr_info 0x%08x error_code 0x%08x " \ + "requests 0x%016llx", \ __entry->vcpu_id, \ kvm_print_exit_reason(__entry->exit_reason, __entry->isa), \ __entry->guest_rip, __entry->info1, __entry->info2, \ - __entry->intr_info, __entry->error_code) \ + __entry->intr_info, __entry->error_code, \ + __entry->requests) \ ) /* @@ -412,7 +430,7 @@ TRACE_EVENT(kvm_page_fault, TP_fast_assign( __entry->vcpu_id = vcpu->vcpu_id; - __entry->guest_rip = kvm_rip_read(vcpu); + __entry->guest_rip = tracing_kvm_rip_read(vcpu); __entry->fault_address = fault_address; __entry->error_code = error_code; ), @@ -735,13 +753,13 @@ TRACE_EVENT(kvm_nested_intr_vmexit, * Tracepoint for nested #vmexit because of interrupt pending */ TRACE_EVENT(kvm_invlpga, - TP_PROTO(__u64 rip, int asid, u64 address), + TP_PROTO(__u64 rip, unsigned int asid, u64 address), TP_ARGS(rip, asid, address), TP_STRUCT__entry( - __field( __u64, rip ) - __field( int, asid ) - __field( __u64, address ) + __field( __u64, rip ) + __field( unsigned int, asid ) + __field( __u64, address ) ), TP_fast_assign( @@ -750,7 +768,7 @@ TRACE_EVENT(kvm_invlpga, __entry->address = address; ), - TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx", + TP_printk("rip: 0x%016llx asid: %u address: 0x%016llx", __entry->rip, __entry->asid, __entry->address) ); @@ -819,22 +837,23 @@ TRACE_EVENT(kvm_emulate_insn, TP_ARGS(vcpu, failed), TP_STRUCT__entry( - __field( __u64, rip ) - __field( __u32, csbase ) - __field( __u8, len ) - __array( __u8, insn, 15 ) - __field( __u8, flags ) - __field( __u8, failed ) + __field( __u64, rip ) + __field( __u32, csbase ) + __field( __u8, len ) + __array( __u8, insn, X86_MAX_INSTRUCTION_LENGTH ) + __field( __u8, flags ) + __field( __u8, failed ) ), TP_fast_assign( - __entry->csbase = static_call(kvm_x86_get_segment_base)(vcpu, VCPU_SREG_CS); + __entry->csbase = kvm_x86_call(get_segment_base)(vcpu, + VCPU_SREG_CS); __entry->len = vcpu->arch.emulate_ctxt->fetch.ptr - vcpu->arch.emulate_ctxt->fetch.data; __entry->rip = vcpu->arch.emulate_ctxt->_eip - __entry->len; memcpy(__entry->insn, vcpu->arch.emulate_ctxt->fetch.data, - 15); + X86_MAX_INSTRUCTION_LENGTH); __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt->mode); __entry->failed = failed; ), @@ -1074,7 +1093,7 @@ TRACE_EVENT(kvm_smm_transition, ); /* - * Tracepoint for VT-d posted-interrupts. + * Tracepoint for VT-d posted-interrupts and AMD-Vi Guest Virtual APIC. */ TRACE_EVENT(kvm_pi_irte_update, TP_PROTO(unsigned int host_irq, unsigned int vcpu_id, @@ -1100,7 +1119,7 @@ TRACE_EVENT(kvm_pi_irte_update, __entry->set = set; ), - TP_printk("VT-d PI is %s for irq %u, vcpu %u, gsi: 0x%x, " + TP_printk("PI is %s for irq %u, vcpu %u, gsi: 0x%x, " "gvec: 0x%x, pi_desc_addr: 0x%llx", __entry->set ? "enabled and being updated" : "disabled", __entry->host_irq, @@ -1375,6 +1394,10 @@ TRACE_EVENT(kvm_hv_stimer_cleanup, __entry->vcpu_id, __entry->timer_index) ); +#define kvm_print_apicv_inhibit_reasons(inhibits) \ + (inhibits), (inhibits) ? " " : "", \ + (inhibits) ? __print_flags(inhibits, "|", APICV_INHIBIT_REASONS) : "" + TRACE_EVENT(kvm_apicv_inhibit_changed, TP_PROTO(int reason, bool set, unsigned long inhibits), TP_ARGS(reason, set, inhibits), @@ -1391,9 +1414,10 @@ TRACE_EVENT(kvm_apicv_inhibit_changed, __entry->inhibits = inhibits; ), - TP_printk("%s reason=%u, inhibits=0x%lx", + TP_printk("%s reason=%u, inhibits=0x%lx%s%s", __entry->set ? "set" : "cleared", - __entry->reason, __entry->inhibits) + __entry->reason, + kvm_print_apicv_inhibit_reasons(__entry->inhibits)) ); TRACE_EVENT(kvm_apicv_accept_irq, @@ -1678,7 +1702,7 @@ TRACE_EVENT(kvm_nested_vmenter_failed, ), TP_fast_assign( - __assign_str(msg, msg); + __assign_str(msg); __entry->err = err; ), @@ -1834,6 +1858,37 @@ TRACE_EVENT(kvm_vmgexit_msr_protocol_exit, __entry->vcpu_id, __entry->ghcb_gpa, __entry->result) ); +/* + * Tracepoint for #NPFs due to RMP faults. + */ +TRACE_EVENT(kvm_rmp_fault, + TP_PROTO(struct kvm_vcpu *vcpu, u64 gpa, u64 pfn, u64 error_code, + int rmp_level, int psmash_ret), + TP_ARGS(vcpu, gpa, pfn, error_code, rmp_level, psmash_ret), + + TP_STRUCT__entry( + __field(unsigned int, vcpu_id) + __field(u64, gpa) + __field(u64, pfn) + __field(u64, error_code) + __field(int, rmp_level) + __field(int, psmash_ret) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu->vcpu_id; + __entry->gpa = gpa; + __entry->pfn = pfn; + __entry->error_code = error_code; + __entry->rmp_level = rmp_level; + __entry->psmash_ret = psmash_ret; + ), + + TP_printk("vcpu %u gpa %016llx pfn 0x%llx error_code 0x%llx rmp_level %d psmash_ret %d", + __entry->vcpu_id, __entry->gpa, __entry->pfn, + __entry->error_code, __entry->rmp_level, __entry->psmash_ret) +); + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 41a4533f9989..cb6588238f46 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -54,9 +54,7 @@ struct nested_vmx_msrs { }; struct vmcs_config { - int size; - u32 basic_cap; - u32 revision_id; + u64 basic; u32 pin_based_exec_ctrl; u32 cpu_based_exec_ctrl; u32 cpu_based_2nd_exec_ctrl; @@ -76,7 +74,7 @@ extern struct vmx_capability vmx_capability __ro_after_init; static inline bool cpu_has_vmx_basic_inout(void) { - return (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT); + return vmcs_config.basic & VMX_BASIC_INOUT; } static inline bool cpu_has_virtual_nmis(void) @@ -225,7 +223,7 @@ static inline bool cpu_has_vmx_vmfunc(void) static inline bool cpu_has_vmx_shadow_vmcs(void) { /* check if the cpu supports writing r/o exit information fields */ - if (!(vmcs_config.misc & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS)) + if (!(vmcs_config.misc & VMX_MISC_VMWRITE_SHADOW_RO_FIELDS)) return false; return vmcs_config.cpu_based_2nd_exec_ctrl & @@ -367,7 +365,7 @@ static inline bool cpu_has_vmx_invvpid_global(void) static inline bool cpu_has_vmx_intel_pt(void) { - return (vmcs_config.misc & MSR_IA32_VMX_MISC_INTEL_PT) && + return (vmcs_config.misc & VMX_MISC_INTEL_PT) && (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PT_USE_GPA) && (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL); } diff --git a/arch/x86/kvm/vmx/common.h b/arch/x86/kvm/vmx/common.h new file mode 100644 index 000000000000..a0c5e8781c33 --- /dev/null +++ b/arch/x86/kvm/vmx/common.h @@ -0,0 +1,182 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __KVM_X86_VMX_COMMON_H +#define __KVM_X86_VMX_COMMON_H + +#include <linux/kvm_host.h> +#include <asm/posted_intr.h> + +#include "mmu.h" + +union vmx_exit_reason { + struct { + u32 basic : 16; + u32 reserved16 : 1; + u32 reserved17 : 1; + u32 reserved18 : 1; + u32 reserved19 : 1; + u32 reserved20 : 1; + u32 reserved21 : 1; + u32 reserved22 : 1; + u32 reserved23 : 1; + u32 reserved24 : 1; + u32 reserved25 : 1; + u32 bus_lock_detected : 1; + u32 enclave_mode : 1; + u32 smi_pending_mtf : 1; + u32 smi_from_vmx_root : 1; + u32 reserved30 : 1; + u32 failed_vmentry : 1; + }; + u32 full; +}; + +struct vcpu_vt { + /* Posted interrupt descriptor */ + struct pi_desc pi_desc; + + /* Used if this vCPU is waiting for PI notification wakeup. */ + struct list_head pi_wakeup_list; + + union vmx_exit_reason exit_reason; + + unsigned long exit_qualification; + u32 exit_intr_info; + + /* + * If true, guest state has been loaded into hardware, and host state + * saved into vcpu_{vt,vmx,tdx}. If false, host state is loaded into + * hardware. + */ + bool guest_state_loaded; + bool emulation_required; + +#ifdef CONFIG_X86_64 + u64 msr_host_kernel_gs_base; +#endif + + unsigned long host_debugctlmsr; +}; + +#ifdef CONFIG_KVM_INTEL_TDX + +static __always_inline bool is_td(struct kvm *kvm) +{ + return kvm->arch.vm_type == KVM_X86_TDX_VM; +} + +static __always_inline bool is_td_vcpu(struct kvm_vcpu *vcpu) +{ + return is_td(vcpu->kvm); +} + +#else + +static __always_inline bool is_td(struct kvm *kvm) { return false; } +static __always_inline bool is_td_vcpu(struct kvm_vcpu *vcpu) { return false; } + +#endif + +static inline bool vt_is_tdx_private_gpa(struct kvm *kvm, gpa_t gpa) +{ + /* For TDX the direct mask is the shared mask. */ + return !kvm_is_addr_direct(kvm, gpa); +} + +static inline int __vmx_handle_ept_violation(struct kvm_vcpu *vcpu, gpa_t gpa, + unsigned long exit_qualification) +{ + u64 error_code; + + /* Is it a read fault? */ + error_code = (exit_qualification & EPT_VIOLATION_ACC_READ) + ? PFERR_USER_MASK : 0; + /* Is it a write fault? */ + error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE) + ? PFERR_WRITE_MASK : 0; + /* Is it a fetch fault? */ + error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) + ? PFERR_FETCH_MASK : 0; + /* ept page table entry is present? */ + error_code |= (exit_qualification & EPT_VIOLATION_PROT_MASK) + ? PFERR_PRESENT_MASK : 0; + + if (error_code & EPT_VIOLATION_GVA_IS_VALID) + error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ? + PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; + + if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) + error_code |= PFERR_PRIVATE_ACCESS; + + return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); +} + +static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, + int pi_vec) +{ +#ifdef CONFIG_SMP + if (vcpu->mode == IN_GUEST_MODE) { + /* + * The vector of the virtual has already been set in the PIR. + * Send a notification event to deliver the virtual interrupt + * unless the vCPU is the currently running vCPU, i.e. the + * event is being sent from a fastpath VM-Exit handler, in + * which case the PIR will be synced to the vIRR before + * re-entering the guest. + * + * When the target is not the running vCPU, the following + * possibilities emerge: + * + * Case 1: vCPU stays in non-root mode. Sending a notification + * event posts the interrupt to the vCPU. + * + * Case 2: vCPU exits to root mode and is still runnable. The + * PIR will be synced to the vIRR before re-entering the guest. + * Sending a notification event is ok as the host IRQ handler + * will ignore the spurious event. + * + * Case 3: vCPU exits to root mode and is blocked. vcpu_block() + * has already synced PIR to vIRR and never blocks the vCPU if + * the vIRR is not empty. Therefore, a blocked vCPU here does + * not wait for any requested interrupts in PIR, and sending a + * notification event also results in a benign, spurious event. + */ + + if (vcpu != kvm_get_running_vcpu()) + __apic_send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); + return; + } +#endif + /* + * The vCPU isn't in the guest; wake the vCPU in case it is blocking, + * otherwise do nothing as KVM will grab the highest priority pending + * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest(). + */ + kvm_vcpu_wake_up(vcpu); +} + +/* + * Post an interrupt to a vCPU's PIR and trigger the vCPU to process the + * interrupt if necessary. + */ +static inline void __vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, + struct pi_desc *pi_desc, int vector) +{ + if (pi_test_and_set_pir(vector, pi_desc)) + return; + + /* If a previous notification has sent the IPI, nothing to do. */ + if (pi_test_and_set_on(pi_desc)) + return; + + /* + * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*() + * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is + * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a + * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE. + */ + kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR); +} + +noinstr void vmx_handle_nmi(struct kvm_vcpu *vcpu); + +#endif /* __KVM_X86_VMX_COMMON_H */ diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c index fab6a1ad98dc..fa41d036acd4 100644 --- a/arch/x86/kvm/vmx/hyperv.c +++ b/arch/x86/kvm/vmx/hyperv.c @@ -4,6 +4,7 @@ #include <linux/errno.h> #include <linux/smp.h> +#include "x86.h" #include "../cpuid.h" #include "hyperv.h" #include "nested.h" diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h index a87407412615..11a339009781 100644 --- a/arch/x86/kvm/vmx/hyperv.h +++ b/arch/x86/kvm/vmx/hyperv.h @@ -42,7 +42,7 @@ static inline struct hv_enlightened_vmcs *nested_vmx_evmcs(struct vcpu_vmx *vmx) return vmx->nested.hv_evmcs; } -static inline bool guest_cpuid_has_evmcs(struct kvm_vcpu *vcpu) +static inline bool guest_cpu_cap_has_evmcs(struct kvm_vcpu *vcpu) { /* * eVMCS is exposed to the guest if Hyper-V is enabled in CPUID and diff --git a/arch/x86/kvm/vmx/hyperv_evmcs.h b/arch/x86/kvm/vmx/hyperv_evmcs.h index a543fccfc574..6536290f4274 100644 --- a/arch/x86/kvm/vmx/hyperv_evmcs.h +++ b/arch/x86/kvm/vmx/hyperv_evmcs.h @@ -6,7 +6,7 @@ #ifndef __KVM_X86_VMX_HYPERV_EVMCS_H #define __KVM_X86_VMX_HYPERV_EVMCS_H -#include <asm/hyperv-tlfs.h> +#include <hyperv/hvhdk.h> #include "capabilities.h" #include "vmcs12.h" diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c new file mode 100644 index 000000000000..d1e02e567b57 --- /dev/null +++ b/arch/x86/kvm/vmx/main.c @@ -0,0 +1,1111 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/moduleparam.h> + +#include "x86_ops.h" +#include "vmx.h" +#include "mmu.h" +#include "nested.h" +#include "pmu.h" +#include "posted_intr.h" +#include "tdx.h" +#include "tdx_arch.h" + +#ifdef CONFIG_KVM_INTEL_TDX +static_assert(offsetof(struct vcpu_vmx, vt) == offsetof(struct vcpu_tdx, vt)); + +static void vt_disable_virtualization_cpu(void) +{ + /* Note, TDX *and* VMX need to be disabled if TDX is enabled. */ + if (enable_tdx) + tdx_disable_virtualization_cpu(); + vmx_disable_virtualization_cpu(); +} + +static __init int vt_hardware_setup(void) +{ + int ret; + + ret = vmx_hardware_setup(); + if (ret) + return ret; + + /* + * Update vt_x86_ops::vm_size here so it is ready before + * kvm_ops_update() is called in kvm_x86_vendor_init(). + * + * Note, the actual bringing up of TDX must be done after + * kvm_ops_update() because enabling TDX requires enabling + * hardware virtualization first, i.e., all online CPUs must + * be in post-VMXON state. This means the @vm_size here + * may be updated to TDX's size but TDX may fail to enable + * at later time. + * + * The VMX/VT code could update kvm_x86_ops::vm_size again + * after bringing up TDX, but this would require exporting + * either kvm_x86_ops or kvm_ops_update() from the base KVM + * module, which looks overkill. Anyway, the worst case here + * is KVM may allocate couple of more bytes than needed for + * each VM. + */ + if (enable_tdx) { + vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, + sizeof(struct kvm_tdx)); + /* + * Note, TDX may fail to initialize in a later time in + * vt_init(), in which case it is not necessary to setup + * those callbacks. But making them valid here even + * when TDX fails to init later is fine because those + * callbacks won't be called if the VM isn't TDX guest. + */ + vt_x86_ops.link_external_spt = tdx_sept_link_private_spt; + vt_x86_ops.set_external_spte = tdx_sept_set_private_spte; + vt_x86_ops.free_external_spt = tdx_sept_free_private_spt; + vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte; + vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt; + } + + return 0; +} + +static int vt_vm_init(struct kvm *kvm) +{ + if (is_td(kvm)) + return tdx_vm_init(kvm); + + return vmx_vm_init(kvm); +} + +static void vt_vm_pre_destroy(struct kvm *kvm) +{ + if (is_td(kvm)) + return tdx_mmu_release_hkid(kvm); +} + +static void vt_vm_destroy(struct kvm *kvm) +{ + if (is_td(kvm)) + return tdx_vm_destroy(kvm); + + vmx_vm_destroy(kvm); +} + +static int vt_vcpu_precreate(struct kvm *kvm) +{ + if (is_td(kvm)) + return 0; + + return vmx_vcpu_precreate(kvm); +} + +static int vt_vcpu_create(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return tdx_vcpu_create(vcpu); + + return vmx_vcpu_create(vcpu); +} + +static void vt_vcpu_free(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) { + tdx_vcpu_free(vcpu); + return; + } + + vmx_vcpu_free(vcpu); +} + +static void vt_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) +{ + if (is_td_vcpu(vcpu)) { + tdx_vcpu_reset(vcpu, init_event); + return; + } + + vmx_vcpu_reset(vcpu, init_event); +} + +static void vt_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + if (is_td_vcpu(vcpu)) { + tdx_vcpu_load(vcpu, cpu); + return; + } + + vmx_vcpu_load(vcpu, cpu); +} + +static void vt_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) +{ + /* + * Basic TDX does not support feature PML. KVM does not enable PML in + * TD's VMCS, nor does it allocate or flush PML buffer for TDX. + */ + if (WARN_ON_ONCE(is_td_vcpu(vcpu))) + return; + + vmx_update_cpu_dirty_logging(vcpu); +} + +static void vt_prepare_switch_to_guest(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) { + tdx_prepare_switch_to_guest(vcpu); + return; + } + + vmx_prepare_switch_to_guest(vcpu); +} + +static void vt_vcpu_put(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) { + tdx_vcpu_put(vcpu); + return; + } + + vmx_vcpu_put(vcpu); +} + +static int vt_vcpu_pre_run(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return tdx_vcpu_pre_run(vcpu); + + return vmx_vcpu_pre_run(vcpu); +} + +static fastpath_t vt_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) +{ + if (is_td_vcpu(vcpu)) + return tdx_vcpu_run(vcpu, force_immediate_exit); + + return vmx_vcpu_run(vcpu, force_immediate_exit); +} + +static int vt_handle_exit(struct kvm_vcpu *vcpu, + enum exit_fastpath_completion fastpath) +{ + if (is_td_vcpu(vcpu)) + return tdx_handle_exit(vcpu, fastpath); + + return vmx_handle_exit(vcpu, fastpath); +} + +static int vt_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + if (unlikely(is_td_vcpu(vcpu))) + return tdx_set_msr(vcpu, msr_info); + + return vmx_set_msr(vcpu, msr_info); +} + +/* + * The kvm parameter can be NULL (module initialization, or invocation before + * VM creation). Be sure to check the kvm parameter before using it. + */ +static bool vt_has_emulated_msr(struct kvm *kvm, u32 index) +{ + if (kvm && is_td(kvm)) + return tdx_has_emulated_msr(index); + + return vmx_has_emulated_msr(kvm, index); +} + +static int vt_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + if (unlikely(is_td_vcpu(vcpu))) + return tdx_get_msr(vcpu, msr_info); + + return vmx_get_msr(vcpu, msr_info); +} + +static void vt_msr_filter_changed(struct kvm_vcpu *vcpu) +{ + /* + * TDX doesn't allow VMM to configure interception of MSR accesses. + * TDX guest requests MSR accesses by calling TDVMCALL. The MSR + * filters will be applied when handling the TDVMCALL for RDMSR/WRMSR + * if the userspace has set any. + */ + if (is_td_vcpu(vcpu)) + return; + + vmx_msr_filter_changed(vcpu); +} + +static int vt_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) +{ + if (is_td_vcpu(vcpu)) + return tdx_complete_emulated_msr(vcpu, err); + + return vmx_complete_emulated_msr(vcpu, err); +} + +#ifdef CONFIG_KVM_SMM +static int vt_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) +{ + if (KVM_BUG_ON(is_td_vcpu(vcpu), vcpu->kvm)) + return 0; + + return vmx_smi_allowed(vcpu, for_injection); +} + +static int vt_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) +{ + if (KVM_BUG_ON(is_td_vcpu(vcpu), vcpu->kvm)) + return 0; + + return vmx_enter_smm(vcpu, smram); +} + +static int vt_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) +{ + if (KVM_BUG_ON(is_td_vcpu(vcpu), vcpu->kvm)) + return 0; + + return vmx_leave_smm(vcpu, smram); +} + +static void vt_enable_smi_window(struct kvm_vcpu *vcpu) +{ + if (KVM_BUG_ON(is_td_vcpu(vcpu), vcpu->kvm)) + return; + + /* RSM will cause a vmexit anyway. */ + vmx_enable_smi_window(vcpu); +} +#endif + +static int vt_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len) +{ + /* + * For TDX, this can only be triggered for MMIO emulation. Let the + * guest retry after installing the SPTE with suppress #VE bit cleared, + * so that the guest will receive #VE when retry. The guest is expected + * to call TDG.VP.VMCALL<MMIO> to request VMM to do MMIO emulation on + * #VE. + */ + if (is_td_vcpu(vcpu)) + return X86EMUL_RETRY_INSTR; + + return vmx_check_emulate_instruction(vcpu, emul_type, insn, insn_len); +} + +static bool vt_apic_init_signal_blocked(struct kvm_vcpu *vcpu) +{ + /* + * INIT and SIPI are always blocked for TDX, i.e., INIT handling and + * the OP vcpu_deliver_sipi_vector() won't be called. + */ + if (is_td_vcpu(vcpu)) + return true; + + return vmx_apic_init_signal_blocked(vcpu); +} + +static void vt_set_virtual_apic_mode(struct kvm_vcpu *vcpu) +{ + /* Only x2APIC mode is supported for TD. */ + if (is_td_vcpu(vcpu)) + return; + + return vmx_set_virtual_apic_mode(vcpu); +} + +static void vt_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) +{ + if (is_td_vcpu(vcpu)) + return; + + return vmx_hwapic_isr_update(vcpu, max_isr); +} + +static int vt_sync_pir_to_irr(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return -1; + + return vmx_sync_pir_to_irr(vcpu); +} + +static void vt_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, + int trig_mode, int vector) +{ + if (is_td_vcpu(apic->vcpu)) { + tdx_deliver_interrupt(apic, delivery_mode, trig_mode, + vector); + return; + } + + vmx_deliver_interrupt(apic, delivery_mode, trig_mode, vector); +} + +static void vt_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_vcpu_after_set_cpuid(vcpu); +} + +static void vt_update_exception_bitmap(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_update_exception_bitmap(vcpu); +} + +static u64 vt_get_segment_base(struct kvm_vcpu *vcpu, int seg) +{ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_get_segment_base(vcpu, seg); +} + +static void vt_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, + int seg) +{ + if (is_td_vcpu(vcpu)) { + memset(var, 0, sizeof(*var)); + return; + } + + vmx_get_segment(vcpu, var, seg); +} + +static void vt_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, + int seg) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_segment(vcpu, var, seg); +} + +static int vt_get_cpl(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_get_cpl(vcpu); +} + +static int vt_get_cpl_no_cache(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_get_cpl_no_cache(vcpu); +} + +static void vt_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) +{ + if (is_td_vcpu(vcpu)) { + *db = 0; + *l = 0; + return; + } + + vmx_get_cs_db_l_bits(vcpu, db, l); +} + +static bool vt_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + if (is_td_vcpu(vcpu)) + return true; + + return vmx_is_valid_cr0(vcpu, cr0); +} + +static void vt_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_cr0(vcpu, cr0); +} + +static bool vt_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + if (is_td_vcpu(vcpu)) + return true; + + return vmx_is_valid_cr4(vcpu, cr4); +} + +static void vt_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_cr4(vcpu, cr4); +} + +static int vt_set_efer(struct kvm_vcpu *vcpu, u64 efer) +{ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_set_efer(vcpu, efer); +} + +static void vt_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +{ + if (is_td_vcpu(vcpu)) { + memset(dt, 0, sizeof(*dt)); + return; + } + + vmx_get_idt(vcpu, dt); +} + +static void vt_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_idt(vcpu, dt); +} + +static void vt_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +{ + if (is_td_vcpu(vcpu)) { + memset(dt, 0, sizeof(*dt)); + return; + } + + vmx_get_gdt(vcpu, dt); +} + +static void vt_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_gdt(vcpu, dt); +} + +static void vt_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_dr6(vcpu, val); +} + +static void vt_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_dr7(vcpu, val); +} + +static void vt_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) +{ + /* + * MOV-DR exiting is always cleared for TD guest, even in debug mode. + * Thus KVM_DEBUGREG_WONT_EXIT can never be set and it should never + * reach here for TD vcpu. + */ + if (is_td_vcpu(vcpu)) + return; + + vmx_sync_dirty_debug_regs(vcpu); +} + +static void vt_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) +{ + if (WARN_ON_ONCE(is_td_vcpu(vcpu))) + return; + + vmx_cache_reg(vcpu, reg); +} + +static unsigned long vt_get_rflags(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_get_rflags(vcpu); +} + +static void vt_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_rflags(vcpu, rflags); +} + +static bool vt_get_if_flag(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return false; + + return vmx_get_if_flag(vcpu); +} + +static void vt_flush_tlb_all(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) { + tdx_flush_tlb_all(vcpu); + return; + } + + vmx_flush_tlb_all(vcpu); +} + +static void vt_flush_tlb_current(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) { + tdx_flush_tlb_current(vcpu); + return; + } + + vmx_flush_tlb_current(vcpu); +} + +static void vt_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_flush_tlb_gva(vcpu, addr); +} + +static void vt_flush_tlb_guest(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_flush_tlb_guest(vcpu); +} + +static void vt_inject_nmi(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) { + tdx_inject_nmi(vcpu); + return; + } + + vmx_inject_nmi(vcpu); +} + +static int vt_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) +{ + /* + * The TDX module manages NMI windows and NMI reinjection, and hides NMI + * blocking, all KVM can do is throw an NMI over the wall. + */ + if (is_td_vcpu(vcpu)) + return true; + + return vmx_nmi_allowed(vcpu, for_injection); +} + +static bool vt_get_nmi_mask(struct kvm_vcpu *vcpu) +{ + /* + * KVM can't get NMI blocking status for TDX guest, assume NMIs are + * always unmasked. + */ + if (is_td_vcpu(vcpu)) + return false; + + return vmx_get_nmi_mask(vcpu); +} + +static void vt_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_nmi_mask(vcpu, masked); +} + +static void vt_enable_nmi_window(struct kvm_vcpu *vcpu) +{ + /* Refer to the comments in tdx_inject_nmi(). */ + if (is_td_vcpu(vcpu)) + return; + + vmx_enable_nmi_window(vcpu); +} + +static void vt_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, + int pgd_level) +{ + if (is_td_vcpu(vcpu)) { + tdx_load_mmu_pgd(vcpu, root_hpa, pgd_level); + return; + } + + vmx_load_mmu_pgd(vcpu, root_hpa, pgd_level); +} + +static void vt_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_interrupt_shadow(vcpu, mask); +} + +static u32 vt_get_interrupt_shadow(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_get_interrupt_shadow(vcpu); +} + +static void vt_patch_hypercall(struct kvm_vcpu *vcpu, + unsigned char *hypercall) +{ + /* + * Because guest memory is protected, guest can't be patched. TD kernel + * is modified to use TDG.VP.VMCALL for hypercall. + */ + if (is_td_vcpu(vcpu)) + return; + + vmx_patch_hypercall(vcpu, hypercall); +} + +static void vt_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_inject_irq(vcpu, reinjected); +} + +static void vt_inject_exception(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_inject_exception(vcpu); +} + +static void vt_cancel_injection(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_cancel_injection(vcpu); +} + +static int vt_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) +{ + if (is_td_vcpu(vcpu)) + return tdx_interrupt_allowed(vcpu); + + return vmx_interrupt_allowed(vcpu, for_injection); +} + +static void vt_enable_irq_window(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_enable_irq_window(vcpu); +} + +static void vt_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code) +{ + *intr_info = 0; + *error_code = 0; + + if (is_td_vcpu(vcpu)) + return; + + vmx_get_entry_info(vcpu, intr_info, error_code); +} + +static void vt_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, + u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) +{ + if (is_td_vcpu(vcpu)) { + tdx_get_exit_info(vcpu, reason, info1, info2, intr_info, + error_code); + return; + } + + vmx_get_exit_info(vcpu, reason, info1, info2, intr_info, error_code); +} + +static void vt_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_update_cr8_intercept(vcpu, tpr, irr); +} + +static void vt_set_apic_access_page_addr(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_set_apic_access_page_addr(vcpu); +} + +static void vt_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) { + KVM_BUG_ON(!kvm_vcpu_apicv_active(vcpu), vcpu->kvm); + return; + } + + vmx_refresh_apicv_exec_ctrl(vcpu); +} + +static void vt_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_load_eoi_exitmap(vcpu, eoi_exit_bitmap); +} + +static int vt_set_tss_addr(struct kvm *kvm, unsigned int addr) +{ + if (is_td(kvm)) + return 0; + + return vmx_set_tss_addr(kvm, addr); +} + +static int vt_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) +{ + if (is_td(kvm)) + return 0; + + return vmx_set_identity_map_addr(kvm, ident_addr); +} + +static u64 vt_get_l2_tsc_offset(struct kvm_vcpu *vcpu) +{ + /* TDX doesn't support L2 guest at the moment. */ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_get_l2_tsc_offset(vcpu); +} + +static u64 vt_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) +{ + /* TDX doesn't support L2 guest at the moment. */ + if (is_td_vcpu(vcpu)) + return 0; + + return vmx_get_l2_tsc_multiplier(vcpu); +} + +static void vt_write_tsc_offset(struct kvm_vcpu *vcpu) +{ + /* In TDX, tsc offset can't be changed. */ + if (is_td_vcpu(vcpu)) + return; + + vmx_write_tsc_offset(vcpu); +} + +static void vt_write_tsc_multiplier(struct kvm_vcpu *vcpu) +{ + /* In TDX, tsc multiplier can't be changed. */ + if (is_td_vcpu(vcpu)) + return; + + vmx_write_tsc_multiplier(vcpu); +} + +#ifdef CONFIG_X86_64 +static int vt_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, + bool *expired) +{ + /* VMX-preemption timer isn't available for TDX. */ + if (is_td_vcpu(vcpu)) + return -EINVAL; + + return vmx_set_hv_timer(vcpu, guest_deadline_tsc, expired); +} + +static void vt_cancel_hv_timer(struct kvm_vcpu *vcpu) +{ + /* VMX-preemption timer can't be set. See vt_set_hv_timer(). */ + if (is_td_vcpu(vcpu)) + return; + + vmx_cancel_hv_timer(vcpu); +} +#endif + +static void vt_setup_mce(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_setup_mce(vcpu); +} + +static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp) +{ + if (!is_td(kvm)) + return -ENOTTY; + + return tdx_vm_ioctl(kvm, argp); +} + +static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp) +{ + if (!is_td_vcpu(vcpu)) + return -EINVAL; + + return tdx_vcpu_ioctl(vcpu, argp); +} + +static int vt_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +{ + if (is_td(kvm)) + return tdx_gmem_private_max_mapping_level(kvm, pfn); + + return 0; +} + +#define vt_op(name) vt_##name +#define vt_op_tdx_only(name) vt_##name +#else /* CONFIG_KVM_INTEL_TDX */ +#define vt_op(name) vmx_##name +#define vt_op_tdx_only(name) NULL +#endif /* CONFIG_KVM_INTEL_TDX */ + +#define VMX_REQUIRED_APICV_INHIBITS \ + (BIT(APICV_INHIBIT_REASON_DISABLED) | \ + BIT(APICV_INHIBIT_REASON_ABSENT) | \ + BIT(APICV_INHIBIT_REASON_HYPERV) | \ + BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \ + BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \ + BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \ + BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED)) + +struct kvm_x86_ops vt_x86_ops __initdata = { + .name = KBUILD_MODNAME, + + .check_processor_compatibility = vmx_check_processor_compat, + + .hardware_unsetup = vmx_hardware_unsetup, + + .enable_virtualization_cpu = vmx_enable_virtualization_cpu, + .disable_virtualization_cpu = vt_op(disable_virtualization_cpu), + .emergency_disable_virtualization_cpu = vmx_emergency_disable_virtualization_cpu, + + .has_emulated_msr = vt_op(has_emulated_msr), + + .vm_size = sizeof(struct kvm_vmx), + + .vm_init = vt_op(vm_init), + .vm_destroy = vt_op(vm_destroy), + .vm_pre_destroy = vt_op_tdx_only(vm_pre_destroy), + + .vcpu_precreate = vt_op(vcpu_precreate), + .vcpu_create = vt_op(vcpu_create), + .vcpu_free = vt_op(vcpu_free), + .vcpu_reset = vt_op(vcpu_reset), + + .prepare_switch_to_guest = vt_op(prepare_switch_to_guest), + .vcpu_load = vt_op(vcpu_load), + .vcpu_put = vt_op(vcpu_put), + + .update_exception_bitmap = vt_op(update_exception_bitmap), + .get_feature_msr = vmx_get_feature_msr, + .get_msr = vt_op(get_msr), + .set_msr = vt_op(set_msr), + + .get_segment_base = vt_op(get_segment_base), + .get_segment = vt_op(get_segment), + .set_segment = vt_op(set_segment), + .get_cpl = vt_op(get_cpl), + .get_cpl_no_cache = vt_op(get_cpl_no_cache), + .get_cs_db_l_bits = vt_op(get_cs_db_l_bits), + .is_valid_cr0 = vt_op(is_valid_cr0), + .set_cr0 = vt_op(set_cr0), + .is_valid_cr4 = vt_op(is_valid_cr4), + .set_cr4 = vt_op(set_cr4), + .set_efer = vt_op(set_efer), + .get_idt = vt_op(get_idt), + .set_idt = vt_op(set_idt), + .get_gdt = vt_op(get_gdt), + .set_gdt = vt_op(set_gdt), + .set_dr6 = vt_op(set_dr6), + .set_dr7 = vt_op(set_dr7), + .sync_dirty_debug_regs = vt_op(sync_dirty_debug_regs), + .cache_reg = vt_op(cache_reg), + .get_rflags = vt_op(get_rflags), + .set_rflags = vt_op(set_rflags), + .get_if_flag = vt_op(get_if_flag), + + .flush_tlb_all = vt_op(flush_tlb_all), + .flush_tlb_current = vt_op(flush_tlb_current), + .flush_tlb_gva = vt_op(flush_tlb_gva), + .flush_tlb_guest = vt_op(flush_tlb_guest), + + .vcpu_pre_run = vt_op(vcpu_pre_run), + .vcpu_run = vt_op(vcpu_run), + .handle_exit = vt_op(handle_exit), + .skip_emulated_instruction = vmx_skip_emulated_instruction, + .update_emulated_instruction = vmx_update_emulated_instruction, + .set_interrupt_shadow = vt_op(set_interrupt_shadow), + .get_interrupt_shadow = vt_op(get_interrupt_shadow), + .patch_hypercall = vt_op(patch_hypercall), + .inject_irq = vt_op(inject_irq), + .inject_nmi = vt_op(inject_nmi), + .inject_exception = vt_op(inject_exception), + .cancel_injection = vt_op(cancel_injection), + .interrupt_allowed = vt_op(interrupt_allowed), + .nmi_allowed = vt_op(nmi_allowed), + .get_nmi_mask = vt_op(get_nmi_mask), + .set_nmi_mask = vt_op(set_nmi_mask), + .enable_nmi_window = vt_op(enable_nmi_window), + .enable_irq_window = vt_op(enable_irq_window), + .update_cr8_intercept = vt_op(update_cr8_intercept), + + .x2apic_icr_is_split = false, + .set_virtual_apic_mode = vt_op(set_virtual_apic_mode), + .set_apic_access_page_addr = vt_op(set_apic_access_page_addr), + .refresh_apicv_exec_ctrl = vt_op(refresh_apicv_exec_ctrl), + .load_eoi_exitmap = vt_op(load_eoi_exitmap), + .apicv_pre_state_restore = pi_apicv_pre_state_restore, + .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS, + .hwapic_isr_update = vt_op(hwapic_isr_update), + .sync_pir_to_irr = vt_op(sync_pir_to_irr), + .deliver_interrupt = vt_op(deliver_interrupt), + .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt, + + .set_tss_addr = vt_op(set_tss_addr), + .set_identity_map_addr = vt_op(set_identity_map_addr), + .get_mt_mask = vmx_get_mt_mask, + + .get_exit_info = vt_op(get_exit_info), + .get_entry_info = vt_op(get_entry_info), + + .vcpu_after_set_cpuid = vt_op(vcpu_after_set_cpuid), + + .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, + + .get_l2_tsc_offset = vt_op(get_l2_tsc_offset), + .get_l2_tsc_multiplier = vt_op(get_l2_tsc_multiplier), + .write_tsc_offset = vt_op(write_tsc_offset), + .write_tsc_multiplier = vt_op(write_tsc_multiplier), + + .load_mmu_pgd = vt_op(load_mmu_pgd), + + .check_intercept = vmx_check_intercept, + .handle_exit_irqoff = vmx_handle_exit_irqoff, + + .update_cpu_dirty_logging = vt_op(update_cpu_dirty_logging), + + .nested_ops = &vmx_nested_ops, + + .pi_update_irte = vmx_pi_update_irte, + .pi_start_assignment = vmx_pi_start_assignment, + +#ifdef CONFIG_X86_64 + .set_hv_timer = vt_op(set_hv_timer), + .cancel_hv_timer = vt_op(cancel_hv_timer), +#endif + + .setup_mce = vt_op(setup_mce), + +#ifdef CONFIG_KVM_SMM + .smi_allowed = vt_op(smi_allowed), + .enter_smm = vt_op(enter_smm), + .leave_smm = vt_op(leave_smm), + .enable_smi_window = vt_op(enable_smi_window), +#endif + + .check_emulate_instruction = vt_op(check_emulate_instruction), + .apic_init_signal_blocked = vt_op(apic_init_signal_blocked), + .migrate_timers = vmx_migrate_timers, + + .msr_filter_changed = vt_op(msr_filter_changed), + .complete_emulated_msr = vt_op(complete_emulated_msr), + + .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, + + .get_untagged_addr = vmx_get_untagged_addr, + + .mem_enc_ioctl = vt_op_tdx_only(mem_enc_ioctl), + .vcpu_mem_enc_ioctl = vt_op_tdx_only(vcpu_mem_enc_ioctl), + + .private_max_mapping_level = vt_op_tdx_only(gmem_private_max_mapping_level) +}; + +struct kvm_x86_init_ops vt_init_ops __initdata = { + .hardware_setup = vt_op(hardware_setup), + .handle_intel_pt_intr = NULL, + + .runtime_ops = &vt_x86_ops, + .pmu_ops = &intel_pmu_ops, +}; + +static void __exit vt_exit(void) +{ + kvm_exit(); + tdx_cleanup(); + vmx_exit(); +} +module_exit(vt_exit); + +static int __init vt_init(void) +{ + unsigned vcpu_size, vcpu_align; + int r; + + r = vmx_init(); + if (r) + return r; + + /* tdx_init() has been taken */ + r = tdx_bringup(); + if (r) + goto err_tdx_bringup; + + /* + * TDX and VMX have different vCPU structures. Calculate the + * maximum size/align so that kvm_init() can use the larger + * values to create the kmem_vcpu_cache. + */ + vcpu_size = sizeof(struct vcpu_vmx); + vcpu_align = __alignof__(struct vcpu_vmx); + if (enable_tdx) { + vcpu_size = max_t(unsigned, vcpu_size, + sizeof(struct vcpu_tdx)); + vcpu_align = max_t(unsigned, vcpu_align, + __alignof__(struct vcpu_tdx)); + kvm_caps.supported_vm_types |= BIT(KVM_X86_TDX_VM); + } + + /* + * Common KVM initialization _must_ come last, after this, /dev/kvm is + * exposed to userspace! + */ + r = kvm_init(vcpu_size, vcpu_align, THIS_MODULE); + if (r) + goto err_kvm_init; + + return 0; + +err_kvm_init: + tdx_cleanup(); +err_tdx_bringup: + vmx_exit(); + return r; +} +module_init(vt_init); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index d05ddf751491..7211c71d4241 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -6,16 +6,18 @@ #include <asm/debugreg.h> #include <asm/mmu_context.h> +#include <asm/msr.h> +#include "x86.h" #include "cpuid.h" #include "hyperv.h" #include "mmu.h" #include "nested.h" #include "pmu.h" +#include "posted_intr.h" #include "sgx.h" #include "trace.h" #include "vmx.h" -#include "x86.h" #include "smm.h" static bool __read_mostly enable_shadow_vmcs = 1; @@ -230,11 +232,8 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); - if (nested_vmx_is_evmptr12_valid(vmx)) { - kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); - vmx->nested.hv_evmcs = NULL; - } - + kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map); + vmx->nested.hv_evmcs = NULL; vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; if (hv_vcpu) { @@ -259,7 +258,7 @@ static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr) * state. It is possible that the area will stay mapped as * vmx->nested.hv_evmcs but this shouldn't be a problem. */ - if (!guest_cpuid_has_evmcs(vcpu) || + if (!guest_cpu_cap_has_evmcs(vcpu) || !evmptr_is_valid(nested_get_evmptr(vcpu))) return false; @@ -277,7 +276,7 @@ static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, { struct vmcs_host_state *dest, *src; - if (unlikely(!vmx->guest_state_loaded)) + if (unlikely(!vmx->vt.guest_state_loaded)) return; src = &prev->host_state; @@ -303,7 +302,7 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) cpu = get_cpu(); prev = vmx->loaded_vmcs; vmx->loaded_vmcs = vmcs; - vmx_vcpu_load_vmcs(vcpu, cpu, prev); + vmx_vcpu_load_vmcs(vcpu, cpu); vmx_sync_vmcs_host_state(vmx, prev); put_cpu(); @@ -316,6 +315,16 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) vcpu->arch.regs_dirty = 0; } +static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map); + kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map); + kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map); + vmx->nested.pi_desc = NULL; +} + /* * Free whatever needs to be freed from vmx->nested when L1 goes down, or * just stops using VMX. @@ -348,15 +357,8 @@ static void free_nested(struct kvm_vcpu *vcpu) vmx->nested.cached_vmcs12 = NULL; kfree(vmx->nested.cached_shadow_vmcs12); vmx->nested.cached_shadow_vmcs12 = NULL; - /* - * Unpin physical memory we referred to in the vmcs02. The APIC access - * page's backing page (yeah, confusing) shouldn't actually be accessed, - * and if it is written, the contents are irrelevant. - */ - kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); - kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); - kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); - vmx->nested.pi_desc = NULL; + + nested_put_vmcs12_pages(vcpu); kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); @@ -409,18 +411,40 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long exit_qualification; u32 vm_exit_reason; - unsigned long exit_qualification = vcpu->arch.exit_qualification; if (vmx->nested.pml_full) { vm_exit_reason = EXIT_REASON_PML_FULL; vmx->nested.pml_full = false; - exit_qualification &= INTR_INFO_UNBLOCK_NMI; + + /* + * It should be impossible to trigger a nested PML Full VM-Exit + * for anything other than an EPT Violation from L2. KVM *can* + * trigger nEPT page fault injection in response to an EPT + * Misconfig, e.g. if the MMIO SPTE was stale and L1's EPT + * tables also changed, but KVM should not treat EPT Misconfig + * VM-Exits as writes. + */ + WARN_ON_ONCE(vmx->vt.exit_reason.basic != EXIT_REASON_EPT_VIOLATION); + + /* + * PML Full and EPT Violation VM-Exits both use bit 12 to report + * "NMI unblocking due to IRET", i.e. the bit can be propagated + * as-is from the original EXIT_QUALIFICATION. + */ + exit_qualification = vmx_get_exit_qual(vcpu) & INTR_INFO_UNBLOCK_NMI; } else { - if (fault->error_code & PFERR_RSVD_MASK) + if (fault->error_code & PFERR_RSVD_MASK) { vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; - else + exit_qualification = 0; + } else { + exit_qualification = fault->exit_qualification; + exit_qualification |= vmx_get_exit_qual(vcpu) & + (EPT_VIOLATION_GVA_IS_VALID | + EPT_VIOLATION_GVA_TRANSLATED); vm_exit_reason = EXIT_REASON_EPT_VIOLATION; + } /* * Although the caller (kvm_inject_emulated_page_fault) would @@ -601,7 +625,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, int msr; unsigned long *msr_bitmap_l1; unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; - struct kvm_host_map *map = &vmx->nested.msr_bitmap_map; + struct kvm_host_map map; /* Nothing to do if the MSR bitmap is not in use. */ if (!cpu_has_vmx_msr_bitmap() || @@ -624,10 +648,10 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, return true; } - if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map)) + if (kvm_vcpu_map_readonly(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), &map)) return false; - msr_bitmap_l1 = (unsigned long *)map->hva; + msr_bitmap_l1 = (unsigned long *)map.hva; /* * To keep the control flow simple, pay eight 8-byte writes (sixteen @@ -691,7 +715,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_FLUSH_CMD, MSR_TYPE_W); - kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false); + kvm_vcpu_unmap(vcpu, &map); vmx->nested.force_msr_bitmap_recalc = false; @@ -801,12 +825,30 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, return 0; } +static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, + vmx->nested.msrs.misc_high); + + return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; +} + static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, u32 count, u64 addr) { if (count == 0) return 0; + /* + * Exceeding the limit results in architecturally _undefined_ behavior, + * i.e. KVM is allowed to do literally anything in response to a bad + * limit. Immediately generate a consistency check so that code that + * consumes the count doesn't need to worry about extreme edge cases. + */ + if (count > nested_vmx_max_atomic_switch_msrs(vcpu)) + return -EINVAL; + if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) return -EINVAL; @@ -917,15 +959,6 @@ static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, return 0; } -static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, - vmx->nested.msrs.misc_high); - - return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; -} - /* * Load guest's/host's msr at nested entry/exit. * return 0 for success, entry index for failure. @@ -942,7 +975,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); for (i = 0; i < count; i++) { - if (unlikely(i >= max_msr_list_size)) + if (WARN_ON_ONCE(i >= max_msr_list_size)) goto fail; if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), @@ -958,7 +991,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) __func__, i, e.index, e.reserved); goto fail; } - if (kvm_set_msr(vcpu, e.index, e.value)) { + if (kvm_set_msr_with_filter(vcpu, e.index, e.value)) { pr_debug_ratelimited( "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", __func__, i, e.index, e.value); @@ -994,7 +1027,7 @@ static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, } } - if (kvm_get_msr(vcpu, msr_index, data)) { + if (kvm_get_msr_with_filter(vcpu, msr_index, data)) { pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, msr_index); return false; @@ -1030,7 +1063,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); for (i = 0; i < count; i++) { - if (unlikely(i >= max_msr_list_size)) + if (WARN_ON_ONCE(i >= max_msr_list_size)) return -EINVAL; if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) @@ -1089,9 +1122,9 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, /* * Emulated VMEntry does not fail here. Instead a less * accurate value will be returned by - * nested_vmx_get_vmexit_msr_value() using kvm_get_msr() - * instead of reading the value from the vmcs02 VMExit - * MSR-store area. + * nested_vmx_get_vmexit_msr_value() by reading KVM's + * internal MSR state instead of reading the value from + * the vmcs02 VMExit MSR-store area. */ pr_warn_ratelimited( "Not enough msr entries in msr_autostore. Can't add msr %x\n", @@ -1174,11 +1207,14 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, kvm_hv_nested_transtion_tlb_flush(vcpu, enable_ept); /* - * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings - * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a - * full TLB flush from the guest's perspective. This is required even - * if VPID is disabled in the host as KVM may need to synchronize the - * MMU in response to the guest TLB flush. + * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the + * same VPID as the host, and so architecturally, linear and combined + * mappings for VPID=0 must be flushed at VM-Enter and VM-Exit. KVM + * emulates L2 sharing L1's VPID=0 by using vpid01 while running L2, + * and so KVM must also emulate TLB flush of VPID=0, i.e. vpid01. This + * is required if VPID is disabled in KVM, as a TLB flush (there are no + * VPIDs) still occurs from L1's perspective, and KVM may need to + * synchronize the MMU in response to the guest TLB flush. * * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. * EPT is a special snowflake, as guest-physical mappings aren't @@ -1228,21 +1264,32 @@ static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) { - const u64 feature_and_reserved = - /* feature (except bit 48; see below) */ - BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | - /* reserved */ - BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); + const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT | + VMX_BASIC_INOUT | + VMX_BASIC_TRUE_CTLS; + + const u64 reserved_bits = GENMASK_ULL(63, 56) | + GENMASK_ULL(47, 45) | + BIT_ULL(31); + u64 vmx_basic = vmcs_config.nested.basic; - if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) + BUILD_BUG_ON(feature_bits & reserved_bits); + + /* + * Except for 32BIT_PHYS_ADDR_ONLY, which is an anti-feature bit (has + * inverted polarity), the incoming value must not set feature bits or + * reserved bits that aren't allowed/supported by KVM. Fields, i.e. + * multi-bit values, are explicitly checked below. + */ + if (!is_bitwise_subset(vmx_basic, data, feature_bits | reserved_bits)) return -EINVAL; /* * KVM does not emulate a version of VMX that constrains physical * addresses of VMX structures (e.g. VMCS) to 32-bits. */ - if (data & BIT_ULL(48)) + if (data & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) return -EINVAL; if (vmx_basic_vmcs_revision_id(vmx_basic) != @@ -1311,16 +1358,29 @@ vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) { - const u64 feature_and_reserved_bits = - /* feature */ - BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | - BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | - /* reserved */ - GENMASK_ULL(13, 9) | BIT_ULL(31); + const u64 feature_bits = VMX_MISC_SAVE_EFER_LMA | + VMX_MISC_ACTIVITY_HLT | + VMX_MISC_ACTIVITY_SHUTDOWN | + VMX_MISC_ACTIVITY_WAIT_SIPI | + VMX_MISC_INTEL_PT | + VMX_MISC_RDMSR_IN_SMM | + VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | + VMX_MISC_VMXOFF_BLOCK_SMI | + VMX_MISC_ZERO_LEN_INS; + + const u64 reserved_bits = BIT_ULL(31) | GENMASK_ULL(13, 9); + u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low, vmcs_config.nested.misc_high); - if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) + BUILD_BUG_ON(feature_bits & reserved_bits); + + /* + * The incoming value must not set feature bits or reserved bits that + * aren't allowed/supported by KVM. Fields, i.e. multi-bit values, are + * explicitly checked below. + */ + if (!is_bitwise_subset(vmx_misc, data, feature_bits | reserved_bits)) return -EINVAL; if ((vmx->nested.msrs.pinbased_ctls_high & @@ -2039,7 +2099,7 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( bool evmcs_gpa_changed = false; u64 evmcs_gpa; - if (likely(!guest_cpuid_has_evmcs(vcpu))) + if (likely(!guest_cpu_cap_has_evmcs(vcpu))) return EVMPTRLD_DISABLED; evmcs_gpa = nested_get_evmptr(vcpu); @@ -2220,6 +2280,9 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL)); + if (vmx->ve_info) + vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info)); + /* All VMFUNCs are currently emulated through L0 vmexits. */ if (cpu_has_vmx_vmfunc()) vmcs_write64(VM_FUNCTION_CONTROL, 0); @@ -2265,6 +2328,17 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); + /* + * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the + * same VPID as the host. Emulate this behavior by using vpid01 for L2 + * if VPID is disabled in vmcs12. Note, if VPID is disabled, VM-Enter + * and VM-Exit are architecturally required to flush VPID=0, but *only* + * VPID=0. I.e. using vpid02 would be ok (so long as KVM emulates the + * required flushes), but doing so would cause KVM to over-flush. E.g. + * if L1 runs L2 X with VPID12=1, then runs L2 Y with VPID12 disabled, + * and then runs L2 X again, then KVM can and should retain TLB entries + * for VPID12=1. + */ if (enable_vpid) { if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); @@ -2291,10 +2365,12 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 /* Posted interrupts setting is only taken from vmcs12. */ vmx->nested.pi_pending = false; - if (nested_cpu_has_posted_intr(vmcs12)) + if (nested_cpu_has_posted_intr(vmcs12)) { vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; - else + } else { + vmx->nested.posted_intr_nv = -1; exec_control &= ~PIN_BASED_POSTED_INTR; + } pin_controls_set(vmx, exec_control); /* @@ -2400,7 +2476,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 if (cpu_has_load_ia32_efer()) { if (guest_efer & EFER_LMA) exec_control |= VM_ENTRY_IA32E_MODE; - if (guest_efer != host_efer) + if (guest_efer != kvm_host.efer) exec_control |= VM_ENTRY_LOAD_IA32_EFER; } vm_entry_controls_set(vmx, exec_control); @@ -2413,7 +2489,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 * bits may be modified by vmx_set_efer() in prepare_vmcs02(). */ exec_control = __vm_exit_controls_get(vmcs01); - if (cpu_has_load_ia32_efer() && guest_efer != host_efer) + if (cpu_has_load_ia32_efer() && guest_efer != kvm_host.efer) exec_control |= VM_EXIT_LOAD_IA32_EFER; else exec_control &= ~VM_EXIT_LOAD_IA32_EFER; @@ -2444,6 +2520,7 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { + vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); @@ -2481,7 +2558,7 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); - vmx->segment_cache.bitmask = 0; + vmx_segment_cache_clear(vmx); } if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & @@ -2903,7 +2980,7 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, case INTR_TYPE_SOFT_EXCEPTION: case INTR_TYPE_SOFT_INTR: case INTR_TYPE_PRIV_SW_EXCEPTION: - if (CC(vmcs12->vm_entry_instruction_len > 15) || + if (CC(vmcs12->vm_entry_instruction_len > X86_MAX_INSTRUCTION_LENGTH) || CC(vmcs12->vm_entry_instruction_len == 0 && CC(!nested_cpu_has_zero_length_injection(vcpu)))) return -EINVAL; @@ -2925,7 +3002,7 @@ static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, return -EINVAL; #ifdef CONFIG_KVM_HYPERV - if (guest_cpuid_has_evmcs(vcpu)) + if (guest_cpu_cap_has_evmcs(vcpu)) return nested_evmcs_check_controls(vmcs12); #endif @@ -2943,6 +3020,17 @@ static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, return 0; } +static bool is_l1_noncanonical_address_on_vmexit(u64 la, struct vmcs12 *vmcs12) +{ + /* + * Check that the given linear address is canonical after a VM exit + * from L2, based on HOST_CR4.LA57 value that will be loaded for L1. + */ + u8 l1_address_bits_on_exit = (vmcs12->host_cr4 & X86_CR4_LA57) ? 57 : 48; + + return !__is_canonical_address(la, l1_address_bits_on_exit); +} + static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { @@ -2953,8 +3041,8 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3))) return -EINVAL; - if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || - CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))) + if (CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || + CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_eip, vcpu))) return -EINVAL; if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && @@ -2988,12 +3076,12 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, CC(vmcs12->host_ss_selector == 0 && !ia32e)) return -EINVAL; - if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) || - CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || - CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || - CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) || - CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) || - CC(is_noncanonical_address(vmcs12->host_rip, vcpu))) + if (CC(is_noncanonical_base_address(vmcs12->host_fs_base, vcpu)) || + CC(is_noncanonical_base_address(vmcs12->host_gs_base, vcpu)) || + CC(is_noncanonical_base_address(vmcs12->host_gdtr_base, vcpu)) || + CC(is_noncanonical_base_address(vmcs12->host_idtr_base, vcpu)) || + CC(is_noncanonical_base_address(vmcs12->host_tr_base, vcpu)) || + CC(is_l1_noncanonical_address_on_vmexit(vmcs12->host_rip, vmcs12))) return -EINVAL; /* @@ -3111,7 +3199,7 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, } if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && - (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || + (CC(is_noncanonical_msr_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) return -EINVAL; @@ -3209,7 +3297,7 @@ static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) * L2 was running), map it here to make sure vmcs12 changes are * properly reflected. */ - if (guest_cpuid_has_evmcs(vcpu) && + if (guest_cpu_cap_has_evmcs(vcpu) && vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) { enum nested_evmptrld_status evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, false); @@ -3364,7 +3452,7 @@ static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) if (!nested_cpu_has_pml(vmcs12)) return 0; - if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { + if (vmcs12->guest_pml_index >= PML_LOG_NR_ENTRIES) { vmx->nested.pml_full = true; return 1; } @@ -3403,14 +3491,6 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) return 1; } -static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) -{ - u8 rvi = vmx_get_rvi(); - u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); - - return ((rvi & 0xf0) > (vppr & 0xf0)); -} - static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12); @@ -3430,7 +3510,6 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); enum vm_entry_failure_code entry_failure_code; - bool evaluate_pending_interrupts; union vmx_exit_reason exit_reason = { .basic = EXIT_REASON_INVALID_STATE, .failed_vmentry = 1, @@ -3449,13 +3528,6 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, kvm_service_local_tlb_flush_requests(vcpu); - evaluate_pending_interrupts = exec_controls_get(vmx) & - (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); - if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) - evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); - if (!evaluate_pending_interrupts) - evaluate_pending_interrupts |= kvm_apic_has_pending_init_or_sipi(vcpu); - if (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); @@ -3538,9 +3610,13 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, * Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI * when it executed VMLAUNCH/VMRESUME, as entering non-root mode can * effectively unblock various events, e.g. INIT/SIPI cause VM-Exit - * unconditionally. + * unconditionally. Take care to pull data from vmcs01 as appropriate, + * e.g. when checking for interrupt windows, as vmcs02 is now loaded. */ - if (unlikely(evaluate_pending_interrupts)) + if ((__exec_controls_get(&vmx->vmcs01) & (CPU_BASED_INTR_WINDOW_EXITING | + CPU_BASED_NMI_WINDOW_EXITING)) || + kvm_apic_has_pending_init_or_sipi(vcpu) || + kvm_apic_has_interrupt(vcpu)) kvm_make_request(KVM_REQ_EVENT, vcpu); /* @@ -3673,14 +3749,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (unlikely(status != NVMX_VMENTRY_SUCCESS)) goto vmentry_failed; - /* Emulate processing of posted interrupts on VM-Enter. */ - if (nested_cpu_has_posted_intr(vmcs12) && - kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) { - vmx->nested.pi_pending = true; - kvm_make_request(KVM_REQ_EVENT, vcpu); - kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv); - } - /* Hide L1D cache contents from the nested guest. */ vmx->vcpu.arch.l1tf_flush_l1d = true; @@ -3713,7 +3781,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) break; case GUEST_ACTIVITY_WAIT_SIPI: vmx->nested.nested_run_pending = 0; - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED); break; default: break; @@ -3874,8 +3942,8 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) if (!pi_test_and_clear_on(vmx->nested.pi_desc)) return 0; - max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); - if (max_irr != 256) { + max_irr = pi_find_highest_vector(vmx->nested.pi_desc); + if (max_irr > 0) { vapic_page = vmx->nested.virtual_apic_map.hva; if (!vapic_page) goto mmio_needed; @@ -4006,10 +4074,46 @@ static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) to_vmx(vcpu)->nested.preemption_timer_expired; } -static bool vmx_has_nested_events(struct kvm_vcpu *vcpu) +static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection) { - return nested_vmx_preemption_timer_pending(vcpu) || - to_vmx(vcpu)->nested.mtf_pending; + struct vcpu_vmx *vmx = to_vmx(vcpu); + void *vapic = vmx->nested.virtual_apic_map.hva; + int max_irr, vppr; + + if (nested_vmx_preemption_timer_pending(vcpu) || + vmx->nested.mtf_pending) + return true; + + /* + * Virtual Interrupt Delivery doesn't require manual injection. Either + * the interrupt is already in GUEST_RVI and will be recognized by CPU + * at VM-Entry, or there is a KVM_REQ_EVENT pending and KVM will move + * the interrupt from the PIR to RVI prior to entering the guest. + */ + if (for_injection) + return false; + + if (!nested_cpu_has_vid(get_vmcs12(vcpu)) || + __vmx_interrupt_blocked(vcpu)) + return false; + + if (!vapic) + return false; + + vppr = *((u32 *)(vapic + APIC_PROCPRI)); + + max_irr = vmx_get_rvi(); + if ((max_irr & 0xf0) > (vppr & 0xf0)) + return true; + + if (vmx->nested.pi_pending && vmx->nested.pi_desc && + pi_test_on(vmx->nested.pi_desc)) { + max_irr = pi_find_highest_vector(vmx->nested.pi_desc); + if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0)) + return true; + } + + return false; } /* @@ -4106,13 +4210,25 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) */ bool block_nested_exceptions = vmx->nested.nested_run_pending; /* - * New events (not exceptions) are only recognized at instruction + * Events that don't require injection, i.e. that are virtualized by + * hardware, aren't blocked by a pending VM-Enter as KVM doesn't need + * to regain control in order to deliver the event, and hardware will + * handle event ordering, e.g. with respect to injected exceptions. + * + * But, new events (not exceptions) are only recognized at instruction * boundaries. If an event needs reinjection, then KVM is handling a - * VM-Exit that occurred _during_ instruction execution; new events are - * blocked until the instruction completes. + * VM-Exit that occurred _during_ instruction execution; new events, + * irrespective of whether or not they're injected, are blocked until + * the instruction completes. + */ + bool block_non_injected_events = kvm_event_needs_reinjection(vcpu); + /* + * Inject events are blocked by nested VM-Enter, as KVM is responsible + * for managing priority between concurrent events, i.e. KVM needs to + * wait until after VM-Enter completes to deliver injected events. */ bool block_nested_events = block_nested_exceptions || - kvm_event_needs_reinjection(vcpu); + block_non_injected_events; if (lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &apic->pending_events)) { @@ -4222,11 +4338,71 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) } if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { + int irq; + + if (!nested_exit_on_intr(vcpu)) { + if (block_nested_events) + return -EBUSY; + + goto no_vmexit; + } + + if (!nested_exit_intr_ack_set(vcpu)) { + if (block_nested_events) + return -EBUSY; + + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); + return 0; + } + + irq = kvm_cpu_get_extint(vcpu); + if (irq != -1) { + if (block_nested_events) + return -EBUSY; + + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, + INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); + return 0; + } + + irq = kvm_apic_has_interrupt(vcpu); + if (WARN_ON_ONCE(irq < 0)) + goto no_vmexit; + + /* + * If the IRQ is L2's PI notification vector, process posted + * interrupts for L2 instead of injecting VM-Exit, as the + * detection/morphing architecturally occurs when the IRQ is + * delivered to the CPU. Note, only interrupts that are routed + * through the local APIC trigger posted interrupt processing, + * and enabling posted interrupts requires ACK-on-exit. + */ + if (irq == vmx->nested.posted_intr_nv) { + /* + * Nested posted interrupts are delivered via RVI, i.e. + * aren't injected by KVM, and so can be queued even if + * manual event injection is disallowed. + */ + if (block_non_injected_events) + return -EBUSY; + + vmx->nested.pi_pending = true; + kvm_apic_clear_irr(vcpu, irq); + goto no_vmexit; + } + if (block_nested_events) return -EBUSY; - if (!nested_exit_on_intr(vcpu)) - goto no_vmexit; - nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); + + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, + INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); + + /* + * ACK the interrupt _after_ emulating VM-Exit, as the IRQ must + * be marked as in-service in vmcs01.GUEST_INTERRUPT_STATUS.SVI + * if APICv is active. + */ + kvm_apic_ack_interrupt(vcpu, irq); return 0; } @@ -4354,12 +4530,12 @@ static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, cpu = get_cpu(); vmx->loaded_vmcs = &vmx->nested.vmcs02; - vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01); + vmx_vcpu_load_vmcs(vcpu, cpu); sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); vmx->loaded_vmcs = &vmx->vmcs01; - vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02); + vmx_vcpu_load_vmcs(vcpu, cpu); put_cpu(); } @@ -4452,11 +4628,11 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) */ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, u32 vm_exit_reason, u32 exit_intr_info, - unsigned long exit_qualification) + unsigned long exit_qualification, u32 exit_insn_len) { /* update exit information fields: */ vmcs12->vm_exit_reason = vm_exit_reason; - if (to_vmx(vcpu)->exit_reason.enclave_mode) + if (vmx_get_exit_reason(vcpu).enclave_mode) vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; vmcs12->exit_qualification = exit_qualification; @@ -4480,7 +4656,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vm_exit_reason, exit_intr_info); vmcs12->vm_exit_intr_info = exit_intr_info; - vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + vmcs12->vm_exit_instruction_len = exit_insn_len; vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); /* @@ -4628,7 +4804,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vmcs12->vm_exit_msr_load_count)) nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); - to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); + to_vt(vcpu)->emulation_required = vmx_emulation_required(vcpu); } static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) @@ -4640,7 +4816,7 @@ static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) return vmcs_read64(GUEST_IA32_EFER); if (cpu_has_load_ia32_efer()) - return host_efer; + return kvm_host.efer; for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) @@ -4651,7 +4827,7 @@ static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) if (efer_msr) return efer_msr->data; - return host_efer; + return kvm_host.efer; } static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) @@ -4744,7 +4920,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) goto vmabort; } - if (kvm_set_msr(vcpu, h.index, h.value)) { + if (kvm_set_msr_with_filter(vcpu, h.index, h.value)) { pr_debug_ratelimited( "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", __func__, j, h.index, h.value); @@ -4764,8 +4940,9 @@ vmabort: * and modify vmcs12 to make it see what it would expect to see there if * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) */ -void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, - u32 exit_intr_info, unsigned long exit_qualification) +void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, + u32 exit_intr_info, unsigned long exit_qualification, + u32 exit_insn_len) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); @@ -4815,7 +4992,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, if (vm_exit_reason != -1) prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, - exit_intr_info, exit_qualification); + exit_intr_info, exit_qualification, + exit_insn_len); /* * Must happen outside of sync_vmcs02_to_vmcs12() as it will @@ -4852,16 +5030,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmx_switch_vmcs(vcpu, &vmx->vmcs01); - /* - * If IBRS is advertised to the vCPU, KVM must flush the indirect - * branch predictors when transitioning from L2 to L1, as L1 expects - * hardware (KVM in this case) to provide separate predictor modes. - * Bare metal isolates VMX root (host) from VMX non-root (guest), but - * doesn't isolate different VMCSs, i.e. in this case, doesn't provide - * separate modes for L2 vs L1. - */ - if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) - indirect_branch_prediction_barrier(); + kvm_nested_vmexit_handle_ibrs(vcpu); /* Update any VMCS fields that might have changed while L2 ran */ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); @@ -4883,11 +5052,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmx_update_cpu_dirty_logging(vcpu); } - /* Unpin physical memory we referred to in vmcs02 */ - kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); - kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); - kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); - vmx->nested.pi_desc = NULL; + nested_put_vmcs12_pages(vcpu); if (vmx->nested.reload_vmcs01_apic_access_page) { vmx->nested.reload_vmcs01_apic_access_page = false; @@ -4899,22 +5064,19 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); } + if (vmx->nested.update_vmcs01_hwapic_isr) { + vmx->nested.update_vmcs01_hwapic_isr = false; + kvm_apic_update_hwapic_isr(vcpu); + } + if ((vm_exit_reason != -1) && (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx))) vmx->nested.need_vmcs12_to_shadow_sync = true; /* in case we halted in L2 */ - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); if (likely(!vmx->fail)) { - if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && - nested_exit_intr_ack_set(vcpu)) { - int irq = kvm_cpu_get_interrupt(vcpu); - WARN_ON(irq < 0); - vmcs12->vm_exit_intr_info = irq | - INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; - } - if (vm_exit_reason != -1) trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, vmcs12->exit_qualification, @@ -4925,6 +5087,17 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, load_vmcs12_host_state(vcpu, vmcs12); + /* + * Process events if an injectable IRQ or NMI is pending, even + * if the event is blocked (RFLAGS.IF is cleared on VM-Exit). + * If an event became pending while L2 was active, KVM needs to + * either inject the event or request an IRQ/NMI window. SMIs + * don't need to be processed as SMM is mutually exclusive with + * non-root mode. INIT/SIPI don't need to be checked as INIT + * is blocked post-VMXON, and SIPIs are ignored. + */ + if (kvm_cpu_has_injectable_intr(vcpu) || vcpu->arch.nmi_pending) + kvm_make_request(KVM_REQ_EVENT, vcpu); return; } @@ -5031,7 +5204,7 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, * non-canonical form. This is the only check on the memory * destination for long mode! */ - exn = is_noncanonical_address(*ret, vcpu); + exn = is_noncanonical_address(*ret, vcpu, 0); } else { /* * When not in long mode, the virtual/linear address is @@ -5157,9 +5330,8 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) goto out_shadow_vmcs; - hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS_PINNED); - vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; + hrtimer_setup(&vmx->nested.preemption_timer, vmx_preemption_timer_fn, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED); vmx->nested.vpid02 = allocate_vpid(); @@ -5828,6 +6000,12 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + /* + * Always flush the effective vpid02, i.e. never flush the current VPID + * and never explicitly flush vpid01. INVVPID targets a VPID, not a + * VMCS, and so whether or not the current vmcs12 has VPID enabled is + * irrelevant (and there may not be a loaded vmcs12). + */ vpid02 = nested_get_vpid02(vcpu); switch (type) { case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: @@ -5836,7 +6014,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) * invalidation. */ if (!operand.vpid || - is_noncanonical_address(operand.gla, vcpu)) + is_noncanonical_invlpg_address(operand.gla, vcpu)) return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); vpid_sync_vcpu_addr(vpid02, operand.gla); @@ -5950,7 +6128,7 @@ fail: * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode * EXIT_REASON_VMFUNC as the exit reason. */ - nested_vmx_vmexit(vcpu, vmx->exit_reason.full, + nested_vmx_vmexit(vcpu, vmx->vt.exit_reason.full, vmx_get_intr_info(vcpu), vmx_get_exit_qual(vcpu)); return 1; @@ -6130,7 +6308,7 @@ static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, { u32 encls_leaf; - if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) || + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) || !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) return false; @@ -6208,6 +6386,8 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, else if (is_alignment_check(intr_info) && !vmx_guest_inject_ac(vcpu)) return true; + else if (is_ve_fault(intr_info)) + return true; return false; case EXIT_REASON_EXTERNAL_INTERRUPT: return true; @@ -6393,7 +6573,7 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - union vmx_exit_reason exit_reason = vmx->exit_reason; + union vmx_exit_reason exit_reason = vmx->vt.exit_reason; unsigned long exit_qual; u32 exit_intr_info; @@ -6466,7 +6646,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, vmx = to_vmx(vcpu); vmcs12 = get_vmcs12(vcpu); - if (guest_can_use(vcpu, X86_FEATURE_VMX) && + if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) && (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; @@ -6607,7 +6787,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) return -EINVAL; } else { - if (!guest_can_use(vcpu, X86_FEATURE_VMX)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) return -EINVAL; if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) @@ -6641,7 +6821,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, return -EINVAL; if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && - (!guest_can_use(vcpu, X86_FEATURE_VMX) || + (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) || !vmx->nested.enlightened_vmcs_enabled)) return -EINVAL; @@ -6987,7 +7167,7 @@ static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf, { msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA; msrs->misc_low |= - MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | + VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | VMX_MISC_ACTIVITY_HLT | VMX_MISC_ACTIVITY_WAIT_SIPI; @@ -7002,12 +7182,10 @@ static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs) * guest, and the VMCS structure we give it - not about the * VMX support of the underlying hardware. */ - msrs->basic = - VMCS12_REVISION | - VMX_BASIC_TRUE_CTLS | - ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | - (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); + msrs->basic = vmx_basic_encode_vmcs_info(VMCS12_REVISION, VMCS12_SIZE, + X86_MEMTYPE_WB); + msrs->basic |= VMX_BASIC_TRUE_CTLS; if (cpu_has_vmx_basic_inout()) msrs->basic |= VMX_BASIC_INOUT; } @@ -7025,8 +7203,8 @@ static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs) msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; /* These MSRs specify bits which the guest must keep fixed off. */ - rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); - rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); + rdmsrq(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); + rdmsrq(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); if (vmx_umip_emulated()) msrs->cr4_fixed1 |= X86_CR4_UMIP; diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index cce4e2aa30fb..6eedcfc91070 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -26,8 +26,26 @@ void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu); enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry); bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu); -void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, - u32 exit_intr_info, unsigned long exit_qualification); +void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, + u32 exit_intr_info, unsigned long exit_qualification, + u32 exit_insn_len); + +static inline void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, + u32 exit_intr_info, + unsigned long exit_qualification) +{ + u32 exit_insn_len; + + if (to_vmx(vcpu)->fail || vm_exit_reason == -1 || + (vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) + exit_insn_len = 0; + else + exit_insn_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + + __nested_vmx_vmexit(vcpu, vm_exit_reason, exit_intr_info, + exit_qualification, exit_insn_len); +} + void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu); int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata); @@ -39,11 +57,17 @@ bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) { + lockdep_assert_once(lockdep_is_held(&vcpu->mutex) || + !refcount_read(&vcpu->kvm->users_count)); + return to_vmx(vcpu)->nested.cached_vmcs12; } static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu) { + lockdep_assert_once(lockdep_is_held(&vcpu->mutex) || + !refcount_read(&vcpu->kvm->users_count)); + return to_vmx(vcpu)->nested.cached_shadow_vmcs12; } @@ -109,7 +133,7 @@ static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu) static inline bool nested_cpu_has_vmwrite_any_field(struct kvm_vcpu *vcpu) { return to_vmx(vcpu)->nested.msrs.misc_low & - MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS; + VMX_MISC_VMWRITE_SHADOW_RO_FIELDS; } static inline bool nested_cpu_has_zero_length_injection(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 12ade343a17e..bbf4509f32d0 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -13,12 +13,14 @@ #include <linux/types.h> #include <linux/kvm_host.h> #include <linux/perf_event.h> +#include <asm/msr.h> #include <asm/perf_event.h> #include "x86.h" #include "cpuid.h" #include "lapic.h" #include "nested.h" #include "pmu.h" +#include "tdx.h" /* * Perf's "BASE" is wildly misleading, architectural PMUs use bits 31:16 of ECX @@ -34,6 +36,24 @@ #define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0) +static struct lbr_desc *vcpu_to_lbr_desc(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return NULL; + + return &to_vmx(vcpu)->lbr_desc; +} + +static struct x86_pmu_lbr *vcpu_to_lbr_records(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return NULL; + + return &to_vmx(vcpu)->lbr_desc.records; +} + +#pragma GCC poison to_vmx + static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) { struct kvm_pmc *pmc; @@ -110,7 +130,7 @@ static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, static inline u64 vcpu_get_perf_capabilities(struct kvm_vcpu *vcpu) { - if (!guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM)) return 0; return vcpu->arch.perf_capabilities; @@ -129,6 +149,22 @@ static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr) return get_gp_pmc(pmu, msr, MSR_IA32_PMC0); } +static bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return false; + + return cpuid_model_is_consistent(vcpu); +} + +bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return false; + + return !!vcpu_to_lbr_records(vcpu)->nr; +} + static bool intel_pmu_is_valid_lbr_msr(struct kvm_vcpu *vcpu, u32 index) { struct x86_pmu_lbr *records = vcpu_to_lbr_records(vcpu); @@ -160,7 +196,7 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) ret = vcpu_get_perf_capabilities(vcpu) & PERF_CAP_PEBS_FORMAT; break; case MSR_IA32_DS_AREA: - ret = guest_cpuid_has(vcpu, X86_FEATURE_DS); + ret = guest_cpu_cap_has(vcpu, X86_FEATURE_DS); break; case MSR_PEBS_DATA_CFG: perf_capabilities = vcpu_get_perf_capabilities(vcpu); @@ -194,6 +230,9 @@ static inline void intel_pmu_release_guest_lbr_event(struct kvm_vcpu *vcpu) { struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + if (!lbr_desc) + return; + if (lbr_desc->event) { perf_event_release_kernel(lbr_desc->event); lbr_desc->event = NULL; @@ -235,6 +274,9 @@ int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu) PERF_SAMPLE_BRANCH_USER, }; + if (WARN_ON_ONCE(!lbr_desc)) + return 0; + if (unlikely(lbr_desc->event)) { __set_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use); return 0; @@ -279,9 +321,9 @@ static bool intel_pmu_handle_lbr_msrs_access(struct kvm_vcpu *vcpu, local_irq_disable(); if (lbr_desc->event->state == PERF_EVENT_STATE_ACTIVE) { if (read) - rdmsrl(index, msr_info->data); + rdmsrq(index, msr_info->data); else - wrmsrl(index, msr_info->data); + wrmsrq(index, msr_info->data); __set_bit(INTEL_PMC_IDX_FIXED_VLBR, vcpu_to_pmu(vcpu)->pmc_in_use); local_irq_enable(); return true; @@ -348,14 +390,14 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) switch (msr) { case MSR_CORE_PERF_FIXED_CTR_CTRL: - if (data & pmu->fixed_ctr_ctrl_mask) + if (data & pmu->fixed_ctr_ctrl_rsvd) return 1; if (pmu->fixed_ctr_ctrl != data) reprogram_fixed_counters(pmu, data); break; case MSR_IA32_PEBS_ENABLE: - if (data & pmu->pebs_enable_mask) + if (data & pmu->pebs_enable_rsvd) return 1; if (pmu->pebs_enable != data) { @@ -365,13 +407,13 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } break; case MSR_IA32_DS_AREA: - if (is_noncanonical_address(data, vcpu)) + if (is_noncanonical_msr_address(data, vcpu)) return 1; pmu->ds_area = data; break; case MSR_PEBS_DATA_CFG: - if (data & pmu->pebs_data_cfg_mask) + if (data & pmu->pebs_data_cfg_rsvd) return 1; pmu->pebs_data_cfg = data; @@ -436,8 +478,8 @@ static __always_inline u64 intel_get_fixed_pmc_eventsel(unsigned int index) }; u64 eventsel; - BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_perf_ids) != KVM_PMC_MAX_FIXED); - BUILD_BUG_ON(index >= KVM_PMC_MAX_FIXED); + BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_perf_ids) != KVM_MAX_NR_INTEL_FIXED_COUTNERS); + BUILD_BUG_ON(index >= KVM_MAX_NR_INTEL_FIXED_COUTNERS); /* * Yell if perf reports support for a fixed counter but perf doesn't @@ -448,6 +490,14 @@ static __always_inline u64 intel_get_fixed_pmc_eventsel(unsigned int index) return eventsel; } +static void intel_pmu_enable_fixed_counter_bits(struct kvm_pmu *pmu, u64 bits) +{ + int i; + + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) + pmu->fixed_ctr_ctrl_rsvd &= ~intel_fixed_bits_by_idx(i, bits); +} + static void intel_pmu_refresh(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -456,8 +506,10 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) union cpuid10_eax eax; union cpuid10_edx edx; u64 perf_capabilities; - u64 counter_mask; - int i; + u64 counter_rsvd; + + if (!lbr_desc) + return; memset(&lbr_desc->records, 0, sizeof(lbr_desc->records)); @@ -501,22 +553,24 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) ((u64)1 << edx.split.bit_width_fixed) - 1; } - for (i = 0; i < pmu->nr_arch_fixed_counters; i++) - pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4)); - counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) | + intel_pmu_enable_fixed_counter_bits(pmu, INTEL_FIXED_0_KERNEL | + INTEL_FIXED_0_USER | + INTEL_FIXED_0_ENABLE_PMI); + + counter_rsvd = ~(((1ull << pmu->nr_arch_gp_counters) - 1) | (((1ull << pmu->nr_arch_fixed_counters) - 1) << KVM_FIXED_PMC_BASE_IDX)); - pmu->global_ctrl_mask = counter_mask; + pmu->global_ctrl_rsvd = counter_rsvd; /* * GLOBAL_STATUS and GLOBAL_OVF_CONTROL (a.k.a. GLOBAL_STATUS_RESET) * share reserved bit definitions. The kernel just happens to use * OVF_CTRL for the names. */ - pmu->global_status_mask = pmu->global_ctrl_mask + pmu->global_status_rsvd = pmu->global_ctrl_rsvd & ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF | MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD); if (vmx_pt_mode_is_host_guest()) - pmu->global_status_mask &= + pmu->global_status_rsvd &= ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI; entry = kvm_find_cpuid_entry_index(vcpu, 7, 0); @@ -533,9 +587,9 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters); perf_capabilities = vcpu_get_perf_capabilities(vcpu); - if (cpuid_model_is_consistent(vcpu) && + if (intel_pmu_lbr_is_compatible(vcpu) && (perf_capabilities & PMU_CAP_LBR_FMT)) - x86_perf_get_lbr(&lbr_desc->records); + memcpy(&lbr_desc->records, &vmx_lbr_caps, sizeof(vmx_lbr_caps)); else lbr_desc->records.nr = 0; @@ -544,15 +598,12 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) if (perf_capabilities & PERF_CAP_PEBS_FORMAT) { if (perf_capabilities & PERF_CAP_PEBS_BASELINE) { - pmu->pebs_enable_mask = counter_mask; + pmu->pebs_enable_rsvd = counter_rsvd; pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE; - for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { - pmu->fixed_ctr_ctrl_mask &= - ~(1ULL << (KVM_FIXED_PMC_BASE_IDX + i * 4)); - } - pmu->pebs_data_cfg_mask = ~0xff00000full; + pmu->pebs_data_cfg_rsvd = ~0xff00000full; + intel_pmu_enable_fixed_counter_bits(pmu, ICL_FIXED_0_ADAPTIVE); } else { - pmu->pebs_enable_mask = + pmu->pebs_enable_rsvd = ~((1ull << pmu->nr_arch_gp_counters) - 1); } } @@ -564,14 +615,17 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); - for (i = 0; i < KVM_INTEL_PMC_MAX_GENERIC; i++) { + if (!lbr_desc) + return; + + for (i = 0; i < KVM_MAX_NR_INTEL_GP_COUNTERS; i++) { pmu->gp_counters[i].type = KVM_PMC_GP; pmu->gp_counters[i].vcpu = vcpu; pmu->gp_counters[i].idx = i; pmu->gp_counters[i].current_config = 0; } - for (i = 0; i < KVM_PMC_MAX_FIXED; i++) { + for (i = 0; i < KVM_MAX_NR_INTEL_FIXED_COUTNERS; i++) { pmu->fixed_counters[i].type = KVM_PMC_FIXED; pmu->fixed_counters[i].vcpu = vcpu; pmu->fixed_counters[i].idx = i + KVM_FIXED_PMC_BASE_IDX; @@ -671,6 +725,9 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); + if (WARN_ON_ONCE(!lbr_desc)) + return; + if (!lbr_desc->event) { vmx_disable_lbr_msrs_passthrough(vcpu); if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR) @@ -731,6 +788,6 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = { .deliver_pmi = intel_pmu_deliver_pmi, .cleanup = intel_pmu_cleanup, .EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT, - .MAX_NR_GP_COUNTERS = KVM_INTEL_PMC_MAX_GENERIC, + .MAX_NR_GP_COUNTERS = KVM_MAX_NR_INTEL_GP_COUNTERS, .MIN_NR_GP_COUNTERS = 1, }; diff --git a/arch/x86/kvm/vmx/pmu_intel.h b/arch/x86/kvm/vmx/pmu_intel.h new file mode 100644 index 000000000000..5620d0882cdc --- /dev/null +++ b/arch/x86/kvm/vmx/pmu_intel.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_PMU_INTEL_H +#define __KVM_X86_VMX_PMU_INTEL_H + +#include <linux/kvm_host.h> + +bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu); +int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu); + +struct lbr_desc { + /* Basic info about guest LBR records. */ + struct x86_pmu_lbr records; + + /* + * Emulate LBR feature via passthrough LBR registers when the + * per-vcpu guest LBR event is scheduled on the current pcpu. + * + * The records may be inaccurate if the host reclaims the LBR. + */ + struct perf_event *event; + + /* True if LBRs are marked as not intercepted in the MSR bitmap */ + bool msr_passthrough; +}; + +extern struct x86_pmu_lbr vmx_lbr_caps; + +#endif /* __KVM_X86_VMX_PMU_INTEL_H */ diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index af662312fd07..5c615e5845bf 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -11,6 +11,7 @@ #include "posted_intr.h" #include "trace.h" #include "vmx.h" +#include "tdx.h" /* * Maintain a per-CPU list of vCPUs that need to be awakened by wakeup_handler() @@ -31,9 +32,11 @@ static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu); */ static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock); -static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) +#define PI_LOCK_SCHED_OUT SINGLE_DEPTH_NESTING + +static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) { - return &(to_vmx(vcpu)->pi_desc); + return &(to_vt(vcpu)->pi_desc); } static int pi_try_set_control(struct pi_desc *pi_desc, u64 *pold, u64 new) @@ -53,7 +56,7 @@ static int pi_try_set_control(struct pi_desc *pi_desc, u64 *pold, u64 new) void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); struct pi_desc old, new; unsigned long flags; unsigned int dest; @@ -89,9 +92,20 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) * current pCPU if the task was migrated. */ if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) { - raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); - list_del(&vmx->pi_wakeup_list); - raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); + raw_spinlock_t *spinlock = &per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu); + + /* + * In addition to taking the wakeup lock for the regular/IRQ + * context, tell lockdep it is being taken for the "sched out" + * context as well. vCPU loads happens in task context, and + * this is taking the lock of the *previous* CPU, i.e. can race + * with both the scheduler and the wakeup handler. + */ + raw_spin_lock(spinlock); + spin_acquire(&spinlock->dep_map, PI_LOCK_SCHED_OUT, 0, _RET_IP_); + list_del(&vt->pi_wakeup_list); + spin_release(&spinlock->dep_map, _RET_IP_); + raw_spin_unlock(spinlock); } dest = cpu_physical_id(cpu); @@ -107,7 +121,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) * handle task migration (@cpu != vcpu->cpu). */ new.ndst = dest; - new.sn = 0; + __pi_clear_sn(&new); /* * Restore the notification vector; in the blocking case, the @@ -134,9 +148,8 @@ after_clear_sn: static bool vmx_can_use_vtd_pi(struct kvm *kvm) { - return irqchip_in_kernel(kvm) && enable_apicv && - kvm_arch_has_assigned_device(kvm) && - irq_remapping_cap(IRQ_POSTING_CAP); + return irqchip_in_kernel(kvm) && kvm_arch_has_irq_bypass() && + kvm_arch_has_assigned_device(kvm); } /* @@ -146,18 +159,30 @@ static bool vmx_can_use_vtd_pi(struct kvm *kvm) static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); struct pi_desc old, new; - unsigned long flags; - local_irq_save(flags); + lockdep_assert_irqs_disabled(); - raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); - list_add_tail(&vmx->pi_wakeup_list, + /* + * Acquire the wakeup lock using the "sched out" context to workaround + * a lockdep false positive. When this is called, schedule() holds + * various per-CPU scheduler locks. When the wakeup handler runs, it + * holds this CPU's wakeup lock while calling try_to_wake_up(), which + * can eventually take the aforementioned scheduler locks, which causes + * lockdep to assume there is deadlock. + * + * Deadlock can't actually occur because IRQs are disabled for the + * entirety of the sched_out critical section, i.e. the wakeup handler + * can't run while the scheduler locks are held. + */ + raw_spin_lock_nested(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu), + PI_LOCK_SCHED_OUT); + list_add_tail(&vt->pi_wakeup_list, &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu)); raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); - WARN(pi_desc->sn, "PI descriptor SN field set before blocking"); + WARN(pi_test_sn(pi_desc), "PI descriptor SN field set before blocking"); old.control = READ_ONCE(pi_desc->control); do { @@ -176,8 +201,6 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) */ if (pi_test_on(&new)) __apic_send_IPI_self(POSTED_INTR_WAKEUP_VECTOR); - - local_irq_restore(flags); } static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu) @@ -190,7 +213,8 @@ static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu) * notification vector is switched to the one that calls * back to the pi_wakeup_handler() function. */ - return vmx_can_use_ipiv(vcpu) || vmx_can_use_vtd_pi(vcpu->kvm); + return (vmx_can_use_ipiv(vcpu) && !is_td_vcpu(vcpu)) || + vmx_can_use_vtd_pi(vcpu->kvm); } void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) @@ -200,7 +224,9 @@ void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) if (!vmx_needs_pi_wakeup(vcpu)) return; - if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu)) + if (kvm_vcpu_is_blocking(vcpu) && + ((is_td_vcpu(vcpu) && tdx_interrupt_allowed(vcpu)) || + (!is_td_vcpu(vcpu) && !vmx_interrupt_blocked(vcpu)))) pi_enable_wakeup_handler(vcpu); /* @@ -220,13 +246,13 @@ void pi_wakeup_handler(void) int cpu = smp_processor_id(); struct list_head *wakeup_list = &per_cpu(wakeup_vcpus_on_cpu, cpu); raw_spinlock_t *spinlock = &per_cpu(wakeup_vcpus_on_cpu_lock, cpu); - struct vcpu_vmx *vmx; + struct vcpu_vt *vt; raw_spin_lock(spinlock); - list_for_each_entry(vmx, wakeup_list, pi_wakeup_list) { + list_for_each_entry(vt, wakeup_list, pi_wakeup_list) { - if (pi_test_on(&vmx->pi_desc)) - kvm_vcpu_wake_up(&vmx->vcpu); + if (pi_test_on(&vt->pi_desc)) + kvm_vcpu_wake_up(vt_to_vcpu(vt)); } raw_spin_unlock(spinlock); } @@ -237,6 +263,14 @@ void __init pi_init_cpu(int cpu) raw_spin_lock_init(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu)); } +void pi_apicv_pre_state_restore(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi = vcpu_to_pi_desc(vcpu); + + pi_clear_on(pi); + memset(pi->pir, 0, sizeof(pi->pir)); +} + bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); @@ -254,7 +288,7 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) */ void vmx_pi_start_assignment(struct kvm *kvm) { - if (!irq_remapping_cap(IRQ_POSTING_CAP)) + if (!kvm_arch_has_irq_bypass()) return; kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK); @@ -274,6 +308,7 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, { struct kvm_kernel_irq_routing_entry *e; struct kvm_irq_routing_table *irq_rt; + bool enable_remapped_mode = true; struct kvm_lapic_irq irq; struct kvm_vcpu *vcpu; struct vcpu_data vcpu_info; @@ -312,21 +347,8 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, kvm_set_msi_irq(kvm, e, &irq); if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || - !kvm_irq_is_postable(&irq)) { - /* - * Make sure the IRTE is in remapped mode if - * we don't handle it in posted mode. - */ - ret = irq_set_vcpu_affinity(host_irq, NULL); - if (ret < 0) { - printk(KERN_INFO - "failed to back to remapped mode, irq: %u\n", - host_irq); - goto out; - } - + !kvm_irq_is_postable(&irq)) continue; - } vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); vcpu_info.vector = irq.vector; @@ -334,11 +356,12 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi, vcpu_info.vector, vcpu_info.pi_desc_addr, set); - if (set) - ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); - else - ret = irq_set_vcpu_affinity(host_irq, NULL); + if (!set) + continue; + + enable_remapped_mode = false; + ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); if (ret < 0) { printk(KERN_INFO "%s: failed to update PI IRTE\n", __func__); @@ -346,6 +369,9 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, } } + if (enable_remapped_mode) + ret = irq_set_vcpu_affinity(host_irq, NULL); + ret = 0; out: srcu_read_unlock(&kvm->irq_srcu, idx); diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index 26992076552e..80499ea0e674 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -2,105 +2,25 @@ #ifndef __KVM_X86_VMX_POSTED_INTR_H #define __KVM_X86_VMX_POSTED_INTR_H -#define POSTED_INTR_ON 0 -#define POSTED_INTR_SN 1 - -#define PID_TABLE_ENTRY_VALID 1 - -/* Posted-Interrupt Descriptor */ -struct pi_desc { - u32 pir[8]; /* Posted interrupt requested */ - union { - struct { - /* bit 256 - Outstanding Notification */ - u16 on : 1, - /* bit 257 - Suppress Notification */ - sn : 1, - /* bit 271:258 - Reserved */ - rsvd_1 : 14; - /* bit 279:272 - Notification Vector */ - u8 nv; - /* bit 287:280 - Reserved */ - u8 rsvd_2; - /* bit 319:288 - Notification Destination */ - u32 ndst; - }; - u64 control; - }; - u32 rsvd[6]; -} __aligned(64); - -static inline bool pi_test_and_set_on(struct pi_desc *pi_desc) -{ - return test_and_set_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc) -{ - return test_and_clear_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc) -{ - return test_and_clear_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - -static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) -{ - return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); -} - -static inline bool pi_is_pir_empty(struct pi_desc *pi_desc) -{ - return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS); -} - -static inline void pi_set_sn(struct pi_desc *pi_desc) -{ - set_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - -static inline void pi_set_on(struct pi_desc *pi_desc) -{ - set_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline void pi_clear_on(struct pi_desc *pi_desc) -{ - clear_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline void pi_clear_sn(struct pi_desc *pi_desc) -{ - clear_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - -static inline bool pi_test_on(struct pi_desc *pi_desc) -{ - return test_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline bool pi_test_sn(struct pi_desc *pi_desc) -{ - return test_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} +#include <linux/bitmap.h> +#include <asm/posted_intr.h> void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu); void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu); void pi_wakeup_handler(void); void __init pi_init_cpu(int cpu); +void pi_apicv_pre_state_restore(struct kvm_vcpu *vcpu); bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu); int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); void vmx_pi_start_assignment(struct kvm *kvm); +static inline int pi_find_highest_vector(struct pi_desc *pi_desc) +{ + int vec; + + vec = find_last_bit(pi_desc->pir, 256); + return vec < 256 ? vec : -1; +} + #endif /* __KVM_X86_VMX_POSTED_INTR_H */ diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index 6fef01e0536e..df1d0cf76947 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -2,14 +2,14 @@ /* Copyright(c) 2021 Intel Corporation. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <asm/msr.h> #include <asm/sgx.h> -#include "cpuid.h" +#include "x86.h" #include "kvm_cache_regs.h" #include "nested.h" #include "sgx.h" #include "vmx.h" -#include "x86.h" bool __read_mostly enable_sgx = 1; module_param_named(sgx, enable_sgx, bool, 0444); @@ -38,7 +38,7 @@ static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset, fault = true; } else if (likely(is_64_bit_mode(vcpu))) { *gva = vmx_get_untagged_addr(vcpu, *gva, 0); - fault = is_noncanonical_address(*gva, vcpu); + fault = is_noncanonical_address(*gva, vcpu, 0); } else { *gva &= 0xffffffff; fault = (s.unusable) || @@ -123,7 +123,7 @@ static int sgx_inject_fault(struct kvm_vcpu *vcpu, gva_t gva, int trapnr) * likely than a bad userspace address. */ if ((trapnr == PF_VECTOR || !boot_cpu_has(X86_FEATURE_SGX2)) && - guest_cpuid_has(vcpu, X86_FEATURE_SGX2)) { + guest_cpu_cap_has(vcpu, X86_FEATURE_SGX2)) { memset(&ex, 0, sizeof(ex)); ex.vector = PF_VECTOR; ex.error_code = PFERR_PRESENT_MASK | PFERR_WRITE_MASK | @@ -274,7 +274,7 @@ static int handle_encls_ecreate(struct kvm_vcpu *vcpu) * simultaneously set SGX_ATTR_PROVISIONKEY to bypass the check to * enforce restriction of access to the PROVISIONKEY. */ - contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL_ACCOUNT); + contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL); if (!contents) return -ENOMEM; @@ -366,7 +366,7 @@ static inline bool encls_leaf_enabled_in_guest(struct kvm_vcpu *vcpu, u32 leaf) return true; if (leaf >= EAUG && leaf <= EMODT) - return guest_cpuid_has(vcpu, X86_FEATURE_SGX2); + return guest_cpu_cap_has(vcpu, X86_FEATURE_SGX2); return false; } @@ -382,8 +382,8 @@ int handle_encls(struct kvm_vcpu *vcpu) { u32 leaf = (u32)kvm_rax_read(vcpu); - if (!enable_sgx || !guest_cpuid_has(vcpu, X86_FEATURE_SGX) || - !guest_cpuid_has(vcpu, X86_FEATURE_SGX1)) { + if (!enable_sgx || !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) || + !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX1)) { kvm_queue_exception(vcpu, UD_VECTOR); } else if (!encls_leaf_enabled_in_guest(vcpu, leaf) || !sgx_enabled_in_guest_bios(vcpu) || !is_paging(vcpu)) { @@ -412,16 +412,16 @@ void setup_default_sgx_lepubkeyhash(void) * MSRs exist but are read-only (locked and not writable). */ if (!enable_sgx || boot_cpu_has(X86_FEATURE_SGX_LC) || - rdmsrl_safe(MSR_IA32_SGXLEPUBKEYHASH0, &sgx_pubkey_hash[0])) { + rdmsrq_safe(MSR_IA32_SGXLEPUBKEYHASH0, &sgx_pubkey_hash[0])) { sgx_pubkey_hash[0] = 0xa6053e051270b7acULL; sgx_pubkey_hash[1] = 0x6cfbe8ba8b3b413dULL; sgx_pubkey_hash[2] = 0xc4916d99f2b3735dULL; sgx_pubkey_hash[3] = 0xd4f8c05909f9bb3bULL; } else { /* MSR_IA32_SGXLEPUBKEYHASH0 is read above */ - rdmsrl(MSR_IA32_SGXLEPUBKEYHASH1, sgx_pubkey_hash[1]); - rdmsrl(MSR_IA32_SGXLEPUBKEYHASH2, sgx_pubkey_hash[2]); - rdmsrl(MSR_IA32_SGXLEPUBKEYHASH3, sgx_pubkey_hash[3]); + rdmsrq(MSR_IA32_SGXLEPUBKEYHASH1, sgx_pubkey_hash[1]); + rdmsrq(MSR_IA32_SGXLEPUBKEYHASH2, sgx_pubkey_hash[2]); + rdmsrq(MSR_IA32_SGXLEPUBKEYHASH3, sgx_pubkey_hash[3]); } } @@ -480,15 +480,15 @@ void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (!cpu_has_vmx_encls_vmexit()) return; - if (guest_cpuid_has(vcpu, X86_FEATURE_SGX) && + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) && sgx_enabled_in_guest_bios(vcpu)) { - if (guest_cpuid_has(vcpu, X86_FEATURE_SGX1)) { + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX1)) { bitmap &= ~GENMASK_ULL(ETRACK, ECREATE); if (sgx_intercept_encls_ecreate(vcpu)) bitmap |= (1 << ECREATE); } - if (guest_cpuid_has(vcpu, X86_FEATURE_SGX2)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX2)) bitmap &= ~GENMASK_ULL(EMODT, EAUG); /* diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c new file mode 100644 index 000000000000..1ad20c273f3b --- /dev/null +++ b/arch/x86/kvm/vmx/tdx.c @@ -0,0 +1,3595 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/cleanup.h> +#include <linux/cpu.h> +#include <asm/cpufeature.h> +#include <asm/fpu/xcr.h> +#include <linux/misc_cgroup.h> +#include <linux/mmu_context.h> +#include <asm/tdx.h> +#include "capabilities.h" +#include "mmu.h" +#include "x86_ops.h" +#include "lapic.h" +#include "tdx.h" +#include "vmx.h" +#include "mmu/spte.h" +#include "common.h" +#include "posted_intr.h" +#include "irq.h" +#include <trace/events/kvm.h> +#include "trace.h" + +#pragma GCC poison to_vmx + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#define pr_tdx_error(__fn, __err) \ + pr_err_ratelimited("SEAMCALL %s failed: 0x%llx\n", #__fn, __err) + +#define __pr_tdx_error_N(__fn_str, __err, __fmt, ...) \ + pr_err_ratelimited("SEAMCALL " __fn_str " failed: 0x%llx, " __fmt, __err, __VA_ARGS__) + +#define pr_tdx_error_1(__fn, __err, __rcx) \ + __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx\n", __rcx) + +#define pr_tdx_error_2(__fn, __err, __rcx, __rdx) \ + __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx\n", __rcx, __rdx) + +#define pr_tdx_error_3(__fn, __err, __rcx, __rdx, __r8) \ + __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx, r8 0x%llx\n", __rcx, __rdx, __r8) + +bool enable_tdx __ro_after_init; +module_param_named(tdx, enable_tdx, bool, 0444); + +#define TDX_SHARED_BIT_PWL_5 gpa_to_gfn(BIT_ULL(51)) +#define TDX_SHARED_BIT_PWL_4 gpa_to_gfn(BIT_ULL(47)) + +static enum cpuhp_state tdx_cpuhp_state; + +static const struct tdx_sys_info *tdx_sysinfo; + +void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err) +{ + KVM_BUG_ON(1, tdx->vcpu.kvm); + pr_err("TDH_VP_RD[%s.0x%x] failed 0x%llx\n", uclass, field, err); +} + +void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field, + u64 val, u64 err) +{ + KVM_BUG_ON(1, tdx->vcpu.kvm); + pr_err("TDH_VP_WR[%s.0x%x]%s0x%llx failed: 0x%llx\n", uclass, field, op, val, err); +} + +#define KVM_SUPPORTED_TD_ATTRS (TDX_TD_ATTR_SEPT_VE_DISABLE) + +static __always_inline struct kvm_tdx *to_kvm_tdx(struct kvm *kvm) +{ + return container_of(kvm, struct kvm_tdx, kvm); +} + +static __always_inline struct vcpu_tdx *to_tdx(struct kvm_vcpu *vcpu) +{ + return container_of(vcpu, struct vcpu_tdx, vcpu); +} + +static u64 tdx_get_supported_attrs(const struct tdx_sys_info_td_conf *td_conf) +{ + u64 val = KVM_SUPPORTED_TD_ATTRS; + + if ((val & td_conf->attributes_fixed1) != td_conf->attributes_fixed1) + return 0; + + val &= td_conf->attributes_fixed0; + + return val; +} + +static u64 tdx_get_supported_xfam(const struct tdx_sys_info_td_conf *td_conf) +{ + u64 val = kvm_caps.supported_xcr0 | kvm_caps.supported_xss; + + if ((val & td_conf->xfam_fixed1) != td_conf->xfam_fixed1) + return 0; + + val &= td_conf->xfam_fixed0; + + return val; +} + +static int tdx_get_guest_phys_addr_bits(const u32 eax) +{ + return (eax & GENMASK(23, 16)) >> 16; +} + +static u32 tdx_set_guest_phys_addr_bits(const u32 eax, int addr_bits) +{ + return (eax & ~GENMASK(23, 16)) | (addr_bits & 0xff) << 16; +} + +#define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM)) + +static bool has_tsx(const struct kvm_cpuid_entry2 *entry) +{ + return entry->function == 7 && entry->index == 0 && + (entry->ebx & TDX_FEATURE_TSX); +} + +static void clear_tsx(struct kvm_cpuid_entry2 *entry) +{ + entry->ebx &= ~TDX_FEATURE_TSX; +} + +static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry) +{ + return entry->function == 7 && entry->index == 0 && + (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG)); +} + +static void clear_waitpkg(struct kvm_cpuid_entry2 *entry) +{ + entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG); +} + +static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry) +{ + if (has_tsx(entry)) + clear_tsx(entry); + + if (has_waitpkg(entry)) + clear_waitpkg(entry); +} + +static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry) +{ + return has_tsx(entry) || has_waitpkg(entry); +} + +#define KVM_TDX_CPUID_NO_SUBLEAF ((__u32)-1) + +static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char idx) +{ + const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; + + entry->function = (u32)td_conf->cpuid_config_leaves[idx]; + entry->index = td_conf->cpuid_config_leaves[idx] >> 32; + entry->eax = (u32)td_conf->cpuid_config_values[idx][0]; + entry->ebx = td_conf->cpuid_config_values[idx][0] >> 32; + entry->ecx = (u32)td_conf->cpuid_config_values[idx][1]; + entry->edx = td_conf->cpuid_config_values[idx][1] >> 32; + + if (entry->index == KVM_TDX_CPUID_NO_SUBLEAF) + entry->index = 0; + + /* + * The TDX module doesn't allow configuring the guest phys addr bits + * (EAX[23:16]). However, KVM uses it as an interface to the userspace + * to configure the GPAW. Report these bits as configurable. + */ + if (entry->function == 0x80000008) + entry->eax = tdx_set_guest_phys_addr_bits(entry->eax, 0xff); + + tdx_clear_unsupported_cpuid(entry); +} + +static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf, + struct kvm_tdx_capabilities *caps) +{ + int i; + + caps->supported_attrs = tdx_get_supported_attrs(td_conf); + if (!caps->supported_attrs) + return -EIO; + + caps->supported_xfam = tdx_get_supported_xfam(td_conf); + if (!caps->supported_xfam) + return -EIO; + + caps->cpuid.nent = td_conf->num_cpuid_config; + + for (i = 0; i < td_conf->num_cpuid_config; i++) + td_init_cpuid_entry2(&caps->cpuid.entries[i], i); + + return 0; +} + +/* + * Some SEAMCALLs acquire the TDX module globally, and can fail with + * TDX_OPERAND_BUSY. Use a global mutex to serialize these SEAMCALLs. + */ +static DEFINE_MUTEX(tdx_lock); + +static atomic_t nr_configured_hkid; + +static bool tdx_operand_busy(u64 err) +{ + return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY; +} + + +/* + * A per-CPU list of TD vCPUs associated with a given CPU. + * Protected by interrupt mask. Only manipulated by the CPU owning this per-CPU + * list. + * - When a vCPU is loaded onto a CPU, it is removed from the per-CPU list of + * the old CPU during the IPI callback running on the old CPU, and then added + * to the per-CPU list of the new CPU. + * - When a TD is tearing down, all vCPUs are disassociated from their current + * running CPUs and removed from the per-CPU list during the IPI callback + * running on those CPUs. + * - When a CPU is brought down, traverse the per-CPU list to disassociate all + * associated TD vCPUs and remove them from the per-CPU list. + */ +static DEFINE_PER_CPU(struct list_head, associated_tdvcpus); + +static __always_inline unsigned long tdvmcall_exit_type(struct kvm_vcpu *vcpu) +{ + return to_tdx(vcpu)->vp_enter_args.r10; +} + +static __always_inline unsigned long tdvmcall_leaf(struct kvm_vcpu *vcpu) +{ + return to_tdx(vcpu)->vp_enter_args.r11; +} + +static __always_inline void tdvmcall_set_return_code(struct kvm_vcpu *vcpu, + long val) +{ + to_tdx(vcpu)->vp_enter_args.r10 = val; +} + +static __always_inline void tdvmcall_set_return_val(struct kvm_vcpu *vcpu, + unsigned long val) +{ + to_tdx(vcpu)->vp_enter_args.r11 = val; +} + +static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx) +{ + tdx_guest_keyid_free(kvm_tdx->hkid); + kvm_tdx->hkid = -1; + atomic_dec(&nr_configured_hkid); + misc_cg_uncharge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1); + put_misc_cg(kvm_tdx->misc_cg); + kvm_tdx->misc_cg = NULL; +} + +static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx) +{ + return kvm_tdx->hkid > 0; +} + +static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu) +{ + lockdep_assert_irqs_disabled(); + + list_del(&to_tdx(vcpu)->cpu_list); + + /* + * Ensure tdx->cpu_list is updated before setting vcpu->cpu to -1, + * otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU + * to its list before it's deleted from this CPU's list. + */ + smp_wmb(); + + vcpu->cpu = -1; +} + +static void tdx_clear_page(struct page *page) +{ + const void *zero_page = (const void *) page_to_virt(ZERO_PAGE(0)); + void *dest = page_to_virt(page); + unsigned long i; + + /* + * The page could have been poisoned. MOVDIR64B also clears + * the poison bit so the kernel can safely use the page again. + */ + for (i = 0; i < PAGE_SIZE; i += 64) + movdir64b(dest + i, zero_page); + /* + * MOVDIR64B store uses WC buffer. Prevent following memory reads + * from seeing potentially poisoned cache. + */ + __mb(); +} + +static void tdx_no_vcpus_enter_start(struct kvm *kvm) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + + lockdep_assert_held_write(&kvm->mmu_lock); + + WRITE_ONCE(kvm_tdx->wait_for_sept_zap, true); + + kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); +} + +static void tdx_no_vcpus_enter_stop(struct kvm *kvm) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + + lockdep_assert_held_write(&kvm->mmu_lock); + + WRITE_ONCE(kvm_tdx->wait_for_sept_zap, false); +} + +/* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */ +static int __tdx_reclaim_page(struct page *page) +{ + u64 err, rcx, rdx, r8; + + err = tdh_phymem_page_reclaim(page, &rcx, &rdx, &r8); + + /* + * No need to check for TDX_OPERAND_BUSY; all TD pages are freed + * before the HKID is released and control pages have also been + * released at this point, so there is no possibility of contention. + */ + if (WARN_ON_ONCE(err)) { + pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8); + return -EIO; + } + return 0; +} + +static int tdx_reclaim_page(struct page *page) +{ + int r; + + r = __tdx_reclaim_page(page); + if (!r) + tdx_clear_page(page); + return r; +} + + +/* + * Reclaim the TD control page(s) which are crypto-protected by TDX guest's + * private KeyID. Assume the cache associated with the TDX private KeyID has + * been flushed. + */ +static void tdx_reclaim_control_page(struct page *ctrl_page) +{ + /* + * Leak the page if the kernel failed to reclaim the page. + * The kernel cannot use it safely anymore. + */ + if (tdx_reclaim_page(ctrl_page)) + return; + + __free_page(ctrl_page); +} + +struct tdx_flush_vp_arg { + struct kvm_vcpu *vcpu; + u64 err; +}; + +static void tdx_flush_vp(void *_arg) +{ + struct tdx_flush_vp_arg *arg = _arg; + struct kvm_vcpu *vcpu = arg->vcpu; + u64 err; + + arg->err = 0; + lockdep_assert_irqs_disabled(); + + /* Task migration can race with CPU offlining. */ + if (unlikely(vcpu->cpu != raw_smp_processor_id())) + return; + + /* + * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized. The + * list tracking still needs to be updated so that it's correct if/when + * the vCPU does get initialized. + */ + if (to_tdx(vcpu)->state != VCPU_TD_STATE_UNINITIALIZED) { + /* + * No need to retry. TDX Resources needed for TDH.VP.FLUSH are: + * TDVPR as exclusive, TDR as shared, and TDCS as shared. This + * vp flush function is called when destructing vCPU/TD or vCPU + * migration. No other thread uses TDVPR in those cases. + */ + err = tdh_vp_flush(&to_tdx(vcpu)->vp); + if (unlikely(err && err != TDX_VCPU_NOT_ASSOCIATED)) { + /* + * This function is called in IPI context. Do not use + * printk to avoid console semaphore. + * The caller prints out the error message, instead. + */ + if (err) + arg->err = err; + } + } + + tdx_disassociate_vp(vcpu); +} + +static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu) +{ + struct tdx_flush_vp_arg arg = { + .vcpu = vcpu, + }; + int cpu = vcpu->cpu; + + if (unlikely(cpu == -1)) + return; + + smp_call_function_single(cpu, tdx_flush_vp, &arg, 1); + if (KVM_BUG_ON(arg.err, vcpu->kvm)) + pr_tdx_error(TDH_VP_FLUSH, arg.err); +} + +void tdx_disable_virtualization_cpu(void) +{ + int cpu = raw_smp_processor_id(); + struct list_head *tdvcpus = &per_cpu(associated_tdvcpus, cpu); + struct tdx_flush_vp_arg arg; + struct vcpu_tdx *tdx, *tmp; + unsigned long flags; + + local_irq_save(flags); + /* Safe variant needed as tdx_disassociate_vp() deletes the entry. */ + list_for_each_entry_safe(tdx, tmp, tdvcpus, cpu_list) { + arg.vcpu = &tdx->vcpu; + tdx_flush_vp(&arg); + } + local_irq_restore(flags); +} + +#define TDX_SEAMCALL_RETRIES 10000 + +static void smp_func_do_phymem_cache_wb(void *unused) +{ + u64 err = 0; + bool resume; + int i; + + /* + * TDH.PHYMEM.CACHE.WB flushes caches associated with any TDX private + * KeyID on the package or core. The TDX module may not finish the + * cache flush but return TDX_INTERRUPTED_RESUMEABLE instead. The + * kernel should retry it until it returns success w/o rescheduling. + */ + for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) { + resume = !!err; + err = tdh_phymem_cache_wb(resume); + switch (err) { + case TDX_INTERRUPTED_RESUMABLE: + continue; + case TDX_NO_HKID_READY_TO_WBCACHE: + err = TDX_SUCCESS; /* Already done by other thread */ + fallthrough; + default: + goto out; + } + } + +out: + if (WARN_ON_ONCE(err)) + pr_tdx_error(TDH_PHYMEM_CACHE_WB, err); +} + +void tdx_mmu_release_hkid(struct kvm *kvm) +{ + bool packages_allocated, targets_allocated; + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + cpumask_var_t packages, targets; + struct kvm_vcpu *vcpu; + unsigned long j; + int i; + u64 err; + + if (!is_hkid_assigned(kvm_tdx)) + return; + + packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL); + targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL); + cpus_read_lock(); + + kvm_for_each_vcpu(j, vcpu, kvm) + tdx_flush_vp_on_cpu(vcpu); + + /* + * TDH.PHYMEM.CACHE.WB tries to acquire the TDX module global lock + * and can fail with TDX_OPERAND_BUSY when it fails to get the lock. + * Multiple TDX guests can be destroyed simultaneously. Take the + * mutex to prevent it from getting error. + */ + mutex_lock(&tdx_lock); + + /* + * Releasing HKID is in vm_destroy(). + * After the above flushing vps, there should be no more vCPU + * associations, as all vCPU fds have been released at this stage. + */ + err = tdh_mng_vpflushdone(&kvm_tdx->td); + if (err == TDX_FLUSHVP_NOT_DONE) + goto out; + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error(TDH_MNG_VPFLUSHDONE, err); + pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n", + kvm_tdx->hkid); + goto out; + } + + for_each_online_cpu(i) { + if (packages_allocated && + cpumask_test_and_set_cpu(topology_physical_package_id(i), + packages)) + continue; + if (targets_allocated) + cpumask_set_cpu(i, targets); + } + if (targets_allocated) + on_each_cpu_mask(targets, smp_func_do_phymem_cache_wb, NULL, true); + else + on_each_cpu(smp_func_do_phymem_cache_wb, NULL, true); + /* + * In the case of error in smp_func_do_phymem_cache_wb(), the following + * tdh_mng_key_freeid() will fail. + */ + err = tdh_mng_key_freeid(&kvm_tdx->td); + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error(TDH_MNG_KEY_FREEID, err); + pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n", + kvm_tdx->hkid); + } else { + tdx_hkid_free(kvm_tdx); + } + +out: + mutex_unlock(&tdx_lock); + cpus_read_unlock(); + free_cpumask_var(targets); + free_cpumask_var(packages); +} + +static void tdx_reclaim_td_control_pages(struct kvm *kvm) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + u64 err; + int i; + + /* + * tdx_mmu_release_hkid() failed to reclaim HKID. Something went wrong + * heavily with TDX module. Give up freeing TD pages. As the function + * already warned, don't warn it again. + */ + if (is_hkid_assigned(kvm_tdx)) + return; + + if (kvm_tdx->td.tdcs_pages) { + for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { + if (!kvm_tdx->td.tdcs_pages[i]) + continue; + + tdx_reclaim_control_page(kvm_tdx->td.tdcs_pages[i]); + } + kfree(kvm_tdx->td.tdcs_pages); + kvm_tdx->td.tdcs_pages = NULL; + } + + if (!kvm_tdx->td.tdr_page) + return; + + if (__tdx_reclaim_page(kvm_tdx->td.tdr_page)) + return; + + /* + * Use a SEAMCALL to ask the TDX module to flush the cache based on the + * KeyID. TDX module may access TDR while operating on TD (Especially + * when it is reclaiming TDCS). + */ + err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td); + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); + return; + } + tdx_clear_page(kvm_tdx->td.tdr_page); + + __free_page(kvm_tdx->td.tdr_page); + kvm_tdx->td.tdr_page = NULL; +} + +void tdx_vm_destroy(struct kvm *kvm) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + + tdx_reclaim_td_control_pages(kvm); + + kvm_tdx->state = TD_STATE_UNINITIALIZED; +} + +static int tdx_do_tdh_mng_key_config(void *param) +{ + struct kvm_tdx *kvm_tdx = param; + u64 err; + + /* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */ + err = tdh_mng_key_config(&kvm_tdx->td); + + if (KVM_BUG_ON(err, &kvm_tdx->kvm)) { + pr_tdx_error(TDH_MNG_KEY_CONFIG, err); + return -EIO; + } + + return 0; +} + +int tdx_vm_init(struct kvm *kvm) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + + kvm->arch.has_protected_state = true; + kvm->arch.has_private_mem = true; + kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT; + + /* + * Because guest TD is protected, VMM can't parse the instruction in TD. + * Instead, guest uses MMIO hypercall. For unmodified device driver, + * #VE needs to be injected for MMIO and #VE handler in TD converts MMIO + * instruction into MMIO hypercall. + * + * SPTE value for MMIO needs to be setup so that #VE is injected into + * TD instead of triggering EPT MISCONFIG. + * - RWX=0 so that EPT violation is triggered. + * - suppress #VE bit is cleared to inject #VE. + */ + kvm_mmu_set_mmio_spte_value(kvm, 0); + + /* + * TDX has its own limit of maximum vCPUs it can support for all + * TDX guests in addition to KVM_MAX_VCPUS. TDX module reports + * such limit via the MAX_VCPU_PER_TD global metadata. In + * practice, it reflects the number of logical CPUs that ALL + * platforms that the TDX module supports can possibly have. + * + * Limit TDX guest's maximum vCPUs to the number of logical CPUs + * the platform has. Simply forwarding the MAX_VCPU_PER_TD to + * userspace would result in an unpredictable ABI. + */ + kvm->max_vcpus = min_t(int, kvm->max_vcpus, num_present_cpus()); + + kvm_tdx->state = TD_STATE_UNINITIALIZED; + + return 0; +} + +int tdx_vcpu_create(struct kvm_vcpu *vcpu) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); + struct vcpu_tdx *tdx = to_tdx(vcpu); + + if (kvm_tdx->state != TD_STATE_INITIALIZED) + return -EIO; + + /* + * TDX module mandates APICv, which requires an in-kernel local APIC. + * Disallow an in-kernel I/O APIC, because level-triggered interrupts + * and thus the I/O APIC as a whole can't be faithfully emulated in KVM. + */ + if (!irqchip_split(vcpu->kvm)) + return -EINVAL; + + fpstate_set_confidential(&vcpu->arch.guest_fpu); + vcpu->arch.apic->guest_apic_protected = true; + INIT_LIST_HEAD(&tdx->vt.pi_wakeup_list); + + vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX; + + vcpu->arch.switch_db_regs = KVM_DEBUGREG_AUTO_SWITCH; + vcpu->arch.cr0_guest_owned_bits = -1ul; + vcpu->arch.cr4_guest_owned_bits = -1ul; + + /* KVM can't change TSC offset/multiplier as TDX module manages them. */ + vcpu->arch.guest_tsc_protected = true; + vcpu->arch.tsc_offset = kvm_tdx->tsc_offset; + vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset; + vcpu->arch.tsc_scaling_ratio = kvm_tdx->tsc_multiplier; + vcpu->arch.l1_tsc_scaling_ratio = kvm_tdx->tsc_multiplier; + + vcpu->arch.guest_state_protected = + !(to_kvm_tdx(vcpu->kvm)->attributes & TDX_TD_ATTR_DEBUG); + + if ((kvm_tdx->xfam & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE) + vcpu->arch.xfd_no_write_intercept = true; + + tdx->vt.pi_desc.nv = POSTED_INTR_VECTOR; + __pi_set_sn(&tdx->vt.pi_desc); + + tdx->state = VCPU_TD_STATE_UNINITIALIZED; + + return 0; +} + +void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + + vmx_vcpu_pi_load(vcpu, cpu); + if (vcpu->cpu == cpu || !is_hkid_assigned(to_kvm_tdx(vcpu->kvm))) + return; + + tdx_flush_vp_on_cpu(vcpu); + + KVM_BUG_ON(cpu != raw_smp_processor_id(), vcpu->kvm); + local_irq_disable(); + /* + * Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure + * vcpu->cpu is read before tdx->cpu_list. + */ + smp_rmb(); + + list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu)); + local_irq_enable(); +} + +bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu) +{ + /* + * KVM can't get the interrupt status of TDX guest and it assumes + * interrupt is always allowed unless TDX guest calls TDVMCALL with HLT, + * which passes the interrupt blocked flag. + */ + return vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT || + !to_tdx(vcpu)->vp_enter_args.r12; +} + +bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu) +{ + u64 vcpu_state_details; + + if (pi_has_pending_interrupt(vcpu)) + return true; + + /* + * Only check RVI pending for HALTED case with IRQ enabled. + * For non-HLT cases, KVM doesn't care about STI/SS shadows. And if the + * interrupt was pending before TD exit, then it _must_ be blocked, + * otherwise the interrupt would have been serviced at the instruction + * boundary. + */ + if (vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT || + to_tdx(vcpu)->vp_enter_args.r12) + return false; + + vcpu_state_details = + td_state_non_arch_read64(to_tdx(vcpu), TD_VCPU_STATE_DETAILS_NON_ARCH); + + return tdx_vcpu_state_details_intr_pending(vcpu_state_details); +} + +/* + * Compared to vmx_prepare_switch_to_guest(), there is not much to do + * as SEAMCALL/SEAMRET calls take care of most of save and restore. + */ +void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) +{ + struct vcpu_vt *vt = to_vt(vcpu); + + if (vt->guest_state_loaded) + return; + + if (likely(is_64bit_mm(current->mm))) + vt->msr_host_kernel_gs_base = current->thread.gsbase; + else + vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); + + vt->host_debugctlmsr = get_debugctlmsr(); + + vt->guest_state_loaded = true; +} + +struct tdx_uret_msr { + u32 msr; + unsigned int slot; + u64 defval; +}; + +static struct tdx_uret_msr tdx_uret_msrs[] = { + {.msr = MSR_SYSCALL_MASK, .defval = 0x20200 }, + {.msr = MSR_STAR,}, + {.msr = MSR_LSTAR,}, + {.msr = MSR_TSC_AUX,}, +}; + +static void tdx_user_return_msr_update_cache(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) + kvm_user_return_msr_update_cache(tdx_uret_msrs[i].slot, + tdx_uret_msrs[i].defval); +} + +static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu) +{ + struct vcpu_vt *vt = to_vt(vcpu); + struct vcpu_tdx *tdx = to_tdx(vcpu); + + if (!vt->guest_state_loaded) + return; + + ++vcpu->stat.host_state_reload; + wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base); + + if (tdx->guest_entered) { + tdx_user_return_msr_update_cache(); + tdx->guest_entered = false; + } + + vt->guest_state_loaded = false; +} + +void tdx_vcpu_put(struct kvm_vcpu *vcpu) +{ + vmx_vcpu_pi_put(vcpu); + tdx_prepare_switch_to_host(vcpu); +} + +void tdx_vcpu_free(struct kvm_vcpu *vcpu) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); + struct vcpu_tdx *tdx = to_tdx(vcpu); + int i; + + /* + * It is not possible to reclaim pages while hkid is assigned. It might + * be assigned if: + * 1. the TD VM is being destroyed but freeing hkid failed, in which + * case the pages are leaked + * 2. TD VCPU creation failed and this on the error path, in which case + * there is nothing to do anyway + */ + if (is_hkid_assigned(kvm_tdx)) + return; + + if (tdx->vp.tdcx_pages) { + for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { + if (tdx->vp.tdcx_pages[i]) + tdx_reclaim_control_page(tdx->vp.tdcx_pages[i]); + } + kfree(tdx->vp.tdcx_pages); + tdx->vp.tdcx_pages = NULL; + } + if (tdx->vp.tdvpr_page) { + tdx_reclaim_control_page(tdx->vp.tdvpr_page); + tdx->vp.tdvpr_page = 0; + } + + tdx->state = VCPU_TD_STATE_UNINITIALIZED; +} + +int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu) +{ + if (unlikely(to_tdx(vcpu)->state != VCPU_TD_STATE_INITIALIZED || + to_kvm_tdx(vcpu->kvm)->state != TD_STATE_RUNNABLE)) + return -EINVAL; + + return 1; +} + +static __always_inline u32 tdcall_to_vmx_exit_reason(struct kvm_vcpu *vcpu) +{ + switch (tdvmcall_leaf(vcpu)) { + case EXIT_REASON_CPUID: + case EXIT_REASON_HLT: + case EXIT_REASON_IO_INSTRUCTION: + case EXIT_REASON_MSR_READ: + case EXIT_REASON_MSR_WRITE: + return tdvmcall_leaf(vcpu); + case EXIT_REASON_EPT_VIOLATION: + return EXIT_REASON_EPT_MISCONFIG; + default: + break; + } + + return EXIT_REASON_TDCALL; +} + +static __always_inline u32 tdx_to_vmx_exit_reason(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + u32 exit_reason; + + switch (tdx->vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) { + case TDX_SUCCESS: + case TDX_NON_RECOVERABLE_VCPU: + case TDX_NON_RECOVERABLE_TD: + case TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE: + case TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE: + break; + default: + return -1u; + } + + exit_reason = tdx->vp_enter_ret; + + switch (exit_reason) { + case EXIT_REASON_TDCALL: + if (tdvmcall_exit_type(vcpu)) + return EXIT_REASON_VMCALL; + + return tdcall_to_vmx_exit_reason(vcpu); + case EXIT_REASON_EPT_MISCONFIG: + /* + * Defer KVM_BUG_ON() until tdx_handle_exit() because this is in + * non-instrumentable code with interrupts disabled. + */ + return -1u; + default: + break; + } + + return exit_reason; +} + +static noinstr void tdx_vcpu_enter_exit(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); + + guest_state_enter_irqoff(); + + tdx->vp_enter_ret = tdh_vp_enter(&tdx->vp, &tdx->vp_enter_args); + + vt->exit_reason.full = tdx_to_vmx_exit_reason(vcpu); + + vt->exit_qualification = tdx->vp_enter_args.rcx; + tdx->ext_exit_qualification = tdx->vp_enter_args.rdx; + tdx->exit_gpa = tdx->vp_enter_args.r8; + vt->exit_intr_info = tdx->vp_enter_args.r9; + + vmx_handle_nmi(vcpu); + + guest_state_exit_irqoff(); +} + +static bool tdx_failed_vmentry(struct kvm_vcpu *vcpu) +{ + return vmx_get_exit_reason(vcpu).failed_vmentry && + vmx_get_exit_reason(vcpu).full != -1u; +} + +static fastpath_t tdx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) +{ + u64 vp_enter_ret = to_tdx(vcpu)->vp_enter_ret; + + /* + * TDX_OPERAND_BUSY could be returned for SEPT due to 0-step mitigation + * or for TD EPOCH due to contention with TDH.MEM.TRACK on TDH.VP.ENTER. + * + * When KVM requests KVM_REQ_OUTSIDE_GUEST_MODE, which has both + * KVM_REQUEST_WAIT and KVM_REQUEST_NO_ACTION set, it requires target + * vCPUs leaving fastpath so that interrupt can be enabled to ensure the + * IPIs can be delivered. Return EXIT_FASTPATH_EXIT_HANDLED instead of + * EXIT_FASTPATH_REENTER_GUEST to exit fastpath, otherwise, the + * requester may be blocked endlessly. + */ + if (unlikely(tdx_operand_busy(vp_enter_ret))) + return EXIT_FASTPATH_EXIT_HANDLED; + + return EXIT_FASTPATH_NONE; +} + +#define TDX_REGS_AVAIL_SET (BIT_ULL(VCPU_EXREG_EXIT_INFO_1) | \ + BIT_ULL(VCPU_EXREG_EXIT_INFO_2) | \ + BIT_ULL(VCPU_REGS_RAX) | \ + BIT_ULL(VCPU_REGS_RBX) | \ + BIT_ULL(VCPU_REGS_RCX) | \ + BIT_ULL(VCPU_REGS_RDX) | \ + BIT_ULL(VCPU_REGS_RBP) | \ + BIT_ULL(VCPU_REGS_RSI) | \ + BIT_ULL(VCPU_REGS_RDI) | \ + BIT_ULL(VCPU_REGS_R8) | \ + BIT_ULL(VCPU_REGS_R9) | \ + BIT_ULL(VCPU_REGS_R10) | \ + BIT_ULL(VCPU_REGS_R11) | \ + BIT_ULL(VCPU_REGS_R12) | \ + BIT_ULL(VCPU_REGS_R13) | \ + BIT_ULL(VCPU_REGS_R14) | \ + BIT_ULL(VCPU_REGS_R15)) + +static void tdx_load_host_xsave_state(struct kvm_vcpu *vcpu) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); + + /* + * All TDX hosts support PKRU; but even if they didn't, + * vcpu->arch.host_pkru would be 0 and the wrpkru would be + * skipped. + */ + if (vcpu->arch.host_pkru != 0) + wrpkru(vcpu->arch.host_pkru); + + if (kvm_host.xcr0 != (kvm_tdx->xfam & kvm_caps.supported_xcr0)) + xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0); + + /* + * Likewise, even if a TDX hosts didn't support XSS both arms of + * the comparison would be 0 and the wrmsrl would be skipped. + */ + if (kvm_host.xss != (kvm_tdx->xfam & kvm_caps.supported_xss)) + wrmsrl(MSR_IA32_XSS, kvm_host.xss); +} + +#define TDX_DEBUGCTL_PRESERVED (DEBUGCTLMSR_BTF | \ + DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI | \ + DEBUGCTLMSR_FREEZE_IN_SMM) + +fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); + + /* + * force_immediate_exit requires vCPU entering for events injection with + * an immediately exit followed. But The TDX module doesn't guarantee + * entry, it's already possible for KVM to _think_ it completely entry + * to the guest without actually having done so. + * Since KVM never needs to force an immediate exit for TDX, and can't + * do direct injection, just warn on force_immediate_exit. + */ + WARN_ON_ONCE(force_immediate_exit); + + /* + * Wait until retry of SEPT-zap-related SEAMCALL completes before + * allowing vCPU entry to avoid contention with tdh_vp_enter() and + * TDCALLs. + */ + if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap))) + return EXIT_FASTPATH_EXIT_HANDLED; + + trace_kvm_entry(vcpu, force_immediate_exit); + + if (pi_test_on(&vt->pi_desc)) { + apic->send_IPI_self(POSTED_INTR_VECTOR); + + if (pi_test_pir(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTT) & + APIC_VECTOR_MASK, &vt->pi_desc)) + kvm_wait_lapic_expire(vcpu); + } + + tdx_vcpu_enter_exit(vcpu); + + if (vt->host_debugctlmsr & ~TDX_DEBUGCTL_PRESERVED) + update_debugctlmsr(vt->host_debugctlmsr); + + tdx_load_host_xsave_state(vcpu); + tdx->guest_entered = true; + + vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET; + + if (unlikely(tdx->vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) + return EXIT_FASTPATH_NONE; + + if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) + return EXIT_FASTPATH_NONE; + + if (unlikely(vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY)) + kvm_machine_check(); + + trace_kvm_exit(vcpu, KVM_ISA_VMX); + + if (unlikely(tdx_failed_vmentry(vcpu))) + return EXIT_FASTPATH_NONE; + + return tdx_exit_handlers_fastpath(vcpu); +} + +void tdx_inject_nmi(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.nmi_injections; + td_management_write8(to_tdx(vcpu), TD_VCPU_PEND_NMI, 1); + /* + * From KVM's perspective, NMI injection is completed right after + * writing to PEND_NMI. KVM doesn't care whether an NMI is injected by + * the TDX module or not. + */ + vcpu->arch.nmi_injected = false; + /* + * TDX doesn't support KVM to request NMI window exit. If there is + * still a pending vNMI, KVM is not able to inject it along with the + * one pending in TDX module in a back-to-back way. Since the previous + * vNMI is still pending in TDX module, i.e. it has not been delivered + * to TDX guest yet, it's OK to collapse the pending vNMI into the + * previous one. The guest is expected to handle all the NMI sources + * when handling the first vNMI. + */ + vcpu->arch.nmi_pending = 0; +} + +static int tdx_handle_exception_nmi(struct kvm_vcpu *vcpu) +{ + u32 intr_info = vmx_get_intr_info(vcpu); + + /* + * Machine checks are handled by handle_exception_irqoff(), or by + * tdx_handle_exit() with TDX_NON_RECOVERABLE set if a #MC occurs on + * VM-Entry. NMIs are handled by tdx_vcpu_enter_exit(). + */ + if (is_nmi(intr_info) || is_machine_check(intr_info)) + return 1; + + vcpu->run->exit_reason = KVM_EXIT_EXCEPTION; + vcpu->run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK; + vcpu->run->ex.error_code = 0; + + return 0; +} + +static int complete_hypercall_exit(struct kvm_vcpu *vcpu) +{ + tdvmcall_set_return_code(vcpu, vcpu->run->hypercall.ret); + return 1; +} + +static int tdx_emulate_vmcall(struct kvm_vcpu *vcpu) +{ + kvm_rax_write(vcpu, to_tdx(vcpu)->vp_enter_args.r10); + kvm_rbx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r11); + kvm_rcx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r12); + kvm_rdx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r13); + kvm_rsi_write(vcpu, to_tdx(vcpu)->vp_enter_args.r14); + + return __kvm_emulate_hypercall(vcpu, 0, complete_hypercall_exit); +} + +/* + * Split into chunks and check interrupt pending between chunks. This allows + * for timely injection of interrupts to prevent issues with guest lockup + * detection. + */ +#define TDX_MAP_GPA_MAX_LEN (2 * 1024 * 1024) +static void __tdx_map_gpa(struct vcpu_tdx *tdx); + +static int tdx_complete_vmcall_map_gpa(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + + if (vcpu->run->hypercall.ret) { + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + tdx->vp_enter_args.r11 = tdx->map_gpa_next; + return 1; + } + + tdx->map_gpa_next += TDX_MAP_GPA_MAX_LEN; + if (tdx->map_gpa_next >= tdx->map_gpa_end) + return 1; + + /* + * Stop processing the remaining part if there is a pending interrupt, + * which could be qualified to deliver. Skip checking pending RVI for + * TDVMCALL_MAP_GPA, see comments in tdx_protected_apic_has_interrupt(). + */ + if (kvm_vcpu_has_events(vcpu)) { + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_RETRY); + tdx->vp_enter_args.r11 = tdx->map_gpa_next; + return 1; + } + + __tdx_map_gpa(tdx); + return 0; +} + +static void __tdx_map_gpa(struct vcpu_tdx *tdx) +{ + u64 gpa = tdx->map_gpa_next; + u64 size = tdx->map_gpa_end - tdx->map_gpa_next; + + if (size > TDX_MAP_GPA_MAX_LEN) + size = TDX_MAP_GPA_MAX_LEN; + + tdx->vcpu.run->exit_reason = KVM_EXIT_HYPERCALL; + tdx->vcpu.run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; + /* + * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2) + * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that + * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting + * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU. + */ + tdx->vcpu.run->hypercall.ret = 0; + tdx->vcpu.run->hypercall.args[0] = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm)); + tdx->vcpu.run->hypercall.args[1] = size / PAGE_SIZE; + tdx->vcpu.run->hypercall.args[2] = vt_is_tdx_private_gpa(tdx->vcpu.kvm, gpa) ? + KVM_MAP_GPA_RANGE_ENCRYPTED : + KVM_MAP_GPA_RANGE_DECRYPTED; + tdx->vcpu.run->hypercall.flags = KVM_EXIT_HYPERCALL_LONG_MODE; + + tdx->vcpu.arch.complete_userspace_io = tdx_complete_vmcall_map_gpa; +} + +static int tdx_map_gpa(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + u64 gpa = tdx->vp_enter_args.r12; + u64 size = tdx->vp_enter_args.r13; + u64 ret; + + /* + * Converting TDVMCALL_MAP_GPA to KVM_HC_MAP_GPA_RANGE requires + * userspace to enable KVM_CAP_EXIT_HYPERCALL with KVM_HC_MAP_GPA_RANGE + * bit set. This is a base call so it should always be supported, but + * KVM has no way to ensure that userspace implements the GHCI correctly. + * So if KVM_HC_MAP_GPA_RANGE does not cause a VMEXIT, return an error + * to the guest. + */ + if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) { + ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED; + goto error; + } + + if (gpa + size <= gpa || !kvm_vcpu_is_legal_gpa(vcpu, gpa) || + !kvm_vcpu_is_legal_gpa(vcpu, gpa + size - 1) || + (vt_is_tdx_private_gpa(vcpu->kvm, gpa) != + vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))) { + ret = TDVMCALL_STATUS_INVALID_OPERAND; + goto error; + } + + if (!PAGE_ALIGNED(gpa) || !PAGE_ALIGNED(size)) { + ret = TDVMCALL_STATUS_ALIGN_ERROR; + goto error; + } + + tdx->map_gpa_end = gpa + size; + tdx->map_gpa_next = gpa; + + __tdx_map_gpa(tdx); + return 0; + +error: + tdvmcall_set_return_code(vcpu, ret); + tdx->vp_enter_args.r11 = gpa; + return 1; +} + +static int tdx_report_fatal_error(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + u64 *regs = vcpu->run->system_event.data; + u64 *module_regs = &tdx->vp_enter_args.r8; + int index = VCPU_REGS_RAX; + + vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; + vcpu->run->system_event.type = KVM_SYSTEM_EVENT_TDX_FATAL; + vcpu->run->system_event.ndata = 16; + + /* Dump 16 general-purpose registers to userspace in ascending order. */ + regs[index++] = tdx->vp_enter_ret; + regs[index++] = tdx->vp_enter_args.rcx; + regs[index++] = tdx->vp_enter_args.rdx; + regs[index++] = tdx->vp_enter_args.rbx; + regs[index++] = 0; + regs[index++] = 0; + regs[index++] = tdx->vp_enter_args.rsi; + regs[index] = tdx->vp_enter_args.rdi; + for (index = 0; index < 8; index++) + regs[VCPU_REGS_R8 + index] = module_regs[index]; + + return 0; +} + +static int tdx_emulate_cpuid(struct kvm_vcpu *vcpu) +{ + u32 eax, ebx, ecx, edx; + struct vcpu_tdx *tdx = to_tdx(vcpu); + + /* EAX and ECX for cpuid is stored in R12 and R13. */ + eax = tdx->vp_enter_args.r12; + ecx = tdx->vp_enter_args.r13; + + kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false); + + tdx->vp_enter_args.r12 = eax; + tdx->vp_enter_args.r13 = ebx; + tdx->vp_enter_args.r14 = ecx; + tdx->vp_enter_args.r15 = edx; + + return 1; +} + +static int tdx_complete_pio_out(struct kvm_vcpu *vcpu) +{ + vcpu->arch.pio.count = 0; + return 1; +} + +static int tdx_complete_pio_in(struct kvm_vcpu *vcpu) +{ + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; + unsigned long val = 0; + int ret; + + ret = ctxt->ops->pio_in_emulated(ctxt, vcpu->arch.pio.size, + vcpu->arch.pio.port, &val, 1); + + WARN_ON_ONCE(!ret); + + tdvmcall_set_return_val(vcpu, val); + + return 1; +} + +static int tdx_emulate_io(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; + unsigned long val = 0; + unsigned int port; + u64 size, write; + int ret; + + ++vcpu->stat.io_exits; + + size = tdx->vp_enter_args.r12; + write = tdx->vp_enter_args.r13; + port = tdx->vp_enter_args.r14; + + if ((write != 0 && write != 1) || (size != 1 && size != 2 && size != 4)) { + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + return 1; + } + + if (write) { + val = tdx->vp_enter_args.r15; + ret = ctxt->ops->pio_out_emulated(ctxt, size, port, &val, 1); + } else { + ret = ctxt->ops->pio_in_emulated(ctxt, size, port, &val, 1); + } + + if (!ret) + vcpu->arch.complete_userspace_io = write ? tdx_complete_pio_out : + tdx_complete_pio_in; + else if (!write) + tdvmcall_set_return_val(vcpu, val); + + return ret; +} + +static int tdx_complete_mmio_read(struct kvm_vcpu *vcpu) +{ + unsigned long val = 0; + gpa_t gpa; + int size; + + gpa = vcpu->mmio_fragments[0].gpa; + size = vcpu->mmio_fragments[0].len; + + memcpy(&val, vcpu->run->mmio.data, size); + tdvmcall_set_return_val(vcpu, val); + trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val); + return 1; +} + +static inline int tdx_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, int size, + unsigned long val) +{ + if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { + trace_kvm_fast_mmio(gpa); + return 0; + } + + trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, size, gpa, &val); + if (kvm_io_bus_write(vcpu, KVM_MMIO_BUS, gpa, size, &val)) + return -EOPNOTSUPP; + + return 0; +} + +static inline int tdx_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, int size) +{ + unsigned long val; + + if (kvm_io_bus_read(vcpu, KVM_MMIO_BUS, gpa, size, &val)) + return -EOPNOTSUPP; + + tdvmcall_set_return_val(vcpu, val); + trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val); + return 0; +} + +static int tdx_emulate_mmio(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + int size, write, r; + unsigned long val; + gpa_t gpa; + + size = tdx->vp_enter_args.r12; + write = tdx->vp_enter_args.r13; + gpa = tdx->vp_enter_args.r14; + val = write ? tdx->vp_enter_args.r15 : 0; + + if (size != 1 && size != 2 && size != 4 && size != 8) + goto error; + if (write != 0 && write != 1) + goto error; + + /* + * TDG.VP.VMCALL<MMIO> allows only shared GPA, it makes no sense to + * do MMIO emulation for private GPA. + */ + if (vt_is_tdx_private_gpa(vcpu->kvm, gpa) || + vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1)) + goto error; + + gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm)); + + if (write) + r = tdx_mmio_write(vcpu, gpa, size, val); + else + r = tdx_mmio_read(vcpu, gpa, size); + if (!r) + /* Kernel completed device emulation. */ + return 1; + + /* Request the device emulation to userspace device model. */ + vcpu->mmio_is_write = write; + if (!write) + vcpu->arch.complete_userspace_io = tdx_complete_mmio_read; + + vcpu->run->mmio.phys_addr = gpa; + vcpu->run->mmio.len = size; + vcpu->run->mmio.is_write = write; + vcpu->run->exit_reason = KVM_EXIT_MMIO; + + if (write) { + memcpy(vcpu->run->mmio.data, &val, size); + } else { + vcpu->mmio_fragments[0].gpa = gpa; + vcpu->mmio_fragments[0].len = size; + trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, size, gpa, NULL); + } + return 0; + +error: + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + return 1; +} + +static int tdx_complete_get_td_vm_call_info(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + + tdvmcall_set_return_code(vcpu, vcpu->run->tdx.get_tdvmcall_info.ret); + + /* + * For now, there is no TDVMCALL beyond GHCI base API supported by KVM + * directly without the support from userspace, just set the value + * returned from userspace. + */ + tdx->vp_enter_args.r11 = vcpu->run->tdx.get_tdvmcall_info.r11; + tdx->vp_enter_args.r12 = vcpu->run->tdx.get_tdvmcall_info.r12; + tdx->vp_enter_args.r13 = vcpu->run->tdx.get_tdvmcall_info.r13; + tdx->vp_enter_args.r14 = vcpu->run->tdx.get_tdvmcall_info.r14; + + return 1; +} + +static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + + switch (tdx->vp_enter_args.r12) { + case 0: + tdx->vp_enter_args.r11 = 0; + tdx->vp_enter_args.r12 = 0; + tdx->vp_enter_args.r13 = 0; + tdx->vp_enter_args.r14 = 0; + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUCCESS); + return 1; + case 1: + vcpu->run->tdx.get_tdvmcall_info.leaf = tdx->vp_enter_args.r12; + vcpu->run->exit_reason = KVM_EXIT_TDX; + vcpu->run->tdx.flags = 0; + vcpu->run->tdx.nr = TDVMCALL_GET_TD_VM_CALL_INFO; + vcpu->run->tdx.get_tdvmcall_info.ret = TDVMCALL_STATUS_SUCCESS; + vcpu->run->tdx.get_tdvmcall_info.r11 = 0; + vcpu->run->tdx.get_tdvmcall_info.r12 = 0; + vcpu->run->tdx.get_tdvmcall_info.r13 = 0; + vcpu->run->tdx.get_tdvmcall_info.r14 = 0; + vcpu->arch.complete_userspace_io = tdx_complete_get_td_vm_call_info; + return 0; + default: + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + return 1; + } +} + +static int tdx_complete_simple(struct kvm_vcpu *vcpu) +{ + tdvmcall_set_return_code(vcpu, vcpu->run->tdx.unknown.ret); + return 1; +} + +static int tdx_get_quote(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + u64 gpa = tdx->vp_enter_args.r12; + u64 size = tdx->vp_enter_args.r13; + + /* The gpa of buffer must have shared bit set. */ + if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) { + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + return 1; + } + + vcpu->run->exit_reason = KVM_EXIT_TDX; + vcpu->run->tdx.flags = 0; + vcpu->run->tdx.nr = TDVMCALL_GET_QUOTE; + vcpu->run->tdx.get_quote.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED; + vcpu->run->tdx.get_quote.gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm)); + vcpu->run->tdx.get_quote.size = size; + + vcpu->arch.complete_userspace_io = tdx_complete_simple; + + return 0; +} + +static int handle_tdvmcall(struct kvm_vcpu *vcpu) +{ + switch (tdvmcall_leaf(vcpu)) { + case TDVMCALL_MAP_GPA: + return tdx_map_gpa(vcpu); + case TDVMCALL_REPORT_FATAL_ERROR: + return tdx_report_fatal_error(vcpu); + case TDVMCALL_GET_TD_VM_CALL_INFO: + return tdx_get_td_vm_call_info(vcpu); + case TDVMCALL_GET_QUOTE: + return tdx_get_quote(vcpu); + default: + break; + } + + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED); + return 1; +} + +void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level) +{ + u64 shared_bit = (pgd_level == 5) ? TDX_SHARED_BIT_PWL_5 : + TDX_SHARED_BIT_PWL_4; + + if (KVM_BUG_ON(shared_bit != kvm_gfn_direct_bits(vcpu->kvm), vcpu->kvm)) + return; + + td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa); +} + +static void tdx_unpin(struct kvm *kvm, struct page *page) +{ + put_page(page); +} + +static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, + enum pg_level level, struct page *page) +{ + int tdx_level = pg_level_to_tdx_sept_level(level); + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + gpa_t gpa = gfn_to_gpa(gfn); + u64 entry, level_state; + u64 err; + + err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state); + if (unlikely(tdx_operand_busy(err))) { + tdx_unpin(kvm, page); + return -EBUSY; + } + + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state); + tdx_unpin(kvm, page); + return -EIO; + } + + return 0; +} + +/* + * KVM_TDX_INIT_MEM_REGION calls kvm_gmem_populate() to map guest pages; the + * callback tdx_gmem_post_populate() then maps pages into private memory. + * through the a seamcall TDH.MEM.PAGE.ADD(). The SEAMCALL also requires the + * private EPT structures for the page to have been built before, which is + * done via kvm_tdp_map_page(). nr_premapped counts the number of pages that + * were added to the EPT structures but not added with TDH.MEM.PAGE.ADD(). + * The counter has to be zero on KVM_TDX_FINALIZE_VM, to ensure that there + * are no half-initialized shared EPT pages. + */ +static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, kvm_pfn_t pfn) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + + if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm)) + return -EINVAL; + + /* nr_premapped will be decreased when tdh_mem_page_add() is called. */ + atomic64_inc(&kvm_tdx->nr_premapped); + return 0; +} + +int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, kvm_pfn_t pfn) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + struct page *page = pfn_to_page(pfn); + + /* TODO: handle large pages. */ + if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) + return -EINVAL; + + /* + * Because guest_memfd doesn't support page migration with + * a_ops->migrate_folio (yet), no callback is triggered for KVM on page + * migration. Until guest_memfd supports page migration, prevent page + * migration. + * TODO: Once guest_memfd introduces callback on page migration, + * implement it and remove get_page/put_page(). + */ + get_page(page); + + /* + * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching + * barrier in tdx_td_finalize(). + */ + smp_rmb(); + if (likely(kvm_tdx->state == TD_STATE_RUNNABLE)) + return tdx_mem_page_aug(kvm, gfn, level, page); + + return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn); +} + +static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, struct page *page) +{ + int tdx_level = pg_level_to_tdx_sept_level(level); + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + gpa_t gpa = gfn_to_gpa(gfn); + u64 err, entry, level_state; + + /* TODO: handle large pages. */ + if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) + return -EINVAL; + + if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm)) + return -EINVAL; + + /* + * When zapping private page, write lock is held. So no race condition + * with other vcpu sept operation. + * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs. + */ + err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry, + &level_state); + + if (unlikely(tdx_operand_busy(err))) { + /* + * The second retry is expected to succeed after kicking off all + * other vCPUs and prevent them from invoking TDH.VP.ENTER. + */ + tdx_no_vcpus_enter_start(kvm); + err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry, + &level_state); + tdx_no_vcpus_enter_stop(kvm); + } + + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state); + return -EIO; + } + + err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page); + + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); + return -EIO; + } + tdx_clear_page(page); + tdx_unpin(kvm, page); + return 0; +} + +int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, void *private_spt) +{ + int tdx_level = pg_level_to_tdx_sept_level(level); + gpa_t gpa = gfn_to_gpa(gfn); + struct page *page = virt_to_page(private_spt); + u64 err, entry, level_state; + + err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, tdx_level, page, &entry, + &level_state); + if (unlikely(tdx_operand_busy(err))) + return -EBUSY; + + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error_2(TDH_MEM_SEPT_ADD, err, entry, level_state); + return -EIO; + } + + return 0; +} + +/* + * Check if the error returned from a SEPT zap SEAMCALL is due to that a page is + * mapped by KVM_TDX_INIT_MEM_REGION without tdh_mem_page_add() being called + * successfully. + * + * Since tdh_mem_sept_add() must have been invoked successfully before a + * non-leaf entry present in the mirrored page table, the SEPT ZAP related + * SEAMCALLs should not encounter err TDX_EPT_WALK_FAILED. They should instead + * find TDX_EPT_ENTRY_STATE_INCORRECT due to an empty leaf entry found in the + * SEPT. + * + * Further check if the returned entry from SEPT walking is with RWX permissions + * to filter out anything unexpected. + * + * Note: @level is pg_level, not the tdx_level. The tdx_level extracted from + * level_state returned from a SEAMCALL error is the same as that passed into + * the SEAMCALL. + */ +static int tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx *kvm_tdx, u64 err, + u64 entry, int level) +{ + if (!err || kvm_tdx->state == TD_STATE_RUNNABLE) + return false; + + if (err != (TDX_EPT_ENTRY_STATE_INCORRECT | TDX_OPERAND_ID_RCX)) + return false; + + if ((is_last_spte(entry, level) && (entry & VMX_EPT_RWX_MASK))) + return false; + + return true; +} + +static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, struct page *page) +{ + int tdx_level = pg_level_to_tdx_sept_level(level); + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + gpa_t gpa = gfn_to_gpa(gfn) & KVM_HPAGE_MASK(level); + u64 err, entry, level_state; + + /* For now large page isn't supported yet. */ + WARN_ON_ONCE(level != PG_LEVEL_4K); + + err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); + + if (unlikely(tdx_operand_busy(err))) { + /* After no vCPUs enter, the second retry is expected to succeed */ + tdx_no_vcpus_enter_start(kvm); + err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); + tdx_no_vcpus_enter_stop(kvm); + } + if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) && + !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) { + atomic64_dec(&kvm_tdx->nr_premapped); + tdx_unpin(kvm, page); + return 0; + } + + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state); + return -EIO; + } + return 1; +} + +/* + * Ensure shared and private EPTs to be flushed on all vCPUs. + * tdh_mem_track() is the only caller that increases TD epoch. An increase in + * the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are + * running in guest mode with the value "N - 1". + * + * A successful execution of tdh_mem_track() ensures that vCPUs can only run in + * guest mode with TD epoch value "N" if no TD exit occurs after the TD epoch + * being increased to "N + 1". + * + * Kicking off all vCPUs after that further results in no vCPUs can run in guest + * mode with TD epoch value "N", which unblocks the next tdh_mem_track() (e.g. + * to increase TD epoch to "N + 2"). + * + * TDX module will flush EPT on the next TD enter and make vCPUs to run in + * guest mode with TD epoch value "N + 1". + * + * kvm_make_all_cpus_request() guarantees all vCPUs are out of guest mode by + * waiting empty IPI handler ack_kick(). + * + * No action is required to the vCPUs being kicked off since the kicking off + * occurs certainly after TD epoch increment and before the next + * tdh_mem_track(). + */ +static void tdx_track(struct kvm *kvm) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + u64 err; + + /* If TD isn't finalized, it's before any vcpu running. */ + if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE)) + return; + + lockdep_assert_held_write(&kvm->mmu_lock); + + err = tdh_mem_track(&kvm_tdx->td); + if (unlikely(tdx_operand_busy(err))) { + /* After no vCPUs enter, the second retry is expected to succeed */ + tdx_no_vcpus_enter_start(kvm); + err = tdh_mem_track(&kvm_tdx->td); + tdx_no_vcpus_enter_stop(kvm); + } + + if (KVM_BUG_ON(err, kvm)) + pr_tdx_error(TDH_MEM_TRACK, err); + + kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); +} + +int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, void *private_spt) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + + /* + * free_external_spt() is only called after hkid is freed when TD is + * tearing down. + * KVM doesn't (yet) zap page table pages in mirror page table while + * TD is active, though guest pages mapped in mirror page table could be + * zapped during TD is active, e.g. for shared <-> private conversion + * and slot move/deletion. + */ + if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm)) + return -EINVAL; + + /* + * The HKID assigned to this TD was already freed and cache was + * already flushed. We don't have to flush again. + */ + return tdx_reclaim_page(virt_to_page(private_spt)); +} + +int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, kvm_pfn_t pfn) +{ + struct page *page = pfn_to_page(pfn); + int ret; + + /* + * HKID is released after all private pages have been removed, and set + * before any might be populated. Warn if zapping is attempted when + * there can't be anything populated in the private EPT. + */ + if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm)) + return -EINVAL; + + ret = tdx_sept_zap_private_spte(kvm, gfn, level, page); + if (ret <= 0) + return ret; + + /* + * TDX requires TLB tracking before dropping private page. Do + * it here, although it is also done later. + */ + tdx_track(kvm); + + return tdx_sept_drop_private_spte(kvm, gfn, level, page); +} + +void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, + int trig_mode, int vector) +{ + struct kvm_vcpu *vcpu = apic->vcpu; + struct vcpu_tdx *tdx = to_tdx(vcpu); + + /* TDX supports only posted interrupt. No lapic emulation. */ + __vmx_deliver_posted_interrupt(vcpu, &tdx->vt.pi_desc, vector); + + trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector); +} + +static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcpu) +{ + u64 eeq_type = to_tdx(vcpu)->ext_exit_qualification & TDX_EXT_EXIT_QUAL_TYPE_MASK; + u64 eq = vmx_get_exit_qual(vcpu); + + if (eeq_type != TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION) + return false; + + return !(eq & EPT_VIOLATION_PROT_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN); +} + +static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qual; + gpa_t gpa = to_tdx(vcpu)->exit_gpa; + bool local_retry = false; + int ret; + + if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) { + if (tdx_is_sept_violation_unexpected_pending(vcpu)) { + pr_warn("Guest access before accepting 0x%llx on vCPU %d\n", + gpa, vcpu->vcpu_id); + kvm_vm_dead(vcpu->kvm); + return -EIO; + } + /* + * Always treat SEPT violations as write faults. Ignore the + * EXIT_QUALIFICATION reported by TDX-SEAM for SEPT violations. + * TD private pages are always RWX in the SEPT tables, + * i.e. they're always mapped writable. Just as importantly, + * treating SEPT violations as write faults is necessary to + * avoid COW allocations, which will cause TDAUGPAGE failures + * due to aliasing a single HPA to multiple GPAs. + */ + exit_qual = EPT_VIOLATION_ACC_WRITE; + + /* Only private GPA triggers zero-step mitigation */ + local_retry = true; + } else { + exit_qual = vmx_get_exit_qual(vcpu); + /* + * EPT violation due to instruction fetch should never be + * triggered from shared memory in TDX guest. If such EPT + * violation occurs, treat it as broken hardware. + */ + if (KVM_BUG_ON(exit_qual & EPT_VIOLATION_ACC_INSTR, vcpu->kvm)) + return -EIO; + } + + trace_kvm_page_fault(vcpu, gpa, exit_qual); + + /* + * To minimize TDH.VP.ENTER invocations, retry locally for private GPA + * mapping in TDX. + * + * KVM may return RET_PF_RETRY for private GPA due to + * - contentions when atomically updating SPTEs of the mirror page table + * - in-progress GFN invalidation or memslot removal. + * - TDX_OPERAND_BUSY error from TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD, + * caused by contentions with TDH.VP.ENTER (with zero-step mitigation) + * or certain TDCALLs. + * + * If TDH.VP.ENTER is invoked more times than the threshold set by the + * TDX module before KVM resolves the private GPA mapping, the TDX + * module will activate zero-step mitigation during TDH.VP.ENTER. This + * process acquires an SEPT tree lock in the TDX module, leading to + * further contentions with TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD + * operations on other vCPUs. + * + * Breaking out of local retries for kvm_vcpu_has_events() is for + * interrupt injection. kvm_vcpu_has_events() should not see pending + * events for TDX. Since KVM can't determine if IRQs (or NMIs) are + * blocked by TDs, false positives are inevitable i.e., KVM may re-enter + * the guest even if the IRQ/NMI can't be delivered. + * + * Note: even without breaking out of local retries, zero-step + * mitigation may still occur due to + * - invoking of TDH.VP.ENTER after KVM_EXIT_MEMORY_FAULT, + * - a single RIP causing EPT violations for more GFNs than the + * threshold count. + * This is safe, as triggering zero-step mitigation only introduces + * contentions to page installation SEAMCALLs on other vCPUs, which will + * handle retries locally in their EPT violation handlers. + */ + while (1) { + ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual); + + if (ret != RET_PF_RETRY || !local_retry) + break; + + if (kvm_vcpu_has_events(vcpu) || signal_pending(current)) + break; + + if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) { + ret = -EIO; + break; + } + + cond_resched(); + } + return ret; +} + +int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) +{ + if (err) { + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + return 1; + } + + if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MSR_READ) + tdvmcall_set_return_val(vcpu, kvm_read_edx_eax(vcpu)); + + return 1; +} + + +int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + u64 vp_enter_ret = tdx->vp_enter_ret; + union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu); + + if (fastpath != EXIT_FASTPATH_NONE) + return 1; + + if (unlikely(vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) { + KVM_BUG_ON(1, vcpu->kvm); + return -EIO; + } + + /* + * Handle TDX SW errors, including TDX_SEAMCALL_UD, TDX_SEAMCALL_GP and + * TDX_SEAMCALL_VMFAILINVALID. + */ + if (unlikely((vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) { + KVM_BUG_ON(!kvm_rebooting, vcpu->kvm); + goto unhandled_exit; + } + + if (unlikely(tdx_failed_vmentry(vcpu))) { + /* + * If the guest state is protected, that means off-TD debug is + * not enabled, TDX_NON_RECOVERABLE must be set. + */ + WARN_ON_ONCE(vcpu->arch.guest_state_protected && + !(vp_enter_ret & TDX_NON_RECOVERABLE)); + vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; + vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason.full; + vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; + return 0; + } + + if (unlikely(vp_enter_ret & (TDX_ERROR | TDX_NON_RECOVERABLE)) && + exit_reason.basic != EXIT_REASON_TRIPLE_FAULT) { + kvm_pr_unimpl("TD vp_enter_ret 0x%llx\n", vp_enter_ret); + goto unhandled_exit; + } + + WARN_ON_ONCE(exit_reason.basic != EXIT_REASON_TRIPLE_FAULT && + (vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) != TDX_SUCCESS); + + switch (exit_reason.basic) { + case EXIT_REASON_TRIPLE_FAULT: + vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; + vcpu->mmio_needed = 0; + return 0; + case EXIT_REASON_EXCEPTION_NMI: + return tdx_handle_exception_nmi(vcpu); + case EXIT_REASON_EXTERNAL_INTERRUPT: + ++vcpu->stat.irq_exits; + return 1; + case EXIT_REASON_CPUID: + return tdx_emulate_cpuid(vcpu); + case EXIT_REASON_HLT: + return kvm_emulate_halt_noskip(vcpu); + case EXIT_REASON_TDCALL: + return handle_tdvmcall(vcpu); + case EXIT_REASON_VMCALL: + return tdx_emulate_vmcall(vcpu); + case EXIT_REASON_IO_INSTRUCTION: + return tdx_emulate_io(vcpu); + case EXIT_REASON_MSR_READ: + kvm_rcx_write(vcpu, tdx->vp_enter_args.r12); + return kvm_emulate_rdmsr(vcpu); + case EXIT_REASON_MSR_WRITE: + kvm_rcx_write(vcpu, tdx->vp_enter_args.r12); + kvm_rax_write(vcpu, tdx->vp_enter_args.r13 & -1u); + kvm_rdx_write(vcpu, tdx->vp_enter_args.r13 >> 32); + return kvm_emulate_wrmsr(vcpu); + case EXIT_REASON_EPT_MISCONFIG: + return tdx_emulate_mmio(vcpu); + case EXIT_REASON_EPT_VIOLATION: + return tdx_handle_ept_violation(vcpu); + case EXIT_REASON_OTHER_SMI: + /* + * Unlike VMX, SMI in SEAM non-root mode (i.e. when + * TD guest vCPU is running) will cause VM exit to TDX module, + * then SEAMRET to KVM. Once it exits to KVM, SMI is delivered + * and handled by kernel handler right away. + * + * The Other SMI exit can also be caused by the SEAM non-root + * machine check delivered via Machine Check System Management + * Interrupt (MSMI), but it has already been handled by the + * kernel machine check handler, i.e., the memory page has been + * marked as poisoned and it won't be freed to the free list + * when the TDX guest is terminated (the TDX module marks the + * guest as dead and prevent it from further running when + * machine check happens in SEAM non-root). + * + * - A MSMI will not reach here, it's handled as non_recoverable + * case above. + * - If it's not an MSMI, no need to do anything here. + */ + return 1; + default: + break; + } + +unhandled_exit: + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; + vcpu->run->internal.ndata = 2; + vcpu->run->internal.data[0] = vp_enter_ret; + vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; + return 0; +} + +void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, + u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + + *reason = tdx->vt.exit_reason.full; + if (*reason != -1u) { + *info1 = vmx_get_exit_qual(vcpu); + *info2 = tdx->ext_exit_qualification; + *intr_info = vmx_get_intr_info(vcpu); + } else { + *info1 = 0; + *info2 = 0; + *intr_info = 0; + } + + *error_code = 0; +} + +bool tdx_has_emulated_msr(u32 index) +{ + switch (index) { + case MSR_IA32_UCODE_REV: + case MSR_IA32_ARCH_CAPABILITIES: + case MSR_IA32_POWER_CTL: + case MSR_IA32_CR_PAT: + case MSR_MTRRcap: + case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: + case MSR_MTRRdefType: + case MSR_IA32_TSC_DEADLINE: + case MSR_IA32_MISC_ENABLE: + case MSR_PLATFORM_INFO: + case MSR_MISC_FEATURES_ENABLES: + case MSR_IA32_APICBASE: + case MSR_EFER: + case MSR_IA32_FEAT_CTL: + case MSR_IA32_MCG_CAP: + case MSR_IA32_MCG_STATUS: + case MSR_IA32_MCG_CTL: + case MSR_IA32_MCG_EXT_CTL: + case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: + /* MSR_IA32_MCx_{CTL, STATUS, ADDR, MISC, CTL2} */ + case MSR_KVM_POLL_CONTROL: + return true; + case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: + /* + * x2APIC registers that are virtualized by the CPU can't be + * emulated, KVM doesn't have access to the virtual APIC page. + */ + switch (index) { + case X2APIC_MSR(APIC_TASKPRI): + case X2APIC_MSR(APIC_PROCPRI): + case X2APIC_MSR(APIC_EOI): + case X2APIC_MSR(APIC_ISR) ... X2APIC_MSR(APIC_ISR + APIC_ISR_NR): + case X2APIC_MSR(APIC_TMR) ... X2APIC_MSR(APIC_TMR + APIC_ISR_NR): + case X2APIC_MSR(APIC_IRR) ... X2APIC_MSR(APIC_IRR + APIC_ISR_NR): + return false; + default: + return true; + } + default: + return false; + } +} + +static bool tdx_is_read_only_msr(u32 index) +{ + return index == MSR_IA32_APICBASE || index == MSR_EFER || + index == MSR_IA32_FEAT_CTL; +} + +int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) +{ + switch (msr->index) { + case MSR_IA32_FEAT_CTL: + /* + * MCE and MCA are advertised via cpuid. Guest kernel could + * check if LMCE is enabled or not. + */ + msr->data = FEAT_CTL_LOCKED; + if (vcpu->arch.mcg_cap & MCG_LMCE_P) + msr->data |= FEAT_CTL_LMCE_ENABLED; + return 0; + case MSR_IA32_MCG_EXT_CTL: + if (!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) + return 1; + msr->data = vcpu->arch.mcg_ext_ctl; + return 0; + default: + if (!tdx_has_emulated_msr(msr->index)) + return 1; + + return kvm_get_msr_common(vcpu, msr); + } +} + +int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) +{ + switch (msr->index) { + case MSR_IA32_MCG_EXT_CTL: + if ((!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) || + (msr->data & ~MCG_EXT_CTL_LMCE_EN)) + return 1; + vcpu->arch.mcg_ext_ctl = msr->data; + return 0; + default: + if (tdx_is_read_only_msr(msr->index)) + return 1; + + if (!tdx_has_emulated_msr(msr->index)) + return 1; + + return kvm_set_msr_common(vcpu, msr); + } +} + +static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd) +{ + const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; + struct kvm_tdx_capabilities __user *user_caps; + struct kvm_tdx_capabilities *caps = NULL; + int ret = 0; + + /* flags is reserved for future use */ + if (cmd->flags) + return -EINVAL; + + caps = kmalloc(sizeof(*caps) + + sizeof(struct kvm_cpuid_entry2) * td_conf->num_cpuid_config, + GFP_KERNEL); + if (!caps) + return -ENOMEM; + + user_caps = u64_to_user_ptr(cmd->data); + if (copy_from_user(caps, user_caps, sizeof(*caps))) { + ret = -EFAULT; + goto out; + } + + if (caps->cpuid.nent < td_conf->num_cpuid_config) { + ret = -E2BIG; + goto out; + } + + ret = init_kvm_tdx_caps(td_conf, caps); + if (ret) + goto out; + + if (copy_to_user(user_caps, caps, sizeof(*caps))) { + ret = -EFAULT; + goto out; + } + + if (copy_to_user(user_caps->cpuid.entries, caps->cpuid.entries, + caps->cpuid.nent * + sizeof(caps->cpuid.entries[0]))) + ret = -EFAULT; + +out: + /* kfree() accepts NULL. */ + kfree(caps); + return ret; +} + +/* + * KVM reports guest physical address in CPUID.0x800000008.EAX[23:16], which is + * similar to TDX's GPAW. Use this field as the interface for userspace to + * configure the GPAW and EPT level for TDs. + * + * Only values 48 and 52 are supported. Value 52 means GPAW-52 and EPT level + * 5, Value 48 means GPAW-48 and EPT level 4. For value 48, GPAW-48 is always + * supported. Value 52 is only supported when the platform supports 5 level + * EPT. + */ +static int setup_tdparams_eptp_controls(struct kvm_cpuid2 *cpuid, + struct td_params *td_params) +{ + const struct kvm_cpuid_entry2 *entry; + int guest_pa; + + entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x80000008, 0); + if (!entry) + return -EINVAL; + + guest_pa = tdx_get_guest_phys_addr_bits(entry->eax); + + if (guest_pa != 48 && guest_pa != 52) + return -EINVAL; + + if (guest_pa == 52 && !cpu_has_vmx_ept_5levels()) + return -EINVAL; + + td_params->eptp_controls = VMX_EPTP_MT_WB; + if (guest_pa == 52) { + td_params->eptp_controls |= VMX_EPTP_PWL_5; + td_params->config_flags |= TDX_CONFIG_FLAGS_MAX_GPAW; + } else { + td_params->eptp_controls |= VMX_EPTP_PWL_4; + } + + return 0; +} + +static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid, + struct td_params *td_params) +{ + const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; + const struct kvm_cpuid_entry2 *entry; + struct tdx_cpuid_value *value; + int i, copy_cnt = 0; + + /* + * td_params.cpuid_values: The number and the order of cpuid_value must + * be same to the one of struct tdsysinfo.{num_cpuid_config, cpuid_configs} + * It's assumed that td_params was zeroed. + */ + for (i = 0; i < td_conf->num_cpuid_config; i++) { + struct kvm_cpuid_entry2 tmp; + + td_init_cpuid_entry2(&tmp, i); + + entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, + tmp.function, tmp.index); + if (!entry) + continue; + + if (tdx_unsupported_cpuid(entry)) + return -EINVAL; + + copy_cnt++; + + value = &td_params->cpuid_values[i]; + value->eax = entry->eax; + value->ebx = entry->ebx; + value->ecx = entry->ecx; + value->edx = entry->edx; + + /* + * TDX module does not accept nonzero bits 16..23 for the + * CPUID[0x80000008].EAX, see setup_tdparams_eptp_controls(). + */ + if (tmp.function == 0x80000008) + value->eax = tdx_set_guest_phys_addr_bits(value->eax, 0); + } + + /* + * Rely on the TDX module to reject invalid configuration, but it can't + * check of leafs that don't have a proper slot in td_params->cpuid_values + * to stick then. So fail if there were entries that didn't get copied to + * td_params. + */ + if (copy_cnt != cpuid->nent) + return -EINVAL; + + return 0; +} + +static int setup_tdparams(struct kvm *kvm, struct td_params *td_params, + struct kvm_tdx_init_vm *init_vm) +{ + const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; + struct kvm_cpuid2 *cpuid = &init_vm->cpuid; + int ret; + + if (kvm->created_vcpus) + return -EBUSY; + + if (init_vm->attributes & ~tdx_get_supported_attrs(td_conf)) + return -EINVAL; + + if (init_vm->xfam & ~tdx_get_supported_xfam(td_conf)) + return -EINVAL; + + td_params->max_vcpus = kvm->max_vcpus; + td_params->attributes = init_vm->attributes | td_conf->attributes_fixed1; + td_params->xfam = init_vm->xfam | td_conf->xfam_fixed1; + + td_params->config_flags = TDX_CONFIG_FLAGS_NO_RBP_MOD; + td_params->tsc_frequency = TDX_TSC_KHZ_TO_25MHZ(kvm->arch.default_tsc_khz); + + ret = setup_tdparams_eptp_controls(cpuid, td_params); + if (ret) + return ret; + + ret = setup_tdparams_cpuids(cpuid, td_params); + if (ret) + return ret; + +#define MEMCPY_SAME_SIZE(dst, src) \ + do { \ + BUILD_BUG_ON(sizeof(dst) != sizeof(src)); \ + memcpy((dst), (src), sizeof(dst)); \ + } while (0) + + MEMCPY_SAME_SIZE(td_params->mrconfigid, init_vm->mrconfigid); + MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner); + MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig); + + return 0; +} + +static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params, + u64 *seamcall_err) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + cpumask_var_t packages; + struct page **tdcs_pages = NULL; + struct page *tdr_page; + int ret, i; + u64 err, rcx; + + *seamcall_err = 0; + ret = tdx_guest_keyid_alloc(); + if (ret < 0) + return ret; + kvm_tdx->hkid = ret; + kvm_tdx->misc_cg = get_current_misc_cg(); + ret = misc_cg_try_charge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1); + if (ret) + goto free_hkid; + + ret = -ENOMEM; + + atomic_inc(&nr_configured_hkid); + + tdr_page = alloc_page(GFP_KERNEL); + if (!tdr_page) + goto free_hkid; + + kvm_tdx->td.tdcs_nr_pages = tdx_sysinfo->td_ctrl.tdcs_base_size / PAGE_SIZE; + /* TDVPS = TDVPR(4K page) + TDCX(multiple 4K pages), -1 for TDVPR. */ + kvm_tdx->td.tdcx_nr_pages = tdx_sysinfo->td_ctrl.tdvps_base_size / PAGE_SIZE - 1; + tdcs_pages = kcalloc(kvm_tdx->td.tdcs_nr_pages, sizeof(*kvm_tdx->td.tdcs_pages), + GFP_KERNEL | __GFP_ZERO); + if (!tdcs_pages) + goto free_tdr; + + for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { + tdcs_pages[i] = alloc_page(GFP_KERNEL); + if (!tdcs_pages[i]) + goto free_tdcs; + } + + if (!zalloc_cpumask_var(&packages, GFP_KERNEL)) + goto free_tdcs; + + cpus_read_lock(); + + /* + * Need at least one CPU of the package to be online in order to + * program all packages for host key id. Check it. + */ + for_each_present_cpu(i) + cpumask_set_cpu(topology_physical_package_id(i), packages); + for_each_online_cpu(i) + cpumask_clear_cpu(topology_physical_package_id(i), packages); + if (!cpumask_empty(packages)) { + ret = -EIO; + /* + * Because it's hard for human operator to figure out the + * reason, warn it. + */ +#define MSG_ALLPKG "All packages need to have online CPU to create TD. Online CPU and retry.\n" + pr_warn_ratelimited(MSG_ALLPKG); + goto free_packages; + } + + /* + * TDH.MNG.CREATE tries to grab the global TDX module and fails + * with TDX_OPERAND_BUSY when it fails to grab. Take the global + * lock to prevent it from failure. + */ + mutex_lock(&tdx_lock); + kvm_tdx->td.tdr_page = tdr_page; + err = tdh_mng_create(&kvm_tdx->td, kvm_tdx->hkid); + mutex_unlock(&tdx_lock); + + if (err == TDX_RND_NO_ENTROPY) { + ret = -EAGAIN; + goto free_packages; + } + + if (WARN_ON_ONCE(err)) { + pr_tdx_error(TDH_MNG_CREATE, err); + ret = -EIO; + goto free_packages; + } + + for_each_online_cpu(i) { + int pkg = topology_physical_package_id(i); + + if (cpumask_test_and_set_cpu(pkg, packages)) + continue; + + /* + * Program the memory controller in the package with an + * encryption key associated to a TDX private host key id + * assigned to this TDR. Concurrent operations on same memory + * controller results in TDX_OPERAND_BUSY. No locking needed + * beyond the cpus_read_lock() above as it serializes against + * hotplug and the first online CPU of the package is always + * used. We never have two CPUs in the same socket trying to + * program the key. + */ + ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config, + kvm_tdx, true); + if (ret) + break; + } + cpus_read_unlock(); + free_cpumask_var(packages); + if (ret) { + i = 0; + goto teardown; + } + + kvm_tdx->td.tdcs_pages = tdcs_pages; + for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { + err = tdh_mng_addcx(&kvm_tdx->td, tdcs_pages[i]); + if (err == TDX_RND_NO_ENTROPY) { + /* Here it's hard to allow userspace to retry. */ + ret = -EAGAIN; + goto teardown; + } + if (WARN_ON_ONCE(err)) { + pr_tdx_error(TDH_MNG_ADDCX, err); + ret = -EIO; + goto teardown; + } + } + + err = tdh_mng_init(&kvm_tdx->td, __pa(td_params), &rcx); + if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_INVALID) { + /* + * Because a user gives operands, don't warn. + * Return a hint to the user because it's sometimes hard for the + * user to figure out which operand is invalid. SEAMCALL status + * code includes which operand caused invalid operand error. + */ + *seamcall_err = err; + ret = -EINVAL; + goto teardown; + } else if (WARN_ON_ONCE(err)) { + pr_tdx_error_1(TDH_MNG_INIT, err, rcx); + ret = -EIO; + goto teardown; + } + + return 0; + + /* + * The sequence for freeing resources from a partially initialized TD + * varies based on where in the initialization flow failure occurred. + * Simply use the full teardown and destroy, which naturally play nice + * with partial initialization. + */ +teardown: + /* Only free pages not yet added, so start at 'i' */ + for (; i < kvm_tdx->td.tdcs_nr_pages; i++) { + if (tdcs_pages[i]) { + __free_page(tdcs_pages[i]); + tdcs_pages[i] = NULL; + } + } + if (!kvm_tdx->td.tdcs_pages) + kfree(tdcs_pages); + + tdx_mmu_release_hkid(kvm); + tdx_reclaim_td_control_pages(kvm); + + return ret; + +free_packages: + cpus_read_unlock(); + free_cpumask_var(packages); + +free_tdcs: + for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) { + if (tdcs_pages[i]) + __free_page(tdcs_pages[i]); + } + kfree(tdcs_pages); + kvm_tdx->td.tdcs_pages = NULL; + +free_tdr: + if (tdr_page) + __free_page(tdr_page); + kvm_tdx->td.tdr_page = 0; + +free_hkid: + tdx_hkid_free(kvm_tdx); + + return ret; +} + +static u64 tdx_td_metadata_field_read(struct kvm_tdx *tdx, u64 field_id, + u64 *data) +{ + u64 err; + + err = tdh_mng_rd(&tdx->td, field_id, data); + + return err; +} + +#define TDX_MD_UNREADABLE_LEAF_MASK GENMASK(30, 7) +#define TDX_MD_UNREADABLE_SUBLEAF_MASK GENMASK(31, 7) + +static int tdx_read_cpuid(struct kvm_vcpu *vcpu, u32 leaf, u32 sub_leaf, + bool sub_leaf_set, int *entry_index, + struct kvm_cpuid_entry2 *out) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); + u64 field_id = TD_MD_FIELD_ID_CPUID_VALUES; + u64 ebx_eax, edx_ecx; + u64 err = 0; + + if (sub_leaf > 0b1111111) + return -EINVAL; + + if (*entry_index >= KVM_MAX_CPUID_ENTRIES) + return -EINVAL; + + if (leaf & TDX_MD_UNREADABLE_LEAF_MASK || + sub_leaf & TDX_MD_UNREADABLE_SUBLEAF_MASK) + return -EINVAL; + + /* + * bit 23:17, REVSERVED: reserved, must be 0; + * bit 16, LEAF_31: leaf number bit 31; + * bit 15:9, LEAF_6_0: leaf number bits 6:0, leaf bits 30:7 are + * implicitly 0; + * bit 8, SUBLEAF_NA: sub-leaf not applicable flag; + * bit 7:1, SUBLEAF_6_0: sub-leaf number bits 6:0. If SUBLEAF_NA is 1, + * the SUBLEAF_6_0 is all-1. + * sub-leaf bits 31:7 are implicitly 0; + * bit 0, ELEMENT_I: Element index within field; + */ + field_id |= ((leaf & 0x80000000) ? 1 : 0) << 16; + field_id |= (leaf & 0x7f) << 9; + if (sub_leaf_set) + field_id |= (sub_leaf & 0x7f) << 1; + else + field_id |= 0x1fe; + + err = tdx_td_metadata_field_read(kvm_tdx, field_id, &ebx_eax); + if (err) //TODO check for specific errors + goto err_out; + + out->eax = (u32) ebx_eax; + out->ebx = (u32) (ebx_eax >> 32); + + field_id++; + err = tdx_td_metadata_field_read(kvm_tdx, field_id, &edx_ecx); + /* + * It's weird that reading edx_ecx fails while reading ebx_eax + * succeeded. + */ + if (WARN_ON_ONCE(err)) + goto err_out; + + out->ecx = (u32) edx_ecx; + out->edx = (u32) (edx_ecx >> 32); + + out->function = leaf; + out->index = sub_leaf; + out->flags |= sub_leaf_set ? KVM_CPUID_FLAG_SIGNIFCANT_INDEX : 0; + + /* + * Work around missing support on old TDX modules, fetch + * guest maxpa from gfn_direct_bits. + */ + if (leaf == 0x80000008) { + gpa_t gpa_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm)); + unsigned int g_maxpa = __ffs(gpa_bits) + 1; + + out->eax = tdx_set_guest_phys_addr_bits(out->eax, g_maxpa); + } + + (*entry_index)++; + + return 0; + +err_out: + out->eax = 0; + out->ebx = 0; + out->ecx = 0; + out->edx = 0; + + return -EIO; +} + +static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + struct kvm_tdx_init_vm *init_vm; + struct td_params *td_params = NULL; + int ret; + + BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid)); + BUILD_BUG_ON(sizeof(struct td_params) != 1024); + + if (kvm_tdx->state != TD_STATE_UNINITIALIZED) + return -EINVAL; + + if (cmd->flags) + return -EINVAL; + + init_vm = kmalloc(sizeof(*init_vm) + + sizeof(init_vm->cpuid.entries[0]) * KVM_MAX_CPUID_ENTRIES, + GFP_KERNEL); + if (!init_vm) + return -ENOMEM; + + if (copy_from_user(init_vm, u64_to_user_ptr(cmd->data), sizeof(*init_vm))) { + ret = -EFAULT; + goto out; + } + + if (init_vm->cpuid.nent > KVM_MAX_CPUID_ENTRIES) { + ret = -E2BIG; + goto out; + } + + if (copy_from_user(init_vm->cpuid.entries, + u64_to_user_ptr(cmd->data) + sizeof(*init_vm), + flex_array_size(init_vm, cpuid.entries, init_vm->cpuid.nent))) { + ret = -EFAULT; + goto out; + } + + if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) { + ret = -EINVAL; + goto out; + } + + if (init_vm->cpuid.padding) { + ret = -EINVAL; + goto out; + } + + td_params = kzalloc(sizeof(struct td_params), GFP_KERNEL); + if (!td_params) { + ret = -ENOMEM; + goto out; + } + + ret = setup_tdparams(kvm, td_params, init_vm); + if (ret) + goto out; + + ret = __tdx_td_init(kvm, td_params, &cmd->hw_error); + if (ret) + goto out; + + kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET); + kvm_tdx->tsc_multiplier = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_MULTIPLIER); + kvm_tdx->attributes = td_params->attributes; + kvm_tdx->xfam = td_params->xfam; + + if (td_params->config_flags & TDX_CONFIG_FLAGS_MAX_GPAW) + kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_5; + else + kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_4; + + kvm_tdx->state = TD_STATE_INITIALIZED; +out: + /* kfree() accepts NULL. */ + kfree(init_vm); + kfree(td_params); + + return ret; +} + +void tdx_flush_tlb_current(struct kvm_vcpu *vcpu) +{ + /* + * flush_tlb_current() is invoked when the first time for the vcpu to + * run or when root of shared EPT is invalidated. + * KVM only needs to flush shared EPT because the TDX module handles TLB + * invalidation for private EPT in tdh_vp_enter(); + * + * A single context invalidation for shared EPT can be performed here. + * However, this single context invalidation requires the private EPTP + * rather than the shared EPTP to flush shared EPT, as shared EPT uses + * private EPTP as its ASID for TLB invalidation. + * + * To avoid reading back private EPTP, perform a global invalidation for + * shared EPT instead to keep this function simple. + */ + ept_sync_global(); +} + +void tdx_flush_tlb_all(struct kvm_vcpu *vcpu) +{ + /* + * TDX has called tdx_track() in tdx_sept_remove_private_spte() to + * ensure that private EPT will be flushed on the next TD enter. No need + * to call tdx_track() here again even when this callback is a result of + * zapping private EPT. + * + * Due to the lack of the context to determine which EPT has been + * affected by zapping, invoke invept() directly here for both shared + * EPT and private EPT for simplicity, though it's not necessary for + * private EPT. + */ + ept_sync_global(); +} + +static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + + guard(mutex)(&kvm->slots_lock); + + if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) + return -EINVAL; + /* + * Pages are pending for KVM_TDX_INIT_MEM_REGION to issue + * TDH.MEM.PAGE.ADD(). + */ + if (atomic64_read(&kvm_tdx->nr_premapped)) + return -EINVAL; + + cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td); + if (tdx_operand_busy(cmd->hw_error)) + return -EBUSY; + if (KVM_BUG_ON(cmd->hw_error, kvm)) { + pr_tdx_error(TDH_MR_FINALIZE, cmd->hw_error); + return -EIO; + } + + kvm_tdx->state = TD_STATE_RUNNABLE; + /* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */ + smp_wmb(); + kvm->arch.pre_fault_allowed = true; + return 0; +} + +int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) +{ + struct kvm_tdx_cmd tdx_cmd; + int r; + + if (copy_from_user(&tdx_cmd, argp, sizeof(struct kvm_tdx_cmd))) + return -EFAULT; + + /* + * Userspace should never set hw_error. It is used to fill + * hardware-defined error by the kernel. + */ + if (tdx_cmd.hw_error) + return -EINVAL; + + mutex_lock(&kvm->lock); + + switch (tdx_cmd.id) { + case KVM_TDX_CAPABILITIES: + r = tdx_get_capabilities(&tdx_cmd); + break; + case KVM_TDX_INIT_VM: + r = tdx_td_init(kvm, &tdx_cmd); + break; + case KVM_TDX_FINALIZE_VM: + r = tdx_td_finalize(kvm, &tdx_cmd); + break; + default: + r = -EINVAL; + goto out; + } + + if (copy_to_user(argp, &tdx_cmd, sizeof(struct kvm_tdx_cmd))) + r = -EFAULT; + +out: + mutex_unlock(&kvm->lock); + return r; +} + +/* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */ +static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); + struct vcpu_tdx *tdx = to_tdx(vcpu); + struct page *page; + int ret, i; + u64 err; + + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + tdx->vp.tdvpr_page = page; + + tdx->vp.tdcx_pages = kcalloc(kvm_tdx->td.tdcx_nr_pages, sizeof(*tdx->vp.tdcx_pages), + GFP_KERNEL); + if (!tdx->vp.tdcx_pages) { + ret = -ENOMEM; + goto free_tdvpr; + } + + for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { + page = alloc_page(GFP_KERNEL); + if (!page) { + ret = -ENOMEM; + goto free_tdcx; + } + tdx->vp.tdcx_pages[i] = page; + } + + err = tdh_vp_create(&kvm_tdx->td, &tdx->vp); + if (KVM_BUG_ON(err, vcpu->kvm)) { + ret = -EIO; + pr_tdx_error(TDH_VP_CREATE, err); + goto free_tdcx; + } + + for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { + err = tdh_vp_addcx(&tdx->vp, tdx->vp.tdcx_pages[i]); + if (KVM_BUG_ON(err, vcpu->kvm)) { + pr_tdx_error(TDH_VP_ADDCX, err); + /* + * Pages already added are reclaimed by the vcpu_free + * method, but the rest are freed here. + */ + for (; i < kvm_tdx->td.tdcx_nr_pages; i++) { + __free_page(tdx->vp.tdcx_pages[i]); + tdx->vp.tdcx_pages[i] = NULL; + } + return -EIO; + } + } + + err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id); + if (KVM_BUG_ON(err, vcpu->kvm)) { + pr_tdx_error(TDH_VP_INIT, err); + return -EIO; + } + + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + + return 0; + +free_tdcx: + for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { + if (tdx->vp.tdcx_pages[i]) + __free_page(tdx->vp.tdcx_pages[i]); + tdx->vp.tdcx_pages[i] = NULL; + } + kfree(tdx->vp.tdcx_pages); + tdx->vp.tdcx_pages = NULL; + +free_tdvpr: + if (tdx->vp.tdvpr_page) + __free_page(tdx->vp.tdvpr_page); + tdx->vp.tdvpr_page = 0; + + return ret; +} + +/* Sometimes reads multipple subleafs. Return how many enties were written. */ +static int tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu *vcpu, u32 leaf, int *entry_index, + struct kvm_cpuid_entry2 *output_e) +{ + int sub_leaf = 0; + int ret; + + /* First try without a subleaf */ + ret = tdx_read_cpuid(vcpu, leaf, 0, false, entry_index, output_e); + + /* If success, or invalid leaf, just give up */ + if (ret != -EIO) + return ret; + + /* + * If the try without a subleaf failed, try reading subleafs until + * failure. The TDX module only supports 6 bits of subleaf index. + */ + while (1) { + /* Keep reading subleafs until there is a failure. */ + if (tdx_read_cpuid(vcpu, leaf, sub_leaf, true, entry_index, output_e)) + return !sub_leaf; + + sub_leaf++; + output_e++; + } + + return 0; +} + +static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) +{ + struct kvm_cpuid2 __user *output, *td_cpuid; + int r = 0, i = 0, leaf; + u32 level; + + output = u64_to_user_ptr(cmd->data); + td_cpuid = kzalloc(sizeof(*td_cpuid) + + sizeof(output->entries[0]) * KVM_MAX_CPUID_ENTRIES, + GFP_KERNEL); + if (!td_cpuid) + return -ENOMEM; + + if (copy_from_user(td_cpuid, output, sizeof(*output))) { + r = -EFAULT; + goto out; + } + + /* Read max CPUID for normal range */ + if (tdx_vcpu_get_cpuid_leaf(vcpu, 0, &i, &td_cpuid->entries[i])) { + r = -EIO; + goto out; + } + level = td_cpuid->entries[0].eax; + + for (leaf = 1; leaf <= level; leaf++) + tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]); + + /* Read max CPUID for extended range */ + if (tdx_vcpu_get_cpuid_leaf(vcpu, 0x80000000, &i, &td_cpuid->entries[i])) { + r = -EIO; + goto out; + } + level = td_cpuid->entries[i - 1].eax; + + for (leaf = 0x80000001; leaf <= level; leaf++) + tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]); + + if (td_cpuid->nent < i) + r = -E2BIG; + td_cpuid->nent = i; + + if (copy_to_user(output, td_cpuid, sizeof(*output))) { + r = -EFAULT; + goto out; + } + + if (r == -E2BIG) + goto out; + + if (copy_to_user(output->entries, td_cpuid->entries, + td_cpuid->nent * sizeof(struct kvm_cpuid_entry2))) + r = -EFAULT; + +out: + kfree(td_cpuid); + + return r; +} + +static int tdx_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) +{ + u64 apic_base; + struct vcpu_tdx *tdx = to_tdx(vcpu); + int ret; + + if (cmd->flags) + return -EINVAL; + + if (tdx->state != VCPU_TD_STATE_UNINITIALIZED) + return -EINVAL; + + /* + * TDX requires X2APIC, userspace is responsible for configuring guest + * CPUID accordingly. + */ + apic_base = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC | + (kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0); + if (kvm_apic_set_base(vcpu, apic_base, true)) + return -EINVAL; + + ret = tdx_td_vcpu_init(vcpu, (u64)cmd->data); + if (ret) + return ret; + + td_vmcs_write16(tdx, POSTED_INTR_NV, POSTED_INTR_VECTOR); + td_vmcs_write64(tdx, POSTED_INTR_DESC_ADDR, __pa(&tdx->vt.pi_desc)); + td_vmcs_setbit32(tdx, PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_POSTED_INTR); + + tdx->state = VCPU_TD_STATE_INITIALIZED; + + return 0; +} + +void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) +{ + /* + * Yell on INIT, as TDX doesn't support INIT, i.e. KVM should drop all + * INIT events. + * + * Defer initializing vCPU for RESET state until KVM_TDX_INIT_VCPU, as + * userspace needs to define the vCPU model before KVM can initialize + * vCPU state, e.g. to enable x2APIC. + */ + WARN_ON_ONCE(init_event); +} + +struct tdx_gmem_post_populate_arg { + struct kvm_vcpu *vcpu; + __u32 flags; +}; + +static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, + void __user *src, int order, void *_arg) +{ + u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS; + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + struct tdx_gmem_post_populate_arg *arg = _arg; + struct kvm_vcpu *vcpu = arg->vcpu; + gpa_t gpa = gfn_to_gpa(gfn); + u8 level = PG_LEVEL_4K; + struct page *src_page; + int ret, i; + u64 err, entry, level_state; + + /* + * Get the source page if it has been faulted in. Return failure if the + * source page has been swapped out or unmapped in primary memory. + */ + ret = get_user_pages_fast((unsigned long)src, 1, 0, &src_page); + if (ret < 0) + return ret; + if (ret != 1) + return -ENOMEM; + + ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level); + if (ret < 0) + goto out; + + /* + * The private mem cannot be zapped after kvm_tdp_map_page() + * because all paths are covered by slots_lock and the + * filemap invalidate lock. Check that they are indeed enough. + */ + if (IS_ENABLED(CONFIG_KVM_PROVE_MMU)) { + scoped_guard(read_lock, &kvm->mmu_lock) { + if (KVM_BUG_ON(!kvm_tdp_mmu_gpa_is_mapped(vcpu, gpa), kvm)) { + ret = -EIO; + goto out; + } + } + } + + ret = 0; + err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn), + src_page, &entry, &level_state); + if (err) { + ret = unlikely(tdx_operand_busy(err)) ? -EBUSY : -EIO; + goto out; + } + + if (!KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) + atomic64_dec(&kvm_tdx->nr_premapped); + + if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) { + for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) { + err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry, + &level_state); + if (err) { + ret = -EIO; + break; + } + } + } + +out: + put_page(src_page); + return ret; +} + +static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + struct kvm *kvm = vcpu->kvm; + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + struct kvm_tdx_init_mem_region region; + struct tdx_gmem_post_populate_arg arg; + long gmem_ret; + int ret; + + if (tdx->state != VCPU_TD_STATE_INITIALIZED) + return -EINVAL; + + guard(mutex)(&kvm->slots_lock); + + /* Once TD is finalized, the initial guest memory is fixed. */ + if (kvm_tdx->state == TD_STATE_RUNNABLE) + return -EINVAL; + + if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION) + return -EINVAL; + + if (copy_from_user(®ion, u64_to_user_ptr(cmd->data), sizeof(region))) + return -EFAULT; + + if (!PAGE_ALIGNED(region.source_addr) || !PAGE_ALIGNED(region.gpa) || + !region.nr_pages || + region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa || + !vt_is_tdx_private_gpa(kvm, region.gpa) || + !vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1)) + return -EINVAL; + + kvm_mmu_reload(vcpu); + ret = 0; + while (region.nr_pages) { + if (signal_pending(current)) { + ret = -EINTR; + break; + } + + arg = (struct tdx_gmem_post_populate_arg) { + .vcpu = vcpu, + .flags = cmd->flags, + }; + gmem_ret = kvm_gmem_populate(kvm, gpa_to_gfn(region.gpa), + u64_to_user_ptr(region.source_addr), + 1, tdx_gmem_post_populate, &arg); + if (gmem_ret < 0) { + ret = gmem_ret; + break; + } + + if (gmem_ret != 1) { + ret = -EIO; + break; + } + + region.source_addr += PAGE_SIZE; + region.gpa += PAGE_SIZE; + region.nr_pages--; + + cond_resched(); + } + + if (copy_to_user(u64_to_user_ptr(cmd->data), ®ion, sizeof(region))) + ret = -EFAULT; + return ret; +} + +int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); + struct kvm_tdx_cmd cmd; + int ret; + + if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) + return -EINVAL; + + if (copy_from_user(&cmd, argp, sizeof(cmd))) + return -EFAULT; + + if (cmd.hw_error) + return -EINVAL; + + switch (cmd.id) { + case KVM_TDX_INIT_VCPU: + ret = tdx_vcpu_init(vcpu, &cmd); + break; + case KVM_TDX_INIT_MEM_REGION: + ret = tdx_vcpu_init_mem_region(vcpu, &cmd); + break; + case KVM_TDX_GET_CPUID: + ret = tdx_vcpu_get_cpuid(vcpu, &cmd); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +{ + return PG_LEVEL_4K; +} + +static int tdx_online_cpu(unsigned int cpu) +{ + unsigned long flags; + int r; + + /* Sanity check CPU is already in post-VMXON */ + WARN_ON_ONCE(!(cr4_read_shadow() & X86_CR4_VMXE)); + + local_irq_save(flags); + r = tdx_cpu_enable(); + local_irq_restore(flags); + + return r; +} + +static int tdx_offline_cpu(unsigned int cpu) +{ + int i; + + /* No TD is running. Allow any cpu to be offline. */ + if (!atomic_read(&nr_configured_hkid)) + return 0; + + /* + * In order to reclaim TDX HKID, (i.e. when deleting guest TD), need to + * call TDH.PHYMEM.PAGE.WBINVD on all packages to program all memory + * controller with pconfig. If we have active TDX HKID, refuse to + * offline the last online cpu. + */ + for_each_online_cpu(i) { + /* + * Found another online cpu on the same package. + * Allow to offline. + */ + if (i != cpu && topology_physical_package_id(i) == + topology_physical_package_id(cpu)) + return 0; + } + + /* + * This is the last cpu of this package. Don't offline it. + * + * Because it's hard for human operator to understand the + * reason, warn it. + */ +#define MSG_ALLPKG_ONLINE \ + "TDX requires all packages to have an online CPU. Delete all TDs in order to offline all CPUs of a package.\n" + pr_warn_ratelimited(MSG_ALLPKG_ONLINE); + return -EBUSY; +} + +static void __do_tdx_cleanup(void) +{ + /* + * Once TDX module is initialized, it cannot be disabled and + * re-initialized again w/o runtime update (which isn't + * supported by kernel). Only need to remove the cpuhp here. + * The TDX host core code tracks TDX status and can handle + * 'multiple enabling' scenario. + */ + WARN_ON_ONCE(!tdx_cpuhp_state); + cpuhp_remove_state_nocalls_cpuslocked(tdx_cpuhp_state); + tdx_cpuhp_state = 0; +} + +static void __tdx_cleanup(void) +{ + cpus_read_lock(); + __do_tdx_cleanup(); + cpus_read_unlock(); +} + +static int __init __do_tdx_bringup(void) +{ + int r; + + /* + * TDX-specific cpuhp callback to call tdx_cpu_enable() on all + * online CPUs before calling tdx_enable(), and on any new + * going-online CPU to make sure it is ready for TDX guest. + */ + r = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN, + "kvm/cpu/tdx:online", + tdx_online_cpu, tdx_offline_cpu); + if (r < 0) + return r; + + tdx_cpuhp_state = r; + + r = tdx_enable(); + if (r) + __do_tdx_cleanup(); + + return r; +} + +static int __init __tdx_bringup(void) +{ + const struct tdx_sys_info_td_conf *td_conf; + int r, i; + + for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) { + /* + * Check if MSRs (tdx_uret_msrs) can be saved/restored + * before returning to user space. + * + * this_cpu_ptr(user_return_msrs)->registered isn't checked + * because the registration is done at vcpu runtime by + * tdx_user_return_msr_update_cache(). + */ + tdx_uret_msrs[i].slot = kvm_find_user_return_msr(tdx_uret_msrs[i].msr); + if (tdx_uret_msrs[i].slot == -1) { + /* If any MSR isn't supported, it is a KVM bug */ + pr_err("MSR %x isn't included by kvm_find_user_return_msr\n", + tdx_uret_msrs[i].msr); + return -EIO; + } + } + + /* + * Enabling TDX requires enabling hardware virtualization first, + * as making SEAMCALLs requires CPU being in post-VMXON state. + */ + r = kvm_enable_virtualization(); + if (r) + return r; + + cpus_read_lock(); + r = __do_tdx_bringup(); + cpus_read_unlock(); + + if (r) + goto tdx_bringup_err; + + /* Get TDX global information for later use */ + tdx_sysinfo = tdx_get_sysinfo(); + if (WARN_ON_ONCE(!tdx_sysinfo)) { + r = -EINVAL; + goto get_sysinfo_err; + } + + /* Check TDX module and KVM capabilities */ + if (!tdx_get_supported_attrs(&tdx_sysinfo->td_conf) || + !tdx_get_supported_xfam(&tdx_sysinfo->td_conf)) + goto get_sysinfo_err; + + if (!(tdx_sysinfo->features.tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM)) + goto get_sysinfo_err; + + /* + * TDX has its own limit of maximum vCPUs it can support for all + * TDX guests in addition to KVM_MAX_VCPUS. Userspace needs to + * query TDX guest's maximum vCPUs by checking KVM_CAP_MAX_VCPU + * extension on per-VM basis. + * + * TDX module reports such limit via the MAX_VCPU_PER_TD global + * metadata. Different modules may report different values. + * Some old module may also not support this metadata (in which + * case this limit is U16_MAX). + * + * In practice, the reported value reflects the maximum logical + * CPUs that ALL the platforms that the module supports can + * possibly have. + * + * Simply forwarding the MAX_VCPU_PER_TD to userspace could + * result in an unpredictable ABI. KVM instead always advertise + * the number of logical CPUs the platform has as the maximum + * vCPUs for TDX guests. + * + * Make sure MAX_VCPU_PER_TD reported by TDX module is not + * smaller than the number of logical CPUs, otherwise KVM will + * report an unsupported value to userspace. + * + * Note, a platform with TDX enabled in the BIOS cannot support + * physical CPU hotplug, and TDX requires the BIOS has marked + * all logical CPUs in MADT table as enabled. Just use + * num_present_cpus() for the number of logical CPUs. + */ + td_conf = &tdx_sysinfo->td_conf; + if (td_conf->max_vcpus_per_td < num_present_cpus()) { + pr_err("Disable TDX: MAX_VCPU_PER_TD (%u) smaller than number of logical CPUs (%u).\n", + td_conf->max_vcpus_per_td, num_present_cpus()); + r = -EINVAL; + goto get_sysinfo_err; + } + + if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids())) { + r = -EINVAL; + goto get_sysinfo_err; + } + + /* + * Leave hardware virtualization enabled after TDX is enabled + * successfully. TDX CPU hotplug depends on this. + */ + return 0; + +get_sysinfo_err: + __tdx_cleanup(); +tdx_bringup_err: + kvm_disable_virtualization(); + return r; +} + +void tdx_cleanup(void) +{ + if (enable_tdx) { + misc_cg_set_capacity(MISC_CG_RES_TDX, 0); + __tdx_cleanup(); + kvm_disable_virtualization(); + } +} + +int __init tdx_bringup(void) +{ + int r, i; + + /* tdx_disable_virtualization_cpu() uses associated_tdvcpus. */ + for_each_possible_cpu(i) + INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, i)); + + if (!enable_tdx) + return 0; + + if (!enable_ept) { + pr_err("EPT is required for TDX\n"); + goto success_disable_tdx; + } + + if (!tdp_mmu_enabled || !enable_mmio_caching || !enable_ept_ad_bits) { + pr_err("TDP MMU and MMIO caching and EPT A/D bit is required for TDX\n"); + goto success_disable_tdx; + } + + if (!enable_apicv) { + pr_err("APICv is required for TDX\n"); + goto success_disable_tdx; + } + + if (!cpu_feature_enabled(X86_FEATURE_OSXSAVE)) { + pr_err("tdx: OSXSAVE is required for TDX\n"); + goto success_disable_tdx; + } + + if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) { + pr_err("tdx: MOVDIR64B is required for TDX\n"); + goto success_disable_tdx; + } + + if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) { + pr_err("Self-snoop is required for TDX\n"); + goto success_disable_tdx; + } + + if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) { + pr_err("tdx: no TDX private KeyIDs available\n"); + goto success_disable_tdx; + } + + if (!enable_virt_at_load) { + pr_err("tdx: tdx requires kvm.enable_virt_at_load=1\n"); + goto success_disable_tdx; + } + + /* + * Ideally KVM should probe whether TDX module has been loaded + * first and then try to bring it up. But TDX needs to use SEAMCALL + * to probe whether the module is loaded (there is no CPUID or MSR + * for that), and making SEAMCALL requires enabling virtualization + * first, just like the rest steps of bringing up TDX module. + * + * So, for simplicity do everything in __tdx_bringup(); the first + * SEAMCALL will return -ENODEV when the module is not loaded. The + * only complication is having to make sure that initialization + * SEAMCALLs don't return TDX_SEAMCALL_VMFAILINVALID in other + * cases. + */ + r = __tdx_bringup(); + if (r) { + /* + * Disable TDX only but don't fail to load module if + * the TDX module could not be loaded. No need to print + * message saying "module is not loaded" because it was + * printed when the first SEAMCALL failed. + */ + if (r == -ENODEV) + goto success_disable_tdx; + + enable_tdx = 0; + } + + return r; + +success_disable_tdx: + enable_tdx = 0; + return 0; +} diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h new file mode 100644 index 000000000000..51f98443e8a2 --- /dev/null +++ b/arch/x86/kvm/vmx/tdx.h @@ -0,0 +1,204 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_TDX_H +#define __KVM_X86_VMX_TDX_H + +#include "tdx_arch.h" +#include "tdx_errno.h" + +#ifdef CONFIG_KVM_INTEL_TDX +#include "common.h" + +int tdx_bringup(void); +void tdx_cleanup(void); + +extern bool enable_tdx; + +/* TDX module hardware states. These follow the TDX module OP_STATEs. */ +enum kvm_tdx_state { + TD_STATE_UNINITIALIZED = 0, + TD_STATE_INITIALIZED, + TD_STATE_RUNNABLE, +}; + +struct kvm_tdx { + struct kvm kvm; + + struct misc_cg *misc_cg; + int hkid; + enum kvm_tdx_state state; + + u64 attributes; + u64 xfam; + + u64 tsc_offset; + u64 tsc_multiplier; + + struct tdx_td td; + + /* For KVM_TDX_INIT_MEM_REGION. */ + atomic64_t nr_premapped; + + /* + * Prevent vCPUs from TD entry to ensure SEPT zap related SEAMCALLs do + * not contend with tdh_vp_enter() and TDCALLs. + * Set/unset is protected with kvm->mmu_lock. + */ + bool wait_for_sept_zap; +}; + +/* TDX module vCPU states */ +enum vcpu_tdx_state { + VCPU_TD_STATE_UNINITIALIZED = 0, + VCPU_TD_STATE_INITIALIZED, +}; + +struct vcpu_tdx { + struct kvm_vcpu vcpu; + struct vcpu_vt vt; + u64 ext_exit_qualification; + gpa_t exit_gpa; + struct tdx_module_args vp_enter_args; + + struct tdx_vp vp; + + struct list_head cpu_list; + + u64 vp_enter_ret; + + enum vcpu_tdx_state state; + bool guest_entered; + + u64 map_gpa_next; + u64 map_gpa_end; +}; + +void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err); +void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field, + u64 val, u64 err); + +static __always_inline u64 td_tdcs_exec_read64(struct kvm_tdx *kvm_tdx, u32 field) +{ + u64 err, data; + + err = tdh_mng_rd(&kvm_tdx->td, TDCS_EXEC(field), &data); + if (unlikely(err)) { + pr_err("TDH_MNG_RD[EXEC.0x%x] failed: 0x%llx\n", field, err); + return 0; + } + return data; +} + +static __always_inline void tdvps_vmcs_check(u32 field, u8 bits) +{ +#define VMCS_ENC_ACCESS_TYPE_MASK 0x1UL +#define VMCS_ENC_ACCESS_TYPE_FULL 0x0UL +#define VMCS_ENC_ACCESS_TYPE_HIGH 0x1UL +#define VMCS_ENC_ACCESS_TYPE(field) ((field) & VMCS_ENC_ACCESS_TYPE_MASK) + + /* TDX is 64bit only. HIGH field isn't supported. */ + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && + VMCS_ENC_ACCESS_TYPE(field) == VMCS_ENC_ACCESS_TYPE_HIGH, + "Read/Write to TD VMCS *_HIGH fields not supported"); + + BUILD_BUG_ON(bits != 16 && bits != 32 && bits != 64); + +#define VMCS_ENC_WIDTH_MASK GENMASK(14, 13) +#define VMCS_ENC_WIDTH_16BIT (0UL << 13) +#define VMCS_ENC_WIDTH_64BIT (1UL << 13) +#define VMCS_ENC_WIDTH_32BIT (2UL << 13) +#define VMCS_ENC_WIDTH_NATURAL (3UL << 13) +#define VMCS_ENC_WIDTH(field) ((field) & VMCS_ENC_WIDTH_MASK) + + /* TDX is 64bit only. i.e. natural width = 64bit. */ + BUILD_BUG_ON_MSG(bits != 64 && __builtin_constant_p(field) && + (VMCS_ENC_WIDTH(field) == VMCS_ENC_WIDTH_64BIT || + VMCS_ENC_WIDTH(field) == VMCS_ENC_WIDTH_NATURAL), + "Invalid TD VMCS access for 64-bit field"); + BUILD_BUG_ON_MSG(bits != 32 && __builtin_constant_p(field) && + VMCS_ENC_WIDTH(field) == VMCS_ENC_WIDTH_32BIT, + "Invalid TD VMCS access for 32-bit field"); + BUILD_BUG_ON_MSG(bits != 16 && __builtin_constant_p(field) && + VMCS_ENC_WIDTH(field) == VMCS_ENC_WIDTH_16BIT, + "Invalid TD VMCS access for 16-bit field"); +} + +static __always_inline void tdvps_management_check(u64 field, u8 bits) {} +static __always_inline void tdvps_state_non_arch_check(u64 field, u8 bits) {} + +#define TDX_BUILD_TDVPS_ACCESSORS(bits, uclass, lclass) \ +static __always_inline u##bits td_##lclass##_read##bits(struct vcpu_tdx *tdx, \ + u32 field) \ +{ \ + u64 err, data; \ + \ + tdvps_##lclass##_check(field, bits); \ + err = tdh_vp_rd(&tdx->vp, TDVPS_##uclass(field), &data); \ + if (unlikely(err)) { \ + tdh_vp_rd_failed(tdx, #uclass, field, err); \ + return 0; \ + } \ + return (u##bits)data; \ +} \ +static __always_inline void td_##lclass##_write##bits(struct vcpu_tdx *tdx, \ + u32 field, u##bits val) \ +{ \ + u64 err; \ + \ + tdvps_##lclass##_check(field, bits); \ + err = tdh_vp_wr(&tdx->vp, TDVPS_##uclass(field), val, \ + GENMASK_ULL(bits - 1, 0)); \ + if (unlikely(err)) \ + tdh_vp_wr_failed(tdx, #uclass, " = ", field, (u64)val, err); \ +} \ +static __always_inline void td_##lclass##_setbit##bits(struct vcpu_tdx *tdx, \ + u32 field, u64 bit) \ +{ \ + u64 err; \ + \ + tdvps_##lclass##_check(field, bits); \ + err = tdh_vp_wr(&tdx->vp, TDVPS_##uclass(field), bit, bit); \ + if (unlikely(err)) \ + tdh_vp_wr_failed(tdx, #uclass, " |= ", field, bit, err); \ +} \ +static __always_inline void td_##lclass##_clearbit##bits(struct vcpu_tdx *tdx, \ + u32 field, u64 bit) \ +{ \ + u64 err; \ + \ + tdvps_##lclass##_check(field, bits); \ + err = tdh_vp_wr(&tdx->vp, TDVPS_##uclass(field), 0, bit); \ + if (unlikely(err)) \ + tdh_vp_wr_failed(tdx, #uclass, " &= ~", field, bit, err);\ +} + + +bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu); +int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err); + +TDX_BUILD_TDVPS_ACCESSORS(16, VMCS, vmcs); +TDX_BUILD_TDVPS_ACCESSORS(32, VMCS, vmcs); +TDX_BUILD_TDVPS_ACCESSORS(64, VMCS, vmcs); + +TDX_BUILD_TDVPS_ACCESSORS(8, MANAGEMENT, management); +TDX_BUILD_TDVPS_ACCESSORS(64, STATE_NON_ARCH, state_non_arch); + +#else +static inline int tdx_bringup(void) { return 0; } +static inline void tdx_cleanup(void) {} + +#define enable_tdx 0 + +struct kvm_tdx { + struct kvm kvm; +}; + +struct vcpu_tdx { + struct kvm_vcpu vcpu; +}; + +static inline bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu) { return false; } +static inline int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) { return 0; } + +#endif + +#endif diff --git a/arch/x86/kvm/vmx/tdx_arch.h b/arch/x86/kvm/vmx/tdx_arch.h new file mode 100644 index 000000000000..a30e880849e3 --- /dev/null +++ b/arch/x86/kvm/vmx/tdx_arch.h @@ -0,0 +1,167 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* architectural constants/data definitions for TDX SEAMCALLs */ + +#ifndef __KVM_X86_TDX_ARCH_H +#define __KVM_X86_TDX_ARCH_H + +#include <linux/types.h> + +/* TDX control structure (TDR/TDCS/TDVPS) field access codes */ +#define TDX_NON_ARCH BIT_ULL(63) +#define TDX_CLASS_SHIFT 56 +#define TDX_FIELD_MASK GENMASK_ULL(31, 0) + +#define __BUILD_TDX_FIELD(non_arch, class, field) \ + (((non_arch) ? TDX_NON_ARCH : 0) | \ + ((u64)(class) << TDX_CLASS_SHIFT) | \ + ((u64)(field) & TDX_FIELD_MASK)) + +#define BUILD_TDX_FIELD(class, field) \ + __BUILD_TDX_FIELD(false, (class), (field)) + +#define BUILD_TDX_FIELD_NON_ARCH(class, field) \ + __BUILD_TDX_FIELD(true, (class), (field)) + + +/* Class code for TD */ +#define TD_CLASS_EXECUTION_CONTROLS 17ULL + +/* Class code for TDVPS */ +#define TDVPS_CLASS_VMCS 0ULL +#define TDVPS_CLASS_GUEST_GPR 16ULL +#define TDVPS_CLASS_OTHER_GUEST 17ULL +#define TDVPS_CLASS_MANAGEMENT 32ULL + +enum tdx_tdcs_execution_control { + TD_TDCS_EXEC_TSC_OFFSET = 10, + TD_TDCS_EXEC_TSC_MULTIPLIER = 11, +}; + +enum tdx_vcpu_guest_other_state { + TD_VCPU_STATE_DETAILS_NON_ARCH = 0x100, +}; + +#define TDX_VCPU_STATE_DETAILS_INTR_PENDING BIT_ULL(0) + +static inline bool tdx_vcpu_state_details_intr_pending(u64 vcpu_state_details) +{ + return !!(vcpu_state_details & TDX_VCPU_STATE_DETAILS_INTR_PENDING); +} + +/* @field is any of enum tdx_tdcs_execution_control */ +#define TDCS_EXEC(field) BUILD_TDX_FIELD(TD_CLASS_EXECUTION_CONTROLS, (field)) + +/* @field is the VMCS field encoding */ +#define TDVPS_VMCS(field) BUILD_TDX_FIELD(TDVPS_CLASS_VMCS, (field)) + +/* @field is any of enum tdx_guest_other_state */ +#define TDVPS_STATE(field) BUILD_TDX_FIELD(TDVPS_CLASS_OTHER_GUEST, (field)) +#define TDVPS_STATE_NON_ARCH(field) BUILD_TDX_FIELD_NON_ARCH(TDVPS_CLASS_OTHER_GUEST, (field)) + +/* Management class fields */ +enum tdx_vcpu_guest_management { + TD_VCPU_PEND_NMI = 11, +}; + +/* @field is any of enum tdx_vcpu_guest_management */ +#define TDVPS_MANAGEMENT(field) BUILD_TDX_FIELD(TDVPS_CLASS_MANAGEMENT, (field)) + +#define TDX_EXTENDMR_CHUNKSIZE 256 + +struct tdx_cpuid_value { + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; +} __packed; + +#define TDX_TD_ATTR_DEBUG BIT_ULL(0) +#define TDX_TD_ATTR_SEPT_VE_DISABLE BIT_ULL(28) +#define TDX_TD_ATTR_PKS BIT_ULL(30) +#define TDX_TD_ATTR_KL BIT_ULL(31) +#define TDX_TD_ATTR_PERFMON BIT_ULL(63) + +#define TDX_EXT_EXIT_QUAL_TYPE_MASK GENMASK(3, 0) +#define TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION 6 +/* + * TD_PARAMS is provided as an input to TDH_MNG_INIT, the size of which is 1024B. + */ +struct td_params { + u64 attributes; + u64 xfam; + u16 max_vcpus; + u8 reserved0[6]; + + u64 eptp_controls; + u64 config_flags; + u16 tsc_frequency; + u8 reserved1[38]; + + u64 mrconfigid[6]; + u64 mrowner[6]; + u64 mrownerconfig[6]; + u64 reserved2[4]; + + union { + DECLARE_FLEX_ARRAY(struct tdx_cpuid_value, cpuid_values); + u8 reserved3[768]; + }; +} __packed __aligned(1024); + +/* + * Guest uses MAX_PA for GPAW when set. + * 0: GPA.SHARED bit is GPA[47] + * 1: GPA.SHARED bit is GPA[51] + */ +#define TDX_CONFIG_FLAGS_MAX_GPAW BIT_ULL(0) + +/* + * TDH.VP.ENTER, TDG.VP.VMCALL preserves RBP + * 0: RBP can be used for TDG.VP.VMCALL input. RBP is clobbered. + * 1: RBP can't be used for TDG.VP.VMCALL input. RBP is preserved. + */ +#define TDX_CONFIG_FLAGS_NO_RBP_MOD BIT_ULL(2) + + +/* + * TDX requires the frequency to be defined in units of 25MHz, which is the + * frequency of the core crystal clock on TDX-capable platforms, i.e. the TDX + * module can only program frequencies that are multiples of 25MHz. The + * frequency must be between 100mhz and 10ghz (inclusive). + */ +#define TDX_TSC_KHZ_TO_25MHZ(tsc_in_khz) ((tsc_in_khz) / (25 * 1000)) +#define TDX_TSC_25MHZ_TO_KHZ(tsc_in_25mhz) ((tsc_in_25mhz) * (25 * 1000)) +#define TDX_MIN_TSC_FREQUENCY_KHZ (100 * 1000) +#define TDX_MAX_TSC_FREQUENCY_KHZ (10 * 1000 * 1000) + +/* Additional Secure EPT entry information */ +#define TDX_SEPT_LEVEL_MASK GENMASK_ULL(2, 0) +#define TDX_SEPT_STATE_MASK GENMASK_ULL(15, 8) +#define TDX_SEPT_STATE_SHIFT 8 + +enum tdx_sept_entry_state { + TDX_SEPT_FREE = 0, + TDX_SEPT_BLOCKED = 1, + TDX_SEPT_PENDING = 2, + TDX_SEPT_PENDING_BLOCKED = 3, + TDX_SEPT_PRESENT = 4, +}; + +static inline u8 tdx_get_sept_level(u64 sept_entry_info) +{ + return sept_entry_info & TDX_SEPT_LEVEL_MASK; +} + +static inline u8 tdx_get_sept_state(u64 sept_entry_info) +{ + return (sept_entry_info & TDX_SEPT_STATE_MASK) >> TDX_SEPT_STATE_SHIFT; +} + +#define MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM BIT_ULL(20) + +/* + * TD scope metadata field ID. + */ +#define TD_MD_FIELD_ID_CPUID_VALUES 0x9410000300000000ULL + +#endif /* __KVM_X86_TDX_ARCH_H */ diff --git a/arch/x86/kvm/vmx/tdx_errno.h b/arch/x86/kvm/vmx/tdx_errno.h new file mode 100644 index 000000000000..6ff4672c4181 --- /dev/null +++ b/arch/x86/kvm/vmx/tdx_errno.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* architectural status code for SEAMCALL */ + +#ifndef __KVM_X86_TDX_ERRNO_H +#define __KVM_X86_TDX_ERRNO_H + +#define TDX_SEAMCALL_STATUS_MASK 0xFFFFFFFF00000000ULL + +/* + * TDX SEAMCALL Status Codes (returned in RAX) + */ +#define TDX_NON_RECOVERABLE_VCPU 0x4000000100000000ULL +#define TDX_NON_RECOVERABLE_TD 0x4000000200000000ULL +#define TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE 0x6000000500000000ULL +#define TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE 0x6000000700000000ULL +#define TDX_INTERRUPTED_RESUMABLE 0x8000000300000000ULL +#define TDX_OPERAND_INVALID 0xC000010000000000ULL +#define TDX_OPERAND_BUSY 0x8000020000000000ULL +#define TDX_PREVIOUS_TLB_EPOCH_BUSY 0x8000020100000000ULL +#define TDX_PAGE_METADATA_INCORRECT 0xC000030000000000ULL +#define TDX_VCPU_NOT_ASSOCIATED 0x8000070200000000ULL +#define TDX_KEY_GENERATION_FAILED 0x8000080000000000ULL +#define TDX_KEY_STATE_INCORRECT 0xC000081100000000ULL +#define TDX_KEY_CONFIGURED 0x0000081500000000ULL +#define TDX_NO_HKID_READY_TO_WBCACHE 0x0000082100000000ULL +#define TDX_FLUSHVP_NOT_DONE 0x8000082400000000ULL +#define TDX_EPT_WALK_FAILED 0xC0000B0000000000ULL +#define TDX_EPT_ENTRY_STATE_INCORRECT 0xC0000B0D00000000ULL +#define TDX_METADATA_FIELD_NOT_READABLE 0xC0000C0200000000ULL + +/* + * TDX module operand ID, appears in 31:0 part of error code as + * detail information + */ +#define TDX_OPERAND_ID_RCX 0x01 +#define TDX_OPERAND_ID_TDR 0x80 +#define TDX_OPERAND_ID_SEPT 0x92 +#define TDX_OPERAND_ID_TD_EPOCH 0xa9 + +#endif /* __KVM_X86_TDX_ERRNO_H */ diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 7c1996b433e2..b25625314658 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -140,6 +140,11 @@ static inline bool is_nm_fault(u32 intr_info) return is_exception_n(intr_info, NM_VECTOR); } +static inline bool is_ve_fault(u32 intr_info) +{ + return is_exception_n(intr_info, VE_VECTOR); +} + /* Undocumented: icebp/int1 */ static inline bool is_icebp(u32 intr_info) { diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index 01936013428b..56fd150a6f24 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -188,12 +188,13 @@ struct __packed vmcs12 { }; /* - * VMCS12_REVISION is an arbitrary id that should be changed if the content or - * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and - * VMPTRLD verifies that the VMCS region that L1 is loading contains this id. + * VMCS12_REVISION is KVM's arbitrary ID for the layout of struct vmcs12. KVM + * enumerates this value to L1 via MSR_IA32_VMX_BASIC, and checks the revision + * ID during nested VMPTRLD to verify that L1 is loading a VMCS that adhere's + * to KVM's virtual CPU definition. * - * IMPORTANT: Changing this value will break save/restore compatibility with - * older kvm releases. + * DO NOT change this value, as it will break save/restore compatibility with + * older KVM releases. */ #define VMCS12_REVISION 0x11e57ed0 @@ -206,7 +207,8 @@ struct __packed vmcs12 { #define VMCS12_SIZE KVM_STATE_NESTED_VMX_VMCS_SIZE /* - * For save/restore compatibility, the vmcs12 field offsets must not change. + * For save/restore compatibility, the vmcs12 field offsets must not change, + * although appending fields and/or filling gaps is obviously allowed. */ #define CHECK_OFFSET(field, loc) \ ASSERT_STRUCT_OFFSET(struct vmcs12, field, loc) diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 2bfbf758d061..0a6cf5bff2aa 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -59,8 +59,7 @@ * without the explicit restore, thinks the stack is getting walloped. * Using an unwind hint is problematic due to x86-64's dynamic alignment. */ - mov %_ASM_BP, %_ASM_SP - pop %_ASM_BP + leave RET .endm @@ -275,6 +274,8 @@ SYM_INNER_LABEL_ALIGN(vmx_vmexit, SYM_L_GLOBAL) call vmx_spec_ctrl_restore_host + CLEAR_BRANCH_HISTORY_VMEXIT + /* Put return value in AX */ mov %_ASM_BX, %_ASM_AX diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c37a89eda90f..4953846cb30d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -46,6 +46,7 @@ #include <asm/perf_event.h> #include <asm/mmu_context.h> #include <asm/mshyperv.h> +#include <asm/msr.h> #include <asm/mwait.h> #include <asm/spec-ctrl.h> #include <asm/vmx.h> @@ -53,6 +54,7 @@ #include <trace/events/ipi.h> #include "capabilities.h" +#include "common.h" #include "cpuid.h" #include "hyperv.h" #include "kvm_onhyperv.h" @@ -68,10 +70,13 @@ #include "vmcs12.h" #include "vmx.h" #include "x86.h" +#include "x86_ops.h" #include "smm.h" #include "vmx_onhyperv.h" +#include "posted_intr.h" MODULE_AUTHOR("Qumranet"); +MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions"); MODULE_LICENSE("GPL"); #ifdef MODULE @@ -112,6 +117,8 @@ module_param(enable_apicv, bool, 0444); bool __read_mostly enable_ipiv = true; module_param(enable_ipiv, bool, 0444); +module_param(enable_device_posted_irqs, bool, 0444); + /* * If nested=1, nested virtualization is supported, i.e., guests may use * VMX and be a hypervisor for its own guests. If nested=0, guests may not @@ -214,9 +221,13 @@ module_param(ple_window_shrink, uint, 0444); static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; module_param(ple_window_max, uint, 0444); -/* Default is SYSTEM mode, 1 for host-guest mode */ +/* Default is SYSTEM mode, 1 for host-guest mode (which is BROKEN) */ int __read_mostly pt_mode = PT_MODE_SYSTEM; +#ifdef CONFIG_BROKEN module_param(pt_mode, int, S_IRUGO); +#endif + +struct x86_pmu_lbr __ro_after_init vmx_lbr_caps; static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); @@ -255,7 +266,7 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) return 0; } - if (host_arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { + if (kvm_host.arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; return 0; } @@ -266,6 +277,7 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) case L1TF_MITIGATION_OFF: l1tf = VMENTER_L1D_FLUSH_NEVER; break; + case L1TF_MITIGATION_AUTO: case L1TF_MITIGATION_FLUSH_NOWARN: case L1TF_MITIGATION_FLUSH: case L1TF_MITIGATION_FLUSH_NOSMT: @@ -373,9 +385,9 @@ static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) if (!vmx->disable_fb_clear) return; - msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL); + msr = native_rdmsrq(MSR_IA32_MCU_OPT_CTRL); msr |= FB_CLEAR_DIS; - native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr); + native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, msr); /* Cache the MSR value to avoid reading it later */ vmx->msr_ia32_mcu_opt_ctrl = msr; } @@ -386,7 +398,7 @@ static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx) return; vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS; - native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); + native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); } static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) @@ -400,7 +412,7 @@ static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) * and VM-Exit. */ vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) && - (host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) && + (kvm_host.arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) && !boot_cpu_has_bug(X86_BUG_MDS) && !boot_cpu_has_bug(X86_BUG_TAA); @@ -476,10 +488,9 @@ noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva) ext, vpid, gva); } -noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa) +noinline void invept_error(unsigned long ext, u64 eptp) { - vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n", - ext, eptp, gpa); + vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx\n", ext, eptp); } static DEFINE_PER_CPU(struct vmcs *, vmxarea); @@ -520,16 +531,10 @@ static const struct kvm_vmx_segment_field { VMX_SEGMENT_FIELD(LDTR), }; -static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) -{ - vmx->segment_cache.bitmask = 0; -} static unsigned long host_idt_base; #if IS_ENABLED(CONFIG_HYPERV) -static struct kvm_x86_ops vmx_x86_ops __initdata; - static bool __read_mostly enlightened_vmcs = true; module_param(enlightened_vmcs, bool, 0444); @@ -579,9 +584,8 @@ static __init void hv_init_evmcs(void) } if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) - vmx_x86_ops.enable_l2_tlb_flush + vt_x86_ops.enable_l2_tlb_flush = hv_enable_l2_tlb_flush; - } else { enlightened_vmcs = false; } @@ -753,7 +757,7 @@ fault: return -EIO; } -static void vmx_emergency_disable(void) +void vmx_emergency_disable_virtualization_cpu(void) { int cpu = raw_smp_processor_id(); struct loaded_vmcs *v; @@ -770,8 +774,11 @@ static void vmx_emergency_disable(void) return; list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), - loaded_vmcss_on_cpu_link) + loaded_vmcss_on_cpu_link) { vmcs_clear(v->vmcs); + if (v->shadow_vmcs) + vmcs_clear(v->shadow_vmcs); + } kvm_cpu_vmxoff(); } @@ -872,6 +879,12 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu) eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR); /* + * #VE isn't used for VMX. To test against unexpected changes + * related to #VE for VMX, intercept unexpected #VE and warn on it. + */ + if (IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE)) + eb |= 1u << VE_VECTOR; + /* * Guest access to VMware backdoor ports could legitimately * trigger #GP because of TSS I/O permission bitmap. * We intercept those #GP and allow access to them anyway @@ -1058,7 +1071,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, * provide that period, so a CPU could write host's record into * guest's memory. */ - wrmsrl(MSR_IA32_PEBS_ENABLE, 0); + wrmsrq(MSR_IA32_PEBS_ENABLE, 0); } i = vmx_find_loadstore_msr_slot(&m->guest, msr); @@ -1116,12 +1129,12 @@ static bool update_transition_efer(struct vcpu_vmx *vmx) * atomically, since it's faster than switching it manually. */ if (cpu_has_load_ia32_efer() || - (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) { + (enable_ept && ((vmx->vcpu.arch.efer ^ kvm_host.efer) & EFER_NX))) { if (!(guest_efer & EFER_LMA)) guest_efer &= ~EFER_LME; - if (guest_efer != host_efer) + if (guest_efer != kvm_host.efer) add_atomic_switch_msr(vmx, MSR_EFER, - guest_efer, host_efer, false); + guest_efer, kvm_host.efer, false); else clear_atomic_switch_msr(vmx, MSR_EFER); return false; @@ -1134,7 +1147,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx) clear_atomic_switch_msr(vmx, MSR_EFER); guest_efer &= ~ignore_bits; - guest_efer |= host_efer & ignore_bits; + guest_efer |= kvm_host.efer & ignore_bits; vmx->guest_uret_msrs[i].data = guest_efer; vmx->guest_uret_msrs[i].mask = ~ignore_bits; @@ -1187,13 +1200,13 @@ static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) { u32 i; - wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status); - wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); - wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); - wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); + wrmsrq(MSR_IA32_RTIT_STATUS, ctx->status); + wrmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); + wrmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); + wrmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); for (i = 0; i < addr_range; i++) { - wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); - wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); + wrmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); + wrmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); } } @@ -1201,13 +1214,13 @@ static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range) { u32 i; - rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status); - rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); - rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); - rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); + rdmsrq(MSR_IA32_RTIT_STATUS, ctx->status); + rdmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); + rdmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); + rdmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); for (i = 0; i < addr_range; i++) { - rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); - rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); + rdmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); + rdmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); } } @@ -1220,9 +1233,9 @@ static void pt_guest_enter(struct vcpu_vmx *vmx) * GUEST_IA32_RTIT_CTL is already set in the VMCS. * Save host state before VM entry. */ - rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); + rdmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { - wrmsrl(MSR_IA32_RTIT_CTL, 0); + wrmsrq(MSR_IA32_RTIT_CTL, 0); pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); } @@ -1243,7 +1256,7 @@ static void pt_guest_exit(struct vcpu_vmx *vmx) * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary. */ if (vmx->pt_desc.host.ctl) - wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); + wrmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); } void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, @@ -1276,6 +1289,7 @@ void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); struct vmcs_host_state *host_state; #ifdef CONFIG_X86_64 int cpu = raw_smp_processor_id(); @@ -1304,7 +1318,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) if (vmx->nested.need_vmcs12_to_shadow_sync) nested_sync_vmcs12_to_shadow(vcpu); - if (vmx->guest_state_loaded) + if (vt->guest_state_loaded) return; host_state = &vmx->loaded_vmcs->host_state; @@ -1325,15 +1339,15 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) fs_sel = current->thread.fsindex; gs_sel = current->thread.gsindex; fs_base = current->thread.fsbase; - vmx->msr_host_kernel_gs_base = current->thread.gsbase; + vt->msr_host_kernel_gs_base = current->thread.gsbase; } else { savesegment(fs, fs_sel); savesegment(gs, gs_sel); fs_base = read_msr(MSR_FS_BASE); - vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); + vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); } - wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + wrmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #else savesegment(fs, fs_sel); savesegment(gs, gs_sel); @@ -1342,14 +1356,14 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) #endif vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base); - vmx->guest_state_loaded = true; + vt->guest_state_loaded = true; } static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) { struct vmcs_host_state *host_state; - if (!vmx->guest_state_loaded) + if (!vmx->vt.guest_state_loaded) return; host_state = &vmx->loaded_vmcs->host_state; @@ -1357,7 +1371,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) ++vmx->vcpu.stat.host_state_reload; #ifdef CONFIG_X86_64 - rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #endif if (host_state->ldt_sel || (host_state->gs_sel & 7)) { kvm_load_ldt(host_state->ldt_sel); @@ -1377,10 +1391,10 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) #endif invalidate_tss_limit(); #ifdef CONFIG_X86_64 - wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); + wrmsrq(MSR_KERNEL_GS_BASE, vmx->vt.msr_host_kernel_gs_base); #endif load_fixmap_gdt(raw_smp_processor_id()); - vmx->guest_state_loaded = false; + vmx->vt.guest_state_loaded = false; vmx->guest_uret_msrs_loaded = false; } @@ -1388,8 +1402,8 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) { preempt_disable(); - if (vmx->guest_state_loaded) - rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + if (vmx->vt.guest_state_loaded) + rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); preempt_enable(); return vmx->msr_guest_kernel_gs_base; } @@ -1397,15 +1411,46 @@ static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) { preempt_disable(); - if (vmx->guest_state_loaded) - wrmsrl(MSR_KERNEL_GS_BASE, data); + if (vmx->vt.guest_state_loaded) + wrmsrq(MSR_KERNEL_GS_BASE, data); preempt_enable(); vmx->msr_guest_kernel_gs_base = data; } #endif -void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, - struct loaded_vmcs *buddy) +static void grow_ple_window(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned int old = vmx->ple_window; + + vmx->ple_window = __grow_ple_window(old, ple_window, + ple_window_grow, + ple_window_max); + + if (vmx->ple_window != old) { + vmx->ple_window_dirty = true; + trace_kvm_ple_window_update(vcpu->vcpu_id, + vmx->ple_window, old); + } +} + +static void shrink_ple_window(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned int old = vmx->ple_window; + + vmx->ple_window = __shrink_ple_window(old, ple_window, + ple_window_shrink, + ple_window); + + if (vmx->ple_window != old) { + vmx->ple_window_dirty = true; + trace_kvm_ple_window_update(vcpu->vcpu_id, + vmx->ple_window, old); + } +} + +void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); bool already_loaded = vmx->loaded_vmcs->cpu == cpu; @@ -1432,16 +1477,6 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, if (prev != vmx->loaded_vmcs->vmcs) { per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; vmcs_load(vmx->loaded_vmcs->vmcs); - - /* - * No indirect branch prediction barrier needed when switching - * the active VMCS within a vCPU, unless IBRS is advertised to - * the vCPU. To minimize the number of IBPBs executed, KVM - * performs IBPB on nested VM-Exit (a single nested transition - * may switch the active VMCS multiple times). - */ - if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev)) - indirect_branch_prediction_barrier(); } if (!already_loaded) { @@ -1475,18 +1510,17 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. */ -static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); + if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) + shrink_ple_window(vcpu); - vmx_vcpu_load_vmcs(vcpu, cpu, NULL); + vmx_vcpu_load_vmcs(vcpu, cpu); vmx_vcpu_pi_load(vcpu, cpu); - - vmx->host_debugctlmsr = get_debugctlmsr(); } -static void vmx_vcpu_put(struct kvm_vcpu *vcpu) +void vmx_vcpu_put(struct kvm_vcpu *vcpu) { vmx_vcpu_pi_put(vcpu); @@ -1542,10 +1576,10 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) vmcs_writel(GUEST_RFLAGS, rflags); if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM) - vmx->emulation_required = vmx_emulation_required(vcpu); + vmx->vt.emulation_required = vmx_emulation_required(vcpu); } -static bool vmx_get_if_flag(struct kvm_vcpu *vcpu) +bool vmx_get_if_flag(struct kvm_vcpu *vcpu) { return vmx_get_rflags(vcpu) & X86_EFLAGS_IF; } @@ -1596,7 +1630,8 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) * result in a #GP unless the same write also clears TraceEn. */ if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) && - ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN)) + (data & RTIT_CTL_TRACEEN) && + data != vmx->pt_desc.guest.ctl) return 1; /* @@ -1651,8 +1686,8 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) return 0; } -static int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, - void *insn, int insn_len) +int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len) { /* * Emulation of instructions in SGX enclaves is impossible as RIP does @@ -1661,16 +1696,22 @@ static int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, * so that guest userspace can't DoS the guest simply by triggering * emulation (enclaves are CPL3 only). */ - if (to_vmx(vcpu)->exit_reason.enclave_mode) { + if (vmx_get_exit_reason(vcpu).enclave_mode) { kvm_queue_exception(vcpu, UD_VECTOR); return X86EMUL_PROPAGATE_FAULT; } + + /* Check that emulation is possible during event vectoring */ + if ((to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && + !kvm_can_emulate_event_vectoring(emul_type)) + return X86EMUL_UNHANDLEABLE_VECTORING; + return X86EMUL_CONTINUE; } static int skip_emulated_instruction(struct kvm_vcpu *vcpu) { - union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason; + union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu); unsigned long rip, orig_rip; u32 instr_len; @@ -1736,7 +1777,7 @@ rip_updated: * Recognizes a pending MTF VM-exit and records the nested state for later * delivery. */ -static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu) +void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -1767,7 +1808,7 @@ static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu) } } -static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu) +int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu) { vmx_update_emulated_instruction(vcpu); return skip_emulated_instruction(vcpu); @@ -1786,7 +1827,7 @@ static void vmx_clear_hlt(struct kvm_vcpu *vcpu) vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); } -static void vmx_inject_exception(struct kvm_vcpu *vcpu) +void vmx_inject_exception(struct kvm_vcpu *vcpu) { struct kvm_queued_exception *ex = &vcpu->arch.exception; u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; @@ -1817,7 +1858,7 @@ static void vmx_inject_exception(struct kvm_vcpu *vcpu) return; } - WARN_ON_ONCE(vmx->emulation_required); + WARN_ON_ONCE(vmx->vt.emulation_required); if (kvm_exception_is_soft(ex->vector)) { vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, @@ -1868,8 +1909,8 @@ static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx) vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx)); vmx_setup_uret_msr(vmx, MSR_TSC_AUX, - guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) || - guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID)); + guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDTSCP) || + guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDPID)); /* * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new @@ -1907,12 +1948,12 @@ u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) return kvm_caps.default_tsc_scaling_ratio; } -static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu) +void vmx_write_tsc_offset(struct kvm_vcpu *vcpu) { vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); } -static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu) +void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu) { vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); } @@ -1955,15 +1996,15 @@ static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx, return !(msr->data & ~valid_bits); } -static int vmx_get_msr_feature(struct kvm_msr_entry *msr) +int vmx_get_feature_msr(u32 msr, u64 *data) { - switch (msr->index) { + switch (msr) { case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: if (!nested) return 1; - return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data); + return vmx_get_vmx_msr(&vmcs_config.nested, msr, data); default: - return KVM_MSR_RET_INVALID; + return KVM_MSR_RET_UNSUPPORTED; } } @@ -1972,7 +2013,7 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr) * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmx_uret_msr *msr; @@ -2022,7 +2063,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported() || (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) + !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX))) return 1; msr_info->data = vmcs_read64(GUEST_BNDCFGS); break; @@ -2038,13 +2079,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC)) return 1; msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0]; break; case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: - if (!guest_can_use(vcpu, X86_FEATURE_VMX)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) return 1; if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, &msr_info->data)) @@ -2057,7 +2098,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * sanity checking and refuse to boot. Filter all unsupported * features out. */ - if (!msr_info->host_initiated && guest_cpuid_has_evmcs(vcpu)) + if (!msr_info->host_initiated && guest_cpu_cap_has_evmcs(vcpu)) nested_evmcs_filter_control_msr(vcpu, msr_info->index, &msr_info->data); #endif @@ -2127,7 +2168,7 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu, u64 data) { #ifdef CONFIG_X86_64 - if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) return (u32)data; #endif return (unsigned long)data; @@ -2138,7 +2179,7 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated u64 debugctl = 0; if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) && - (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))) + (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))) debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT; if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) && @@ -2153,7 +2194,7 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmx_uret_msr *msr; @@ -2242,9 +2283,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported() || (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) + !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX))) return 1; - if (is_noncanonical_address(data & PAGE_MASK, vcpu) || + if (is_noncanonical_msr_address(data & PAGE_MASK, vcpu) || (data & MSR_IA32_BNDCFGS_RSVD)) return 1; @@ -2344,7 +2385,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * behavior, but it's close enough. */ if (!msr_info->host_initiated && - (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) || + (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC) || ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) && !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED)))) return 1; @@ -2354,7 +2395,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: if (!msr_info->host_initiated) return 1; /* they are read-only */ - if (!guest_can_use(vcpu, X86_FEATURE_VMX)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) return 1; return vmx_set_vmx_msr(vcpu, msr_index, data); case MSR_IA32_RTIT_CTL: @@ -2409,7 +2450,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; if (index >= 2 * vmx->pt_desc.num_address_ranges) return 1; - if (is_noncanonical_address(data, vcpu)) + if (is_noncanonical_msr_address(data, vcpu)) return 1; if (index % 2) vmx->pt_desc.guest.addr_b[index / 2] = data; @@ -2417,8 +2458,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmx->pt_desc.guest.addr_a[index / 2] = data; break; case MSR_IA32_PERF_CAPABILITIES: - if (data && !vcpu_to_pmu(vcpu)->version) - return 1; if (data & PMU_CAP_LBR_FMT) { if ((data & PMU_CAP_LBR_FMT) != (kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT)) @@ -2430,9 +2469,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if ((data & PERF_CAP_PEBS_MASK) != (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK)) return 1; - if (!guest_cpuid_has(vcpu, X86_FEATURE_DS)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DS)) return 1; - if (!guest_cpuid_has(vcpu, X86_FEATURE_DTES64)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DTES64)) return 1; if (!cpuid_model_is_consistent(vcpu)) return 1; @@ -2456,7 +2495,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return ret; } -static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) +void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) { unsigned long guest_owned_bits; @@ -2510,30 +2549,6 @@ static bool cpu_has_sgx(void) return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0)); } -/* - * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they - * can't be used due to errata where VM Exit may incorrectly clear - * IA32_PERF_GLOBAL_CTRL[34:32]. Work around the errata by using the - * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL. - */ -static bool cpu_has_perf_global_ctrl_bug(void) -{ - if (boot_cpu_data.x86 == 0x6) { - switch (boot_cpu_data.x86_model) { - case INTEL_FAM6_NEHALEM_EP: /* AAK155 */ - case INTEL_FAM6_NEHALEM: /* AAP115 */ - case INTEL_FAM6_WESTMERE: /* AAT100 */ - case INTEL_FAM6_WESTMERE_EP: /* BC86,AAY89,BD102 */ - case INTEL_FAM6_NEHALEM_EX: /* BA97 */ - return true; - default: - break; - } - } - - return false; -} - static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result) { u32 vmx_msr_low, vmx_msr_high; @@ -2556,23 +2571,50 @@ static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) { u64 allowed; - rdmsrl(msr, allowed); + rdmsrq(msr, allowed); return ctl_opt & allowed; } +#define vmx_check_entry_exit_pairs(pairs, entry_controls, exit_controls) \ +({ \ + int i, r = 0; \ + \ + BUILD_BUG_ON(sizeof(pairs[0].entry_control) != sizeof(entry_controls)); \ + BUILD_BUG_ON(sizeof(pairs[0].exit_control) != sizeof(exit_controls)); \ + \ + for (i = 0; i < ARRAY_SIZE(pairs); i++) { \ + typeof(entry_controls) n_ctrl = pairs[i].entry_control; \ + typeof(exit_controls) x_ctrl = pairs[i].exit_control; \ + \ + if (!(entry_controls & n_ctrl) == !(exit_controls & x_ctrl)) \ + continue; \ + \ + pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, " \ + "entry = %llx (%llx), exit = %llx (%llx)\n", \ + (u64)(entry_controls & n_ctrl), (u64)n_ctrl, \ + (u64)(exit_controls & x_ctrl), (u64)x_ctrl); \ + \ + if (error_on_inconsistent_vmcs_config) \ + r = -EIO; \ + \ + entry_controls &= ~n_ctrl; \ + exit_controls &= ~x_ctrl; \ + } \ + r; \ +}) + static int setup_vmcs_config(struct vmcs_config *vmcs_conf, struct vmx_capability *vmx_cap) { - u32 vmx_msr_low, vmx_msr_high; u32 _pin_based_exec_control = 0; u32 _cpu_based_exec_control = 0; u32 _cpu_based_2nd_exec_control = 0; u64 _cpu_based_3rd_exec_control = 0; u32 _vmexit_control = 0; u32 _vmentry_control = 0; + u64 basic_msr; u64 misc_msr; - int i; /* * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory. @@ -2604,6 +2646,9 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, &_cpu_based_2nd_exec_control)) return -EIO; } + if (!IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE)) + _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; + #ifndef CONFIG_X86_64 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) @@ -2628,6 +2673,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, return -EIO; vmx_cap->ept = 0; + _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; } if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && vmx_cap->vpid) { @@ -2672,46 +2718,54 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, &_vmentry_control)) return -EIO; - for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) { - u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control; - u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control; - - if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl)) - continue; - - pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n", - _vmentry_control & n_ctrl, _vmexit_control & x_ctrl); - - if (error_on_inconsistent_vmcs_config) - return -EIO; + if (vmx_check_entry_exit_pairs(vmcs_entry_exit_pairs, + _vmentry_control, _vmexit_control)) + return -EIO; - _vmentry_control &= ~n_ctrl; - _vmexit_control &= ~x_ctrl; + /* + * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they + * can't be used due to an errata where VM Exit may incorrectly clear + * IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the + * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL. + */ + switch (boot_cpu_data.x86_vfm) { + case INTEL_NEHALEM_EP: /* AAK155 */ + case INTEL_NEHALEM: /* AAP115 */ + case INTEL_WESTMERE: /* AAT100 */ + case INTEL_WESTMERE_EP: /* BC86,AAY89,BD102 */ + case INTEL_NEHALEM_EX: /* BA97 */ + _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; + pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " + "does not work properly. Using workaround\n"); + break; + default: + break; } - rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); + rdmsrq(MSR_IA32_VMX_BASIC, basic_msr); /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ - if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) + if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE) return -EIO; #ifdef CONFIG_X86_64 - /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ - if (vmx_msr_high & (1u<<16)) + /* + * KVM expects to be able to shove all legal physical addresses into + * VMCS fields for 64-bit kernels, and per the SDM, "This bit is always + * 0 for processors that support Intel 64 architecture". + */ + if (basic_msr & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) return -EIO; #endif /* Require Write-Back (WB) memory type for VMCS accesses. */ - if (((vmx_msr_high >> 18) & 15) != 6) + if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB) return -EIO; - rdmsrl(MSR_IA32_VMX_MISC, misc_msr); - - vmcs_conf->size = vmx_msr_high & 0x1fff; - vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; - - vmcs_conf->revision_id = vmx_msr_low; + rdmsrq(MSR_IA32_VMX_MISC, misc_msr); + vmcs_conf->basic = basic_msr; vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; @@ -2757,7 +2811,7 @@ static bool kvm_is_vmx_supported(void) return supported; } -static int vmx_check_processor_compat(void) +int vmx_check_processor_compat(void) { int cpu = raw_smp_processor_id(); struct vmcs_config vmcs_conf; @@ -2793,13 +2847,13 @@ static int kvm_cpu_vmxon(u64 vmxon_pointer) fault: WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", - rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); + rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); cr4_clear_bits(X86_CR4_VMXE); return -EFAULT; } -static int vmx_hardware_enable(void) +int vmx_enable_virtualization_cpu(void) { int cpu = raw_smp_processor_id(); u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); @@ -2823,9 +2877,6 @@ static int vmx_hardware_enable(void) return r; } - if (enable_ept) - ept_sync_global(); - return 0; } @@ -2839,7 +2890,7 @@ static void vmclear_local_loaded_vmcss(void) __loaded_vmcs_clear(v); } -static void vmx_hardware_disable(void) +void vmx_disable_virtualization_cpu(void) { vmclear_local_loaded_vmcss(); @@ -2861,13 +2912,13 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) if (!pages) return NULL; vmcs = page_address(pages); - memset(vmcs, 0, vmcs_config.size); + memset(vmcs, 0, vmx_basic_vmcs_size(vmcs_config.basic)); /* KVM supports Enlightened VMCS v1 only */ if (kvm_is_using_evmcs()) vmcs->hdr.revision_id = KVM_EVMCS_VERSION; else - vmcs->hdr.revision_id = vmcs_config.revision_id; + vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic); if (shadow) vmcs->hdr.shadow_vmcs = 1; @@ -2960,7 +3011,7 @@ static __init int alloc_kvm_area(void) * physical CPU. */ if (kvm_is_using_evmcs()) - vmcs->hdr.revision_id = vmcs_config.revision_id; + vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic); per_cpu(vmxarea, cpu) = vmcs; } @@ -3153,7 +3204,7 @@ static void exit_lmode(struct kvm_vcpu *vcpu) #endif -static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) +void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -3178,12 +3229,12 @@ static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu)) + if (is_guest_mode(vcpu) && nested_cpu_has_vpid(get_vmcs12(vcpu))) return nested_get_vpid02(vcpu); return to_vmx(vcpu)->vpid; } -static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) +void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; u64 root_hpa = mmu->root.hpa; @@ -3199,7 +3250,7 @@ static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) vpid_sync_context(vmx_get_current_vpid(vcpu)); } -static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) +void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) { /* * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in @@ -3208,7 +3259,7 @@ static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr); } -static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) +void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) { /* * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a @@ -3253,7 +3304,7 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu) #define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \ CPU_BASED_CR3_STORE_EXITING) -static bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { if (is_guest_mode(vcpu)) return nested_guest_cr0_valid(vcpu, cr0); @@ -3350,7 +3401,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } /* depends on vcpu->arch.cr0 to be set to a new value */ - vmx->emulation_required = vmx_emulation_required(vcpu); + vmx->vt.emulation_required = vmx_emulation_required(vcpu); } static int vmx_get_max_ept_level(void) @@ -3374,8 +3425,7 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) return eptp; } -static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, - int root_level) +void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) { struct kvm *kvm = vcpu->kvm; bool update_guest_cr3 = true; @@ -3404,8 +3454,7 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, vmcs_writel(GUEST_CR3, guest_cr3); } - -static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { /* * We operate under the default treatment of SMM, so VMX cannot be @@ -3482,7 +3531,7 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) vmcs_writel(GUEST_CR4, hw_cr4); if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; } void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) @@ -3521,7 +3570,7 @@ void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) var->g = (ar >> 15) & 1; } -static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) +u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) { struct kvm_segment s; @@ -3532,16 +3581,29 @@ static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) return vmx_read_guest_seg_base(to_vmx(vcpu), seg); } -int vmx_get_cpl(struct kvm_vcpu *vcpu) +static int __vmx_get_cpl(struct kvm_vcpu *vcpu, bool no_cache) { struct vcpu_vmx *vmx = to_vmx(vcpu); + int ar; if (unlikely(vmx->rmode.vm86_active)) return 0; - else { - int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); - return VMX_AR_DPL(ar); - } + + if (no_cache) + ar = vmcs_read32(GUEST_SS_AR_BYTES); + else + ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); + return VMX_AR_DPL(ar); +} + +int vmx_get_cpl(struct kvm_vcpu *vcpu) +{ + return __vmx_get_cpl(vcpu, false); +} + +int vmx_get_cpl_no_cache(struct kvm_vcpu *vcpu) +{ + return __vmx_get_cpl(vcpu, true); } static u32 vmx_segment_access_rights(struct kvm_segment *var) @@ -3598,14 +3660,14 @@ void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); } -static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) +void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { __vmx_set_segment(vcpu, var, seg); - to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); + to_vmx(vcpu)->vt.emulation_required = vmx_emulation_required(vcpu); } -static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) +void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) { u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS); @@ -3613,25 +3675,25 @@ static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) *l = (ar >> 13) & 1; } -static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { dt->size = vmcs_read32(GUEST_IDTR_LIMIT); dt->address = vmcs_readl(GUEST_IDTR_BASE); } -static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { vmcs_write32(GUEST_IDTR_LIMIT, dt->size); vmcs_writel(GUEST_IDTR_BASE, dt->address); } -static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { dt->size = vmcs_read32(GUEST_GDTR_LIMIT); dt->address = vmcs_readl(GUEST_GDTR_BASE); } -static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { vmcs_write32(GUEST_GDTR_LIMIT, dt->size); vmcs_writel(GUEST_GDTR_BASE, dt->address); @@ -4099,27 +4161,7 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) } } -static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - void *vapic_page; - u32 vppr; - int rvi; - - if (WARN_ON_ONCE(!is_guest_mode(vcpu)) || - !nested_cpu_has_vid(get_vmcs12(vcpu)) || - WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn)) - return false; - - rvi = vmx_get_rvi(); - - vapic_page = vmx->nested.virtual_apic_map.hva; - vppr = *((u32 *)(vapic_page + APIC_PROCPRI)); - - return ((rvi & 0xf0) > (vppr & 0xf0)); -} - -static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) +void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 i; @@ -4150,55 +4192,18 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) pt_update_intercept_for_msr(vcpu); } -static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, - int pi_vec) -{ -#ifdef CONFIG_SMP - if (vcpu->mode == IN_GUEST_MODE) { - /* - * The vector of the virtual has already been set in the PIR. - * Send a notification event to deliver the virtual interrupt - * unless the vCPU is the currently running vCPU, i.e. the - * event is being sent from a fastpath VM-Exit handler, in - * which case the PIR will be synced to the vIRR before - * re-entering the guest. - * - * When the target is not the running vCPU, the following - * possibilities emerge: - * - * Case 1: vCPU stays in non-root mode. Sending a notification - * event posts the interrupt to the vCPU. - * - * Case 2: vCPU exits to root mode and is still runnable. The - * PIR will be synced to the vIRR before re-entering the guest. - * Sending a notification event is ok as the host IRQ handler - * will ignore the spurious event. - * - * Case 3: vCPU exits to root mode and is blocked. vcpu_block() - * has already synced PIR to vIRR and never blocks the vCPU if - * the vIRR is not empty. Therefore, a blocked vCPU here does - * not wait for any requested interrupts in PIR, and sending a - * notification event also results in a benign, spurious event. - */ - - if (vcpu != kvm_get_running_vcpu()) - __apic_send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); - return; - } -#endif - /* - * The vCPU isn't in the guest; wake the vCPU in case it is blocking, - * otherwise do nothing as KVM will grab the highest priority pending - * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest(). - */ - kvm_vcpu_wake_up(vcpu); -} - static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, int vector) { struct vcpu_vmx *vmx = to_vmx(vcpu); + /* + * DO NOT query the vCPU's vmcs12, as vmcs12 is dynamically allocated + * and freed, and must not be accessed outside of vcpu->mutex. The + * vCPU's cached PI NV is valid if and only if posted interrupts + * enabled in its vmcs12, i.e. checking the vector also checks that + * L1 has enabled posted interrupts for L2. + */ if (is_guest_mode(vcpu) && vector == vmx->nested.posted_intr_nv) { /* @@ -4235,7 +4240,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, */ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) { - struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); int r; r = vmx_deliver_nested_posted_interrupt(vcpu, vector); @@ -4246,25 +4251,12 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) if (!vcpu->arch.apic->apicv_active) return -1; - if (pi_test_and_set_pir(vector, &vmx->pi_desc)) - return 0; - - /* If a previous notification has sent the IPI, nothing to do. */ - if (pi_test_and_set_on(&vmx->pi_desc)) - return 0; - - /* - * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*() - * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is - * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a - * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE. - */ - kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR); + __vmx_deliver_posted_interrupt(vcpu, &vt->pi_desc, vector); return 0; } -static void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, - int trig_mode, int vector) +void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, + int trig_mode, int vector) { struct kvm_vcpu *vcpu = apic->vcpu; @@ -4339,7 +4331,7 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32)) vmcs_writel(HOST_IA32_SYSENTER_ESP, 0); - rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); + rdmsrq(MSR_IA32_SYSENTER_EIP, tmpl); vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { @@ -4348,7 +4340,7 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) } if (cpu_has_load_ia32_efer()) - vmcs_write64(HOST_IA32_EFER, host_efer); + vmcs_write64(HOST_IA32_EFER, kvm_host.efer); } void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) @@ -4397,9 +4389,6 @@ static u32 vmx_vmentry_ctrl(void) VM_ENTRY_LOAD_IA32_EFER | VM_ENTRY_IA32E_MODE); - if (cpu_has_perf_global_ctrl_bug()) - vmentry_ctrl &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; - return vmentry_ctrl; } @@ -4417,16 +4406,12 @@ static u32 vmx_vmexit_ctrl(void) if (vmx_pt_mode_is_system()) vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | VM_EXIT_CLEAR_IA32_RTIT_CTL); - - if (cpu_has_perf_global_ctrl_bug()) - vmexit_ctrl &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; - /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ return vmexit_ctrl & ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); } -static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) +void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -4536,7 +4521,8 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control, * Update the nested MSR settings so that a nested VMM can/can't set * controls for features that are/aren't exposed to the guest. */ - if (nested) { + if (nested && + kvm_check_has_quirk(vmx->vcpu.kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) { /* * All features that can be added or removed to VMX MSRs must * be supported in the first place for nested virtualization. @@ -4562,10 +4548,7 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control, bool __enabled; \ \ if (cpu_has_vmx_##name()) { \ - if (kvm_is_governed_feature(X86_FEATURE_##feat_name)) \ - __enabled = guest_can_use(__vcpu, X86_FEATURE_##feat_name); \ - else \ - __enabled = guest_cpuid_has(__vcpu, X86_FEATURE_##feat_name); \ + __enabled = guest_cpu_cap_has(__vcpu, X86_FEATURE_##feat_name); \ vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\ __enabled, exiting); \ } \ @@ -4592,6 +4575,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; if (!enable_ept) { exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; + exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; enable_unrestricted_guest = 0; } if (!enable_unrestricted_guest) @@ -4640,8 +4624,8 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) */ if (cpu_has_vmx_rdtscp()) { bool rdpid_or_rdtscp_enabled = - guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) || - guest_cpuid_has(vcpu, X86_FEATURE_RDPID); + guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) || + guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID); vmx_adjust_secondary_exec_control(vmx, &exec_control, SECONDARY_EXEC_ENABLE_RDTSCP, @@ -4690,7 +4674,7 @@ static int vmx_alloc_ipiv_pid_table(struct kvm *kvm) return 0; } -static int vmx_vcpu_precreate(struct kvm *kvm) +int vmx_vcpu_precreate(struct kvm *kvm) { return vmx_alloc_ipiv_pid_table(kvm); } @@ -4715,8 +4699,12 @@ static void init_vmcs(struct vcpu_vmx *vmx) exec_controls_set(vmx, vmx_exec_control(vmx)); - if (cpu_has_secondary_exec_ctrls()) + if (cpu_has_secondary_exec_ctrls()) { secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx)); + if (vmx->ve_info) + vmcs_write64(VE_INFORMATION_ADDRESS, + __pa(vmx->ve_info)); + } if (cpu_has_tertiary_exec_ctrls()) tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx)); @@ -4730,7 +4718,7 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmcs_write16(GUEST_INTR_STATUS, 0); vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); - vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); + vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->vt.pi_desc))); } if (vmx_can_use_ipiv(&vmx->vcpu)) { @@ -4787,7 +4775,7 @@ static void init_vmcs(struct vcpu_vmx *vmx) if (enable_pml) { vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); - vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); + vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX); } vmx_write_encls_bitmap(&vmx->vcpu, NULL); @@ -4821,7 +4809,8 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu) init_vmcs(vmx); - if (nested) + if (nested && + kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs)); vcpu_setup_sgx_lepubkeyhash(vcpu); @@ -4834,18 +4823,19 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; #endif - vcpu->arch.microcode_version = 0x100000000ULL; + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) + vcpu->arch.microcode_version = 0x100000000ULL; vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED; /* * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR * or POSTED_INTR_WAKEUP_VECTOR. */ - vmx->pi_desc.nv = POSTED_INTR_VECTOR; - vmx->pi_desc.sn = 1; + vmx->vt.pi_desc.nv = POSTED_INTR_VECTOR; + __pi_set_sn(&vmx->vt.pi_desc); } -static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) +void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -4860,9 +4850,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx->hv_deadline_tsc = -1; kvm_set_cr8(vcpu, 0); - vmx_segment_cache_clear(vmx); - kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS); - seg_setup(VCPU_SREG_CS); vmcs_write16(GUEST_CS_SELECTOR, 0xf000); vmcs_writel(GUEST_CS_BASE, 0xffff0000ul); @@ -4889,6 +4876,9 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmcs_writel(GUEST_IDTR_BASE, 0); vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); + vmx_segment_cache_clear(vmx); + kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS); + vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); @@ -4904,12 +4894,12 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx_update_fb_clear_dis(vcpu, vmx); } -static void vmx_enable_irq_window(struct kvm_vcpu *vcpu) +void vmx_enable_irq_window(struct kvm_vcpu *vcpu) { exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); } -static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) +void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) { if (!enable_vnmi || vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { @@ -4920,7 +4910,7 @@ static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); } -static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) +void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) { struct vcpu_vmx *vmx = to_vmx(vcpu); uint32_t intr; @@ -4948,7 +4938,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) vmx_clear_hlt(vcpu); } -static void vmx_inject_nmi(struct kvm_vcpu *vcpu) +void vmx_inject_nmi(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -5026,7 +5016,7 @@ bool vmx_nmi_blocked(struct kvm_vcpu *vcpu) GUEST_INTR_STATE_NMI)); } -static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) +int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) { if (to_vmx(vcpu)->nested.nested_run_pending) return -EBUSY; @@ -5038,17 +5028,22 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) return !vmx_nmi_blocked(vcpu); } +bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu) +{ + return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) || + (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); +} + bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) return false; - return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) || - (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & - (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); + return __vmx_interrupt_blocked(vcpu); } -static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) +int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) { if (to_vmx(vcpu)->nested.nested_run_pending) return -EBUSY; @@ -5063,7 +5058,7 @@ static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) return !vmx_interrupt_blocked(vcpu); } -static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) +int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) { void __user *ret; @@ -5083,7 +5078,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) return init_rmode_tss(kvm, ret); } -static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) +int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) { to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr; return 0; @@ -5171,6 +5166,12 @@ bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu) (kvm_get_rflags(vcpu) & X86_EFLAGS_AC); } +static bool is_xfd_nm_fault(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.guest_fpu.fpstate->xfd && + !kvm_is_cr0_bit_set(vcpu, X86_CR0_TS); +} + static int handle_exception_nmi(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -5197,13 +5198,24 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) * point. */ if (is_nm_fault(intr_info)) { - kvm_queue_exception(vcpu, NM_VECTOR); + kvm_queue_exception_p(vcpu, NM_VECTOR, + is_xfd_nm_fault(vcpu) ? vcpu->arch.guest_fpu.xfd_err : 0); return 1; } if (is_invalid_opcode(intr_info)) return handle_ud(vcpu); + if (WARN_ON_ONCE(is_ve_fault(intr_info))) { + struct vmx_ve_information *ve_info = vmx->ve_info; + + WARN_ONCE(ve_info->exit_reason != EXIT_REASON_EPT_VIOLATION, + "Unexpected #VE on VM-Exit reason 0x%x", ve_info->exit_reason); + dump_vmcs(vcpu); + kvm_mmu_print_sptes(vcpu, ve_info->guest_physical_address, "#VE"); + return 1; + } + error_code = 0; if (intr_info & INTR_INFO_DELIVER_CODE_MASK) error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); @@ -5369,8 +5381,7 @@ static int handle_io(struct kvm_vcpu *vcpu) return kvm_fast_pio(vcpu, size, port, in); } -static void -vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) +void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) { /* * Patch in the VMCALL instruction: @@ -5576,7 +5587,7 @@ out: return kvm_complete_insn_gp(vcpu, err); } -static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) +void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) { get_debugreg(vcpu->arch.db[0], 0); get_debugreg(vcpu->arch.db[1], 1); @@ -5595,7 +5606,13 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) set_debugreg(DR6_RESERVED, 6); } -static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) +void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) +{ + lockdep_assert_irqs_disabled(); + set_debugreg(vcpu->arch.dr6, 6); +} + +void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) { vmcs_writel(GUEST_DR7, val); } @@ -5732,11 +5749,8 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) static int handle_ept_violation(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification; + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); gpa_t gpa; - u64 error_code; - - exit_qualification = vmx_get_exit_qual(vcpu); /* * EPT violation happened while executing iret from NMI, @@ -5752,24 +5766,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); trace_kvm_page_fault(vcpu, gpa, exit_qualification); - /* Is it a read fault? */ - error_code = (exit_qualification & EPT_VIOLATION_ACC_READ) - ? PFERR_USER_MASK : 0; - /* Is it a write fault? */ - error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE) - ? PFERR_WRITE_MASK : 0; - /* Is it a fetch fault? */ - error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) - ? PFERR_FETCH_MASK : 0; - /* ept page table entry is present? */ - error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK) - ? PFERR_PRESENT_MASK : 0; - - error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ? - PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; - - vcpu->arch.exit_qualification = exit_qualification; - /* * Check that the GPA doesn't exceed physical memory limits, as that is * a guest page fault. We have to emulate the instruction here, because @@ -5781,7 +5777,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu, gpa))) return kvm_emulate_instruction(vcpu, 0); - return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); + return __vmx_handle_ept_violation(vcpu, gpa, exit_qualification); } static int handle_ept_misconfig(struct kvm_vcpu *vcpu) @@ -5817,11 +5813,35 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu) return 1; } -static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu) +/* + * Returns true if emulation is required (due to the vCPU having invalid state + * with unsrestricted guest mode disabled) and KVM can't faithfully emulate the + * current vCPU state. + */ +static bool vmx_unhandleable_emulation_required(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - return vmx->emulation_required && !vmx->rmode.vm86_active && + if (!vmx->vt.emulation_required) + return false; + + /* + * It is architecturally impossible for emulation to be required when a + * nested VM-Enter is pending completion, as VM-Enter will VM-Fail if + * guest state is invalid and unrestricted guest is disabled, i.e. KVM + * should synthesize VM-Fail instead emulation L2 code. This path is + * only reachable if userspace modifies L2 guest state after KVM has + * performed the nested VM-Enter consistency checks. + */ + if (vmx->nested.nested_run_pending) + return true; + + /* + * KVM only supports emulating exceptions if the vCPU is in Real Mode. + * If emulation is required, KVM can't perform a successful VM-Enter to + * inject the exception. + */ + return !vmx->rmode.vm86_active && (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected); } @@ -5834,7 +5854,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) intr_window_requested = exec_controls_get(vmx) & CPU_BASED_INTR_WINDOW_EXITING; - while (vmx->emulation_required && count-- != 0) { + while (vmx->vt.emulation_required && count-- != 0) { if (intr_window_requested && !vmx_interrupt_blocked(vcpu)) return handle_interrupt_window(&vmx->vcpu); @@ -5844,7 +5864,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (!kvm_emulate_instruction(vcpu, 0)) return 0; - if (vmx_emulation_required_with_pending_exception(vcpu)) { + if (vmx_unhandleable_emulation_required(vcpu)) { kvm_prepare_emulation_failure_exit(vcpu); return 0; } @@ -5866,9 +5886,9 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) return 1; } -static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) +int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) { - if (vmx_emulation_required_with_pending_exception(vcpu)) { + if (vmx_unhandleable_emulation_required(vcpu)) { kvm_prepare_emulation_failure_exit(vcpu); return 0; } @@ -5876,38 +5896,6 @@ static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) return 1; } -static void grow_ple_window(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned int old = vmx->ple_window; - - vmx->ple_window = __grow_ple_window(old, ple_window, - ple_window_grow, - ple_window_max); - - if (vmx->ple_window != old) { - vmx->ple_window_dirty = true; - trace_kvm_ple_window_update(vcpu->vcpu_id, - vmx->ple_window, old); - } -} - -static void shrink_ple_window(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned int old = vmx->ple_window; - - vmx->ple_window = __shrink_ple_window(old, ple_window, - ple_window_shrink, - ple_window); - - if (vmx->ple_window != old) { - vmx->ple_window_dirty = true; - trace_kvm_ple_window_update(vcpu->vcpu_id, - vmx->ple_window, old); - } -} - /* * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE * exiting, so only get here on cpu with PAUSE-Loop-Exiting. @@ -5943,7 +5931,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) } operand; int gpr_index; - if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } @@ -6033,7 +6021,7 @@ static int handle_preemption_timer(struct kvm_vcpu *vcpu) /* * When nested=0, all VMX instruction VM Exits filter here. The handlers - * are overwritten by nested_vmx_setup() when nested=1. + * are overwritten by nested_vmx_hardware_setup() when nested=1. */ static int handle_vmx_instruction(struct kvm_vcpu *vcpu) { @@ -6061,7 +6049,7 @@ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) * VM-Exits. Unconditionally set the flag here and leave the handling to * vmx_handle_exit(). */ - to_vmx(vcpu)->exit_reason.bus_lock_detected = true; + to_vt(vcpu)->exit_reason.bus_lock_detected = true; return 1; } @@ -6154,15 +6142,14 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { static const int kvm_vmx_max_exit_handlers = ARRAY_SIZE(kvm_vmx_exit_handlers); -static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, - u64 *info1, u64 *info2, - u32 *intr_info, u32 *error_code) +void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, + u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); - *reason = vmx->exit_reason.full; + *reason = vmx->vt.exit_reason.full; *info1 = vmx_get_exit_qual(vcpu); - if (!(vmx->exit_reason.failed_vmentry)) { + if (!(vmx->vt.exit_reason.failed_vmentry)) { *info2 = vmx->idt_vectoring_info; *intr_info = vmx_get_intr_info(vcpu); if (is_exception_with_error_code(*intr_info)) @@ -6176,6 +6163,15 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, } } +void vmx_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code) +{ + *intr_info = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); + if (is_exception_with_error_code(*intr_info)) + *error_code = vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE); + else + *error_code = 0; +} + static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) { if (vmx->pml_pg) { @@ -6187,32 +6183,40 @@ static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + u16 pml_idx, pml_tail_index; u64 *pml_buf; - u16 pml_idx; + int i; pml_idx = vmcs_read16(GUEST_PML_INDEX); /* Do nothing if PML buffer is empty */ - if (pml_idx == (PML_ENTITY_NUM - 1)) + if (pml_idx == PML_HEAD_INDEX) return; + /* + * PML index always points to the next available PML buffer entity + * unless PML log has just overflowed. + */ + pml_tail_index = (pml_idx >= PML_LOG_NR_ENTRIES) ? 0 : pml_idx + 1; - /* PML index always points to next available PML buffer entity */ - if (pml_idx >= PML_ENTITY_NUM) - pml_idx = 0; - else - pml_idx++; - + /* + * PML log is written backwards: the CPU first writes the entry 511 + * then the entry 510, and so on. + * + * Read the entries in the same order they were written, to ensure that + * the dirty ring is filled in the same order the CPU wrote them. + */ pml_buf = page_address(vmx->pml_pg); - for (; pml_idx < PML_ENTITY_NUM; pml_idx++) { + + for (i = PML_HEAD_INDEX; i >= pml_tail_index; i--) { u64 gpa; - gpa = pml_buf[pml_idx]; + gpa = pml_buf[i]; WARN_ON(gpa & (PAGE_SIZE - 1)); kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); } /* reset PML index */ - vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); + vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX); } static void vmx_dump_sel(char *name, uint32_t sel) @@ -6414,6 +6418,24 @@ void dump_vmcs(struct kvm_vcpu *vcpu) if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) pr_err("Virtual processor ID = 0x%04x\n", vmcs_read16(VIRTUAL_PROCESSOR_ID)); + if (secondary_exec_control & SECONDARY_EXEC_EPT_VIOLATION_VE) { + struct vmx_ve_information *ve_info = vmx->ve_info; + u64 ve_info_pa = vmcs_read64(VE_INFORMATION_ADDRESS); + + /* + * If KVM is dumping the VMCS, then something has gone wrong + * already. Derefencing an address from the VMCS, which could + * very well be corrupted, is a terrible idea. The virtual + * address is known so use it. + */ + pr_err("VE info address = 0x%016llx%s\n", ve_info_pa, + ve_info_pa == __pa(ve_info) ? "" : "(corrupted!)"); + pr_err("ve_info: 0x%08x 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%04x\n", + ve_info->exit_reason, ve_info->delivery, + ve_info->exit_qualification, + ve_info->guest_linear_address, + ve_info->guest_physical_address, ve_info->eptp_index); + } } /* @@ -6423,7 +6445,7 @@ void dump_vmcs(struct kvm_vcpu *vcpu) static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { struct vcpu_vmx *vmx = to_vmx(vcpu); - union vmx_exit_reason exit_reason = vmx->exit_reason; + union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu); u32 vectoring_info = vmx->idt_vectoring_info; u16 exit_handler_index; @@ -6479,7 +6501,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) * the least awful solution for the userspace case without * risking false positives. */ - if (vmx->emulation_required) { + if (vmx->vt.emulation_required) { nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); return 1; } @@ -6489,7 +6511,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) } /* If guest state is invalid, start emulating. L2 is handled above. */ - if (vmx->emulation_required) + if (vmx->vt.emulation_required) return handle_invalid_guest_state(vcpu); if (exit_reason.failed_vmentry) { @@ -6510,33 +6532,15 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) return 0; } - /* - * Note: - * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by - * delivery event since it indicates guest is accessing MMIO. - * The vm-exit can be triggered again after return to guest that - * will cause infinite loop. - */ if ((vectoring_info & VECTORING_INFO_VALID_MASK) && (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI && exit_reason.basic != EXIT_REASON_EPT_VIOLATION && exit_reason.basic != EXIT_REASON_PML_FULL && exit_reason.basic != EXIT_REASON_APIC_ACCESS && exit_reason.basic != EXIT_REASON_TASK_SWITCH && - exit_reason.basic != EXIT_REASON_NOTIFY)) { - int ndata = 3; - - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; - vcpu->run->internal.data[0] = vectoring_info; - vcpu->run->internal.data[1] = exit_reason.full; - vcpu->run->internal.data[2] = vmx_get_exit_qual(vcpu); - if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) { - vcpu->run->internal.data[ndata++] = - vmcs_read64(GUEST_PHYSICAL_ADDRESS); - } - vcpu->run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu; - vcpu->run->internal.ndata = ndata; + exit_reason.basic != EXIT_REASON_NOTIFY && + exit_reason.basic != EXIT_REASON_EPT_MISCONFIG)) { + kvm_prepare_event_vectoring_exit(vcpu, INVALID_GPA); return 0; } @@ -6599,7 +6603,7 @@ unexpected_vmexit: return 0; } -static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) +int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { int ret = __vmx_handle_exit(vcpu, exit_fastpath); @@ -6607,7 +6611,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) * Exit to user space when bus lock detected to inform that there is * a bus lock in guest. */ - if (to_vmx(vcpu)->exit_reason.bus_lock_detected) { + if (vmx_get_exit_reason(vcpu).bus_lock_detected) { if (ret > 0) vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; @@ -6639,9 +6643,10 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) bool flush_l1d; /* - * Clear the per-vcpu flush bit, it gets set again - * either from vcpu_run() or from one of the unsafe - * VMEXIT handlers. + * Clear the per-vcpu flush bit, it gets set again if the vCPU + * is reloaded, i.e. if the vCPU is scheduled out or if KVM + * exits to userspace, or if KVM reaches one of the unsafe + * VMEXIT handlers, e.g. if KVM calls into the emulator. */ flush_l1d = vcpu->arch.l1tf_flush_l1d; vcpu->arch.l1tf_flush_l1d = false; @@ -6660,7 +6665,7 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) vcpu->stat.l1d_flush++; if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { - native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); + native_wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH); return; } @@ -6687,7 +6692,7 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) : "eax", "ebx", "ecx", "edx"); } -static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) +void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); int tpr_threshold; @@ -6757,14 +6762,16 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) vmx_update_msr_bitmap_x2apic(vcpu); } -static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) +void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) { const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT; struct kvm *kvm = vcpu->kvm; struct kvm_memslots *slots = kvm_memslots(kvm); struct kvm_memory_slot *slot; + struct page *refcounted_page; unsigned long mmu_seq; kvm_pfn_t pfn; + bool writable; /* Defer reload until vmcs01 is the current VMCS. */ if (is_guest_mode(vcpu)) { @@ -6800,37 +6807,58 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) * controls the APIC-access page memslot, and only deletes the memslot * if APICv is permanently inhibited, i.e. the memslot won't reappear. */ - pfn = gfn_to_pfn_memslot(slot, gfn); + pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &refcounted_page); if (is_error_noslot_pfn(pfn)) return; read_lock(&vcpu->kvm->mmu_lock); - if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn)) { + if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn)) kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - read_unlock(&vcpu->kvm->mmu_lock); - goto out; - } + else + vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn)); - vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn)); - read_unlock(&vcpu->kvm->mmu_lock); + /* + * Do not pin the APIC access page in memory so that it can be freely + * migrated, the MMU notifier will call us again if it is migrated or + * swapped out. KVM backs the memslot with anonymous memory, the pfn + * should always point at a refcounted page (if the pfn is valid). + */ + if (!WARN_ON_ONCE(!refcounted_page)) + kvm_release_page_clean(refcounted_page); /* * No need for a manual TLB flush at this point, KVM has already done a * flush if there were SPTEs pointing at the previous page. */ -out: - /* - * Do not pin apic access page in memory, the MMU notifier - * will call us again if it is migrated or swapped out. - */ - kvm_release_pfn_clean(pfn); + read_unlock(&vcpu->kvm->mmu_lock); } -static void vmx_hwapic_isr_update(int max_isr) +void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) { u16 status; u8 old; + /* + * If L2 is active, defer the SVI update until vmcs01 is loaded, as SVI + * is only relevant for if and only if Virtual Interrupt Delivery is + * enabled in vmcs12, and if VID is enabled then L2 EOIs affect L2's + * vAPIC, not L1's vAPIC. KVM must update vmcs01 on the next nested + * VM-Exit, otherwise L1 with run with a stale SVI. + */ + if (is_guest_mode(vcpu)) { + /* + * KVM is supposed to forward intercepted L2 EOIs to L1 if VID + * is enabled in vmcs12; as above, the EOIs affect L2's vAPIC. + * Note, userspace can stuff state while L2 is active; assert + * that VID is disabled if and only if the vCPU is in KVM_RUN + * to avoid false positives if userspace is setting APIC state. + */ + WARN_ON_ONCE(vcpu->wants_to_run && + nested_cpu_has_vid(get_vmcs12(vcpu))); + to_vmx(vcpu)->nested.update_vmcs01_hwapic_isr = true; + return; + } + if (max_isr == -1) max_isr = 0; @@ -6860,38 +6888,24 @@ static void vmx_set_rvi(int vector) } } -static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) +int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) { - /* - * When running L2, updating RVI is only relevant when - * vmcs12 virtual-interrupt-delivery enabled. - * However, it can be enabled only when L1 also - * intercepts external-interrupts and in that case - * we should not update vmcs02 RVI but instead intercept - * interrupt. Therefore, do nothing when running L2. - */ - if (!is_guest_mode(vcpu)) - vmx_set_rvi(max_irr); -} - -static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); int max_irr; bool got_posted_interrupt; if (KVM_BUG_ON(!enable_apicv, vcpu->kvm)) return -EIO; - if (pi_test_on(&vmx->pi_desc)) { - pi_clear_on(&vmx->pi_desc); + if (pi_test_on(&vt->pi_desc)) { + pi_clear_on(&vt->pi_desc); /* * IOMMU can write to PID.ON, so the barrier matters even on UP. * But on x86 this is just a compiler barrier anyway. */ smp_mb__after_atomic(); got_posted_interrupt = - kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr); + kvm_apic_update_irr(vcpu, vt->pi_desc.pir, &max_irr); } else { max_irr = kvm_lapic_find_highest_irr(vcpu); got_posted_interrupt = false; @@ -6920,7 +6934,7 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) return max_irr; } -static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) { if (!kvm_vcpu_apicv_active(vcpu)) return; @@ -6931,14 +6945,6 @@ static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); } -static void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - pi_clear_on(&vmx->pi_desc); - memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); -} - void vmx_do_interrupt_irqoff(unsigned long entry); void vmx_do_nmi_irqoff(void); @@ -6949,37 +6955,34 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) * MSR value is not clobbered by the host activity before the guest * has chance to consume it. * - * Do not blindly read xfd_err here, since this exception might - * be caused by L1 interception on a platform which doesn't - * support xfd at all. - * - * Do it conditionally upon guest_fpu::xfd. xfd_err matters - * only when xfd contains a non-zero value. + * Update the guest's XFD_ERR if and only if XFD is enabled, as the #NM + * interception may have been caused by L1 interception. Per the SDM, + * XFD_ERR is not modified for non-XFD #NM, i.e. if CR0.TS=1. * - * Queuing exception is done in vmx_handle_exit. See comment there. + * Note, XFD_ERR is updated _before_ the #NM interception check, i.e. + * unlike CR2 and DR6, the value is not a payload that is attached to + * the #NM exception. */ - if (vcpu->arch.guest_fpu.fpstate->xfd) - rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); + if (is_xfd_nm_fault(vcpu)) + rdmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); } -static void handle_exception_irqoff(struct vcpu_vmx *vmx) +static void handle_exception_irqoff(struct kvm_vcpu *vcpu, u32 intr_info) { - u32 intr_info = vmx_get_intr_info(&vmx->vcpu); - /* if exit due to PF check for async PF */ if (is_page_fault(intr_info)) - vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); + vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); /* if exit due to NM, handle before interrupts are enabled */ else if (is_nm_fault(intr_info)) - handle_nm_fault_irqoff(&vmx->vcpu); + handle_nm_fault_irqoff(vcpu); /* Handle machine checks before interrupts are enabled */ else if (is_machine_check(intr_info)) kvm_machine_check(); } -static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) +static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu, + u32 intr_info) { - u32 intr_info = vmx_get_intr_info(vcpu); unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm, @@ -6996,24 +6999,22 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) vcpu->arch.at_instruction_boundary = true; } -static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) +void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (vmx->emulation_required) + if (to_vt(vcpu)->emulation_required) return; - if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) - handle_external_interrupt_irqoff(vcpu); - else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI) - handle_exception_irqoff(vmx); + if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXTERNAL_INTERRUPT) + handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu)); + else if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXCEPTION_NMI) + handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu)); } /* * The kvm parameter can be NULL (module initialization, or invocation before * VM creation). Be sure to check the kvm parameter before using it. */ -static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index) +bool vmx_has_emulated_msr(struct kvm *kvm, u32 index) { switch (index) { case MSR_IA32_SMBASE: @@ -7111,13 +7112,17 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, case INTR_TYPE_SOFT_EXCEPTION: vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); fallthrough; - case INTR_TYPE_HARD_EXCEPTION: - if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { - u32 err = vmcs_read32(error_code_field); - kvm_requeue_exception_e(vcpu, vector, err); - } else - kvm_requeue_exception(vcpu, vector); + case INTR_TYPE_HARD_EXCEPTION: { + u32 error_code = 0; + + if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) + error_code = vmcs_read32(error_code_field); + + kvm_requeue_exception(vcpu, vector, + idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK, + error_code); break; + } case INTR_TYPE_SOFT_INTR: vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); fallthrough; @@ -7136,7 +7141,7 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) IDT_VECTORING_ERROR_CODE); } -static void vmx_cancel_injection(struct kvm_vcpu *vcpu) +void vmx_cancel_injection(struct kvm_vcpu *vcpu) { __vmx_complete_interrupts(vcpu, vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), @@ -7212,7 +7217,7 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, return; if (flags & VMX_RUN_SAVE_SPEC_CTRL) - vmx->spec_ctrl = __rdmsr(MSR_IA32_SPEC_CTRL); + vmx->spec_ctrl = native_rdmsrq(MSR_IA32_SPEC_CTRL); /* * If the guest/host SPEC_CTRL values differ, restore the host value. @@ -7223,7 +7228,7 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, */ if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) || vmx->spec_ctrl != hostval) - native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval); + native_wrmsrq(MSR_IA32_SPEC_CTRL, hostval); barrier_nospec(); } @@ -7236,19 +7241,35 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, * the fastpath even, all other exits must use the slow path. */ if (is_guest_mode(vcpu) && - to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_PREEMPTION_TIMER) + vmx_get_exit_reason(vcpu).basic != EXIT_REASON_PREEMPTION_TIMER) return EXIT_FASTPATH_NONE; - switch (to_vmx(vcpu)->exit_reason.basic) { + switch (vmx_get_exit_reason(vcpu).basic) { case EXIT_REASON_MSR_WRITE: return handle_fastpath_set_msr_irqoff(vcpu); case EXIT_REASON_PREEMPTION_TIMER: return handle_fastpath_preemption_timer(vcpu, force_immediate_exit); + case EXIT_REASON_HLT: + return handle_fastpath_hlt(vcpu); default: return EXIT_FASTPATH_NONE; } } +noinstr void vmx_handle_nmi(struct kvm_vcpu *vcpu) +{ + if ((u16)vmx_get_exit_reason(vcpu).basic != EXIT_REASON_EXCEPTION_NMI || + !is_nmi(vmx_get_intr_info(vcpu))) + return; + + kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); + if (cpu_feature_enabled(X86_FEATURE_FRED)) + fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR); + else + vmx_do_nmi_irqoff(); + kvm_after_interrupt(vcpu); +} + static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, unsigned int flags) { @@ -7261,10 +7282,14 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, * mitigation for MDS is done late in VMentry and is still * executed in spite of L1D Flush. This is because an extra VERW * should not matter much after the big hammer L1D Flush. + * + * cpu_buf_vm_clear is used when system is not vulnerable to MDS/TAA, + * and is affected by MMIO Stale Data. In such cases mitigation in only + * needed against an MMIO capable guest. */ if (static_branch_unlikely(&vmx_l1d_should_flush)) vmx_l1d_flush(vcpu); - else if (static_branch_unlikely(&mmio_stale_data_clear) && + else if (static_branch_unlikely(&cpu_buf_vm_clear) && kvm_arch_has_assigned_device(vcpu->kvm)) mds_clear_cpu_buffers(); @@ -7284,29 +7309,21 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, vmx_enable_fb_clear(vmx); if (unlikely(vmx->fail)) { - vmx->exit_reason.full = 0xdead; + vmx->vt.exit_reason.full = 0xdead; goto out; } - vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON); - if (likely(!vmx->exit_reason.failed_vmentry)) + vmx->vt.exit_reason.full = vmcs_read32(VM_EXIT_REASON); + if (likely(!vmx_get_exit_reason(vcpu).failed_vmentry)) vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); - if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI && - is_nmi(vmx_get_intr_info(vcpu))) { - kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); - if (cpu_feature_enabled(X86_FEATURE_FRED)) - fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR); - else - vmx_do_nmi_irqoff(); - kvm_after_interrupt(vcpu); - } + vmx_handle_nmi(vcpu); out: guest_state_exit_irqoff(); } -static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) +fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long cr3, cr4; @@ -7321,15 +7338,15 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) * start emulation until we arrive back to a valid state. Synthesize a * consistency check VM-Exit due to invalid guest state and bail. */ - if (unlikely(vmx->emulation_required)) { + if (unlikely(vmx->vt.emulation_required)) { vmx->fail = 0; - vmx->exit_reason.full = EXIT_REASON_INVALID_STATE; - vmx->exit_reason.failed_vmentry = 1; + vmx->vt.exit_reason.full = EXIT_REASON_INVALID_STATE; + vmx->vt.exit_reason.failed_vmentry = 1; kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1); - vmx->exit_qualification = ENTRY_FAIL_DEFAULT; + vmx->vt.exit_qualification = ENTRY_FAIL_DEFAULT; kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2); - vmx->exit_intr_info = 0; + vmx->vt.exit_intr_info = 0; return EXIT_FASTPATH_NONE; } @@ -7371,10 +7388,6 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) vmx->loaded_vmcs->host_state.cr4 = cr4; } - /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ - if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) - set_debugreg(vcpu->arch.dr6, 6); - /* When single-stepping over STI and MOV SS, we must clear the * corresponding interruptibility bits in the guest state. Otherwise * vmentry fails as it then expects bit 14 (BS) in pending debug @@ -7410,8 +7423,8 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) } /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ - if (vmx->host_debugctlmsr) - update_debugctlmsr(vmx->host_debugctlmsr); + if (vcpu->arch.host_debugctl) + update_debugctlmsr(vcpu->arch.host_debugctl); #ifndef CONFIG_X86_64 /* @@ -7436,7 +7449,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) * checking. */ if (vmx->nested.nested_run_pending && - !vmx->exit_reason.failed_vmentry) + !vmx_get_exit_reason(vcpu).failed_vmentry) ++vcpu->stat.nested_run; vmx->nested.nested_run_pending = 0; @@ -7445,12 +7458,12 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) if (unlikely(vmx->fail)) return EXIT_FASTPATH_NONE; - if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY)) + if (unlikely((u16)vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY)) kvm_machine_check(); trace_kvm_exit(vcpu, KVM_ISA_VMX); - if (unlikely(vmx->exit_reason.failed_vmentry)) + if (unlikely(vmx_get_exit_reason(vcpu).failed_vmentry)) return EXIT_FASTPATH_NONE; vmx->loaded_vmcs->launched = 1; @@ -7461,7 +7474,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit); } -static void vmx_vcpu_free(struct kvm_vcpu *vcpu) +void vmx_vcpu_free(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -7470,9 +7483,10 @@ static void vmx_vcpu_free(struct kvm_vcpu *vcpu) free_vpid(vmx->vpid); nested_vmx_free_vcpu(vcpu); free_loaded_vmcs(vmx->loaded_vmcs); + free_page((unsigned long)vmx->ve_info); } -static int vmx_vcpu_create(struct kvm_vcpu *vcpu) +int vmx_vcpu_create(struct kvm_vcpu *vcpu) { struct vmx_uret_msr *tsx_ctrl; struct vcpu_vmx *vmx; @@ -7481,7 +7495,7 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu) BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); vmx = to_vmx(vcpu); - INIT_LIST_HEAD(&vmx->pi_wakeup_list); + INIT_LIST_HEAD(&vmx->vt.pi_wakeup_list); err = -ENOMEM; @@ -7563,9 +7577,23 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu) goto free_vmcs; } + err = -ENOMEM; + if (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_EPT_VIOLATION_VE) { + struct page *page; + + BUILD_BUG_ON(sizeof(*vmx->ve_info) > PAGE_SIZE); + + /* ve_info must be page aligned. */ + page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!page) + goto free_vmcs; + + vmx->ve_info = page_to_virt(page); + } + if (vmx_can_use_ipiv(vcpu)) WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id], - __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID); + __pa(&vmx->vt.pi_desc) | PID_TABLE_ENTRY_VALID); return 0; @@ -7581,7 +7609,7 @@ free_vpid: #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" -static int vmx_vm_init(struct kvm *kvm) +int vmx_vm_init(struct kvm *kvm) { if (!ple_gap) kvm->arch.pause_in_guest = true; @@ -7592,6 +7620,7 @@ static int vmx_vm_init(struct kvm *kvm) case L1TF_MITIGATION_FLUSH_NOWARN: /* 'I explicitly don't care' is set */ break; + case L1TF_MITIGATION_AUTO: case L1TF_MITIGATION_FLUSH: case L1TF_MITIGATION_FLUSH_NOSMT: case L1TF_MITIGATION_FULL: @@ -7609,44 +7638,37 @@ static int vmx_vm_init(struct kvm *kvm) break; } } + + if (enable_pml) + kvm->arch.cpu_dirty_log_size = PML_LOG_NR_ENTRIES; return 0; } -static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) +static inline bool vmx_ignore_guest_pat(struct kvm *kvm) { - /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in - * memory aliases with conflicting memory types and sometimes MCEs. - * We have to be careful as to what are honored and when. - * - * For MMIO, guest CD/MTRR are ignored. The EPT memory type is set to - * UC. The effective memory type is UC or WC depending on guest PAT. - * This was historically the source of MCEs and we want to be - * conservative. - * - * When there is no need to deal with noncoherent DMA (e.g., no VT-d - * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored. The - * EPT memory type is set to WB. The effective memory type is forced - * WB. - * - * Otherwise, we trust guest. Guest CD/MTRR/PAT are all honored. The - * EPT memory type is used to emulate guest CD/MTRR. + /* + * Non-coherent DMA devices need the guest to flush CPU properly. + * In that case it is not possible to map all guest RAM as WB, so + * always trust guest PAT. */ + return !kvm_arch_has_noncoherent_dma(kvm) && + kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT); +} +u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) +{ + /* + * Force UC for host MMIO regions, as allowing the guest to access MMIO + * with cacheable accesses will result in Machine Checks. + */ if (is_mmio) return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; - if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) + /* Force WB if ignoring guest PAT */ + if (vmx_ignore_guest_pat(vcpu->kvm)) return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; - if (kvm_read_cr0_bits(vcpu, X86_CR0_CD)) { - if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) - return MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT; - else - return (MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT) | - VMX_EPT_IPAT_BIT; - } - - return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT; + return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT); } static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl) @@ -7784,7 +7806,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4)); } -static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) +void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -7793,12 +7815,8 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) * to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be * set if and only if XSAVE is supported. */ - if (boot_cpu_has(X86_FEATURE_XSAVE) && - guest_cpuid_has(vcpu, X86_FEATURE_XSAVE)) - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_XSAVES); - - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VMX); - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LAM); + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE)) + guest_cpu_cap_clear(vcpu, X86_FEATURE_XSAVES); vmx_setup_uret_msrs(vmx); @@ -7806,7 +7824,7 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vmcs_set_secondary_exec_control(vmx, vmx_secondary_exec_control(vmx)); - if (guest_can_use(vcpu, X86_FEATURE_VMX)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; @@ -7815,25 +7833,25 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX); - if (guest_can_use(vcpu, X86_FEATURE_VMX)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) nested_vmx_cr_fixed1_bits_update(vcpu); if (boot_cpu_has(X86_FEATURE_INTEL_PT) && - guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT)) + guest_cpu_cap_has(vcpu, X86_FEATURE_INTEL_PT)) update_intel_pt_cfg(vcpu); if (boot_cpu_has(X86_FEATURE_RTM)) { struct vmx_uret_msr *msr; msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); if (msr) { - bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM); + bool enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_RTM); vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE); } } if (kvm_cpu_cap_has(X86_FEATURE_XFD)) vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R, - !guest_cpuid_has(vcpu, X86_FEATURE_XFD)); + !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)); if (boot_cpu_has(X86_FEATURE_IBPB)) vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W, @@ -7841,17 +7859,17 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) if (boot_cpu_has(X86_FEATURE_FLUSH_L1D)) vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W, - !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D)); + !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); set_cr4_guest_host_mask(vmx); vmx_write_encls_bitmap(vcpu, NULL); - if (guest_cpuid_has(vcpu, X86_FEATURE_SGX)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX)) vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED; else vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED; - if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC)) vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_LC_ENABLED; else @@ -7862,28 +7880,55 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vmx_update_exception_bitmap(vcpu); } -static u64 vmx_get_perf_capabilities(void) +static __init u64 vmx_get_perf_capabilities(void) { u64 perf_cap = PMU_CAP_FW_WRITES; - struct x86_pmu_lbr lbr; u64 host_perf_cap = 0; if (!enable_pmu) return 0; if (boot_cpu_has(X86_FEATURE_PDCM)) - rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); + rdmsrq(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) { - x86_perf_get_lbr(&lbr); - if (lbr.nr) + x86_perf_get_lbr(&vmx_lbr_caps); + + /* + * KVM requires LBR callstack support, as the overhead due to + * context switching LBRs without said support is too high. + * See intel_pmu_create_guest_lbr_event() for more info. + */ + if (!vmx_lbr_caps.has_callstack) + memset(&vmx_lbr_caps, 0, sizeof(vmx_lbr_caps)); + else if (vmx_lbr_caps.nr) perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; } if (vmx_pebs_supported()) { perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; - if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4) - perf_cap &= ~PERF_CAP_PEBS_BASELINE; + + /* + * Disallow adaptive PEBS as it is functionally broken, can be + * used by the guest to read *host* LBRs, and can be used to + * bypass userspace event filters. To correctly and safely + * support adaptive PEBS, KVM needs to: + * + * 1. Account for the ADAPTIVE flag when (re)programming fixed + * counters. + * + * 2. Gain support from perf (or take direct control of counter + * programming) to support events without adaptive PEBS + * enabled for the hardware counter. + * + * 3. Ensure LBR MSRs cannot hold host data on VM-Entry with + * adaptive PEBS enabled and MSR_PEBS_DATA_CFG.LBRS=1. + * + * 4. Document which PMU events are effectively exposed to the + * guest via adaptive PEBS, and make adaptive PEBS mutually + * exclusive with KVM_SET_PMU_EVENT_FILTER if necessary. + */ + perf_cap &= ~PERF_CAP_PEBS_BASELINE; } return perf_cap; @@ -7918,6 +7963,7 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_clear(X86_FEATURE_SGX_LC); kvm_cpu_cap_clear(X86_FEATURE_SGX1); kvm_cpu_cap_clear(X86_FEATURE_SGX2); + kvm_cpu_cap_clear(X86_FEATURE_SGX_EDECCSSA); } if (vmx_umip_emulated()) @@ -7938,66 +7984,86 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); } -static int vmx_check_intercept_io(struct kvm_vcpu *vcpu, - struct x86_instruction_info *info) +static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu, + struct x86_instruction_info *info, + unsigned long *exit_qualification) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); unsigned short port; - bool intercept; int size; + bool imm; + + /* + * If the 'use IO bitmaps' VM-execution control is 0, IO instruction + * VM-exits depend on the 'unconditional IO exiting' VM-execution + * control. + * + * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps. + */ + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) + return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); if (info->intercept == x86_intercept_in || info->intercept == x86_intercept_ins) { port = info->src_val; size = info->dst_bytes; + imm = info->src_type == OP_IMM; } else { port = info->dst_val; size = info->src_bytes; + imm = info->dst_type == OP_IMM; } - /* - * If the 'use IO bitmaps' VM-execution control is 0, IO instruction - * VM-exits depend on the 'unconditional IO exiting' VM-execution - * control. - * - * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps. - */ - if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) - intercept = nested_cpu_has(vmcs12, - CPU_BASED_UNCOND_IO_EXITING); - else - intercept = nested_vmx_check_io_bitmaps(vcpu, port, size); - /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ - return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; + *exit_qualification = ((unsigned long)port << 16) | (size - 1); + + if (info->intercept == x86_intercept_ins || + info->intercept == x86_intercept_outs) + *exit_qualification |= BIT(4); + + if (info->rep_prefix) + *exit_qualification |= BIT(5); + + if (imm) + *exit_qualification |= BIT(6); + + return nested_vmx_check_io_bitmaps(vcpu, port, size); } -static int vmx_check_intercept(struct kvm_vcpu *vcpu, - struct x86_instruction_info *info, - enum x86_intercept_stage stage, - struct x86_exception *exception) +int vmx_check_intercept(struct kvm_vcpu *vcpu, + struct x86_instruction_info *info, + enum x86_intercept_stage stage, + struct x86_exception *exception) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + unsigned long exit_qualification = 0; + u32 vm_exit_reason; + u64 exit_insn_len; switch (info->intercept) { - /* - * RDPID causes #UD if disabled through secondary execution controls. - * Because it is marked as EmulateOnUD, we need to intercept it here. - * Note, RDPID is hidden behind ENABLE_RDTSCP. - */ case x86_intercept_rdpid: + /* + * RDPID causes #UD if not enabled through secondary execution + * controls (ENABLE_RDTSCP). Note, the implicit MSR access to + * TSC_AUX is NOT subject to interception, i.e. checking only + * the dedicated execution control is architecturally correct. + */ if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) { exception->vector = UD_VECTOR; exception->error_code_valid = false; return X86EMUL_PROPAGATE_FAULT; } - break; + return X86EMUL_CONTINUE; case x86_intercept_in: case x86_intercept_ins: case x86_intercept_out: case x86_intercept_outs: - return vmx_check_intercept_io(vcpu, info); + if (!vmx_is_io_intercepted(vcpu, info, &exit_qualification)) + return X86EMUL_CONTINUE; + + vm_exit_reason = EXIT_REASON_IO_INSTRUCTION; + break; case x86_intercept_lgdt: case x86_intercept_lidt: @@ -8010,7 +8076,24 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu, if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC)) return X86EMUL_CONTINUE; - /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ + if (info->intercept == x86_intercept_lldt || + info->intercept == x86_intercept_ltr || + info->intercept == x86_intercept_sldt || + info->intercept == x86_intercept_str) + vm_exit_reason = EXIT_REASON_LDTR_TR; + else + vm_exit_reason = EXIT_REASON_GDTR_IDTR; + /* + * FIXME: Decode the ModR/M to generate the correct exit + * qualification for memory operands. + */ + break; + + case x86_intercept_hlt: + if (!nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING)) + return X86EMUL_CONTINUE; + + vm_exit_reason = EXIT_REASON_HLT; break; case x86_intercept_pause: @@ -8023,17 +8106,24 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu, * the PAUSE. */ if ((info->rep_prefix != REPE_PREFIX) || - !nested_cpu_has2(vmcs12, CPU_BASED_PAUSE_EXITING)) + !nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING)) return X86EMUL_CONTINUE; + vm_exit_reason = EXIT_REASON_PAUSE_INSTRUCTION; break; /* TODO: check more intercepts... */ default: - break; + return X86EMUL_UNHANDLEABLE; } - return X86EMUL_UNHANDLEABLE; + exit_insn_len = abs_diff((s64)info->next_rip, (s64)info->rip); + if (!exit_insn_len || exit_insn_len > X86_MAX_INSTRUCTION_LENGTH) + return X86EMUL_UNHANDLEABLE; + + __nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification, + exit_insn_len); + return X86EMUL_INTERCEPTED; } #ifdef CONFIG_X86_64 @@ -8055,8 +8145,8 @@ static inline int u64_shl_div_u64(u64 a, unsigned int shift, return 0; } -static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, - bool *expired) +int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, + bool *expired) { struct vcpu_vmx *vmx; u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; @@ -8095,18 +8185,12 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, return 0; } -static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) +void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) { to_vmx(vcpu)->hv_deadline_tsc = -1; } #endif -static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) -{ - if (!kvm_pause_in_guest(vcpu->kvm)) - shrink_ple_window(vcpu); -} - void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -8130,7 +8214,7 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); } -static void vmx_setup_mce(struct kvm_vcpu *vcpu) +void vmx_setup_mce(struct kvm_vcpu *vcpu) { if (vcpu->arch.mcg_cap & MCG_LMCE_P) to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= @@ -8141,7 +8225,7 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu) } #ifdef CONFIG_KVM_SMM -static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) +int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) { /* we need a nested vmexit to enter SMM, postpone if run is pending */ if (to_vmx(vcpu)->nested.nested_run_pending) @@ -8149,7 +8233,7 @@ static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) return !is_smm(vcpu); } -static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) +int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -8170,7 +8254,7 @@ static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) return 0; } -static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) +int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) { struct vcpu_vmx *vmx = to_vmx(vcpu); int ret; @@ -8191,18 +8275,18 @@ static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) return 0; } -static void vmx_enable_smi_window(struct kvm_vcpu *vcpu) +void vmx_enable_smi_window(struct kvm_vcpu *vcpu) { /* RSM will cause a vmexit anyway. */ } #endif -static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) +bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) { return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu); } -static void vmx_migrate_timers(struct kvm_vcpu *vcpu) +void vmx_migrate_timers(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu)) { struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer; @@ -8212,7 +8296,7 @@ static void vmx_migrate_timers(struct kvm_vcpu *vcpu) } } -static void vmx_hardware_unsetup(void) +void vmx_hardware_unsetup(void) { kvm_set_posted_intr_wakeup_handler(NULL); @@ -8222,18 +8306,7 @@ static void vmx_hardware_unsetup(void) free_kvm_area(); } -#define VMX_REQUIRED_APICV_INHIBITS \ -( \ - BIT(APICV_INHIBIT_REASON_DISABLE)| \ - BIT(APICV_INHIBIT_REASON_ABSENT) | \ - BIT(APICV_INHIBIT_REASON_HYPERV) | \ - BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \ - BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \ - BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \ - BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) \ -) - -static void vmx_vm_destroy(struct kvm *kvm) +void vmx_vm_destroy(struct kvm *kvm) { struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); @@ -8284,148 +8357,6 @@ gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags return (sign_extend64(gva, lam_bit) & ~BIT_ULL(63)) | (gva & BIT_ULL(63)); } -static struct kvm_x86_ops vmx_x86_ops __initdata = { - .name = KBUILD_MODNAME, - - .check_processor_compatibility = vmx_check_processor_compat, - - .hardware_unsetup = vmx_hardware_unsetup, - - .hardware_enable = vmx_hardware_enable, - .hardware_disable = vmx_hardware_disable, - .has_emulated_msr = vmx_has_emulated_msr, - - .vm_size = sizeof(struct kvm_vmx), - .vm_init = vmx_vm_init, - .vm_destroy = vmx_vm_destroy, - - .vcpu_precreate = vmx_vcpu_precreate, - .vcpu_create = vmx_vcpu_create, - .vcpu_free = vmx_vcpu_free, - .vcpu_reset = vmx_vcpu_reset, - - .prepare_switch_to_guest = vmx_prepare_switch_to_guest, - .vcpu_load = vmx_vcpu_load, - .vcpu_put = vmx_vcpu_put, - - .update_exception_bitmap = vmx_update_exception_bitmap, - .get_msr_feature = vmx_get_msr_feature, - .get_msr = vmx_get_msr, - .set_msr = vmx_set_msr, - .get_segment_base = vmx_get_segment_base, - .get_segment = vmx_get_segment, - .set_segment = vmx_set_segment, - .get_cpl = vmx_get_cpl, - .get_cs_db_l_bits = vmx_get_cs_db_l_bits, - .is_valid_cr0 = vmx_is_valid_cr0, - .set_cr0 = vmx_set_cr0, - .is_valid_cr4 = vmx_is_valid_cr4, - .set_cr4 = vmx_set_cr4, - .set_efer = vmx_set_efer, - .get_idt = vmx_get_idt, - .set_idt = vmx_set_idt, - .get_gdt = vmx_get_gdt, - .set_gdt = vmx_set_gdt, - .set_dr7 = vmx_set_dr7, - .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, - .cache_reg = vmx_cache_reg, - .get_rflags = vmx_get_rflags, - .set_rflags = vmx_set_rflags, - .get_if_flag = vmx_get_if_flag, - - .flush_tlb_all = vmx_flush_tlb_all, - .flush_tlb_current = vmx_flush_tlb_current, - .flush_tlb_gva = vmx_flush_tlb_gva, - .flush_tlb_guest = vmx_flush_tlb_guest, - - .vcpu_pre_run = vmx_vcpu_pre_run, - .vcpu_run = vmx_vcpu_run, - .handle_exit = vmx_handle_exit, - .skip_emulated_instruction = vmx_skip_emulated_instruction, - .update_emulated_instruction = vmx_update_emulated_instruction, - .set_interrupt_shadow = vmx_set_interrupt_shadow, - .get_interrupt_shadow = vmx_get_interrupt_shadow, - .patch_hypercall = vmx_patch_hypercall, - .inject_irq = vmx_inject_irq, - .inject_nmi = vmx_inject_nmi, - .inject_exception = vmx_inject_exception, - .cancel_injection = vmx_cancel_injection, - .interrupt_allowed = vmx_interrupt_allowed, - .nmi_allowed = vmx_nmi_allowed, - .get_nmi_mask = vmx_get_nmi_mask, - .set_nmi_mask = vmx_set_nmi_mask, - .enable_nmi_window = vmx_enable_nmi_window, - .enable_irq_window = vmx_enable_irq_window, - .update_cr8_intercept = vmx_update_cr8_intercept, - .set_virtual_apic_mode = vmx_set_virtual_apic_mode, - .set_apic_access_page_addr = vmx_set_apic_access_page_addr, - .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, - .load_eoi_exitmap = vmx_load_eoi_exitmap, - .apicv_pre_state_restore = vmx_apicv_pre_state_restore, - .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS, - .hwapic_irr_update = vmx_hwapic_irr_update, - .hwapic_isr_update = vmx_hwapic_isr_update, - .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, - .sync_pir_to_irr = vmx_sync_pir_to_irr, - .deliver_interrupt = vmx_deliver_interrupt, - .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt, - - .set_tss_addr = vmx_set_tss_addr, - .set_identity_map_addr = vmx_set_identity_map_addr, - .get_mt_mask = vmx_get_mt_mask, - - .get_exit_info = vmx_get_exit_info, - - .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid, - - .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, - - .get_l2_tsc_offset = vmx_get_l2_tsc_offset, - .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier, - .write_tsc_offset = vmx_write_tsc_offset, - .write_tsc_multiplier = vmx_write_tsc_multiplier, - - .load_mmu_pgd = vmx_load_mmu_pgd, - - .check_intercept = vmx_check_intercept, - .handle_exit_irqoff = vmx_handle_exit_irqoff, - - .sched_in = vmx_sched_in, - - .cpu_dirty_log_size = PML_ENTITY_NUM, - .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging, - - .nested_ops = &vmx_nested_ops, - - .pi_update_irte = vmx_pi_update_irte, - .pi_start_assignment = vmx_pi_start_assignment, - -#ifdef CONFIG_X86_64 - .set_hv_timer = vmx_set_hv_timer, - .cancel_hv_timer = vmx_cancel_hv_timer, -#endif - - .setup_mce = vmx_setup_mce, - -#ifdef CONFIG_KVM_SMM - .smi_allowed = vmx_smi_allowed, - .enter_smm = vmx_enter_smm, - .leave_smm = vmx_leave_smm, - .enable_smi_window = vmx_enable_smi_window, -#endif - - .check_emulate_instruction = vmx_check_emulate_instruction, - .apic_init_signal_blocked = vmx_apic_init_signal_blocked, - .migrate_timers = vmx_migrate_timers, - - .msr_filter_changed = vmx_msr_filter_changed, - .complete_emulated_msr = kvm_complete_insn_gp, - - .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, - - .get_untagged_addr = vmx_get_untagged_addr, -}; - static unsigned int vmx_handle_intel_pt_intr(void) { struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); @@ -8471,18 +8402,16 @@ static void __init vmx_setup_me_spte_mask(void) u64 me_mask = 0; /* - * kvm_get_shadow_phys_bits() returns shadow_phys_bits. Use - * the former to avoid exposing shadow_phys_bits. - * * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to - * shadow_phys_bits. On MKTME and/or TDX capable systems, + * kvm_host.maxphyaddr. On MKTME and/or TDX capable systems, * boot_cpu_data.x86_phys_bits holds the actual physical address - * w/o the KeyID bits, and shadow_phys_bits equals to MAXPHYADDR - * reported by CPUID. Those bits between are KeyID bits. + * w/o the KeyID bits, and kvm_host.maxphyaddr equals to + * MAXPHYADDR reported by CPUID. Those bits between are KeyID bits. */ - if (boot_cpu_data.x86_phys_bits != kvm_get_shadow_phys_bits()) + if (boot_cpu_data.x86_phys_bits != kvm_host.maxphyaddr) me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits, - kvm_get_shadow_phys_bits() - 1); + kvm_host.maxphyaddr - 1); + /* * Unlike SME, host kernel doesn't support setting up any * MKTME KeyID on Intel platforms. No memory encryption @@ -8491,9 +8420,7 @@ static void __init vmx_setup_me_spte_mask(void) kvm_mmu_set_me_spte_mask(0, me_mask); } -static struct kvm_x86_init_ops vmx_init_ops __initdata; - -static __init int hardware_setup(void) +__init int vmx_hardware_setup(void) { unsigned long host_bndcfgs; struct desc_ptr dt; @@ -8507,15 +8434,11 @@ static __init int hardware_setup(void) if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) return -EIO; - if (cpu_has_perf_global_ctrl_bug()) - pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " - "does not work properly. Using workaround\n"); - if (boot_cpu_has(X86_FEATURE_NX)) kvm_enable_efer_bits(EFER_NX); if (boot_cpu_has(X86_FEATURE_MPX)) { - rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs); + rdmsrq(MSR_IA32_BNDCFGS, host_bndcfgs); WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost"); } @@ -8562,16 +8485,16 @@ static __init int hardware_setup(void) * using the APIC_ACCESS_ADDR VMCS field. */ if (!flexpriority_enabled) - vmx_x86_ops.set_apic_access_page_addr = NULL; + vt_x86_ops.set_apic_access_page_addr = NULL; if (!cpu_has_vmx_tpr_shadow()) - vmx_x86_ops.update_cr8_intercept = NULL; + vt_x86_ops.update_cr8_intercept = NULL; #if IS_ENABLED(CONFIG_HYPERV) if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH && enable_ept) { - vmx_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs; - vmx_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range; + vt_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs; + vt_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range; } #endif @@ -8586,7 +8509,7 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_apicv()) enable_apicv = 0; if (!enable_apicv) - vmx_x86_ops.sync_pir_to_irr = NULL; + vt_x86_ops.sync_pir_to_irr = NULL; if (!enable_apicv || !cpu_has_vmx_ipiv()) enable_ipiv = false; @@ -8604,6 +8527,8 @@ static __init int hardware_setup(void) if (enable_ept) kvm_mmu_set_ept_masks(enable_ept_ad_bits, cpu_has_vmx_ept_execute_only()); + else + vt_x86_ops.get_mt_mask = NULL; /* * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID @@ -8621,9 +8546,6 @@ static __init int hardware_setup(void) if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) enable_pml = 0; - if (!enable_pml) - vmx_x86_ops.cpu_dirty_log_size = 0; - if (!cpu_has_vmx_preemption_timer()) enable_preemption_timer = false; @@ -8631,7 +8553,7 @@ static __init int hardware_setup(void) u64 use_timer_freq = 5000ULL * 1000 * 1000; cpu_preemption_timer_multi = - vmcs_config.misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; + vmx_misc_preemption_timer_rate(vmcs_config.misc); if (tsc_khz) use_timer_freq = (u64)tsc_khz * 1000; @@ -8647,8 +8569,8 @@ static __init int hardware_setup(void) } if (!enable_preemption_timer) { - vmx_x86_ops.set_hv_timer = NULL; - vmx_x86_ops.cancel_hv_timer = NULL; + vt_x86_ops.set_hv_timer = NULL; + vt_x86_ops.cancel_hv_timer = NULL; } kvm_caps.supported_mce_cap |= MCG_LMCE_P; @@ -8659,9 +8581,9 @@ static __init int hardware_setup(void) if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt()) pt_mode = PT_MODE_SYSTEM; if (pt_mode == PT_MODE_HOST_GUEST) - vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr; + vt_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr; else - vmx_init_ops.handle_intel_pt_intr = NULL; + vt_init_ops.handle_intel_pt_intr = NULL; setup_default_sgx_lepubkeyhash(); @@ -8681,17 +8603,30 @@ static __init int hardware_setup(void) kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); + /* + * On Intel CPUs that lack self-snoop feature, letting the guest control + * memory types may result in unexpected behavior. So always ignore guest + * PAT on those CPUs and map VM as writeback, not allowing userspace to + * disable the quirk. + * + * On certain Intel CPUs (e.g. SPR, ICX), though self-snoop feature is + * supported, UC is slow enough to cause issues with some older guests (e.g. + * an old version of bochs driver uses ioremap() instead of ioremap_wc() to + * map the video RAM, causing wayland desktop to fail to get started + * correctly). To avoid breaking those older guests that rely on KVM to force + * memory type to WB, provide KVM_X86_QUIRK_IGNORE_GUEST_PAT to preserve the + * safer (for performance) default behavior. + * + * On top of this, non-coherent DMA devices need the guest to flush CPU + * caches properly. This also requires honoring guest PAT, and is forced + * independent of the quirk in vmx_ignore_guest_pat(). + */ + if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) + kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; + kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; return r; } -static struct kvm_x86_init_ops vmx_init_ops __initdata = { - .hardware_setup = hardware_setup, - .handle_intel_pt_intr = NULL, - - .runtime_ops = &vmx_x86_ops, - .pmu_ops = &intel_pmu_ops, -}; - static void vmx_cleanup_l1d_flush(void) { if (vmx_l1d_flush_pages) { @@ -8702,25 +8637,16 @@ static void vmx_cleanup_l1d_flush(void) l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; } -static void __vmx_exit(void) +void vmx_exit(void) { allow_smaller_maxphyaddr = false; - cpu_emergency_unregister_virt_callback(vmx_emergency_disable); - vmx_cleanup_l1d_flush(); -} -static void vmx_exit(void) -{ - kvm_exit(); kvm_x86_vendor_exit(); - - __vmx_exit(); } -module_exit(vmx_exit); -static int __init vmx_init(void) +int __init vmx_init(void) { int r, cpu; @@ -8733,7 +8659,7 @@ static int __init vmx_init(void) */ hv_init_evmcs(); - r = kvm_x86_vendor_init(&vmx_init_ops); + r = kvm_x86_vendor_init(&vt_init_ops); if (r) return r; @@ -8754,8 +8680,6 @@ static int __init vmx_init(void) pi_init_cpu(cpu); } - cpu_emergency_register_virt_callback(vmx_emergency_disable); - vmx_check_vmcs12_offsets(); /* @@ -8766,21 +8690,9 @@ static int __init vmx_init(void) if (!enable_ept) allow_smaller_maxphyaddr = true; - /* - * Common KVM initialization _must_ come last, after this, /dev/kvm is - * exposed to userspace! - */ - r = kvm_init(sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx), - THIS_MODULE); - if (r) - goto err_kvm_init; - return 0; -err_kvm_init: - __vmx_exit(); err_l1d_flush: kvm_x86_vendor_exit(); return r; } -module_init(vmx_init); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 65786dbe7d60..b5758c33c60f 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -7,18 +7,17 @@ #include <asm/kvm.h> #include <asm/intel_pt.h> #include <asm/perf_event.h> +#include <asm/posted_intr.h> #include "capabilities.h" #include "../kvm_cache_regs.h" -#include "posted_intr.h" +#include "pmu_intel.h" #include "vmcs.h" #include "vmx_ops.h" #include "../cpuid.h" #include "run_flags.h" - -#define MSR_TYPE_R 1 -#define MSR_TYPE_W 2 -#define MSR_TYPE_RW 3 +#include "../mmu.h" +#include "common.h" #define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) @@ -70,45 +69,6 @@ struct pt_desc { struct pt_ctx guest; }; -union vmx_exit_reason { - struct { - u32 basic : 16; - u32 reserved16 : 1; - u32 reserved17 : 1; - u32 reserved18 : 1; - u32 reserved19 : 1; - u32 reserved20 : 1; - u32 reserved21 : 1; - u32 reserved22 : 1; - u32 reserved23 : 1; - u32 reserved24 : 1; - u32 reserved25 : 1; - u32 bus_lock_detected : 1; - u32 enclave_mode : 1; - u32 smi_pending_mtf : 1; - u32 smi_from_vmx_root : 1; - u32 reserved30 : 1; - u32 failed_vmentry : 1; - }; - u32 full; -}; - -struct lbr_desc { - /* Basic info about guest LBR records. */ - struct x86_pmu_lbr records; - - /* - * Emulate LBR feature via passthrough LBR registers when the - * per-vcpu guest LBR event is scheduled on the current pcpu. - * - * The records may be inaccurate if the host reclaims the LBR. - */ - struct perf_event *event; - - /* True if LBRs are marked as not intercepted in the MSR bitmap */ - bool msr_passthrough; -}; - /* * The nested_vmx structure is part of vcpu_vmx, and holds information we need * for correct emulation of VMX (i.e., nested VMX) on this vcpu. @@ -177,6 +137,7 @@ struct nested_vmx { bool reload_vmcs01_apic_access_page; bool update_vmcs01_cpu_dirty_logging; bool update_vmcs01_apicv_status; + bool update_vmcs01_hwapic_isr; /* * Enlightened VMCS has been enabled. It does not mean that L1 has to @@ -201,8 +162,6 @@ struct nested_vmx { struct kvm_host_map virtual_apic_map; struct kvm_host_map pi_desc_map; - struct kvm_host_map msr_bitmap_map; - struct pi_desc *pi_desc; bool pi_pending; u16 posted_intr_nv; @@ -250,20 +209,10 @@ struct nested_vmx { struct vcpu_vmx { struct kvm_vcpu vcpu; + struct vcpu_vt vt; u8 fail; u8 x2apic_msr_bitmap_mode; - /* - * If true, host state has been stored in vmx->loaded_vmcs for - * the CPU registers that only need to be switched when transitioning - * to/from the kernel, and the registers have been loaded with guest - * values. If false, host state is loaded in the CPU registers - * and vmx->loaded_vmcs->host_state is invalid. - */ - bool guest_state_loaded; - - unsigned long exit_qualification; - u32 exit_intr_info; u32 idt_vectoring_info; ulong rflags; @@ -276,7 +225,6 @@ struct vcpu_vmx { struct vmx_uret_msr guest_uret_msrs[MAX_NR_USER_RETURN_MSRS]; bool guest_uret_msrs_loaded; #ifdef CONFIG_X86_64 - u64 msr_host_kernel_gs_base; u64 msr_guest_kernel_gs_base; #endif @@ -315,15 +263,6 @@ struct vcpu_vmx { } seg[8]; } segment_cache; int vpid; - bool emulation_required; - - union vmx_exit_reason exit_reason; - - /* Posted interrupt descriptor */ - struct pi_desc pi_desc; - - /* Used if this vCPU is waiting for PI notification wakeup. */ - struct list_head pi_wakeup_list; /* Support for a guest hypervisor (nested VMX) */ struct nested_vmx nested; @@ -333,14 +272,15 @@ struct vcpu_vmx { bool ple_window_dirty; /* Support for PML */ -#define PML_ENTITY_NUM 512 +#define PML_LOG_NR_ENTRIES 512 + /* PML is written backwards: this is the first entry written by the CPU */ +#define PML_HEAD_INDEX (PML_LOG_NR_ENTRIES-1) + struct page *pml_pg; /* apic deadline value in host tsc */ u64 hv_deadline_tsc; - unsigned long host_debugctlmsr; - /* * Only bits masked by msr_ia32_feature_control_valid_bits can be set in * msr_ia32_feature_control. FEAT_CTL_LOCKED is always included @@ -362,6 +302,9 @@ struct vcpu_vmx { DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS); DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS); } shadow_msr_intercept; + + /* ve_info must be page aligned. */ + struct vmx_ve_information *ve_info; }; struct kvm_vmx { @@ -374,8 +317,44 @@ struct kvm_vmx { u64 *pid_table; }; -void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, - struct loaded_vmcs *buddy); +static __always_inline struct vcpu_vt *to_vt(struct kvm_vcpu *vcpu) +{ + return &(container_of(vcpu, struct vcpu_vmx, vcpu)->vt); +} + +static __always_inline struct kvm_vcpu *vt_to_vcpu(struct vcpu_vt *vt) +{ + return &(container_of(vt, struct vcpu_vmx, vt)->vcpu); +} + +static __always_inline union vmx_exit_reason vmx_get_exit_reason(struct kvm_vcpu *vcpu) +{ + return to_vt(vcpu)->exit_reason; +} + +static __always_inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu) +{ + struct vcpu_vt *vt = to_vt(vcpu); + + if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1) && + !WARN_ON_ONCE(is_td_vcpu(vcpu))) + vt->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + + return vt->exit_qualification; +} + +static __always_inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu) +{ + struct vcpu_vt *vt = to_vt(vcpu); + + if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2) && + !WARN_ON_ONCE(is_td_vcpu(vcpu))) + vt->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + + return vt->exit_intr_info; +} + +void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu); int allocate_vpid(void); void free_vpid(int vpid); void vmx_set_constant_host_state(struct vcpu_vmx *vmx); @@ -383,6 +362,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, unsigned long fs_base, unsigned long gs_base); int vmx_get_cpl(struct kvm_vcpu *vcpu); +int vmx_get_cpl_no_cache(struct kvm_vcpu *vcpu); bool vmx_emulation_required(struct kvm_vcpu *vcpu); unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu); void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); @@ -400,6 +380,7 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu); void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); bool vmx_nmi_blocked(struct kvm_vcpu *vcpu); +bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu); bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu); bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); @@ -574,7 +555,8 @@ static inline u8 vmx_get_rvi(void) SECONDARY_EXEC_ENABLE_VMFUNC | \ SECONDARY_EXEC_BUS_LOCK_DETECTION | \ SECONDARY_EXEC_NOTIFY_VM_EXITING | \ - SECONDARY_EXEC_ENCLS_EXITING) + SECONDARY_EXEC_ENCLS_EXITING | \ + SECONDARY_EXEC_EPT_VIOLATION_VE) #define KVM_REQUIRED_VMX_TERTIARY_VM_EXEC_CONTROL 0 #define KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL \ @@ -657,45 +639,10 @@ static __always_inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) return container_of(vcpu, struct vcpu_vmx, vcpu); } -static inline struct lbr_desc *vcpu_to_lbr_desc(struct kvm_vcpu *vcpu) -{ - return &to_vmx(vcpu)->lbr_desc; -} - -static inline struct x86_pmu_lbr *vcpu_to_lbr_records(struct kvm_vcpu *vcpu) -{ - return &vcpu_to_lbr_desc(vcpu)->records; -} - -static inline bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu) -{ - return !!vcpu_to_lbr_records(vcpu)->nr; -} - void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu); int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu); void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu); -static __always_inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1)) - vmx->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - - return vmx->exit_qualification; -} - -static __always_inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2)) - vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - - return vmx->exit_intr_info; -} - struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags); void free_vmcs(struct vmcs *vmcs); int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); @@ -719,7 +666,8 @@ static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu) if (!enable_ept) return true; - return allow_smaller_maxphyaddr && cpuid_maxphyaddr(vcpu) < boot_cpu_data.x86_phys_bits; + return allow_smaller_maxphyaddr && + cpuid_maxphyaddr(vcpu) < kvm_host.maxphyaddr; } static inline bool is_unrestricted_guest(struct kvm_vcpu *vcpu) @@ -747,4 +695,12 @@ static inline bool vmx_can_use_ipiv(struct kvm_vcpu *vcpu) return lapic_in_kernel(vcpu) && enable_ipiv; } +static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) +{ + vmx->segment_cache.bitmask = 0; +} + +int vmx_init(void); +void vmx_exit(void); + #endif /* __KVM_X86_VMX_H */ diff --git a/arch/x86/kvm/vmx/vmx_onhyperv.h b/arch/x86/kvm/vmx/vmx_onhyperv.h index eb48153bfd73..cdf8cbb69209 100644 --- a/arch/x86/kvm/vmx/vmx_onhyperv.h +++ b/arch/x86/kvm/vmx/vmx_onhyperv.h @@ -3,7 +3,7 @@ #ifndef __ARCH_X86_KVM_VMX_ONHYPERV_H__ #define __ARCH_X86_KVM_VMX_ONHYPERV_H__ -#include <asm/hyperv-tlfs.h> +#include <hyperv/hvhdk.h> #include <asm/mshyperv.h> #include <linux/jump_label.h> @@ -104,6 +104,14 @@ static inline void evmcs_load(u64 phys_addr) struct hv_vp_assist_page *vp_ap = hv_get_vp_assist_page(smp_processor_id()); + /* + * When enabling eVMCS, KVM verifies that every CPU has a valid hv_vp_assist_page() + * and aborts enabling the feature otherwise. CPU onlining path is also checked in + * vmx_hardware_enable(). + */ + if (KVM_BUG_ON(!vp_ap, kvm_get_running_vcpu()->kvm)) + return; + if (current_evmcs->hv_enlightenments_control.nested_flush_hypercall) vp_ap->nested_control.features.directhypercall = 1; vp_ap->current_nested_vmcs = phys_addr; diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index 8060e5fc6dbd..96677576c836 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -15,7 +15,7 @@ void vmwrite_error(unsigned long field, unsigned long value); void vmclear_error(struct vmcs *vmcs, u64 phys_addr); void vmptrld_error(struct vmcs *vmcs, u64 phys_addr); void invvpid_error(unsigned long ext, u16 vpid, gva_t gva); -void invept_error(unsigned long ext, u64 eptp, gpa_t gpa); +void invept_error(unsigned long ext, u64 eptp); #ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT /* @@ -47,7 +47,7 @@ static __always_inline void vmcs_check16(unsigned long field) BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, "16-bit accessor invalid for 64-bit high field"); BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, - "16-bit accessor invalid for 32-bit high field"); + "16-bit accessor invalid for 32-bit field"); BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, "16-bit accessor invalid for natural width field"); } @@ -118,7 +118,7 @@ do_exception: #else /* !CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ - asm volatile("1: vmread %2, %1\n\t" + asm volatile("1: vmread %[field], %[output]\n\t" ".byte 0x3e\n\t" /* branch taken hint */ "ja 3f\n\t" @@ -127,24 +127,26 @@ do_exception: * @field, and bounce through the trampoline to preserve * volatile registers. */ - "xorl %k1, %k1\n\t" + "xorl %k[output], %k[output]\n\t" "2:\n\t" - "push %1\n\t" - "push %2\n\t" + "push %[output]\n\t" + "push %[field]\n\t" "call vmread_error_trampoline\n\t" /* * Unwind the stack. Note, the trampoline zeros out the * memory for @fault so that the result is '0' on error. */ - "pop %2\n\t" - "pop %1\n\t" + "pop %[field]\n\t" + "pop %[output]\n\t" "3:\n\t" /* VMREAD faulted. As above, except push '1' for @fault. */ - _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_ONE_REG, %1) + _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_ONE_REG, %[output]) - : ASM_CALL_CONSTRAINT, "=&r"(value) : "r"(field) : "cc"); + : ASM_CALL_CONSTRAINT, [output] "=&r" (value) + : [field] "r" (field) + : "cc"); return value; #endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ @@ -312,13 +314,13 @@ static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva) vmx_asm2(invvpid, "r"(ext), "m"(operand), ext, vpid, gva); } -static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa) +static inline void __invept(unsigned long ext, u64 eptp) { struct { - u64 eptp, gpa; - } operand = {eptp, gpa}; - - vmx_asm2(invept, "r"(ext), "m"(operand), ext, eptp, gpa); + u64 eptp; + u64 reserved_0; + } operand = { eptp, 0 }; + vmx_asm2(invept, "r"(ext), "m"(operand), ext, eptp); } static inline void vpid_sync_vcpu_single(int vpid) @@ -355,13 +357,13 @@ static inline void vpid_sync_vcpu_addr(int vpid, gva_t addr) static inline void ept_sync_global(void) { - __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0); + __invept(VMX_EPT_EXTENT_GLOBAL, 0); } static inline void ept_sync_context(u64 eptp) { if (cpu_has_vmx_invept_context()) - __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); + __invept(VMX_EPT_EXTENT_CONTEXT, eptp); else ept_sync_global(); } diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h new file mode 100644 index 000000000000..b4596f651232 --- /dev/null +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -0,0 +1,169 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_X86_OPS_H +#define __KVM_X86_VMX_X86_OPS_H + +#include <linux/kvm_host.h> + +#include "x86.h" + +__init int vmx_hardware_setup(void); + +extern struct kvm_x86_ops vt_x86_ops __initdata; +extern struct kvm_x86_init_ops vt_init_ops __initdata; + +void vmx_hardware_unsetup(void); +int vmx_check_processor_compat(void); +int vmx_enable_virtualization_cpu(void); +void vmx_disable_virtualization_cpu(void); +void vmx_emergency_disable_virtualization_cpu(void); +int vmx_vm_init(struct kvm *kvm); +void vmx_vm_destroy(struct kvm *kvm); +int vmx_vcpu_precreate(struct kvm *kvm); +int vmx_vcpu_create(struct kvm_vcpu *vcpu); +int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu); +fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit); +void vmx_vcpu_free(struct kvm_vcpu *vcpu); +void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); +void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu); +void vmx_vcpu_put(struct kvm_vcpu *vcpu); +int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath); +void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu); +int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu); +void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu); +int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); +#ifdef CONFIG_KVM_SMM +int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection); +int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram); +int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram); +void vmx_enable_smi_window(struct kvm_vcpu *vcpu); +#endif +int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len); +int vmx_check_intercept(struct kvm_vcpu *vcpu, + struct x86_instruction_info *info, + enum x86_intercept_stage stage, + struct x86_exception *exception); +bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu); +void vmx_migrate_timers(struct kvm_vcpu *vcpu); +void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu); +void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr); +int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu); +void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, + int trig_mode, int vector); +void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu); +bool vmx_has_emulated_msr(struct kvm *kvm, u32 index); +void vmx_msr_filter_changed(struct kvm_vcpu *vcpu); +void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); +void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); +int vmx_get_feature_msr(u32 msr, u64 *data); +int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); +#define vmx_complete_emulated_msr kvm_complete_insn_gp +u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg); +void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); +void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); +int vmx_get_cpl(struct kvm_vcpu *vcpu); +void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); +bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); +void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); +void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); +void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); +bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); +int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer); +void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); +void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); +void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); +void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); +void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val); +void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val); +void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu); +void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg); +unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu); +void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); +bool vmx_get_if_flag(struct kvm_vcpu *vcpu); +void vmx_flush_tlb_all(struct kvm_vcpu *vcpu); +void vmx_flush_tlb_current(struct kvm_vcpu *vcpu); +void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr); +void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu); +void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask); +u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu); +void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall); +void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected); +void vmx_inject_nmi(struct kvm_vcpu *vcpu); +void vmx_inject_exception(struct kvm_vcpu *vcpu); +void vmx_cancel_injection(struct kvm_vcpu *vcpu); +int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection); +int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection); +bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); +void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); +void vmx_enable_nmi_window(struct kvm_vcpu *vcpu); +void vmx_enable_irq_window(struct kvm_vcpu *vcpu); +void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr); +void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu); +void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu); +void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); +int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); +int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr); +u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); + +void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, + u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code); +void vmx_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code); + +u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu); +u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu); +void vmx_write_tsc_offset(struct kvm_vcpu *vcpu); +void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu); +void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu); +#ifdef CONFIG_X86_64 +int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, + bool *expired); +void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu); +#endif +void vmx_setup_mce(struct kvm_vcpu *vcpu); + +#ifdef CONFIG_KVM_INTEL_TDX +void tdx_disable_virtualization_cpu(void); +int tdx_vm_init(struct kvm *kvm); +void tdx_mmu_release_hkid(struct kvm *kvm); +void tdx_vm_destroy(struct kvm *kvm); +int tdx_vm_ioctl(struct kvm *kvm, void __user *argp); + +int tdx_vcpu_create(struct kvm_vcpu *vcpu); +void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); +void tdx_vcpu_free(struct kvm_vcpu *vcpu); +void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu); +int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu); +fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit); +void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); +void tdx_vcpu_put(struct kvm_vcpu *vcpu); +bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu); +int tdx_handle_exit(struct kvm_vcpu *vcpu, + enum exit_fastpath_completion fastpath); + +void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, + int trig_mode, int vector); +void tdx_inject_nmi(struct kvm_vcpu *vcpu); +void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, + u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code); +bool tdx_has_emulated_msr(u32 index); +int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); +int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); + +int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp); + +int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, void *private_spt); +int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, void *private_spt); +int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, kvm_pfn_t pfn); +int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, kvm_pfn_t pfn); + +void tdx_flush_tlb_current(struct kvm_vcpu *vcpu); +void tdx_flush_tlb_all(struct kvm_vcpu *vcpu); +void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); +int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn); +#endif + +#endif /* __KVM_X86_VMX_X86_OPS_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 47d9f03b7778..b58a74c1722d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -90,13 +90,18 @@ #include "trace.h" #define MAX_IO_MSRS 256 -#define KVM_MAX_MCE_BANKS 32 -struct kvm_caps kvm_caps __read_mostly = { - .supported_mce_cap = MCG_CTL_P | MCG_SER_P, -}; +/* + * Note, kvm_caps fields should *never* have default values, all fields must be + * recomputed from scratch during vendor module load, e.g. to account for a + * vendor module being reloaded with different module parameters. + */ +struct kvm_caps kvm_caps __read_mostly; EXPORT_SYMBOL_GPL(kvm_caps); +struct kvm_host_values kvm_host __read_mostly; +EXPORT_SYMBOL_GPL(kvm_host); + #define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e)) #define emul_to_vcpu(ctxt) \ @@ -113,8 +118,6 @@ u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #endif -static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; - #define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE) #define KVM_CAP_PMU_VALID_MASK KVM_PMU_CAP_DISABLE @@ -161,15 +164,6 @@ module_param(kvmclock_periodic_sync, bool, 0444); static u32 __read_mostly tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, 0644); -/* - * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables - * adaptive tuning starting from default advancement of 1000ns. '0' disables - * advancement entirely. Any other value is used as-is and disables adaptive - * tuning, i.e. allows privileged userspace to set an exact advancement time. - */ -static int __read_mostly lapic_timer_advance_ns = -1; -module_param(lapic_timer_advance_ns, int, 0644); - static bool __read_mostly vector_hashing = true; module_param(vector_hashing, bool, 0444); @@ -226,20 +220,14 @@ static struct kvm_user_return_msrs __percpu *user_return_msrs; | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ | XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE) -u64 __read_mostly host_efer; -EXPORT_SYMBOL_GPL(host_efer); - bool __read_mostly allow_smaller_maxphyaddr = 0; EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); bool __read_mostly enable_apicv = true; EXPORT_SYMBOL_GPL(enable_apicv); -u64 __read_mostly host_xss; -EXPORT_SYMBOL_GPL(host_xss); - -u64 __read_mostly host_arch_capabilities; -EXPORT_SYMBOL_GPL(host_arch_capabilities); +bool __read_mostly enable_device_posted_irqs = true; +EXPORT_SYMBOL_GPL(enable_device_posted_irqs); const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS(), @@ -314,29 +302,241 @@ const struct kvm_stats_header kvm_vcpu_stats_header = { sizeof(kvm_vcpu_stats_desc), }; -u64 __read_mostly host_xcr0; - static struct kmem_cache *x86_emulator_cache; /* - * When called, it means the previous get/set msr reached an invalid msr. - * Return true if we want to ignore/silent this failed msr access. + * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track + * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS, + * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. msrs_to_save holds MSRs that + * require host support, i.e. should be probed via RDMSR. emulated_msrs holds + * MSRs that KVM emulates without strictly requiring host support. + * msr_based_features holds MSRs that enumerate features, i.e. are effectively + * CPUID leafs. Note, msr_based_features isn't mutually exclusive with + * msrs_to_save and emulated_msrs. */ -static bool kvm_msr_ignored_check(u32 msr, u64 data, bool write) + +static const u32 msrs_to_save_base[] = { + MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, + MSR_STAR, +#ifdef CONFIG_X86_64 + MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, +#endif + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, + MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, + MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL, + MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH, + MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK, + MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B, + MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, + MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, + MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, + MSR_IA32_UMWAIT_CONTROL, + + MSR_IA32_XFD, MSR_IA32_XFD_ERR, +}; + +static const u32 msrs_to_save_pmu[] = { + MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, + MSR_ARCH_PERFMON_FIXED_CTR0 + 2, + MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, + MSR_CORE_PERF_GLOBAL_CTRL, + MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG, + + /* This part of MSRs should match KVM_MAX_NR_INTEL_GP_COUNTERS. */ + MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, + MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, + MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, + MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, + MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, + MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, + MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, + MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, + + MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, + MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, + + /* This part of MSRs should match KVM_MAX_NR_AMD_GP_COUNTERS. */ + MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2, + MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, + MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, + MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, + + MSR_AMD64_PERF_CNTR_GLOBAL_CTL, + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, +}; + +static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) + + ARRAY_SIZE(msrs_to_save_pmu)]; +static unsigned num_msrs_to_save; + +static const u32 emulated_msrs_all[] = { + MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, + MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, + +#ifdef CONFIG_KVM_HYPERV + HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, + HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, + HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY, + HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, + HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, + HV_X64_MSR_RESET, + HV_X64_MSR_VP_INDEX, + HV_X64_MSR_VP_RUNTIME, + HV_X64_MSR_SCONTROL, + HV_X64_MSR_STIMER0_CONFIG, + HV_X64_MSR_VP_ASSIST_PAGE, + HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL, + HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL, + HV_X64_MSR_SYNDBG_OPTIONS, + HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, + HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, + HV_X64_MSR_SYNDBG_PENDING_BUFFER, +#endif + + MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, + MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, + + MSR_IA32_TSC_ADJUST, + MSR_IA32_TSC_DEADLINE, + MSR_IA32_ARCH_CAPABILITIES, + MSR_IA32_PERF_CAPABILITIES, + MSR_IA32_MISC_ENABLE, + MSR_IA32_MCG_STATUS, + MSR_IA32_MCG_CTL, + MSR_IA32_MCG_EXT_CTL, + MSR_IA32_SMBASE, + MSR_SMI_COUNT, + MSR_PLATFORM_INFO, + MSR_MISC_FEATURES_ENABLES, + MSR_AMD64_VIRT_SPEC_CTRL, + MSR_AMD64_TSC_RATIO, + MSR_IA32_POWER_CTL, + MSR_IA32_UCODE_REV, + + /* + * KVM always supports the "true" VMX control MSRs, even if the host + * does not. The VMX MSRs as a whole are considered "emulated" as KVM + * doesn't strictly require them to exist in the host (ignoring that + * KVM would refuse to load in the first place if the core set of MSRs + * aren't supported). + */ + MSR_IA32_VMX_BASIC, + MSR_IA32_VMX_TRUE_PINBASED_CTLS, + MSR_IA32_VMX_TRUE_PROCBASED_CTLS, + MSR_IA32_VMX_TRUE_EXIT_CTLS, + MSR_IA32_VMX_TRUE_ENTRY_CTLS, + MSR_IA32_VMX_MISC, + MSR_IA32_VMX_CR0_FIXED0, + MSR_IA32_VMX_CR4_FIXED0, + MSR_IA32_VMX_VMCS_ENUM, + MSR_IA32_VMX_PROCBASED_CTLS2, + MSR_IA32_VMX_EPT_VPID_CAP, + MSR_IA32_VMX_VMFUNC, + + MSR_K7_HWCR, + MSR_KVM_POLL_CONTROL, +}; + +static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)]; +static unsigned num_emulated_msrs; + +/* + * List of MSRs that control the existence of MSR-based features, i.e. MSRs + * that are effectively CPUID leafs. VMX MSRs are also included in the set of + * feature MSRs, but are handled separately to allow expedited lookups. + */ +static const u32 msr_based_features_all_except_vmx[] = { + MSR_AMD64_DE_CFG, + MSR_IA32_UCODE_REV, + MSR_IA32_ARCH_CAPABILITIES, + MSR_IA32_PERF_CAPABILITIES, + MSR_PLATFORM_INFO, +}; + +static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) + + (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)]; +static unsigned int num_msr_based_features; + +/* + * All feature MSRs except uCode revID, which tracks the currently loaded uCode + * patch, are immutable once the vCPU model is defined. + */ +static bool kvm_is_immutable_feature_msr(u32 msr) { - const char *op = write ? "wrmsr" : "rdmsr"; + int i; - if (ignore_msrs) { - if (report_ignored_msrs) - kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n", - op, msr, data); - /* Mask the error */ + if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR) return true; - } else { + + for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) { + if (msr == msr_based_features_all_except_vmx[i]) + return msr != MSR_IA32_UCODE_REV; + } + + return false; +} + +static bool kvm_is_advertised_msr(u32 msr_index) +{ + unsigned int i; + + for (i = 0; i < num_msrs_to_save; i++) { + if (msrs_to_save[i] == msr_index) + return true; + } + + for (i = 0; i < num_emulated_msrs; i++) { + if (emulated_msrs[i] == msr_index) + return true; + } + + return false; +} + +typedef int (*msr_access_t)(struct kvm_vcpu *vcpu, u32 index, u64 *data, + bool host_initiated); + +static __always_inline int kvm_do_msr_access(struct kvm_vcpu *vcpu, u32 msr, + u64 *data, bool host_initiated, + enum kvm_msr_access rw, + msr_access_t msr_access_fn) +{ + const char *op = rw == MSR_TYPE_W ? "wrmsr" : "rdmsr"; + int ret; + + BUILD_BUG_ON(rw != MSR_TYPE_R && rw != MSR_TYPE_W); + + /* + * Zero the data on read failures to avoid leaking stack data to the + * guest and/or userspace, e.g. if the failure is ignored below. + */ + ret = msr_access_fn(vcpu, msr, data, host_initiated); + if (ret && rw == MSR_TYPE_R) + *data = 0; + + if (ret != KVM_MSR_RET_UNSUPPORTED) + return ret; + + /* + * Userspace is allowed to read MSRs, and write '0' to MSRs, that KVM + * advertises to userspace, even if an MSR isn't fully supported. + * Simply check that @data is '0', which covers both the write '0' case + * and all reads (in which case @data is zeroed on failure; see above). + */ + if (host_initiated && !*data && kvm_is_advertised_msr(msr)) + return 0; + + if (!ignore_msrs) { kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n", - op, msr, data); - return false; + op, msr, *data); + return ret; } + + if (report_ignored_msrs) + kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n", op, msr, *data); + + return 0; } static struct kmem_cache *kvm_alloc_emulator_cache(void) @@ -369,7 +569,7 @@ static void kvm_on_user_return(struct user_return_notifier *urn) /* * Disabling irqs at this point since the following code could be - * interrupted and executed through kvm_arch_hardware_disable() + * interrupted and executed through kvm_arch_disable_virtualization_cpu() */ local_irq_save(flags); if (msrs->registered) { @@ -380,7 +580,7 @@ static void kvm_on_user_return(struct user_return_notifier *urn) for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) { values = &msrs->values[slot]; if (values->host != values->curr) { - wrmsrl(kvm_uret_msrs_list[slot], values->host); + wrmsrq(kvm_uret_msrs_list[slot], values->host); values->curr = values->host; } } @@ -392,10 +592,10 @@ static int kvm_probe_user_return_msr(u32 msr) int ret; preempt_disable(); - ret = rdmsrl_safe(msr, &val); + ret = rdmsrq_safe(msr, &val); if (ret) goto out; - ret = wrmsrl_safe(msr, val); + ret = wrmsrq_safe(msr, val); out: preempt_enable(); return ret; @@ -427,80 +627,59 @@ EXPORT_SYMBOL_GPL(kvm_find_user_return_msr); static void kvm_user_return_msr_cpu_online(void) { - unsigned int cpu = smp_processor_id(); - struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); + struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); u64 value; int i; for (i = 0; i < kvm_nr_uret_msrs; ++i) { - rdmsrl_safe(kvm_uret_msrs_list[i], &value); + rdmsrq_safe(kvm_uret_msrs_list[i], &value); msrs->values[i].host = value; msrs->values[i].curr = value; } } +static void kvm_user_return_register_notifier(struct kvm_user_return_msrs *msrs) +{ + if (!msrs->registered) { + msrs->urn.on_user_return = kvm_on_user_return; + user_return_notifier_register(&msrs->urn); + msrs->registered = true; + } +} + int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) { - unsigned int cpu = smp_processor_id(); - struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); + struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); int err; value = (value & mask) | (msrs->values[slot].host & ~mask); if (value == msrs->values[slot].curr) return 0; - err = wrmsrl_safe(kvm_uret_msrs_list[slot], value); + err = wrmsrq_safe(kvm_uret_msrs_list[slot], value); if (err) return 1; msrs->values[slot].curr = value; - if (!msrs->registered) { - msrs->urn.on_user_return = kvm_on_user_return; - user_return_notifier_register(&msrs->urn); - msrs->registered = true; - } + kvm_user_return_register_notifier(msrs); return 0; } EXPORT_SYMBOL_GPL(kvm_set_user_return_msr); -static void drop_user_return_notifiers(void) -{ - unsigned int cpu = smp_processor_id(); - struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); - - if (msrs->registered) - kvm_on_user_return(&msrs->urn); -} - -u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) +void kvm_user_return_msr_update_cache(unsigned int slot, u64 value) { - return vcpu->arch.apic_base; -} + struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); -enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu) -{ - return kvm_apic_mode(kvm_get_apic_base(vcpu)); + msrs->values[slot].curr = value; + kvm_user_return_register_notifier(msrs); } -EXPORT_SYMBOL_GPL(kvm_get_apic_mode); +EXPORT_SYMBOL_GPL(kvm_user_return_msr_update_cache); -int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +static void drop_user_return_notifiers(void) { - enum lapic_mode old_mode = kvm_get_apic_mode(vcpu); - enum lapic_mode new_mode = kvm_apic_mode(msr_info->data); - u64 reserved_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu) | 0x2ff | - (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE); + struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); - if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID) - return 1; - if (!msr_info->host_initiated) { - if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC) - return 1; - if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC) - return 1; - } - - kvm_lapic_set_base(vcpu, msr_info->data); - kvm_recalculate_apic_map(vcpu->kvm); - return 0; + if (msrs->registered) + kvm_on_user_return(&msrs->urn); } /* @@ -637,15 +816,9 @@ static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vecto ex->payload = payload; } -/* Forcibly leave the nested mode in cases like a vCPU reset */ -static void kvm_leave_nested(struct kvm_vcpu *vcpu) -{ - kvm_x86_ops.nested_ops->leave_nested(vcpu); -} - -static void kvm_multiple_exception(struct kvm_vcpu *vcpu, - unsigned nr, bool has_error, u32 error_code, - bool has_payload, unsigned long payload, bool reinject) +static void kvm_multiple_exception(struct kvm_vcpu *vcpu, unsigned int nr, + bool has_error, u32 error_code, + bool has_payload, unsigned long payload) { u32 prev_nr; int class1, class2; @@ -653,13 +826,10 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, kvm_make_request(KVM_REQ_EVENT, vcpu); /* - * If the exception is destined for L2 and isn't being reinjected, - * morph it to a VM-Exit if L1 wants to intercept the exception. A - * previously injected exception is not checked because it was checked - * when it was original queued, and re-checking is incorrect if _L1_ - * injected the exception, in which case it's exempt from interception. + * If the exception is destined for L2, morph it to a VM-Exit if L1 + * wants to intercept the exception. */ - if (!reinject && is_guest_mode(vcpu) && + if (is_guest_mode(vcpu) && kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, nr, error_code)) { kvm_queue_exception_vmexit(vcpu, nr, has_error, error_code, has_payload, payload); @@ -668,28 +838,9 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) { queue: - if (reinject) { - /* - * On VM-Entry, an exception can be pending if and only - * if event injection was blocked by nested_run_pending. - * In that case, however, vcpu_enter_guest() requests an - * immediate exit, and the guest shouldn't proceed far - * enough to need reinjection. - */ - WARN_ON_ONCE(kvm_is_exception_pending(vcpu)); - vcpu->arch.exception.injected = true; - if (WARN_ON_ONCE(has_payload)) { - /* - * A reinjected event has already - * delivered its payload. - */ - has_payload = false; - payload = 0; - } - } else { - vcpu->arch.exception.pending = true; - vcpu->arch.exception.injected = false; - } + vcpu->arch.exception.pending = true; + vcpu->arch.exception.injected = false; + vcpu->arch.exception.has_error_code = has_error; vcpu->arch.exception.vector = nr; vcpu->arch.exception.error_code = error_code; @@ -730,30 +881,53 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) { - kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false); + kvm_multiple_exception(vcpu, nr, false, 0, false, 0); } EXPORT_SYMBOL_GPL(kvm_queue_exception); -void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) -{ - kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true); -} -EXPORT_SYMBOL_GPL(kvm_requeue_exception); void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, unsigned long payload) { - kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false); + kvm_multiple_exception(vcpu, nr, false, 0, true, payload); } EXPORT_SYMBOL_GPL(kvm_queue_exception_p); static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code, unsigned long payload) { - kvm_multiple_exception(vcpu, nr, true, error_code, - true, payload, false); + kvm_multiple_exception(vcpu, nr, true, error_code, true, payload); } +void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned int nr, + bool has_error_code, u32 error_code) +{ + + /* + * On VM-Entry, an exception can be pending if and only if event + * injection was blocked by nested_run_pending. In that case, however, + * vcpu_enter_guest() requests an immediate exit, and the guest + * shouldn't proceed far enough to need reinjection. + */ + WARN_ON_ONCE(kvm_is_exception_pending(vcpu)); + + /* + * Do not check for interception when injecting an event for L2, as the + * exception was checked for intercept when it was original queued, and + * re-checking is incorrect if _L1_ injected the exception, in which + * case it's exempt from interception. + */ + kvm_make_request(KVM_REQ_EVENT, vcpu); + + vcpu->arch.exception.injected = true; + vcpu->arch.exception.has_error_code = has_error_code; + vcpu->arch.exception.vector = nr; + vcpu->arch.exception.error_code = error_code; + vcpu->arch.exception.has_payload = false; + vcpu->arch.exception.payload = 0; +} +EXPORT_SYMBOL_GPL(kvm_requeue_exception); + int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) { if (err) @@ -823,23 +997,17 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu) void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { - kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false); + kvm_multiple_exception(vcpu, nr, true, error_code, false, 0); } EXPORT_SYMBOL_GPL(kvm_queue_exception_e); -void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) -{ - kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true); -} -EXPORT_SYMBOL_GPL(kvm_requeue_exception_e); - /* * Checks if cpl <= required_cpl; if true, return true. Otherwise queue * a #GP and return false. */ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) { - if (static_call(kvm_x86_get_cpl)(vcpu) <= required_cpl) + if (kvm_x86_call(get_cpl)(vcpu) <= required_cpl) return true; kvm_queue_exception_e(vcpu, GP_VECTOR, 0); return false; @@ -923,7 +1091,7 @@ static bool kvm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) return false; - return static_call(kvm_x86_is_valid_cr0)(vcpu, cr0); + return kvm_x86_call(is_valid_cr0)(vcpu, cr0); } void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0) @@ -960,11 +1128,6 @@ void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned lon if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS) kvm_mmu_reset_context(vcpu); - - if (((cr0 ^ old_cr0) & X86_CR0_CD) && - kvm_mmu_honors_guest_mtrrs(vcpu->kvm) && - !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) - kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL); } EXPORT_SYMBOL_GPL(kvm_post_set_cr0); @@ -987,7 +1150,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (!is_pae(vcpu)) return 1; - static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); + kvm_x86_call(get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); if (cs_l) return 1; } @@ -1001,7 +1164,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) (is_64_bit_mode(vcpu) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE))) return 1; - static_call(kvm_x86_set_cr0)(vcpu, cr0); + kvm_x86_call(set_cr0)(vcpu, cr0); kvm_post_set_cr0(vcpu, old_cr0, cr0); @@ -1022,19 +1185,19 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) { - if (vcpu->arch.xcr0 != host_xcr0) + if (vcpu->arch.xcr0 != kvm_host.xcr0) xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); - if (guest_can_use(vcpu, X86_FEATURE_XSAVES) && - vcpu->arch.ia32_xss != host_xss) - wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss); + if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) && + vcpu->arch.ia32_xss != kvm_host.xss) + wrmsrq(MSR_IA32_XSS, vcpu->arch.ia32_xss); } if (cpu_feature_enabled(X86_FEATURE_PKU) && vcpu->arch.pkru != vcpu->arch.host_pkru && ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) - write_pkru(vcpu->arch.pkru); + wrpkru(vcpu->arch.pkru); } EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state); @@ -1048,17 +1211,17 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) { vcpu->arch.pkru = rdpkru(); if (vcpu->arch.pkru != vcpu->arch.host_pkru) - write_pkru(vcpu->arch.host_pkru); + wrpkru(vcpu->arch.host_pkru); } if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) { - if (vcpu->arch.xcr0 != host_xcr0) - xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); + if (vcpu->arch.xcr0 != kvm_host.xcr0) + xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0); - if (guest_can_use(vcpu, X86_FEATURE_XSAVES) && - vcpu->arch.ia32_xss != host_xss) - wrmsrl(MSR_IA32_XSS, host_xss); + if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) && + vcpu->arch.ia32_xss != kvm_host.xss) + wrmsrq(MSR_IA32_XSS, kvm_host.xss); } } @@ -1112,14 +1275,14 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) vcpu->arch.xcr0 = xcr0; if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND) - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; return 0; } int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu) { /* Note, #UD due to CR4.OSXSAVE=0 has priority over the intercept. */ - if (static_call(kvm_x86_get_cpl)(vcpu) != 0 || + if (kvm_x86_call(get_cpl)(vcpu) != 0 || __kvm_set_xcr(vcpu, kvm_rcx_read(vcpu), kvm_read_edx_eax(vcpu))) { kvm_inject_gp(vcpu, 0); return 1; @@ -1129,22 +1292,10 @@ int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv); -bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) -{ - if (cr4 & cr4_reserved_bits) - return false; - - if (cr4 & vcpu->arch.cr4_guest_rsvd_bits) - return false; - - return true; -} -EXPORT_SYMBOL_GPL(__kvm_is_valid_cr4); - static bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { return __kvm_is_valid_cr4(vcpu, cr4) && - static_call(kvm_x86_is_valid_cr4)(vcpu, cr4); + kvm_x86_call(is_valid_cr4)(vcpu, cr4); } void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4) @@ -1212,7 +1363,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; } - static_call(kvm_x86_set_cr4)(vcpu, cr4); + kvm_x86_call(set_cr4)(vcpu, cr4); kvm_post_set_cr4(vcpu, old_cr4, cr4); @@ -1351,7 +1502,7 @@ void kvm_update_dr7(struct kvm_vcpu *vcpu) dr7 = vcpu->arch.guest_debug_dr7; else dr7 = vcpu->arch.dr7; - static_call(kvm_x86_set_dr7)(vcpu, dr7); + kvm_x86_call(set_dr7)(vcpu, dr7); vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED; if (dr7 & DR7_BP_EN_MASK) vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; @@ -1362,10 +1513,10 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) { u64 fixed = DR6_FIXED_1; - if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_RTM)) fixed |= DR6_RTM; - if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)) fixed |= DR6_BUS_LOCK; return fixed; } @@ -1433,178 +1584,6 @@ int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu) EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc); /* - * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track - * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS, - * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. msrs_to_save holds MSRs that - * require host support, i.e. should be probed via RDMSR. emulated_msrs holds - * MSRs that KVM emulates without strictly requiring host support. - * msr_based_features holds MSRs that enumerate features, i.e. are effectively - * CPUID leafs. Note, msr_based_features isn't mutually exclusive with - * msrs_to_save and emulated_msrs. - */ - -static const u32 msrs_to_save_base[] = { - MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, - MSR_STAR, -#ifdef CONFIG_X86_64 - MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, -#endif - MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, - MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, - MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL, - MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH, - MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK, - MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B, - MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, - MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, - MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, - MSR_IA32_UMWAIT_CONTROL, - - MSR_IA32_XFD, MSR_IA32_XFD_ERR, -}; - -static const u32 msrs_to_save_pmu[] = { - MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, - MSR_ARCH_PERFMON_FIXED_CTR0 + 2, - MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, - MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL, - MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG, - - /* This part of MSRs should match KVM_INTEL_PMC_MAX_GENERIC. */ - MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, - MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, - MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, - MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, - MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, - MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, - MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, - MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, - - MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, - MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, - - /* This part of MSRs should match KVM_AMD_PMC_MAX_GENERIC. */ - MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2, - MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, - MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, - MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, - - MSR_AMD64_PERF_CNTR_GLOBAL_CTL, - MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, - MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, -}; - -static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) + - ARRAY_SIZE(msrs_to_save_pmu)]; -static unsigned num_msrs_to_save; - -static const u32 emulated_msrs_all[] = { - MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, - MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, - -#ifdef CONFIG_KVM_HYPERV - HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, - HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, - HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY, - HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, - HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, - HV_X64_MSR_RESET, - HV_X64_MSR_VP_INDEX, - HV_X64_MSR_VP_RUNTIME, - HV_X64_MSR_SCONTROL, - HV_X64_MSR_STIMER0_CONFIG, - HV_X64_MSR_VP_ASSIST_PAGE, - HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL, - HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL, - HV_X64_MSR_SYNDBG_OPTIONS, - HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, - HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, - HV_X64_MSR_SYNDBG_PENDING_BUFFER, -#endif - - MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, - MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, - - MSR_IA32_TSC_ADJUST, - MSR_IA32_TSC_DEADLINE, - MSR_IA32_ARCH_CAPABILITIES, - MSR_IA32_PERF_CAPABILITIES, - MSR_IA32_MISC_ENABLE, - MSR_IA32_MCG_STATUS, - MSR_IA32_MCG_CTL, - MSR_IA32_MCG_EXT_CTL, - MSR_IA32_SMBASE, - MSR_SMI_COUNT, - MSR_PLATFORM_INFO, - MSR_MISC_FEATURES_ENABLES, - MSR_AMD64_VIRT_SPEC_CTRL, - MSR_AMD64_TSC_RATIO, - MSR_IA32_POWER_CTL, - MSR_IA32_UCODE_REV, - - /* - * KVM always supports the "true" VMX control MSRs, even if the host - * does not. The VMX MSRs as a whole are considered "emulated" as KVM - * doesn't strictly require them to exist in the host (ignoring that - * KVM would refuse to load in the first place if the core set of MSRs - * aren't supported). - */ - MSR_IA32_VMX_BASIC, - MSR_IA32_VMX_TRUE_PINBASED_CTLS, - MSR_IA32_VMX_TRUE_PROCBASED_CTLS, - MSR_IA32_VMX_TRUE_EXIT_CTLS, - MSR_IA32_VMX_TRUE_ENTRY_CTLS, - MSR_IA32_VMX_MISC, - MSR_IA32_VMX_CR0_FIXED0, - MSR_IA32_VMX_CR4_FIXED0, - MSR_IA32_VMX_VMCS_ENUM, - MSR_IA32_VMX_PROCBASED_CTLS2, - MSR_IA32_VMX_EPT_VPID_CAP, - MSR_IA32_VMX_VMFUNC, - - MSR_K7_HWCR, - MSR_KVM_POLL_CONTROL, -}; - -static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)]; -static unsigned num_emulated_msrs; - -/* - * List of MSRs that control the existence of MSR-based features, i.e. MSRs - * that are effectively CPUID leafs. VMX MSRs are also included in the set of - * feature MSRs, but are handled separately to allow expedited lookups. - */ -static const u32 msr_based_features_all_except_vmx[] = { - MSR_AMD64_DE_CFG, - MSR_IA32_UCODE_REV, - MSR_IA32_ARCH_CAPABILITIES, - MSR_IA32_PERF_CAPABILITIES, -}; - -static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) + - (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)]; -static unsigned int num_msr_based_features; - -/* - * All feature MSRs except uCode revID, which tracks the currently loaded uCode - * patch, are immutable once the vCPU model is defined. - */ -static bool kvm_is_immutable_feature_msr(u32 msr) -{ - int i; - - if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR) - return true; - - for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) { - if (msr == msr_based_features_all_except_vmx[i]) - return msr != MSR_IA32_UCODE_REV; - } - - return false; -} - -/* * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM * does not yet virtualize. These include: * 10 - MISC_PACKAGE_CTRLS @@ -1621,11 +1600,11 @@ static bool kvm_is_immutable_feature_msr(u32 msr) ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \ ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \ ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \ - ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR) + ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO | ARCH_CAP_ITS_NO) static u64 kvm_get_arch_capabilities(void) { - u64 data = host_arch_capabilities & KVM_SUPPORTED_ARCH_CAP; + u64 data = kvm_host.arch_capabilities & KVM_SUPPORTED_ARCH_CAP; /* * If nx_huge_pages is enabled, KVM's shadow paging will ensure that @@ -1655,6 +1634,8 @@ static u64 kvm_get_arch_capabilities(void) data |= ARCH_CAP_MDS_NO; if (!boot_cpu_has_bug(X86_BUG_RFDS)) data |= ARCH_CAP_RFDS_NO; + if (!boot_cpu_has_bug(X86_BUG_ITS)) + data |= ARCH_CAP_ITS_NO; if (!boot_cpu_has(X86_FEATURE_RTM)) { /* @@ -1681,58 +1662,52 @@ static u64 kvm_get_arch_capabilities(void) return data; } -static int kvm_get_msr_feature(struct kvm_msr_entry *msr) +static int kvm_get_feature_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, + bool host_initiated) { - switch (msr->index) { + WARN_ON_ONCE(!host_initiated); + + switch (index) { case MSR_IA32_ARCH_CAPABILITIES: - msr->data = kvm_get_arch_capabilities(); + *data = kvm_get_arch_capabilities(); break; case MSR_IA32_PERF_CAPABILITIES: - msr->data = kvm_caps.supported_perf_cap; + *data = kvm_caps.supported_perf_cap; + break; + case MSR_PLATFORM_INFO: + *data = MSR_PLATFORM_INFO_CPUID_FAULT; break; case MSR_IA32_UCODE_REV: - rdmsrl_safe(msr->index, &msr->data); + rdmsrq_safe(index, data); break; default: - return static_call(kvm_x86_get_msr_feature)(msr); + return kvm_x86_call(get_feature_msr)(index, data); } return 0; } -static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) +static int do_get_feature_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { - struct kvm_msr_entry msr; - int r; - - /* Unconditionally clear the output for simplicity */ - msr.data = 0; - msr.index = index; - r = kvm_get_msr_feature(&msr); - - if (r == KVM_MSR_RET_INVALID && kvm_msr_ignored_check(index, 0, false)) - r = 0; - - *data = msr.data; - - return r; + return kvm_do_msr_access(vcpu, index, data, true, MSR_TYPE_R, + kvm_get_feature_msr); } static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) { - if (efer & EFER_AUTOIBRS && !guest_cpuid_has(vcpu, X86_FEATURE_AUTOIBRS)) + if (efer & EFER_AUTOIBRS && !guest_cpu_cap_has(vcpu, X86_FEATURE_AUTOIBRS)) return false; - if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT)) + if (efer & EFER_FFXSR && !guest_cpu_cap_has(vcpu, X86_FEATURE_FXSR_OPT)) return false; - if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM)) + if (efer & EFER_SVME && !guest_cpu_cap_has(vcpu, X86_FEATURE_SVM)) return false; if (efer & (EFER_LME | EFER_LMA) && - !guest_cpuid_has(vcpu, X86_FEATURE_LM)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) return false; - if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX)) + if (efer & EFER_NX && !guest_cpu_cap_has(vcpu, X86_FEATURE_NX)) return false; return true; @@ -1768,7 +1743,7 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) efer &= ~EFER_LMA; efer |= vcpu->arch.efer & EFER_LMA; - r = static_call(kvm_x86_set_efer)(vcpu, efer); + r = kvm_x86_call(set_efer)(vcpu, efer); if (r) { WARN_ON(r > 0); return r; @@ -1850,7 +1825,7 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, case MSR_KERNEL_GS_BASE: case MSR_CSTAR: case MSR_LSTAR: - if (is_noncanonical_address(data, vcpu)) + if (is_noncanonical_msr_address(data, vcpu)) return 1; break; case MSR_IA32_SYSENTER_EIP: @@ -1867,15 +1842,15 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, * value, and that something deterministic happens if the guest * invokes 64-bit SYSENTER. */ - data = __canonical_address(data, vcpu_virt_addr_bits(vcpu)); + data = __canonical_address(data, max_host_virt_addr_bits()); break; case MSR_TSC_AUX: if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX)) return 1; if (!host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && - !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) && + !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID)) return 1; /* @@ -1883,11 +1858,11 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, * incomplete and conflicting architectural behavior. Current * AMD CPUs completely ignore bits 63:32, i.e. they aren't * reserved and always read as zeros. Enforce Intel's reserved - * bits check if and only if the guest CPU is Intel, and clear - * the bits in all other cases. This ensures cross-vendor - * migration will provide consistent behavior for the guest. + * bits check if the guest CPU is Intel compatible, otherwise + * clear the bits. This ensures cross-vendor migration will + * provide consistent behavior for the guest. */ - if (guest_cpuid_is_intel(vcpu) && (data >> 32) != 0) + if (guest_cpuid_is_intel_compatible(vcpu) && (data >> 32) != 0) return 1; data = (u32)data; @@ -1898,19 +1873,20 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, msr.index = index; msr.host_initiated = host_initiated; - return static_call(kvm_x86_set_msr)(vcpu, &msr); + return kvm_x86_call(set_msr)(vcpu, &msr); +} + +static int _kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, + bool host_initiated) +{ + return __kvm_set_msr(vcpu, index, *data, host_initiated); } static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, u32 index, u64 data, bool host_initiated) { - int ret = __kvm_set_msr(vcpu, index, data, host_initiated); - - if (ret == KVM_MSR_RET_INVALID) - if (kvm_msr_ignored_check(index, data, true)) - ret = 0; - - return ret; + return kvm_do_msr_access(vcpu, index, &data, host_initiated, MSR_TYPE_W, + _kvm_set_msr); } /* @@ -1931,8 +1907,8 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, return 1; if (!host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && - !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) && + !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID)) return 1; break; } @@ -1940,7 +1916,7 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, msr.index = index; msr.host_initiated = host_initiated; - ret = static_call(kvm_x86_get_msr)(vcpu, &msr); + ret = kvm_x86_call(get_msr)(vcpu, &msr); if (!ret) *data = msr.data; return ret; @@ -1949,31 +1925,25 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated) { - int ret = __kvm_get_msr(vcpu, index, data, host_initiated); - - if (ret == KVM_MSR_RET_INVALID) { - /* Unconditionally clear *data for simplicity */ - *data = 0; - if (kvm_msr_ignored_check(index, 0, false)) - ret = 0; - } - - return ret; + return kvm_do_msr_access(vcpu, index, data, host_initiated, MSR_TYPE_R, + __kvm_get_msr); } -static int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data) +int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data) { if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) return KVM_MSR_RET_FILTERED; return kvm_get_msr_ignored_check(vcpu, index, data, false); } +EXPORT_SYMBOL_GPL(kvm_get_msr_with_filter); -static int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data) +int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data) { if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) return KVM_MSR_RET_FILTERED; return kvm_set_msr_ignored_check(vcpu, index, data, false); } +EXPORT_SYMBOL_GPL(kvm_set_msr_with_filter); int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) { @@ -2008,7 +1978,7 @@ static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu) static int complete_fast_msr_access(struct kvm_vcpu *vcpu) { - return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error); + return kvm_x86_call(complete_emulated_msr)(vcpu, vcpu->run->msr.error); } static int complete_fast_rdmsr(struct kvm_vcpu *vcpu) @@ -2020,7 +1990,7 @@ static int complete_fast_rdmsr(struct kvm_vcpu *vcpu) static u64 kvm_msr_reason(int r) { switch (r) { - case KVM_MSR_RET_INVALID: + case KVM_MSR_RET_UNSUPPORTED: return KVM_MSR_EXIT_REASON_UNKNOWN; case KVM_MSR_RET_FILTERED: return KVM_MSR_EXIT_REASON_FILTER; @@ -2072,7 +2042,7 @@ int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) trace_kvm_msr_read_ex(ecx); } - return static_call(kvm_x86_complete_emulated_msr)(vcpu, r); + return kvm_x86_call(complete_emulated_msr)(vcpu, r); } EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr); @@ -2097,7 +2067,7 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) trace_kvm_msr_write_ex(ecx, data); } - return static_call(kvm_x86_complete_emulated_msr)(vcpu, r); + return kvm_x86_call(complete_emulated_msr)(vcpu, r); } EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); @@ -2123,10 +2093,20 @@ EXPORT_SYMBOL_GPL(kvm_handle_invalid_op); static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn) { - if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) && - !guest_cpuid_has(vcpu, X86_FEATURE_MWAIT)) + bool enabled; + + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS)) + goto emulate_as_nop; + + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) + enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_MWAIT); + else + enabled = vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT; + + if (!enabled) return kvm_handle_invalid_op(vcpu); +emulate_as_nop: pr_warn_once("%s instruction emulated as NOP!\n", insn); return kvm_emulate_as_nop(vcpu); } @@ -2145,8 +2125,9 @@ EXPORT_SYMBOL_GPL(kvm_emulate_monitor); static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) { xfer_to_guest_mode_prepare(); - return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) || - xfer_to_guest_mode_work_pending(); + + return READ_ONCE(vcpu->mode) == EXITING_GUEST_MODE || + kvm_request_pending(vcpu) || xfer_to_guest_mode_work_pending(); } /* @@ -2183,31 +2164,34 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) { u32 msr = kvm_rcx_read(vcpu); u64 data; - fastpath_t ret = EXIT_FASTPATH_NONE; + fastpath_t ret; + bool handled; kvm_vcpu_srcu_read_lock(vcpu); switch (msr) { case APIC_BASE_MSR + (APIC_ICR >> 4): data = kvm_read_edx_eax(vcpu); - if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) { - kvm_skip_emulated_instruction(vcpu); - ret = EXIT_FASTPATH_EXIT_HANDLED; - } + handled = !handle_fastpath_set_x2apic_icr_irqoff(vcpu, data); break; case MSR_IA32_TSC_DEADLINE: data = kvm_read_edx_eax(vcpu); - if (!handle_fastpath_set_tscdeadline(vcpu, data)) { - kvm_skip_emulated_instruction(vcpu); - ret = EXIT_FASTPATH_REENTER_GUEST; - } + handled = !handle_fastpath_set_tscdeadline(vcpu, data); break; default: + handled = false; break; } - if (ret != EXIT_FASTPATH_NONE) + if (handled) { + if (!kvm_skip_emulated_instruction(vcpu)) + ret = EXIT_FASTPATH_EXIT_USERSPACE; + else + ret = EXIT_FASTPATH_REENTER_GUEST; trace_kvm_msr_write(msr, data); + } else { + ret = EXIT_FASTPATH_NONE; + } kvm_vcpu_srcu_read_unlock(vcpu); @@ -2230,16 +2214,13 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) /* * Disallow writes to immutable feature MSRs after KVM_RUN. KVM does * not support modifying the guest vCPU model on the fly, e.g. changing - * the nVMX capabilities while L2 is running is nonsensical. Ignore + * the nVMX capabilities while L2 is running is nonsensical. Allow * writes of the same value, e.g. to allow userspace to blindly stuff * all MSRs when emulating RESET. */ - if (kvm_vcpu_has_run(vcpu) && kvm_is_immutable_feature_msr(index)) { - if (do_get_msr(vcpu, index, &val) || *data != val) - return -EINVAL; - - return 0; - } + if (kvm_vcpu_has_run(vcpu) && kvm_is_immutable_feature_msr(index) && + (do_get_msr(vcpu, index, &val) || *data != val)) + return -EINVAL; return kvm_set_msr_ignored_check(vcpu, index, *data, true); } @@ -2611,6 +2592,9 @@ EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_multiplier); static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset) { + if (vcpu->arch.guest_tsc_protected) + return; + trace_kvm_write_tsc_offset(vcpu->vcpu_id, vcpu->arch.l1_tsc_offset, l1_offset); @@ -2625,12 +2609,12 @@ static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset) if (is_guest_mode(vcpu)) vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( l1_offset, - static_call(kvm_x86_get_l2_tsc_offset)(vcpu), - static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu)); + kvm_x86_call(get_l2_tsc_offset)(vcpu), + kvm_x86_call(get_l2_tsc_multiplier)(vcpu)); else vcpu->arch.tsc_offset = l1_offset; - static_call(kvm_x86_write_tsc_offset)(vcpu); + kvm_x86_call(write_tsc_offset)(vcpu); } static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier) @@ -2641,12 +2625,12 @@ static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multipli if (is_guest_mode(vcpu)) vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( l1_multiplier, - static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu)); + kvm_x86_call(get_l2_tsc_multiplier)(vcpu)); else vcpu->arch.tsc_scaling_ratio = l1_multiplier; if (kvm_caps.has_tsc_control) - static_call(kvm_x86_write_tsc_multiplier)(vcpu); + kvm_x86_call(write_tsc_multiplier)(vcpu); } static inline bool kvm_check_tsc_unstable(void) @@ -2668,12 +2652,18 @@ static inline bool kvm_check_tsc_unstable(void) * participates in. */ static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc, - u64 ns, bool matched) + u64 ns, bool matched, bool user_set_tsc) { struct kvm *kvm = vcpu->kvm; lockdep_assert_held(&kvm->arch.tsc_write_lock); + if (vcpu->arch.guest_tsc_protected) + return; + + if (user_set_tsc) + vcpu->kvm->arch.user_set_tsc = true; + /* * We also track th most recent recorded KHZ, write and time to * allow the matching interval to be extended at each write. @@ -2759,8 +2749,6 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value) } } - if (user_value) - kvm->arch.user_set_tsc = true; /* * For a reliable TSC, we can match TSC offsets, and for an unstable @@ -2780,7 +2768,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value) matched = true; } - __kvm_synchronize_tsc(vcpu, offset, data, ns, matched); + __kvm_synchronize_tsc(vcpu, offset, data, ns, matched, !!user_value); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); } @@ -3158,15 +3146,17 @@ u64 get_kvmclock_ns(struct kvm *kvm) return data.clock; } -static void kvm_setup_guest_pvclock(struct kvm_vcpu *v, +static void kvm_setup_guest_pvclock(struct pvclock_vcpu_time_info *ref_hv_clock, + struct kvm_vcpu *vcpu, struct gfn_to_pfn_cache *gpc, - unsigned int offset, - bool force_tsc_unstable) + unsigned int offset) { - struct kvm_vcpu_arch *vcpu = &v->arch; struct pvclock_vcpu_time_info *guest_hv_clock; + struct pvclock_vcpu_time_info hv_clock; unsigned long flags; + memcpy(&hv_clock, ref_hv_clock, sizeof(hv_clock)); + read_lock_irqsave(&gpc->lock, flags); while (!kvm_gpc_check(gpc, offset + sizeof(*guest_hv_clock))) { read_unlock_irqrestore(&gpc->lock, flags); @@ -3186,52 +3176,34 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v, * it is consistent. */ - guest_hv_clock->version = vcpu->hv_clock.version = (guest_hv_clock->version + 1) | 1; + guest_hv_clock->version = hv_clock.version = (guest_hv_clock->version + 1) | 1; smp_wmb(); /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ - vcpu->hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED); - - if (vcpu->pvclock_set_guest_stopped_request) { - vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED; - vcpu->pvclock_set_guest_stopped_request = false; - } + hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED); - memcpy(guest_hv_clock, &vcpu->hv_clock, sizeof(*guest_hv_clock)); - - if (force_tsc_unstable) - guest_hv_clock->flags &= ~PVCLOCK_TSC_STABLE_BIT; + memcpy(guest_hv_clock, &hv_clock, sizeof(*guest_hv_clock)); smp_wmb(); - guest_hv_clock->version = ++vcpu->hv_clock.version; + guest_hv_clock->version = ++hv_clock.version; kvm_gpc_mark_dirty_in_slot(gpc); read_unlock_irqrestore(&gpc->lock, flags); - trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); + trace_kvm_pvclock_update(vcpu->vcpu_id, &hv_clock); } -static int kvm_guest_time_update(struct kvm_vcpu *v) +int kvm_guest_time_update(struct kvm_vcpu *v) { + struct pvclock_vcpu_time_info hv_clock = {}; unsigned long flags, tgt_tsc_khz; unsigned seq; struct kvm_vcpu_arch *vcpu = &v->arch; struct kvm_arch *ka = &v->kvm->arch; s64 kernel_ns; u64 tsc_timestamp, host_tsc; - u8 pvclock_flags; bool use_master_clock; -#ifdef CONFIG_KVM_XEN - /* - * For Xen guests we may need to override PVCLOCK_TSC_STABLE_BIT as unless - * explicitly told to use TSC as its clocksource Xen will not set this bit. - * This default behaviour led to bugs in some guest kernels which cause - * problems if they observe PVCLOCK_TSC_STABLE_BIT in the pvclock flags. - */ - bool xen_pvclock_tsc_unstable = - ka->xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE; -#endif kernel_ns = 0; host_tsc = 0; @@ -3292,35 +3264,57 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) { kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL, - &vcpu->hv_clock.tsc_shift, - &vcpu->hv_clock.tsc_to_system_mul); + &vcpu->pvclock_tsc_shift, + &vcpu->pvclock_tsc_mul); vcpu->hw_tsc_khz = tgt_tsc_khz; - kvm_xen_update_tsc_info(v); } - vcpu->hv_clock.tsc_timestamp = tsc_timestamp; - vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; + hv_clock.tsc_shift = vcpu->pvclock_tsc_shift; + hv_clock.tsc_to_system_mul = vcpu->pvclock_tsc_mul; + hv_clock.tsc_timestamp = tsc_timestamp; + hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; vcpu->last_guest_tsc = tsc_timestamp; /* If the host uses TSC clocksource, then it is stable */ - pvclock_flags = 0; + hv_clock.flags = 0; if (use_master_clock) - pvclock_flags |= PVCLOCK_TSC_STABLE_BIT; + hv_clock.flags |= PVCLOCK_TSC_STABLE_BIT; + + if (vcpu->pv_time.active) { + /* + * GUEST_STOPPED is only supported by kvmclock, and KVM's + * historic behavior is to only process the request if kvmclock + * is active/enabled. + */ + if (vcpu->pvclock_set_guest_stopped_request) { + hv_clock.flags |= PVCLOCK_GUEST_STOPPED; + vcpu->pvclock_set_guest_stopped_request = false; + } + kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->pv_time, 0); + + hv_clock.flags &= ~PVCLOCK_GUEST_STOPPED; + } - vcpu->hv_clock.flags = pvclock_flags; + kvm_hv_setup_tsc_page(v->kvm, &hv_clock); - if (vcpu->pv_time.active) - kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0, false); #ifdef CONFIG_KVM_XEN + /* + * For Xen guests we may need to override PVCLOCK_TSC_STABLE_BIT as unless + * explicitly told to use TSC as its clocksource Xen will not set this bit. + * This default behaviour led to bugs in some guest kernels which cause + * problems if they observe PVCLOCK_TSC_STABLE_BIT in the pvclock flags. + * + * Note! Clear TSC_STABLE only for Xen clocks, i.e. the order matters! + */ + if (ka->xen.hvm_config.flags & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE) + hv_clock.flags &= ~PVCLOCK_TSC_STABLE_BIT; + if (vcpu->xen.vcpu_info_cache.active) - kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache, - offsetof(struct compat_vcpu_info, time), - xen_pvclock_tsc_unstable); + kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_info_cache, + offsetof(struct compat_vcpu_info, time)); if (vcpu->xen.vcpu_time_info_cache.active) - kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0, - xen_pvclock_tsc_unstable); + kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_time_info_cache, 0); #endif - kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock); return 0; } @@ -3470,7 +3464,7 @@ static bool is_mci_status_msr(u32 msr) static bool can_set_mci_status(struct kvm_vcpu *vcpu) { /* McStatusWrEn enabled? */ - if (guest_cpuid_is_amd_or_hygon(vcpu)) + if (guest_cpuid_is_amd_compatible(vcpu)) return !!(vcpu->arch.msr_hwcr & BIT_ULL(18)); return false; @@ -3586,7 +3580,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) sizeof(u64))) return 1; - vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); + vcpu->arch.apf.send_always = (data & KVM_ASYNC_PF_SEND_ALWAYS); vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; kvm_async_pf_wakeup_all(vcpu); @@ -3619,7 +3613,7 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu) static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - static_call(kvm_x86_flush_tlb_all)(vcpu); + kvm_x86_call(flush_tlb_all)(vcpu); /* Flushing all ASIDs flushes the current ASID... */ kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); @@ -3640,7 +3634,7 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) kvm_mmu_sync_prev_roots(vcpu); } - static_call(kvm_x86_flush_tlb_guest)(vcpu); + kvm_x86_call(flush_tlb_guest)(vcpu); /* * Flushing all "guest" TLB is always a superset of Hyper-V's fine @@ -3653,7 +3647,7 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - static_call(kvm_x86_flush_tlb_current)(vcpu); + kvm_x86_call(flush_tlb_current)(vcpu); } /* @@ -3770,24 +3764,18 @@ static void record_steal_time(struct kvm_vcpu *vcpu) mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa)); } -static bool kvm_is_msr_to_save(u32 msr_index) -{ - unsigned int i; - - for (i = 0; i < num_msrs_to_save; i++) { - if (msrs_to_save[i] == msr_index) - return true; - } - - return false; -} - int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { u32 msr = msr_info->index; u64 data = msr_info->data; - if (msr && msr == vcpu->kvm->arch.xen_hvm_config.msr) + /* + * Do not allow host-initiated writes to trigger the Xen hypercall + * page setup; it could incur locking paths which are not expected + * if userspace sets the MSR in an unusual location. + */ + if (kvm_xen_is_hypercall_page_msr(vcpu->kvm, msr) && + !msr_info->host_initiated) return kvm_xen_write_hypercall_page(vcpu, data); switch (msr) { @@ -3806,13 +3794,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.microcode_version = data; break; case MSR_IA32_ARCH_CAPABILITIES: - if (!msr_info->host_initiated) - return 1; + if (!msr_info->host_initiated || + !guest_cpu_cap_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) + return KVM_MSR_RET_UNSUPPORTED; vcpu->arch.arch_capabilities = data; break; case MSR_IA32_PERF_CAPABILITIES: - if (!msr_info->host_initiated) - return 1; + if (!msr_info->host_initiated || + !guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM)) + return KVM_MSR_RET_UNSUPPORTED; + if (data & ~kvm_caps.supported_perf_cap) return 1; @@ -3834,11 +3825,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if ((!guest_has_pred_cmd_msr(vcpu))) return 1; - if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL) && + !guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_IBPB)) reserved_bits |= PRED_CMD_IBPB; - if (!guest_cpuid_has(vcpu, X86_FEATURE_SBPB)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SBPB)) reserved_bits |= PRED_CMD_SBPB; } @@ -3854,12 +3845,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!data) break; - wrmsrl(MSR_IA32_PRED_CMD, data); + wrmsrq(MSR_IA32_PRED_CMD, data); break; } case MSR_IA32_FLUSH_CMD: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)) return 1; if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D) || (data & ~L1D_FLUSH)) @@ -3867,7 +3858,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!data) break; - wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); + wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH); break; case MSR_EFER: return set_efer(vcpu, msr_info); @@ -3903,14 +3894,14 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_MTRRdefType: return kvm_mtrr_set_msr(vcpu, msr, data); case MSR_IA32_APICBASE: - return kvm_set_apic_base(vcpu, msr_info); + return kvm_apic_set_base(vcpu, data, msr_info->host_initiated); case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: return kvm_x2apic_msr_write(vcpu, msr, data); case MSR_IA32_TSC_DEADLINE: kvm_set_lapic_tscdeadline_msr(vcpu, data); break; case MSR_IA32_TSC_ADJUST: - if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) { + if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSC_ADJUST)) { if (!msr_info->host_initiated) { s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; adjust_tsc_offset_guest(vcpu, adj); @@ -3937,10 +3928,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) && ((old_val ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) { - if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XMM3)) return 1; vcpu->arch.ia32_misc_enable_msr = data; - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; } else { vcpu->arch.ia32_misc_enable_msr = data; } @@ -3957,7 +3948,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_TSC: if (msr_info->host_initiated) { kvm_synchronize_tsc(vcpu, &data); - } else { + } else if (!vcpu->arch.guest_tsc_protected) { u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset; adjust_tsc_offset_guest(vcpu, adj); vcpu->arch.ia32_tsc_adjust_msr += adj; @@ -3975,7 +3966,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data & ~kvm_caps.supported_xss) return 1; vcpu->arch.ia32_xss = data; - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; break; case MSR_SMI_COUNT: if (!msr_info->host_initiated) @@ -4114,19 +4105,17 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) kvm_pr_unimpl_wrmsr(vcpu, msr, data); break; case MSR_AMD64_OSVW_ID_LENGTH: - if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW)) return 1; vcpu->arch.osvw.length = data; break; case MSR_AMD64_OSVW_STATUS: - if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW)) return 1; vcpu->arch.osvw.status = data; break; case MSR_PLATFORM_INFO: - if (!msr_info->host_initiated || - (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) && - cpuid_fault_enabled(vcpu))) + if (!msr_info->host_initiated) return 1; vcpu->arch.msr_platform_info = data; break; @@ -4140,7 +4129,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) #ifdef CONFIG_X86_64 case MSR_IA32_XFD: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_XFD)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)) return 1; if (data & ~kvm_guest_supported_xfd(vcpu)) @@ -4150,7 +4139,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_XFD_ERR: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_XFD)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)) return 1; if (data & ~kvm_guest_supported_xfd(vcpu)) @@ -4163,15 +4152,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); - /* - * Userspace is allowed to write '0' to MSRs that KVM reports - * as to-be-saved, even if an MSRs isn't fully supported. - */ - if (msr_info->host_initiated && !data && - kvm_is_msr_to_save(msr)) - break; - - return KVM_MSR_RET_INVALID; + return KVM_MSR_RET_UNSUPPORTED; } return 0; } @@ -4273,15 +4254,13 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.microcode_version; break; case MSR_IA32_ARCH_CAPABILITIES: - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) - return 1; + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) + return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->arch.arch_capabilities; break; case MSR_IA32_PERF_CAPABILITIES: - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) - return 1; + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM)) + return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->arch.perf_capabilities; break; case MSR_IA32_POWER_CTL: @@ -4335,7 +4314,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 1 << 24; break; case MSR_IA32_APICBASE: - msr_info->data = kvm_get_apic_base(vcpu); + msr_info->data = vcpu->arch.apic_base; break; case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data); @@ -4481,12 +4460,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 0xbe702111; break; case MSR_AMD64_OSVW_ID_LENGTH: - if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW)) return 1; msr_info->data = vcpu->arch.osvw.length; break; case MSR_AMD64_OSVW_STATUS: - if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW)) return 1; msr_info->data = vcpu->arch.osvw.status; break; @@ -4505,14 +4484,14 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) #ifdef CONFIG_X86_64 case MSR_IA32_XFD: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_XFD)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)) return 1; msr_info->data = vcpu->arch.guest_fpu.fpstate->xfd; break; case MSR_IA32_XFD_ERR: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_XFD)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)) return 1; msr_info->data = vcpu->arch.guest_fpu.xfd_err; @@ -4522,17 +4501,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); - /* - * Userspace is allowed to read MSRs that KVM reports as - * to-be-saved, even if an MSR isn't fully supported. - */ - if (msr_info->host_initiated && - kvm_is_msr_to_save(msr_info->index)) { - msr_info->data = 0; - break; - } - - return KVM_MSR_RET_INVALID; + return KVM_MSR_RET_UNSUPPORTED; } return 0; } @@ -4604,6 +4573,20 @@ static inline bool kvm_can_mwait_in_guest(void) boot_cpu_has(X86_FEATURE_ARAT); } +static u64 kvm_get_allowed_disable_exits(void) +{ + u64 r = KVM_X86_DISABLE_EXITS_PAUSE; + + if (!mitigate_smt_rsb) { + r |= KVM_X86_DISABLE_EXITS_HLT | + KVM_X86_DISABLE_EXITS_CSTATE; + + if (kvm_can_mwait_in_guest()) + r |= KVM_X86_DISABLE_EXITS_MWAIT; + } + return r; +} + #ifdef CONFIG_KVM_HYPERV static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 __user *cpuid_arg) @@ -4629,9 +4612,12 @@ static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu, static bool kvm_is_vm_type_supported(unsigned long type) { - return type == KVM_X86_DEFAULT_VM || - (type == KVM_X86_SW_PROTECTED_VM && - IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled); + return type < 32 && (kvm_caps.supported_vm_types & BIT(type)); +} + +static inline u64 kvm_sync_valid_fields(struct kvm *kvm) +{ + return kvm && kvm->arch.has_protected_state ? 0 : KVM_SYNC_X86_VALID_FIELDS; } int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) @@ -4682,7 +4668,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ASYNC_PF_INT: case KVM_CAP_GET_TSC_KHZ: case KVM_CAP_KVMCLOCK_CTRL: - case KVM_CAP_READONLY_MEM: case KVM_CAP_IOAPIC_POLARITY_IGNORED: case KVM_CAP_TSC_DEADLINE_TIMER: case KVM_CAP_DISABLE_QUIRKS: @@ -4714,8 +4699,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES: case KVM_CAP_IRQFD_RESAMPLE: case KVM_CAP_MEMORY_FAULT_INFO: + case KVM_CAP_X86_GUEST_MODE: r = 1; break; + case KVM_CAP_PRE_FAULT_MEMORY: + r = tdp_enabled; + break; + case KVM_CAP_X86_APIC_BUS_CYCLES_NS: + r = APIC_BUS_CYCLE_NS_DEFAULT; + break; case KVM_CAP_EXIT_HYPERCALL: r = KVM_EXIT_HYPERCALL_VALID_MASK; break; @@ -4736,21 +4728,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; #endif case KVM_CAP_SYNC_REGS: - r = KVM_SYNC_X86_VALID_FIELDS; + r = kvm_sync_valid_fields(kvm); break; case KVM_CAP_ADJUST_CLOCK: r = KVM_CLOCK_VALID_FLAGS; break; case KVM_CAP_X86_DISABLE_EXITS: - r = KVM_X86_DISABLE_EXITS_PAUSE; - - if (!mitigate_smt_rsb) { - r |= KVM_X86_DISABLE_EXITS_HLT | - KVM_X86_DISABLE_EXITS_CSTATE; - - if (kvm_can_mwait_in_guest()) - r |= KVM_X86_DISABLE_EXITS_MWAIT; - } + r = kvm_get_allowed_disable_exits(); break; case KVM_CAP_X86_SMM: if (!IS_ENABLED(CONFIG_KVM_SMM)) @@ -4764,13 +4748,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) * fringe case that is not enabled except via specific settings * of the module parameters. */ - r = static_call(kvm_x86_has_emulated_msr)(kvm, MSR_IA32_SMBASE); + r = kvm_x86_call(has_emulated_msr)(kvm, MSR_IA32_SMBASE); break; case KVM_CAP_NR_VCPUS: r = min_t(unsigned int, num_online_cpus(), KVM_MAX_VCPUS); break; case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; + if (kvm) + r = kvm->max_vcpus; break; case KVM_CAP_MAX_VCPU_ID: r = KVM_MAX_VCPU_IDS; @@ -4826,15 +4812,16 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0; break; case KVM_CAP_DISABLE_QUIRKS2: - r = KVM_X86_VALID_QUIRKS; + r = kvm_caps.supported_quirks; break; case KVM_CAP_X86_NOTIFY_VMEXIT: r = kvm_caps.has_notify_vmexit; break; case KVM_CAP_VM_TYPES: - r = BIT(KVM_X86_DEFAULT_VM); - if (kvm_is_vm_type_supported(KVM_X86_SW_PROTECTED_VM)) - r |= BIT(KVM_X86_SW_PROTECTED_VM); + r = kvm_caps.supported_vm_types; + break; + case KVM_CAP_READONLY_MEM: + r = kvm ? kvm_arch_has_readonly_mem(kvm) : 1; break; default: break; @@ -4842,46 +4829,44 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) return r; } -static inline void __user *kvm_get_attr_addr(struct kvm_device_attr *attr) +static int __kvm_x86_dev_get_attr(struct kvm_device_attr *attr, u64 *val) { - void __user *uaddr = (void __user*)(unsigned long)attr->addr; - - if ((u64)(unsigned long)uaddr != attr->addr) - return ERR_PTR_USR(-EFAULT); - return uaddr; -} - -static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr) -{ - u64 __user *uaddr = kvm_get_attr_addr(attr); - - if (attr->group) + if (attr->group) { + if (kvm_x86_ops.dev_get_attr) + return kvm_x86_call(dev_get_attr)(attr->group, attr->attr, val); return -ENXIO; - - if (IS_ERR(uaddr)) - return PTR_ERR(uaddr); + } switch (attr->attr) { case KVM_X86_XCOMP_GUEST_SUPP: - if (put_user(kvm_caps.supported_xcr0, uaddr)) - return -EFAULT; + *val = kvm_caps.supported_xcr0; return 0; default: return -ENXIO; } } +static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr) +{ + u64 __user *uaddr = u64_to_user_ptr(attr->addr); + int r; + u64 val; + + r = __kvm_x86_dev_get_attr(attr, &val); + if (r < 0) + return r; + + if (put_user(val, uaddr)) + return -EFAULT; + + return 0; +} + static int kvm_x86_dev_has_attr(struct kvm_device_attr *attr) { - if (attr->group) - return -ENXIO; + u64 val; - switch (attr->attr) { - case KVM_X86_XCOMP_GUEST_SUPP: - return 0; - default: - return -ENXIO; - } + return __kvm_x86_dev_get_attr(attr, &val); } long kvm_arch_dev_ioctl(struct file *filp, @@ -4967,7 +4952,7 @@ long kvm_arch_dev_ioctl(struct file *filp, break; } case KVM_GET_MSRS: - r = msr_io(NULL, argp, do_get_msr_feature, 1); + r = msr_io(NULL, argp, do_get_feature_msr, 1); break; #ifdef CONFIG_KVM_HYPERV case KVM_GET_SUPPORTED_HV_CPUID: @@ -5008,18 +4993,42 @@ static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) return kvm_arch_has_noncoherent_dma(vcpu->kvm); } +static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu); + void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + + vcpu->arch.l1tf_flush_l1d = true; + + if (vcpu->scheduled_out && pmu->version && pmu->event_count) { + pmu->need_cleanup = true; + kvm_make_request(KVM_REQ_PMU, vcpu); + } + /* Address WBINVD may be executed by guest */ if (need_emulate_wbinvd(vcpu)) { - if (static_call(kvm_x86_has_wbinvd_exit)()) + if (kvm_x86_call(has_wbinvd_exit)()) cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); else if (vcpu->cpu != -1 && vcpu->cpu != cpu) smp_call_function_single(vcpu->cpu, wbinvd_ipi, NULL, 1); } - static_call(kvm_x86_vcpu_load)(vcpu, cpu); + kvm_x86_call(vcpu_load)(vcpu, cpu); + + if (vcpu != per_cpu(last_vcpu, cpu)) { + /* + * Flush the branch predictor when switching vCPUs on the same + * physical CPU, as each vCPU needs its own branch prediction + * domain. No IBPB is needed when switching between L1 and L2 + * on the same vCPU unless IBRS is advertised to the vCPU; that + * is handled on the nested VM-Exit path. + */ + if (static_branch_likely(&switch_vcpu_ibpb)) + indirect_branch_prediction_barrier(); + per_cpu(last_vcpu, cpu) = vcpu; + } /* Save host pkru register if supported */ vcpu->arch.host_pkru = read_pkru(); @@ -5041,7 +5050,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) u64 offset = kvm_compute_l1_tsc_offset(vcpu, vcpu->arch.last_guest_tsc); kvm_vcpu_write_tsc_offset(vcpu, offset); - vcpu->arch.tsc_catchup = 1; + if (!vcpu->arch.guest_tsc_protected) + vcpu->arch.tsc_catchup = 1; } if (kvm_lapic_hv_timer_in_use(vcpu)) @@ -5113,7 +5123,13 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) int idx; if (vcpu->preempted) { - vcpu->arch.preempted_in_kernel = kvm_arch_vcpu_in_kernel(vcpu); + /* + * Assume protected guests are in-kernel. Inefficient yielding + * due to false positives is preferable to never yielding due + * to false negatives. + */ + vcpu->arch.preempted_in_kernel = vcpu->arch.guest_state_protected || + !kvm_x86_call(get_cpl_no_cache)(vcpu); /* * Take the srcu lock as memslots will be accessed to check the gfn @@ -5127,14 +5143,17 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) srcu_read_unlock(&vcpu->kvm->srcu, idx); } - static_call(kvm_x86_vcpu_put)(vcpu); + kvm_x86_call(vcpu_put)(vcpu); vcpu->arch.last_host_tsc = rdtsc(); } static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { - static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); + if (vcpu->arch.apic->guest_apic_protected) + return -EINVAL; + + kvm_x86_call(sync_pir_to_irr)(vcpu); return kvm_apic_get_state(vcpu, s); } @@ -5144,6 +5163,9 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, { int r; + if (vcpu->arch.apic->guest_apic_protected) + return -EINVAL; + r = kvm_apic_set_state(vcpu, s); if (r) return r; @@ -5251,7 +5273,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, kvm_apic_after_set_mcg_cap(vcpu); - static_call(kvm_x86_setup_mce)(vcpu); + kvm_x86_call(setup_mce)(vcpu); out: return r; } @@ -5411,11 +5433,11 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, events->interrupt.injected = vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft; events->interrupt.nr = vcpu->arch.interrupt.nr; - events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); + events->interrupt.shadow = kvm_x86_call(get_interrupt_shadow)(vcpu); events->nmi.injected = vcpu->arch.nmi_injected; events->nmi.pending = kvm_get_nr_pending_nmis(vcpu); - events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu); + events->nmi.masked = kvm_x86_call(get_nmi_mask)(vcpu); /* events->sipi_vector is never valid when reporting to user space */ @@ -5497,8 +5519,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.interrupt.nr = events->interrupt.nr; vcpu->arch.interrupt.soft = events->interrupt.soft; if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) - static_call(kvm_x86_set_interrupt_shadow)(vcpu, - events->interrupt.shadow); + kvm_x86_call(set_interrupt_shadow)(vcpu, + events->interrupt.shadow); vcpu->arch.nmi_injected = events->nmi.injected; if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) { @@ -5507,7 +5529,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, if (events->nmi.pending) kvm_make_request(KVM_REQ_NMI, vcpu); } - static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked); + kvm_x86_call(set_nmi_mask)(vcpu, events->nmi.masked); if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR && lapic_in_kernel(vcpu)) @@ -5557,11 +5579,15 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, return 0; } -static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, - struct kvm_debugregs *dbgregs) +static int kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, + struct kvm_debugregs *dbgregs) { unsigned int i; + if (vcpu->kvm->arch.has_protected_state && + vcpu->arch.guest_state_protected) + return -EINVAL; + memset(dbgregs, 0, sizeof(*dbgregs)); BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.db) != ARRAY_SIZE(dbgregs->db)); @@ -5570,6 +5596,7 @@ static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, dbgregs->dr6 = vcpu->arch.dr6; dbgregs->dr7 = vcpu->arch.dr7; + return 0; } static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, @@ -5577,6 +5604,10 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, { unsigned int i; + if (vcpu->kvm->arch.has_protected_state && + vcpu->arch.guest_state_protected) + return -EINVAL; + if (dbgregs->flags) return -EINVAL; @@ -5597,8 +5628,8 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, } -static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu, - u8 *state, unsigned int size) +static int kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu, + u8 *state, unsigned int size) { /* * Only copy state for features that are enabled for the guest. The @@ -5616,24 +5647,25 @@ static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu, XFEATURE_MASK_FPSSE; if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) - return; + return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0; fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, state, size, supported_xcr0, vcpu->arch.pkru); + return 0; } -static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, - struct kvm_xsave *guest_xsave) +static int kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, + struct kvm_xsave *guest_xsave) { - kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region, - sizeof(guest_xsave->region)); + return kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region, + sizeof(guest_xsave->region)); } static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) - return 0; + return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0; return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu, guest_xsave->region, @@ -5641,18 +5673,23 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, &vcpu->arch.pkru); } -static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, - struct kvm_xcrs *guest_xcrs) +static int kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, + struct kvm_xcrs *guest_xcrs) { + if (vcpu->kvm->arch.has_protected_state && + vcpu->arch.guest_state_protected) + return -EINVAL; + if (!boot_cpu_has(X86_FEATURE_XSAVE)) { guest_xcrs->nr_xcrs = 0; - return; + return 0; } guest_xcrs->nr_xcrs = 1; guest_xcrs->flags = 0; guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK; guest_xcrs->xcrs[0].value = vcpu->arch.xcr0; + return 0; } static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, @@ -5660,6 +5697,10 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, { int i, r = 0; + if (vcpu->kvm->arch.has_protected_state && + vcpu->arch.guest_state_protected) + return -EINVAL; + if (!boot_cpu_has(X86_FEATURE_XSAVE)) return -EINVAL; @@ -5712,12 +5753,9 @@ static int kvm_arch_tsc_has_attr(struct kvm_vcpu *vcpu, static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { - u64 __user *uaddr = kvm_get_attr_addr(attr); + u64 __user *uaddr = u64_to_user_ptr(attr->addr); int r; - if (IS_ERR(uaddr)) - return PTR_ERR(uaddr); - switch (attr->attr) { case KVM_VCPU_TSC_OFFSET: r = -EFAULT; @@ -5735,13 +5773,10 @@ static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu, static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { - u64 __user *uaddr = kvm_get_attr_addr(attr); + u64 __user *uaddr = u64_to_user_ptr(attr->addr); struct kvm *kvm = vcpu->kvm; int r; - if (IS_ERR(uaddr)) - return PTR_ERR(uaddr); - switch (attr->attr) { case KVM_VCPU_TSC_OFFSET: { u64 offset, tsc, ns; @@ -5761,8 +5796,7 @@ static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu, tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset; ns = get_kvmclock_base_ns(); - kvm->arch.user_set_tsc = true; - __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched); + __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched, true); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); r = 0; @@ -5842,7 +5876,7 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, if (!kvm_x86_ops.enable_l2_tlb_flush) return -ENOTTY; - return static_call(kvm_x86_enable_l2_tlb_flush)(vcpu); + return kvm_x86_call(enable_l2_tlb_flush)(vcpu); case KVM_CAP_HYPERV_ENFORCE_CPUID: return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]); @@ -5850,9 +5884,6 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: vcpu->arch.pv_cpuid.enforce = cap->args[0]; - if (vcpu->arch.pv_cpuid.enforce) - kvm_update_pv_runtime(vcpu); - return 0; default: return -EINVAL; @@ -5881,8 +5912,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EINVAL; if (!lapic_in_kernel(vcpu)) goto out; - u.lapic = kzalloc(sizeof(struct kvm_lapic_state), - GFP_KERNEL_ACCOUNT); + u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); r = -ENOMEM; if (!u.lapic) @@ -6042,13 +6072,17 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events))) break; + kvm_vcpu_srcu_read_lock(vcpu); r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events); + kvm_vcpu_srcu_read_unlock(vcpu); break; } case KVM_GET_DEBUGREGS: { struct kvm_debugregs dbgregs; - kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs); + r = kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs); + if (r < 0) + break; r = -EFAULT; if (copy_to_user(argp, &dbgregs, @@ -6073,12 +6107,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (vcpu->arch.guest_fpu.uabi_size > sizeof(struct kvm_xsave)) break; - u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT); + u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); r = -ENOMEM; if (!u.xsave) break; - kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave); + r = kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave); + if (r < 0) + break; r = -EFAULT; if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave))) @@ -6102,12 +6138,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp, case KVM_GET_XSAVE2: { int size = vcpu->arch.guest_fpu.uabi_size; - u.xsave = kzalloc(size, GFP_KERNEL_ACCOUNT); + u.xsave = kzalloc(size, GFP_KERNEL); r = -ENOMEM; if (!u.xsave) break; - kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size); + r = kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size); + if (r < 0) + break; r = -EFAULT; if (copy_to_user(argp, u.xsave, size)) @@ -6118,12 +6156,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } case KVM_GET_XCRS: { - u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT); + u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); r = -ENOMEM; if (!u.xcrs) break; - kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs); + r = kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs); + if (r < 0) + break; r = -EFAULT; if (copy_to_user(argp, u.xcrs, @@ -6267,6 +6307,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } #endif case KVM_GET_SREGS2: { + r = -EINVAL; + if (vcpu->kvm->arch.has_protected_state && + vcpu->arch.guest_state_protected) + goto out; + u.sregs2 = kzalloc(sizeof(struct kvm_sregs2), GFP_KERNEL); r = -ENOMEM; if (!u.sregs2) @@ -6279,6 +6324,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_SET_SREGS2: { + r = -EINVAL; + if (vcpu->kvm->arch.has_protected_state && + vcpu->arch.guest_state_protected) + goto out; + u.sregs2 = memdup_user(argp, sizeof(struct kvm_sregs2)); if (IS_ERR(u.sregs2)) { r = PTR_ERR(u.sregs2); @@ -6293,6 +6343,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp, case KVM_SET_DEVICE_ATTR: r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp); break; + case KVM_MEMORY_ENCRYPT_OP: + r = -ENOTTY; + if (!kvm_x86_ops.vcpu_mem_enc_ioctl) + goto out; + r = kvm_x86_ops.vcpu_mem_enc_ioctl(vcpu, argp); + break; default: r = -EINVAL; } @@ -6314,14 +6370,14 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) if (addr > (unsigned int)(-3 * PAGE_SIZE)) return -EINVAL; - ret = static_call(kvm_x86_set_tss_addr)(kvm, addr); + ret = kvm_x86_call(set_tss_addr)(kvm, addr); return ret; } static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) { - return static_call(kvm_x86_set_identity_map_addr)(kvm, ident_addr); + return kvm_x86_call(set_identity_map_addr)(kvm, ident_addr); } static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, @@ -6480,7 +6536,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) struct kvm_vcpu *vcpu; unsigned long i; - if (!kvm_x86_ops.cpu_dirty_log_size) + if (!kvm->arch.cpu_dirty_log_size) return; kvm_for_each_vcpu(i, vcpu, kvm) @@ -6510,11 +6566,11 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, switch (cap->cap) { case KVM_CAP_DISABLE_QUIRKS2: r = -EINVAL; - if (cap->args[0] & ~KVM_X86_VALID_QUIRKS) + if (cap->args[0] & ~kvm_caps.supported_quirks) break; fallthrough; case KVM_CAP_DISABLE_QUIRKS: - kvm->arch.disabled_quirks = cap->args[0]; + kvm->arch.disabled_quirks |= cap->args[0] & kvm_caps.supported_quirks; r = 0; break; case KVM_CAP_SPLIT_IRQCHIP: { @@ -6527,9 +6583,6 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, goto split_irqchip_unlock; if (kvm->created_vcpus) goto split_irqchip_unlock; - r = kvm_setup_empty_irq_routing(kvm); - if (r) - goto split_irqchip_unlock; /* Pairs with irqchip_in_kernel. */ smp_wmb(); kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT; @@ -6554,30 +6607,32 @@ split_irqchip_unlock: break; case KVM_CAP_X86_DISABLE_EXITS: r = -EINVAL; - if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS) + if (cap->args[0] & ~kvm_get_allowed_disable_exits()) break; - if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE) - kvm->arch.pause_in_guest = true; + mutex_lock(&kvm->lock); + if (kvm->created_vcpus) + goto disable_exits_unlock; #define SMT_RSB_MSG "This processor is affected by the Cross-Thread Return Predictions vulnerability. " \ "KVM_CAP_X86_DISABLE_EXITS should only be used with SMT disabled or trusted guests." - if (!mitigate_smt_rsb) { - if (boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible() && - (cap->args[0] & ~KVM_X86_DISABLE_EXITS_PAUSE)) - pr_warn_once(SMT_RSB_MSG); - - if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) && - kvm_can_mwait_in_guest()) - kvm->arch.mwait_in_guest = true; - if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT) - kvm->arch.hlt_in_guest = true; - if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE) - kvm->arch.cstate_in_guest = true; - } + if (!mitigate_smt_rsb && boot_cpu_has_bug(X86_BUG_SMT_RSB) && + cpu_smt_possible() && + (cap->args[0] & ~KVM_X86_DISABLE_EXITS_PAUSE)) + pr_warn_once(SMT_RSB_MSG); + if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE) + kvm->arch.pause_in_guest = true; + if (cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) + kvm->arch.mwait_in_guest = true; + if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT) + kvm->arch.hlt_in_guest = true; + if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE) + kvm->arch.cstate_in_guest = true; r = 0; +disable_exits_unlock: + mutex_unlock(&kvm->lock); break; case KVM_CAP_MSR_PLATFORM_INFO: kvm->arch.guest_can_read_msr_platform_info = cap->args[0]; @@ -6634,14 +6689,14 @@ split_irqchip_unlock: if (!kvm_x86_ops.vm_copy_enc_context_from) break; - r = static_call(kvm_x86_vm_copy_enc_context_from)(kvm, cap->args[0]); + r = kvm_x86_call(vm_copy_enc_context_from)(kvm, cap->args[0]); break; case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM: r = -EINVAL; if (!kvm_x86_ops.vm_move_enc_context_from) break; - r = static_call(kvm_x86_vm_move_enc_context_from)(kvm, cap->args[0]); + r = kvm_x86_call(vm_move_enc_context_from)(kvm, cap->args[0]); break; case KVM_CAP_EXIT_HYPERCALL: if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) { @@ -6676,7 +6731,9 @@ split_irqchip_unlock: break; mutex_lock(&kvm->lock); - if (kvm->arch.max_vcpu_ids == cap->args[0]) { + if (kvm->arch.bsp_vcpu_id > cap->args[0]) { + ; + } else if (kvm->arch.max_vcpu_ids == cap->args[0]) { r = 0; } else if (!kvm->arch.max_vcpu_ids) { kvm->arch.max_vcpu_ids = cap->args[0]; @@ -6729,6 +6786,30 @@ split_irqchip_unlock: } mutex_unlock(&kvm->lock); break; + case KVM_CAP_X86_APIC_BUS_CYCLES_NS: { + u64 bus_cycle_ns = cap->args[0]; + u64 unused; + + /* + * Guard against overflow in tmict_to_ns(). 128 is the highest + * divide value that can be programmed in APIC_TDCR. + */ + r = -EINVAL; + if (!bus_cycle_ns || + check_mul_overflow((u64)U32_MAX * 128, bus_cycle_ns, &unused)) + break; + + r = 0; + mutex_lock(&kvm->lock); + if (!irqchip_in_kernel(kvm)) + r = -ENXIO; + else if (kvm->created_vcpus) + r = -EINVAL; + else + kvm->arch.apic_bus_cycle_ns = bus_cycle_ns; + mutex_unlock(&kvm->lock); + break; + } default: r = -EINVAL; break; @@ -6900,23 +6981,15 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm) { struct kvm_vcpu *vcpu; unsigned long i; - int ret = 0; - - mutex_lock(&kvm->lock); - kvm_for_each_vcpu(i, vcpu, kvm) { - if (!vcpu->arch.pv_time.active) - continue; - ret = kvm_set_guest_paused(vcpu); - if (ret) { - kvm_err("Failed to pause guest VCPU%d: %d\n", - vcpu->vcpu_id, ret); - break; - } - } - mutex_unlock(&kvm->lock); + /* + * Ignore the return, marking the guest paused only "fails" if the vCPU + * isn't using kvmclock; continuing on is correct and desirable. + */ + kvm_for_each_vcpu(i, vcpu, kvm) + (void)kvm_set_guest_paused(vcpu); - return ret ? NOTIFY_BAD : NOTIFY_DONE; + return NOTIFY_DONE; } int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state) @@ -7197,6 +7270,9 @@ set_pit2_out: mutex_lock(&kvm->lock); if (kvm->created_vcpus) r = -EBUSY; + else if (arg > KVM_MAX_VCPU_IDS || + (kvm->arch.max_vcpu_ids && arg > kvm->arch.max_vcpu_ids)) + r = -EINVAL; else kvm->arch.bsp_vcpu_id = arg; mutex_unlock(&kvm->lock); @@ -7268,14 +7344,13 @@ set_pit2_out: r = READ_ONCE(kvm->arch.default_tsc_khz); goto out; } - case KVM_MEMORY_ENCRYPT_OP: { + case KVM_MEMORY_ENCRYPT_OP: r = -ENOTTY; if (!kvm_x86_ops.mem_enc_ioctl) goto out; - r = static_call(kvm_x86_mem_enc_ioctl)(kvm, argp); + r = kvm_x86_call(mem_enc_ioctl)(kvm, argp); break; - } case KVM_MEMORY_ENCRYPT_REG_REGION: { struct kvm_enc_region region; @@ -7287,7 +7362,7 @@ set_pit2_out: if (!kvm_x86_ops.mem_enc_register_region) goto out; - r = static_call(kvm_x86_mem_enc_register_region)(kvm, ®ion); + r = kvm_x86_call(mem_enc_register_region)(kvm, ®ion); break; } case KVM_MEMORY_ENCRYPT_UNREG_REGION: { @@ -7301,7 +7376,7 @@ set_pit2_out: if (!kvm_x86_ops.mem_enc_unregister_region) goto out; - r = static_call(kvm_x86_mem_enc_unregister_region)(kvm, ®ion); + r = kvm_x86_call(mem_enc_unregister_region)(kvm, ®ion); break; } #ifdef CONFIG_KVM_HYPERV @@ -7337,11 +7412,9 @@ out: static void kvm_probe_feature_msr(u32 msr_index) { - struct kvm_msr_entry msr = { - .index = msr_index, - }; + u64 data; - if (kvm_get_msr_feature(&msr)) + if (kvm_get_feature_msr(NULL, msr_index, &data, true)) return; msr_based_features[num_msr_based_features++] = msr_index; @@ -7395,17 +7468,20 @@ static void kvm_probe_msr_to_save(u32 msr_index) intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)) return; break; - case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR_MAX: + case MSR_ARCH_PERFMON_PERFCTR0 ... + MSR_ARCH_PERFMON_PERFCTR0 + KVM_MAX_NR_GP_COUNTERS - 1: if (msr_index - MSR_ARCH_PERFMON_PERFCTR0 >= kvm_pmu_cap.num_counters_gp) return; break; - case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL_MAX: + case MSR_ARCH_PERFMON_EVENTSEL0 ... + MSR_ARCH_PERFMON_EVENTSEL0 + KVM_MAX_NR_GP_COUNTERS - 1: if (msr_index - MSR_ARCH_PERFMON_EVENTSEL0 >= kvm_pmu_cap.num_counters_gp) return; break; - case MSR_ARCH_PERFMON_FIXED_CTR0 ... MSR_ARCH_PERFMON_FIXED_CTR_MAX: + case MSR_ARCH_PERFMON_FIXED_CTR0 ... + MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_MAX_NR_FIXED_COUNTERS - 1: if (msr_index - MSR_ARCH_PERFMON_FIXED_CTR0 >= kvm_pmu_cap.num_counters_fixed) return; @@ -7436,7 +7512,7 @@ static void kvm_init_msr_lists(void) { unsigned i; - BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3, + BUILD_BUG_ON_MSG(KVM_MAX_NR_FIXED_COUNTERS != 3, "Please update the fixed PMCs in msrs_to_save_pmu[]"); num_msrs_to_save = 0; @@ -7452,7 +7528,8 @@ static void kvm_init_msr_lists(void) } for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) { - if (!static_call(kvm_x86_has_emulated_msr)(NULL, emulated_msrs_all[i])) + if (!kvm_x86_call(has_emulated_msr)(NULL, + emulated_msrs_all[i])) continue; emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i]; @@ -7511,13 +7588,13 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) void kvm_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { - static_call(kvm_x86_set_segment)(vcpu, var, seg); + kvm_x86_call(set_segment)(vcpu, var, seg); } void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { - static_call(kvm_x86_get_segment)(vcpu, var, seg); + kvm_x86_call(get_segment)(vcpu, var, seg); } gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u64 access, @@ -7540,7 +7617,7 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; + u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); } EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read); @@ -7550,7 +7627,7 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; + u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_WRITE_MASK; return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); } @@ -7603,7 +7680,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; + u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; unsigned offset; int ret; @@ -7628,7 +7705,7 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { - u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; + u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; /* * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED @@ -7651,7 +7728,7 @@ static int emulator_read_std(struct x86_emulate_ctxt *ctxt, if (system) access |= PFERR_IMPLICIT_ACCESS; - else if (static_call(kvm_x86_get_cpl)(vcpu) == 3) + else if (kvm_x86_call(get_cpl)(vcpu) == 3) access |= PFERR_USER_MASK; return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception); @@ -7696,7 +7773,7 @@ static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *v if (system) access |= PFERR_IMPLICIT_ACCESS; - else if (static_call(kvm_x86_get_cpl)(vcpu) == 3) + else if (kvm_x86_call(get_cpl)(vcpu) == 3) access |= PFERR_USER_MASK; return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, @@ -7717,8 +7794,8 @@ EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); static int kvm_check_emulate_insn(struct kvm_vcpu *vcpu, int emul_type, void *insn, int insn_len) { - return static_call(kvm_x86_check_emulate_instruction)(vcpu, emul_type, - insn, insn_len); + return kvm_x86_call(check_emulate_instruction)(vcpu, emul_type, + insn, insn_len); } int handle_ud(struct kvm_vcpu *vcpu) @@ -7768,8 +7845,8 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, bool write) { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - u64 access = ((static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0) - | (write ? PFERR_WRITE_MASK : 0); + u64 access = ((kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0) + | (write ? PFERR_WRITE_MASK : 0); /* * currently PKRU is only applied to ept enabled guest so @@ -7967,7 +8044,7 @@ static int emulator_read_write(struct x86_emulate_ctxt *ctxt, return rc; if (!vcpu->mmio_nr_fragments) - return rc; + return X86EMUL_CONTINUE; gpa = vcpu->mmio_fragments[0].gpa; @@ -8195,7 +8272,7 @@ static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) { - return static_call(kvm_x86_get_segment_base)(vcpu, seg); + return kvm_x86_call(get_segment_base)(vcpu, seg); } static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address) @@ -8208,7 +8285,7 @@ static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) if (!need_emulate_wbinvd(vcpu)) return X86EMUL_CONTINUE; - if (static_call(kvm_x86_has_wbinvd_exit)()) { + if (kvm_x86_call(has_wbinvd_exit)()) { int cpu = get_cpu(); cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); @@ -8312,27 +8389,27 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) { - return static_call(kvm_x86_get_cpl)(emul_to_vcpu(ctxt)); + return kvm_x86_call(get_cpl)(emul_to_vcpu(ctxt)); } static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - static_call(kvm_x86_get_gdt)(emul_to_vcpu(ctxt), dt); + kvm_x86_call(get_gdt)(emul_to_vcpu(ctxt), dt); } static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - static_call(kvm_x86_get_idt)(emul_to_vcpu(ctxt), dt); + kvm_x86_call(get_idt)(emul_to_vcpu(ctxt), dt); } static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - static_call(kvm_x86_set_gdt)(emul_to_vcpu(ctxt), dt); + kvm_x86_call(set_gdt)(emul_to_vcpu(ctxt), dt); } static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - static_call(kvm_x86_set_idt)(emul_to_vcpu(ctxt), dt); + kvm_x86_call(set_idt)(emul_to_vcpu(ctxt), dt); } static unsigned long emulator_get_cached_segment_base( @@ -8479,8 +8556,8 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt, struct x86_instruction_info *info, enum x86_intercept_stage stage) { - return static_call(kvm_x86_check_intercept)(emul_to_vcpu(ctxt), info, stage, - &ctxt->exception); + return kvm_x86_call(check_intercept)(emul_to_vcpu(ctxt), info, stage, + &ctxt->exception); } static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, @@ -8492,17 +8569,22 @@ static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt) { - return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE); + return guest_cpu_cap_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE); } static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt) { - return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR); + return guest_cpu_cap_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR); } static bool emulator_guest_has_rdpid(struct x86_emulate_ctxt *ctxt) { - return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID); + return guest_cpu_cap_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID); +} + +static bool emulator_guest_cpuid_is_intel_compatible(struct x86_emulate_ctxt *ctxt) +{ + return guest_cpuid_is_intel_compatible(emul_to_vcpu(ctxt)); } static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg) @@ -8517,7 +8599,7 @@ static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulon static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked) { - static_call(kvm_x86_set_nmi_mask)(emul_to_vcpu(ctxt), masked); + kvm_x86_call(set_nmi_mask)(emul_to_vcpu(ctxt), masked); } static bool emulator_is_smm(struct x86_emulate_ctxt *ctxt) @@ -8562,7 +8644,14 @@ static gva_t emulator_get_untagged_addr(struct x86_emulate_ctxt *ctxt, if (!kvm_x86_ops.get_untagged_addr) return addr; - return static_call(kvm_x86_get_untagged_addr)(emul_to_vcpu(ctxt), addr, flags); + return kvm_x86_call(get_untagged_addr)(emul_to_vcpu(ctxt), + addr, flags); +} + +static bool emulator_is_canonical_addr(struct x86_emulate_ctxt *ctxt, + gva_t addr, unsigned int flags) +{ + return !is_noncanonical_address(addr, emul_to_vcpu(ctxt), flags); } static const struct x86_emulate_ops emulate_ops = { @@ -8603,6 +8692,7 @@ static const struct x86_emulate_ops emulate_ops = { .guest_has_movbe = emulator_guest_has_movbe, .guest_has_fxsr = emulator_guest_has_fxsr, .guest_has_rdpid = emulator_guest_has_rdpid, + .guest_cpuid_is_intel_compatible = emulator_guest_cpuid_is_intel_compatible, .set_nmi_mask = emulator_set_nmi_mask, .is_smm = emulator_is_smm, .is_guest_mode = emulator_is_guest_mode, @@ -8610,11 +8700,12 @@ static const struct x86_emulate_ops emulate_ops = { .triple_fault = emulator_triple_fault, .set_xcr = emulator_set_xcr, .get_untagged_addr = emulator_get_untagged_addr, + .is_canonical_addr = emulator_is_canonical_addr, }; static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) { - u32 int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); + u32 int_shadow = kvm_x86_call(get_interrupt_shadow)(vcpu); /* * an sti; sti; sequence only disable interrupts for the first * instruction. So, if the last instruction, be it emulated or @@ -8625,7 +8716,7 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) if (int_shadow & mask) mask = 0; if (unlikely(int_shadow || mask)) { - static_call(kvm_x86_set_interrupt_shadow)(vcpu, mask); + kvm_x86_call(set_interrupt_shadow)(vcpu, mask); if (!mask) kvm_make_request(KVM_REQ_EVENT, vcpu); } @@ -8666,7 +8757,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; int cs_db, cs_l; - static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); + kvm_x86_call(get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); ctxt->gpa_available = false; ctxt->eflags = kvm_get_rflags(vcpu); @@ -8722,9 +8813,8 @@ static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data, */ memset(&info, 0, sizeof(info)); - static_call(kvm_x86_get_exit_info)(vcpu, (u32 *)&info[0], &info[1], - &info[2], (u32 *)&info[3], - (u32 *)&info[4]); + kvm_x86_call(get_exit_info)(vcpu, (u32 *)&info[0], &info[1], &info[2], + (u32 *)&info[3], (u32 *)&info[4]); run->exit_reason = KVM_EXIT_INTERNAL_ERROR; run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION; @@ -8781,6 +8871,28 @@ void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_prepare_emulation_failure_exit); +void kvm_prepare_event_vectoring_exit(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + u32 reason, intr_info, error_code; + struct kvm_run *run = vcpu->run; + u64 info1, info2; + int ndata = 0; + + kvm_x86_call(get_exit_info)(vcpu, &reason, &info1, &info2, + &intr_info, &error_code); + + run->internal.data[ndata++] = info2; + run->internal.data[ndata++] = reason; + run->internal.data[ndata++] = info1; + run->internal.data[ndata++] = gpa; + run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu; + + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; + run->internal.ndata = ndata; +} +EXPORT_SYMBOL_GPL(kvm_prepare_event_vectoring_exit); + static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) { struct kvm *kvm = vcpu->kvm; @@ -8801,7 +8913,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) kvm_queue_exception(vcpu, UD_VECTOR); - if (!is_guest_mode(vcpu) && static_call(kvm_x86_get_cpl)(vcpu) == 0) { + if (!is_guest_mode(vcpu) && kvm_x86_call(get_cpl)(vcpu) == 0) { prepare_emulation_ctxt_failure_exit(vcpu); return 0; } @@ -8809,60 +8921,13 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) return 1; } -static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - int emulation_type) +static bool kvm_unprotect_and_retry_on_failure(struct kvm_vcpu *vcpu, + gpa_t cr2_or_gpa, + int emulation_type) { - gpa_t gpa = cr2_or_gpa; - kvm_pfn_t pfn; - if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) return false; - if (WARN_ON_ONCE(is_guest_mode(vcpu)) || - WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))) - return false; - - if (!vcpu->arch.mmu->root_role.direct) { - /* - * Write permission should be allowed since only - * write access need to be emulated. - */ - gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); - - /* - * If the mapping is invalid in guest, let cpu retry - * it to generate fault. - */ - if (gpa == INVALID_GPA) - return true; - } - - /* - * Do not retry the unhandleable instruction if it faults on the - * readonly host memory, otherwise it will goto a infinite loop: - * retry instruction -> write #PF -> emulation fail -> retry - * instruction -> ... - */ - pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa)); - - /* - * If the instruction failed on the error pfn, it can not be fixed, - * report the error to userspace. - */ - if (is_error_noslot_pfn(pfn)) - return false; - - kvm_release_pfn_clean(pfn); - - /* - * If emulation may have been triggered by a write to a shadowed page - * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the - * guest to let the CPU re-execute the instruction in the hope that the - * CPU can cleanly execute the instruction that KVM failed to emulate. - */ - if (vcpu->kvm->arch.indirect_shadow_pages) - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); - /* * If the failed instruction faulted on an access to page tables that * are used to translate any part of the instruction, KVM can't resolve @@ -8873,54 +8938,24 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * then zap the SPTE to unprotect the gfn, and then do it all over * again. Report the error to userspace. */ - return !(emulation_type & EMULTYPE_WRITE_PF_TO_SP); -} - -static bool retry_instruction(struct x86_emulate_ctxt *ctxt, - gpa_t cr2_or_gpa, int emulation_type) -{ - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa; - - last_retry_eip = vcpu->arch.last_retry_eip; - last_retry_addr = vcpu->arch.last_retry_addr; + if (emulation_type & EMULTYPE_WRITE_PF_TO_SP) + return false; /* - * If the emulation is caused by #PF and it is non-page_table - * writing instruction, it means the VM-EXIT is caused by shadow - * page protected, we can zap the shadow page and retry this - * instruction directly. - * - * Note: if the guest uses a non-page-table modifying instruction - * on the PDE that points to the instruction, then we will unmap - * the instruction and go to an infinite loop. So, we cache the - * last retried eip and the last fault address, if we meet the eip - * and the address again, we can break out of the potential infinite - * loop. + * If emulation may have been triggered by a write to a shadowed page + * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the + * guest to let the CPU re-execute the instruction in the hope that the + * CPU can cleanly execute the instruction that KVM failed to emulate. */ - vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0; - - if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) - return false; - - if (WARN_ON_ONCE(is_guest_mode(vcpu)) || - WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))) - return false; - - if (x86_page_table_writing_insn(ctxt)) - return false; - - if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa) - return false; - - vcpu->arch.last_retry_eip = ctxt->eip; - vcpu->arch.last_retry_addr = cr2_or_gpa; - - if (!vcpu->arch.mmu->root_role.direct) - gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); - - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); + __kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa, true); + /* + * Retry even if _this_ vCPU didn't unprotect the gfn, as it's possible + * all SPTEs were already zapped by a different task. The alternative + * is to report the error to userspace and likely terminate the guest, + * and the last_retry_{eip,addr} checks will prevent retrying the page + * fault indefinitely, i.e. there's nothing to lose by retrying. + */ return true; } @@ -8959,10 +8994,10 @@ static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu) int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) { - unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu); + unsigned long rflags = kvm_x86_call(get_rflags)(vcpu); int r; - r = static_call(kvm_x86_skip_emulated_instruction)(vcpu); + r = kvm_x86_call(skip_emulated_instruction)(vcpu); if (unlikely(!r)) return 0; @@ -8984,19 +9019,17 @@ EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction); static bool kvm_is_code_breakpoint_inhibited(struct kvm_vcpu *vcpu) { - u32 shadow; - if (kvm_get_rflags(vcpu) & X86_EFLAGS_RF) return true; /* - * Intel CPUs inhibit code #DBs when MOV/POP SS blocking is active, - * but AMD CPUs do not. MOV/POP SS blocking is rare, check that first - * to avoid the relatively expensive CPUID lookup. + * Intel compatible CPUs inhibit code #DBs when MOV/POP SS blocking is + * active, but AMD compatible CPUs do not. */ - shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); - return (shadow & KVM_X86_SHADOW_INT_MOV_SS) && - guest_cpuid_is_intel(vcpu); + if (!guest_cpuid_is_intel_compatible(vcpu)) + return false; + + return kvm_x86_call(get_interrupt_shadow)(vcpu) & KVM_X86_SHADOW_INT_MOV_SS; } static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu, @@ -9122,11 +9155,25 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; bool writeback = true; + if ((emulation_type & EMULTYPE_ALLOW_RETRY_PF) && + (WARN_ON_ONCE(is_guest_mode(vcpu)) || + WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))) + emulation_type &= ~EMULTYPE_ALLOW_RETRY_PF; + r = kvm_check_emulate_insn(vcpu, emulation_type, insn, insn_len); if (r != X86EMUL_CONTINUE) { if (r == X86EMUL_RETRY_INSTR || r == X86EMUL_PROPAGATE_FAULT) return 1; + if (kvm_unprotect_and_retry_on_failure(vcpu, cr2_or_gpa, + emulation_type)) + return 1; + + if (r == X86EMUL_UNHANDLEABLE_VECTORING) { + kvm_prepare_event_vectoring_exit(vcpu, cr2_or_gpa); + return 0; + } + WARN_ON_ONCE(r != X86EMUL_UNHANDLEABLE); return handle_emulation_failure(vcpu, emulation_type); } @@ -9152,8 +9199,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, kvm_queue_exception(vcpu, UD_VECTOR); return 1; } - if (reexecute_instruction(vcpu, cr2_or_gpa, - emulation_type)) + if (kvm_unprotect_and_retry_on_failure(vcpu, cr2_or_gpa, + emulation_type)) return 1; if (ctxt->have_exception && @@ -9200,7 +9247,15 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, return 1; } - if (retry_instruction(ctxt, cr2_or_gpa, emulation_type)) + /* + * If emulation was caused by a write-protection #PF on a non-page_table + * writing instruction, try to unprotect the gfn, i.e. zap shadow pages, + * and retry the instruction, as the vCPU is likely no longer using the + * gfn as a page table. + */ + if ((emulation_type & EMULTYPE_ALLOW_RETRY_PF) && + !x86_page_table_writing_insn(ctxt) && + kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa)) return 1; /* this is needed for vmware backdoor interface to work since it @@ -9231,7 +9286,8 @@ restart: return 1; if (r == EMULATION_FAILED) { - if (reexecute_instruction(vcpu, cr2_or_gpa, emulation_type)) + if (kvm_unprotect_and_retry_on_failure(vcpu, cr2_or_gpa, + emulation_type)) return 1; return handle_emulation_failure(vcpu, emulation_type); @@ -9268,7 +9324,7 @@ restart: writeback: if (writeback) { - unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu); + unsigned long rflags = kvm_x86_call(get_rflags)(vcpu); toggle_interruptibility(vcpu, ctxt->interruptibility); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; @@ -9285,7 +9341,7 @@ writeback: kvm_rip_write(vcpu, ctxt->eip); if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) r = kvm_vcpu_do_singlestep(vcpu); - static_call_cond(kvm_x86_update_emulated_instruction)(vcpu); + kvm_x86_call(update_emulated_instruction)(vcpu); __kvm_set_rflags(vcpu, ctxt->eflags); } @@ -9326,7 +9382,7 @@ static int complete_fast_pio_out(struct kvm_vcpu *vcpu) { vcpu->arch.pio.count = 0; - if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) + if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))) return 1; return kvm_skip_emulated_instruction(vcpu); @@ -9351,7 +9407,7 @@ static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, complete_fast_pio_out_port_0x7e; kvm_skip_emulated_instruction(vcpu); } else { - vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu); + vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu); vcpu->arch.complete_userspace_io = complete_fast_pio_out; } return 0; @@ -9364,7 +9420,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu) /* We should only ever be called with arch.pio.count equal to 1 */ BUG_ON(vcpu->arch.pio.count != 1); - if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) { + if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))) { vcpu->arch.pio.count = 0; return 1; } @@ -9393,7 +9449,7 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, return ret; } - vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu); + vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu); vcpu->arch.complete_userspace_io = complete_fast_pio_in; return 0; @@ -9684,7 +9740,7 @@ static int kvm_x86_check_processor_compatibility(void) __cr4_reserved_bits(cpu_has, &boot_cpu_data)) return -EIO; - return static_call(kvm_x86_check_processor_compatibility)(); + return kvm_x86_call(check_processor_compatibility)(); } static void kvm_x86_check_cpu_compat(void *ret) @@ -9699,7 +9755,7 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) guard(mutex)(&vendor_module_lock); - if (kvm_x86_ops.hardware_enable) { + if (kvm_x86_ops.enable_virtualization_cpu) { pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name); return -EEXIST; } @@ -9726,12 +9782,14 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) * with an exception. PAT[0] is set to WB on RESET and also by the * kernel, i.e. failure indicates a kernel bug or broken firmware. */ - if (rdmsrl_safe(MSR_IA32_CR_PAT, &host_pat) || + if (rdmsrq_safe(MSR_IA32_CR_PAT, &host_pat) || (host_pat & GENMASK(2, 0)) != 6) { pr_err("host PAT[0] is not WB\n"); return -EIO; } + memset(&kvm_caps, 0, sizeof(kvm_caps)); + x86_emulator_cache = kvm_alloc_emulator_cache(); if (!x86_emulator_cache) { pr_err("failed to allocate cache for x86 emulator\n"); @@ -9750,25 +9808,33 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) if (r) goto out_free_percpu; + kvm_caps.supported_vm_types = BIT(KVM_X86_DEFAULT_VM); + kvm_caps.supported_mce_cap = MCG_CTL_P | MCG_SER_P; + if (boot_cpu_has(X86_FEATURE_XSAVE)) { - host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0; + kvm_host.xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + kvm_caps.supported_xcr0 = kvm_host.xcr0 & KVM_SUPPORTED_XCR0; } + kvm_caps.supported_quirks = KVM_X86_VALID_QUIRKS; + kvm_caps.inapplicable_quirks = KVM_X86_CONDITIONAL_QUIRKS; - rdmsrl_safe(MSR_EFER, &host_efer); + rdmsrq_safe(MSR_EFER, &kvm_host.efer); if (boot_cpu_has(X86_FEATURE_XSAVES)) - rdmsrl(MSR_IA32_XSS, host_xss); + rdmsrq(MSR_IA32_XSS, kvm_host.xss); kvm_init_pmu_capability(ops->pmu_ops); if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, host_arch_capabilities); + rdmsrq(MSR_IA32_ARCH_CAPABILITIES, kvm_host.arch_capabilities); r = ops->hardware_setup(); if (r != 0) goto out_mmu_exit; + enable_device_posted_irqs &= enable_apicv && + irq_remapping_cap(IRQ_POSTING_CAP); + kvm_ops_update(ops); for_each_online_cpu(cpu) { @@ -9795,13 +9861,16 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) kvm_register_perf_callbacks(ops->handle_intel_pt_intr); + if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled) + kvm_caps.supported_vm_types |= BIT(KVM_X86_SW_PROTECTED_VM); + + /* KVM always ignores guest PAT for shadow paging. */ + if (!tdp_enabled) + kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; + if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) kvm_caps.supported_xss = 0; -#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f) - cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_); -#undef __kvm_cpu_cap_has - if (kvm_caps.has_tsc_control) { /* * Make sure the user can only configure tsc_khz values that @@ -9818,8 +9887,8 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) return 0; out_unwind_ops: - kvm_x86_ops.hardware_enable = NULL; - static_call(kvm_x86_hardware_unsetup)(); + kvm_x86_ops.enable_virtualization_cpu = NULL; + kvm_x86_call(hardware_unsetup)(); out_mmu_exit: kvm_mmu_vendor_module_exit(); out_free_percpu: @@ -9850,7 +9919,7 @@ void kvm_x86_vendor_exit(void) irq_work_sync(&pvclock_irq_work); cancel_work_sync(&pvclock_gtod_work); #endif - static_call(kvm_x86_hardware_unsetup)(); + kvm_x86_call(hardware_unsetup)(); kvm_mmu_vendor_module_exit(); free_percpu(user_return_msrs); kmem_cache_destroy(x86_emulator_cache); @@ -9859,56 +9928,11 @@ void kvm_x86_vendor_exit(void) WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key)); #endif mutex_lock(&vendor_module_lock); - kvm_x86_ops.hardware_enable = NULL; + kvm_x86_ops.enable_virtualization_cpu = NULL; mutex_unlock(&vendor_module_lock); } EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit); -static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason) -{ - /* - * The vCPU has halted, e.g. executed HLT. Update the run state if the - * local APIC is in-kernel, the run loop will detect the non-runnable - * state and halt the vCPU. Exit to userspace if the local APIC is - * managed by userspace, in which case userspace is responsible for - * handling wake events. - */ - ++vcpu->stat.halt_exits; - if (lapic_in_kernel(vcpu)) { - vcpu->arch.mp_state = state; - return 1; - } else { - vcpu->run->exit_reason = reason; - return 0; - } -} - -int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu) -{ - return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT); -} -EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip); - -int kvm_emulate_halt(struct kvm_vcpu *vcpu) -{ - int ret = kvm_skip_emulated_instruction(vcpu); - /* - * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered - * KVM_EXIT_DEBUG here. - */ - return kvm_emulate_halt_noskip(vcpu) && ret; -} -EXPORT_SYMBOL_GPL(kvm_emulate_halt); - -int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu) -{ - int ret = kvm_skip_emulated_instruction(vcpu); - - return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD, - KVM_EXIT_AP_RESET_HOLD) && ret; -} -EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold); - #ifdef CONFIG_X86_64 static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, unsigned long clock_type) @@ -9976,7 +10000,8 @@ EXPORT_SYMBOL_GPL(kvm_apicv_activated); bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu) { ulong vm_reasons = READ_ONCE(vcpu->kvm->arch.apicv_inhibit_reasons); - ulong vcpu_reasons = static_call(kvm_x86_vcpu_get_apicv_inhibit_reasons)(vcpu); + ulong vcpu_reasons = + kvm_x86_call(vcpu_get_apicv_inhibit_reasons)(vcpu); return (vm_reasons | vcpu_reasons) == 0; } @@ -9985,6 +10010,10 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_apicv_activated); static void set_or_clear_apicv_inhibit(unsigned long *inhibits, enum kvm_apicv_inhibit reason, bool set) { + const struct trace_print_flags apicv_inhibits[] = { APICV_INHIBIT_REASONS }; + + BUILD_BUG_ON(ARRAY_SIZE(apicv_inhibits) != NR_APICV_INHIBIT_REASONS); + if (set) __set_bit(reason, inhibits); else @@ -9995,15 +10024,12 @@ static void set_or_clear_apicv_inhibit(unsigned long *inhibits, static void kvm_apicv_init(struct kvm *kvm) { - unsigned long *inhibits = &kvm->arch.apicv_inhibit_reasons; - - init_rwsem(&kvm->arch.apicv_update_lock); + enum kvm_apicv_inhibit reason = enable_apicv ? APICV_INHIBIT_REASON_ABSENT : + APICV_INHIBIT_REASON_DISABLED; - set_or_clear_apicv_inhibit(inhibits, APICV_INHIBIT_REASON_ABSENT, true); + set_or_clear_apicv_inhibit(&kvm->arch.apicv_inhibit_reasons, reason, true); - if (!enable_apicv) - set_or_clear_apicv_inhibit(inhibits, - APICV_INHIBIT_REASON_DISABLE, true); + init_rwsem(&kvm->arch.apicv_update_lock); } static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id) @@ -10044,33 +10070,27 @@ static int complete_hypercall_exit(struct kvm_vcpu *vcpu) { u64 ret = vcpu->run->hypercall.ret; - if (!is_64_bit_mode(vcpu)) + if (!is_64_bit_hypercall(vcpu)) ret = (u32)ret; kvm_rax_write(vcpu, ret); - ++vcpu->stat.hypercalls; return kvm_skip_emulated_instruction(vcpu); } -int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) +int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, int cpl, + int (*complete_hypercall)(struct kvm_vcpu *)) { - unsigned long nr, a0, a1, a2, a3, ret; - int op_64_bit; + unsigned long ret; + unsigned long nr = kvm_rax_read(vcpu); + unsigned long a0 = kvm_rbx_read(vcpu); + unsigned long a1 = kvm_rcx_read(vcpu); + unsigned long a2 = kvm_rdx_read(vcpu); + unsigned long a3 = kvm_rsi_read(vcpu); + int op_64_bit = is_64_bit_hypercall(vcpu); - if (kvm_xen_hypercall_enabled(vcpu->kvm)) - return kvm_xen_hypercall(vcpu); - - if (kvm_hv_hypercall_enabled(vcpu)) - return kvm_hv_hypercall(vcpu); - - nr = kvm_rax_read(vcpu); - a0 = kvm_rbx_read(vcpu); - a1 = kvm_rcx_read(vcpu); - a2 = kvm_rdx_read(vcpu); - a3 = kvm_rsi_read(vcpu); + ++vcpu->stat.hypercalls; trace_kvm_hypercall(nr, a0, a1, a2, a3); - op_64_bit = is_64_bit_hypercall(vcpu); if (!op_64_bit) { nr &= 0xFFFFFFFF; a0 &= 0xFFFFFFFF; @@ -10079,7 +10099,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) a3 &= 0xFFFFFFFF; } - if (static_call(kvm_x86_get_cpl)(vcpu) != 0) { + if (cpl) { ret = -KVM_EPERM; goto out; } @@ -10120,7 +10140,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) u64 gpa = a0, npages = a1, attrs = a2; ret = -KVM_ENOSYS; - if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE))) + if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) break; if (!PAGE_ALIGNED(gpa) || !npages || @@ -10131,6 +10151,13 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; + /* + * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2) + * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that + * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting + * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU. + */ + vcpu->run->hypercall.ret = 0; vcpu->run->hypercall.args[0] = gpa; vcpu->run->hypercall.args[1] = npages; vcpu->run->hypercall.args[2] = attrs; @@ -10139,20 +10166,30 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) vcpu->run->hypercall.flags |= KVM_EXIT_HYPERCALL_LONG_MODE; WARN_ON_ONCE(vcpu->run->hypercall.flags & KVM_EXIT_HYPERCALL_MBZ); - vcpu->arch.complete_userspace_io = complete_hypercall_exit; + vcpu->arch.complete_userspace_io = complete_hypercall; return 0; } default: ret = -KVM_ENOSYS; break; } + out: - if (!op_64_bit) - ret = (u32)ret; - kvm_rax_write(vcpu, ret); + vcpu->run->hypercall.ret = ret; + return 1; +} +EXPORT_SYMBOL_GPL(____kvm_emulate_hypercall); - ++vcpu->stat.hypercalls; - return kvm_skip_emulated_instruction(vcpu); +int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) +{ + if (kvm_xen_hypercall_enabled(vcpu->kvm)) + return kvm_xen_hypercall(vcpu); + + if (kvm_hv_hypercall_enabled(vcpu)) + return kvm_hv_hypercall(vcpu); + + return __kvm_emulate_hypercall(vcpu, kvm_x86_call(get_cpl)(vcpu), + complete_hypercall_exit); } EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); @@ -10173,7 +10210,7 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) return X86EMUL_PROPAGATE_FAULT; } - static_call(kvm_x86_patch_hypercall)(vcpu, instruction); + kvm_x86_call(patch_hypercall)(vcpu, instruction); return emulator_write_emulated(ctxt, rip, instruction, 3, &ctxt->exception); @@ -10190,9 +10227,9 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) { struct kvm_run *kvm_run = vcpu->run; - kvm_run->if_flag = static_call(kvm_x86_get_if_flag)(vcpu); + kvm_run->if_flag = kvm_x86_call(get_if_flag)(vcpu); kvm_run->cr8 = kvm_get_cr8(vcpu); - kvm_run->apic_base = kvm_get_apic_base(vcpu); + kvm_run->apic_base = vcpu->arch.apic_base; kvm_run->ready_for_interrupt_injection = pic_in_kernel(vcpu->kvm) || @@ -10200,6 +10237,8 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) if (is_smm(vcpu)) kvm_run->flags |= KVM_RUN_X86_SMM; + if (is_guest_mode(vcpu)) + kvm_run->flags |= KVM_RUN_X86_GUEST_MODE; } static void update_cr8_intercept(struct kvm_vcpu *vcpu) @@ -10225,7 +10264,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) tpr = kvm_lapic_get_cr8(vcpu); - static_call(kvm_x86_update_cr8_intercept)(vcpu, tpr, max_irr); + kvm_x86_call(update_cr8_intercept)(vcpu, tpr, max_irr); } @@ -10255,7 +10294,7 @@ static void kvm_inject_exception(struct kvm_vcpu *vcpu) vcpu->arch.exception.error_code, vcpu->arch.exception.injected); - static_call(kvm_x86_inject_exception)(vcpu); + kvm_x86_call(inject_exception)(vcpu); } /* @@ -10341,9 +10380,9 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu, else if (kvm_is_exception_pending(vcpu)) ; /* see above */ else if (vcpu->arch.nmi_injected) - static_call(kvm_x86_inject_nmi)(vcpu); + kvm_x86_call(inject_nmi)(vcpu); else if (vcpu->arch.interrupt.injected) - static_call(kvm_x86_inject_irq)(vcpu, true); + kvm_x86_call(inject_irq)(vcpu, true); /* * Exceptions that morph to VM-Exits are handled above, and pending @@ -10428,7 +10467,8 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu, */ #ifdef CONFIG_KVM_SMM if (vcpu->arch.smi_pending) { - r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY; + r = can_inject ? kvm_x86_call(smi_allowed)(vcpu, true) : + -EBUSY; if (r < 0) goto out; if (r) { @@ -10437,27 +10477,29 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu, enter_smm(vcpu); can_inject = false; } else - static_call(kvm_x86_enable_smi_window)(vcpu); + kvm_x86_call(enable_smi_window)(vcpu); } #endif if (vcpu->arch.nmi_pending) { - r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY; + r = can_inject ? kvm_x86_call(nmi_allowed)(vcpu, true) : + -EBUSY; if (r < 0) goto out; if (r) { --vcpu->arch.nmi_pending; vcpu->arch.nmi_injected = true; - static_call(kvm_x86_inject_nmi)(vcpu); + kvm_x86_call(inject_nmi)(vcpu); can_inject = false; - WARN_ON(static_call(kvm_x86_nmi_allowed)(vcpu, true) < 0); + WARN_ON(kvm_x86_call(nmi_allowed)(vcpu, true) < 0); } if (vcpu->arch.nmi_pending) - static_call(kvm_x86_enable_nmi_window)(vcpu); + kvm_x86_call(enable_nmi_window)(vcpu); } if (kvm_cpu_has_injectable_intr(vcpu)) { - r = can_inject ? static_call(kvm_x86_interrupt_allowed)(vcpu, true) : -EBUSY; + r = can_inject ? kvm_x86_call(interrupt_allowed)(vcpu, true) : + -EBUSY; if (r < 0) goto out; if (r) { @@ -10465,17 +10507,17 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu, if (!WARN_ON_ONCE(irq == -1)) { kvm_queue_interrupt(vcpu, irq, false); - static_call(kvm_x86_inject_irq)(vcpu, false); - WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0); + kvm_x86_call(inject_irq)(vcpu, false); + WARN_ON(kvm_x86_call(interrupt_allowed)(vcpu, true) < 0); } } if (kvm_cpu_has_injectable_intr(vcpu)) - static_call(kvm_x86_enable_irq_window)(vcpu); + kvm_x86_call(enable_irq_window)(vcpu); } if (is_guest_mode(vcpu) && kvm_x86_ops.nested_ops->has_events && - kvm_x86_ops.nested_ops->has_events(vcpu)) + kvm_x86_ops.nested_ops->has_events(vcpu, true)) *req_immediate_exit = true; /* @@ -10516,7 +10558,7 @@ static void process_nmi(struct kvm_vcpu *vcpu) * blocks NMIs). KVM will immediately inject one of the two NMIs, and * will request an NMI window to handle the second NMI. */ - if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected) + if (kvm_x86_call(get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected) limit = 1; else limit = 2; @@ -10525,14 +10567,14 @@ static void process_nmi(struct kvm_vcpu *vcpu) * Adjust the limit to account for pending virtual NMIs, which aren't * tracked in vcpu->arch.nmi_pending. */ - if (static_call(kvm_x86_is_vnmi_pending)(vcpu)) + if (kvm_x86_call(is_vnmi_pending)(vcpu)) limit--; vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0); vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit); if (vcpu->arch.nmi_pending && - (static_call(kvm_x86_set_vnmi_pending)(vcpu))) + (kvm_x86_call(set_vnmi_pending)(vcpu))) vcpu->arch.nmi_pending--; if (vcpu->arch.nmi_pending) @@ -10543,7 +10585,7 @@ static void process_nmi(struct kvm_vcpu *vcpu) int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu) { return vcpu->arch.nmi_pending + - static_call(kvm_x86_is_vnmi_pending)(vcpu); + kvm_x86_call(is_vnmi_pending)(vcpu); } void kvm_make_scan_ioapic_request_mask(struct kvm *kvm, @@ -10577,7 +10619,7 @@ void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) apic->apicv_active = activate; kvm_apic_update_apicv(vcpu); - static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu); + kvm_x86_call(refresh_apicv_exec_ctrl)(vcpu); /* * When APICv gets disabled, we may still have injected interrupts @@ -10604,8 +10646,8 @@ static void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) * deleted if any vCPU has xAPIC virtualization and x2APIC enabled, but * and hardware doesn't support x2APIC virtualization. E.g. some AMD * CPUs support AVIC but not x2APIC. KVM still allows enabling AVIC in - * this case so that KVM can the AVIC doorbell to inject interrupts to - * running vCPUs, but KVM must not create SPTEs for the APIC base as + * this case so that KVM can use the AVIC doorbell to inject interrupts + * to running vCPUs, but KVM must not create SPTEs for the APIC base as * the vCPU would incorrectly be able to access the vAPIC page via MMIO * despite being in x2APIC mode. For simplicity, inhibiting the APIC * access page is sticky. @@ -10634,11 +10676,11 @@ void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, if (!!old != !!new) { /* * Kick all vCPUs before setting apicv_inhibit_reasons to avoid - * false positives in the sanity check WARN in svm_vcpu_run(). + * false positives in the sanity check WARN in vcpu_enter_guest(). * This task will wait for all vCPUs to ack the kick IRQ before * updating apicv_inhibit_reasons, and all other vCPUs will * block on acquiring apicv_update_lock so that vCPUs can't - * redo svm_vcpu_run() without seeing the new inhibit state. + * redo vcpu_enter_guest() without seeing the new inhibit state. * * Note, holding apicv_update_lock and taking it in the read * side (handling the request) also prevents other vCPUs from @@ -10676,14 +10718,14 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) return; bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256); + vcpu->arch.highest_stale_pending_ioapic_eoi = -1; + + kvm_x86_call(sync_pir_to_irr)(vcpu); if (irqchip_split(vcpu->kvm)) kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors); - else { - static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); - if (ioapic_in_kernel(vcpu->kvm)) - kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); - } + else if (ioapic_in_kernel(vcpu->kvm)) + kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); if (is_guest_mode(vcpu)) vcpu->arch.load_eoi_exitmap_pending = true; @@ -10703,17 +10745,17 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu) bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors, to_hv_synic(vcpu)->vec_bitmap, 256); - static_call_cond(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap); + kvm_x86_call(load_eoi_exitmap)(vcpu, eoi_exit_bitmap); return; } #endif - static_call_cond(kvm_x86_load_eoi_exitmap)( + kvm_x86_call(load_eoi_exitmap)( vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors); } void kvm_arch_guest_memory_reclaimed(struct kvm *kvm) { - static_call_cond(kvm_x86_guest_memory_reclaimed)(kvm); + kvm_x86_call(guest_memory_reclaimed)(kvm); } static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) @@ -10721,7 +10763,7 @@ static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) if (!lapic_in_kernel(vcpu)) return; - static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu); + kvm_x86_call(set_apic_access_page_addr)(vcpu); } /* @@ -10885,10 +10927,18 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_APF_READY, vcpu)) kvm_check_async_pf_completion(vcpu); if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu)) - static_call(kvm_x86_msr_filter_changed)(vcpu); + kvm_x86_call(msr_filter_changed)(vcpu); if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu)) - static_call(kvm_x86_update_cpu_dirty_logging)(vcpu); + kvm_x86_call(update_cpu_dirty_logging)(vcpu); + + if (kvm_check_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu)) { + kvm_vcpu_reset(vcpu, true); + if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) { + r = 1; + goto out; + } + } } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win || @@ -10910,7 +10960,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto out; } if (req_int_win) - static_call(kvm_x86_enable_irq_window)(vcpu); + kvm_x86_call(enable_irq_window)(vcpu); if (kvm_lapic_enabled(vcpu)) { update_cr8_intercept(vcpu); @@ -10925,7 +10975,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) preempt_disable(); - static_call(kvm_x86_prepare_switch_to_guest)(vcpu); + kvm_x86_call(prepare_switch_to_guest)(vcpu); /* * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt @@ -10961,7 +11011,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * i.e. they can post interrupts even if APICv is temporarily disabled. */ if (kvm_lapic_enabled(vcpu)) - static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); + kvm_x86_call(sync_pir_to_irr)(vcpu); if (kvm_vcpu_exit_request(vcpu)) { vcpu->mode = OUTSIDE_GUEST_MODE; @@ -10981,18 +11031,24 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) switch_fpu_return(); if (vcpu->arch.guest_fpu.xfd_err) - wrmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); + wrmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); - if (unlikely(vcpu->arch.switch_db_regs)) { + if (unlikely(vcpu->arch.switch_db_regs && + !(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH))) { set_debugreg(0, 7); set_debugreg(vcpu->arch.eff_db[0], 0); set_debugreg(vcpu->arch.eff_db[1], 1); set_debugreg(vcpu->arch.eff_db[2], 2); set_debugreg(vcpu->arch.eff_db[3], 3); + /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ + if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) + kvm_x86_call(set_dr6)(vcpu, vcpu->arch.dr6); } else if (unlikely(hw_breakpoint_active())) { set_debugreg(0, 7); } + vcpu->arch.host_debugctl = get_debugctlmsr(); + guest_timing_enter_irqoff(); for (;;) { @@ -11005,12 +11061,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) && (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED)); - exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu, req_immediate_exit); + exit_fastpath = kvm_x86_call(vcpu_run)(vcpu, + req_immediate_exit); if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) break; if (kvm_lapic_enabled(vcpu)) - static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); + kvm_x86_call(sync_pir_to_irr)(vcpu); if (unlikely(kvm_vcpu_exit_request(vcpu))) { exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED; @@ -11029,7 +11086,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) */ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) { WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP); - static_call(kvm_x86_sync_dirty_debug_regs)(vcpu); + WARN_ON(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH); + kvm_x86_call(sync_dirty_debug_regs)(vcpu); kvm_update_dr0123(vcpu); kvm_update_dr7(vcpu); } @@ -11058,10 +11116,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (vcpu->arch.xfd_no_write_intercept) fpu_sync_guest_vmexit_xfd_state(); - static_call(kvm_x86_handle_exit_irqoff)(vcpu); + kvm_x86_call(handle_exit_irqoff)(vcpu); if (vcpu->arch.guest_fpu.xfd_err) - wrmsrl(MSR_IA32_XFD_ERR, 0); + wrmsrq(MSR_IA32_XFD_ERR, 0); /* * Consume any pending interrupts, including the possible source of @@ -11091,9 +11149,16 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_vcpu_srcu_read_lock(vcpu); /* + * Call this to ensure WC buffers in guest are evicted after each VM + * Exit, so that the evicted WC writes can be snooped across all cpus + */ + smp_mb__after_srcu_read_lock(); + + /* * Profile KVM exit RIPs: */ - if (unlikely(prof_on == KVM_PROFILING)) { + if (unlikely(prof_on == KVM_PROFILING && + !vcpu->arch.guest_state_protected)) { unsigned long rip = kvm_rip_read(vcpu); profile_hit(KVM_PROFILING, (void *)rip); } @@ -11104,19 +11169,82 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (vcpu->arch.apic_attention) kvm_lapic_sync_from_vapic(vcpu); - r = static_call(kvm_x86_handle_exit)(vcpu, exit_fastpath); + if (unlikely(exit_fastpath == EXIT_FASTPATH_EXIT_USERSPACE)) + return 0; + + r = kvm_x86_call(handle_exit)(vcpu, exit_fastpath); return r; cancel_injection: if (req_immediate_exit) kvm_make_request(KVM_REQ_EVENT, vcpu); - static_call(kvm_x86_cancel_injection)(vcpu); + kvm_x86_call(cancel_injection)(vcpu); if (unlikely(vcpu->arch.apic_attention)) kvm_lapic_sync_from_vapic(vcpu); out: return r; } +static bool kvm_vcpu_running(struct kvm_vcpu *vcpu) +{ + return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && + !vcpu->arch.apf.halted); +} + +bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) +{ + if (!list_empty_careful(&vcpu->async_pf.done)) + return true; + + if (kvm_apic_has_pending_init_or_sipi(vcpu) && + kvm_apic_init_sipi_allowed(vcpu)) + return true; + + if (kvm_is_exception_pending(vcpu)) + return true; + + if (kvm_test_request(KVM_REQ_NMI, vcpu) || + (vcpu->arch.nmi_pending && + kvm_x86_call(nmi_allowed)(vcpu, false))) + return true; + +#ifdef CONFIG_KVM_SMM + if (kvm_test_request(KVM_REQ_SMI, vcpu) || + (vcpu->arch.smi_pending && + kvm_x86_call(smi_allowed)(vcpu, false))) + return true; +#endif + + if (kvm_test_request(KVM_REQ_PMI, vcpu)) + return true; + + if (kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu)) + return true; + + if (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu)) + return true; + + if (kvm_hv_has_stimer_pending(vcpu)) + return true; + + if (is_guest_mode(vcpu) && + kvm_x86_ops.nested_ops->has_events && + kvm_x86_ops.nested_ops->has_events(vcpu, false)) + return true; + + if (kvm_xen_has_pending_events(vcpu)) + return true; + + return false; +} +EXPORT_SYMBOL_GPL(kvm_vcpu_has_events); + +int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) +{ + return kvm_vcpu_running(vcpu) || vcpu->arch.pv.pv_unhalted || + kvm_vcpu_has_events(vcpu); +} + /* Called within kvm->srcu read side. */ static inline int vcpu_block(struct kvm_vcpu *vcpu) { @@ -11160,7 +11288,10 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu) * causes a spurious wakeup from HLT). */ if (is_guest_mode(vcpu)) { - if (kvm_check_nested_events(vcpu) < 0) + int r = kvm_check_nested_events(vcpu); + + WARN_ON_ONCE(r == -EBUSY); + if (r < 0) return 0; } @@ -11169,9 +11300,7 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu) switch(vcpu->arch.mp_state) { case KVM_MP_STATE_HALTED: case KVM_MP_STATE_AP_RESET_HOLD: - vcpu->arch.pv.pv_unhalted = false; - vcpu->arch.mp_state = - KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); fallthrough; case KVM_MP_STATE_RUNNABLE: vcpu->arch.apf.halted = false; @@ -11185,19 +11314,12 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu) return 1; } -static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) -{ - return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && - !vcpu->arch.apf.halted); -} - /* Called within kvm->srcu read side. */ static int vcpu_run(struct kvm_vcpu *vcpu) { int r; vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; - vcpu->arch.l1tf_flush_l1d = true; for (;;) { /* @@ -11243,6 +11365,97 @@ static int vcpu_run(struct kvm_vcpu *vcpu) return r; } +static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason) +{ + /* + * The vCPU has halted, e.g. executed HLT. Update the run state if the + * local APIC is in-kernel, the run loop will detect the non-runnable + * state and halt the vCPU. Exit to userspace if the local APIC is + * managed by userspace, in which case userspace is responsible for + * handling wake events. + */ + ++vcpu->stat.halt_exits; + if (lapic_in_kernel(vcpu)) { + if (kvm_vcpu_has_events(vcpu) || vcpu->arch.pv.pv_unhalted) + state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, state); + return 1; + } else { + vcpu->run->exit_reason = reason; + return 0; + } +} + +int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu) +{ + return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT); +} +EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip); + +int kvm_emulate_halt(struct kvm_vcpu *vcpu) +{ + int ret = kvm_skip_emulated_instruction(vcpu); + /* + * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered + * KVM_EXIT_DEBUG here. + */ + return kvm_emulate_halt_noskip(vcpu) && ret; +} +EXPORT_SYMBOL_GPL(kvm_emulate_halt); + +fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu) +{ + int ret; + + kvm_vcpu_srcu_read_lock(vcpu); + ret = kvm_emulate_halt(vcpu); + kvm_vcpu_srcu_read_unlock(vcpu); + + if (!ret) + return EXIT_FASTPATH_EXIT_USERSPACE; + + if (kvm_vcpu_running(vcpu)) + return EXIT_FASTPATH_REENTER_GUEST; + + return EXIT_FASTPATH_EXIT_HANDLED; +} +EXPORT_SYMBOL_GPL(handle_fastpath_hlt); + +int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu) +{ + int ret = kvm_skip_emulated_instruction(vcpu); + + return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD, + KVM_EXIT_AP_RESET_HOLD) && ret; +} +EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold); + +bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) +{ + return kvm_vcpu_apicv_active(vcpu) && + kvm_x86_call(dy_apicv_has_pending_interrupt)(vcpu); +} + +bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.preempted_in_kernel; +} + +bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) +{ + if (READ_ONCE(vcpu->arch.pv.pv_unhalted)) + return true; + + if (kvm_test_request(KVM_REQ_NMI, vcpu) || +#ifdef CONFIG_KVM_SMM + kvm_test_request(KVM_REQ_SMI, vcpu) || +#endif + kvm_test_request(KVM_REQ_EVENT, vcpu)) + return true; + + return kvm_arch_dy_has_pending_interrupt(vcpu); +} + static inline int complete_emulated_io(struct kvm_vcpu *vcpu) { return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE); @@ -11338,8 +11551,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { struct kvm_queued_exception *ex = &vcpu->arch.exception; struct kvm_run *kvm_run = vcpu->run; + u64 sync_valid_fields; int r; + r = kvm_mmu_post_init_vm(vcpu->kvm); + if (r) + return r; + vcpu_load(vcpu); kvm_sigset_activate(vcpu); kvm_run->flags = 0; @@ -11347,7 +11565,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) kvm_vcpu_srcu_read_lock(vcpu); if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { - if (kvm_run->immediate_exit) { + if (!vcpu->wants_to_run) { r = -EINTR; goto out; } @@ -11379,8 +11597,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) goto out; } - if ((kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) || - (kvm_run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)) { + sync_valid_fields = kvm_sync_valid_fields(vcpu->kvm); + if ((kvm_run->kvm_valid_regs & ~sync_valid_fields) || + (kvm_run->kvm_dirty_regs & ~sync_valid_fields)) { r = -EINVAL; goto out; } @@ -11425,12 +11644,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) WARN_ON_ONCE(vcpu->mmio_needed); } - if (kvm_run->immediate_exit) { + if (!vcpu->wants_to_run) { r = -EINTR; goto out; } - r = static_call(kvm_x86_vcpu_pre_run)(vcpu); + r = kvm_x86_call(vcpu_pre_run)(vcpu); if (r <= 0) goto out; @@ -11438,7 +11657,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) out: kvm_put_guest_fpu(vcpu); - if (kvm_run->kvm_valid_regs) + if (kvm_run->kvm_valid_regs && likely(!vcpu->arch.guest_state_protected)) store_regs(vcpu); post_kvm_run_save(vcpu); kvm_vcpu_srcu_read_unlock(vcpu); @@ -11486,6 +11705,10 @@ static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { + if (vcpu->kvm->arch.has_protected_state && + vcpu->arch.guest_state_protected) + return -EINVAL; + vcpu_load(vcpu); __get_regs(vcpu, regs); vcpu_put(vcpu); @@ -11527,6 +11750,10 @@ static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { + if (vcpu->kvm->arch.has_protected_state && + vcpu->arch.guest_state_protected) + return -EINVAL; + vcpu_load(vcpu); __set_regs(vcpu, regs); vcpu_put(vcpu); @@ -11550,10 +11777,10 @@ static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); - static_call(kvm_x86_get_idt)(vcpu, &dt); + kvm_x86_call(get_idt)(vcpu, &dt); sregs->idt.limit = dt.size; sregs->idt.base = dt.address; - static_call(kvm_x86_get_gdt)(vcpu, &dt); + kvm_x86_call(get_gdt)(vcpu, &dt); sregs->gdt.limit = dt.size; sregs->gdt.base = dt.address; @@ -11565,7 +11792,7 @@ skip_protected_regs: sregs->cr4 = kvm_read_cr4(vcpu); sregs->cr8 = kvm_get_cr8(vcpu); sregs->efer = vcpu->arch.efer; - sregs->apic_base = kvm_get_apic_base(vcpu); + sregs->apic_base = vcpu->arch.apic_base; } static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) @@ -11599,6 +11826,10 @@ static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2) int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { + if (vcpu->kvm->arch.has_protected_state && + vcpu->arch.guest_state_protected) + return -EINVAL; + vcpu_load(vcpu); __get_sregs(vcpu, sregs); vcpu_put(vcpu); @@ -11614,6 +11845,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, if (kvm_mpx_supported()) kvm_load_guest_fpu(vcpu); + kvm_vcpu_srcu_read_lock(vcpu); + r = kvm_apic_accept_events(vcpu); if (r < 0) goto out; @@ -11627,6 +11860,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, mp_state->mp_state = vcpu->arch.mp_state; out: + kvm_vcpu_srcu_read_unlock(vcpu); + if (kvm_mpx_supported()) kvm_put_guest_fpu(vcpu); vcpu_put(vcpu); @@ -11669,10 +11904,10 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, goto out; if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED); set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events); } else - vcpu->arch.mp_state = mp_state->mp_state; + kvm_set_mp_state(vcpu, mp_state->mp_state); kvm_make_request(KVM_REQ_EVENT, vcpu); ret = 0; @@ -11691,7 +11926,13 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason, has_error_code, error_code); - if (ret) { + + /* + * Report an error userspace if MMIO is needed, as KVM doesn't support + * MMIO during a task switch (or any other complex operation). + */ + if (ret || vcpu->mmio_needed) { + vcpu->mmio_needed = false; vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; @@ -11732,16 +11973,13 @@ static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, int *mmu_reset_needed, bool update_pdptrs) { - struct msr_data apic_base_msr; int idx; struct desc_ptr dt; if (!kvm_is_valid_sregs(vcpu, sregs)) return -EINVAL; - apic_base_msr.data = sregs->apic_base; - apic_base_msr.host_initiated = true; - if (kvm_set_apic_base(vcpu, &apic_base_msr)) + if (kvm_apic_set_base(vcpu, sregs->apic_base, true)) return -EINVAL; if (vcpu->arch.guest_state_protected) @@ -11749,27 +11987,27 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, dt.size = sregs->idt.limit; dt.address = sregs->idt.base; - static_call(kvm_x86_set_idt)(vcpu, &dt); + kvm_x86_call(set_idt)(vcpu, &dt); dt.size = sregs->gdt.limit; dt.address = sregs->gdt.base; - static_call(kvm_x86_set_gdt)(vcpu, &dt); + kvm_x86_call(set_gdt)(vcpu, &dt); vcpu->arch.cr2 = sregs->cr2; *mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; vcpu->arch.cr3 = sregs->cr3; kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); - static_call_cond(kvm_x86_post_set_cr3)(vcpu, sregs->cr3); + kvm_x86_call(post_set_cr3)(vcpu, sregs->cr3); kvm_set_cr8(vcpu, sregs->cr8); *mmu_reset_needed |= vcpu->arch.efer != sregs->efer; - static_call(kvm_x86_set_efer)(vcpu, sregs->efer); + kvm_x86_call(set_efer)(vcpu, sregs->efer); *mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; - static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0); + kvm_x86_call(set_cr0)(vcpu, sregs->cr0); *mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; - static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4); + kvm_x86_call(set_cr4)(vcpu, sregs->cr4); if (update_pdptrs) { idx = srcu_read_lock(&vcpu->kvm->srcu); @@ -11796,7 +12034,7 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && !is_protmode(vcpu)) - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); return 0; } @@ -11866,6 +12104,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, { int ret; + if (vcpu->kvm->arch.has_protected_state && + vcpu->arch.guest_state_protected) + return -EINVAL; + vcpu_load(vcpu); ret = __set_sregs(vcpu, sregs); vcpu_put(vcpu); @@ -11943,7 +12185,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, */ kvm_set_rflags(vcpu, rflags); - static_call(kvm_x86_update_exception_bitmap)(vcpu); + kvm_x86_call(update_exception_bitmap)(vcpu); kvm_arch_vcpu_guestdbg_update_apicv_inhibit(vcpu->kvm); @@ -11983,7 +12225,7 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) struct fxregs_state *fxsave; if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) - return 0; + return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0; vcpu_load(vcpu); @@ -12006,7 +12248,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) struct fxregs_state *fxsave; if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) - return 0; + return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0; vcpu_load(vcpu); @@ -12080,7 +12322,7 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) if (id >= kvm->arch.max_vcpu_ids) return -EINVAL; - return static_call(kvm_x86_vcpu_precreate)(kvm); + return kvm_x86_call(vcpu_precreate)(kvm); } int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) @@ -12095,15 +12337,15 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm); if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); else - vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; + kvm_set_mp_state(vcpu, KVM_MP_STATE_UNINITIALIZED); r = kvm_mmu_create(vcpu); if (r < 0) return r; - r = kvm_create_lapic(vcpu, lapic_timer_advance_ns); + r = kvm_create_lapic(vcpu); if (r < 0) goto fail_mmu_destroy; @@ -12134,14 +12376,13 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) goto free_emulate_ctxt; } - vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); - vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu); - - vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; - kvm_async_pf_hash_reset(vcpu); - vcpu->arch.perf_capabilities = kvm_caps.supported_perf_cap; + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) { + vcpu->arch.arch_capabilities = kvm_get_arch_capabilities(); + vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; + vcpu->arch.perf_capabilities = kvm_caps.supported_perf_cap; + } kvm_pmu_init(vcpu); vcpu->arch.pending_external_vector = -1; @@ -12151,15 +12392,13 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.hv_root_tdp = INVALID_PAGE; #endif - r = static_call(kvm_x86_vcpu_create)(vcpu); + r = kvm_x86_call(vcpu_create)(vcpu); if (r) goto free_guest_fpu; - vcpu->arch.arch_capabilities = kvm_get_arch_capabilities(); - vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; kvm_xen_init_vcpu(vcpu); - kvm_vcpu_mtrr_init(vcpu); vcpu_load(vcpu); + kvm_vcpu_after_set_cpuid(vcpu); kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz); kvm_vcpu_reset(vcpu, false); kvm_init_mmu(vcpu); @@ -12205,11 +12444,17 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { - int idx; + int idx, cpu; + + kvm_clear_async_pf_completion_queue(vcpu); + kvm_mmu_unload(vcpu); kvmclock_reset(vcpu); - static_call(kvm_x86_vcpu_free)(vcpu); + for_each_possible_cpu(cpu) + cmpxchg(per_cpu_ptr(&last_vcpu, cpu), vcpu, NULL); + + kvm_x86_call(vcpu_free)(vcpu); kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt); free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); @@ -12305,6 +12550,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) if (!init_event) { vcpu->arch.smbase = 0x30000; + vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; + vcpu->arch.msr_misc_features_enables = 0; vcpu->arch.ia32_misc_enable_msr = MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | MSR_IA32_MISC_ENABLE_BTS_UNAVAIL; @@ -12327,7 +12574,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1); kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600); - static_call(kvm_x86_vcpu_reset)(vcpu, init_event); + kvm_x86_call(vcpu_reset)(vcpu, init_event); kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); kvm_rip_write(vcpu, 0xfff0); @@ -12346,10 +12593,10 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) else new_cr0 |= X86_CR0_NW | X86_CR0_CD; - static_call(kvm_x86_set_cr0)(vcpu, new_cr0); - static_call(kvm_x86_set_cr4)(vcpu, 0); - static_call(kvm_x86_set_efer)(vcpu, 0); - static_call(kvm_x86_update_exception_bitmap)(vcpu); + kvm_x86_call(set_cr0)(vcpu, new_cr0); + kvm_x86_call(set_cr4)(vcpu, 0); + kvm_x86_call(set_efer)(vcpu, 0); + kvm_x86_call(update_exception_bitmap)(vcpu); /* * On the standard CR0/CR4/EFER modification paths, there are several @@ -12390,7 +12637,17 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) } EXPORT_SYMBOL_GPL(kvm_vcpu_deliver_sipi_vector); -int kvm_arch_hardware_enable(void) +void kvm_arch_enable_virtualization(void) +{ + cpu_emergency_register_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu); +} + +void kvm_arch_disable_virtualization(void) +{ + cpu_emergency_unregister_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu); +} + +int kvm_arch_enable_virtualization_cpu(void) { struct kvm *kvm; struct kvm_vcpu *vcpu; @@ -12406,7 +12663,7 @@ int kvm_arch_hardware_enable(void) if (ret) return ret; - ret = static_call(kvm_x86_hardware_enable)(); + ret = kvm_x86_call(enable_virtualization_cpu)(); if (ret != 0) return ret; @@ -12486,9 +12743,9 @@ int kvm_arch_hardware_enable(void) return 0; } -void kvm_arch_hardware_disable(void) +void kvm_arch_disable_virtualization_cpu(void) { - static_call(kvm_x86_hardware_disable)(); + kvm_x86_call(disable_virtualization_cpu)(); drop_user_return_notifiers(); } @@ -12496,24 +12753,13 @@ bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) { return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id; } +EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp); bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) { return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0; } -void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) -{ - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - - vcpu->arch.l1tf_flush_l1d = true; - if (pmu->version && unlikely(pmu->event_count)) { - pmu->need_cleanup = true; - kvm_make_request(KVM_REQ_PMU, vcpu); - } - static_call(kvm_x86_sched_in)(vcpu, cpu); -} - void kvm_arch_free_vm(struct kvm *kvm) { #if IS_ENABLED(CONFIG_HYPERV) @@ -12532,6 +12778,12 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) return -EINVAL; kvm->arch.vm_type = type; + kvm->arch.has_private_mem = + (type == KVM_X86_SW_PROTECTED_VM); + /* Decided by the vendor code for other VM types. */ + kvm->arch.pre_fault_allowed = + type == KVM_X86_DEFAULT_VM || type == KVM_X86_SW_PROTECTED_VM; + kvm->arch.disabled_quirks = kvm_caps.inapplicable_quirks & kvm_caps.supported_quirks; ret = kvm_page_track_init(kvm); if (ret) @@ -12539,7 +12791,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_mmu_init_vm(kvm); - ret = static_call(kvm_x86_vm_init)(kvm); + ret = kvm_x86_call(vm_init)(kvm); if (ret) goto out_uninit_mmu; @@ -12562,6 +12814,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); kvm->arch.default_tsc_khz = max_tsc_khz ? : tsc_khz; + kvm->arch.apic_bus_cycle_ns = APIC_BUS_CYCLE_NS_DEFAULT; kvm->arch.guest_can_read_msr_platform_info = true; kvm->arch.enable_pmu = enable_pmu; @@ -12577,6 +12830,14 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_hv_init_vm(kvm); kvm_xen_init_vm(kvm); + if (ignore_msrs && !report_ignored_msrs) { + pr_warn_once("Running KVM with ignore_msrs=1 and report_ignored_msrs=0 is not a\n" + "a supported configuration. Lying to the guest about the existence of MSRs\n" + "may cause the guest operating system to hang or produce errors. If a guest\n" + "does not run without ignore_msrs=1, please report it to kvm@vger.kernel.org.\n"); + } + + once_init(&kvm->arch.nx_once); return 0; out_uninit_mmu: @@ -12586,36 +12847,6 @@ out: return ret; } -int kvm_arch_post_init_vm(struct kvm *kvm) -{ - return kvm_mmu_post_init_vm(kvm); -} - -static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) -{ - vcpu_load(vcpu); - kvm_mmu_unload(vcpu); - vcpu_put(vcpu); -} - -static void kvm_unload_vcpu_mmus(struct kvm *kvm) -{ - unsigned long i; - struct kvm_vcpu *vcpu; - - kvm_for_each_vcpu(i, vcpu, kvm) { - kvm_clear_async_pf_completion_queue(vcpu); - kvm_unload_vcpu_mmu(vcpu); - } -} - -void kvm_arch_sync_events(struct kvm *kvm) -{ - cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work); - cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); - kvm_free_pit(kvm); -} - /** * __x86_set_memory_region: Setup KVM internal memory slot * @@ -12646,7 +12877,8 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, struct kvm_memslots *slots = kvm_memslots(kvm); struct kvm_memory_slot *slot; - /* Called with kvm->slots_lock held. */ + lockdep_assert_held(&kvm->slots_lock); + if (WARN_ON(id >= KVM_MEM_SLOTS_NUM)) return ERR_PTR_USR(-EINVAL); @@ -12679,7 +12911,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, m.guest_phys_addr = gpa; m.userspace_addr = hva; m.memory_size = size; - r = __kvm_set_memory_region(kvm, &m); + r = kvm_set_internal_memslot(kvm, &m); if (r < 0) return ERR_PTR_USR(r); } @@ -12693,7 +12925,19 @@ EXPORT_SYMBOL_GPL(__x86_set_memory_region); void kvm_arch_pre_destroy_vm(struct kvm *kvm) { + /* + * Stop all background workers and kthreads before destroying vCPUs, as + * iterating over vCPUs in a different task while vCPUs are being freed + * is unsafe, i.e. will lead to use-after-free. The PIT also needs to + * be stopped before IRQ routing is freed. + */ + cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work); + cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); + + kvm_free_pit(kvm); + kvm_mmu_pre_destroy_vm(kvm); + static_call_cond(kvm_x86_vm_pre_destroy)(kvm); } void kvm_arch_destroy_vm(struct kvm *kvm) @@ -12712,18 +12956,17 @@ void kvm_arch_destroy_vm(struct kvm *kvm) __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0); mutex_unlock(&kvm->slots_lock); } - kvm_unload_vcpu_mmus(kvm); - static_call_cond(kvm_x86_vm_destroy)(kvm); + kvm_destroy_vcpus(kvm); kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1)); kvm_pic_destroy(kvm); kvm_ioapic_destroy(kvm); - kvm_destroy_vcpus(kvm); kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1)); kvm_mmu_uninit_vm(kvm); kvm_page_track_cleanup(kvm); kvm_xen_destroy_vm(kvm); kvm_hv_destroy_vm(kvm); + kvm_x86_call(vm_destroy)(kvm); } static void memslot_rmap_free(struct kvm_memory_slot *slot) @@ -12731,7 +12974,7 @@ static void memslot_rmap_free(struct kvm_memory_slot *slot) int i; for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { - kvfree(slot->arch.rmap[i]); + vfree(slot->arch.rmap[i]); slot->arch.rmap[i] = NULL; } } @@ -12743,7 +12986,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) memslot_rmap_free(slot); for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) { - kvfree(slot->arch.lpage_info[i - 1]); + vfree(slot->arch.lpage_info[i - 1]); slot->arch.lpage_info[i - 1] = NULL; } @@ -12780,7 +13023,7 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm, /* * Clear out the previous array pointers for the KVM_MR_MOVE case. The - * old arrays will be freed by __kvm_set_memory_region() if installing + * old arrays will be freed by kvm_set_memory_region() if installing * the new memslot is successful. */ memset(&slot->arch, 0, sizeof(slot->arch)); @@ -12835,7 +13078,7 @@ out_free: memslot_rmap_free(slot); for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) { - kvfree(slot->arch.lpage_info[i - 1]); + vfree(slot->arch.lpage_info[i - 1]); slot->arch.lpage_info[i - 1] = NULL; } return -ENOMEM; @@ -12873,6 +13116,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, if ((new->base_gfn + new->npages - 1) > kvm_mmu_max_gfn()) return -EINVAL; + if (kvm_is_gfn_alias(kvm, new->base_gfn + new->npages - 1)) + return -EINVAL; + return kvm_alloc_memslot_metadata(kvm, new); } @@ -12889,7 +13135,7 @@ static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable) { int nr_slots; - if (!kvm_x86_ops.cpu_dirty_log_size) + if (!kvm->arch.cpu_dirty_log_size) return; nr_slots = atomic_read(&kvm->nr_memslots_dirty_logging); @@ -12941,19 +13187,15 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, if (!log_dirty_pages) { /* - * Dirty logging tracks sptes in 4k granularity, meaning that - * large sptes have to be split. If live migration succeeds, - * the guest in the source machine will be destroyed and large - * sptes will be created in the destination. However, if the - * guest continues to run in the source machine (for example if - * live migration fails), small sptes will remain around and - * cause bad performance. + * Recover huge page mappings in the slot now that dirty logging + * is disabled, i.e. now that KVM does not have to track guest + * writes at 4KiB granularity. * - * Scan sptes if dirty logging has been stopped, dropping those - * which can be collapsed into a single large-page spte. Later - * page faults will create the large-page sptes. + * Dirty logging might be disabled by userspace if an ongoing VM + * live migration is cancelled and the VM must continue running + * on the source. */ - kvm_mmu_zap_collapsible_sptes(kvm, new); + kvm_mmu_recover_huge_pages(kvm, new); } else { /* * Initially-all-set does not require write protecting any page, @@ -12965,7 +13207,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, if (READ_ONCE(eager_page_split)) kvm_mmu_slot_try_split_huge_pages(kvm, new, PG_LEVEL_4K); - if (kvm_x86_ops.cpu_dirty_log_size) { + if (kvm->arch.cpu_dirty_log_size) { kvm_mmu_slot_leaf_clear_dirty(kvm, new); kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M); } else { @@ -13042,102 +13284,23 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, kvm_arch_free_memslot(kvm, old); } -static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) -{ - return (is_guest_mode(vcpu) && - static_call(kvm_x86_guest_apic_has_interrupt)(vcpu)); -} - -static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) +bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) { - if (!list_empty_careful(&vcpu->async_pf.done)) - return true; - - if (kvm_apic_has_pending_init_or_sipi(vcpu) && - kvm_apic_init_sipi_allowed(vcpu)) - return true; - - if (vcpu->arch.pv.pv_unhalted) - return true; - - if (kvm_is_exception_pending(vcpu)) - return true; - - if (kvm_test_request(KVM_REQ_NMI, vcpu) || - (vcpu->arch.nmi_pending && - static_call(kvm_x86_nmi_allowed)(vcpu, false))) - return true; - -#ifdef CONFIG_KVM_SMM - if (kvm_test_request(KVM_REQ_SMI, vcpu) || - (vcpu->arch.smi_pending && - static_call(kvm_x86_smi_allowed)(vcpu, false))) - return true; -#endif - - if (kvm_test_request(KVM_REQ_PMI, vcpu)) - return true; - - if (kvm_arch_interrupt_allowed(vcpu) && - (kvm_cpu_has_interrupt(vcpu) || - kvm_guest_apic_has_interrupt(vcpu))) - return true; - - if (kvm_hv_has_stimer_pending(vcpu)) - return true; + WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu)); - if (is_guest_mode(vcpu) && - kvm_x86_ops.nested_ops->has_events && - kvm_x86_ops.nested_ops->has_events(vcpu)) - return true; - - if (kvm_xen_has_pending_events(vcpu)) + if (vcpu->arch.guest_state_protected) return true; - return false; -} - -int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) -{ - return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); -} - -bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) -{ - return kvm_vcpu_apicv_active(vcpu) && - static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu); + return kvm_x86_call(get_cpl)(vcpu) == 0; } -bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.preempted_in_kernel; -} - -bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) +unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu) { - if (READ_ONCE(vcpu->arch.pv.pv_unhalted)) - return true; + WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu)); - if (kvm_test_request(KVM_REQ_NMI, vcpu) || -#ifdef CONFIG_KVM_SMM - kvm_test_request(KVM_REQ_SMI, vcpu) || -#endif - kvm_test_request(KVM_REQ_EVENT, vcpu)) - return true; - - return kvm_arch_dy_has_pending_interrupt(vcpu); -} - -bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) -{ if (vcpu->arch.guest_state_protected) - return true; - - return static_call(kvm_x86_get_cpl)(vcpu) == 0; -} + return 0; -unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu) -{ return kvm_rip_read(vcpu); } @@ -13148,7 +13311,7 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) { - return static_call(kvm_x86_interrupt_allowed)(vcpu, false); + return kvm_x86_call(interrupt_allowed)(vcpu, false); } unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu) @@ -13174,7 +13337,7 @@ unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) { unsigned long rflags; - rflags = static_call(kvm_x86_get_rflags)(vcpu); + rflags = kvm_x86_call(get_rflags)(vcpu); if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) rflags &= ~X86_EFLAGS_TF; return rflags; @@ -13186,7 +13349,7 @@ static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) rflags |= X86_EFLAGS_TF; - static_call(kvm_x86_set_rflags)(vcpu, rflags); + kvm_x86_call(set_rflags)(vcpu, rflags); } void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) @@ -13297,8 +13460,8 @@ static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu) if (!kvm_pv_async_pf_enabled(vcpu)) return false; - if (vcpu->arch.apf.send_user_only && - static_call(kvm_x86_get_cpl)(vcpu) == 0) + if (!vcpu->arch.apf.send_always && + (vcpu->arch.guest_state_protected || !kvm_x86_call(get_cpl)(vcpu))) return false; if (is_guest_mode(vcpu)) { @@ -13388,7 +13551,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, } vcpu->arch.apf.halted = false; - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); } void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu) @@ -13409,7 +13572,7 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu) void kvm_arch_start_assignment(struct kvm *kvm) { if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1) - static_call_cond(kvm_x86_pi_start_assignment)(kvm); + kvm_x86_call(pi_start_assignment)(kvm); } EXPORT_SYMBOL_GPL(kvm_arch_start_assignment); @@ -13428,13 +13591,15 @@ EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device); static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm) { /* - * Non-coherent DMA assignment and de-assignment will affect - * whether KVM honors guest MTRRs and cause changes in memtypes - * in TDP. - * So, pass %true unconditionally to indicate non-coherent DMA was, - * or will be involved, and that zapping SPTEs might be necessary. + * Non-coherent DMA assignment and de-assignment may affect whether or + * not KVM honors guest PAT, and thus may cause changes in EPT SPTEs + * due to toggling the "ignore PAT" bit. Zap all SPTEs when the first + * (or last) non-coherent device is (un)registered to so that new SPTEs + * with the correct "ignore guest PAT" setting are created. + * + * If KVM always honors guest PAT, however, there is nothing to do. */ - if (__kvm_mmu_honors_guest_mtrrs(true)) + if (kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT)) kvm_zap_gfn_range(kvm, gpa_to_gfn(0), gpa_to_gfn(~0ULL)); } @@ -13458,26 +13623,27 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); -bool kvm_arch_has_irq_bypass(void) -{ - return enable_apicv && irq_remapping_cap(IRQ_POSTING_CAP); -} - int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, struct irq_bypass_producer *prod) { struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); + struct kvm *kvm = irqfd->kvm; int ret; - irqfd->producer = prod; kvm_arch_start_assignment(irqfd->kvm); - ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, - prod->irq, irqfd->gsi, 1); + spin_lock_irq(&kvm->irqfds.lock); + irqfd->producer = prod; + + ret = kvm_x86_call(pi_update_irte)(irqfd->kvm, + prod->irq, irqfd->gsi, 1); if (ret) kvm_arch_end_assignment(irqfd->kvm); + spin_unlock_irq(&kvm->irqfds.lock); + + return ret; } @@ -13487,9 +13653,9 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, int ret; struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); + struct kvm *kvm = irqfd->kvm; WARN_ON(irqfd->producer != prod); - irqfd->producer = NULL; /* * When producer of consumer is unregistered, we change back to @@ -13497,24 +13663,32 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, * when the irq is masked/disabled or the consumer side (KVM * int this case doesn't want to receive the interrupts. */ - ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0); + spin_lock_irq(&kvm->irqfds.lock); + irqfd->producer = NULL; + + ret = kvm_x86_call(pi_update_irte)(irqfd->kvm, + prod->irq, irqfd->gsi, 0); if (ret) printk(KERN_INFO "irq bypass consumer (token %p) unregistration" " fails: %d\n", irqfd->consumer.token, ret); + spin_unlock_irq(&kvm->irqfds.lock); + + kvm_arch_end_assignment(irqfd->kvm); } int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set) { - return static_call(kvm_x86_pi_update_irte)(kvm, host_irq, guest_irq, set); + return kvm_x86_call(pi_update_irte)(kvm, host_irq, guest_irq, set); } bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, struct kvm_kernel_irq_routing_entry *new) { - if (new->type != KVM_IRQ_ROUTING_MSI) + if (old->type != KVM_IRQ_ROUTING_MSI || + new->type != KVM_IRQ_ROUTING_MSI) return true; return !!memcmp(&old->msi, &new->msi, sizeof(new->msi)); @@ -13531,6 +13705,19 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_arch_no_poll); +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE +int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order) +{ + return kvm_x86_call(gmem_prepare)(kvm, pfn, gfn, max_order); +} +#endif + +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE +void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) +{ + kvm_x86_call(gmem_invalidate)(start, end); +} +#endif int kvm_spec_ctrl_test_value(u64 value) { @@ -13545,12 +13732,12 @@ int kvm_spec_ctrl_test_value(u64 value) local_irq_save(flags); - if (rdmsrl_safe(MSR_IA32_SPEC_CTRL, &saved_value)) + if (rdmsrq_safe(MSR_IA32_SPEC_CTRL, &saved_value)) ret = 1; - else if (wrmsrl_safe(MSR_IA32_SPEC_CTRL, value)) + else if (wrmsrq_safe(MSR_IA32_SPEC_CTRL, value)) ret = 1; else - wrmsrl(MSR_IA32_SPEC_CTRL, saved_value); + wrmsrq(MSR_IA32_SPEC_CTRL, saved_value); local_irq_restore(flags); @@ -13640,7 +13827,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) * invalidation. */ if ((!pcid_enabled && (operand.pcid != 0)) || - is_noncanonical_address(operand.gla, vcpu)) { + is_noncanonical_invlpg_address(operand.gla, vcpu)) { kvm_inject_gp(vcpu, 0); return 1; } @@ -13889,6 +14076,7 @@ EXPORT_SYMBOL_GPL(kvm_sev_es_string_io); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_entry); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); @@ -13916,9 +14104,12 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_rmp_fault); static int __init kvm_x86_init(void) { + kvm_init_xstate_sizes(); + kvm_mmu_x86_module_init(); mitigate_smt_rsb &= boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible(); return 0; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index a8b71803777b..832f0faf4779 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -8,6 +8,9 @@ #include <asm/pvclock.h> #include "kvm_cache_regs.h" #include "kvm_emulate.h" +#include "cpuid.h" + +#define KVM_MAX_MCE_BANKS 32 struct kvm_caps { /* control of guest tsc rate supported? */ @@ -24,11 +27,30 @@ struct kvm_caps { bool has_bus_lock_exit; /* notify VM exit supported? */ bool has_notify_vmexit; + /* bit mask of VM types */ + u32 supported_vm_types; u64 supported_mce_cap; u64 supported_xcr0; u64 supported_xss; u64 supported_perf_cap; + + u64 supported_quirks; + u64 inapplicable_quirks; +}; + +struct kvm_host_values { + /* + * The host's raw MAXPHYADDR, i.e. the number of non-reserved physical + * address bits irrespective of features that repurpose legal bits, + * e.g. MKTME. + */ + u8 maxphyaddr; + + u64 efer; + u64 xcr0; + u64 xss; + u64 arch_capabilities; }; void kvm_spurious_fault(void); @@ -87,16 +109,48 @@ static inline unsigned int __shrink_ple_window(unsigned int val, return max(val, min); } -#define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL +#define MSR_IA32_CR_PAT_DEFAULT \ + PAT_VALUE(WB, WT, UC_MINUS, UC, WB, WT, UC_MINUS, UC) void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu); int kvm_check_nested_events(struct kvm_vcpu *vcpu); +/* Forcibly leave the nested mode in cases like a vCPU reset */ +static inline void kvm_leave_nested(struct kvm_vcpu *vcpu) +{ + kvm_x86_ops.nested_ops->leave_nested(vcpu); +} + +/* + * If IBRS is advertised to the vCPU, KVM must flush the indirect branch + * predictors when transitioning from L2 to L1, as L1 expects hardware (KVM in + * this case) to provide separate predictor modes. Bare metal isolates the host + * from the guest, but doesn't isolate different guests from one another (in + * this case L1 and L2). The exception is if bare metal supports same mode IBRS, + * which offers protection within the same mode, and hence protects L1 from L2. + */ +static inline void kvm_nested_vmexit_handle_ibrs(struct kvm_vcpu *vcpu) +{ + if (cpu_feature_enabled(X86_FEATURE_AMD_IBRS_SAME_MODE)) + return; + + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL) || + guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_IBRS)) + indirect_branch_prediction_barrier(); +} + static inline bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu) { return vcpu->arch.last_vmentry_cpu != -1; } +static inline void kvm_set_mp_state(struct kvm_vcpu *vcpu, int mp_state) +{ + vcpu->arch.mp_state = mp_state; + if (mp_state == KVM_MP_STATE_RUNNABLE) + vcpu->arch.pv.pv_unhalted = false; +} + static inline bool kvm_is_exception_pending(struct kvm_vcpu *vcpu) { return vcpu->arch.exception.pending || @@ -157,7 +211,7 @@ static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu) if (!is_long_mode(vcpu)) return false; - static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); + kvm_x86_call(get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); return cs_l; } @@ -210,9 +264,52 @@ static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu) return kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 57 : 48; } -static inline bool is_noncanonical_address(u64 la, struct kvm_vcpu *vcpu) +static inline u8 max_host_virt_addr_bits(void) +{ + return kvm_cpu_cap_has(X86_FEATURE_LA57) ? 57 : 48; +} + +/* + * x86 MSRs which contain linear addresses, x86 hidden segment bases, and + * IDT/GDT bases have static canonicality checks, the size of which depends + * only on the CPU's support for 5-level paging, rather than on the state of + * CR4.LA57. This applies to both WRMSR and to other instructions that set + * their values, e.g. SGDT. + * + * KVM passes through most of these MSRS and also doesn't intercept the + * instructions that set the hidden segment bases. + * + * Because of this, to be consistent with hardware, even if the guest doesn't + * have LA57 enabled in its CPUID, perform canonicality checks based on *host* + * support for 5 level paging. + * + * Finally, instructions which are related to MMU invalidation of a given + * linear address, also have a similar static canonical check on address. + * This allows for example to invalidate 5-level addresses of a guest from a + * host which uses 4-level paging. + */ +static inline bool is_noncanonical_address(u64 la, struct kvm_vcpu *vcpu, + unsigned int flags) +{ + if (flags & (X86EMUL_F_INVLPG | X86EMUL_F_MSR | X86EMUL_F_DT_LOAD)) + return !__is_canonical_address(la, max_host_virt_addr_bits()); + else + return !__is_canonical_address(la, vcpu_virt_addr_bits(vcpu)); +} + +static inline bool is_noncanonical_msr_address(u64 la, struct kvm_vcpu *vcpu) +{ + return is_noncanonical_address(la, vcpu, X86EMUL_F_MSR); +} + +static inline bool is_noncanonical_base_address(u64 la, struct kvm_vcpu *vcpu) +{ + return is_noncanonical_address(la, vcpu, X86EMUL_F_DT_LOAD); +} + +static inline bool is_noncanonical_invlpg_address(u64 la, struct kvm_vcpu *vcpu) { - return !__is_canonical_address(la, vcpu_virt_addr_bits(vcpu)); + return is_noncanonical_address(la, vcpu, X86EMUL_F_INVLPG); } static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu, @@ -295,6 +392,7 @@ void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); u64 get_kvmclock_ns(struct kvm *kvm); uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm); bool kvm_get_monotonic_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp); +int kvm_guest_time_update(struct kvm_vcpu *v); int kvm_read_guest_virt(struct kvm_vcpu *vcpu, gva_t addr, void *val, unsigned int bytes, @@ -309,12 +407,8 @@ int handle_ud(struct kvm_vcpu *vcpu); void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu, struct kvm_queued_exception *ex); -void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu); -u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); -bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, - int page_num); bool kvm_vector_hashing_enabled(void); void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code); int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type, @@ -322,12 +416,10 @@ int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type, int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len); fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); - -extern u64 host_xcr0; -extern u64 host_xss; -extern u64 host_arch_capabilities; +fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu); extern struct kvm_caps kvm_caps; +extern struct kvm_host_values kvm_host; extern bool enable_pmu; @@ -489,19 +581,36 @@ static inline void kvm_machine_check(void) void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu); void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); int kvm_spec_ctrl_test_value(u64 value); -bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, struct x86_exception *e); int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva); bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type); +enum kvm_msr_access { + MSR_TYPE_R = BIT(0), + MSR_TYPE_W = BIT(1), + MSR_TYPE_RW = MSR_TYPE_R | MSR_TYPE_W, +}; + /* * Internal error codes that are used to indicate that MSR emulation encountered - * an error that should result in #GP in the guest, unless userspace - * handles it. + * an error that should result in #GP in the guest, unless userspace handles it. + * Note, '1', '0', and negative numbers are off limits, as they are used by KVM + * as part of KVM's lightly documented internal KVM_RUN return codes. + * + * UNSUPPORTED - The MSR isn't supported, either because it is completely + * unknown to KVM, or because the MSR should not exist according + * to the vCPU model. + * + * FILTERED - Access to the MSR is denied by a userspace MSR filter. */ -#define KVM_MSR_RET_INVALID 2 /* in-kernel MSR emulation #GP condition */ -#define KVM_MSR_RET_FILTERED 3 /* #GP due to userspace MSR filter */ +#define KVM_MSR_RET_UNSUPPORTED 2 +#define KVM_MSR_RET_FILTERED 3 + +static inline bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + return !(cr4 & vcpu->arch.cr4_guest_rsvd_bits); +} #define __cr4_reserved_bits(__cpu_has, __c) \ ({ \ @@ -538,4 +647,24 @@ int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size, unsigned int port, void *data, unsigned int count, int in); +static inline bool user_exit_on_hypercall(struct kvm *kvm, unsigned long hc_nr) +{ + return kvm->arch.hypercall_exit_enabled & BIT(hc_nr); +} + +int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, int cpl, + int (*complete_hypercall)(struct kvm_vcpu *)); + +#define __kvm_emulate_hypercall(_vcpu, cpl, complete_hypercall) \ +({ \ + int __ret; \ + __ret = ____kvm_emulate_hypercall(_vcpu, cpl, complete_hypercall); \ + \ + if (__ret > 0) \ + __ret = complete_hypercall(_vcpu); \ + __ret; \ +}) + +int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); + #endif diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index f65b35a05d91..9b029bb29a16 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -150,11 +150,46 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer) return HRTIMER_NORESTART; } +static int xen_get_guest_pvclock(struct kvm_vcpu *vcpu, + struct pvclock_vcpu_time_info *hv_clock, + struct gfn_to_pfn_cache *gpc, + unsigned int offset) +{ + unsigned long flags; + int r; + + read_lock_irqsave(&gpc->lock, flags); + while (!kvm_gpc_check(gpc, offset + sizeof(*hv_clock))) { + read_unlock_irqrestore(&gpc->lock, flags); + + r = kvm_gpc_refresh(gpc, offset + sizeof(*hv_clock)); + if (r) + return r; + + read_lock_irqsave(&gpc->lock, flags); + } + + memcpy(hv_clock, gpc->khva + offset, sizeof(*hv_clock)); + read_unlock_irqrestore(&gpc->lock, flags); + + /* + * Sanity check TSC shift+multiplier to verify the guest's view of time + * is more or less consistent. + */ + if (hv_clock->tsc_shift != vcpu->arch.pvclock_tsc_shift || + hv_clock->tsc_to_system_mul != vcpu->arch.pvclock_tsc_mul) + return -EINVAL; + + return 0; +} + static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, bool linux_wa) { + struct kvm_vcpu_xen *xen = &vcpu->arch.xen; int64_t kernel_now, delta; uint64_t guest_now; + int r = -EOPNOTSUPP; /* * The guest provides the requested timeout in absolute nanoseconds @@ -173,10 +208,29 @@ static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, * the absolute CLOCK_MONOTONIC time at which the timer should * fire. */ - if (vcpu->arch.hv_clock.version && vcpu->kvm->arch.use_master_clock && - static_cpu_has(X86_FEATURE_CONSTANT_TSC)) { + do { + struct pvclock_vcpu_time_info hv_clock; uint64_t host_tsc, guest_tsc; + if (!static_cpu_has(X86_FEATURE_CONSTANT_TSC) || + !vcpu->kvm->arch.use_master_clock) + break; + + /* + * If both Xen PV clocks are active, arbitrarily try to use the + * compat clock first, but also try to use the non-compat clock + * if the compat clock is unusable. The two PV clocks hold the + * same information, but it's possible one (or both) is stale + * and/or currently unreachable. + */ + if (xen->vcpu_info_cache.active) + r = xen_get_guest_pvclock(vcpu, &hv_clock, &xen->vcpu_info_cache, + offsetof(struct compat_vcpu_info, time)); + if (r && xen->vcpu_time_info_cache.active) + r = xen_get_guest_pvclock(vcpu, &hv_clock, &xen->vcpu_time_info_cache, 0); + if (r) + break; + if (!IS_ENABLED(CONFIG_64BIT) || !kvm_get_monotonic_and_clockread(&kernel_now, &host_tsc)) { /* @@ -197,9 +251,10 @@ static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, /* Calculate the guest kvmclock as the guest would do it. */ guest_tsc = kvm_read_l1_tsc(vcpu, host_tsc); - guest_now = __pvclock_read_cycles(&vcpu->arch.hv_clock, - guest_tsc); - } else { + guest_now = __pvclock_read_cycles(&hv_clock, guest_tsc); + } while (0); + + if (r) { /* * Without CONSTANT_TSC, get_kvmclock_ns() is the only option. * @@ -263,13 +318,6 @@ static void kvm_xen_stop_timer(struct kvm_vcpu *vcpu) atomic_set(&vcpu->arch.xen.timer_pending, 0); } -static void kvm_xen_init_timer(struct kvm_vcpu *vcpu) -{ - hrtimer_init(&vcpu->arch.xen.timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS_HARD); - vcpu->arch.xen.timer.function = xen_timer_callback; -} - static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) { struct kvm_vcpu_xen *vx = &v->arch.xen; @@ -741,7 +789,7 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) } else { void __user * hva = u64_to_user_ptr(data->u.shared_info.hva); - if (!PAGE_ALIGNED(hva) || !access_ok(hva, PAGE_SIZE)) { + if (!PAGE_ALIGNED(hva)) { r = -EINVAL; } else if (!hva) { kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache); @@ -1070,9 +1118,6 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) break; } - if (!vcpu->arch.xen.timer.function) - kvm_xen_init_timer(vcpu); - /* Stop the timer (if it's running) before changing the vector */ kvm_xen_stop_timer(vcpu); vcpu->arch.xen.timer_virq = data->u.timer.port; @@ -1270,7 +1315,7 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data) instructions[0] = 0xb8; /* vmcall / vmmcall */ - static_call(kvm_x86_patch_hypercall)(vcpu, instructions + 5); + kvm_x86_call(patch_hypercall)(vcpu, instructions + 5); /* ret */ instructions[8] = 0xc3; @@ -1290,10 +1335,10 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data) * Note, truncation is a non-issue as 'lm' is guaranteed to be * false for a 32-bit kernel, i.e. when hva_t is only 4 bytes. */ - hva_t blob_addr = lm ? kvm->arch.xen_hvm_config.blob_addr_64 - : kvm->arch.xen_hvm_config.blob_addr_32; - u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64 - : kvm->arch.xen_hvm_config.blob_size_32; + hva_t blob_addr = lm ? kvm->arch.xen.hvm_config.blob_addr_64 + : kvm->arch.xen.hvm_config.blob_addr_32; + u8 blob_size = lm ? kvm->arch.xen.hvm_config.blob_size_64 + : kvm->arch.xen.hvm_config.blob_size_32; u8 *page; int ret; @@ -1334,15 +1379,24 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc) xhc->blob_size_32 || xhc->blob_size_64)) return -EINVAL; + /* + * Restrict the MSR to the range that is unofficially reserved for + * synthetic, virtualization-defined MSRs, e.g. to prevent confusing + * KVM by colliding with a real MSR that requires special handling. + */ + if (xhc->msr && + (xhc->msr < KVM_XEN_MSR_MIN_INDEX || xhc->msr > KVM_XEN_MSR_MAX_INDEX)) + return -EINVAL; + mutex_lock(&kvm->arch.xen.xen_lock); - if (xhc->msr && !kvm->arch.xen_hvm_config.msr) + if (xhc->msr && !kvm->arch.xen.hvm_config.msr) static_branch_inc(&kvm_xen_enabled.key); - else if (!xhc->msr && kvm->arch.xen_hvm_config.msr) + else if (!xhc->msr && kvm->arch.xen.hvm_config.msr) static_branch_slow_dec_deferred(&kvm_xen_enabled); - old_flags = kvm->arch.xen_hvm_config.flags; - memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc)); + old_flags = kvm->arch.xen.hvm_config.flags; + memcpy(&kvm->arch.xen.hvm_config, xhc, sizeof(*xhc)); mutex_unlock(&kvm->arch.xen.xen_lock); @@ -1423,7 +1477,7 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, int i; if (!lapic_in_kernel(vcpu) || - !(vcpu->kvm->arch.xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_EVTCHN_SEND)) + !(vcpu->kvm->arch.xen.hvm_config.flags & KVM_XEN_HVM_CONFIG_EVTCHN_SEND)) return false; if (IS_ENABLED(CONFIG_64BIT) && !longmode) { @@ -1490,7 +1544,7 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, set_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask); if (!wait_pending_event(vcpu, sched_poll.nr_ports, ports)) { - vcpu->arch.mp_state = KVM_MP_STATE_HALTED; + kvm_set_mp_state(vcpu, KVM_MP_STATE_HALTED); if (sched_poll.timeout) mod_timer(&vcpu->arch.xen.poll_timer, @@ -1499,9 +1553,9 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, kvm_vcpu_halt(vcpu); if (sched_poll.timeout) - del_timer(&vcpu->arch.xen.poll_timer); + timer_delete(&vcpu->arch.xen.poll_timer); - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); } vcpu->arch.xen.poll_evtchn = 0; @@ -1517,7 +1571,8 @@ out: static void cancel_evtchn_poll(struct timer_list *t) { - struct kvm_vcpu *vcpu = from_timer(vcpu, t, arch.xen.poll_timer); + struct kvm_vcpu *vcpu = timer_container_of(vcpu, t, + arch.xen.poll_timer); kvm_make_request(KVM_REQ_UNBLOCK, vcpu); kvm_vcpu_kick(vcpu); @@ -1650,7 +1705,7 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu) params[5] = (u64)kvm_r9_read(vcpu); } #endif - cpl = static_call(kvm_x86_get_cpl)(vcpu); + cpl = kvm_x86_call(get_cpl)(vcpu); trace_kvm_xen_hypercall(cpl, input, params[0], params[1], params[2], params[3], params[4], params[5]); @@ -2235,6 +2290,8 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu) vcpu->arch.xen.poll_evtchn = 0; timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0); + hrtimer_setup(&vcpu->arch.xen.timer, xen_timer_callback, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_HARD); kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm); kvm_gpc_init(&vcpu->arch.xen.runstate2_cache, vcpu->kvm); @@ -2252,30 +2309,7 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache); kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_time_info_cache); - del_timer_sync(&vcpu->arch.xen.poll_timer); -} - -void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *entry; - u32 function; - - if (!vcpu->arch.xen.cpuid.base) - return; - - function = vcpu->arch.xen.cpuid.base | XEN_CPUID_LEAF(3); - if (function > vcpu->arch.xen.cpuid.limit) - return; - - entry = kvm_find_cpuid_entry_index(vcpu, function, 1); - if (entry) { - entry->ecx = vcpu->arch.hv_clock.tsc_to_system_mul; - entry->edx = vcpu->arch.hv_clock.tsc_shift; - } - - entry = kvm_find_cpuid_entry_index(vcpu, function, 2); - if (entry) - entry->eax = vcpu->arch.hw_tsc_khz; + timer_delete_sync(&vcpu->arch.xen.poll_timer); } void kvm_xen_init_vm(struct kvm *kvm) @@ -2299,6 +2333,6 @@ void kvm_xen_destroy_vm(struct kvm *kvm) } idr_destroy(&kvm->arch.xen.evtchn_ports); - if (kvm->arch.xen_hvm_config.msr) + if (kvm->arch.xen.hvm_config.msr) static_branch_slow_dec_deferred(&kvm_xen_enabled); } diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h index f5841d9000ae..59e6128a7bd3 100644 --- a/arch/x86/kvm/xen.h +++ b/arch/x86/kvm/xen.h @@ -9,6 +9,7 @@ #ifndef __ARCH_X86_KVM_XEN_H__ #define __ARCH_X86_KVM_XEN_H__ +#include <asm/xen/cpuid.h> #include <asm/xen/hypervisor.h> #ifdef CONFIG_KVM_XEN @@ -35,7 +36,6 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, int kvm_xen_setup_evtchn(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue); -void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu); static inline void kvm_xen_sw_enable_lapic(struct kvm_vcpu *vcpu) { @@ -50,16 +50,32 @@ static inline void kvm_xen_sw_enable_lapic(struct kvm_vcpu *vcpu) kvm_xen_inject_vcpu_vector(vcpu); } +static inline bool kvm_xen_is_tsc_leaf(struct kvm_vcpu *vcpu, u32 function) +{ + return static_branch_unlikely(&kvm_xen_enabled.key) && + vcpu->arch.xen.cpuid.base && + function <= vcpu->arch.xen.cpuid.limit && + function == (vcpu->arch.xen.cpuid.base | XEN_CPUID_LEAF(3)); +} + static inline bool kvm_xen_msr_enabled(struct kvm *kvm) { return static_branch_unlikely(&kvm_xen_enabled.key) && - kvm->arch.xen_hvm_config.msr; + kvm->arch.xen.hvm_config.msr; +} + +static inline bool kvm_xen_is_hypercall_page_msr(struct kvm *kvm, u32 msr) +{ + if (!static_branch_unlikely(&kvm_xen_enabled.key)) + return false; + + return msr && msr == kvm->arch.xen.hvm_config.msr; } static inline bool kvm_xen_hypercall_enabled(struct kvm *kvm) { return static_branch_unlikely(&kvm_xen_enabled.key) && - (kvm->arch.xen_hvm_config.flags & + (kvm->arch.xen.hvm_config.flags & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL); } @@ -124,6 +140,11 @@ static inline bool kvm_xen_msr_enabled(struct kvm *kvm) return false; } +static inline bool kvm_xen_is_hypercall_page_msr(struct kvm *kvm, u32 msr) +{ + return false; +} + static inline bool kvm_xen_hypercall_enabled(struct kvm *kvm) { return false; @@ -157,8 +178,9 @@ static inline bool kvm_xen_timer_enabled(struct kvm_vcpu *vcpu) return false; } -static inline void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu) +static inline bool kvm_xen_is_tsc_leaf(struct kvm_vcpu *vcpu, u32 function) { + return false; } #endif |