diff options
Diffstat (limited to 'arch/x86/kvm/cpuid.c')
-rw-r--r-- | arch/x86/kvm/cpuid.c | 470 |
1 files changed, 349 insertions, 121 deletions
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 07e9215e911d..62bc7a01cecc 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -19,6 +19,7 @@ #include <asm/user.h> #include <asm/fpu/xstate.h> #include <asm/sgx.h> +#include <asm/cpuid.h> #include "cpuid.h" #include "lapic.h" #include "mmu.h" @@ -32,7 +33,7 @@ u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; EXPORT_SYMBOL_GPL(kvm_cpu_caps); -static u32 xstate_required_size(u64 xstate_bv, bool compacted) +u32 xstate_required_size(u64 xstate_bv, bool compacted) { int feature_bit = 0; u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; @@ -42,7 +43,11 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted) if (xstate_bv & 0x1) { u32 eax, ebx, ecx, edx, offset; cpuid_count(0xD, feature_bit, &eax, &ebx, &ecx, &edx); - offset = compacted ? ret : ebx; + /* ECX[1]: 64B alignment in compacted form */ + if (compacted) + offset = (ecx & 0x2) ? ALIGN(ret, 64) : ret; + else + offset = ebx; ret = max(ret, offset + eax); } @@ -62,9 +67,17 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted) #define F feature_bit #define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0) +/* + * Magic value used by KVM when querying userspace-provided CPUID entries and + * doesn't care about the CPIUD index because the index of the function in + * question is not significant. Note, this magic value must have at least one + * bit set in bits[63:32] and must be consumed as a u64 by cpuid_entry2_find() + * to avoid false positives when processing guest CPUID input. + */ +#define KVM_CPUID_INDEX_NOT_SIGNIFICANT -1ull static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( - struct kvm_cpuid_entry2 *entries, int nent, u32 function, u32 index) + struct kvm_cpuid_entry2 *entries, int nent, u32 function, u64 index) { struct kvm_cpuid_entry2 *e; int i; @@ -72,23 +85,49 @@ static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( for (i = 0; i < nent; i++) { e = &entries[i]; - if (e->function == function && - (!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) || e->index == index)) + if (e->function != function) + continue; + + /* + * If the index isn't significant, use the first entry with a + * matching function. It's userspace's responsibilty to not + * provide "duplicate" entries in all cases. + */ + if (!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) || e->index == index) + return e; + + + /* + * Similarly, use the first matching entry if KVM is doing a + * lookup (as opposed to emulating CPUID) for a function that's + * architecturally defined as not having a significant index. + */ + if (index == KVM_CPUID_INDEX_NOT_SIGNIFICANT) { + /* + * Direct lookups from KVM should not diverge from what + * KVM defines internally (the architectural behavior). + */ + WARN_ON_ONCE(cpuid_function_is_indexed(function)); return e; + } } return NULL; } -static int kvm_check_cpuid(struct kvm_cpuid_entry2 *entries, int nent) +static int kvm_check_cpuid(struct kvm_vcpu *vcpu, + struct kvm_cpuid_entry2 *entries, + int nent) { struct kvm_cpuid_entry2 *best; + u64 xfeatures; /* * The existing code assumes virtual address is 48-bit or 57-bit in the * canonical address checks; exit if it is ever changed. */ - best = cpuid_entry2_find(entries, nent, 0x80000008, 0); + best = cpuid_entry2_find(entries, nent, 0x80000008, + KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (best) { int vaddr_bits = (best->eax & 0xff00) >> 8; @@ -96,6 +135,42 @@ static int kvm_check_cpuid(struct kvm_cpuid_entry2 *entries, int nent) return -EINVAL; } + /* + * Exposing dynamic xfeatures to the guest requires additional + * enabling in the FPU, e.g. to expand the guest XSAVE state size. + */ + best = cpuid_entry2_find(entries, nent, 0xd, 0); + if (!best) + return 0; + + xfeatures = best->eax | ((u64)best->edx << 32); + xfeatures &= XFEATURE_MASK_USER_DYNAMIC; + if (!xfeatures) + return 0; + + return fpu_enable_guest_xfd_features(&vcpu->arch.guest_fpu, xfeatures); +} + +/* Check whether the supplied CPUID data is equal to what is already set for the vCPU. */ +static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, + int nent) +{ + struct kvm_cpuid_entry2 *orig; + int i; + + if (nent != vcpu->arch.cpuid_nent) + return -EINVAL; + + for (i = 0; i < nent; i++) { + orig = &vcpu->arch.cpuid_entries[i]; + if (e2[i].function != orig->function || + e2[i].index != orig->index || + e2[i].flags != orig->flags || + e2[i].eax != orig->eax || e2[i].ebx != orig->ebx || + e2[i].ecx != orig->ecx || e2[i].edx != orig->edx) + return -EINVAL; + } + return 0; } @@ -107,7 +182,7 @@ static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu) vcpu->arch.kvm_cpuid_base = 0; for_each_possible_hypervisor_cpuid_base(function) { - entry = kvm_find_cpuid_entry(vcpu, function, 0); + entry = kvm_find_cpuid_entry(vcpu, function); if (entry) { u32 signature[3]; @@ -125,14 +200,22 @@ static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu) } } -static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu) +static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu, + struct kvm_cpuid_entry2 *entries, int nent) { u32 base = vcpu->arch.kvm_cpuid_base; if (!base) return NULL; - return kvm_find_cpuid_entry(vcpu, base | KVM_CPUID_FEATURES, 0); + return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES, + KVM_CPUID_INDEX_NOT_SIGNIFICANT); +} + +static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu) +{ + return __kvm_find_kvm_cpuid_features(vcpu, vcpu->arch.cpuid_entries, + vcpu->arch.cpuid_nent); } void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) @@ -147,11 +230,28 @@ void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) vcpu->arch.pv_cpuid.features = best->eax; } -void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) +/* + * Calculate guest's supported XCR0 taking into account guest CPUID data and + * KVM's supported XCR0 (comprised of host's XCR0 and KVM_SUPPORTED_XCR0). + */ +static u64 cpuid_get_supported_xcr0(struct kvm_cpuid_entry2 *entries, int nent) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 1, 0); + best = cpuid_entry2_find(entries, nent, 0xd, 0); + if (!best) + return 0; + + return (best->eax | ((u64)best->edx << 32)) & kvm_caps.supported_xcr0; +} + +static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries, + int nent) +{ + struct kvm_cpuid_entry2 *best; + u64 guest_supported_xcr0 = cpuid_get_supported_xcr0(entries, nent); + + best = cpuid_entry2_find(entries, nent, 1, KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (best) { /* Update OSXSAVE bit */ if (boot_cpu_has(X86_FEATURE_XSAVE)) @@ -162,41 +262,70 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE); } - best = kvm_find_cpuid_entry(vcpu, 7, 0); + best = cpuid_entry2_find(entries, nent, 7, 0); if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7) cpuid_entry_change(best, X86_FEATURE_OSPKE, kvm_read_cr4_bits(vcpu, X86_CR4_PKE)); - best = kvm_find_cpuid_entry(vcpu, 0xD, 0); + best = cpuid_entry2_find(entries, nent, 0xD, 0); if (best) best->ebx = xstate_required_size(vcpu->arch.xcr0, false); - best = kvm_find_cpuid_entry(vcpu, 0xD, 1); + best = cpuid_entry2_find(entries, nent, 0xD, 1); if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) || cpuid_entry_has(best, X86_FEATURE_XSAVEC))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); - best = kvm_find_kvm_cpuid_features(vcpu); + best = __kvm_find_kvm_cpuid_features(vcpu, entries, nent); if (kvm_hlt_in_guest(vcpu->kvm) && best && (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) { - best = kvm_find_cpuid_entry(vcpu, 0x1, 0); + best = cpuid_entry2_find(entries, nent, 0x1, KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (best) cpuid_entry_change(best, X86_FEATURE_MWAIT, vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT); } + + /* + * Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate + * the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's + * requested XCR0 value. The enclave's XFRM must be a subset of XCRO + * at the time of EENTER, thus adjust the allowed XFRM by the guest's + * supported XCR0. Similar to XCR0 handling, FP and SSE are forced to + * '1' even on CPUs that don't support XSAVE. + */ + best = cpuid_entry2_find(entries, nent, 0x12, 0x1); + if (best) { + best->ecx &= guest_supported_xcr0 & 0xffffffff; + best->edx &= guest_supported_xcr0 >> 32; + best->ecx |= XFEATURE_MASK_FPSSE; + } +} + +void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) +{ + __kvm_update_cpuid_runtime(vcpu, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent); } EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime); +static bool kvm_cpuid_has_hyperv(struct kvm_cpuid_entry2 *entries, int nent) +{ + struct kvm_cpuid_entry2 *entry; + + entry = cpuid_entry2_find(entries, nent, HYPERV_CPUID_INTERFACE, + KVM_CPUID_INDEX_NOT_SIGNIFICANT); + return entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX; +} + static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 1, 0); + best = kvm_find_cpuid_entry(vcpu, 1); if (best && apic) { if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER)) apic->lapic_timer.timer_mode_mask = 3 << 17; @@ -206,27 +335,16 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_apic_set_version(vcpu); } - best = kvm_find_cpuid_entry(vcpu, 0xD, 0); - if (!best) - vcpu->arch.guest_supported_xcr0 = 0; - else - vcpu->arch.guest_supported_xcr0 = - (best->eax | ((u64)best->edx << 32)) & supported_xcr0; + vcpu->arch.guest_supported_xcr0 = + cpuid_get_supported_xcr0(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent); /* - * Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate - * the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's - * requested XCR0 value. The enclave's XFRM must be a subset of XCRO - * at the time of EENTER, thus adjust the allowed XFRM by the guest's - * supported XCR0. Similar to XCR0 handling, FP and SSE are forced to - * '1' even on CPUs that don't support XSAVE. + * FP+SSE can always be saved/restored via KVM_{G,S}ET_XSAVE, even if + * XSAVE/XCRO are not exposed to the guest, and even if XSAVE isn't + * supported by the host. */ - best = kvm_find_cpuid_entry(vcpu, 0x12, 0x1); - if (best) { - best->ecx &= vcpu->arch.guest_supported_xcr0 & 0xffffffff; - best->edx &= vcpu->arch.guest_supported_xcr0 >> 32; - best->ecx |= XFEATURE_MASK_FPSSE; - } + vcpu->arch.guest_fpu.fpstate->user_xfeatures = vcpu->arch.guest_supported_xcr0 | + XFEATURE_MASK_FPSSE; kvm_update_pv_runtime(vcpu); @@ -237,7 +355,8 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vcpu->arch.cr4_guest_rsvd_bits = __cr4_reserved_bits(guest_cpuid_has, vcpu); - kvm_hv_set_cpuid(vcpu); + kvm_hv_set_cpuid(vcpu, kvm_cpuid_has_hyperv(vcpu->arch.cpuid_entries, + vcpu->arch.cpuid_nent)); /* Invoke the vendor callback only after the above state is updated. */ static_call(kvm_x86_vcpu_after_set_cpuid)(vcpu); @@ -253,10 +372,10 @@ int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0); + best = kvm_find_cpuid_entry(vcpu, 0x80000000); if (!best || best->eax < 0x80000008) goto not_found; - best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); + best = kvm_find_cpuid_entry(vcpu, 0x80000008); if (best) return best->eax & 0xff; not_found: @@ -276,21 +395,48 @@ u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu) static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, int nent) { - int r; + int r; + + __kvm_update_cpuid_runtime(vcpu, e2, nent); + + /* + * KVM does not correctly handle changing guest CPUID after KVM_RUN, as + * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't + * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page + * faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with + * the core vCPU model on the fly. It would've been better to forbid any + * KVM_SET_CPUID{,2} calls after KVM_RUN altogether but unfortunately + * some VMMs (e.g. QEMU) reuse vCPU fds for CPU hotplug/unplug and do + * KVM_SET_CPUID{,2} again. To support this legacy behavior, check + * whether the supplied CPUID data is equal to what's already set. + */ + if (vcpu->arch.last_vmentry_cpu != -1) { + r = kvm_cpuid_check_equal(vcpu, e2, nent); + if (r) + return r; + + kvfree(e2); + return 0; + } - r = kvm_check_cpuid(e2, nent); - if (r) - return r; + if (kvm_cpuid_has_hyperv(e2, nent)) { + r = kvm_hv_vcpu_init(vcpu); + if (r) + return r; + } + + r = kvm_check_cpuid(vcpu, e2, nent); + if (r) + return r; - kvfree(vcpu->arch.cpuid_entries); - vcpu->arch.cpuid_entries = e2; - vcpu->arch.cpuid_nent = nent; + kvfree(vcpu->arch.cpuid_entries); + vcpu->arch.cpuid_entries = e2; + vcpu->arch.cpuid_nent = nent; - kvm_update_kvm_cpuid_base(vcpu); - kvm_update_cpuid_runtime(vcpu); - kvm_vcpu_after_set_cpuid(vcpu); + kvm_update_kvm_cpuid_base(vcpu); + kvm_vcpu_after_set_cpuid(vcpu); - return 0; + return 0; } /* when an old userspace process fills a new kernel module */ @@ -422,9 +568,11 @@ void kvm_set_cpu_caps(void) #ifdef CONFIG_X86_64 unsigned int f_gbpages = F(GBPAGES); unsigned int f_lm = F(LM); + unsigned int f_xfd = F(XFD); #else unsigned int f_gbpages = 0; unsigned int f_lm = 0; + unsigned int f_xfd = 0; #endif memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps)); @@ -463,12 +611,13 @@ void kvm_set_cpu_caps(void) ); kvm_cpu_cap_mask(CPUID_7_0_EBX, - F(FSGSBASE) | F(SGX) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | - F(BMI2) | F(ERMS) | F(INVPCID) | F(RTM) | 0 /*MPX*/ | F(RDSEED) | - F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) | - F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | - F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | 0 /*INTEL_PT*/ - ); + F(FSGSBASE) | F(SGX) | F(BMI1) | F(HLE) | F(AVX2) | + F(FDP_EXCPTN_ONLY) | F(SMEP) | F(BMI2) | F(ERMS) | F(INVPCID) | + F(RTM) | F(ZERO_FCS_FDS) | 0 /*MPX*/ | F(AVX512F) | + F(AVX512DQ) | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512IFMA) | + F(CLFLUSHOPT) | F(CLWB) | 0 /*INTEL_PT*/ | F(AVX512PF) | + F(AVX512ER) | F(AVX512CD) | F(SHA_NI) | F(AVX512BW) | + F(AVX512VL)); kvm_cpu_cap_mask(CPUID_7_ECX, F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) | @@ -492,7 +641,8 @@ void kvm_set_cpu_caps(void) F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) | - F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) + F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) | + F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) ); /* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */ @@ -511,7 +661,7 @@ void kvm_set_cpu_caps(void) ); kvm_cpu_cap_mask(CPUID_D_1_EAX, - F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) + F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) | f_xfd ); kvm_cpu_cap_init_scattered(CPUID_12_EAX, @@ -523,7 +673,7 @@ void kvm_set_cpu_caps(void) F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) | 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) | - F(TOPOEXT) | F(PERFCTR_CORE) + F(TOPOEXT) | 0 /* PERFCTR_CORE */ ); kvm_cpu_cap_mask(CPUID_8000_0001_EDX, @@ -619,29 +769,37 @@ static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array, entry = &array->entries[array->nent++]; + memset(entry, 0, sizeof(*entry)); entry->function = function; entry->index = index; - entry->flags = 0; + switch (function & 0xC0000000) { + case 0x40000000: + /* Hypervisor leaves are always synthesized by __do_cpuid_func. */ + return entry; + + case 0x80000000: + /* + * 0x80000021 is sometimes synthesized by __do_cpuid_func, which + * would result in out-of-bounds calls to do_host_cpuid. + */ + { + static int max_cpuid_80000000; + if (!READ_ONCE(max_cpuid_80000000)) + WRITE_ONCE(max_cpuid_80000000, cpuid_eax(0x80000000)); + if (function > READ_ONCE(max_cpuid_80000000)) + return entry; + } + break; + + default: + break; + } cpuid_count(entry->function, entry->index, &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); - switch (function) { - case 4: - case 7: - case 0xb: - case 0xd: - case 0xf: - case 0x10: - case 0x12: - case 0x14: - case 0x17: - case 0x18: - case 0x1f: - case 0x8000001d: + if (cpuid_function_is_indexed(function)) entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - break; - } return entry; } @@ -760,36 +918,29 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->edx = 0; } break; - case 9: - break; case 0xa: { /* Architectural Performance Monitoring */ - struct x86_pmu_capability cap; union cpuid10_eax eax; union cpuid10_edx edx; - perf_get_x86_pmu_capability(&cap); - - /* - * Only support guest architectural pmu on a host - * with architectural pmu. - */ - if (!cap.version) - memset(&cap, 0, sizeof(cap)); + if (!static_cpu_has(X86_FEATURE_ARCH_PERFMON)) { + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; + break; + } - eax.split.version_id = min(cap.version, 2); - eax.split.num_counters = cap.num_counters_gp; - eax.split.bit_width = cap.bit_width_gp; - eax.split.mask_length = cap.events_mask_len; + eax.split.version_id = kvm_pmu_cap.version; + eax.split.num_counters = kvm_pmu_cap.num_counters_gp; + eax.split.bit_width = kvm_pmu_cap.bit_width_gp; + eax.split.mask_length = kvm_pmu_cap.events_mask_len; + edx.split.num_counters_fixed = kvm_pmu_cap.num_counters_fixed; + edx.split.bit_width_fixed = kvm_pmu_cap.bit_width_fixed; - edx.split.num_counters_fixed = min(cap.num_counters_fixed, MAX_FIXED_COUNTERS); - edx.split.bit_width_fixed = cap.bit_width_fixed; - if (cap.version) + if (kvm_pmu_cap.version) edx.split.anythread_deprecated = 1; edx.split.reserved1 = 0; edx.split.reserved2 = 0; entry->eax = eax.full; - entry->ebx = cap.events_mask; + entry->ebx = kvm_pmu_cap.events_mask; entry->ecx = 0; entry->edx = edx.full; break; @@ -811,12 +962,15 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) goto out; } break; - case 0xd: - entry->eax &= supported_xcr0; - entry->ebx = xstate_required_size(supported_xcr0, false); + case 0xd: { + u64 permitted_xcr0 = kvm_caps.supported_xcr0 & xstate_get_guest_group_perm(); + u64 permitted_xss = kvm_caps.supported_xss; + + entry->eax &= permitted_xcr0; + entry->ebx = xstate_required_size(permitted_xcr0, false); entry->ecx = entry->ebx; - entry->edx &= supported_xcr0 >> 32; - if (!supported_xcr0) + entry->edx &= permitted_xcr0 >> 32; + if (!permitted_xcr0) break; entry = do_host_cpuid(array, function, 1); @@ -825,20 +979,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) cpuid_entry_override(entry, CPUID_D_1_EAX); if (entry->eax & (F(XSAVES)|F(XSAVEC))) - entry->ebx = xstate_required_size(supported_xcr0 | supported_xss, + entry->ebx = xstate_required_size(permitted_xcr0 | permitted_xss, true); else { - WARN_ON_ONCE(supported_xss != 0); + WARN_ON_ONCE(permitted_xss != 0); entry->ebx = 0; } - entry->ecx &= supported_xss; - entry->edx &= supported_xss >> 32; + entry->ecx &= permitted_xss; + entry->edx &= permitted_xss >> 32; for (i = 2; i < 64; ++i) { bool s_state; - if (supported_xcr0 & BIT_ULL(i)) + if (permitted_xcr0 & BIT_ULL(i)) s_state = false; - else if (supported_xss & BIT_ULL(i)) + else if (permitted_xss & BIT_ULL(i)) s_state = true; else continue; @@ -852,16 +1006,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) * invalid sub-leafs. Only valid sub-leafs should * reach this point, and they should have a non-zero * save state size. Furthermore, check whether the - * processor agrees with supported_xcr0/supported_xss + * processor agrees with permitted_xcr0/permitted_xss * on whether this is an XCR0- or IA32_XSS-managed area. */ if (WARN_ON_ONCE(!entry->eax || (entry->ecx & 0x1) != s_state)) { --array->nent; continue; } + + if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) + entry->ecx &= ~BIT_ULL(2); entry->edx = 0; } break; + } case 0x12: /* Intel SGX */ if (!kvm_cpu_cap_has(X86_FEATURE_SGX)) { @@ -906,6 +1064,24 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) goto out; } break; + /* Intel AMX TILE */ + case 0x1d: + if (!kvm_cpu_cap_has(X86_FEATURE_AMX_TILE)) { + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; + break; + } + + for (i = 1, max_idx = entry->eax; i <= max_idx; ++i) { + if (!do_host_cpuid(array, function, i)) + goto out; + } + break; + case 0x1e: /* TMUL information */ + if (!kvm_cpu_cap_has(X86_FEATURE_AMX_TILE)) { + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; + break; + } + break; case KVM_CPUID_SIGNATURE: { const u32 *sigptr = (const u32 *)KVM_SIGNATURE; entry->eax = KVM_CPUID_FEATURES; @@ -937,14 +1113,33 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->edx = 0; break; case 0x80000000: - entry->eax = min(entry->eax, 0x8000001f); + entry->eax = min(entry->eax, 0x80000021); + /* + * Serializing LFENCE is reported in a multitude of ways, and + * NullSegClearsBase is not reported in CPUID on Zen2; help + * userspace by providing the CPUID leaf ourselves. + * + * However, only do it if the host has CPUID leaf 0x8000001d. + * QEMU thinks that it can query the host blindly for that + * CPUID leaf if KVM reports that it supports 0x8000001d or + * above. The processor merrily returns values from the + * highest Intel leaf which QEMU tries to use as the guest's + * 0x8000001d. Even worse, this can result in an infinite + * loop if said highest leaf has no subleaves indexed by ECX. + */ + if (entry->eax >= 0x8000001d && + (static_cpu_has(X86_FEATURE_LFENCE_RDTSC) + || !static_cpu_has_bug(X86_BUG_NULL_SEG))) + entry->eax = max(entry->eax, 0x80000021); break; case 0x80000001: + entry->ebx &= ~GENMASK(27, 16); cpuid_entry_override(entry, CPUID_8000_0001_EDX); cpuid_entry_override(entry, CPUID_8000_0001_ECX); break; case 0x80000006: - /* L2 cache and TLB: pass through host info. */ + /* Drop reserved bits, pass host L2 cache and TLB info. */ + entry->edx &= ~GENMASK(17, 16); break; case 0x80000007: /* Advanced power management */ /* invariant TSC is CPUID.80000007H:EDX[8] */ @@ -974,6 +1169,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) g_phys_as = phys_as; entry->eax = g_phys_as | (virt_as << 8); + entry->ecx &= ~(GENMASK(31, 16) | GENMASK(11, 8)); entry->edx = 0; cpuid_entry_override(entry, CPUID_8000_0008_EBX); break; @@ -993,6 +1189,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->ecx = entry->edx = 0; break; case 0x8000001a: + entry->eax &= GENMASK(2, 0); + entry->ebx = entry->ecx = entry->edx = 0; + break; case 0x8000001e: break; case 0x8000001F: @@ -1000,7 +1199,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->eax = entry->ebx = entry->ecx = entry->edx = 0; } else { cpuid_entry_override(entry, CPUID_8000_001F_EAX); - + /* Clear NumVMPL since KVM does not support VMPL. */ + entry->ebx &= ~GENMASK(31, 12); /* * Enumerate '0' for "PA bits reduction", the adjusted * MAXPHYADDR is enumerated directly (see 0x80000008). @@ -1008,6 +1208,27 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->ebx &= ~GENMASK(11, 6); } break; + case 0x80000020: + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; + break; + case 0x80000021: + entry->ebx = entry->ecx = entry->edx = 0; + /* + * Pass down these bits: + * EAX 0 NNDBP, Processor ignores nested data breakpoints + * EAX 2 LAS, LFENCE always serializing + * EAX 6 NSCB, Null selector clear base + * + * Other defined bits are for MSRs that KVM does not expose: + * EAX 3 SPCL, SMM page configuration lock + * EAX 13 PCMSR, Prefetch control MSR + */ + entry->eax &= BIT(0) | BIT(2) | BIT(6); + if (static_cpu_has(X86_FEATURE_LFENCE_RDTSC)) + entry->eax |= BIT(2); + if (!static_cpu_has_bug(X86_BUG_NULL_SEG)) + entry->eax |= BIT(6); + break; /*Add support for Centaur's CPUID instruction*/ case 0xC0000000: /*Just support up to 0xC0000004 now*/ @@ -1117,8 +1338,7 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, if (sanity_check_entries(entries, cpuid->nent, type)) return -EINVAL; - array.entries = vzalloc(array_size(sizeof(struct kvm_cpuid_entry2), - cpuid->nent)); + array.entries = kvcalloc(cpuid->nent, sizeof(struct kvm_cpuid_entry2), GFP_KERNEL); if (!array.entries) return -ENOMEM; @@ -1136,16 +1356,24 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, r = -EFAULT; out_free: - vfree(array.entries); + kvfree(array.entries); return r; } -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, - u32 function, u32 index) +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, + u32 function, u32 index) { return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent, function, index); } +EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry_index); + +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, + u32 function) +{ + return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent, + function, KVM_CPUID_INDEX_NOT_SIGNIFICANT); +} EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); /* @@ -1182,7 +1410,7 @@ get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index) struct kvm_cpuid_entry2 *basic, *class; u32 function = *fn_ptr; - basic = kvm_find_cpuid_entry(vcpu, 0, 0); + basic = kvm_find_cpuid_entry(vcpu, 0); if (!basic) return NULL; @@ -1191,11 +1419,11 @@ get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index) return NULL; if (function >= 0x40000000 && function <= 0x4fffffff) - class = kvm_find_cpuid_entry(vcpu, function & 0xffffff00, 0); + class = kvm_find_cpuid_entry(vcpu, function & 0xffffff00); else if (function >= 0xc0000000) - class = kvm_find_cpuid_entry(vcpu, 0xc0000000, 0); + class = kvm_find_cpuid_entry(vcpu, 0xc0000000); else - class = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); + class = kvm_find_cpuid_entry(vcpu, function & 0x80000000); if (class && function <= class->eax) return NULL; @@ -1213,7 +1441,7 @@ get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index) * the effective CPUID entry is the max basic leaf. Note, the index of * the original requested leaf is observed! */ - return kvm_find_cpuid_entry(vcpu, basic->eax, index); + return kvm_find_cpuid_entry_index(vcpu, basic->eax, index); } bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, @@ -1223,7 +1451,7 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, struct kvm_cpuid_entry2 *entry; bool exact, used_max_basic = false; - entry = kvm_find_cpuid_entry(vcpu, function, index); + entry = kvm_find_cpuid_entry_index(vcpu, function, index); exact = !!entry; if (!entry && !exact_only) { @@ -1252,7 +1480,7 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, * exists. EDX can be copied from any existing index. */ if (function == 0xb || function == 0x1f) { - entry = kvm_find_cpuid_entry(vcpu, function, 1); + entry = kvm_find_cpuid_entry_index(vcpu, function, 1); if (entry) { *ecx = index & 0xff; *edx = entry->edx; |