diff options
Diffstat (limited to 'arch/x86/kvm/svm/svm.c')
-rw-r--r-- | arch/x86/kvm/svm/svm.c | 174 |
1 files changed, 102 insertions, 72 deletions
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index b649f92287a2..8834822c00cd 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -43,6 +43,9 @@ #include "svm.h" #include "svm_ops.h" +#include "kvm_onhyperv.h" +#include "svm_onhyperv.h" + #define __ex(x) __kvm_handle_fault_on_reboot(x) MODULE_AUTHOR("Qumranet"); @@ -185,6 +188,13 @@ module_param(vls, int, 0444); static int vgif = true; module_param(vgif, int, 0444); +/* + * enable / disable AVIC. Because the defaults differ for APICv + * support between VMX and SVM we cannot use module_param_named. + */ +static bool avic; +module_param(avic, bool, 0444); + bool __read_mostly dump_invalid_vmcb; module_param(dump_invalid_vmcb, bool, 0644); @@ -212,7 +222,7 @@ DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to * defer the restoration of TSC_AUX until the CPU returns to userspace. */ -#define TSC_AUX_URET_SLOT 0 +static int tsc_aux_uret_slot __read_mostly = -1; static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; @@ -447,6 +457,11 @@ static int has_svm(void) return 0; } + if (pgtable_l5_enabled()) { + pr_info("KVM doesn't yet support 5-level paging on AMD SVM\n"); + return 0; + } + return 1; } @@ -668,6 +683,9 @@ static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm, write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp); msrpm[offset] = tmp; + + svm_hv_vmcb_dirty_nested_enlightenments(vcpu); + } void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, @@ -858,8 +876,8 @@ static __init void svm_adjust_mmio_mask(void) return; /* If memory encryption is not enabled, use existing mask */ - rdmsrl(MSR_K8_SYSCFG, msr); - if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + rdmsrl(MSR_AMD64_SYSCFG, msr); + if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) return; enc_bit = cpuid_ebx(0x8000001f) & 0x3f; @@ -934,6 +952,16 @@ static __init int svm_hardware_setup(void) int r; unsigned int order = get_order(IOPM_SIZE); + /* + * NX is required for shadow paging and for NPT if the NX huge pages + * mitigation is enabled. + */ + if (!boot_cpu_has(X86_FEATURE_NX)) { + pr_err_ratelimited("NX (Execute Disable) not supported\n"); + return -EOPNOTSUPP; + } + kvm_enable_efer_bits(EFER_NX); + iopm_pages = alloc_pages(GFP_KERNEL, order); if (!iopm_pages) @@ -947,9 +975,6 @@ static __init int svm_hardware_setup(void) supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); - if (boot_cpu_has(X86_FEATURE_NX)) - kvm_enable_efer_bits(EFER_NX); - if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) kvm_enable_efer_bits(EFER_FFXSR); @@ -959,8 +984,7 @@ static __init int svm_hardware_setup(void) kvm_tsc_scaling_ratio_frac_bits = 32; } - if (boot_cpu_has(X86_FEATURE_RDTSCP)) - kvm_define_user_return_msr(TSC_AUX_URET_SLOT, MSR_TSC_AUX); + tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); /* Check for pause filtering support */ if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { @@ -992,6 +1016,8 @@ static __init int svm_hardware_setup(void) /* Note, SEV setup consumes npt_enabled. */ sev_hardware_setup(); + svm_hv_hardware_setup(); + svm_adjust_mmio_mask(); for_each_possible_cpu(cpu) { @@ -1005,16 +1031,12 @@ static __init int svm_hardware_setup(void) nrips = false; } - if (avic) { - if (!npt_enabled || - !boot_cpu_has(X86_FEATURE_AVIC) || - !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) { - avic = false; - } else { - pr_info("AVIC enabled\n"); + enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC); - amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); - } + if (enable_apicv) { + pr_info("AVIC enabled\n"); + + amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); } if (vls) { @@ -1078,29 +1100,35 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) seg->base = 0; } -static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - u64 g_tsc_offset = 0; - if (is_guest_mode(vcpu)) { - /* Write L1's TSC offset. */ - g_tsc_offset = svm->vmcb->control.tsc_offset - - svm->vmcb01.ptr->control.tsc_offset; - svm->vmcb01.ptr->control.tsc_offset = offset; - } + return svm->nested.ctl.tsc_offset; +} - trace_kvm_write_tsc_offset(vcpu->vcpu_id, - svm->vmcb->control.tsc_offset - g_tsc_offset, - offset); +static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) +{ + return kvm_default_tsc_scaling_ratio; +} - svm->vmcb->control.tsc_offset = offset + g_tsc_offset; +static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +{ + struct vcpu_svm *svm = to_svm(vcpu); + svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset; + svm->vmcb->control.tsc_offset = offset; vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); - return svm->vmcb->control.tsc_offset; } -static void svm_check_invpcid(struct vcpu_svm *svm) +static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) +{ + wrmsrl(MSR_AMD64_TSC_RATIO, multiplier); +} + +/* Evaluate instruction intercepts that depend on guest CPUID features. */ +static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu, + struct vcpu_svm *svm) { /* * Intercept INVPCID if shadow paging is enabled to sync/free shadow @@ -1113,6 +1141,13 @@ static void svm_check_invpcid(struct vcpu_svm *svm) else svm_clr_intercept(svm, INTERCEPT_INVPCID); } + + if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) { + if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) + svm_clr_intercept(svm, INTERCEPT_RDTSCP); + else + svm_set_intercept(svm, INTERCEPT_RDTSCP); + } } static void init_vmcb(struct kvm_vcpu *vcpu) @@ -1235,8 +1270,8 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm->current_vmcb->asid_generation = 0; svm->asid = 0; - svm->nested.vmcb12_gpa = 0; - svm->nested.last_vmcb12_gpa = 0; + svm->nested.vmcb12_gpa = INVALID_GPA; + svm->nested.last_vmcb12_gpa = INVALID_GPA; vcpu->arch.hflags = 0; if (!kvm_pause_in_guest(vcpu->kvm)) { @@ -1248,7 +1283,7 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm_clr_intercept(svm, INTERCEPT_PAUSE); } - svm_check_invpcid(svm); + svm_recalc_instruction_intercepts(vcpu, svm); /* * If the host supports V_SPEC_CTRL then disable the interception @@ -1276,6 +1311,8 @@ static void init_vmcb(struct kvm_vcpu *vcpu) } } + svm_hv_init_vmcb(svm->vmcb); + vmcb_mark_all_dirty(svm->vmcb); enable_gif(svm); @@ -1424,6 +1461,9 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); + if (sev_es_guest(vcpu->kvm)) + sev_es_unmap_ghcb(svm); + if (svm->guest_state_loaded) return; @@ -1445,8 +1485,8 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) } } - if (static_cpu_has(X86_FEATURE_RDTSCP)) - kvm_set_user_return_msr(TSC_AUX_URET_SLOT, svm->tsc_aux, -1ull); + if (likely(tsc_aux_uret_slot >= 0)) + kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull); svm->guest_state_loaded = true; } @@ -2655,11 +2695,6 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data |= (u64)svm->sysenter_esp_hi << 32; break; case MSR_TSC_AUX: - if (!boot_cpu_has(X86_FEATURE_RDTSCP)) - return 1; - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) - return 1; msr_info->data = svm->tsc_aux; break; /* @@ -2876,30 +2911,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0; break; case MSR_TSC_AUX: - if (!boot_cpu_has(X86_FEATURE_RDTSCP)) - return 1; - - if (!msr->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) - return 1; - - /* - * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has - * incomplete and conflicting architectural behavior. Current - * AMD CPUs completely ignore bits 63:32, i.e. they aren't - * reserved and always read as zeros. Emulate AMD CPU behavior - * to avoid explosions if the vCPU is migrated from an AMD host - * to an Intel host. - */ - data = (u32)data; - /* * TSC_AUX is usually changed only during boot and never read * directly. Intercept TSC_AUX instead of exposing it to the * guest via direct_access_msrs, and switch it via user return. */ preempt_disable(); - r = kvm_set_user_return_msr(TSC_AUX_URET_SLOT, data, -1ull); + r = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull); preempt_enable(); if (r) return 1; @@ -3084,6 +3102,7 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { [SVM_EXIT_STGI] = stgi_interception, [SVM_EXIT_CLGI] = clgi_interception, [SVM_EXIT_SKINIT] = skinit_interception, + [SVM_EXIT_RDTSCP] = kvm_handle_invalid_op, [SVM_EXIT_WBINVD] = kvm_emulate_wbinvd, [SVM_EXIT_MONITOR] = kvm_emulate_monitor, [SVM_EXIT_MWAIT] = kvm_emulate_mwait, @@ -3113,6 +3132,8 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) return; } + pr_err("VMCB %p, last attempted VMRUN on CPU %d\n", + svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu); pr_err("VMCB Control Area:\n"); pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff); pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16); @@ -3769,6 +3790,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) } svm->vmcb->save.cr2 = vcpu->arch.cr2; + svm_hv_update_vp_id(svm->vmcb, vcpu); + /* * Run with all-zero DR6 unless needed, so that we can get the exact cause * of a #DB. @@ -3842,6 +3865,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) svm->next_rip = 0; if (is_guest_mode(vcpu)) { nested_sync_control_from_vmcb02(svm); + + /* Track VMRUNs that have made past consistency checking */ + if (svm->nested.nested_run_pending && + svm->vmcb->control.exit_code != SVM_EXIT_ERR) + ++vcpu->stat.nested_run; + svm->nested.nested_run_pending = 0; } @@ -3853,10 +3882,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); - if (npt_enabled) { - vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); - vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); - } + if (npt_enabled) + kvm_register_clear_available(vcpu, VCPU_EXREG_PDPTR); /* * We need to handle MC intercepts here before the vcpu has a chance to @@ -3884,6 +3911,8 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, svm->vmcb->control.nested_cr3 = __sme_set(root_hpa); vmcb_mark_dirty(svm->vmcb, VMCB_NPT); + hv_track_root_tdp(vcpu, root_hpa); + /* Loading L2's CR3 is handled by enter_svm_guest_mode. */ if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) return; @@ -3972,8 +4001,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) && guest_cpuid_has(vcpu, X86_FEATURE_NRIPS); - /* Check again if INVPCID interception if required */ - svm_check_invpcid(svm); + svm_recalc_instruction_intercepts(vcpu, svm); /* For sev guests, the memory encryption bit is not reserved in CR3. */ if (sev_guest(vcpu->kvm)) { @@ -4257,7 +4285,7 @@ static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) return !svm_smi_blocked(vcpu); } -static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) +static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) { struct vcpu_svm *svm = to_svm(vcpu); int ret; @@ -4279,7 +4307,7 @@ static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) return 0; } -static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) +static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_host_map map; @@ -4435,13 +4463,12 @@ static int svm_vm_init(struct kvm *kvm) if (!pause_filter_count || !pause_filter_thresh) kvm->arch.pause_in_guest = true; - if (avic) { + if (enable_apicv) { int ret = avic_vm_init(kvm); if (ret) return ret; } - kvm_apicv_init(kvm, avic); return 0; } @@ -4532,7 +4559,10 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .has_wbinvd_exit = svm_has_wbinvd_exit, - .write_l1_tsc_offset = svm_write_l1_tsc_offset, + .get_l2_tsc_offset = svm_get_l2_tsc_offset, + .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier, + .write_tsc_offset = svm_write_tsc_offset, + .write_tsc_multiplier = svm_write_tsc_multiplier, .load_mmu_pgd = svm_load_mmu_pgd, @@ -4552,8 +4582,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .setup_mce = svm_setup_mce, .smi_allowed = svm_smi_allowed, - .pre_enter_smm = svm_pre_enter_smm, - .pre_leave_smm = svm_pre_leave_smm, + .enter_smm = svm_enter_smm, + .leave_smm = svm_leave_smm, .enable_smi_window = svm_enable_smi_window, .mem_enc_op = svm_mem_enc_op, |