diff options
Diffstat (limited to '')
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 1387 |
1 files changed, 946 insertions, 441 deletions
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 0dbf94eb954f..63247c57c72c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -36,6 +36,7 @@ #include <asm/debugreg.h> #include <asm/desc.h> #include <asm/fpu/api.h> +#include <asm/fpu/xstate.h> #include <asm/idtentry.h> #include <asm/io.h> #include <asm/irq_remapping.h> @@ -104,6 +105,9 @@ module_param(fasteoi, bool, S_IRUGO); module_param(enable_apicv, bool, S_IRUGO); +bool __read_mostly enable_ipiv = true; +module_param(enable_ipiv, bool, 0444); + /* * If nested=1, nested virtualization is supported, i.e., guests may use * VMX and be a hypervisor for its own guests. If nested=0, guests may not @@ -115,6 +119,9 @@ module_param(nested, bool, S_IRUGO); bool __read_mostly enable_pml = 1; module_param_named(pml, enable_pml, bool, S_IRUGO); +static bool __read_mostly error_on_inconsistent_vmcs_config = true; +module_param(error_on_inconsistent_vmcs_config, bool, 0444); + static bool __read_mostly dump_invalid_vmcs = 0; module_param(dump_invalid_vmcs, bool, 0644); @@ -161,6 +168,8 @@ static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = { MSR_FS_BASE, MSR_GS_BASE, MSR_KERNEL_GS_BASE, + MSR_IA32_XFD, + MSR_IA32_XFD_ERR, #endif MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, @@ -226,6 +235,9 @@ static const struct { #define L1D_CACHE_ORDER 4 static void *vmx_l1d_flush_pages; +/* Control for disabling CPU Fill buffer clear */ +static bool __read_mostly vmx_fb_clear_ctrl_available; + static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) { struct page *page; @@ -357,6 +369,60 @@ static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); } +static void vmx_setup_fb_clear_ctrl(void) +{ + u64 msr; + + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES) && + !boot_cpu_has_bug(X86_BUG_MDS) && + !boot_cpu_has_bug(X86_BUG_TAA)) { + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr); + if (msr & ARCH_CAP_FB_CLEAR_CTRL) + vmx_fb_clear_ctrl_available = true; + } +} + +static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) +{ + u64 msr; + + if (!vmx->disable_fb_clear) + return; + + msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL); + msr |= FB_CLEAR_DIS; + native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr); + /* Cache the MSR value to avoid reading it later */ + vmx->msr_ia32_mcu_opt_ctrl = msr; +} + +static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx) +{ + if (!vmx->disable_fb_clear) + return; + + vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS; + native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); +} + +static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) +{ + vmx->disable_fb_clear = vmx_fb_clear_ctrl_available; + + /* + * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS + * at VMEntry. Skip the MSR read/write when a guest has no use case to + * execute VERW. + */ + if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) || + ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) && + (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) && + (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) && + (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) && + (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO))) + vmx->disable_fb_clear = false; +} + static const struct kernel_param_ops vmentry_l1d_flush_ops = { .set = vmentry_l1d_flush_set, .get = vmentry_l1d_flush_get, @@ -373,7 +439,7 @@ do { \ pr_warn_ratelimited(fmt); \ } while (0) -asmlinkage void vmread_error(unsigned long field, bool fault) +void vmread_error(unsigned long field, bool fault) { if (fault) kvm_spurious_fault(); @@ -383,18 +449,20 @@ asmlinkage void vmread_error(unsigned long field, bool fault) noinline void vmwrite_error(unsigned long field, unsigned long value) { - vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%d\n", + vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%u\n", field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr) { - vmx_insn_failed("kvm: vmclear failed: %p/%llx\n", vmcs, phys_addr); + vmx_insn_failed("kvm: vmclear failed: %p/%llx err=%u\n", + vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr) { - vmx_insn_failed("kvm: vmptrld failed: %p/%llx\n", vmcs, phys_addr); + vmx_insn_failed("kvm: vmptrld failed: %p/%llx err=%u\n", + vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva) @@ -538,11 +606,6 @@ static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) return flexpriority_enabled && lapic_in_kernel(vcpu); } -static inline bool report_flexpriority(void) -{ - return flexpriority_enabled; -} - static int possible_passthrough_msr_slot(u32 msr) { u32 i; @@ -602,15 +665,13 @@ static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, unsigned int slot = msr - vmx->guest_uret_msrs; int ret = 0; - u64 old_msr_data = msr->data; - msr->data = data; if (msr->load_into_hardware) { preempt_disable(); - ret = kvm_set_user_return_msr(slot, msr->data, msr->mask); + ret = kvm_set_user_return_msr(slot, data, msr->mask); preempt_enable(); - if (ret) - msr->data = old_msr_data; } + if (!ret) + msr->data = data; return ret; } @@ -644,10 +705,10 @@ static void __loaded_vmcs_clear(void *arg) /* * Ensure all writes to loaded_vmcs, including deleting it from its - * current percpu list, complete before setting loaded_vmcs->vcpu to - * -1, otherwise a different cpu can see vcpu == -1 first and add - * loaded_vmcs to its percpu list before it's deleted from this cpu's - * list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs(). + * current percpu list, complete before setting loaded_vmcs->cpu to + * -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first + * and add loaded_vmcs to its percpu list before it's deleted from this + * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs(). */ smp_wmb(); @@ -763,6 +824,14 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu) vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match); } + /* + * Disabling xfd interception indicates that dynamic xfeatures + * might be used in the guest. Always trap #NM in this case + * to save guest xfd_err timely. + */ + if (vcpu->arch.xfd_no_write_intercept) + eb |= (1u << NM_VECTOR); + vmcs_write32(EXCEPTION_BITMAP, eb); } @@ -774,11 +843,28 @@ static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr) if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS)) return true; - return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, - MSR_IA32_SPEC_CTRL); + return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr); } -static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, +unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) +{ + unsigned int flags = 0; + + if (vmx->loaded_vmcs->launched) + flags |= VMX_RUN_VMRESUME; + + /* + * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free + * to change it directly without causing a vmexit. In that case read + * it after vmexit and store it in vmx->spec_ctrl. + */ + if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))) + flags |= VMX_RUN_SAVE_SPEC_CTRL; + + return flags; +} + +static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, unsigned long entry, unsigned long exit) { vm_entry_controls_clearbit(vmx, entry); @@ -836,7 +922,7 @@ skip_guest: vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); } -static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, +static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, unsigned long entry, unsigned long exit, unsigned long guest_val_vmcs, unsigned long host_val_vmcs, u64 guest_val, u64 host_val) @@ -1271,7 +1357,6 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, if (!already_loaded) { void *gdt = get_current_gdt_ro(); - unsigned long sysenter_esp; /* * Flush all EPTP/VPID contexts, the new pCPU may have stale @@ -1287,8 +1372,11 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ - rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); - vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ + if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) { + /* 22.2.3 */ + vmcs_writel(HOST_IA32_SYSENTER_ESP, + (unsigned long)(cpu_entry_stack(cpu) + 1)); + } vmx->loaded_vmcs->cpu = cpu; } @@ -1469,11 +1557,12 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) return 0; } -static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len) +static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len) { /* * Emulation of instructions in SGX enclaves is impossible as RIP does - * not point tthe failing instruction, and even if it did, the code + * not point at the failing instruction, and even if it did, the code * stream is inaccessible. Inject #UD instead of exiting to userspace * so that guest userspace can't DoS the guest simply by triggering * emulation (enclaves are CPL3 only). @@ -1563,17 +1652,25 @@ static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu) /* * Per the SDM, MTF takes priority over debug-trap exceptions besides - * T-bit traps. As instruction emulation is completed (i.e. at the - * instruction boundary), any #DB exception pending delivery must be a - * debug-trap. Record the pending MTF state to be delivered in + * TSS T-bit traps and ICEBP (INT1). KVM doesn't emulate T-bit traps + * or ICEBP (in the emulator proper), and skipping of ICEBP after an + * intercepted #DB deliberately avoids single-step #DB and MTF updates + * as ICEBP is higher priority than both. As instruction emulation is + * completed at this point (i.e. KVM is at the instruction boundary), + * any #DB exception pending delivery must be a debug-trap of lower + * priority than MTF. Record the pending MTF state to be delivered in * vmx_check_nested_events(). */ if (nested_cpu_has_mtf(vmcs12) && (!vcpu->arch.exception.pending || - vcpu->arch.exception.nr == DB_VECTOR)) + vcpu->arch.exception.vector == DB_VECTOR) && + (!vcpu->arch.exception_vmexit.pending || + vcpu->arch.exception_vmexit.vector == DB_VECTOR)) { vmx->nested.mtf_pending = true; - else + kvm_make_request(KVM_REQ_EVENT, vcpu); + } else { vmx->nested.mtf_pending = false; + } } static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu) @@ -1595,32 +1692,40 @@ static void vmx_clear_hlt(struct kvm_vcpu *vcpu) vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); } -static void vmx_queue_exception(struct kvm_vcpu *vcpu) +static void vmx_inject_exception(struct kvm_vcpu *vcpu) { + struct kvm_queued_exception *ex = &vcpu->arch.exception; + u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned nr = vcpu->arch.exception.nr; - bool has_error_code = vcpu->arch.exception.has_error_code; - u32 error_code = vcpu->arch.exception.error_code; - u32 intr_info = nr | INTR_INFO_VALID_MASK; - kvm_deliver_exception_payload(vcpu); + kvm_deliver_exception_payload(vcpu, ex); - if (has_error_code) { - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); + if (ex->has_error_code) { + /* + * Despite the error code being architecturally defined as 32 + * bits, and the VMCS field being 32 bits, Intel CPUs and thus + * VMX don't actually supporting setting bits 31:16. Hardware + * will (should) never provide a bogus error code, but AMD CPUs + * do generate error codes with bits 31:16 set, and so KVM's + * ABI lets userspace shove in arbitrary 32-bit values. Drop + * the upper bits to avoid VM-Fail, losing information that + * does't really exist is preferable to killing the VM. + */ + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code); intr_info |= INTR_INFO_DELIVER_CODE_MASK; } if (vmx->rmode.vm86_active) { int inc_eip = 0; - if (kvm_exception_is_soft(nr)) + if (kvm_exception_is_soft(ex->vector)) inc_eip = vcpu->arch.event_exit_inst_len; - kvm_inject_realmode_interrupt(vcpu, nr, inc_eip); + kvm_inject_realmode_interrupt(vcpu, ex->vector, inc_eip); return; } WARN_ON_ONCE(vmx->emulation_required); - if (kvm_exception_is_soft(nr)) { + if (kvm_exception_is_soft(ex->vector)) { vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, vmx->vcpu.arch.event_exit_inst_len); intr_info |= INTR_TYPE_SOFT_EXCEPTION; @@ -1705,7 +1810,7 @@ u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) return vmcs12->tsc_multiplier; - return kvm_default_tsc_scaling_ratio; + return kvm_caps.default_tsc_scaling_ratio; } static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) @@ -1753,7 +1858,7 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr) } /* - * Reads an msr value (of 'msr_index') into 'pdata'. + * Reads an msr value (of 'msr_info->index') into 'msr_info->data'. * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ @@ -1841,9 +1946,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * sanity checking and refuse to boot. Filter all unsupported * features out. */ - if (!msr_info->host_initiated && - vmx->nested.enlightened_vmcs_enabled) - nested_evmcs_filter_control_msr(msr_info->index, + if (!msr_info->host_initiated && guest_cpuid_has_evmcs(vcpu)) + nested_evmcs_filter_control_msr(vcpu, msr_info->index, &msr_info->data); break; case MSR_IA32_RTIT_CTL: @@ -1917,15 +2021,17 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu, return (unsigned long)data; } -static u64 vcpu_supported_debugctl(struct kvm_vcpu *vcpu) +static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated) { - u64 debugctl = vmx_supported_debugctl(); + u64 debugctl = 0; - if (!intel_pmu_lbr_is_enabled(vcpu)) - debugctl &= ~DEBUGCTLMSR_LBR_MASK; + if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) && + (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))) + debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT; - if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)) - debugctl &= ~DEBUGCTLMSR_BUS_LOCK_DETECT; + if ((vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT) && + (host_initiated || intel_pmu_lbr_is_enabled(vcpu))) + debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; return debugctl; } @@ -1960,6 +2066,24 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_KERNEL_GS_BASE: vmx_write_guest_kernel_gs_base(vmx, data); break; + case MSR_IA32_XFD: + ret = kvm_set_msr_common(vcpu, msr_info); + /* + * Always intercepting WRMSR could incur non-negligible + * overhead given xfd might be changed frequently in + * guest context switch. Disable write interception + * upon the first write with a non-zero value (indicating + * potential usage on dynamic xfeatures). Also update + * exception bitmap to trap #NM for proper virtualization + * of guest xfd_err. + */ + if (!ret && data) { + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, + MSR_TYPE_RW); + vcpu->arch.xfd_no_write_intercept = true; + vmx_update_exception_bitmap(vcpu); + } + break; #endif case MSR_IA32_SYSENTER_CS: if (is_guest_mode(vcpu)) @@ -1981,7 +2105,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmcs_writel(GUEST_SYSENTER_ESP, data); break; case MSR_IA32_DEBUGCTLMSR: { - u64 invalid = data & ~vcpu_supported_debugctl(vcpu); + u64 invalid; + + invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) { if (report_ignored_msrs) vcpu_unimpl(vcpu, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n", @@ -2011,6 +2137,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (is_noncanonical_address(data & PAGE_MASK, vcpu) || (data & MSR_IA32_BNDCFGS_RSVD)) return 1; + + if (is_guest_mode(vcpu) && + ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) || + (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS))) + get_vmcs12(vcpu)->guest_bndcfgs = data; + vmcs_write64(GUEST_BNDCFGS, data); break; case MSR_IA32_UMWAIT_CONTROL: @@ -2100,9 +2232,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } ret = kvm_set_msr_common(vcpu, msr_info); break; - case MSR_IA32_TSC_ADJUST: - ret = kvm_set_msr_common(vcpu, msr_info); - break; case MSR_IA32_MCG_EXT_CTL: if ((!msr_info->host_initiated && !(to_vmx(vcpu)->msr_ia32_feature_control & @@ -2215,7 +2344,18 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if ((data & PMU_CAP_LBR_FMT) != (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT)) return 1; - if (!intel_pmu_lbr_is_compatible(vcpu)) + if (!cpuid_model_is_consistent(vcpu)) + return 1; + } + if (data & PERF_CAP_PEBS_FORMAT) { + if ((data & PERF_CAP_PEBS_MASK) != + (vmx_get_perf_capabilities() & PERF_CAP_PEBS_MASK)) + return 1; + if (!guest_cpuid_has(vcpu, X86_FEATURE_DS)) + return 1; + if (!guest_cpuid_has(vcpu, X86_FEATURE_DTES64)) + return 1; + if (!cpuid_model_is_consistent(vcpu)) return 1; } ret = kvm_set_msr_common(vcpu, msr_info); @@ -2230,6 +2370,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) ret = kvm_set_msr_common(vcpu, msr_info); } + /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */ + if (msr_index == MSR_IA32_ARCH_CAPABILITIES) + vmx_update_fb_clear_dis(vcpu, vmx); + return ret; } @@ -2307,7 +2451,7 @@ fault: return -EFAULT; } -static int hardware_enable(void) +static int vmx_hardware_enable(void) { int cpu = raw_smp_processor_id(); u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); @@ -2348,7 +2492,7 @@ static void vmclear_local_loaded_vmcss(void) __loaded_vmcs_clear(v); } -static void hardware_disable(void) +static void vmx_hardware_disable(void) { vmclear_local_loaded_vmcss(); @@ -2369,6 +2513,30 @@ static bool cpu_has_sgx(void) return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0)); } +/* + * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they + * can't be used due to errata where VM Exit may incorrectly clear + * IA32_PERF_GLOBAL_CTRL[34:32]. Work around the errata by using the + * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL. + */ +static bool cpu_has_perf_global_ctrl_bug(void) +{ + if (boot_cpu_data.x86 == 0x6) { + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_NEHALEM_EP: /* AAK155 */ + case INTEL_FAM6_NEHALEM: /* AAP115 */ + case INTEL_FAM6_WESTMERE: /* AAT100 */ + case INTEL_FAM6_WESTMERE_EP: /* BC86,AAY89,BD102 */ + case INTEL_FAM6_NEHALEM_EX: /* BA97 */ + return true; + default: + break; + } + } + + return false; +} + static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result) { @@ -2388,74 +2556,56 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, return 0; } +static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) +{ + u64 allowed; + + rdmsrl(msr, allowed); + + return ctl_opt & allowed; +} + static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, struct vmx_capability *vmx_cap) { u32 vmx_msr_low, vmx_msr_high; - u32 min, opt, min2, opt2; u32 _pin_based_exec_control = 0; u32 _cpu_based_exec_control = 0; u32 _cpu_based_2nd_exec_control = 0; + u64 _cpu_based_3rd_exec_control = 0; u32 _vmexit_control = 0; u32 _vmentry_control = 0; + u64 misc_msr; + int i; + + /* + * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory. + * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always + * intercepts writes to PAT and EFER, i.e. never enables those controls. + */ + struct { + u32 entry_control; + u32 exit_control; + } const vmcs_entry_exit_pairs[] = { + { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL }, + { VM_ENTRY_LOAD_IA32_PAT, VM_EXIT_LOAD_IA32_PAT }, + { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER }, + { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS }, + { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL }, + }; memset(vmcs_conf, 0, sizeof(*vmcs_conf)); - min = CPU_BASED_HLT_EXITING | -#ifdef CONFIG_X86_64 - CPU_BASED_CR8_LOAD_EXITING | - CPU_BASED_CR8_STORE_EXITING | -#endif - CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING | - CPU_BASED_UNCOND_IO_EXITING | - CPU_BASED_MOV_DR_EXITING | - CPU_BASED_USE_TSC_OFFSETTING | - CPU_BASED_MWAIT_EXITING | - CPU_BASED_MONITOR_EXITING | - CPU_BASED_INVLPG_EXITING | - CPU_BASED_RDPMC_EXITING; - - opt = CPU_BASED_TPR_SHADOW | - CPU_BASED_USE_MSR_BITMAPS | - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, - &_cpu_based_exec_control) < 0) + + if (adjust_vmx_controls(KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL, + KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL, + MSR_IA32_VMX_PROCBASED_CTLS, + &_cpu_based_exec_control)) return -EIO; -#ifdef CONFIG_X86_64 - if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) - _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & - ~CPU_BASED_CR8_STORE_EXITING; -#endif if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { - min2 = 0; - opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | - SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | - SECONDARY_EXEC_WBINVD_EXITING | - SECONDARY_EXEC_ENABLE_VPID | - SECONDARY_EXEC_ENABLE_EPT | - SECONDARY_EXEC_UNRESTRICTED_GUEST | - SECONDARY_EXEC_PAUSE_LOOP_EXITING | - SECONDARY_EXEC_DESC | - SECONDARY_EXEC_ENABLE_RDTSCP | - SECONDARY_EXEC_ENABLE_INVPCID | - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | - SECONDARY_EXEC_SHADOW_VMCS | - SECONDARY_EXEC_XSAVES | - SECONDARY_EXEC_RDSEED_EXITING | - SECONDARY_EXEC_RDRAND_EXITING | - SECONDARY_EXEC_ENABLE_PML | - SECONDARY_EXEC_TSC_SCALING | - SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | - SECONDARY_EXEC_PT_USE_GPA | - SECONDARY_EXEC_PT_CONCEAL_VMX | - SECONDARY_EXEC_ENABLE_VMFUNC | - SECONDARY_EXEC_BUS_LOCK_DETECTION; - if (cpu_has_sgx()) - opt2 |= SECONDARY_EXEC_ENCLS_EXITING; - if (adjust_vmx_controls(min2, opt2, + if (adjust_vmx_controls(KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL, + KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL, MSR_IA32_VMX_PROCBASED_CTLS2, - &_cpu_based_2nd_exec_control) < 0) + &_cpu_based_2nd_exec_control)) return -EIO; } #ifndef CONFIG_X86_64 @@ -2473,43 +2623,45 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, &vmx_cap->ept, &vmx_cap->vpid); - if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { - /* CR3 accesses and invlpg don't need to cause VM Exits when EPT - enabled */ - _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING | - CPU_BASED_INVLPG_EXITING); - } else if (vmx_cap->ept) { - vmx_cap->ept = 0; + if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) && + vmx_cap->ept) { pr_warn_once("EPT CAP should not exist if not support " "1-setting enable EPT VM-execution control\n"); + + if (error_on_inconsistent_vmcs_config) + return -EIO; + + vmx_cap->ept = 0; } if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && - vmx_cap->vpid) { - vmx_cap->vpid = 0; + vmx_cap->vpid) { pr_warn_once("VPID CAP should not exist if not support " "1-setting enable VPID VM-execution control\n"); + + if (error_on_inconsistent_vmcs_config) + return -EIO; + + vmx_cap->vpid = 0; } - min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT; -#ifdef CONFIG_X86_64 - min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; -#endif - opt = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | - VM_EXIT_LOAD_IA32_PAT | - VM_EXIT_LOAD_IA32_EFER | - VM_EXIT_CLEAR_BNDCFGS | - VM_EXIT_PT_CONCEAL_PIP | - VM_EXIT_CLEAR_IA32_RTIT_CTL; - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, - &_vmexit_control) < 0) + if (!cpu_has_sgx()) + _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING; + + if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) + _cpu_based_3rd_exec_control = + adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL, + MSR_IA32_VMX_PROCBASED_CTLS3); + + if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_EXIT_CONTROLS, + KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS, + MSR_IA32_VMX_EXIT_CTLS, + &_vmexit_control)) return -EIO; - min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; - opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR | - PIN_BASED_VMX_PREEMPTION_TIMER; - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, - &_pin_based_exec_control) < 0) + if (adjust_vmx_controls(KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL, + KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL, + MSR_IA32_VMX_PINBASED_CTLS, + &_pin_based_exec_control)) return -EIO; if (cpu_has_broken_vmx_preemption_timer()) @@ -2518,40 +2670,28 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)) _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; - min = VM_ENTRY_LOAD_DEBUG_CONTROLS; - opt = VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | - VM_ENTRY_LOAD_IA32_PAT | - VM_ENTRY_LOAD_IA32_EFER | - VM_ENTRY_LOAD_BNDCFGS | - VM_ENTRY_PT_CONCEAL_PIP | - VM_ENTRY_LOAD_IA32_RTIT_CTL; - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, - &_vmentry_control) < 0) + if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS, + KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS, + MSR_IA32_VMX_ENTRY_CTLS, + &_vmentry_control)) return -EIO; - /* - * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they - * can't be used due to an errata where VM Exit may incorrectly clear - * IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the - * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL. - */ - if (boot_cpu_data.x86 == 0x6) { - switch (boot_cpu_data.x86_model) { - case 26: /* AAK155 */ - case 30: /* AAP115 */ - case 37: /* AAT100 */ - case 44: /* BC86,AAY89,BD102 */ - case 46: /* BA97 */ - _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; - _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; - pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " - "does not work properly. Using workaround\n"); - break; - default: - break; - } - } + for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) { + u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control; + u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control; + + if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl)) + continue; + + pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n", + _vmentry_control & n_ctrl, _vmexit_control & x_ctrl); + if (error_on_inconsistent_vmcs_config) + return -EIO; + + _vmentry_control &= ~n_ctrl; + _vmexit_control &= ~x_ctrl; + } rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); @@ -2569,8 +2709,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, if (((vmx_msr_high >> 18) & 15) != 6) return -EIO; + rdmsrl(MSR_IA32_VMX_MISC, misc_msr); + vmcs_conf->size = vmx_msr_high & 0x1fff; - vmcs_conf->order = get_order(vmcs_conf->size); vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; vmcs_conf->revision_id = vmx_msr_low; @@ -2578,13 +2719,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; + vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control; vmcs_conf->vmexit_ctrl = _vmexit_control; vmcs_conf->vmentry_ctrl = _vmentry_control; - -#if IS_ENABLED(CONFIG_HYPERV) - if (enlightened_vmcs) - evmcs_sanitize_exec_ctrls(vmcs_conf); -#endif + vmcs_conf->misc = misc_msr; return 0; } @@ -2595,7 +2733,7 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) struct page *pages; struct vmcs *vmcs; - pages = __alloc_pages_node(node, flags, vmcs_config.order); + pages = __alloc_pages_node(node, flags, 0); if (!pages) return NULL; vmcs = page_address(pages); @@ -2614,7 +2752,7 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) void free_vmcs(struct vmcs *vmcs) { - free_pages((unsigned long)vmcs, vmcs_config.order); + free_page((unsigned long)vmcs); } /* @@ -2845,21 +2983,22 @@ static void enter_rmode(struct kvm_vcpu *vcpu) int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct vmx_uret_msr *msr = vmx_find_uret_msr(vmx, MSR_EFER); /* Nothing to do if hardware doesn't support EFER. */ - if (!msr) + if (!vmx_find_uret_msr(vmx, MSR_EFER)) return 0; vcpu->arch.efer = efer; - if (efer & EFER_LMA) { - vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); - msr->data = efer; - } else { - vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); +#ifdef CONFIG_X86_64 + if (efer & EFER_LMA) + vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE); + else + vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE); +#else + if (KVM_BUG_ON(efer & EFER_LMA, vcpu->kvm)) + return 1; +#endif - msr->data = efer & ~EFER_LME; - } vmx_setup_uret_msrs(vmx); return 0; } @@ -2885,7 +3024,6 @@ static void enter_lmode(struct kvm_vcpu *vcpu) static void exit_lmode(struct kvm_vcpu *vcpu) { - vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); } @@ -2924,7 +3062,7 @@ static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; - u64 root_hpa = mmu->root_hpa; + u64 root_hpa = mmu->root.hpa; /* No flush required if the current context is invalid. */ if (!VALID_PAGE(root_hpa)) @@ -2932,7 +3070,7 @@ static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) if (enable_ept) ept_sync_context(construct_eptp(vcpu, root_hpa, - mmu->shadow_root_level)); + mmu->root_role.level)); else vpid_sync_context(vmx_get_current_vpid(vcpu)); } @@ -2985,7 +3123,7 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu) mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); - kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); + kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR); } #define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \ @@ -3067,6 +3205,13 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */ if ((old_cr0_pg ^ cr0) & X86_CR0_PG) vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); + + /* + * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but + * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG. + */ + if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG)) + kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); } /* depends on vcpu->arch.cr0 to be set to a new value */ @@ -3110,9 +3255,9 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, if (!enable_unrestricted_guest && !is_paging(vcpu)) guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; - else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) + else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3)) guest_cr3 = vcpu->arch.cr3; - else /* vmcs01.GUEST_CR3 is already up-to-date. */ + else /* vmcs.GUEST_CR3 is already up-to-date. */ update_guest_cr3 = false; vmx_ept_load_pdptrs(vcpu); } else { @@ -3123,12 +3268,13 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, vmcs_writel(GUEST_CR3, guest_cr3); } + static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { /* * We operate under the default treatment of SMM, so VMX cannot be - * enabled under SMM. Note, whether or not VMXE is allowed at all is - * handled by kvm_is_valid_cr4(). + * enabled under SMM. Note, whether or not VMXE is allowed at all, + * i.e. is a reserved bit, is handled by common x86 code. */ if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu)) return false; @@ -3599,7 +3745,7 @@ static int init_rmode_identity_map(struct kvm *kvm) } /* Set up identity-mapping pagetable for EPT in real mode */ - for (i = 0; i < PT32_ENT_PER_PAGE; i++) { + for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) { tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) { @@ -3687,6 +3833,19 @@ void free_vpid(int vpid) spin_unlock(&vmx_vpid_lock); } +static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx) +{ + /* + * When KVM is a nested hypervisor on top of Hyper-V and uses + * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR + * bitmap has changed. + */ + if (static_branch_unlikely(&enable_evmcs)) + evmcs_touch_msr_bitmap(); + + vmx->nested.force_msr_bitmap_recalc = true; +} + void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -3695,8 +3854,7 @@ void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) if (!cpu_has_vmx_msr_bitmap()) return; - if (static_branch_unlikely(&enable_evmcs)) - evmcs_touch_msr_bitmap(); + vmx_msr_bitmap_l01_changed(vmx); /* * Mark the desired intercept state in shadow bitmap, this is needed @@ -3740,8 +3898,7 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) if (!cpu_has_vmx_msr_bitmap()) return; - if (static_branch_unlikely(&enable_evmcs)) - evmcs_touch_msr_bitmap(); + vmx_msr_bitmap_l01_changed(vmx); /* * Mark the desired intercept state in shadow bitmap, this is needed @@ -3818,6 +3975,8 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW); vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); + if (enable_ipiv) + vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW); } } @@ -3863,59 +4022,70 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) u32 i; /* - * Set intercept permissions for all potentially passed through MSRs - * again. They will automatically get filtered through the MSR filter, - * so we are back in sync after this. + * Redo intercept permissions for MSRs that KVM is passing through to + * the guest. Disabling interception will check the new MSR filter and + * ensure that KVM enables interception if usersepace wants to filter + * the MSR. MSRs that KVM is already intercepting don't need to be + * refreshed since KVM is going to intercept them regardless of what + * userspace wants. */ for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) { u32 msr = vmx_possible_passthrough_msrs[i]; - bool read = test_bit(i, vmx->shadow_msr_intercept.read); - bool write = test_bit(i, vmx->shadow_msr_intercept.write); - vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_R, read); - vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_W, write); + if (!test_bit(i, vmx->shadow_msr_intercept.read)) + vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_R); + + if (!test_bit(i, vmx->shadow_msr_intercept.write)) + vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W); } - pt_update_intercept_for_msr(vcpu); + /* PT MSRs can be passed through iff PT is exposed to the guest. */ + if (vmx_pt_mode_is_host_guest()) + pt_update_intercept_for_msr(vcpu); } -static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, - bool nested) +static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, + int pi_vec) { #ifdef CONFIG_SMP - int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR; - if (vcpu->mode == IN_GUEST_MODE) { /* - * The vector of interrupt to be delivered to vcpu had - * been set in PIR before this function. + * The vector of the virtual has already been set in the PIR. + * Send a notification event to deliver the virtual interrupt + * unless the vCPU is the currently running vCPU, i.e. the + * event is being sent from a fastpath VM-Exit handler, in + * which case the PIR will be synced to the vIRR before + * re-entering the guest. * - * Following cases will be reached in this block, and - * we always send a notification event in all cases as - * explained below. + * When the target is not the running vCPU, the following + * possibilities emerge: * - * Case 1: vcpu keeps in non-root mode. Sending a - * notification event posts the interrupt to vcpu. + * Case 1: vCPU stays in non-root mode. Sending a notification + * event posts the interrupt to the vCPU. * - * Case 2: vcpu exits to root mode and is still - * runnable. PIR will be synced to vIRR before the - * next vcpu entry. Sending a notification event in - * this case has no effect, as vcpu is not in root - * mode. + * Case 2: vCPU exits to root mode and is still runnable. The + * PIR will be synced to the vIRR before re-entering the guest. + * Sending a notification event is ok as the host IRQ handler + * will ignore the spurious event. * - * Case 3: vcpu exits to root mode and is blocked. - * vcpu_block() has already synced PIR to vIRR and - * never blocks vcpu if vIRR is not cleared. Therefore, - * a blocked vcpu here does not wait for any requested - * interrupts in PIR, and sending a notification event - * which has no effect is safe here. + * Case 3: vCPU exits to root mode and is blocked. vcpu_block() + * has already synced PIR to vIRR and never blocks the vCPU if + * the vIRR is not empty. Therefore, a blocked vCPU here does + * not wait for any requested interrupts in PIR, and sending a + * notification event also results in a benign, spurious event. */ - apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); - return true; + if (vcpu != kvm_get_running_vcpu()) + apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); + return; } #endif - return false; + /* + * The vCPU isn't in the guest; wake the vCPU in case it is blocking, + * otherwise do nothing as KVM will grab the highest priority pending + * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest(). + */ + kvm_vcpu_wake_up(vcpu); } static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, @@ -3931,9 +4101,21 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, */ vmx->nested.pi_pending = true; kvm_make_request(KVM_REQ_EVENT, vcpu); + + /* + * This pairs with the smp_mb_*() after setting vcpu->mode in + * vcpu_enter_guest() to guarantee the vCPU sees the event + * request if triggering a posted interrupt "fails" because + * vcpu->mode != IN_GUEST_MODE. The extra barrier is needed as + * the smb_wmb() in kvm_make_request() only ensures everything + * done before making the request is visible when the request + * is visible, it doesn't ensure ordering between the store to + * vcpu->requests and the load from vcpu->mode. + */ + smp_mb__after_atomic(); + /* the PIR and ON have been set by L1. */ - if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true)) - kvm_vcpu_kick(vcpu); + kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR); return 0; } return -1; @@ -3954,7 +4136,8 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) if (!r) return 0; - if (!vcpu->arch.apicv_active) + /* Note, this is called iff the local APIC is in-kernel. */ + if (!vcpu->arch.apic->apicv_active) return -1; if (pi_test_and_set_pir(vector, &vmx->pi_desc)) @@ -3964,12 +4147,31 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) if (pi_test_and_set_on(&vmx->pi_desc)) return 0; - if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false)) - kvm_vcpu_kick(vcpu); - + /* + * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*() + * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is + * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a + * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE. + */ + kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR); return 0; } +static void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, + int trig_mode, int vector) +{ + struct kvm_vcpu *vcpu = apic->vcpu; + + if (vmx_deliver_posted_interrupt(vcpu, vector)) { + kvm_lapic_set_irr(vector, apic); + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_vcpu_kick(vcpu); + } else { + trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, + trig_mode, vector); + } +} + /* * Set up the vmcs's constant host-state fields, i.e., host-state fields that * will not change in the lifetime of the guest. @@ -4021,6 +4223,16 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); vmcs_write32(HOST_IA32_SYSENTER_CS, low32); + + /* + * SYSENTER is used for 32-bit system calls on either 32-bit or + * 64-bit kernels. It is always zero If neither is allowed, otherwise + * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may + * have already done so!). + */ + if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32)) + vmcs_writel(HOST_IA32_SYSENTER_ESP, 0); + rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ @@ -4039,8 +4251,10 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS & ~vcpu->arch.cr4_guest_rsvd_bits; - if (!enable_ept) - vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PGE; + if (!enable_ept) { + vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS; + vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS; + } if (is_guest_mode(&vmx->vcpu)) vcpu->arch.cr4_guest_owned_bits &= ~get_vmcs12(vcpu)->cr4_guest_host_mask; @@ -4070,18 +4284,37 @@ static u32 vmx_vmentry_ctrl(void) if (vmx_pt_mode_is_system()) vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | VM_ENTRY_LOAD_IA32_RTIT_CTL); - /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ - return vmentry_ctrl & - ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER); + /* + * IA32e mode, and loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically. + */ + vmentry_ctrl &= ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | + VM_ENTRY_LOAD_IA32_EFER | + VM_ENTRY_IA32E_MODE); + + if (cpu_has_perf_global_ctrl_bug()) + vmentry_ctrl &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + + return vmentry_ctrl; } static u32 vmx_vmexit_ctrl(void) { u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; + /* + * Not used by KVM and never set in vmcs01 or vmcs02, but emulated for + * nested virtualization and thus allowed to be set in vmcs12. + */ + vmexit_ctrl &= ~(VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER | + VM_EXIT_SAVE_VMX_PREEMPTION_TIMER); + if (vmx_pt_mode_is_system()) vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | VM_EXIT_CLEAR_IA32_RTIT_CTL); + + if (cpu_has_perf_global_ctrl_bug()) + vmexit_ctrl &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; + /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ return vmexit_ctrl & ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); @@ -4091,16 +4324,25 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (is_guest_mode(vcpu)) { + vmx->nested.update_vmcs01_apicv_status = true; + return; + } + pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); - if (cpu_has_secondary_exec_ctrls()) { - if (kvm_vcpu_apicv_active(vcpu)) - secondary_exec_controls_setbit(vmx, - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); - else - secondary_exec_controls_clearbit(vmx, - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + + if (kvm_vcpu_apicv_active(vcpu)) { + secondary_exec_controls_setbit(vmx, + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + if (enable_ipiv) + tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT); + } else { + secondary_exec_controls_clearbit(vmx, + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + if (enable_ipiv) + tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT); } vmx_update_msr_bitmap_x2apic(vcpu); @@ -4110,20 +4352,38 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) { u32 exec_control = vmcs_config.cpu_based_exec_ctrl; + /* + * Not used by KVM, but fully supported for nesting, i.e. are allowed in + * vmcs12 and propagated to vmcs02 when set in vmcs12. + */ + exec_control &= ~(CPU_BASED_RDTSC_EXITING | + CPU_BASED_USE_IO_BITMAPS | + CPU_BASED_MONITOR_TRAP_FLAG | + CPU_BASED_PAUSE_EXITING); + + /* INTR_WINDOW_EXITING and NMI_WINDOW_EXITING are toggled dynamically */ + exec_control &= ~(CPU_BASED_INTR_WINDOW_EXITING | + CPU_BASED_NMI_WINDOW_EXITING); + if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) exec_control &= ~CPU_BASED_MOV_DR_EXITING; - if (!cpu_need_tpr_shadow(&vmx->vcpu)) { + if (!cpu_need_tpr_shadow(&vmx->vcpu)) exec_control &= ~CPU_BASED_TPR_SHADOW; + #ifdef CONFIG_X86_64 + if (exec_control & CPU_BASED_TPR_SHADOW) + exec_control &= ~(CPU_BASED_CR8_LOAD_EXITING | + CPU_BASED_CR8_STORE_EXITING); + else exec_control |= CPU_BASED_CR8_STORE_EXITING | CPU_BASED_CR8_LOAD_EXITING; #endif - } - if (!enable_ept) - exec_control |= CPU_BASED_CR3_STORE_EXITING | - CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_INVLPG_EXITING; + /* No need to intercept CR3 access or INVPLG when using EPT. */ + if (enable_ept) + exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING | + CPU_BASED_INVLPG_EXITING); if (kvm_mwait_in_guest(vmx->vcpu.kvm)) exec_control &= ~(CPU_BASED_MWAIT_EXITING | CPU_BASED_MONITOR_EXITING); @@ -4132,6 +4392,20 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) return exec_control; } +static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx) +{ + u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl; + + /* + * IPI virtualization relies on APICv. Disable IPI virtualization if + * APICv is inhibited. + */ + if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu)) + exec_control &= ~TERTIARY_EXEC_IPI_VIRT; + + return exec_control; +} + /* * Adjust a single secondary execution control bit to intercept/allow an * instruction in the guest. This is usually done based on whether or not a @@ -4274,13 +4548,48 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) if (!vcpu->kvm->arch.bus_lock_detection_enabled) exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION; + if (!kvm_notify_vmexit_enabled(vcpu->kvm)) + exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING; + return exec_control; } +static inline int vmx_get_pid_table_order(struct kvm *kvm) +{ + return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table)); +} + +static int vmx_alloc_ipiv_pid_table(struct kvm *kvm) +{ + struct page *pages; + struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); + + if (!irqchip_in_kernel(kvm) || !enable_ipiv) + return 0; + + if (kvm_vmx->pid_table) + return 0; + + pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, vmx_get_pid_table_order(kvm)); + if (!pages) + return -ENOMEM; + + kvm_vmx->pid_table = (void *)page_address(pages); + return 0; +} + +static int vmx_vcpu_precreate(struct kvm *kvm) +{ + return vmx_alloc_ipiv_pid_table(kvm); +} + #define VMX_XSS_EXIT_BITMAP 0 static void init_vmcs(struct vcpu_vmx *vmx) { + struct kvm *kvm = vmx->vcpu.kvm; + struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); + if (nested) nested_vmx_set_vmcs_shadowing_bitmap(); @@ -4297,7 +4606,10 @@ static void init_vmcs(struct vcpu_vmx *vmx) if (cpu_has_secondary_exec_ctrls()) secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx)); - if (kvm_vcpu_apicv_active(&vmx->vcpu)) { + if (cpu_has_tertiary_exec_ctrls()) + tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx)); + + if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) { vmcs_write64(EOI_EXIT_BITMAP0, 0); vmcs_write64(EOI_EXIT_BITMAP1, 0); vmcs_write64(EOI_EXIT_BITMAP2, 0); @@ -4309,12 +4621,20 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); } - if (!kvm_pause_in_guest(vmx->vcpu.kvm)) { + if (vmx_can_use_ipiv(&vmx->vcpu)) { + vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table)); + vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1); + } + + if (!kvm_pause_in_guest(kvm)) { vmcs_write32(PLE_GAP, ple_gap); vmx->ple_window = ple_window; vmx->ple_window_dirty = true; } + if (kvm_notify_vmexit_enabled(kvm)) + vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ @@ -4465,6 +4785,8 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); vpid_sync_context(vmx->vpid); + + vmx_update_fb_clear_dis(vcpu, vmx); } static void vmx_enable_irq_window(struct kvm_vcpu *vcpu) @@ -4483,13 +4805,13 @@ static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); } -static void vmx_inject_irq(struct kvm_vcpu *vcpu) +static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) { struct vcpu_vmx *vmx = to_vmx(vcpu); uint32_t intr; int irq = vcpu->arch.interrupt.nr; - trace_kvm_inj_virq(irq); + trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected); ++vcpu->stat.irq_injections; if (vmx->rmode.vm86_active) { @@ -4692,7 +5014,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, if (kvm_emulate_instruction(vcpu, 0)) { if (vcpu->arch.halt_request) { vcpu->arch.halt_request = 0; - return kvm_vcpu_halt(vcpu); + return kvm_emulate_halt_noskip(vcpu); } return 1; } @@ -4748,6 +5070,17 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) if (is_machine_check(intr_info) || is_nmi(intr_info)) return 1; /* handled by handle_exception_nmi_irqoff() */ + /* + * Queue the exception here instead of in handle_nm_fault_irqoff(). + * This ensures the nested_vmx check is not skipped so vmexit can + * be reflected to L1 (when it intercepts #NM) before reaching this + * point. + */ + if (is_nm_fault(intr_info)) { + kvm_queue_exception(vcpu, NM_VECTOR); + return 1; + } + if (is_invalid_opcode(intr_info)) return handle_ud(vcpu); @@ -4811,8 +5144,35 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) dr6 = vmx_get_exit_qual(vcpu); if (!(vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { + /* + * If the #DB was due to ICEBP, a.k.a. INT1, skip the + * instruction. ICEBP generates a trap-like #DB, but + * despite its interception control being tied to #DB, + * is an instruction intercept, i.e. the VM-Exit occurs + * on the ICEBP itself. Use the inner "skip" helper to + * avoid single-step #DB and MTF updates, as ICEBP is + * higher priority. Note, skipping ICEBP still clears + * STI and MOVSS blocking. + * + * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS + * if single-step is enabled in RFLAGS and STI or MOVSS + * blocking is active, as the CPU doesn't set the bit + * on VM-Exit due to #DB interception. VM-Entry has a + * consistency check that a single-step #DB is pending + * in this scenario as the previous instruction cannot + * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV + * don't modify RFLAGS), therefore the one instruction + * delay when activating single-step breakpoints must + * have already expired. Note, the CPU sets/clears BS + * as appropriate for all other VM-Exits types. + */ if (is_icebp(intr_info)) WARN_ON(!skip_emulated_instruction(vcpu)); + else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) && + (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS))) + vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, + vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS); kvm_queue_exception_p(vcpu, DB_VECTOR, dr6); return 1; @@ -5050,7 +5410,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) if (!kvm_require_dr(vcpu, dr)) return 1; - if (kvm_x86_ops.get_cpl(vcpu) > 0) + if (vmx_get_cpl(vcpu) > 0) goto out; dr7 = vmcs_readl(GUEST_DR7); @@ -5183,9 +5543,16 @@ static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) static int handle_apic_write(struct kvm_vcpu *vcpu) { unsigned long exit_qualification = vmx_get_exit_qual(vcpu); - u32 offset = exit_qualification & 0xfff; - /* APIC-write VM exit is trap-like and thus no need to adjust IP */ + /* + * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and + * hardware has done any necessary aliasing, offset adjustments, etc... + * for the access. I.e. the correct value has already been written to + * the vAPIC page for the correct 16-byte chunk. KVM needs only to + * retrieve the register value and emulate the access. + */ + u32 offset = exit_qualification & 0xff0; + kvm_apic_write_nodecode(vcpu, offset); return 1; } @@ -5267,7 +5634,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); - trace_kvm_page_fault(gpa, exit_qualification); + trace_kvm_page_fault(vcpu, gpa, exit_qualification); /* Is it a read fault? */ error_code = (exit_qualification & EPT_VIOLATION_ACC_READ) @@ -5279,9 +5646,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) ? PFERR_FETCH_MASK : 0; /* ept page table entry is present? */ - error_code |= (exit_qualification & - (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE | - EPT_VIOLATION_EXECUTABLE)) + error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK) ? PFERR_PRESENT_MASK : 0; error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ? @@ -5307,7 +5672,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) { gpa_t gpa; - if (!vmx_can_emulate_instruction(vcpu, NULL, 0)) + if (!vmx_can_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0)) return 1; /* @@ -5336,6 +5701,14 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu) return 1; } +static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + return vmx->emulation_required && !vmx->rmode.vm86_active && + (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected); +} + static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -5355,15 +5728,14 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (!kvm_emulate_instruction(vcpu, 0)) return 0; - if (vmx->emulation_required && !vmx->rmode.vm86_active && - vcpu->arch.exception.pending) { + if (vmx_emulation_required_with_pending_exception(vcpu)) { kvm_prepare_emulation_failure_exit(vcpu); return 0; } if (vcpu->arch.halt_request) { vcpu->arch.halt_request = 0; - return kvm_vcpu_halt(vcpu); + return kvm_emulate_halt_noskip(vcpu); } /* @@ -5378,6 +5750,16 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) return 1; } +static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) +{ + if (vmx_emulation_required_with_pending_exception(vcpu)) { + kvm_prepare_emulation_failure_exit(vcpu); + return 0; + } + + return 1; +} + static void grow_ple_window(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -5543,6 +5925,32 @@ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) return 1; } +static int handle_notify(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qual = vmx_get_exit_qual(vcpu); + bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID; + + ++vcpu->stat.notify_window_exits; + + /* + * Notify VM exit happened while executing iret from NMI, + * "blocked by NMI" bit has to be set before next VM entry. + */ + if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI)) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + + if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER || + context_invalid) { + vcpu->run->exit_reason = KVM_EXIT_NOTIFY; + vcpu->run->notify.flags = context_invalid ? + KVM_NOTIFY_CONTEXT_INVALID : 0; + return 0; + } + + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -5600,6 +6008,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, [EXIT_REASON_ENCLS] = handle_encls, [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit, + [EXIT_REASON_NOTIFY] = handle_notify, }; static const int kvm_vmx_max_exit_handlers = @@ -5697,6 +6106,7 @@ void dump_vmcs(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); u32 vmentry_ctl, vmexit_ctl; u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control; + u64 tertiary_exec_control; unsigned long cr4; int efer_slot; @@ -5710,9 +6120,16 @@ void dump_vmcs(struct kvm_vcpu *vcpu) cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); cr4 = vmcs_readl(GUEST_CR4); - secondary_exec_control = 0; + if (cpu_has_secondary_exec_ctrls()) secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + else + secondary_exec_control = 0; + + if (cpu_has_tertiary_exec_ctrls()) + tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL); + else + tertiary_exec_control = 0; pr_err("VMCS %p, last attempted VM-entry on CPU %d\n", vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu); @@ -5812,9 +6229,10 @@ void dump_vmcs(struct kvm_vcpu *vcpu) vmx_dump_msrs("host autoload", &vmx->msr_autoload.host); pr_err("*** Control State ***\n"); - pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n", - pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control); - pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl); + pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n", + cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control); + pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n", + pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl); pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", vmcs_read32(EXCEPTION_BITMAP), vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), @@ -5964,7 +6382,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) exit_reason.basic != EXIT_REASON_EPT_VIOLATION && exit_reason.basic != EXIT_REASON_PML_FULL && exit_reason.basic != EXIT_REASON_APIC_ACCESS && - exit_reason.basic != EXIT_REASON_TASK_SWITCH)) { + exit_reason.basic != EXIT_REASON_TASK_SWITCH && + exit_reason.basic != EXIT_REASON_NOTIFY)) { int ndata = 3; vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; @@ -6073,7 +6492,7 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) int size = PAGE_SIZE << L1D_CACHE_ORDER; /* - * This code is only executed when the the flush mode is 'cond' or + * This code is only executed when the flush mode is 'cond' or * 'always' */ if (static_branch_likely(&vmx_l1d_flush_cond)) { @@ -6226,7 +6645,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) put_page(page); } -static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) +static void vmx_hwapic_isr_update(int max_isr) { u16 status; u8 old; @@ -6344,11 +6763,33 @@ void vmx_do_interrupt_nmi_irqoff(unsigned long entry); static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, unsigned long entry) { - kvm_before_interrupt(vcpu); + bool is_nmi = entry == (unsigned long)asm_exc_nmi_noist; + + kvm_before_interrupt(vcpu, is_nmi ? KVM_HANDLING_NMI : KVM_HANDLING_IRQ); vmx_do_interrupt_nmi_irqoff(entry); kvm_after_interrupt(vcpu); } +static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) +{ + /* + * Save xfd_err to guest_fpu before interrupt is enabled, so the + * MSR value is not clobbered by the host activity before the guest + * has chance to consume it. + * + * Do not blindly read xfd_err here, since this exception might + * be caused by L1 interception on a platform which doesn't + * support xfd at all. + * + * Do it conditionally upon guest_fpu::xfd. xfd_err matters + * only when xfd contains a non-zero value. + * + * Queuing exception is done in vmx_handle_exit. See comment there. + */ + if (vcpu->arch.guest_fpu.fpstate->xfd) + rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); +} + static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) { const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist; @@ -6357,6 +6798,9 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) /* if exit due to PF check for async PF */ if (is_page_fault(intr_info)) vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); + /* if exit due to NM, handle before interrupts are enabled */ + else if (is_nm_fault(intr_info)) + handle_nm_fault_irqoff(&vmx->vcpu); /* Handle machine checks before interrupts are enabled */ else if (is_machine_check(intr_info)) kvm_machine_check(); @@ -6376,6 +6820,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) return; handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc)); + vcpu->arch.at_instruction_boundary = true; } static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) @@ -6530,9 +6975,14 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) { int i, nr_msrs; struct perf_guest_switch_msr *msrs; + struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu); + + pmu->host_cross_mapped_mask = 0; + if (pmu->pebs_enable & pmu->global_ctrl) + intel_pmu_cross_mapped_check(pmu); /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */ - msrs = perf_guest_get_msrs(&nr_msrs); + msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu); if (!msrs) return; @@ -6578,6 +7028,31 @@ void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) } } +void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, + unsigned int flags) +{ + u64 hostval = this_cpu_read(x86_spec_ctrl_current); + + if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) + return; + + if (flags & VMX_RUN_SAVE_SPEC_CTRL) + vmx->spec_ctrl = __rdmsr(MSR_IA32_SPEC_CTRL); + + /* + * If the guest/host SPEC_CTRL values differ, restore the host value. + * + * For legacy IBRS, the IBRS bit always needs to be written after + * transitioning from a less privileged predictor mode, regardless of + * whether the guest/host values differ. + */ + if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) || + vmx->spec_ctrl != hostval) + native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval); + + barrier_nospec(); +} + static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { switch (to_vmx(vcpu)->exit_reason.basic) { @@ -6591,25 +7066,33 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) } static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, - struct vcpu_vmx *vmx) + struct vcpu_vmx *vmx, + unsigned long flags) { - kvm_guest_enter_irqoff(); + guest_state_enter_irqoff(); /* L1D Flush includes CPU buffer clear to mitigate MDS */ if (static_branch_unlikely(&vmx_l1d_should_flush)) vmx_l1d_flush(vcpu); else if (static_branch_unlikely(&mds_user_clear)) mds_clear_cpu_buffers(); + else if (static_branch_unlikely(&mmio_stale_data_clear) && + kvm_arch_has_assigned_device(vcpu->kvm)) + mds_clear_cpu_buffers(); + + vmx_disable_fb_clear(vmx); if (vcpu->arch.cr2 != native_read_cr2()) native_write_cr2(vcpu->arch.cr2); vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, - vmx->loaded_vmcs->launched); + flags); vcpu->arch.cr2 = native_read_cr2(); - kvm_guest_exit_irqoff(); + vmx_enable_fb_clear(vmx); + + guest_state_exit_irqoff(); } static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) @@ -6656,7 +7139,15 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP)) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); + vcpu->arch.regs_dirty = 0; + /* + * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately + * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time + * it switches back to the current->mm, which can occur in KVM context + * when switching to a temporary mm to patch kernel code, e.g. if KVM + * toggles a static key while handling a VM-Exit. + */ cr3 = __get_current_cr3_fast(); if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { vmcs_writel(HOST_CR3, cr3); @@ -6694,36 +7185,8 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) kvm_wait_lapic_expire(vcpu); - /* - * If this vCPU has touched SPEC_CTRL, restore the guest's value if - * it's non-zero. Since vmentry is serialising on affected CPUs, there - * is no need to worry about the conditional branch over the wrmsr - * being speculatively taken. - */ - x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0); - /* The actual VMENTER/EXIT is in the .noinstr.text section. */ - vmx_vcpu_enter_exit(vcpu, vmx); - - /* - * We do not use IBRS in the kernel. If this vCPU has used the - * SPEC_CTRL MSR it may have left it on; save the value and - * turn it off. This is much more efficient than blindly adding - * it to the atomic save/restore list. Especially as the former - * (Saving guest MSRs on vmexit) doesn't even exist in KVM. - * - * For non-nested case: - * If the L01 MSR bitmap does not intercept the MSR, then we need to - * save it. - * - * For nested case: - * If the L02 MSR bitmap does not intercept the MSR, then we need to - * save it. - */ - if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))) - vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); - - x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0); + vmx_vcpu_enter_exit(vcpu, vmx, __vmx_vcpu_run_flags(vmx)); /* All fields are clean at this point */ if (static_branch_unlikely(&enable_evmcs)) { @@ -6750,7 +7213,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) loadsegment(es, __USER_DS); #endif - vmx_register_cache_reset(vcpu); + vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET; pt_guest_exit(vmx); @@ -6798,7 +7261,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) return vmx_exit_handlers_fastpath(vcpu); } -static void vmx_free_vcpu(struct kvm_vcpu *vcpu) +static void vmx_vcpu_free(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6809,7 +7272,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) free_loaded_vmcs(vmx->loaded_vmcs); } -static int vmx_create_vcpu(struct kvm_vcpu *vcpu) +static int vmx_vcpu_create(struct kvm_vcpu *vcpu) { struct vmx_uret_msr *tsx_ctrl; struct vcpu_vmx *vmx; @@ -6818,6 +7281,8 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); vmx = to_vmx(vcpu); + INIT_LIST_HEAD(&vmx->pi_wakeup_list); + err = -ENOMEM; vmx->vpid = allocate_vpid(); @@ -6898,6 +7363,10 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) goto free_vmcs; } + if (vmx_can_use_ipiv(vcpu)) + WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id], + __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID); + return 0; free_vmcs: @@ -6957,7 +7426,7 @@ static int __init vmx_check_processor_compat(void) if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) return -EIO; if (nested) - nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept); + nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept); if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", smp_processor_id()); @@ -6966,10 +7435,9 @@ static int __init vmx_check_processor_compat(void) return 0; } -static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) +static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { u8 cache; - u64 ipat = 0; /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in * memory aliases with conflicting memory types and sometimes MCEs. @@ -6989,30 +7457,22 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) * EPT memory type is used to emulate guest CD/MTRR. */ - if (is_mmio) { - cache = MTRR_TYPE_UNCACHABLE; - goto exit; - } + if (is_mmio) + return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; - if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) { - ipat = VMX_EPT_IPAT_BIT; - cache = MTRR_TYPE_WRBACK; - goto exit; - } + if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) + return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; if (kvm_read_cr0(vcpu) & X86_CR0_CD) { - ipat = VMX_EPT_IPAT_BIT; if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) cache = MTRR_TYPE_WRBACK; else cache = MTRR_TYPE_UNCACHABLE; - goto exit; - } - cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn); + return (cache << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; + } -exit: - return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat; + return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT; } static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl) @@ -7051,7 +7511,7 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \ } while (0) - entry = kvm_find_cpuid_entry(vcpu, 0x1, 0); + entry = kvm_find_cpuid_entry(vcpu, 0x1); cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME)); cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME)); cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC)); @@ -7067,7 +7527,7 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID)); cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE)); - entry = kvm_find_cpuid_entry(vcpu, 0x7, 0); + entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0); cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE)); cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP)); cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP)); @@ -7078,23 +7538,6 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) #undef cr4_fixed1_update } -static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (kvm_mpx_supported()) { - bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX); - - if (mpx_enabled) { - vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; - vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; - } else { - vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS; - vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS; - } - } -} - static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -7102,7 +7545,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) int i; for (i = 0; i < PT_CPUID_LEAVES; i++) { - best = kvm_find_cpuid_entry(vcpu, 0x14, i); + best = kvm_find_cpuid_entry_index(vcpu, 0x14, i); if (!best) return; vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax; @@ -7178,18 +7621,16 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vmx_secondary_exec_control(vmx)); if (nested_vmx_allowed(vcpu)) - to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= + vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; else - to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= + vmx->msr_ia32_feature_control_valid_bits &= ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX); - if (nested_vmx_allowed(vcpu)) { + if (nested_vmx_allowed(vcpu)) nested_vmx_cr_fixed1_bits_update(vcpu); - nested_vmx_entry_exit_ctls_update(vcpu); - } if (boot_cpu_has(X86_FEATURE_INTEL_PT) && guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT)) @@ -7204,6 +7645,11 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) } } + if (kvm_cpu_cap_has(X86_FEATURE_XFD)) + vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R, + !guest_cpuid_has(vcpu, X86_FEATURE_XFD)); + + set_cr4_guest_host_mask(vmx); vmx_write_encls_bitmap(vcpu, NULL); @@ -7238,6 +7684,13 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_clear(X86_FEATURE_INVPCID); if (vmx_pt_mode_is_host_guest()) kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); + if (vmx_pebs_supported()) { + kvm_cpu_cap_check_and_set(X86_FEATURE_DS); + kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64); + } + + if (!enable_pmu) + kvm_cpu_cap_clear(X86_FEATURE_PDCM); if (!enable_sgx) { kvm_cpu_cap_clear(X86_FEATURE_SGX); @@ -7250,7 +7703,7 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_UMIP); /* CPUID 0xD.1 */ - supported_xss = 0; + kvm_caps.supported_xss = 0; if (!cpu_has_vmx_xsaves()) kvm_cpu_cap_clear(X86_FEATURE_XSAVES); @@ -7391,9 +7844,9 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, delta_tsc = 0; /* Convert to host delta tsc if tsc scaling is enabled */ - if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio && + if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio && delta_tsc && u64_shl_div_u64(delta_tsc, - kvm_tsc_scaling_ratio_frac_bits, + kvm_caps.tsc_scaling_ratio_frac_bits, vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc)) return -ERANGE; @@ -7443,25 +7896,6 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); } -static int vmx_pre_block(struct kvm_vcpu *vcpu) -{ - if (pi_pre_block(vcpu)) - return 1; - - if (kvm_lapic_hv_timer_in_use(vcpu)) - kvm_lapic_switch_to_sw_timer(vcpu); - - return 0; -} - -static void vmx_post_block(struct kvm_vcpu *vcpu) -{ - if (kvm_x86_ops.set_hv_timer) - kvm_lapic_switch_to_hv_timer(vcpu); - - pi_post_block(vcpu); -} - static void vmx_setup_mce(struct kvm_vcpu *vcpu) { if (vcpu->arch.mcg_cap & MCG_LMCE_P) @@ -7484,6 +7918,13 @@ static int vmx_enter_smm(struct kvm_vcpu *vcpu, char *smstate) { struct vcpu_vmx *vmx = to_vmx(vcpu); + /* + * TODO: Implement custom flows for forcing the vCPU out/in of L2 on + * SMI and RSM. Using the common VM-Exit + VM-Enter routines is wrong + * SMI and RSM only modify state that is saved and restored via SMRAM. + * E.g. most MSRs are left untouched, but many are modified by VM-Exit + * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM. + */ vmx->nested.smm.guest_mode = is_guest_mode(vcpu); if (vmx->nested.smm.guest_mode) nested_vmx_vmexit(vcpu, -1, 0, 0); @@ -7509,6 +7950,7 @@ static int vmx_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) if (ret) return ret; + vmx->nested.nested_run_pending = 1; vmx->nested.smm.guest_mode = false; } return 0; @@ -7534,7 +7976,7 @@ static void vmx_migrate_timers(struct kvm_vcpu *vcpu) } } -static void hardware_unsetup(void) +static void vmx_hardware_unsetup(void) { kvm_set_posted_intr_wakeup_handler(NULL); @@ -7544,34 +7986,44 @@ static void hardware_unsetup(void) free_kvm_area(); } -static bool vmx_check_apicv_inhibit_reasons(ulong bit) +static bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason) { ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | BIT(APICV_INHIBIT_REASON_ABSENT) | BIT(APICV_INHIBIT_REASON_HYPERV) | - BIT(APICV_INHIBIT_REASON_BLOCKIRQ); + BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | + BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | + BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED); - return supported & BIT(bit); + return supported & BIT(reason); +} + +static void vmx_vm_destroy(struct kvm *kvm) +{ + struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); + + free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm)); } static struct kvm_x86_ops vmx_x86_ops __initdata = { .name = "kvm_intel", - .hardware_unsetup = hardware_unsetup, + .hardware_unsetup = vmx_hardware_unsetup, - .hardware_enable = hardware_enable, - .hardware_disable = hardware_disable, - .cpu_has_accelerated_tpr = report_flexpriority, + .hardware_enable = vmx_hardware_enable, + .hardware_disable = vmx_hardware_disable, .has_emulated_msr = vmx_has_emulated_msr, .vm_size = sizeof(struct kvm_vmx), .vm_init = vmx_vm_init, + .vm_destroy = vmx_vm_destroy, - .vcpu_create = vmx_create_vcpu, - .vcpu_free = vmx_free_vcpu, + .vcpu_precreate = vmx_vcpu_precreate, + .vcpu_create = vmx_vcpu_create, + .vcpu_free = vmx_vcpu_free, .vcpu_reset = vmx_vcpu_reset, - .prepare_guest_switch = vmx_prepare_switch_to_guest, + .prepare_switch_to_guest = vmx_prepare_switch_to_guest, .vcpu_load = vmx_vcpu_load, .vcpu_put = vmx_vcpu_put, @@ -7599,21 +8051,22 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .set_rflags = vmx_set_rflags, .get_if_flag = vmx_get_if_flag, - .tlb_flush_all = vmx_flush_tlb_all, - .tlb_flush_current = vmx_flush_tlb_current, - .tlb_flush_gva = vmx_flush_tlb_gva, - .tlb_flush_guest = vmx_flush_tlb_guest, + .flush_tlb_all = vmx_flush_tlb_all, + .flush_tlb_current = vmx_flush_tlb_current, + .flush_tlb_gva = vmx_flush_tlb_gva, + .flush_tlb_guest = vmx_flush_tlb_guest, - .run = vmx_vcpu_run, + .vcpu_pre_run = vmx_vcpu_pre_run, + .vcpu_run = vmx_vcpu_run, .handle_exit = vmx_handle_exit, .skip_emulated_instruction = vmx_skip_emulated_instruction, .update_emulated_instruction = vmx_update_emulated_instruction, .set_interrupt_shadow = vmx_set_interrupt_shadow, .get_interrupt_shadow = vmx_get_interrupt_shadow, .patch_hypercall = vmx_patch_hypercall, - .set_irq = vmx_inject_irq, - .set_nmi = vmx_inject_nmi, - .queue_exception = vmx_queue_exception, + .inject_irq = vmx_inject_irq, + .inject_nmi = vmx_inject_nmi, + .inject_exception = vmx_inject_exception, .cancel_injection = vmx_cancel_injection, .interrupt_allowed = vmx_interrupt_allowed, .nmi_allowed = vmx_nmi_allowed, @@ -7632,7 +8085,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .hwapic_isr_update = vmx_hwapic_isr_update, .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, .sync_pir_to_irr = vmx_sync_pir_to_irr, - .deliver_posted_interrupt = vmx_deliver_posted_interrupt, + .deliver_interrupt = vmx_deliver_interrupt, .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt, .set_tss_addr = vmx_set_tss_addr, @@ -7662,14 +8115,10 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .cpu_dirty_log_size = PML_ENTITY_NUM, .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging, - .pre_block = vmx_pre_block, - .post_block = vmx_post_block, - - .pmu_ops = &intel_pmu_ops, .nested_ops = &vmx_nested_ops, - .update_pi_irte = pi_update_irte, - .start_assignment = vmx_pi_start_assignment, + .pi_update_irte = vmx_pi_update_irte, + .pi_start_assignment = vmx_pi_start_assignment, #ifdef CONFIG_X86_64 .set_hv_timer = vmx_set_hv_timer, @@ -7693,6 +8142,20 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, }; +static unsigned int vmx_handle_intel_pt_intr(void) +{ + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); + + /* '0' on failure so that the !PT case can use a RET0 static call. */ + if (!vcpu || !kvm_handling_nmi_from_guest(vcpu)) + return 0; + + kvm_make_request(KVM_REQ_PMI, vcpu); + __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT, + (unsigned long *)&vcpu->arch.pmu.global_status); + return 1; +} + static __init void vmx_setup_user_return_msrs(void) { @@ -7719,11 +8182,38 @@ static __init void vmx_setup_user_return_msrs(void) kvm_add_user_return_msr(vmx_uret_msrs_list[i]); } +static void __init vmx_setup_me_spte_mask(void) +{ + u64 me_mask = 0; + + /* + * kvm_get_shadow_phys_bits() returns shadow_phys_bits. Use + * the former to avoid exposing shadow_phys_bits. + * + * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to + * shadow_phys_bits. On MKTME and/or TDX capable systems, + * boot_cpu_data.x86_phys_bits holds the actual physical address + * w/o the KeyID bits, and shadow_phys_bits equals to MAXPHYADDR + * reported by CPUID. Those bits between are KeyID bits. + */ + if (boot_cpu_data.x86_phys_bits != kvm_get_shadow_phys_bits()) + me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits, + kvm_get_shadow_phys_bits() - 1); + /* + * Unlike SME, host kernel doesn't support setting up any + * MKTME KeyID on Intel platforms. No memory encryption + * bits should be included into the SPTE. + */ + kvm_mmu_set_me_spte_mask(0, me_mask); +} + +static struct kvm_x86_init_ops vmx_init_ops __initdata; + static __init int hardware_setup(void) { unsigned long host_bndcfgs; struct desc_ptr dt; - int r, ept_lpage_level; + int r; store_idt(&dt); host_idt_base = dt.address; @@ -7733,6 +8223,10 @@ static __init int hardware_setup(void) if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) return -EIO; + if (cpu_has_perf_global_ctrl_bug()) + pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " + "does not work properly. Using workaround\n"); + if (boot_cpu_has(X86_FEATURE_NX)) kvm_enable_efer_bits(EFER_NX); @@ -7742,8 +8236,8 @@ static __init int hardware_setup(void) } if (!cpu_has_vmx_mpx()) - supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | - XFEATURE_MASK_BNDCSR); + kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | + XFEATURE_MASK_BNDCSR); if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) @@ -7773,6 +8267,11 @@ static __init int hardware_setup(void) if (!cpu_has_virtual_nmis()) enable_vnmi = 0; +#ifdef CONFIG_X86_SGX_KVM + if (!cpu_has_vmx_encls_vmexit()) + enable_sgx = false; +#endif + /* * set_apic_access_page_addr() is used to reload apic access * page upon invalidation. No need to do anything if not @@ -7806,13 +8305,16 @@ static __init int hardware_setup(void) if (!enable_apicv) vmx_x86_ops.sync_pir_to_irr = NULL; - if (cpu_has_vmx_tsc_scaling()) { - kvm_has_tsc_control = true; - kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; - kvm_tsc_scaling_ratio_frac_bits = 48; - } + if (!enable_apicv || !cpu_has_vmx_ipiv()) + enable_ipiv = false; + + if (cpu_has_vmx_tsc_scaling()) + kvm_caps.has_tsc_control = true; - kvm_has_bus_lock_exit = cpu_has_vmx_bus_lock_detection(); + kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; + kvm_caps.tsc_scaling_ratio_frac_bits = 48; + kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection(); + kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit(); set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ @@ -7820,16 +8322,14 @@ static __init int hardware_setup(void) kvm_mmu_set_ept_masks(enable_ept_ad_bits, cpu_has_vmx_ept_execute_only()); - if (!enable_ept) - ept_lpage_level = 0; - else if (cpu_has_vmx_ept_1g_page()) - ept_lpage_level = PG_LEVEL_1G; - else if (cpu_has_vmx_ept_2m_page()) - ept_lpage_level = PG_LEVEL_2M; - else - ept_lpage_level = PG_LEVEL_4K; + /* + * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID + * bits to shadow_zero_check. + */ + vmx_setup_me_spte_mask(); + kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(), - ept_lpage_level); + ept_caps_to_lpage_level(vmx_capability.ept)); /* * Only enable PML when hardware supports PML feature, and both EPT @@ -7846,11 +8346,9 @@ static __init int hardware_setup(void) if (enable_preemption_timer) { u64 use_timer_freq = 5000ULL * 1000 * 1000; - u64 vmx_msr; - rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); cpu_preemption_timer_multi = - vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; + vmcs_config.misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; if (tsc_khz) use_timer_freq = (u64)tsc_khz * 1000; @@ -7871,18 +8369,22 @@ static __init int hardware_setup(void) vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit; } - kvm_mce_cap_supported |= MCG_LMCE_P; + kvm_caps.supported_mce_cap |= MCG_LMCE_P; + kvm_caps.supported_mce_cap |= MCG_CMCI_P; if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST) return -EINVAL; - if (!enable_ept || !cpu_has_vmx_intel_pt()) + if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt()) pt_mode = PT_MODE_SYSTEM; + if (pt_mode == PT_MODE_HOST_GUEST) + vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr; + else + vmx_init_ops.handle_intel_pt_intr = NULL; setup_default_sgx_lepubkeyhash(); if (nested) { - nested_vmx_setup_ctls_msrs(&vmcs_config.nested, - vmx_capability.ept); + nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept); r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers); if (r) @@ -7892,7 +8394,7 @@ static __init int hardware_setup(void) vmx_set_cpu_caps(); r = alloc_kvm_area(); - if (r) + if (r && nested) nested_vmx_hardware_unsetup(); kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); @@ -7905,8 +8407,10 @@ static struct kvm_x86_init_ops vmx_init_ops __initdata = { .disabled_by_bios = vmx_disabled_by_bios, .check_processor_compatibility = vmx_check_processor_compat, .hardware_setup = hardware_setup, + .handle_intel_pt_intr = NULL, .runtime_ops = &vmx_x86_ops, + .pmu_ops = &intel_pmu_ops, }; static void vmx_cleanup_l1d_flush(void) @@ -7971,7 +8475,6 @@ static int __init vmx_init(void) ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED && (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >= KVM_EVMCS_VERSION) { - int cpu; /* Check that we have assist pages on all online CPUs */ for_each_online_cpu(cpu) { @@ -8013,6 +8516,8 @@ static int __init vmx_init(void) return r; } + vmx_setup_fb_clear_ctrl(); + for_each_possible_cpu(cpu) { INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); |