aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm/vmx.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/vmx.c')
-rw-r--r--arch/x86/kvm/vmx.c300
1 files changed, 200 insertions, 100 deletions
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1d26f3c4985b..e665aa7167cf 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -121,7 +121,6 @@ module_param_named(pml, enable_pml, bool, S_IRUGO);
#define MSR_BITMAP_MODE_X2APIC 1
#define MSR_BITMAP_MODE_X2APIC_APICV 2
-#define MSR_BITMAP_MODE_LM 4
#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
@@ -397,6 +396,7 @@ struct loaded_vmcs {
int cpu;
bool launched;
bool nmi_known_unmasked;
+ bool hv_timer_armed;
/* Support for vnmi-less CPUs */
int soft_vnmi_blocked;
ktime_t entry_time;
@@ -856,6 +856,7 @@ struct nested_vmx {
/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
u64 vmcs01_debugctl;
+ u64 vmcs01_guest_bndcfgs;
u16 vpid02;
u16 last_vpid;
@@ -1019,6 +1020,8 @@ struct vcpu_vmx {
int ple_window;
bool ple_window_dirty;
+ bool req_immediate_exit;
+
/* Support for PML */
#define PML_ENTITY_NUM 512
struct page *pml_pg;
@@ -1569,8 +1572,12 @@ static int vmx_hv_remote_flush_tlb(struct kvm *kvm)
goto out;
}
+ /*
+ * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs the address of the
+ * base of EPT PML4 table, strip off EPT configuration information.
+ */
ret = hyperv_flush_guest_mapping(
- to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer);
+ to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer & PAGE_MASK);
out:
spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
@@ -2864,6 +2871,8 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
u16 fs_sel, gs_sel;
int i;
+ vmx->req_immediate_exit = false;
+
if (vmx->loaded_cpu_state)
return;
@@ -2894,8 +2903,7 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
}
- if (is_long_mode(&vmx->vcpu))
- wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+ wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
#else
savesegment(fs, fs_sel);
savesegment(gs, gs_sel);
@@ -2946,8 +2954,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
vmx->loaded_cpu_state = NULL;
#ifdef CONFIG_X86_64
- if (is_long_mode(&vmx->vcpu))
- rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+ rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
#endif
if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
kvm_load_ldt(host_state->ldt_sel);
@@ -2975,24 +2982,19 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
#ifdef CONFIG_X86_64
static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
{
- if (is_long_mode(&vmx->vcpu)) {
- preempt_disable();
- if (vmx->loaded_cpu_state)
- rdmsrl(MSR_KERNEL_GS_BASE,
- vmx->msr_guest_kernel_gs_base);
- preempt_enable();
- }
+ preempt_disable();
+ if (vmx->loaded_cpu_state)
+ rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+ preempt_enable();
return vmx->msr_guest_kernel_gs_base;
}
static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
{
- if (is_long_mode(&vmx->vcpu)) {
- preempt_disable();
- if (vmx->loaded_cpu_state)
- wrmsrl(MSR_KERNEL_GS_BASE, data);
- preempt_enable();
- }
+ preempt_disable();
+ if (vmx->loaded_cpu_state)
+ wrmsrl(MSR_KERNEL_GS_BASE, data);
+ preempt_enable();
vmx->msr_guest_kernel_gs_base = data;
}
#endif
@@ -3528,9 +3530,6 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv)
VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
- if (kvm_mpx_supported())
- msrs->exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
-
/* We support free control of debug control saving. */
msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
@@ -3547,8 +3546,6 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv)
VM_ENTRY_LOAD_IA32_PAT;
msrs->entry_ctls_high |=
(VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
- if (kvm_mpx_supported())
- msrs->entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
/* We support free control of debug control loading. */
msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
@@ -3596,12 +3593,12 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv)
msrs->secondary_ctls_high);
msrs->secondary_ctls_low = 0;
msrs->secondary_ctls_high &=
- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_DESC |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_WBINVD_EXITING;
+
/*
* We can emulate "VMCS shadowing," even if the hardware
* doesn't support it.
@@ -3658,6 +3655,10 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv)
msrs->secondary_ctls_high |=
SECONDARY_EXEC_UNRESTRICTED_GUEST;
+ if (flexpriority_enabled)
+ msrs->secondary_ctls_high |=
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+
/* miscellaneous data */
rdmsr(MSR_IA32_VMX_MISC,
msrs->misc_low,
@@ -5068,19 +5069,6 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
if (!msr)
return;
- /*
- * MSR_KERNEL_GS_BASE is not intercepted when the guest is in
- * 64-bit mode as a 64-bit kernel may frequently access the
- * MSR. This means we need to manually save/restore the MSR
- * when switching between guest and host state, but only if
- * the guest is in 64-bit mode. Sync our cached value if the
- * guest is transitioning to 32-bit mode and the CPU contains
- * guest state, i.e. the cache is stale.
- */
-#ifdef CONFIG_X86_64
- if (!(efer & EFER_LMA))
- (void)vmx_read_guest_kernel_gs_base(vmx);
-#endif
vcpu->arch.efer = efer;
if (efer & EFER_LMA) {
vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
@@ -5393,9 +5381,10 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
* To use VMXON (and later other VMX instructions), a guest
* must first be able to turn on cr4.VMXE (see handle_vmon()).
* So basically the check on whether to allow nested VMX
- * is here.
+ * is here. We operate under the default treatment of SMM,
+ * so VMX cannot be enabled under SMM.
*/
- if (!nested_vmx_allowed(vcpu))
+ if (!nested_vmx_allowed(vcpu) || is_smm(vcpu))
return 1;
}
@@ -6072,9 +6061,6 @@ static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
mode |= MSR_BITMAP_MODE_X2APIC_APICV;
}
- if (is_long_mode(vcpu))
- mode |= MSR_BITMAP_MODE_LM;
-
return mode;
}
@@ -6115,9 +6101,6 @@ static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
if (!changed)
return;
- vmx_set_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW,
- !(mode & MSR_BITMAP_MODE_LM));
-
if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
@@ -6183,6 +6166,32 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
nested_mark_vmcs12_pages_dirty(vcpu);
}
+static u8 vmx_get_rvi(void)
+{
+ return vmcs_read16(GUEST_INTR_STATUS) & 0xff;
+}
+
+static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ void *vapic_page;
+ u32 vppr;
+ int rvi;
+
+ if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
+ !nested_cpu_has_vid(get_vmcs12(vcpu)) ||
+ WARN_ON_ONCE(!vmx->nested.virtual_apic_page))
+ return false;
+
+ rvi = vmx_get_rvi();
+
+ vapic_page = kmap(vmx->nested.virtual_apic_page);
+ vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
+ kunmap(vmx->nested.virtual_apic_page);
+
+ return ((rvi & 0xf0) > (vppr & 0xf0));
+}
+
static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
bool nested)
{
@@ -6983,7 +6992,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
* Cause the #SS fault with 0 error code in VM86 mode.
*/
if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
- if (emulate_instruction(vcpu, 0) == EMULATE_DONE) {
+ if (kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE) {
if (vcpu->arch.halt_request) {
vcpu->arch.halt_request = 0;
return kvm_vcpu_halt(vcpu);
@@ -7054,7 +7063,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
WARN_ON_ONCE(!enable_vmware_backdoor);
- er = emulate_instruction(vcpu,
+ er = kvm_emulate_instruction(vcpu,
EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL);
if (er == EMULATE_USER_EXIT)
return 0;
@@ -7157,7 +7166,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
++vcpu->stat.io_exits;
if (string)
- return emulate_instruction(vcpu, 0) == EMULATE_DONE;
+ return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
port = exit_qualification >> 16;
size = (exit_qualification & 7) + 1;
@@ -7231,7 +7240,7 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
static int handle_desc(struct kvm_vcpu *vcpu)
{
WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
- return emulate_instruction(vcpu, 0) == EMULATE_DONE;
+ return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
}
static int handle_cr(struct kvm_vcpu *vcpu)
@@ -7480,7 +7489,7 @@ static int handle_vmcall(struct kvm_vcpu *vcpu)
static int handle_invd(struct kvm_vcpu *vcpu)
{
- return emulate_instruction(vcpu, 0) == EMULATE_DONE;
+ return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
}
static int handle_invlpg(struct kvm_vcpu *vcpu)
@@ -7547,7 +7556,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu)
return kvm_skip_emulated_instruction(vcpu);
}
}
- return emulate_instruction(vcpu, 0) == EMULATE_DONE;
+ return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
}
static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
@@ -7704,8 +7713,8 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
return kvm_skip_emulated_instruction(vcpu);
else
- return x86_emulate_instruction(vcpu, gpa, EMULTYPE_SKIP,
- NULL, 0) == EMULATE_DONE;
+ return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) ==
+ EMULATE_DONE;
}
return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
@@ -7748,7 +7757,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
if (kvm_test_request(KVM_REQ_EVENT, vcpu))
return 1;
- err = emulate_instruction(vcpu, 0);
+ err = kvm_emulate_instruction(vcpu, 0);
if (err == EMULATE_USER_EXIT) {
++vcpu->stat.mmio_exits;
@@ -7966,6 +7975,9 @@ static __init int hardware_setup(void)
kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
}
+ if (!cpu_has_vmx_preemption_timer())
+ kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit;
+
if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) {
u64 vmx_msr;
@@ -9208,7 +9220,8 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
static int handle_preemption_timer(struct kvm_vcpu *vcpu)
{
- kvm_lapic_expired_hv_timer(vcpu);
+ if (!to_vmx(vcpu)->req_immediate_exit)
+ kvm_lapic_expired_hv_timer(vcpu);
return 1;
}
@@ -10214,15 +10227,16 @@ static void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
if (!lapic_in_kernel(vcpu))
return;
+ if (!flexpriority_enabled &&
+ !cpu_has_vmx_virtualize_x2apic_mode())
+ return;
+
/* Postpone execution until vmcs01 is the current VMCS. */
if (is_guest_mode(vcpu)) {
to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true;
return;
}
- if (!cpu_need_tpr_shadow(vcpu))
- return;
-
sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
@@ -10344,6 +10358,14 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
return max_irr;
}
+static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
+{
+ u8 rvi = vmx_get_rvi();
+ u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);
+
+ return ((rvi & 0xf0) > (vppr & 0xf0));
+}
+
static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
{
if (!kvm_vcpu_apicv_active(vcpu))
@@ -10595,24 +10617,43 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
msrs[i].host, false);
}
-static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
+static void vmx_arm_hv_timer(struct vcpu_vmx *vmx, u32 val)
+{
+ vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, val);
+ if (!vmx->loaded_vmcs->hv_timer_armed)
+ vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
+ PIN_BASED_VMX_PREEMPTION_TIMER);
+ vmx->loaded_vmcs->hv_timer_armed = true;
+}
+
+static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u64 tscl;
u32 delta_tsc;
- if (vmx->hv_deadline_tsc == -1)
+ if (vmx->req_immediate_exit) {
+ vmx_arm_hv_timer(vmx, 0);
return;
+ }
- tscl = rdtsc();
- if (vmx->hv_deadline_tsc > tscl)
- /* sure to be 32 bit only because checked on set_hv_timer */
- delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
- cpu_preemption_timer_multi);
- else
- delta_tsc = 0;
+ if (vmx->hv_deadline_tsc != -1) {
+ tscl = rdtsc();
+ if (vmx->hv_deadline_tsc > tscl)
+ /* set_hv_timer ensures the delta fits in 32-bits */
+ delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
+ cpu_preemption_timer_multi);
+ else
+ delta_tsc = 0;
- vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
+ vmx_arm_hv_timer(vmx, delta_tsc);
+ return;
+ }
+
+ if (vmx->loaded_vmcs->hv_timer_armed)
+ vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+ PIN_BASED_VMX_PREEMPTION_TIMER);
+ vmx->loaded_vmcs->hv_timer_armed = false;
}
static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
@@ -10672,7 +10713,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
atomic_switch_perf_msrs(vmx);
- vmx_arm_hv_timer(vcpu);
+ vmx_update_hv_timer(vcpu);
/*
* If this vCPU has touched SPEC_CTRL, restore the guest's value if
@@ -11214,6 +11255,23 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
#undef cr4_fixed1_update
}
+static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (kvm_mpx_supported()) {
+ bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX);
+
+ if (mpx_enabled) {
+ vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
+ vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
+ } else {
+ vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS;
+ vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS;
+ }
+ }
+}
+
static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -11230,8 +11288,10 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
- if (nested_vmx_allowed(vcpu))
+ if (nested_vmx_allowed(vcpu)) {
nested_vmx_cr_fixed1_bits_update(vcpu);
+ nested_vmx_entry_exit_ctls_update(vcpu);
+ }
}
static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -11427,16 +11487,18 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (vcpu->arch.virtual_tsc_khz == 0)
- return;
-
- /* Make sure short timeouts reliably trigger an immediate vmexit.
- * hrtimer_start does not guarantee this. */
- if (preemption_timeout <= 1) {
+ /*
+ * A timer value of zero is architecturally guaranteed to cause
+ * a VMExit prior to executing any instructions in the guest.
+ */
+ if (preemption_timeout == 0) {
vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
return;
}
+ if (vcpu->arch.virtual_tsc_khz == 0)
+ return;
+
preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
preemption_timeout *= 1000000;
do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
@@ -11646,11 +11708,15 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
* bits 15:8 should be zero in posted_intr_nv,
* the descriptor address has been already checked
* in nested_get_vmcs12_pages.
+ *
+ * bits 5:0 of posted_intr_desc_addr should be zero.
*/
if (nested_cpu_has_posted_intr(vmcs12) &&
(!nested_cpu_has_vid(vmcs12) ||
!nested_exit_intr_ack_set(vcpu) ||
- vmcs12->posted_intr_nv & 0xff00))
+ (vmcs12->posted_intr_nv & 0xff00) ||
+ (vmcs12->posted_intr_desc_addr & 0x3f) ||
+ (!page_address_valid(vcpu, vmcs12->posted_intr_desc_addr))))
return -EINVAL;
/* tpr shadow is needed by all apicv features. */
@@ -11993,8 +12059,13 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
set_cr4_guest_host_mask(vmx);
- if (vmx_mpx_supported())
- vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
+ if (kvm_mpx_supported()) {
+ if (vmx->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
+ vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
+ else
+ vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
+ }
if (enable_vpid) {
if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
@@ -12076,11 +12147,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
exec_control = vmcs12->pin_based_vm_exec_control;
- /* Preemption timer setting is only taken from vmcs01. */
- exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+ /* Preemption timer setting is computed directly in vmx_vcpu_run. */
exec_control |= vmcs_config.pin_based_exec_ctrl;
- if (vmx->hv_deadline_tsc == -1)
- exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+ exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+ vmx->loaded_vmcs->hv_timer_armed = false;
/* Posted interrupts setting is only taken from vmcs12. */
if (nested_cpu_has_posted_intr(vmcs12)) {
@@ -12318,6 +12388,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+ if (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
if (nested_vmx_check_io_bitmap_controls(vcpu, vmcs12))
return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
@@ -12537,12 +12610,21 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual)
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
bool from_vmentry = !!exit_qual;
u32 dummy_exit_qual;
+ bool evaluate_pending_interrupts;
int r = 0;
+ evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
+ (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING);
+ if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
+ evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
+
enter_guest_mode(vcpu);
if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ if (kvm_mpx_supported() &&
+ !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
+ vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
vmx_segment_cache_clear(vmx);
@@ -12575,6 +12657,23 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual)
}
/*
+ * If L1 had a pending IRQ/NMI until it executed
+ * VMLAUNCH/VMRESUME which wasn't delivered because it was
+ * disallowed (e.g. interrupts disabled), L0 needs to
+ * evaluate if this pending event should cause an exit from L2
+ * to L1 or delivered directly to L2 (e.g. In case L1 don't
+ * intercept EXTERNAL_INTERRUPT).
+ *
+ * Usually this would be handled by the processor noticing an
+ * IRQ/NMI window request, or checking RVI during evaluation of
+ * pending virtual interrupts. However, this setting was done
+ * on VMCS01 and now VMCS02 is active instead. Thus, we force L0
+ * to perform pending event evaluation by requesting a KVM_REQ_EVENT.
+ */
+ if (unlikely(evaluate_pending_interrupts))
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ /*
* Note no nested_vmx_succeed or nested_vmx_fail here. At this point
* we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
* returned as far as L1 is concerned. It will only return (and set
@@ -12841,6 +12940,11 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
return 0;
}
+static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
+{
+ to_vmx(vcpu)->req_immediate_exit = true;
+}
+
static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
{
ktime_t remaining =
@@ -13231,12 +13335,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
- if (vmx->hv_deadline_tsc == -1)
- vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
- PIN_BASED_VMX_PREEMPTION_TIMER);
- else
- vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
- PIN_BASED_VMX_PREEMPTION_TIMER);
+
if (kvm_has_tsc_control)
decache_tsc_multiplier(vmx);
@@ -13440,18 +13539,12 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
return -ERANGE;
vmx->hv_deadline_tsc = tscl + delta_tsc;
- vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
- PIN_BASED_VMX_PREEMPTION_TIMER);
-
return delta_tsc == 0;
}
static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- vmx->hv_deadline_tsc = -1;
- vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
- PIN_BASED_VMX_PREEMPTION_TIMER);
+ to_vmx(vcpu)->hv_deadline_tsc = -1;
}
#endif
@@ -13932,6 +14025,14 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
return -EINVAL;
+ /*
+ * SMM temporarily disables VMX, so we cannot be in guest mode,
+ * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags
+ * must be zero.
+ */
+ if (is_smm(vcpu) ? kvm_state->flags : kvm_state->vmx.smm.flags)
+ return -EINVAL;
+
if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
!(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
return -EINVAL;
@@ -13988,9 +14089,6 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
return -EINVAL;
- if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING)
- vmx->nested.nested_run_pending = 1;
-
vmx->nested.dirty_vmcs12 = true;
ret = enter_vmx_non_root_mode(vcpu, NULL);
if (ret)
@@ -14078,6 +14176,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.apicv_post_state_restore = vmx_apicv_post_state_restore,
.hwapic_irr_update = vmx_hwapic_irr_update,
.hwapic_isr_update = vmx_hwapic_isr_update,
+ .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
.sync_pir_to_irr = vmx_sync_pir_to_irr,
.deliver_posted_interrupt = vmx_deliver_posted_interrupt,
@@ -14111,6 +14210,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.umip_emulated = vmx_umip_emulated,
.check_nested_events = vmx_check_nested_events,
+ .request_immediate_exit = vmx_request_immediate_exit,
.sched_in = vmx_sched_in,