diff options
Diffstat (limited to 'arch/x86/kvm/svm/nested.c')
-rw-r--r-- | arch/x86/kvm/svm/nested.c | 797 |
1 files changed, 524 insertions, 273 deletions
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index f8b7bc04b3e7..4c620999d230 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -28,6 +28,7 @@ #include "cpuid.h" #include "lapic.h" #include "svm.h" +#include "hyperv.h" #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK @@ -35,41 +36,25 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, struct x86_exception *fault) { struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb *vmcb = svm->vmcb; - if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) { + if (vmcb->control.exit_code != SVM_EXIT_NPF) { /* * TODO: track the cause of the nested page fault, and * correctly fill in the high bits of exit_info_1. */ - svm->vmcb->control.exit_code = SVM_EXIT_NPF; - svm->vmcb->control.exit_code_hi = 0; - svm->vmcb->control.exit_info_1 = (1ULL << 32); - svm->vmcb->control.exit_info_2 = fault->address; + vmcb->control.exit_code = SVM_EXIT_NPF; + vmcb->control.exit_code_hi = 0; + vmcb->control.exit_info_1 = (1ULL << 32); + vmcb->control.exit_info_2 = fault->address; } - svm->vmcb->control.exit_info_1 &= ~0xffffffffULL; - svm->vmcb->control.exit_info_1 |= fault->error_code; + vmcb->control.exit_info_1 &= ~0xffffffffULL; + vmcb->control.exit_info_1 |= fault->error_code; nested_svm_vmexit(svm); } -static void svm_inject_page_fault_nested(struct kvm_vcpu *vcpu, struct x86_exception *fault) -{ - struct vcpu_svm *svm = to_svm(vcpu); - WARN_ON(!is_guest_mode(vcpu)); - - if (vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_EXCEPTION_OFFSET + PF_VECTOR) && - !svm->nested.nested_run_pending) { - svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + PF_VECTOR; - svm->vmcb->control.exit_code_hi = 0; - svm->vmcb->control.exit_info_1 = fault->error_code; - svm->vmcb->control.exit_info_2 = fault->address; - nested_svm_vmexit(svm); - } else { - kvm_inject_page_fault(vcpu, fault); - } -} - static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index) { struct vcpu_svm *svm = to_svm(vcpu); @@ -119,9 +104,24 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; } +static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm) +{ + if (!svm->v_vmload_vmsave_enabled) + return true; + + if (!nested_npt_enabled(svm)) + return true; + + if (!(svm->nested.ctl.virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK)) + return true; + + return false; +} + void recalc_intercepts(struct vcpu_svm *svm) { - struct vmcb_control_area *c, *h, *g; + struct vmcb_control_area *c, *h; + struct vmcb_ctrl_area_cached *g; unsigned int i; vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); @@ -159,51 +159,45 @@ void recalc_intercepts(struct vcpu_svm *svm) if (!intercept_smi) vmcb_clr_intercept(c, INTERCEPT_SMI); - vmcb_set_intercept(c, INTERCEPT_VMLOAD); - vmcb_set_intercept(c, INTERCEPT_VMSAVE); -} - -static void copy_vmcb_control_area(struct vmcb_control_area *dst, - struct vmcb_control_area *from) -{ - unsigned int i; - - for (i = 0; i < MAX_INTERCEPT; i++) - dst->intercepts[i] = from->intercepts[i]; - - dst->iopm_base_pa = from->iopm_base_pa; - dst->msrpm_base_pa = from->msrpm_base_pa; - dst->tsc_offset = from->tsc_offset; - /* asid not copied, it is handled manually for svm->vmcb. */ - dst->tlb_ctl = from->tlb_ctl; - dst->int_ctl = from->int_ctl; - dst->int_vector = from->int_vector; - dst->int_state = from->int_state; - dst->exit_code = from->exit_code; - dst->exit_code_hi = from->exit_code_hi; - dst->exit_info_1 = from->exit_info_1; - dst->exit_info_2 = from->exit_info_2; - dst->exit_int_info = from->exit_int_info; - dst->exit_int_info_err = from->exit_int_info_err; - dst->nested_ctl = from->nested_ctl; - dst->event_inj = from->event_inj; - dst->event_inj_err = from->event_inj_err; - dst->nested_cr3 = from->nested_cr3; - dst->virt_ext = from->virt_ext; - dst->pause_filter_count = from->pause_filter_count; - dst->pause_filter_thresh = from->pause_filter_thresh; + if (nested_vmcb_needs_vls_intercept(svm)) { + /* + * If the virtual VMLOAD/VMSAVE is not enabled for the L2, + * we must intercept these instructions to correctly + * emulate them in case L1 doesn't intercept them. + */ + vmcb_set_intercept(c, INTERCEPT_VMLOAD); + vmcb_set_intercept(c, INTERCEPT_VMSAVE); + } else { + WARN_ON(!(c->virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK)); + } } +/* + * Merge L0's (KVM) and L1's (Nested VMCB) MSR permission bitmaps. The function + * is optimized in that it only merges the parts where KVM MSR permission bitmap + * may contain zero bits. + */ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) { + struct hv_enlightenments *hve = + (struct hv_enlightenments *)svm->nested.ctl.reserved_sw; + int i; + /* - * This function merges the msr permission bitmaps of kvm and the - * nested vmcb. It is optimized in that it only merges the parts where - * the kvm msr permission bitmap may contain zero bits + * MSR bitmap update can be skipped when: + * - MSR bitmap for L1 hasn't changed. + * - Nested hypervisor (L1) is attempting to launch the same L2 as + * before. + * - Nested hypervisor (L1) is using Hyper-V emulation interface and + * tells KVM (L0) there were no changes in MSR bitmap for L2. */ - int i; + if (!svm->nested.force_msr_bitmap_recalc && + kvm_hv_hypercall_enabled(&svm->vcpu) && + hve->hv_enlightenments_control.msr_bitmap && + (svm->nested.ctl.clean & BIT(VMCB_HV_NESTED_ENLIGHTENMENTS))) + goto set_msrpm_base_pa; - if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) + if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) return true; for (i = 0; i < MSRPM_OFFSETS; i++) { @@ -214,6 +208,11 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) break; p = msrpm_offsets[i]; + + /* x2apic msrs are intercepted always for the nested guest */ + if (is_x2apic_msrpm_offset(p)) + continue; + offset = svm->nested.ctl.msrpm_base_pa + (p * 4); if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4)) @@ -222,6 +221,9 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) svm->nested.msrpm[p] = svm->msrpm[p] | value; } + svm->nested.force_msr_bitmap_recalc = false; + +set_msrpm_base_pa: svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm)); return true; @@ -250,10 +252,10 @@ static bool nested_svm_check_tlb_ctl(struct kvm_vcpu *vcpu, u8 tlb_ctl) } } -static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu, - struct vmcb_control_area *control) +static bool __nested_vmcb_check_controls(struct kvm_vcpu *vcpu, + struct vmcb_ctrl_area_cached *control) { - if (CC(!vmcb_is_intercept(control, INTERCEPT_VMRUN))) + if (CC(!vmcb12_is_intercept(control, INTERCEPT_VMRUN))) return false; if (CC(control->asid == 0)) @@ -275,9 +277,20 @@ static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu, return true; } -static bool nested_vmcb_check_cr3_cr4(struct kvm_vcpu *vcpu, - struct vmcb_save_area *save) +/* Common checks that apply to both L1 and L2 state. */ +static bool __nested_vmcb_check_save(struct kvm_vcpu *vcpu, + struct vmcb_save_area_cached *save) { + if (CC(!(save->efer & EFER_SVME))) + return false; + + if (CC((save->cr0 & X86_CR0_CD) == 0 && (save->cr0 & X86_CR0_NW)) || + CC(save->cr0 & ~0xffffffffULL)) + return false; + + if (CC(!kvm_dr6_valid(save->dr6)) || CC(!kvm_dr7_valid(save->dr7))) + return false; + /* * These checks are also performed by KVM_SET_SREGS, * except that EFER.LMA is not checked by SVM against @@ -290,51 +303,103 @@ static bool nested_vmcb_check_cr3_cr4(struct kvm_vcpu *vcpu, return false; } - if (CC(!kvm_is_valid_cr4(vcpu, save->cr4))) + /* Note, SVM doesn't have any additional restrictions on CR4. */ + if (CC(!__kvm_is_valid_cr4(vcpu, save->cr4))) + return false; + + if (CC(!kvm_valid_efer(vcpu, save->efer))) return false; return true; } -/* Common checks that apply to both L1 and L2 state. */ -static bool nested_vmcb_valid_sregs(struct kvm_vcpu *vcpu, - struct vmcb_save_area *save) +static bool nested_vmcb_check_save(struct kvm_vcpu *vcpu) { - /* - * FIXME: these should be done after copying the fields, - * to avoid TOC/TOU races. For these save area checks - * the possible damage is limited since kvm_set_cr0 and - * kvm_set_cr4 handle failure; EFER_SVME is an exception - * so it is force-set later in nested_prepare_vmcb_save. - */ - if (CC(!(save->efer & EFER_SVME))) - return false; + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_save_area_cached *save = &svm->nested.save; - if (CC((save->cr0 & X86_CR0_CD) == 0 && (save->cr0 & X86_CR0_NW)) || - CC(save->cr0 & ~0xffffffffULL)) - return false; + return __nested_vmcb_check_save(vcpu, save); +} - if (CC(!kvm_dr6_valid(save->dr6)) || CC(!kvm_dr7_valid(save->dr7))) - return false; +static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_ctrl_area_cached *ctl = &svm->nested.ctl; - if (!nested_vmcb_check_cr3_cr4(vcpu, save)) - return false; + return __nested_vmcb_check_controls(vcpu, ctl); +} - if (CC(!kvm_valid_efer(vcpu, save->efer))) - return false; +static +void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu, + struct vmcb_ctrl_area_cached *to, + struct vmcb_control_area *from) +{ + unsigned int i; - return true; + for (i = 0; i < MAX_INTERCEPT; i++) + to->intercepts[i] = from->intercepts[i]; + + to->iopm_base_pa = from->iopm_base_pa; + to->msrpm_base_pa = from->msrpm_base_pa; + to->tsc_offset = from->tsc_offset; + to->tlb_ctl = from->tlb_ctl; + to->int_ctl = from->int_ctl; + to->int_vector = from->int_vector; + to->int_state = from->int_state; + to->exit_code = from->exit_code; + to->exit_code_hi = from->exit_code_hi; + to->exit_info_1 = from->exit_info_1; + to->exit_info_2 = from->exit_info_2; + to->exit_int_info = from->exit_int_info; + to->exit_int_info_err = from->exit_int_info_err; + to->nested_ctl = from->nested_ctl; + to->event_inj = from->event_inj; + to->event_inj_err = from->event_inj_err; + to->next_rip = from->next_rip; + to->nested_cr3 = from->nested_cr3; + to->virt_ext = from->virt_ext; + to->pause_filter_count = from->pause_filter_count; + to->pause_filter_thresh = from->pause_filter_thresh; + + /* Copy asid here because nested_vmcb_check_controls will check it. */ + to->asid = from->asid; + to->msrpm_base_pa &= ~0x0fffULL; + to->iopm_base_pa &= ~0x0fffULL; + + /* Hyper-V extensions (Enlightened VMCB) */ + if (kvm_hv_hypercall_enabled(vcpu)) { + to->clean = from->clean; + memcpy(to->reserved_sw, from->reserved_sw, + sizeof(struct hv_enlightenments)); + } } -void nested_load_control_from_vmcb12(struct vcpu_svm *svm, - struct vmcb_control_area *control) +void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm, + struct vmcb_control_area *control) +{ + __nested_copy_vmcb_control_to_cache(&svm->vcpu, &svm->nested.ctl, control); +} + +static void __nested_copy_vmcb_save_to_cache(struct vmcb_save_area_cached *to, + struct vmcb_save_area *from) { - copy_vmcb_control_area(&svm->nested.ctl, control); + /* + * Copy only fields that are validated, as we need them + * to avoid TOC/TOU races. + */ + to->efer = from->efer; + to->cr0 = from->cr0; + to->cr3 = from->cr3; + to->cr4 = from->cr4; - /* Copy it here because nested_svm_check_controls will check it. */ - svm->nested.ctl.asid = control->asid; - svm->nested.ctl.msrpm_base_pa &= ~0x0fffULL; - svm->nested.ctl.iopm_base_pa &= ~0x0fffULL; + to->dr6 = from->dr6; + to->dr7 = from->dr7; +} + +void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm, + struct vmcb_save_area *save) +{ + __nested_copy_vmcb_save_to_cache(&svm->nested.save, save); } /* @@ -361,6 +426,10 @@ void nested_sync_control_from_vmcb02(struct vcpu_svm *svm) */ mask &= ~V_IRQ_MASK; } + + if (nested_vgif_enabled(svm)) + mask |= V_GIF_MASK; + svm->nested.ctl.int_ctl &= ~mask; svm->nested.ctl.int_ctl |= svm->vmcb->control.int_ctl & mask; } @@ -377,7 +446,7 @@ static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm, unsigned int nr; if (vcpu->arch.exception.injected) { - nr = vcpu->arch.exception.nr; + nr = vcpu->arch.exception.vector; exit_int_info = nr | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT; if (vcpu->arch.exception.has_error_code) { @@ -402,11 +471,6 @@ static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm, vmcb12->control.exit_int_info = exit_int_info; } -static inline bool nested_npt_enabled(struct vcpu_svm *svm) -{ - return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE; -} - static void nested_svm_transition_tlb_flush(struct kvm_vcpu *vcpu) { /* @@ -437,18 +501,17 @@ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, return -EINVAL; if (reload_pdptrs && !nested_npt && is_pae_paging(vcpu) && - CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) + CC(!load_pdptrs(vcpu, cr3))) return -EINVAL; - if (!nested_npt) - kvm_mmu_new_pgd(vcpu, cr3); - vcpu->arch.cr3 = cr3; - kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ kvm_init_mmu(vcpu); + if (!nested_npt) + kvm_mmu_new_pgd(vcpu, cr3); + return 0; } @@ -464,6 +527,8 @@ void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm) static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12) { bool new_vmcb12 = false; + struct vmcb *vmcb01 = svm->vmcb01.ptr; + struct vmcb *vmcb02 = svm->nested.vmcb02.ptr; nested_vmcb02_compute_g_pat(svm); @@ -471,34 +536,30 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) { new_vmcb12 = true; svm->nested.last_vmcb12_gpa = svm->nested.vmcb12_gpa; + svm->nested.force_msr_bitmap_recalc = true; } if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) { - svm->vmcb->save.es = vmcb12->save.es; - svm->vmcb->save.cs = vmcb12->save.cs; - svm->vmcb->save.ss = vmcb12->save.ss; - svm->vmcb->save.ds = vmcb12->save.ds; - svm->vmcb->save.cpl = vmcb12->save.cpl; - vmcb_mark_dirty(svm->vmcb, VMCB_SEG); + vmcb02->save.es = vmcb12->save.es; + vmcb02->save.cs = vmcb12->save.cs; + vmcb02->save.ss = vmcb12->save.ss; + vmcb02->save.ds = vmcb12->save.ds; + vmcb02->save.cpl = vmcb12->save.cpl; + vmcb_mark_dirty(vmcb02, VMCB_SEG); } if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DT))) { - svm->vmcb->save.gdtr = vmcb12->save.gdtr; - svm->vmcb->save.idtr = vmcb12->save.idtr; - vmcb_mark_dirty(svm->vmcb, VMCB_DT); + vmcb02->save.gdtr = vmcb12->save.gdtr; + vmcb02->save.idtr = vmcb12->save.idtr; + vmcb_mark_dirty(vmcb02, VMCB_DT); } kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED); - /* - * Force-set EFER_SVME even though it is checked earlier on the - * VMCB12, because the guest can flip the bit between the check - * and now. Clearing EFER_SVME would call svm_free_nested. - */ - svm_set_efer(&svm->vcpu, vmcb12->save.efer | EFER_SVME); + svm_set_efer(&svm->vcpu, svm->nested.save.efer); - svm_set_cr0(&svm->vcpu, vmcb12->save.cr0); - svm_set_cr4(&svm->vcpu, vmcb12->save.cr4); + svm_set_cr0(&svm->vcpu, svm->nested.save.cr0); + svm_set_cr4(&svm->vcpu, svm->nested.save.cr4); svm->vcpu.arch.cr2 = vmcb12->save.cr2; @@ -507,47 +568,87 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 kvm_rip_write(&svm->vcpu, vmcb12->save.rip); /* In case we don't even reach vcpu_run, the fields are not updated */ - svm->vmcb->save.rax = vmcb12->save.rax; - svm->vmcb->save.rsp = vmcb12->save.rsp; - svm->vmcb->save.rip = vmcb12->save.rip; + vmcb02->save.rax = vmcb12->save.rax; + vmcb02->save.rsp = vmcb12->save.rsp; + vmcb02->save.rip = vmcb12->save.rip; /* These bits will be set properly on the first execution when new_vmc12 is true */ if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) { - svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1; - svm->vcpu.arch.dr6 = vmcb12->save.dr6 | DR6_ACTIVE_LOW; - vmcb_mark_dirty(svm->vmcb, VMCB_DR); + vmcb02->save.dr7 = svm->nested.save.dr7 | DR7_FIXED_1; + svm->vcpu.arch.dr6 = svm->nested.save.dr6 | DR6_ACTIVE_LOW; + vmcb_mark_dirty(vmcb02, VMCB_DR); + } + + if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { + /* + * Reserved bits of DEBUGCTL are ignored. Be consistent with + * svm_set_msr's definition of reserved bits. + */ + svm_copy_lbrs(vmcb02, vmcb12); + vmcb02->save.dbgctl &= ~DEBUGCTL_RESERVED_BITS; + svm_update_lbrv(&svm->vcpu); + + } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) { + svm_copy_lbrs(vmcb02, vmcb01); } } -static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) +static inline bool is_evtinj_soft(u32 evtinj) { - const u32 int_ctl_vmcb01_bits = - V_INTR_MASKING_MASK | V_GIF_MASK | V_GIF_ENABLE_MASK; + u32 type = evtinj & SVM_EVTINJ_TYPE_MASK; + u8 vector = evtinj & SVM_EVTINJ_VEC_MASK; - const u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK; + if (!(evtinj & SVM_EVTINJ_VALID)) + return false; + + if (type == SVM_EVTINJ_TYPE_SOFT) + return true; + + return type == SVM_EVTINJ_TYPE_EXEPT && kvm_exception_is_soft(vector); +} + +static bool is_evtinj_nmi(u32 evtinj) +{ + u32 type = evtinj & SVM_EVTINJ_TYPE_MASK; + + if (!(evtinj & SVM_EVTINJ_VALID)) + return false; + + return type == SVM_EVTINJ_TYPE_NMI; +} + +static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, + unsigned long vmcb12_rip, + unsigned long vmcb12_csbase) +{ + u32 int_ctl_vmcb01_bits = V_INTR_MASKING_MASK; + u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK; struct kvm_vcpu *vcpu = &svm->vcpu; + struct vmcb *vmcb01 = svm->vmcb01.ptr; + struct vmcb *vmcb02 = svm->nested.vmcb02.ptr; + u32 pause_count12; + u32 pause_thresh12; /* * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2, * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes. */ - /* - * Also covers avic_vapic_bar, avic_backing_page, avic_logical_id, - * avic_physical_id. - */ - WARN_ON(kvm_apicv_activated(svm->vcpu.kvm)); + if (svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK)) + int_ctl_vmcb12_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK); + else + int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK); /* Copied from vmcb01. msrpm_base can be overwritten later. */ - svm->vmcb->control.nested_ctl = svm->vmcb01.ptr->control.nested_ctl; - svm->vmcb->control.iopm_base_pa = svm->vmcb01.ptr->control.iopm_base_pa; - svm->vmcb->control.msrpm_base_pa = svm->vmcb01.ptr->control.msrpm_base_pa; + vmcb02->control.nested_ctl = vmcb01->control.nested_ctl; + vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa; + vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa; /* Done at vmrun: asid. */ /* Also overwritten later if necessary. */ - svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; + vmcb02->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; /* nested_cr3. */ if (nested_npt_enabled(svm)) @@ -558,21 +659,75 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) svm->nested.ctl.tsc_offset, svm->tsc_ratio_msr); - svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset; + vmcb02->control.tsc_offset = vcpu->arch.tsc_offset; - if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) { + if (svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) { WARN_ON(!svm->tsc_scaling_enabled); nested_svm_update_tsc_ratio_msr(vcpu); } - svm->vmcb->control.int_ctl = + vmcb02->control.int_ctl = (svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) | - (svm->vmcb01.ptr->control.int_ctl & int_ctl_vmcb01_bits); + (vmcb01->control.int_ctl & int_ctl_vmcb01_bits); + + vmcb02->control.int_vector = svm->nested.ctl.int_vector; + vmcb02->control.int_state = svm->nested.ctl.int_state; + vmcb02->control.event_inj = svm->nested.ctl.event_inj; + vmcb02->control.event_inj_err = svm->nested.ctl.event_inj_err; - svm->vmcb->control.int_vector = svm->nested.ctl.int_vector; - svm->vmcb->control.int_state = svm->nested.ctl.int_state; - svm->vmcb->control.event_inj = svm->nested.ctl.event_inj; - svm->vmcb->control.event_inj_err = svm->nested.ctl.event_inj_err; + /* + * next_rip is consumed on VMRUN as the return address pushed on the + * stack for injected soft exceptions/interrupts. If nrips is exposed + * to L1, take it verbatim from vmcb12. If nrips is supported in + * hardware but not exposed to L1, stuff the actual L2 RIP to emulate + * what a nrips=0 CPU would do (L1 is responsible for advancing RIP + * prior to injecting the event). + */ + if (svm->nrips_enabled) + vmcb02->control.next_rip = svm->nested.ctl.next_rip; + else if (boot_cpu_has(X86_FEATURE_NRIPS)) + vmcb02->control.next_rip = vmcb12_rip; + + svm->nmi_l1_to_l2 = is_evtinj_nmi(vmcb02->control.event_inj); + if (is_evtinj_soft(vmcb02->control.event_inj)) { + svm->soft_int_injected = true; + svm->soft_int_csbase = vmcb12_csbase; + svm->soft_int_old_rip = vmcb12_rip; + if (svm->nrips_enabled) + svm->soft_int_next_rip = svm->nested.ctl.next_rip; + else + svm->soft_int_next_rip = vmcb12_rip; + } + + vmcb02->control.virt_ext = vmcb01->control.virt_ext & + LBR_CTL_ENABLE_MASK; + if (svm->lbrv_enabled) + vmcb02->control.virt_ext |= + (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK); + + if (!nested_vmcb_needs_vls_intercept(svm)) + vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + + pause_count12 = svm->pause_filter_enabled ? svm->nested.ctl.pause_filter_count : 0; + pause_thresh12 = svm->pause_threshold_enabled ? svm->nested.ctl.pause_filter_thresh : 0; + if (kvm_pause_in_guest(svm->vcpu.kvm)) { + /* use guest values since host doesn't intercept PAUSE */ + vmcb02->control.pause_filter_count = pause_count12; + vmcb02->control.pause_filter_thresh = pause_thresh12; + + } else { + /* start from host values otherwise */ + vmcb02->control.pause_filter_count = vmcb01->control.pause_filter_count; + vmcb02->control.pause_filter_thresh = vmcb01->control.pause_filter_thresh; + + /* ... but ensure filtering is disabled if so requested. */ + if (vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) { + if (!pause_count12) + vmcb02->control.pause_filter_count = 0; + if (!pause_thresh12) + vmcb02->control.pause_filter_thresh = 0; + } + } nested_svm_transition_tlb_flush(vcpu); @@ -604,11 +759,15 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, struct vcpu_svm *svm = to_svm(vcpu); int ret; - trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa, - vmcb12->save.rip, - vmcb12->control.int_ctl, - vmcb12->control.event_inj, - vmcb12->control.nested_ctl); + trace_kvm_nested_vmenter(svm->vmcb->save.rip, + vmcb12_gpa, + vmcb12->save.rip, + vmcb12->control.int_ctl, + vmcb12->control.event_inj, + vmcb12->control.nested_ctl, + vmcb12->control.nested_cr3, + vmcb12->save.cr3, + KVM_ISA_SVM); trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff, vmcb12->control.intercepts[INTERCEPT_CR] >> 16, @@ -625,22 +784,22 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr); svm_switch_vmcb(svm, &svm->nested.vmcb02); - nested_vmcb02_prepare_control(svm); + nested_vmcb02_prepare_control(svm, vmcb12->save.rip, vmcb12->save.cs.base); nested_vmcb02_prepare_save(svm, vmcb12); - ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3, + ret = nested_svm_load_cr3(&svm->vcpu, svm->nested.save.cr3, nested_npt_enabled(svm), from_vmrun); if (ret) return ret; - if (!npt_enabled) - vcpu->arch.mmu->inject_page_fault = svm_inject_page_fault_nested; - if (!from_vmrun) kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); svm_set_gif(svm, true); + if (kvm_vcpu_apicv_active(vcpu)) + kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); + return 0; } @@ -651,6 +810,7 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) struct vmcb *vmcb12; struct kvm_host_map map; u64 vmcb12_gpa; + struct vmcb *vmcb01 = svm->vmcb01.ptr; if (!svm->nested.hsave_msr) { kvm_inject_gp(vcpu, 0); @@ -678,10 +838,11 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) if (WARN_ON_ONCE(!svm->nested.initialized)) return -EINVAL; - nested_load_control_from_vmcb12(svm, &vmcb12->control); + nested_copy_vmcb_control_to_cache(svm, &vmcb12->control); + nested_copy_vmcb_save_to_cache(svm, &vmcb12->save); - if (!nested_vmcb_valid_sregs(vcpu, &vmcb12->save) || - !nested_vmcb_check_controls(vcpu, &svm->nested.ctl)) { + if (!nested_vmcb_check_save(vcpu) || + !nested_vmcb_check_controls(vcpu)) { vmcb12->control.exit_code = SVM_EXIT_ERR; vmcb12->control.exit_code_hi = 0; vmcb12->control.exit_info_1 = 0; @@ -693,14 +854,14 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) * Since vmcb01 is not in use, we can use it to store some of the L1 * state. */ - svm->vmcb01.ptr->save.efer = vcpu->arch.efer; - svm->vmcb01.ptr->save.cr0 = kvm_read_cr0(vcpu); - svm->vmcb01.ptr->save.cr4 = vcpu->arch.cr4; - svm->vmcb01.ptr->save.rflags = kvm_get_rflags(vcpu); - svm->vmcb01.ptr->save.rip = kvm_rip_read(vcpu); + vmcb01->save.efer = vcpu->arch.efer; + vmcb01->save.cr0 = kvm_read_cr0(vcpu); + vmcb01->save.cr4 = vcpu->arch.cr4; + vmcb01->save.rflags = kvm_get_rflags(vcpu); + vmcb01->save.rip = kvm_rip_read(vcpu); if (!npt_enabled) - svm->vmcb01.ptr->save.cr3 = kvm_read_cr3(vcpu); + vmcb01->save.cr3 = kvm_read_cr3(vcpu); svm->nested.nested_run_pending = 1; @@ -712,6 +873,8 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) out_exit_err: svm->nested.nested_run_pending = 0; + svm->nmi_l1_to_l2 = false; + svm->soft_int_injected = false; svm->vmcb->control.exit_code = SVM_EXIT_ERR; svm->vmcb->control.exit_code_hi = 0; @@ -766,14 +929,12 @@ void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb) int nested_svm_vmexit(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; + struct vmcb *vmcb01 = svm->vmcb01.ptr; + struct vmcb *vmcb02 = svm->nested.vmcb02.ptr; struct vmcb *vmcb12; - struct vmcb *vmcb = svm->vmcb; struct kvm_host_map map; int rc; - /* Triple faults in L2 should never escape. */ - WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)); - rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map); if (rc) { if (rc == -EINVAL) @@ -795,63 +956,77 @@ int nested_svm_vmexit(struct vcpu_svm *svm) /* Give the current vmcb to the guest */ - vmcb12->save.es = vmcb->save.es; - vmcb12->save.cs = vmcb->save.cs; - vmcb12->save.ss = vmcb->save.ss; - vmcb12->save.ds = vmcb->save.ds; - vmcb12->save.gdtr = vmcb->save.gdtr; - vmcb12->save.idtr = vmcb->save.idtr; + vmcb12->save.es = vmcb02->save.es; + vmcb12->save.cs = vmcb02->save.cs; + vmcb12->save.ss = vmcb02->save.ss; + vmcb12->save.ds = vmcb02->save.ds; + vmcb12->save.gdtr = vmcb02->save.gdtr; + vmcb12->save.idtr = vmcb02->save.idtr; vmcb12->save.efer = svm->vcpu.arch.efer; vmcb12->save.cr0 = kvm_read_cr0(vcpu); vmcb12->save.cr3 = kvm_read_cr3(vcpu); - vmcb12->save.cr2 = vmcb->save.cr2; + vmcb12->save.cr2 = vmcb02->save.cr2; vmcb12->save.cr4 = svm->vcpu.arch.cr4; vmcb12->save.rflags = kvm_get_rflags(vcpu); vmcb12->save.rip = kvm_rip_read(vcpu); vmcb12->save.rsp = kvm_rsp_read(vcpu); vmcb12->save.rax = kvm_rax_read(vcpu); - vmcb12->save.dr7 = vmcb->save.dr7; + vmcb12->save.dr7 = vmcb02->save.dr7; vmcb12->save.dr6 = svm->vcpu.arch.dr6; - vmcb12->save.cpl = vmcb->save.cpl; + vmcb12->save.cpl = vmcb02->save.cpl; - vmcb12->control.int_state = vmcb->control.int_state; - vmcb12->control.exit_code = vmcb->control.exit_code; - vmcb12->control.exit_code_hi = vmcb->control.exit_code_hi; - vmcb12->control.exit_info_1 = vmcb->control.exit_info_1; - vmcb12->control.exit_info_2 = vmcb->control.exit_info_2; + vmcb12->control.int_state = vmcb02->control.int_state; + vmcb12->control.exit_code = vmcb02->control.exit_code; + vmcb12->control.exit_code_hi = vmcb02->control.exit_code_hi; + vmcb12->control.exit_info_1 = vmcb02->control.exit_info_1; + vmcb12->control.exit_info_2 = vmcb02->control.exit_info_2; if (vmcb12->control.exit_code != SVM_EXIT_ERR) nested_save_pending_event_to_vmcb12(svm, vmcb12); if (svm->nrips_enabled) - vmcb12->control.next_rip = vmcb->control.next_rip; + vmcb12->control.next_rip = vmcb02->control.next_rip; vmcb12->control.int_ctl = svm->nested.ctl.int_ctl; vmcb12->control.tlb_ctl = svm->nested.ctl.tlb_ctl; vmcb12->control.event_inj = svm->nested.ctl.event_inj; vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err; + if (!kvm_pause_in_guest(vcpu->kvm)) { + vmcb01->control.pause_filter_count = vmcb02->control.pause_filter_count; + vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS); + + } + nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr); svm_switch_vmcb(svm, &svm->vmcb01); + if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { + svm_copy_lbrs(vmcb12, vmcb02); + svm_update_lbrv(vcpu); + } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) { + svm_copy_lbrs(vmcb01, vmcb02); + svm_update_lbrv(vcpu); + } + /* * On vmexit the GIF is set to false and * no event can be injected in L1. */ svm_set_gif(svm, false); - svm->vmcb->control.exit_int_info = 0; + vmcb01->control.exit_int_info = 0; svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset; - if (svm->vmcb->control.tsc_offset != svm->vcpu.arch.tsc_offset) { - svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset; - vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); + if (vmcb01->control.tsc_offset != svm->vcpu.arch.tsc_offset) { + vmcb01->control.tsc_offset = svm->vcpu.arch.tsc_offset; + vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS); } - if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) { + if (svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) { WARN_ON(!svm->tsc_scaling_enabled); vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; - svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio); + __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); } svm->nested.ctl.nested_cr3 = 0; @@ -859,13 +1034,13 @@ int nested_svm_vmexit(struct vcpu_svm *svm) /* * Restore processor state that had been saved in vmcb01 */ - kvm_set_rflags(vcpu, svm->vmcb->save.rflags); - svm_set_efer(vcpu, svm->vmcb->save.efer); - svm_set_cr0(vcpu, svm->vmcb->save.cr0 | X86_CR0_PE); - svm_set_cr4(vcpu, svm->vmcb->save.cr4); - kvm_rax_write(vcpu, svm->vmcb->save.rax); - kvm_rsp_write(vcpu, svm->vmcb->save.rsp); - kvm_rip_write(vcpu, svm->vmcb->save.rip); + kvm_set_rflags(vcpu, vmcb01->save.rflags); + svm_set_efer(vcpu, vmcb01->save.efer); + svm_set_cr0(vcpu, vmcb01->save.cr0 | X86_CR0_PE); + svm_set_cr4(vcpu, vmcb01->save.cr4); + kvm_rax_write(vcpu, vmcb01->save.rax); + kvm_rsp_write(vcpu, vmcb01->save.rsp); + kvm_rip_write(vcpu, vmcb01->save.rip); svm->vcpu.arch.dr7 = DR7_FIXED_1; kvm_update_dr7(&svm->vcpu); @@ -883,7 +1058,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) nested_svm_uninit_mmu_context(vcpu); - rc = nested_svm_load_cr3(vcpu, svm->vmcb->save.cr3, false, true); + rc = nested_svm_load_cr3(vcpu, vmcb01->save.cr3, false, true); if (rc) return 1; @@ -901,9 +1076,16 @@ int nested_svm_vmexit(struct vcpu_svm *svm) * right now so that it an be accounted for before we execute * L1's next instruction. */ - if (unlikely(svm->vmcb->save.rflags & X86_EFLAGS_TF)) + if (unlikely(vmcb01->save.rflags & X86_EFLAGS_TF)) kvm_queue_exception(&(svm->vcpu), DB_VECTOR); + /* + * Un-inhibit the AVIC right away, so that other vCPUs can start + * to benefit from it right away. + */ + if (kvm_apicv_activated(vcpu->kvm)) + kvm_vcpu_update_apicv(vcpu); + return 0; } @@ -964,9 +1146,9 @@ void svm_free_nested(struct vcpu_svm *svm) /* * Forcibly leave nested mode in order to be able to reset the VCPU later on. */ -void svm_leave_nested(struct vcpu_svm *svm) +void svm_leave_nested(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = &svm->vcpu; + struct vcpu_svm *svm = to_svm(vcpu); if (is_guest_mode(vcpu)) { svm->nested.nested_run_pending = 0; @@ -988,7 +1170,7 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) u32 offset, msr, value; int write, mask; - if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) + if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) return NESTED_EXIT_HOST; msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; @@ -1015,7 +1197,7 @@ static int nested_svm_intercept_ioio(struct vcpu_svm *svm) u8 start_bit; u64 gpa; - if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_IOIO_PROT))) + if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_IOIO_PROT))) return NESTED_EXIT_HOST; port = svm->vmcb->control.exit_info_1 >> 16; @@ -1046,12 +1228,12 @@ static int nested_svm_intercept(struct vcpu_svm *svm) vmexit = nested_svm_intercept_ioio(svm); break; case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: { - if (vmcb_is_intercept(&svm->nested.ctl, exit_code)) + if (vmcb12_is_intercept(&svm->nested.ctl, exit_code)) vmexit = NESTED_EXIT_DONE; break; } case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: { - if (vmcb_is_intercept(&svm->nested.ctl, exit_code)) + if (vmcb12_is_intercept(&svm->nested.ctl, exit_code)) vmexit = NESTED_EXIT_DONE; break; } @@ -1069,7 +1251,7 @@ static int nested_svm_intercept(struct vcpu_svm *svm) break; } default: { - if (vmcb_is_intercept(&svm->nested.ctl, exit_code)) + if (vmcb12_is_intercept(&svm->nested.ctl, exit_code)) vmexit = NESTED_EXIT_DONE; } } @@ -1104,58 +1286,73 @@ int nested_svm_check_permissions(struct kvm_vcpu *vcpu) return 0; } -static bool nested_exit_on_exception(struct vcpu_svm *svm) +static bool nested_svm_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector, + u32 error_code) { - unsigned int nr = svm->vcpu.arch.exception.nr; + struct vcpu_svm *svm = to_svm(vcpu); - return (svm->nested.ctl.intercepts[INTERCEPT_EXCEPTION] & BIT(nr)); + return (svm->nested.ctl.intercepts[INTERCEPT_EXCEPTION] & BIT(vector)); } -static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm) +static void nested_svm_inject_exception_vmexit(struct kvm_vcpu *vcpu) { - unsigned int nr = svm->vcpu.arch.exception.nr; + struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb *vmcb = svm->vmcb; - svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; - svm->vmcb->control.exit_code_hi = 0; + vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + ex->vector; + vmcb->control.exit_code_hi = 0; - if (svm->vcpu.arch.exception.has_error_code) - svm->vmcb->control.exit_info_1 = svm->vcpu.arch.exception.error_code; + if (ex->has_error_code) + vmcb->control.exit_info_1 = ex->error_code; /* * EXITINFO2 is undefined for all exception intercepts other * than #PF. */ - if (nr == PF_VECTOR) { - if (svm->vcpu.arch.exception.nested_apf) - svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token; - else if (svm->vcpu.arch.exception.has_payload) - svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload; + if (ex->vector == PF_VECTOR) { + if (ex->has_payload) + vmcb->control.exit_info_2 = ex->payload; else - svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; - } else if (nr == DB_VECTOR) { - /* See inject_pending_event. */ - kvm_deliver_exception_payload(&svm->vcpu); - if (svm->vcpu.arch.dr7 & DR7_GD) { - svm->vcpu.arch.dr7 &= ~DR7_GD; - kvm_update_dr7(&svm->vcpu); + vmcb->control.exit_info_2 = vcpu->arch.cr2; + } else if (ex->vector == DB_VECTOR) { + /* See kvm_check_and_inject_events(). */ + kvm_deliver_exception_payload(vcpu, ex); + + if (vcpu->arch.dr7 & DR7_GD) { + vcpu->arch.dr7 &= ~DR7_GD; + kvm_update_dr7(vcpu); } - } else - WARN_ON(svm->vcpu.arch.exception.has_payload); + } else { + WARN_ON(ex->has_payload); + } nested_svm_vmexit(svm); } static inline bool nested_exit_on_init(struct vcpu_svm *svm) { - return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INIT); + return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_INIT); } static int svm_check_nested_events(struct kvm_vcpu *vcpu) { - struct vcpu_svm *svm = to_svm(vcpu); - bool block_nested_events = - kvm_event_needs_reinjection(vcpu) || svm->nested.nested_run_pending; struct kvm_lapic *apic = vcpu->arch.apic; + struct vcpu_svm *svm = to_svm(vcpu); + /* + * Only a pending nested run blocks a pending exception. If there is a + * previously injected event, the pending exception occurred while said + * event was being delivered and thus needs to be handled. + */ + bool block_nested_exceptions = svm->nested.nested_run_pending; + /* + * New events (not exceptions) are only recognized at instruction + * boundaries. If an event needs reinjection, then KVM is handling a + * VM-Exit that occurred _during_ instruction execution; new events are + * blocked until the instruction completes. + */ + bool block_nested_events = block_nested_exceptions || + kvm_event_needs_reinjection(vcpu); if (lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &apic->pending_events)) { @@ -1167,18 +1364,16 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu) return 0; } - if (vcpu->arch.exception.pending) { - /* - * Only a pending nested run can block a pending exception. - * Otherwise an injected NMI/interrupt should either be - * lost or delivered to the nested hypervisor in the EXITINTINFO - * vmcb field, while delivering the pending exception. - */ - if (svm->nested.nested_run_pending) + if (vcpu->arch.exception_vmexit.pending) { + if (block_nested_exceptions) return -EBUSY; - if (!nested_exit_on_exception(svm)) - return 0; - nested_svm_inject_exception_vmexit(svm); + nested_svm_inject_exception_vmexit(vcpu); + return 0; + } + + if (vcpu->arch.exception.pending) { + if (block_nested_exceptions) + return -EBUSY; return 0; } @@ -1248,7 +1443,43 @@ void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu) vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(vcpu->arch.l1_tsc_scaling_ratio, svm->tsc_ratio_msr); - svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio); + __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); +} + +/* Inverse operation of nested_copy_vmcb_control_to_cache(). asid is copied too. */ +static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst, + struct vmcb_ctrl_area_cached *from) +{ + unsigned int i; + + memset(dst, 0, sizeof(struct vmcb_control_area)); + + for (i = 0; i < MAX_INTERCEPT; i++) + dst->intercepts[i] = from->intercepts[i]; + + dst->iopm_base_pa = from->iopm_base_pa; + dst->msrpm_base_pa = from->msrpm_base_pa; + dst->tsc_offset = from->tsc_offset; + dst->asid = from->asid; + dst->tlb_ctl = from->tlb_ctl; + dst->int_ctl = from->int_ctl; + dst->int_vector = from->int_vector; + dst->int_state = from->int_state; + dst->exit_code = from->exit_code; + dst->exit_code_hi = from->exit_code_hi; + dst->exit_info_1 = from->exit_info_1; + dst->exit_info_2 = from->exit_info_2; + dst->exit_int_info = from->exit_int_info; + dst->exit_int_info_err = from->exit_int_info_err; + dst->nested_ctl = from->nested_ctl; + dst->event_inj = from->event_inj; + dst->event_inj_err = from->event_inj_err; + dst->next_rip = from->next_rip; + dst->nested_cr3 = from->nested_cr3; + dst->virt_ext = from->virt_ext; + dst->pause_filter_count = from->pause_filter_count; + dst->pause_filter_thresh = from->pause_filter_thresh; + /* 'clean' and 'reserved_sw' are not changed by KVM */ } static int svm_get_nested_state(struct kvm_vcpu *vcpu, @@ -1256,6 +1487,8 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu, u32 user_data_size) { struct vcpu_svm *svm; + struct vmcb_control_area *ctl; + unsigned long r; struct kvm_nested_state kvm_state = { .flags = 0, .format = KVM_STATE_NESTED_FORMAT_SVM, @@ -1297,9 +1530,18 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu, */ if (clear_user(user_vmcb, KVM_STATE_NESTED_SVM_VMCB_SIZE)) return -EFAULT; - if (copy_to_user(&user_vmcb->control, &svm->nested.ctl, - sizeof(user_vmcb->control))) + + ctl = kzalloc(sizeof(*ctl), GFP_KERNEL); + if (!ctl) + return -ENOMEM; + + nested_copy_vmcb_cache_to_control(ctl, &svm->nested.ctl); + r = copy_to_user(&user_vmcb->control, ctl, + sizeof(user_vmcb->control)); + kfree(ctl); + if (r) return -EFAULT; + if (copy_to_user(&user_vmcb->save, &svm->vmcb01.ptr->save, sizeof(user_vmcb->save))) return -EFAULT; @@ -1316,6 +1558,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, &user_kvm_nested_state->data.svm[0]; struct vmcb_control_area *ctl; struct vmcb_save_area *save; + struct vmcb_save_area_cached save_cached; + struct vmcb_ctrl_area_cached ctl_cached; unsigned long cr0; int ret; @@ -1345,7 +1589,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, return -EINVAL; if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) { - svm_leave_nested(svm); + svm_leave_nested(vcpu); svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET)); return 0; } @@ -1368,7 +1612,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, goto out_free; ret = -EINVAL; - if (!nested_vmcb_check_controls(vcpu, ctl)) + __nested_copy_vmcb_control_to_cache(vcpu, &ctl_cached, ctl); + if (!__nested_vmcb_check_controls(vcpu, &ctl_cached)) goto out_free; /* @@ -1383,22 +1628,11 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, * Validate host state saved from before VMRUN (see * nested_svm_check_permissions). */ + __nested_copy_vmcb_save_to_cache(&save_cached, save); if (!(save->cr0 & X86_CR0_PG) || !(save->cr0 & X86_CR0_PE) || (save->rflags & X86_EFLAGS_VM) || - !nested_vmcb_valid_sregs(vcpu, save)) - goto out_free; - - /* - * While the nested guest CR3 is already checked and set by - * KVM_SET_SREGS, it was set when nested state was yet loaded, - * thus MMU might not be initialized correctly. - * Set it again to fix this. - */ - - ret = nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3, - nested_npt_enabled(svm), false); - if (WARN_ON_ONCE(ret)) + !__nested_vmcb_check_save(vcpu, &save_cached)) goto out_free; @@ -1410,7 +1644,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, */ if (is_guest_mode(vcpu)) - svm_leave_nested(svm); + svm_leave_nested(vcpu); else svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save; @@ -1422,10 +1656,25 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa; svm_copy_vmrun_state(&svm->vmcb01.ptr->save, save); - nested_load_control_from_vmcb12(svm, ctl); + nested_copy_vmcb_control_to_cache(svm, ctl); svm_switch_vmcb(svm, &svm->nested.vmcb02); - nested_vmcb02_prepare_control(svm); + nested_vmcb02_prepare_control(svm, svm->vmcb->save.rip, svm->vmcb->save.cs.base); + + /* + * While the nested guest CR3 is already checked and set by + * KVM_SET_SREGS, it was set when nested state was yet loaded, + * thus MMU might not be initialized correctly. + * Set it again to fix this. + */ + + ret = nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3, + nested_npt_enabled(svm), false); + if (WARN_ON_ONCE(ret)) + goto out_free; + + svm->nested.force_msr_bitmap_recalc = true; + kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); ret = 0; out_free: @@ -1449,7 +1698,7 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) * the guest CR3 might be restored prior to setting the nested * state which can lead to a load of wrong PDPTRs. */ - if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3))) + if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) return false; if (!nested_svm_vmrun_msrpm(svm)) { @@ -1464,6 +1713,8 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) } struct kvm_x86_nested_ops svm_nested_ops = { + .leave_nested = svm_leave_nested, + .is_exception_vmexit = nested_svm_is_exception_vmexit, .check_events = svm_check_nested_events, .triple_fault = nested_svm_triple_fault, .get_nested_state_pages = svm_get_nested_state_pages, |