diff options
Diffstat (limited to 'arch/x86/kvm/svm')
-rw-r--r-- | arch/x86/kvm/svm/avic.c | 595 | ||||
-rw-r--r-- | arch/x86/kvm/svm/hyperv.h | 35 | ||||
-rw-r--r-- | arch/x86/kvm/svm/nested.c | 797 | ||||
-rw-r--r-- | arch/x86/kvm/svm/pmu.c | 179 | ||||
-rw-r--r-- | arch/x86/kvm/svm/sev.c | 381 | ||||
-rw-r--r-- | arch/x86/kvm/svm/svm.c | 1600 | ||||
-rw-r--r-- | arch/x86/kvm/svm/svm.h | 265 | ||||
-rw-r--r-- | arch/x86/kvm/svm/svm_onhyperv.c | 1 | ||||
-rw-r--r-- | arch/x86/kvm/svm/svm_onhyperv.h | 37 | ||||
-rw-r--r-- | arch/x86/kvm/svm/svm_ops.h | 5 | ||||
-rw-r--r-- | arch/x86/kvm/svm/vmenter.S | 282 |
11 files changed, 2629 insertions, 1548 deletions
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 8f9af7b7dbbe..6919dee69f18 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -27,20 +27,6 @@ #include "irq.h" #include "svm.h" -#define SVM_AVIC_DOORBELL 0xc001011b - -#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) - -/* - * 0xff is broadcast, so the max index allowed for physical APIC ID - * table is 0xfe. APIC IDs above 0xff are reserved. - */ -#define AVIC_MAX_PHYSICAL_ID_COUNT 255 - -#define AVIC_UNACCEL_ACCESS_WRITE_MASK 1 -#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0 -#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF - /* AVIC GATAG is encoded using VM and VCPU IDs */ #define AVIC_VCPU_ID_BITS 8 #define AVIC_VCPU_ID_MASK ((1 << AVIC_VCPU_ID_BITS) - 1) @@ -54,6 +40,9 @@ #define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK) #define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK) +static bool force_avic; +module_param_unsafe(force_avic, bool, 0444); + /* Note: * This hash table is used to map VM_ID to a struct kvm_svm, * when handling AMD IOMMU GALOG notification to schedule in @@ -64,6 +53,7 @@ static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); static u32 next_vm_id = 0; static bool next_vm_id_wrapped = 0; static DEFINE_SPINLOCK(svm_vm_data_hash_lock); +enum avic_modes avic_mode; /* * This is a wrapper of struct amd_iommu_ir_data. @@ -73,12 +63,54 @@ struct amd_svm_iommu_ir { void *data; /* Storing pointer to struct amd_ir_data */ }; -enum avic_ipi_failure_cause { - AVIC_IPI_FAILURE_INVALID_INT_TYPE, - AVIC_IPI_FAILURE_TARGET_NOT_RUNNING, - AVIC_IPI_FAILURE_INVALID_TARGET, - AVIC_IPI_FAILURE_INVALID_BACKING_PAGE, -}; +static void avic_activate_vmcb(struct vcpu_svm *svm) +{ + struct vmcb *vmcb = svm->vmcb01.ptr; + + vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); + vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; + + vmcb->control.int_ctl |= AVIC_ENABLE_MASK; + + /* Note: + * KVM can support hybrid-AVIC mode, where KVM emulates x2APIC + * MSR accesses, while interrupt injection to a running vCPU + * can be achieved using AVIC doorbell. The AVIC hardware still + * accelerate MMIO accesses, but this does not cause any harm + * as the guest is not supposed to access xAPIC mmio when uses x2APIC. + */ + if (apic_x2apic_mode(svm->vcpu.arch.apic) && + avic_mode == AVIC_MODE_X2) { + vmcb->control.int_ctl |= X2APIC_MODE_MASK; + vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID; + /* Disabling MSR intercept for x2APIC registers */ + svm_set_x2apic_msr_interception(svm, false); + } else { + /* For xAVIC and hybrid-xAVIC modes */ + vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID; + /* Enabling MSR intercept for x2APIC registers */ + svm_set_x2apic_msr_interception(svm, true); + } +} + +static void avic_deactivate_vmcb(struct vcpu_svm *svm) +{ + struct vmcb *vmcb = svm->vmcb01.ptr; + + vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); + vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; + + /* + * If running nested and the guest uses its own MSR bitmap, there + * is no need to update L0's msr bitmap + */ + if (is_guest_mode(&svm->vcpu) && + vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)) + return; + + /* Enabling MSR intercept for x2APIC registers */ + svm_set_x2apic_msr_interception(svm, true); +} /* Note: * This function is called from IOMMU driver to notify @@ -185,9 +217,8 @@ free_avic: return err; } -void avic_init_vmcb(struct vcpu_svm *svm) +void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb) { - struct vmcb *vmcb = svm->vmcb; struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm); phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page)); phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page)); @@ -196,13 +227,12 @@ void avic_init_vmcb(struct vcpu_svm *svm) vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK; vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK; - vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT; vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK; if (kvm_apicv_activated(svm->vcpu.kvm)) - vmcb->control.int_ctl |= AVIC_ENABLE_MASK; + avic_activate_vmcb(svm); else - vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; + avic_deactivate_vmcb(svm); } static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, @@ -211,7 +241,8 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, u64 *avic_physical_id_table; struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); - if (index >= AVIC_MAX_PHYSICAL_ID_COUNT) + if ((avic_mode == AVIC_MODE_X1 && index > AVIC_MAX_PHYSICAL_ID) || + (avic_mode == AVIC_MODE_X2 && index > X2AVIC_MAX_PHYSICAL_ID)) return NULL; avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page); @@ -258,7 +289,8 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) int id = vcpu->vcpu_id; struct vcpu_svm *svm = to_svm(vcpu); - if (id >= AVIC_MAX_PHYSICAL_ID_COUNT) + if ((avic_mode == AVIC_MODE_X1 && id > AVIC_MAX_PHYSICAL_ID) || + (avic_mode == AVIC_MODE_X2 && id > X2AVIC_MAX_PHYSICAL_ID)) return -EINVAL; if (!vcpu->arch.apic->regs) @@ -289,20 +321,166 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) return 0; } +void avic_ring_doorbell(struct kvm_vcpu *vcpu) +{ + /* + * Note, the vCPU could get migrated to a different pCPU at any point, + * which could result in signalling the wrong/previous pCPU. But if + * that happens the vCPU is guaranteed to do a VMRUN (after being + * migrated) and thus will process pending interrupts, i.e. a doorbell + * is not needed (and the spurious one is harmless). + */ + int cpu = READ_ONCE(vcpu->cpu); + + if (cpu != get_cpu()) { + wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); + trace_kvm_avic_doorbell(vcpu->vcpu_id, kvm_cpu_get_apicid(cpu)); + } + put_cpu(); +} + +/* + * A fast-path version of avic_kick_target_vcpus(), which attempts to match + * destination APIC ID to vCPU without looping through all vCPUs. + */ +static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source, + u32 icrl, u32 icrh, u32 index) +{ + u32 l1_physical_id, dest; + struct kvm_vcpu *target_vcpu; + int dest_mode = icrl & APIC_DEST_MASK; + int shorthand = icrl & APIC_SHORT_MASK; + struct kvm_svm *kvm_svm = to_kvm_svm(kvm); + + if (shorthand != APIC_DEST_NOSHORT) + return -EINVAL; + + if (apic_x2apic_mode(source)) + dest = icrh; + else + dest = GET_XAPIC_DEST_FIELD(icrh); + + if (dest_mode == APIC_DEST_PHYSICAL) { + /* broadcast destination, use slow path */ + if (apic_x2apic_mode(source) && dest == X2APIC_BROADCAST) + return -EINVAL; + if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST) + return -EINVAL; + + l1_physical_id = dest; + + if (WARN_ON_ONCE(l1_physical_id != index)) + return -EINVAL; + + } else { + u32 bitmap, cluster; + int logid_index; + + if (apic_x2apic_mode(source)) { + /* 16 bit dest mask, 16 bit cluster id */ + bitmap = dest & 0xFFFF0000; + cluster = (dest >> 16) << 4; + } else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) { + /* 8 bit dest mask*/ + bitmap = dest; + cluster = 0; + } else { + /* 4 bit desk mask, 4 bit cluster id */ + bitmap = dest & 0xF; + cluster = (dest >> 4) << 2; + } + + if (unlikely(!bitmap)) + /* guest bug: nobody to send the logical interrupt to */ + return 0; + + if (!is_power_of_2(bitmap)) + /* multiple logical destinations, use slow path */ + return -EINVAL; + + logid_index = cluster + __ffs(bitmap); + + if (!apic_x2apic_mode(source)) { + u32 *avic_logical_id_table = + page_address(kvm_svm->avic_logical_id_table_page); + + u32 logid_entry = avic_logical_id_table[logid_index]; + + if (WARN_ON_ONCE(index != logid_index)) + return -EINVAL; + + /* guest bug: non existing/reserved logical destination */ + if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK))) + return 0; + + l1_physical_id = logid_entry & + AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; + } else { + /* + * For x2APIC logical mode, cannot leverage the index. + * Instead, calculate physical ID from logical ID in ICRH. + */ + int cluster = (icrh & 0xffff0000) >> 16; + int apic = ffs(icrh & 0xffff) - 1; + + /* + * If the x2APIC logical ID sub-field (i.e. icrh[15:0]) + * contains anything but a single bit, we cannot use the + * fast path, because it is limited to a single vCPU. + */ + if (apic < 0 || icrh != (1 << apic)) + return -EINVAL; + + l1_physical_id = (cluster << 4) + apic; + } + } + + target_vcpu = kvm_get_vcpu_by_id(kvm, l1_physical_id); + if (unlikely(!target_vcpu)) + /* guest bug: non existing vCPU is a target of this IPI*/ + return 0; + + target_vcpu->arch.apic->irr_pending = true; + svm_complete_interrupt_delivery(target_vcpu, + icrl & APIC_MODE_MASK, + icrl & APIC_INT_LEVELTRIG, + icrl & APIC_VECTOR_MASK); + return 0; +} + static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, - u32 icrl, u32 icrh) + u32 icrl, u32 icrh, u32 index) { + unsigned long i; struct kvm_vcpu *vcpu; - int i; + if (!avic_kick_target_vcpus_fast(kvm, source, icrl, icrh, index)) + return; + + trace_kvm_avic_kick_vcpu_slowpath(icrh, icrl, index); + + /* + * Wake any target vCPUs that are blocking, i.e. waiting for a wake + * event. There's no need to signal doorbells, as hardware has handled + * vCPUs that were in guest at the time of the IPI, and vCPUs that have + * since entered the guest will have processed pending IRQs at VMRUN. + */ kvm_for_each_vcpu(i, vcpu, kvm) { - bool m = kvm_apic_match_dest(vcpu, source, - icrl & APIC_SHORT_MASK, - GET_APIC_DEST_FIELD(icrh), - icrl & APIC_DEST_MASK); + u32 dest; - if (m && !avic_vcpu_is_running(vcpu)) - kvm_vcpu_wake_up(vcpu); + if (apic_x2apic_mode(vcpu->arch.apic)) + dest = icrh; + else + dest = GET_XAPIC_DEST_FIELD(icrh); + + if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK, + dest, icrl & APIC_DEST_MASK)) { + vcpu->arch.apic->irr_pending = true; + svm_complete_interrupt_delivery(vcpu, + icrl & APIC_MODE_MASK, + icrl & APIC_INT_LEVELTRIG, + icrl & APIC_VECTOR_MASK); + } } } @@ -312,7 +490,7 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu) u32 icrh = svm->vmcb->control.exit_info_1 >> 32; u32 icrl = svm->vmcb->control.exit_info_1; u32 id = svm->vmcb->control.exit_info_2 >> 32; - u32 index = svm->vmcb->control.exit_info_2 & 0xFF; + u32 index = svm->vmcb->control.exit_info_2 & 0x1FF; struct kvm_lapic *apic = vcpu->arch.apic; trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index); @@ -320,18 +498,18 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu) switch (id) { case AVIC_IPI_FAILURE_INVALID_INT_TYPE: /* - * AVIC hardware handles the generation of - * IPIs when the specified Message Type is Fixed - * (also known as fixed delivery mode) and - * the Trigger Mode is edge-triggered. The hardware - * also supports self and broadcast delivery modes - * specified via the Destination Shorthand(DSH) - * field of the ICRL. Logical and physical APIC ID - * formats are supported. All other IPI types cause - * a #VMEXIT, which needs to emulated. + * Emulate IPIs that are not handled by AVIC hardware, which + * only virtualizes Fixed, Edge-Triggered INTRs. The exit is + * a trap, e.g. ICR holds the correct value and RIP has been + * advanced, KVM is responsible only for emulating the IPI. + * Sadly, hardware may sometimes leave the BUSY flag set, in + * which case KVM needs to emulate the ICR write as well in + * order to clear the BUSY flag. */ - kvm_lapic_reg_write(apic, APIC_ICR2, icrh); - kvm_lapic_reg_write(apic, APIC_ICR, icrl); + if (icrl & APIC_ICR_BUSY) + kvm_apic_write_nodecode(vcpu, APIC_ICR); + else + kvm_apic_send_ipi(apic, icrl, icrh); break; case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: /* @@ -339,11 +517,9 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu) * set the appropriate IRR bits on the valid target * vcpus. So, we just need to kick the appropriate vcpu. */ - avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh); + avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh, index); break; case AVIC_IPI_FAILURE_INVALID_TARGET: - WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n", - index, vcpu->vcpu_id, icrh, icrl); break; case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE: WARN_ONCE(1, "Invalid backing page\n"); @@ -355,6 +531,13 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu) return 1; } +unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu) +{ + if (is_guest_mode(vcpu)) + return APICV_INHIBIT_REASON_NESTED; + return 0; +} + static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat) { struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); @@ -407,8 +590,13 @@ static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); bool flat = svm->dfr_reg == APIC_DFR_FLAT; - u32 *entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat); + u32 *entry; + /* Note: x2AVIC does not use logical APIC ID table */ + if (apic_x2apic_mode(vcpu->arch.apic)) + return; + + entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat); if (entry) clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry); } @@ -420,6 +608,10 @@ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR); u32 id = kvm_xapic_id(vcpu->arch.apic); + /* AVIC does not support LDR update for x2APIC */ + if (apic_x2apic_mode(vcpu->arch.apic)) + return 0; + if (ldr == svm->ldr_reg) return 0; @@ -434,35 +626,6 @@ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) return ret; } -static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu) -{ - u64 *old, *new; - struct vcpu_svm *svm = to_svm(vcpu); - u32 id = kvm_xapic_id(vcpu->arch.apic); - - if (vcpu->vcpu_id == id) - return 0; - - old = avic_get_physical_id_entry(vcpu, vcpu->vcpu_id); - new = avic_get_physical_id_entry(vcpu, id); - if (!new || !old) - return 1; - - /* We need to move physical_id_entry to new offset */ - *new = *old; - *old = 0ULL; - to_svm(vcpu)->avic_physical_id_cache = new; - - /* - * Also update the guest physical APIC ID in the logical - * APIC ID table entry if already setup the LDR. - */ - if (svm->ldr_reg) - avic_handle_ldr_update(vcpu); - - return 0; -} - static void avic_handle_dfr_update(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -475,30 +638,24 @@ static void avic_handle_dfr_update(struct kvm_vcpu *vcpu) svm->dfr_reg = dfr; } -static int avic_unaccel_trap_write(struct vcpu_svm *svm) +static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu) { - struct kvm_lapic *apic = svm->vcpu.arch.apic; - u32 offset = svm->vmcb->control.exit_info_1 & + u32 offset = to_svm(vcpu)->vmcb->control.exit_info_1 & AVIC_UNACCEL_ACCESS_OFFSET_MASK; switch (offset) { - case APIC_ID: - if (avic_handle_apic_id_update(&svm->vcpu)) - return 0; - break; case APIC_LDR: - if (avic_handle_ldr_update(&svm->vcpu)) + if (avic_handle_ldr_update(vcpu)) return 0; break; case APIC_DFR: - avic_handle_dfr_update(&svm->vcpu); + avic_handle_dfr_update(vcpu); break; default: break; } - kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset)); - + kvm_apic_write_nodecode(vcpu, offset); return 1; } @@ -548,7 +705,7 @@ int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu) if (trap) { /* Handling Trap */ WARN_ONCE(!write, "svm: Handling trap read.\n"); - ret = avic_unaccel_trap_write(svm); + ret = avic_unaccel_trap_write(vcpu); } else { /* Handling Fault */ ret = kvm_emulate_instruction(vcpu, 0); @@ -576,28 +733,25 @@ int avic_init_vcpu(struct vcpu_svm *svm) return ret; } -void avic_post_state_restore(struct kvm_vcpu *vcpu) +void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu) { - if (avic_handle_apic_id_update(vcpu) != 0) - return; avic_handle_dfr_update(vcpu); avic_handle_ldr_update(vcpu); } -void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu) +void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu) { - return; -} - -void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) -{ -} + if (!lapic_in_kernel(vcpu) || avic_mode == AVIC_MODE_NONE) + return; -void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) -{ + if (kvm_get_apic_mode(vcpu) == LAPIC_MODE_INVALID) { + WARN_ONCE(true, "Invalid local APIC state (vcpu_id=%d)", vcpu->vcpu_id); + return; + } + avic_refresh_apicv_exec_ctrl(vcpu); } -static int svm_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) +static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) { int ret = 0; unsigned long flags; @@ -629,68 +783,6 @@ out: return ret; } -void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb *vmcb = svm->vmcb01.ptr; - bool activated = kvm_vcpu_apicv_active(vcpu); - - if (!enable_apicv) - return; - - if (activated) { - /** - * During AVIC temporary deactivation, guest could update - * APIC ID, DFR and LDR registers, which would not be trapped - * by avic_unaccelerated_access_interception(). In this case, - * we need to check and update the AVIC logical APIC ID table - * accordingly before re-activating. - */ - avic_post_state_restore(vcpu); - vmcb->control.int_ctl |= AVIC_ENABLE_MASK; - } else { - vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; - } - vmcb_mark_dirty(vmcb, VMCB_AVIC); - - if (activated) - avic_vcpu_load(vcpu, vcpu->cpu); - else - avic_vcpu_put(vcpu); - - svm_set_pi_irte_mode(vcpu, activated); -} - -void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) -{ - return; -} - -int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) -{ - if (!vcpu->arch.apicv_active) - return -1; - - kvm_lapic_set_irr(vec, vcpu->arch.apic); - smp_mb__after_atomic(); - - if (avic_vcpu_is_running(vcpu)) { - int cpuid = vcpu->cpu; - - if (cpuid != get_cpu()) - wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpuid)); - put_cpu(); - } else - kvm_vcpu_wake_up(vcpu); - - return 0; -} - -bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu) -{ - return false; -} - static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) { unsigned long flags; @@ -788,7 +880,7 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, } /* - * svm_update_pi_irte - set IRTE for Posted-Interrupts + * avic_pi_update_irte - set IRTE for Posted-Interrupts * * @kvm: kvm * @host_irq: host irq of the interrupt @@ -796,12 +888,12 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, * @set: set or unset PI * returns 0 on success, < 0 on failure */ -int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) +int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set) { struct kvm_kernel_irq_routing_entry *e; struct kvm_irq_routing_table *irq_rt; - int idx, ret = -EINVAL; + int idx, ret = 0; if (!kvm_arch_has_assigned_device(kvm) || !irq_remapping_cap(IRQ_POSTING_CAP)) @@ -812,7 +904,13 @@ int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, idx = srcu_read_lock(&kvm->irq_srcu); irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); - WARN_ON(guest_irq >= irq_rt->nr_rt_entries); + + if (guest_irq >= irq_rt->nr_rt_entries || + hlist_empty(&irq_rt->map[guest_irq])) { + pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", + guest_irq, irq_rt->nr_rt_entries); + goto out; + } hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { struct vcpu_data vcpu_info; @@ -897,7 +995,7 @@ out: return ret; } -bool svm_check_apicv_inhibit_reasons(ulong bit) +bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason) { ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | BIT(APICV_INHIBIT_REASON_ABSENT) | @@ -905,10 +1003,12 @@ bool svm_check_apicv_inhibit_reasons(ulong bit) BIT(APICV_INHIBIT_REASON_NESTED) | BIT(APICV_INHIBIT_REASON_IRQWIN) | BIT(APICV_INHIBIT_REASON_PIT_REINJ) | - BIT(APICV_INHIBIT_REASON_X2APIC) | - BIT(APICV_INHIBIT_REASON_BLOCKIRQ); + BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | + BIT(APICV_INHIBIT_REASON_SEV) | + BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | + BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED); - return supported & BIT(bit); + return supported & BIT(reason); } @@ -945,30 +1045,32 @@ out: void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { u64 entry; - /* ID = 0xff (broadcast), ID > 0xff (reserved) */ int h_physical_id = kvm_cpu_get_apicid(cpu); struct vcpu_svm *svm = to_svm(vcpu); + lockdep_assert_preemption_disabled(); + + if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK)) + return; + /* - * Since the host physical APIC id is 8 bits, - * we can support host APIC ID upto 255. + * No need to update anything if the vCPU is blocking, i.e. if the vCPU + * is being scheduled in after being preempted. The CPU entries in the + * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'. + * If the vCPU was migrated, its new CPU value will be stuffed when the + * vCPU unblocks. */ - if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK)) + if (kvm_vcpu_is_blocking(vcpu)) return; entry = READ_ONCE(*(svm->avic_physical_id_cache)); - WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); - - entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; - if (svm->avic_is_running) - entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; + entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; WRITE_ONCE(*(svm->avic_physical_id_cache), entry); - avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, - svm->avic_is_running); + avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true); } void avic_vcpu_put(struct kvm_vcpu *vcpu) @@ -976,42 +1078,119 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) u64 entry; struct vcpu_svm *svm = to_svm(vcpu); + lockdep_assert_preemption_disabled(); + entry = READ_ONCE(*(svm->avic_physical_id_cache)); - if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) - avic_update_iommu_vcpu_affinity(vcpu, -1, 0); + + /* Nothing to do if IsRunning == '0' due to vCPU blocking. */ + if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) + return; + + avic_update_iommu_vcpu_affinity(vcpu, -1, 0); entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; WRITE_ONCE(*(svm->avic_physical_id_cache), entry); } -/* - * This function is called during VCPU halt/unhalt. - */ -static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run) + +void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - int cpu = get_cpu(); + struct vmcb *vmcb = svm->vmcb01.ptr; + bool activated = kvm_vcpu_apicv_active(vcpu); - WARN_ON(cpu != vcpu->cpu); - svm->avic_is_running = is_run; + if (!enable_apicv) + return; - if (kvm_vcpu_apicv_active(vcpu)) { - if (is_run) - avic_vcpu_load(vcpu, cpu); - else - avic_vcpu_put(vcpu); + if (activated) { + /** + * During AVIC temporary deactivation, guest could update + * APIC ID, DFR and LDR registers, which would not be trapped + * by avic_unaccelerated_access_interception(). In this case, + * we need to check and update the AVIC logical APIC ID table + * accordingly before re-activating. + */ + avic_apicv_post_state_restore(vcpu); + avic_activate_vmcb(svm); + } else { + avic_deactivate_vmcb(svm); } - put_cpu(); + vmcb_mark_dirty(vmcb, VMCB_AVIC); + + if (activated) + avic_vcpu_load(vcpu, vcpu->cpu); + else + avic_vcpu_put(vcpu); + + avic_set_pi_irte_mode(vcpu, activated); } -void svm_vcpu_blocking(struct kvm_vcpu *vcpu) +void avic_vcpu_blocking(struct kvm_vcpu *vcpu) { - avic_set_running(vcpu, false); + if (!kvm_vcpu_apicv_active(vcpu)) + return; + + /* + * Unload the AVIC when the vCPU is about to block, _before_ + * the vCPU actually blocks. + * + * Any IRQs that arrive before IsRunning=0 will not cause an + * incomplete IPI vmexit on the source, therefore vIRR will also + * be checked by kvm_vcpu_check_block() before blocking. The + * memory barrier implicit in set_current_state orders writing + * IsRunning=0 before reading the vIRR. The processor needs a + * matching memory barrier on interrupt delivery between writing + * IRR and reading IsRunning; the lack of this barrier might be + * the cause of errata #1235). + */ + avic_vcpu_put(vcpu); } -void svm_vcpu_unblocking(struct kvm_vcpu *vcpu) +void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) { - if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu)) - kvm_vcpu_update_apicv(vcpu); - avic_set_running(vcpu, true); + if (!kvm_vcpu_apicv_active(vcpu)) + return; + + avic_vcpu_load(vcpu, vcpu->cpu); +} + +/* + * Note: + * - The module param avic enable both xAPIC and x2APIC mode. + * - Hypervisor can support both xAVIC and x2AVIC in the same guest. + * - The mode can be switched at run-time. + */ +bool avic_hardware_setup(struct kvm_x86_ops *x86_ops) +{ + if (!npt_enabled) + return false; + + if (boot_cpu_has(X86_FEATURE_AVIC)) { + avic_mode = AVIC_MODE_X1; + pr_info("AVIC enabled\n"); + } else if (force_avic) { + /* + * Some older systems does not advertise AVIC support. + * See Revision Guide for specific AMD processor for more detail. + */ + avic_mode = AVIC_MODE_X1; + pr_warn("AVIC is not supported in CPUID but force enabled"); + pr_warn("Your system might crash and burn"); + } + + /* AVIC is a prerequisite for x2AVIC. */ + if (boot_cpu_has(X86_FEATURE_X2AVIC)) { + if (avic_mode == AVIC_MODE_X1) { + avic_mode = AVIC_MODE_X2; + pr_info("x2AVIC enabled\n"); + } else { + pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled"); + pr_warn(FW_BUG "Try enable AVIC using force_avic option"); + } + } + + if (avic_mode != AVIC_MODE_NONE) + amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); + + return !!avic_mode; } diff --git a/arch/x86/kvm/svm/hyperv.h b/arch/x86/kvm/svm/hyperv.h new file mode 100644 index 000000000000..7d6d97968fb9 --- /dev/null +++ b/arch/x86/kvm/svm/hyperv.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Common Hyper-V on KVM and KVM on Hyper-V definitions (SVM). + */ + +#ifndef __ARCH_X86_KVM_SVM_HYPERV_H__ +#define __ARCH_X86_KVM_SVM_HYPERV_H__ + +#include <asm/mshyperv.h> + +#include "../hyperv.h" + +/* + * Hyper-V uses the software reserved 32 bytes in VMCB + * control area to expose SVM enlightenments to guests. + */ +struct hv_enlightenments { + struct __packed hv_enlightenments_control { + u32 nested_flush_hypercall:1; + u32 msr_bitmap:1; + u32 enlightened_npt_tlb: 1; + u32 reserved:29; + } __packed hv_enlightenments_control; + u32 hv_vp_id; + u64 hv_vm_id; + u64 partition_assist_page; + u64 reserved; +} __packed; + +/* + * Hyper-V uses the software reserved clean bit in VMCB + */ +#define VMCB_HV_NESTED_ENLIGHTENMENTS VMCB_SW + +#endif /* __ARCH_X86_KVM_SVM_HYPERV_H__ */ diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index f8b7bc04b3e7..4c620999d230 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -28,6 +28,7 @@ #include "cpuid.h" #include "lapic.h" #include "svm.h" +#include "hyperv.h" #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK @@ -35,41 +36,25 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, struct x86_exception *fault) { struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb *vmcb = svm->vmcb; - if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) { + if (vmcb->control.exit_code != SVM_EXIT_NPF) { /* * TODO: track the cause of the nested page fault, and * correctly fill in the high bits of exit_info_1. */ - svm->vmcb->control.exit_code = SVM_EXIT_NPF; - svm->vmcb->control.exit_code_hi = 0; - svm->vmcb->control.exit_info_1 = (1ULL << 32); - svm->vmcb->control.exit_info_2 = fault->address; + vmcb->control.exit_code = SVM_EXIT_NPF; + vmcb->control.exit_code_hi = 0; + vmcb->control.exit_info_1 = (1ULL << 32); + vmcb->control.exit_info_2 = fault->address; } - svm->vmcb->control.exit_info_1 &= ~0xffffffffULL; - svm->vmcb->control.exit_info_1 |= fault->error_code; + vmcb->control.exit_info_1 &= ~0xffffffffULL; + vmcb->control.exit_info_1 |= fault->error_code; nested_svm_vmexit(svm); } -static void svm_inject_page_fault_nested(struct kvm_vcpu *vcpu, struct x86_exception *fault) -{ - struct vcpu_svm *svm = to_svm(vcpu); - WARN_ON(!is_guest_mode(vcpu)); - - if (vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_EXCEPTION_OFFSET + PF_VECTOR) && - !svm->nested.nested_run_pending) { - svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + PF_VECTOR; - svm->vmcb->control.exit_code_hi = 0; - svm->vmcb->control.exit_info_1 = fault->error_code; - svm->vmcb->control.exit_info_2 = fault->address; - nested_svm_vmexit(svm); - } else { - kvm_inject_page_fault(vcpu, fault); - } -} - static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index) { struct vcpu_svm *svm = to_svm(vcpu); @@ -119,9 +104,24 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; } +static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm) +{ + if (!svm->v_vmload_vmsave_enabled) + return true; + + if (!nested_npt_enabled(svm)) + return true; + + if (!(svm->nested.ctl.virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK)) + return true; + + return false; +} + void recalc_intercepts(struct vcpu_svm *svm) { - struct vmcb_control_area *c, *h, *g; + struct vmcb_control_area *c, *h; + struct vmcb_ctrl_area_cached *g; unsigned int i; vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); @@ -159,51 +159,45 @@ void recalc_intercepts(struct vcpu_svm *svm) if (!intercept_smi) vmcb_clr_intercept(c, INTERCEPT_SMI); - vmcb_set_intercept(c, INTERCEPT_VMLOAD); - vmcb_set_intercept(c, INTERCEPT_VMSAVE); -} - -static void copy_vmcb_control_area(struct vmcb_control_area *dst, - struct vmcb_control_area *from) -{ - unsigned int i; - - for (i = 0; i < MAX_INTERCEPT; i++) - dst->intercepts[i] = from->intercepts[i]; - - dst->iopm_base_pa = from->iopm_base_pa; - dst->msrpm_base_pa = from->msrpm_base_pa; - dst->tsc_offset = from->tsc_offset; - /* asid not copied, it is handled manually for svm->vmcb. */ - dst->tlb_ctl = from->tlb_ctl; - dst->int_ctl = from->int_ctl; - dst->int_vector = from->int_vector; - dst->int_state = from->int_state; - dst->exit_code = from->exit_code; - dst->exit_code_hi = from->exit_code_hi; - dst->exit_info_1 = from->exit_info_1; - dst->exit_info_2 = from->exit_info_2; - dst->exit_int_info = from->exit_int_info; - dst->exit_int_info_err = from->exit_int_info_err; - dst->nested_ctl = from->nested_ctl; - dst->event_inj = from->event_inj; - dst->event_inj_err = from->event_inj_err; - dst->nested_cr3 = from->nested_cr3; - dst->virt_ext = from->virt_ext; - dst->pause_filter_count = from->pause_filter_count; - dst->pause_filter_thresh = from->pause_filter_thresh; + if (nested_vmcb_needs_vls_intercept(svm)) { + /* + * If the virtual VMLOAD/VMSAVE is not enabled for the L2, + * we must intercept these instructions to correctly + * emulate them in case L1 doesn't intercept them. + */ + vmcb_set_intercept(c, INTERCEPT_VMLOAD); + vmcb_set_intercept(c, INTERCEPT_VMSAVE); + } else { + WARN_ON(!(c->virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK)); + } } +/* + * Merge L0's (KVM) and L1's (Nested VMCB) MSR permission bitmaps. The function + * is optimized in that it only merges the parts where KVM MSR permission bitmap + * may contain zero bits. + */ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) { + struct hv_enlightenments *hve = + (struct hv_enlightenments *)svm->nested.ctl.reserved_sw; + int i; + /* - * This function merges the msr permission bitmaps of kvm and the - * nested vmcb. It is optimized in that it only merges the parts where - * the kvm msr permission bitmap may contain zero bits + * MSR bitmap update can be skipped when: + * - MSR bitmap for L1 hasn't changed. + * - Nested hypervisor (L1) is attempting to launch the same L2 as + * before. + * - Nested hypervisor (L1) is using Hyper-V emulation interface and + * tells KVM (L0) there were no changes in MSR bitmap for L2. */ - int i; + if (!svm->nested.force_msr_bitmap_recalc && + kvm_hv_hypercall_enabled(&svm->vcpu) && + hve->hv_enlightenments_control.msr_bitmap && + (svm->nested.ctl.clean & BIT(VMCB_HV_NESTED_ENLIGHTENMENTS))) + goto set_msrpm_base_pa; - if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) + if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) return true; for (i = 0; i < MSRPM_OFFSETS; i++) { @@ -214,6 +208,11 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) break; p = msrpm_offsets[i]; + + /* x2apic msrs are intercepted always for the nested guest */ + if (is_x2apic_msrpm_offset(p)) + continue; + offset = svm->nested.ctl.msrpm_base_pa + (p * 4); if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4)) @@ -222,6 +221,9 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) svm->nested.msrpm[p] = svm->msrpm[p] | value; } + svm->nested.force_msr_bitmap_recalc = false; + +set_msrpm_base_pa: svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm)); return true; @@ -250,10 +252,10 @@ static bool nested_svm_check_tlb_ctl(struct kvm_vcpu *vcpu, u8 tlb_ctl) } } -static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu, - struct vmcb_control_area *control) +static bool __nested_vmcb_check_controls(struct kvm_vcpu *vcpu, + struct vmcb_ctrl_area_cached *control) { - if (CC(!vmcb_is_intercept(control, INTERCEPT_VMRUN))) + if (CC(!vmcb12_is_intercept(control, INTERCEPT_VMRUN))) return false; if (CC(control->asid == 0)) @@ -275,9 +277,20 @@ static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu, return true; } -static bool nested_vmcb_check_cr3_cr4(struct kvm_vcpu *vcpu, - struct vmcb_save_area *save) +/* Common checks that apply to both L1 and L2 state. */ +static bool __nested_vmcb_check_save(struct kvm_vcpu *vcpu, + struct vmcb_save_area_cached *save) { + if (CC(!(save->efer & EFER_SVME))) + return false; + + if (CC((save->cr0 & X86_CR0_CD) == 0 && (save->cr0 & X86_CR0_NW)) || + CC(save->cr0 & ~0xffffffffULL)) + return false; + + if (CC(!kvm_dr6_valid(save->dr6)) || CC(!kvm_dr7_valid(save->dr7))) + return false; + /* * These checks are also performed by KVM_SET_SREGS, * except that EFER.LMA is not checked by SVM against @@ -290,51 +303,103 @@ static bool nested_vmcb_check_cr3_cr4(struct kvm_vcpu *vcpu, return false; } - if (CC(!kvm_is_valid_cr4(vcpu, save->cr4))) + /* Note, SVM doesn't have any additional restrictions on CR4. */ + if (CC(!__kvm_is_valid_cr4(vcpu, save->cr4))) + return false; + + if (CC(!kvm_valid_efer(vcpu, save->efer))) return false; return true; } -/* Common checks that apply to both L1 and L2 state. */ -static bool nested_vmcb_valid_sregs(struct kvm_vcpu *vcpu, - struct vmcb_save_area *save) +static bool nested_vmcb_check_save(struct kvm_vcpu *vcpu) { - /* - * FIXME: these should be done after copying the fields, - * to avoid TOC/TOU races. For these save area checks - * the possible damage is limited since kvm_set_cr0 and - * kvm_set_cr4 handle failure; EFER_SVME is an exception - * so it is force-set later in nested_prepare_vmcb_save. - */ - if (CC(!(save->efer & EFER_SVME))) - return false; + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_save_area_cached *save = &svm->nested.save; - if (CC((save->cr0 & X86_CR0_CD) == 0 && (save->cr0 & X86_CR0_NW)) || - CC(save->cr0 & ~0xffffffffULL)) - return false; + return __nested_vmcb_check_save(vcpu, save); +} - if (CC(!kvm_dr6_valid(save->dr6)) || CC(!kvm_dr7_valid(save->dr7))) - return false; +static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_ctrl_area_cached *ctl = &svm->nested.ctl; - if (!nested_vmcb_check_cr3_cr4(vcpu, save)) - return false; + return __nested_vmcb_check_controls(vcpu, ctl); +} - if (CC(!kvm_valid_efer(vcpu, save->efer))) - return false; +static +void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu, + struct vmcb_ctrl_area_cached *to, + struct vmcb_control_area *from) +{ + unsigned int i; - return true; + for (i = 0; i < MAX_INTERCEPT; i++) + to->intercepts[i] = from->intercepts[i]; + + to->iopm_base_pa = from->iopm_base_pa; + to->msrpm_base_pa = from->msrpm_base_pa; + to->tsc_offset = from->tsc_offset; + to->tlb_ctl = from->tlb_ctl; + to->int_ctl = from->int_ctl; + to->int_vector = from->int_vector; + to->int_state = from->int_state; + to->exit_code = from->exit_code; + to->exit_code_hi = from->exit_code_hi; + to->exit_info_1 = from->exit_info_1; + to->exit_info_2 = from->exit_info_2; + to->exit_int_info = from->exit_int_info; + to->exit_int_info_err = from->exit_int_info_err; + to->nested_ctl = from->nested_ctl; + to->event_inj = from->event_inj; + to->event_inj_err = from->event_inj_err; + to->next_rip = from->next_rip; + to->nested_cr3 = from->nested_cr3; + to->virt_ext = from->virt_ext; + to->pause_filter_count = from->pause_filter_count; + to->pause_filter_thresh = from->pause_filter_thresh; + + /* Copy asid here because nested_vmcb_check_controls will check it. */ + to->asid = from->asid; + to->msrpm_base_pa &= ~0x0fffULL; + to->iopm_base_pa &= ~0x0fffULL; + + /* Hyper-V extensions (Enlightened VMCB) */ + if (kvm_hv_hypercall_enabled(vcpu)) { + to->clean = from->clean; + memcpy(to->reserved_sw, from->reserved_sw, + sizeof(struct hv_enlightenments)); + } } -void nested_load_control_from_vmcb12(struct vcpu_svm *svm, - struct vmcb_control_area *control) +void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm, + struct vmcb_control_area *control) +{ + __nested_copy_vmcb_control_to_cache(&svm->vcpu, &svm->nested.ctl, control); +} + +static void __nested_copy_vmcb_save_to_cache(struct vmcb_save_area_cached *to, + struct vmcb_save_area *from) { - copy_vmcb_control_area(&svm->nested.ctl, control); + /* + * Copy only fields that are validated, as we need them + * to avoid TOC/TOU races. + */ + to->efer = from->efer; + to->cr0 = from->cr0; + to->cr3 = from->cr3; + to->cr4 = from->cr4; - /* Copy it here because nested_svm_check_controls will check it. */ - svm->nested.ctl.asid = control->asid; - svm->nested.ctl.msrpm_base_pa &= ~0x0fffULL; - svm->nested.ctl.iopm_base_pa &= ~0x0fffULL; + to->dr6 = from->dr6; + to->dr7 = from->dr7; +} + +void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm, + struct vmcb_save_area *save) +{ + __nested_copy_vmcb_save_to_cache(&svm->nested.save, save); } /* @@ -361,6 +426,10 @@ void nested_sync_control_from_vmcb02(struct vcpu_svm *svm) */ mask &= ~V_IRQ_MASK; } + + if (nested_vgif_enabled(svm)) + mask |= V_GIF_MASK; + svm->nested.ctl.int_ctl &= ~mask; svm->nested.ctl.int_ctl |= svm->vmcb->control.int_ctl & mask; } @@ -377,7 +446,7 @@ static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm, unsigned int nr; if (vcpu->arch.exception.injected) { - nr = vcpu->arch.exception.nr; + nr = vcpu->arch.exception.vector; exit_int_info = nr | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT; if (vcpu->arch.exception.has_error_code) { @@ -402,11 +471,6 @@ static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm, vmcb12->control.exit_int_info = exit_int_info; } -static inline bool nested_npt_enabled(struct vcpu_svm *svm) -{ - return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE; -} - static void nested_svm_transition_tlb_flush(struct kvm_vcpu *vcpu) { /* @@ -437,18 +501,17 @@ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, return -EINVAL; if (reload_pdptrs && !nested_npt && is_pae_paging(vcpu) && - CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) + CC(!load_pdptrs(vcpu, cr3))) return -EINVAL; - if (!nested_npt) - kvm_mmu_new_pgd(vcpu, cr3); - vcpu->arch.cr3 = cr3; - kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ kvm_init_mmu(vcpu); + if (!nested_npt) + kvm_mmu_new_pgd(vcpu, cr3); + return 0; } @@ -464,6 +527,8 @@ void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm) static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12) { bool new_vmcb12 = false; + struct vmcb *vmcb01 = svm->vmcb01.ptr; + struct vmcb *vmcb02 = svm->nested.vmcb02.ptr; nested_vmcb02_compute_g_pat(svm); @@ -471,34 +536,30 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) { new_vmcb12 = true; svm->nested.last_vmcb12_gpa = svm->nested.vmcb12_gpa; + svm->nested.force_msr_bitmap_recalc = true; } if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) { - svm->vmcb->save.es = vmcb12->save.es; - svm->vmcb->save.cs = vmcb12->save.cs; - svm->vmcb->save.ss = vmcb12->save.ss; - svm->vmcb->save.ds = vmcb12->save.ds; - svm->vmcb->save.cpl = vmcb12->save.cpl; - vmcb_mark_dirty(svm->vmcb, VMCB_SEG); + vmcb02->save.es = vmcb12->save.es; + vmcb02->save.cs = vmcb12->save.cs; + vmcb02->save.ss = vmcb12->save.ss; + vmcb02->save.ds = vmcb12->save.ds; + vmcb02->save.cpl = vmcb12->save.cpl; + vmcb_mark_dirty(vmcb02, VMCB_SEG); } if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DT))) { - svm->vmcb->save.gdtr = vmcb12->save.gdtr; - svm->vmcb->save.idtr = vmcb12->save.idtr; - vmcb_mark_dirty(svm->vmcb, VMCB_DT); + vmcb02->save.gdtr = vmcb12->save.gdtr; + vmcb02->save.idtr = vmcb12->save.idtr; + vmcb_mark_dirty(vmcb02, VMCB_DT); } kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED); - /* - * Force-set EFER_SVME even though it is checked earlier on the - * VMCB12, because the guest can flip the bit between the check - * and now. Clearing EFER_SVME would call svm_free_nested. - */ - svm_set_efer(&svm->vcpu, vmcb12->save.efer | EFER_SVME); + svm_set_efer(&svm->vcpu, svm->nested.save.efer); - svm_set_cr0(&svm->vcpu, vmcb12->save.cr0); - svm_set_cr4(&svm->vcpu, vmcb12->save.cr4); + svm_set_cr0(&svm->vcpu, svm->nested.save.cr0); + svm_set_cr4(&svm->vcpu, svm->nested.save.cr4); svm->vcpu.arch.cr2 = vmcb12->save.cr2; @@ -507,47 +568,87 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 kvm_rip_write(&svm->vcpu, vmcb12->save.rip); /* In case we don't even reach vcpu_run, the fields are not updated */ - svm->vmcb->save.rax = vmcb12->save.rax; - svm->vmcb->save.rsp = vmcb12->save.rsp; - svm->vmcb->save.rip = vmcb12->save.rip; + vmcb02->save.rax = vmcb12->save.rax; + vmcb02->save.rsp = vmcb12->save.rsp; + vmcb02->save.rip = vmcb12->save.rip; /* These bits will be set properly on the first execution when new_vmc12 is true */ if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) { - svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1; - svm->vcpu.arch.dr6 = vmcb12->save.dr6 | DR6_ACTIVE_LOW; - vmcb_mark_dirty(svm->vmcb, VMCB_DR); + vmcb02->save.dr7 = svm->nested.save.dr7 | DR7_FIXED_1; + svm->vcpu.arch.dr6 = svm->nested.save.dr6 | DR6_ACTIVE_LOW; + vmcb_mark_dirty(vmcb02, VMCB_DR); + } + + if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { + /* + * Reserved bits of DEBUGCTL are ignored. Be consistent with + * svm_set_msr's definition of reserved bits. + */ + svm_copy_lbrs(vmcb02, vmcb12); + vmcb02->save.dbgctl &= ~DEBUGCTL_RESERVED_BITS; + svm_update_lbrv(&svm->vcpu); + + } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) { + svm_copy_lbrs(vmcb02, vmcb01); } } -static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) +static inline bool is_evtinj_soft(u32 evtinj) { - const u32 int_ctl_vmcb01_bits = - V_INTR_MASKING_MASK | V_GIF_MASK | V_GIF_ENABLE_MASK; + u32 type = evtinj & SVM_EVTINJ_TYPE_MASK; + u8 vector = evtinj & SVM_EVTINJ_VEC_MASK; - const u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK; + if (!(evtinj & SVM_EVTINJ_VALID)) + return false; + + if (type == SVM_EVTINJ_TYPE_SOFT) + return true; + + return type == SVM_EVTINJ_TYPE_EXEPT && kvm_exception_is_soft(vector); +} + +static bool is_evtinj_nmi(u32 evtinj) +{ + u32 type = evtinj & SVM_EVTINJ_TYPE_MASK; + + if (!(evtinj & SVM_EVTINJ_VALID)) + return false; + + return type == SVM_EVTINJ_TYPE_NMI; +} + +static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, + unsigned long vmcb12_rip, + unsigned long vmcb12_csbase) +{ + u32 int_ctl_vmcb01_bits = V_INTR_MASKING_MASK; + u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK; struct kvm_vcpu *vcpu = &svm->vcpu; + struct vmcb *vmcb01 = svm->vmcb01.ptr; + struct vmcb *vmcb02 = svm->nested.vmcb02.ptr; + u32 pause_count12; + u32 pause_thresh12; /* * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2, * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes. */ - /* - * Also covers avic_vapic_bar, avic_backing_page, avic_logical_id, - * avic_physical_id. - */ - WARN_ON(kvm_apicv_activated(svm->vcpu.kvm)); + if (svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK)) + int_ctl_vmcb12_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK); + else + int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK); /* Copied from vmcb01. msrpm_base can be overwritten later. */ - svm->vmcb->control.nested_ctl = svm->vmcb01.ptr->control.nested_ctl; - svm->vmcb->control.iopm_base_pa = svm->vmcb01.ptr->control.iopm_base_pa; - svm->vmcb->control.msrpm_base_pa = svm->vmcb01.ptr->control.msrpm_base_pa; + vmcb02->control.nested_ctl = vmcb01->control.nested_ctl; + vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa; + vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa; /* Done at vmrun: asid. */ /* Also overwritten later if necessary. */ - svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; + vmcb02->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; /* nested_cr3. */ if (nested_npt_enabled(svm)) @@ -558,21 +659,75 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) svm->nested.ctl.tsc_offset, svm->tsc_ratio_msr); - svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset; + vmcb02->control.tsc_offset = vcpu->arch.tsc_offset; - if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) { + if (svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) { WARN_ON(!svm->tsc_scaling_enabled); nested_svm_update_tsc_ratio_msr(vcpu); } - svm->vmcb->control.int_ctl = + vmcb02->control.int_ctl = (svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) | - (svm->vmcb01.ptr->control.int_ctl & int_ctl_vmcb01_bits); + (vmcb01->control.int_ctl & int_ctl_vmcb01_bits); + + vmcb02->control.int_vector = svm->nested.ctl.int_vector; + vmcb02->control.int_state = svm->nested.ctl.int_state; + vmcb02->control.event_inj = svm->nested.ctl.event_inj; + vmcb02->control.event_inj_err = svm->nested.ctl.event_inj_err; - svm->vmcb->control.int_vector = svm->nested.ctl.int_vector; - svm->vmcb->control.int_state = svm->nested.ctl.int_state; - svm->vmcb->control.event_inj = svm->nested.ctl.event_inj; - svm->vmcb->control.event_inj_err = svm->nested.ctl.event_inj_err; + /* + * next_rip is consumed on VMRUN as the return address pushed on the + * stack for injected soft exceptions/interrupts. If nrips is exposed + * to L1, take it verbatim from vmcb12. If nrips is supported in + * hardware but not exposed to L1, stuff the actual L2 RIP to emulate + * what a nrips=0 CPU would do (L1 is responsible for advancing RIP + * prior to injecting the event). + */ + if (svm->nrips_enabled) + vmcb02->control.next_rip = svm->nested.ctl.next_rip; + else if (boot_cpu_has(X86_FEATURE_NRIPS)) + vmcb02->control.next_rip = vmcb12_rip; + + svm->nmi_l1_to_l2 = is_evtinj_nmi(vmcb02->control.event_inj); + if (is_evtinj_soft(vmcb02->control.event_inj)) { + svm->soft_int_injected = true; + svm->soft_int_csbase = vmcb12_csbase; + svm->soft_int_old_rip = vmcb12_rip; + if (svm->nrips_enabled) + svm->soft_int_next_rip = svm->nested.ctl.next_rip; + else + svm->soft_int_next_rip = vmcb12_rip; + } + + vmcb02->control.virt_ext = vmcb01->control.virt_ext & + LBR_CTL_ENABLE_MASK; + if (svm->lbrv_enabled) + vmcb02->control.virt_ext |= + (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK); + + if (!nested_vmcb_needs_vls_intercept(svm)) + vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + + pause_count12 = svm->pause_filter_enabled ? svm->nested.ctl.pause_filter_count : 0; + pause_thresh12 = svm->pause_threshold_enabled ? svm->nested.ctl.pause_filter_thresh : 0; + if (kvm_pause_in_guest(svm->vcpu.kvm)) { + /* use guest values since host doesn't intercept PAUSE */ + vmcb02->control.pause_filter_count = pause_count12; + vmcb02->control.pause_filter_thresh = pause_thresh12; + + } else { + /* start from host values otherwise */ + vmcb02->control.pause_filter_count = vmcb01->control.pause_filter_count; + vmcb02->control.pause_filter_thresh = vmcb01->control.pause_filter_thresh; + + /* ... but ensure filtering is disabled if so requested. */ + if (vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) { + if (!pause_count12) + vmcb02->control.pause_filter_count = 0; + if (!pause_thresh12) + vmcb02->control.pause_filter_thresh = 0; + } + } nested_svm_transition_tlb_flush(vcpu); @@ -604,11 +759,15 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, struct vcpu_svm *svm = to_svm(vcpu); int ret; - trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa, - vmcb12->save.rip, - vmcb12->control.int_ctl, - vmcb12->control.event_inj, - vmcb12->control.nested_ctl); + trace_kvm_nested_vmenter(svm->vmcb->save.rip, + vmcb12_gpa, + vmcb12->save.rip, + vmcb12->control.int_ctl, + vmcb12->control.event_inj, + vmcb12->control.nested_ctl, + vmcb12->control.nested_cr3, + vmcb12->save.cr3, + KVM_ISA_SVM); trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff, vmcb12->control.intercepts[INTERCEPT_CR] >> 16, @@ -625,22 +784,22 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr); svm_switch_vmcb(svm, &svm->nested.vmcb02); - nested_vmcb02_prepare_control(svm); + nested_vmcb02_prepare_control(svm, vmcb12->save.rip, vmcb12->save.cs.base); nested_vmcb02_prepare_save(svm, vmcb12); - ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3, + ret = nested_svm_load_cr3(&svm->vcpu, svm->nested.save.cr3, nested_npt_enabled(svm), from_vmrun); if (ret) return ret; - if (!npt_enabled) - vcpu->arch.mmu->inject_page_fault = svm_inject_page_fault_nested; - if (!from_vmrun) kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); svm_set_gif(svm, true); + if (kvm_vcpu_apicv_active(vcpu)) + kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); + return 0; } @@ -651,6 +810,7 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) struct vmcb *vmcb12; struct kvm_host_map map; u64 vmcb12_gpa; + struct vmcb *vmcb01 = svm->vmcb01.ptr; if (!svm->nested.hsave_msr) { kvm_inject_gp(vcpu, 0); @@ -678,10 +838,11 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) if (WARN_ON_ONCE(!svm->nested.initialized)) return -EINVAL; - nested_load_control_from_vmcb12(svm, &vmcb12->control); + nested_copy_vmcb_control_to_cache(svm, &vmcb12->control); + nested_copy_vmcb_save_to_cache(svm, &vmcb12->save); - if (!nested_vmcb_valid_sregs(vcpu, &vmcb12->save) || - !nested_vmcb_check_controls(vcpu, &svm->nested.ctl)) { + if (!nested_vmcb_check_save(vcpu) || + !nested_vmcb_check_controls(vcpu)) { vmcb12->control.exit_code = SVM_EXIT_ERR; vmcb12->control.exit_code_hi = 0; vmcb12->control.exit_info_1 = 0; @@ -693,14 +854,14 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) * Since vmcb01 is not in use, we can use it to store some of the L1 * state. */ - svm->vmcb01.ptr->save.efer = vcpu->arch.efer; - svm->vmcb01.ptr->save.cr0 = kvm_read_cr0(vcpu); - svm->vmcb01.ptr->save.cr4 = vcpu->arch.cr4; - svm->vmcb01.ptr->save.rflags = kvm_get_rflags(vcpu); - svm->vmcb01.ptr->save.rip = kvm_rip_read(vcpu); + vmcb01->save.efer = vcpu->arch.efer; + vmcb01->save.cr0 = kvm_read_cr0(vcpu); + vmcb01->save.cr4 = vcpu->arch.cr4; + vmcb01->save.rflags = kvm_get_rflags(vcpu); + vmcb01->save.rip = kvm_rip_read(vcpu); if (!npt_enabled) - svm->vmcb01.ptr->save.cr3 = kvm_read_cr3(vcpu); + vmcb01->save.cr3 = kvm_read_cr3(vcpu); svm->nested.nested_run_pending = 1; @@ -712,6 +873,8 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) out_exit_err: svm->nested.nested_run_pending = 0; + svm->nmi_l1_to_l2 = false; + svm->soft_int_injected = false; svm->vmcb->control.exit_code = SVM_EXIT_ERR; svm->vmcb->control.exit_code_hi = 0; @@ -766,14 +929,12 @@ void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb) int nested_svm_vmexit(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; + struct vmcb *vmcb01 = svm->vmcb01.ptr; + struct vmcb *vmcb02 = svm->nested.vmcb02.ptr; struct vmcb *vmcb12; - struct vmcb *vmcb = svm->vmcb; struct kvm_host_map map; int rc; - /* Triple faults in L2 should never escape. */ - WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)); - rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map); if (rc) { if (rc == -EINVAL) @@ -795,63 +956,77 @@ int nested_svm_vmexit(struct vcpu_svm *svm) /* Give the current vmcb to the guest */ - vmcb12->save.es = vmcb->save.es; - vmcb12->save.cs = vmcb->save.cs; - vmcb12->save.ss = vmcb->save.ss; - vmcb12->save.ds = vmcb->save.ds; - vmcb12->save.gdtr = vmcb->save.gdtr; - vmcb12->save.idtr = vmcb->save.idtr; + vmcb12->save.es = vmcb02->save.es; + vmcb12->save.cs = vmcb02->save.cs; + vmcb12->save.ss = vmcb02->save.ss; + vmcb12->save.ds = vmcb02->save.ds; + vmcb12->save.gdtr = vmcb02->save.gdtr; + vmcb12->save.idtr = vmcb02->save.idtr; vmcb12->save.efer = svm->vcpu.arch.efer; vmcb12->save.cr0 = kvm_read_cr0(vcpu); vmcb12->save.cr3 = kvm_read_cr3(vcpu); - vmcb12->save.cr2 = vmcb->save.cr2; + vmcb12->save.cr2 = vmcb02->save.cr2; vmcb12->save.cr4 = svm->vcpu.arch.cr4; vmcb12->save.rflags = kvm_get_rflags(vcpu); vmcb12->save.rip = kvm_rip_read(vcpu); vmcb12->save.rsp = kvm_rsp_read(vcpu); vmcb12->save.rax = kvm_rax_read(vcpu); - vmcb12->save.dr7 = vmcb->save.dr7; + vmcb12->save.dr7 = vmcb02->save.dr7; vmcb12->save.dr6 = svm->vcpu.arch.dr6; - vmcb12->save.cpl = vmcb->save.cpl; + vmcb12->save.cpl = vmcb02->save.cpl; - vmcb12->control.int_state = vmcb->control.int_state; - vmcb12->control.exit_code = vmcb->control.exit_code; - vmcb12->control.exit_code_hi = vmcb->control.exit_code_hi; - vmcb12->control.exit_info_1 = vmcb->control.exit_info_1; - vmcb12->control.exit_info_2 = vmcb->control.exit_info_2; + vmcb12->control.int_state = vmcb02->control.int_state; + vmcb12->control.exit_code = vmcb02->control.exit_code; + vmcb12->control.exit_code_hi = vmcb02->control.exit_code_hi; + vmcb12->control.exit_info_1 = vmcb02->control.exit_info_1; + vmcb12->control.exit_info_2 = vmcb02->control.exit_info_2; if (vmcb12->control.exit_code != SVM_EXIT_ERR) nested_save_pending_event_to_vmcb12(svm, vmcb12); if (svm->nrips_enabled) - vmcb12->control.next_rip = vmcb->control.next_rip; + vmcb12->control.next_rip = vmcb02->control.next_rip; vmcb12->control.int_ctl = svm->nested.ctl.int_ctl; vmcb12->control.tlb_ctl = svm->nested.ctl.tlb_ctl; vmcb12->control.event_inj = svm->nested.ctl.event_inj; vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err; + if (!kvm_pause_in_guest(vcpu->kvm)) { + vmcb01->control.pause_filter_count = vmcb02->control.pause_filter_count; + vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS); + + } + nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr); svm_switch_vmcb(svm, &svm->vmcb01); + if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { + svm_copy_lbrs(vmcb12, vmcb02); + svm_update_lbrv(vcpu); + } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) { + svm_copy_lbrs(vmcb01, vmcb02); + svm_update_lbrv(vcpu); + } + /* * On vmexit the GIF is set to false and * no event can be injected in L1. */ svm_set_gif(svm, false); - svm->vmcb->control.exit_int_info = 0; + vmcb01->control.exit_int_info = 0; svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset; - if (svm->vmcb->control.tsc_offset != svm->vcpu.arch.tsc_offset) { - svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset; - vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); + if (vmcb01->control.tsc_offset != svm->vcpu.arch.tsc_offset) { + vmcb01->control.tsc_offset = svm->vcpu.arch.tsc_offset; + vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS); } - if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) { + if (svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) { WARN_ON(!svm->tsc_scaling_enabled); vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; - svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio); + __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); } svm->nested.ctl.nested_cr3 = 0; @@ -859,13 +1034,13 @@ int nested_svm_vmexit(struct vcpu_svm *svm) /* * Restore processor state that had been saved in vmcb01 */ - kvm_set_rflags(vcpu, svm->vmcb->save.rflags); - svm_set_efer(vcpu, svm->vmcb->save.efer); - svm_set_cr0(vcpu, svm->vmcb->save.cr0 | X86_CR0_PE); - svm_set_cr4(vcpu, svm->vmcb->save.cr4); - kvm_rax_write(vcpu, svm->vmcb->save.rax); - kvm_rsp_write(vcpu, svm->vmcb->save.rsp); - kvm_rip_write(vcpu, svm->vmcb->save.rip); + kvm_set_rflags(vcpu, vmcb01->save.rflags); + svm_set_efer(vcpu, vmcb01->save.efer); + svm_set_cr0(vcpu, vmcb01->save.cr0 | X86_CR0_PE); + svm_set_cr4(vcpu, vmcb01->save.cr4); + kvm_rax_write(vcpu, vmcb01->save.rax); + kvm_rsp_write(vcpu, vmcb01->save.rsp); + kvm_rip_write(vcpu, vmcb01->save.rip); svm->vcpu.arch.dr7 = DR7_FIXED_1; kvm_update_dr7(&svm->vcpu); @@ -883,7 +1058,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) nested_svm_uninit_mmu_context(vcpu); - rc = nested_svm_load_cr3(vcpu, svm->vmcb->save.cr3, false, true); + rc = nested_svm_load_cr3(vcpu, vmcb01->save.cr3, false, true); if (rc) return 1; @@ -901,9 +1076,16 @@ int nested_svm_vmexit(struct vcpu_svm *svm) * right now so that it an be accounted for before we execute * L1's next instruction. */ - if (unlikely(svm->vmcb->save.rflags & X86_EFLAGS_TF)) + if (unlikely(vmcb01->save.rflags & X86_EFLAGS_TF)) kvm_queue_exception(&(svm->vcpu), DB_VECTOR); + /* + * Un-inhibit the AVIC right away, so that other vCPUs can start + * to benefit from it right away. + */ + if (kvm_apicv_activated(vcpu->kvm)) + kvm_vcpu_update_apicv(vcpu); + return 0; } @@ -964,9 +1146,9 @@ void svm_free_nested(struct vcpu_svm *svm) /* * Forcibly leave nested mode in order to be able to reset the VCPU later on. */ -void svm_leave_nested(struct vcpu_svm *svm) +void svm_leave_nested(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = &svm->vcpu; + struct vcpu_svm *svm = to_svm(vcpu); if (is_guest_mode(vcpu)) { svm->nested.nested_run_pending = 0; @@ -988,7 +1170,7 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) u32 offset, msr, value; int write, mask; - if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) + if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) return NESTED_EXIT_HOST; msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; @@ -1015,7 +1197,7 @@ static int nested_svm_intercept_ioio(struct vcpu_svm *svm) u8 start_bit; u64 gpa; - if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_IOIO_PROT))) + if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_IOIO_PROT))) return NESTED_EXIT_HOST; port = svm->vmcb->control.exit_info_1 >> 16; @@ -1046,12 +1228,12 @@ static int nested_svm_intercept(struct vcpu_svm *svm) vmexit = nested_svm_intercept_ioio(svm); break; case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: { - if (vmcb_is_intercept(&svm->nested.ctl, exit_code)) + if (vmcb12_is_intercept(&svm->nested.ctl, exit_code)) vmexit = NESTED_EXIT_DONE; break; } case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: { - if (vmcb_is_intercept(&svm->nested.ctl, exit_code)) + if (vmcb12_is_intercept(&svm->nested.ctl, exit_code)) vmexit = NESTED_EXIT_DONE; break; } @@ -1069,7 +1251,7 @@ static int nested_svm_intercept(struct vcpu_svm *svm) break; } default: { - if (vmcb_is_intercept(&svm->nested.ctl, exit_code)) + if (vmcb12_is_intercept(&svm->nested.ctl, exit_code)) vmexit = NESTED_EXIT_DONE; } } @@ -1104,58 +1286,73 @@ int nested_svm_check_permissions(struct kvm_vcpu *vcpu) return 0; } -static bool nested_exit_on_exception(struct vcpu_svm *svm) +static bool nested_svm_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector, + u32 error_code) { - unsigned int nr = svm->vcpu.arch.exception.nr; + struct vcpu_svm *svm = to_svm(vcpu); - return (svm->nested.ctl.intercepts[INTERCEPT_EXCEPTION] & BIT(nr)); + return (svm->nested.ctl.intercepts[INTERCEPT_EXCEPTION] & BIT(vector)); } -static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm) +static void nested_svm_inject_exception_vmexit(struct kvm_vcpu *vcpu) { - unsigned int nr = svm->vcpu.arch.exception.nr; + struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb *vmcb = svm->vmcb; - svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; - svm->vmcb->control.exit_code_hi = 0; + vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + ex->vector; + vmcb->control.exit_code_hi = 0; - if (svm->vcpu.arch.exception.has_error_code) - svm->vmcb->control.exit_info_1 = svm->vcpu.arch.exception.error_code; + if (ex->has_error_code) + vmcb->control.exit_info_1 = ex->error_code; /* * EXITINFO2 is undefined for all exception intercepts other * than #PF. */ - if (nr == PF_VECTOR) { - if (svm->vcpu.arch.exception.nested_apf) - svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token; - else if (svm->vcpu.arch.exception.has_payload) - svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload; + if (ex->vector == PF_VECTOR) { + if (ex->has_payload) + vmcb->control.exit_info_2 = ex->payload; else - svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; - } else if (nr == DB_VECTOR) { - /* See inject_pending_event. */ - kvm_deliver_exception_payload(&svm->vcpu); - if (svm->vcpu.arch.dr7 & DR7_GD) { - svm->vcpu.arch.dr7 &= ~DR7_GD; - kvm_update_dr7(&svm->vcpu); + vmcb->control.exit_info_2 = vcpu->arch.cr2; + } else if (ex->vector == DB_VECTOR) { + /* See kvm_check_and_inject_events(). */ + kvm_deliver_exception_payload(vcpu, ex); + + if (vcpu->arch.dr7 & DR7_GD) { + vcpu->arch.dr7 &= ~DR7_GD; + kvm_update_dr7(vcpu); } - } else - WARN_ON(svm->vcpu.arch.exception.has_payload); + } else { + WARN_ON(ex->has_payload); + } nested_svm_vmexit(svm); } static inline bool nested_exit_on_init(struct vcpu_svm *svm) { - return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INIT); + return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_INIT); } static int svm_check_nested_events(struct kvm_vcpu *vcpu) { - struct vcpu_svm *svm = to_svm(vcpu); - bool block_nested_events = - kvm_event_needs_reinjection(vcpu) || svm->nested.nested_run_pending; struct kvm_lapic *apic = vcpu->arch.apic; + struct vcpu_svm *svm = to_svm(vcpu); + /* + * Only a pending nested run blocks a pending exception. If there is a + * previously injected event, the pending exception occurred while said + * event was being delivered and thus needs to be handled. + */ + bool block_nested_exceptions = svm->nested.nested_run_pending; + /* + * New events (not exceptions) are only recognized at instruction + * boundaries. If an event needs reinjection, then KVM is handling a + * VM-Exit that occurred _during_ instruction execution; new events are + * blocked until the instruction completes. + */ + bool block_nested_events = block_nested_exceptions || + kvm_event_needs_reinjection(vcpu); if (lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &apic->pending_events)) { @@ -1167,18 +1364,16 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu) return 0; } - if (vcpu->arch.exception.pending) { - /* - * Only a pending nested run can block a pending exception. - * Otherwise an injected NMI/interrupt should either be - * lost or delivered to the nested hypervisor in the EXITINTINFO - * vmcb field, while delivering the pending exception. - */ - if (svm->nested.nested_run_pending) + if (vcpu->arch.exception_vmexit.pending) { + if (block_nested_exceptions) return -EBUSY; - if (!nested_exit_on_exception(svm)) - return 0; - nested_svm_inject_exception_vmexit(svm); + nested_svm_inject_exception_vmexit(vcpu); + return 0; + } + + if (vcpu->arch.exception.pending) { + if (block_nested_exceptions) + return -EBUSY; return 0; } @@ -1248,7 +1443,43 @@ void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu) vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(vcpu->arch.l1_tsc_scaling_ratio, svm->tsc_ratio_msr); - svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio); + __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); +} + +/* Inverse operation of nested_copy_vmcb_control_to_cache(). asid is copied too. */ +static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst, + struct vmcb_ctrl_area_cached *from) +{ + unsigned int i; + + memset(dst, 0, sizeof(struct vmcb_control_area)); + + for (i = 0; i < MAX_INTERCEPT; i++) + dst->intercepts[i] = from->intercepts[i]; + + dst->iopm_base_pa = from->iopm_base_pa; + dst->msrpm_base_pa = from->msrpm_base_pa; + dst->tsc_offset = from->tsc_offset; + dst->asid = from->asid; + dst->tlb_ctl = from->tlb_ctl; + dst->int_ctl = from->int_ctl; + dst->int_vector = from->int_vector; + dst->int_state = from->int_state; + dst->exit_code = from->exit_code; + dst->exit_code_hi = from->exit_code_hi; + dst->exit_info_1 = from->exit_info_1; + dst->exit_info_2 = from->exit_info_2; + dst->exit_int_info = from->exit_int_info; + dst->exit_int_info_err = from->exit_int_info_err; + dst->nested_ctl = from->nested_ctl; + dst->event_inj = from->event_inj; + dst->event_inj_err = from->event_inj_err; + dst->next_rip = from->next_rip; + dst->nested_cr3 = from->nested_cr3; + dst->virt_ext = from->virt_ext; + dst->pause_filter_count = from->pause_filter_count; + dst->pause_filter_thresh = from->pause_filter_thresh; + /* 'clean' and 'reserved_sw' are not changed by KVM */ } static int svm_get_nested_state(struct kvm_vcpu *vcpu, @@ -1256,6 +1487,8 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu, u32 user_data_size) { struct vcpu_svm *svm; + struct vmcb_control_area *ctl; + unsigned long r; struct kvm_nested_state kvm_state = { .flags = 0, .format = KVM_STATE_NESTED_FORMAT_SVM, @@ -1297,9 +1530,18 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu, */ if (clear_user(user_vmcb, KVM_STATE_NESTED_SVM_VMCB_SIZE)) return -EFAULT; - if (copy_to_user(&user_vmcb->control, &svm->nested.ctl, - sizeof(user_vmcb->control))) + + ctl = kzalloc(sizeof(*ctl), GFP_KERNEL); + if (!ctl) + return -ENOMEM; + + nested_copy_vmcb_cache_to_control(ctl, &svm->nested.ctl); + r = copy_to_user(&user_vmcb->control, ctl, + sizeof(user_vmcb->control)); + kfree(ctl); + if (r) return -EFAULT; + if (copy_to_user(&user_vmcb->save, &svm->vmcb01.ptr->save, sizeof(user_vmcb->save))) return -EFAULT; @@ -1316,6 +1558,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, &user_kvm_nested_state->data.svm[0]; struct vmcb_control_area *ctl; struct vmcb_save_area *save; + struct vmcb_save_area_cached save_cached; + struct vmcb_ctrl_area_cached ctl_cached; unsigned long cr0; int ret; @@ -1345,7 +1589,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, return -EINVAL; if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) { - svm_leave_nested(svm); + svm_leave_nested(vcpu); svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET)); return 0; } @@ -1368,7 +1612,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, goto out_free; ret = -EINVAL; - if (!nested_vmcb_check_controls(vcpu, ctl)) + __nested_copy_vmcb_control_to_cache(vcpu, &ctl_cached, ctl); + if (!__nested_vmcb_check_controls(vcpu, &ctl_cached)) goto out_free; /* @@ -1383,22 +1628,11 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, * Validate host state saved from before VMRUN (see * nested_svm_check_permissions). */ + __nested_copy_vmcb_save_to_cache(&save_cached, save); if (!(save->cr0 & X86_CR0_PG) || !(save->cr0 & X86_CR0_PE) || (save->rflags & X86_EFLAGS_VM) || - !nested_vmcb_valid_sregs(vcpu, save)) - goto out_free; - - /* - * While the nested guest CR3 is already checked and set by - * KVM_SET_SREGS, it was set when nested state was yet loaded, - * thus MMU might not be initialized correctly. - * Set it again to fix this. - */ - - ret = nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3, - nested_npt_enabled(svm), false); - if (WARN_ON_ONCE(ret)) + !__nested_vmcb_check_save(vcpu, &save_cached)) goto out_free; @@ -1410,7 +1644,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, */ if (is_guest_mode(vcpu)) - svm_leave_nested(svm); + svm_leave_nested(vcpu); else svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save; @@ -1422,10 +1656,25 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa; svm_copy_vmrun_state(&svm->vmcb01.ptr->save, save); - nested_load_control_from_vmcb12(svm, ctl); + nested_copy_vmcb_control_to_cache(svm, ctl); svm_switch_vmcb(svm, &svm->nested.vmcb02); - nested_vmcb02_prepare_control(svm); + nested_vmcb02_prepare_control(svm, svm->vmcb->save.rip, svm->vmcb->save.cs.base); + + /* + * While the nested guest CR3 is already checked and set by + * KVM_SET_SREGS, it was set when nested state was yet loaded, + * thus MMU might not be initialized correctly. + * Set it again to fix this. + */ + + ret = nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3, + nested_npt_enabled(svm), false); + if (WARN_ON_ONCE(ret)) + goto out_free; + + svm->nested.force_msr_bitmap_recalc = true; + kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); ret = 0; out_free: @@ -1449,7 +1698,7 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) * the guest CR3 might be restored prior to setting the nested * state which can lead to a load of wrong PDPTRs. */ - if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3))) + if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) return false; if (!nested_svm_vmrun_msrpm(svm)) { @@ -1464,6 +1713,8 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) } struct kvm_x86_nested_ops svm_nested_ops = { + .leave_nested = svm_leave_nested, + .is_exception_vmexit = nested_svm_is_exception_vmexit, .check_events = svm_check_nested_events, .triple_fault = nested_svm_triple_fault, .get_nested_state_pages = svm_get_nested_state_pages, diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index b4095dfeeee6..9d65cd095691 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -16,145 +16,64 @@ #include "cpuid.h" #include "lapic.h" #include "pmu.h" +#include "svm.h" enum pmu_type { PMU_TYPE_COUNTER = 0, PMU_TYPE_EVNTSEL, }; -enum index { - INDEX_ZERO = 0, - INDEX_ONE, - INDEX_TWO, - INDEX_THREE, - INDEX_FOUR, - INDEX_FIVE, - INDEX_ERROR, -}; - -/* duplicated from amd_perfmon_event_map, K7 and above should work. */ -static struct kvm_event_hw_type_mapping amd_event_mapping[] = { - [0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES }, - [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, - [2] = { 0x7d, 0x07, PERF_COUNT_HW_CACHE_REFERENCES }, - [3] = { 0x7e, 0x07, PERF_COUNT_HW_CACHE_MISSES }, - [4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, - [5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, - [6] = { 0xd0, 0x00, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, - [7] = { 0xd1, 0x00, PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, -}; - -static unsigned int get_msr_base(struct kvm_pmu *pmu, enum pmu_type type) +static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) { - struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); + unsigned int num_counters = pmu->nr_arch_gp_counters; - if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) { - if (type == PMU_TYPE_COUNTER) - return MSR_F15H_PERF_CTR; - else - return MSR_F15H_PERF_CTL; - } else { - if (type == PMU_TYPE_COUNTER) - return MSR_K7_PERFCTR0; - else - return MSR_K7_EVNTSEL0; - } -} + if (pmc_idx >= num_counters) + return NULL; -static enum index msr_to_index(u32 msr) -{ - switch (msr) { - case MSR_F15H_PERF_CTL0: - case MSR_F15H_PERF_CTR0: - case MSR_K7_EVNTSEL0: - case MSR_K7_PERFCTR0: - return INDEX_ZERO; - case MSR_F15H_PERF_CTL1: - case MSR_F15H_PERF_CTR1: - case MSR_K7_EVNTSEL1: - case MSR_K7_PERFCTR1: - return INDEX_ONE; - case MSR_F15H_PERF_CTL2: - case MSR_F15H_PERF_CTR2: - case MSR_K7_EVNTSEL2: - case MSR_K7_PERFCTR2: - return INDEX_TWO; - case MSR_F15H_PERF_CTL3: - case MSR_F15H_PERF_CTR3: - case MSR_K7_EVNTSEL3: - case MSR_K7_PERFCTR3: - return INDEX_THREE; - case MSR_F15H_PERF_CTL4: - case MSR_F15H_PERF_CTR4: - return INDEX_FOUR; - case MSR_F15H_PERF_CTL5: - case MSR_F15H_PERF_CTR5: - return INDEX_FIVE; - default: - return INDEX_ERROR; - } + return &pmu->gp_counters[array_index_nospec(pmc_idx, num_counters)]; } static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, enum pmu_type type) { struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); + unsigned int idx; + + if (!vcpu->kvm->arch.enable_pmu) + return NULL; switch (msr) { - case MSR_F15H_PERF_CTL0: - case MSR_F15H_PERF_CTL1: - case MSR_F15H_PERF_CTL2: - case MSR_F15H_PERF_CTL3: - case MSR_F15H_PERF_CTL4: - case MSR_F15H_PERF_CTL5: + case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: if (!guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) return NULL; - fallthrough; + /* + * Each PMU counter has a pair of CTL and CTR MSRs. CTLn + * MSRs (accessed via EVNTSEL) are even, CTRn MSRs are odd. + */ + idx = (unsigned int)((msr - MSR_F15H_PERF_CTL0) / 2); + if (!(msr & 0x1) != (type == PMU_TYPE_EVNTSEL)) + return NULL; + break; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: if (type != PMU_TYPE_EVNTSEL) return NULL; + idx = msr - MSR_K7_EVNTSEL0; break; - case MSR_F15H_PERF_CTR0: - case MSR_F15H_PERF_CTR1: - case MSR_F15H_PERF_CTR2: - case MSR_F15H_PERF_CTR3: - case MSR_F15H_PERF_CTR4: - case MSR_F15H_PERF_CTR5: - if (!guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) - return NULL; - fallthrough; case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: if (type != PMU_TYPE_COUNTER) return NULL; + idx = msr - MSR_K7_PERFCTR0; break; default: return NULL; } - return &pmu->gp_counters[msr_to_index(msr)]; + return amd_pmc_idx_to_pmc(pmu, idx); } -static unsigned amd_find_arch_event(struct kvm_pmu *pmu, - u8 event_select, - u8 unit_mask) +static bool amd_hw_event_available(struct kvm_pmc *pmc) { - int i; - - for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++) - if (amd_event_mapping[i].eventsel == event_select - && amd_event_mapping[i].unit_mask == unit_mask) - break; - - if (i == ARRAY_SIZE(amd_event_mapping)) - return PERF_COUNT_HW_MAX; - - return amd_event_mapping[i].event_type; -} - -/* return PERF_COUNT_HW_MAX as AMD doesn't have fixed events */ -static unsigned amd_find_fixed_event(int idx) -{ - return PERF_COUNT_HW_MAX; + return true; } /* check if a PMC is enabled by comparing it against global_ctrl bits. Because @@ -165,22 +84,6 @@ static bool amd_pmc_is_enabled(struct kvm_pmc *pmc) return true; } -static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) -{ - unsigned int base = get_msr_base(pmu, PMU_TYPE_COUNTER); - struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); - - if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) { - /* - * The idx is contiguous. The MSRs are not. The counter MSRs - * are interleaved with the event select MSRs. - */ - pmc_idx *= 2; - } - - return get_gp_pmc_amd(pmu, base + pmc_idx, PMU_TYPE_COUNTER); -} - static bool amd_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -194,15 +97,7 @@ static bool amd_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) static struct kvm_pmc *amd_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, unsigned int idx, u64 *mask) { - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - struct kvm_pmc *counters; - - idx &= ~(3u << 30); - if (idx >= pmu->nr_arch_gp_counters) - return NULL; - counters = pmu->gp_counters; - - return &counters[idx]; + return amd_pmc_idx_to_pmc(vcpu_to_pmu(vcpu), idx & ~(3u << 30)); } static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) @@ -255,17 +150,18 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER); if (pmc) { pmc->counter += data - pmc_read_counter(pmc); + pmc_update_sample_period(pmc); return 0; } /* MSR_EVNTSELn */ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL); if (pmc) { - if (data == pmc->eventsel) - return 0; - if (!(data & pmu->reserved_bits)) { - reprogram_gp_counter(pmc, data); - return 0; + data &= ~pmu->reserved_bits; + if (data != pmc->eventsel) { + pmc->eventsel = data; + reprogram_counter(pmc); } + return 0; } return 1; @@ -282,6 +178,7 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu) pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1; pmu->reserved_bits = 0xfffffff000280000ull; + pmu->raw_event_mask = AMD64_RAW_EVENT_MASK; pmu->version = 1; /* not applicable to AMD; but clean them to prevent any fall out */ pmu->counter_bitmask[KVM_PMC_FIXED] = 0; @@ -295,9 +192,10 @@ static void amd_pmu_init(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); int i; - BUILD_BUG_ON(AMD64_NUM_COUNTERS_CORE > INTEL_PMC_MAX_GENERIC); + BUILD_BUG_ON(KVM_AMD_PMC_MAX_GENERIC > AMD64_NUM_COUNTERS_CORE); + BUILD_BUG_ON(KVM_AMD_PMC_MAX_GENERIC > INTEL_PMC_MAX_GENERIC); - for (i = 0; i < AMD64_NUM_COUNTERS_CORE ; i++) { + for (i = 0; i < KVM_AMD_PMC_MAX_GENERIC ; i++) { pmu->gp_counters[i].type = KVM_PMC_GP; pmu->gp_counters[i].vcpu = vcpu; pmu->gp_counters[i].idx = i; @@ -310,7 +208,7 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); int i; - for (i = 0; i < AMD64_NUM_COUNTERS_CORE; i++) { + for (i = 0; i < KVM_AMD_PMC_MAX_GENERIC; i++) { struct kvm_pmc *pmc = &pmu->gp_counters[i]; pmc_stop_counter(pmc); @@ -318,9 +216,8 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu) } } -struct kvm_pmu_ops amd_pmu_ops = { - .find_arch_event = amd_find_arch_event, - .find_fixed_event = amd_find_fixed_event, +struct kvm_pmu_ops amd_pmu_ops __initdata = { + .hw_event_available = amd_hw_event_available, .pmc_is_enabled = amd_pmc_is_enabled, .pmc_idx_to_pmc = amd_pmc_idx_to_pmc, .rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc, diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 7656a2c5662a..efaaef2b7ae1 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -22,6 +22,7 @@ #include <asm/trapnr.h> #include <asm/fpu/xcr.h> +#include "mmu.h" #include "x86.h" #include "svm.h" #include "svm_ops.h" @@ -195,7 +196,7 @@ static void sev_asid_free(struct kvm_sev_info *sev) __set_bit(sev->asid, sev_reclaim_asid_bitmap); for_each_possible_cpu(cpu) { - sd = per_cpu(svm_data, cpu); + sd = per_cpu_ptr(&svm_data, cpu); sd->sev_vmcbs[sev->asid] = NULL; } @@ -258,6 +259,9 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) goto e_free; INIT_LIST_HEAD(&sev->regions_list); + INIT_LIST_HEAD(&sev->mirror_vms); + + kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_SEV); return 0; @@ -464,6 +468,7 @@ static void sev_clflush_pages(struct page *pages[], unsigned long npages) page_virtual = kmap_atomic(pages[i]); clflush_cache_range(page_virtual, PAGE_SIZE); kunmap_atomic(page_virtual); + cond_resched(); } } @@ -558,12 +563,20 @@ e_unpin: static int sev_es_sync_vmsa(struct vcpu_svm *svm) { - struct vmcb_save_area *save = &svm->vmcb->save; + struct sev_es_save_area *save = svm->sev_es.vmsa; /* Check some debug related fields before encrypting the VMSA */ - if (svm->vcpu.guest_debug || (save->dr7 & ~DR7_FIXED_1)) + if (svm->vcpu.guest_debug || (svm->vmcb->save.dr7 & ~DR7_FIXED_1)) return -EINVAL; + /* + * SEV-ES will use a VMSA that is pointed to by the VMCB, not + * the traditional VMSA that is part of the VMCB. Copy the + * traditional VMSA as it has been built so far (in prep + * for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state. + */ + memcpy(save, &svm->vmcb->save, sizeof(svm->vmcb->save)); + /* Sync registgers */ save->rax = svm->vcpu.arch.regs[VCPU_REGS_RAX]; save->rbx = svm->vcpu.arch.regs[VCPU_REGS_RBX]; @@ -591,13 +604,8 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm) save->xss = svm->vcpu.arch.ia32_xss; save->dr6 = svm->vcpu.arch.dr6; - /* - * SEV-ES will use a VMSA that is pointed to by the VMCB, not - * the traditional VMSA that is part of the VMCB. Copy the - * traditional VMSA as it has been built so far (in prep - * for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state. - */ - memcpy(svm->sev_es.vmsa, save, sizeof(*save)); + pr_debug("Virtual Machine Save Area (VMSA):\n"); + print_hex_dump_debug("", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false); return 0; } @@ -636,7 +644,8 @@ static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu, static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) { struct kvm_vcpu *vcpu; - int i, ret; + unsigned long i; + int ret; if (!sev_es_guest(kvm)) return -ENOTTY; @@ -683,7 +692,7 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) if (params.len > SEV_FW_BLOB_MAX_SIZE) return -EINVAL; - blob = kmalloc(params.len, GFP_KERNEL_ACCOUNT); + blob = kzalloc(params.len, GFP_KERNEL_ACCOUNT); if (!blob) return -ENOMEM; @@ -803,7 +812,7 @@ static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr, if (!IS_ALIGNED(dst_paddr, 16) || !IS_ALIGNED(paddr, 16) || !IS_ALIGNED(size, 16)) { - tpage = (void *)alloc_page(GFP_KERNEL); + tpage = (void *)alloc_page(GFP_KERNEL | __GFP_ZERO); if (!tpage) return -ENOMEM; @@ -839,7 +848,7 @@ static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr, /* If source buffer is not aligned then use an intermediate buffer */ if (!IS_ALIGNED((unsigned long)vaddr, 16)) { - src_tpage = alloc_page(GFP_KERNEL); + src_tpage = alloc_page(GFP_KERNEL_ACCOUNT); if (!src_tpage) return -ENOMEM; @@ -860,7 +869,7 @@ static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr, if (!IS_ALIGNED((unsigned long)dst_vaddr, 16) || !IS_ALIGNED(size, 16)) { int dst_offset; - dst_tpage = alloc_page(GFP_KERNEL); + dst_tpage = alloc_page(GFP_KERNEL_ACCOUNT); if (!dst_tpage) { ret = -ENOMEM; goto e_free; @@ -1089,7 +1098,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp) if (params.len > SEV_FW_BLOB_MAX_SIZE) return -EINVAL; - blob = kmalloc(params.len, GFP_KERNEL_ACCOUNT); + blob = kzalloc(params.len, GFP_KERNEL_ACCOUNT); if (!blob) return -ENOMEM; @@ -1171,7 +1180,7 @@ static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp) return -EINVAL; /* allocate the memory to hold the session data blob */ - session_data = kmalloc(params.session_len, GFP_KERNEL_ACCOUNT); + session_data = kzalloc(params.session_len, GFP_KERNEL_ACCOUNT); if (!session_data) return -ENOMEM; @@ -1295,11 +1304,11 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) /* allocate memory for header and transport buffer */ ret = -ENOMEM; - hdr = kmalloc(params.hdr_len, GFP_KERNEL_ACCOUNT); + hdr = kzalloc(params.hdr_len, GFP_KERNEL_ACCOUNT); if (!hdr) goto e_unpin; - trans_data = kmalloc(params.trans_len, GFP_KERNEL_ACCOUNT); + trans_data = kzalloc(params.trans_len, GFP_KERNEL_ACCOUNT); if (!trans_data) goto e_free_hdr; @@ -1565,7 +1574,7 @@ static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) r = -EINTR; if (mutex_lock_killable(&dst_kvm->lock)) goto release_src; - if (mutex_lock_killable(&src_kvm->lock)) + if (mutex_lock_killable_nested(&src_kvm->lock, SINGLE_DEPTH_NESTING)) goto unlock_dst; return 0; @@ -1589,24 +1598,48 @@ static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) atomic_set_release(&src_sev->migration_in_progress, 0); } +/* vCPU mutex subclasses. */ +enum sev_migration_role { + SEV_MIGRATION_SOURCE = 0, + SEV_MIGRATION_TARGET, + SEV_NR_MIGRATION_ROLES, +}; -static int sev_lock_vcpus_for_migration(struct kvm *kvm) +static int sev_lock_vcpus_for_migration(struct kvm *kvm, + enum sev_migration_role role) { struct kvm_vcpu *vcpu; - int i, j; + unsigned long i, j; kvm_for_each_vcpu(i, vcpu, kvm) { - if (mutex_lock_killable(&vcpu->mutex)) + if (mutex_lock_killable_nested(&vcpu->mutex, role)) goto out_unlock; + +#ifdef CONFIG_PROVE_LOCKING + if (!i) + /* + * Reset the role to one that avoids colliding with + * the role used for the first vcpu mutex. + */ + role = SEV_NR_MIGRATION_ROLES; + else + mutex_release(&vcpu->mutex.dep_map, _THIS_IP_); +#endif } return 0; out_unlock: + kvm_for_each_vcpu(j, vcpu, kvm) { if (i == j) break; +#ifdef CONFIG_PROVE_LOCKING + if (j) + mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_); +#endif + mutex_unlock(&vcpu->mutex); } return -EINTR; @@ -1615,50 +1648,84 @@ out_unlock: static void sev_unlock_vcpus_for_migration(struct kvm *kvm) { struct kvm_vcpu *vcpu; - int i; + unsigned long i; + bool first = true; kvm_for_each_vcpu(i, vcpu, kvm) { + if (first) + first = false; + else + mutex_acquire(&vcpu->mutex.dep_map, + SEV_NR_MIGRATION_ROLES, 0, _THIS_IP_); + mutex_unlock(&vcpu->mutex); } } -static void sev_migrate_from(struct kvm_sev_info *dst, - struct kvm_sev_info *src) +static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm) { + struct kvm_sev_info *dst = &to_kvm_svm(dst_kvm)->sev_info; + struct kvm_sev_info *src = &to_kvm_svm(src_kvm)->sev_info; + struct kvm_vcpu *dst_vcpu, *src_vcpu; + struct vcpu_svm *dst_svm, *src_svm; + struct kvm_sev_info *mirror; + unsigned long i; + dst->active = true; dst->asid = src->asid; dst->handle = src->handle; dst->pages_locked = src->pages_locked; dst->enc_context_owner = src->enc_context_owner; + dst->es_active = src->es_active; src->asid = 0; src->active = false; src->handle = 0; src->pages_locked = 0; src->enc_context_owner = NULL; + src->es_active = false; list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list); -} -static int sev_es_migrate_from(struct kvm *dst, struct kvm *src) -{ - int i; - struct kvm_vcpu *dst_vcpu, *src_vcpu; - struct vcpu_svm *dst_svm, *src_svm; + /* + * If this VM has mirrors, "transfer" each mirror's refcount of the + * source to the destination (this KVM). The caller holds a reference + * to the source, so there's no danger of use-after-free. + */ + list_cut_before(&dst->mirror_vms, &src->mirror_vms, &src->mirror_vms); + list_for_each_entry(mirror, &dst->mirror_vms, mirror_entry) { + kvm_get_kvm(dst_kvm); + kvm_put_kvm(src_kvm); + mirror->enc_context_owner = dst_kvm; + } - if (atomic_read(&src->online_vcpus) != atomic_read(&dst->online_vcpus)) - return -EINVAL; + /* + * If this VM is a mirror, remove the old mirror from the owners list + * and add the new mirror to the list. + */ + if (is_mirroring_enc_context(dst_kvm)) { + struct kvm_sev_info *owner_sev_info = + &to_kvm_svm(dst->enc_context_owner)->sev_info; - kvm_for_each_vcpu(i, src_vcpu, src) { - if (!src_vcpu->arch.guest_state_protected) - return -EINVAL; + list_del(&src->mirror_entry); + list_add_tail(&dst->mirror_entry, &owner_sev_info->mirror_vms); } - kvm_for_each_vcpu(i, src_vcpu, src) { - src_svm = to_svm(src_vcpu); - dst_vcpu = kvm_get_vcpu(dst, i); + kvm_for_each_vcpu(i, dst_vcpu, dst_kvm) { dst_svm = to_svm(dst_vcpu); + sev_init_vmcb(dst_svm); + + if (!dst->es_active) + continue; + + /* + * Note, the source is not required to have the same number of + * vCPUs as the destination when migrating a vanilla SEV VM. + */ + src_vcpu = kvm_get_vcpu(dst_kvm, i); + src_svm = to_svm(src_vcpu); + /* * Transfer VMSA and GHCB state to the destination. Nullify and * clear source fields as appropriate, the state now belongs to @@ -1674,13 +1741,28 @@ static int sev_es_migrate_from(struct kvm *dst, struct kvm *src) src_svm->vmcb->control.vmsa_pa = INVALID_PAGE; src_vcpu->arch.guest_state_protected = false; } - to_kvm_svm(src)->sev_info.es_active = false; - to_kvm_svm(dst)->sev_info.es_active = true; +} + +static int sev_check_source_vcpus(struct kvm *dst, struct kvm *src) +{ + struct kvm_vcpu *src_vcpu; + unsigned long i; + + if (!sev_es_guest(src)) + return 0; + + if (atomic_read(&src->online_vcpus) != atomic_read(&dst->online_vcpus)) + return -EINVAL; + + kvm_for_each_vcpu(i, src_vcpu, src) { + if (!src_vcpu->arch.guest_state_protected) + return -EINVAL; + } return 0; } -int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd) +int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) { struct kvm_sev_info *dst_sev = &to_kvm_svm(kvm)->sev_info; struct kvm_sev_info *src_sev, *cg_cleanup_sev; @@ -1707,15 +1789,6 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd) src_sev = &to_kvm_svm(source_kvm)->sev_info; - /* - * VMs mirroring src's encryption context rely on it to keep the - * ASID allocated, but below we are clearing src_sev->asid. - */ - if (src_sev->num_mirrored_vms) { - ret = -EBUSY; - goto out_unlock; - } - dst_sev->misc_cg = get_current_misc_cg(); cg_cleanup_sev = dst_sev; if (dst_sev->misc_cg != src_sev->misc_cg) { @@ -1725,19 +1798,18 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd) charged = true; } - ret = sev_lock_vcpus_for_migration(kvm); + ret = sev_lock_vcpus_for_migration(kvm, SEV_MIGRATION_SOURCE); if (ret) goto out_dst_cgroup; - ret = sev_lock_vcpus_for_migration(source_kvm); + ret = sev_lock_vcpus_for_migration(source_kvm, SEV_MIGRATION_TARGET); if (ret) goto out_dst_vcpu; - if (sev_es_guest(source_kvm)) { - ret = sev_es_migrate_from(kvm, source_kvm); - if (ret) - goto out_source_vcpu; - } - sev_migrate_from(dst_sev, src_sev); + ret = sev_check_source_vcpus(kvm, source_kvm); + if (ret) + goto out_source_vcpu; + + sev_migrate_from(kvm, source_kvm); kvm_vm_dead(source_kvm); cg_cleanup_sev = src_sev; ret = 0; @@ -1760,7 +1832,7 @@ out_fput: return ret; } -int svm_mem_enc_op(struct kvm *kvm, void __user *argp) +int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; int r; @@ -1857,8 +1929,8 @@ out: return r; } -int svm_register_enc_region(struct kvm *kvm, - struct kvm_enc_region *range) +int sev_mem_enc_register_region(struct kvm *kvm, + struct kvm_enc_region *range) { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct enc_region *region; @@ -1931,8 +2003,8 @@ static void __unregister_enc_region_locked(struct kvm *kvm, kfree(region); } -int svm_unregister_enc_region(struct kvm *kvm, - struct kvm_enc_region *range) +int sev_mem_enc_unregister_region(struct kvm *kvm, + struct kvm_enc_region *range) { struct enc_region *region; int ret; @@ -1971,7 +2043,7 @@ failed: return ret; } -int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) +int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) { struct file *source_kvm_file; struct kvm *source_kvm; @@ -2007,10 +2079,10 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) */ source_sev = &to_kvm_svm(source_kvm)->sev_info; kvm_get_kvm(source_kvm); - source_sev->num_mirrored_vms++; + mirror_sev = &to_kvm_svm(kvm)->sev_info; + list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms); /* Set enc_context_owner and copy its encryption context over */ - mirror_sev = &to_kvm_svm(kvm)->sev_info; mirror_sev->enc_context_owner = source_kvm; mirror_sev->active = true; mirror_sev->asid = source_sev->asid; @@ -2018,6 +2090,7 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) mirror_sev->es_active = source_sev->es_active; mirror_sev->handle = source_sev->handle; INIT_LIST_HEAD(&mirror_sev->regions_list); + INIT_LIST_HEAD(&mirror_sev->mirror_vms); ret = 0; /* @@ -2040,19 +2113,17 @@ void sev_vm_destroy(struct kvm *kvm) struct list_head *head = &sev->regions_list; struct list_head *pos, *q; - WARN_ON(sev->num_mirrored_vms); - if (!sev_guest(kvm)) return; + WARN_ON(!list_empty(&sev->mirror_vms)); + /* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */ if (is_mirroring_enc_context(kvm)) { struct kvm *owner_kvm = sev->enc_context_owner; - struct kvm_sev_info *owner_sev = &to_kvm_svm(owner_kvm)->sev_info; mutex_lock(&owner_kvm->lock); - if (!WARN_ON(!owner_sev->num_mirrored_vms)) - owner_sev->num_mirrored_vms--; + list_del(&sev->mirror_entry); mutex_unlock(&owner_kvm->lock); kvm_put_kvm(owner_kvm); return; @@ -2099,8 +2170,13 @@ void __init sev_hardware_setup(void) if (!sev_enabled || !npt_enabled) goto out; - /* Does the CPU support SEV? */ - if (!boot_cpu_has(X86_FEATURE_SEV)) + /* + * SEV must obviously be supported in hardware. Sanity check that the + * CPU supports decode assists, which is mandatory for SEV guests to + * support instruction emulation. + */ + if (!boot_cpu_has(X86_FEATURE_SEV) || + WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_DECODEASSISTS))) goto out; /* Retrieve SEV CPUID information */ @@ -2146,6 +2222,15 @@ void __init sev_hardware_setup(void) if (!sev_es_enabled) goto out; + /* + * SEV-ES requires MMIO caching as KVM doesn't have access to the guest + * instruction stream, i.e. can't emulate in response to a #NPF and + * instead relies on #NPF(RSVD) being reflected into the guest as #VC + * (the guest can then do a #VMGEXIT to request MMIO emulation). + */ + if (!enable_mmio_caching) + goto out; + /* Does the CPU support SEV-ES? */ if (!boot_cpu_has(X86_FEATURE_SEV_ES)) goto out; @@ -2167,7 +2252,7 @@ out: #endif } -void sev_hardware_teardown(void) +void sev_hardware_unsetup(void) { if (!sev_enabled) return; @@ -2198,51 +2283,47 @@ int sev_cpu_init(struct svm_cpu_data *sd) * Pages used by hardware to hold guest encrypted state must be flushed before * returning them to the system. */ -static void sev_flush_guest_memory(struct vcpu_svm *svm, void *va, - unsigned long len) +static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va) { + int asid = to_kvm_svm(vcpu->kvm)->sev_info.asid; + /* - * If hardware enforced cache coherency for encrypted mappings of the - * same physical page is supported, nothing to do. + * Note! The address must be a kernel address, as regular page walk + * checks are performed by VM_PAGE_FLUSH, i.e. operating on a user + * address is non-deterministic and unsafe. This function deliberately + * takes a pointer to deter passing in a user address. */ - if (boot_cpu_has(X86_FEATURE_SME_COHERENT)) - return; + unsigned long addr = (unsigned long)va; /* - * If the VM Page Flush MSR is supported, use it to flush the page - * (using the page virtual address and the guest ASID). + * If CPU enforced cache coherency for encrypted mappings of the + * same physical page is supported, use CLFLUSHOPT instead. NOTE: cache + * flush is still needed in order to work properly with DMA devices. */ - if (boot_cpu_has(X86_FEATURE_VM_PAGE_FLUSH)) { - struct kvm_sev_info *sev; - unsigned long va_start; - u64 start, stop; - - /* Align start and stop to page boundaries. */ - va_start = (unsigned long)va; - start = (u64)va_start & PAGE_MASK; - stop = PAGE_ALIGN((u64)va_start + len); - - if (start < stop) { - sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info; + if (boot_cpu_has(X86_FEATURE_SME_COHERENT)) { + clflush_cache_range(va, PAGE_SIZE); + return; + } - while (start < stop) { - wrmsrl(MSR_AMD64_VM_PAGE_FLUSH, - start | sev->asid); + /* + * VM Page Flush takes a host virtual address and a guest ASID. Fall + * back to WBINVD if this faults so as not to make any problems worse + * by leaving stale encrypted data in the cache. + */ + if (WARN_ON_ONCE(wrmsrl_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid))) + goto do_wbinvd; - start += PAGE_SIZE; - } + return; - return; - } +do_wbinvd: + wbinvd_on_all_cpus(); +} - WARN(1, "Address overflow, using WBINVD\n"); - } +void sev_guest_memory_reclaimed(struct kvm *kvm) +{ + if (!sev_guest(kvm)) + return; - /* - * Hardware should always have one of the above features, - * but if not, use WBINVD and issue a warning. - */ - WARN_ONCE(1, "Using WBINVD to flush guest memory\n"); wbinvd_on_all_cpus(); } @@ -2256,7 +2337,8 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu) svm = to_svm(vcpu); if (vcpu->arch.guest_state_protected) - sev_flush_guest_memory(svm, svm->sev_es.vmsa, PAGE_SIZE); + sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa); + __free_page(virt_to_page(svm->sev_es.vmsa)); if (svm->sev_es.ghcb_sa_free) @@ -2352,7 +2434,7 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); } -static bool sev_es_validate_vmgexit(struct vcpu_svm *svm) +static int sev_es_validate_vmgexit(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu; struct ghcb *ghcb; @@ -2457,7 +2539,7 @@ static bool sev_es_validate_vmgexit(struct vcpu_svm *svm) goto vmgexit_err; } - return true; + return 0; vmgexit_err: vcpu = &svm->vcpu; @@ -2480,7 +2562,8 @@ vmgexit_err: ghcb_set_sw_exit_info_1(ghcb, 2); ghcb_set_sw_exit_info_2(ghcb, reason); - return false; + /* Resume the guest to "return" the error code. */ + return 1; } void sev_es_unmap_ghcb(struct vcpu_svm *svm) @@ -2517,7 +2600,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm) void pre_sev_run(struct vcpu_svm *svm, int cpu) { - struct svm_cpu_data *sd = per_cpu(svm_data, cpu); + struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); int asid = sev_get_asid(svm->vcpu.kvm); /* Assign the asid allocated with this SEV guest */ @@ -2539,7 +2622,7 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu) } #define GHCB_SCRATCH_AREA_LIMIT (16ULL * PAGE_SIZE) -static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) +static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) { struct vmcb_control_area *control = &svm->vmcb->control; struct ghcb *ghcb = svm->sev_es.ghcb; @@ -2592,14 +2675,14 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) } scratch_va = kvzalloc(len, GFP_KERNEL_ACCOUNT); if (!scratch_va) - goto e_scratch; + return -ENOMEM; if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, scratch_va, len)) { /* Unable to copy scratch area from guest */ pr_err("vmgexit: kvm_read_guest for scratch area failed\n"); kvfree(scratch_va); - goto e_scratch; + return -EFAULT; } /* @@ -2615,13 +2698,13 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) svm->sev_es.ghcb_sa = scratch_va; svm->sev_es.ghcb_sa_len = len; - return true; + return 0; e_scratch: ghcb_set_sw_exit_info_1(ghcb, 2); ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_SCRATCH_AREA); - return false; + return 1; } static void set_ghcb_msr_bits(struct vcpu_svm *svm, u64 value, u64 mask, @@ -2709,8 +2792,12 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) pr_info("SEV-ES guest requested termination: %#llx:%#llx\n", reason_set, reason_code); - ret = -EINVAL; - break; + vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; + vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM; + vcpu->run->system_event.ndata = 1; + vcpu->run->system_event.data[0] = control->ghcb_gpa; + + return 0; } default: /* Error, keep GHCB MSR value as-is */ @@ -2759,17 +2846,18 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) exit_code = ghcb_get_sw_exit_code(ghcb); - if (!sev_es_validate_vmgexit(svm)) - return 1; + ret = sev_es_validate_vmgexit(svm); + if (ret) + return ret; sev_es_sync_from_ghcb(svm); ghcb_set_sw_exit_info_1(ghcb, 0); ghcb_set_sw_exit_info_2(ghcb, 0); - ret = 1; switch (exit_code) { case SVM_VMGEXIT_MMIO_READ: - if (!setup_vmgexit_scratch(svm, true, control->exit_info_2)) + ret = setup_vmgexit_scratch(svm, true, control->exit_info_2); + if (ret) break; ret = kvm_sev_es_mmio_read(vcpu, @@ -2778,7 +2866,8 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) svm->sev_es.ghcb_sa); break; case SVM_VMGEXIT_MMIO_WRITE: - if (!setup_vmgexit_scratch(svm, false, control->exit_info_2)) + ret = setup_vmgexit_scratch(svm, false, control->exit_info_2); + if (ret) break; ret = kvm_sev_es_mmio_write(vcpu, @@ -2811,6 +2900,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_INPUT); } + ret = 1; break; } case SVM_VMGEXIT_UNSUPPORTED_EVENT: @@ -2830,6 +2920,7 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in) { int count; int bytes; + int r; if (svm->vmcb->control.exit_info_2 > INT_MAX) return -EINVAL; @@ -2838,14 +2929,15 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in) if (unlikely(check_mul_overflow(count, size, &bytes))) return -EINVAL; - if (!setup_vmgexit_scratch(svm, in, bytes)) - return 1; + r = setup_vmgexit_scratch(svm, in, bytes); + if (r) + return r; return kvm_sev_es_string_io(&svm->vcpu, size, port, svm->sev_es.ghcb_sa, count, in); } -void sev_es_init_vmcb(struct vcpu_svm *svm) +static void sev_es_init_vmcb(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; @@ -2888,6 +2980,23 @@ void sev_es_init_vmcb(struct vcpu_svm *svm) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); + + if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && + (guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDTSCP) || + guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDPID))) { + set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, 1, 1); + if (guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDTSCP)) + svm_clr_intercept(svm, INTERCEPT_RDTSCP); + } +} + +void sev_init_vmcb(struct vcpu_svm *svm) +{ + svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE; + clr_exception_intercept(svm, UD_VECTOR); + + if (sev_es_guest(svm->vcpu.kvm)) + sev_es_init_vmcb(svm); } void sev_es_vcpu_reset(struct vcpu_svm *svm) @@ -2901,20 +3010,16 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm) sev_enc_bit)); } -void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu) +void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa) { - struct svm_cpu_data *sd = per_cpu(svm_data, cpu); - struct vmcb_save_area *hostsa; - /* * As an SEV-ES guest, hardware will restore the host state on VMEXIT, - * of which one step is to perform a VMLOAD. Since hardware does not - * perform a VMSAVE on VMRUN, the host savearea must be updated. + * of which one step is to perform a VMLOAD. KVM performs the + * corresponding VMSAVE in svm_prepare_guest_switch for both + * traditional and SEV-ES guests. */ - vmsave(__sme_page_pa(sd->save_area)); /* XCR0 is restored on VMEXIT, save the current host value */ - hostsa = (struct vmcb_save_area *)(page_address(sd->save_area) + 0x400); hostsa->xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); /* PKRU is restored on VMEXIT, save the current host value */ diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d0f68d11ec70..9f88c8e6766e 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -62,20 +62,6 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); #define SEG_TYPE_LDT 2 #define SEG_TYPE_BUSY_TSS16 3 -#define SVM_FEATURE_LBRV (1 << 1) -#define SVM_FEATURE_SVML (1 << 2) -#define SVM_FEATURE_TSC_RATE (1 << 4) -#define SVM_FEATURE_VMCB_CLEAN (1 << 5) -#define SVM_FEATURE_FLUSH_ASID (1 << 6) -#define SVM_FEATURE_DECODE_ASSIST (1 << 7) -#define SVM_FEATURE_PAUSE_FILTER (1 << 10) - -#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) - -#define TSC_RATIO_RSVD 0xffffff0000000000ULL -#define TSC_RATIO_MIN 0x0000000000000001ULL -#define TSC_RATIO_MAX 0x000000ffffffffffULL - static bool erratum_383_found __read_mostly; u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; @@ -87,7 +73,8 @@ u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; static uint64_t osvw_len = 4, osvw_status; static DEFINE_PER_CPU(u64, current_tsc_ratio); -#define TSC_RATIO_DEFAULT 0x0100000000ULL + +#define X2APIC_MSR(x) (APIC_BASE_MSR + (x >> 4)) static const struct svm_direct_access_msrs { u32 index; /* Index of the MSR */ @@ -114,6 +101,39 @@ static const struct svm_direct_access_msrs { { .index = MSR_EFER, .always = false }, { .index = MSR_IA32_CR_PAT, .always = false }, { .index = MSR_AMD64_SEV_ES_GHCB, .always = true }, + { .index = MSR_TSC_AUX, .always = false }, + { .index = X2APIC_MSR(APIC_ID), .always = false }, + { .index = X2APIC_MSR(APIC_LVR), .always = false }, + { .index = X2APIC_MSR(APIC_TASKPRI), .always = false }, + { .index = X2APIC_MSR(APIC_ARBPRI), .always = false }, + { .index = X2APIC_MSR(APIC_PROCPRI), .always = false }, + { .index = X2APIC_MSR(APIC_EOI), .always = false }, + { .index = X2APIC_MSR(APIC_RRR), .always = false }, + { .index = X2APIC_MSR(APIC_LDR), .always = false }, + { .index = X2APIC_MSR(APIC_DFR), .always = false }, + { .index = X2APIC_MSR(APIC_SPIV), .always = false }, + { .index = X2APIC_MSR(APIC_ISR), .always = false }, + { .index = X2APIC_MSR(APIC_TMR), .always = false }, + { .index = X2APIC_MSR(APIC_IRR), .always = false }, + { .index = X2APIC_MSR(APIC_ESR), .always = false }, + { .index = X2APIC_MSR(APIC_ICR), .always = false }, + { .index = X2APIC_MSR(APIC_ICR2), .always = false }, + + /* + * Note: + * AMD does not virtualize APIC TSC-deadline timer mode, but it is + * emulated by KVM. When setting APIC LVTT (0x832) register bit 18, + * the AVIC hardware would generate GP fault. Therefore, always + * intercept the MSR 0x832, and do not setup direct_access_msr. + */ + { .index = X2APIC_MSR(APIC_LVTTHMR), .always = false }, + { .index = X2APIC_MSR(APIC_LVTPC), .always = false }, + { .index = X2APIC_MSR(APIC_LVT0), .always = false }, + { .index = X2APIC_MSR(APIC_LVT1), .always = false }, + { .index = X2APIC_MSR(APIC_LVTERR), .always = false }, + { .index = X2APIC_MSR(APIC_TMICT), .always = false }, + { .index = X2APIC_MSR(APIC_TMCCT), .always = false }, + { .index = X2APIC_MSR(APIC_TDCR), .always = false }, { .index = MSR_INVALID, .always = false }, }; @@ -185,7 +205,7 @@ static int vls = true; module_param(vls, int, 0444); /* enable/disable Virtual GIF */ -static int vgif = true; +int vgif = true; module_param(vgif, int, 0444); /* enable/disable LBR virtualization */ @@ -225,7 +245,7 @@ struct kvm_ldttss_desc { u32 zero1; } __attribute__((packed)); -DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); +DEFINE_PER_CPU(struct svm_cpu_data, svm_data); /* * Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via @@ -263,9 +283,9 @@ u32 svm_msrpm_offset(u32 msr) return MSR_INVALID; } -#define MAX_INST_SIZE 15 +static void svm_flush_tlb_current(struct kvm_vcpu *vcpu); -static int get_max_npt_level(void) +static int get_npt_level(void) { #ifdef CONFIG_X86_64 return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; @@ -290,7 +310,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) { if (!(efer & EFER_SVME)) { - svm_leave_nested(svm); + svm_leave_nested(vcpu); svm_set_gif(svm, true); /* #GP intercept is still needed for vmware backdoor */ if (!enable_vmware_backdoor) @@ -312,7 +332,11 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) return ret; } - if (svm_gp_erratum_intercept) + /* + * Never intercept #GP for SEV guests, KVM can't + * decrypt guest memory to workaround the erratum. + */ + if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm)) set_exception_intercept(svm, GP_VECTOR); } } @@ -349,9 +373,11 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) } -static int skip_emulated_instruction(struct kvm_vcpu *vcpu) +static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu, + bool commit_side_effects) { struct vcpu_svm *svm = to_svm(vcpu); + unsigned long old_rflags; /* * SEV-ES does not expose the next RIP. The RIP update is controlled by @@ -366,48 +392,91 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu) } if (!svm->next_rip) { + if (unlikely(!commit_side_effects)) + old_rflags = svm->vmcb->save.rflags; + if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) return 0; + + if (unlikely(!commit_side_effects)) + svm->vmcb->save.rflags = old_rflags; } else { kvm_rip_write(vcpu, svm->next_rip); } done: - svm_set_interrupt_shadow(vcpu, 0); + if (likely(commit_side_effects)) + svm_set_interrupt_shadow(vcpu, 0); return 1; } -static void svm_queue_exception(struct kvm_vcpu *vcpu) +static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu) { + return __svm_skip_emulated_instruction(vcpu, true); +} + +static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu) +{ + unsigned long rip, old_rip = kvm_rip_read(vcpu); struct vcpu_svm *svm = to_svm(vcpu); - unsigned nr = vcpu->arch.exception.nr; - bool has_error_code = vcpu->arch.exception.has_error_code; - u32 error_code = vcpu->arch.exception.error_code; - kvm_deliver_exception_payload(vcpu); + /* + * Due to architectural shortcomings, the CPU doesn't always provide + * NextRIP, e.g. if KVM intercepted an exception that occurred while + * the CPU was vectoring an INTO/INT3 in the guest. Temporarily skip + * the instruction even if NextRIP is supported to acquire the next + * RIP so that it can be shoved into the NextRIP field, otherwise + * hardware will fail to advance guest RIP during event injection. + * Drop the exception/interrupt if emulation fails and effectively + * retry the instruction, it's the least awful option. If NRIPS is + * in use, the skip must not commit any side effects such as clearing + * the interrupt shadow or RFLAGS.RF. + */ + if (!__svm_skip_emulated_instruction(vcpu, !nrips)) + return -EIO; - if (nr == BP_VECTOR && !nrips) { - unsigned long rip, old_rip = kvm_rip_read(vcpu); + rip = kvm_rip_read(vcpu); - /* - * For guest debugging where we have to reinject #BP if some - * INT3 is guest-owned: - * Emulate nRIP by moving RIP forward. Will fail if injection - * raises a fault that is not intercepted. Still better than - * failing in all cases. - */ - (void)skip_emulated_instruction(vcpu); - rip = kvm_rip_read(vcpu); - svm->int3_rip = rip + svm->vmcb->save.cs.base; - svm->int3_injected = rip - old_rip; - } + /* + * Save the injection information, even when using next_rip, as the + * VMCB's next_rip will be lost (cleared on VM-Exit) if the injection + * doesn't complete due to a VM-Exit occurring while the CPU is + * vectoring the event. Decoding the instruction isn't guaranteed to + * work as there may be no backing instruction, e.g. if the event is + * being injected by L1 for L2, or if the guest is patching INT3 into + * a different instruction. + */ + svm->soft_int_injected = true; + svm->soft_int_csbase = svm->vmcb->save.cs.base; + svm->soft_int_old_rip = old_rip; + svm->soft_int_next_rip = rip; + + if (nrips) + kvm_rip_write(vcpu, old_rip); - svm->vmcb->control.event_inj = nr + if (static_cpu_has(X86_FEATURE_NRIPS)) + svm->vmcb->control.next_rip = rip; + + return 0; +} + +static void svm_inject_exception(struct kvm_vcpu *vcpu) +{ + struct kvm_queued_exception *ex = &vcpu->arch.exception; + struct vcpu_svm *svm = to_svm(vcpu); + + kvm_deliver_exception_payload(vcpu, ex); + + if (kvm_exception_is_soft(ex->vector) && + svm_update_soft_interrupt_rip(vcpu)) + return; + + svm->vmcb->control.event_inj = ex->vector | SVM_EVTINJ_VALID - | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0) + | (ex->has_error_code ? SVM_EVTINJ_VALID_ERR : 0) | SVM_EVTINJ_TYPE_EXEPT; - svm->vmcb->control.event_inj_err = error_code; + svm->vmcb->control.event_inj_err = ex->error_code; } static void svm_init_erratum_383(void) @@ -472,11 +541,24 @@ static int has_svm(void) return 1; } +void __svm_write_tsc_multiplier(u64 multiplier) +{ + preempt_disable(); + + if (multiplier == __this_cpu_read(current_tsc_ratio)) + goto out; + + wrmsrl(MSR_AMD64_TSC_RATIO, multiplier); + __this_cpu_write(current_tsc_ratio, multiplier); +out: + preempt_enable(); +} + static void svm_hardware_disable(void) { /* Make sure we clean up behind us */ if (tsc_scaling) - wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); + __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT); cpu_svm_disable(); @@ -499,12 +581,7 @@ static int svm_hardware_enable(void) pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me); return -EINVAL; } - sd = per_cpu(svm_data, me); - if (!sd) { - pr_err("%s: svm_data is NULL on %d\n", __func__, me); - return -EINVAL; - } - + sd = per_cpu_ptr(&svm_data, me); sd->asid_generation = 1; sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; sd->next_asid = sd->max_asid + 1; @@ -515,15 +592,14 @@ static int svm_hardware_enable(void) wrmsrl(MSR_EFER, efer | EFER_SVME); - wrmsrl(MSR_VM_HSAVE_PA, __sme_page_pa(sd->save_area)); + wrmsrl(MSR_VM_HSAVE_PA, sd->save_area_pa); if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { /* * Set the default value, even if we don't use TSC scaling * to avoid having stale value in the msr */ - wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); - __this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT); + __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT); } @@ -565,44 +641,37 @@ static int svm_hardware_enable(void) static void svm_cpu_uninit(int cpu) { - struct svm_cpu_data *sd = per_cpu(svm_data, cpu); + struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); - if (!sd) + if (!sd->save_area) return; - per_cpu(svm_data, cpu) = NULL; kfree(sd->sev_vmcbs); __free_page(sd->save_area); - kfree(sd); + sd->save_area_pa = 0; + sd->save_area = NULL; } static int svm_cpu_init(int cpu) { - struct svm_cpu_data *sd; + struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); int ret = -ENOMEM; - sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL); - if (!sd) - return ret; - sd->cpu = cpu; - sd->save_area = alloc_page(GFP_KERNEL); + memset(sd, 0, sizeof(struct svm_cpu_data)); + sd->save_area = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!sd->save_area) - goto free_cpu_data; - - clear_page(page_address(sd->save_area)); + return ret; ret = sev_cpu_init(sd); if (ret) goto free_save_area; - per_cpu(svm_data, cpu) = sd; - + sd->save_area_pa = __sme_page_pa(sd->save_area); return 0; free_save_area: __free_page(sd->save_area); -free_cpu_data: - kfree(sd); + sd->save_area = NULL; return ret; } @@ -651,6 +720,15 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) u32 offset; u32 *msrpm; + /* + * For non-nested case: + * If the L01 MSR bitmap does not intercept the MSR, then we need to + * save it. + * + * For nested case: + * If the L02 MSR bitmap does not intercept the MSR, then we need to + * save it. + */ msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm: to_svm(vcpu)->msrpm; @@ -666,6 +744,7 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, int read, int write) { + struct vcpu_svm *svm = to_svm(vcpu); u8 bit_read, bit_write; unsigned long tmp; u32 offset; @@ -696,7 +775,7 @@ static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm, msrpm[offset] = tmp; svm_hv_vmcb_dirty_nested_enlightenments(vcpu); - + svm->nested.force_msr_bitmap_recalc = true; } void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, @@ -732,6 +811,29 @@ void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm) } } +void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept) +{ + int i; + + if (intercept == svm->x2avic_msrs_intercepted) + return; + + if (avic_mode != AVIC_MODE_X2 || + !apic_x2apic_mode(svm->vcpu.arch.apic)) + return; + + for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) { + int index = direct_access_msrs[i].index; + + if ((index < APIC_BASE_MSR) || + (index > APIC_BASE_MSR + 0xff)) + continue; + set_msr_interception(&svm->vcpu, svm->msrpm, index, + !intercept, !intercept); + } + + svm->x2avic_msrs_intercepted = intercept; +} void svm_vcpu_free_msrpm(u32 *msrpm) { @@ -800,6 +902,17 @@ static void init_msrpm_offsets(void) } } +void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb) +{ + to_vmcb->save.dbgctl = from_vmcb->save.dbgctl; + to_vmcb->save.br_from = from_vmcb->save.br_from; + to_vmcb->save.br_to = from_vmcb->save.br_to; + to_vmcb->save.last_excp_from = from_vmcb->save.last_excp_from; + to_vmcb->save.last_excp_to = from_vmcb->save.last_excp_to; + + vmcb_mark_dirty(to_vmcb, VMCB_LBR); +} + static void svm_enable_lbrv(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -809,6 +922,10 @@ static void svm_enable_lbrv(struct kvm_vcpu *vcpu) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); + + /* Move the LBR msrs to the vmcb02 so that the guest can see them. */ + if (is_guest_mode(vcpu)) + svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr); } static void svm_disable_lbrv(struct kvm_vcpu *vcpu) @@ -820,6 +937,67 @@ static void svm_disable_lbrv(struct kvm_vcpu *vcpu) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0); + + /* + * Move the LBR msrs back to the vmcb01 to avoid copying them + * on nested guest entries. + */ + if (is_guest_mode(vcpu)) + svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb); +} + +static int svm_get_lbr_msr(struct vcpu_svm *svm, u32 index) +{ + /* + * If the LBR virtualization is disabled, the LBR msrs are always + * kept in the vmcb01 to avoid copying them on nested guest entries. + * + * If nested, and the LBR virtualization is enabled/disabled, the msrs + * are moved between the vmcb01 and vmcb02 as needed. + */ + struct vmcb *vmcb = + (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) ? + svm->vmcb : svm->vmcb01.ptr; + + switch (index) { + case MSR_IA32_DEBUGCTLMSR: + return vmcb->save.dbgctl; + case MSR_IA32_LASTBRANCHFROMIP: + return vmcb->save.br_from; + case MSR_IA32_LASTBRANCHTOIP: + return vmcb->save.br_to; + case MSR_IA32_LASTINTFROMIP: + return vmcb->save.last_excp_from; + case MSR_IA32_LASTINTTOIP: + return vmcb->save.last_excp_to; + default: + KVM_BUG(false, svm->vcpu.kvm, + "%s: Unknown MSR 0x%x", __func__, index); + return 0; + } +} + +void svm_update_lbrv(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + bool enable_lbrv = svm_get_lbr_msr(svm, MSR_IA32_DEBUGCTLMSR) & + DEBUGCTLMSR_LBR; + + bool current_enable_lbrv = !!(svm->vmcb->control.virt_ext & + LBR_CTL_ENABLE_MASK); + + if (unlikely(is_guest_mode(vcpu) && svm->lbrv_enabled)) + if (unlikely(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)) + enable_lbrv = true; + + if (enable_lbrv == current_enable_lbrv) + return; + + if (enable_lbrv) + svm_enable_lbrv(vcpu); + else + svm_disable_lbrv(vcpu); } void disable_nmi_singlestep(struct vcpu_svm *svm) @@ -841,6 +1019,9 @@ static void grow_ple_window(struct kvm_vcpu *vcpu) struct vmcb_control_area *control = &svm->vmcb->control; int old = control->pause_filter_count; + if (kvm_pause_in_guest(vcpu->kvm)) + return; + control->pause_filter_count = __grow_ple_window(old, pause_filter_count, pause_filter_count_grow, @@ -859,6 +1040,9 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) struct vmcb_control_area *control = &svm->vmcb->control; int old = control->pause_filter_count; + if (kvm_pause_in_guest(vcpu->kvm)) + return; + control->pause_filter_count = __shrink_ple_window(old, pause_filter_count, @@ -871,52 +1055,11 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) } } -/* - * The default MMIO mask is a single bit (excluding the present bit), - * which could conflict with the memory encryption bit. Check for - * memory encryption support and override the default MMIO mask if - * memory encryption is enabled. - */ -static __init void svm_adjust_mmio_mask(void) -{ - unsigned int enc_bit, mask_bit; - u64 msr, mask; - - /* If there is no memory encryption support, use existing mask */ - if (cpuid_eax(0x80000000) < 0x8000001f) - return; - - /* If memory encryption is not enabled, use existing mask */ - rdmsrl(MSR_AMD64_SYSCFG, msr); - if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) - return; - - enc_bit = cpuid_ebx(0x8000001f) & 0x3f; - mask_bit = boot_cpu_data.x86_phys_bits; - - /* Increment the mask bit if it is the same as the encryption bit */ - if (enc_bit == mask_bit) - mask_bit++; - - /* - * If the mask bit location is below 52, then some bits above the - * physical addressing limit will always be reserved, so use the - * rsvd_bits() function to generate the mask. This mask, along with - * the present bit, will be used to generate a page fault with - * PFER.RSV = 1. - * - * If the mask bit location is 52 (or above), then clear the mask. - */ - mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0; - - kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK); -} - -static void svm_hardware_teardown(void) +static void svm_hardware_unsetup(void) { int cpu; - sev_hardware_teardown(); + sev_hardware_unsetup(); for_each_possible_cpu(cpu) svm_cpu_uninit(cpu); @@ -926,191 +1069,6 @@ static void svm_hardware_teardown(void) iopm_base = 0; } -static __init void svm_set_cpu_caps(void) -{ - kvm_set_cpu_caps(); - - supported_xss = 0; - - /* CPUID 0x80000001 and 0x8000000A (SVM features) */ - if (nested) { - kvm_cpu_cap_set(X86_FEATURE_SVM); - - if (nrips) - kvm_cpu_cap_set(X86_FEATURE_NRIPS); - - if (npt_enabled) - kvm_cpu_cap_set(X86_FEATURE_NPT); - - if (tsc_scaling) - kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR); - - /* Nested VM can receive #VMEXIT instead of triggering #GP */ - kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); - } - - /* CPUID 0x80000008 */ - if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || - boot_cpu_has(X86_FEATURE_AMD_SSBD)) - kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); - - /* CPUID 0x8000001F (SME/SEV features) */ - sev_set_cpu_caps(); -} - -static __init int svm_hardware_setup(void) -{ - int cpu; - struct page *iopm_pages; - void *iopm_va; - int r; - unsigned int order = get_order(IOPM_SIZE); - - /* - * NX is required for shadow paging and for NPT if the NX huge pages - * mitigation is enabled. - */ - if (!boot_cpu_has(X86_FEATURE_NX)) { - pr_err_ratelimited("NX (Execute Disable) not supported\n"); - return -EOPNOTSUPP; - } - kvm_enable_efer_bits(EFER_NX); - - iopm_pages = alloc_pages(GFP_KERNEL, order); - - if (!iopm_pages) - return -ENOMEM; - - iopm_va = page_address(iopm_pages); - memset(iopm_va, 0xff, PAGE_SIZE * (1 << order)); - iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; - - init_msrpm_offsets(); - - supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); - - if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) - kvm_enable_efer_bits(EFER_FFXSR); - - if (tsc_scaling) { - if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { - tsc_scaling = false; - } else { - pr_info("TSC scaling supported\n"); - kvm_has_tsc_control = true; - kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX; - kvm_tsc_scaling_ratio_frac_bits = 32; - } - } - - tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); - - /* Check for pause filtering support */ - if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { - pause_filter_count = 0; - pause_filter_thresh = 0; - } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) { - pause_filter_thresh = 0; - } - - if (nested) { - printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); - kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); - } - - /* - * KVM's MMU doesn't support using 2-level paging for itself, and thus - * NPT isn't supported if the host is using 2-level paging since host - * CR4 is unchanged on VMRUN. - */ - if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE)) - npt_enabled = false; - - if (!boot_cpu_has(X86_FEATURE_NPT)) - npt_enabled = false; - - /* Force VM NPT level equal to the host's max NPT level */ - kvm_configure_mmu(npt_enabled, get_max_npt_level(), - get_max_npt_level(), PG_LEVEL_1G); - pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); - - /* Note, SEV setup consumes npt_enabled. */ - sev_hardware_setup(); - - svm_hv_hardware_setup(); - - svm_adjust_mmio_mask(); - - for_each_possible_cpu(cpu) { - r = svm_cpu_init(cpu); - if (r) - goto err; - } - - if (nrips) { - if (!boot_cpu_has(X86_FEATURE_NRIPS)) - nrips = false; - } - - enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC); - - if (enable_apicv) { - pr_info("AVIC enabled\n"); - - amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); - } - - if (vls) { - if (!npt_enabled || - !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) || - !IS_ENABLED(CONFIG_X86_64)) { - vls = false; - } else { - pr_info("Virtual VMLOAD VMSAVE supported\n"); - } - } - - if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK)) - svm_gp_erratum_intercept = false; - - if (vgif) { - if (!boot_cpu_has(X86_FEATURE_VGIF)) - vgif = false; - else - pr_info("Virtual GIF supported\n"); - } - - if (lbrv) { - if (!boot_cpu_has(X86_FEATURE_LBRV)) - lbrv = false; - else - pr_info("LBR virtualization supported\n"); - } - - svm_set_cpu_caps(); - - /* - * It seems that on AMD processors PTE's accessed bit is - * being set by the CPU hardware before the NPF vmexit. - * This is not expected behaviour and our tests fail because - * of it. - * A workaround here is to disable support for - * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled. - * In this case userspace can know if there is support using - * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle - * it - * If future AMD CPU models change the behaviour described above, - * this variable can be changed accordingly - */ - allow_smaller_maxphyaddr = !npt_enabled; - - return 0; - -err: - svm_hardware_teardown(); - return r; -} - static void init_seg(struct vmcb_seg *seg) { seg->selector = 0; @@ -1151,11 +1109,12 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } -void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) +static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) { - wrmsrl(MSR_AMD64_TSC_RATIO, multiplier); + __svm_write_tsc_multiplier(multiplier); } + /* Evaluate instruction intercepts that depend on guest CPUID features. */ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu, struct vcpu_svm *svm) @@ -1196,6 +1155,8 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0); + + svm->v_vmload_vmsave_enabled = false; } else { /* * If hardware supports Virtual VMLOAD VMSAVE then enable it @@ -1215,8 +1176,9 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu) static void init_vmcb(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb_control_area *control = &svm->vmcb->control; - struct vmcb_save_area *save = &svm->vmcb->save; + struct vmcb *vmcb = svm->vmcb01.ptr; + struct vmcb_control_area *control = &vmcb->control; + struct vmcb_save_area *save = &vmcb->save; svm_set_intercept(svm, INTERCEPT_CR0_READ); svm_set_intercept(svm, INTERCEPT_CR3_READ); @@ -1238,9 +1200,10 @@ static void init_vmcb(struct kvm_vcpu *vcpu) * Guest access to VMware backdoor ports could legitimately * trigger #GP because of TSS I/O permission bitmap. * We intercept those #GP and allow access to them anyway - * as VMware does. + * as VMware does. Don't intercept #GP for SEV guests as KVM can't + * decrypt guest memory to decode the faulting instruction. */ - if (enable_vmware_backdoor) + if (enable_vmware_backdoor && !sev_guest(vcpu->kvm)) set_exception_intercept(svm, GP_VECTOR); svm_set_intercept(svm, INTERCEPT_INTR); @@ -1339,7 +1302,7 @@ static void init_vmcb(struct kvm_vcpu *vcpu) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); if (kvm_vcpu_apicv_active(vcpu)) - avic_init_vmcb(svm); + avic_init_vmcb(svm, vmcb); if (vgif) { svm_clr_intercept(svm, INTERCEPT_STGI); @@ -1347,20 +1310,13 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK; } - if (sev_guest(vcpu->kvm)) { - svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE; - clr_exception_intercept(svm, UD_VECTOR); - - if (sev_es_guest(vcpu->kvm)) { - /* Perform SEV-ES specific VMCB updates */ - sev_es_init_vmcb(svm); - } - } + if (sev_guest(vcpu->kvm)) + sev_init_vmcb(svm); - svm_hv_init_vmcb(svm->vmcb); + svm_hv_init_vmcb(vmcb); init_vmcb_after_set_cpuid(vcpu); - vmcb_mark_all_dirty(svm->vmcb); + vmcb_mark_all_dirty(vmcb); enable_gif(svm); } @@ -1373,7 +1329,7 @@ static void __svm_vcpu_reset(struct kvm_vcpu *vcpu) svm_init_osvw(vcpu); vcpu->arch.microcode_version = 0x01000065; - svm->tsc_ratio_msr = kvm_default_tsc_scaling_ratio; + svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio; if (sev_es_guest(vcpu->kvm)) sev_es_vcpu_reset(svm); @@ -1398,7 +1354,7 @@ void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb) svm->vmcb = target_vmcb->ptr; } -static int svm_create_vcpu(struct kvm_vcpu *vcpu) +static int svm_vcpu_create(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm; struct page *vmcb01_page; @@ -1435,18 +1391,14 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) if (err) goto error_free_vmsa_page; - /* We initialize this flag to true to make sure that the is_running - * bit would be set the first time the vcpu is loaded. - */ - if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm)) - svm->avic_is_running = true; - svm->msrpm = svm_vcpu_alloc_msrpm(); if (!svm->msrpm) { err = -ENOMEM; goto error_free_vmsa_page; } + svm->x2avic_msrs_intercepted = true; + svm->vmcb01.ptr = page_address(vmcb01_page); svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT); svm_switch_vmcb(svm, &svm->vmcb01); @@ -1472,10 +1424,10 @@ static void svm_clear_current_vmcb(struct vmcb *vmcb) int i; for_each_online_cpu(i) - cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL); + cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL); } -static void svm_free_vcpu(struct kvm_vcpu *vcpu) +static void svm_vcpu_free(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1494,10 +1446,10 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE)); } -static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) +static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); + struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); if (sev_es_guest(vcpu->kvm)) sev_es_unmap_ghcb(svm); @@ -1509,20 +1461,17 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) * Save additional host state that will be restored on VMEXIT (sev-es) * or subsequent vmload of host save area. */ + vmsave(sd->save_area_pa); if (sev_es_guest(vcpu->kvm)) { - sev_es_prepare_guest_switch(svm, vcpu->cpu); - } else { - vmsave(__sme_page_pa(sd->save_area)); - } + struct sev_es_save_area *hostsa; + hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400); - if (tsc_scaling) { - u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio; - if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) { - __this_cpu_write(current_tsc_ratio, tsc_ratio); - wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio); - } + sev_es_prepare_switch_to_guest(hostsa); } + if (tsc_scaling) + __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); + if (likely(tsc_aux_uret_slot >= 0)) kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull); @@ -1537,7 +1486,7 @@ static void svm_prepare_host_switch(struct kvm_vcpu *vcpu) static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_svm *svm = to_svm(vcpu); - struct svm_cpu_data *sd = per_cpu(svm_data, cpu); + struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); if (sd->current_vmcb != svm->vmcb) { sd->current_vmcb = svm->vmcb; @@ -1585,12 +1534,27 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) to_svm(vcpu)->vmcb->save.rflags = rflags; } +static bool svm_get_if_flag(struct kvm_vcpu *vcpu) +{ + struct vmcb *vmcb = to_svm(vcpu)->vmcb; + + return sev_es_guest(vcpu->kvm) + ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK + : kvm_get_rflags(vcpu) & X86_EFLAGS_IF; +} + static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) { + kvm_register_mark_available(vcpu, reg); + switch (reg) { case VCPU_EXREG_PDPTR: - BUG_ON(!npt_enabled); - load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); + /* + * When !npt_enabled, mmu->pdptrs[] is already available since + * it is always updated per SDM when moving to CRs. + */ + if (npt_enabled) + load_pdptrs(vcpu, kvm_read_cr3(vcpu)); break; default: KVM_BUG_ON(1, vcpu->kvm); @@ -1604,7 +1568,7 @@ static void svm_set_vintr(struct vcpu_svm *svm) /* * The following fields are ignored when AVIC is enabled */ - WARN_ON(kvm_apicv_activated(svm->vcpu.kvm)); + WARN_ON(kvm_vcpu_apicv_activated(&svm->vcpu)); svm_set_intercept(svm, INTERCEPT_VINTR); @@ -1743,6 +1707,15 @@ static int svm_get_cpl(struct kvm_vcpu *vcpu) return save->cpl; } +static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) +{ + struct kvm_segment cs; + + svm_get_segment(vcpu, &cs, VCPU_SREG_CS); + *db = cs.db; + *l = cs.l; +} + static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1777,10 +1750,29 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) vmcb_mark_dirty(svm->vmcb, VMCB_DT); } +static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + /* + * For guests that don't set guest_state_protected, the cr3 update is + * handled via kvm_mmu_load() while entering the guest. For guests + * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to + * VMCB save area now, since the save area will become the initial + * contents of the VMSA, and future VMCB save area updates won't be + * seen. + */ + if (sev_es_guest(vcpu->kvm)) { + svm->vmcb->save.cr3 = cr3; + vmcb_mark_dirty(svm->vmcb, VMCB_CR); + } +} + void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_svm *svm = to_svm(vcpu); u64 hcr0 = cr0; + bool old_paging = is_paging(vcpu); #ifdef CONFIG_X86_64 if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) { @@ -1797,8 +1789,11 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) #endif vcpu->arch.cr0 = cr0; - if (!npt_enabled) + if (!npt_enabled) { hcr0 |= X86_CR0_PG | X86_CR0_WP; + if (old_paging != is_paging(vcpu)) + svm_set_cr4(vcpu, kvm_read_cr4(vcpu)); + } /* * re-enable caching here because the QEMU bios @@ -1839,11 +1834,15 @@ void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) unsigned long old_cr4 = vcpu->arch.cr4; if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) - svm_flush_tlb(vcpu); + svm_flush_tlb_current(vcpu); vcpu->arch.cr4 = cr4; - if (!npt_enabled) + if (!npt_enabled) { cr4 |= X86_CR4_PAE; + + if (!is_paging(vcpu)) + cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); + } cr4 |= host_cr4_mce; to_svm(vcpu)->vmcb->save.cr4 = cr4; vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); @@ -1973,7 +1972,7 @@ static int npf_interception(struct kvm_vcpu *vcpu) u64 fault_address = svm->vmcb->control.exit_info_2; u64 error_code = svm->vmcb->control.exit_info_1; - trace_kvm_page_fault(fault_address, error_code); + trace_kvm_page_fault(vcpu, fault_address, error_code); return kvm_mmu_page_fault(vcpu, fault_address, error_code, static_cpu_has(X86_FEATURE_DECODEASSISTS) ? svm->vmcb->control.insn_bytes : NULL, @@ -2292,10 +2291,6 @@ static int gp_interception(struct kvm_vcpu *vcpu) if (error_code) goto reinject; - /* All SVM instructions expect page aligned RAX */ - if (svm->vmcb->save.rax & ~PAGE_MASK) - goto reinject; - /* Decode the instruction for usage later */ if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK) goto reinject; @@ -2313,8 +2308,13 @@ static int gp_interception(struct kvm_vcpu *vcpu) if (!is_guest_mode(vcpu)) return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE); - } else + } else { + /* All SVM instructions expect page aligned RAX */ + if (svm->vmcb->save.rax & ~PAGE_MASK) + goto reinject; + return emulate_svm_instr(vcpu, opcode); + } reinject: kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); @@ -2330,7 +2330,7 @@ void svm_set_gif(struct vcpu_svm *svm, bool value) * Likewise, clear the VINTR intercept, we will set it * again while processing KVM_REQ_EVENT if needed. */ - if (vgif_enabled(svm)) + if (vgif) svm_clr_intercept(svm, INTERCEPT_STGI); if (svm_is_intercept(svm, INTERCEPT_VINTR)) svm_clear_vintr(svm); @@ -2338,7 +2338,8 @@ void svm_set_gif(struct vcpu_svm *svm, bool value) enable_gif(svm); if (svm->vcpu.arch.smi_pending || svm->vcpu.arch.nmi_pending || - kvm_cpu_has_injectable_intr(&svm->vcpu)) + kvm_cpu_has_injectable_intr(&svm->vcpu) || + kvm_apic_has_pending_init_or_sipi(&svm->vcpu)) kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); } else { disable_gif(svm); @@ -2348,7 +2349,7 @@ void svm_set_gif(struct vcpu_svm *svm, bool value) * in use, we still rely on the VINTR intercept (rather than * STGI) to detect an open interrupt window. */ - if (!vgif_enabled(svm)) + if (!vgif) svm_clear_vintr(svm); } } @@ -2445,6 +2446,7 @@ static int task_switch_interception(struct kvm_vcpu *vcpu) kvm_clear_exception_queue(vcpu); break; case SVM_EXITINTINFO_TYPE_INTR: + case SVM_EXITINTINFO_TYPE_SOFT: kvm_clear_interrupt_queue(vcpu); break; default: @@ -2456,7 +2458,7 @@ static int task_switch_interception(struct kvm_vcpu *vcpu) int_type == SVM_EXITINTINFO_TYPE_SOFT || (int_type == SVM_EXITINTINFO_TYPE_EXEPT && (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) { - if (!skip_emulated_instruction(vcpu)) + if (!svm_skip_emulated_instruction(vcpu)) return 0; } @@ -2508,7 +2510,7 @@ static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu, bool ret = false; if (!is_guest_mode(vcpu) || - (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0)))) + (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0)))) return false; cr0 &= ~SVM_CR0_SELECTIVE_MASK; @@ -2763,25 +2765,12 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_TSC_AUX: msr_info->data = svm->tsc_aux; break; - /* - * Nobody will change the following 5 values in the VMCB so we can - * safely return them on rdmsr. They will always be 0 until LBRV is - * implemented. - */ case MSR_IA32_DEBUGCTLMSR: - msr_info->data = svm->vmcb->save.dbgctl; - break; case MSR_IA32_LASTBRANCHFROMIP: - msr_info->data = svm->vmcb->save.br_from; - break; case MSR_IA32_LASTBRANCHTOIP: - msr_info->data = svm->vmcb->save.br_to; - break; case MSR_IA32_LASTINTFROMIP: - msr_info->data = svm->vmcb->save.last_excp_from; - break; case MSR_IA32_LASTINTTOIP: - msr_info->data = svm->vmcb->save.last_excp_to; + msr_info->data = svm_get_lbr_msr(svm, msr_info->index); break; case MSR_VM_HSAVE_PA: msr_info->data = svm->nested.hsave_msr; @@ -2880,10 +2869,25 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) u64 data = msr->data; switch (ecx) { case MSR_AMD64_TSC_RATIO: - if (!msr->host_initiated && !svm->tsc_scaling_enabled) - return 1; - if (data & TSC_RATIO_RSVD) + if (!svm->tsc_scaling_enabled) { + + if (!msr->host_initiated) + return 1; + /* + * In case TSC scaling is not enabled, always + * leave this MSR at the default value. + * + * Due to bug in qemu 6.2.0, it would try to set + * this msr to 0 if tsc scaling is not enabled. + * Ignore this value as well. + */ + if (data != 0 && data != svm->tsc_ratio_msr) + return 1; + break; + } + + if (data & SVM_TSC_RATIO_RSVD) return 1; svm->tsc_ratio_msr = data; @@ -3012,12 +3016,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) if (data & DEBUGCTL_RESERVED_BITS) return 1; - svm->vmcb->save.dbgctl = data; - vmcb_mark_dirty(svm->vmcb, VMCB_LBR); - if (data & (1ULL<<0)) - svm_enable_lbrv(vcpu); + if (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) + svm->vmcb->save.dbgctl = data; else - svm_disable_lbrv(vcpu); + svm->vmcb01.ptr->save.dbgctl = data; + + svm_update_lbrv(vcpu); + break; case MSR_VM_HSAVE_PA: /* @@ -3074,11 +3079,18 @@ static int interrupt_window_interception(struct kvm_vcpu *vcpu) svm_clear_vintr(to_svm(vcpu)); /* - * For AVIC, the only reason to end up here is ExtINTs. + * If not running nested, for AVIC, the only reason to end up here is ExtINTs. * In this case AVIC was temporarily disabled for * requesting the IRQ window and we have to re-enable it. + * + * If running nested, still remove the VM wide AVIC inhibit to + * support case in which the interrupt window was requested when the + * vCPU was not running nested. + + * All vCPUs which run still run nested, will remain to have their + * AVIC still inhibited due to per-cpu AVIC inhibition. */ - kvm_request_apicv_update(vcpu->kvm, true, APICV_INHIBIT_REASON_IRQWIN); + kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN); ++vcpu->stat.irq_window_exits; return 1; @@ -3087,7 +3099,6 @@ static int interrupt_window_interception(struct kvm_vcpu *vcpu) static int pause_interception(struct kvm_vcpu *vcpu) { bool in_kernel; - /* * CPL is not made available for an SEV-ES guest, therefore * vcpu->arch.preempted_in_kernel can never be true. Just @@ -3095,8 +3106,7 @@ static int pause_interception(struct kvm_vcpu *vcpu) */ in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0; - if (!kvm_pause_in_guest(vcpu->kvm)) - grow_ple_window(vcpu); + grow_ple_window(vcpu); kvm_vcpu_on_spin(vcpu, in_kernel); return kvm_skip_emulated_instruction(vcpu); @@ -3290,8 +3300,8 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) "tr:", save01->tr.selector, save01->tr.attrib, save01->tr.limit, save01->tr.base); - pr_err("cpl: %d efer: %016llx\n", - save->cpl, save->efer); + pr_err("vmpl: %d cpl: %d efer: %016llx\n", + save->vmpl, save->cpl, save->efer); pr_err("%-15s %016llx %-13s %016llx\n", "cr0:", save->cr0, "cr2:", save->cr2); pr_err("%-15s %016llx %-13s %016llx\n", @@ -3321,7 +3331,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) "excp_to:", save->last_excp_to); } -static bool svm_check_exit_valid(struct kvm_vcpu *vcpu, u64 exit_code) +static bool svm_check_exit_valid(u64 exit_code) { return (exit_code < ARRAY_SIZE(svm_exit_handlers) && svm_exit_handlers[exit_code]); @@ -3341,7 +3351,7 @@ static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code) int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) { - if (!svm_check_exit_valid(vcpu, exit_code)) + if (!svm_check_exit_valid(exit_code)) return svm_handle_invalid_exit(vcpu, exit_code); #ifdef CONFIG_RETPOLINE @@ -3376,7 +3386,7 @@ static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, *error_code = 0; } -static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) +static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_run *kvm_run = vcpu->run; @@ -3432,7 +3442,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) static void reload_tss(struct kvm_vcpu *vcpu) { - struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); + struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); sd->tss_desc->type = 9; /* available 32/64-bit TSS */ load_TR_desc(); @@ -3440,7 +3450,7 @@ static void reload_tss(struct kvm_vcpu *vcpu) static void pre_svm_run(struct kvm_vcpu *vcpu) { - struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); + struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); struct vcpu_svm *svm = to_svm(vcpu); /* @@ -3467,23 +3477,86 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; + + if (svm->nmi_l1_to_l2) + return; + vcpu->arch.hflags |= HF_NMI_MASK; if (!sev_es_guest(vcpu->kvm)) svm_set_intercept(svm, INTERCEPT_IRET); ++vcpu->stat.nmi_injections; } -static void svm_set_irq(struct kvm_vcpu *vcpu) +static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) { struct vcpu_svm *svm = to_svm(vcpu); + u32 type; - BUG_ON(!(gif_set(svm))); + if (vcpu->arch.interrupt.soft) { + if (svm_update_soft_interrupt_rip(vcpu)) + return; + + type = SVM_EVTINJ_TYPE_SOFT; + } else { + type = SVM_EVTINJ_TYPE_INTR; + } - trace_kvm_inj_virq(vcpu->arch.interrupt.nr); + trace_kvm_inj_virq(vcpu->arch.interrupt.nr, + vcpu->arch.interrupt.soft, reinjected); ++vcpu->stat.irq_injections; svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | - SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; + SVM_EVTINJ_VALID | type; +} + +void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, + int trig_mode, int vector) +{ + /* + * apic->apicv_active must be read after vcpu->mode. + * Pairs with smp_store_release in vcpu_enter_guest. + */ + bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE); + + /* Note, this is called iff the local APIC is in-kernel. */ + if (!READ_ONCE(vcpu->arch.apic->apicv_active)) { + /* Process the interrupt via kvm_check_and_inject_events(). */ + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_vcpu_kick(vcpu); + return; + } + + trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector); + if (in_guest_mode) { + /* + * Signal the doorbell to tell hardware to inject the IRQ. If + * the vCPU exits the guest before the doorbell chimes, hardware + * will automatically process AVIC interrupts at the next VMRUN. + */ + avic_ring_doorbell(vcpu); + } else { + /* + * Wake the vCPU if it was blocking. KVM will then detect the + * pending IRQ when checking if the vCPU has a wake event. + */ + kvm_vcpu_wake_up(vcpu); + } +} + +static void svm_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, + int trig_mode, int vector) +{ + kvm_lapic_set_irr(vector, apic); + + /* + * Pairs with the smp_mb_*() after setting vcpu->guest_mode in + * vcpu_enter_guest() to ensure the write to the vIRR is ordered before + * the read of guest_mode. This guarantees that either VMRUN will see + * and process the new vIRR entry, or that svm_complete_interrupt_delivery + * will signal the doorbell if the CPU has already entered the guest. + */ + smp_mb__after_atomic(); + svm_complete_interrupt_delivery(apic->vcpu, delivery_mode, trig_mode, vector); } static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) @@ -3533,11 +3606,13 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) if (svm->nested.nested_run_pending) return -EBUSY; + if (svm_nmi_blocked(vcpu)) + return 0; + /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm)) return -EBUSY; - - return !svm_nmi_blocked(vcpu); + return 1; } static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) @@ -3568,14 +3643,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) if (!gif_set(svm)) return true; - if (sev_es_guest(vcpu->kvm)) { - /* - * SEV-ES guests to not expose RFLAGS. Use the VMCB interrupt mask - * bit to determine the state of the IF flag. - */ - if (!(vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK)) - return true; - } else if (is_guest_mode(vcpu)) { + if (is_guest_mode(vcpu)) { /* As long as interrupts are being delivered... */ if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF) @@ -3586,7 +3654,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) if (nested_exit_on_intr(svm)) return false; } else { - if (!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF)) + if (!svm_get_if_flag(vcpu)) return true; } @@ -3596,9 +3664,13 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) { struct vcpu_svm *svm = to_svm(vcpu); + if (svm->nested.nested_run_pending) return -EBUSY; + if (svm_interrupt_blocked(vcpu)) + return 0; + /* * An IRQ must not be injected into L2 if it's supposed to VM-Exit, * e.g. if the IRQ arrived asynchronously after checking nested events. @@ -3606,7 +3678,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm)) return -EBUSY; - return !svm_interrupt_blocked(vcpu); + return 1; } static void svm_enable_irq_window(struct kvm_vcpu *vcpu) @@ -3621,14 +3693,20 @@ static void svm_enable_irq_window(struct kvm_vcpu *vcpu) * enabled, the STGI interception will not occur. Enable the irq * window under the assumption that the hardware will set the GIF. */ - if (vgif_enabled(svm) || gif_set(svm)) { + if (vgif || gif_set(svm)) { /* * IRQ window is not needed when AVIC is enabled, * unless we have pending ExtINT since it cannot be injected - * via AVIC. In such case, we need to temporarily disable AVIC, + * via AVIC. In such case, KVM needs to temporarily disable AVIC, * and fallback to injecting IRQ via V_IRQ. + * + * If running nested, AVIC is already locally inhibited + * on this vCPU, therefore there is no need to request + * the VM wide AVIC inhibition. */ - kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_IRQWIN); + if (!is_guest_mode(vcpu)) + kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN); + svm_set_vintr(svm); } } @@ -3641,7 +3719,7 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) return; /* IRET will cause a vm exit */ if (!gif_set(svm)) { - if (vgif_enabled(svm)) + if (vgif) svm_set_intercept(svm, INTERCEPT_STGI); return; /* STGI will cause a vm exit */ } @@ -3655,17 +3733,7 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); } -static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) -{ - return 0; -} - -static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) -{ - return 0; -} - -void svm_flush_tlb(struct kvm_vcpu *vcpu) +static void svm_flush_tlb_current(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3716,15 +3784,49 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; } +static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector, + int type) +{ + bool is_exception = (type == SVM_EXITINTINFO_TYPE_EXEPT); + bool is_soft = (type == SVM_EXITINTINFO_TYPE_SOFT); + struct vcpu_svm *svm = to_svm(vcpu); + + /* + * If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's + * associated with the original soft exception/interrupt. next_rip is + * cleared on all exits that can occur while vectoring an event, so KVM + * needs to manually set next_rip for re-injection. Unlike the !nrips + * case below, this needs to be done if and only if KVM is re-injecting + * the same event, i.e. if the event is a soft exception/interrupt, + * otherwise next_rip is unused on VMRUN. + */ + if (nrips && (is_soft || (is_exception && kvm_exception_is_soft(vector))) && + kvm_is_linear_rip(vcpu, svm->soft_int_old_rip + svm->soft_int_csbase)) + svm->vmcb->control.next_rip = svm->soft_int_next_rip; + /* + * If NRIPS isn't enabled, KVM must manually advance RIP prior to + * injecting the soft exception/interrupt. That advancement needs to + * be unwound if vectoring didn't complete. Note, the new event may + * not be the injected event, e.g. if KVM injected an INTn, the INTn + * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will + * be the reported vectored event, but RIP still needs to be unwound. + */ + else if (!nrips && (is_soft || is_exception) && + kvm_is_linear_rip(vcpu, svm->soft_int_next_rip + svm->soft_int_csbase)) + kvm_rip_write(vcpu, svm->soft_int_old_rip); +} + static void svm_complete_interrupts(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); u8 vector; int type; u32 exitintinfo = svm->vmcb->control.exit_int_info; - unsigned int3_injected = svm->int3_injected; + bool nmi_l1_to_l2 = svm->nmi_l1_to_l2; + bool soft_int_injected = svm->soft_int_injected; - svm->int3_injected = 0; + svm->nmi_l1_to_l2 = false; + svm->soft_int_injected = false; /* * If we've made progress since setting HF_IRET_MASK, we've @@ -3749,9 +3851,13 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; + if (soft_int_injected) + svm_complete_soft_interrupt(vcpu, vector, type); + switch (type) { case SVM_EXITINTINFO_TYPE_NMI: vcpu->arch.nmi_injected = true; + svm->nmi_l1_to_l2 = nmi_l1_to_l2; break; case SVM_EXITINTINFO_TYPE_EXEPT: /* @@ -3760,18 +3866,6 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) if (vector == X86_TRAP_VC) break; - /* - * In case of software exceptions, do not reinject the vector, - * but re-execute the instruction instead. Rewind RIP first - * if we emulated INT3 before. - */ - if (kvm_exception_is_soft(vector)) { - if (vector == BP_VECTOR && int3_injected && - kvm_is_linear_rip(vcpu, svm->int3_rip)) - kvm_rip_write(vcpu, - kvm_rip_read(vcpu) - int3_injected); - break; - } if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { u32 err = svm->vmcb->control.exit_int_info_err; kvm_requeue_exception_e(vcpu, vector, err); @@ -3782,9 +3876,13 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) case SVM_EXITINTINFO_TYPE_INTR: kvm_queue_interrupt(vcpu, vector, false); break; + case SVM_EXITINTINFO_TYPE_SOFT: + kvm_queue_interrupt(vcpu, vector, true); + break; default: break; } + } static void svm_cancel_injection(struct kvm_vcpu *vcpu) @@ -3798,6 +3896,11 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu) svm_complete_interrupts(vcpu); } +static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu) +{ + return 1; +} + static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && @@ -3807,37 +3910,24 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) return EXIT_FASTPATH_NONE; } -static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu) +static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted) { struct vcpu_svm *svm = to_svm(vcpu); - unsigned long vmcb_pa = svm->current_vmcb->pa; - - kvm_guest_enter_irqoff(); - - if (sev_es_guest(vcpu->kvm)) { - __svm_sev_es_vcpu_run(vmcb_pa); - } else { - struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); - /* - * Use a single vmcb (vmcb01 because it's always valid) for - * context switching guest state via VMLOAD/VMSAVE, that way - * the state doesn't need to be copied between vmcb01 and - * vmcb02 when switching vmcbs for nested virtualization. - */ - vmload(svm->vmcb01.pa); - __svm_vcpu_run(vmcb_pa, (unsigned long *)&vcpu->arch.regs); - vmsave(svm->vmcb01.pa); + guest_state_enter_irqoff(); - vmload(__sme_page_pa(sd->save_area)); - } + if (sev_es_guest(vcpu->kvm)) + __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted); + else + __svm_vcpu_run(svm, spec_ctrl_intercepted); - kvm_guest_exit_irqoff(); + guest_state_exit_irqoff(); } static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL); trace_kvm_entry(vcpu); @@ -3894,34 +3984,15 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) * being speculatively taken. */ if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL)) - x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl); + x86_spec_ctrl_set_guest(svm->virt_spec_ctrl); - svm_vcpu_enter_exit(vcpu); - - /* - * We do not use IBRS in the kernel. If this vCPU has used the - * SPEC_CTRL MSR it may have left it on; save the value and - * turn it off. This is much more efficient than blindly adding - * it to the atomic save/restore list. Especially as the former - * (Saving guest MSRs on vmexit) doesn't even exist in KVM. - * - * For non-nested case: - * If the L01 MSR bitmap does not intercept the MSR, then we need to - * save it. - * - * For nested case: - * If the L02 MSR bitmap does not intercept the MSR, then we need to - * save it. - */ - if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL) && - unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) - svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); + svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted); if (!sev_es_guest(vcpu->kvm)) reload_tss(vcpu); if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL)) - x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl); + x86_spec_ctrl_restore_host(svm->virt_spec_ctrl); if (!sev_es_guest(vcpu->kvm)) { vcpu->arch.cr2 = svm->vmcb->save.cr2; @@ -3929,9 +4000,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; } + vcpu->arch.regs_dirty = 0; if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) - kvm_before_interrupt(vcpu); + kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); kvm_load_host_xsave_state(vcpu); stgi(); @@ -3963,8 +4035,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); - if (npt_enabled) - kvm_register_clear_available(vcpu, VCPU_EXREG_PDPTR); + vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET; /* * We need to handle MC intercepts here before the vcpu has a chance to @@ -3994,11 +4065,8 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, hv_track_root_tdp(vcpu, root_hpa); - /* Loading L2's CR3 is handled by enter_svm_guest_mode. */ - if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) - return; cr3 = vcpu->arch.cr3; - } else if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) { + } else if (root_level >= PT64_ROOT_4LEVEL) { cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu); } else { /* PCID in the guest should be impossible with a 32-bit MMU. */ @@ -4037,11 +4105,6 @@ static int __init svm_check_processor_compat(void) return 0; } -static bool svm_cpu_has_accelerated_tpr(void) -{ - return false; -} - /* * The kvm parameter can be NULL (module initialization, or invocation before * VM creation). Be sure to check the kvm parameter before using it. @@ -4064,11 +4127,6 @@ static bool svm_has_emulated_msr(struct kvm *kvm, u32 index) return true; } -static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) -{ - return 0; -} - static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4083,33 +4141,27 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) guest_cpuid_has(vcpu, X86_FEATURE_NRIPS); svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR); + svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV); + + svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD); + + svm->pause_filter_enabled = kvm_cpu_cap_has(X86_FEATURE_PAUSEFILTER) && + guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER); + + svm->pause_threshold_enabled = kvm_cpu_cap_has(X86_FEATURE_PFTHRESHOLD) && + guest_cpuid_has(vcpu, X86_FEATURE_PFTHRESHOLD); + + svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF); svm_recalc_instruction_intercepts(vcpu, svm); /* For sev guests, the memory encryption bit is not reserved in CR3. */ if (sev_guest(vcpu->kvm)) { - best = kvm_find_cpuid_entry(vcpu, 0x8000001F, 0); + best = kvm_find_cpuid_entry(vcpu, 0x8000001F); if (best) vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f)); } - if (kvm_vcpu_apicv_active(vcpu)) { - /* - * AVIC does not work with an x2APIC mode guest. If the X2APIC feature - * is exposed to the guest, disable AVIC. - */ - if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC)) - kvm_request_apicv_update(vcpu->kvm, false, - APICV_INHIBIT_REASON_X2APIC); - - /* - * Currently, AVIC does not work with nested virtualization. - * So, we disable AVIC when cpuid for SVM is set in the L1 guest. - */ - if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM)) - kvm_request_apicv_update(vcpu->kvm, false, - APICV_INHIBIT_REASON_NESTED); - } init_vmcb_after_set_cpuid(vcpu); } @@ -4215,7 +4267,7 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, info->intercept == x86_intercept_clts) break; - if (!(vmcb_is_intercept(&svm->nested.ctl, + if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))) break; @@ -4304,6 +4356,8 @@ out: static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu) { + if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR) + vcpu->arch.at_instruction_boundary = true; } static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) @@ -4335,11 +4389,14 @@ static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) if (svm->nested.nested_run_pending) return -EBUSY; + if (svm_smi_blocked(vcpu)) + return 0; + /* An SMI must not be injected into L2 if it's supposed to VM-Exit. */ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm)) return -EBUSY; - return !svm_smi_blocked(vcpu); + return 1; } static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) @@ -4360,7 +4417,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; - ret = nested_svm_vmexit(svm); + ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW); if (ret) return ret; @@ -4373,7 +4430,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) * by 0x400 (matches the offset of 'struct vmcb_save_area' * within 'struct vmcb'). Note: HSAVE area may also be used by * L1 hypervisor to save additional host context (e.g. KVM does - * that, see svm_prepare_guest_switch()) which must be + * that, see svm_prepare_switch_to_guest()) which must be * preserved. */ if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), @@ -4433,10 +4490,18 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) * Enter the nested guest now */ + vmcb_mark_all_dirty(svm->vmcb01.ptr); + vmcb12 = map.hva; - nested_load_control_from_vmcb12(svm, &vmcb12->control); + nested_copy_vmcb_control_to_cache(svm, &vmcb12->control); + nested_copy_vmcb_save_to_cache(svm, &vmcb12->save); ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false); + if (ret) + goto unmap_save; + + svm->nested.nested_run_pending = 1; + unmap_save: kvm_vcpu_unmap(vcpu, &map_save, true); unmap_map: @@ -4449,7 +4514,7 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); if (!gif_set(svm)) { - if (vgif_enabled(svm)) + if (vgif) svm_set_intercept(svm, INTERCEPT_STGI); /* STGI will cause a vm exit */ } else { @@ -4457,79 +4522,140 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu) } } -static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len) +static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len) { bool smep, smap, is_user; unsigned long cr4; + u64 error_code; + + /* Emulation is always possible when KVM has access to all guest state. */ + if (!sev_guest(vcpu->kvm)) + return true; + + /* #UD and #GP should never be intercepted for SEV guests. */ + WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD | + EMULTYPE_TRAP_UD_FORCED | + EMULTYPE_VMWARE_GP)); /* - * When the guest is an SEV-ES guest, emulation is not possible. + * Emulation is impossible for SEV-ES guests as KVM doesn't have access + * to guest register state. */ if (sev_es_guest(vcpu->kvm)) return false; /* + * Emulation is possible if the instruction is already decoded, e.g. + * when completing I/O after returning from userspace. + */ + if (emul_type & EMULTYPE_NO_DECODE) + return true; + + /* + * Emulation is possible for SEV guests if and only if a prefilled + * buffer containing the bytes of the intercepted instruction is + * available. SEV guest memory is encrypted with a guest specific key + * and cannot be decrypted by KVM, i.e. KVM would read cyphertext and + * decode garbage. + * + * Inject #UD if KVM reached this point without an instruction buffer. + * In practice, this path should never be hit by a well-behaved guest, + * e.g. KVM doesn't intercept #UD or #GP for SEV guests, but this path + * is still theoretically reachable, e.g. via unaccelerated fault-like + * AVIC access, and needs to be handled by KVM to avoid putting the + * guest into an infinite loop. Injecting #UD is somewhat arbitrary, + * but its the least awful option given lack of insight into the guest. + */ + if (unlikely(!insn)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return false; + } + + /* + * Emulate for SEV guests if the insn buffer is not empty. The buffer + * will be empty if the DecodeAssist microcode cannot fetch bytes for + * the faulting instruction because the code fetch itself faulted, e.g. + * the guest attempted to fetch from emulated MMIO or a guest page + * table used to translate CS:RIP resides in emulated MMIO. + */ + if (likely(insn_len)) + return true; + + /* * Detect and workaround Errata 1096 Fam_17h_00_0Fh. * * Errata: - * When CPU raise #NPF on guest data access and vCPU CR4.SMAP=1, it is - * possible that CPU microcode implementing DecodeAssist will fail - * to read bytes of instruction which caused #NPF. In this case, - * GuestIntrBytes field of the VMCB on a VMEXIT will incorrectly - * return 0 instead of the correct guest instruction bytes. - * - * This happens because CPU microcode reading instruction bytes - * uses a special opcode which attempts to read data using CPL=0 - * privileges. The microcode reads CS:RIP and if it hits a SMAP - * fault, it gives up and returns no instruction bytes. + * When CPU raises #NPF on guest data access and vCPU CR4.SMAP=1, it is + * possible that CPU microcode implementing DecodeAssist will fail to + * read guest memory at CS:RIP and vmcb.GuestIntrBytes will incorrectly + * be '0'. This happens because microcode reads CS:RIP using a _data_ + * loap uop with CPL=0 privileges. If the load hits a SMAP #PF, ucode + * gives up and does not fill the instruction bytes buffer. * - * Detection: - * We reach here in case CPU supports DecodeAssist, raised #NPF and - * returned 0 in GuestIntrBytes field of the VMCB. - * First, errata can only be triggered in case vCPU CR4.SMAP=1. - * Second, if vCPU CR4.SMEP=1, errata could only be triggered - * in case vCPU CPL==3 (Because otherwise guest would have triggered - * a SMEP fault instead of #NPF). - * Otherwise, vCPU CR4.SMEP=0, errata could be triggered by any vCPU CPL. - * As most guests enable SMAP if they have also enabled SMEP, use above - * logic in order to attempt minimize false-positive of detecting errata - * while still preserving all cases semantic correctness. + * As above, KVM reaches this point iff the VM is an SEV guest, the CPU + * supports DecodeAssist, a #NPF was raised, KVM's page fault handler + * triggered emulation (e.g. for MMIO), and the CPU returned 0 in the + * GuestIntrBytes field of the VMCB. * - * Workaround: - * To determine what instruction the guest was executing, the hypervisor - * will have to decode the instruction at the instruction pointer. + * This does _not_ mean that the erratum has been encountered, as the + * DecodeAssist will also fail if the load for CS:RIP hits a legitimate + * #PF, e.g. if the guest attempt to execute from emulated MMIO and + * encountered a reserved/not-present #PF. * - * In non SEV guest, hypervisor will be able to read the guest - * memory to decode the instruction pointer when insn_len is zero - * so we return true to indicate that decoding is possible. + * To hit the erratum, the following conditions must be true: + * 1. CR4.SMAP=1 (obviously). + * 2. CR4.SMEP=0 || CPL=3. If SMEP=1 and CPL<3, the erratum cannot + * have been hit as the guest would have encountered a SMEP + * violation #PF, not a #NPF. + * 3. The #NPF is not due to a code fetch, in which case failure to + * retrieve the instruction bytes is legitimate (see abvoe). * - * But in the SEV guest, the guest memory is encrypted with the - * guest specific key and hypervisor will not be able to decode the - * instruction pointer so we will not able to workaround it. Lets - * print the error and request to kill the guest. - */ - if (likely(!insn || insn_len)) - return true; - - /* - * If RIP is invalid, go ahead with emulation which will cause an - * internal error exit. + * In addition, don't apply the erratum workaround if the #NPF occurred + * while translating guest page tables (see below). */ - if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT)) - return true; + error_code = to_svm(vcpu)->vmcb->control.exit_info_1; + if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK)) + goto resume_guest; cr4 = kvm_read_cr4(vcpu); smep = cr4 & X86_CR4_SMEP; smap = cr4 & X86_CR4_SMAP; is_user = svm_get_cpl(vcpu) == 3; if (smap && (!smep || is_user)) { - if (!sev_guest(vcpu->kvm)) - return true; - pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n"); - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + + /* + * If the fault occurred in userspace, arbitrarily inject #GP + * to avoid killing the guest and to hopefully avoid confusing + * the guest kernel too much, e.g. injecting #PF would not be + * coherent with respect to the guest's page tables. Request + * triple fault if the fault occurred in the kernel as there's + * no fault that KVM can inject without confusing the guest. + * In practice, the triple fault is moot as no sane SEV kernel + * will execute from user memory while also running with SMAP=1. + */ + if (is_user) + kvm_inject_gp(vcpu, 0); + else + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); } +resume_guest: + /* + * If the erratum was not hit, simply resume the guest and let it fault + * again. While awful, e.g. the vCPU may get stuck in an infinite loop + * if the fault is at CPL=0, it's the lesser of all evils. Exiting to + * userspace will kill the guest, and letting the emulator read garbage + * will yield random behavior and potentially corrupt the guest. + * + * Simply resuming the guest is technically not a violation of the SEV + * architecture. AMD's APM states that all code fetches and page table + * accesses for SEV guest are encrypted, regardless of the C-Bit. The + * APM also states that encrypted accesses to MMIO are "ignored", but + * doesn't explicitly define "ignored", i.e. doing nothing and letting + * the guest spin is technically "ignoring" the access. + */ return false; } @@ -4537,15 +4663,7 @@ static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - /* - * TODO: Last condition latch INIT signals on vCPU when - * vCPU is in guest-mode and vmcb12 defines intercept on INIT. - * To properly emulate the INIT intercept, - * svm_check_nested_events() should call nested_svm_vmexit() - * if an INIT signal is pending. - */ - return !gif_set(svm) || - (vmcb_is_intercept(&svm->vmcb->control, INTERCEPT_INIT)); + return !gif_set(svm); } static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) @@ -4579,25 +4697,24 @@ static int svm_vm_init(struct kvm *kvm) static struct kvm_x86_ops svm_x86_ops __initdata = { .name = "kvm_amd", - .hardware_unsetup = svm_hardware_teardown, + .hardware_unsetup = svm_hardware_unsetup, .hardware_enable = svm_hardware_enable, .hardware_disable = svm_hardware_disable, - .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr, .has_emulated_msr = svm_has_emulated_msr, - .vcpu_create = svm_create_vcpu, - .vcpu_free = svm_free_vcpu, + .vcpu_create = svm_vcpu_create, + .vcpu_free = svm_vcpu_free, .vcpu_reset = svm_vcpu_reset, .vm_size = sizeof(struct kvm_svm), .vm_init = svm_vm_init, .vm_destroy = svm_vm_destroy, - .prepare_guest_switch = svm_prepare_guest_switch, + .prepare_switch_to_guest = svm_prepare_switch_to_guest, .vcpu_load = svm_vcpu_load, .vcpu_put = svm_vcpu_put, - .vcpu_blocking = svm_vcpu_blocking, - .vcpu_unblocking = svm_vcpu_unblocking, + .vcpu_blocking = avic_vcpu_blocking, + .vcpu_unblocking = avic_vcpu_unblocking, .update_exception_bitmap = svm_update_exception_bitmap, .get_msr_feature = svm_get_msr_feature, @@ -4607,8 +4724,9 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .get_segment = svm_get_segment, .set_segment = svm_set_segment, .get_cpl = svm_get_cpl, - .get_cs_db_l_bits = kvm_get_cs_db_l_bits, + .get_cs_db_l_bits = svm_get_cs_db_l_bits, .set_cr0 = svm_set_cr0, + .post_set_cr3 = sev_post_set_cr3, .is_valid_cr4 = svm_is_valid_cr4, .set_cr4 = svm_set_cr4, .set_efer = svm_set_efer, @@ -4621,22 +4739,24 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, + .get_if_flag = svm_get_if_flag, - .tlb_flush_all = svm_flush_tlb, - .tlb_flush_current = svm_flush_tlb, - .tlb_flush_gva = svm_flush_tlb_gva, - .tlb_flush_guest = svm_flush_tlb, + .flush_tlb_all = svm_flush_tlb_current, + .flush_tlb_current = svm_flush_tlb_current, + .flush_tlb_gva = svm_flush_tlb_gva, + .flush_tlb_guest = svm_flush_tlb_current, - .run = svm_vcpu_run, - .handle_exit = handle_exit, - .skip_emulated_instruction = skip_emulated_instruction, + .vcpu_pre_run = svm_vcpu_pre_run, + .vcpu_run = svm_vcpu_run, + .handle_exit = svm_handle_exit, + .skip_emulated_instruction = svm_skip_emulated_instruction, .update_emulated_instruction = NULL, .set_interrupt_shadow = svm_set_interrupt_shadow, .get_interrupt_shadow = svm_get_interrupt_shadow, .patch_hypercall = svm_patch_hypercall, - .set_irq = svm_set_irq, - .set_nmi = svm_inject_nmi, - .queue_exception = svm_queue_exception, + .inject_irq = svm_inject_irq, + .inject_nmi = svm_inject_nmi, + .inject_exception = svm_inject_exception, .cancel_injection = svm_cancel_injection, .interrupt_allowed = svm_interrupt_allowed, .nmi_allowed = svm_nmi_allowed, @@ -4645,17 +4765,10 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .enable_nmi_window = svm_enable_nmi_window, .enable_irq_window = svm_enable_irq_window, .update_cr8_intercept = svm_update_cr8_intercept, - .set_virtual_apic_mode = svm_set_virtual_apic_mode, - .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl, - .check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons, - .load_eoi_exitmap = svm_load_eoi_exitmap, - .hwapic_irr_update = svm_hwapic_irr_update, - .hwapic_isr_update = svm_hwapic_isr_update, - .apicv_post_state_restore = avic_post_state_restore, - - .set_tss_addr = svm_set_tss_addr, - .set_identity_map_addr = svm_set_identity_map_addr, - .get_mt_mask = svm_get_mt_mask, + .set_virtual_apic_mode = avic_set_virtual_apic_mode, + .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl, + .check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons, + .apicv_post_state_restore = avic_apicv_post_state_restore, .get_exit_info = svm_get_exit_info, @@ -4677,12 +4790,10 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .sched_in = svm_sched_in, - .pmu_ops = &amd_pmu_ops, .nested_ops = &svm_nested_ops, - .deliver_posted_interrupt = svm_deliver_avic_intr, - .dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt, - .update_pi_irte = svm_update_pi_irte, + .deliver_interrupt = svm_deliver_interrupt, + .pi_update_irte = avic_pi_update_irte, .setup_mce = svm_setup_mce, .smi_allowed = svm_smi_allowed, @@ -4690,12 +4801,13 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .leave_smm = svm_leave_smm, .enable_smi_window = svm_enable_smi_window, - .mem_enc_op = svm_mem_enc_op, - .mem_enc_reg_region = svm_register_enc_region, - .mem_enc_unreg_region = svm_unregister_enc_region, + .mem_enc_ioctl = sev_mem_enc_ioctl, + .mem_enc_register_region = sev_mem_enc_register_region, + .mem_enc_unregister_region = sev_mem_enc_unregister_region, + .guest_memory_reclaimed = sev_guest_memory_reclaimed, - .vm_copy_enc_context_from = svm_vm_copy_asid_from, - .vm_move_enc_context_from = svm_vm_migrate_from, + .vm_copy_enc_context_from = sev_vm_copy_enc_context_from, + .vm_move_enc_context_from = sev_vm_move_enc_context_from, .can_emulate_instruction = svm_can_emulate_instruction, @@ -4705,8 +4817,265 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .complete_emulated_msr = svm_complete_emulated_msr, .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector, + .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons, }; +/* + * The default MMIO mask is a single bit (excluding the present bit), + * which could conflict with the memory encryption bit. Check for + * memory encryption support and override the default MMIO mask if + * memory encryption is enabled. + */ +static __init void svm_adjust_mmio_mask(void) +{ + unsigned int enc_bit, mask_bit; + u64 msr, mask; + + /* If there is no memory encryption support, use existing mask */ + if (cpuid_eax(0x80000000) < 0x8000001f) + return; + + /* If memory encryption is not enabled, use existing mask */ + rdmsrl(MSR_AMD64_SYSCFG, msr); + if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) + return; + + enc_bit = cpuid_ebx(0x8000001f) & 0x3f; + mask_bit = boot_cpu_data.x86_phys_bits; + + /* Increment the mask bit if it is the same as the encryption bit */ + if (enc_bit == mask_bit) + mask_bit++; + + /* + * If the mask bit location is below 52, then some bits above the + * physical addressing limit will always be reserved, so use the + * rsvd_bits() function to generate the mask. This mask, along with + * the present bit, will be used to generate a page fault with + * PFER.RSV = 1. + * + * If the mask bit location is 52 (or above), then clear the mask. + */ + mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0; + + kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK); +} + +static __init void svm_set_cpu_caps(void) +{ + kvm_set_cpu_caps(); + + kvm_caps.supported_xss = 0; + + /* CPUID 0x80000001 and 0x8000000A (SVM features) */ + if (nested) { + kvm_cpu_cap_set(X86_FEATURE_SVM); + kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN); + + if (nrips) + kvm_cpu_cap_set(X86_FEATURE_NRIPS); + + if (npt_enabled) + kvm_cpu_cap_set(X86_FEATURE_NPT); + + if (tsc_scaling) + kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR); + + if (vls) + kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD); + if (lbrv) + kvm_cpu_cap_set(X86_FEATURE_LBRV); + + if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) + kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER); + + if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) + kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD); + + if (vgif) + kvm_cpu_cap_set(X86_FEATURE_VGIF); + + /* Nested VM can receive #VMEXIT instead of triggering #GP */ + kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); + } + + /* CPUID 0x80000008 */ + if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || + boot_cpu_has(X86_FEATURE_AMD_SSBD)) + kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); + + /* AMD PMU PERFCTR_CORE CPUID */ + if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) + kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE); + + /* CPUID 0x8000001F (SME/SEV features) */ + sev_set_cpu_caps(); +} + +static __init int svm_hardware_setup(void) +{ + int cpu; + struct page *iopm_pages; + void *iopm_va; + int r; + unsigned int order = get_order(IOPM_SIZE); + + /* + * NX is required for shadow paging and for NPT if the NX huge pages + * mitigation is enabled. + */ + if (!boot_cpu_has(X86_FEATURE_NX)) { + pr_err_ratelimited("NX (Execute Disable) not supported\n"); + return -EOPNOTSUPP; + } + kvm_enable_efer_bits(EFER_NX); + + iopm_pages = alloc_pages(GFP_KERNEL, order); + + if (!iopm_pages) + return -ENOMEM; + + iopm_va = page_address(iopm_pages); + memset(iopm_va, 0xff, PAGE_SIZE * (1 << order)); + iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; + + init_msrpm_offsets(); + + kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | + XFEATURE_MASK_BNDCSR); + + if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) + kvm_enable_efer_bits(EFER_FFXSR); + + if (tsc_scaling) { + if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { + tsc_scaling = false; + } else { + pr_info("TSC scaling supported\n"); + kvm_caps.has_tsc_control = true; + } + } + kvm_caps.max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX; + kvm_caps.tsc_scaling_ratio_frac_bits = 32; + + tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); + + /* Check for pause filtering support */ + if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { + pause_filter_count = 0; + pause_filter_thresh = 0; + } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) { + pause_filter_thresh = 0; + } + + if (nested) { + printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); + kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); + } + + /* + * KVM's MMU doesn't support using 2-level paging for itself, and thus + * NPT isn't supported if the host is using 2-level paging since host + * CR4 is unchanged on VMRUN. + */ + if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE)) + npt_enabled = false; + + if (!boot_cpu_has(X86_FEATURE_NPT)) + npt_enabled = false; + + /* Force VM NPT level equal to the host's paging level */ + kvm_configure_mmu(npt_enabled, get_npt_level(), + get_npt_level(), PG_LEVEL_1G); + pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); + + /* Setup shadow_me_value and shadow_me_mask */ + kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask); + + svm_adjust_mmio_mask(); + + /* + * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which + * may be modified by svm_adjust_mmio_mask()). + */ + sev_hardware_setup(); + + svm_hv_hardware_setup(); + + for_each_possible_cpu(cpu) { + r = svm_cpu_init(cpu); + if (r) + goto err; + } + + if (nrips) { + if (!boot_cpu_has(X86_FEATURE_NRIPS)) + nrips = false; + } + + enable_apicv = avic = avic && avic_hardware_setup(&svm_x86_ops); + + if (!enable_apicv) { + svm_x86_ops.vcpu_blocking = NULL; + svm_x86_ops.vcpu_unblocking = NULL; + svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL; + } + + if (vls) { + if (!npt_enabled || + !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) || + !IS_ENABLED(CONFIG_X86_64)) { + vls = false; + } else { + pr_info("Virtual VMLOAD VMSAVE supported\n"); + } + } + + if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK)) + svm_gp_erratum_intercept = false; + + if (vgif) { + if (!boot_cpu_has(X86_FEATURE_VGIF)) + vgif = false; + else + pr_info("Virtual GIF supported\n"); + } + + if (lbrv) { + if (!boot_cpu_has(X86_FEATURE_LBRV)) + lbrv = false; + else + pr_info("LBR virtualization supported\n"); + } + + if (!enable_pmu) + pr_info("PMU virtualization is disabled\n"); + + svm_set_cpu_caps(); + + /* + * It seems that on AMD processors PTE's accessed bit is + * being set by the CPU hardware before the NPF vmexit. + * This is not expected behaviour and our tests fail because + * of it. + * A workaround here is to disable support for + * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled. + * In this case userspace can know if there is support using + * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle + * it + * If future AMD CPU models change the behaviour described above, + * this variable can be changed accordingly + */ + allow_smaller_maxphyaddr = !npt_enabled; + + return 0; + +err: + svm_hardware_unsetup(); + return r; +} + + static struct kvm_x86_init_ops svm_init_ops __initdata = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -4714,6 +5083,7 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = { .check_processor_compatibility = svm_check_processor_compat, .runtime_ops = &svm_x86_ops, + .pmu_ops = &amd_pmu_ops, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 1c7306c370fa..199a2ecef1ce 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -22,17 +22,28 @@ #include <asm/svm.h> #include <asm/sev-common.h> +#include "kvm_cache_regs.h" + #define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT) #define IOPM_SIZE PAGE_SIZE * 3 #define MSRPM_SIZE PAGE_SIZE * 2 -#define MAX_DIRECT_ACCESS_MSRS 20 -#define MSRPM_OFFSETS 16 +#define MAX_DIRECT_ACCESS_MSRS 46 +#define MSRPM_OFFSETS 32 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; +extern int vgif; extern bool intercept_smi; +enum avic_modes { + AVIC_MODE_NONE = 0, + AVIC_MODE_X1, + AVIC_MODE_X2, +}; + +extern enum avic_modes avic_mode; + /* * Clean bits in VMCB. * VMCB_ALL_CLEAN_MASK might also need to @@ -79,7 +90,8 @@ struct kvm_sev_info { struct list_head regions_list; /* List of registered regions */ u64 ap_jump_table; /* SEV-ES AP Jump Table address */ struct kvm *enc_context_owner; /* Owner of copied encryption context */ - unsigned long num_mirrored_vms; /* Number of VMs sharing this ASID */ + struct list_head mirror_vms; /* List of VMs mirroring */ + struct list_head mirror_entry; /* Use as a list entry of mirrors */ struct misc_cg *misc_cg; /* For misc cgroup accounting */ atomic_t migration_in_progress; }; @@ -105,6 +117,43 @@ struct kvm_vmcb_info { uint64_t asid_generation; }; +struct vmcb_save_area_cached { + u64 efer; + u64 cr4; + u64 cr3; + u64 cr0; + u64 dr7; + u64 dr6; +}; + +struct vmcb_ctrl_area_cached { + u32 intercepts[MAX_INTERCEPT]; + u16 pause_filter_thresh; + u16 pause_filter_count; + u64 iopm_base_pa; + u64 msrpm_base_pa; + u64 tsc_offset; + u32 asid; + u8 tlb_ctl; + u32 int_ctl; + u32 int_vector; + u32 int_state; + u32 exit_code; + u32 exit_code_hi; + u64 exit_info_1; + u64 exit_info_2; + u32 exit_int_info; + u32 exit_int_info_err; + u64 nested_ctl; + u32 event_inj; + u32 event_inj_err; + u64 next_rip; + u64 nested_cr3; + u64 virt_ext; + u32 clean; + u8 reserved_sw[32]; +}; + struct svm_nested_state { struct kvm_vmcb_info vmcb02; u64 hsave_msr; @@ -120,14 +169,29 @@ struct svm_nested_state { bool nested_run_pending; /* cache for control fields of the guest */ - struct vmcb_control_area ctl; + struct vmcb_ctrl_area_cached ctl; + + /* + * Note: this struct is not kept up-to-date while L2 runs; it is only + * valid within nested_svm_vmrun. + */ + struct vmcb_save_area_cached save; bool initialized; + + /* + * Indicates whether MSR bitmap for L2 needs to be rebuilt due to + * changes in MSR bitmap for L1 or switching to a different L2. Note, + * this flag can only be used reliably in conjunction with a paravirt L1 + * which informs L0 whether any changes to MSR bitmap for L2 were done + * on its side. + */ + bool force_msr_bitmap_recalc; }; struct vcpu_sev_es_state { /* SEV-ES support */ - struct vmcb_save_area *vmsa; + struct sev_es_save_area *vmsa; struct ghcb *ghcb; struct kvm_host_map ghcb_map; bool received_first_sipi; @@ -145,7 +209,6 @@ struct vcpu_svm { struct vmcb *vmcb; struct kvm_vmcb_info vmcb01; struct kvm_vmcb_info *current_vmcb; - struct svm_cpu_data *svm_data; u32 asid; u32 sysenter_esp_hi; u32 sysenter_eip_hi; @@ -173,19 +236,26 @@ struct vcpu_svm { bool nmi_singlestep; u64 nmi_singlestep_guest_rflags; + bool nmi_l1_to_l2; - unsigned int3_injected; - unsigned long int3_rip; + unsigned long soft_int_csbase; + unsigned long soft_int_old_rip; + unsigned long soft_int_next_rip; + bool soft_int_injected; - /* cached guest cpuid flags for faster access */ + /* optional nested SVM features that are enabled for this guest */ bool nrips_enabled : 1; bool tsc_scaling_enabled : 1; + bool v_vmload_vmsave_enabled : 1; + bool lbrv_enabled : 1; + bool pause_filter_enabled : 1; + bool pause_threshold_enabled : 1; + bool vgif_enabled : 1; u32 ldr_reg; u32 dfr_reg; struct page *avic_backing_page; u64 *avic_physical_id_cache; - bool avic_is_running; /* * Per-vcpu list of struct amd_svm_iommu_ir: @@ -205,11 +275,11 @@ struct vcpu_svm { struct vcpu_sev_es_state sev_es; bool guest_state_loaded; + + bool x2avic_msrs_intercepted; }; struct svm_cpu_data { - int cpu; - u64 asid_generation; u32 max_asid; u32 next_asid; @@ -217,13 +287,15 @@ struct svm_cpu_data { struct kvm_ldttss_desc *tss_desc; struct page *save_area; + unsigned long save_area_pa; + struct vmcb *current_vmcb; /* index = sev_asid, value = vmcb pointer */ struct vmcb **sev_vmcbs; }; -DECLARE_PER_CPU(struct svm_cpu_data *, svm_data); +DECLARE_PER_CPU(struct svm_cpu_data, svm_data); void recalc_intercepts(struct vcpu_svm *svm); @@ -265,11 +337,6 @@ static inline void vmcb_mark_all_clean(struct vmcb *vmcb) & ~VMCB_ALWAYS_DIRTY_MASK; } -static inline bool vmcb_is_clean(struct vmcb *vmcb, int bit) -{ - return (vmcb->control.clean & (1 << bit)); -} - static inline void vmcb_mark_dirty(struct vmcb *vmcb, int bit) { vmcb->control.clean &= ~(1 << bit); @@ -285,6 +352,16 @@ static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) return container_of(vcpu, struct vcpu_svm, vcpu); } +/* + * Only the PDPTRs are loaded on demand into the shadow MMU. All other + * fields are synchronized on VM-Exit, because accessing the VMCB is cheap. + * + * CR3 might be out of date in the VMCB but it is not marked dirty; instead, + * KVM_REQ_LOAD_MMU_PGD is always requested when the cached vcpu->arch.cr3 + * is changed. svm_load_mmu_pgd() then syncs the new CR3 value into the VMCB. + */ +#define SVM_REGS_LAZY_LOAD_SET (1 << VCPU_EXREG_PDPTR) + static inline void vmcb_set_intercept(struct vmcb_control_area *control, u32 bit) { WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT); @@ -303,6 +380,12 @@ static inline bool vmcb_is_intercept(struct vmcb_control_area *control, u32 bit) return test_bit(bit, (unsigned long *)&control->intercepts); } +static inline bool vmcb12_is_intercept(struct vmcb_ctrl_area_cached *control, u32 bit) +{ + WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT); + return test_bit(bit, (unsigned long *)&control->intercepts); +} + static inline void set_dr_intercepts(struct vcpu_svm *svm) { struct vmcb *vmcb = svm->vmcb01.ptr; @@ -388,49 +471,83 @@ static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit) return vmcb_is_intercept(&svm->vmcb->control, bit); } -static inline bool vgif_enabled(struct vcpu_svm *svm) +static inline bool nested_vgif_enabled(struct vcpu_svm *svm) +{ + return svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK); +} + +static inline struct vmcb *get_vgif_vmcb(struct vcpu_svm *svm) { - return !!(svm->vmcb->control.int_ctl & V_GIF_ENABLE_MASK); + if (!vgif) + return NULL; + + if (is_guest_mode(&svm->vcpu) && !nested_vgif_enabled(svm)) + return svm->nested.vmcb02.ptr; + else + return svm->vmcb01.ptr; } static inline void enable_gif(struct vcpu_svm *svm) { - if (vgif_enabled(svm)) - svm->vmcb->control.int_ctl |= V_GIF_MASK; + struct vmcb *vmcb = get_vgif_vmcb(svm); + + if (vmcb) + vmcb->control.int_ctl |= V_GIF_MASK; else svm->vcpu.arch.hflags |= HF_GIF_MASK; } static inline void disable_gif(struct vcpu_svm *svm) { - if (vgif_enabled(svm)) - svm->vmcb->control.int_ctl &= ~V_GIF_MASK; + struct vmcb *vmcb = get_vgif_vmcb(svm); + + if (vmcb) + vmcb->control.int_ctl &= ~V_GIF_MASK; else svm->vcpu.arch.hflags &= ~HF_GIF_MASK; } static inline bool gif_set(struct vcpu_svm *svm) { - if (vgif_enabled(svm)) - return !!(svm->vmcb->control.int_ctl & V_GIF_MASK); + struct vmcb *vmcb = get_vgif_vmcb(svm); + + if (vmcb) + return !!(vmcb->control.int_ctl & V_GIF_MASK); else return !!(svm->vcpu.arch.hflags & HF_GIF_MASK); } +static inline bool nested_npt_enabled(struct vcpu_svm *svm) +{ + return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE; +} + +static inline bool is_x2apic_msrpm_offset(u32 offset) +{ + /* 4 msrs per u8, and 4 u8 in u32 */ + u32 msr = offset * 16; + + return (msr >= APIC_BASE_MSR) && + (msr < (APIC_BASE_MSR + 0x100)); +} + /* svm.c */ #define MSR_INVALID 0xffffffffU +#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) + extern bool dump_invalid_vmcb; u32 svm_msrpm_offset(u32 msr); u32 *svm_vcpu_alloc_msrpm(void); void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm); void svm_vcpu_free_msrpm(u32 *msrpm); +void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb); +void svm_update_lbrv(struct kvm_vcpu *vcpu); int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer); void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); -void svm_flush_tlb(struct kvm_vcpu *vcpu); void disable_nmi_singlestep(struct vcpu_svm *svm); bool svm_smi_blocked(struct kvm_vcpu *vcpu); bool svm_nmi_blocked(struct kvm_vcpu *vcpu); @@ -439,6 +556,9 @@ void svm_set_gif(struct vcpu_svm *svm, bool value); int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code); void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, int read, int write); +void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool disable); +void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, + int trig_mode, int vec); /* nested.c */ @@ -455,22 +575,22 @@ static inline bool nested_svm_virtualize_tpr(struct kvm_vcpu *vcpu) static inline bool nested_exit_on_smi(struct vcpu_svm *svm) { - return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SMI); + return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SMI); } static inline bool nested_exit_on_intr(struct vcpu_svm *svm) { - return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INTR); + return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_INTR); } static inline bool nested_exit_on_nmi(struct vcpu_svm *svm) { - return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_NMI); + return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_NMI); } int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb_gpa, struct vmcb *vmcb12, bool from_vmrun); -void svm_leave_nested(struct vcpu_svm *svm); +void svm_leave_nested(struct kvm_vcpu *vcpu); void svm_free_nested(struct vcpu_svm *svm); int svm_allocate_nested(struct vcpu_svm *svm); int nested_svm_vmrun(struct kvm_vcpu *vcpu); @@ -493,9 +613,11 @@ int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); int nested_svm_exit_special(struct vcpu_svm *svm); void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu); -void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier); -void nested_load_control_from_vmcb12(struct vcpu_svm *svm, - struct vmcb_control_area *control); +void __svm_write_tsc_multiplier(u64 multiplier); +void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm, + struct vmcb_control_area *control); +void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm, + struct vmcb_save_area *save); void nested_sync_control_from_vmcb02(struct vcpu_svm *svm); void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm); void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb); @@ -504,50 +626,27 @@ extern struct kvm_x86_nested_ops svm_nested_ops; /* avic.c */ -#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF) -#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31 -#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31) - -#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL) -#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12) -#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62) -#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63) - -#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL - -static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - u64 *entry = svm->avic_physical_id_cache; - - if (!entry) - return false; - - return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); -} - +bool avic_hardware_setup(struct kvm_x86_ops *ops); int avic_ga_log_notifier(u32 ga_tag); void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); -void avic_init_vmcb(struct vcpu_svm *svm); +void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb); int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu); int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu); int avic_init_vcpu(struct vcpu_svm *svm); void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu); void avic_vcpu_put(struct kvm_vcpu *vcpu); -void avic_post_state_restore(struct kvm_vcpu *vcpu); -void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu); -void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu); -bool svm_check_apicv_inhibit_reasons(ulong bit); -void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); -void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr); -void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr); -int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec); -bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu); -int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set); -void svm_vcpu_blocking(struct kvm_vcpu *vcpu); -void svm_vcpu_unblocking(struct kvm_vcpu *vcpu); +void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu); +void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu); +bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason); +int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set); +void avic_vcpu_blocking(struct kvm_vcpu *vcpu); +void avic_vcpu_unblocking(struct kvm_vcpu *vcpu); +void avic_ring_doorbell(struct kvm_vcpu *vcpu); +unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu); +void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu); + /* sev.c */ @@ -558,30 +657,32 @@ void svm_vcpu_unblocking(struct kvm_vcpu *vcpu); extern unsigned int max_sev_asid; void sev_vm_destroy(struct kvm *kvm); -int svm_mem_enc_op(struct kvm *kvm, void __user *argp); -int svm_register_enc_region(struct kvm *kvm, - struct kvm_enc_region *range); -int svm_unregister_enc_region(struct kvm *kvm, - struct kvm_enc_region *range); -int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd); -int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd); +int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp); +int sev_mem_enc_register_region(struct kvm *kvm, + struct kvm_enc_region *range); +int sev_mem_enc_unregister_region(struct kvm *kvm, + struct kvm_enc_region *range); +int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd); +int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd); +void sev_guest_memory_reclaimed(struct kvm *kvm); + void pre_sev_run(struct vcpu_svm *svm, int cpu); void __init sev_set_cpu_caps(void); void __init sev_hardware_setup(void); -void sev_hardware_teardown(void); +void sev_hardware_unsetup(void); int sev_cpu_init(struct svm_cpu_data *sd); +void sev_init_vmcb(struct vcpu_svm *svm); void sev_free_vcpu(struct kvm_vcpu *vcpu); int sev_handle_vmgexit(struct kvm_vcpu *vcpu); int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in); -void sev_es_init_vmcb(struct vcpu_svm *svm); void sev_es_vcpu_reset(struct vcpu_svm *svm); void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); -void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu); +void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa); void sev_es_unmap_ghcb(struct vcpu_svm *svm); /* vmenter.S */ -void __svm_sev_es_vcpu_run(unsigned long vmcb_pa); -void __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs); +void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted); +void __svm_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted); #endif diff --git a/arch/x86/kvm/svm/svm_onhyperv.c b/arch/x86/kvm/svm/svm_onhyperv.c index 98aa981c04ec..8cdc62c74a96 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.c +++ b/arch/x86/kvm/svm/svm_onhyperv.c @@ -4,7 +4,6 @@ */ #include <linux/kvm_host.h> -#include "kvm_cache_regs.h" #include <asm/mshyperv.h> diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h index c53b8bf8d013..e2fc59380465 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.h +++ b/arch/x86/kvm/svm/svm_onhyperv.h @@ -7,35 +7,12 @@ #define __ARCH_X86_KVM_SVM_ONHYPERV_H__ #if IS_ENABLED(CONFIG_HYPERV) -#include <asm/mshyperv.h> -#include "hyperv.h" #include "kvm_onhyperv.h" +#include "svm/hyperv.h" static struct kvm_x86_ops svm_x86_ops; -/* - * Hyper-V uses the software reserved 32 bytes in VMCB - * control area to expose SVM enlightenments to guests. - */ -struct hv_enlightenments { - struct __packed hv_enlightenments_control { - u32 nested_flush_hypercall:1; - u32 msr_bitmap:1; - u32 enlightened_npt_tlb: 1; - u32 reserved:29; - } __packed hv_enlightenments_control; - u32 hv_vp_id; - u64 hv_vm_id; - u64 partition_assist_page; - u64 reserved; -} __packed; - -/* - * Hyper-V uses the software reserved clean bit in VMCB - */ -#define VMCB_HV_NESTED_ENLIGHTENMENTS VMCB_SW - int svm_hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu); static inline void svm_hv_init_vmcb(struct vmcb *vmcb) @@ -46,6 +23,9 @@ static inline void svm_hv_init_vmcb(struct vmcb *vmcb) if (npt_enabled && ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) hve->hv_enlightenments_control.enlightened_npt_tlb = 1; + + if (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP) + hve->hv_enlightenments_control.msr_bitmap = 1; } static inline void svm_hv_hardware_setup(void) @@ -83,14 +63,7 @@ static inline void svm_hv_vmcb_dirty_nested_enlightenments( struct hv_enlightenments *hve = (struct hv_enlightenments *)vmcb->control.reserved_sw; - /* - * vmcb can be NULL if called during early vcpu init. - * And its okay not to mark vmcb dirty during vcpu init - * as we mark it dirty unconditionally towards end of vcpu - * init phase. - */ - if (vmcb_is_clean(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS) && - hve->hv_enlightenments_control.msr_bitmap) + if (hve->hv_enlightenments_control.msr_bitmap) vmcb_mark_dirty(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS); } diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h index 9430d6437c9f..36c8af87a707 100644 --- a/arch/x86/kvm/svm/svm_ops.h +++ b/arch/x86/kvm/svm/svm_ops.h @@ -61,9 +61,4 @@ static __always_inline void vmsave(unsigned long pa) svm_asm1(vmsave, "a" (pa), "memory"); } -static __always_inline void vmload(unsigned long pa) -{ - svm_asm1(vmload, "a" (pa), "memory"); -} - #endif /* __KVM_X86_SVM_OPS_H */ diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 4fa17df123cd..34367dc203f2 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -4,35 +4,97 @@ #include <asm/bitsperlong.h> #include <asm/kvm_vcpu_regs.h> #include <asm/nospec-branch.h> +#include "kvm-asm-offsets.h" #define WORD_SIZE (BITS_PER_LONG / 8) /* Intentionally omit RAX as it's context switched by hardware */ -#define VCPU_RCX __VCPU_REGS_RCX * WORD_SIZE -#define VCPU_RDX __VCPU_REGS_RDX * WORD_SIZE -#define VCPU_RBX __VCPU_REGS_RBX * WORD_SIZE +#define VCPU_RCX (SVM_vcpu_arch_regs + __VCPU_REGS_RCX * WORD_SIZE) +#define VCPU_RDX (SVM_vcpu_arch_regs + __VCPU_REGS_RDX * WORD_SIZE) +#define VCPU_RBX (SVM_vcpu_arch_regs + __VCPU_REGS_RBX * WORD_SIZE) /* Intentionally omit RSP as it's context switched by hardware */ -#define VCPU_RBP __VCPU_REGS_RBP * WORD_SIZE -#define VCPU_RSI __VCPU_REGS_RSI * WORD_SIZE -#define VCPU_RDI __VCPU_REGS_RDI * WORD_SIZE +#define VCPU_RBP (SVM_vcpu_arch_regs + __VCPU_REGS_RBP * WORD_SIZE) +#define VCPU_RSI (SVM_vcpu_arch_regs + __VCPU_REGS_RSI * WORD_SIZE) +#define VCPU_RDI (SVM_vcpu_arch_regs + __VCPU_REGS_RDI * WORD_SIZE) #ifdef CONFIG_X86_64 -#define VCPU_R8 __VCPU_REGS_R8 * WORD_SIZE -#define VCPU_R9 __VCPU_REGS_R9 * WORD_SIZE -#define VCPU_R10 __VCPU_REGS_R10 * WORD_SIZE -#define VCPU_R11 __VCPU_REGS_R11 * WORD_SIZE -#define VCPU_R12 __VCPU_REGS_R12 * WORD_SIZE -#define VCPU_R13 __VCPU_REGS_R13 * WORD_SIZE -#define VCPU_R14 __VCPU_REGS_R14 * WORD_SIZE -#define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE +#define VCPU_R8 (SVM_vcpu_arch_regs + __VCPU_REGS_R8 * WORD_SIZE) +#define VCPU_R9 (SVM_vcpu_arch_regs + __VCPU_REGS_R9 * WORD_SIZE) +#define VCPU_R10 (SVM_vcpu_arch_regs + __VCPU_REGS_R10 * WORD_SIZE) +#define VCPU_R11 (SVM_vcpu_arch_regs + __VCPU_REGS_R11 * WORD_SIZE) +#define VCPU_R12 (SVM_vcpu_arch_regs + __VCPU_REGS_R12 * WORD_SIZE) +#define VCPU_R13 (SVM_vcpu_arch_regs + __VCPU_REGS_R13 * WORD_SIZE) +#define VCPU_R14 (SVM_vcpu_arch_regs + __VCPU_REGS_R14 * WORD_SIZE) +#define VCPU_R15 (SVM_vcpu_arch_regs + __VCPU_REGS_R15 * WORD_SIZE) #endif +#define SVM_vmcb01_pa (SVM_vmcb01 + KVM_VMCB_pa) + .section .noinstr.text, "ax" +.macro RESTORE_GUEST_SPEC_CTRL + /* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */ + ALTERNATIVE_2 "", \ + "jmp 800f", X86_FEATURE_MSR_SPEC_CTRL, \ + "", X86_FEATURE_V_SPEC_CTRL +801: +.endm +.macro RESTORE_GUEST_SPEC_CTRL_BODY +800: + /* + * SPEC_CTRL handling: if the guest's SPEC_CTRL value differs from the + * host's, write the MSR. This is kept out-of-line so that the common + * case does not have to jump. + * + * IMPORTANT: To avoid RSB underflow attacks and any other nastiness, + * there must not be any returns or indirect branches between this code + * and vmentry. + */ + movl SVM_spec_ctrl(%_ASM_DI), %eax + cmp PER_CPU_VAR(x86_spec_ctrl_current), %eax + je 801b + mov $MSR_IA32_SPEC_CTRL, %ecx + xor %edx, %edx + wrmsr + jmp 801b +.endm + +.macro RESTORE_HOST_SPEC_CTRL + /* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */ + ALTERNATIVE_2 "", \ + "jmp 900f", X86_FEATURE_MSR_SPEC_CTRL, \ + "", X86_FEATURE_V_SPEC_CTRL +901: +.endm +.macro RESTORE_HOST_SPEC_CTRL_BODY +900: + /* Same for after vmexit. */ + mov $MSR_IA32_SPEC_CTRL, %ecx + + /* + * Load the value that the guest had written into MSR_IA32_SPEC_CTRL, + * if it was not intercepted during guest execution. + */ + cmpb $0, (%_ASM_SP) + jnz 998f + rdmsr + movl %eax, SVM_spec_ctrl(%_ASM_DI) +998: + + /* Now restore the host value of the MSR if different from the guest's. */ + movl PER_CPU_VAR(x86_spec_ctrl_current), %eax + cmp SVM_spec_ctrl(%_ASM_DI), %eax + je 901b + xor %edx, %edx + wrmsr + jmp 901b +.endm + + /** * __svm_vcpu_run - Run a vCPU via a transition to SVM guest mode - * @vmcb_pa: unsigned long - * @regs: unsigned long * (to guest registers) + * @svm: struct vcpu_svm * + * @spec_ctrl_intercepted: bool */ SYM_FUNC_START(__svm_vcpu_run) push %_ASM_BP @@ -47,49 +109,71 @@ SYM_FUNC_START(__svm_vcpu_run) #endif push %_ASM_BX - /* Save @regs. */ + /* + * Save variables needed after vmexit on the stack, in inverse + * order compared to when they are needed. + */ + + /* Accessed directly from the stack in RESTORE_HOST_SPEC_CTRL. */ push %_ASM_ARG2 - /* Save @vmcb. */ + /* Needed to restore access to percpu variables. */ + __ASM_SIZE(push) PER_CPU_VAR(svm_data + SD_save_area_pa) + + /* Finally save @svm. */ push %_ASM_ARG1 - /* Move @regs to RAX. */ - mov %_ASM_ARG2, %_ASM_AX +.ifnc _ASM_ARG1, _ASM_DI + /* + * Stash @svm in RDI early. On 32-bit, arguments are in RAX, RCX + * and RDX which are clobbered by RESTORE_GUEST_SPEC_CTRL. + */ + mov %_ASM_ARG1, %_ASM_DI +.endif + + /* Clobbers RAX, RCX, RDX. */ + RESTORE_GUEST_SPEC_CTRL + + /* + * Use a single vmcb (vmcb01 because it's always valid) for + * context switching guest state via VMLOAD/VMSAVE, that way + * the state doesn't need to be copied between vmcb01 and + * vmcb02 when switching vmcbs for nested virtualization. + */ + mov SVM_vmcb01_pa(%_ASM_DI), %_ASM_AX +1: vmload %_ASM_AX +2: + + /* Get svm->current_vmcb->pa into RAX. */ + mov SVM_current_vmcb(%_ASM_DI), %_ASM_AX + mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX /* Load guest registers. */ - mov VCPU_RCX(%_ASM_AX), %_ASM_CX - mov VCPU_RDX(%_ASM_AX), %_ASM_DX - mov VCPU_RBX(%_ASM_AX), %_ASM_BX - mov VCPU_RBP(%_ASM_AX), %_ASM_BP - mov VCPU_RSI(%_ASM_AX), %_ASM_SI - mov VCPU_RDI(%_ASM_AX), %_ASM_DI + mov VCPU_RCX(%_ASM_DI), %_ASM_CX + mov VCPU_RDX(%_ASM_DI), %_ASM_DX + mov VCPU_RBX(%_ASM_DI), %_ASM_BX + mov VCPU_RBP(%_ASM_DI), %_ASM_BP + mov VCPU_RSI(%_ASM_DI), %_ASM_SI #ifdef CONFIG_X86_64 - mov VCPU_R8 (%_ASM_AX), %r8 - mov VCPU_R9 (%_ASM_AX), %r9 - mov VCPU_R10(%_ASM_AX), %r10 - mov VCPU_R11(%_ASM_AX), %r11 - mov VCPU_R12(%_ASM_AX), %r12 - mov VCPU_R13(%_ASM_AX), %r13 - mov VCPU_R14(%_ASM_AX), %r14 - mov VCPU_R15(%_ASM_AX), %r15 + mov VCPU_R8 (%_ASM_DI), %r8 + mov VCPU_R9 (%_ASM_DI), %r9 + mov VCPU_R10(%_ASM_DI), %r10 + mov VCPU_R11(%_ASM_DI), %r11 + mov VCPU_R12(%_ASM_DI), %r12 + mov VCPU_R13(%_ASM_DI), %r13 + mov VCPU_R14(%_ASM_DI), %r14 + mov VCPU_R15(%_ASM_DI), %r15 #endif - - /* "POP" @vmcb to RAX. */ - pop %_ASM_AX + mov VCPU_RDI(%_ASM_DI), %_ASM_DI /* Enter guest mode */ sti -1: vmrun %_ASM_AX - -2: cli - -#ifdef CONFIG_RETPOLINE - /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ - FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE -#endif +3: vmrun %_ASM_AX +4: + cli - /* "POP" @regs to RAX. */ + /* Pop @svm to RAX while it's the only available register. */ pop %_ASM_AX /* Save all guest registers. */ @@ -110,6 +194,35 @@ SYM_FUNC_START(__svm_vcpu_run) mov %r15, VCPU_R15(%_ASM_AX) #endif + /* @svm can stay in RDI from now on. */ + mov %_ASM_AX, %_ASM_DI + + mov SVM_vmcb01_pa(%_ASM_DI), %_ASM_AX +5: vmsave %_ASM_AX +6: + + /* Restores GSBASE among other things, allowing access to percpu data. */ + pop %_ASM_AX +7: vmload %_ASM_AX +8: + +#ifdef CONFIG_RETPOLINE + /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ + FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE +#endif + + /* Clobbers RAX, RCX, RDX. */ + RESTORE_HOST_SPEC_CTRL + + /* + * Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be + * untrained as soon as we exit the VM and are back to the + * kernel. This should be done before re-enabling interrupts + * because interrupt handlers won't sanitize 'ret' if the return is + * from the kernel. + */ + UNTRAIN_RET + /* * Clear all general purpose registers except RSP and RAX to prevent * speculative use of the guest's values, even those that are reloaded @@ -136,6 +249,9 @@ SYM_FUNC_START(__svm_vcpu_run) xor %r15d, %r15d #endif + /* "Pop" @spec_ctrl_intercepted. */ + pop %_ASM_BX + pop %_ASM_BX #ifdef CONFIG_X86_64 @@ -148,19 +264,35 @@ SYM_FUNC_START(__svm_vcpu_run) pop %edi #endif pop %_ASM_BP - ret + RET -3: cmpb $0, kvm_rebooting + RESTORE_GUEST_SPEC_CTRL_BODY + RESTORE_HOST_SPEC_CTRL_BODY + +10: cmpb $0, kvm_rebooting jne 2b ud2 +30: cmpb $0, kvm_rebooting + jne 4b + ud2 +50: cmpb $0, kvm_rebooting + jne 6b + ud2 +70: cmpb $0, kvm_rebooting + jne 8b + ud2 - _ASM_EXTABLE(1b, 3b) + _ASM_EXTABLE(1b, 10b) + _ASM_EXTABLE(3b, 30b) + _ASM_EXTABLE(5b, 50b) + _ASM_EXTABLE(7b, 70b) SYM_FUNC_END(__svm_vcpu_run) /** * __svm_sev_es_vcpu_run - Run a SEV-ES vCPU via a transition to SVM guest mode - * @vmcb_pa: unsigned long + * @svm: struct vcpu_svm * + * @spec_ctrl_intercepted: bool */ SYM_FUNC_START(__svm_sev_es_vcpu_run) push %_ASM_BP @@ -175,8 +307,31 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) #endif push %_ASM_BX - /* Move @vmcb to RAX. */ - mov %_ASM_ARG1, %_ASM_AX + /* + * Save variables needed after vmexit on the stack, in inverse + * order compared to when they are needed. + */ + + /* Accessed directly from the stack in RESTORE_HOST_SPEC_CTRL. */ + push %_ASM_ARG2 + + /* Save @svm. */ + push %_ASM_ARG1 + +.ifnc _ASM_ARG1, _ASM_DI + /* + * Stash @svm in RDI early. On 32-bit, arguments are in RAX, RCX + * and RDX which are clobbered by RESTORE_GUEST_SPEC_CTRL. + */ + mov %_ASM_ARG1, %_ASM_DI +.endif + + /* Clobbers RAX, RCX, RDX. */ + RESTORE_GUEST_SPEC_CTRL + + /* Get svm->current_vmcb->pa into RAX. */ + mov SVM_current_vmcb(%_ASM_DI), %_ASM_AX + mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX /* Enter guest mode */ sti @@ -185,11 +340,29 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) 2: cli + /* Pop @svm to RDI, guest registers have been saved already. */ + pop %_ASM_DI + #ifdef CONFIG_RETPOLINE /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE #endif + /* Clobbers RAX, RCX, RDX. */ + RESTORE_HOST_SPEC_CTRL + + /* + * Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be + * untrained as soon as we exit the VM and are back to the + * kernel. This should be done before re-enabling interrupts + * because interrupt handlers won't sanitize RET if the return is + * from the kernel. + */ + UNTRAIN_RET + + /* "Pop" @spec_ctrl_intercepted. */ + pop %_ASM_BX + pop %_ASM_BX #ifdef CONFIG_X86_64 @@ -202,7 +375,10 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) pop %edi #endif pop %_ASM_BP - ret + RET + + RESTORE_GUEST_SPEC_CTRL_BODY + RESTORE_HOST_SPEC_CTRL_BODY 3: cmpb $0, kvm_rebooting jne 2b |