diff options
Diffstat (limited to 'arch/x86/kvm/svm')
-rw-r--r-- | arch/x86/kvm/svm/avic.c | 77 | ||||
-rw-r--r-- | arch/x86/kvm/svm/nested.c | 86 | ||||
-rw-r--r-- | arch/x86/kvm/svm/pmu.c | 19 | ||||
-rw-r--r-- | arch/x86/kvm/svm/sev.c | 2460 | ||||
-rw-r--r-- | arch/x86/kvm/svm/svm.c | 743 | ||||
-rw-r--r-- | arch/x86/kvm/svm/svm.h | 209 | ||||
-rw-r--r-- | arch/x86/kvm/svm/vmenter.S | 113 |
7 files changed, 2992 insertions, 715 deletions
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 4b74ea91f4e6..067f8e3f5a0d 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -20,6 +20,7 @@ #include <linux/kvm_host.h> #include <asm/irq_remapping.h> +#include <asm/msr.h> #include "trace.h" #include "lapic.h" @@ -330,7 +331,7 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu) int cpu = READ_ONCE(vcpu->cpu); if (cpu != get_cpu()) { - wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); + wrmsrq(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); trace_kvm_avic_doorbell(vcpu->vcpu_id, kvm_cpu_get_apicid(cpu)); } put_cpu(); @@ -796,12 +797,15 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) struct amd_svm_iommu_ir *ir; u64 entry; + if (WARN_ON_ONCE(!pi->ir_data)) + return -EINVAL; + /** * In some cases, the existing irte is updated and re-set, * so we need to check here if it's already been * added * to the ir_list. */ - if (pi->ir_data && (pi->prev_ga_tag != 0)) { + if (pi->prev_ga_tag) { struct kvm *kvm = svm->vcpu.kvm; u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag); struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); @@ -820,7 +824,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) * Allocating new amd_iommu_pi_data, which will get * add to the per-vcpu ir_list. */ - ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT); + ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_ATOMIC | __GFP_ACCOUNT); if (!ir) { ret = -ENOMEM; goto out; @@ -896,10 +900,10 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, { struct kvm_kernel_irq_routing_entry *e; struct kvm_irq_routing_table *irq_rt; + bool enable_remapped_mode = true; int idx, ret = 0; - if (!kvm_arch_has_assigned_device(kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP)) + if (!kvm_arch_has_assigned_device(kvm) || !kvm_arch_has_irq_bypass()) return 0; pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n", @@ -933,6 +937,8 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, kvm_vcpu_apicv_active(&svm->vcpu)) { struct amd_iommu_pi_data pi; + enable_remapped_mode = false; + /* Try to enable guest_mode in IRTE */ pi.base = __sme_set(page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK); @@ -951,33 +957,6 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, */ if (!ret && pi.is_guest_mode) svm_ir_list_add(svm, &pi); - } else { - /* Use legacy mode in IRTE */ - struct amd_iommu_pi_data pi; - - /** - * Here, pi is used to: - * - Tell IOMMU to use legacy mode for this interrupt. - * - Retrieve ga_tag of prior interrupt remapping data. - */ - pi.prev_ga_tag = 0; - pi.is_guest_mode = false; - ret = irq_set_vcpu_affinity(host_irq, &pi); - - /** - * Check if the posted interrupt was previously - * setup with the guest_mode by checking if the ga_tag - * was cached. If so, we need to clean up the per-vcpu - * ir_list. - */ - if (!ret && pi.prev_ga_tag) { - int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); - struct kvm_vcpu *vcpu; - - vcpu = kvm_get_vcpu_by_id(kvm, id); - if (vcpu) - svm_ir_list_del(to_svm(vcpu), &pi); - } } if (!ret && svm) { @@ -993,6 +972,34 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, } ret = 0; + if (enable_remapped_mode) { + /* Use legacy mode in IRTE */ + struct amd_iommu_pi_data pi; + + /** + * Here, pi is used to: + * - Tell IOMMU to use legacy mode for this interrupt. + * - Retrieve ga_tag of prior interrupt remapping data. + */ + pi.prev_ga_tag = 0; + pi.is_guest_mode = false; + ret = irq_set_vcpu_affinity(host_irq, &pi); + + /** + * Check if the posted interrupt was previously + * setup with the guest_mode by checking if the ga_tag + * was cached. If so, we need to clean up the per-vcpu + * ir_list. + */ + if (!ret && pi.prev_ga_tag) { + int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); + struct kvm_vcpu *vcpu; + + vcpu = kvm_get_vcpu_by_id(kvm, id); + if (vcpu) + svm_ir_list_del(to_svm(vcpu), &pi); + } + } out: srcu_read_unlock(&kvm->irq_srcu, idx); return ret; @@ -1199,6 +1206,12 @@ bool avic_hardware_setup(void) return false; } + if (cc_platform_has(CC_ATTR_HOST_SEV_SNP) && + !boot_cpu_has(X86_FEATURE_HV_INUSE_WR_ALLOWED)) { + pr_warn("AVIC disabled: missing HvInUseWrAllowed on SNP-enabled system\n"); + return false; + } + if (boot_cpu_has(X86_FEATURE_AVIC)) { pr_info("AVIC enabled\n"); } else if (force_avic) { diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 55b9a6d96bcf..8427a48b8b7a 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -63,8 +63,12 @@ static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index) u64 pdpte; int ret; + /* + * Note, nCR3 is "assumed" to be 32-byte aligned, i.e. the CPU ignores + * nCR3[4:0] when loading PDPTEs from memory. + */ ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte, - offset_in_page(cr3) + index * 8, 8); + (cr3 & GENMASK(11, 5)) + index * 8, 8); if (ret) return 0; return pdpte; @@ -107,7 +111,7 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm) { - if (!guest_can_use(&svm->vcpu, X86_FEATURE_V_VMSAVE_VMLOAD)) + if (!guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_V_VMSAVE_VMLOAD)) return true; if (!nested_npt_enabled(svm)) @@ -590,7 +594,7 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 vmcb_mark_dirty(vmcb02, VMCB_DR); } - if (unlikely(guest_can_use(vcpu, X86_FEATURE_LBRV) && + if (unlikely(guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { /* * Reserved bits of DEBUGCTL are ignored. Be consistent with @@ -642,12 +646,17 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, u32 pause_count12; u32 pause_thresh12; + nested_svm_transition_tlb_flush(vcpu); + + /* Enter Guest-Mode */ + enter_guest_mode(vcpu); + /* * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2, * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes. */ - if (guest_can_use(vcpu, X86_FEATURE_VGIF) && + if (guest_cpu_cap_has(vcpu, X86_FEATURE_VGIF) && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK)) int_ctl_vmcb12_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK); else @@ -669,6 +678,33 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa; vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa; + /* + * Stash vmcb02's counter if the guest hasn't moved past the guilty + * instruction; otherwise, reset the counter to '0'. + * + * In order to detect if L2 has made forward progress or not, track the + * RIP at which a bus lock has occurred on a per-vmcb12 basis. If RIP + * is changed, guest has clearly made forward progress, bus_lock_counter + * still remained '1', so reset bus_lock_counter to '0'. Eg. In the + * scenario, where a buslock happened in L1 before VMRUN, the bus lock + * firmly happened on an instruction in the past. Even if vmcb01's + * counter is still '1', (because the guilty instruction got patched), + * the vCPU has clearly made forward progress and so KVM should reset + * vmcb02's counter to '0'. + * + * If the RIP hasn't changed, stash the bus lock counter at nested VMRUN + * to prevent the same guilty instruction from triggering a VM-Exit. Eg. + * if userspace rate-limits the vCPU, then it's entirely possible that + * L1's tick interrupt is pending by the time userspace re-runs the + * vCPU. If KVM unconditionally clears the counter on VMRUN, then when + * L1 re-enters L2, the same instruction will trigger a VM-Exit and the + * entire cycle start over. + */ + if (vmcb02->save.rip && (svm->nested.ctl.bus_lock_rip == vmcb02->save.rip)) + vmcb02->control.bus_lock_counter = 1; + else + vmcb02->control.bus_lock_counter = 0; + /* Done at vmrun: asid. */ /* Also overwritten later if necessary. */ @@ -685,7 +721,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, vmcb02->control.tsc_offset = vcpu->arch.tsc_offset; - if (guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR) && + if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR) && svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) nested_svm_update_tsc_ratio_msr(vcpu); @@ -706,7 +742,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, * what a nrips=0 CPU would do (L1 is responsible for advancing RIP * prior to injecting the event). */ - if (guest_can_use(vcpu, X86_FEATURE_NRIPS)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) vmcb02->control.next_rip = svm->nested.ctl.next_rip; else if (boot_cpu_has(X86_FEATURE_NRIPS)) vmcb02->control.next_rip = vmcb12_rip; @@ -716,7 +752,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, svm->soft_int_injected = true; svm->soft_int_csbase = vmcb12_csbase; svm->soft_int_old_rip = vmcb12_rip; - if (guest_can_use(vcpu, X86_FEATURE_NRIPS)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) svm->soft_int_next_rip = svm->nested.ctl.next_rip; else svm->soft_int_next_rip = vmcb12_rip; @@ -724,18 +760,18 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, vmcb02->control.virt_ext = vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK; - if (guest_can_use(vcpu, X86_FEATURE_LBRV)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV)) vmcb02->control.virt_ext |= (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK); if (!nested_vmcb_needs_vls_intercept(svm)) vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; - if (guest_can_use(vcpu, X86_FEATURE_PAUSEFILTER)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_PAUSEFILTER)) pause_count12 = svm->nested.ctl.pause_filter_count; else pause_count12 = 0; - if (guest_can_use(vcpu, X86_FEATURE_PFTHRESHOLD)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_PFTHRESHOLD)) pause_thresh12 = svm->nested.ctl.pause_filter_thresh; else pause_thresh12 = 0; @@ -758,11 +794,6 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, } } - nested_svm_transition_tlb_flush(vcpu); - - /* Enter Guest-Mode */ - enter_guest_mode(vcpu); - /* * Merge guest and host intercepts - must be called with vcpu in * guest-mode to take effect. @@ -922,7 +953,7 @@ out_exit_err: nested_svm_vmexit(svm); out: - kvm_vcpu_unmap(vcpu, &map, true); + kvm_vcpu_unmap(vcpu, &map); return ret; } @@ -990,7 +1021,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); /* in case we halted in L2 */ - svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); /* Give the current vmcb to the guest */ @@ -1022,7 +1053,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) if (vmcb12->control.exit_code != SVM_EXIT_ERR) nested_save_pending_event_to_vmcb12(svm, vmcb12); - if (guest_can_use(vcpu, X86_FEATURE_NRIPS)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) vmcb12->control.next_rip = vmcb02->control.next_rip; vmcb12->control.int_ctl = svm->nested.ctl.int_ctl; @@ -1035,8 +1066,17 @@ int nested_svm_vmexit(struct vcpu_svm *svm) } + /* + * Invalidate bus_lock_rip unless KVM is still waiting for the guest + * to make forward progress before re-enabling bus lock detection. + */ + if (!vmcb02->control.bus_lock_counter) + svm->nested.ctl.bus_lock_rip = INVALID_GPA; + nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr); + kvm_nested_vmexit_handle_ibrs(vcpu); + svm_switch_vmcb(svm, &svm->vmcb01); /* @@ -1061,7 +1101,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) if (!nested_exit_on_intr(svm)) kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); - if (unlikely(guest_can_use(vcpu, X86_FEATURE_LBRV) && + if (unlikely(guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { svm_copy_lbrs(vmcb12, vmcb02); svm_update_lbrv(vcpu); @@ -1126,7 +1166,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->control.exit_int_info_err, KVM_ISA_SVM); - kvm_vcpu_unmap(vcpu, &map, true); + kvm_vcpu_unmap(vcpu, &map); nested_svm_transition_tlb_flush(vcpu); @@ -1181,7 +1221,7 @@ int svm_allocate_nested(struct vcpu_svm *svm) if (svm->nested.initialized) return 0; - vmcb02_page = snp_safe_alloc_page(&svm->vcpu); + vmcb02_page = snp_safe_alloc_page(); if (!vmcb02_page) return -ENOMEM; svm->nested.vmcb02.ptr = page_address(vmcb02_page); @@ -1693,8 +1733,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, return -EINVAL; ret = -ENOMEM; - ctl = kzalloc(sizeof(*ctl), GFP_KERNEL_ACCOUNT); - save = kzalloc(sizeof(*save), GFP_KERNEL_ACCOUNT); + ctl = kzalloc(sizeof(*ctl), GFP_KERNEL); + save = kzalloc(sizeof(*save), GFP_KERNEL); if (!ctl || !save) goto out_free; diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index dfcc38bd97d3..288f7f2a46f2 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -46,7 +46,7 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, switch (msr) { case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: - if (!guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PERFCTR_CORE)) return NULL; /* * Each PMU counter has a pair of CTL and CTR MSRs. CTLn @@ -109,7 +109,7 @@ static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) case MSR_K7_EVNTSEL0 ... MSR_K7_PERFCTR3: return pmu->version > 0; case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: - return guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE); + return guest_cpu_cap_has(vcpu, X86_FEATURE_PERFCTR_CORE); case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: @@ -179,7 +179,7 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu) union cpuid_0x80000022_ebx ebx; pmu->version = 1; - if (guest_cpuid_has(vcpu, X86_FEATURE_PERFMON_V2)) { + if (guest_cpu_cap_has(vcpu, X86_FEATURE_PERFMON_V2)) { pmu->version = 2; /* * Note, PERFMON_V2 is also in 0x80000022.0x0, i.e. the guest @@ -189,7 +189,7 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu) x86_feature_cpuid(X86_FEATURE_PERFMON_V2).index); ebx.full = kvm_find_cpuid_entry_index(vcpu, 0x80000022, 0)->ebx; pmu->nr_arch_gp_counters = ebx.split.num_core_pmc; - } else if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) { + } else if (guest_cpu_cap_has(vcpu, X86_FEATURE_PERFCTR_CORE)) { pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS_CORE; } else { pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS; @@ -199,8 +199,8 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu) kvm_pmu_cap.num_counters_gp); if (pmu->version > 1) { - pmu->global_ctrl_mask = ~((1ull << pmu->nr_arch_gp_counters) - 1); - pmu->global_status_mask = pmu->global_ctrl_mask; + pmu->global_ctrl_rsvd = ~((1ull << pmu->nr_arch_gp_counters) - 1); + pmu->global_status_rsvd = pmu->global_ctrl_rsvd; } pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1; @@ -217,10 +217,9 @@ static void amd_pmu_init(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); int i; - BUILD_BUG_ON(KVM_AMD_PMC_MAX_GENERIC > AMD64_NUM_COUNTERS_CORE); - BUILD_BUG_ON(KVM_AMD_PMC_MAX_GENERIC > INTEL_PMC_MAX_GENERIC); + BUILD_BUG_ON(KVM_MAX_NR_AMD_GP_COUNTERS > AMD64_NUM_COUNTERS_CORE); - for (i = 0; i < KVM_AMD_PMC_MAX_GENERIC ; i++) { + for (i = 0; i < KVM_MAX_NR_AMD_GP_COUNTERS; i++) { pmu->gp_counters[i].type = KVM_PMC_GP; pmu->gp_counters[i].vcpu = vcpu; pmu->gp_counters[i].idx = i; @@ -238,6 +237,6 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = { .refresh = amd_pmu_refresh, .init = amd_pmu_init, .EVENTSEL_EVENT = AMD64_EVENTSEL_EVENT, - .MAX_NR_GP_COUNTERS = KVM_AMD_PMC_MAX_GENERIC, + .MAX_NR_GP_COUNTERS = KVM_MAX_NR_AMD_GP_COUNTERS, .MIN_NR_GP_COUNTERS = AMD64_NUM_COUNTERS, }; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index ae0ac12382b9..459c3b791fd4 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -19,11 +19,15 @@ #include <linux/misc_cgroup.h> #include <linux/processor.h> #include <linux/trace_events.h> +#include <uapi/linux/sev-guest.h> #include <asm/pkru.h> #include <asm/trapnr.h> #include <asm/fpu/xcr.h> +#include <asm/fpu/xstate.h> #include <asm/debugreg.h> +#include <asm/msr.h> +#include <asm/sev.h> #include "mmu.h" #include "x86.h" @@ -32,22 +36,12 @@ #include "cpuid.h" #include "trace.h" -#ifndef CONFIG_KVM_AMD_SEV -/* - * When this config is not defined, SEV feature is not supported and APIs in - * this file are not used but this file still gets compiled into the KVM AMD - * module. - * - * We will not have MISC_CG_RES_SEV and MISC_CG_RES_SEV_ES entries in the enum - * misc_res_type {} defined in linux/misc_cgroup.h. - * - * Below macros allow compilation to succeed. - */ -#define MISC_CG_RES_SEV MISC_CG_RES_TYPES -#define MISC_CG_RES_SEV_ES MISC_CG_RES_TYPES -#endif +#define GHCB_VERSION_MAX 2ULL +#define GHCB_VERSION_DEFAULT 2ULL +#define GHCB_VERSION_MIN 1ULL + +#define GHCB_HV_FT_SUPPORTED (GHCB_HV_FT_SNP | GHCB_HV_FT_SNP_AP_CREATION) -#ifdef CONFIG_KVM_AMD_SEV /* enable/disable SEV support */ static bool sev_enabled = true; module_param_named(sev, sev_enabled, bool, 0444); @@ -56,14 +50,35 @@ module_param_named(sev, sev_enabled, bool, 0444); static bool sev_es_enabled = true; module_param_named(sev_es, sev_es_enabled, bool, 0444); +/* enable/disable SEV-SNP support */ +static bool sev_snp_enabled = true; +module_param_named(sev_snp, sev_snp_enabled, bool, 0444); + /* enable/disable SEV-ES DebugSwap support */ -static bool sev_es_debug_swap_enabled = false; +static bool sev_es_debug_swap_enabled = true; module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444); -#else -#define sev_enabled false -#define sev_es_enabled false -#define sev_es_debug_swap_enabled false -#endif /* CONFIG_KVM_AMD_SEV */ +static u64 sev_supported_vmsa_features; + +#define AP_RESET_HOLD_NONE 0 +#define AP_RESET_HOLD_NAE_EVENT 1 +#define AP_RESET_HOLD_MSR_PROTO 2 + +/* As defined by SEV-SNP Firmware ABI, under "Guest Policy". */ +#define SNP_POLICY_MASK_API_MINOR GENMASK_ULL(7, 0) +#define SNP_POLICY_MASK_API_MAJOR GENMASK_ULL(15, 8) +#define SNP_POLICY_MASK_SMT BIT_ULL(16) +#define SNP_POLICY_MASK_RSVD_MBO BIT_ULL(17) +#define SNP_POLICY_MASK_DEBUG BIT_ULL(19) +#define SNP_POLICY_MASK_SINGLE_SOCKET BIT_ULL(20) + +#define SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \ + SNP_POLICY_MASK_API_MAJOR | \ + SNP_POLICY_MASK_SMT | \ + SNP_POLICY_MASK_RSVD_MBO | \ + SNP_POLICY_MASK_DEBUG | \ + SNP_POLICY_MASK_SINGLE_SOCKET) + +#define INITIAL_VMSA_GPA 0xFFFFFFFFF000 static u8 sev_enc_bit; static DECLARE_RWSEM(sev_deactivate_lock); @@ -75,6 +90,8 @@ static unsigned int nr_asids; static unsigned long *sev_asid_bitmap; static unsigned long *sev_reclaim_asid_bitmap; +static int snp_decommission_context(struct kvm *kvm); + struct enc_region { struct list_head list; unsigned long npages; @@ -84,9 +101,10 @@ struct enc_region { }; /* Called with the sev_bitmap_lock held, or on shutdown */ -static int sev_flush_asids(int min_asid, int max_asid) +static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid) { - int ret, asid, error = 0; + int ret, error = 0; + unsigned int asid; /* Check if there are any ASIDs to reclaim before performing a flush */ asid = find_next_bit(sev_reclaim_asid_bitmap, nr_asids, min_asid); @@ -100,23 +118,36 @@ static int sev_flush_asids(int min_asid, int max_asid) down_write(&sev_deactivate_lock); wbinvd_on_all_cpus(); - ret = sev_guest_df_flush(&error); + + if (sev_snp_enabled) + ret = sev_do_cmd(SEV_CMD_SNP_DF_FLUSH, NULL, &error); + else + ret = sev_guest_df_flush(&error); up_write(&sev_deactivate_lock); if (ret) - pr_err("SEV: DF_FLUSH failed, ret=%d, error=%#x\n", ret, error); + pr_err("SEV%s: DF_FLUSH failed, ret=%d, error=%#x\n", + sev_snp_enabled ? "-SNP" : "", ret, error); return ret; } static inline bool is_mirroring_enc_context(struct kvm *kvm) { - return !!to_kvm_svm(kvm)->sev_info.enc_context_owner; + return !!to_kvm_sev_info(kvm)->enc_context_owner; +} + +static bool sev_vcpu_has_debug_swap(struct vcpu_svm *svm) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); + + return sev->vmsa_features & SVM_SEV_FEAT_DEBUG_SWAP; } /* Must be called with the sev_bitmap_lock held */ -static bool __sev_recycle_asids(int min_asid, int max_asid) +static bool __sev_recycle_asids(unsigned int min_asid, unsigned int max_asid) { if (sev_flush_asids(min_asid, max_asid)) return false; @@ -143,8 +174,20 @@ static void sev_misc_cg_uncharge(struct kvm_sev_info *sev) static int sev_asid_new(struct kvm_sev_info *sev) { - int asid, min_asid, max_asid, ret; + /* + * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid. + * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1. + * Note: min ASID can end up larger than the max if basic SEV support is + * effectively disabled by disallowing use of ASIDs for SEV guests. + */ + unsigned int min_asid = sev->es_active ? 1 : min_sev_asid; + unsigned int max_asid = sev->es_active ? min_sev_asid - 1 : max_sev_asid; + unsigned int asid; bool retry = true; + int ret; + + if (min_asid > max_asid) + return -ENOTTY; WARN_ON(sev->misc_cg); sev->misc_cg = get_current_misc_cg(); @@ -157,12 +200,6 @@ static int sev_asid_new(struct kvm_sev_info *sev) mutex_lock(&sev_bitmap_lock); - /* - * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid. - * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1. - */ - min_asid = sev->es_active ? 1 : min_sev_asid; - max_asid = sev->es_active ? min_sev_asid - 1 : max_sev_asid; again: asid = find_next_zero_bit(sev_asid_bitmap, max_asid + 1, min_asid); if (asid > max_asid) { @@ -179,7 +216,8 @@ again: mutex_unlock(&sev_bitmap_lock); - return asid; + sev->asid = asid; + return 0; e_uncharge: sev_misc_cg_uncharge(sev); put_misc_cg(sev->misc_cg); @@ -187,11 +225,9 @@ e_uncharge: return ret; } -static int sev_get_asid(struct kvm *kvm) +static unsigned int sev_get_asid(struct kvm *kvm) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - - return sev->asid; + return to_kvm_sev_info(kvm)->asid; } static void sev_asid_free(struct kvm_sev_info *sev) @@ -226,6 +262,53 @@ static void sev_decommission(unsigned int handle) sev_guest_decommission(&decommission, NULL); } +/* + * Transition a page to hypervisor-owned/shared state in the RMP table. This + * should not fail under normal conditions, but leak the page should that + * happen since it will no longer be usable by the host due to RMP protections. + */ +static int kvm_rmp_make_shared(struct kvm *kvm, u64 pfn, enum pg_level level) +{ + if (KVM_BUG_ON(rmp_make_shared(pfn, level), kvm)) { + snp_leak_pages(pfn, page_level_size(level) >> PAGE_SHIFT); + return -EIO; + } + + return 0; +} + +/* + * Certain page-states, such as Pre-Guest and Firmware pages (as documented + * in Chapter 5 of the SEV-SNP Firmware ABI under "Page States") cannot be + * directly transitioned back to normal/hypervisor-owned state via RMPUPDATE + * unless they are reclaimed first. + * + * Until they are reclaimed and subsequently transitioned via RMPUPDATE, they + * might not be usable by the host due to being set as immutable or still + * being associated with a guest ASID. + * + * Bug the VM and leak the page if reclaim fails, or if the RMP entry can't be + * converted back to shared, as the page is no longer usable due to RMP + * protections, and it's infeasible for the guest to continue on. + */ +static int snp_page_reclaim(struct kvm *kvm, u64 pfn) +{ + struct sev_data_snp_page_reclaim data = {0}; + int fw_err, rc; + + data.paddr = __sme_set(pfn << PAGE_SHIFT); + rc = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &fw_err); + if (KVM_BUG(rc, kvm, "Failed to reclaim PFN %llx, rc %d fw_err %d", pfn, rc, fw_err)) { + snp_leak_pages(pfn, 1); + return -EIO; + } + + if (kvm_rmp_make_shared(kvm, pfn, PG_LEVEL_4K)) + return -EIO; + + return rc; +} + static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) { struct sev_data_deactivate deactivate; @@ -243,33 +326,138 @@ static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) sev_decommission(handle); } -static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) +/* + * This sets up bounce buffers/firmware pages to handle SNP Guest Request + * messages (e.g. attestation requests). See "SNP Guest Request" in the GHCB + * 2.0 specification for more details. + * + * Technically, when an SNP Guest Request is issued, the guest will provide its + * own request/response pages, which could in theory be passed along directly + * to firmware rather than using bounce pages. However, these pages would need + * special care: + * + * - Both pages are from shared guest memory, so they need to be protected + * from migration/etc. occurring while firmware reads/writes to them. At a + * minimum, this requires elevating the ref counts and potentially needing + * an explicit pinning of the memory. This places additional restrictions + * on what type of memory backends userspace can use for shared guest + * memory since there is some reliance on using refcounted pages. + * + * - The response page needs to be switched to Firmware-owned[1] state + * before the firmware can write to it, which can lead to potential + * host RMP #PFs if the guest is misbehaved and hands the host a + * guest page that KVM might write to for other reasons (e.g. virtio + * buffers/etc.). + * + * Both of these issues can be avoided completely by using separately-allocated + * bounce pages for both the request/response pages and passing those to + * firmware instead. So that's what is being set up here. + * + * Guest requests rely on message sequence numbers to ensure requests are + * issued to firmware in the order the guest issues them, so concurrent guest + * requests generally shouldn't happen. But a misbehaved guest could issue + * concurrent guest requests in theory, so a mutex is used to serialize + * access to the bounce buffers. + * + * [1] See the "Page States" section of the SEV-SNP Firmware ABI for more + * details on Firmware-owned pages, along with "RMP and VMPL Access Checks" + * in the APM for details on the related RMP restrictions. + */ +static int snp_guest_req_init(struct kvm *kvm) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + struct page *req_page; + + req_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!req_page) + return -ENOMEM; + + sev->guest_resp_buf = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!sev->guest_resp_buf) { + __free_page(req_page); + return -EIO; + } + + sev->guest_req_buf = page_address(req_page); + mutex_init(&sev->guest_req_mutex); + + return 0; +} + +static void snp_guest_req_cleanup(struct kvm *kvm) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + + if (sev->guest_resp_buf) + snp_free_firmware_page(sev->guest_resp_buf); + + if (sev->guest_req_buf) + __free_page(virt_to_page(sev->guest_req_buf)); + + sev->guest_req_buf = NULL; + sev->guest_resp_buf = NULL; +} + +static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, + struct kvm_sev_init *data, + unsigned long vm_type) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct sev_platform_init_args init_args = {0}; - int asid, ret; + bool es_active = vm_type != KVM_X86_SEV_VM; + u64 valid_vmsa_features = es_active ? sev_supported_vmsa_features : 0; + int ret; if (kvm->created_vcpus) return -EINVAL; - ret = -EBUSY; + if (data->flags) + return -EINVAL; + + if (data->vmsa_features & ~valid_vmsa_features) + return -EINVAL; + + if (data->ghcb_version > GHCB_VERSION_MAX || (!es_active && data->ghcb_version)) + return -EINVAL; + if (unlikely(sev->active)) - return ret; + return -EINVAL; sev->active = true; - sev->es_active = argp->id == KVM_SEV_ES_INIT; - asid = sev_asid_new(sev); - if (asid < 0) + sev->es_active = es_active; + sev->vmsa_features = data->vmsa_features; + sev->ghcb_version = data->ghcb_version; + + /* + * Currently KVM supports the full range of mandatory features defined + * by version 2 of the GHCB protocol, so default to that for SEV-ES + * guests created via KVM_SEV_INIT2. + */ + if (sev->es_active && !sev->ghcb_version) + sev->ghcb_version = GHCB_VERSION_DEFAULT; + + if (vm_type == KVM_X86_SNP_VM) + sev->vmsa_features |= SVM_SEV_FEAT_SNP_ACTIVE; + + ret = sev_asid_new(sev); + if (ret) goto e_no_asid; - sev->asid = asid; init_args.probe = false; ret = sev_platform_init(&init_args); if (ret) goto e_free; + /* This needs to happen after SEV/SNP firmware initialization. */ + if (vm_type == KVM_X86_SNP_VM) { + ret = snp_guest_req_init(kvm); + if (ret) + goto e_free; + } + INIT_LIST_HEAD(&sev->regions_list); INIT_LIST_HEAD(&sev->mirror_vms); + sev->need_init = false; kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_SEV); @@ -280,15 +468,57 @@ e_free: sev_asid_free(sev); sev->asid = 0; e_no_asid: + sev->vmsa_features = 0; sev->es_active = false; sev->active = false; return ret; } +static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_init data = { + .vmsa_features = 0, + .ghcb_version = 0, + }; + unsigned long vm_type; + + if (kvm->arch.vm_type != KVM_X86_DEFAULT_VM) + return -EINVAL; + + vm_type = (argp->id == KVM_SEV_INIT ? KVM_X86_SEV_VM : KVM_X86_SEV_ES_VM); + + /* + * KVM_SEV_ES_INIT has been deprecated by KVM_SEV_INIT2, so it will + * continue to only ever support the minimal GHCB protocol version. + */ + if (vm_type == KVM_X86_SEV_ES_VM) + data.ghcb_version = GHCB_VERSION_MIN; + + return __sev_guest_init(kvm, argp, &data, vm_type); +} + +static int sev_guest_init2(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_init data; + + if (!to_kvm_sev_info(kvm)->need_init) + return -EINVAL; + + if (kvm->arch.vm_type != KVM_X86_SEV_VM && + kvm->arch.vm_type != KVM_X86_SEV_ES_VM && + kvm->arch.vm_type != KVM_X86_SNP_VM) + return -EINVAL; + + if (copy_from_user(&data, u64_to_user_ptr(argp->data), sizeof(data))) + return -EFAULT; + + return __sev_guest_init(kvm, argp, &data, kvm->arch.vm_type); +} + static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error) { + unsigned int asid = sev_get_asid(kvm); struct sev_data_activate activate; - int asid = sev_get_asid(kvm); int ret; /* activate ASID on the given handle */ @@ -301,29 +531,24 @@ static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error) static int __sev_issue_cmd(int fd, int id, void *data, int *error) { - struct fd f; - int ret; + CLASS(fd, f)(fd); - f = fdget(fd); - if (!f.file) + if (fd_empty(f)) return -EBADF; - ret = sev_issue_cmd_external_user(f.file, id, data, error); - - fdput(f); - return ret; + return sev_issue_cmd_external_user(fd_file(f), id, data, error); } static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); return __sev_issue_cmd(sev->fd, id, data, error); } static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct sev_data_launch_start start; struct kvm_sev_launch_start params; void *dh_blob, *session_blob; @@ -333,9 +558,11 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!sev_guest(kvm)) return -ENOTTY; - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) return -EFAULT; + sev->policy = params.policy; + memset(&start, 0, sizeof(start)); dh_blob = NULL; @@ -377,7 +604,7 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) /* return handle to userspace */ params.handle = start.handle; - if (copy_to_user((void __user *)(uintptr_t)argp->data, ¶ms, sizeof(params))) { + if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, sizeof(params))) { sev_unbind_asid(kvm, start.handle); ret = -EFAULT; goto e_free_session; @@ -395,9 +622,9 @@ e_free_dh: static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, unsigned long ulen, unsigned long *n, - int write) + unsigned int flags) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); unsigned long npages, size; int npinned; unsigned long locked, lock_limit; @@ -428,7 +655,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, /* Avoid using vmalloc for smaller buffers. */ size = npages * sizeof(struct page *); if (size > PAGE_SIZE) - pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); + pages = __vmalloc(size, GFP_KERNEL_ACCOUNT); else pages = kmalloc(size, GFP_KERNEL_ACCOUNT); @@ -436,7 +663,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, return ERR_PTR(-ENOMEM); /* Pin the user virtual address. */ - npinned = pin_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages); + npinned = pin_user_pages_fast(uaddr, npages, flags, pages); if (npinned != npages) { pr_err("SEV: Failure locking %lu pages.\n", npages); ret = -ENOMEM; @@ -459,11 +686,9 @@ err: static void sev_unpin_memory(struct kvm *kvm, struct page **pages, unsigned long npages) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - unpin_user_pages(pages, npages); kvfree(pages); - sev->pages_locked -= npages; + to_kvm_sev_info(kvm)->pages_locked -= npages; } static void sev_clflush_pages(struct page *pages[], unsigned long npages) @@ -507,7 +732,6 @@ static unsigned long get_num_contig_pages(unsigned long idx, static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) { unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i; - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct kvm_sev_launch_update_data params; struct sev_data_launch_update_data data; struct page **inpages; @@ -516,7 +740,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!sev_guest(kvm)) return -ENOTTY; - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) return -EFAULT; vaddr = params.uaddr; @@ -524,7 +748,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) vaddr_end = vaddr + size; /* Lock the user memory. */ - inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1); + inpages = sev_pin_memory(kvm, vaddr, size, &npages, FOLL_WRITE); if (IS_ERR(inpages)) return PTR_ERR(inpages); @@ -535,7 +759,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) sev_clflush_pages(inpages, npages); data.reserved = 0; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) { int offset, len; @@ -574,7 +798,13 @@ e_unpin: static int sev_es_sync_vmsa(struct vcpu_svm *svm) { + struct kvm_vcpu *vcpu = &svm->vcpu; + struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); struct sev_es_save_area *save = svm->sev_es.vmsa; + struct xregs_state *xsave; + const u8 *s; + u8 *d; + int i; /* Check some debug related fields before encrypting the VMSA */ if (svm->vcpu.guest_debug || (svm->vmcb->save.dr7 & ~DR7_FIXED_1)) @@ -615,10 +845,44 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm) save->xss = svm->vcpu.arch.ia32_xss; save->dr6 = svm->vcpu.arch.dr6; - if (sev_es_debug_swap_enabled) { - save->sev_features |= SVM_SEV_FEAT_DEBUG_SWAP; - pr_warn_once("Enabling DebugSwap with KVM_SEV_ES_INIT. " - "This will not work starting with Linux 6.10\n"); + save->sev_features = sev->vmsa_features; + + /* + * Skip FPU and AVX setup with KVM_SEV_ES_INIT to avoid + * breaking older measurements. + */ + if (vcpu->kvm->arch.vm_type != KVM_X86_DEFAULT_VM) { + xsave = &vcpu->arch.guest_fpu.fpstate->regs.xsave; + save->x87_dp = xsave->i387.rdp; + save->mxcsr = xsave->i387.mxcsr; + save->x87_ftw = xsave->i387.twd; + save->x87_fsw = xsave->i387.swd; + save->x87_fcw = xsave->i387.cwd; + save->x87_fop = xsave->i387.fop; + save->x87_ds = 0; + save->x87_cs = 0; + save->x87_rip = xsave->i387.rip; + + for (i = 0; i < 8; i++) { + /* + * The format of the x87 save area is undocumented and + * definitely not what you would expect. It consists of + * an 8*8 bytes area with bytes 0-7, and an 8*2 bytes + * area with bytes 8-9 of each register. + */ + d = save->fpreg_x87 + i * 8; + s = ((u8 *)xsave->i387.st_space) + i * 16; + memcpy(d, s, 8); + save->fpreg_x87[64 + i * 2] = s[8]; + save->fpreg_x87[64 + i * 2 + 1] = s[9]; + } + memcpy(save->fpreg_xmm, xsave->i387.xmm_space, 256); + + s = get_xsave_addr(xsave, XFEATURE_YMM); + if (s) + memcpy(save->fpreg_ymm, s, 256); + else + memset(save->fpreg_ymm, 0, 256); } pr_debug("Virtual Machine Save Area (VMSA):\n"); @@ -652,14 +916,29 @@ static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu, clflush_cache_range(svm->sev_es.vmsa, PAGE_SIZE); vmsa.reserved = 0; - vmsa.handle = to_kvm_svm(kvm)->sev_info.handle; + vmsa.handle = to_kvm_sev_info(kvm)->handle; vmsa.address = __sme_pa(svm->sev_es.vmsa); vmsa.len = PAGE_SIZE; ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error); if (ret) return ret; + /* + * SEV-ES guests maintain an encrypted version of their FPU + * state which is restored and saved on VMRUN and VMEXIT. + * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't + * do xsave/xrstor on it. + */ + fpstate_set_confidential(&vcpu->arch.guest_fpu); vcpu->arch.guest_state_protected = true; + + /* + * SEV-ES guest mandates LBR Virtualization to be _always_ ON. Enable it + * only after setting guest_state_protected because KVM_SET_MSRS allows + * dynamic toggling of LBRV (for performance reason) on write access to + * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set. + */ + svm_enable_lbrv(vcpu); return 0; } @@ -689,8 +968,7 @@ static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) { - void __user *measure = (void __user *)(uintptr_t)argp->data; - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + void __user *measure = u64_to_user_ptr(argp->data); struct sev_data_launch_measure data; struct kvm_sev_launch_measure params; void __user *p = NULL; @@ -709,7 +987,7 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!params.len) goto cmd; - p = (void __user *)(uintptr_t)params.uaddr; + p = u64_to_user_ptr(params.uaddr); if (p) { if (params.len > SEV_FW_BLOB_MAX_SIZE) return -EINVAL; @@ -723,7 +1001,7 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) } cmd: - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, &data, &argp->error); /* @@ -751,19 +1029,17 @@ e_free_blob: static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_launch_finish data; if (!sev_guest(kvm)) return -ENOTTY; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; return sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, &data, &argp->error); } static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct kvm_sev_guest_status params; struct sev_data_guest_status data; int ret; @@ -773,7 +1049,7 @@ static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp) memset(&data, 0, sizeof(data)); - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, &data, &argp->error); if (ret) return ret; @@ -782,7 +1058,7 @@ static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp) params.state = data.state; params.handle = data.handle; - if (copy_to_user((void __user *)(uintptr_t)argp->data, ¶ms, sizeof(params))) + if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, sizeof(params))) ret = -EFAULT; return ret; @@ -792,11 +1068,10 @@ static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src, unsigned long dst, int size, int *error, bool enc) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_dbg data; data.reserved = 0; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; data.dst_addr = dst; data.src_addr = src; data.len = size; @@ -947,7 +1222,7 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) if (!sev_guest(kvm)) return -ENOTTY; - if (copy_from_user(&debug, (void __user *)(uintptr_t)argp->data, sizeof(debug))) + if (copy_from_user(&debug, u64_to_user_ptr(argp->data), sizeof(debug))) return -EFAULT; if (!debug.len || debug.src_uaddr + debug.len < debug.src_uaddr) @@ -968,7 +1243,7 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) if (IS_ERR(src_p)) return PTR_ERR(src_p); - dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, 1); + dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, FOLL_WRITE); if (IS_ERR(dst_p)) { sev_unpin_memory(kvm, src_p, n); return PTR_ERR(dst_p); @@ -1020,7 +1295,6 @@ err: static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_launch_secret data; struct kvm_sev_launch_secret params; struct page **pages; @@ -1031,10 +1305,10 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!sev_guest(kvm)) return -ENOTTY; - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) return -EFAULT; - pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1); + pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, FOLL_WRITE); if (IS_ERR(pages)) return PTR_ERR(pages); @@ -1076,7 +1350,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) data.hdr_address = __psp_pa(hdr); data.hdr_len = params.hdr_len; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, &data, &argp->error); kfree(hdr); @@ -1095,8 +1369,7 @@ e_unpin_memory: static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp) { - void __user *report = (void __user *)(uintptr_t)argp->data; - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + void __user *report = u64_to_user_ptr(argp->data); struct sev_data_attestation_report data; struct kvm_sev_attestation_report params; void __user *p; @@ -1106,7 +1379,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!sev_guest(kvm)) return -ENOTTY; - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) return -EFAULT; memset(&data, 0, sizeof(data)); @@ -1115,7 +1388,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!params.len) goto cmd; - p = (void __user *)(uintptr_t)params.uaddr; + p = u64_to_user_ptr(params.uaddr); if (p) { if (params.len > SEV_FW_BLOB_MAX_SIZE) return -EINVAL; @@ -1129,7 +1402,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp) memcpy(data.mnonce, params.mnonce, sizeof(params.mnonce)); } cmd: - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, &data, &argp->error); /* * If we query the session length, FW responded with expected data. @@ -1159,16 +1432,15 @@ static int __sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp, struct kvm_sev_send_start *params) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_send_start data; int ret; memset(&data, 0, sizeof(data)); - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error); params->session_len = data.session_len; - if (copy_to_user((void __user *)(uintptr_t)argp->data, params, + if (copy_to_user(u64_to_user_ptr(argp->data), params, sizeof(struct kvm_sev_send_start))) ret = -EFAULT; @@ -1177,7 +1449,6 @@ __sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp, static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_send_start data; struct kvm_sev_send_start params; void *amd_certs, *session_data; @@ -1187,7 +1458,7 @@ static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!sev_guest(kvm)) return -ENOTTY; - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(struct kvm_sev_send_start))) return -EFAULT; @@ -1238,11 +1509,11 @@ static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp) data.amd_certs_len = params.amd_certs_len; data.session_address = __psp_pa(session_data); data.session_len = params.session_len; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error); - if (!ret && copy_to_user((void __user *)(uintptr_t)params.session_uaddr, + if (!ret && copy_to_user(u64_to_user_ptr(params.session_uaddr), session_data, params.session_len)) { ret = -EFAULT; goto e_free_amd_cert; @@ -1250,7 +1521,7 @@ static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp) params.policy = data.policy; params.session_len = data.session_len; - if (copy_to_user((void __user *)(uintptr_t)argp->data, ¶ms, + if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, sizeof(struct kvm_sev_send_start))) ret = -EFAULT; @@ -1270,18 +1541,17 @@ static int __sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp, struct kvm_sev_send_update_data *params) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_send_update_data data; int ret; memset(&data, 0, sizeof(data)); - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error); params->hdr_len = data.hdr_len; params->trans_len = data.trans_len; - if (copy_to_user((void __user *)(uintptr_t)argp->data, params, + if (copy_to_user(u64_to_user_ptr(argp->data), params, sizeof(struct kvm_sev_send_update_data))) ret = -EFAULT; @@ -1290,7 +1560,6 @@ __sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp, static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_send_update_data data; struct kvm_sev_send_update_data params; void *hdr, *trans_data; @@ -1301,7 +1570,7 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!sev_guest(kvm)) return -ENOTTY; - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(struct kvm_sev_send_update_data))) return -EFAULT; @@ -1326,11 +1595,11 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) /* allocate memory for header and transport buffer */ ret = -ENOMEM; - hdr = kzalloc(params.hdr_len, GFP_KERNEL_ACCOUNT); + hdr = kzalloc(params.hdr_len, GFP_KERNEL); if (!hdr) goto e_unpin; - trans_data = kzalloc(params.trans_len, GFP_KERNEL_ACCOUNT); + trans_data = kzalloc(params.trans_len, GFP_KERNEL); if (!trans_data) goto e_free_hdr; @@ -1344,7 +1613,7 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset; data.guest_address |= sev_me_mask; data.guest_len = params.guest_len; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error); @@ -1352,14 +1621,14 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) goto e_free_trans_data; /* copy transport buffer to user space */ - if (copy_to_user((void __user *)(uintptr_t)params.trans_uaddr, + if (copy_to_user(u64_to_user_ptr(params.trans_uaddr), trans_data, params.trans_len)) { ret = -EFAULT; goto e_free_trans_data; } /* Copy packet header to userspace. */ - if (copy_to_user((void __user *)(uintptr_t)params.hdr_uaddr, hdr, + if (copy_to_user(u64_to_user_ptr(params.hdr_uaddr), hdr, params.hdr_len)) ret = -EFAULT; @@ -1375,31 +1644,29 @@ e_unpin: static int sev_send_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_send_finish data; if (!sev_guest(kvm)) return -ENOTTY; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; return sev_issue_cmd(kvm, SEV_CMD_SEND_FINISH, &data, &argp->error); } static int sev_send_cancel(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_send_cancel data; if (!sev_guest(kvm)) return -ENOTTY; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; return sev_issue_cmd(kvm, SEV_CMD_SEND_CANCEL, &data, &argp->error); } static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct sev_data_receive_start start; struct kvm_sev_receive_start params; int *error = &argp->error; @@ -1411,7 +1678,7 @@ static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp) return -ENOTTY; /* Get parameter from the userspace */ - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(struct kvm_sev_receive_start))) return -EFAULT; @@ -1453,7 +1720,7 @@ static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp) } params.handle = start.handle; - if (copy_to_user((void __user *)(uintptr_t)argp->data, + if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, sizeof(struct kvm_sev_receive_start))) { ret = -EFAULT; sev_unbind_asid(kvm, start.handle); @@ -1473,7 +1740,6 @@ e_free_pdh: static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct kvm_sev_receive_update_data params; struct sev_data_receive_update_data data; void *hdr = NULL, *trans = NULL; @@ -1484,7 +1750,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!sev_guest(kvm)) return -EINVAL; - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(struct kvm_sev_receive_update_data))) return -EFAULT; @@ -1516,7 +1782,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) /* Pin guest memory */ guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK, - PAGE_SIZE, &n, 1); + PAGE_SIZE, &n, FOLL_WRITE); if (IS_ERR(guest_page)) { ret = PTR_ERR(guest_page); goto e_free_trans; @@ -1533,7 +1799,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset; data.guest_address |= sev_me_mask; data.guest_len = params.guest_len; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_RECEIVE_UPDATE_DATA, &data, &argp->error); @@ -1550,13 +1816,12 @@ e_free_hdr: static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_receive_finish data; if (!sev_guest(kvm)) return -ENOTTY; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error); } @@ -1576,8 +1841,8 @@ static bool is_cmd_allowed_from_mirror(u32 cmd_id) static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) { - struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info; - struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info; + struct kvm_sev_info *dst_sev = to_kvm_sev_info(dst_kvm); + struct kvm_sev_info *src_sev = to_kvm_sev_info(src_kvm); int r = -EBUSY; if (dst_kvm == src_kvm) @@ -1611,8 +1876,8 @@ release_dst: static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) { - struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info; - struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info; + struct kvm_sev_info *dst_sev = to_kvm_sev_info(dst_kvm); + struct kvm_sev_info *src_sev = to_kvm_sev_info(src_kvm); mutex_unlock(&dst_kvm->lock); mutex_unlock(&src_kvm->lock); @@ -1620,74 +1885,10 @@ static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) atomic_set_release(&src_sev->migration_in_progress, 0); } -/* vCPU mutex subclasses. */ -enum sev_migration_role { - SEV_MIGRATION_SOURCE = 0, - SEV_MIGRATION_TARGET, - SEV_NR_MIGRATION_ROLES, -}; - -static int sev_lock_vcpus_for_migration(struct kvm *kvm, - enum sev_migration_role role) -{ - struct kvm_vcpu *vcpu; - unsigned long i, j; - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (mutex_lock_killable_nested(&vcpu->mutex, role)) - goto out_unlock; - -#ifdef CONFIG_PROVE_LOCKING - if (!i) - /* - * Reset the role to one that avoids colliding with - * the role used for the first vcpu mutex. - */ - role = SEV_NR_MIGRATION_ROLES; - else - mutex_release(&vcpu->mutex.dep_map, _THIS_IP_); -#endif - } - - return 0; - -out_unlock: - - kvm_for_each_vcpu(j, vcpu, kvm) { - if (i == j) - break; - -#ifdef CONFIG_PROVE_LOCKING - if (j) - mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_); -#endif - - mutex_unlock(&vcpu->mutex); - } - return -EINTR; -} - -static void sev_unlock_vcpus_for_migration(struct kvm *kvm) -{ - struct kvm_vcpu *vcpu; - unsigned long i; - bool first = true; - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (first) - first = false; - else - mutex_acquire(&vcpu->mutex.dep_map, - SEV_NR_MIGRATION_ROLES, 0, _THIS_IP_); - - mutex_unlock(&vcpu->mutex); - } -} - static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm) { - struct kvm_sev_info *dst = &to_kvm_svm(dst_kvm)->sev_info; - struct kvm_sev_info *src = &to_kvm_svm(src_kvm)->sev_info; + struct kvm_sev_info *dst = to_kvm_sev_info(dst_kvm); + struct kvm_sev_info *src = to_kvm_sev_info(src_kvm); struct kvm_vcpu *dst_vcpu, *src_vcpu; struct vcpu_svm *dst_svm, *src_svm; struct kvm_sev_info *mirror; @@ -1699,6 +1900,7 @@ static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm) dst->pages_locked = src->pages_locked; dst->enc_context_owner = src->enc_context_owner; dst->es_active = src->es_active; + dst->vmsa_features = src->vmsa_features; src->asid = 0; src->active = false; @@ -1726,8 +1928,7 @@ static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm) * and add the new mirror to the list. */ if (is_mirroring_enc_context(dst_kvm)) { - struct kvm_sev_info *owner_sev_info = - &to_kvm_svm(dst->enc_context_owner)->sev_info; + struct kvm_sev_info *owner_sev_info = to_kvm_sev_info(dst->enc_context_owner); list_del(&src->mirror_entry); list_add_tail(&dst->mirror_entry, &owner_sev_info->mirror_vms); @@ -1786,32 +1987,31 @@ static int sev_check_source_vcpus(struct kvm *dst, struct kvm *src) int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) { - struct kvm_sev_info *dst_sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *dst_sev = to_kvm_sev_info(kvm); struct kvm_sev_info *src_sev, *cg_cleanup_sev; - struct fd f = fdget(source_fd); + CLASS(fd, f)(source_fd); struct kvm *source_kvm; bool charged = false; int ret; - if (!f.file) + if (fd_empty(f)) return -EBADF; - if (!file_is_kvm(f.file)) { - ret = -EBADF; - goto out_fput; - } + if (!file_is_kvm(fd_file(f))) + return -EBADF; - source_kvm = f.file->private_data; + source_kvm = fd_file(f)->private_data; ret = sev_lock_two_vms(kvm, source_kvm); if (ret) - goto out_fput; + return ret; - if (sev_guest(kvm) || !sev_guest(source_kvm)) { + if (kvm->arch.vm_type != source_kvm->arch.vm_type || + sev_guest(kvm) || !sev_guest(source_kvm)) { ret = -EINVAL; goto out_unlock; } - src_sev = &to_kvm_svm(source_kvm)->sev_info; + src_sev = to_kvm_sev_info(source_kvm); dst_sev->misc_cg = get_current_misc_cg(); cg_cleanup_sev = dst_sev; @@ -1822,10 +2022,10 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) charged = true; } - ret = sev_lock_vcpus_for_migration(kvm, SEV_MIGRATION_SOURCE); + ret = kvm_lock_all_vcpus(kvm); if (ret) goto out_dst_cgroup; - ret = sev_lock_vcpus_for_migration(source_kvm, SEV_MIGRATION_TARGET); + ret = kvm_lock_all_vcpus(source_kvm); if (ret) goto out_dst_vcpu; @@ -1839,9 +2039,9 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) ret = 0; out_source_vcpu: - sev_unlock_vcpus_for_migration(source_kvm); + kvm_unlock_all_vcpus(source_kvm); out_dst_vcpu: - sev_unlock_vcpus_for_migration(kvm); + kvm_unlock_all_vcpus(kvm); out_dst_cgroup: /* Operates on the source on success, on the destination on failure. */ if (charged) @@ -1850,8 +2050,429 @@ out_dst_cgroup: cg_cleanup_sev->misc_cg = NULL; out_unlock: sev_unlock_two_vms(kvm, source_kvm); -out_fput: - fdput(f); + return ret; +} + +int sev_dev_get_attr(u32 group, u64 attr, u64 *val) +{ + if (group != KVM_X86_GRP_SEV) + return -ENXIO; + + switch (attr) { + case KVM_X86_SEV_VMSA_FEATURES: + *val = sev_supported_vmsa_features; + return 0; + + default: + return -ENXIO; + } +} + +/* + * The guest context contains all the information, keys and metadata + * associated with the guest that the firmware tracks to implement SEV + * and SNP features. The firmware stores the guest context in hypervisor + * provide page via the SNP_GCTX_CREATE command. + */ +static void *snp_context_create(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct sev_data_snp_addr data = {}; + void *context; + int rc; + + /* Allocate memory for context page */ + context = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT); + if (!context) + return NULL; + + data.address = __psp_pa(context); + rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_GCTX_CREATE, &data, &argp->error); + if (rc) { + pr_warn("Failed to create SEV-SNP context, rc %d fw_error %d", + rc, argp->error); + snp_free_firmware_page(context); + return NULL; + } + + return context; +} + +static int snp_bind_asid(struct kvm *kvm, int *error) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + struct sev_data_snp_activate data = {0}; + + data.gctx_paddr = __psp_pa(sev->snp_context); + data.asid = sev_get_asid(kvm); + return sev_issue_cmd(kvm, SEV_CMD_SNP_ACTIVATE, &data, error); +} + +static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + struct sev_data_snp_launch_start start = {0}; + struct kvm_sev_snp_launch_start params; + int rc; + + if (!sev_snp_guest(kvm)) + return -ENOTTY; + + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) + return -EFAULT; + + /* Don't allow userspace to allocate memory for more than 1 SNP context. */ + if (sev->snp_context) + return -EINVAL; + + if (params.flags) + return -EINVAL; + + if (params.policy & ~SNP_POLICY_MASK_VALID) + return -EINVAL; + + /* Check for policy bits that must be set */ + if (!(params.policy & SNP_POLICY_MASK_RSVD_MBO) || + !(params.policy & SNP_POLICY_MASK_SMT)) + return -EINVAL; + + if (params.policy & SNP_POLICY_MASK_SINGLE_SOCKET) + return -EINVAL; + + sev->policy = params.policy; + + sev->snp_context = snp_context_create(kvm, argp); + if (!sev->snp_context) + return -ENOTTY; + + start.gctx_paddr = __psp_pa(sev->snp_context); + start.policy = params.policy; + memcpy(start.gosvw, params.gosvw, sizeof(params.gosvw)); + rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_START, &start, &argp->error); + if (rc) { + pr_debug("%s: SEV_CMD_SNP_LAUNCH_START firmware command failed, rc %d\n", + __func__, rc); + goto e_free_context; + } + + sev->fd = argp->sev_fd; + rc = snp_bind_asid(kvm, &argp->error); + if (rc) { + pr_debug("%s: Failed to bind ASID to SEV-SNP context, rc %d\n", + __func__, rc); + goto e_free_context; + } + + return 0; + +e_free_context: + snp_decommission_context(kvm); + + return rc; +} + +struct sev_gmem_populate_args { + __u8 type; + int sev_fd; + int fw_error; +}; + +static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pfn, + void __user *src, int order, void *opaque) +{ + struct sev_gmem_populate_args *sev_populate_args = opaque; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + int n_private = 0, ret, i; + int npages = (1 << order); + gfn_t gfn; + + if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src)) + return -EINVAL; + + for (gfn = gfn_start, i = 0; gfn < gfn_start + npages; gfn++, i++) { + struct sev_data_snp_launch_update fw_args = {0}; + bool assigned = false; + int level; + + ret = snp_lookup_rmpentry((u64)pfn + i, &assigned, &level); + if (ret || assigned) { + pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n", + __func__, gfn, ret, assigned); + ret = ret ? -EINVAL : -EEXIST; + goto err; + } + + if (src) { + void *vaddr = kmap_local_pfn(pfn + i); + + if (copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE)) { + ret = -EFAULT; + goto err; + } + kunmap_local(vaddr); + } + + ret = rmp_make_private(pfn + i, gfn << PAGE_SHIFT, PG_LEVEL_4K, + sev_get_asid(kvm), true); + if (ret) + goto err; + + n_private++; + + fw_args.gctx_paddr = __psp_pa(sev->snp_context); + fw_args.address = __sme_set(pfn_to_hpa(pfn + i)); + fw_args.page_size = PG_LEVEL_TO_RMP(PG_LEVEL_4K); + fw_args.page_type = sev_populate_args->type; + + ret = __sev_issue_cmd(sev_populate_args->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE, + &fw_args, &sev_populate_args->fw_error); + if (ret) + goto fw_err; + } + + return 0; + +fw_err: + /* + * If the firmware command failed handle the reclaim and cleanup of that + * PFN specially vs. prior pages which can be cleaned up below without + * needing to reclaim in advance. + * + * Additionally, when invalid CPUID function entries are detected, + * firmware writes the expected values into the page and leaves it + * unencrypted so it can be used for debugging and error-reporting. + * + * Copy this page back into the source buffer so userspace can use this + * information to provide information on which CPUID leaves/fields + * failed CPUID validation. + */ + if (!snp_page_reclaim(kvm, pfn + i) && + sev_populate_args->type == KVM_SEV_SNP_PAGE_TYPE_CPUID && + sev_populate_args->fw_error == SEV_RET_INVALID_PARAM) { + void *vaddr = kmap_local_pfn(pfn + i); + + if (copy_to_user(src + i * PAGE_SIZE, vaddr, PAGE_SIZE)) + pr_debug("Failed to write CPUID page back to userspace\n"); + + kunmap_local(vaddr); + } + + /* pfn + i is hypervisor-owned now, so skip below cleanup for it. */ + n_private--; + +err: + pr_debug("%s: exiting with error ret %d (fw_error %d), restoring %d gmem PFNs to shared.\n", + __func__, ret, sev_populate_args->fw_error, n_private); + for (i = 0; i < n_private; i++) + kvm_rmp_make_shared(kvm, pfn + i, PG_LEVEL_4K); + + return ret; +} + +static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + struct sev_gmem_populate_args sev_populate_args = {0}; + struct kvm_sev_snp_launch_update params; + struct kvm_memory_slot *memslot; + long npages, count; + void __user *src; + int ret = 0; + + if (!sev_snp_guest(kvm) || !sev->snp_context) + return -EINVAL; + + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) + return -EFAULT; + + pr_debug("%s: GFN start 0x%llx length 0x%llx type %d flags %d\n", __func__, + params.gfn_start, params.len, params.type, params.flags); + + if (!PAGE_ALIGNED(params.len) || params.flags || + (params.type != KVM_SEV_SNP_PAGE_TYPE_NORMAL && + params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO && + params.type != KVM_SEV_SNP_PAGE_TYPE_UNMEASURED && + params.type != KVM_SEV_SNP_PAGE_TYPE_SECRETS && + params.type != KVM_SEV_SNP_PAGE_TYPE_CPUID)) + return -EINVAL; + + npages = params.len / PAGE_SIZE; + + /* + * For each GFN that's being prepared as part of the initial guest + * state, the following pre-conditions are verified: + * + * 1) The backing memslot is a valid private memslot. + * 2) The GFN has been set to private via KVM_SET_MEMORY_ATTRIBUTES + * beforehand. + * 3) The PFN of the guest_memfd has not already been set to private + * in the RMP table. + * + * The KVM MMU relies on kvm->mmu_invalidate_seq to retry nested page + * faults if there's a race between a fault and an attribute update via + * KVM_SET_MEMORY_ATTRIBUTES, and a similar approach could be utilized + * here. However, kvm->slots_lock guards against both this as well as + * concurrent memslot updates occurring while these checks are being + * performed, so use that here to make it easier to reason about the + * initial expected state and better guard against unexpected + * situations. + */ + mutex_lock(&kvm->slots_lock); + + memslot = gfn_to_memslot(kvm, params.gfn_start); + if (!kvm_slot_can_be_private(memslot)) { + ret = -EINVAL; + goto out; + } + + sev_populate_args.sev_fd = argp->sev_fd; + sev_populate_args.type = params.type; + src = params.type == KVM_SEV_SNP_PAGE_TYPE_ZERO ? NULL : u64_to_user_ptr(params.uaddr); + + count = kvm_gmem_populate(kvm, params.gfn_start, src, npages, + sev_gmem_post_populate, &sev_populate_args); + if (count < 0) { + argp->error = sev_populate_args.fw_error; + pr_debug("%s: kvm_gmem_populate failed, ret %ld (fw_error %d)\n", + __func__, count, argp->error); + ret = -EIO; + } else { + params.gfn_start += count; + params.len -= count * PAGE_SIZE; + if (params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO) + params.uaddr += count * PAGE_SIZE; + + ret = 0; + if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, sizeof(params))) + ret = -EFAULT; + } + +out: + mutex_unlock(&kvm->slots_lock); + + return ret; +} + +static int snp_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + struct sev_data_snp_launch_update data = {}; + struct kvm_vcpu *vcpu; + unsigned long i; + int ret; + + data.gctx_paddr = __psp_pa(sev->snp_context); + data.page_type = SNP_PAGE_TYPE_VMSA; + + kvm_for_each_vcpu(i, vcpu, kvm) { + struct vcpu_svm *svm = to_svm(vcpu); + u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT; + + ret = sev_es_sync_vmsa(svm); + if (ret) + return ret; + + /* Transition the VMSA page to a firmware state. */ + ret = rmp_make_private(pfn, INITIAL_VMSA_GPA, PG_LEVEL_4K, sev->asid, true); + if (ret) + return ret; + + /* Issue the SNP command to encrypt the VMSA */ + data.address = __sme_pa(svm->sev_es.vmsa); + ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE, + &data, &argp->error); + if (ret) { + snp_page_reclaim(kvm, pfn); + + return ret; + } + + svm->vcpu.arch.guest_state_protected = true; + /* + * SEV-ES (and thus SNP) guest mandates LBR Virtualization to + * be _always_ ON. Enable it only after setting + * guest_state_protected because KVM_SET_MSRS allows dynamic + * toggling of LBRV (for performance reason) on write access to + * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set. + */ + svm_enable_lbrv(vcpu); + } + + return 0; +} + +static int snp_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + struct kvm_sev_snp_launch_finish params; + struct sev_data_snp_launch_finish *data; + void *id_block = NULL, *id_auth = NULL; + int ret; + + if (!sev_snp_guest(kvm)) + return -ENOTTY; + + if (!sev->snp_context) + return -EINVAL; + + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) + return -EFAULT; + + if (params.flags) + return -EINVAL; + + /* Measure all vCPUs using LAUNCH_UPDATE before finalizing the launch flow. */ + ret = snp_launch_update_vmsa(kvm, argp); + if (ret) + return ret; + + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); + if (!data) + return -ENOMEM; + + if (params.id_block_en) { + id_block = psp_copy_user_blob(params.id_block_uaddr, KVM_SEV_SNP_ID_BLOCK_SIZE); + if (IS_ERR(id_block)) { + ret = PTR_ERR(id_block); + goto e_free; + } + + data->id_block_en = 1; + data->id_block_paddr = __sme_pa(id_block); + + id_auth = psp_copy_user_blob(params.id_auth_uaddr, KVM_SEV_SNP_ID_AUTH_SIZE); + if (IS_ERR(id_auth)) { + ret = PTR_ERR(id_auth); + goto e_free_id_block; + } + + data->id_auth_paddr = __sme_pa(id_auth); + + if (params.auth_key_en) + data->auth_key_en = 1; + } + + data->vcek_disabled = params.vcek_disabled; + + memcpy(data->host_data, params.host_data, KVM_SEV_SNP_FINISH_DATA_SIZE); + data->gctx_paddr = __psp_pa(sev->snp_context); + ret = sev_issue_cmd(kvm, SEV_CMD_SNP_LAUNCH_FINISH, data, &argp->error); + + /* + * Now that there will be no more SNP_LAUNCH_UPDATE ioctls, private pages + * can be given to the guest simply by marking the RMP entry as private. + * This can happen on first access and also with KVM_PRE_FAULT_MEMORY. + */ + if (!ret) + kvm->arch.pre_fault_allowed = true; + + kfree(id_auth); + +e_free_id_block: + kfree(id_block); + +e_free: + kfree(data); + return ret; } @@ -1878,6 +2499,15 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) goto out; } + /* + * Once KVM_SEV_INIT2 initializes a KVM instance as an SNP guest, only + * allow the use of SNP-specific commands. + */ + if (sev_snp_guest(kvm) && sev_cmd.id < KVM_SEV_SNP_LAUNCH_START) { + r = -EPERM; + goto out; + } + switch (sev_cmd.id) { case KVM_SEV_ES_INIT: if (!sev_es_enabled) { @@ -1888,6 +2518,9 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) case KVM_SEV_INIT: r = sev_guest_init(kvm, &sev_cmd); break; + case KVM_SEV_INIT2: + r = sev_guest_init2(kvm, &sev_cmd); + break; case KVM_SEV_LAUNCH_START: r = sev_launch_start(kvm, &sev_cmd); break; @@ -1939,6 +2572,15 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) case KVM_SEV_RECEIVE_FINISH: r = sev_receive_finish(kvm, &sev_cmd); break; + case KVM_SEV_SNP_LAUNCH_START: + r = snp_launch_start(kvm, &sev_cmd); + break; + case KVM_SEV_SNP_LAUNCH_UPDATE: + r = snp_launch_update(kvm, &sev_cmd); + break; + case KVM_SEV_SNP_LAUNCH_FINISH: + r = snp_launch_finish(kvm, &sev_cmd); + break; default: r = -EINVAL; goto out; @@ -1955,7 +2597,7 @@ out: int sev_mem_enc_register_region(struct kvm *kvm, struct kvm_enc_region *range) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct enc_region *region; int ret = 0; @@ -1974,7 +2616,8 @@ int sev_mem_enc_register_region(struct kvm *kvm, return -ENOMEM; mutex_lock(&kvm->lock); - region->pages = sev_pin_memory(kvm, range->addr, range->size, ®ion->npages, 1); + region->pages = sev_pin_memory(kvm, range->addr, range->size, ®ion->npages, + FOLL_WRITE | FOLL_LONGTERM); if (IS_ERR(region->pages)) { ret = PTR_ERR(region->pages); mutex_unlock(&kvm->lock); @@ -2007,7 +2650,7 @@ e_free: static struct enc_region * find_enc_region(struct kvm *kvm, struct kvm_enc_region *range) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct list_head *head = &sev->regions_list; struct enc_region *i; @@ -2070,23 +2713,21 @@ failed: int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) { - struct fd f = fdget(source_fd); + CLASS(fd, f)(source_fd); struct kvm *source_kvm; struct kvm_sev_info *source_sev, *mirror_sev; int ret; - if (!f.file) + if (fd_empty(f)) return -EBADF; - if (!file_is_kvm(f.file)) { - ret = -EBADF; - goto e_source_fput; - } + if (!file_is_kvm(fd_file(f))) + return -EBADF; - source_kvm = f.file->private_data; + source_kvm = fd_file(f)->private_data; ret = sev_lock_two_vms(kvm, source_kvm); if (ret) - goto e_source_fput; + return ret; /* * Mirrors of mirrors should work, but let's not get silly. Also @@ -2104,9 +2745,9 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) * The mirror kvm holds an enc_context_owner ref so its asid can't * disappear until we're done with it */ - source_sev = &to_kvm_svm(source_kvm)->sev_info; + source_sev = to_kvm_sev_info(source_kvm); kvm_get_kvm(source_kvm); - mirror_sev = &to_kvm_svm(kvm)->sev_info; + mirror_sev = to_kvm_sev_info(kvm); list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms); /* Set enc_context_owner and copy its encryption context over */ @@ -2115,6 +2756,7 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) mirror_sev->asid = source_sev->asid; mirror_sev->fd = source_sev->fd; mirror_sev->es_active = source_sev->es_active; + mirror_sev->need_init = false; mirror_sev->handle = source_sev->handle; INIT_LIST_HEAD(&mirror_sev->regions_list); INIT_LIST_HEAD(&mirror_sev->mirror_vms); @@ -2128,14 +2770,37 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) e_unlock: sev_unlock_two_vms(kvm, source_kvm); -e_source_fput: - fdput(f); return ret; } +static int snp_decommission_context(struct kvm *kvm) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + struct sev_data_snp_addr data = {}; + int ret; + + /* If context is not created then do nothing */ + if (!sev->snp_context) + return 0; + + /* Do the decommision, which will unbind the ASID from the SNP context */ + data.address = __sme_pa(sev->snp_context); + down_write(&sev_deactivate_lock); + ret = sev_do_cmd(SEV_CMD_SNP_DECOMMISSION, &data, NULL); + up_write(&sev_deactivate_lock); + + if (WARN_ONCE(ret, "Failed to release guest context, ret %d", ret)) + return ret; + + snp_free_firmware_page(sev->snp_context); + sev->snp_context = NULL; + + return 0; +} + void sev_vm_destroy(struct kvm *kvm) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct list_head *head = &sev->regions_list; struct list_head *pos, *q; @@ -2174,22 +2839,70 @@ void sev_vm_destroy(struct kvm *kvm) } } - sev_unbind_asid(kvm, sev->handle); + if (sev_snp_guest(kvm)) { + snp_guest_req_cleanup(kvm); + + /* + * Decomission handles unbinding of the ASID. If it fails for + * some unexpected reason, just leak the ASID. + */ + if (snp_decommission_context(kvm)) + return; + } else { + sev_unbind_asid(kvm, sev->handle); + } + sev_asid_free(sev); } void __init sev_set_cpu_caps(void) { - if (!sev_enabled) - kvm_cpu_cap_clear(X86_FEATURE_SEV); - if (!sev_es_enabled) - kvm_cpu_cap_clear(X86_FEATURE_SEV_ES); + if (sev_enabled) { + kvm_cpu_cap_set(X86_FEATURE_SEV); + kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_VM); + } + if (sev_es_enabled) { + kvm_cpu_cap_set(X86_FEATURE_SEV_ES); + kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_ES_VM); + } + if (sev_snp_enabled) { + kvm_cpu_cap_set(X86_FEATURE_SEV_SNP); + kvm_caps.supported_vm_types |= BIT(KVM_X86_SNP_VM); + } +} + +static bool is_sev_snp_initialized(void) +{ + struct sev_user_data_snp_status *status; + struct sev_data_snp_addr buf; + bool initialized = false; + int ret, error = 0; + + status = snp_alloc_firmware_page(GFP_KERNEL | __GFP_ZERO); + if (!status) + return false; + + buf.address = __psp_pa(status); + ret = sev_do_cmd(SEV_CMD_SNP_PLATFORM_STATUS, &buf, &error); + if (ret) { + pr_err("SEV: SNP_PLATFORM_STATUS failed ret=%d, fw_error=%d (%#x)\n", + ret, error, error); + goto out; + } + + initialized = !!status->state; + +out: + snp_free_firmware_page(status); + + return initialized; } void __init sev_hardware_setup(void) { -#ifdef CONFIG_KVM_AMD_SEV unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count; + struct sev_platform_init_args init_args = {0}; + bool sev_snp_supported = false; bool sev_es_supported = false; bool sev_supported = false; @@ -2208,6 +2921,16 @@ void __init sev_hardware_setup(void) WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_FLUSHBYASID))) goto out; + /* + * The kernel's initcall infrastructure lacks the ability to express + * dependencies between initcalls, whereas the modules infrastructure + * automatically handles dependencies via symbol loading. Ensure the + * PSP SEV driver is initialized before proceeding if KVM is built-in, + * as the dependency isn't handled by the initcall infrastructure. + */ + if (IS_BUILTIN(CONFIG_KVM_AMD) && sev_module_init()) + goto out; + /* Retrieve SEV CPUID information */ cpuid(0x8000001f, &eax, &ebx, &ecx, &edx); @@ -2240,8 +2963,10 @@ void __init sev_hardware_setup(void) goto out; } - sev_asid_count = max_sev_asid - min_sev_asid + 1; - WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count)); + if (min_sev_asid <= max_sev_asid) { + sev_asid_count = max_sev_asid - min_sev_asid + 1; + WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count)); + } sev_supported = true; /* SEV-ES support requested? */ @@ -2261,6 +2986,12 @@ void __init sev_hardware_setup(void) if (!boot_cpu_has(X86_FEATURE_SEV_ES)) goto out; + if (!lbrv) { + WARN_ONCE(!boot_cpu_has(X86_FEATURE_LBRV), + "LBRV must be present for SEV-ES support"); + goto out; + } + /* Has the system been allocated ASIDs for SEV-ES? */ if (min_sev_asid == 1) goto out; @@ -2268,23 +2999,43 @@ void __init sev_hardware_setup(void) sev_es_asid_count = min_sev_asid - 1; WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count)); sev_es_supported = true; + sev_snp_supported = sev_snp_enabled && cc_platform_has(CC_ATTR_HOST_SEV_SNP); out: + if (sev_enabled) { + init_args.probe = true; + if (sev_platform_init(&init_args)) + sev_supported = sev_es_supported = sev_snp_supported = false; + else if (sev_snp_supported) + sev_snp_supported = is_sev_snp_initialized(); + } + if (boot_cpu_has(X86_FEATURE_SEV)) pr_info("SEV %s (ASIDs %u - %u)\n", - sev_supported ? "enabled" : "disabled", + sev_supported ? min_sev_asid <= max_sev_asid ? "enabled" : + "unusable" : + "disabled", min_sev_asid, max_sev_asid); if (boot_cpu_has(X86_FEATURE_SEV_ES)) pr_info("SEV-ES %s (ASIDs %u - %u)\n", - sev_es_supported ? "enabled" : "disabled", + str_enabled_disabled(sev_es_supported), + min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1); + if (boot_cpu_has(X86_FEATURE_SEV_SNP)) + pr_info("SEV-SNP %s (ASIDs %u - %u)\n", + str_enabled_disabled(sev_snp_supported), min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1); sev_enabled = sev_supported; sev_es_enabled = sev_es_supported; + sev_snp_enabled = sev_snp_supported; + if (!sev_es_enabled || !cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) || !cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP)) sev_es_debug_swap_enabled = false; -#endif + + sev_supported_vmsa_features = 0; + if (sev_es_debug_swap_enabled) + sev_supported_vmsa_features |= SVM_SEV_FEAT_DEBUG_SWAP; } void sev_hardware_unsetup(void) @@ -2300,6 +3051,8 @@ void sev_hardware_unsetup(void) misc_cg_set_capacity(MISC_CG_RES_SEV, 0); misc_cg_set_capacity(MISC_CG_RES_SEV_ES, 0); + + sev_platform_shutdown(); } int sev_cpu_init(struct svm_cpu_data *sd) @@ -2320,7 +3073,7 @@ int sev_cpu_init(struct svm_cpu_data *sd) */ static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va) { - int asid = to_kvm_svm(vcpu->kvm)->sev_info.asid; + unsigned int asid = sev_get_asid(vcpu->kvm); /* * Note! The address must be a kernel address, as regular page walk @@ -2345,7 +3098,7 @@ static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va) * back to WBINVD if this faults so as not to make any problems worse * by leaving stale encrypted data in the cache. */ - if (WARN_ON_ONCE(wrmsrl_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid))) + if (WARN_ON_ONCE(wrmsrq_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid))) goto do_wbinvd; return; @@ -2356,7 +3109,13 @@ do_wbinvd: void sev_guest_memory_reclaimed(struct kvm *kvm) { - if (!sev_guest(kvm)) + /* + * With SNP+gmem, private/encrypted memory is unreachable via the + * hva-based mmu notifiers, so these events are only actually + * pertaining to shared pages where there is no need to perform + * the WBINVD to flush associated caches. + */ + if (!sev_guest(kvm) || sev_snp_guest(kvm)) return; wbinvd_on_all_cpus(); @@ -2371,18 +3130,36 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu) svm = to_svm(vcpu); + /* + * If it's an SNP guest, then the VMSA was marked in the RMP table as + * a guest-owned page. Transition the page to hypervisor state before + * releasing it back to the system. + */ + if (sev_snp_guest(vcpu->kvm)) { + u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT; + + if (kvm_rmp_make_shared(vcpu->kvm, pfn, PG_LEVEL_4K)) + goto skip_vmsa_free; + } + if (vcpu->arch.guest_state_protected) sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa); __free_page(virt_to_page(svm->sev_es.vmsa)); +skip_vmsa_free: if (svm->sev_es.ghcb_sa_free) kvfree(svm->sev_es.ghcb_sa); } +static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control) +{ + return (((u64)control->exit_code_hi) << 32) | control->exit_code; +} + static void dump_ghcb(struct vcpu_svm *svm) { - struct ghcb *ghcb = svm->sev_es.ghcb; + struct vmcb_control_area *control = &svm->vmcb->control; unsigned int nbits; /* Re-use the dump_invalid_vmcb module parameter */ @@ -2391,18 +3168,24 @@ static void dump_ghcb(struct vcpu_svm *svm) return; } - nbits = sizeof(ghcb->save.valid_bitmap) * 8; + nbits = sizeof(svm->sev_es.valid_bitmap) * 8; - pr_err("GHCB (GPA=%016llx):\n", svm->vmcb->control.ghcb_gpa); + /* + * Print KVM's snapshot of the GHCB values that were (unsuccessfully) + * used to handle the exit. If the guest has since modified the GHCB + * itself, dumping the raw GHCB won't help debug why KVM was unable to + * handle the VMGEXIT that KVM observed. + */ + pr_err("GHCB (GPA=%016llx) snapshot:\n", svm->vmcb->control.ghcb_gpa); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code", - ghcb->save.sw_exit_code, ghcb_sw_exit_code_is_valid(ghcb)); + kvm_ghcb_get_sw_exit_code(control), kvm_ghcb_sw_exit_code_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1", - ghcb->save.sw_exit_info_1, ghcb_sw_exit_info_1_is_valid(ghcb)); + control->exit_info_1, kvm_ghcb_sw_exit_info_1_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2", - ghcb->save.sw_exit_info_2, ghcb_sw_exit_info_2_is_valid(ghcb)); + control->exit_info_2, kvm_ghcb_sw_exit_info_2_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_scratch", - ghcb->save.sw_scratch, ghcb_sw_scratch_is_valid(ghcb)); - pr_err("%-20s%*pb\n", "valid_bitmap", nbits, ghcb->save.valid_bitmap); + svm->sev_es.sw_scratch, kvm_ghcb_sw_scratch_is_valid(svm)); + pr_err("%-20s%*pb\n", "valid_bitmap", nbits, svm->sev_es.valid_bitmap); } static void sev_es_sync_to_ghcb(struct vcpu_svm *svm) @@ -2458,7 +3241,7 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) if (kvm_ghcb_xcr0_is_valid(svm)) { vcpu->arch.xcr0 = ghcb_get_xcr0(ghcb); - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; } /* Copy the GHCB exit information into the VMCB fields */ @@ -2473,11 +3256,6 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); } -static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control) -{ - return (((u64)control->exit_code_hi) << 32) | control->exit_code; -} - static int sev_es_validate_vmgexit(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -2571,10 +3349,31 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) if (!kvm_ghcb_sw_scratch_is_valid(svm)) goto vmgexit_err; break; + case SVM_VMGEXIT_AP_CREATION: + if (!sev_snp_guest(vcpu->kvm)) + goto vmgexit_err; + if (lower_32_bits(control->exit_info_1) != SVM_VMGEXIT_AP_DESTROY) + if (!kvm_ghcb_rax_is_valid(svm)) + goto vmgexit_err; + break; case SVM_VMGEXIT_NMI_COMPLETE: case SVM_VMGEXIT_AP_HLT_LOOP: case SVM_VMGEXIT_AP_JUMP_TABLE: case SVM_VMGEXIT_UNSUPPORTED_EVENT: + case SVM_VMGEXIT_HV_FEATURES: + case SVM_VMGEXIT_TERM_REQUEST: + break; + case SVM_VMGEXIT_PSC: + if (!sev_snp_guest(vcpu->kvm) || !kvm_ghcb_sw_scratch_is_valid(svm)) + goto vmgexit_err; + break; + case SVM_VMGEXIT_GUEST_REQUEST: + case SVM_VMGEXIT_EXT_GUEST_REQUEST: + if (!sev_snp_guest(vcpu->kvm) || + !PAGE_ALIGNED(control->exit_info_1) || + !PAGE_ALIGNED(control->exit_info_2) || + control->exit_info_1 == control->exit_info_2) + goto vmgexit_err; break; default: reason = GHCB_ERR_INVALID_EVENT; @@ -2596,8 +3395,7 @@ vmgexit_err: dump_ghcb(svm); } - ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, reason); + svm_vmgexit_bad_input(svm, reason); /* Resume the guest to "return" the error code. */ return 1; @@ -2605,6 +3403,9 @@ vmgexit_err: void sev_es_unmap_ghcb(struct vcpu_svm *svm) { + /* Clear any indication that the vCPU is in a type of AP Reset Hold */ + svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NONE; + if (!svm->sev_es.ghcb) return; @@ -2631,14 +3432,23 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm) sev_es_sync_to_ghcb(svm); - kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map, true); + kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map); svm->sev_es.ghcb = NULL; } -void pre_sev_run(struct vcpu_svm *svm, int cpu) +int pre_sev_run(struct vcpu_svm *svm, int cpu) { struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); - int asid = sev_get_asid(svm->vcpu.kvm); + struct kvm *kvm = svm->vcpu.kvm; + unsigned int asid = sev_get_asid(kvm); + + /* + * Reject KVM_RUN if userspace attempts to run the vCPU with an invalid + * VMSA, e.g. if userspace forces the vCPU to be RUNNABLE after an SNP + * AP Destroy event. + */ + if (sev_es_guest(kvm) && !VALID_PAGE(svm->vmcb->control.vmsa_pa)) + return -EINVAL; /* Assign the asid allocated with this SEV guest */ svm->asid = asid; @@ -2651,11 +3461,12 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu) */ if (sd->sev_vmcbs[asid] == svm->vmcb && svm->vcpu.arch.last_vmentry_cpu == cpu) - return; + return 0; sd->sev_vmcbs[asid] = svm->vmcb; svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; vmcb_mark_dirty(svm->vmcb, VMCB_ASID); + return 0; } #define GHCB_SCRATCH_AREA_LIMIT (16ULL * PAGE_SIZE) @@ -2737,8 +3548,7 @@ static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) return 0; e_scratch: - ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_SCRATCH_AREA); + svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_SCRATCH_AREA); return 1; } @@ -2760,10 +3570,517 @@ static void set_ghcb_msr(struct vcpu_svm *svm, u64 value) svm->vmcb->control.ghcb_gpa = value; } +static int snp_rmptable_psmash(kvm_pfn_t pfn) +{ + int ret; + + pfn = pfn & ~(KVM_PAGES_PER_HPAGE(PG_LEVEL_2M) - 1); + + /* + * PSMASH_FAIL_INUSE indicates another processor is modifying the + * entry, so retry until that's no longer the case. + */ + do { + ret = psmash(pfn); + } while (ret == PSMASH_FAIL_INUSE); + + return ret; +} + +static int snp_complete_psc_msr(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (vcpu->run->hypercall.ret) + set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR); + else + set_ghcb_msr(svm, GHCB_MSR_PSC_RESP); + + return 1; /* resume guest */ +} + +static int snp_begin_psc_msr(struct vcpu_svm *svm, u64 ghcb_msr) +{ + u64 gpa = gfn_to_gpa(GHCB_MSR_PSC_REQ_TO_GFN(ghcb_msr)); + u8 op = GHCB_MSR_PSC_REQ_TO_OP(ghcb_msr); + struct kvm_vcpu *vcpu = &svm->vcpu; + + if (op != SNP_PAGE_STATE_PRIVATE && op != SNP_PAGE_STATE_SHARED) { + set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR); + return 1; /* resume guest */ + } + + if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) { + set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR); + return 1; /* resume guest */ + } + + vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; + vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; + /* + * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2) + * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that + * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting + * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU. + */ + vcpu->run->hypercall.ret = 0; + vcpu->run->hypercall.args[0] = gpa; + vcpu->run->hypercall.args[1] = 1; + vcpu->run->hypercall.args[2] = (op == SNP_PAGE_STATE_PRIVATE) + ? KVM_MAP_GPA_RANGE_ENCRYPTED + : KVM_MAP_GPA_RANGE_DECRYPTED; + vcpu->run->hypercall.args[2] |= KVM_MAP_GPA_RANGE_PAGE_SZ_4K; + + vcpu->arch.complete_userspace_io = snp_complete_psc_msr; + + return 0; /* forward request to userspace */ +} + +struct psc_buffer { + struct psc_hdr hdr; + struct psc_entry entries[]; +} __packed; + +static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc); + +static void snp_complete_psc(struct vcpu_svm *svm, u64 psc_ret) +{ + svm->sev_es.psc_inflight = 0; + svm->sev_es.psc_idx = 0; + svm->sev_es.psc_2m = false; + + /* + * PSC requests always get a "no action" response in SW_EXITINFO1, with + * a PSC-specific return code in SW_EXITINFO2 that provides the "real" + * return code. E.g. if the PSC request was interrupted, the need to + * retry is communicated via SW_EXITINFO2, not SW_EXITINFO1. + */ + svm_vmgexit_no_action(svm, psc_ret); +} + +static void __snp_complete_one_psc(struct vcpu_svm *svm) +{ + struct psc_buffer *psc = svm->sev_es.ghcb_sa; + struct psc_entry *entries = psc->entries; + struct psc_hdr *hdr = &psc->hdr; + __u16 idx; + + /* + * Everything in-flight has been processed successfully. Update the + * corresponding entries in the guest's PSC buffer and zero out the + * count of in-flight PSC entries. + */ + for (idx = svm->sev_es.psc_idx; svm->sev_es.psc_inflight; + svm->sev_es.psc_inflight--, idx++) { + struct psc_entry *entry = &entries[idx]; + + entry->cur_page = entry->pagesize ? 512 : 1; + } + + hdr->cur_entry = idx; +} + +static int snp_complete_one_psc(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct psc_buffer *psc = svm->sev_es.ghcb_sa; + + if (vcpu->run->hypercall.ret) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); + return 1; /* resume guest */ + } + + __snp_complete_one_psc(svm); + + /* Handle the next range (if any). */ + return snp_begin_psc(svm, psc); +} + +static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc) +{ + struct psc_entry *entries = psc->entries; + struct kvm_vcpu *vcpu = &svm->vcpu; + struct psc_hdr *hdr = &psc->hdr; + struct psc_entry entry_start; + u16 idx, idx_start, idx_end; + int npages; + bool huge; + u64 gfn; + + if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); + return 1; + } + +next_range: + /* There should be no other PSCs in-flight at this point. */ + if (WARN_ON_ONCE(svm->sev_es.psc_inflight)) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); + return 1; + } + + /* + * The PSC descriptor buffer can be modified by a misbehaved guest after + * validation, so take care to only use validated copies of values used + * for things like array indexing. + */ + idx_start = hdr->cur_entry; + idx_end = hdr->end_entry; + + if (idx_end >= VMGEXIT_PSC_MAX_COUNT) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_HDR); + return 1; + } + + /* Find the start of the next range which needs processing. */ + for (idx = idx_start; idx <= idx_end; idx++, hdr->cur_entry++) { + entry_start = entries[idx]; + + gfn = entry_start.gfn; + huge = entry_start.pagesize; + npages = huge ? 512 : 1; + + if (entry_start.cur_page > npages || !IS_ALIGNED(gfn, npages)) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_ENTRY); + return 1; + } + + if (entry_start.cur_page) { + /* + * If this is a partially-completed 2M range, force 4K handling + * for the remaining pages since they're effectively split at + * this point. Subsequent code should ensure this doesn't get + * combined with adjacent PSC entries where 2M handling is still + * possible. + */ + npages -= entry_start.cur_page; + gfn += entry_start.cur_page; + huge = false; + } + + if (npages) + break; + } + + if (idx > idx_end) { + /* Nothing more to process. */ + snp_complete_psc(svm, 0); + return 1; + } + + svm->sev_es.psc_2m = huge; + svm->sev_es.psc_idx = idx; + svm->sev_es.psc_inflight = 1; + + /* + * Find all subsequent PSC entries that contain adjacent GPA + * ranges/operations and can be combined into a single + * KVM_HC_MAP_GPA_RANGE exit. + */ + while (++idx <= idx_end) { + struct psc_entry entry = entries[idx]; + + if (entry.operation != entry_start.operation || + entry.gfn != entry_start.gfn + npages || + entry.cur_page || !!entry.pagesize != huge) + break; + + svm->sev_es.psc_inflight++; + npages += huge ? 512 : 1; + } + + switch (entry_start.operation) { + case VMGEXIT_PSC_OP_PRIVATE: + case VMGEXIT_PSC_OP_SHARED: + vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; + vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; + /* + * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2) + * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that + * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting + * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU. + */ + vcpu->run->hypercall.ret = 0; + vcpu->run->hypercall.args[0] = gfn_to_gpa(gfn); + vcpu->run->hypercall.args[1] = npages; + vcpu->run->hypercall.args[2] = entry_start.operation == VMGEXIT_PSC_OP_PRIVATE + ? KVM_MAP_GPA_RANGE_ENCRYPTED + : KVM_MAP_GPA_RANGE_DECRYPTED; + vcpu->run->hypercall.args[2] |= entry_start.pagesize + ? KVM_MAP_GPA_RANGE_PAGE_SZ_2M + : KVM_MAP_GPA_RANGE_PAGE_SZ_4K; + vcpu->arch.complete_userspace_io = snp_complete_one_psc; + return 0; /* forward request to userspace */ + default: + /* + * Only shared/private PSC operations are currently supported, so if the + * entire range consists of unsupported operations (e.g. SMASH/UNSMASH), + * then consider the entire range completed and avoid exiting to + * userspace. In theory snp_complete_psc() can always be called directly + * at this point to complete the current range and start the next one, + * but that could lead to unexpected levels of recursion. + */ + __snp_complete_one_psc(svm); + goto next_range; + } + + BUG(); +} + +/* + * Invoked as part of svm_vcpu_reset() processing of an init event. + */ +void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct kvm_memory_slot *slot; + struct page *page; + kvm_pfn_t pfn; + gfn_t gfn; + + if (!sev_snp_guest(vcpu->kvm)) + return; + + guard(mutex)(&svm->sev_es.snp_vmsa_mutex); + + if (!svm->sev_es.snp_ap_waiting_for_reset) + return; + + svm->sev_es.snp_ap_waiting_for_reset = false; + + /* Mark the vCPU as offline and not runnable */ + vcpu->arch.pv.pv_unhalted = false; + kvm_set_mp_state(vcpu, KVM_MP_STATE_HALTED); + + /* Clear use of the VMSA */ + svm->vmcb->control.vmsa_pa = INVALID_PAGE; + + /* + * When replacing the VMSA during SEV-SNP AP creation, + * mark the VMCB dirty so that full state is always reloaded. + */ + vmcb_mark_all_dirty(svm->vmcb); + + if (!VALID_PAGE(svm->sev_es.snp_vmsa_gpa)) + return; + + gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa); + svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; + + slot = gfn_to_memslot(vcpu->kvm, gfn); + if (!slot) + return; + + /* + * The new VMSA will be private memory guest memory, so retrieve the + * PFN from the gmem backend. + */ + if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, &page, NULL)) + return; + + /* + * From this point forward, the VMSA will always be a guest-mapped page + * rather than the initial one allocated by KVM in svm->sev_es.vmsa. In + * theory, svm->sev_es.vmsa could be free'd and cleaned up here, but + * that involves cleanups like wbinvd_on_all_cpus() which would ideally + * be handled during teardown rather than guest boot. Deferring that + * also allows the existing logic for SEV-ES VMSAs to be re-used with + * minimal SNP-specific changes. + */ + svm->sev_es.snp_has_guest_vmsa = true; + + /* Use the new VMSA */ + svm->vmcb->control.vmsa_pa = pfn_to_hpa(pfn); + + /* Mark the vCPU as runnable */ + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); + + /* + * gmem pages aren't currently migratable, but if this ever changes + * then care should be taken to ensure svm->sev_es.vmsa is pinned + * through some other means. + */ + kvm_release_page_clean(page); +} + +static int sev_snp_ap_creation(struct vcpu_svm *svm) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm); + struct kvm_vcpu *vcpu = &svm->vcpu; + struct kvm_vcpu *target_vcpu; + struct vcpu_svm *target_svm; + unsigned int request; + unsigned int apic_id; + + request = lower_32_bits(svm->vmcb->control.exit_info_1); + apic_id = upper_32_bits(svm->vmcb->control.exit_info_1); + + /* Validate the APIC ID */ + target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, apic_id); + if (!target_vcpu) { + vcpu_unimpl(vcpu, "vmgexit: invalid AP APIC ID [%#x] from guest\n", + apic_id); + return -EINVAL; + } + + target_svm = to_svm(target_vcpu); + + guard(mutex)(&target_svm->sev_es.snp_vmsa_mutex); + + switch (request) { + case SVM_VMGEXIT_AP_CREATE_ON_INIT: + case SVM_VMGEXIT_AP_CREATE: + if (vcpu->arch.regs[VCPU_REGS_RAX] != sev->vmsa_features) { + vcpu_unimpl(vcpu, "vmgexit: mismatched AP sev_features [%#lx] != [%#llx] from guest\n", + vcpu->arch.regs[VCPU_REGS_RAX], sev->vmsa_features); + return -EINVAL; + } + + if (!page_address_valid(vcpu, svm->vmcb->control.exit_info_2)) { + vcpu_unimpl(vcpu, "vmgexit: invalid AP VMSA address [%#llx] from guest\n", + svm->vmcb->control.exit_info_2); + return -EINVAL; + } + + /* + * Malicious guest can RMPADJUST a large page into VMSA which + * will hit the SNP erratum where the CPU will incorrectly signal + * an RMP violation #PF if a hugepage collides with the RMP entry + * of VMSA page, reject the AP CREATE request if VMSA address from + * guest is 2M aligned. + */ + if (IS_ALIGNED(svm->vmcb->control.exit_info_2, PMD_SIZE)) { + vcpu_unimpl(vcpu, + "vmgexit: AP VMSA address [%llx] from guest is unsafe as it is 2M aligned\n", + svm->vmcb->control.exit_info_2); + return -EINVAL; + } + + target_svm->sev_es.snp_vmsa_gpa = svm->vmcb->control.exit_info_2; + break; + case SVM_VMGEXIT_AP_DESTROY: + target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; + break; + default: + vcpu_unimpl(vcpu, "vmgexit: invalid AP creation request [%#x] from guest\n", + request); + return -EINVAL; + } + + target_svm->sev_es.snp_ap_waiting_for_reset = true; + + /* + * Unless Creation is deferred until INIT, signal the vCPU to update + * its state. + */ + if (request != SVM_VMGEXIT_AP_CREATE_ON_INIT) + kvm_make_request_and_kick(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu); + + return 0; +} + +static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa) +{ + struct sev_data_snp_guest_request data = {0}; + struct kvm *kvm = svm->vcpu.kvm; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + sev_ret_code fw_err = 0; + int ret; + + if (!sev_snp_guest(kvm)) + return -EINVAL; + + mutex_lock(&sev->guest_req_mutex); + + if (kvm_read_guest(kvm, req_gpa, sev->guest_req_buf, PAGE_SIZE)) { + ret = -EIO; + goto out_unlock; + } + + data.gctx_paddr = __psp_pa(sev->snp_context); + data.req_paddr = __psp_pa(sev->guest_req_buf); + data.res_paddr = __psp_pa(sev->guest_resp_buf); + + /* + * Firmware failures are propagated on to guest, but any other failure + * condition along the way should be reported to userspace. E.g. if + * the PSP is dead and commands are timing out. + */ + ret = sev_issue_cmd(kvm, SEV_CMD_SNP_GUEST_REQUEST, &data, &fw_err); + if (ret && !fw_err) + goto out_unlock; + + if (kvm_write_guest(kvm, resp_gpa, sev->guest_resp_buf, PAGE_SIZE)) { + ret = -EIO; + goto out_unlock; + } + + /* No action is requested *from KVM* if there was a firmware error. */ + svm_vmgexit_no_action(svm, SNP_GUEST_ERR(0, fw_err)); + + ret = 1; /* resume guest */ + +out_unlock: + mutex_unlock(&sev->guest_req_mutex); + return ret; +} + +static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa) +{ + struct kvm *kvm = svm->vcpu.kvm; + u8 msg_type; + + if (!sev_snp_guest(kvm)) + return -EINVAL; + + if (kvm_read_guest(kvm, req_gpa + offsetof(struct snp_guest_msg_hdr, msg_type), + &msg_type, 1)) + return -EIO; + + /* + * As per GHCB spec, requests of type MSG_REPORT_REQ also allow for + * additional certificate data to be provided alongside the attestation + * report via the guest-provided data pages indicated by RAX/RBX. The + * certificate data is optional and requires additional KVM enablement + * to provide an interface for userspace to provide it, but KVM still + * needs to be able to handle extended guest requests either way. So + * provide a stub implementation that will always return an empty + * certificate table in the guest-provided data pages. + */ + if (msg_type == SNP_MSG_REPORT_REQ) { + struct kvm_vcpu *vcpu = &svm->vcpu; + u64 data_npages; + gpa_t data_gpa; + + if (!kvm_ghcb_rax_is_valid(svm) || !kvm_ghcb_rbx_is_valid(svm)) + goto request_invalid; + + data_gpa = vcpu->arch.regs[VCPU_REGS_RAX]; + data_npages = vcpu->arch.regs[VCPU_REGS_RBX]; + + if (!PAGE_ALIGNED(data_gpa)) + goto request_invalid; + + /* + * As per GHCB spec (see "SNP Extended Guest Request"), the + * certificate table is terminated by 24-bytes of zeroes. + */ + if (data_npages && kvm_clear_guest(kvm, data_gpa, 24)) + return -EIO; + } + + return snp_handle_guest_req(svm, req_gpa, resp_gpa); + +request_invalid: + svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT); + return 1; /* resume guest */ +} + static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; struct kvm_vcpu *vcpu = &svm->vcpu; + struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); u64 ghcb_info; int ret = 1; @@ -2774,7 +4091,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) switch (ghcb_info) { case GHCB_MSR_SEV_INFO_REQ: - set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX, + set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version, GHCB_VERSION_MIN, sev_enc_bit)); break; @@ -2816,6 +4133,60 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) GHCB_MSR_INFO_POS); break; } + case GHCB_MSR_AP_RESET_HOLD_REQ: + svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_MSR_PROTO; + ret = kvm_emulate_ap_reset_hold(&svm->vcpu); + + /* + * Preset the result to a non-SIPI return and then only set + * the result to non-zero when delivering a SIPI. + */ + set_ghcb_msr_bits(svm, 0, + GHCB_MSR_AP_RESET_HOLD_RESULT_MASK, + GHCB_MSR_AP_RESET_HOLD_RESULT_POS); + + set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP, + GHCB_MSR_INFO_MASK, + GHCB_MSR_INFO_POS); + break; + case GHCB_MSR_HV_FT_REQ: + set_ghcb_msr_bits(svm, GHCB_HV_FT_SUPPORTED, + GHCB_MSR_HV_FT_MASK, GHCB_MSR_HV_FT_POS); + set_ghcb_msr_bits(svm, GHCB_MSR_HV_FT_RESP, + GHCB_MSR_INFO_MASK, GHCB_MSR_INFO_POS); + break; + case GHCB_MSR_PREF_GPA_REQ: + if (!sev_snp_guest(vcpu->kvm)) + goto out_terminate; + + set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_NONE, GHCB_MSR_GPA_VALUE_MASK, + GHCB_MSR_GPA_VALUE_POS); + set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_RESP, GHCB_MSR_INFO_MASK, + GHCB_MSR_INFO_POS); + break; + case GHCB_MSR_REG_GPA_REQ: { + u64 gfn; + + if (!sev_snp_guest(vcpu->kvm)) + goto out_terminate; + + gfn = get_ghcb_msr_bits(svm, GHCB_MSR_GPA_VALUE_MASK, + GHCB_MSR_GPA_VALUE_POS); + + svm->sev_es.ghcb_registered_gpa = gfn_to_gpa(gfn); + + set_ghcb_msr_bits(svm, gfn, GHCB_MSR_GPA_VALUE_MASK, + GHCB_MSR_GPA_VALUE_POS); + set_ghcb_msr_bits(svm, GHCB_MSR_REG_GPA_RESP, GHCB_MSR_INFO_MASK, + GHCB_MSR_INFO_POS); + break; + } + case GHCB_MSR_PSC_REQ: + if (!sev_snp_guest(vcpu->kvm)) + goto out_terminate; + + ret = snp_begin_psc_msr(svm, control->ghcb_gpa); + break; case GHCB_MSR_TERM_REQ: { u64 reason_set, reason_code; @@ -2828,12 +4199,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) pr_info("SEV-ES guest requested termination: %#llx:%#llx\n", reason_set, reason_code); - vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; - vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM; - vcpu->run->system_event.ndata = 1; - vcpu->run->system_event.data[0] = control->ghcb_gpa; - - return 0; + goto out_terminate; } default: /* Error, keep GHCB MSR value as-is */ @@ -2844,6 +4210,14 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) control->ghcb_gpa, ret); return ret; + +out_terminate: + vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; + vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM; + vcpu->run->system_event.ndata = 1; + vcpu->run->system_event.data[0] = control->ghcb_gpa; + + return 0; } int sev_handle_vmgexit(struct kvm_vcpu *vcpu) @@ -2879,12 +4253,18 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) trace_kvm_vmgexit_enter(vcpu->vcpu_id, svm->sev_es.ghcb); sev_es_sync_from_ghcb(svm); + + /* SEV-SNP guest requires that the GHCB GPA must be registered */ + if (sev_snp_guest(svm->vcpu.kvm) && !ghcb_gpa_is_registered(svm, ghcb_gpa)) { + vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB GPA [%#llx] is not registered.\n", ghcb_gpa); + return -EINVAL; + } + ret = sev_es_validate_vmgexit(svm); if (ret) return ret; - ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 0); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 0); + svm_vmgexit_success(svm, 0); exit_code = kvm_ghcb_get_sw_exit_code(control); switch (exit_code) { @@ -2915,10 +4295,11 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) ret = 1; break; case SVM_VMGEXIT_AP_HLT_LOOP: + svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NAE_EVENT; ret = kvm_emulate_ap_reset_hold(vcpu); break; case SVM_VMGEXIT_AP_JUMP_TABLE: { - struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); switch (control->exit_info_1) { case 0: @@ -2927,18 +4308,50 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) break; case 1: /* Get AP jump table address */ - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, sev->ap_jump_table); + svm_vmgexit_success(svm, sev->ap_jump_table); break; default: pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n", control->exit_info_1); - ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_INPUT); + svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT); } ret = 1; break; } + case SVM_VMGEXIT_HV_FEATURES: + svm_vmgexit_success(svm, GHCB_HV_FT_SUPPORTED); + ret = 1; + break; + case SVM_VMGEXIT_TERM_REQUEST: + pr_info("SEV-ES guest requested termination: reason %#llx info %#llx\n", + control->exit_info_1, control->exit_info_2); + vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; + vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM; + vcpu->run->system_event.ndata = 1; + vcpu->run->system_event.data[0] = control->ghcb_gpa; + break; + case SVM_VMGEXIT_PSC: + ret = setup_vmgexit_scratch(svm, true, control->exit_info_2); + if (ret) + break; + + ret = snp_begin_psc(svm, svm->sev_es.ghcb_sa); + break; + case SVM_VMGEXIT_AP_CREATION: + ret = sev_snp_ap_creation(svm); + if (ret) { + svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT); + } + + ret = 1; + break; + case SVM_VMGEXIT_GUEST_REQUEST: + ret = snp_handle_guest_req(svm, control->exit_info_1, control->exit_info_2); + break; + case SVM_VMGEXIT_EXT_GUEST_REQUEST: + ret = snp_handle_ext_guest_req(svm, control->exit_info_1, control->exit_info_2); + break; case SVM_VMGEXIT_UNSUPPORTED_EVENT: vcpu_unimpl(vcpu, "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n", @@ -2978,8 +4391,8 @@ static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm) struct kvm_vcpu *vcpu = &svm->vcpu; if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) { - bool v_tsc_aux = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) || - guest_cpuid_has(vcpu, X86_FEATURE_RDPID); + bool v_tsc_aux = guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) || + guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID); set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, v_tsc_aux, v_tsc_aux); } @@ -2988,16 +4401,15 @@ static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm) * For SEV-ES, accesses to MSR_IA32_XSS should not be intercepted if * the host/guest supports its use. * - * guest_can_use() checks a number of requirements on the host/guest to - * ensure that MSR_IA32_XSS is available, but it might report true even - * if X86_FEATURE_XSAVES isn't configured in the guest to ensure host - * MSR_IA32_XSS is always properly restored. For SEV-ES, it is better - * to further check that the guest CPUID actually supports - * X86_FEATURE_XSAVES so that accesses to MSR_IA32_XSS by misbehaved - * guests will still get intercepted and caught in the normal - * kvm_emulate_rdmsr()/kvm_emulated_wrmsr() paths. - */ - if (guest_can_use(vcpu, X86_FEATURE_XSAVES) && + * KVM treats the guest as being capable of using XSAVES even if XSAVES + * isn't enabled in guest CPUID as there is no intercept for XSAVES, + * i.e. the guest can use XSAVES/XRSTOR to read/write XSS if XSAVE is + * exposed to the guest and XSAVES is supported in hardware. Condition + * full XSS passthrough on the guest being able to use XSAVES *and* + * XSAVES being exposed to the guest so that KVM can at least honor + * guest CPUID for RDMSR and WRMSR. + */ + if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) && guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_XSS, 1, 1); else @@ -3020,11 +4432,11 @@ void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm) static void sev_es_init_vmcb(struct vcpu_svm *svm) { + struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm); struct vmcb *vmcb = svm->vmcb01.ptr; struct kvm_vcpu *vcpu = &svm->vcpu; svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ES_ENABLE; - svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; /* * An SEV-ES guest requires a VMSA area that is a separate from the @@ -3033,9 +4445,13 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) * the VMSA will be NULL if this vCPU is the destination for intrahost * migration, and will be copied later. */ - if (svm->sev_es.vmsa) + if (svm->sev_es.vmsa && !svm->sev_es.snp_has_guest_vmsa) svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa); + if (cpu_feature_enabled(X86_FEATURE_ALLOWED_SEV_FEATURES)) + svm->vmcb->control.allowed_sev_features = sev->vmsa_features | + VMCB_ALLOWED_SEV_FEATURES_VALID; + /* Can't intercept CR register access, HV can't modify CR registers */ svm_clr_intercept(svm, INTERCEPT_CR0_READ); svm_clr_intercept(svm, INTERCEPT_CR4_READ); @@ -3053,7 +4469,7 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) svm_set_intercept(svm, TRAP_CR8_WRITE); vmcb->control.intercepts[INTERCEPT_DR] = 0; - if (!sev_es_debug_swap_enabled) { + if (!sev_vcpu_has_debug_swap(svm)) { vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); recalc_intercepts(svm); @@ -3076,10 +4492,6 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) /* Clear intercepts on selected MSRs */ set_msr_interception(vcpu, svm->msrpm, MSR_EFER, 1, 1); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_CR_PAT, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); } void sev_init_vmcb(struct vcpu_svm *svm) @@ -3099,17 +4511,24 @@ void sev_init_vmcb(struct vcpu_svm *svm) void sev_es_vcpu_reset(struct vcpu_svm *svm) { + struct kvm_vcpu *vcpu = &svm->vcpu; + struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); + /* * Set the GHCB MSR value as per the GHCB specification when emulating * vCPU RESET for an SEV-ES guest. */ - set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX, + set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version, GHCB_VERSION_MIN, sev_enc_bit)); + + mutex_init(&svm->sev_es.snp_vmsa_mutex); } -void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa) +void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa) { + struct kvm *kvm = svm->vcpu.kvm; + /* * All host state for SEV-ES guests is categorized into three swap types * based on how it is handled by hardware during a world switch: @@ -3127,20 +4546,28 @@ void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa) * isn't saved by VMRUN, that isn't already saved by VMSAVE (performed * by common SVM code). */ - hostsa->xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + hostsa->xcr0 = kvm_host.xcr0; hostsa->pkru = read_pkru(); - hostsa->xss = host_xss; + hostsa->xss = kvm_host.xss; /* * If DebugSwap is enabled, debug registers are loaded but NOT saved by - * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU both - * saves and loads debug registers (Type-A). - */ - if (sev_es_debug_swap_enabled) { - hostsa->dr0 = native_get_debugreg(0); - hostsa->dr1 = native_get_debugreg(1); - hostsa->dr2 = native_get_debugreg(2); - hostsa->dr3 = native_get_debugreg(3); + * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU does + * not save or load debug registers. Sadly, KVM can't prevent SNP + * guests from lying about DebugSwap on secondary vCPUs, i.e. the + * SEV_FEATURES provided at "AP Create" isn't guaranteed to match what + * the guest has actually enabled (or not!) in the VMSA. + * + * If DebugSwap is *possible*, save the masks so that they're restored + * if the guest enables DebugSwap. But for the DRs themselves, do NOT + * rely on the CPU to restore the host values; KVM will restore them as + * needed in common code, via hw_breakpoint_restore(). Note, KVM does + * NOT support virtualizing Breakpoint Extensions, i.e. the mask MSRs + * don't need to be restored per se, KVM just needs to ensure they are + * loaded with the correct values *if* the CPU writes the MSRs. + */ + if (sev_vcpu_has_debug_swap(svm) || + (sev_snp_guest(kvm) && cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP))) { hostsa->dr0_addr_mask = amd_get_dr_addr_mask(0); hostsa->dr1_addr_mask = amd_get_dr_addr_mask(1); hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2); @@ -3158,24 +4585,40 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) return; } - /* - * Subsequent SIPI: Return from an AP Reset Hold VMGEXIT, where - * the guest will set the CS and RIP. Set SW_EXIT_INFO_2 to a - * non-zero value. - */ - if (!svm->sev_es.ghcb) - return; + /* Subsequent SIPI */ + switch (svm->sev_es.ap_reset_hold_type) { + case AP_RESET_HOLD_NAE_EVENT: + /* + * Return from an AP Reset Hold VMGEXIT, where the guest will + * set the CS and RIP. Set SW_EXIT_INFO_2 to a non-zero value. + */ + svm_vmgexit_success(svm, 1); + break; + case AP_RESET_HOLD_MSR_PROTO: + /* + * Return from an AP Reset Hold VMGEXIT, where the guest will + * set the CS and RIP. Set GHCB data field to a non-zero value. + */ + set_ghcb_msr_bits(svm, 1, + GHCB_MSR_AP_RESET_HOLD_RESULT_MASK, + GHCB_MSR_AP_RESET_HOLD_RESULT_POS); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1); + set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP, + GHCB_MSR_INFO_MASK, + GHCB_MSR_INFO_POS); + break; + default: + break; + } } -struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu) +struct page *snp_safe_alloc_page_node(int node, gfp_t gfp) { unsigned long pfn; struct page *p; - if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP)) - return alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + return alloc_pages_node(node, gfp | __GFP_ZERO, 0); /* * Allocate an SNP-safe page to workaround the SNP erratum where @@ -3186,7 +4629,7 @@ struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu) * Allocate one extra page, choose a page which is not * 2MB-aligned, and free the other. */ - p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1); + p = alloc_pages_node(node, gfp | __GFP_ZERO, 1); if (!p) return NULL; @@ -3200,3 +4643,366 @@ struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu) return p; } + +void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) +{ + struct kvm_memory_slot *slot; + struct kvm *kvm = vcpu->kvm; + int order, rmp_level, ret; + struct page *page; + bool assigned; + kvm_pfn_t pfn; + gfn_t gfn; + + gfn = gpa >> PAGE_SHIFT; + + /* + * The only time RMP faults occur for shared pages is when the guest is + * triggering an RMP fault for an implicit page-state change from + * shared->private. Implicit page-state changes are forwarded to + * userspace via KVM_EXIT_MEMORY_FAULT events, however, so RMP faults + * for shared pages should not end up here. + */ + if (!kvm_mem_is_private(kvm, gfn)) { + pr_warn_ratelimited("SEV: Unexpected RMP fault for non-private GPA 0x%llx\n", + gpa); + return; + } + + slot = gfn_to_memslot(kvm, gfn); + if (!kvm_slot_can_be_private(slot)) { + pr_warn_ratelimited("SEV: Unexpected RMP fault, non-private slot for GPA 0x%llx\n", + gpa); + return; + } + + ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, &page, &order); + if (ret) { + pr_warn_ratelimited("SEV: Unexpected RMP fault, no backing page for private GPA 0x%llx\n", + gpa); + return; + } + + ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level); + if (ret || !assigned) { + pr_warn_ratelimited("SEV: Unexpected RMP fault, no assigned RMP entry found for GPA 0x%llx PFN 0x%llx error %d\n", + gpa, pfn, ret); + goto out_no_trace; + } + + /* + * There are 2 cases where a PSMASH may be needed to resolve an #NPF + * with PFERR_GUEST_RMP_BIT set: + * + * 1) RMPADJUST/PVALIDATE can trigger an #NPF with PFERR_GUEST_SIZEM + * bit set if the guest issues them with a smaller granularity than + * what is indicated by the page-size bit in the 2MB RMP entry for + * the PFN that backs the GPA. + * + * 2) Guest access via NPT can trigger an #NPF if the NPT mapping is + * smaller than what is indicated by the 2MB RMP entry for the PFN + * that backs the GPA. + * + * In both these cases, the corresponding 2M RMP entry needs to + * be PSMASH'd to 512 4K RMP entries. If the RMP entry is already + * split into 4K RMP entries, then this is likely a spurious case which + * can occur when there are concurrent accesses by the guest to a 2MB + * GPA range that is backed by a 2MB-aligned PFN who's RMP entry is in + * the process of being PMASH'd into 4K entries. These cases should + * resolve automatically on subsequent accesses, so just ignore them + * here. + */ + if (rmp_level == PG_LEVEL_4K) + goto out; + + ret = snp_rmptable_psmash(pfn); + if (ret) { + /* + * Look it up again. If it's 4K now then the PSMASH may have + * raced with another process and the issue has already resolved + * itself. + */ + if (!snp_lookup_rmpentry(pfn, &assigned, &rmp_level) && + assigned && rmp_level == PG_LEVEL_4K) + goto out; + + pr_warn_ratelimited("SEV: Unable to split RMP entry for GPA 0x%llx PFN 0x%llx ret %d\n", + gpa, pfn, ret); + } + + kvm_zap_gfn_range(kvm, gfn, gfn + PTRS_PER_PMD); +out: + trace_kvm_rmp_fault(vcpu, gpa, pfn, error_code, rmp_level, ret); +out_no_trace: + kvm_release_page_unused(page); +} + +static bool is_pfn_range_shared(kvm_pfn_t start, kvm_pfn_t end) +{ + kvm_pfn_t pfn = start; + + while (pfn < end) { + int ret, rmp_level; + bool assigned; + + ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level); + if (ret) { + pr_warn_ratelimited("SEV: Failed to retrieve RMP entry: PFN 0x%llx GFN start 0x%llx GFN end 0x%llx RMP level %d error %d\n", + pfn, start, end, rmp_level, ret); + return false; + } + + if (assigned) { + pr_debug("%s: overlap detected, PFN 0x%llx start 0x%llx end 0x%llx RMP level %d\n", + __func__, pfn, start, end, rmp_level); + return false; + } + + pfn++; + } + + return true; +} + +static u8 max_level_for_order(int order) +{ + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M)) + return PG_LEVEL_2M; + + return PG_LEVEL_4K; +} + +static bool is_large_rmp_possible(struct kvm *kvm, kvm_pfn_t pfn, int order) +{ + kvm_pfn_t pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD); + + /* + * If this is a large folio, and the entire 2M range containing the + * PFN is currently shared, then the entire 2M-aligned range can be + * set to private via a single 2M RMP entry. + */ + if (max_level_for_order(order) > PG_LEVEL_4K && + is_pfn_range_shared(pfn_aligned, pfn_aligned + PTRS_PER_PMD)) + return true; + + return false; +} + +int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + kvm_pfn_t pfn_aligned; + gfn_t gfn_aligned; + int level, rc; + bool assigned; + + if (!sev_snp_guest(kvm)) + return 0; + + rc = snp_lookup_rmpentry(pfn, &assigned, &level); + if (rc) { + pr_err_ratelimited("SEV: Failed to look up RMP entry: GFN %llx PFN %llx error %d\n", + gfn, pfn, rc); + return -ENOENT; + } + + if (assigned) { + pr_debug("%s: already assigned: gfn %llx pfn %llx max_order %d level %d\n", + __func__, gfn, pfn, max_order, level); + return 0; + } + + if (is_large_rmp_possible(kvm, pfn, max_order)) { + level = PG_LEVEL_2M; + pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD); + gfn_aligned = ALIGN_DOWN(gfn, PTRS_PER_PMD); + } else { + level = PG_LEVEL_4K; + pfn_aligned = pfn; + gfn_aligned = gfn; + } + + rc = rmp_make_private(pfn_aligned, gfn_to_gpa(gfn_aligned), level, sev->asid, false); + if (rc) { + pr_err_ratelimited("SEV: Failed to update RMP entry: GFN %llx PFN %llx level %d error %d\n", + gfn, pfn, level, rc); + return -EINVAL; + } + + pr_debug("%s: updated: gfn %llx pfn %llx pfn_aligned %llx max_order %d level %d\n", + __func__, gfn, pfn, pfn_aligned, max_order, level); + + return 0; +} + +void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) +{ + kvm_pfn_t pfn; + + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + return; + + pr_debug("%s: PFN start 0x%llx PFN end 0x%llx\n", __func__, start, end); + + for (pfn = start; pfn < end;) { + bool use_2m_update = false; + int rc, rmp_level; + bool assigned; + + rc = snp_lookup_rmpentry(pfn, &assigned, &rmp_level); + if (rc || !assigned) + goto next_pfn; + + use_2m_update = IS_ALIGNED(pfn, PTRS_PER_PMD) && + end >= (pfn + PTRS_PER_PMD) && + rmp_level > PG_LEVEL_4K; + + /* + * If an unaligned PFN corresponds to a 2M region assigned as a + * large page in the RMP table, PSMASH the region into individual + * 4K RMP entries before attempting to convert a 4K sub-page. + */ + if (!use_2m_update && rmp_level > PG_LEVEL_4K) { + /* + * This shouldn't fail, but if it does, report it, but + * still try to update RMP entry to shared and pray this + * was a spurious error that can be addressed later. + */ + rc = snp_rmptable_psmash(pfn); + WARN_ONCE(rc, "SEV: Failed to PSMASH RMP entry for PFN 0x%llx error %d\n", + pfn, rc); + } + + rc = rmp_make_shared(pfn, use_2m_update ? PG_LEVEL_2M : PG_LEVEL_4K); + if (WARN_ONCE(rc, "SEV: Failed to update RMP entry for PFN 0x%llx error %d\n", + pfn, rc)) + goto next_pfn; + + /* + * SEV-ES avoids host/guest cache coherency issues through + * WBINVD hooks issued via MMU notifiers during run-time, and + * KVM's VM destroy path at shutdown. Those MMU notifier events + * don't cover gmem since there is no requirement to map pages + * to a HVA in order to use them for a running guest. While the + * shutdown path would still likely cover things for SNP guests, + * userspace may also free gmem pages during run-time via + * hole-punching operations on the guest_memfd, so flush the + * cache entries for these pages before free'ing them back to + * the host. + */ + clflush_cache_range(__va(pfn_to_hpa(pfn)), + use_2m_update ? PMD_SIZE : PAGE_SIZE); +next_pfn: + pfn += use_2m_update ? PTRS_PER_PMD : 1; + cond_resched(); + } +} + +int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +{ + int level, rc; + bool assigned; + + if (!sev_snp_guest(kvm)) + return 0; + + rc = snp_lookup_rmpentry(pfn, &assigned, &level); + if (rc || !assigned) + return PG_LEVEL_4K; + + return level; +} + +struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_save_area *vmsa; + struct kvm_sev_info *sev; + int error = 0; + int ret; + + if (!sev_es_guest(vcpu->kvm)) + return NULL; + + /* + * If the VMSA has not yet been encrypted, return a pointer to the + * current un-encrypted VMSA. + */ + if (!vcpu->arch.guest_state_protected) + return (struct vmcb_save_area *)svm->sev_es.vmsa; + + sev = to_kvm_sev_info(vcpu->kvm); + + /* Check if the SEV policy allows debugging */ + if (sev_snp_guest(vcpu->kvm)) { + if (!(sev->policy & SNP_POLICY_DEBUG)) + return NULL; + } else { + if (sev->policy & SEV_POLICY_NODBG) + return NULL; + } + + if (sev_snp_guest(vcpu->kvm)) { + struct sev_data_snp_dbg dbg = {0}; + + vmsa = snp_alloc_firmware_page(__GFP_ZERO); + if (!vmsa) + return NULL; + + dbg.gctx_paddr = __psp_pa(sev->snp_context); + dbg.src_addr = svm->vmcb->control.vmsa_pa; + dbg.dst_addr = __psp_pa(vmsa); + + ret = sev_do_cmd(SEV_CMD_SNP_DBG_DECRYPT, &dbg, &error); + + /* + * Return the target page to a hypervisor page no matter what. + * If this fails, the page can't be used, so leak it and don't + * try to use it. + */ + if (snp_page_reclaim(vcpu->kvm, PHYS_PFN(__pa(vmsa)))) + return NULL; + + if (ret) { + pr_err("SEV: SNP_DBG_DECRYPT failed ret=%d, fw_error=%d (%#x)\n", + ret, error, error); + free_page((unsigned long)vmsa); + + return NULL; + } + } else { + struct sev_data_dbg dbg = {0}; + struct page *vmsa_page; + + vmsa_page = alloc_page(GFP_KERNEL); + if (!vmsa_page) + return NULL; + + vmsa = page_address(vmsa_page); + + dbg.handle = sev->handle; + dbg.src_addr = svm->vmcb->control.vmsa_pa; + dbg.dst_addr = __psp_pa(vmsa); + dbg.len = PAGE_SIZE; + + ret = sev_do_cmd(SEV_CMD_DBG_DECRYPT, &dbg, &error); + if (ret) { + pr_err("SEV: SEV_CMD_DBG_DECRYPT failed ret=%d, fw_error=%d (0x%x)\n", + ret, error, error); + __free_page(vmsa_page); + + return NULL; + } + } + + return vmsa; +} + +void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa) +{ + /* If the VMSA has not yet been encrypted, nothing was allocated */ + if (!vcpu->arch.guest_state_protected || !vmsa) + return; + + free_page((unsigned long)vmsa); +} diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d1a9f9951635..ab9b947dbf4f 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -28,8 +28,11 @@ #include <linux/rwsem.h> #include <linux/cc_platform.h> #include <linux/smp.h> +#include <linux/string_choices.h> +#include <linux/mutex.h> #include <asm/apic.h> +#include <asm/msr.h> #include <asm/perf_event.h> #include <asm/tlbflush.h> #include <asm/desc.h> @@ -53,6 +56,7 @@ #include "svm_onhyperv.h" MODULE_AUTHOR("Qumranet"); +MODULE_DESCRIPTION("KVM support for SVM (AMD-V) extensions"); MODULE_LICENSE("GPL"); #ifdef MODULE @@ -99,6 +103,7 @@ static const struct svm_direct_access_msrs { { .index = MSR_IA32_SPEC_CTRL, .always = false }, { .index = MSR_IA32_PRED_CMD, .always = false }, { .index = MSR_IA32_FLUSH_CMD, .always = false }, + { .index = MSR_IA32_DEBUGCTLMSR, .always = false }, { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false }, { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, { .index = MSR_IA32_LASTINTFROMIP, .always = false }, @@ -215,7 +220,7 @@ int vgif = true; module_param(vgif, int, 0444); /* enable/disable LBR virtualization */ -static int lbrv = true; +int lbrv = true; module_param(lbrv, int, 0444); static int tsc_scaling = true; @@ -228,6 +233,8 @@ module_param(tsc_scaling, int, 0444); static bool avic; module_param(avic, bool, 0444); +module_param(enable_device_posted_irqs, bool, 0444); + bool __read_mostly dump_invalid_vmcb; module_param(dump_invalid_vmcb, bool, 0644); @@ -246,6 +253,8 @@ static unsigned long iopm_base; DEFINE_PER_CPU(struct svm_cpu_data, svm_data); +static DEFINE_MUTEX(vmcb_dump_mutex); + /* * Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE. @@ -282,8 +291,6 @@ u32 svm_msrpm_offset(u32 msr) return MSR_INVALID; } -static void svm_flush_tlb_current(struct kvm_vcpu *vcpu); - static int get_npt_level(void) { #ifdef CONFIG_X86_64 @@ -474,24 +481,18 @@ static void svm_inject_exception(struct kvm_vcpu *vcpu) static void svm_init_erratum_383(void) { - u32 low, high; - int err; u64 val; if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH)) return; /* Use _safe variants to not break nested virtualization */ - val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err); - if (err) + if (native_read_msr_safe(MSR_AMD64_DC_CFG, &val)) return; val |= (1ULL << 47); - low = lower_32_bits(val); - high = upper_32_bits(val); - - native_write_msr_safe(MSR_AMD64_DC_CFG, low, high); + native_write_msr_safe(MSR_AMD64_DC_CFG, val); erratum_383_found = true; } @@ -565,34 +566,39 @@ static void __svm_write_tsc_multiplier(u64 multiplier) if (multiplier == __this_cpu_read(current_tsc_ratio)) return; - wrmsrl(MSR_AMD64_TSC_RATIO, multiplier); + wrmsrq(MSR_AMD64_TSC_RATIO, multiplier); __this_cpu_write(current_tsc_ratio, multiplier); } +static __always_inline struct sev_es_save_area *sev_es_host_save_area(struct svm_cpu_data *sd) +{ + return &sd->save_area->host_sev_es_save; +} + static inline void kvm_cpu_svm_disable(void) { uint64_t efer; - wrmsrl(MSR_VM_HSAVE_PA, 0); - rdmsrl(MSR_EFER, efer); + wrmsrq(MSR_VM_HSAVE_PA, 0); + rdmsrq(MSR_EFER, efer); if (efer & EFER_SVME) { /* * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and * NMI aren't blocked. */ stgi(); - wrmsrl(MSR_EFER, efer & ~EFER_SVME); + wrmsrq(MSR_EFER, efer & ~EFER_SVME); } } -static void svm_emergency_disable(void) +static void svm_emergency_disable_virtualization_cpu(void) { kvm_rebooting = true; kvm_cpu_svm_disable(); } -static void svm_hardware_disable(void) +static void svm_disable_virtualization_cpu(void) { /* Make sure we clean up behind us */ if (tsc_scaling) @@ -603,14 +609,14 @@ static void svm_hardware_disable(void) amd_pmu_disable_virt(); } -static int svm_hardware_enable(void) +static int svm_enable_virtualization_cpu(void) { struct svm_cpu_data *sd; uint64_t efer; int me = raw_smp_processor_id(); - rdmsrl(MSR_EFER, efer); + rdmsrq(MSR_EFER, efer); if (efer & EFER_SVME) return -EBUSY; @@ -620,9 +626,9 @@ static int svm_hardware_enable(void) sd->next_asid = sd->max_asid + 1; sd->min_asid = max_sev_asid + 1; - wrmsrl(MSR_EFER, efer | EFER_SVME); + wrmsrq(MSR_EFER, efer | EFER_SVME); - wrmsrl(MSR_VM_HSAVE_PA, sd->save_area_pa); + wrmsrq(MSR_VM_HSAVE_PA, sd->save_area_pa); if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { /* @@ -643,13 +649,12 @@ static int svm_hardware_enable(void) * erratum is present everywhere). */ if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) { - uint64_t len, status = 0; + u64 len, status = 0; int err; - len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err); + err = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &len); if (!err) - status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, - &err); + err = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, &status); if (err) osvw_status = osvw_len = 0; @@ -673,12 +678,9 @@ static int svm_hardware_enable(void) * TSC_AUX field now to avoid a RDMSR on every vCPU run. */ if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) { - struct sev_es_save_area *hostsa; u32 __maybe_unused msr_hi; - hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400); - - rdmsr(MSR_TSC_AUX, hostsa->tsc_aux, msr_hi); + rdmsr(MSR_TSC_AUX, sev_es_host_save_area(sd)->tsc_aux, msr_hi); } return 0; @@ -692,7 +694,7 @@ static void svm_cpu_uninit(int cpu) return; kfree(sd->sev_vmcbs); - __free_page(sd->save_area); + __free_page(__sme_pa_to_page(sd->save_area_pa)); sd->save_area_pa = 0; sd->save_area = NULL; } @@ -700,23 +702,24 @@ static void svm_cpu_uninit(int cpu) static int svm_cpu_init(int cpu) { struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); + struct page *save_area_page; int ret = -ENOMEM; memset(sd, 0, sizeof(struct svm_cpu_data)); - sd->save_area = snp_safe_alloc_page(NULL); - if (!sd->save_area) + save_area_page = snp_safe_alloc_page_node(cpu_to_node(cpu), GFP_KERNEL); + if (!save_area_page) return ret; ret = sev_cpu_init(sd); if (ret) goto free_save_area; - sd->save_area_pa = __sme_page_pa(sd->save_area); + sd->save_area = page_address(save_area_page); + sd->save_area_pa = __sme_page_pa(save_area_page); return 0; free_save_area: - __free_page(sd->save_area); - sd->save_area = NULL; + __free_page(save_area_page); return ret; } @@ -990,7 +993,7 @@ void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb) vmcb_mark_dirty(to_vmcb, VMCB_LBR); } -static void svm_enable_lbrv(struct kvm_vcpu *vcpu) +void svm_enable_lbrv(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1000,6 +1003,9 @@ static void svm_enable_lbrv(struct kvm_vcpu *vcpu) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); + if (sev_es_guest(vcpu->kvm)) + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_DEBUGCTLMSR, 1, 1); + /* Move the LBR msrs to the vmcb02 so that the guest can see them. */ if (is_guest_mode(vcpu)) svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr); @@ -1009,6 +1015,8 @@ static void svm_disable_lbrv(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm); + svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); @@ -1039,7 +1047,7 @@ void svm_update_lbrv(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK; bool enable_lbrv = (svm_get_lbr_vmcb(svm)->save.dbgctl & DEBUGCTLMSR_LBR) || - (is_guest_mode(vcpu) && guest_can_use(vcpu, X86_FEATURE_LBRV) && + (is_guest_mode(vcpu) && guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)); if (enable_lbrv == current_enable_lbrv) @@ -1115,8 +1123,7 @@ static void svm_hardware_unsetup(void) for_each_possible_cpu(cpu) svm_cpu_uninit(cpu); - __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), - get_order(IOPM_SIZE)); + __free_pages(__sme_pa_to_page(iopm_base), get_order(IOPM_SIZE)); iopm_base = 0; } @@ -1178,14 +1185,14 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu, */ if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) { if (!npt_enabled || - !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID)) + !guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_INVPCID)) svm_set_intercept(svm, INTERCEPT_INVPCID); else svm_clr_intercept(svm, INTERCEPT_INVPCID); } if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) { - if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP)) svm_clr_intercept(svm, INTERCEPT_RDTSCP); else svm_set_intercept(svm, INTERCEPT_RDTSCP); @@ -1196,7 +1203,7 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if (guest_cpuid_is_intel(vcpu)) { + if (guest_cpuid_is_intel_compatible(vcpu)) { /* * We must intercept SYSENTER_EIP and SYSENTER_ESP * accesses because the processor only stores 32 bits. @@ -1289,10 +1296,14 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm_set_intercept(svm, INTERCEPT_MWAIT); } - if (!kvm_hlt_in_guest(vcpu->kvm)) - svm_set_intercept(svm, INTERCEPT_HLT); + if (!kvm_hlt_in_guest(vcpu->kvm)) { + if (cpu_feature_enabled(X86_FEATURE_IDLE_HLT)) + svm_set_intercept(svm, INTERCEPT_IDLE_HLT); + else + svm_set_intercept(svm, INTERCEPT_HLT); + } - control->iopm_base_pa = __sme_set(iopm_base); + control->iopm_base_pa = iopm_base; control->msrpm_base_pa = __sme_set(__pa(svm->msrpm)); control->int_ctl = V_INTR_MASKING_MASK; @@ -1363,6 +1374,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK; } + if (vcpu->kvm->arch.bus_lock_detection_enabled) + svm_set_intercept(svm, INTERCEPT_BUSLOCK); + if (sev_guest(vcpu->kvm)) sev_init_vmcb(svm); @@ -1381,7 +1395,9 @@ static void __svm_vcpu_reset(struct kvm_vcpu *vcpu) svm_vcpu_init_msrpm(vcpu, svm->msrpm); svm_init_osvw(vcpu); - vcpu->arch.microcode_version = 0x01000065; + + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) + vcpu->arch.microcode_version = 0x01000065; svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio; svm->nmi_masked = false; @@ -1398,6 +1414,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) svm->spec_ctrl = 0; svm->virt_spec_ctrl = 0; + if (init_event) + sev_snp_init_protected_guest_state(vcpu); + init_vmcb(vcpu); if (!init_event) @@ -1421,7 +1440,7 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu) svm = to_svm(vcpu); err = -ENOMEM; - vmcb01_page = snp_safe_alloc_page(vcpu); + vmcb01_page = snp_safe_alloc_page(); if (!vmcb01_page) goto out; @@ -1430,17 +1449,9 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu) * SEV-ES guests require a separate VMSA page used to contain * the encrypted register state of the guest. */ - vmsa_page = snp_safe_alloc_page(vcpu); + vmsa_page = snp_safe_alloc_page(); if (!vmsa_page) goto error_free_vmcb_page; - - /* - * SEV-ES guests maintain an encrypted version of their FPU - * state which is restored and saved on VMRUN and VMEXIT. - * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't - * do xsave/xrstor on it. - */ - fpstate_set_confidential(&vcpu->arch.guest_fpu); } err = avic_init_vcpu(svm); @@ -1475,33 +1486,75 @@ out: return err; } -static void svm_clear_current_vmcb(struct vmcb *vmcb) +static void svm_vcpu_free(struct kvm_vcpu *vcpu) { - int i; + struct vcpu_svm *svm = to_svm(vcpu); + + svm_leave_nested(vcpu); + svm_free_nested(svm); + + sev_free_vcpu(vcpu); - for_each_online_cpu(i) - cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL); + __free_page(__sme_pa_to_page(svm->vmcb01.pa)); + __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE)); } -static void svm_vcpu_free(struct kvm_vcpu *vcpu) +#ifdef CONFIG_CPU_MITIGATIONS +static DEFINE_SPINLOCK(srso_lock); +static atomic_t srso_nr_vms; + +static void svm_srso_clear_bp_spec_reduce(void *ign) { - struct vcpu_svm *svm = to_svm(vcpu); + struct svm_cpu_data *sd = this_cpu_ptr(&svm_data); + + if (!sd->bp_spec_reduce_set) + return; + + msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); + sd->bp_spec_reduce_set = false; +} + +static void svm_srso_vm_destroy(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) + return; + + if (atomic_dec_return(&srso_nr_vms)) + return; + + guard(spinlock)(&srso_lock); /* - * The vmcb page can be recycled, causing a false negative in - * svm_vcpu_load(). So, ensure that no logical CPU has this - * vmcb page recorded as its current vmcb. + * Verify a new VM didn't come along, acquire the lock, and increment + * the count before this task acquired the lock. */ - svm_clear_current_vmcb(svm->vmcb); + if (atomic_read(&srso_nr_vms)) + return; - svm_leave_nested(vcpu); - svm_free_nested(svm); + on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1); +} - sev_free_vcpu(vcpu); +static void svm_srso_vm_init(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) + return; - __free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT)); - __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE)); + /* + * Acquire the lock on 0 => 1 transitions to ensure a potential 1 => 0 + * transition, i.e. destroying the last VM, is fully complete, e.g. so + * that a delayed IPI doesn't clear BP_SPEC_REDUCE after a vCPU runs. + */ + if (atomic_inc_not_zero(&srso_nr_vms)) + return; + + guard(spinlock)(&srso_lock); + + atomic_inc(&srso_nr_vms); } +#else +static void svm_srso_vm_init(void) { } +static void svm_srso_vm_destroy(void) { } +#endif static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { @@ -1519,12 +1572,8 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) * or subsequent vmload of host save area. */ vmsave(sd->save_area_pa); - if (sev_es_guest(vcpu->kvm)) { - struct sev_es_save_area *hostsa; - hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400); - - sev_es_prepare_switch_to_guest(hostsa); - } + if (sev_es_guest(vcpu->kvm)) + sev_es_prepare_switch_to_guest(svm, sev_es_host_save_area(sd)); if (tsc_scaling) __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); @@ -1533,12 +1582,17 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) * TSC_AUX is always virtualized for SEV-ES guests when the feature is * available. The user return MSR support is not required in this case * because TSC_AUX is restored on #VMEXIT from the host save area - * (which has been initialized in svm_hardware_enable()). + * (which has been initialized in svm_enable_virtualization_cpu()). */ if (likely(tsc_aux_uret_slot >= 0) && (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm))) kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull); + if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE) && + !sd->bp_spec_reduce_set) { + sd->bp_spec_reduce_set = true; + msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); + } svm->guest_state_loaded = true; } @@ -1549,15 +1603,9 @@ static void svm_prepare_host_switch(struct kvm_vcpu *vcpu) static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - struct vcpu_svm *svm = to_svm(vcpu); - struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); - - if (sd->current_vmcb != svm->vmcb) { - sd->current_vmcb = svm->vmcb; + if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) + shrink_ple_window(vcpu); - if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT)) - indirect_branch_prediction_barrier(); - } if (kvm_vcpu_apicv_active(vcpu)) avic_vcpu_load(vcpu, cpu); } @@ -1916,9 +1964,6 @@ void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE; unsigned long old_cr4 = vcpu->arch.cr4; - if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) - svm_flush_tlb_current(vcpu); - vcpu->arch.cr4 = cr4; if (!npt_enabled) { cr4 |= X86_CR4_PAE; @@ -1931,7 +1976,7 @@ void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; } static void svm_set_segment(struct kvm_vcpu *vcpu, @@ -1990,11 +2035,11 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) svm->asid = sd->next_asid++; } -static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value) +static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value) { - struct vmcb *vmcb = svm->vmcb; + struct vmcb *vmcb = to_svm(vcpu)->vmcb; - if (svm->vcpu.arch.guest_state_protected) + if (vcpu->arch.guest_state_protected) return; if (unlikely(value != vmcb->save.dr6)) { @@ -2051,15 +2096,33 @@ static int pf_interception(struct kvm_vcpu *vcpu) static int npf_interception(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + int rc; u64 fault_address = svm->vmcb->control.exit_info_2; u64 error_code = svm->vmcb->control.exit_info_1; + /* + * WARN if hardware generates a fault with an error code that collides + * with KVM-defined sythentic flags. Clear the flags and continue on, + * i.e. don't terminate the VM, as KVM can't possibly be relying on a + * flag that KVM doesn't know about. + */ + if (WARN_ON_ONCE(error_code & PFERR_SYNTHETIC_MASK)) + error_code &= ~PFERR_SYNTHETIC_MASK; + + if (sev_snp_guest(vcpu->kvm) && (error_code & PFERR_GUEST_ENC_MASK)) + error_code |= PFERR_PRIVATE_ACCESS; + trace_kvm_page_fault(vcpu, fault_address, error_code); - return kvm_mmu_page_fault(vcpu, fault_address, error_code, - static_cpu_has(X86_FEATURE_DECODEASSISTS) ? - svm->vmcb->control.insn_bytes : NULL, - svm->vmcb->control.insn_len); + rc = kvm_mmu_page_fault(vcpu, fault_address, error_code, + static_cpu_has(X86_FEATURE_DECODEASSISTS) ? + svm->vmcb->control.insn_bytes : NULL, + svm->vmcb->control.insn_len); + + if (rc > 0 && error_code & PFERR_GUEST_RMP_MASK) + sev_handle_rmp_fault(vcpu, fault_address, error_code); + + return rc; } static int db_interception(struct kvm_vcpu *vcpu) @@ -2119,14 +2182,13 @@ static int ac_interception(struct kvm_vcpu *vcpu) static bool is_erratum_383(void) { - int err, i; + int i; u64 value; if (!erratum_383_found) return false; - value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err); - if (err) + if (native_read_msr_safe(MSR_IA32_MC0_STATUS, &value)) return false; /* Bit 62 may or may not be set for this mce */ @@ -2137,17 +2199,11 @@ static bool is_erratum_383(void) /* Clear MCi_STATUS registers */ for (i = 0; i < 6; ++i) - native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0); - - value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err); - if (!err) { - u32 low, high; + native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0); + if (!native_read_msr_safe(MSR_IA32_MCG_STATUS, &value)) { value &= ~(1ULL << 2); - low = lower_32_bits(value); - high = upper_32_bits(value); - - native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high); + native_write_msr_safe(MSR_IA32_MCG_STATUS, value); } /* Flush tlb to evict multi-match entries */ @@ -2201,6 +2257,10 @@ static int shutdown_interception(struct kvm_vcpu *vcpu) */ if (!sev_es_guest(vcpu->kvm)) { clear_page(svm->vmcb); +#ifdef CONFIG_KVM_SMM + if (is_smm(vcpu)) + kvm_smm_changed(vcpu, false); +#endif kvm_vcpu_reset(vcpu, true); } @@ -2278,7 +2338,7 @@ static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload) svm_copy_vmloadsave_state(vmcb12, svm->vmcb); } - kvm_vcpu_unmap(vcpu, &map, true); + kvm_vcpu_unmap(vcpu, &map); return ret; } @@ -2804,30 +2864,44 @@ static int efer_trap(struct kvm_vcpu *vcpu) return kvm_complete_insn_gp(vcpu, ret); } -static int svm_get_msr_feature(struct kvm_msr_entry *msr) +static int svm_get_feature_msr(u32 msr, u64 *data) { - msr->data = 0; + *data = 0; - switch (msr->index) { + switch (msr) { case MSR_AMD64_DE_CFG: if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC)) - msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE; + *data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE; break; default: - return KVM_MSR_RET_INVALID; + return KVM_MSR_RET_UNSUPPORTED; } return 0; } +static bool +sev_es_prevent_msr_access(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + return sev_es_guest(vcpu->kvm) && + vcpu->arch.guest_state_protected && + svm_msrpm_offset(msr_info->index) != MSR_INVALID && + !msr_write_intercepted(vcpu, msr_info->index); +} + static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_svm *svm = to_svm(vcpu); + if (sev_es_prevent_msr_access(vcpu, msr_info)) { + msr_info->data = 0; + return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0; + } + switch (msr_info->index) { case MSR_AMD64_TSC_RATIO: if (!msr_info->host_initiated && - !guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR)) return 1; msr_info->data = svm->tsc_ratio_msr; break; @@ -2841,6 +2915,12 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_CSTAR: msr_info->data = svm->vmcb01.ptr->save.cstar; break; + case MSR_GS_BASE: + msr_info->data = svm->vmcb01.ptr->save.gs.base; + break; + case MSR_FS_BASE: + msr_info->data = svm->vmcb01.ptr->save.fs.base; + break; case MSR_KERNEL_GS_BASE: msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base; break; @@ -2853,12 +2933,12 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_SYSENTER_EIP: msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip; - if (guest_cpuid_is_intel(vcpu)) + if (guest_cpuid_is_intel_compatible(vcpu)) msr_info->data |= (u64)svm->sysenter_eip_hi << 32; break; case MSR_IA32_SYSENTER_ESP: msr_info->data = svm->vmcb01.ptr->save.sysenter_esp; - if (guest_cpuid_is_intel(vcpu)) + if (guest_cpuid_is_intel_compatible(vcpu)) msr_info->data |= (u64)svm->sysenter_esp_hi << 32; break; case MSR_TSC_AUX: @@ -2897,7 +2977,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_AMD64_VIRT_SPEC_CTRL: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_VIRT_SSBD)) return 1; msr_info->data = svm->virt_spec_ctrl; @@ -2934,11 +3014,7 @@ static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb)) return kvm_complete_insn_gp(vcpu, err); - ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 1); - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, - X86_TRAP_GP | - SVM_EVTINJ_TYPE_EXEPT | - SVM_EVTINJ_VALID); + svm_vmgexit_inject_exception(svm, X86_TRAP_GP); return 1; } @@ -2974,10 +3050,14 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) u32 ecx = msr->index; u64 data = msr->data; + + if (sev_es_prevent_msr_access(vcpu, msr)) + return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0; + switch (ecx) { case MSR_AMD64_TSC_RATIO: - if (!guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR)) { + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR)) { if (!msr->host_initiated) return 1; @@ -2999,7 +3079,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm->tsc_ratio_msr = data; - if (guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR) && + if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR) && is_guest_mode(vcpu)) nested_svm_update_tsc_ratio_msr(vcpu); @@ -3044,7 +3124,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; case MSR_AMD64_VIRT_SPEC_CTRL: if (!msr->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_VIRT_SSBD)) return 1; if (data & ~SPEC_CTRL_SSBD) @@ -3062,6 +3142,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) case MSR_CSTAR: svm->vmcb01.ptr->save.cstar = data; break; + case MSR_GS_BASE: + svm->vmcb01.ptr->save.gs.base = data; + break; + case MSR_FS_BASE: + svm->vmcb01.ptr->save.fs.base = data; + break; case MSR_KERNEL_GS_BASE: svm->vmcb01.ptr->save.kernel_gs_base = data; break; @@ -3081,11 +3167,11 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) * 32 bit part of these msrs to support Intel's * implementation of SYSENTER/SYSEXIT. */ - svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0; + svm->sysenter_eip_hi = guest_cpuid_is_intel_compatible(vcpu) ? (data >> 32) : 0; break; case MSR_IA32_SYSENTER_ESP: svm->vmcb01.ptr->save.sysenter_esp = (u32)data; - svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0; + svm->sysenter_esp_hi = guest_cpuid_is_intel_compatible(vcpu) ? (data >> 32) : 0; break; case MSR_TSC_AUX: /* @@ -3093,7 +3179,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) * feature is available. The user return MSR support is not * required in this case because TSC_AUX is restored on #VMEXIT * from the host save area (which has been initialized in - * svm_hardware_enable()). + * svm_enable_virtualization_cpu()). */ if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && sev_es_guest(vcpu->kvm)) break; @@ -3116,6 +3202,16 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) kvm_pr_unimpl_wrmsr(vcpu, ecx, data); break; } + + /* + * Suppress BTF as KVM doesn't virtualize BTF, but there's no + * way to communicate lack of support to the guest. + */ + if (data & DEBUGCTLMSR_BTF) { + kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data); + data &= ~DEBUGCTLMSR_BTF; + } + if (data & DEBUGCTL_RESERVED_BITS) return 1; @@ -3140,18 +3236,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) kvm_pr_unimpl_wrmsr(vcpu, ecx, data); break; case MSR_AMD64_DE_CFG: { - struct kvm_msr_entry msr_entry; - - msr_entry.index = msr->index; - if (svm_get_msr_feature(&msr_entry)) - return 1; + u64 supported_de_cfg; - /* Check the supported bits */ - if (data & ~msr_entry.data) + if (svm_get_feature_msr(ecx, &supported_de_cfg)) return 1; - /* Don't allow the guest to change a bit, #GP */ - if (!msr->host_initiated && (data ^ msr_entry.data)) + if (data & ~supported_de_cfg) return 1; svm->msr_decfg = data; @@ -3216,7 +3306,7 @@ static int invpcid_interception(struct kvm_vcpu *vcpu) unsigned long type; gva_t gva; - if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } @@ -3229,9 +3319,51 @@ static int invpcid_interception(struct kvm_vcpu *vcpu) type = svm->vmcb->control.exit_info_2; gva = svm->vmcb->control.exit_info_1; + /* + * FIXME: Perform segment checks for 32-bit mode, and inject #SS if the + * stack segment is used. The intercept takes priority over all + * #GP checks except CPL>0, but somehow still generates a linear + * address? The APM is sorely lacking. + */ + if (is_noncanonical_address(gva, vcpu, 0)) { + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return 1; + } + return kvm_handle_invpcid(vcpu, type, gva); } +static inline int complete_userspace_buslock(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + /* + * If userspace has NOT changed RIP, then KVM's ABI is to let the guest + * execute the bus-locking instruction. Set the bus lock counter to '1' + * to effectively step past the bus lock. + */ + if (kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip)) + svm->vmcb->control.bus_lock_counter = 1; + + return 1; +} + +static int bus_lock_exit(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; + vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK; + + vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu); + vcpu->arch.complete_userspace_io = complete_userspace_buslock; + + if (is_guest_mode(vcpu)) + svm->nested.ctl.bus_lock_rip = vcpu->arch.cui_linear_rip; + + return 0; +} + static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { [SVM_EXIT_READ_CR0] = cr_interception, [SVM_EXIT_READ_CR3] = cr_interception, @@ -3299,11 +3431,15 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { [SVM_EXIT_CR4_WRITE_TRAP] = cr_trap, [SVM_EXIT_CR8_WRITE_TRAP] = cr_trap, [SVM_EXIT_INVPCID] = invpcid_interception, + [SVM_EXIT_IDLE_HLT] = kvm_emulate_halt, [SVM_EXIT_NPF] = npf_interception, + [SVM_EXIT_BUS_LOCK] = bus_lock_exit, [SVM_EXIT_RSM] = rsm_interception, [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception, +#ifdef CONFIG_KVM_AMD_SEV [SVM_EXIT_VMGEXIT] = sev_handle_vmgexit, +#endif }; static void dump_vmcb(struct kvm_vcpu *vcpu) @@ -3312,14 +3448,21 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save; + char *vm_type; if (!dump_invalid_vmcb) { pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n"); return; } - pr_err("VMCB %p, last attempted VMRUN on CPU %d\n", - svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu); + guard(mutex)(&vmcb_dump_mutex); + + vm_type = sev_snp_guest(vcpu->kvm) ? "SEV-SNP" : + sev_es_guest(vcpu->kvm) ? "SEV-ES" : + sev_guest(vcpu->kvm) ? "SEV" : "SVM"; + + pr_err("%s vCPU%u VMCB %p, last attempted VMRUN on CPU %d\n", + vm_type, vcpu->vcpu_id, svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu); pr_err("VMCB Control Area:\n"); pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff); pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16); @@ -3357,6 +3500,17 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id); pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id); pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa); + pr_err("%-20s%016llx\n", "allowed_sev_features:", control->allowed_sev_features); + pr_err("%-20s%016llx\n", "guest_sev_features:", control->guest_sev_features); + + if (sev_es_guest(vcpu->kvm)) { + save = sev_decrypt_vmsa(vcpu); + if (!save) + goto no_vmsa; + + save01 = save; + } + pr_err("VMCB State Save Area:\n"); pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", "es:", @@ -3427,6 +3581,63 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-15s %016llx %-13s %016llx\n", "excp_from:", save->last_excp_from, "excp_to:", save->last_excp_to); + + if (sev_es_guest(vcpu->kvm)) { + struct sev_es_save_area *vmsa = (struct sev_es_save_area *)save; + + pr_err("%-15s %016llx\n", + "sev_features", vmsa->sev_features); + + pr_err("%-15s %016llx %-13s %016llx\n", + "rax:", vmsa->rax, "rbx:", vmsa->rbx); + pr_err("%-15s %016llx %-13s %016llx\n", + "rcx:", vmsa->rcx, "rdx:", vmsa->rdx); + pr_err("%-15s %016llx %-13s %016llx\n", + "rsi:", vmsa->rsi, "rdi:", vmsa->rdi); + pr_err("%-15s %016llx %-13s %016llx\n", + "rbp:", vmsa->rbp, "rsp:", vmsa->rsp); + pr_err("%-15s %016llx %-13s %016llx\n", + "r8:", vmsa->r8, "r9:", vmsa->r9); + pr_err("%-15s %016llx %-13s %016llx\n", + "r10:", vmsa->r10, "r11:", vmsa->r11); + pr_err("%-15s %016llx %-13s %016llx\n", + "r12:", vmsa->r12, "r13:", vmsa->r13); + pr_err("%-15s %016llx %-13s %016llx\n", + "r14:", vmsa->r14, "r15:", vmsa->r15); + pr_err("%-15s %016llx %-13s %016llx\n", + "xcr0:", vmsa->xcr0, "xss:", vmsa->xss); + } else { + pr_err("%-15s %016llx %-13s %016lx\n", + "rax:", save->rax, "rbx:", + vcpu->arch.regs[VCPU_REGS_RBX]); + pr_err("%-15s %016lx %-13s %016lx\n", + "rcx:", vcpu->arch.regs[VCPU_REGS_RCX], + "rdx:", vcpu->arch.regs[VCPU_REGS_RDX]); + pr_err("%-15s %016lx %-13s %016lx\n", + "rsi:", vcpu->arch.regs[VCPU_REGS_RSI], + "rdi:", vcpu->arch.regs[VCPU_REGS_RDI]); + pr_err("%-15s %016lx %-13s %016llx\n", + "rbp:", vcpu->arch.regs[VCPU_REGS_RBP], + "rsp:", save->rsp); +#ifdef CONFIG_X86_64 + pr_err("%-15s %016lx %-13s %016lx\n", + "r8:", vcpu->arch.regs[VCPU_REGS_R8], + "r9:", vcpu->arch.regs[VCPU_REGS_R9]); + pr_err("%-15s %016lx %-13s %016lx\n", + "r10:", vcpu->arch.regs[VCPU_REGS_R10], + "r11:", vcpu->arch.regs[VCPU_REGS_R11]); + pr_err("%-15s %016lx %-13s %016lx\n", + "r12:", vcpu->arch.regs[VCPU_REGS_R12], + "r13:", vcpu->arch.regs[VCPU_REGS_R13]); + pr_err("%-15s %016lx %-13s %016lx\n", + "r14:", vcpu->arch.regs[VCPU_REGS_R14], + "r15:", vcpu->arch.regs[VCPU_REGS_R15]); +#endif + } + +no_vmsa: + if (sev_es_guest(vcpu->kvm)) + sev_free_decrypted_vmsa(vcpu, save); } static bool svm_check_exit_valid(u64 exit_code) @@ -3459,10 +3670,14 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) return interrupt_window_interception(vcpu); else if (exit_code == SVM_EXIT_INTR) return intr_interception(vcpu); - else if (exit_code == SVM_EXIT_HLT) + else if (exit_code == SVM_EXIT_HLT || exit_code == SVM_EXIT_IDLE_HLT) return kvm_emulate_halt(vcpu); else if (exit_code == SVM_EXIT_NPF) return npf_interception(vcpu); +#ifdef CONFIG_KVM_AMD_SEV + else if (exit_code == SVM_EXIT_VMGEXIT) + return sev_handle_vmgexit(vcpu); +#endif #endif return svm_exit_handlers[exit_code](vcpu); } @@ -3484,6 +3699,21 @@ static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, *error_code = 0; } +static void svm_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, + u32 *error_code) +{ + struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; + + *intr_info = control->event_inj; + + if ((*intr_info & SVM_EXITINTINFO_VALID) && + (*intr_info & SVM_EXITINTINFO_VALID_ERR)) + *error_code = control->event_inj_err; + else + *error_code = 0; + +} + static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3527,7 +3757,7 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) return svm_invoke_exit_handler(vcpu, exit_code); } -static void pre_svm_run(struct kvm_vcpu *vcpu) +static int pre_svm_run(struct kvm_vcpu *vcpu) { struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); struct vcpu_svm *svm = to_svm(vcpu); @@ -3549,6 +3779,8 @@ static void pre_svm_run(struct kvm_vcpu *vcpu) /* FIXME: handle wraparound of asid_generation */ if (svm->current_vmcb->asid_generation != sd->asid_generation) new_asid(svm, sd); + + return 0; } static void svm_inject_nmi(struct kvm_vcpu *vcpu) @@ -3842,16 +4074,27 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); /* - * KVM should never request an NMI window when vNMI is enabled, as KVM - * allows at most one to-be-injected NMI and one pending NMI, i.e. if - * two NMIs arrive simultaneously, KVM will inject one and set - * V_NMI_PENDING for the other. WARN, but continue with the standard - * single-step approach to try and salvage the pending NMI. + * If NMIs are outright masked, i.e. the vCPU is already handling an + * NMI, and KVM has not yet intercepted an IRET, then there is nothing + * more to do at this time as KVM has already enabled IRET intercepts. + * If KVM has already intercepted IRET, then single-step over the IRET, + * as NMIs aren't architecturally unmasked until the IRET completes. + * + * If vNMI is enabled, KVM should never request an NMI window if NMIs + * are masked, as KVM allows at most one to-be-injected NMI and one + * pending NMI. If two NMIs arrive simultaneously, KVM will inject one + * NMI and set V_NMI_PENDING for the other, but if and only if NMIs are + * unmasked. KVM _will_ request an NMI window in some situations, e.g. + * if the vCPU is in an STI shadow or if GIF=0, KVM can't immediately + * inject the NMI. In those situations, KVM needs to single-step over + * the STI shadow or intercept STGI. */ - WARN_ON_ONCE(is_vnmi_enabled(svm)); + if (svm_get_nmi_mask(vcpu)) { + WARN_ON_ONCE(is_vnmi_enabled(svm)); - if (svm_get_nmi_mask(vcpu) && !svm->awaiting_iret_completion) - return; /* IRET will cause a vm exit */ + if (!svm->awaiting_iret_completion) + return; /* IRET will cause a vm exit */ + } /* * SEV-ES guests are responsible for signaling when a vCPU is ready to @@ -4045,20 +4288,23 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) vcpu->arch.nmi_injected = true; svm->nmi_l1_to_l2 = nmi_l1_to_l2; break; - case SVM_EXITINTINFO_TYPE_EXEPT: + case SVM_EXITINTINFO_TYPE_EXEPT: { + u32 error_code = 0; + /* * Never re-inject a #VC exception. */ if (vector == X86_TRAP_VC) break; - if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { - u32 err = svm->vmcb->control.exit_int_info_err; - kvm_requeue_exception_e(vcpu, vector, err); + if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) + error_code = svm->vmcb->control.exit_int_info_err; - } else - kvm_requeue_exception(vcpu, vector); + kvm_requeue_exception(vcpu, vector, + exitintinfo & SVM_EXITINTINFO_VALID_ERR, + error_code); break; + } case SVM_EXITINTINFO_TYPE_INTR: kvm_queue_interrupt(vcpu, vector, false); break; @@ -4084,34 +4330,62 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu) static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu) { + if (to_kvm_sev_info(vcpu->kvm)->need_init) + return -EINVAL; + return 1; } static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); + if (is_guest_mode(vcpu)) return EXIT_FASTPATH_NONE; - if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && - to_svm(vcpu)->vmcb->control.exit_info_1) + switch (svm->vmcb->control.exit_code) { + case SVM_EXIT_MSR: + if (!svm->vmcb->control.exit_info_1) + break; return handle_fastpath_set_msr_irqoff(vcpu); + case SVM_EXIT_HLT: + return handle_fastpath_hlt(vcpu); + default: + break; + } return EXIT_FASTPATH_NONE; } static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted) { + struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); struct vcpu_svm *svm = to_svm(vcpu); guest_state_enter_irqoff(); + /* + * Set RFLAGS.IF prior to VMRUN, as the host's RFLAGS.IF at the time of + * VMRUN controls whether or not physical IRQs are masked (KVM always + * runs with V_INTR_MASKING_MASK). Toggle RFLAGS.IF here to avoid the + * temptation to do STI+VMRUN+CLI, as AMD CPUs bleed the STI shadow + * into guest state if delivery of an event during VMRUN triggers a + * #VMEXIT, and the guest_state transitions already tell lockdep that + * IRQs are being enabled/disabled. Note! GIF=0 for the entirety of + * this path, so IRQs aren't actually unmasked while running host code. + */ + raw_local_irq_enable(); + amd_clear_divider(); if (sev_es_guest(vcpu->kvm)) - __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted); + __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted, + sev_es_host_save_area(sd)); else __svm_vcpu_run(svm, spec_ctrl_intercepted); + raw_local_irq_disable(); + guest_state_exit_irqoff(); } @@ -4146,7 +4420,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, if (force_immediate_exit) smp_send_reschedule(vcpu->cpu); - pre_svm_run(vcpu); + if (pre_svm_run(vcpu)) { + vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; + vcpu->run->fail_entry.hardware_entry_failure_reason = SVM_EXIT_ERR; + vcpu->run->fail_entry.cpu = vcpu->cpu; + return EXIT_FASTPATH_EXIT_USERSPACE; + } sync_lapic_to_cr8(vcpu); @@ -4162,14 +4441,22 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, * Run with all-zero DR6 unless needed, so that we can get the exact cause * of a #DB. */ - if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) - svm_set_dr6(svm, vcpu->arch.dr6); - else - svm_set_dr6(svm, DR6_ACTIVE_LOW); + if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))) + svm_set_dr6(vcpu, DR6_ACTIVE_LOW); clgi(); kvm_load_guest_xsave_state(vcpu); + /* + * Hardware only context switches DEBUGCTL if LBR virtualization is + * enabled. Manually load DEBUGCTL if necessary (and restore it after + * VM-Exit), as running with the host's DEBUGCTL can negatively affect + * guest state and can even be fatal, e.g. due to Bus Lock Detect. + */ + if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && + vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) + update_debugctlmsr(svm->vmcb->save.dbgctl); + kvm_wait_lapic_expire(vcpu); /* @@ -4197,6 +4484,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); + if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && + vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) + update_debugctlmsr(vcpu->arch.host_debugctl); + kvm_load_host_xsave_state(vcpu); stgi(); @@ -4318,27 +4609,17 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) * XSS on VM-Enter/VM-Exit. Failure to do so would effectively give * the guest read/write access to the host's XSS. */ - if (boot_cpu_has(X86_FEATURE_XSAVE) && - boot_cpu_has(X86_FEATURE_XSAVES) && - guest_cpuid_has(vcpu, X86_FEATURE_XSAVE)) - kvm_governed_feature_set(vcpu, X86_FEATURE_XSAVES); - - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_NRIPS); - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_TSCRATEMSR); - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LBRV); + guest_cpu_cap_change(vcpu, X86_FEATURE_XSAVES, + boot_cpu_has(X86_FEATURE_XSAVES) && + guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE)); /* - * Intercept VMLOAD if the vCPU mode is Intel in order to emulate that + * Intercept VMLOAD if the vCPU model is Intel in order to emulate that * VMLOAD drops bits 63:32 of SYSENTER (ignoring the fact that exposing * SVM on Intel is bonkers and extremely unlikely to work). */ - if (!guest_cpuid_is_intel(vcpu)) - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD); - - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PAUSEFILTER); - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PFTHRESHOLD); - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VGIF); - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VNMI); + if (guest_cpuid_is_intel_compatible(vcpu)) + guest_cpu_cap_clear(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD); svm_recalc_instruction_intercepts(vcpu, svm); @@ -4348,7 +4629,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) if (boot_cpu_has(X86_FEATURE_FLUSH_L1D)) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0, - !!guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D)); + !!guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); if (sev_guest(vcpu->kvm)) sev_vcpu_after_set_cpuid(svm); @@ -4551,12 +4832,6 @@ static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu) vcpu->arch.at_instruction_boundary = true; } -static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) -{ - if (!kvm_pause_in_guest(vcpu->kvm)) - shrink_ple_window(vcpu); -} - static void svm_setup_mce(struct kvm_vcpu *vcpu) { /* [63:9] are reserved. */ @@ -4605,7 +4880,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) * responsible for ensuring nested SVM and SMIs are mutually exclusive. */ - if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) return 1; smram->smram64.svm_guest_flag = 1; @@ -4639,7 +4914,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) svm_copy_vmrun_state(map_save.hva + 0x400, &svm->vmcb01.ptr->save); - kvm_vcpu_unmap(vcpu, &map_save, true); + kvm_vcpu_unmap(vcpu, &map_save); return 0; } @@ -4652,14 +4927,14 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) const struct kvm_smram_state_64 *smram64 = &smram->smram64; - if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) return 0; /* Non-zero if SMI arrived while vCPU was in guest mode. */ if (!smram64->svm_guest_flag) return 0; - if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SVM)) return 1; if (!(smram64->efer & EFER_SVME)) @@ -4699,9 +4974,9 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) svm->nested.nested_run_pending = 1; unmap_save: - kvm_vcpu_unmap(vcpu, &map_save, true); + kvm_vcpu_unmap(vcpu, &map_save); unmap_map: - kvm_vcpu_unmap(vcpu, &map, true); + kvm_vcpu_unmap(vcpu, &map); return ret; } @@ -4722,9 +4997,15 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu) static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, void *insn, int insn_len) { + struct vcpu_svm *svm = to_svm(vcpu); bool smep, smap, is_user; u64 error_code; + /* Check that emulation is possible during event vectoring */ + if ((svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK) && + !kvm_can_emulate_event_vectoring(emul_type)) + return X86EMUL_UNHANDLEABLE_VECTORING; + /* Emulation is always possible when KVM has access to all guest state. */ if (!sev_guest(vcpu->kvm)) return X86EMUL_CONTINUE; @@ -4821,7 +5102,7 @@ static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, * In addition, don't apply the erratum workaround if the #NPF occurred * while translating guest page tables (see below). */ - error_code = to_svm(vcpu)->vmcb->control.exit_info_1; + error_code = svm->vmcb->control.exit_info_1; if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK)) goto resume_guest; @@ -4885,10 +5166,24 @@ static void svm_vm_destroy(struct kvm *kvm) { avic_vm_destroy(kvm); sev_vm_destroy(kvm); + + svm_srso_vm_destroy(); } static int svm_vm_init(struct kvm *kvm) { + int type = kvm->arch.vm_type; + + if (type != KVM_X86_DEFAULT_VM && + type != KVM_X86_SW_PROTECTED_VM) { + kvm->arch.has_protected_state = + (type == KVM_X86_SEV_ES_VM || type == KVM_X86_SNP_VM); + to_kvm_sev_info(kvm)->need_init = true; + + kvm->arch.has_private_mem = (type == KVM_X86_SNP_VM); + kvm->arch.pre_fault_allowed = !kvm->arch.has_private_mem; + } + if (!pause_filter_count || !pause_filter_thresh) kvm->arch.pause_in_guest = true; @@ -4898,12 +5193,13 @@ static int svm_vm_init(struct kvm *kvm) return ret; } + svm_srso_vm_init(); return 0; } static void *svm_alloc_apic_backing_page(struct kvm_vcpu *vcpu) { - struct page *page = snp_safe_alloc_page(vcpu); + struct page *page = snp_safe_alloc_page(); if (!page) return NULL; @@ -4917,8 +5213,9 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .check_processor_compatibility = svm_check_processor_compat, .hardware_unsetup = svm_hardware_unsetup, - .hardware_enable = svm_hardware_enable, - .hardware_disable = svm_hardware_disable, + .enable_virtualization_cpu = svm_enable_virtualization_cpu, + .disable_virtualization_cpu = svm_disable_virtualization_cpu, + .emergency_disable_virtualization_cpu = svm_emergency_disable_virtualization_cpu, .has_emulated_msr = svm_has_emulated_msr, .vcpu_create = svm_vcpu_create, @@ -4936,13 +5233,14 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .vcpu_unblocking = avic_vcpu_unblocking, .update_exception_bitmap = svm_update_exception_bitmap, - .get_msr_feature = svm_get_msr_feature, + .get_feature_msr = svm_get_feature_msr, .get_msr = svm_get_msr, .set_msr = svm_set_msr, .get_segment_base = svm_get_segment_base, .get_segment = svm_get_segment, .set_segment = svm_set_segment, .get_cpl = svm_get_cpl, + .get_cpl_no_cache = svm_get_cpl, .get_cs_db_l_bits = svm_get_cs_db_l_bits, .is_valid_cr0 = svm_is_valid_cr0, .set_cr0 = svm_set_cr0, @@ -4954,6 +5252,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_idt = svm_set_idt, .get_gdt = svm_get_gdt, .set_gdt = svm_set_gdt, + .set_dr6 = svm_set_dr6, .set_dr7 = svm_set_dr7, .sync_dirty_debug_regs = svm_sync_dirty_debug_regs, .cache_reg = svm_cache_reg, @@ -4987,12 +5286,15 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .enable_nmi_window = svm_enable_nmi_window, .enable_irq_window = svm_enable_irq_window, .update_cr8_intercept = svm_update_cr8_intercept, + + .x2apic_icr_is_split = true, .set_virtual_apic_mode = avic_refresh_virtual_apic_mode, .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl, .apicv_post_state_restore = avic_apicv_post_state_restore, .required_apicv_inhibits = AVIC_REQUIRED_APICV_INHIBITS, .get_exit_info = svm_get_exit_info, + .get_entry_info = svm_get_entry_info, .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid, @@ -5008,8 +5310,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .check_intercept = svm_check_intercept, .handle_exit_irqoff = svm_handle_exit_irqoff, - .sched_in = svm_sched_in, - .nested_ops = &svm_nested_ops, .deliver_interrupt = svm_deliver_interrupt, @@ -5023,6 +5323,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .enable_smi_window = svm_enable_smi_window, #endif +#ifdef CONFIG_KVM_AMD_SEV + .dev_get_attr = sev_dev_get_attr, .mem_enc_ioctl = sev_mem_enc_ioctl, .mem_enc_register_region = sev_mem_enc_register_region, .mem_enc_unregister_region = sev_mem_enc_unregister_region, @@ -5030,7 +5332,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .vm_copy_enc_context_from = sev_vm_copy_enc_context_from, .vm_move_enc_context_from = sev_vm_move_enc_context_from, - +#endif .check_emulate_instruction = svm_check_emulate_instruction, .apic_init_signal_blocked = svm_apic_init_signal_blocked, @@ -5041,6 +5343,10 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector, .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons, .alloc_apic_backing_page = svm_alloc_apic_backing_page, + + .gmem_prepare = sev_gmem_prepare, + .gmem_invalidate = sev_gmem_invalidate, + .private_max_mapping_level = sev_private_max_mapping_level, }; /* @@ -5059,7 +5365,7 @@ static __init void svm_adjust_mmio_mask(void) return; /* If memory encryption is not enabled, use existing mask */ - rdmsrl(MSR_AMD64_SYSCFG, msr); + rdmsrq(MSR_AMD64_SYSCFG, msr); if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) return; @@ -5133,6 +5439,9 @@ static __init void svm_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); } + if (cpu_feature_enabled(X86_FEATURE_BUS_LOCK_THRESHOLD)) + kvm_caps.has_bus_lock_exit = true; + /* CPUID 0x80000008 */ if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || boot_cpu_has(X86_FEATURE_AMD_SSBD)) @@ -5157,6 +5466,9 @@ static __init void svm_set_cpu_caps(void) /* CPUID 0x8000001F (SME/SEV features) */ sev_set_cpu_caps(); + + /* Don't advertise Bus Lock Detect to guest if SVM support is absent */ + kvm_cpu_cap_clear(X86_FEATURE_BUS_LOCK_DETECT); } static __init int svm_hardware_setup(void) @@ -5184,7 +5496,7 @@ static __init int svm_hardware_setup(void) iopm_va = page_address(iopm_pages); memset(iopm_va, 0xff, PAGE_SIZE * (1 << order)); - iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; + iopm_base = __sme_page_pa(iopm_pages); init_msrpm_offsets(); @@ -5237,7 +5549,7 @@ static __init int svm_hardware_setup(void) /* Force VM NPT level equal to the host's paging level */ kvm_configure_mmu(npt_enabled, get_npt_level(), get_npt_level(), PG_LEVEL_1G); - pr_info("Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); + pr_info("Nested Paging %s\n", str_enabled_disabled(npt_enabled)); /* Setup shadow_me_value and shadow_me_mask */ kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask); @@ -5246,6 +5558,12 @@ static __init int svm_hardware_setup(void) nrips = nrips && boot_cpu_has(X86_FEATURE_NRIPS); + if (lbrv) { + if (!boot_cpu_has(X86_FEATURE_LBRV)) + lbrv = false; + else + pr_info("LBR virtualization supported\n"); + } /* * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which * may be modified by svm_adjust_mmio_mask()), as well as nrips. @@ -5299,14 +5617,6 @@ static __init int svm_hardware_setup(void) svm_x86_ops.set_vnmi_pending = NULL; } - - if (lbrv) { - if (!boot_cpu_has(X86_FEATURE_LBRV)) - lbrv = false; - else - pr_info("LBR virtualization supported\n"); - } - if (!enable_pmu) pr_info("PMU virtualization is disabled\n"); @@ -5327,6 +5637,7 @@ static __init int svm_hardware_setup(void) */ allow_smaller_maxphyaddr = !npt_enabled; + kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_CD_NW_CLEARED; return 0; err: @@ -5345,8 +5656,6 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = { static void __svm_exit(void) { kvm_x86_vendor_exit(); - - cpu_emergency_unregister_virt_callback(svm_emergency_disable); } static int __init svm_init(void) @@ -5362,8 +5671,6 @@ static int __init svm_init(void) if (r) return r; - cpu_emergency_register_virt_callback(svm_emergency_disable); - /* * Common KVM initialization _must_ come last, after this, /dev/kvm is * exposed to userspace! diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 7f1fbd874c45..e6f3c6a153a0 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -25,12 +25,26 @@ #include "cpuid.h" #include "kvm_cache_regs.h" -#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT) +/* + * Helpers to convert to/from physical addresses for pages whose address is + * consumed directly by hardware. Even though it's a physical address, SVM + * often restricts the address to the natural width, hence 'unsigned long' + * instead of 'hpa_t'. + */ +static inline unsigned long __sme_page_pa(struct page *page) +{ + return __sme_set(page_to_pfn(page) << PAGE_SHIFT); +} + +static inline struct page *__sme_pa_to_page(unsigned long pa) +{ + return pfn_to_page(__sme_clr(pa) >> PAGE_SHIFT); +} #define IOPM_SIZE PAGE_SIZE * 3 #define MSRPM_SIZE PAGE_SIZE * 2 -#define MAX_DIRECT_ACCESS_MSRS 47 +#define MAX_DIRECT_ACCESS_MSRS 48 #define MSRPM_OFFSETS 32 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; @@ -39,6 +53,7 @@ extern int vgif; extern bool intercept_smi; extern bool x2avic_enabled; extern bool vnmi; +extern int lbrv; /* * Clean bits in VMCB. @@ -79,19 +94,30 @@ enum { struct kvm_sev_info { bool active; /* SEV enabled guest */ bool es_active; /* SEV-ES enabled guest */ + bool need_init; /* waiting for SEV_INIT2 */ unsigned int asid; /* ASID used for this guest */ unsigned int handle; /* SEV firmware handle */ int fd; /* SEV device fd */ + unsigned long policy; unsigned long pages_locked; /* Number of pages locked */ struct list_head regions_list; /* List of registered regions */ u64 ap_jump_table; /* SEV-ES AP Jump Table address */ + u64 vmsa_features; + u16 ghcb_version; /* Highest guest GHCB protocol version allowed */ struct kvm *enc_context_owner; /* Owner of copied encryption context */ struct list_head mirror_vms; /* List of VMs mirroring */ struct list_head mirror_entry; /* Use as a list entry of mirrors */ struct misc_cg *misc_cg; /* For misc cgroup accounting */ atomic_t migration_in_progress; + void *snp_context; /* SNP guest context page */ + void *guest_req_buf; /* Bounce buffer for SNP Guest Request input */ + void *guest_resp_buf; /* Bounce buffer for SNP Guest Request output */ + struct mutex guest_req_mutex; /* Must acquire before using bounce buffers */ }; +#define SEV_POLICY_NODBG BIT_ULL(0) +#define SNP_POLICY_DEBUG BIT_ULL(19) + struct kvm_svm { struct kvm kvm; @@ -147,6 +173,7 @@ struct vmcb_ctrl_area_cached { u64 nested_cr3; u64 virt_ext; u32 clean; + u64 bus_lock_rip; union { #if IS_ENABLED(CONFIG_HYPERV) || IS_ENABLED(CONFIG_KVM_HYPERV) struct hv_vmcb_enlightenments hv_enlightenments; @@ -197,6 +224,7 @@ struct vcpu_sev_es_state { u8 valid_bitmap[16]; struct kvm_host_map ghcb_map; bool received_first_sipi; + unsigned int ap_reset_hold_type; /* SEV-ES scratch area support */ u64 sw_scratch; @@ -204,6 +232,18 @@ struct vcpu_sev_es_state { u32 ghcb_sa_len; bool ghcb_sa_sync; bool ghcb_sa_free; + + /* SNP Page-State-Change buffer entries currently being processed */ + u16 psc_idx; + u16 psc_inflight; + bool psc_2m; + + u64 ghcb_registered_gpa; + + struct mutex snp_vmsa_mutex; /* Used to handle concurrent updates of VMSA. */ + gpa_t snp_vmsa_gpa; + bool snp_ap_waiting_for_reset; + bool snp_has_guest_vmsa; }; struct vcpu_svm { @@ -300,10 +340,10 @@ struct svm_cpu_data { u32 next_asid; u32 min_asid; - struct page *save_area; - unsigned long save_area_pa; + bool bp_spec_reduce_set; - struct vmcb *current_vmcb; + struct vmcb *save_area; + unsigned long save_area_pa; /* index = sev_asid, value = vmcb pointer */ struct vmcb **sev_vmcbs; @@ -318,26 +358,39 @@ static __always_inline struct kvm_svm *to_kvm_svm(struct kvm *kvm) return container_of(kvm, struct kvm_svm, kvm); } -static __always_inline bool sev_guest(struct kvm *kvm) +static __always_inline struct kvm_sev_info *to_kvm_sev_info(struct kvm *kvm) { -#ifdef CONFIG_KVM_AMD_SEV - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - - return sev->active; -#else - return false; -#endif + return &to_kvm_svm(kvm)->sev_info; } +#ifdef CONFIG_KVM_AMD_SEV +static __always_inline bool sev_guest(struct kvm *kvm) +{ + return to_kvm_sev_info(kvm)->active; +} static __always_inline bool sev_es_guest(struct kvm *kvm) { -#ifdef CONFIG_KVM_AMD_SEV - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); return sev->es_active && !WARN_ON_ONCE(!sev->active); +} + +static __always_inline bool sev_snp_guest(struct kvm *kvm) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + + return (sev->vmsa_features & SVM_SEV_FEAT_SNP_ACTIVE) && + !WARN_ON_ONCE(!sev_es_guest(kvm)); +} #else - return false; +#define sev_guest(kvm) false +#define sev_es_guest(kvm) false +#define sev_snp_guest(kvm) false #endif + +static inline bool ghcb_gpa_is_registered(struct vcpu_svm *svm, u64 val) +{ + return svm->sev_es.ghcb_registered_gpa == val; } static inline void vmcb_mark_all_dirty(struct vmcb *vmcb) @@ -445,7 +498,7 @@ static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit) static inline bool nested_vgif_enabled(struct vcpu_svm *svm) { - return guest_can_use(&svm->vcpu, X86_FEATURE_VGIF) && + return guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_VGIF) && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK); } @@ -497,7 +550,7 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm) static inline bool nested_vnmi_enabled(struct vcpu_svm *svm) { - return guest_can_use(&svm->vcpu, X86_FEATURE_VNMI) && + return guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_VNMI) && (svm->nested.ctl.int_ctl & V_NMI_ENABLE_MASK); } @@ -531,10 +584,39 @@ static inline bool is_vnmi_enabled(struct vcpu_svm *svm) return false; } +static inline void svm_vmgexit_set_return_code(struct vcpu_svm *svm, + u64 response, u64 data) +{ + ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, response); + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, data); +} + +static inline void svm_vmgexit_inject_exception(struct vcpu_svm *svm, u8 vector) +{ + u64 data = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT | vector; + + svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_ISSUE_EXCEPTION, data); +} + +static inline void svm_vmgexit_bad_input(struct vcpu_svm *svm, u64 suberror) +{ + svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_MALFORMED_INPUT, suberror); +} + +static inline void svm_vmgexit_success(struct vcpu_svm *svm, u64 data) +{ + svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_NO_ACTION, data); +} + +static inline void svm_vmgexit_no_action(struct vcpu_svm *svm, u64 data) +{ + svm_vmgexit_set_return_code(svm, GHCB_HV_RESP_NO_ACTION, data); +} + /* svm.c */ #define MSR_INVALID 0xffffffffU -#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) +#define DEBUGCTL_RESERVED_BITS (~DEBUGCTLMSR_LBR) extern bool dump_invalid_vmcb; @@ -543,6 +625,7 @@ u32 *svm_vcpu_alloc_msrpm(void); void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm); void svm_vcpu_free_msrpm(u32 *msrpm); void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb); +void svm_enable_lbrv(struct kvm_vcpu *vcpu); void svm_update_lbrv(struct kvm_vcpu *vcpu); int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer); @@ -627,7 +710,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops; /* avic.c */ #define AVIC_REQUIRED_APICV_INHIBITS \ ( \ - BIT(APICV_INHIBIT_REASON_DISABLE) | \ + BIT(APICV_INHIBIT_REASON_DISABLED) | \ BIT(APICV_INHIBIT_REASON_ABSENT) | \ BIT(APICV_INHIBIT_REASON_HYPERV) | \ BIT(APICV_INHIBIT_REASON_NESTED) | \ @@ -664,13 +747,16 @@ void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu); /* sev.c */ -#define GHCB_VERSION_MAX 1ULL -#define GHCB_VERSION_MIN 1ULL - - -extern unsigned int max_sev_asid; +int pre_sev_run(struct vcpu_svm *svm, int cpu); +void sev_init_vmcb(struct vcpu_svm *svm); +void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm); +int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in); +void sev_es_vcpu_reset(struct vcpu_svm *svm); +void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); +void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa); +void sev_es_unmap_ghcb(struct vcpu_svm *svm); -void sev_vm_destroy(struct kvm *kvm); +#ifdef CONFIG_KVM_AMD_SEV int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp); int sev_mem_enc_register_region(struct kvm *kvm, struct kvm_enc_region *range); @@ -679,26 +765,73 @@ int sev_mem_enc_unregister_region(struct kvm *kvm, int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd); int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd); void sev_guest_memory_reclaimed(struct kvm *kvm); +int sev_handle_vmgexit(struct kvm_vcpu *vcpu); -void pre_sev_run(struct vcpu_svm *svm, int cpu); +/* These symbols are used in common code and are stubbed below. */ + +struct page *snp_safe_alloc_page_node(int node, gfp_t gfp); +static inline struct page *snp_safe_alloc_page(void) +{ + return snp_safe_alloc_page_node(numa_node_id(), GFP_KERNEL_ACCOUNT); +} + +void sev_free_vcpu(struct kvm_vcpu *vcpu); +void sev_vm_destroy(struct kvm *kvm); void __init sev_set_cpu_caps(void); void __init sev_hardware_setup(void); void sev_hardware_unsetup(void); int sev_cpu_init(struct svm_cpu_data *sd); -void sev_init_vmcb(struct vcpu_svm *svm); -void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm); -void sev_free_vcpu(struct kvm_vcpu *vcpu); -int sev_handle_vmgexit(struct kvm_vcpu *vcpu); -int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in); -void sev_es_vcpu_reset(struct vcpu_svm *svm); -void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); -void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa); -void sev_es_unmap_ghcb(struct vcpu_svm *svm); -struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu); +int sev_dev_get_attr(u32 group, u64 attr, u64 *val); +extern unsigned int max_sev_asid; +void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code); +void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu); +int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); +void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); +int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn); +struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu); +void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa); +#else +static inline struct page *snp_safe_alloc_page_node(int node, gfp_t gfp) +{ + return alloc_pages_node(node, gfp | __GFP_ZERO, 0); +} + +static inline struct page *snp_safe_alloc_page(void) +{ + return snp_safe_alloc_page_node(numa_node_id(), GFP_KERNEL_ACCOUNT); +} + +static inline void sev_free_vcpu(struct kvm_vcpu *vcpu) {} +static inline void sev_vm_destroy(struct kvm *kvm) {} +static inline void __init sev_set_cpu_caps(void) {} +static inline void __init sev_hardware_setup(void) {} +static inline void sev_hardware_unsetup(void) {} +static inline int sev_cpu_init(struct svm_cpu_data *sd) { return 0; } +static inline int sev_dev_get_attr(u32 group, u64 attr, u64 *val) { return -ENXIO; } +#define max_sev_asid 0 +static inline void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) {} +static inline void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) {} +static inline int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order) +{ + return 0; +} +static inline void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) {} +static inline int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +{ + return 0; +} + +static inline struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu) +{ + return NULL; +} +static inline void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa) {} +#endif /* vmenter.S */ -void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted); +void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted, + struct sev_es_save_area *hostsa); void __svm_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted); #define DEFINE_KVM_GHCB_ACCESSORS(field) \ diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 187018c424bf..0c61153b275f 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -3,6 +3,7 @@ #include <asm/asm.h> #include <asm/asm-offsets.h> #include <asm/bitsperlong.h> +#include <asm/frame.h> #include <asm/kvm_vcpu_regs.h> #include <asm/nospec-branch.h> #include "kvm-asm-offsets.h" @@ -67,7 +68,7 @@ "", X86_FEATURE_V_SPEC_CTRL 901: .endm -.macro RESTORE_HOST_SPEC_CTRL_BODY +.macro RESTORE_HOST_SPEC_CTRL_BODY spec_ctrl_intercepted:req 900: /* Same for after vmexit. */ mov $MSR_IA32_SPEC_CTRL, %ecx @@ -76,7 +77,7 @@ * Load the value that the guest had written into MSR_IA32_SPEC_CTRL, * if it was not intercepted during guest execution. */ - cmpb $0, (%_ASM_SP) + cmpb $0, \spec_ctrl_intercepted jnz 998f rdmsr movl %eax, SVM_spec_ctrl(%_ASM_DI) @@ -99,6 +100,7 @@ */ SYM_FUNC_START(__svm_vcpu_run) push %_ASM_BP + mov %_ASM_SP, %_ASM_BP #ifdef CONFIG_X86_64 push %r15 push %r14 @@ -168,12 +170,8 @@ SYM_FUNC_START(__svm_vcpu_run) mov VCPU_RDI(%_ASM_DI), %_ASM_DI /* Enter guest mode */ - sti - 3: vmrun %_ASM_AX 4: - cli - /* Pop @svm to RAX while it's the only available register. */ pop %_ASM_AX @@ -207,10 +205,8 @@ SYM_FUNC_START(__svm_vcpu_run) 7: vmload %_ASM_AX 8: -#ifdef CONFIG_MITIGATION_RETPOLINE /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ - FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE -#endif + FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT /* Clobbers RAX, RCX, RDX. */ RESTORE_HOST_SPEC_CTRL @@ -268,7 +264,7 @@ SYM_FUNC_START(__svm_vcpu_run) RET RESTORE_GUEST_SPEC_CTRL_BODY - RESTORE_HOST_SPEC_CTRL_BODY + RESTORE_HOST_SPEC_CTRL_BODY (%_ASM_SP) 10: cmpb $0, _ASM_RIP(kvm_rebooting) jne 2b @@ -290,66 +286,62 @@ SYM_FUNC_START(__svm_vcpu_run) SYM_FUNC_END(__svm_vcpu_run) +#ifdef CONFIG_KVM_AMD_SEV + + +#ifdef CONFIG_X86_64 +#define SEV_ES_GPRS_BASE 0x300 +#define SEV_ES_RBX (SEV_ES_GPRS_BASE + __VCPU_REGS_RBX * WORD_SIZE) +#define SEV_ES_RBP (SEV_ES_GPRS_BASE + __VCPU_REGS_RBP * WORD_SIZE) +#define SEV_ES_RSI (SEV_ES_GPRS_BASE + __VCPU_REGS_RSI * WORD_SIZE) +#define SEV_ES_RDI (SEV_ES_GPRS_BASE + __VCPU_REGS_RDI * WORD_SIZE) +#define SEV_ES_R12 (SEV_ES_GPRS_BASE + __VCPU_REGS_R12 * WORD_SIZE) +#define SEV_ES_R13 (SEV_ES_GPRS_BASE + __VCPU_REGS_R13 * WORD_SIZE) +#define SEV_ES_R14 (SEV_ES_GPRS_BASE + __VCPU_REGS_R14 * WORD_SIZE) +#define SEV_ES_R15 (SEV_ES_GPRS_BASE + __VCPU_REGS_R15 * WORD_SIZE) +#endif + /** * __svm_sev_es_vcpu_run - Run a SEV-ES vCPU via a transition to SVM guest mode * @svm: struct vcpu_svm * * @spec_ctrl_intercepted: bool */ SYM_FUNC_START(__svm_sev_es_vcpu_run) - push %_ASM_BP -#ifdef CONFIG_X86_64 - push %r15 - push %r14 - push %r13 - push %r12 -#else - push %edi - push %esi -#endif - push %_ASM_BX + FRAME_BEGIN /* - * Save variables needed after vmexit on the stack, in inverse - * order compared to when they are needed. + * Save non-volatile (callee-saved) registers to the host save area. + * Except for RAX and RSP, all GPRs are restored on #VMEXIT, but not + * saved on VMRUN. */ + mov %rbp, SEV_ES_RBP (%rdx) + mov %r15, SEV_ES_R15 (%rdx) + mov %r14, SEV_ES_R14 (%rdx) + mov %r13, SEV_ES_R13 (%rdx) + mov %r12, SEV_ES_R12 (%rdx) + mov %rbx, SEV_ES_RBX (%rdx) - /* Accessed directly from the stack in RESTORE_HOST_SPEC_CTRL. */ - push %_ASM_ARG2 - - /* Save @svm. */ - push %_ASM_ARG1 - -.ifnc _ASM_ARG1, _ASM_DI /* - * Stash @svm in RDI early. On 32-bit, arguments are in RAX, RCX - * and RDX which are clobbered by RESTORE_GUEST_SPEC_CTRL. + * Save volatile registers that hold arguments that are needed after + * #VMEXIT (RDI=@svm and RSI=@spec_ctrl_intercepted). */ - mov %_ASM_ARG1, %_ASM_DI -.endif + mov %rdi, SEV_ES_RDI (%rdx) + mov %rsi, SEV_ES_RSI (%rdx) - /* Clobbers RAX, RCX, RDX. */ + /* Clobbers RAX, RCX, RDX (@hostsa). */ RESTORE_GUEST_SPEC_CTRL /* Get svm->current_vmcb->pa into RAX. */ - mov SVM_current_vmcb(%_ASM_DI), %_ASM_AX - mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX + mov SVM_current_vmcb(%rdi), %rax + mov KVM_VMCB_pa(%rax), %rax /* Enter guest mode */ - sti - -1: vmrun %_ASM_AX - -2: cli - - /* Pop @svm to RDI, guest registers have been saved already. */ - pop %_ASM_DI - -#ifdef CONFIG_MITIGATION_RETPOLINE +1: vmrun %rax +2: /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ - FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE -#endif + FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT - /* Clobbers RAX, RCX, RDX. */ + /* Clobbers RAX, RCX, RDX, consumes RDI (@svm) and RSI (@spec_ctrl_intercepted). */ RESTORE_HOST_SPEC_CTRL /* @@ -361,30 +353,17 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) */ UNTRAIN_RET_VM - /* "Pop" @spec_ctrl_intercepted. */ - pop %_ASM_BX - - pop %_ASM_BX - -#ifdef CONFIG_X86_64 - pop %r12 - pop %r13 - pop %r14 - pop %r15 -#else - pop %esi - pop %edi -#endif - pop %_ASM_BP + FRAME_END RET RESTORE_GUEST_SPEC_CTRL_BODY - RESTORE_HOST_SPEC_CTRL_BODY + RESTORE_HOST_SPEC_CTRL_BODY %sil -3: cmpb $0, _ASM_RIP(kvm_rebooting) +3: cmpb $0, kvm_rebooting(%rip) jne 2b ud2 _ASM_EXTABLE(1b, 3b) SYM_FUNC_END(__svm_sev_es_vcpu_run) +#endif /* CONFIG_KVM_AMD_SEV */ |