diff options
Diffstat (limited to 'arch/x86/kvm/svm/sev.c')
-rw-r--r-- | arch/x86/kvm/svm/sev.c | 2751 |
1 files changed, 2359 insertions, 392 deletions
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 0c240ed04f96..459c3b791fd4 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -6,44 +6,42 @@ * * Copyright 2010 Red Hat, Inc. and/or its affiliates. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_types.h> #include <linux/kvm_host.h> #include <linux/kernel.h> #include <linux/highmem.h> +#include <linux/psp.h> #include <linux/psp-sev.h> #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/misc_cgroup.h> #include <linux/processor.h> #include <linux/trace_events.h> +#include <uapi/linux/sev-guest.h> #include <asm/pkru.h> #include <asm/trapnr.h> #include <asm/fpu/xcr.h> +#include <asm/fpu/xstate.h> +#include <asm/debugreg.h> +#include <asm/msr.h> +#include <asm/sev.h> +#include "mmu.h" #include "x86.h" #include "svm.h" #include "svm_ops.h" #include "cpuid.h" #include "trace.h" -#ifndef CONFIG_KVM_AMD_SEV -/* - * When this config is not defined, SEV feature is not supported and APIs in - * this file are not used but this file still gets compiled into the KVM AMD - * module. - * - * We will not have MISC_CG_RES_SEV and MISC_CG_RES_SEV_ES entries in the enum - * misc_res_type {} defined in linux/misc_cgroup.h. - * - * Below macros allow compilation to succeed. - */ -#define MISC_CG_RES_SEV MISC_CG_RES_TYPES -#define MISC_CG_RES_SEV_ES MISC_CG_RES_TYPES -#endif +#define GHCB_VERSION_MAX 2ULL +#define GHCB_VERSION_DEFAULT 2ULL +#define GHCB_VERSION_MIN 1ULL + +#define GHCB_HV_FT_SUPPORTED (GHCB_HV_FT_SNP | GHCB_HV_FT_SNP_AP_CREATION) -#ifdef CONFIG_KVM_AMD_SEV /* enable/disable SEV support */ static bool sev_enabled = true; module_param_named(sev, sev_enabled, bool, 0444); @@ -51,10 +49,36 @@ module_param_named(sev, sev_enabled, bool, 0444); /* enable/disable SEV-ES support */ static bool sev_es_enabled = true; module_param_named(sev_es, sev_es_enabled, bool, 0444); -#else -#define sev_enabled false -#define sev_es_enabled false -#endif /* CONFIG_KVM_AMD_SEV */ + +/* enable/disable SEV-SNP support */ +static bool sev_snp_enabled = true; +module_param_named(sev_snp, sev_snp_enabled, bool, 0444); + +/* enable/disable SEV-ES DebugSwap support */ +static bool sev_es_debug_swap_enabled = true; +module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444); +static u64 sev_supported_vmsa_features; + +#define AP_RESET_HOLD_NONE 0 +#define AP_RESET_HOLD_NAE_EVENT 1 +#define AP_RESET_HOLD_MSR_PROTO 2 + +/* As defined by SEV-SNP Firmware ABI, under "Guest Policy". */ +#define SNP_POLICY_MASK_API_MINOR GENMASK_ULL(7, 0) +#define SNP_POLICY_MASK_API_MAJOR GENMASK_ULL(15, 8) +#define SNP_POLICY_MASK_SMT BIT_ULL(16) +#define SNP_POLICY_MASK_RSVD_MBO BIT_ULL(17) +#define SNP_POLICY_MASK_DEBUG BIT_ULL(19) +#define SNP_POLICY_MASK_SINGLE_SOCKET BIT_ULL(20) + +#define SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \ + SNP_POLICY_MASK_API_MAJOR | \ + SNP_POLICY_MASK_SMT | \ + SNP_POLICY_MASK_RSVD_MBO | \ + SNP_POLICY_MASK_DEBUG | \ + SNP_POLICY_MASK_SINGLE_SOCKET) + +#define INITIAL_VMSA_GPA 0xFFFFFFFFF000 static u8 sev_enc_bit; static DECLARE_RWSEM(sev_deactivate_lock); @@ -66,6 +90,8 @@ static unsigned int nr_asids; static unsigned long *sev_asid_bitmap; static unsigned long *sev_reclaim_asid_bitmap; +static int snp_decommission_context(struct kvm *kvm); + struct enc_region { struct list_head list; unsigned long npages; @@ -75,9 +101,10 @@ struct enc_region { }; /* Called with the sev_bitmap_lock held, or on shutdown */ -static int sev_flush_asids(int min_asid, int max_asid) +static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid) { - int ret, asid, error = 0; + int ret, error = 0; + unsigned int asid; /* Check if there are any ASIDs to reclaim before performing a flush */ asid = find_next_bit(sev_reclaim_asid_bitmap, nr_asids, min_asid); @@ -91,23 +118,36 @@ static int sev_flush_asids(int min_asid, int max_asid) down_write(&sev_deactivate_lock); wbinvd_on_all_cpus(); - ret = sev_guest_df_flush(&error); + + if (sev_snp_enabled) + ret = sev_do_cmd(SEV_CMD_SNP_DF_FLUSH, NULL, &error); + else + ret = sev_guest_df_flush(&error); up_write(&sev_deactivate_lock); if (ret) - pr_err("SEV: DF_FLUSH failed, ret=%d, error=%#x\n", ret, error); + pr_err("SEV%s: DF_FLUSH failed, ret=%d, error=%#x\n", + sev_snp_enabled ? "-SNP" : "", ret, error); return ret; } static inline bool is_mirroring_enc_context(struct kvm *kvm) { - return !!to_kvm_svm(kvm)->sev_info.enc_context_owner; + return !!to_kvm_sev_info(kvm)->enc_context_owner; +} + +static bool sev_vcpu_has_debug_swap(struct vcpu_svm *svm) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); + + return sev->vmsa_features & SVM_SEV_FEAT_DEBUG_SWAP; } /* Must be called with the sev_bitmap_lock held */ -static bool __sev_recycle_asids(int min_asid, int max_asid) +static bool __sev_recycle_asids(unsigned int min_asid, unsigned int max_asid) { if (sev_flush_asids(min_asid, max_asid)) return false; @@ -134,8 +174,20 @@ static void sev_misc_cg_uncharge(struct kvm_sev_info *sev) static int sev_asid_new(struct kvm_sev_info *sev) { - int asid, min_asid, max_asid, ret; + /* + * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid. + * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1. + * Note: min ASID can end up larger than the max if basic SEV support is + * effectively disabled by disallowing use of ASIDs for SEV guests. + */ + unsigned int min_asid = sev->es_active ? 1 : min_sev_asid; + unsigned int max_asid = sev->es_active ? min_sev_asid - 1 : max_sev_asid; + unsigned int asid; bool retry = true; + int ret; + + if (min_asid > max_asid) + return -ENOTTY; WARN_ON(sev->misc_cg); sev->misc_cg = get_current_misc_cg(); @@ -148,12 +200,6 @@ static int sev_asid_new(struct kvm_sev_info *sev) mutex_lock(&sev_bitmap_lock); - /* - * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid. - * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1. - */ - min_asid = sev->es_active ? 1 : min_sev_asid; - max_asid = sev->es_active ? min_sev_asid - 1 : max_sev_asid; again: asid = find_next_zero_bit(sev_asid_bitmap, max_asid + 1, min_asid); if (asid > max_asid) { @@ -170,7 +216,8 @@ again: mutex_unlock(&sev_bitmap_lock); - return asid; + sev->asid = asid; + return 0; e_uncharge: sev_misc_cg_uncharge(sev); put_misc_cg(sev->misc_cg); @@ -178,11 +225,9 @@ e_uncharge: return ret; } -static int sev_get_asid(struct kvm *kvm) +static unsigned int sev_get_asid(struct kvm *kvm) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - - return sev->asid; + return to_kvm_sev_info(kvm)->asid; } static void sev_asid_free(struct kvm_sev_info *sev) @@ -195,7 +240,7 @@ static void sev_asid_free(struct kvm_sev_info *sev) __set_bit(sev->asid, sev_reclaim_asid_bitmap); for_each_possible_cpu(cpu) { - sd = per_cpu(svm_data, cpu); + sd = per_cpu_ptr(&svm_data, cpu); sd->sev_vmcbs[sev->asid] = NULL; } @@ -217,6 +262,53 @@ static void sev_decommission(unsigned int handle) sev_guest_decommission(&decommission, NULL); } +/* + * Transition a page to hypervisor-owned/shared state in the RMP table. This + * should not fail under normal conditions, but leak the page should that + * happen since it will no longer be usable by the host due to RMP protections. + */ +static int kvm_rmp_make_shared(struct kvm *kvm, u64 pfn, enum pg_level level) +{ + if (KVM_BUG_ON(rmp_make_shared(pfn, level), kvm)) { + snp_leak_pages(pfn, page_level_size(level) >> PAGE_SHIFT); + return -EIO; + } + + return 0; +} + +/* + * Certain page-states, such as Pre-Guest and Firmware pages (as documented + * in Chapter 5 of the SEV-SNP Firmware ABI under "Page States") cannot be + * directly transitioned back to normal/hypervisor-owned state via RMPUPDATE + * unless they are reclaimed first. + * + * Until they are reclaimed and subsequently transitioned via RMPUPDATE, they + * might not be usable by the host due to being set as immutable or still + * being associated with a guest ASID. + * + * Bug the VM and leak the page if reclaim fails, or if the RMP entry can't be + * converted back to shared, as the page is no longer usable due to RMP + * protections, and it's infeasible for the guest to continue on. + */ +static int snp_page_reclaim(struct kvm *kvm, u64 pfn) +{ + struct sev_data_snp_page_reclaim data = {0}; + int fw_err, rc; + + data.paddr = __sme_set(pfn << PAGE_SHIFT); + rc = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &fw_err); + if (KVM_BUG(rc, kvm, "Failed to reclaim PFN %llx, rc %d fw_err %d", pfn, rc, fw_err)) { + snp_leak_pages(pfn, 1); + return -EIO; + } + + if (kvm_rmp_make_shared(kvm, pfn, PG_LEVEL_4K)) + return -EIO; + + return rc; +} + static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) { struct sev_data_deactivate deactivate; @@ -234,49 +326,199 @@ static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) sev_decommission(handle); } -static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) +/* + * This sets up bounce buffers/firmware pages to handle SNP Guest Request + * messages (e.g. attestation requests). See "SNP Guest Request" in the GHCB + * 2.0 specification for more details. + * + * Technically, when an SNP Guest Request is issued, the guest will provide its + * own request/response pages, which could in theory be passed along directly + * to firmware rather than using bounce pages. However, these pages would need + * special care: + * + * - Both pages are from shared guest memory, so they need to be protected + * from migration/etc. occurring while firmware reads/writes to them. At a + * minimum, this requires elevating the ref counts and potentially needing + * an explicit pinning of the memory. This places additional restrictions + * on what type of memory backends userspace can use for shared guest + * memory since there is some reliance on using refcounted pages. + * + * - The response page needs to be switched to Firmware-owned[1] state + * before the firmware can write to it, which can lead to potential + * host RMP #PFs if the guest is misbehaved and hands the host a + * guest page that KVM might write to for other reasons (e.g. virtio + * buffers/etc.). + * + * Both of these issues can be avoided completely by using separately-allocated + * bounce pages for both the request/response pages and passing those to + * firmware instead. So that's what is being set up here. + * + * Guest requests rely on message sequence numbers to ensure requests are + * issued to firmware in the order the guest issues them, so concurrent guest + * requests generally shouldn't happen. But a misbehaved guest could issue + * concurrent guest requests in theory, so a mutex is used to serialize + * access to the bounce buffers. + * + * [1] See the "Page States" section of the SEV-SNP Firmware ABI for more + * details on Firmware-owned pages, along with "RMP and VMPL Access Checks" + * in the APM for details on the related RMP restrictions. + */ +static int snp_guest_req_init(struct kvm *kvm) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - int asid, ret; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + struct page *req_page; + + req_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!req_page) + return -ENOMEM; + + sev->guest_resp_buf = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!sev->guest_resp_buf) { + __free_page(req_page); + return -EIO; + } + + sev->guest_req_buf = page_address(req_page); + mutex_init(&sev->guest_req_mutex); + + return 0; +} + +static void snp_guest_req_cleanup(struct kvm *kvm) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + + if (sev->guest_resp_buf) + snp_free_firmware_page(sev->guest_resp_buf); + + if (sev->guest_req_buf) + __free_page(virt_to_page(sev->guest_req_buf)); + + sev->guest_req_buf = NULL; + sev->guest_resp_buf = NULL; +} + +static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, + struct kvm_sev_init *data, + unsigned long vm_type) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + struct sev_platform_init_args init_args = {0}; + bool es_active = vm_type != KVM_X86_SEV_VM; + u64 valid_vmsa_features = es_active ? sev_supported_vmsa_features : 0; + int ret; if (kvm->created_vcpus) return -EINVAL; - ret = -EBUSY; + if (data->flags) + return -EINVAL; + + if (data->vmsa_features & ~valid_vmsa_features) + return -EINVAL; + + if (data->ghcb_version > GHCB_VERSION_MAX || (!es_active && data->ghcb_version)) + return -EINVAL; + if (unlikely(sev->active)) - return ret; + return -EINVAL; sev->active = true; - sev->es_active = argp->id == KVM_SEV_ES_INIT; - asid = sev_asid_new(sev); - if (asid < 0) + sev->es_active = es_active; + sev->vmsa_features = data->vmsa_features; + sev->ghcb_version = data->ghcb_version; + + /* + * Currently KVM supports the full range of mandatory features defined + * by version 2 of the GHCB protocol, so default to that for SEV-ES + * guests created via KVM_SEV_INIT2. + */ + if (sev->es_active && !sev->ghcb_version) + sev->ghcb_version = GHCB_VERSION_DEFAULT; + + if (vm_type == KVM_X86_SNP_VM) + sev->vmsa_features |= SVM_SEV_FEAT_SNP_ACTIVE; + + ret = sev_asid_new(sev); + if (ret) goto e_no_asid; - sev->asid = asid; - ret = sev_platform_init(&argp->error); + init_args.probe = false; + ret = sev_platform_init(&init_args); if (ret) goto e_free; + /* This needs to happen after SEV/SNP firmware initialization. */ + if (vm_type == KVM_X86_SNP_VM) { + ret = snp_guest_req_init(kvm); + if (ret) + goto e_free; + } + INIT_LIST_HEAD(&sev->regions_list); INIT_LIST_HEAD(&sev->mirror_vms); + sev->need_init = false; kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_SEV); return 0; e_free: + argp->error = init_args.error; sev_asid_free(sev); sev->asid = 0; e_no_asid: + sev->vmsa_features = 0; sev->es_active = false; sev->active = false; return ret; } +static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_init data = { + .vmsa_features = 0, + .ghcb_version = 0, + }; + unsigned long vm_type; + + if (kvm->arch.vm_type != KVM_X86_DEFAULT_VM) + return -EINVAL; + + vm_type = (argp->id == KVM_SEV_INIT ? KVM_X86_SEV_VM : KVM_X86_SEV_ES_VM); + + /* + * KVM_SEV_ES_INIT has been deprecated by KVM_SEV_INIT2, so it will + * continue to only ever support the minimal GHCB protocol version. + */ + if (vm_type == KVM_X86_SEV_ES_VM) + data.ghcb_version = GHCB_VERSION_MIN; + + return __sev_guest_init(kvm, argp, &data, vm_type); +} + +static int sev_guest_init2(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_init data; + + if (!to_kvm_sev_info(kvm)->need_init) + return -EINVAL; + + if (kvm->arch.vm_type != KVM_X86_SEV_VM && + kvm->arch.vm_type != KVM_X86_SEV_ES_VM && + kvm->arch.vm_type != KVM_X86_SNP_VM) + return -EINVAL; + + if (copy_from_user(&data, u64_to_user_ptr(argp->data), sizeof(data))) + return -EFAULT; + + return __sev_guest_init(kvm, argp, &data, kvm->arch.vm_type); +} + static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error) { + unsigned int asid = sev_get_asid(kvm); struct sev_data_activate activate; - int asid = sev_get_asid(kvm); int ret; /* activate ASID on the given handle */ @@ -289,29 +531,24 @@ static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error) static int __sev_issue_cmd(int fd, int id, void *data, int *error) { - struct fd f; - int ret; + CLASS(fd, f)(fd); - f = fdget(fd); - if (!f.file) + if (fd_empty(f)) return -EBADF; - ret = sev_issue_cmd_external_user(f.file, id, data, error); - - fdput(f); - return ret; + return sev_issue_cmd_external_user(fd_file(f), id, data, error); } static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); return __sev_issue_cmd(sev->fd, id, data, error); } static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct sev_data_launch_start start; struct kvm_sev_launch_start params; void *dh_blob, *session_blob; @@ -321,9 +558,11 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!sev_guest(kvm)) return -ENOTTY; - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) return -EFAULT; + sev->policy = params.policy; + memset(&start, 0, sizeof(start)); dh_blob = NULL; @@ -365,7 +604,7 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) /* return handle to userspace */ params.handle = start.handle; - if (copy_to_user((void __user *)(uintptr_t)argp->data, ¶ms, sizeof(params))) { + if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, sizeof(params))) { sev_unbind_asid(kvm, start.handle); ret = -EFAULT; goto e_free_session; @@ -383,9 +622,9 @@ e_free_dh: static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, unsigned long ulen, unsigned long *n, - int write) + unsigned int flags) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); unsigned long npages, size; int npinned; unsigned long locked, lock_limit; @@ -416,7 +655,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, /* Avoid using vmalloc for smaller buffers. */ size = npages * sizeof(struct page *); if (size > PAGE_SIZE) - pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); + pages = __vmalloc(size, GFP_KERNEL_ACCOUNT); else pages = kmalloc(size, GFP_KERNEL_ACCOUNT); @@ -424,7 +663,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, return ERR_PTR(-ENOMEM); /* Pin the user virtual address. */ - npinned = pin_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages); + npinned = pin_user_pages_fast(uaddr, npages, flags, pages); if (npinned != npages) { pr_err("SEV: Failure locking %lu pages.\n", npages); ret = -ENOMEM; @@ -447,11 +686,9 @@ err: static void sev_unpin_memory(struct kvm *kvm, struct page **pages, unsigned long npages) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - unpin_user_pages(pages, npages); kvfree(pages); - sev->pages_locked -= npages; + to_kvm_sev_info(kvm)->pages_locked -= npages; } static void sev_clflush_pages(struct page *pages[], unsigned long npages) @@ -464,9 +701,9 @@ static void sev_clflush_pages(struct page *pages[], unsigned long npages) return; for (i = 0; i < npages; i++) { - page_virtual = kmap_atomic(pages[i]); + page_virtual = kmap_local_page(pages[i]); clflush_cache_range(page_virtual, PAGE_SIZE); - kunmap_atomic(page_virtual); + kunmap_local(page_virtual); cond_resched(); } } @@ -495,7 +732,6 @@ static unsigned long get_num_contig_pages(unsigned long idx, static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) { unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i; - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct kvm_sev_launch_update_data params; struct sev_data_launch_update_data data; struct page **inpages; @@ -504,7 +740,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!sev_guest(kvm)) return -ENOTTY; - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) return -EFAULT; vaddr = params.uaddr; @@ -512,7 +748,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) vaddr_end = vaddr + size; /* Lock the user memory. */ - inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1); + inpages = sev_pin_memory(kvm, vaddr, size, &npages, FOLL_WRITE); if (IS_ERR(inpages)) return PTR_ERR(inpages); @@ -523,7 +759,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) sev_clflush_pages(inpages, npages); data.reserved = 0; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) { int offset, len; @@ -562,7 +798,13 @@ e_unpin: static int sev_es_sync_vmsa(struct vcpu_svm *svm) { + struct kvm_vcpu *vcpu = &svm->vcpu; + struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); struct sev_es_save_area *save = svm->sev_es.vmsa; + struct xregs_state *xsave; + const u8 *s; + u8 *d; + int i; /* Check some debug related fields before encrypting the VMSA */ if (svm->vcpu.guest_debug || (svm->vmcb->save.dr7 & ~DR7_FIXED_1)) @@ -603,6 +845,49 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm) save->xss = svm->vcpu.arch.ia32_xss; save->dr6 = svm->vcpu.arch.dr6; + save->sev_features = sev->vmsa_features; + + /* + * Skip FPU and AVX setup with KVM_SEV_ES_INIT to avoid + * breaking older measurements. + */ + if (vcpu->kvm->arch.vm_type != KVM_X86_DEFAULT_VM) { + xsave = &vcpu->arch.guest_fpu.fpstate->regs.xsave; + save->x87_dp = xsave->i387.rdp; + save->mxcsr = xsave->i387.mxcsr; + save->x87_ftw = xsave->i387.twd; + save->x87_fsw = xsave->i387.swd; + save->x87_fcw = xsave->i387.cwd; + save->x87_fop = xsave->i387.fop; + save->x87_ds = 0; + save->x87_cs = 0; + save->x87_rip = xsave->i387.rip; + + for (i = 0; i < 8; i++) { + /* + * The format of the x87 save area is undocumented and + * definitely not what you would expect. It consists of + * an 8*8 bytes area with bytes 0-7, and an 8*2 bytes + * area with bytes 8-9 of each register. + */ + d = save->fpreg_x87 + i * 8; + s = ((u8 *)xsave->i387.st_space) + i * 16; + memcpy(d, s, 8); + save->fpreg_x87[64 + i * 2] = s[8]; + save->fpreg_x87[64 + i * 2 + 1] = s[9]; + } + memcpy(save->fpreg_xmm, xsave->i387.xmm_space, 256); + + s = get_xsave_addr(xsave, XFEATURE_YMM); + if (s) + memcpy(save->fpreg_ymm, s, 256); + else + memset(save->fpreg_ymm, 0, 256); + } + + pr_debug("Virtual Machine Save Area (VMSA):\n"); + print_hex_dump_debug("", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false); + return 0; } @@ -613,6 +898,11 @@ static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu, struct vcpu_svm *svm = to_svm(vcpu); int ret; + if (vcpu->guest_debug) { + pr_warn_once("KVM_SET_GUEST_DEBUG for SEV-ES guest is not supported"); + return -EINVAL; + } + /* Perform some pre-encryption checks against the VMSA */ ret = sev_es_sync_vmsa(svm); if (ret) @@ -626,14 +916,29 @@ static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu, clflush_cache_range(svm->sev_es.vmsa, PAGE_SIZE); vmsa.reserved = 0; - vmsa.handle = to_kvm_svm(kvm)->sev_info.handle; + vmsa.handle = to_kvm_sev_info(kvm)->handle; vmsa.address = __sme_pa(svm->sev_es.vmsa); vmsa.len = PAGE_SIZE; ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error); if (ret) return ret; + /* + * SEV-ES guests maintain an encrypted version of their FPU + * state which is restored and saved on VMRUN and VMEXIT. + * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't + * do xsave/xrstor on it. + */ + fpstate_set_confidential(&vcpu->arch.guest_fpu); vcpu->arch.guest_state_protected = true; + + /* + * SEV-ES guest mandates LBR Virtualization to be _always_ ON. Enable it + * only after setting guest_state_protected because KVM_SET_MSRS allows + * dynamic toggling of LBRV (for performance reason) on write access to + * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set. + */ + svm_enable_lbrv(vcpu); return 0; } @@ -663,8 +968,7 @@ static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) { - void __user *measure = (void __user *)(uintptr_t)argp->data; - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + void __user *measure = u64_to_user_ptr(argp->data); struct sev_data_launch_measure data; struct kvm_sev_launch_measure params; void __user *p = NULL; @@ -683,7 +987,7 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!params.len) goto cmd; - p = (void __user *)(uintptr_t)params.uaddr; + p = u64_to_user_ptr(params.uaddr); if (p) { if (params.len > SEV_FW_BLOB_MAX_SIZE) return -EINVAL; @@ -697,7 +1001,7 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) } cmd: - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, &data, &argp->error); /* @@ -725,19 +1029,17 @@ e_free_blob: static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_launch_finish data; if (!sev_guest(kvm)) return -ENOTTY; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; return sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, &data, &argp->error); } static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct kvm_sev_guest_status params; struct sev_data_guest_status data; int ret; @@ -747,7 +1049,7 @@ static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp) memset(&data, 0, sizeof(data)); - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, &data, &argp->error); if (ret) return ret; @@ -756,7 +1058,7 @@ static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp) params.state = data.state; params.handle = data.handle; - if (copy_to_user((void __user *)(uintptr_t)argp->data, ¶ms, sizeof(params))) + if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, sizeof(params))) ret = -EFAULT; return ret; @@ -766,11 +1068,10 @@ static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src, unsigned long dst, int size, int *error, bool enc) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_dbg data; data.reserved = 0; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; data.dst_addr = dst; data.src_addr = src; data.len = size; @@ -808,7 +1109,7 @@ static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr, if (!IS_ALIGNED(dst_paddr, 16) || !IS_ALIGNED(paddr, 16) || !IS_ALIGNED(size, 16)) { - tpage = (void *)alloc_page(GFP_KERNEL | __GFP_ZERO); + tpage = (void *)alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!tpage) return -ENOMEM; @@ -921,7 +1222,7 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) if (!sev_guest(kvm)) return -ENOTTY; - if (copy_from_user(&debug, (void __user *)(uintptr_t)argp->data, sizeof(debug))) + if (copy_from_user(&debug, u64_to_user_ptr(argp->data), sizeof(debug))) return -EFAULT; if (!debug.len || debug.src_uaddr + debug.len < debug.src_uaddr) @@ -942,7 +1243,7 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) if (IS_ERR(src_p)) return PTR_ERR(src_p); - dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, 1); + dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, FOLL_WRITE); if (IS_ERR(dst_p)) { sev_unpin_memory(kvm, src_p, n); return PTR_ERR(dst_p); @@ -994,7 +1295,6 @@ err: static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_launch_secret data; struct kvm_sev_launch_secret params; struct page **pages; @@ -1005,10 +1305,10 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!sev_guest(kvm)) return -ENOTTY; - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) return -EFAULT; - pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1); + pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, FOLL_WRITE); if (IS_ERR(pages)) return PTR_ERR(pages); @@ -1050,7 +1350,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) data.hdr_address = __psp_pa(hdr); data.hdr_len = params.hdr_len; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, &data, &argp->error); kfree(hdr); @@ -1069,8 +1369,7 @@ e_unpin_memory: static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp) { - void __user *report = (void __user *)(uintptr_t)argp->data; - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + void __user *report = u64_to_user_ptr(argp->data); struct sev_data_attestation_report data; struct kvm_sev_attestation_report params; void __user *p; @@ -1080,7 +1379,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!sev_guest(kvm)) return -ENOTTY; - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) return -EFAULT; memset(&data, 0, sizeof(data)); @@ -1089,7 +1388,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!params.len) goto cmd; - p = (void __user *)(uintptr_t)params.uaddr; + p = u64_to_user_ptr(params.uaddr); if (p) { if (params.len > SEV_FW_BLOB_MAX_SIZE) return -EINVAL; @@ -1103,7 +1402,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp) memcpy(data.mnonce, params.mnonce, sizeof(params.mnonce)); } cmd: - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, &data, &argp->error); /* * If we query the session length, FW responded with expected data. @@ -1133,16 +1432,15 @@ static int __sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp, struct kvm_sev_send_start *params) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_send_start data; int ret; memset(&data, 0, sizeof(data)); - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error); params->session_len = data.session_len; - if (copy_to_user((void __user *)(uintptr_t)argp->data, params, + if (copy_to_user(u64_to_user_ptr(argp->data), params, sizeof(struct kvm_sev_send_start))) ret = -EFAULT; @@ -1151,7 +1449,6 @@ __sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp, static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_send_start data; struct kvm_sev_send_start params; void *amd_certs, *session_data; @@ -1161,7 +1458,7 @@ static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!sev_guest(kvm)) return -ENOTTY; - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(struct kvm_sev_send_start))) return -EFAULT; @@ -1212,11 +1509,11 @@ static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp) data.amd_certs_len = params.amd_certs_len; data.session_address = __psp_pa(session_data); data.session_len = params.session_len; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error); - if (!ret && copy_to_user((void __user *)(uintptr_t)params.session_uaddr, + if (!ret && copy_to_user(u64_to_user_ptr(params.session_uaddr), session_data, params.session_len)) { ret = -EFAULT; goto e_free_amd_cert; @@ -1224,7 +1521,7 @@ static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp) params.policy = data.policy; params.session_len = data.session_len; - if (copy_to_user((void __user *)(uintptr_t)argp->data, ¶ms, + if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, sizeof(struct kvm_sev_send_start))) ret = -EFAULT; @@ -1244,18 +1541,17 @@ static int __sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp, struct kvm_sev_send_update_data *params) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_send_update_data data; int ret; memset(&data, 0, sizeof(data)); - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error); params->hdr_len = data.hdr_len; params->trans_len = data.trans_len; - if (copy_to_user((void __user *)(uintptr_t)argp->data, params, + if (copy_to_user(u64_to_user_ptr(argp->data), params, sizeof(struct kvm_sev_send_update_data))) ret = -EFAULT; @@ -1264,7 +1560,6 @@ __sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp, static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_send_update_data data; struct kvm_sev_send_update_data params; void *hdr, *trans_data; @@ -1275,7 +1570,7 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!sev_guest(kvm)) return -ENOTTY; - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(struct kvm_sev_send_update_data))) return -EFAULT; @@ -1289,7 +1584,7 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) /* Check if we are crossing the page boundary */ offset = params.guest_uaddr & (PAGE_SIZE - 1); - if ((params.guest_len + offset > PAGE_SIZE)) + if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE) return -EINVAL; /* Pin guest memory */ @@ -1300,11 +1595,11 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) /* allocate memory for header and transport buffer */ ret = -ENOMEM; - hdr = kzalloc(params.hdr_len, GFP_KERNEL_ACCOUNT); + hdr = kzalloc(params.hdr_len, GFP_KERNEL); if (!hdr) goto e_unpin; - trans_data = kzalloc(params.trans_len, GFP_KERNEL_ACCOUNT); + trans_data = kzalloc(params.trans_len, GFP_KERNEL); if (!trans_data) goto e_free_hdr; @@ -1318,7 +1613,7 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset; data.guest_address |= sev_me_mask; data.guest_len = params.guest_len; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error); @@ -1326,14 +1621,14 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) goto e_free_trans_data; /* copy transport buffer to user space */ - if (copy_to_user((void __user *)(uintptr_t)params.trans_uaddr, + if (copy_to_user(u64_to_user_ptr(params.trans_uaddr), trans_data, params.trans_len)) { ret = -EFAULT; goto e_free_trans_data; } /* Copy packet header to userspace. */ - if (copy_to_user((void __user *)(uintptr_t)params.hdr_uaddr, hdr, + if (copy_to_user(u64_to_user_ptr(params.hdr_uaddr), hdr, params.hdr_len)) ret = -EFAULT; @@ -1349,31 +1644,29 @@ e_unpin: static int sev_send_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_send_finish data; if (!sev_guest(kvm)) return -ENOTTY; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; return sev_issue_cmd(kvm, SEV_CMD_SEND_FINISH, &data, &argp->error); } static int sev_send_cancel(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_send_cancel data; if (!sev_guest(kvm)) return -ENOTTY; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; return sev_issue_cmd(kvm, SEV_CMD_SEND_CANCEL, &data, &argp->error); } static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct sev_data_receive_start start; struct kvm_sev_receive_start params; int *error = &argp->error; @@ -1385,7 +1678,7 @@ static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp) return -ENOTTY; /* Get parameter from the userspace */ - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(struct kvm_sev_receive_start))) return -EFAULT; @@ -1427,7 +1720,7 @@ static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp) } params.handle = start.handle; - if (copy_to_user((void __user *)(uintptr_t)argp->data, + if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, sizeof(struct kvm_sev_receive_start))) { ret = -EFAULT; sev_unbind_asid(kvm, start.handle); @@ -1447,7 +1740,6 @@ e_free_pdh: static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct kvm_sev_receive_update_data params; struct sev_data_receive_update_data data; void *hdr = NULL, *trans = NULL; @@ -1458,7 +1750,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!sev_guest(kvm)) return -EINVAL; - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(struct kvm_sev_receive_update_data))) return -EFAULT; @@ -1469,7 +1761,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) /* Check if we are crossing the page boundary */ offset = params.guest_uaddr & (PAGE_SIZE - 1); - if ((params.guest_len + offset > PAGE_SIZE)) + if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE) return -EINVAL; hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len); @@ -1490,7 +1782,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) /* Pin guest memory */ guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK, - PAGE_SIZE, &n, 1); + PAGE_SIZE, &n, FOLL_WRITE); if (IS_ERR(guest_page)) { ret = PTR_ERR(guest_page); goto e_free_trans; @@ -1507,7 +1799,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset; data.guest_address |= sev_me_mask; data.guest_len = params.guest_len; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; ret = sev_issue_cmd(kvm, SEV_CMD_RECEIVE_UPDATE_DATA, &data, &argp->error); @@ -1524,13 +1816,12 @@ e_free_hdr: static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; struct sev_data_receive_finish data; if (!sev_guest(kvm)) return -ENOTTY; - data.handle = sev->handle; + data.handle = to_kvm_sev_info(kvm)->handle; return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error); } @@ -1550,8 +1841,8 @@ static bool is_cmd_allowed_from_mirror(u32 cmd_id) static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) { - struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info; - struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info; + struct kvm_sev_info *dst_sev = to_kvm_sev_info(dst_kvm); + struct kvm_sev_info *src_sev = to_kvm_sev_info(src_kvm); int r = -EBUSY; if (dst_kvm == src_kvm) @@ -1585,8 +1876,8 @@ release_dst: static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) { - struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info; - struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info; + struct kvm_sev_info *dst_sev = to_kvm_sev_info(dst_kvm); + struct kvm_sev_info *src_sev = to_kvm_sev_info(src_kvm); mutex_unlock(&dst_kvm->lock); mutex_unlock(&src_kvm->lock); @@ -1594,77 +1885,10 @@ static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) atomic_set_release(&src_sev->migration_in_progress, 0); } -/* vCPU mutex subclasses. */ -enum sev_migration_role { - SEV_MIGRATION_SOURCE = 0, - SEV_MIGRATION_TARGET, - SEV_NR_MIGRATION_ROLES, -}; - -static int sev_lock_vcpus_for_migration(struct kvm *kvm, - enum sev_migration_role role) -{ - struct kvm_vcpu *vcpu; - unsigned long i, j; - bool first = true; - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (mutex_lock_killable_nested(&vcpu->mutex, role)) - goto out_unlock; - - if (first) { - /* - * Reset the role to one that avoids colliding with - * the role used for the first vcpu mutex. - */ - role = SEV_NR_MIGRATION_ROLES; - first = false; - } else { - mutex_release(&vcpu->mutex.dep_map, _THIS_IP_); - } - } - - return 0; - -out_unlock: - - first = true; - kvm_for_each_vcpu(j, vcpu, kvm) { - if (i == j) - break; - - if (first) - first = false; - else - mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_); - - - mutex_unlock(&vcpu->mutex); - } - return -EINTR; -} - -static void sev_unlock_vcpus_for_migration(struct kvm *kvm) -{ - struct kvm_vcpu *vcpu; - unsigned long i; - bool first = true; - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (first) - first = false; - else - mutex_acquire(&vcpu->mutex.dep_map, - SEV_NR_MIGRATION_ROLES, 0, _THIS_IP_); - - mutex_unlock(&vcpu->mutex); - } -} - static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm) { - struct kvm_sev_info *dst = &to_kvm_svm(dst_kvm)->sev_info; - struct kvm_sev_info *src = &to_kvm_svm(src_kvm)->sev_info; + struct kvm_sev_info *dst = to_kvm_sev_info(dst_kvm); + struct kvm_sev_info *src = to_kvm_sev_info(src_kvm); struct kvm_vcpu *dst_vcpu, *src_vcpu; struct vcpu_svm *dst_svm, *src_svm; struct kvm_sev_info *mirror; @@ -1676,6 +1900,7 @@ static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm) dst->pages_locked = src->pages_locked; dst->enc_context_owner = src->enc_context_owner; dst->es_active = src->es_active; + dst->vmsa_features = src->vmsa_features; src->asid = 0; src->active = false; @@ -1703,8 +1928,7 @@ static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm) * and add the new mirror to the list. */ if (is_mirroring_enc_context(dst_kvm)) { - struct kvm_sev_info *owner_sev_info = - &to_kvm_svm(dst->enc_context_owner)->sev_info; + struct kvm_sev_info *owner_sev_info = to_kvm_sev_info(dst->enc_context_owner); list_del(&src->mirror_entry); list_add_tail(&dst->mirror_entry, &owner_sev_info->mirror_vms); @@ -1722,7 +1946,7 @@ static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm) * Note, the source is not required to have the same number of * vCPUs as the destination when migrating a vanilla SEV VM. */ - src_vcpu = kvm_get_vcpu(dst_kvm, i); + src_vcpu = kvm_get_vcpu(src_kvm, i); src_svm = to_svm(src_vcpu); /* @@ -1763,30 +1987,31 @@ static int sev_check_source_vcpus(struct kvm *dst, struct kvm *src) int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) { - struct kvm_sev_info *dst_sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *dst_sev = to_kvm_sev_info(kvm); struct kvm_sev_info *src_sev, *cg_cleanup_sev; - struct file *source_kvm_file; + CLASS(fd, f)(source_fd); struct kvm *source_kvm; bool charged = false; int ret; - source_kvm_file = fget(source_fd); - if (!file_is_kvm(source_kvm_file)) { - ret = -EBADF; - goto out_fput; - } + if (fd_empty(f)) + return -EBADF; + + if (!file_is_kvm(fd_file(f))) + return -EBADF; - source_kvm = source_kvm_file->private_data; + source_kvm = fd_file(f)->private_data; ret = sev_lock_two_vms(kvm, source_kvm); if (ret) - goto out_fput; + return ret; - if (sev_guest(kvm) || !sev_guest(source_kvm)) { + if (kvm->arch.vm_type != source_kvm->arch.vm_type || + sev_guest(kvm) || !sev_guest(source_kvm)) { ret = -EINVAL; goto out_unlock; } - src_sev = &to_kvm_svm(source_kvm)->sev_info; + src_sev = to_kvm_sev_info(source_kvm); dst_sev->misc_cg = get_current_misc_cg(); cg_cleanup_sev = dst_sev; @@ -1797,10 +2022,10 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) charged = true; } - ret = sev_lock_vcpus_for_migration(kvm, SEV_MIGRATION_SOURCE); + ret = kvm_lock_all_vcpus(kvm); if (ret) goto out_dst_cgroup; - ret = sev_lock_vcpus_for_migration(source_kvm, SEV_MIGRATION_TARGET); + ret = kvm_lock_all_vcpus(source_kvm); if (ret) goto out_dst_vcpu; @@ -1814,9 +2039,9 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) ret = 0; out_source_vcpu: - sev_unlock_vcpus_for_migration(source_kvm); + kvm_unlock_all_vcpus(source_kvm); out_dst_vcpu: - sev_unlock_vcpus_for_migration(kvm); + kvm_unlock_all_vcpus(kvm); out_dst_cgroup: /* Operates on the source on success, on the destination on failure. */ if (charged) @@ -1825,9 +2050,429 @@ out_dst_cgroup: cg_cleanup_sev->misc_cg = NULL; out_unlock: sev_unlock_two_vms(kvm, source_kvm); -out_fput: - if (source_kvm_file) - fput(source_kvm_file); + return ret; +} + +int sev_dev_get_attr(u32 group, u64 attr, u64 *val) +{ + if (group != KVM_X86_GRP_SEV) + return -ENXIO; + + switch (attr) { + case KVM_X86_SEV_VMSA_FEATURES: + *val = sev_supported_vmsa_features; + return 0; + + default: + return -ENXIO; + } +} + +/* + * The guest context contains all the information, keys and metadata + * associated with the guest that the firmware tracks to implement SEV + * and SNP features. The firmware stores the guest context in hypervisor + * provide page via the SNP_GCTX_CREATE command. + */ +static void *snp_context_create(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct sev_data_snp_addr data = {}; + void *context; + int rc; + + /* Allocate memory for context page */ + context = snp_alloc_firmware_page(GFP_KERNEL_ACCOUNT); + if (!context) + return NULL; + + data.address = __psp_pa(context); + rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_GCTX_CREATE, &data, &argp->error); + if (rc) { + pr_warn("Failed to create SEV-SNP context, rc %d fw_error %d", + rc, argp->error); + snp_free_firmware_page(context); + return NULL; + } + + return context; +} + +static int snp_bind_asid(struct kvm *kvm, int *error) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + struct sev_data_snp_activate data = {0}; + + data.gctx_paddr = __psp_pa(sev->snp_context); + data.asid = sev_get_asid(kvm); + return sev_issue_cmd(kvm, SEV_CMD_SNP_ACTIVATE, &data, error); +} + +static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + struct sev_data_snp_launch_start start = {0}; + struct kvm_sev_snp_launch_start params; + int rc; + + if (!sev_snp_guest(kvm)) + return -ENOTTY; + + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) + return -EFAULT; + + /* Don't allow userspace to allocate memory for more than 1 SNP context. */ + if (sev->snp_context) + return -EINVAL; + + if (params.flags) + return -EINVAL; + + if (params.policy & ~SNP_POLICY_MASK_VALID) + return -EINVAL; + + /* Check for policy bits that must be set */ + if (!(params.policy & SNP_POLICY_MASK_RSVD_MBO) || + !(params.policy & SNP_POLICY_MASK_SMT)) + return -EINVAL; + + if (params.policy & SNP_POLICY_MASK_SINGLE_SOCKET) + return -EINVAL; + + sev->policy = params.policy; + + sev->snp_context = snp_context_create(kvm, argp); + if (!sev->snp_context) + return -ENOTTY; + + start.gctx_paddr = __psp_pa(sev->snp_context); + start.policy = params.policy; + memcpy(start.gosvw, params.gosvw, sizeof(params.gosvw)); + rc = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_START, &start, &argp->error); + if (rc) { + pr_debug("%s: SEV_CMD_SNP_LAUNCH_START firmware command failed, rc %d\n", + __func__, rc); + goto e_free_context; + } + + sev->fd = argp->sev_fd; + rc = snp_bind_asid(kvm, &argp->error); + if (rc) { + pr_debug("%s: Failed to bind ASID to SEV-SNP context, rc %d\n", + __func__, rc); + goto e_free_context; + } + + return 0; + +e_free_context: + snp_decommission_context(kvm); + + return rc; +} + +struct sev_gmem_populate_args { + __u8 type; + int sev_fd; + int fw_error; +}; + +static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pfn, + void __user *src, int order, void *opaque) +{ + struct sev_gmem_populate_args *sev_populate_args = opaque; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + int n_private = 0, ret, i; + int npages = (1 << order); + gfn_t gfn; + + if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src)) + return -EINVAL; + + for (gfn = gfn_start, i = 0; gfn < gfn_start + npages; gfn++, i++) { + struct sev_data_snp_launch_update fw_args = {0}; + bool assigned = false; + int level; + + ret = snp_lookup_rmpentry((u64)pfn + i, &assigned, &level); + if (ret || assigned) { + pr_debug("%s: Failed to ensure GFN 0x%llx RMP entry is initial shared state, ret: %d assigned: %d\n", + __func__, gfn, ret, assigned); + ret = ret ? -EINVAL : -EEXIST; + goto err; + } + + if (src) { + void *vaddr = kmap_local_pfn(pfn + i); + + if (copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE)) { + ret = -EFAULT; + goto err; + } + kunmap_local(vaddr); + } + + ret = rmp_make_private(pfn + i, gfn << PAGE_SHIFT, PG_LEVEL_4K, + sev_get_asid(kvm), true); + if (ret) + goto err; + + n_private++; + + fw_args.gctx_paddr = __psp_pa(sev->snp_context); + fw_args.address = __sme_set(pfn_to_hpa(pfn + i)); + fw_args.page_size = PG_LEVEL_TO_RMP(PG_LEVEL_4K); + fw_args.page_type = sev_populate_args->type; + + ret = __sev_issue_cmd(sev_populate_args->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE, + &fw_args, &sev_populate_args->fw_error); + if (ret) + goto fw_err; + } + + return 0; + +fw_err: + /* + * If the firmware command failed handle the reclaim and cleanup of that + * PFN specially vs. prior pages which can be cleaned up below without + * needing to reclaim in advance. + * + * Additionally, when invalid CPUID function entries are detected, + * firmware writes the expected values into the page and leaves it + * unencrypted so it can be used for debugging and error-reporting. + * + * Copy this page back into the source buffer so userspace can use this + * information to provide information on which CPUID leaves/fields + * failed CPUID validation. + */ + if (!snp_page_reclaim(kvm, pfn + i) && + sev_populate_args->type == KVM_SEV_SNP_PAGE_TYPE_CPUID && + sev_populate_args->fw_error == SEV_RET_INVALID_PARAM) { + void *vaddr = kmap_local_pfn(pfn + i); + + if (copy_to_user(src + i * PAGE_SIZE, vaddr, PAGE_SIZE)) + pr_debug("Failed to write CPUID page back to userspace\n"); + + kunmap_local(vaddr); + } + + /* pfn + i is hypervisor-owned now, so skip below cleanup for it. */ + n_private--; + +err: + pr_debug("%s: exiting with error ret %d (fw_error %d), restoring %d gmem PFNs to shared.\n", + __func__, ret, sev_populate_args->fw_error, n_private); + for (i = 0; i < n_private; i++) + kvm_rmp_make_shared(kvm, pfn + i, PG_LEVEL_4K); + + return ret; +} + +static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + struct sev_gmem_populate_args sev_populate_args = {0}; + struct kvm_sev_snp_launch_update params; + struct kvm_memory_slot *memslot; + long npages, count; + void __user *src; + int ret = 0; + + if (!sev_snp_guest(kvm) || !sev->snp_context) + return -EINVAL; + + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) + return -EFAULT; + + pr_debug("%s: GFN start 0x%llx length 0x%llx type %d flags %d\n", __func__, + params.gfn_start, params.len, params.type, params.flags); + + if (!PAGE_ALIGNED(params.len) || params.flags || + (params.type != KVM_SEV_SNP_PAGE_TYPE_NORMAL && + params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO && + params.type != KVM_SEV_SNP_PAGE_TYPE_UNMEASURED && + params.type != KVM_SEV_SNP_PAGE_TYPE_SECRETS && + params.type != KVM_SEV_SNP_PAGE_TYPE_CPUID)) + return -EINVAL; + + npages = params.len / PAGE_SIZE; + + /* + * For each GFN that's being prepared as part of the initial guest + * state, the following pre-conditions are verified: + * + * 1) The backing memslot is a valid private memslot. + * 2) The GFN has been set to private via KVM_SET_MEMORY_ATTRIBUTES + * beforehand. + * 3) The PFN of the guest_memfd has not already been set to private + * in the RMP table. + * + * The KVM MMU relies on kvm->mmu_invalidate_seq to retry nested page + * faults if there's a race between a fault and an attribute update via + * KVM_SET_MEMORY_ATTRIBUTES, and a similar approach could be utilized + * here. However, kvm->slots_lock guards against both this as well as + * concurrent memslot updates occurring while these checks are being + * performed, so use that here to make it easier to reason about the + * initial expected state and better guard against unexpected + * situations. + */ + mutex_lock(&kvm->slots_lock); + + memslot = gfn_to_memslot(kvm, params.gfn_start); + if (!kvm_slot_can_be_private(memslot)) { + ret = -EINVAL; + goto out; + } + + sev_populate_args.sev_fd = argp->sev_fd; + sev_populate_args.type = params.type; + src = params.type == KVM_SEV_SNP_PAGE_TYPE_ZERO ? NULL : u64_to_user_ptr(params.uaddr); + + count = kvm_gmem_populate(kvm, params.gfn_start, src, npages, + sev_gmem_post_populate, &sev_populate_args); + if (count < 0) { + argp->error = sev_populate_args.fw_error; + pr_debug("%s: kvm_gmem_populate failed, ret %ld (fw_error %d)\n", + __func__, count, argp->error); + ret = -EIO; + } else { + params.gfn_start += count; + params.len -= count * PAGE_SIZE; + if (params.type != KVM_SEV_SNP_PAGE_TYPE_ZERO) + params.uaddr += count * PAGE_SIZE; + + ret = 0; + if (copy_to_user(u64_to_user_ptr(argp->data), ¶ms, sizeof(params))) + ret = -EFAULT; + } + +out: + mutex_unlock(&kvm->slots_lock); + + return ret; +} + +static int snp_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + struct sev_data_snp_launch_update data = {}; + struct kvm_vcpu *vcpu; + unsigned long i; + int ret; + + data.gctx_paddr = __psp_pa(sev->snp_context); + data.page_type = SNP_PAGE_TYPE_VMSA; + + kvm_for_each_vcpu(i, vcpu, kvm) { + struct vcpu_svm *svm = to_svm(vcpu); + u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT; + + ret = sev_es_sync_vmsa(svm); + if (ret) + return ret; + + /* Transition the VMSA page to a firmware state. */ + ret = rmp_make_private(pfn, INITIAL_VMSA_GPA, PG_LEVEL_4K, sev->asid, true); + if (ret) + return ret; + + /* Issue the SNP command to encrypt the VMSA */ + data.address = __sme_pa(svm->sev_es.vmsa); + ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_SNP_LAUNCH_UPDATE, + &data, &argp->error); + if (ret) { + snp_page_reclaim(kvm, pfn); + + return ret; + } + + svm->vcpu.arch.guest_state_protected = true; + /* + * SEV-ES (and thus SNP) guest mandates LBR Virtualization to + * be _always_ ON. Enable it only after setting + * guest_state_protected because KVM_SET_MSRS allows dynamic + * toggling of LBRV (for performance reason) on write access to + * MSR_IA32_DEBUGCTLMSR when guest_state_protected is not set. + */ + svm_enable_lbrv(vcpu); + } + + return 0; +} + +static int snp_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + struct kvm_sev_snp_launch_finish params; + struct sev_data_snp_launch_finish *data; + void *id_block = NULL, *id_auth = NULL; + int ret; + + if (!sev_snp_guest(kvm)) + return -ENOTTY; + + if (!sev->snp_context) + return -EINVAL; + + if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) + return -EFAULT; + + if (params.flags) + return -EINVAL; + + /* Measure all vCPUs using LAUNCH_UPDATE before finalizing the launch flow. */ + ret = snp_launch_update_vmsa(kvm, argp); + if (ret) + return ret; + + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); + if (!data) + return -ENOMEM; + + if (params.id_block_en) { + id_block = psp_copy_user_blob(params.id_block_uaddr, KVM_SEV_SNP_ID_BLOCK_SIZE); + if (IS_ERR(id_block)) { + ret = PTR_ERR(id_block); + goto e_free; + } + + data->id_block_en = 1; + data->id_block_paddr = __sme_pa(id_block); + + id_auth = psp_copy_user_blob(params.id_auth_uaddr, KVM_SEV_SNP_ID_AUTH_SIZE); + if (IS_ERR(id_auth)) { + ret = PTR_ERR(id_auth); + goto e_free_id_block; + } + + data->id_auth_paddr = __sme_pa(id_auth); + + if (params.auth_key_en) + data->auth_key_en = 1; + } + + data->vcek_disabled = params.vcek_disabled; + + memcpy(data->host_data, params.host_data, KVM_SEV_SNP_FINISH_DATA_SIZE); + data->gctx_paddr = __psp_pa(sev->snp_context); + ret = sev_issue_cmd(kvm, SEV_CMD_SNP_LAUNCH_FINISH, data, &argp->error); + + /* + * Now that there will be no more SNP_LAUNCH_UPDATE ioctls, private pages + * can be given to the guest simply by marking the RMP entry as private. + * This can happen on first access and also with KVM_PRE_FAULT_MEMORY. + */ + if (!ret) + kvm->arch.pre_fault_allowed = true; + + kfree(id_auth); + +e_free_id_block: + kfree(id_block); + +e_free: + kfree(data); + return ret; } @@ -1854,6 +2499,15 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) goto out; } + /* + * Once KVM_SEV_INIT2 initializes a KVM instance as an SNP guest, only + * allow the use of SNP-specific commands. + */ + if (sev_snp_guest(kvm) && sev_cmd.id < KVM_SEV_SNP_LAUNCH_START) { + r = -EPERM; + goto out; + } + switch (sev_cmd.id) { case KVM_SEV_ES_INIT: if (!sev_es_enabled) { @@ -1864,6 +2518,9 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) case KVM_SEV_INIT: r = sev_guest_init(kvm, &sev_cmd); break; + case KVM_SEV_INIT2: + r = sev_guest_init2(kvm, &sev_cmd); + break; case KVM_SEV_LAUNCH_START: r = sev_launch_start(kvm, &sev_cmd); break; @@ -1915,6 +2572,15 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp) case KVM_SEV_RECEIVE_FINISH: r = sev_receive_finish(kvm, &sev_cmd); break; + case KVM_SEV_SNP_LAUNCH_START: + r = snp_launch_start(kvm, &sev_cmd); + break; + case KVM_SEV_SNP_LAUNCH_UPDATE: + r = snp_launch_update(kvm, &sev_cmd); + break; + case KVM_SEV_SNP_LAUNCH_FINISH: + r = snp_launch_finish(kvm, &sev_cmd); + break; default: r = -EINVAL; goto out; @@ -1931,7 +2597,7 @@ out: int sev_mem_enc_register_region(struct kvm *kvm, struct kvm_enc_region *range) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct enc_region *region; int ret = 0; @@ -1950,27 +2616,30 @@ int sev_mem_enc_register_region(struct kvm *kvm, return -ENOMEM; mutex_lock(&kvm->lock); - region->pages = sev_pin_memory(kvm, range->addr, range->size, ®ion->npages, 1); + region->pages = sev_pin_memory(kvm, range->addr, range->size, ®ion->npages, + FOLL_WRITE | FOLL_LONGTERM); if (IS_ERR(region->pages)) { ret = PTR_ERR(region->pages); mutex_unlock(&kvm->lock); goto e_free; } - region->uaddr = range->addr; - region->size = range->size; - - list_add_tail(®ion->list, &sev->regions_list); - mutex_unlock(&kvm->lock); - /* * The guest may change the memory encryption attribute from C=0 -> C=1 * or vice versa for this memory range. Lets make sure caches are * flushed to ensure that guest data gets written into memory with - * correct C-bit. + * correct C-bit. Note, this must be done before dropping kvm->lock, + * as region and its array of pages can be freed by a different task + * once kvm->lock is released. */ sev_clflush_pages(region->pages, region->npages); + region->uaddr = range->addr; + region->size = range->size; + + list_add_tail(®ion->list, &sev->regions_list); + mutex_unlock(&kvm->lock); + return ret; e_free: @@ -1981,7 +2650,7 @@ e_free: static struct enc_region * find_enc_region(struct kvm *kvm, struct kvm_enc_region *range) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct list_head *head = &sev->regions_list; struct enc_region *i; @@ -2044,21 +2713,21 @@ failed: int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) { - struct file *source_kvm_file; + CLASS(fd, f)(source_fd); struct kvm *source_kvm; struct kvm_sev_info *source_sev, *mirror_sev; int ret; - source_kvm_file = fget(source_fd); - if (!file_is_kvm(source_kvm_file)) { - ret = -EBADF; - goto e_source_fput; - } + if (fd_empty(f)) + return -EBADF; - source_kvm = source_kvm_file->private_data; + if (!file_is_kvm(fd_file(f))) + return -EBADF; + + source_kvm = fd_file(f)->private_data; ret = sev_lock_two_vms(kvm, source_kvm); if (ret) - goto e_source_fput; + return ret; /* * Mirrors of mirrors should work, but let's not get silly. Also @@ -2076,9 +2745,9 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) * The mirror kvm holds an enc_context_owner ref so its asid can't * disappear until we're done with it */ - source_sev = &to_kvm_svm(source_kvm)->sev_info; + source_sev = to_kvm_sev_info(source_kvm); kvm_get_kvm(source_kvm); - mirror_sev = &to_kvm_svm(kvm)->sev_info; + mirror_sev = to_kvm_sev_info(kvm); list_add_tail(&mirror_sev->mirror_entry, &source_sev->mirror_vms); /* Set enc_context_owner and copy its encryption context over */ @@ -2087,6 +2756,7 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) mirror_sev->asid = source_sev->asid; mirror_sev->fd = source_sev->fd; mirror_sev->es_active = source_sev->es_active; + mirror_sev->need_init = false; mirror_sev->handle = source_sev->handle; INIT_LIST_HEAD(&mirror_sev->regions_list); INIT_LIST_HEAD(&mirror_sev->mirror_vms); @@ -2100,15 +2770,37 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) e_unlock: sev_unlock_two_vms(kvm, source_kvm); -e_source_fput: - if (source_kvm_file) - fput(source_kvm_file); return ret; } +static int snp_decommission_context(struct kvm *kvm) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + struct sev_data_snp_addr data = {}; + int ret; + + /* If context is not created then do nothing */ + if (!sev->snp_context) + return 0; + + /* Do the decommision, which will unbind the ASID from the SNP context */ + data.address = __sme_pa(sev->snp_context); + down_write(&sev_deactivate_lock); + ret = sev_do_cmd(SEV_CMD_SNP_DECOMMISSION, &data, NULL); + up_write(&sev_deactivate_lock); + + if (WARN_ONCE(ret, "Failed to release guest context, ret %d", ret)) + return ret; + + snp_free_firmware_page(sev->snp_context); + sev->snp_context = NULL; + + return 0; +} + void sev_vm_destroy(struct kvm *kvm) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); struct list_head *head = &sev->regions_list; struct list_head *pos, *q; @@ -2147,35 +2839,96 @@ void sev_vm_destroy(struct kvm *kvm) } } - sev_unbind_asid(kvm, sev->handle); + if (sev_snp_guest(kvm)) { + snp_guest_req_cleanup(kvm); + + /* + * Decomission handles unbinding of the ASID. If it fails for + * some unexpected reason, just leak the ASID. + */ + if (snp_decommission_context(kvm)) + return; + } else { + sev_unbind_asid(kvm, sev->handle); + } + sev_asid_free(sev); } void __init sev_set_cpu_caps(void) { - if (!sev_enabled) - kvm_cpu_cap_clear(X86_FEATURE_SEV); - if (!sev_es_enabled) - kvm_cpu_cap_clear(X86_FEATURE_SEV_ES); + if (sev_enabled) { + kvm_cpu_cap_set(X86_FEATURE_SEV); + kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_VM); + } + if (sev_es_enabled) { + kvm_cpu_cap_set(X86_FEATURE_SEV_ES); + kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_ES_VM); + } + if (sev_snp_enabled) { + kvm_cpu_cap_set(X86_FEATURE_SEV_SNP); + kvm_caps.supported_vm_types |= BIT(KVM_X86_SNP_VM); + } +} + +static bool is_sev_snp_initialized(void) +{ + struct sev_user_data_snp_status *status; + struct sev_data_snp_addr buf; + bool initialized = false; + int ret, error = 0; + + status = snp_alloc_firmware_page(GFP_KERNEL | __GFP_ZERO); + if (!status) + return false; + + buf.address = __psp_pa(status); + ret = sev_do_cmd(SEV_CMD_SNP_PLATFORM_STATUS, &buf, &error); + if (ret) { + pr_err("SEV: SNP_PLATFORM_STATUS failed ret=%d, fw_error=%d (%#x)\n", + ret, error, error); + goto out; + } + + initialized = !!status->state; + +out: + snp_free_firmware_page(status); + + return initialized; } void __init sev_hardware_setup(void) { -#ifdef CONFIG_KVM_AMD_SEV unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count; + struct sev_platform_init_args init_args = {0}; + bool sev_snp_supported = false; bool sev_es_supported = false; bool sev_supported = false; - if (!sev_enabled || !npt_enabled) + if (!sev_enabled || !npt_enabled || !nrips) goto out; /* * SEV must obviously be supported in hardware. Sanity check that the * CPU supports decode assists, which is mandatory for SEV guests to - * support instruction emulation. + * support instruction emulation. Ditto for flushing by ASID, as SEV + * guests are bound to a single ASID, i.e. KVM can't rotate to a new + * ASID to effect a TLB flush. */ if (!boot_cpu_has(X86_FEATURE_SEV) || - WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_DECODEASSISTS))) + WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) || + WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_FLUSHBYASID))) + goto out; + + /* + * The kernel's initcall infrastructure lacks the ability to express + * dependencies between initcalls, whereas the modules infrastructure + * automatically handles dependencies via symbol loading. Ensure the + * PSP SEV driver is initialized before proceeding if KVM is built-in, + * as the dependency isn't handled by the initcall infrastructure. + */ + if (IS_BUILTIN(CONFIG_KVM_AMD) && sev_module_init()) goto out; /* Retrieve SEV CPUID information */ @@ -2210,36 +2963,79 @@ void __init sev_hardware_setup(void) goto out; } - sev_asid_count = max_sev_asid - min_sev_asid + 1; - if (misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count)) - goto out; - - pr_info("SEV supported: %u ASIDs\n", sev_asid_count); + if (min_sev_asid <= max_sev_asid) { + sev_asid_count = max_sev_asid - min_sev_asid + 1; + WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count)); + } sev_supported = true; /* SEV-ES support requested? */ if (!sev_es_enabled) goto out; + /* + * SEV-ES requires MMIO caching as KVM doesn't have access to the guest + * instruction stream, i.e. can't emulate in response to a #NPF and + * instead relies on #NPF(RSVD) being reflected into the guest as #VC + * (the guest can then do a #VMGEXIT to request MMIO emulation). + */ + if (!enable_mmio_caching) + goto out; + /* Does the CPU support SEV-ES? */ if (!boot_cpu_has(X86_FEATURE_SEV_ES)) goto out; + if (!lbrv) { + WARN_ONCE(!boot_cpu_has(X86_FEATURE_LBRV), + "LBRV must be present for SEV-ES support"); + goto out; + } + /* Has the system been allocated ASIDs for SEV-ES? */ if (min_sev_asid == 1) goto out; sev_es_asid_count = min_sev_asid - 1; - if (misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count)) - goto out; - - pr_info("SEV-ES supported: %u ASIDs\n", sev_es_asid_count); + WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count)); sev_es_supported = true; + sev_snp_supported = sev_snp_enabled && cc_platform_has(CC_ATTR_HOST_SEV_SNP); out: + if (sev_enabled) { + init_args.probe = true; + if (sev_platform_init(&init_args)) + sev_supported = sev_es_supported = sev_snp_supported = false; + else if (sev_snp_supported) + sev_snp_supported = is_sev_snp_initialized(); + } + + if (boot_cpu_has(X86_FEATURE_SEV)) + pr_info("SEV %s (ASIDs %u - %u)\n", + sev_supported ? min_sev_asid <= max_sev_asid ? "enabled" : + "unusable" : + "disabled", + min_sev_asid, max_sev_asid); + if (boot_cpu_has(X86_FEATURE_SEV_ES)) + pr_info("SEV-ES %s (ASIDs %u - %u)\n", + str_enabled_disabled(sev_es_supported), + min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1); + if (boot_cpu_has(X86_FEATURE_SEV_SNP)) + pr_info("SEV-SNP %s (ASIDs %u - %u)\n", + str_enabled_disabled(sev_snp_supported), + min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1); + sev_enabled = sev_supported; sev_es_enabled = sev_es_supported; -#endif + sev_snp_enabled = sev_snp_supported; + + if (!sev_es_enabled || !cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) || + !cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP)) + sev_es_debug_swap_enabled = false; + + sev_supported_vmsa_features = 0; + if (sev_es_debug_swap_enabled) + sev_supported_vmsa_features |= SVM_SEV_FEAT_DEBUG_SWAP; } void sev_hardware_unsetup(void) @@ -2255,6 +3051,8 @@ void sev_hardware_unsetup(void) misc_cg_set_capacity(MISC_CG_RES_SEV, 0); misc_cg_set_capacity(MISC_CG_RES_SEV_ES, 0); + + sev_platform_shutdown(); } int sev_cpu_init(struct svm_cpu_data *sd) @@ -2275,7 +3073,7 @@ int sev_cpu_init(struct svm_cpu_data *sd) */ static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va) { - int asid = to_kvm_svm(vcpu->kvm)->sev_info.asid; + unsigned int asid = sev_get_asid(vcpu->kvm); /* * Note! The address must be a kernel address, as regular page walk @@ -2300,7 +3098,7 @@ static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va) * back to WBINVD if this faults so as not to make any problems worse * by leaving stale encrypted data in the cache. */ - if (WARN_ON_ONCE(wrmsrl_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid))) + if (WARN_ON_ONCE(wrmsrq_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid))) goto do_wbinvd; return; @@ -2311,7 +3109,13 @@ do_wbinvd: void sev_guest_memory_reclaimed(struct kvm *kvm) { - if (!sev_guest(kvm)) + /* + * With SNP+gmem, private/encrypted memory is unreachable via the + * hva-based mmu notifiers, so these events are only actually + * pertaining to shared pages where there is no need to perform + * the WBINVD to flush associated caches. + */ + if (!sev_guest(kvm) || sev_snp_guest(kvm)) return; wbinvd_on_all_cpus(); @@ -2326,18 +3130,36 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu) svm = to_svm(vcpu); + /* + * If it's an SNP guest, then the VMSA was marked in the RMP table as + * a guest-owned page. Transition the page to hypervisor state before + * releasing it back to the system. + */ + if (sev_snp_guest(vcpu->kvm)) { + u64 pfn = __pa(svm->sev_es.vmsa) >> PAGE_SHIFT; + + if (kvm_rmp_make_shared(vcpu->kvm, pfn, PG_LEVEL_4K)) + goto skip_vmsa_free; + } + if (vcpu->arch.guest_state_protected) sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa); __free_page(virt_to_page(svm->sev_es.vmsa)); +skip_vmsa_free: if (svm->sev_es.ghcb_sa_free) kvfree(svm->sev_es.ghcb_sa); } +static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control) +{ + return (((u64)control->exit_code_hi) << 32) | control->exit_code; +} + static void dump_ghcb(struct vcpu_svm *svm) { - struct ghcb *ghcb = svm->sev_es.ghcb; + struct vmcb_control_area *control = &svm->vmcb->control; unsigned int nbits; /* Re-use the dump_invalid_vmcb module parameter */ @@ -2346,18 +3168,24 @@ static void dump_ghcb(struct vcpu_svm *svm) return; } - nbits = sizeof(ghcb->save.valid_bitmap) * 8; + nbits = sizeof(svm->sev_es.valid_bitmap) * 8; - pr_err("GHCB (GPA=%016llx):\n", svm->vmcb->control.ghcb_gpa); + /* + * Print KVM's snapshot of the GHCB values that were (unsuccessfully) + * used to handle the exit. If the guest has since modified the GHCB + * itself, dumping the raw GHCB won't help debug why KVM was unable to + * handle the VMGEXIT that KVM observed. + */ + pr_err("GHCB (GPA=%016llx) snapshot:\n", svm->vmcb->control.ghcb_gpa); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code", - ghcb->save.sw_exit_code, ghcb_sw_exit_code_is_valid(ghcb)); + kvm_ghcb_get_sw_exit_code(control), kvm_ghcb_sw_exit_code_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1", - ghcb->save.sw_exit_info_1, ghcb_sw_exit_info_1_is_valid(ghcb)); + control->exit_info_1, kvm_ghcb_sw_exit_info_1_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2", - ghcb->save.sw_exit_info_2, ghcb_sw_exit_info_2_is_valid(ghcb)); + control->exit_info_2, kvm_ghcb_sw_exit_info_2_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_scratch", - ghcb->save.sw_scratch, ghcb_sw_scratch_is_valid(ghcb)); - pr_err("%-20s%*pb\n", "valid_bitmap", nbits, ghcb->save.valid_bitmap); + svm->sev_es.sw_scratch, kvm_ghcb_sw_scratch_is_valid(svm)); + pr_err("%-20s%*pb\n", "valid_bitmap", nbits, svm->sev_es.valid_bitmap); } static void sev_es_sync_to_ghcb(struct vcpu_svm *svm) @@ -2400,17 +3228,20 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) */ memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); - vcpu->arch.regs[VCPU_REGS_RAX] = ghcb_get_rax_if_valid(ghcb); - vcpu->arch.regs[VCPU_REGS_RBX] = ghcb_get_rbx_if_valid(ghcb); - vcpu->arch.regs[VCPU_REGS_RCX] = ghcb_get_rcx_if_valid(ghcb); - vcpu->arch.regs[VCPU_REGS_RDX] = ghcb_get_rdx_if_valid(ghcb); - vcpu->arch.regs[VCPU_REGS_RSI] = ghcb_get_rsi_if_valid(ghcb); + BUILD_BUG_ON(sizeof(svm->sev_es.valid_bitmap) != sizeof(ghcb->save.valid_bitmap)); + memcpy(&svm->sev_es.valid_bitmap, &ghcb->save.valid_bitmap, sizeof(ghcb->save.valid_bitmap)); - svm->vmcb->save.cpl = ghcb_get_cpl_if_valid(ghcb); + vcpu->arch.regs[VCPU_REGS_RAX] = kvm_ghcb_get_rax_if_valid(svm, ghcb); + vcpu->arch.regs[VCPU_REGS_RBX] = kvm_ghcb_get_rbx_if_valid(svm, ghcb); + vcpu->arch.regs[VCPU_REGS_RCX] = kvm_ghcb_get_rcx_if_valid(svm, ghcb); + vcpu->arch.regs[VCPU_REGS_RDX] = kvm_ghcb_get_rdx_if_valid(svm, ghcb); + vcpu->arch.regs[VCPU_REGS_RSI] = kvm_ghcb_get_rsi_if_valid(svm, ghcb); - if (ghcb_xcr0_is_valid(ghcb)) { + svm->vmcb->save.cpl = kvm_ghcb_get_cpl_if_valid(svm, ghcb); + + if (kvm_ghcb_xcr0_is_valid(svm)) { vcpu->arch.xcr0 = ghcb_get_xcr0(ghcb); - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; } /* Copy the GHCB exit information into the VMCB fields */ @@ -2419,6 +3250,7 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) control->exit_code_hi = upper_32_bits(exit_code); control->exit_info_1 = ghcb_get_sw_exit_info_1(ghcb); control->exit_info_2 = ghcb_get_sw_exit_info_2(ghcb); + svm->sev_es.sw_scratch = kvm_ghcb_get_sw_scratch_if_valid(svm, ghcb); /* Clear the valid entries fields */ memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); @@ -2426,77 +3258,75 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) static int sev_es_validate_vmgexit(struct vcpu_svm *svm) { - struct kvm_vcpu *vcpu; - struct ghcb *ghcb; + struct vmcb_control_area *control = &svm->vmcb->control; + struct kvm_vcpu *vcpu = &svm->vcpu; u64 exit_code; u64 reason; - ghcb = svm->sev_es.ghcb; - /* * Retrieve the exit code now even though it may not be marked valid * as it could help with debugging. */ - exit_code = ghcb_get_sw_exit_code(ghcb); + exit_code = kvm_ghcb_get_sw_exit_code(control); /* Only GHCB Usage code 0 is supported */ - if (ghcb->ghcb_usage) { + if (svm->sev_es.ghcb->ghcb_usage) { reason = GHCB_ERR_INVALID_USAGE; goto vmgexit_err; } reason = GHCB_ERR_MISSING_INPUT; - if (!ghcb_sw_exit_code_is_valid(ghcb) || - !ghcb_sw_exit_info_1_is_valid(ghcb) || - !ghcb_sw_exit_info_2_is_valid(ghcb)) + if (!kvm_ghcb_sw_exit_code_is_valid(svm) || + !kvm_ghcb_sw_exit_info_1_is_valid(svm) || + !kvm_ghcb_sw_exit_info_2_is_valid(svm)) goto vmgexit_err; - switch (ghcb_get_sw_exit_code(ghcb)) { + switch (exit_code) { case SVM_EXIT_READ_DR7: break; case SVM_EXIT_WRITE_DR7: - if (!ghcb_rax_is_valid(ghcb)) + if (!kvm_ghcb_rax_is_valid(svm)) goto vmgexit_err; break; case SVM_EXIT_RDTSC: break; case SVM_EXIT_RDPMC: - if (!ghcb_rcx_is_valid(ghcb)) + if (!kvm_ghcb_rcx_is_valid(svm)) goto vmgexit_err; break; case SVM_EXIT_CPUID: - if (!ghcb_rax_is_valid(ghcb) || - !ghcb_rcx_is_valid(ghcb)) + if (!kvm_ghcb_rax_is_valid(svm) || + !kvm_ghcb_rcx_is_valid(svm)) goto vmgexit_err; - if (ghcb_get_rax(ghcb) == 0xd) - if (!ghcb_xcr0_is_valid(ghcb)) + if (vcpu->arch.regs[VCPU_REGS_RAX] == 0xd) + if (!kvm_ghcb_xcr0_is_valid(svm)) goto vmgexit_err; break; case SVM_EXIT_INVD: break; case SVM_EXIT_IOIO: - if (ghcb_get_sw_exit_info_1(ghcb) & SVM_IOIO_STR_MASK) { - if (!ghcb_sw_scratch_is_valid(ghcb)) + if (control->exit_info_1 & SVM_IOIO_STR_MASK) { + if (!kvm_ghcb_sw_scratch_is_valid(svm)) goto vmgexit_err; } else { - if (!(ghcb_get_sw_exit_info_1(ghcb) & SVM_IOIO_TYPE_MASK)) - if (!ghcb_rax_is_valid(ghcb)) + if (!(control->exit_info_1 & SVM_IOIO_TYPE_MASK)) + if (!kvm_ghcb_rax_is_valid(svm)) goto vmgexit_err; } break; case SVM_EXIT_MSR: - if (!ghcb_rcx_is_valid(ghcb)) + if (!kvm_ghcb_rcx_is_valid(svm)) goto vmgexit_err; - if (ghcb_get_sw_exit_info_1(ghcb)) { - if (!ghcb_rax_is_valid(ghcb) || - !ghcb_rdx_is_valid(ghcb)) + if (control->exit_info_1) { + if (!kvm_ghcb_rax_is_valid(svm) || + !kvm_ghcb_rdx_is_valid(svm)) goto vmgexit_err; } break; case SVM_EXIT_VMMCALL: - if (!ghcb_rax_is_valid(ghcb) || - !ghcb_cpl_is_valid(ghcb)) + if (!kvm_ghcb_rax_is_valid(svm) || + !kvm_ghcb_cpl_is_valid(svm)) goto vmgexit_err; break; case SVM_EXIT_RDTSCP: @@ -2504,25 +3334,46 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) case SVM_EXIT_WBINVD: break; case SVM_EXIT_MONITOR: - if (!ghcb_rax_is_valid(ghcb) || - !ghcb_rcx_is_valid(ghcb) || - !ghcb_rdx_is_valid(ghcb)) + if (!kvm_ghcb_rax_is_valid(svm) || + !kvm_ghcb_rcx_is_valid(svm) || + !kvm_ghcb_rdx_is_valid(svm)) goto vmgexit_err; break; case SVM_EXIT_MWAIT: - if (!ghcb_rax_is_valid(ghcb) || - !ghcb_rcx_is_valid(ghcb)) + if (!kvm_ghcb_rax_is_valid(svm) || + !kvm_ghcb_rcx_is_valid(svm)) goto vmgexit_err; break; case SVM_VMGEXIT_MMIO_READ: case SVM_VMGEXIT_MMIO_WRITE: - if (!ghcb_sw_scratch_is_valid(ghcb)) + if (!kvm_ghcb_sw_scratch_is_valid(svm)) goto vmgexit_err; break; + case SVM_VMGEXIT_AP_CREATION: + if (!sev_snp_guest(vcpu->kvm)) + goto vmgexit_err; + if (lower_32_bits(control->exit_info_1) != SVM_VMGEXIT_AP_DESTROY) + if (!kvm_ghcb_rax_is_valid(svm)) + goto vmgexit_err; + break; case SVM_VMGEXIT_NMI_COMPLETE: case SVM_VMGEXIT_AP_HLT_LOOP: case SVM_VMGEXIT_AP_JUMP_TABLE: case SVM_VMGEXIT_UNSUPPORTED_EVENT: + case SVM_VMGEXIT_HV_FEATURES: + case SVM_VMGEXIT_TERM_REQUEST: + break; + case SVM_VMGEXIT_PSC: + if (!sev_snp_guest(vcpu->kvm) || !kvm_ghcb_sw_scratch_is_valid(svm)) + goto vmgexit_err; + break; + case SVM_VMGEXIT_GUEST_REQUEST: + case SVM_VMGEXIT_EXT_GUEST_REQUEST: + if (!sev_snp_guest(vcpu->kvm) || + !PAGE_ALIGNED(control->exit_info_1) || + !PAGE_ALIGNED(control->exit_info_2) || + control->exit_info_1 == control->exit_info_2) + goto vmgexit_err; break; default: reason = GHCB_ERR_INVALID_EVENT; @@ -2532,11 +3383,9 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) return 0; vmgexit_err: - vcpu = &svm->vcpu; - if (reason == GHCB_ERR_INVALID_USAGE) { vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n", - ghcb->ghcb_usage); + svm->sev_es.ghcb->ghcb_usage); } else if (reason == GHCB_ERR_INVALID_EVENT) { vcpu_unimpl(vcpu, "vmgexit: exit code %#llx is not valid\n", exit_code); @@ -2546,11 +3395,7 @@ vmgexit_err: dump_ghcb(svm); } - /* Clear the valid entries fields */ - memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); - - ghcb_set_sw_exit_info_1(ghcb, 2); - ghcb_set_sw_exit_info_2(ghcb, reason); + svm_vmgexit_bad_input(svm, reason); /* Resume the guest to "return" the error code. */ return 1; @@ -2558,6 +3403,9 @@ vmgexit_err: void sev_es_unmap_ghcb(struct vcpu_svm *svm) { + /* Clear any indication that the vCPU is in a type of AP Reset Hold */ + svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NONE; + if (!svm->sev_es.ghcb) return; @@ -2569,7 +3417,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm) */ if (svm->sev_es.ghcb_sa_sync) { kvm_write_guest(svm->vcpu.kvm, - ghcb_get_sw_scratch(svm->sev_es.ghcb), + svm->sev_es.sw_scratch, svm->sev_es.ghcb_sa, svm->sev_es.ghcb_sa_len); svm->sev_es.ghcb_sa_sync = false; @@ -2584,14 +3432,23 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm) sev_es_sync_to_ghcb(svm); - kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map, true); + kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map); svm->sev_es.ghcb = NULL; } -void pre_sev_run(struct vcpu_svm *svm, int cpu) +int pre_sev_run(struct vcpu_svm *svm, int cpu) { - struct svm_cpu_data *sd = per_cpu(svm_data, cpu); - int asid = sev_get_asid(svm->vcpu.kvm); + struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); + struct kvm *kvm = svm->vcpu.kvm; + unsigned int asid = sev_get_asid(kvm); + + /* + * Reject KVM_RUN if userspace attempts to run the vCPU with an invalid + * VMSA, e.g. if userspace forces the vCPU to be RUNNABLE after an SNP + * AP Destroy event. + */ + if (sev_es_guest(kvm) && !VALID_PAGE(svm->vmcb->control.vmsa_pa)) + return -EINVAL; /* Assign the asid allocated with this SEV guest */ svm->asid = asid; @@ -2604,23 +3461,23 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu) */ if (sd->sev_vmcbs[asid] == svm->vmcb && svm->vcpu.arch.last_vmentry_cpu == cpu) - return; + return 0; sd->sev_vmcbs[asid] = svm->vmcb; svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; vmcb_mark_dirty(svm->vmcb, VMCB_ASID); + return 0; } #define GHCB_SCRATCH_AREA_LIMIT (16ULL * PAGE_SIZE) static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) { struct vmcb_control_area *control = &svm->vmcb->control; - struct ghcb *ghcb = svm->sev_es.ghcb; u64 ghcb_scratch_beg, ghcb_scratch_end; u64 scratch_gpa_beg, scratch_gpa_end; void *scratch_va; - scratch_gpa_beg = ghcb_get_sw_scratch(ghcb); + scratch_gpa_beg = svm->sev_es.sw_scratch; if (!scratch_gpa_beg) { pr_err("vmgexit: scratch gpa not provided\n"); goto e_scratch; @@ -2638,7 +3495,7 @@ static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) ghcb_scratch_beg = control->ghcb_gpa + offsetof(struct ghcb, shared_buffer); ghcb_scratch_end = control->ghcb_gpa + - offsetof(struct ghcb, reserved_1); + offsetof(struct ghcb, reserved_0xff0); /* * If the scratch area begins within the GHCB, it must be @@ -2691,8 +3548,7 @@ static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) return 0; e_scratch: - ghcb_set_sw_exit_info_1(ghcb, 2); - ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_SCRATCH_AREA); + svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_SCRATCH_AREA); return 1; } @@ -2714,10 +3570,517 @@ static void set_ghcb_msr(struct vcpu_svm *svm, u64 value) svm->vmcb->control.ghcb_gpa = value; } +static int snp_rmptable_psmash(kvm_pfn_t pfn) +{ + int ret; + + pfn = pfn & ~(KVM_PAGES_PER_HPAGE(PG_LEVEL_2M) - 1); + + /* + * PSMASH_FAIL_INUSE indicates another processor is modifying the + * entry, so retry until that's no longer the case. + */ + do { + ret = psmash(pfn); + } while (ret == PSMASH_FAIL_INUSE); + + return ret; +} + +static int snp_complete_psc_msr(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (vcpu->run->hypercall.ret) + set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR); + else + set_ghcb_msr(svm, GHCB_MSR_PSC_RESP); + + return 1; /* resume guest */ +} + +static int snp_begin_psc_msr(struct vcpu_svm *svm, u64 ghcb_msr) +{ + u64 gpa = gfn_to_gpa(GHCB_MSR_PSC_REQ_TO_GFN(ghcb_msr)); + u8 op = GHCB_MSR_PSC_REQ_TO_OP(ghcb_msr); + struct kvm_vcpu *vcpu = &svm->vcpu; + + if (op != SNP_PAGE_STATE_PRIVATE && op != SNP_PAGE_STATE_SHARED) { + set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR); + return 1; /* resume guest */ + } + + if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) { + set_ghcb_msr(svm, GHCB_MSR_PSC_RESP_ERROR); + return 1; /* resume guest */ + } + + vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; + vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; + /* + * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2) + * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that + * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting + * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU. + */ + vcpu->run->hypercall.ret = 0; + vcpu->run->hypercall.args[0] = gpa; + vcpu->run->hypercall.args[1] = 1; + vcpu->run->hypercall.args[2] = (op == SNP_PAGE_STATE_PRIVATE) + ? KVM_MAP_GPA_RANGE_ENCRYPTED + : KVM_MAP_GPA_RANGE_DECRYPTED; + vcpu->run->hypercall.args[2] |= KVM_MAP_GPA_RANGE_PAGE_SZ_4K; + + vcpu->arch.complete_userspace_io = snp_complete_psc_msr; + + return 0; /* forward request to userspace */ +} + +struct psc_buffer { + struct psc_hdr hdr; + struct psc_entry entries[]; +} __packed; + +static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc); + +static void snp_complete_psc(struct vcpu_svm *svm, u64 psc_ret) +{ + svm->sev_es.psc_inflight = 0; + svm->sev_es.psc_idx = 0; + svm->sev_es.psc_2m = false; + + /* + * PSC requests always get a "no action" response in SW_EXITINFO1, with + * a PSC-specific return code in SW_EXITINFO2 that provides the "real" + * return code. E.g. if the PSC request was interrupted, the need to + * retry is communicated via SW_EXITINFO2, not SW_EXITINFO1. + */ + svm_vmgexit_no_action(svm, psc_ret); +} + +static void __snp_complete_one_psc(struct vcpu_svm *svm) +{ + struct psc_buffer *psc = svm->sev_es.ghcb_sa; + struct psc_entry *entries = psc->entries; + struct psc_hdr *hdr = &psc->hdr; + __u16 idx; + + /* + * Everything in-flight has been processed successfully. Update the + * corresponding entries in the guest's PSC buffer and zero out the + * count of in-flight PSC entries. + */ + for (idx = svm->sev_es.psc_idx; svm->sev_es.psc_inflight; + svm->sev_es.psc_inflight--, idx++) { + struct psc_entry *entry = &entries[idx]; + + entry->cur_page = entry->pagesize ? 512 : 1; + } + + hdr->cur_entry = idx; +} + +static int snp_complete_one_psc(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct psc_buffer *psc = svm->sev_es.ghcb_sa; + + if (vcpu->run->hypercall.ret) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); + return 1; /* resume guest */ + } + + __snp_complete_one_psc(svm); + + /* Handle the next range (if any). */ + return snp_begin_psc(svm, psc); +} + +static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc) +{ + struct psc_entry *entries = psc->entries; + struct kvm_vcpu *vcpu = &svm->vcpu; + struct psc_hdr *hdr = &psc->hdr; + struct psc_entry entry_start; + u16 idx, idx_start, idx_end; + int npages; + bool huge; + u64 gfn; + + if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); + return 1; + } + +next_range: + /* There should be no other PSCs in-flight at this point. */ + if (WARN_ON_ONCE(svm->sev_es.psc_inflight)) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC); + return 1; + } + + /* + * The PSC descriptor buffer can be modified by a misbehaved guest after + * validation, so take care to only use validated copies of values used + * for things like array indexing. + */ + idx_start = hdr->cur_entry; + idx_end = hdr->end_entry; + + if (idx_end >= VMGEXIT_PSC_MAX_COUNT) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_HDR); + return 1; + } + + /* Find the start of the next range which needs processing. */ + for (idx = idx_start; idx <= idx_end; idx++, hdr->cur_entry++) { + entry_start = entries[idx]; + + gfn = entry_start.gfn; + huge = entry_start.pagesize; + npages = huge ? 512 : 1; + + if (entry_start.cur_page > npages || !IS_ALIGNED(gfn, npages)) { + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_ENTRY); + return 1; + } + + if (entry_start.cur_page) { + /* + * If this is a partially-completed 2M range, force 4K handling + * for the remaining pages since they're effectively split at + * this point. Subsequent code should ensure this doesn't get + * combined with adjacent PSC entries where 2M handling is still + * possible. + */ + npages -= entry_start.cur_page; + gfn += entry_start.cur_page; + huge = false; + } + + if (npages) + break; + } + + if (idx > idx_end) { + /* Nothing more to process. */ + snp_complete_psc(svm, 0); + return 1; + } + + svm->sev_es.psc_2m = huge; + svm->sev_es.psc_idx = idx; + svm->sev_es.psc_inflight = 1; + + /* + * Find all subsequent PSC entries that contain adjacent GPA + * ranges/operations and can be combined into a single + * KVM_HC_MAP_GPA_RANGE exit. + */ + while (++idx <= idx_end) { + struct psc_entry entry = entries[idx]; + + if (entry.operation != entry_start.operation || + entry.gfn != entry_start.gfn + npages || + entry.cur_page || !!entry.pagesize != huge) + break; + + svm->sev_es.psc_inflight++; + npages += huge ? 512 : 1; + } + + switch (entry_start.operation) { + case VMGEXIT_PSC_OP_PRIVATE: + case VMGEXIT_PSC_OP_SHARED: + vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; + vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; + /* + * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2) + * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that + * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting + * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU. + */ + vcpu->run->hypercall.ret = 0; + vcpu->run->hypercall.args[0] = gfn_to_gpa(gfn); + vcpu->run->hypercall.args[1] = npages; + vcpu->run->hypercall.args[2] = entry_start.operation == VMGEXIT_PSC_OP_PRIVATE + ? KVM_MAP_GPA_RANGE_ENCRYPTED + : KVM_MAP_GPA_RANGE_DECRYPTED; + vcpu->run->hypercall.args[2] |= entry_start.pagesize + ? KVM_MAP_GPA_RANGE_PAGE_SZ_2M + : KVM_MAP_GPA_RANGE_PAGE_SZ_4K; + vcpu->arch.complete_userspace_io = snp_complete_one_psc; + return 0; /* forward request to userspace */ + default: + /* + * Only shared/private PSC operations are currently supported, so if the + * entire range consists of unsupported operations (e.g. SMASH/UNSMASH), + * then consider the entire range completed and avoid exiting to + * userspace. In theory snp_complete_psc() can always be called directly + * at this point to complete the current range and start the next one, + * but that could lead to unexpected levels of recursion. + */ + __snp_complete_one_psc(svm); + goto next_range; + } + + BUG(); +} + +/* + * Invoked as part of svm_vcpu_reset() processing of an init event. + */ +void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct kvm_memory_slot *slot; + struct page *page; + kvm_pfn_t pfn; + gfn_t gfn; + + if (!sev_snp_guest(vcpu->kvm)) + return; + + guard(mutex)(&svm->sev_es.snp_vmsa_mutex); + + if (!svm->sev_es.snp_ap_waiting_for_reset) + return; + + svm->sev_es.snp_ap_waiting_for_reset = false; + + /* Mark the vCPU as offline and not runnable */ + vcpu->arch.pv.pv_unhalted = false; + kvm_set_mp_state(vcpu, KVM_MP_STATE_HALTED); + + /* Clear use of the VMSA */ + svm->vmcb->control.vmsa_pa = INVALID_PAGE; + + /* + * When replacing the VMSA during SEV-SNP AP creation, + * mark the VMCB dirty so that full state is always reloaded. + */ + vmcb_mark_all_dirty(svm->vmcb); + + if (!VALID_PAGE(svm->sev_es.snp_vmsa_gpa)) + return; + + gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa); + svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; + + slot = gfn_to_memslot(vcpu->kvm, gfn); + if (!slot) + return; + + /* + * The new VMSA will be private memory guest memory, so retrieve the + * PFN from the gmem backend. + */ + if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, &page, NULL)) + return; + + /* + * From this point forward, the VMSA will always be a guest-mapped page + * rather than the initial one allocated by KVM in svm->sev_es.vmsa. In + * theory, svm->sev_es.vmsa could be free'd and cleaned up here, but + * that involves cleanups like wbinvd_on_all_cpus() which would ideally + * be handled during teardown rather than guest boot. Deferring that + * also allows the existing logic for SEV-ES VMSAs to be re-used with + * minimal SNP-specific changes. + */ + svm->sev_es.snp_has_guest_vmsa = true; + + /* Use the new VMSA */ + svm->vmcb->control.vmsa_pa = pfn_to_hpa(pfn); + + /* Mark the vCPU as runnable */ + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); + + /* + * gmem pages aren't currently migratable, but if this ever changes + * then care should be taken to ensure svm->sev_es.vmsa is pinned + * through some other means. + */ + kvm_release_page_clean(page); +} + +static int sev_snp_ap_creation(struct vcpu_svm *svm) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm); + struct kvm_vcpu *vcpu = &svm->vcpu; + struct kvm_vcpu *target_vcpu; + struct vcpu_svm *target_svm; + unsigned int request; + unsigned int apic_id; + + request = lower_32_bits(svm->vmcb->control.exit_info_1); + apic_id = upper_32_bits(svm->vmcb->control.exit_info_1); + + /* Validate the APIC ID */ + target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, apic_id); + if (!target_vcpu) { + vcpu_unimpl(vcpu, "vmgexit: invalid AP APIC ID [%#x] from guest\n", + apic_id); + return -EINVAL; + } + + target_svm = to_svm(target_vcpu); + + guard(mutex)(&target_svm->sev_es.snp_vmsa_mutex); + + switch (request) { + case SVM_VMGEXIT_AP_CREATE_ON_INIT: + case SVM_VMGEXIT_AP_CREATE: + if (vcpu->arch.regs[VCPU_REGS_RAX] != sev->vmsa_features) { + vcpu_unimpl(vcpu, "vmgexit: mismatched AP sev_features [%#lx] != [%#llx] from guest\n", + vcpu->arch.regs[VCPU_REGS_RAX], sev->vmsa_features); + return -EINVAL; + } + + if (!page_address_valid(vcpu, svm->vmcb->control.exit_info_2)) { + vcpu_unimpl(vcpu, "vmgexit: invalid AP VMSA address [%#llx] from guest\n", + svm->vmcb->control.exit_info_2); + return -EINVAL; + } + + /* + * Malicious guest can RMPADJUST a large page into VMSA which + * will hit the SNP erratum where the CPU will incorrectly signal + * an RMP violation #PF if a hugepage collides with the RMP entry + * of VMSA page, reject the AP CREATE request if VMSA address from + * guest is 2M aligned. + */ + if (IS_ALIGNED(svm->vmcb->control.exit_info_2, PMD_SIZE)) { + vcpu_unimpl(vcpu, + "vmgexit: AP VMSA address [%llx] from guest is unsafe as it is 2M aligned\n", + svm->vmcb->control.exit_info_2); + return -EINVAL; + } + + target_svm->sev_es.snp_vmsa_gpa = svm->vmcb->control.exit_info_2; + break; + case SVM_VMGEXIT_AP_DESTROY: + target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; + break; + default: + vcpu_unimpl(vcpu, "vmgexit: invalid AP creation request [%#x] from guest\n", + request); + return -EINVAL; + } + + target_svm->sev_es.snp_ap_waiting_for_reset = true; + + /* + * Unless Creation is deferred until INIT, signal the vCPU to update + * its state. + */ + if (request != SVM_VMGEXIT_AP_CREATE_ON_INIT) + kvm_make_request_and_kick(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu); + + return 0; +} + +static int snp_handle_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa) +{ + struct sev_data_snp_guest_request data = {0}; + struct kvm *kvm = svm->vcpu.kvm; + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + sev_ret_code fw_err = 0; + int ret; + + if (!sev_snp_guest(kvm)) + return -EINVAL; + + mutex_lock(&sev->guest_req_mutex); + + if (kvm_read_guest(kvm, req_gpa, sev->guest_req_buf, PAGE_SIZE)) { + ret = -EIO; + goto out_unlock; + } + + data.gctx_paddr = __psp_pa(sev->snp_context); + data.req_paddr = __psp_pa(sev->guest_req_buf); + data.res_paddr = __psp_pa(sev->guest_resp_buf); + + /* + * Firmware failures are propagated on to guest, but any other failure + * condition along the way should be reported to userspace. E.g. if + * the PSP is dead and commands are timing out. + */ + ret = sev_issue_cmd(kvm, SEV_CMD_SNP_GUEST_REQUEST, &data, &fw_err); + if (ret && !fw_err) + goto out_unlock; + + if (kvm_write_guest(kvm, resp_gpa, sev->guest_resp_buf, PAGE_SIZE)) { + ret = -EIO; + goto out_unlock; + } + + /* No action is requested *from KVM* if there was a firmware error. */ + svm_vmgexit_no_action(svm, SNP_GUEST_ERR(0, fw_err)); + + ret = 1; /* resume guest */ + +out_unlock: + mutex_unlock(&sev->guest_req_mutex); + return ret; +} + +static int snp_handle_ext_guest_req(struct vcpu_svm *svm, gpa_t req_gpa, gpa_t resp_gpa) +{ + struct kvm *kvm = svm->vcpu.kvm; + u8 msg_type; + + if (!sev_snp_guest(kvm)) + return -EINVAL; + + if (kvm_read_guest(kvm, req_gpa + offsetof(struct snp_guest_msg_hdr, msg_type), + &msg_type, 1)) + return -EIO; + + /* + * As per GHCB spec, requests of type MSG_REPORT_REQ also allow for + * additional certificate data to be provided alongside the attestation + * report via the guest-provided data pages indicated by RAX/RBX. The + * certificate data is optional and requires additional KVM enablement + * to provide an interface for userspace to provide it, but KVM still + * needs to be able to handle extended guest requests either way. So + * provide a stub implementation that will always return an empty + * certificate table in the guest-provided data pages. + */ + if (msg_type == SNP_MSG_REPORT_REQ) { + struct kvm_vcpu *vcpu = &svm->vcpu; + u64 data_npages; + gpa_t data_gpa; + + if (!kvm_ghcb_rax_is_valid(svm) || !kvm_ghcb_rbx_is_valid(svm)) + goto request_invalid; + + data_gpa = vcpu->arch.regs[VCPU_REGS_RAX]; + data_npages = vcpu->arch.regs[VCPU_REGS_RBX]; + + if (!PAGE_ALIGNED(data_gpa)) + goto request_invalid; + + /* + * As per GHCB spec (see "SNP Extended Guest Request"), the + * certificate table is terminated by 24-bytes of zeroes. + */ + if (data_npages && kvm_clear_guest(kvm, data_gpa, 24)) + return -EIO; + } + + return snp_handle_guest_req(svm, req_gpa, resp_gpa); + +request_invalid: + svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT); + return 1; /* resume guest */ +} + static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; struct kvm_vcpu *vcpu = &svm->vcpu; + struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); u64 ghcb_info; int ret = 1; @@ -2728,7 +4091,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) switch (ghcb_info) { case GHCB_MSR_SEV_INFO_REQ: - set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX, + set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version, GHCB_VERSION_MIN, sev_enc_bit)); break; @@ -2770,6 +4133,60 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) GHCB_MSR_INFO_POS); break; } + case GHCB_MSR_AP_RESET_HOLD_REQ: + svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_MSR_PROTO; + ret = kvm_emulate_ap_reset_hold(&svm->vcpu); + + /* + * Preset the result to a non-SIPI return and then only set + * the result to non-zero when delivering a SIPI. + */ + set_ghcb_msr_bits(svm, 0, + GHCB_MSR_AP_RESET_HOLD_RESULT_MASK, + GHCB_MSR_AP_RESET_HOLD_RESULT_POS); + + set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP, + GHCB_MSR_INFO_MASK, + GHCB_MSR_INFO_POS); + break; + case GHCB_MSR_HV_FT_REQ: + set_ghcb_msr_bits(svm, GHCB_HV_FT_SUPPORTED, + GHCB_MSR_HV_FT_MASK, GHCB_MSR_HV_FT_POS); + set_ghcb_msr_bits(svm, GHCB_MSR_HV_FT_RESP, + GHCB_MSR_INFO_MASK, GHCB_MSR_INFO_POS); + break; + case GHCB_MSR_PREF_GPA_REQ: + if (!sev_snp_guest(vcpu->kvm)) + goto out_terminate; + + set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_NONE, GHCB_MSR_GPA_VALUE_MASK, + GHCB_MSR_GPA_VALUE_POS); + set_ghcb_msr_bits(svm, GHCB_MSR_PREF_GPA_RESP, GHCB_MSR_INFO_MASK, + GHCB_MSR_INFO_POS); + break; + case GHCB_MSR_REG_GPA_REQ: { + u64 gfn; + + if (!sev_snp_guest(vcpu->kvm)) + goto out_terminate; + + gfn = get_ghcb_msr_bits(svm, GHCB_MSR_GPA_VALUE_MASK, + GHCB_MSR_GPA_VALUE_POS); + + svm->sev_es.ghcb_registered_gpa = gfn_to_gpa(gfn); + + set_ghcb_msr_bits(svm, gfn, GHCB_MSR_GPA_VALUE_MASK, + GHCB_MSR_GPA_VALUE_POS); + set_ghcb_msr_bits(svm, GHCB_MSR_REG_GPA_RESP, GHCB_MSR_INFO_MASK, + GHCB_MSR_INFO_POS); + break; + } + case GHCB_MSR_PSC_REQ: + if (!sev_snp_guest(vcpu->kvm)) + goto out_terminate; + + ret = snp_begin_psc_msr(svm, control->ghcb_gpa); + break; case GHCB_MSR_TERM_REQ: { u64 reason_set, reason_code; @@ -2782,12 +4199,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) pr_info("SEV-ES guest requested termination: %#llx:%#llx\n", reason_set, reason_code); - vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; - vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM; - vcpu->run->system_event.ndata = 1; - vcpu->run->system_event.data[0] = control->ghcb_gpa; - - return 0; + goto out_terminate; } default: /* Error, keep GHCB MSR value as-is */ @@ -2798,6 +4210,14 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) control->ghcb_gpa, ret); return ret; + +out_terminate: + vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; + vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM; + vcpu->run->system_event.ndata = 1; + vcpu->run->system_event.data[0] = control->ghcb_gpa; + + return 0; } int sev_handle_vmgexit(struct kvm_vcpu *vcpu) @@ -2805,7 +4225,6 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); struct vmcb_control_area *control = &svm->vmcb->control; u64 ghcb_gpa, exit_code; - struct ghcb *ghcb; int ret; /* Validate the GHCB */ @@ -2830,20 +4249,24 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) } svm->sev_es.ghcb = svm->sev_es.ghcb_map.hva; - ghcb = svm->sev_es.ghcb_map.hva; - trace_kvm_vmgexit_enter(vcpu->vcpu_id, ghcb); + trace_kvm_vmgexit_enter(vcpu->vcpu_id, svm->sev_es.ghcb); - exit_code = ghcb_get_sw_exit_code(ghcb); + sev_es_sync_from_ghcb(svm); + + /* SEV-SNP guest requires that the GHCB GPA must be registered */ + if (sev_snp_guest(svm->vcpu.kvm) && !ghcb_gpa_is_registered(svm, ghcb_gpa)) { + vcpu_unimpl(&svm->vcpu, "vmgexit: GHCB GPA [%#llx] is not registered.\n", ghcb_gpa); + return -EINVAL; + } ret = sev_es_validate_vmgexit(svm); if (ret) return ret; - sev_es_sync_from_ghcb(svm); - ghcb_set_sw_exit_info_1(ghcb, 0); - ghcb_set_sw_exit_info_2(ghcb, 0); + svm_vmgexit_success(svm, 0); + exit_code = kvm_ghcb_get_sw_exit_code(control); switch (exit_code) { case SVM_VMGEXIT_MMIO_READ: ret = setup_vmgexit_scratch(svm, true, control->exit_info_2); @@ -2866,13 +4289,17 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) svm->sev_es.ghcb_sa); break; case SVM_VMGEXIT_NMI_COMPLETE: - ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_IRET); + ++vcpu->stat.nmi_window_exits; + svm->nmi_masked = false; + kvm_make_request(KVM_REQ_EVENT, vcpu); + ret = 1; break; case SVM_VMGEXIT_AP_HLT_LOOP: + svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NAE_EVENT; ret = kvm_emulate_ap_reset_hold(vcpu); break; case SVM_VMGEXIT_AP_JUMP_TABLE: { - struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info; + struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); switch (control->exit_info_1) { case 0: @@ -2881,18 +4308,50 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) break; case 1: /* Get AP jump table address */ - ghcb_set_sw_exit_info_2(ghcb, sev->ap_jump_table); + svm_vmgexit_success(svm, sev->ap_jump_table); break; default: pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n", control->exit_info_1); - ghcb_set_sw_exit_info_1(ghcb, 2); - ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_INPUT); + svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT); } ret = 1; break; } + case SVM_VMGEXIT_HV_FEATURES: + svm_vmgexit_success(svm, GHCB_HV_FT_SUPPORTED); + ret = 1; + break; + case SVM_VMGEXIT_TERM_REQUEST: + pr_info("SEV-ES guest requested termination: reason %#llx info %#llx\n", + control->exit_info_1, control->exit_info_2); + vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; + vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM; + vcpu->run->system_event.ndata = 1; + vcpu->run->system_event.data[0] = control->ghcb_gpa; + break; + case SVM_VMGEXIT_PSC: + ret = setup_vmgexit_scratch(svm, true, control->exit_info_2); + if (ret) + break; + + ret = snp_begin_psc(svm, svm->sev_es.ghcb_sa); + break; + case SVM_VMGEXIT_AP_CREATION: + ret = sev_snp_ap_creation(svm); + if (ret) { + svm_vmgexit_bad_input(svm, GHCB_ERR_INVALID_INPUT); + } + + ret = 1; + break; + case SVM_VMGEXIT_GUEST_REQUEST: + ret = snp_handle_guest_req(svm, control->exit_info_1, control->exit_info_2); + break; + case SVM_VMGEXIT_EXT_GUEST_REQUEST: + ret = snp_handle_ext_guest_req(svm, control->exit_info_1, control->exit_info_2); + break; case SVM_VMGEXIT_UNSUPPORTED_EVENT: vcpu_unimpl(vcpu, "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n", @@ -2927,19 +4386,71 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in) count, in); } +static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + + if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) { + bool v_tsc_aux = guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) || + guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID); + + set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, v_tsc_aux, v_tsc_aux); + } + + /* + * For SEV-ES, accesses to MSR_IA32_XSS should not be intercepted if + * the host/guest supports its use. + * + * KVM treats the guest as being capable of using XSAVES even if XSAVES + * isn't enabled in guest CPUID as there is no intercept for XSAVES, + * i.e. the guest can use XSAVES/XRSTOR to read/write XSS if XSAVE is + * exposed to the guest and XSAVES is supported in hardware. Condition + * full XSS passthrough on the guest being able to use XSAVES *and* + * XSAVES being exposed to the guest so that KVM can at least honor + * guest CPUID for RDMSR and WRMSR. + */ + if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) && + guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_XSS, 1, 1); + else + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_XSS, 0, 0); +} + +void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + struct kvm_cpuid_entry2 *best; + + /* For sev guests, the memory encryption bit is not reserved in CR3. */ + best = kvm_find_cpuid_entry(vcpu, 0x8000001F); + if (best) + vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f)); + + if (sev_es_guest(svm->vcpu.kvm)) + sev_es_vcpu_after_set_cpuid(svm); +} + static void sev_es_init_vmcb(struct vcpu_svm *svm) { + struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm); + struct vmcb *vmcb = svm->vmcb01.ptr; struct kvm_vcpu *vcpu = &svm->vcpu; svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ES_ENABLE; - svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; /* * An SEV-ES guest requires a VMSA area that is a separate from the * VMCB page. Do not include the encryption mask on the VMSA physical - * address since hardware will access it using the guest key. + * address since hardware will access it using the guest key. Note, + * the VMSA will be NULL if this vCPU is the destination for intrahost + * migration, and will be copied later. */ - svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa); + if (svm->sev_es.vmsa && !svm->sev_es.snp_has_guest_vmsa) + svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa); + + if (cpu_feature_enabled(X86_FEATURE_ALLOWED_SEV_FEATURES)) + svm->vmcb->control.allowed_sev_features = sev->vmsa_features | + VMCB_ALLOWED_SEV_FEATURES_VALID; /* Can't intercept CR register access, HV can't modify CR registers */ svm_clr_intercept(svm, INTERCEPT_CR0_READ); @@ -2957,8 +4468,23 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) svm_set_intercept(svm, TRAP_CR4_WRITE); svm_set_intercept(svm, TRAP_CR8_WRITE); - /* No support for enable_vmware_backdoor */ - clr_exception_intercept(svm, GP_VECTOR); + vmcb->control.intercepts[INTERCEPT_DR] = 0; + if (!sev_vcpu_has_debug_swap(svm)) { + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); + vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); + recalc_intercepts(svm); + } else { + /* + * Disable #DB intercept iff DebugSwap is enabled. KVM doesn't + * allow debugging SEV-ES guests, and enables DebugSwap iff + * NO_NESTED_DATA_BP is supported, so there's no reason to + * intercept #DB when DebugSwap is enabled. For simplicity + * with respect to guest debug, intercept #DB for other VMs + * even if NO_NESTED_DATA_BP is supported, i.e. even if the + * guest can't DoS the CPU with infinite #DB vectoring. + */ + clr_exception_intercept(svm, DB_VECTOR); + } /* Can't intercept XSETBV, HV can't modify XCR0 directly */ svm_clr_intercept(svm, INTERCEPT_XSETBV); @@ -2966,18 +4492,6 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) /* Clear intercepts on selected MSRs */ set_msr_interception(vcpu, svm->msrpm, MSR_EFER, 1, 1); set_msr_interception(vcpu, svm->msrpm, MSR_IA32_CR_PAT, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); - - if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && - (guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDTSCP) || - guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDPID))) { - set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, 1, 1); - if (guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDTSCP)) - svm_clr_intercept(svm, INTERCEPT_RDTSCP); - } } void sev_init_vmcb(struct vcpu_svm *svm) @@ -2985,38 +4499,80 @@ void sev_init_vmcb(struct vcpu_svm *svm) svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE; clr_exception_intercept(svm, UD_VECTOR); + /* + * Don't intercept #GP for SEV guests, e.g. for the VMware backdoor, as + * KVM can't decrypt guest memory to decode the faulting instruction. + */ + clr_exception_intercept(svm, GP_VECTOR); + if (sev_es_guest(svm->vcpu.kvm)) sev_es_init_vmcb(svm); } void sev_es_vcpu_reset(struct vcpu_svm *svm) { + struct kvm_vcpu *vcpu = &svm->vcpu; + struct kvm_sev_info *sev = to_kvm_sev_info(vcpu->kvm); + /* * Set the GHCB MSR value as per the GHCB specification when emulating * vCPU RESET for an SEV-ES guest. */ - set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX, + set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version, GHCB_VERSION_MIN, sev_enc_bit)); + + mutex_init(&svm->sev_es.snp_vmsa_mutex); } -void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa) +void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa) { + struct kvm *kvm = svm->vcpu.kvm; + /* - * As an SEV-ES guest, hardware will restore the host state on VMEXIT, - * of which one step is to perform a VMLOAD. KVM performs the - * corresponding VMSAVE in svm_prepare_guest_switch for both - * traditional and SEV-ES guests. + * All host state for SEV-ES guests is categorized into three swap types + * based on how it is handled by hardware during a world switch: + * + * A: VMRUN: Host state saved in host save area + * VMEXIT: Host state loaded from host save area + * + * B: VMRUN: Host state _NOT_ saved in host save area + * VMEXIT: Host state loaded from host save area + * + * C: VMRUN: Host state _NOT_ saved in host save area + * VMEXIT: Host state initialized to default(reset) values + * + * Manually save type-B state, i.e. state that is loaded by VMEXIT but + * isn't saved by VMRUN, that isn't already saved by VMSAVE (performed + * by common SVM code). */ - - /* XCR0 is restored on VMEXIT, save the current host value */ - hostsa->xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - - /* PKRU is restored on VMEXIT, save the current host value */ + hostsa->xcr0 = kvm_host.xcr0; hostsa->pkru = read_pkru(); + hostsa->xss = kvm_host.xss; - /* MSR_IA32_XSS is restored on VMEXIT, save the currnet host value */ - hostsa->xss = host_xss; + /* + * If DebugSwap is enabled, debug registers are loaded but NOT saved by + * the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU does + * not save or load debug registers. Sadly, KVM can't prevent SNP + * guests from lying about DebugSwap on secondary vCPUs, i.e. the + * SEV_FEATURES provided at "AP Create" isn't guaranteed to match what + * the guest has actually enabled (or not!) in the VMSA. + * + * If DebugSwap is *possible*, save the masks so that they're restored + * if the guest enables DebugSwap. But for the DRs themselves, do NOT + * rely on the CPU to restore the host values; KVM will restore them as + * needed in common code, via hw_breakpoint_restore(). Note, KVM does + * NOT support virtualizing Breakpoint Extensions, i.e. the mask MSRs + * don't need to be restored per se, KVM just needs to ensure they are + * loaded with the correct values *if* the CPU writes the MSRs. + */ + if (sev_vcpu_has_debug_swap(svm) || + (sev_snp_guest(kvm) && cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP))) { + hostsa->dr0_addr_mask = amd_get_dr_addr_mask(0); + hostsa->dr1_addr_mask = amd_get_dr_addr_mask(1); + hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2); + hostsa->dr3_addr_mask = amd_get_dr_addr_mask(3); + } } void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) @@ -3029,13 +4585,424 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) return; } + /* Subsequent SIPI */ + switch (svm->sev_es.ap_reset_hold_type) { + case AP_RESET_HOLD_NAE_EVENT: + /* + * Return from an AP Reset Hold VMGEXIT, where the guest will + * set the CS and RIP. Set SW_EXIT_INFO_2 to a non-zero value. + */ + svm_vmgexit_success(svm, 1); + break; + case AP_RESET_HOLD_MSR_PROTO: + /* + * Return from an AP Reset Hold VMGEXIT, where the guest will + * set the CS and RIP. Set GHCB data field to a non-zero value. + */ + set_ghcb_msr_bits(svm, 1, + GHCB_MSR_AP_RESET_HOLD_RESULT_MASK, + GHCB_MSR_AP_RESET_HOLD_RESULT_POS); + + set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP, + GHCB_MSR_INFO_MASK, + GHCB_MSR_INFO_POS); + break; + default: + break; + } +} + +struct page *snp_safe_alloc_page_node(int node, gfp_t gfp) +{ + unsigned long pfn; + struct page *p; + + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + return alloc_pages_node(node, gfp | __GFP_ZERO, 0); + /* - * Subsequent SIPI: Return from an AP Reset Hold VMGEXIT, where - * the guest will set the CS and RIP. Set SW_EXIT_INFO_2 to a - * non-zero value. + * Allocate an SNP-safe page to workaround the SNP erratum where + * the CPU will incorrectly signal an RMP violation #PF if a + * hugepage (2MB or 1GB) collides with the RMP entry of a + * 2MB-aligned VMCB, VMSA, or AVIC backing page. + * + * Allocate one extra page, choose a page which is not + * 2MB-aligned, and free the other. */ - if (!svm->sev_es.ghcb) + p = alloc_pages_node(node, gfp | __GFP_ZERO, 1); + if (!p) + return NULL; + + split_page(p, 1); + + pfn = page_to_pfn(p); + if (IS_ALIGNED(pfn, PTRS_PER_PMD)) + __free_page(p++); + else + __free_page(p + 1); + + return p; +} + +void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) +{ + struct kvm_memory_slot *slot; + struct kvm *kvm = vcpu->kvm; + int order, rmp_level, ret; + struct page *page; + bool assigned; + kvm_pfn_t pfn; + gfn_t gfn; + + gfn = gpa >> PAGE_SHIFT; + + /* + * The only time RMP faults occur for shared pages is when the guest is + * triggering an RMP fault for an implicit page-state change from + * shared->private. Implicit page-state changes are forwarded to + * userspace via KVM_EXIT_MEMORY_FAULT events, however, so RMP faults + * for shared pages should not end up here. + */ + if (!kvm_mem_is_private(kvm, gfn)) { + pr_warn_ratelimited("SEV: Unexpected RMP fault for non-private GPA 0x%llx\n", + gpa); + return; + } + + slot = gfn_to_memslot(kvm, gfn); + if (!kvm_slot_can_be_private(slot)) { + pr_warn_ratelimited("SEV: Unexpected RMP fault, non-private slot for GPA 0x%llx\n", + gpa); + return; + } + + ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, &page, &order); + if (ret) { + pr_warn_ratelimited("SEV: Unexpected RMP fault, no backing page for private GPA 0x%llx\n", + gpa); + return; + } + + ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level); + if (ret || !assigned) { + pr_warn_ratelimited("SEV: Unexpected RMP fault, no assigned RMP entry found for GPA 0x%llx PFN 0x%llx error %d\n", + gpa, pfn, ret); + goto out_no_trace; + } + + /* + * There are 2 cases where a PSMASH may be needed to resolve an #NPF + * with PFERR_GUEST_RMP_BIT set: + * + * 1) RMPADJUST/PVALIDATE can trigger an #NPF with PFERR_GUEST_SIZEM + * bit set if the guest issues them with a smaller granularity than + * what is indicated by the page-size bit in the 2MB RMP entry for + * the PFN that backs the GPA. + * + * 2) Guest access via NPT can trigger an #NPF if the NPT mapping is + * smaller than what is indicated by the 2MB RMP entry for the PFN + * that backs the GPA. + * + * In both these cases, the corresponding 2M RMP entry needs to + * be PSMASH'd to 512 4K RMP entries. If the RMP entry is already + * split into 4K RMP entries, then this is likely a spurious case which + * can occur when there are concurrent accesses by the guest to a 2MB + * GPA range that is backed by a 2MB-aligned PFN who's RMP entry is in + * the process of being PMASH'd into 4K entries. These cases should + * resolve automatically on subsequent accesses, so just ignore them + * here. + */ + if (rmp_level == PG_LEVEL_4K) + goto out; + + ret = snp_rmptable_psmash(pfn); + if (ret) { + /* + * Look it up again. If it's 4K now then the PSMASH may have + * raced with another process and the issue has already resolved + * itself. + */ + if (!snp_lookup_rmpentry(pfn, &assigned, &rmp_level) && + assigned && rmp_level == PG_LEVEL_4K) + goto out; + + pr_warn_ratelimited("SEV: Unable to split RMP entry for GPA 0x%llx PFN 0x%llx ret %d\n", + gpa, pfn, ret); + } + + kvm_zap_gfn_range(kvm, gfn, gfn + PTRS_PER_PMD); +out: + trace_kvm_rmp_fault(vcpu, gpa, pfn, error_code, rmp_level, ret); +out_no_trace: + kvm_release_page_unused(page); +} + +static bool is_pfn_range_shared(kvm_pfn_t start, kvm_pfn_t end) +{ + kvm_pfn_t pfn = start; + + while (pfn < end) { + int ret, rmp_level; + bool assigned; + + ret = snp_lookup_rmpentry(pfn, &assigned, &rmp_level); + if (ret) { + pr_warn_ratelimited("SEV: Failed to retrieve RMP entry: PFN 0x%llx GFN start 0x%llx GFN end 0x%llx RMP level %d error %d\n", + pfn, start, end, rmp_level, ret); + return false; + } + + if (assigned) { + pr_debug("%s: overlap detected, PFN 0x%llx start 0x%llx end 0x%llx RMP level %d\n", + __func__, pfn, start, end, rmp_level); + return false; + } + + pfn++; + } + + return true; +} + +static u8 max_level_for_order(int order) +{ + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M)) + return PG_LEVEL_2M; + + return PG_LEVEL_4K; +} + +static bool is_large_rmp_possible(struct kvm *kvm, kvm_pfn_t pfn, int order) +{ + kvm_pfn_t pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD); + + /* + * If this is a large folio, and the entire 2M range containing the + * PFN is currently shared, then the entire 2M-aligned range can be + * set to private via a single 2M RMP entry. + */ + if (max_level_for_order(order) > PG_LEVEL_4K && + is_pfn_range_shared(pfn_aligned, pfn_aligned + PTRS_PER_PMD)) + return true; + + return false; +} + +int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order) +{ + struct kvm_sev_info *sev = to_kvm_sev_info(kvm); + kvm_pfn_t pfn_aligned; + gfn_t gfn_aligned; + int level, rc; + bool assigned; + + if (!sev_snp_guest(kvm)) + return 0; + + rc = snp_lookup_rmpentry(pfn, &assigned, &level); + if (rc) { + pr_err_ratelimited("SEV: Failed to look up RMP entry: GFN %llx PFN %llx error %d\n", + gfn, pfn, rc); + return -ENOENT; + } + + if (assigned) { + pr_debug("%s: already assigned: gfn %llx pfn %llx max_order %d level %d\n", + __func__, gfn, pfn, max_order, level); + return 0; + } + + if (is_large_rmp_possible(kvm, pfn, max_order)) { + level = PG_LEVEL_2M; + pfn_aligned = ALIGN_DOWN(pfn, PTRS_PER_PMD); + gfn_aligned = ALIGN_DOWN(gfn, PTRS_PER_PMD); + } else { + level = PG_LEVEL_4K; + pfn_aligned = pfn; + gfn_aligned = gfn; + } + + rc = rmp_make_private(pfn_aligned, gfn_to_gpa(gfn_aligned), level, sev->asid, false); + if (rc) { + pr_err_ratelimited("SEV: Failed to update RMP entry: GFN %llx PFN %llx level %d error %d\n", + gfn, pfn, level, rc); + return -EINVAL; + } + + pr_debug("%s: updated: gfn %llx pfn %llx pfn_aligned %llx max_order %d level %d\n", + __func__, gfn, pfn, pfn_aligned, max_order, level); + + return 0; +} + +void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) +{ + kvm_pfn_t pfn; + + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + return; + + pr_debug("%s: PFN start 0x%llx PFN end 0x%llx\n", __func__, start, end); + + for (pfn = start; pfn < end;) { + bool use_2m_update = false; + int rc, rmp_level; + bool assigned; + + rc = snp_lookup_rmpentry(pfn, &assigned, &rmp_level); + if (rc || !assigned) + goto next_pfn; + + use_2m_update = IS_ALIGNED(pfn, PTRS_PER_PMD) && + end >= (pfn + PTRS_PER_PMD) && + rmp_level > PG_LEVEL_4K; + + /* + * If an unaligned PFN corresponds to a 2M region assigned as a + * large page in the RMP table, PSMASH the region into individual + * 4K RMP entries before attempting to convert a 4K sub-page. + */ + if (!use_2m_update && rmp_level > PG_LEVEL_4K) { + /* + * This shouldn't fail, but if it does, report it, but + * still try to update RMP entry to shared and pray this + * was a spurious error that can be addressed later. + */ + rc = snp_rmptable_psmash(pfn); + WARN_ONCE(rc, "SEV: Failed to PSMASH RMP entry for PFN 0x%llx error %d\n", + pfn, rc); + } + + rc = rmp_make_shared(pfn, use_2m_update ? PG_LEVEL_2M : PG_LEVEL_4K); + if (WARN_ONCE(rc, "SEV: Failed to update RMP entry for PFN 0x%llx error %d\n", + pfn, rc)) + goto next_pfn; + + /* + * SEV-ES avoids host/guest cache coherency issues through + * WBINVD hooks issued via MMU notifiers during run-time, and + * KVM's VM destroy path at shutdown. Those MMU notifier events + * don't cover gmem since there is no requirement to map pages + * to a HVA in order to use them for a running guest. While the + * shutdown path would still likely cover things for SNP guests, + * userspace may also free gmem pages during run-time via + * hole-punching operations on the guest_memfd, so flush the + * cache entries for these pages before free'ing them back to + * the host. + */ + clflush_cache_range(__va(pfn_to_hpa(pfn)), + use_2m_update ? PMD_SIZE : PAGE_SIZE); +next_pfn: + pfn += use_2m_update ? PTRS_PER_PMD : 1; + cond_resched(); + } +} + +int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +{ + int level, rc; + bool assigned; + + if (!sev_snp_guest(kvm)) + return 0; + + rc = snp_lookup_rmpentry(pfn, &assigned, &level); + if (rc || !assigned) + return PG_LEVEL_4K; + + return level; +} + +struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_save_area *vmsa; + struct kvm_sev_info *sev; + int error = 0; + int ret; + + if (!sev_es_guest(vcpu->kvm)) + return NULL; + + /* + * If the VMSA has not yet been encrypted, return a pointer to the + * current un-encrypted VMSA. + */ + if (!vcpu->arch.guest_state_protected) + return (struct vmcb_save_area *)svm->sev_es.vmsa; + + sev = to_kvm_sev_info(vcpu->kvm); + + /* Check if the SEV policy allows debugging */ + if (sev_snp_guest(vcpu->kvm)) { + if (!(sev->policy & SNP_POLICY_DEBUG)) + return NULL; + } else { + if (sev->policy & SEV_POLICY_NODBG) + return NULL; + } + + if (sev_snp_guest(vcpu->kvm)) { + struct sev_data_snp_dbg dbg = {0}; + + vmsa = snp_alloc_firmware_page(__GFP_ZERO); + if (!vmsa) + return NULL; + + dbg.gctx_paddr = __psp_pa(sev->snp_context); + dbg.src_addr = svm->vmcb->control.vmsa_pa; + dbg.dst_addr = __psp_pa(vmsa); + + ret = sev_do_cmd(SEV_CMD_SNP_DBG_DECRYPT, &dbg, &error); + + /* + * Return the target page to a hypervisor page no matter what. + * If this fails, the page can't be used, so leak it and don't + * try to use it. + */ + if (snp_page_reclaim(vcpu->kvm, PHYS_PFN(__pa(vmsa)))) + return NULL; + + if (ret) { + pr_err("SEV: SNP_DBG_DECRYPT failed ret=%d, fw_error=%d (%#x)\n", + ret, error, error); + free_page((unsigned long)vmsa); + + return NULL; + } + } else { + struct sev_data_dbg dbg = {0}; + struct page *vmsa_page; + + vmsa_page = alloc_page(GFP_KERNEL); + if (!vmsa_page) + return NULL; + + vmsa = page_address(vmsa_page); + + dbg.handle = sev->handle; + dbg.src_addr = svm->vmcb->control.vmsa_pa; + dbg.dst_addr = __psp_pa(vmsa); + dbg.len = PAGE_SIZE; + + ret = sev_do_cmd(SEV_CMD_DBG_DECRYPT, &dbg, &error); + if (ret) { + pr_err("SEV: SEV_CMD_DBG_DECRYPT failed ret=%d, fw_error=%d (0x%x)\n", + ret, error, error); + __free_page(vmsa_page); + + return NULL; + } + } + + return vmsa; +} + +void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa) +{ + /* If the VMSA has not yet been encrypted, nothing was allocated */ + if (!vcpu->arch.guest_state_protected || !vmsa) return; - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1); + free_page((unsigned long)vmsa); } |