diff options
-rw-r--r-- | arch/x86/kvm/vmx/main.c | 14 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/tdx.c | 213 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/tdx_arch.h | 23 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/x86_ops.h | 37 |
4 files changed, 284 insertions, 3 deletions
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 0ea4bec0626e..0c94810b1f48 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -36,9 +36,21 @@ static __init int vt_hardware_setup(void) * is KVM may allocate couple of more bytes than needed for * each VM. */ - if (enable_tdx) + if (enable_tdx) { vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, sizeof(struct kvm_tdx)); + /* + * Note, TDX may fail to initialize in a later time in + * vt_init(), in which case it is not necessary to setup + * those callbacks. But making them valid here even + * when TDX fails to init later is fine because those + * callbacks won't be called if the VM isn't TDX guest. + */ + vt_x86_ops.link_external_spt = tdx_sept_link_private_spt; + vt_x86_ops.set_external_spte = tdx_sept_set_private_spte; + vt_x86_ops.free_external_spt = tdx_sept_free_private_spt; + vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte; + } return 0; } diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 18f09661f9b1..70473442f00c 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -154,6 +154,12 @@ static DEFINE_MUTEX(tdx_lock); static atomic_t nr_configured_hkid; +static bool tdx_operand_busy(u64 err) +{ + return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY; +} + + static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx) { tdx_guest_keyid_free(kvm_tdx->hkid); @@ -525,6 +531,160 @@ void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level) td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa); } +static void tdx_unpin(struct kvm *kvm, struct page *page) +{ + put_page(page); +} + +static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, + enum pg_level level, struct page *page) +{ + int tdx_level = pg_level_to_tdx_sept_level(level); + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + gpa_t gpa = gfn_to_gpa(gfn); + u64 entry, level_state; + u64 err; + + err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state); + if (unlikely(tdx_operand_busy(err))) { + tdx_unpin(kvm, page); + return -EBUSY; + } + + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state); + tdx_unpin(kvm, page); + return -EIO; + } + + return 0; +} + +int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, kvm_pfn_t pfn) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + struct page *page = pfn_to_page(pfn); + + /* TODO: handle large pages. */ + if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) + return -EINVAL; + + /* + * Because guest_memfd doesn't support page migration with + * a_ops->migrate_folio (yet), no callback is triggered for KVM on page + * migration. Until guest_memfd supports page migration, prevent page + * migration. + * TODO: Once guest_memfd introduces callback on page migration, + * implement it and remove get_page/put_page(). + */ + get_page(page); + + if (likely(kvm_tdx->state == TD_STATE_RUNNABLE)) + return tdx_mem_page_aug(kvm, gfn, level, page); + + /* + * TODO: KVM_TDX_INIT_MEM_REGION support to populate before finalize + * comes here for the initial memory. + */ + return -EOPNOTSUPP; +} + +static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, struct page *page) +{ + int tdx_level = pg_level_to_tdx_sept_level(level); + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + gpa_t gpa = gfn_to_gpa(gfn); + u64 err, entry, level_state; + + /* TODO: handle large pages. */ + if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) + return -EINVAL; + + if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm)) + return -EINVAL; + + do { + /* + * When zapping private page, write lock is held. So no race + * condition with other vcpu sept operation. Race only with + * TDH.VP.ENTER. + */ + err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry, + &level_state); + } while (unlikely(tdx_operand_busy(err))); + + if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE && + err == (TDX_EPT_WALK_FAILED | TDX_OPERAND_ID_RCX))) { + /* + * This page was mapped with KVM_MAP_MEMORY, but + * KVM_TDX_INIT_MEM_REGION is not issued yet. + */ + if (!is_last_spte(entry, level) || !(entry & VMX_EPT_RWX_MASK)) { + tdx_unpin(kvm, page); + return 0; + } + } + + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state); + return -EIO; + } + + err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page); + + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); + return -EIO; + } + tdx_clear_page(page); + tdx_unpin(kvm, page); + return 0; +} + +int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, void *private_spt) +{ + int tdx_level = pg_level_to_tdx_sept_level(level); + gpa_t gpa = gfn_to_gpa(gfn); + struct page *page = virt_to_page(private_spt); + u64 err, entry, level_state; + + err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, tdx_level, page, &entry, + &level_state); + if (unlikely(tdx_operand_busy(err))) + return -EBUSY; + + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error_2(TDH_MEM_SEPT_ADD, err, entry, level_state); + return -EIO; + } + + return 0; +} + +static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level) +{ + int tdx_level = pg_level_to_tdx_sept_level(level); + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + gpa_t gpa = gfn_to_gpa(gfn) & KVM_HPAGE_MASK(level); + u64 err, entry, level_state; + + /* For now large page isn't supported yet. */ + WARN_ON_ONCE(level != PG_LEVEL_4K); + + err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); + if (unlikely(tdx_operand_busy(err))) + return -EBUSY; + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state); + return -EIO; + } + return 0; +} + /* * Ensure shared and private EPTs to be flushed on all vCPUs. * tdh_mem_track() is the only caller that increases TD epoch. An increase in @@ -549,7 +709,7 @@ void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level) * occurs certainly after TD epoch increment and before the next * tdh_mem_track(). */ -static void __always_unused tdx_track(struct kvm *kvm) +static void tdx_track(struct kvm *kvm) { struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); u64 err; @@ -562,7 +722,7 @@ static void __always_unused tdx_track(struct kvm *kvm) do { err = tdh_mem_track(&kvm_tdx->td); - } while (unlikely((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY)); + } while (unlikely(tdx_operand_busy(err))); if (KVM_BUG_ON(err, kvm)) pr_tdx_error(TDH_MEM_TRACK, err); @@ -570,6 +730,55 @@ static void __always_unused tdx_track(struct kvm *kvm) kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); } +int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, void *private_spt) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + + /* + * free_external_spt() is only called after hkid is freed when TD is + * tearing down. + * KVM doesn't (yet) zap page table pages in mirror page table while + * TD is active, though guest pages mapped in mirror page table could be + * zapped during TD is active, e.g. for shared <-> private conversion + * and slot move/deletion. + */ + if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm)) + return -EINVAL; + + /* + * The HKID assigned to this TD was already freed and cache was + * already flushed. We don't have to flush again. + */ + return tdx_reclaim_page(virt_to_page(private_spt)); +} + +int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, kvm_pfn_t pfn) +{ + int ret; + + /* + * HKID is released after all private pages have been removed, and set + * before any might be populated. Warn if zapping is attempted when + * there can't be anything populated in the private EPT. + */ + if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm)) + return -EINVAL; + + ret = tdx_sept_zap_private_spte(kvm, gfn, level); + if (ret) + return ret; + + /* + * TDX requires TLB tracking before dropping private page. Do + * it here, although it is also done later. + */ + tdx_track(kvm); + + return tdx_sept_drop_private_spte(kvm, gfn, level, pfn_to_page(pfn)); +} + static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd) { const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; diff --git a/arch/x86/kvm/vmx/tdx_arch.h b/arch/x86/kvm/vmx/tdx_arch.h index 55a740f90e67..58bda8a5ce9a 100644 --- a/arch/x86/kvm/vmx/tdx_arch.h +++ b/arch/x86/kvm/vmx/tdx_arch.h @@ -121,6 +121,29 @@ struct td_params { #define TDX_MIN_TSC_FREQUENCY_KHZ (100 * 1000) #define TDX_MAX_TSC_FREQUENCY_KHZ (10 * 1000 * 1000) +/* Additional Secure EPT entry information */ +#define TDX_SEPT_LEVEL_MASK GENMASK_ULL(2, 0) +#define TDX_SEPT_STATE_MASK GENMASK_ULL(15, 8) +#define TDX_SEPT_STATE_SHIFT 8 + +enum tdx_sept_entry_state { + TDX_SEPT_FREE = 0, + TDX_SEPT_BLOCKED = 1, + TDX_SEPT_PENDING = 2, + TDX_SEPT_PENDING_BLOCKED = 3, + TDX_SEPT_PRESENT = 4, +}; + +static inline u8 tdx_get_sept_level(u64 sept_entry_info) +{ + return sept_entry_info & TDX_SEPT_LEVEL_MASK; +} + +static inline u8 tdx_get_sept_state(u64 sept_entry_info) +{ + return (sept_entry_info & TDX_SEPT_STATE_MASK) >> TDX_SEPT_STATE_SHIFT; +} + #define MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM BIT_ULL(20) /* diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index 5b6fc2252f98..444cdca96ce0 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -132,6 +132,15 @@ void tdx_vcpu_free(struct kvm_vcpu *vcpu); int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp); +int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, void *private_spt); +int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, void *private_spt); +int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, kvm_pfn_t pfn); +int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, kvm_pfn_t pfn); + void tdx_flush_tlb_current(struct kvm_vcpu *vcpu); void tdx_flush_tlb_all(struct kvm_vcpu *vcpu); void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); @@ -146,6 +155,34 @@ static inline void tdx_vcpu_free(struct kvm_vcpu *vcpu) {} static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { return -EOPNOTSUPP; } +static inline int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, + void *private_spt) +{ + return -EOPNOTSUPP; +} + +static inline int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, + void *private_spt) +{ + return -EOPNOTSUPP; +} + +static inline int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, + kvm_pfn_t pfn) +{ + return -EOPNOTSUPP; +} + +static inline int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, + kvm_pfn_t pfn) +{ + return -EOPNOTSUPP; +} + static inline void tdx_flush_tlb_current(struct kvm_vcpu *vcpu) {} static inline void tdx_flush_tlb_all(struct kvm_vcpu *vcpu) {} static inline void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) {} |