diff options
-rw-r--r-- | arch/x86/include/asm/kvm_host.h | 12 | ||||
-rw-r--r-- | arch/x86/include/asm/tdx.h | 15 | ||||
-rw-r--r-- | arch/x86/include/asm/vmx.h | 1 | ||||
-rw-r--r-- | arch/x86/include/uapi/asm/kvm.h | 10 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.h | 4 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmu.c | 17 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmu_internal.h | 5 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/page_track.c | 3 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/spte.c | 10 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.c | 49 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/common.h | 43 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/main.c | 119 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/tdx.c | 745 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/tdx.h | 93 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/tdx_arch.h | 23 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 31 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/x86_ops.h | 51 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 6 | ||||
-rw-r--r-- | arch/x86/virt/vmx/tdx/tdx.c | 139 | ||||
-rw-r--r-- | arch/x86/virt/vmx/tdx/tdx.h | 8 | ||||
-rw-r--r-- | include/linux/kvm_dirty_ring.h | 11 | ||||
-rw-r--r-- | virt/kvm/dirty_ring.c | 11 | ||||
-rw-r--r-- | virt/kvm/kvm_main.c | 4 |
23 files changed, 1325 insertions, 85 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6800d3956ab1..ddb08581c64d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1562,6 +1562,13 @@ struct kvm_arch { struct kvm_mmu_memory_cache split_desc_cache; gfn_t gfn_direct_bits; + + /* + * Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A Zero + * value indicates CPU dirty logging is unsupported or disabled in + * current VM. + */ + int cpu_dirty_log_size; }; struct kvm_vm_stat { @@ -1815,11 +1822,6 @@ struct kvm_x86_ops { struct x86_exception *exception); void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu); - /* - * Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A zero - * value indicates CPU dirty logging is unsupported or disabled. - */ - int cpu_dirty_log_size; void (*update_cpu_dirty_logging)(struct kvm_vcpu *vcpu); const struct kvm_x86_nested_ops *nested_ops; diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 2879fc518a32..e38e7558c02f 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -148,7 +148,6 @@ struct tdx_vp { struct page **tdcx_pages; }; - static inline u64 mk_keyed_paddr(u16 hkid, struct page *page) { u64 ret; @@ -158,15 +157,26 @@ static inline u64 mk_keyed_paddr(u16 hkid, struct page *page) ret |= (u64)hkid << boot_cpu_data.x86_phys_bits; return ret; +} +static inline int pg_level_to_tdx_sept_level(enum pg_level level) +{ + WARN_ON_ONCE(level == PG_LEVEL_NONE); + return level - 1; } u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page); +u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2); +u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2); u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page); +u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2); +u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u64 *ext_err2); u64 tdh_mng_key_config(struct tdx_td *td); u64 tdh_mng_create(struct tdx_td *td, u16 hkid); u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp); u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data); +u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2); +u64 tdh_mr_finalize(struct tdx_td *td); u64 tdh_vp_flush(struct tdx_vp *vp); u64 tdh_mng_vpflushdone(struct tdx_td *td); u64 tdh_mng_key_freeid(struct tdx_td *td); @@ -175,8 +185,11 @@ u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid); u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data); u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask); u64 tdh_phymem_page_reclaim(struct page *page, u64 *tdx_pt, u64 *tdx_owner, u64 *tdx_size); +u64 tdh_mem_track(struct tdx_td *tdr); +u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u64 *ext_err2); u64 tdh_phymem_cache_wb(bool resume); u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td); +u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page); #else static inline void tdx_init(void) { } static inline int tdx_cpu_enable(void) { return -ENODEV; } diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index f7fd4369b821..9298fb9d4bb3 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -256,6 +256,7 @@ enum vmcs_field { TSC_MULTIPLIER_HIGH = 0x00002033, TERTIARY_VM_EXEC_CONTROL = 0x00002034, TERTIARY_VM_EXEC_CONTROL_HIGH = 0x00002035, + SHARED_EPT_POINTER = 0x0000203C, PID_POINTER_TABLE = 0x00002042, PID_POINTER_TABLE_HIGH = 0x00002043, GUEST_PHYSICAL_ADDRESS = 0x00002400, diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index cd55484e3f0c..89cc7a18ef45 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -932,6 +932,8 @@ enum kvm_tdx_cmd_id { KVM_TDX_CAPABILITIES = 0, KVM_TDX_INIT_VM, KVM_TDX_INIT_VCPU, + KVM_TDX_INIT_MEM_REGION, + KVM_TDX_FINALIZE_VM, KVM_TDX_GET_CPUID, KVM_TDX_CMD_NR_MAX, @@ -987,4 +989,12 @@ struct kvm_tdx_init_vm { struct kvm_cpuid2 cpuid; }; +#define KVM_TDX_MEASURE_MEMORY_REGION _BITULL(0) + +struct kvm_tdx_init_mem_region { + __u64 source_addr; + __u64 gpa; + __u64 nr_pages; +}; + #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 050a0e229a4d..47e64a3c4ce3 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -79,6 +79,7 @@ static inline gfn_t kvm_mmu_max_gfn(void) u8 kvm_mmu_get_max_tdp_level(void); void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask); +void kvm_mmu_set_mmio_spte_value(struct kvm *kvm, u64 mmio_value); void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask); void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only); @@ -253,6 +254,9 @@ extern bool tdp_mmu_enabled; #define tdp_mmu_enabled false #endif +bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa); +int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level); + static inline bool kvm_memslots_have_rmaps(struct kvm *kvm) { return !tdp_mmu_enabled || kvm_shadow_root_allocated(kvm); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index a79bc15f31c6..0c6a4568af5b 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -110,6 +110,7 @@ static bool __ro_after_init tdp_mmu_allowed; #ifdef CONFIG_X86_64 bool __read_mostly tdp_mmu_enabled = true; module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0444); +EXPORT_SYMBOL_GPL(tdp_mmu_enabled); #endif static int max_huge_page_level __read_mostly; @@ -1304,15 +1305,15 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, * enabled but it chooses between clearing the Dirty bit and Writeable * bit based on the context. */ - if (kvm_x86_ops.cpu_dirty_log_size) + if (kvm->arch.cpu_dirty_log_size) kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask); else kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } -int kvm_cpu_dirty_log_size(void) +int kvm_cpu_dirty_log_size(struct kvm *kvm) { - return kvm_x86_ops.cpu_dirty_log_size; + return kvm->arch.cpu_dirty_log_size; } bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, @@ -4685,8 +4686,7 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) return direct_page_fault(vcpu, fault); } -static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, - u8 *level) +int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level) { int r; @@ -4700,6 +4700,10 @@ static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, do { if (signal_pending(current)) return -EINTR; + + if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) + return -EIO; + cond_resched(); r = kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level); } while (r == RET_PF_RETRY); @@ -4724,6 +4728,7 @@ static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, return -EIO; } } +EXPORT_SYMBOL_GPL(kvm_tdp_map_page); long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, struct kvm_pre_fault_memory *range) @@ -5747,6 +5752,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) out: return r; } +EXPORT_SYMBOL_GPL(kvm_mmu_load); void kvm_mmu_unload(struct kvm_vcpu *vcpu) { @@ -7072,6 +7078,7 @@ static void kvm_mmu_zap_memslot(struct kvm *kvm, .start = slot->base_gfn, .end = slot->base_gfn + slot->npages, .may_block = true, + .attr_filter = KVM_FILTER_PRIVATE | KVM_FILTER_SHARED, }; bool flush; diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 75f00598289d..db8f33e4de62 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -187,7 +187,8 @@ static inline gfn_t kvm_gfn_root_bits(const struct kvm *kvm, const struct kvm_mm return kvm_gfn_direct_bits(kvm); } -static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp) +static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm *kvm, + struct kvm_mmu_page *sp) { /* * When using the EPT page-modification log, the GPAs in the CPU dirty @@ -197,7 +198,7 @@ static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp) * being enabled is mandatory as the bits used to denote WP-only SPTEs * are reserved for PAE paging (32-bit KVM). */ - return kvm_x86_ops.cpu_dirty_log_size && sp->role.guest_mode; + return kvm->arch.cpu_dirty_log_size && sp->role.guest_mode; } static inline gfn_t gfn_round_for_level(gfn_t gfn, int level) diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index 561c331fd6ec..1b17b12393a8 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -172,6 +172,9 @@ static int kvm_enable_external_write_tracking(struct kvm *kvm) struct kvm_memory_slot *slot; int r = 0, i, bkt; + if (kvm->arch.vm_type == KVM_X86_TDX_VM) + return -EOPNOTSUPP; + mutex_lock(&kvm->slots_arch_lock); /* diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 22551e2f1d00..a609d5b58b69 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -96,8 +96,6 @@ u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access) u64 spte = generation_mmio_spte_mask(gen); u64 gpa = gfn << PAGE_SHIFT; - WARN_ON_ONCE(!vcpu->kvm->arch.shadow_mmio_value); - access &= shadow_mmio_access_mask; spte |= vcpu->kvm->arch.shadow_mmio_value | access; spte |= gpa | shadow_nonpresent_or_rsvd_mask; @@ -170,7 +168,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (sp->role.ad_disabled) spte |= SPTE_TDP_AD_DISABLED; - else if (kvm_mmu_page_ad_need_write_protect(sp)) + else if (kvm_mmu_page_ad_need_write_protect(vcpu->kvm, sp)) spte |= SPTE_TDP_AD_WRPROT_ONLY; spte |= shadow_present_mask; @@ -433,6 +431,12 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask) } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); +void kvm_mmu_set_mmio_spte_value(struct kvm *kvm, u64 mmio_value) +{ + kvm->arch.shadow_mmio_value = mmio_value; +} +EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_value); + void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask) { /* shadow_me_value must be a subset of shadow_me_mask */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 046b6ba31197..fd0a7792386b 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1613,21 +1613,21 @@ void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, } } -static bool tdp_mmu_need_write_protect(struct kvm_mmu_page *sp) +static bool tdp_mmu_need_write_protect(struct kvm *kvm, struct kvm_mmu_page *sp) { /* * All TDP MMU shadow pages share the same role as their root, aside * from level, so it is valid to key off any shadow page to determine if * write protection is needed for an entire tree. */ - return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled; + return kvm_mmu_page_ad_need_write_protect(kvm, sp) || !kvm_ad_enabled; } static void clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end) { - const u64 dbit = tdp_mmu_need_write_protect(root) ? PT_WRITABLE_MASK : - shadow_dirty_mask; + const u64 dbit = tdp_mmu_need_write_protect(kvm, root) ? + PT_WRITABLE_MASK : shadow_dirty_mask; struct tdp_iter iter; rcu_read_lock(); @@ -1672,8 +1672,8 @@ void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t gfn, unsigned long mask, bool wrprot) { - const u64 dbit = (wrprot || tdp_mmu_need_write_protect(root)) ? PT_WRITABLE_MASK : - shadow_dirty_mask; + const u64 dbit = (wrprot || tdp_mmu_need_write_protect(kvm, root)) ? + PT_WRITABLE_MASK : shadow_dirty_mask; struct tdp_iter iter; lockdep_assert_held_write(&kvm->mmu_lock); @@ -1894,16 +1894,13 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, * * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. */ -int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, - int *root_level) +static int __kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, + struct kvm_mmu_page *root) { - struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa); struct tdp_iter iter; gfn_t gfn = addr >> PAGE_SHIFT; int leaf = -1; - *root_level = vcpu->arch.mmu->root_role.level; - tdp_mmu_for_each_pte(iter, vcpu->kvm, root, gfn, gfn + 1) { leaf = iter.level; sptes[leaf] = iter.old_spte; @@ -1912,6 +1909,36 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, return leaf; } +int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, + int *root_level) +{ + struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa); + *root_level = vcpu->arch.mmu->root_role.level; + + return __kvm_tdp_mmu_get_walk(vcpu, addr, sptes, root); +} + +bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa) +{ + struct kvm *kvm = vcpu->kvm; + bool is_direct = kvm_is_addr_direct(kvm, gpa); + hpa_t root = is_direct ? vcpu->arch.mmu->root.hpa : + vcpu->arch.mmu->mirror_root_hpa; + u64 sptes[PT64_ROOT_MAX_LEVEL + 1], spte; + int leaf; + + lockdep_assert_held(&kvm->mmu_lock); + rcu_read_lock(); + leaf = __kvm_tdp_mmu_get_walk(vcpu, gpa, sptes, root_to_sp(root)); + rcu_read_unlock(); + if (leaf < 0) + return false; + + spte = sptes[leaf]; + return is_shadow_present_pte(spte) && is_last_spte(spte, leaf); +} +EXPORT_SYMBOL_GPL(kvm_tdp_mmu_gpa_is_mapped); + /* * Returns the last level spte pointer of the shadow page walk for the given * gpa, and sets *spte to the spte value. This spte may be non-preset. If no diff --git a/arch/x86/kvm/vmx/common.h b/arch/x86/kvm/vmx/common.h new file mode 100644 index 000000000000..7a592467a044 --- /dev/null +++ b/arch/x86/kvm/vmx/common.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __KVM_X86_VMX_COMMON_H +#define __KVM_X86_VMX_COMMON_H + +#include <linux/kvm_host.h> + +#include "mmu.h" + +static inline bool vt_is_tdx_private_gpa(struct kvm *kvm, gpa_t gpa) +{ + /* For TDX the direct mask is the shared mask. */ + return !kvm_is_addr_direct(kvm, gpa); +} + +static inline int __vmx_handle_ept_violation(struct kvm_vcpu *vcpu, gpa_t gpa, + unsigned long exit_qualification) +{ + u64 error_code; + + /* Is it a read fault? */ + error_code = (exit_qualification & EPT_VIOLATION_ACC_READ) + ? PFERR_USER_MASK : 0; + /* Is it a write fault? */ + error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE) + ? PFERR_WRITE_MASK : 0; + /* Is it a fetch fault? */ + error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) + ? PFERR_FETCH_MASK : 0; + /* ept page table entry is present? */ + error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK) + ? PFERR_PRESENT_MASK : 0; + + if (error_code & EPT_VIOLATION_GVA_IS_VALID) + error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ? + PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; + + if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) + error_code |= PFERR_PRIVATE_ACCESS; + + return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); +} + +#endif /* __KVM_X86_VMX_COMMON_H */ diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index e7d402b3a90d..ec8223ee9d28 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -3,12 +3,21 @@ #include "x86_ops.h" #include "vmx.h" +#include "mmu.h" #include "nested.h" #include "pmu.h" #include "posted_intr.h" #include "tdx.h" #include "tdx_arch.h" +static void vt_disable_virtualization_cpu(void) +{ + /* Note, TDX *and* VMX need to be disabled if TDX is enabled. */ + if (enable_tdx) + tdx_disable_virtualization_cpu(); + vmx_disable_virtualization_cpu(); +} + static __init int vt_hardware_setup(void) { int ret; @@ -35,9 +44,21 @@ static __init int vt_hardware_setup(void) * is KVM may allocate couple of more bytes than needed for * each VM. */ - if (enable_tdx) + if (enable_tdx) { vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, sizeof(struct kvm_tdx)); + /* + * Note, TDX may fail to initialize in a later time in + * vt_init(), in which case it is not necessary to setup + * those callbacks. But making them valid here even + * when TDX fails to init later is fine because those + * callbacks won't be called if the VM isn't TDX guest. + */ + vt_x86_ops.link_external_spt = tdx_sept_link_private_spt; + vt_x86_ops.set_external_spte = tdx_sept_set_private_spte; + vt_x86_ops.free_external_spt = tdx_sept_free_private_spt; + vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte; + } return 0; } @@ -98,6 +119,75 @@ static void vt_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx_vcpu_reset(vcpu, init_event); } +static void vt_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + if (is_td_vcpu(vcpu)) { + tdx_vcpu_load(vcpu, cpu); + return; + } + + vmx_vcpu_load(vcpu, cpu); +} + +static void vt_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) +{ + /* + * Basic TDX does not support feature PML. KVM does not enable PML in + * TD's VMCS, nor does it allocate or flush PML buffer for TDX. + */ + if (WARN_ON_ONCE(is_td_vcpu(vcpu))) + return; + + vmx_update_cpu_dirty_logging(vcpu); +} + +static void vt_flush_tlb_all(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) { + tdx_flush_tlb_all(vcpu); + return; + } + + vmx_flush_tlb_all(vcpu); +} + +static void vt_flush_tlb_current(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) { + tdx_flush_tlb_current(vcpu); + return; + } + + vmx_flush_tlb_current(vcpu); +} + +static void vt_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_flush_tlb_gva(vcpu, addr); +} + +static void vt_flush_tlb_guest(struct kvm_vcpu *vcpu) +{ + if (is_td_vcpu(vcpu)) + return; + + vmx_flush_tlb_guest(vcpu); +} + +static void vt_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, + int pgd_level) +{ + if (is_td_vcpu(vcpu)) { + tdx_load_mmu_pgd(vcpu, root_hpa, pgd_level); + return; + } + + vmx_load_mmu_pgd(vcpu, root_hpa, pgd_level); +} + static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp) { if (!is_td(kvm)) @@ -114,6 +204,14 @@ static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp) return tdx_vcpu_ioctl(vcpu, argp); } +static int vt_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +{ + if (is_td(kvm)) + return tdx_gmem_private_max_mapping_level(kvm, pfn); + + return 0; +} + #define VMX_REQUIRED_APICV_INHIBITS \ (BIT(APICV_INHIBIT_REASON_DISABLED) | \ BIT(APICV_INHIBIT_REASON_ABSENT) | \ @@ -131,7 +229,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .hardware_unsetup = vmx_hardware_unsetup, .enable_virtualization_cpu = vmx_enable_virtualization_cpu, - .disable_virtualization_cpu = vmx_disable_virtualization_cpu, + .disable_virtualization_cpu = vt_disable_virtualization_cpu, .emergency_disable_virtualization_cpu = vmx_emergency_disable_virtualization_cpu, .has_emulated_msr = vmx_has_emulated_msr, @@ -148,7 +246,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .vcpu_reset = vt_vcpu_reset, .prepare_switch_to_guest = vmx_prepare_switch_to_guest, - .vcpu_load = vmx_vcpu_load, + .vcpu_load = vt_vcpu_load, .vcpu_put = vmx_vcpu_put, .update_exception_bitmap = vmx_update_exception_bitmap, @@ -178,10 +276,10 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .set_rflags = vmx_set_rflags, .get_if_flag = vmx_get_if_flag, - .flush_tlb_all = vmx_flush_tlb_all, - .flush_tlb_current = vmx_flush_tlb_current, - .flush_tlb_gva = vmx_flush_tlb_gva, - .flush_tlb_guest = vmx_flush_tlb_guest, + .flush_tlb_all = vt_flush_tlb_all, + .flush_tlb_current = vt_flush_tlb_current, + .flush_tlb_gva = vt_flush_tlb_gva, + .flush_tlb_guest = vt_flush_tlb_guest, .vcpu_pre_run = vmx_vcpu_pre_run, .vcpu_run = vmx_vcpu_run, @@ -231,13 +329,12 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .write_tsc_offset = vmx_write_tsc_offset, .write_tsc_multiplier = vmx_write_tsc_multiplier, - .load_mmu_pgd = vmx_load_mmu_pgd, + .load_mmu_pgd = vt_load_mmu_pgd, .check_intercept = vmx_check_intercept, .handle_exit_irqoff = vmx_handle_exit_irqoff, - .cpu_dirty_log_size = PML_LOG_NR_ENTRIES, - .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging, + .update_cpu_dirty_logging = vt_update_cpu_dirty_logging, .nested_ops = &vmx_nested_ops, @@ -271,6 +368,8 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .mem_enc_ioctl = vt_mem_enc_ioctl, .vcpu_mem_enc_ioctl = vt_vcpu_mem_enc_ioctl, + + .private_max_mapping_level = vt_gmem_private_max_mapping_level }; struct kvm_x86_init_ops vt_init_ops __initdata = { diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 01166cb8f2e6..277f5decde9c 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/cleanup.h> #include <linux/cpu.h> #include <asm/cpufeature.h> #include <linux/misc_cgroup.h> @@ -8,6 +9,9 @@ #include "x86_ops.h" #include "lapic.h" #include "tdx.h" +#include "vmx.h" +#include "mmu/spte.h" +#include "common.h" #pragma GCC poison to_vmx @@ -32,10 +36,26 @@ bool enable_tdx __ro_after_init; module_param_named(tdx, enable_tdx, bool, 0444); +#define TDX_SHARED_BIT_PWL_5 gpa_to_gfn(BIT_ULL(51)) +#define TDX_SHARED_BIT_PWL_4 gpa_to_gfn(BIT_ULL(47)) + static enum cpuhp_state tdx_cpuhp_state; static const struct tdx_sys_info *tdx_sysinfo; +void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err) +{ + KVM_BUG_ON(1, tdx->vcpu.kvm); + pr_err("TDH_VP_RD[%s.0x%x] failed 0x%llx\n", uclass, field, err); +} + +void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field, + u64 val, u64 err) +{ + KVM_BUG_ON(1, tdx->vcpu.kvm); + pr_err("TDH_VP_WR[%s.0x%x]%s0x%llx failed: 0x%llx\n", uclass, field, op, val, err); +} + #define KVM_SUPPORTED_TD_ATTRS (TDX_TD_ATTR_SEPT_VE_DISABLE) static __always_inline struct kvm_tdx *to_kvm_tdx(struct kvm *kvm) @@ -136,6 +156,27 @@ static DEFINE_MUTEX(tdx_lock); static atomic_t nr_configured_hkid; +static bool tdx_operand_busy(u64 err) +{ + return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY; +} + + +/* + * A per-CPU list of TD vCPUs associated with a given CPU. + * Protected by interrupt mask. Only manipulated by the CPU owning this per-CPU + * list. + * - When a vCPU is loaded onto a CPU, it is removed from the per-CPU list of + * the old CPU during the IPI callback running on the old CPU, and then added + * to the per-CPU list of the new CPU. + * - When a TD is tearing down, all vCPUs are disassociated from their current + * running CPUs and removed from the per-CPU list during the IPI callback + * running on those CPUs. + * - When a CPU is brought down, traverse the per-CPU list to disassociate all + * associated TD vCPUs and remove them from the per-CPU list. + */ +static DEFINE_PER_CPU(struct list_head, associated_tdvcpus); + static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx) { tdx_guest_keyid_free(kvm_tdx->hkid); @@ -151,6 +192,22 @@ static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx) return kvm_tdx->hkid > 0; } +static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu) +{ + lockdep_assert_irqs_disabled(); + + list_del(&to_tdx(vcpu)->cpu_list); + + /* + * Ensure tdx->cpu_list is updated before setting vcpu->cpu to -1, + * otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU + * to its list before it's deleted from this CPU's list. + */ + smp_wmb(); + + vcpu->cpu = -1; +} + static void tdx_clear_page(struct page *page) { const void *zero_page = (const void *) page_to_virt(ZERO_PAGE(0)); @@ -217,6 +274,83 @@ static void tdx_reclaim_control_page(struct page *ctrl_page) __free_page(ctrl_page); } +struct tdx_flush_vp_arg { + struct kvm_vcpu *vcpu; + u64 err; +}; + +static void tdx_flush_vp(void *_arg) +{ + struct tdx_flush_vp_arg *arg = _arg; + struct kvm_vcpu *vcpu = arg->vcpu; + u64 err; + + arg->err = 0; + lockdep_assert_irqs_disabled(); + + /* Task migration can race with CPU offlining. */ + if (unlikely(vcpu->cpu != raw_smp_processor_id())) + return; + + /* + * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized. The + * list tracking still needs to be updated so that it's correct if/when + * the vCPU does get initialized. + */ + if (to_tdx(vcpu)->state != VCPU_TD_STATE_UNINITIALIZED) { + /* + * No need to retry. TDX Resources needed for TDH.VP.FLUSH are: + * TDVPR as exclusive, TDR as shared, and TDCS as shared. This + * vp flush function is called when destructing vCPU/TD or vCPU + * migration. No other thread uses TDVPR in those cases. + */ + err = tdh_vp_flush(&to_tdx(vcpu)->vp); + if (unlikely(err && err != TDX_VCPU_NOT_ASSOCIATED)) { + /* + * This function is called in IPI context. Do not use + * printk to avoid console semaphore. + * The caller prints out the error message, instead. + */ + if (err) + arg->err = err; + } + } + + tdx_disassociate_vp(vcpu); +} + +static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu) +{ + struct tdx_flush_vp_arg arg = { + .vcpu = vcpu, + }; + int cpu = vcpu->cpu; + + if (unlikely(cpu == -1)) + return; + + smp_call_function_single(cpu, tdx_flush_vp, &arg, 1); + if (KVM_BUG_ON(arg.err, vcpu->kvm)) + pr_tdx_error(TDH_VP_FLUSH, arg.err); +} + +void tdx_disable_virtualization_cpu(void) +{ + int cpu = raw_smp_processor_id(); + struct list_head *tdvcpus = &per_cpu(associated_tdvcpus, cpu); + struct tdx_flush_vp_arg arg; + struct vcpu_tdx *tdx, *tmp; + unsigned long flags; + + local_irq_save(flags); + /* Safe variant needed as tdx_disassociate_vp() deletes the entry. */ + list_for_each_entry_safe(tdx, tmp, tdvcpus, cpu_list) { + arg.vcpu = &tdx->vcpu; + tdx_flush_vp(&arg); + } + local_irq_restore(flags); +} + #define TDX_SEAMCALL_RETRIES 10000 static void smp_func_do_phymem_cache_wb(void *unused) @@ -255,22 +389,21 @@ void tdx_mmu_release_hkid(struct kvm *kvm) bool packages_allocated, targets_allocated; struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); cpumask_var_t packages, targets; - u64 err; + struct kvm_vcpu *vcpu; + unsigned long j; int i; + u64 err; if (!is_hkid_assigned(kvm_tdx)) return; - /* KeyID has been allocated but guest is not yet configured */ - if (!kvm_tdx->td.tdr_page) { - tdx_hkid_free(kvm_tdx); - return; - } - packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL); targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL); cpus_read_lock(); + kvm_for_each_vcpu(j, vcpu, kvm) + tdx_flush_vp_on_cpu(vcpu); + /* * TDH.PHYMEM.CACHE.WB tries to acquire the TDX module global lock * and can fail with TDX_OPERAND_BUSY when it fails to get the lock. @@ -284,6 +417,16 @@ void tdx_mmu_release_hkid(struct kvm *kvm) * After the above flushing vps, there should be no more vCPU * associations, as all vCPU fds have been released at this stage. */ + err = tdh_mng_vpflushdone(&kvm_tdx->td); + if (err == TDX_FLUSHVP_NOT_DONE) + goto out; + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error(TDH_MNG_VPFLUSHDONE, err); + pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n", + kvm_tdx->hkid); + goto out; + } + for_each_online_cpu(i) { if (packages_allocated && cpumask_test_and_set_cpu(topology_physical_package_id(i), @@ -309,6 +452,7 @@ void tdx_mmu_release_hkid(struct kvm *kvm) tdx_hkid_free(kvm_tdx); } +out: mutex_unlock(&tdx_lock); cpus_read_unlock(); free_cpumask_var(targets); @@ -395,6 +539,19 @@ int tdx_vm_init(struct kvm *kvm) kvm->arch.has_private_mem = true; /* + * Because guest TD is protected, VMM can't parse the instruction in TD. + * Instead, guest uses MMIO hypercall. For unmodified device driver, + * #VE needs to be injected for MMIO and #VE handler in TD converts MMIO + * instruction into MMIO hypercall. + * + * SPTE value for MMIO needs to be setup so that #VE is injected into + * TD instead of triggering EPT MISCONFIG. + * - RWX=0 so that EPT violation is triggered. + * - suppress #VE bit is cleared to inject #VE. + */ + kvm_mmu_set_mmio_spte_value(kvm, 0); + + /* * TDX has its own limit of maximum vCPUs it can support for all * TDX guests in addition to KVM_MAX_VCPUS. TDX module reports * such limit via the MAX_VCPU_PER_TD global metadata. In @@ -449,6 +606,27 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu) return 0; } +void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + + if (vcpu->cpu == cpu || !is_hkid_assigned(to_kvm_tdx(vcpu->kvm))) + return; + + tdx_flush_vp_on_cpu(vcpu); + + KVM_BUG_ON(cpu != raw_smp_processor_id(), vcpu->kvm); + local_irq_disable(); + /* + * Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure + * vcpu->cpu is read before tdx->cpu_list. + */ + smp_rmb(); + + list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu)); + local_irq_enable(); +} + void tdx_vcpu_free(struct kvm_vcpu *vcpu) { struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); @@ -482,6 +660,320 @@ void tdx_vcpu_free(struct kvm_vcpu *vcpu) tdx->state = VCPU_TD_STATE_UNINITIALIZED; } + +void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level) +{ + u64 shared_bit = (pgd_level == 5) ? TDX_SHARED_BIT_PWL_5 : + TDX_SHARED_BIT_PWL_4; + + if (KVM_BUG_ON(shared_bit != kvm_gfn_direct_bits(vcpu->kvm), vcpu->kvm)) + return; + + td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa); +} + +static void tdx_unpin(struct kvm *kvm, struct page *page) +{ + put_page(page); +} + +static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, + enum pg_level level, struct page *page) +{ + int tdx_level = pg_level_to_tdx_sept_level(level); + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + gpa_t gpa = gfn_to_gpa(gfn); + u64 entry, level_state; + u64 err; + + err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state); + if (unlikely(tdx_operand_busy(err))) { + tdx_unpin(kvm, page); + return -EBUSY; + } + + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state); + tdx_unpin(kvm, page); + return -EIO; + } + + return 0; +} + +/* + * KVM_TDX_INIT_MEM_REGION calls kvm_gmem_populate() to map guest pages; the + * callback tdx_gmem_post_populate() then maps pages into private memory. + * through the a seamcall TDH.MEM.PAGE.ADD(). The SEAMCALL also requires the + * private EPT structures for the page to have been built before, which is + * done via kvm_tdp_map_page(). nr_premapped counts the number of pages that + * were added to the EPT structures but not added with TDH.MEM.PAGE.ADD(). + * The counter has to be zero on KVM_TDX_FINALIZE_VM, to ensure that there + * are no half-initialized shared EPT pages. + */ +static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, kvm_pfn_t pfn) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + + if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm)) + return -EINVAL; + + /* nr_premapped will be decreased when tdh_mem_page_add() is called. */ + atomic64_inc(&kvm_tdx->nr_premapped); + return 0; +} + +int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, kvm_pfn_t pfn) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + struct page *page = pfn_to_page(pfn); + + /* TODO: handle large pages. */ + if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) + return -EINVAL; + + /* + * Because guest_memfd doesn't support page migration with + * a_ops->migrate_folio (yet), no callback is triggered for KVM on page + * migration. Until guest_memfd supports page migration, prevent page + * migration. + * TODO: Once guest_memfd introduces callback on page migration, + * implement it and remove get_page/put_page(). + */ + get_page(page); + + /* + * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching + * barrier in tdx_td_finalize(). + */ + smp_rmb(); + if (likely(kvm_tdx->state == TD_STATE_RUNNABLE)) + return tdx_mem_page_aug(kvm, gfn, level, page); + + return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn); +} + +static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, struct page *page) +{ + int tdx_level = pg_level_to_tdx_sept_level(level); + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + gpa_t gpa = gfn_to_gpa(gfn); + u64 err, entry, level_state; + + /* TODO: handle large pages. */ + if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) + return -EINVAL; + + if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm)) + return -EINVAL; + + do { + /* + * When zapping private page, write lock is held. So no race + * condition with other vcpu sept operation. Race only with + * TDH.VP.ENTER. + */ + err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry, + &level_state); + } while (unlikely(tdx_operand_busy(err))); + + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state); + return -EIO; + } + + err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page); + + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); + return -EIO; + } + tdx_clear_page(page); + tdx_unpin(kvm, page); + return 0; +} + +int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, void *private_spt) +{ + int tdx_level = pg_level_to_tdx_sept_level(level); + gpa_t gpa = gfn_to_gpa(gfn); + struct page *page = virt_to_page(private_spt); + u64 err, entry, level_state; + + err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, tdx_level, page, &entry, + &level_state); + if (unlikely(tdx_operand_busy(err))) + return -EBUSY; + + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error_2(TDH_MEM_SEPT_ADD, err, entry, level_state); + return -EIO; + } + + return 0; +} + +/* + * Check if the error returned from a SEPT zap SEAMCALL is due to that a page is + * mapped by KVM_TDX_INIT_MEM_REGION without tdh_mem_page_add() being called + * successfully. + * + * Since tdh_mem_sept_add() must have been invoked successfully before a + * non-leaf entry present in the mirrored page table, the SEPT ZAP related + * SEAMCALLs should not encounter err TDX_EPT_WALK_FAILED. They should instead + * find TDX_EPT_ENTRY_STATE_INCORRECT due to an empty leaf entry found in the + * SEPT. + * + * Further check if the returned entry from SEPT walking is with RWX permissions + * to filter out anything unexpected. + * + * Note: @level is pg_level, not the tdx_level. The tdx_level extracted from + * level_state returned from a SEAMCALL error is the same as that passed into + * the SEAMCALL. + */ +static int tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx *kvm_tdx, u64 err, + u64 entry, int level) +{ + if (!err || kvm_tdx->state == TD_STATE_RUNNABLE) + return false; + + if (err != (TDX_EPT_ENTRY_STATE_INCORRECT | TDX_OPERAND_ID_RCX)) + return false; + + if ((is_last_spte(entry, level) && (entry & VMX_EPT_RWX_MASK))) + return false; + + return true; +} + +static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, struct page *page) +{ + int tdx_level = pg_level_to_tdx_sept_level(level); + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + gpa_t gpa = gfn_to_gpa(gfn) & KVM_HPAGE_MASK(level); + u64 err, entry, level_state; + + /* For now large page isn't supported yet. */ + WARN_ON_ONCE(level != PG_LEVEL_4K); + + err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); + if (unlikely(tdx_operand_busy(err))) + return -EBUSY; + + if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) && + !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) { + atomic64_dec(&kvm_tdx->nr_premapped); + tdx_unpin(kvm, page); + return 0; + } + + if (KVM_BUG_ON(err, kvm)) { + pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state); + return -EIO; + } + return 1; +} + +/* + * Ensure shared and private EPTs to be flushed on all vCPUs. + * tdh_mem_track() is the only caller that increases TD epoch. An increase in + * the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are + * running in guest mode with the value "N - 1". + * + * A successful execution of tdh_mem_track() ensures that vCPUs can only run in + * guest mode with TD epoch value "N" if no TD exit occurs after the TD epoch + * being increased to "N + 1". + * + * Kicking off all vCPUs after that further results in no vCPUs can run in guest + * mode with TD epoch value "N", which unblocks the next tdh_mem_track() (e.g. + * to increase TD epoch to "N + 2"). + * + * TDX module will flush EPT on the next TD enter and make vCPUs to run in + * guest mode with TD epoch value "N + 1". + * + * kvm_make_all_cpus_request() guarantees all vCPUs are out of guest mode by + * waiting empty IPI handler ack_kick(). + * + * No action is required to the vCPUs being kicked off since the kicking off + * occurs certainly after TD epoch increment and before the next + * tdh_mem_track(). + */ +static void tdx_track(struct kvm *kvm) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + u64 err; + + /* If TD isn't finalized, it's before any vcpu running. */ + if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE)) + return; + + lockdep_assert_held_write(&kvm->mmu_lock); + + do { + err = tdh_mem_track(&kvm_tdx->td); + } while (unlikely(tdx_operand_busy(err))); + + if (KVM_BUG_ON(err, kvm)) + pr_tdx_error(TDH_MEM_TRACK, err); + + kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); +} + +int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, void *private_spt) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + + /* + * free_external_spt() is only called after hkid is freed when TD is + * tearing down. + * KVM doesn't (yet) zap page table pages in mirror page table while + * TD is active, though guest pages mapped in mirror page table could be + * zapped during TD is active, e.g. for shared <-> private conversion + * and slot move/deletion. + */ + if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm)) + return -EINVAL; + + /* + * The HKID assigned to this TD was already freed and cache was + * already flushed. We don't have to flush again. + */ + return tdx_reclaim_page(virt_to_page(private_spt)); +} + +int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, kvm_pfn_t pfn) +{ + struct page *page = pfn_to_page(pfn); + int ret; + + /* + * HKID is released after all private pages have been removed, and set + * before any might be populated. Warn if zapping is attempted when + * there can't be anything populated in the private EPT. + */ + if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm)) + return -EINVAL; + + ret = tdx_sept_zap_private_spte(kvm, gfn, level, page); + if (ret <= 0) + return ret; + + /* + * TDX requires TLB tracking before dropping private page. Do + * it here, although it is also done later. + */ + tdx_track(kvm); + + return tdx_sept_drop_private_spte(kvm, gfn, level, page); +} + static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd) { const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf; @@ -1023,6 +1515,11 @@ static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd) kvm_tdx->attributes = td_params->attributes; kvm_tdx->xfam = td_params->xfam; + if (td_params->config_flags & TDX_CONFIG_FLAGS_MAX_GPAW) + kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_5; + else + kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_4; + kvm_tdx->state = TD_STATE_INITIALIZED; out: /* kfree() accepts NULL. */ @@ -1032,6 +1529,71 @@ out: return ret; } +void tdx_flush_tlb_current(struct kvm_vcpu *vcpu) +{ + /* + * flush_tlb_current() is invoked when the first time for the vcpu to + * run or when root of shared EPT is invalidated. + * KVM only needs to flush shared EPT because the TDX module handles TLB + * invalidation for private EPT in tdh_vp_enter(); + * + * A single context invalidation for shared EPT can be performed here. + * However, this single context invalidation requires the private EPTP + * rather than the shared EPTP to flush shared EPT, as shared EPT uses + * private EPTP as its ASID for TLB invalidation. + * + * To avoid reading back private EPTP, perform a global invalidation for + * shared EPT instead to keep this function simple. + */ + ept_sync_global(); +} + +void tdx_flush_tlb_all(struct kvm_vcpu *vcpu) +{ + /* + * TDX has called tdx_track() in tdx_sept_remove_private_spte() to + * ensure that private EPT will be flushed on the next TD enter. No need + * to call tdx_track() here again even when this callback is a result of + * zapping private EPT. + * + * Due to the lack of the context to determine which EPT has been + * affected by zapping, invoke invept() directly here for both shared + * EPT and private EPT for simplicity, though it's not necessary for + * private EPT. + */ + ept_sync_global(); +} + +static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + + guard(mutex)(&kvm->slots_lock); + + if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) + return -EINVAL; + /* + * Pages are pending for KVM_TDX_INIT_MEM_REGION to issue + * TDH.MEM.PAGE.ADD(). + */ + if (atomic64_read(&kvm_tdx->nr_premapped)) + return -EINVAL; + + cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td); + if (tdx_operand_busy(cmd->hw_error)) + return -EBUSY; + if (KVM_BUG_ON(cmd->hw_error, kvm)) { + pr_tdx_error(TDH_MR_FINALIZE, cmd->hw_error); + return -EIO; + } + + kvm_tdx->state = TD_STATE_RUNNABLE; + /* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */ + smp_wmb(); + kvm->arch.pre_fault_allowed = true; + return 0; +} + int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) { struct kvm_tdx_cmd tdx_cmd; @@ -1056,6 +1618,9 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) case KVM_TDX_INIT_VM: r = tdx_td_init(kvm, &tdx_cmd); break; + case KVM_TDX_FINALIZE_VM: + r = tdx_td_finalize(kvm, &tdx_cmd); + break; default: r = -EINVAL; goto out; @@ -1269,6 +1834,148 @@ static int tdx_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) return 0; } +struct tdx_gmem_post_populate_arg { + struct kvm_vcpu *vcpu; + __u32 flags; +}; + +static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, + void __user *src, int order, void *_arg) +{ + u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS; + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + struct tdx_gmem_post_populate_arg *arg = _arg; + struct kvm_vcpu *vcpu = arg->vcpu; + gpa_t gpa = gfn_to_gpa(gfn); + u8 level = PG_LEVEL_4K; + struct page *src_page; + int ret, i; + u64 err, entry, level_state; + + /* + * Get the source page if it has been faulted in. Return failure if the + * source page has been swapped out or unmapped in primary memory. + */ + ret = get_user_pages_fast((unsigned long)src, 1, 0, &src_page); + if (ret < 0) + return ret; + if (ret != 1) + return -ENOMEM; + + ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level); + if (ret < 0) + goto out; + + /* + * The private mem cannot be zapped after kvm_tdp_map_page() + * because all paths are covered by slots_lock and the + * filemap invalidate lock. Check that they are indeed enough. + */ + if (IS_ENABLED(CONFIG_KVM_PROVE_MMU)) { + scoped_guard(read_lock, &kvm->mmu_lock) { + if (KVM_BUG_ON(!kvm_tdp_mmu_gpa_is_mapped(vcpu, gpa), kvm)) { + ret = -EIO; + goto out; + } + } + } + + ret = 0; + err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn), + src_page, &entry, &level_state); + if (err) { + ret = unlikely(tdx_operand_busy(err)) ? -EBUSY : -EIO; + goto out; + } + + if (!KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) + atomic64_dec(&kvm_tdx->nr_premapped); + + if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) { + for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) { + err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry, + &level_state); + if (err) { + ret = -EIO; + break; + } + } + } + +out: + put_page(src_page); + return ret; +} + +static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + struct kvm *kvm = vcpu->kvm; + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + struct kvm_tdx_init_mem_region region; + struct tdx_gmem_post_populate_arg arg; + long gmem_ret; + int ret; + + if (tdx->state != VCPU_TD_STATE_INITIALIZED) + return -EINVAL; + + guard(mutex)(&kvm->slots_lock); + + /* Once TD is finalized, the initial guest memory is fixed. */ + if (kvm_tdx->state == TD_STATE_RUNNABLE) + return -EINVAL; + + if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION) + return -EINVAL; + + if (copy_from_user(®ion, u64_to_user_ptr(cmd->data), sizeof(region))) + return -EFAULT; + + if (!PAGE_ALIGNED(region.source_addr) || !PAGE_ALIGNED(region.gpa) || + !region.nr_pages || + region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa || + !vt_is_tdx_private_gpa(kvm, region.gpa) || + !vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1)) + return -EINVAL; + + kvm_mmu_reload(vcpu); + ret = 0; + while (region.nr_pages) { + if (signal_pending(current)) { + ret = -EINTR; + break; + } + + arg = (struct tdx_gmem_post_populate_arg) { + .vcpu = vcpu, + .flags = cmd->flags, + }; + gmem_ret = kvm_gmem_populate(kvm, gpa_to_gfn(region.gpa), + u64_to_user_ptr(region.source_addr), + 1, tdx_gmem_post_populate, &arg); + if (gmem_ret < 0) { + ret = gmem_ret; + break; + } + + if (gmem_ret != 1) { + ret = -EIO; + break; + } + + region.source_addr += PAGE_SIZE; + region.gpa += PAGE_SIZE; + region.nr_pages--; + + cond_resched(); + } + + if (copy_to_user(u64_to_user_ptr(cmd->data), ®ion, sizeof(region))) + ret = -EFAULT; + return ret; +} + int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); @@ -1288,6 +1995,9 @@ int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) case KVM_TDX_INIT_VCPU: ret = tdx_vcpu_init(vcpu, &cmd); break; + case KVM_TDX_INIT_MEM_REGION: + ret = tdx_vcpu_init_mem_region(vcpu, &cmd); + break; case KVM_TDX_GET_CPUID: ret = tdx_vcpu_get_cpuid(vcpu, &cmd); break; @@ -1299,6 +2009,11 @@ int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) return ret; } +int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +{ + return PG_LEVEL_4K; +} + static int tdx_online_cpu(unsigned int cpu) { unsigned long flags; @@ -1496,11 +2211,25 @@ void tdx_cleanup(void) int __init tdx_bringup(void) { - int r; + int r, i; + + /* tdx_disable_virtualization_cpu() uses associated_tdvcpus. */ + for_each_possible_cpu(i) + INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, i)); if (!enable_tdx) return 0; + if (!enable_ept) { + pr_err("EPT is required for TDX\n"); + goto success_disable_tdx; + } + + if (!tdp_mmu_enabled || !enable_mmio_caching || !enable_ept_ad_bits) { + pr_err("TDP MMU and MMIO caching and EPT A/D bit is required for TDX\n"); + goto success_disable_tdx; + } + if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) { pr_err("tdx: MOVDIR64B is required for TDX\n"); goto success_disable_tdx; diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h index 0559126c8f9d..5f34b79d16dd 100644 --- a/arch/x86/kvm/vmx/tdx.h +++ b/arch/x86/kvm/vmx/tdx.h @@ -32,6 +32,9 @@ struct kvm_tdx { u64 tsc_multiplier; struct tdx_td td; + + /* For KVM_TDX_INIT_MEM_REGION. */ + atomic64_t nr_premapped; }; /* TDX module vCPU states */ @@ -45,9 +48,15 @@ struct vcpu_tdx { struct tdx_vp vp; + struct list_head cpu_list; + enum vcpu_tdx_state state; }; +void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err); +void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field, + u64 val, u64 err); + static inline bool is_td(struct kvm *kvm) { return kvm->arch.vm_type == KVM_X86_TDX_VM; @@ -69,6 +78,90 @@ static __always_inline u64 td_tdcs_exec_read64(struct kvm_tdx *kvm_tdx, u32 fiel } return data; } + +static __always_inline void tdvps_vmcs_check(u32 field, u8 bits) +{ +#define VMCS_ENC_ACCESS_TYPE_MASK 0x1UL +#define VMCS_ENC_ACCESS_TYPE_FULL 0x0UL +#define VMCS_ENC_ACCESS_TYPE_HIGH 0x1UL +#define VMCS_ENC_ACCESS_TYPE(field) ((field) & VMCS_ENC_ACCESS_TYPE_MASK) + + /* TDX is 64bit only. HIGH field isn't supported. */ + BUILD_BUG_ON_MSG(__builtin_constant_p(field) && + VMCS_ENC_ACCESS_TYPE(field) == VMCS_ENC_ACCESS_TYPE_HIGH, + "Read/Write to TD VMCS *_HIGH fields not supported"); + + BUILD_BUG_ON(bits != 16 && bits != 32 && bits != 64); + +#define VMCS_ENC_WIDTH_MASK GENMASK(14, 13) +#define VMCS_ENC_WIDTH_16BIT (0UL << 13) +#define VMCS_ENC_WIDTH_64BIT (1UL << 13) +#define VMCS_ENC_WIDTH_32BIT (2UL << 13) +#define VMCS_ENC_WIDTH_NATURAL (3UL << 13) +#define VMCS_ENC_WIDTH(field) ((field) & VMCS_ENC_WIDTH_MASK) + + /* TDX is 64bit only. i.e. natural width = 64bit. */ + BUILD_BUG_ON_MSG(bits != 64 && __builtin_constant_p(field) && + (VMCS_ENC_WIDTH(field) == VMCS_ENC_WIDTH_64BIT || + VMCS_ENC_WIDTH(field) == VMCS_ENC_WIDTH_NATURAL), + "Invalid TD VMCS access for 64-bit field"); + BUILD_BUG_ON_MSG(bits != 32 && __builtin_constant_p(field) && + VMCS_ENC_WIDTH(field) == VMCS_ENC_WIDTH_32BIT, + "Invalid TD VMCS access for 32-bit field"); + BUILD_BUG_ON_MSG(bits != 16 && __builtin_constant_p(field) && + VMCS_ENC_WIDTH(field) == VMCS_ENC_WIDTH_16BIT, + "Invalid TD VMCS access for 16-bit field"); +} + +#define TDX_BUILD_TDVPS_ACCESSORS(bits, uclass, lclass) \ +static __always_inline u##bits td_##lclass##_read##bits(struct vcpu_tdx *tdx, \ + u32 field) \ +{ \ + u64 err, data; \ + \ + tdvps_##lclass##_check(field, bits); \ + err = tdh_vp_rd(&tdx->vp, TDVPS_##uclass(field), &data); \ + if (unlikely(err)) { \ + tdh_vp_rd_failed(tdx, #uclass, field, err); \ + return 0; \ + } \ + return (u##bits)data; \ +} \ +static __always_inline void td_##lclass##_write##bits(struct vcpu_tdx *tdx, \ + u32 field, u##bits val) \ +{ \ + u64 err; \ + \ + tdvps_##lclass##_check(field, bits); \ + err = tdh_vp_wr(&tdx->vp, TDVPS_##uclass(field), val, \ + GENMASK_ULL(bits - 1, 0)); \ + if (unlikely(err)) \ + tdh_vp_wr_failed(tdx, #uclass, " = ", field, (u64)val, err); \ +} \ +static __always_inline void td_##lclass##_setbit##bits(struct vcpu_tdx *tdx, \ + u32 field, u64 bit) \ +{ \ + u64 err; \ + \ + tdvps_##lclass##_check(field, bits); \ + err = tdh_vp_wr(&tdx->vp, TDVPS_##uclass(field), bit, bit); \ + if (unlikely(err)) \ + tdh_vp_wr_failed(tdx, #uclass, " |= ", field, bit, err); \ +} \ +static __always_inline void td_##lclass##_clearbit##bits(struct vcpu_tdx *tdx, \ + u32 field, u64 bit) \ +{ \ + u64 err; \ + \ + tdvps_##lclass##_check(field, bits); \ + err = tdh_vp_wr(&tdx->vp, TDVPS_##uclass(field), 0, bit); \ + if (unlikely(err)) \ + tdh_vp_wr_failed(tdx, #uclass, " &= ~", field, bit, err);\ +} + +TDX_BUILD_TDVPS_ACCESSORS(16, VMCS, vmcs); +TDX_BUILD_TDVPS_ACCESSORS(32, VMCS, vmcs); +TDX_BUILD_TDVPS_ACCESSORS(64, VMCS, vmcs); #else static inline int tdx_bringup(void) { return 0; } static inline void tdx_cleanup(void) {} diff --git a/arch/x86/kvm/vmx/tdx_arch.h b/arch/x86/kvm/vmx/tdx_arch.h index 55a740f90e67..58bda8a5ce9a 100644 --- a/arch/x86/kvm/vmx/tdx_arch.h +++ b/arch/x86/kvm/vmx/tdx_arch.h @@ -121,6 +121,29 @@ struct td_params { #define TDX_MIN_TSC_FREQUENCY_KHZ (100 * 1000) #define TDX_MAX_TSC_FREQUENCY_KHZ (10 * 1000 * 1000) +/* Additional Secure EPT entry information */ +#define TDX_SEPT_LEVEL_MASK GENMASK_ULL(2, 0) +#define TDX_SEPT_STATE_MASK GENMASK_ULL(15, 8) +#define TDX_SEPT_STATE_SHIFT 8 + +enum tdx_sept_entry_state { + TDX_SEPT_FREE = 0, + TDX_SEPT_BLOCKED = 1, + TDX_SEPT_PENDING = 2, + TDX_SEPT_PENDING_BLOCKED = 3, + TDX_SEPT_PRESENT = 4, +}; + +static inline u8 tdx_get_sept_level(u64 sept_entry_info) +{ + return sept_entry_info & TDX_SEPT_LEVEL_MASK; +} + +static inline u8 tdx_get_sept_state(u64 sept_entry_info) +{ + return (sept_entry_info & TDX_SEPT_STATE_MASK) >> TDX_SEPT_STATE_SHIFT; +} + #define MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM BIT_ULL(20) /* diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 03d9e5069791..39d55d638899 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -53,6 +53,7 @@ #include <trace/events/ipi.h> #include "capabilities.h" +#include "common.h" #include "cpuid.h" #include "hyperv.h" #include "kvm_onhyperv.h" @@ -5787,11 +5788,8 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) static int handle_ept_violation(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification; + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); gpa_t gpa; - u64 error_code; - - exit_qualification = vmx_get_exit_qual(vcpu); /* * EPT violation happened while executing iret from NMI, @@ -5807,23 +5805,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); trace_kvm_page_fault(vcpu, gpa, exit_qualification); - /* Is it a read fault? */ - error_code = (exit_qualification & EPT_VIOLATION_ACC_READ) - ? PFERR_USER_MASK : 0; - /* Is it a write fault? */ - error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE) - ? PFERR_WRITE_MASK : 0; - /* Is it a fetch fault? */ - error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) - ? PFERR_FETCH_MASK : 0; - /* ept page table entry is present? */ - error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK) - ? PFERR_PRESENT_MASK : 0; - - if (error_code & EPT_VIOLATION_GVA_IS_VALID) - error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ? - PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; - /* * Check that the GPA doesn't exceed physical memory limits, as that is * a guest page fault. We have to emulate the instruction here, because @@ -5835,7 +5816,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu, gpa))) return kvm_emulate_instruction(vcpu, 0); - return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); + return __vmx_handle_ept_violation(vcpu, gpa, exit_qualification); } static int handle_ept_misconfig(struct kvm_vcpu *vcpu) @@ -7668,6 +7649,9 @@ int vmx_vm_init(struct kvm *kvm) break; } } + + if (enable_pml) + kvm->arch.cpu_dirty_log_size = PML_LOG_NR_ENTRIES; return 0; } @@ -8521,9 +8505,6 @@ __init int vmx_hardware_setup(void) if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) enable_pml = 0; - if (!enable_pml) - vt_x86_ops.cpu_dirty_log_size = 0; - if (!cpu_has_vmx_preemption_timer()) enable_preemption_timer = false; diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index 89bb7785bd09..f47d739051cf 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -122,6 +122,7 @@ void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu); void vmx_setup_mce(struct kvm_vcpu *vcpu); #ifdef CONFIG_KVM_INTEL_TDX +void tdx_disable_virtualization_cpu(void); int tdx_vm_init(struct kvm *kvm); void tdx_mmu_release_hkid(struct kvm *kvm); void tdx_vm_destroy(struct kvm *kvm); @@ -129,9 +130,25 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp); int tdx_vcpu_create(struct kvm_vcpu *vcpu); void tdx_vcpu_free(struct kvm_vcpu *vcpu); +void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu); int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp); + +int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, void *private_spt); +int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, void *private_spt); +int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, kvm_pfn_t pfn); +int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, kvm_pfn_t pfn); + +void tdx_flush_tlb_current(struct kvm_vcpu *vcpu); +void tdx_flush_tlb_all(struct kvm_vcpu *vcpu); +void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); +int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn); #else +static inline void tdx_disable_virtualization_cpu(void) {} static inline int tdx_vm_init(struct kvm *kvm) { return -EOPNOTSUPP; } static inline void tdx_mmu_release_hkid(struct kvm *kvm) {} static inline void tdx_vm_destroy(struct kvm *kvm) {} @@ -139,8 +156,42 @@ static inline int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) { return -EOP static inline int tdx_vcpu_create(struct kvm_vcpu *vcpu) { return -EOPNOTSUPP; } static inline void tdx_vcpu_free(struct kvm_vcpu *vcpu) {} +static inline void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) {} static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { return -EOPNOTSUPP; } + +static inline int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, + void *private_spt) +{ + return -EOPNOTSUPP; +} + +static inline int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, + enum pg_level level, + void *private_spt) +{ + return -EOPNOTSUPP; +} + +static inline int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, + kvm_pfn_t pfn) +{ + return -EOPNOTSUPP; +} + +static inline int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, + kvm_pfn_t pfn) +{ + return -EOPNOTSUPP; +} + +static inline void tdx_flush_tlb_current(struct kvm_vcpu *vcpu) {} +static inline void tdx_flush_tlb_all(struct kvm_vcpu *vcpu) {} +static inline void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) {} +static inline int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) { return 0; } #endif #endif /* __KVM_X86_VMX_X86_OPS_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9f92170226e5..8f3292ccdca4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6480,7 +6480,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) struct kvm_vcpu *vcpu; unsigned long i; - if (!kvm_x86_ops.cpu_dirty_log_size) + if (!kvm->arch.cpu_dirty_log_size) return; kvm_for_each_vcpu(i, vcpu, kvm) @@ -13078,7 +13078,7 @@ static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable) { int nr_slots; - if (!kvm_x86_ops.cpu_dirty_log_size) + if (!kvm->arch.cpu_dirty_log_size) return; nr_slots = atomic_read(&kvm->nr_memslots_dirty_logging); @@ -13150,7 +13150,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, if (READ_ONCE(eager_page_split)) kvm_mmu_slot_try_split_huge_pages(kvm, new, PG_LEVEL_4K); - if (kvm_x86_ops.cpu_dirty_log_size) { + if (kvm->arch.cpu_dirty_log_size) { kvm_mmu_slot_leaf_clear_dirty(kvm, new); kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M); } else { diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 3a272e9ff2ca..8eaaf31a4187 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -1529,6 +1529,45 @@ u64 tdh_mng_addcx(struct tdx_td *td, struct page *tdcs_page) } EXPORT_SYMBOL_GPL(tdh_mng_addcx); +u64 tdh_mem_page_add(struct tdx_td *td, u64 gpa, struct page *page, struct page *source, u64 *ext_err1, u64 *ext_err2) +{ + struct tdx_module_args args = { + .rcx = gpa, + .rdx = tdx_tdr_pa(td), + .r8 = page_to_phys(page), + .r9 = page_to_phys(source), + }; + u64 ret; + + tdx_clflush_page(page); + ret = seamcall_ret(TDH_MEM_PAGE_ADD, &args); + + *ext_err1 = args.rcx; + *ext_err2 = args.rdx; + + return ret; +} +EXPORT_SYMBOL_GPL(tdh_mem_page_add); + +u64 tdh_mem_sept_add(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2) +{ + struct tdx_module_args args = { + .rcx = gpa | level, + .rdx = tdx_tdr_pa(td), + .r8 = page_to_phys(page), + }; + u64 ret; + + tdx_clflush_page(page); + ret = seamcall_ret(TDH_MEM_SEPT_ADD, &args); + + *ext_err1 = args.rcx; + *ext_err2 = args.rdx; + + return ret; +} +EXPORT_SYMBOL_GPL(tdh_mem_sept_add); + u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page) { struct tdx_module_args args = { @@ -1541,6 +1580,42 @@ u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page) } EXPORT_SYMBOL_GPL(tdh_vp_addcx); +u64 tdh_mem_page_aug(struct tdx_td *td, u64 gpa, int level, struct page *page, u64 *ext_err1, u64 *ext_err2) +{ + struct tdx_module_args args = { + .rcx = gpa | level, + .rdx = tdx_tdr_pa(td), + .r8 = page_to_phys(page), + }; + u64 ret; + + tdx_clflush_page(page); + ret = seamcall_ret(TDH_MEM_PAGE_AUG, &args); + + *ext_err1 = args.rcx; + *ext_err2 = args.rdx; + + return ret; +} +EXPORT_SYMBOL_GPL(tdh_mem_page_aug); + +u64 tdh_mem_range_block(struct tdx_td *td, u64 gpa, int level, u64 *ext_err1, u64 *ext_err2) +{ + struct tdx_module_args args = { + .rcx = gpa | level, + .rdx = tdx_tdr_pa(td), + }; + u64 ret; + + ret = seamcall_ret(TDH_MEM_RANGE_BLOCK, &args); + + *ext_err1 = args.rcx; + *ext_err2 = args.rdx; + + return ret; +} +EXPORT_SYMBOL_GPL(tdh_mem_range_block); + u64 tdh_mng_key_config(struct tdx_td *td) { struct tdx_module_args args = { @@ -1592,6 +1667,33 @@ u64 tdh_mng_rd(struct tdx_td *td, u64 field, u64 *data) } EXPORT_SYMBOL_GPL(tdh_mng_rd); +u64 tdh_mr_extend(struct tdx_td *td, u64 gpa, u64 *ext_err1, u64 *ext_err2) +{ + struct tdx_module_args args = { + .rcx = gpa, + .rdx = tdx_tdr_pa(td), + }; + u64 ret; + + ret = seamcall_ret(TDH_MR_EXTEND, &args); + + *ext_err1 = args.rcx; + *ext_err2 = args.rdx; + + return ret; +} +EXPORT_SYMBOL_GPL(tdh_mr_extend); + +u64 tdh_mr_finalize(struct tdx_td *td) +{ + struct tdx_module_args args = { + .rcx = tdx_tdr_pa(td), + }; + + return seamcall(TDH_MR_FINALIZE, &args); +} +EXPORT_SYMBOL_GPL(tdh_mr_finalize); + u64 tdh_vp_flush(struct tdx_vp *vp) { struct tdx_module_args args = { @@ -1703,6 +1805,33 @@ u64 tdh_phymem_page_reclaim(struct page *page, u64 *tdx_pt, u64 *tdx_owner, u64 } EXPORT_SYMBOL_GPL(tdh_phymem_page_reclaim); +u64 tdh_mem_track(struct tdx_td *td) +{ + struct tdx_module_args args = { + .rcx = tdx_tdr_pa(td), + }; + + return seamcall(TDH_MEM_TRACK, &args); +} +EXPORT_SYMBOL_GPL(tdh_mem_track); + +u64 tdh_mem_page_remove(struct tdx_td *td, u64 gpa, u64 level, u64 *ext_err1, u64 *ext_err2) +{ + struct tdx_module_args args = { + .rcx = gpa | level, + .rdx = tdx_tdr_pa(td), + }; + u64 ret; + + ret = seamcall_ret(TDH_MEM_PAGE_REMOVE, &args); + + *ext_err1 = args.rcx; + *ext_err2 = args.rdx; + + return ret; +} +EXPORT_SYMBOL_GPL(tdh_mem_page_remove); + u64 tdh_phymem_cache_wb(bool resume) { struct tdx_module_args args = { @@ -1722,3 +1851,13 @@ u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td) return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args); } EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_tdr); + +u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page) +{ + struct tdx_module_args args = {}; + + args.rcx = mk_keyed_paddr(hkid, page); + + return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args); +} +EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_hkid); diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index da384387d4eb..ed7152f81a6d 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -15,10 +15,16 @@ * TDX module SEAMCALL leaf functions */ #define TDH_MNG_ADDCX 1 +#define TDH_MEM_PAGE_ADD 2 +#define TDH_MEM_SEPT_ADD 3 #define TDH_VP_ADDCX 4 +#define TDH_MEM_PAGE_AUG 6 +#define TDH_MEM_RANGE_BLOCK 7 #define TDH_MNG_KEY_CONFIG 8 #define TDH_MNG_CREATE 9 #define TDH_MNG_RD 11 +#define TDH_MR_EXTEND 16 +#define TDH_MR_FINALIZE 17 #define TDH_VP_FLUSH 18 #define TDH_MNG_VPFLUSHDONE 19 #define TDH_VP_CREATE 10 @@ -28,11 +34,13 @@ #define TDH_PHYMEM_PAGE_RDMD 24 #define TDH_VP_RD 26 #define TDH_PHYMEM_PAGE_RECLAIM 28 +#define TDH_MEM_PAGE_REMOVE 29 #define TDH_SYS_KEY_CONFIG 31 #define TDH_SYS_INIT 33 #define TDH_SYS_RD 34 #define TDH_SYS_LP_INIT 35 #define TDH_SYS_TDMR_INIT 36 +#define TDH_MEM_TRACK 38 #define TDH_PHYMEM_CACHE_WB 40 #define TDH_PHYMEM_PAGE_WBINVD 41 #define TDH_VP_WR 43 diff --git a/include/linux/kvm_dirty_ring.h b/include/linux/kvm_dirty_ring.h index 4862c98d80d3..da4d9b5f58f1 100644 --- a/include/linux/kvm_dirty_ring.h +++ b/include/linux/kvm_dirty_ring.h @@ -32,7 +32,7 @@ struct kvm_dirty_ring { * If CONFIG_HAVE_HVM_DIRTY_RING not defined, kvm_dirty_ring.o should * not be included as well, so define these nop functions for the arch. */ -static inline u32 kvm_dirty_ring_get_rsvd_entries(void) +static inline u32 kvm_dirty_ring_get_rsvd_entries(struct kvm *kvm) { return 0; } @@ -42,7 +42,7 @@ static inline bool kvm_use_dirty_bitmap(struct kvm *kvm) return true; } -static inline int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, +static inline int kvm_dirty_ring_alloc(struct kvm *kvm, struct kvm_dirty_ring *ring, int index, u32 size) { return 0; @@ -71,11 +71,12 @@ static inline void kvm_dirty_ring_free(struct kvm_dirty_ring *ring) #else /* CONFIG_HAVE_KVM_DIRTY_RING */ -int kvm_cpu_dirty_log_size(void); +int kvm_cpu_dirty_log_size(struct kvm *kvm); bool kvm_use_dirty_bitmap(struct kvm *kvm); bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm); -u32 kvm_dirty_ring_get_rsvd_entries(void); -int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size); +u32 kvm_dirty_ring_get_rsvd_entries(struct kvm *kvm); +int kvm_dirty_ring_alloc(struct kvm *kvm, struct kvm_dirty_ring *ring, + int index, u32 size); /* * called with kvm->slots_lock held, returns the number of diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c index 7bc74969a819..d14ffc7513ee 100644 --- a/virt/kvm/dirty_ring.c +++ b/virt/kvm/dirty_ring.c @@ -11,14 +11,14 @@ #include <trace/events/kvm.h> #include "kvm_mm.h" -int __weak kvm_cpu_dirty_log_size(void) +int __weak kvm_cpu_dirty_log_size(struct kvm *kvm) { return 0; } -u32 kvm_dirty_ring_get_rsvd_entries(void) +u32 kvm_dirty_ring_get_rsvd_entries(struct kvm *kvm) { - return KVM_DIRTY_RING_RSVD_ENTRIES + kvm_cpu_dirty_log_size(); + return KVM_DIRTY_RING_RSVD_ENTRIES + kvm_cpu_dirty_log_size(kvm); } bool kvm_use_dirty_bitmap(struct kvm *kvm) @@ -74,14 +74,15 @@ static void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask) KVM_MMU_UNLOCK(kvm); } -int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size) +int kvm_dirty_ring_alloc(struct kvm *kvm, struct kvm_dirty_ring *ring, + int index, u32 size) { ring->dirty_gfns = vzalloc(size); if (!ring->dirty_gfns) return -ENOMEM; ring->size = size / sizeof(struct kvm_dirty_gfn); - ring->soft_limit = ring->size - kvm_dirty_ring_get_rsvd_entries(); + ring->soft_limit = ring->size - kvm_dirty_ring_get_rsvd_entries(kvm); ring->dirty_index = 0; ring->reset_index = 0; ring->index = index; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 622b5a99078a..549537da3062 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4108,7 +4108,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id) goto vcpu_free_run_page; if (kvm->dirty_ring_size) { - r = kvm_dirty_ring_alloc(&vcpu->dirty_ring, + r = kvm_dirty_ring_alloc(kvm, &vcpu->dirty_ring, id, kvm->dirty_ring_size); if (r) goto arch_vcpu_destroy; @@ -4847,7 +4847,7 @@ static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size) return -EINVAL; /* Should be bigger to keep the reserved entries, or a page */ - if (size < kvm_dirty_ring_get_rsvd_entries() * + if (size < kvm_dirty_ring_get_rsvd_entries(kvm) * sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE) return -EINVAL; |