diff options
Diffstat (limited to 'arch/x86/kvm/mmu')
-rw-r--r-- | arch/x86/kvm/mmu/mmu.c | 4288 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmu_internal.h | 278 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmutrace.h | 3 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/page_track.c | 324 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/page_track.h | 58 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/paging.h | 14 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/paging_tmpl.h | 519 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/spte.c | 295 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/spte.h | 263 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_iter.c | 44 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_iter.h | 68 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.c | 1604 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.h | 93 |
13 files changed, 4817 insertions, 3034 deletions
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 17252f39bd7c..4e06e2e89a8f 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -14,6 +14,7 @@ * Yaniv Kamay <yaniv@qumranet.com> * Avi Kivity <avi@qumranet.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "irq.h" #include "ioapic.h" @@ -22,7 +23,9 @@ #include "tdp_mmu.h" #include "x86.h" #include "kvm_cache_regs.h" +#include "smm.h" #include "kvm_emulate.h" +#include "page_track.h" #include "cpuid.h" #include "spte.h" @@ -42,20 +45,21 @@ #include <linux/uaccess.h> #include <linux/hash.h> #include <linux/kern_levels.h> +#include <linux/kstrtox.h> #include <linux/kthread.h> +#include <linux/wordpart.h> #include <asm/page.h> #include <asm/memtype.h> #include <asm/cmpxchg.h> #include <asm/io.h> #include <asm/set_memory.h> +#include <asm/spec-ctrl.h> #include <asm/vmx.h> -#include <asm/kvm_page_track.h> -#include "trace.h" -#include "paging.h" +#include "trace.h" -extern bool itlb_multihit_kvm_mitigation; +static bool nx_hugepage_mitigation_hard_disabled; int __read_mostly nx_huge_pages = -1; static uint __read_mostly nx_huge_pages_recovery_period_ms; @@ -66,12 +70,13 @@ static uint __read_mostly nx_huge_pages_recovery_ratio = 0; static uint __read_mostly nx_huge_pages_recovery_ratio = 60; #endif +static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp); static int set_nx_huge_pages(const char *val, const struct kernel_param *kp); static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp); static const struct kernel_param_ops nx_huge_pages_ops = { .set = set_nx_huge_pages, - .get = param_get_bool, + .get = get_nx_huge_pages, }; static const struct kernel_param_ops nx_huge_pages_recovery_param_ops = { @@ -100,54 +105,51 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644); */ bool tdp_enabled = false; +static bool __ro_after_init tdp_mmu_allowed; + +#ifdef CONFIG_X86_64 +bool __read_mostly tdp_mmu_enabled = true; +module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0444); +EXPORT_SYMBOL_GPL(tdp_mmu_enabled); +#endif + static int max_huge_page_level __read_mostly; static int tdp_root_level __read_mostly; static int max_tdp_level __read_mostly; -#ifdef MMU_DEBUG -bool dbg = 0; -module_param(dbg, bool, 0644); -#endif - #define PTE_PREFETCH_NUM 8 -#define PT32_LEVEL_BITS 10 - -#define PT32_LEVEL_SHIFT(level) \ - (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) - -#define PT32_LVL_OFFSET_MASK(level) \ - (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT32_LEVEL_BITS))) - 1)) - -#define PT32_INDEX(address, level)\ - (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) - - -#define PT32_BASE_ADDR_MASK PAGE_MASK -#define PT32_DIR_BASE_ADDR_MASK \ - (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) -#define PT32_LVL_ADDR_MASK(level) \ - (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT32_LEVEL_BITS))) - 1)) - #include <trace/events/kvm.h> /* make pte_list_desc fit well in cache lines */ #define PTE_LIST_EXT 14 /* - * Slight optimization of cacheline layout, by putting `more' and `spte_count' - * at the start; then accessing it will only use one single cacheline for - * either full (entries==PTE_LIST_EXT) case or entries<=6. + * struct pte_list_desc is the core data structure used to implement a custom + * list for tracking a set of related SPTEs, e.g. all the SPTEs that map a + * given GFN when used in the context of rmaps. Using a custom list allows KVM + * to optimize for the common case where many GFNs will have at most a handful + * of SPTEs pointing at them, i.e. allows packing multiple SPTEs into a small + * memory footprint, which in turn improves runtime performance by exploiting + * cache locality. + * + * A list is comprised of one or more pte_list_desc objects (descriptors). + * Each individual descriptor stores up to PTE_LIST_EXT SPTEs. If a descriptor + * is full and a new SPTEs needs to be added, a new descriptor is allocated and + * becomes the head of the list. This means that by definitions, all tail + * descriptors are full. + * + * Note, the meta data fields are deliberately placed at the start of the + * structure to optimize the cacheline layout; accessing the descriptor will + * touch only a single cacheline so long as @spte_count<=6 (or if only the + * descriptors metadata is accessed). */ struct pte_list_desc { struct pte_list_desc *more; - /* - * Stores number of entries stored in the pte_list_desc. No need to be - * u64 but just for easier alignment. When PTE_LIST_EXT, means full. - */ - u64 spte_count; + /* The number of PTEs stored in _this_ descriptor. */ + u32 spte_count; + /* The number of PTEs stored in all tails of this descriptor. */ + u32 tail_count; u64 *sptes[PTE_LIST_EXT]; }; @@ -178,7 +180,6 @@ struct kvm_shadow_walk_iterator { static struct kmem_cache *pte_list_desc_cache; struct kmem_cache *mmu_page_header_cache; -static struct percpu_counter kvm_total_used_mmu_pages; static void mmu_spte_set(u64 *sptep, u64 spte); @@ -254,32 +255,38 @@ static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu) return regs; } -static inline bool kvm_available_flush_tlb_with_range(void) +static unsigned long get_guest_cr3(struct kvm_vcpu *vcpu) { - return kvm_x86_ops.tlb_remote_flush_with_range; + return kvm_read_cr3(vcpu); } -static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm, - struct kvm_tlb_range *range) +static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu, + struct kvm_mmu *mmu) { - int ret = -ENOTSUPP; - - if (range && kvm_x86_ops.tlb_remote_flush_with_range) - ret = static_call(kvm_x86_tlb_remote_flush_with_range)(kvm, range); + if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && mmu->get_guest_pgd == get_guest_cr3) + return kvm_read_cr3(vcpu); - if (ret) - kvm_flush_remote_tlbs(kvm); + return mmu->get_guest_pgd(vcpu); } -void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, - u64 start_gfn, u64 pages) +static inline bool kvm_available_flush_remote_tlbs_range(void) { - struct kvm_tlb_range range; +#if IS_ENABLED(CONFIG_HYPERV) + return kvm_x86_ops.flush_remote_tlbs_range; +#else + return false; +#endif +} - range.start_gfn = start_gfn; - range.pages = pages; +static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index); + +/* Flush the range of guest memory mapped by the given SPTE. */ +static void kvm_flush_remote_tlbs_sptep(struct kvm *kvm, u64 *sptep) +{ + struct kvm_mmu_page *sp = sptep_to_sp(sptep); + gfn_t gfn = kvm_mmu_page_get_gfn(sp, spte_index(sptep)); - kvm_flush_remote_tlbs_with_range(kvm, &range); + kvm_flush_remote_tlbs_gfn(kvm, gfn, sp->role.level); } static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, @@ -326,26 +333,22 @@ static int is_cpuid_PSE36(void) return 1; } -static gfn_t pse36_gfn_delta(u32 gpte) -{ - int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; - - return (gpte & PT32_DIR_PSE36_MASK) << shift; -} - #ifdef CONFIG_X86_64 static void __set_spte(u64 *sptep, u64 spte) { + KVM_MMU_WARN_ON(is_ept_ve_possible(spte)); WRITE_ONCE(*sptep, spte); } static void __update_clear_spte_fast(u64 *sptep, u64 spte) { + KVM_MMU_WARN_ON(is_ept_ve_possible(spte)); WRITE_ONCE(*sptep, spte); } static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) { + KVM_MMU_WARN_ON(is_ept_ve_possible(spte)); return xchg(sptep, spte); } @@ -432,8 +435,8 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) * The idea using the light way get the spte on x86_32 guest is from * gup_get_pte (mm/gup.c). * - * An spte tlb flush may be pending, because kvm_set_pte_rmapp - * coalesces them and we are running out of the MMU lock. Therefore + * An spte tlb flush may be pending, because they are coalesced and + * we are running out of the MMU lock. Therefore * we need to protect against in-progress updates of the spte. * * Reading the spte while an update is in progress may get the old value @@ -478,78 +481,36 @@ retry: */ static void mmu_spte_set(u64 *sptep, u64 new_spte) { - WARN_ON(is_shadow_present_pte(*sptep)); + WARN_ON_ONCE(is_shadow_present_pte(*sptep)); __set_spte(sptep, new_spte); } -/* - * Update the SPTE (excluding the PFN), but do not track changes in its - * accessed/dirty status. +/* Rules for using mmu_spte_update: + * Update the state bits, it means the mapped pfn is not changed. + * + * Returns true if the TLB needs to be flushed */ -static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte) +static bool mmu_spte_update(u64 *sptep, u64 new_spte) { u64 old_spte = *sptep; - WARN_ON(!is_shadow_present_pte(new_spte)); + WARN_ON_ONCE(!is_shadow_present_pte(new_spte)); check_spte_writable_invariants(new_spte); if (!is_shadow_present_pte(old_spte)) { mmu_spte_set(sptep, new_spte); - return old_spte; + return false; } - if (!spte_has_volatile_bits(old_spte)) + if (!spte_needs_atomic_update(old_spte)) __update_clear_spte_fast(sptep, new_spte); else old_spte = __update_clear_spte_slow(sptep, new_spte); - WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte)); - - return old_spte; -} - -/* Rules for using mmu_spte_update: - * Update the state bits, it means the mapped pfn is not changed. - * - * Whenever an MMU-writable SPTE is overwritten with a read-only SPTE, remote - * TLBs must be flushed. Otherwise rmap_write_protect will find a read-only - * spte, even though the writable spte might be cached on a CPU's TLB. - * - * Returns true if the TLB needs to be flushed - */ -static bool mmu_spte_update(u64 *sptep, u64 new_spte) -{ - bool flush = false; - u64 old_spte = mmu_spte_update_no_track(sptep, new_spte); - - if (!is_shadow_present_pte(old_spte)) - return false; + WARN_ON_ONCE(!is_shadow_present_pte(old_spte) || + spte_to_pfn(old_spte) != spte_to_pfn(new_spte)); - /* - * For the spte updated out of mmu-lock is safe, since - * we always atomically update it, see the comments in - * spte_has_volatile_bits(). - */ - if (is_mmu_writable_spte(old_spte) && - !is_writable_pte(new_spte)) - flush = true; - - /* - * Flush TLB when accessed/dirty states are changed in the page tables, - * to guarantee consistency between TLB and page tables. - */ - - if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) { - flush = true; - kvm_set_pfn_accessed(spte_to_pfn(old_spte)); - } - - if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) { - flush = true; - kvm_set_pfn_dirty(spte_to_pfn(old_spte)); - } - - return flush; + return leaf_spte_change_needs_tlb_flush(old_spte, new_spte); } /* @@ -558,38 +519,21 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) * state bits, it is used to clear the last level sptep. * Returns the old PTE. */ -static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) +static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) { - kvm_pfn_t pfn; u64 old_spte = *sptep; int level = sptep_to_sp(sptep)->role.level; if (!is_shadow_present_pte(old_spte) || - !spte_has_volatile_bits(old_spte)) - __update_clear_spte_fast(sptep, 0ull); + !spte_needs_atomic_update(old_spte)) + __update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE); else - old_spte = __update_clear_spte_slow(sptep, 0ull); + old_spte = __update_clear_spte_slow(sptep, SHADOW_NONPRESENT_VALUE); if (!is_shadow_present_pte(old_spte)) return old_spte; kvm_update_page_stats(kvm, level, -1); - - pfn = spte_to_pfn(old_spte); - - /* - * KVM does not hold the refcount of the page used by - * kvm mmu, before reclaiming the page, we should - * unmap it from mmu first. - */ - WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn))); - - if (is_accessed_spte(old_spte)) - kvm_set_pfn_accessed(pfn); - - if (is_dirty_spte(old_spte)) - kvm_set_pfn_dirty(pfn); - return old_spte; } @@ -600,7 +544,7 @@ static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) */ static void mmu_spte_clear_no_track(u64 *sptep) { - __update_clear_spte_fast(sptep, 0ull); + __update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE); } static u64 mmu_spte_get_lockless(u64 *sptep) @@ -608,35 +552,14 @@ static u64 mmu_spte_get_lockless(u64 *sptep) return __get_spte_lockless(sptep); } -/* Returns the Accessed status of the PTE and resets it at the same time. */ -static bool mmu_spte_age(u64 *sptep) +static inline bool is_tdp_mmu_active(struct kvm_vcpu *vcpu) { - u64 spte = mmu_spte_get_lockless(sptep); - - if (!is_accessed_spte(spte)) - return false; - - if (spte_ad_enabled(spte)) { - clear_bit((ffs(shadow_accessed_mask) - 1), - (unsigned long *)sptep); - } else { - /* - * Capture the dirty status of the page, so that it doesn't get - * lost when the SPTE is marked for access tracking. - */ - if (is_writable_pte(spte)) - kvm_set_pfn_dirty(spte_to_pfn(spte)); - - spte = mark_spte_for_access_track(spte); - mmu_spte_update_no_track(sptep, spte); - } - - return true; + return tdp_mmu_enabled && vcpu->arch.mmu->root_role.direct; } static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) { - if (is_tdp_mmu(vcpu->arch.mmu)) { + if (is_tdp_mmu_active(vcpu)) { kvm_tdp_mmu_walk_lockless_begin(); } else { /* @@ -655,7 +578,7 @@ static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) { - if (is_tdp_mmu(vcpu->arch.mmu)) { + if (is_tdp_mmu_active(vcpu)) { kvm_tdp_mmu_walk_lockless_end(); } else { /* @@ -677,12 +600,18 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect) 1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM); if (r) return r; + if (kvm_has_mirrored_tdp(vcpu->kvm)) { + r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_external_spt_cache, + PT64_ROOT_MAX_LEVEL); + if (r) + return r; + } r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache, PT64_ROOT_MAX_LEVEL); if (r) return r; if (maybe_indirect) { - r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_gfn_array_cache, + r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadowed_info_cache, PT64_ROOT_MAX_LEVEL); if (r) return r; @@ -695,48 +624,80 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) { kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache); kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache); - kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache); + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache); + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_external_spt_cache); kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); } -static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) -{ - return kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache); -} - static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) { kmem_cache_free(pte_list_desc_cache, pte_list_desc); } +static bool sp_has_gptes(struct kvm_mmu_page *sp); + static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) { if (sp->role.passthrough) return sp->gfn; - if (!sp->role.direct) - return sp->gfns[index]; + if (sp->shadowed_translation) + return sp->shadowed_translation[index] >> PAGE_SHIFT; - return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS)); + return sp->gfn + (index << ((sp->role.level - 1) * SPTE_LEVEL_BITS)); } -static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) +/* + * For leaf SPTEs, fetch the *guest* access permissions being shadowed. Note + * that the SPTE itself may have a more constrained access permissions that + * what the guest enforces. For example, a guest may create an executable + * huge PTE but KVM may disallow execution to mitigate iTLB multihit. + */ +static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index) { - if (sp->role.passthrough) { - WARN_ON_ONCE(gfn != sp->gfn); - return; - } + if (sp->shadowed_translation) + return sp->shadowed_translation[index] & ACC_ALL; - if (!sp->role.direct) { - sp->gfns[index] = gfn; + /* + * For direct MMUs (e.g. TDP or non-paging guests) or passthrough SPs, + * KVM is not shadowing any guest page tables, so the "guest access + * permissions" are just ACC_ALL. + * + * For direct SPs in indirect MMUs (shadow paging), i.e. when KVM + * is shadowing a guest huge page with small pages, the guest access + * permissions being shadowed are the access permissions of the huge + * page. + * + * In both cases, sp->role.access contains the correct access bits. + */ + return sp->role.access; +} + +static void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index, + gfn_t gfn, unsigned int access) +{ + if (sp->shadowed_translation) { + sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) | access; return; } - if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index))) - pr_err_ratelimited("gfn mismatch under direct page %llx " - "(expected %llx, got %llx)\n", - sp->gfn, - kvm_mmu_page_get_gfn(sp, index), gfn); + WARN_ONCE(access != kvm_mmu_page_get_access(sp, index), + "access mismatch under %s page %llx (expected %u, got %u)\n", + sp->role.passthrough ? "passthrough" : "direct", + sp->gfn, kvm_mmu_page_get_access(sp, index), access); + + WARN_ONCE(gfn != kvm_mmu_page_get_gfn(sp, index), + "gfn mismatch under %s page %llx (expected %llx, got %llx)\n", + sp->role.passthrough ? "passthrough" : "direct", + sp->gfn, kvm_mmu_page_get_gfn(sp, index), gfn); +} + +static void kvm_mmu_page_set_access(struct kvm_mmu_page *sp, int index, + unsigned int access) +{ + gfn_t gfn = kvm_mmu_page_get_gfn(sp, index); + + kvm_mmu_page_set_translation(sp, index, gfn, access); } /* @@ -752,16 +713,26 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, return &slot->arch.lpage_info[level - 2][idx]; } +/* + * The most significant bit in disallow_lpage tracks whether or not memory + * attributes are mixed, i.e. not identical for all gfns at the current level. + * The lower order bits are used to refcount other cases where a hugepage is + * disallowed, e.g. if KVM has shadow a page table at the gfn. + */ +#define KVM_LPAGE_MIXED_FLAG BIT(31) + static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot, gfn_t gfn, int count) { struct kvm_lpage_info *linfo; - int i; + int old, i; for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { linfo = lpage_info_slot(gfn, slot, i); + + old = linfo->disallow_lpage; linfo->disallow_lpage += count; - WARN_ON(linfo->disallow_lpage < 0); + WARN_ON_ONCE((old ^ linfo->disallow_lpage) & KVM_LPAGE_MIXED_FLAG); } } @@ -782,27 +753,54 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) gfn_t gfn; kvm->arch.indirect_shadow_pages++; + /* + * Ensure indirect_shadow_pages is elevated prior to re-reading guest + * child PTEs in FNAME(gpte_changed), i.e. guarantee either in-flight + * emulated writes are visible before re-reading guest PTEs, or that + * an emulated write will see the elevated count and acquire mmu_lock + * to update SPTEs. Pairs with the smp_mb() in kvm_mmu_track_write(). + */ + smp_mb(); + gfn = sp->gfn; slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, gfn); /* the non-leaf shadow pages are keeping readonly. */ if (sp->role.level > PG_LEVEL_4K) - return kvm_slot_page_track_add_page(kvm, slot, gfn, - KVM_PAGE_TRACK_WRITE); + return __kvm_write_track_add_gfn(kvm, slot, gfn); kvm_mmu_gfn_disallow_lpage(slot, gfn); + + if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K)) + kvm_flush_remote_tlbs_gfn(kvm, gfn, PG_LEVEL_4K); } -void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) { - if (sp->lpage_disallowed) + /* + * If it's possible to replace the shadow page with an NX huge page, + * i.e. if the shadow page is the only thing currently preventing KVM + * from using a huge page, add the shadow page to the list of "to be + * zapped for NX recovery" pages. Note, the shadow page can already be + * on the list if KVM is reusing an existing shadow page, i.e. if KVM + * links a shadow page at multiple points. + */ + if (!list_empty(&sp->possible_nx_huge_page_link)) return; ++kvm->stat.nx_lpage_splits; - list_add_tail(&sp->lpage_disallowed_link, - &kvm->arch.lpage_disallowed_mmu_pages); - sp->lpage_disallowed = true; + list_add_tail(&sp->possible_nx_huge_page_link, + &kvm->arch.possible_nx_huge_pages); +} + +static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp, + bool nx_huge_page_possible) +{ + sp->nx_huge_page_disallowed = true; + + if (nx_huge_page_possible) + track_possible_nx_huge_page(kvm, sp); } static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) @@ -816,22 +814,30 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, gfn); if (sp->role.level > PG_LEVEL_4K) - return kvm_slot_page_track_remove_page(kvm, slot, gfn, - KVM_PAGE_TRACK_WRITE); + return __kvm_write_track_remove_gfn(kvm, slot, gfn); kvm_mmu_gfn_allow_lpage(slot, gfn); } -void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) { + if (list_empty(&sp->possible_nx_huge_page_link)) + return; + --kvm->stat.nx_lpage_splits; - sp->lpage_disallowed = false; - list_del(&sp->lpage_disallowed_link); + list_del_init(&sp->possible_nx_huge_page_link); +} + +static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + sp->nx_huge_page_disallowed = false; + + untrack_possible_nx_huge_page(kvm, sp); } -static struct kvm_memory_slot * -gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, - bool no_dirty_log) +static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, + gfn_t gfn, + bool no_dirty_log) { struct kvm_memory_slot *slot; @@ -848,129 +854,293 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, * About rmap_head encoding: * * If the bit zero of rmap_head->val is clear, then it points to the only spte - * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct + * in this rmap chain. Otherwise, (rmap_head->val & ~3) points to a struct * pte_list_desc containing more mappings. */ +#define KVM_RMAP_MANY BIT(0) + +/* + * rmaps and PTE lists are mostly protected by mmu_lock (the shadow MMU always + * operates with mmu_lock held for write), but rmaps can be walked without + * holding mmu_lock so long as the caller can tolerate SPTEs in the rmap chain + * being zapped/dropped _while the rmap is locked_. + * + * Other than the KVM_RMAP_LOCKED flag, modifications to rmap entries must be + * done while holding mmu_lock for write. This allows a task walking rmaps + * without holding mmu_lock to concurrently walk the same entries as a task + * that is holding mmu_lock but _not_ the rmap lock. Neither task will modify + * the rmaps, thus the walks are stable. + * + * As alluded to above, SPTEs in rmaps are _not_ protected by KVM_RMAP_LOCKED, + * only the rmap chains themselves are protected. E.g. holding an rmap's lock + * ensures all "struct pte_list_desc" fields are stable. + */ +#define KVM_RMAP_LOCKED BIT(1) + +static unsigned long __kvm_rmap_lock(struct kvm_rmap_head *rmap_head) +{ + unsigned long old_val, new_val; + + lockdep_assert_preemption_disabled(); + + /* + * Elide the lock if the rmap is empty, as lockless walkers (read-only + * mode) don't need to (and can't) walk an empty rmap, nor can they add + * entries to the rmap. I.e. the only paths that process empty rmaps + * do so while holding mmu_lock for write, and are mutually exclusive. + */ + old_val = atomic_long_read(&rmap_head->val); + if (!old_val) + return 0; + + do { + /* + * If the rmap is locked, wait for it to be unlocked before + * trying acquire the lock, e.g. to avoid bouncing the cache + * line. + */ + while (old_val & KVM_RMAP_LOCKED) { + cpu_relax(); + old_val = atomic_long_read(&rmap_head->val); + } + + /* + * Recheck for an empty rmap, it may have been purged by the + * task that held the lock. + */ + if (!old_val) + return 0; + + new_val = old_val | KVM_RMAP_LOCKED; + /* + * Use try_cmpxchg_acquire() to prevent reads and writes to the rmap + * from being reordered outside of the critical section created by + * __kvm_rmap_lock(). + * + * Pairs with the atomic_long_set_release() in kvm_rmap_unlock(). + * + * For the !old_val case, no ordering is needed, as there is no rmap + * to walk. + */ + } while (!atomic_long_try_cmpxchg_acquire(&rmap_head->val, &old_val, new_val)); + + /* + * Return the old value, i.e. _without_ the LOCKED bit set. It's + * impossible for the return value to be 0 (see above), i.e. the read- + * only unlock flow can't get a false positive and fail to unlock. + */ + return old_val; +} + +static unsigned long kvm_rmap_lock(struct kvm *kvm, + struct kvm_rmap_head *rmap_head) +{ + lockdep_assert_held_write(&kvm->mmu_lock); + + return __kvm_rmap_lock(rmap_head); +} + +static void __kvm_rmap_unlock(struct kvm_rmap_head *rmap_head, + unsigned long val) +{ + KVM_MMU_WARN_ON(val & KVM_RMAP_LOCKED); + /* + * Ensure that all accesses to the rmap have completed before unlocking + * the rmap. + * + * Pairs with the atomic_long_try_cmpxchg_acquire() in __kvm_rmap_lock(). + */ + atomic_long_set_release(&rmap_head->val, val); +} + +static void kvm_rmap_unlock(struct kvm *kvm, + struct kvm_rmap_head *rmap_head, + unsigned long new_val) +{ + lockdep_assert_held_write(&kvm->mmu_lock); + + __kvm_rmap_unlock(rmap_head, new_val); +} + +static unsigned long kvm_rmap_get(struct kvm_rmap_head *rmap_head) +{ + return atomic_long_read(&rmap_head->val) & ~KVM_RMAP_LOCKED; +} + +/* + * If mmu_lock isn't held, rmaps can only be locked in read-only mode. The + * actual locking is the same, but the caller is disallowed from modifying the + * rmap, and so the unlock flow is a nop if the rmap is/was empty. + */ +static unsigned long kvm_rmap_lock_readonly(struct kvm_rmap_head *rmap_head) +{ + unsigned long rmap_val; + + preempt_disable(); + rmap_val = __kvm_rmap_lock(rmap_head); + + if (!rmap_val) + preempt_enable(); + + return rmap_val; +} + +static void kvm_rmap_unlock_readonly(struct kvm_rmap_head *rmap_head, + unsigned long old_val) +{ + if (!old_val) + return; + + KVM_MMU_WARN_ON(old_val != kvm_rmap_get(rmap_head)); + + __kvm_rmap_unlock(rmap_head, old_val); + preempt_enable(); +} /* * Returns the number of pointers in the rmap chain, not counting the new one. */ -static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, - struct kvm_rmap_head *rmap_head) +static int pte_list_add(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, + u64 *spte, struct kvm_rmap_head *rmap_head) { + unsigned long old_val, new_val; struct pte_list_desc *desc; int count = 0; - if (!rmap_head->val) { - rmap_printk("%p %llx 0->1\n", spte, *spte); - rmap_head->val = (unsigned long)spte; - } else if (!(rmap_head->val & 1)) { - rmap_printk("%p %llx 1->many\n", spte, *spte); - desc = mmu_alloc_pte_list_desc(vcpu); - desc->sptes[0] = (u64 *)rmap_head->val; + old_val = kvm_rmap_lock(kvm, rmap_head); + + if (!old_val) { + new_val = (unsigned long)spte; + } else if (!(old_val & KVM_RMAP_MANY)) { + desc = kvm_mmu_memory_cache_alloc(cache); + desc->sptes[0] = (u64 *)old_val; desc->sptes[1] = spte; desc->spte_count = 2; - rmap_head->val = (unsigned long)desc | 1; + desc->tail_count = 0; + new_val = (unsigned long)desc | KVM_RMAP_MANY; ++count; } else { - rmap_printk("%p %llx many->many\n", spte, *spte); - desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); - while (desc->spte_count == PTE_LIST_EXT) { - count += PTE_LIST_EXT; - if (!desc->more) { - desc->more = mmu_alloc_pte_list_desc(vcpu); - desc = desc->more; - desc->spte_count = 0; - break; - } - desc = desc->more; + desc = (struct pte_list_desc *)(old_val & ~KVM_RMAP_MANY); + count = desc->tail_count + desc->spte_count; + + /* + * If the previous head is full, allocate a new head descriptor + * as tail descriptors are always kept full. + */ + if (desc->spte_count == PTE_LIST_EXT) { + desc = kvm_mmu_memory_cache_alloc(cache); + desc->more = (struct pte_list_desc *)(old_val & ~KVM_RMAP_MANY); + desc->spte_count = 0; + desc->tail_count = count; + new_val = (unsigned long)desc | KVM_RMAP_MANY; + } else { + new_val = old_val; } - count += desc->spte_count; desc->sptes[desc->spte_count++] = spte; } + + kvm_rmap_unlock(kvm, rmap_head, new_val); + return count; } -static void -pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, - struct pte_list_desc *desc, int i, - struct pte_list_desc *prev_desc) +static void pte_list_desc_remove_entry(struct kvm *kvm, unsigned long *rmap_val, + struct pte_list_desc *desc, int i) { - int j = desc->spte_count - 1; + struct pte_list_desc *head_desc = (struct pte_list_desc *)(*rmap_val & ~KVM_RMAP_MANY); + int j = head_desc->spte_count - 1; - desc->sptes[i] = desc->sptes[j]; - desc->sptes[j] = NULL; - desc->spte_count--; - if (desc->spte_count) + /* + * The head descriptor should never be empty. A new head is added only + * when adding an entry and the previous head is full, and heads are + * removed (this flow) when they become empty. + */ + KVM_BUG_ON_DATA_CORRUPTION(j < 0, kvm); + + /* + * Replace the to-be-freed SPTE with the last valid entry from the head + * descriptor to ensure that tail descriptors are full at all times. + * Note, this also means that tail_count is stable for each descriptor. + */ + desc->sptes[i] = head_desc->sptes[j]; + head_desc->sptes[j] = NULL; + head_desc->spte_count--; + if (head_desc->spte_count) return; - if (!prev_desc && !desc->more) - rmap_head->val = 0; + + /* + * The head descriptor is empty. If there are no tail descriptors, + * nullify the rmap head to mark the list as empty, else point the rmap + * head at the next descriptor, i.e. the new head. + */ + if (!head_desc->more) + *rmap_val = 0; else - if (prev_desc) - prev_desc->more = desc->more; - else - rmap_head->val = (unsigned long)desc->more | 1; - mmu_free_pte_list_desc(desc); + *rmap_val = (unsigned long)head_desc->more | KVM_RMAP_MANY; + mmu_free_pte_list_desc(head_desc); } -static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) +static void pte_list_remove(struct kvm *kvm, u64 *spte, + struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; - struct pte_list_desc *prev_desc; + unsigned long rmap_val; int i; - if (!rmap_head->val) { - pr_err("%s: %p 0->BUG\n", __func__, spte); - BUG(); - } else if (!(rmap_head->val & 1)) { - rmap_printk("%p 1->0\n", spte); - if ((u64 *)rmap_head->val != spte) { - pr_err("%s: %p 1->BUG\n", __func__, spte); - BUG(); - } - rmap_head->val = 0; + rmap_val = kvm_rmap_lock(kvm, rmap_head); + if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_val, kvm)) + goto out; + + if (!(rmap_val & KVM_RMAP_MANY)) { + if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_val != spte, kvm)) + goto out; + + rmap_val = 0; } else { - rmap_printk("%p many->many\n", spte); - desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); - prev_desc = NULL; + desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY); while (desc) { for (i = 0; i < desc->spte_count; ++i) { if (desc->sptes[i] == spte) { - pte_list_desc_remove_entry(rmap_head, - desc, i, prev_desc); - return; + pte_list_desc_remove_entry(kvm, &rmap_val, + desc, i); + goto out; } } - prev_desc = desc; desc = desc->more; } - pr_err("%s: %p many->many\n", __func__, spte); - BUG(); + + KVM_BUG_ON_DATA_CORRUPTION(true, kvm); } + +out: + kvm_rmap_unlock(kvm, rmap_head, rmap_val); } -static void pte_list_remove(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - u64 *sptep) +static void kvm_zap_one_rmap_spte(struct kvm *kvm, + struct kvm_rmap_head *rmap_head, u64 *sptep) { mmu_spte_clear_track_bits(kvm, sptep); - __pte_list_remove(sptep, rmap_head); + pte_list_remove(kvm, sptep, rmap_head); } -/* Return true if rmap existed, false otherwise */ -static bool pte_list_destroy(struct kvm *kvm, struct kvm_rmap_head *rmap_head) +/* Return true if at least one SPTE was zapped, false otherwise */ +static bool kvm_zap_all_rmap_sptes(struct kvm *kvm, + struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc, *next; + unsigned long rmap_val; int i; - if (!rmap_head->val) + rmap_val = kvm_rmap_lock(kvm, rmap_head); + if (!rmap_val) return false; - if (!(rmap_head->val & 1)) { - mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val); + if (!(rmap_val & KVM_RMAP_MANY)) { + mmu_spte_clear_track_bits(kvm, (u64 *)rmap_val); goto out; } - desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY); for (; desc; desc = next) { for (i = 0; i < desc->spte_count; i++) @@ -980,28 +1150,22 @@ static bool pte_list_destroy(struct kvm *kvm, struct kvm_rmap_head *rmap_head) } out: /* rmap_head is meaningless now, remember to reset it */ - rmap_head->val = 0; + kvm_rmap_unlock(kvm, rmap_head, 0); return true; } unsigned int pte_list_count(struct kvm_rmap_head *rmap_head) { + unsigned long rmap_val = kvm_rmap_get(rmap_head); struct pte_list_desc *desc; - unsigned int count = 0; - if (!rmap_head->val) + if (!rmap_val) return 0; - else if (!(rmap_head->val & 1)) + else if (!(rmap_val & KVM_RMAP_MANY)) return 1; - desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); - - while (desc) { - count += desc->spte_count; - desc = desc->more; - } - - return count; + desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY); + return desc->tail_count + desc->spte_count; } static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level, @@ -1013,14 +1177,6 @@ static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level, return &slot->arch.rmap[level - PG_LEVEL_4K][idx]; } -static bool rmap_can_add(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu_memory_cache *mc; - - mc = &vcpu->arch.mmu_pte_list_desc_cache; - return kvm_mmu_memory_cache_nr_free_objects(mc); -} - static void rmap_remove(struct kvm *kvm, u64 *spte) { struct kvm_memslots *slots; @@ -1030,7 +1186,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) struct kvm_rmap_head *rmap_head; sp = sptep_to_sp(spte); - gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); + gfn = kvm_mmu_page_get_gfn(sp, spte_index(spte)); /* * Unlike rmap_add, rmap_remove does not run in the context of a vCPU @@ -1042,7 +1198,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) slot = __gfn_to_memslot(slots, gfn); rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); - __pte_list_remove(spte, rmap_head); + pte_list_remove(kvm, spte, rmap_head); } /* @@ -1051,6 +1207,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) */ struct rmap_iterator { /* private fields */ + struct rmap_head *head; struct pte_list_desc *desc; /* holds the sptep if not NULL */ int pos; /* index of the sptep */ }; @@ -1065,23 +1222,19 @@ struct rmap_iterator { static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head, struct rmap_iterator *iter) { - u64 *sptep; + unsigned long rmap_val = kvm_rmap_get(rmap_head); - if (!rmap_head->val) + if (!rmap_val) return NULL; - if (!(rmap_head->val & 1)) { + if (!(rmap_val & KVM_RMAP_MANY)) { iter->desc = NULL; - sptep = (u64 *)rmap_head->val; - goto out; + return (u64 *)rmap_val; } - iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + iter->desc = (struct pte_list_desc *)(rmap_val & ~KVM_RMAP_MANY); iter->pos = 0; - sptep = iter->desc->sptes[iter->pos]; -out: - BUG_ON(!is_shadow_present_pte(*sptep)); - return sptep; + return iter->desc->sptes[iter->pos]; } /* @@ -1091,14 +1244,11 @@ out: */ static u64 *rmap_get_next(struct rmap_iterator *iter) { - u64 *sptep; - if (iter->desc) { if (iter->pos < PTE_LIST_EXT - 1) { ++iter->pos; - sptep = iter->desc->sptes[iter->pos]; - if (sptep) - goto out; + if (iter->desc->sptes[iter->pos]) + return iter->desc->sptes[iter->pos]; } iter->desc = iter->desc->more; @@ -1106,20 +1256,24 @@ static u64 *rmap_get_next(struct rmap_iterator *iter) if (iter->desc) { iter->pos = 0; /* desc->sptes[0] cannot be NULL */ - sptep = iter->desc->sptes[iter->pos]; - goto out; + return iter->desc->sptes[iter->pos]; } } return NULL; -out: - BUG_ON(!is_shadow_present_pte(*sptep)); - return sptep; } -#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \ - for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \ - _spte_; _spte_ = rmap_get_next(_iter_)) +#define __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \ + for (_sptep_ = rmap_get_first(_rmap_head_, _iter_); \ + _sptep_; _sptep_ = rmap_get_next(_iter_)) + +#define for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \ + __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \ + if (!WARN_ON_ONCE(!is_shadow_present_pte(*(_sptep_)))) \ + +#define for_each_rmap_spte_lockless(_rmap_head_, _iter_, _sptep_, _spte_) \ + __for_each_rmap_spte(_rmap_head_, _iter_, _sptep_) \ + if (is_shadow_present_pte(_spte_ = mmu_spte_get_lockless(sptep))) static void drop_spte(struct kvm *kvm, u64 *sptep) { @@ -1129,26 +1283,17 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) rmap_remove(kvm, sptep); } - -static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) +static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush) { - if (is_large_pte(*sptep)) { - WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K); - drop_spte(kvm, sptep); - return true; - } + struct kvm_mmu_page *sp; - return false; -} + sp = sptep_to_sp(sptep); + WARN_ON_ONCE(sp->role.level == PG_LEVEL_4K); -static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) -{ - if (__drop_large_spte(vcpu->kvm, sptep)) { - struct kvm_mmu_page *sp = sptep_to_sp(sptep); + drop_spte(kvm, sptep); - kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, - KVM_PAGES_PER_HPAGE(sp->role.level)); - } + if (flush) + kvm_flush_remote_tlbs_sptep(kvm, sptep); } /* @@ -1172,8 +1317,6 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect) !(pt_protect && is_mmu_writable_spte(spte))) return false; - rmap_printk("spte %p %llx\n", sptep, *sptep); - if (pt_protect) spte &= ~shadow_mmu_writable_mask; spte = spte & ~PT_WRITABLE_MASK; @@ -1198,23 +1341,11 @@ static bool spte_clear_dirty(u64 *sptep) { u64 spte = *sptep; - rmap_printk("spte %p %llx\n", sptep, *sptep); - - MMU_WARN_ON(!spte_ad_enabled(spte)); + KVM_MMU_WARN_ON(!spte_ad_enabled(spte)); spte &= ~shadow_dirty_mask; return mmu_spte_update(sptep, spte); } -static bool spte_wrprot_for_clear_dirty(u64 *sptep) -{ - bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT, - (unsigned long *)sptep); - if (was_writable && !spte_ad_enabled(*sptep)) - kvm_set_pfn_dirty(spte_to_pfn(*sptep)); - - return was_writable; -} - /* * Gets the GFN ready for another round of dirty logging by clearing the * - D bit on ad-enabled SPTEs, and @@ -1228,31 +1359,24 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct rmap_iterator iter; bool flush = false; - for_each_rmap_spte(rmap_head, &iter, sptep) + for_each_rmap_spte(rmap_head, &iter, sptep) { if (spte_ad_need_write_protect(*sptep)) - flush |= spte_wrprot_for_clear_dirty(sptep); + flush |= test_and_clear_bit(PT_WRITABLE_SHIFT, + (unsigned long *)sptep); else flush |= spte_clear_dirty(sptep); + } return flush; } -/** - * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages - * @kvm: kvm instance - * @slot: slot to protect - * @gfn_offset: start of the BITS_PER_LONG pages we care about - * @mask: indicates which pages we should protect - * - * Used when we do not need to care about huge page mappings. - */ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) { struct kvm_rmap_head *rmap_head; - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, slot->base_gfn + gfn_offset, mask, true); @@ -1269,23 +1393,13 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, } } -/** - * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write - * protect the page if the D-bit isn't supported. - * @kvm: kvm instance - * @slot: slot to clear D-bit - * @gfn_offset: start of the BITS_PER_LONG pages we care about - * @mask: indicates which pages we should clear D-bit - * - * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap. - */ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) { struct kvm_rmap_head *rmap_head; - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, slot->base_gfn + gfn_offset, mask, false); @@ -1302,24 +1416,16 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, } } -/** - * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected - * PT level pages. - * - * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to - * enable dirty logging for them. - * - * We need to care about huge page mappings: e.g. during dirty logging we may - * have such mappings. - */ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) { /* - * Huge pages are NOT write protected when we start dirty logging in - * initially-all-set mode; must write protect them here so that they - * are split to 4K on the first write. + * If the slot was assumed to be "initially all dirty", write-protect + * huge pages to ensure they are split to 4KiB on the first write (KVM + * dirty logs at 4KiB granularity). If eager page splitting is enabled, + * immediately try to split huge pages, e.g. so that vCPUs don't get + * saddled with the cost of splitting. * * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn * of memslot has no such restriction, so the range can cross two large @@ -1330,7 +1436,7 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, gfn_t end = slot->base_gfn + gfn_offset + __fls(mask); if (READ_ONCE(eager_page_split)) - kvm_mmu_try_split_huge_pages(kvm, slot, start, end, PG_LEVEL_4K); + kvm_mmu_try_split_huge_pages(kvm, slot, start, end + 1, PG_LEVEL_4K); kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M); @@ -1341,16 +1447,25 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, PG_LEVEL_2M); } - /* Now handle 4K PTEs. */ - if (kvm_x86_ops.cpu_dirty_log_size) + /* + * (Re)Enable dirty logging for all 4KiB SPTEs that map the GFNs in + * mask. If PML is enabled and the GFN doesn't need to be write- + * protected for other reasons, e.g. shadow paging, clear the Dirty bit. + * Otherwise clear the Writable bit. + * + * Note that kvm_mmu_clear_dirty_pt_masked() is called whenever PML is + * enabled but it chooses between clearing the Dirty bit and Writeable + * bit based on the context. + */ + if (kvm->arch.cpu_dirty_log_size) kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask); else kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } -int kvm_cpu_dirty_log_size(void) +int kvm_cpu_dirty_log_size(struct kvm *kvm) { - return kvm_x86_ops.cpu_dirty_log_size; + return kvm->arch.cpu_dirty_log_size; } bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, @@ -1368,7 +1483,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, } } - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) write_protected |= kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level); @@ -1383,57 +1498,10 @@ static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn) return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K); } -static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - const struct kvm_memory_slot *slot) -{ - return pte_list_destroy(kvm, rmap_head); -} - -static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - pte_t unused) +static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + const struct kvm_memory_slot *slot) { - return kvm_zap_rmapp(kvm, rmap_head, slot); -} - -static bool kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - pte_t pte) -{ - u64 *sptep; - struct rmap_iterator iter; - bool need_flush = false; - u64 new_spte; - kvm_pfn_t new_pfn; - - WARN_ON(pte_huge(pte)); - new_pfn = pte_pfn(pte); - -restart: - for_each_rmap_spte(rmap_head, &iter, sptep) { - rmap_printk("spte %p %llx gfn %llx (%d)\n", - sptep, *sptep, gfn, level); - - need_flush = true; - - if (pte_write(pte)) { - pte_list_remove(kvm, rmap_head, sptep); - goto restart; - } else { - new_spte = kvm_mmu_changed_pte_notifier_make_spte( - *sptep, new_pfn); - - mmu_spte_clear_track_bits(kvm, sptep); - mmu_spte_set(sptep, new_spte); - } - } - - if (need_flush && kvm_available_flush_tlb_with_range()) { - kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); - return false; - } - - return need_flush; + return kvm_zap_all_rmap_sptes(kvm, rmap_head); } struct slot_rmap_walk_iterator { @@ -1453,8 +1521,8 @@ struct slot_rmap_walk_iterator { struct kvm_rmap_head *end_rmap; }; -static void -rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level) +static void rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, + int level) { iterator->level = level; iterator->gfn = iterator->start_gfn; @@ -1462,10 +1530,10 @@ rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level) iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot); } -static void -slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator, - const struct kvm_memory_slot *slot, int start_level, - int end_level, gfn_t start_gfn, gfn_t end_gfn) +static void slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator, + const struct kvm_memory_slot *slot, + int start_level, int end_level, + gfn_t start_gfn, gfn_t end_gfn) { iterator->slot = slot; iterator->start_level = start_level; @@ -1484,9 +1552,9 @@ static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator) static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) { while (++iterator->rmap <= iterator->end_rmap) { - iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level)); + iterator->gfn += KVM_PAGES_PER_HPAGE(iterator->level); - if (iterator->rmap->val) + if (atomic_long_read(&iterator->rmap->val)) return; } @@ -1505,108 +1573,199 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) slot_rmap_walk_okay(_iter_); \ slot_rmap_walk_next(_iter_)) -typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, - int level, pte_t pte); +/* The return value indicates if tlb flush on all vcpus is needed. */ +typedef bool (*slot_rmaps_handler) (struct kvm *kvm, + struct kvm_rmap_head *rmap_head, + const struct kvm_memory_slot *slot); -static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm, - struct kvm_gfn_range *range, - rmap_handler_t handler) +static __always_inline bool __walk_slot_rmaps(struct kvm *kvm, + const struct kvm_memory_slot *slot, + slot_rmaps_handler fn, + int start_level, int end_level, + gfn_t start_gfn, gfn_t end_gfn, + bool can_yield, bool flush_on_yield, + bool flush) { struct slot_rmap_walk_iterator iterator; - bool ret = false; - for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, - range->start, range->end - 1, &iterator) - ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn, - iterator.level, range->pte); + lockdep_assert_held_write(&kvm->mmu_lock); - return ret; + for_each_slot_rmap_range(slot, start_level, end_level, start_gfn, + end_gfn, &iterator) { + if (iterator.rmap) + flush |= fn(kvm, iterator.rmap, slot); + + if (!can_yield) + continue; + + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { + if (flush && flush_on_yield) { + kvm_flush_remote_tlbs_range(kvm, start_gfn, + iterator.gfn - start_gfn + 1); + flush = false; + } + cond_resched_rwlock_write(&kvm->mmu_lock); + } + } + + return flush; +} + +static __always_inline bool walk_slot_rmaps(struct kvm *kvm, + const struct kvm_memory_slot *slot, + slot_rmaps_handler fn, + int start_level, int end_level, + bool flush_on_yield) +{ + return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level, + slot->base_gfn, slot->base_gfn + slot->npages - 1, + true, flush_on_yield, false); +} + +static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm, + const struct kvm_memory_slot *slot, + slot_rmaps_handler fn, + bool flush_on_yield) +{ + return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield); +} + +static bool __kvm_rmap_zap_gfn_range(struct kvm *kvm, + const struct kvm_memory_slot *slot, + gfn_t start, gfn_t end, bool can_yield, + bool flush) +{ + return __walk_slot_rmaps(kvm, slot, kvm_zap_rmap, + PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, + start, end - 1, can_yield, true, flush); } bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { bool flush = false; + /* + * To prevent races with vCPUs faulting in a gfn using stale data, + * zapping a gfn range must be protected by mmu_invalidate_in_progress + * (and mmu_invalidate_seq). The only exception is memslot deletion; + * in that case, SRCU synchronization ensures that SPTEs are zapped + * after all vCPUs have unlocked SRCU, guaranteeing that vCPUs see the + * invalid slot. + */ + lockdep_assert_once(kvm->mmu_invalidate_in_progress || + lockdep_is_held(&kvm->slots_lock)); + if (kvm_memslots_have_rmaps(kvm)) - flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp); + flush = __kvm_rmap_zap_gfn_range(kvm, range->slot, + range->start, range->end, + range->may_block, flush); - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); + if (kvm_x86_ops.set_apic_access_page_addr && + range->slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) + kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); + return flush; } -bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) +#define RMAP_RECYCLE_THRESHOLD 1000 + +static void __rmap_add(struct kvm *kvm, + struct kvm_mmu_memory_cache *cache, + const struct kvm_memory_slot *slot, + u64 *spte, gfn_t gfn, unsigned int access) { - bool flush = false; + struct kvm_mmu_page *sp; + struct kvm_rmap_head *rmap_head; + int rmap_count; - if (kvm_memslots_have_rmaps(kvm)) - flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp); + sp = sptep_to_sp(spte); + kvm_mmu_page_set_translation(sp, spte_index(spte), gfn, access); + kvm_update_page_stats(kvm, sp->role.level, 1); - if (is_tdp_mmu_enabled(kvm)) - flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range); + rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); + rmap_count = pte_list_add(kvm, cache, spte, rmap_head); - return flush; + if (rmap_count > kvm->stat.max_mmu_rmap_size) + kvm->stat.max_mmu_rmap_size = rmap_count; + if (rmap_count > RMAP_RECYCLE_THRESHOLD) { + kvm_zap_all_rmap_sptes(kvm, rmap_head); + kvm_flush_remote_tlbs_gfn(kvm, gfn, sp->role.level); + } } -static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - pte_t unused) +static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot, + u64 *spte, gfn_t gfn, unsigned int access) { - u64 *sptep; - struct rmap_iterator iter; - int young = 0; - - for_each_rmap_spte(rmap_head, &iter, sptep) - young |= mmu_spte_age(sptep); + struct kvm_mmu_memory_cache *cache = &vcpu->arch.mmu_pte_list_desc_cache; - return young; + __rmap_add(vcpu->kvm, cache, slot, spte, gfn, access); } -static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, - int level, pte_t unused) +static bool kvm_rmap_age_gfn_range(struct kvm *kvm, + struct kvm_gfn_range *range, + bool test_only) { - u64 *sptep; + struct kvm_rmap_head *rmap_head; struct rmap_iterator iter; + unsigned long rmap_val; + bool young = false; + u64 *sptep; + gfn_t gfn; + int level; + u64 spte; - for_each_rmap_spte(rmap_head, &iter, sptep) - if (is_accessed_spte(*sptep)) - return true; - return false; -} + for (level = PG_LEVEL_4K; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) { + for (gfn = range->start; gfn < range->end; + gfn += KVM_PAGES_PER_HPAGE(level)) { + rmap_head = gfn_to_rmap(gfn, level, range->slot); + rmap_val = kvm_rmap_lock_readonly(rmap_head); -#define RMAP_RECYCLE_THRESHOLD 1000 + for_each_rmap_spte_lockless(rmap_head, &iter, sptep, spte) { + if (!is_accessed_spte(spte)) + continue; -static void rmap_add(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, - u64 *spte, gfn_t gfn) -{ - struct kvm_mmu_page *sp; - struct kvm_rmap_head *rmap_head; - int rmap_count; + if (test_only) { + kvm_rmap_unlock_readonly(rmap_head, rmap_val); + return true; + } - sp = sptep_to_sp(spte); - kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); - rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); - rmap_count = pte_list_add(vcpu, spte, rmap_head); + if (spte_ad_enabled(spte)) + clear_bit((ffs(shadow_accessed_mask) - 1), + (unsigned long *)sptep); + else + /* + * If the following cmpxchg fails, the + * spte is being concurrently modified + * and should most likely stay young. + */ + cmpxchg64(sptep, spte, + mark_spte_for_access_track(spte)); + young = true; + } - if (rmap_count > RMAP_RECYCLE_THRESHOLD) { - kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0)); - kvm_flush_remote_tlbs_with_address( - vcpu->kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); + kvm_rmap_unlock_readonly(rmap_head, rmap_val); + } } + return young; +} + +static bool kvm_may_have_shadow_mmu_sptes(struct kvm *kvm) +{ + return !tdp_mmu_enabled || READ_ONCE(kvm->arch.indirect_shadow_pages); } bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { bool young = false; - if (kvm_memslots_have_rmaps(kvm)) - young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp); + if (tdp_mmu_enabled) + young = kvm_tdp_mmu_age_gfn_range(kvm, range); - if (is_tdp_mmu_enabled(kvm)) - young |= kvm_tdp_mmu_age_gfn_range(kvm, range); + if (kvm_may_have_shadow_mmu_sptes(kvm)) + young |= kvm_rmap_age_gfn_range(kvm, range, false); return young; } @@ -1615,51 +1774,52 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { bool young = false; - if (kvm_memslots_have_rmaps(kvm)) - young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp); + if (tdp_mmu_enabled) + young = kvm_tdp_mmu_test_age_gfn(kvm, range); - if (is_tdp_mmu_enabled(kvm)) - young |= kvm_tdp_mmu_test_age_gfn(kvm, range); + if (young) + return young; + + if (kvm_may_have_shadow_mmu_sptes(kvm)) + young |= kvm_rmap_age_gfn_range(kvm, range, true); return young; } -#ifdef MMU_DEBUG -static int is_empty_shadow_page(u64 *spt) +static void kvm_mmu_check_sptes_at_free(struct kvm_mmu_page *sp) { - u64 *pos; - u64 *end; +#ifdef CONFIG_KVM_PROVE_MMU + int i; - for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) - if (is_shadow_present_pte(*pos)) { - printk(KERN_ERR "%s: %p %llx\n", __func__, - pos, *pos); - return 0; - } - return 1; -} + for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { + if (KVM_MMU_WARN_ON(is_shadow_present_pte(sp->spt[i]))) + pr_err_ratelimited("SPTE %llx (@ %p) for gfn %llx shadow-present at free", + sp->spt[i], &sp->spt[i], + kvm_mmu_page_get_gfn(sp, i)); + } #endif +} -/* - * This value is the sum of all of the kvm instances's - * kvm->arch.n_used_mmu_pages values. We need a global, - * aggregate version in order to make the slab shrinker - * faster - */ -static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr) +static void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { - kvm->arch.n_used_mmu_pages += nr; - percpu_counter_add(&kvm_total_used_mmu_pages, nr); + kvm->arch.n_used_mmu_pages++; + kvm_account_pgtable_pages((void *)sp->spt, +1); } -static void kvm_mmu_free_page(struct kvm_mmu_page *sp) +static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { - MMU_WARN_ON(!is_empty_shadow_page(sp->spt)); + kvm->arch.n_used_mmu_pages--; + kvm_account_pgtable_pages((void *)sp->spt, -1); +} + +static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp) +{ + kvm_mmu_check_sptes_at_free(sp); + hlist_del(&sp->hash_link); list_del(&sp->link); free_page((unsigned long)sp->spt); - if (!sp->role.direct) - free_page((unsigned long)sp->gfns); + free_page((unsigned long)sp->shadowed_translation); kmem_cache_free(mmu_page_header_cache, sp); } @@ -1668,49 +1828,29 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn) return hash_64(gfn, KVM_MMU_HASH_SHIFT); } -static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, +static void mmu_page_add_parent_pte(struct kvm *kvm, + struct kvm_mmu_memory_cache *cache, struct kvm_mmu_page *sp, u64 *parent_pte) { if (!parent_pte) return; - pte_list_add(vcpu, parent_pte, &sp->parent_ptes); + pte_list_add(kvm, cache, parent_pte, &sp->parent_ptes); } -static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, +static void mmu_page_remove_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *parent_pte) { - __pte_list_remove(parent_pte, &sp->parent_ptes); + pte_list_remove(kvm, parent_pte, &sp->parent_ptes); } -static void drop_parent_pte(struct kvm_mmu_page *sp, +static void drop_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *parent_pte) { - mmu_page_remove_parent_pte(sp, parent_pte); + mmu_page_remove_parent_pte(kvm, sp, parent_pte); mmu_spte_clear_no_track(parent_pte); } -static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct) -{ - struct kvm_mmu_page *sp; - - sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); - sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); - if (!direct) - sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache); - set_page_private(virt_to_page(sp->spt), (unsigned long)sp); - - /* - * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages() - * depends on valid pages being added to the head of the list. See - * comments in kvm_zap_obsolete_pages(). - */ - sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; - list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); - kvm_mod_used_mmu_pages(vcpu->kvm, +1); - return sp; -} - static void mark_unsync(u64 *spte); static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) { @@ -1725,23 +1865,15 @@ static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) static void mark_unsync(u64 *spte) { struct kvm_mmu_page *sp; - unsigned int index; sp = sptep_to_sp(spte); - index = spte - sp->spt; - if (__test_and_set_bit(index, sp->unsync_child_bitmap)) + if (__test_and_set_bit(spte_index(spte), sp->unsync_child_bitmap)) return; if (sp->unsync_children++) return; kvm_mmu_mark_parents_unsync(sp); } -static int nonpaging_sync_page(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp) -{ - return -1; -} - #define KVM_PAGE_ARRAY_NR 16 struct kvm_mmu_pages { @@ -1771,7 +1903,7 @@ static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx) { --sp->unsync_children; - WARN_ON((int)sp->unsync_children < 0); + WARN_ON_ONCE((int)sp->unsync_children < 0); __clear_bit(idx, sp->unsync_child_bitmap); } @@ -1789,7 +1921,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, continue; } - child = to_shadow_page(ent & PT64_BASE_ADDR_MASK); + child = spte_to_child_sp(ent); if (child->unsync_children) { if (mmu_pages_add(pvec, child, i)) @@ -1829,7 +1961,7 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp, static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) { - WARN_ON(!sp->unsync); + WARN_ON_ONCE(!sp->unsync); trace_kvm_mmu_sync_page(sp); sp->unsync = 0; --kvm->stat.mmu_unsync; @@ -1861,10 +1993,80 @@ static bool sp_has_gptes(struct kvm_mmu_page *sp) &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \ if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else +static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + union kvm_mmu_page_role root_role = vcpu->arch.mmu->root_role; + + /* + * Ignore various flags when verifying that it's safe to sync a shadow + * page using the current MMU context. + * + * - level: not part of the overall MMU role and will never match as the MMU's + * level tracks the root level + * - access: updated based on the new guest PTE + * - quadrant: not part of the overall MMU role (similar to level) + */ + const union kvm_mmu_page_role sync_role_ign = { + .level = 0xf, + .access = 0x7, + .quadrant = 0x3, + .passthrough = 0x1, + }; + + /* + * Direct pages can never be unsync, and KVM should never attempt to + * sync a shadow page for a different MMU context, e.g. if the role + * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the + * reserved bits checks will be wrong, etc... + */ + if (WARN_ON_ONCE(sp->role.direct || !vcpu->arch.mmu->sync_spte || + (sp->role.word ^ root_role.word) & ~sync_role_ign.word)) + return false; + + return true; +} + +static int kvm_sync_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i) +{ + /* sp->spt[i] has initial value of shadow page table allocation */ + if (sp->spt[i] == SHADOW_NONPRESENT_VALUE) + return 0; + + return vcpu->arch.mmu->sync_spte(vcpu, sp, i); +} + +static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + int flush = 0; + int i; + + if (!kvm_sync_page_check(vcpu, sp)) + return -1; + + for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { + int ret = kvm_sync_spte(vcpu, sp, i); + + if (ret < -1) + return -1; + flush |= ret; + } + + /* + * Note, any flush is purely for KVM's correctness, e.g. when dropping + * an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier + * unmap or dirty logging event doesn't fail to flush. The guest is + * responsible for flushing the TLB to ensure any changes in protection + * bits are recognized, i.e. until the guest flushes or page faults on + * a relevant address, KVM is architecturally allowed to let vCPUs use + * cached translations with the old protection bits. + */ + return flush; +} + static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list) { - int ret = vcpu->arch.mmu->sync_page(vcpu, sp); + int ret = __kvm_sync_page(vcpu, sp); if (ret < 0) kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); @@ -1890,8 +2092,8 @@ static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) if (sp->role.invalid) return true; - /* TDP MMU pages due not use the MMU generation. */ - return !sp->tdp_mmu_page && + /* TDP MMU pages do not use the MMU generation. */ + return !is_tdp_mmu_page(sp) && unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); } @@ -1935,11 +2137,11 @@ static int mmu_pages_first(struct kvm_mmu_pages *pvec, if (pvec->nr == 0) return 0; - WARN_ON(pvec->page[0].idx != INVALID_INDEX); + WARN_ON_ONCE(pvec->page[0].idx != INVALID_INDEX); sp = pvec->page[0].sp; level = sp->role.level; - WARN_ON(level == PG_LEVEL_4K); + WARN_ON_ONCE(level == PG_LEVEL_4K); parents->parent[level-2] = sp; @@ -1961,7 +2163,7 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents) if (!sp) return; - WARN_ON(idx == INVALID_INDEX); + WARN_ON_ONCE(idx == INVALID_INDEX); clear_unsync_child_bit(sp, idx); level++; } while (!sp->unsync_children); @@ -2019,36 +2221,24 @@ static void clear_sp_write_flooding_count(u64 *spte) __clear_sp_write_flooding_count(sptep_to_sp(spte)); } -static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, - gfn_t gfn, - gva_t gaddr, - unsigned level, - int direct, - unsigned int access) +/* + * The vCPU is required when finding indirect shadow pages; the shadow + * page may already exist and syncing it needs the vCPU pointer in + * order to read guest page tables. Direct shadow pages are never + * unsync, thus @vcpu can be NULL if @role.direct is true. + */ +static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm, + struct kvm_vcpu *vcpu, + gfn_t gfn, + struct hlist_head *sp_list, + union kvm_mmu_page_role role) { - bool direct_mmu = vcpu->arch.mmu->root_role.direct; - union kvm_mmu_page_role role; - struct hlist_head *sp_list; - unsigned quadrant; struct kvm_mmu_page *sp; int ret; int collisions = 0; LIST_HEAD(invalid_list); - role = vcpu->arch.mmu->root_role; - role.level = level; - role.direct = direct; - role.access = access; - if (role.has_4_byte_gpte) { - quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); - quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; - role.quadrant = quadrant; - } - if (level <= vcpu->arch.mmu->cpu_role.base.level) - role.passthrough = 0; - - sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]; - for_each_valid_sp(vcpu->kvm, sp, sp_list) { + for_each_valid_sp(kvm, sp, sp_list) { if (sp->gfn != gfn) { collisions++; continue; @@ -2064,16 +2254,20 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, * Unsync pages must not be left as is, because the new * upper-level page will be write-protected. */ - if (level > PG_LEVEL_4K && sp->unsync) - kvm_mmu_prepare_zap_page(vcpu->kvm, sp, + if (role.level > PG_LEVEL_4K && sp->unsync) + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); continue; } - if (direct_mmu) - goto trace_get_page; + /* unsync and write-flooding only apply to indirect SPs. */ + if (sp->role.direct) + goto out; if (sp->unsync) { + if (KVM_BUG_ON(!vcpu, kvm)) + break; + /* * The page is good, but is stale. kvm_sync_page does * get the latest guest state, but (unlike mmu_unsync_children) @@ -2090,39 +2284,164 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (ret < 0) break; - WARN_ON(!list_empty(&invalid_list)); + WARN_ON_ONCE(!list_empty(&invalid_list)); if (ret > 0) - kvm_flush_remote_tlbs(vcpu->kvm); + kvm_flush_remote_tlbs(kvm); } __clear_sp_write_flooding_count(sp); -trace_get_page: - trace_kvm_mmu_get_page(sp, false); goto out; } - ++vcpu->kvm->stat.mmu_cache_miss; + sp = NULL; + ++kvm->stat.mmu_cache_miss; - sp = kvm_mmu_alloc_page(vcpu, direct); +out: + kvm_mmu_commit_zap_page(kvm, &invalid_list); + + if (collisions > kvm->stat.max_mmu_page_hash_collisions) + kvm->stat.max_mmu_page_hash_collisions = collisions; + return sp; +} + +/* Caches used when allocating a new shadow page. */ +struct shadow_page_caches { + struct kvm_mmu_memory_cache *page_header_cache; + struct kvm_mmu_memory_cache *shadow_page_cache; + struct kvm_mmu_memory_cache *shadowed_info_cache; +}; + +static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm, + struct shadow_page_caches *caches, + gfn_t gfn, + struct hlist_head *sp_list, + union kvm_mmu_page_role role) +{ + struct kvm_mmu_page *sp; + + sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache); + sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache); + if (!role.direct && role.level <= KVM_MAX_HUGEPAGE_LEVEL) + sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache); + + set_page_private(virt_to_page(sp->spt), (unsigned long)sp); + + INIT_LIST_HEAD(&sp->possible_nx_huge_page_link); + + /* + * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages() + * depends on valid pages being added to the head of the list. See + * comments in kvm_zap_obsolete_pages(). + */ + sp->mmu_valid_gen = kvm->arch.mmu_valid_gen; + list_add(&sp->link, &kvm->arch.active_mmu_pages); + kvm_account_mmu_page(kvm, sp); sp->gfn = gfn; sp->role = role; hlist_add_head(&sp->hash_link, sp_list); - if (sp_has_gptes(sp)) { - account_shadowed(vcpu->kvm, sp); - if (level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn)) - kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1); + if (sp_has_gptes(sp)) + account_shadowed(kvm, sp); + + return sp; +} + +/* Note, @vcpu may be NULL if @role.direct is true; see kvm_mmu_find_shadow_page. */ +static struct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm *kvm, + struct kvm_vcpu *vcpu, + struct shadow_page_caches *caches, + gfn_t gfn, + union kvm_mmu_page_role role) +{ + struct hlist_head *sp_list; + struct kvm_mmu_page *sp; + bool created = false; + + sp_list = &kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]; + + sp = kvm_mmu_find_shadow_page(kvm, vcpu, gfn, sp_list, role); + if (!sp) { + created = true; + sp = kvm_mmu_alloc_shadow_page(kvm, caches, gfn, sp_list, role); } - trace_kvm_mmu_get_page(sp, true); -out: - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); - if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions) - vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions; + trace_kvm_mmu_get_page(sp, created); return sp; } +static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu, + gfn_t gfn, + union kvm_mmu_page_role role) +{ + struct shadow_page_caches caches = { + .page_header_cache = &vcpu->arch.mmu_page_header_cache, + .shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache, + .shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache, + }; + + return __kvm_mmu_get_shadow_page(vcpu->kvm, vcpu, &caches, gfn, role); +} + +static union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct, + unsigned int access) +{ + struct kvm_mmu_page *parent_sp = sptep_to_sp(sptep); + union kvm_mmu_page_role role; + + role = parent_sp->role; + role.level--; + role.access = access; + role.direct = direct; + role.passthrough = 0; + + /* + * If the guest has 4-byte PTEs then that means it's using 32-bit, + * 2-level, non-PAE paging. KVM shadows such guests with PAE paging + * (i.e. 8-byte PTEs). The difference in PTE size means that KVM must + * shadow each guest page table with multiple shadow page tables, which + * requires extra bookkeeping in the role. + * + * Specifically, to shadow the guest's page directory (which covers a + * 4GiB address space), KVM uses 4 PAE page directories, each mapping + * 1GiB of the address space. @role.quadrant encodes which quarter of + * the address space each maps. + * + * To shadow the guest's page tables (which each map a 4MiB region), KVM + * uses 2 PAE page tables, each mapping a 2MiB region. For these, + * @role.quadrant encodes which half of the region they map. + * + * Concretely, a 4-byte PDE consumes bits 31:22, while an 8-byte PDE + * consumes bits 29:21. To consume bits 31:30, KVM's uses 4 shadow + * PDPTEs; those 4 PAE page directories are pre-allocated and their + * quadrant is assigned in mmu_alloc_root(). A 4-byte PTE consumes + * bits 21:12, while an 8-byte PTE consumes bits 20:12. To consume + * bit 21 in the PTE (the child here), KVM propagates that bit to the + * quadrant, i.e. sets quadrant to '0' or '1'. The parent 8-byte PDE + * covers bit 21 (see above), thus the quadrant is calculated from the + * _least_ significant bit of the PDE index. + */ + if (role.has_4_byte_gpte) { + WARN_ON_ONCE(role.level != PG_LEVEL_4K); + role.quadrant = spte_index(sptep) & 1; + } + + return role; +} + +static struct kvm_mmu_page *kvm_mmu_get_child_sp(struct kvm_vcpu *vcpu, + u64 *sptep, gfn_t gfn, + bool direct, unsigned int access) +{ + union kvm_mmu_page_role role; + + if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) + return ERR_PTR(-EEXIST); + + role = kvm_mmu_child_role(sptep, direct, access); + return kvm_mmu_get_shadow_page(vcpu, gfn, role); +} + static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator, struct kvm_vcpu *vcpu, hpa_t root, u64 addr) @@ -2145,7 +2464,7 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato iterator->shadow_addr = vcpu->arch.mmu->pae_root[(addr >> 30) & 3]; - iterator->shadow_addr &= PT64_BASE_ADDR_MASK; + iterator->shadow_addr &= SPTE_BASE_ADDR_MASK; --iterator->level; if (!iterator->shadow_addr) iterator->level = 0; @@ -2164,7 +2483,7 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) if (iterator->level < PG_LEVEL_4K) return false; - iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); + iterator->index = SPTE_INDEX(iterator->addr, iterator->level); iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; return true; } @@ -2177,7 +2496,7 @@ static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator, return; } - iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK; + iterator->shadow_addr = spte & SPTE_BASE_ADDR_MASK; --iterator->level; } @@ -2186,23 +2505,47 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) __shadow_walk_next(iterator, *iterator->sptep); } -static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, - struct kvm_mmu_page *sp) +static void __link_shadow_page(struct kvm *kvm, + struct kvm_mmu_memory_cache *cache, u64 *sptep, + struct kvm_mmu_page *sp, bool flush) { u64 spte; BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); + /* + * If an SPTE is present already, it must be a leaf and therefore + * a large one. Drop it, and flush the TLB if needed, before + * installing sp. + */ + if (is_shadow_present_pte(*sptep)) + drop_large_spte(kvm, sptep, flush); + spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp)); mmu_spte_set(sptep, spte); - mmu_page_add_parent_pte(vcpu, sp, sptep); + mmu_page_add_parent_pte(kvm, cache, sp, sptep); - if (sp->unsync_children || sp->unsync) + /* + * The non-direct sub-pagetable must be updated before linking. For + * L1 sp, the pagetable is updated via kvm_sync_page() in + * kvm_mmu_find_shadow_page() without write-protecting the gfn, + * so sp->unsync can be true or false. For higher level non-direct + * sp, the pagetable is updated/synced via mmu_sync_children() in + * FNAME(fetch)(), so sp->unsync_children can only be false. + * WARN_ON_ONCE() if anything happens unexpectedly. + */ + if (WARN_ON_ONCE(sp->unsync_children) || sp->unsync) mark_unsync(sptep); } +static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, + struct kvm_mmu_page *sp) +{ + __link_shadow_page(vcpu->kvm, &vcpu->arch.mmu_pte_list_desc_cache, sptep, sp, true); +} + static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned direct_access) { @@ -2216,12 +2559,12 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, * so we should update the spte at this point to get * a new sp with the correct access. */ - child = to_shadow_page(*sptep & PT64_BASE_ADDR_MASK); + child = spte_to_child_sp(*sptep); if (child->role.access == direct_access) return; - drop_parent_pte(child, sptep); - kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1); + drop_parent_pte(vcpu->kvm, child, sptep); + kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep); } } @@ -2237,8 +2580,8 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, if (is_last_spte(pte, sp->role.level)) { drop_spte(kvm, spte); } else { - child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); - drop_parent_pte(child, spte); + child = spte_to_child_sp(pte); + drop_parent_pte(kvm, child, spte); /* * Recursively zap nested TDP SPs, parentless SPs are @@ -2246,11 +2589,12 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, * avoids retaining a large number of stale nested SPs. */ if (tdp_enabled && invalid_list && - child->role.guest_mode && !child->parent_ptes.val) + child->role.guest_mode && + !atomic_long_read(&child->parent_ptes.val)) return kvm_mmu_prepare_zap_page(kvm, child, invalid_list); } - } else if (is_mmio_spte(pte)) { + } else if (is_mmio_spte(kvm, pte)) { mmu_spte_clear_no_track(spte); } return 0; @@ -2263,19 +2607,19 @@ static int kvm_mmu_page_unlink_children(struct kvm *kvm, int zapped = 0; unsigned i; - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) + for (i = 0; i < SPTE_ENT_PER_PAGE; ++i) zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list); return zapped; } -static void kvm_mmu_unlink_parents(struct kvm_mmu_page *sp) +static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) { u64 *sptep; struct rmap_iterator iter; while ((sptep = rmap_get_first(&sp->parent_ptes, &iter))) - drop_parent_pte(sp, sptep); + drop_parent_pte(kvm, sp, sptep); } static int mmu_zap_unsync_children(struct kvm *kvm, @@ -2309,11 +2653,12 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, { bool list_unstable, zapped_root = false; + lockdep_assert_held_write(&kvm->mmu_lock); trace_kvm_mmu_prepare_zap_page(sp); ++kvm->stat.mmu_shadow_zapped; *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list); *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list); - kvm_mmu_unlink_parents(sp); + kvm_mmu_unlink_parents(kvm, sp); /* Zapping children means active_mmu_pages has become unstable. */ list_unstable = *nr_zapped; @@ -2336,7 +2681,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, list_add(&sp->link, invalid_list); else list_move(&sp->link, invalid_list); - kvm_mod_used_mmu_pages(kvm, -1); + kvm_unaccount_mmu_page(kvm, sp); } else { /* * Remove the active root from the active page list, the root @@ -2352,8 +2697,8 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, zapped_root = !is_obsolete_sp(kvm, sp); } - if (sp->lpage_disallowed) - unaccount_huge_nx_page(kvm, sp); + if (sp->nx_huge_page_disallowed) + unaccount_nx_huge_page(kvm, sp); sp->role.invalid = 1; @@ -2395,8 +2740,8 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, kvm_flush_remote_tlbs(kvm); list_for_each_entry_safe(sp, nsp, invalid_list, link) { - WARN_ON(!sp->role.invalid || sp->root_count); - kvm_mmu_free_page(sp); + WARN_ON_ONCE(!sp->role.invalid || sp->root_count); + kvm_mmu_free_shadow_page(sp); } } @@ -2489,39 +2834,49 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages) write_unlock(&kvm->mmu_lock); } -int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) +bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + bool always_retry) { - struct kvm_mmu_page *sp; + struct kvm *kvm = vcpu->kvm; LIST_HEAD(invalid_list); - int r; + struct kvm_mmu_page *sp; + gpa_t gpa = cr2_or_gpa; + bool r = false; + + /* + * Bail early if there aren't any write-protected shadow pages to avoid + * unnecessarily taking mmu_lock lock, e.g. if the gfn is write-tracked + * by a third party. Reading indirect_shadow_pages without holding + * mmu_lock is safe, as this is purely an optimization, i.e. a false + * positive is benign, and a false negative will simply result in KVM + * skipping the unprotect+retry path, which is also an optimization. + */ + if (!READ_ONCE(kvm->arch.indirect_shadow_pages)) + goto out; + + if (!vcpu->arch.mmu->root_role.direct) { + gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); + if (gpa == INVALID_GPA) + goto out; + } - pgprintk("%s: looking for gfn %llx\n", __func__, gfn); - r = 0; write_lock(&kvm->mmu_lock); - for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) { - pgprintk("%s: gfn %llx role %x\n", __func__, gfn, - sp->role.word); - r = 1; + for_each_gfn_valid_sp_with_gptes(kvm, sp, gpa_to_gfn(gpa)) kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); - } + + /* + * Snapshot the result before zapping, as zapping will remove all list + * entries, i.e. checking the list later would yield a false negative. + */ + r = !list_empty(&invalid_list); kvm_mmu_commit_zap_page(kvm, &invalid_list); write_unlock(&kvm->mmu_lock); - return r; -} - -static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) -{ - gpa_t gpa; - int r; - - if (vcpu->arch.mmu->root_role.direct) - return 0; - - gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); - - r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); - +out: + if (r || always_retry) { + vcpu->arch.last_retry_eip = kvm_rip_read(vcpu); + vcpu->arch.last_retry_addr = cr2_or_gpa; + } return r; } @@ -2541,7 +2896,7 @@ static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) * be write-protected. */ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, - gfn_t gfn, bool can_unsync, bool prefetch) + gfn_t gfn, bool synchronizing, bool prefetch) { struct kvm_mmu_page *sp; bool locked = false; @@ -2551,17 +2906,17 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, * track machinery is used to write-protect upper-level shadow pages, * i.e. this guards the role.level == 4K assertion below! */ - if (kvm_slot_page_track_is_active(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE)) + if (kvm_gfn_is_write_tracked(kvm, slot, gfn)) return -EPERM; /* * The page is not write-tracked, mark existing shadow pages unsync - * unless KVM is synchronizing an unsync SP (can_unsync = false). In - * that case, KVM must complete emulation of the guest TLB flush before - * allowing shadow pages to become unsync (writable by the guest). + * unless KVM is synchronizing an unsync SP. In that case, KVM must + * complete emulation of the guest TLB flush before allowing shadow + * pages to become unsync (writable by the guest). */ for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) { - if (!can_unsync) + if (synchronizing) return -EPERM; if (sp->unsync) @@ -2584,16 +2939,16 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, /* * Recheck after taking the spinlock, a different vCPU * may have since marked the page unsync. A false - * positive on the unprotected check above is not + * negative on the unprotected check above is not * possible as clearing sp->unsync _must_ hold mmu_lock - * for write, i.e. unsync cannot transition from 0->1 + * for write, i.e. unsync cannot transition from 1->0 * while this CPU holds mmu_lock for read (or write). */ if (READ_ONCE(sp->unsync)) continue; } - WARN_ON(sp->role.level != PG_LEVEL_4K); + WARN_ON_ONCE(sp->role.level != PG_LEVEL_4K); kvm_unsync_page(kvm, sp); } if (locked) @@ -2658,9 +3013,6 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, bool prefetch = !fault || fault->prefetch; bool write_fault = fault && fault->write; - pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, - *sptep, write_fault, gfn); - if (unlikely(is_noslot_pfn(pfn))) { vcpu->stat.pf_mmio_spte_created++; mark_mmio_spte(vcpu, sptep, gfn, pte_access); @@ -2668,6 +3020,10 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, } if (is_shadow_present_pte(*sptep)) { + if (prefetch && is_last_spte(*sptep, level) && + pfn == spte_to_pfn(*sptep)) + return RET_PF_SPURIOUS; + /* * If we overwrite a PTE page pointer with a 2MB PMD, unlink * the parent of the now unreachable PTE. @@ -2676,12 +3032,10 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, struct kvm_mmu_page *child; u64 pte = *sptep; - child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); - drop_parent_pte(child, sptep); + child = spte_to_child_sp(pte); + drop_parent_pte(vcpu->kvm, child, sptep); flush = true; - } else if (pfn != spte_to_pfn(*sptep)) { - pgprintk("hfn old %llx new %llx\n", - spte_to_pfn(*sptep), pfn); + } else if (WARN_ON_ONCE(pfn != spte_to_pfn(*sptep))) { drop_spte(vcpu->kvm, sptep); flush = true; } else @@ -2689,7 +3043,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, } wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch, - true, host_writable, &spte); + false, host_writable, &spte); if (*sptep == spte) { ret = RET_PF_SPURIOUS; @@ -2698,52 +3052,68 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, trace_kvm_mmu_set_spte(level, gfn, sptep); } - if (wrprot) { - if (write_fault) - ret = RET_PF_EMULATE; - } + if (wrprot && write_fault) + ret = RET_PF_WRITE_PROTECTED; if (flush) - kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, - KVM_PAGES_PER_HPAGE(level)); - - pgprintk("%s: setting spte %llx\n", __func__, *sptep); + kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level); if (!was_rmapped) { WARN_ON_ONCE(ret == RET_PF_SPURIOUS); - kvm_update_page_stats(vcpu->kvm, level, 1); - rmap_add(vcpu, slot, sptep, gfn); + rmap_add(vcpu, slot, sptep, gfn, pte_access); + } else { + /* Already rmapped but the pte_access bits may have changed. */ + kvm_mmu_page_set_access(sp, spte_index(sptep), pte_access); } return ret; } -static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, - u64 *start, u64 *end) +static bool kvm_mmu_prefetch_sptes(struct kvm_vcpu *vcpu, gfn_t gfn, u64 *sptep, + int nr_pages, unsigned int access) { struct page *pages[PTE_PREFETCH_NUM]; struct kvm_memory_slot *slot; - unsigned int access = sp->role.access; - int i, ret; - gfn_t gfn; + int i; + + if (WARN_ON_ONCE(nr_pages > PTE_PREFETCH_NUM)) + return false; - gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK); if (!slot) - return -1; + return false; - ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start); - if (ret <= 0) - return -1; + nr_pages = kvm_prefetch_pages(slot, gfn, pages, nr_pages); + if (nr_pages <= 0) + return false; - for (i = 0; i < ret; i++, gfn++, start++) { - mmu_set_spte(vcpu, slot, start, access, gfn, + for (i = 0; i < nr_pages; i++, gfn++, sptep++) { + mmu_set_spte(vcpu, slot, sptep, access, gfn, page_to_pfn(pages[i]), NULL); - put_page(pages[i]); + + /* + * KVM always prefetches writable pages from the primary MMU, + * and KVM can make its SPTE writable in the fast page handler, + * without notifying the primary MMU. Mark pages/folios dirty + * now to ensure file data is written back if it ends up being + * written by the guest. Because KVM's prefetching GUPs + * writable PTEs, the probability of unnecessary writeback is + * extremely low. + */ + kvm_release_page_dirty(pages[i]); } - return 0; + return true; +} + +static bool direct_pte_prefetch_many(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, + u64 *start, u64 *end) +{ + gfn_t gfn = kvm_mmu_page_get_gfn(sp, spte_index(start)); + unsigned int access = sp->role.access; + + return kvm_mmu_prefetch_sptes(vcpu, gfn, start, end - start, access); } static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, @@ -2752,17 +3122,18 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *spte, *start = NULL; int i; - WARN_ON(!sp->role.direct); + WARN_ON_ONCE(!sp->role.direct); - i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); + i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1); spte = sp->spt + i; for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { if (is_shadow_present_pte(*spte) || spte == sptep) { if (!start) continue; - if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0) + if (!direct_pte_prefetch_many(vcpu, sp, start, spte)) return; + start = NULL; } else if (!start) start = spte; @@ -2792,26 +3163,48 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) * If addresses are being invalidated, skip prefetching to avoid * accidentally prefetching those addresses. */ - if (unlikely(vcpu->kvm->mmu_notifier_count)) + if (unlikely(vcpu->kvm->mmu_invalidate_in_progress)) return; __direct_pte_prefetch(vcpu, sp, sptep); } -static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, +/* + * Lookup the mapping level for @gfn in the current mm. + * + * WARNING! Use of host_pfn_mapping_level() requires the caller and the end + * consumer to be tied into KVM's handlers for MMU notifier events! + * + * There are several ways to safely use this helper: + * + * - Check mmu_invalidate_retry_gfn() after grabbing the mapping level, before + * consuming it. In this case, mmu_lock doesn't need to be held during the + * lookup, but it does need to be held while checking the MMU notifier. + * + * - Hold mmu_lock AND ensure there is no in-progress MMU notifier invalidation + * event for the hva. This can be done by explicit checking the MMU notifier + * or by ensuring that KVM already has a valid mapping that covers the hva. + * + * - Do not use the result to install new mappings, e.g. use the host mapping + * level only to decide whether or not to zap an entry. In this case, it's + * not required to hold mmu_lock (though it's highly likely the caller will + * want to hold mmu_lock anyways, e.g. to modify SPTEs). + * + * Note! The lookup can still race with modifications to host page tables, but + * the above "rules" ensure KVM will not _consume_ the result of the walk if a + * race with the primary MMU occurs. + */ +static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, const struct kvm_memory_slot *slot) { + int level = PG_LEVEL_4K; unsigned long hva; unsigned long flags; - int level = PG_LEVEL_4K; pgd_t pgd; p4d_t p4d; pud_t pud; pmd_t pmd; - if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn)) - return PG_LEVEL_4K; - /* * Note, using the already-retrieved memslot and __gfn_to_hva_memslot() * is not solely for performance, it's also necessary to avoid the @@ -2823,16 +3216,19 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, hva = __gfn_to_hva_memslot(slot, gfn); /* - * Lookup the mapping level in the current mm. The information - * may become stale soon, but it is safe to use as long as - * 1) mmu_notifier_retry was checked after taking mmu_lock, and - * 2) mmu_lock is taken now. - * - * We still need to disable IRQs to prevent concurrent tear down - * of page tables. + * Disable IRQs to prevent concurrent tear down of host page tables, + * e.g. if the primary MMU promotes a P*D to a huge page and then frees + * the original page table. */ local_irq_save(flags); + /* + * Read each entry once. As above, a non-leaf entry can be promoted to + * a huge page _during_ this walk. Re-reading the entry could send the + * walk into the weeks, e.g. p*d_leaf() returns false (sees the old + * value) and then p*d_offset() walks into the target huge page instead + * of the old page table (sees the new value). + */ pgd = READ_ONCE(*pgd_offset(kvm->mm, hva)); if (pgd_none(pgd)) goto out; @@ -2845,7 +3241,7 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, if (pud_none(pud) || !pud_present(pud)) goto out; - if (pud_large(pud)) { + if (pud_leaf(pud)) { level = PG_LEVEL_1G; goto out; } @@ -2854,7 +3250,7 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, if (pmd_none(pmd) || !pmd_present(pmd)) goto out; - if (pmd_large(pmd)) + if (pmd_leaf(pmd)) level = PG_LEVEL_2M; out: @@ -2862,9 +3258,9 @@ out: return level; } -int kvm_mmu_max_mapping_level(struct kvm *kvm, - const struct kvm_memory_slot *slot, gfn_t gfn, - kvm_pfn_t pfn, int max_level) +static int __kvm_mmu_max_mapping_level(struct kvm *kvm, + const struct kvm_memory_slot *slot, + gfn_t gfn, int max_level, bool is_private) { struct kvm_lpage_info *linfo; int host_level; @@ -2876,13 +3272,25 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, break; } + if (is_private) + return max_level; + if (max_level == PG_LEVEL_4K) return PG_LEVEL_4K; - host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot); + host_level = host_pfn_mapping_level(kvm, gfn, slot); return min(host_level, max_level); } +int kvm_mmu_max_mapping_level(struct kvm *kvm, + const struct kvm_memory_slot *slot, gfn_t gfn) +{ + bool is_private = kvm_slot_can_be_private(slot) && + kvm_mem_is_private(kvm, gfn); + + return __kvm_mmu_max_mapping_level(kvm, slot, gfn, PG_LEVEL_NUM, is_private); +} + void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_memory_slot *slot = fault->slot; @@ -2893,7 +3301,7 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (unlikely(fault->max_level == PG_LEVEL_4K)) return; - if (is_error_noslot_pfn(fault->pfn) || kvm_is_reserved_pfn(fault->pfn)) + if (is_error_noslot_pfn(fault->pfn)) return; if (kvm_slot_dirty_track_enabled(slot)) @@ -2903,14 +3311,14 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault * Enforce the iTLB multihit workaround after capturing the requested * level, which will be used to do precise, accurate accounting. */ - fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, - fault->gfn, fault->pfn, - fault->max_level); + fault->req_level = __kvm_mmu_max_mapping_level(vcpu->kvm, slot, + fault->gfn, fault->max_level, + fault->is_private); if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed) return; /* - * mmu_notifier_retry() was successful and mmu_lock is held, so + * mmu_invalidate_retry() was successful and mmu_lock is held, so * the pmd can't be split from under us. */ fault->goal_level = fault->req_level; @@ -2924,13 +3332,14 @@ void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_ if (cur_level > PG_LEVEL_4K && cur_level == fault->goal_level && is_shadow_present_pte(spte) && - !is_large_pte(spte)) { + !is_large_pte(spte) && + spte_to_child_sp(spte)->nx_huge_page_disallowed) { /* - * A small SPTE exists for this pfn, but FNAME(fetch) - * and __direct_map would like to create a large PTE - * instead: just force them to go down another level, - * patching back for them into pfn the next 9 bits of - * the address. + * A small SPTE exists for this pfn, but FNAME(fetch), + * direct_map(), or kvm_tdp_mmu_map() would like to create a + * large PTE instead: just force them to go down another level, + * patching back for them into pfn the next 9 bits of the + * address. */ u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) - KVM_PAGES_PER_HPAGE(cur_level - 1); @@ -2939,7 +3348,7 @@ void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_ } } -static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) +static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; @@ -2957,21 +3366,18 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) if (fault->nx_huge_page_workaround_enabled) disallowed_hugepage_adjust(fault, *it.sptep, it.level); - base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + base_gfn = gfn_round_for_level(fault->gfn, it.level); if (it.level == fault->goal_level) break; - drop_large_spte(vcpu, it.sptep); - if (is_shadow_present_pte(*it.sptep)) + sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, ACC_ALL); + if (sp == ERR_PTR(-EEXIST)) continue; - sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr, - it.level - 1, true, ACC_ALL); - link_shadow_page(vcpu, it.sptep, sp); - if (fault->is_tdp && fault->huge_page_disallowed && - fault->req_level >= it.level) - account_huge_nx_page(vcpu->kvm, sp); + if (fault->huge_page_disallowed) + account_nx_huge_page(vcpu->kvm, sp, + fault->req_level >= it.level); } if (WARN_ON_ONCE(it.level != fault->goal_level)) @@ -2986,60 +3392,76 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) return ret; } -static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) +static void kvm_send_hwpoison_signal(struct kvm_memory_slot *slot, gfn_t gfn) { - send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk); + unsigned long hva = gfn_to_hva_memslot(slot, gfn); + + send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, PAGE_SHIFT, current); } -static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) +static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { + if (is_sigpending_pfn(fault->pfn)) { + kvm_handle_signal_exit(vcpu); + return -EINTR; + } + /* * Do not cache the mmio info caused by writing the readonly gfn * into the spte otherwise read access on readonly gfn also can * caused mmio page fault and treat it as mmio access. */ - if (pfn == KVM_PFN_ERR_RO_FAULT) + if (fault->pfn == KVM_PFN_ERR_RO_FAULT) return RET_PF_EMULATE; - if (pfn == KVM_PFN_ERR_HWPOISON) { - kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current); + if (fault->pfn == KVM_PFN_ERR_HWPOISON) { + kvm_send_hwpoison_signal(fault->slot, fault->gfn); return RET_PF_RETRY; } return -EFAULT; } -static int handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, - unsigned int access) +static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault, + unsigned int access) { - /* The pfn is invalid, report the error! */ - if (unlikely(is_error_pfn(fault->pfn))) - return kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn); + gva_t gva = fault->is_tdp ? 0 : fault->addr; - if (unlikely(!fault->slot)) { - gva_t gva = fault->is_tdp ? 0 : fault->addr; - - vcpu_cache_mmio_info(vcpu, gva, fault->gfn, - access & shadow_mmio_access_mask); - /* - * If MMIO caching is disabled, emulate immediately without - * touching the shadow page tables as attempting to install an - * MMIO SPTE will just be an expensive nop. Do not cache MMIO - * whose gfn is greater than host.MAXPHYADDR, any guest that - * generates such gfns is running nested and is being tricked - * by L0 userspace (you can observe gfn > L1.MAXPHYADDR if - * and only if L1's MAXPHYADDR is inaccurate with respect to - * the hardware's). - */ - if (unlikely(!enable_mmio_caching) || - unlikely(fault->gfn > kvm_mmu_max_gfn())) - return RET_PF_EMULATE; + if (fault->is_private) { + kvm_mmu_prepare_memory_fault_exit(vcpu, fault); + return -EFAULT; } + vcpu_cache_mmio_info(vcpu, gva, fault->gfn, + access & shadow_mmio_access_mask); + + fault->slot = NULL; + fault->pfn = KVM_PFN_NOSLOT; + fault->map_writable = false; + + /* + * If MMIO caching is disabled, emulate immediately without + * touching the shadow page tables as attempting to install an + * MMIO SPTE will just be an expensive nop. + */ + if (unlikely(!enable_mmio_caching)) + return RET_PF_EMULATE; + + /* + * Do not create an MMIO SPTE for a gfn greater than host.MAXPHYADDR, + * any guest that generates such gfns is running nested and is being + * tricked by L0 userspace (you can observe gfn > L1.MAXPHYADDR if and + * only if L1's MAXPHYADDR is inaccurate with respect to the + * hardware's). + */ + if (unlikely(fault->gfn > kvm_mmu_max_gfn())) + return RET_PF_EMULATE; + return RET_PF_CONTINUE; } -static bool page_fault_can_be_fast(struct kvm_page_fault *fault) +static bool page_fault_can_be_fast(struct kvm *kvm, struct kvm_page_fault *fault) { /* * Page faults with reserved bits set, i.e. faults on MMIO SPTEs, only @@ -3051,6 +3473,26 @@ static bool page_fault_can_be_fast(struct kvm_page_fault *fault) return false; /* + * For hardware-protected VMs, certain conditions like attempting to + * perform a write to a page which is not in the state that the guest + * expects it to be in can result in a nested/extended #PF. In this + * case, the below code might misconstrue this situation as being the + * result of a write-protected access, and treat it as a spurious case + * rather than taking any action to satisfy the real source of the #PF + * such as generating a KVM_EXIT_MEMORY_FAULT. This can lead to the + * guest spinning on a #PF indefinitely, so don't attempt the fast path + * in this case. + * + * Note that the kvm_mem_is_private() check might race with an + * attribute update, but this will either result in the guest spinning + * on RET_PF_SPURIOUS until the update completes, or an actual spurious + * case might go down the slow path. Either case will resolve itself. + */ + if (kvm->arch.has_private_mem && + fault->is_private != kvm_mem_is_private(kvm, fault->gfn)) + return false; + + /* * #PF can be fast if: * * 1. The shadow page table entry is not present and A/D bits are @@ -3066,7 +3508,7 @@ static bool page_fault_can_be_fast(struct kvm_page_fault *fault) * by setting the Writable bit, which can be done out of mmu_lock. */ if (!fault->present) - return !kvm_ad_enabled(); + return !kvm_ad_enabled; /* * Note, instruction fetches and writes are mutually exclusive, ignore @@ -3079,9 +3521,9 @@ static bool page_fault_can_be_fast(struct kvm_page_fault *fault) * Returns true if the SPTE was fixed successfully. Otherwise, * someone else modified the SPTE from its original value. */ -static bool -fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, - u64 *sptep, u64 old_spte, u64 new_spte) +static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault, + u64 *sptep, u64 old_spte, u64 new_spte) { /* * Theoretically we could also set dirty bit (and flush TLB) here in @@ -3093,9 +3535,9 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, * harm. This also avoids the TLB flush needed after setting dirty bit * so non-PML cases won't be impacted. * - * Compare with set_spte where instead shadow_dirty_mask is set. + * Compare with make_spte() where instead shadow_dirty_mask is set. */ - if (cmpxchg64(sptep, old_spte, new_spte) != old_spte) + if (!try_cmpxchg64(sptep, &old_spte, new_spte)) return false; if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) @@ -3104,18 +3546,6 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, return true; } -static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte) -{ - if (fault->exec) - return is_executable_pte(spte); - - if (fault->write) - return is_writable_pte(spte); - - /* Fault was on Read access */ - return spte & PT_PRESENT_MASK; -} - /* * Returns the last level spte pointer of the shadow page walk for the given * gpa, and sets *spte to the spte value. This spte may be non-preset. If no @@ -3146,11 +3576,11 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_mmu_page *sp; int ret = RET_PF_INVALID; - u64 spte = 0ull; - u64 *sptep = NULL; + u64 spte; + u64 *sptep; uint retry_count = 0; - if (!page_fault_can_be_fast(fault)) + if (!page_fault_can_be_fast(vcpu->kvm, fault)) return ret; walk_shadow_page_lockless_begin(vcpu); @@ -3158,11 +3588,19 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) do { u64 new_spte; - if (is_tdp_mmu(vcpu->arch.mmu)) - sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte); + if (tdp_mmu_enabled) + sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->gfn, &spte); else sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte); + /* + * It's entirely possible for the mapping to have been zapped + * by a different task, but the root page should always be + * available as the vCPU holds a reference to its root(s). + */ + if (WARN_ON_ONCE(!sptep)) + spte = FROZEN_SPTE; + if (!is_shadow_present_pte(spte)) break; @@ -3193,8 +3631,9 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) * uses A/D bits for non-nested MMUs. Thus, if A/D bits are * enabled, the SPTE can't be an access-tracked SPTE. */ - if (unlikely(!kvm_ad_enabled()) && is_access_track_spte(spte)) - new_spte = restore_acc_track_spte(new_spte); + if (unlikely(!kvm_ad_enabled) && is_access_track_spte(spte)) + new_spte = restore_acc_track_spte(new_spte) | + shadow_accessed_mask; /* * To keep things simple, only SPTEs that are MMU-writable can @@ -3241,8 +3680,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) } if (++retry_count > 4) { - printk_once(KERN_WARNING - "kvm: Fast #PF retrying more than 4 times.\n"); + pr_warn_once("Fast #PF retrying more than 4 times.\n"); break; } @@ -3265,14 +3703,18 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, if (!VALID_PAGE(*root_hpa)) return; - sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK); - if (WARN_ON(!sp)) + sp = root_to_sp(*root_hpa); + if (WARN_ON_ONCE(!sp)) return; - if (is_tdp_mmu_page(sp)) - kvm_tdp_mmu_put_root(kvm, sp, false); - else if (!--sp->root_count && sp->role.invalid) - kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); + if (is_tdp_mmu_page(sp)) { + lockdep_assert_held_read(&kvm->mmu_lock); + kvm_tdp_mmu_put_root(kvm, sp); + } else { + lockdep_assert_held_write(&kvm->mmu_lock); + if (!--sp->root_count && sp->role.invalid) + kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); + } *root_hpa = INVALID_PAGE; } @@ -3281,10 +3723,13 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, ulong roots_to_free) { + bool is_tdp_mmu = tdp_mmu_enabled && mmu->root_role.direct; int i; LIST_HEAD(invalid_list); bool free_active_root; + WARN_ON_ONCE(roots_to_free & ~KVM_MMU_ROOTS_ALL); + BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG); /* Before acquiring the MMU lock, see if we need to do any real work. */ @@ -3301,7 +3746,10 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, return; } - write_lock(&kvm->mmu_lock); + if (is_tdp_mmu) + read_lock(&kvm->mmu_lock); + else + write_lock(&kvm->mmu_lock); for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) @@ -3309,7 +3757,9 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, &invalid_list); if (free_active_root) { - if (to_shadow_page(mmu->root.hpa)) { + if (kvm_mmu_is_dummy_root(mmu->root.hpa)) { + /* Nothing to cleanup for dummy roots. */ + } else if (root_to_sp(mmu->root.hpa)) { mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list); } else if (mmu->pae_root) { for (i = 0; i < 4; ++i) { @@ -3325,14 +3775,20 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, mmu->root.pgd = 0; } - kvm_mmu_commit_zap_page(kvm, &invalid_list); - write_unlock(&kvm->mmu_lock); + if (is_tdp_mmu) { + read_unlock(&kvm->mmu_lock); + WARN_ON_ONCE(!list_empty(&invalid_list)); + } else { + kvm_mmu_commit_zap_page(kvm, &invalid_list); + write_unlock(&kvm->mmu_lock); + } } EXPORT_SYMBOL_GPL(kvm_mmu_free_roots); void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu) { unsigned long roots_to_free = 0; + struct kvm_mmu_page *sp; hpa_t root_hpa; int i; @@ -3347,8 +3803,8 @@ void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu) if (!VALID_PAGE(root_hpa)) continue; - if (!to_shadow_page(root_hpa) || - to_shadow_page(root_hpa)->role.guest_mode) + sp = root_to_sp(root_hpa); + if (!sp || sp->role.guest_mode) roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); } @@ -3356,25 +3812,19 @@ void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu) } EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots); - -static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) +static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant, + u8 level) { - int ret = 0; - - if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) { - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); - ret = 1; - } + union kvm_mmu_page_role role = vcpu->arch.mmu->root_role; + struct kvm_mmu_page *sp; - return ret; -} + role.level = level; + role.quadrant = quadrant; -static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva, - u8 level, bool direct) -{ - struct kvm_mmu_page *sp; + WARN_ON_ONCE(quadrant && !role.has_4_byte_gpte); + WARN_ON_ONCE(role.direct && role.has_4_byte_gpte); - sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL); + sp = kvm_mmu_get_shadow_page(vcpu, gfn, role); ++sp->root_count; return __pa(sp->spt); @@ -3388,16 +3838,21 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) unsigned i; int r; + if (tdp_mmu_enabled) { + if (kvm_has_mirrored_tdp(vcpu->kvm) && + !VALID_PAGE(mmu->mirror_root_hpa)) + kvm_tdp_mmu_alloc_root(vcpu, true); + kvm_tdp_mmu_alloc_root(vcpu, false); + return 0; + } + write_lock(&vcpu->kvm->mmu_lock); r = make_mmu_pages_available(vcpu); if (r < 0) goto out_unlock; - if (is_tdp_mmu_enabled(vcpu->kvm)) { - root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); - mmu->root.hpa = root; - } else if (shadow_root_level >= PT64_ROOT_4LEVEL) { - root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true); + if (shadow_root_level >= PT64_ROOT_4LEVEL) { + root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level); mmu->root.hpa = root; } else if (shadow_root_level == PT32E_ROOT_LEVEL) { if (WARN_ON_ONCE(!mmu->pae_root)) { @@ -3408,8 +3863,8 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) for (i = 0; i < 4; ++i) { WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i])); - root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), - i << 30, PT32_ROOT_LEVEL, true); + root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), 0, + PT32_ROOT_LEVEL); mmu->pae_root[i] = root | PT_PRESENT_MASK | shadow_me_value; } @@ -3454,7 +3909,7 @@ static int mmu_first_shadow_root_alloc(struct kvm *kvm) kvm_page_track_write_tracking_enabled(kvm)) goto out_success; - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot(slot, bkt, slots) { /* @@ -3493,15 +3948,16 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) struct kvm_mmu *mmu = vcpu->arch.mmu; u64 pdptrs[4], pm_mask; gfn_t root_gfn, root_pgd; + int quadrant, i, r; hpa_t root; - unsigned i; - int r; - root_pgd = mmu->get_guest_pgd(vcpu); - root_gfn = root_pgd >> PAGE_SHIFT; + root_pgd = kvm_mmu_get_guest_pgd(vcpu, mmu); + root_gfn = (root_pgd & __PT_BASE_ADDR_MASK) >> PAGE_SHIFT; - if (mmu_check_root(vcpu, root_gfn)) - return 1; + if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) { + mmu->root.hpa = kvm_mmu_get_dummy_root(); + return 0; + } /* * On SVM, reading PDPTRs might access guest memory, which might fault @@ -3513,8 +3969,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) if (!(pdptrs[i] & PT_PRESENT_MASK)) continue; - if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT)) - return 1; + if (!kvm_vcpu_is_visible_gfn(vcpu, pdptrs[i] >> PAGE_SHIFT)) + pdptrs[i] = 0; } } @@ -3533,7 +3989,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) */ if (mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) { root = mmu_alloc_root(vcpu, root_gfn, 0, - mmu->root_role.level, false); + mmu->root_role.level); mmu->root.hpa = root; goto set_root_pgd; } @@ -3578,8 +4034,15 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) root_gfn = pdptrs[i] >> PAGE_SHIFT; } - root = mmu_alloc_root(vcpu, root_gfn, i << 30, - PT32_ROOT_LEVEL, false); + /* + * If shadowing 32-bit non-PAE page tables, each PAE page + * directory maps one quarter of the guest's non-PAE page + * directory. Othwerise each PAE page direct shadows one guest + * PAE page directory so that quadrant should be 0. + */ + quadrant = (mmu->cpu_role.base.level == PT32_ROOT_LEVEL) ? i : 0; + + root = mmu_alloc_root(vcpu, root_gfn, quadrant, PT32_ROOT_LEVEL); mmu->pae_root[i] = root | pm_mask; } @@ -3674,7 +4137,7 @@ static bool is_unsync_root(hpa_t root) { struct kvm_mmu_page *sp; - if (!VALID_PAGE(root)) + if (!VALID_PAGE(root) || kvm_mmu_is_dummy_root(root)) return false; /* @@ -3690,7 +4153,7 @@ static bool is_unsync_root(hpa_t root) * requirement isn't satisfied. */ smp_rmb(); - sp = to_shadow_page(root); + sp = root_to_sp(root); /* * PAE roots (somewhat arbitrarily) aren't backed by shadow pages, the @@ -3720,11 +4183,12 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) if (vcpu->arch.mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) { hpa_t root = vcpu->arch.mmu->root.hpa; - sp = to_shadow_page(root); if (!is_unsync_root(root)) return; + sp = root_to_sp(root); + write_lock(&vcpu->kvm->mmu_lock); mmu_sync_children(vcpu, sp, true); write_unlock(&vcpu->kvm->mmu_lock); @@ -3737,8 +4201,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) hpa_t root = vcpu->arch.mmu->pae_root[i]; if (IS_VALID_PAE_ROOT(root)) { - root &= PT64_BASE_ADDR_MASK; - sp = to_shadow_page(root); + sp = spte_to_child_sp(root); mmu_sync_children(vcpu, sp, true); } } @@ -3808,23 +4271,31 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level return leaf; } -/* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */ -static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) +static int get_sptes_lockless(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, + int *root_level) { - u64 sptes[PT64_ROOT_MAX_LEVEL + 1]; - struct rsvd_bits_validate *rsvd_check; - int root, leaf, level; - bool reserved = false; + int leaf; walk_shadow_page_lockless_begin(vcpu); - if (is_tdp_mmu(vcpu->arch.mmu)) - leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root); + if (is_tdp_mmu_active(vcpu)) + leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, root_level); else - leaf = get_walk(vcpu, addr, sptes, &root); + leaf = get_walk(vcpu, addr, sptes, root_level); walk_shadow_page_lockless_end(vcpu); + return leaf; +} + +/* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */ +static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) +{ + u64 sptes[PT64_ROOT_MAX_LEVEL + 1]; + struct rsvd_bits_validate *rsvd_check; + int root, leaf, level; + bool reserved = false; + leaf = get_sptes_lockless(vcpu, addr, sptes, &root); if (unlikely(leaf < 0)) { *sptep = 0ull; return reserved; @@ -3867,10 +4338,10 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) return RET_PF_EMULATE; reserved = get_mmio_spte(vcpu, addr, &spte); - if (WARN_ON(reserved)) + if (WARN_ON_ONCE(reserved)) return -EINVAL; - if (is_mmio_spte(spte)) { + if (is_mmio_spte(vcpu->kvm, spte)) { gfn_t gfn = get_mmio_spte_gfn(spte); unsigned int access = get_mmio_spte_access(spte); @@ -3905,7 +4376,7 @@ static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu, * guest is writing the page which is write tracked which can * not be fixed by page fault handler. */ - if (kvm_slot_page_track_is_active(vcpu->kvm, fault->slot, fault->gfn, KVM_PAGE_TRACK_WRITE)) + if (kvm_gfn_is_write_tracked(vcpu->kvm, fault->slot, fault->gfn)) return true; return false; @@ -3933,24 +4404,28 @@ static u32 alloc_apf_token(struct kvm_vcpu *vcpu) return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; } -static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - gfn_t gfn) +static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) { struct kvm_arch_async_pf arch; arch.token = alloc_apf_token(vcpu); - arch.gfn = gfn; + arch.gfn = fault->gfn; + arch.error_code = fault->error_code; arch.direct_map = vcpu->arch.mmu->root_role.direct; - arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu); + arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu); - return kvm_setup_async_pf(vcpu, cr2_or_gpa, - kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); + return kvm_setup_async_pf(vcpu, fault->addr, + kvm_vcpu_gfn_to_hva(vcpu, fault->gfn), &arch); } void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) { int r; + if (WARN_ON_ONCE(work->arch.error_code & PFERR_PRIVATE_ACCESS)) + return; + if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) || work->wakeup_all) return; @@ -3960,65 +4435,243 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) return; if (!vcpu->arch.mmu->root_role.direct && - work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu)) + work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu)) return; - kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true); + r = kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code, + true, NULL, NULL); + + /* + * Account fixed page faults, otherwise they'll never be counted, but + * ignore stats for all other return times. Page-ready "faults" aren't + * truly spurious and never trigger emulation + */ + if (r == RET_PF_FIXED) + vcpu->stat.pf_fixed++; +} + +static inline u8 kvm_max_level_for_order(int order) +{ + BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G); + + KVM_MMU_WARN_ON(order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G) && + order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M) && + order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K)); + + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G)) + return PG_LEVEL_1G; + + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M)) + return PG_LEVEL_2M; + + return PG_LEVEL_4K; +} + +static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, + u8 max_level, int gmem_order) +{ + u8 req_max_level; + + if (max_level == PG_LEVEL_4K) + return PG_LEVEL_4K; + + max_level = min(kvm_max_level_for_order(gmem_order), max_level); + if (max_level == PG_LEVEL_4K) + return PG_LEVEL_4K; + + req_max_level = kvm_x86_call(private_max_mapping_level)(kvm, pfn); + if (req_max_level) + max_level = min(max_level, req_max_level); + + return max_level; +} + +static void kvm_mmu_finish_page_fault(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault, int r) +{ + kvm_release_faultin_page(vcpu->kvm, fault->refcounted_page, + r == RET_PF_RETRY, fault->map_writable); +} + +static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) +{ + int max_order, r; + + if (!kvm_slot_can_be_private(fault->slot)) { + kvm_mmu_prepare_memory_fault_exit(vcpu, fault); + return -EFAULT; + } + + r = kvm_gmem_get_pfn(vcpu->kvm, fault->slot, fault->gfn, &fault->pfn, + &fault->refcounted_page, &max_order); + if (r) { + kvm_mmu_prepare_memory_fault_exit(vcpu, fault); + return r; + } + + fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY); + fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->pfn, + fault->max_level, max_order); + + return RET_PF_CONTINUE; } -static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) +static int __kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) +{ + unsigned int foll = fault->write ? FOLL_WRITE : 0; + + if (fault->is_private) + return kvm_mmu_faultin_pfn_private(vcpu, fault); + + foll |= FOLL_NOWAIT; + fault->pfn = __kvm_faultin_pfn(fault->slot, fault->gfn, foll, + &fault->map_writable, &fault->refcounted_page); + + /* + * If resolving the page failed because I/O is needed to fault-in the + * page, then either set up an asynchronous #PF to do the I/O, or if + * doing an async #PF isn't possible, retry with I/O allowed. All + * other failures are terminal, i.e. retrying won't help. + */ + if (fault->pfn != KVM_PFN_ERR_NEEDS_IO) + return RET_PF_CONTINUE; + + if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) { + trace_kvm_try_async_get_page(fault->addr, fault->gfn); + if (kvm_find_async_pf_gfn(vcpu, fault->gfn)) { + trace_kvm_async_pf_repeated_fault(fault->addr, fault->gfn); + kvm_make_request(KVM_REQ_APF_HALT, vcpu); + return RET_PF_RETRY; + } else if (kvm_arch_setup_async_pf(vcpu, fault)) { + return RET_PF_RETRY; + } + } + + /* + * Allow gup to bail on pending non-fatal signals when it's also allowed + * to wait for IO. Note, gup always bails if it is unable to quickly + * get a page and a fatal signal, i.e. SIGKILL, is pending. + */ + foll |= FOLL_INTERRUPTIBLE; + foll &= ~FOLL_NOWAIT; + fault->pfn = __kvm_faultin_pfn(fault->slot, fault->gfn, foll, + &fault->map_writable, &fault->refcounted_page); + + return RET_PF_CONTINUE; +} + +static int kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault, unsigned int access) { struct kvm_memory_slot *slot = fault->slot; - bool async; + struct kvm *kvm = vcpu->kvm; + int ret; + + if (KVM_BUG_ON(kvm_is_gfn_alias(kvm, fault->gfn), kvm)) + return -EFAULT; + + /* + * Note that the mmu_invalidate_seq also serves to detect a concurrent + * change in attributes. is_page_fault_stale() will detect an + * invalidation relate to fault->fn and resume the guest without + * installing a mapping in the page tables. + */ + fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq; + smp_rmb(); + + /* + * Now that we have a snapshot of mmu_invalidate_seq we can check for a + * private vs. shared mismatch. + */ + if (fault->is_private != kvm_mem_is_private(kvm, fault->gfn)) { + kvm_mmu_prepare_memory_fault_exit(vcpu, fault); + return -EFAULT; + } + + if (unlikely(!slot)) + return kvm_handle_noslot_fault(vcpu, fault, access); /* * Retry the page fault if the gfn hit a memslot that is being deleted * or moved. This ensures any existing SPTEs for the old memslot will * be zapped before KVM inserts a new MMIO SPTE for the gfn. */ - if (slot && (slot->flags & KVM_MEMSLOT_INVALID)) + if (slot->flags & KVM_MEMSLOT_INVALID) return RET_PF_RETRY; - if (!kvm_is_visible_memslot(slot)) { - /* Don't expose private memslots to L2. */ - if (is_guest_mode(vcpu)) { - fault->slot = NULL; - fault->pfn = KVM_PFN_NOSLOT; - fault->map_writable = false; - return RET_PF_CONTINUE; - } + if (slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) { + /* + * Don't map L1's APIC access page into L2, KVM doesn't support + * using APICv/AVIC to accelerate L2 accesses to L1's APIC, + * i.e. the access needs to be emulated. Emulating access to + * L1's APIC is also correct if L1 is accelerating L2's own + * virtual APIC, but for some reason L1 also maps _L1's_ APIC + * into L2. Note, vcpu_is_mmio_gpa() always treats access to + * the APIC as MMIO. Allow an MMIO SPTE to be created, as KVM + * uses different roots for L1 vs. L2, i.e. there is no danger + * of breaking APICv/AVIC for L1. + */ + if (is_guest_mode(vcpu)) + return kvm_handle_noslot_fault(vcpu, fault, access); + /* * If the APIC access page exists but is disabled, go directly * to emulation without caching the MMIO access or creating a * MMIO SPTE. That way the cache doesn't need to be purged * when the AVIC is re-enabled. */ - if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT && - !kvm_apicv_activated(vcpu->kvm)) + if (!kvm_apicv_activated(vcpu->kvm)) return RET_PF_EMULATE; } - async = false; - fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async, - fault->write, &fault->map_writable, - &fault->hva); - if (!async) - return RET_PF_CONTINUE; /* *pfn has correct page already */ + /* + * Check for a relevant mmu_notifier invalidation event before getting + * the pfn from the primary MMU, and before acquiring mmu_lock. + * + * For mmu_lock, if there is an in-progress invalidation and the kernel + * allows preemption, the invalidation task may drop mmu_lock and yield + * in response to mmu_lock being contended, which is *very* counter- + * productive as this vCPU can't actually make forward progress until + * the invalidation completes. + * + * Retrying now can also avoid unnessary lock contention in the primary + * MMU, as the primary MMU doesn't necessarily hold a single lock for + * the duration of the invalidation, i.e. faulting in a conflicting pfn + * can cause the invalidation to take longer by holding locks that are + * needed to complete the invalidation. + * + * Do the pre-check even for non-preemtible kernels, i.e. even if KVM + * will never yield mmu_lock in response to contention, as this vCPU is + * *guaranteed* to need to retry, i.e. waiting until mmu_lock is held + * to detect retry guarantees the worst case latency for the vCPU. + */ + if (mmu_invalidate_retry_gfn_unsafe(kvm, fault->mmu_seq, fault->gfn)) + return RET_PF_RETRY; - if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) { - trace_kvm_try_async_get_page(fault->addr, fault->gfn); - if (kvm_find_async_pf_gfn(vcpu, fault->gfn)) { - trace_kvm_async_pf_doublefault(fault->addr, fault->gfn); - kvm_make_request(KVM_REQ_APF_HALT, vcpu); - return RET_PF_RETRY; - } else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn)) { - return RET_PF_RETRY; - } + ret = __kvm_mmu_faultin_pfn(vcpu, fault); + if (ret != RET_PF_CONTINUE) + return ret; + + if (unlikely(is_error_pfn(fault->pfn))) + return kvm_handle_error_pfn(vcpu, fault); + + if (WARN_ON_ONCE(!fault->slot || is_noslot_pfn(fault->pfn))) + return kvm_handle_noslot_fault(vcpu, fault, access); + + /* + * Check again for a relevant mmu_notifier invalidation event purely to + * avoid contending mmu_lock. Most invalidations will be detected by + * the previous check, but checking is extremely cheap relative to the + * overall cost of failing to detect the invalidation until after + * mmu_lock is acquired. + */ + if (mmu_invalidate_retry_gfn_unsafe(kvm, fault->mmu_seq, fault->gfn)) { + kvm_mmu_finish_page_fault(vcpu, fault, RET_PF_RETRY); + return RET_PF_RETRY; } - fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, NULL, - fault->write, &fault->map_writable, - &fault->hva); return RET_PF_CONTINUE; } @@ -4027,9 +4680,9 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) * root was invalidated by a memslot update or a relevant mmu_notifier fired. */ static bool is_page_fault_stale(struct kvm_vcpu *vcpu, - struct kvm_page_fault *fault, int mmu_seq) + struct kvm_page_fault *fault) { - struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root.hpa); + struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa); /* Special roots, e.g. pae_root, are not backed by shadow pages. */ if (sp && is_obsolete_sp(vcpu->kvm, sp)) @@ -4046,22 +4699,25 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu, if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu)) return true; + /* + * Check for a relevant mmu_notifier invalidation event one last time + * now that mmu_lock is held, as the "unsafe" checks performed without + * holding mmu_lock can get false negatives. + */ return fault->slot && - mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva); + mmu_invalidate_retry_gfn(vcpu->kvm, fault->mmu_seq, fault->gfn); } static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu); - - unsigned long mmu_seq; int r; - fault->gfn = fault->addr >> PAGE_SHIFT; - fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn); + /* Dummy roots are used only for shadowing bad guest roots. */ + if (WARN_ON_ONCE(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa))) + return RET_PF_RETRY; if (page_fault_handle_page_track(vcpu, fault)) - return RET_PF_EMULATE; + return RET_PF_WRITE_PROTECTED; r = fast_page_fault(vcpu, fault); if (r != RET_PF_INVALID) @@ -4071,50 +4727,31 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (r) return r; - mmu_seq = vcpu->kvm->mmu_notifier_seq; - smp_rmb(); - - r = kvm_faultin_pfn(vcpu, fault); - if (r != RET_PF_CONTINUE) - return r; - - r = handle_abnormal_pfn(vcpu, fault, ACC_ALL); + r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_ALL); if (r != RET_PF_CONTINUE) return r; r = RET_PF_RETRY; + write_lock(&vcpu->kvm->mmu_lock); - if (is_tdp_mmu_fault) - read_lock(&vcpu->kvm->mmu_lock); - else - write_lock(&vcpu->kvm->mmu_lock); - - if (is_page_fault_stale(vcpu, fault, mmu_seq)) + if (is_page_fault_stale(vcpu, fault)) goto out_unlock; r = make_mmu_pages_available(vcpu); if (r) goto out_unlock; - if (is_tdp_mmu_fault) - r = kvm_tdp_mmu_map(vcpu, fault); - else - r = __direct_map(vcpu, fault); + r = direct_map(vcpu, fault); out_unlock: - if (is_tdp_mmu_fault) - read_unlock(&vcpu->kvm->mmu_lock); - else - write_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(fault->pfn); + kvm_mmu_finish_page_fault(vcpu, fault, r); + write_unlock(&vcpu->kvm->mmu_lock); return r; } static int nonpaging_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - pgprintk("%s: gva %lx error %x\n", __func__, fault->addr, fault->error_code); - /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */ fault->max_level = PG_LEVEL_2M; return direct_page_fault(vcpu, fault); @@ -4131,13 +4768,24 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, if (WARN_ON_ONCE(fault_address >> 32)) return -EFAULT; #endif + /* + * Legacy #PF exception only have a 32-bit error code. Simply drop the + * upper bits as KVM doesn't use them for #PF (because they are never + * set), and to ensure there are no collisions with KVM-defined bits. + */ + if (WARN_ON_ONCE(error_code >> 32)) + error_code = lower_32_bits(error_code); + + /* + * Restrict KVM-defined flags to bits 63:32 so that it's impossible for + * them to conflict with #PF error codes, which are limited to 32 bits. + */ + BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK)); vcpu->arch.l1tf_flush_l1d = true; if (!flags) { - trace_kvm_page_fault(fault_address, error_code); + trace_kvm_page_fault(vcpu, fault_address, error_code); - if (kvm_event_needs_reinjection(vcpu)) - kvm_mmu_unprotect_page_virt(vcpu, fault_address); r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn, insn_len); } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) { @@ -4153,35 +4801,165 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, } EXPORT_SYMBOL_GPL(kvm_handle_page_fault); +#ifdef CONFIG_X86_64 +static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) +{ + int r; + + if (page_fault_handle_page_track(vcpu, fault)) + return RET_PF_WRITE_PROTECTED; + + r = fast_page_fault(vcpu, fault); + if (r != RET_PF_INVALID) + return r; + + r = mmu_topup_memory_caches(vcpu, false); + if (r) + return r; + + r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_ALL); + if (r != RET_PF_CONTINUE) + return r; + + r = RET_PF_RETRY; + read_lock(&vcpu->kvm->mmu_lock); + + if (is_page_fault_stale(vcpu, fault)) + goto out_unlock; + + r = kvm_tdp_mmu_map(vcpu, fault); + +out_unlock: + kvm_mmu_finish_page_fault(vcpu, fault, r); + read_unlock(&vcpu->kvm->mmu_lock); + return r; +} +#endif + int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - while (fault->max_level > PG_LEVEL_4K) { - int page_num = KVM_PAGES_PER_HPAGE(fault->max_level); - gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1); +#ifdef CONFIG_X86_64 + if (tdp_mmu_enabled) + return kvm_tdp_mmu_page_fault(vcpu, fault); +#endif - if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num)) - break; + return direct_page_fault(vcpu, fault); +} + +int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level) +{ + int r; - --fault->max_level; + /* + * Restrict to TDP page fault, since that's the only case where the MMU + * is indexed by GPA. + */ + if (vcpu->arch.mmu->page_fault != kvm_tdp_page_fault) + return -EOPNOTSUPP; + + do { + if (signal_pending(current)) + return -EINTR; + + if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) + return -EIO; + + cond_resched(); + r = kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level); + } while (r == RET_PF_RETRY); + + if (r < 0) + return r; + + switch (r) { + case RET_PF_FIXED: + case RET_PF_SPURIOUS: + case RET_PF_WRITE_PROTECTED: + return 0; + + case RET_PF_EMULATE: + return -ENOENT; + + case RET_PF_RETRY: + case RET_PF_CONTINUE: + case RET_PF_INVALID: + default: + WARN_ONCE(1, "could not fix page fault during prefault"); + return -EIO; } +} +EXPORT_SYMBOL_GPL(kvm_tdp_map_page); - return direct_page_fault(vcpu, fault); +long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, + struct kvm_pre_fault_memory *range) +{ + u64 error_code = PFERR_GUEST_FINAL_MASK; + u8 level = PG_LEVEL_4K; + u64 direct_bits; + u64 end; + int r; + + if (!vcpu->kvm->arch.pre_fault_allowed) + return -EOPNOTSUPP; + + if (kvm_is_gfn_alias(vcpu->kvm, gpa_to_gfn(range->gpa))) + return -EINVAL; + + /* + * reload is efficient when called repeatedly, so we can do it on + * every iteration. + */ + r = kvm_mmu_reload(vcpu); + if (r) + return r; + + direct_bits = 0; + if (kvm_arch_has_private_mem(vcpu->kvm) && + kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa))) + error_code |= PFERR_PRIVATE_ACCESS; + else + direct_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm)); + + /* + * Shadow paging uses GVA for kvm page fault, so restrict to + * two-dimensional paging. + */ + r = kvm_tdp_map_page(vcpu, range->gpa | direct_bits, error_code, &level); + if (r < 0) + return r; + + /* + * If the mapping that covers range->gpa can use a huge page, it + * may start below it or end after range->gpa + range->size. + */ + end = (range->gpa & KVM_HPAGE_MASK(level)) + KVM_HPAGE_SIZE(level); + return min(range->size, end - range->gpa); } static void nonpaging_init_context(struct kvm_mmu *context) { context->page_fault = nonpaging_page_fault; context->gva_to_gpa = nonpaging_gva_to_gpa; - context->sync_page = nonpaging_sync_page; - context->invlpg = NULL; + context->sync_spte = NULL; } static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd, union kvm_mmu_page_role role) { - return (role.direct || pgd == root->pgd) && - VALID_PAGE(root->hpa) && - role.word == to_shadow_page(root->hpa)->role.word; + struct kvm_mmu_page *sp; + + if (!VALID_PAGE(root->hpa)) + return false; + + if (!role.direct && pgd != root->pgd) + return false; + + sp = root_to_sp(root->hpa); + if (WARN_ON_ONCE(!sp)) + return false; + + return role.word == sp->role.word; } /* @@ -4251,11 +5029,10 @@ static bool fast_pgd_switch(struct kvm *kvm, struct kvm_mmu *mmu, gpa_t new_pgd, union kvm_mmu_page_role new_role) { /* - * For now, limit the caching to 64-bit hosts+VMs in order to avoid - * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs - * later if necessary. + * Limit reuse to 64-bit hosts+VMs without "special" roots in order to + * avoid having to deal with PDPTEs and other complexities. */ - if (VALID_PAGE(mmu->root.hpa) && !to_shadow_page(mmu->root.hpa)) + if (VALID_PAGE(mmu->root.hpa) && !root_to_sp(mmu->root.hpa)) kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT); if (VALID_PAGE(mmu->root.hpa)) @@ -4269,10 +5046,12 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd) struct kvm_mmu *mmu = vcpu->arch.mmu; union kvm_mmu_page_role new_role = mmu->root_role; - if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role)) { - /* kvm_mmu_ensure_valid_pgd will set up a new root. */ + /* + * Return immediately if no usable root was found, kvm_mmu_reload() + * will establish a valid root prior to the next VM-Enter. + */ + if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role)) return; - } /* * It's possible that the cached previous root page is obsolete because @@ -4299,21 +5078,19 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd) * If this is a direct root page, it doesn't have a write flooding * count. Otherwise, clear the write flooding count. */ - if (!new_role.direct) - __clear_sp_write_flooding_count( - to_shadow_page(vcpu->arch.mmu->root.hpa)); -} -EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd); + if (!new_role.direct) { + struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa); -static unsigned long get_cr3(struct kvm_vcpu *vcpu) -{ - return kvm_read_cr3(vcpu); + if (!WARN_ON_ONCE(!sp)) + __clear_sp_write_flooding_count(sp); + } } +EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd); static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, unsigned int access) { - if (unlikely(is_mmio_spte(*sptep))) { + if (unlikely(is_mmio_spte(vcpu->kvm, *sptep))) { if (gfn != get_mmio_spte_gfn(*sptep)) { mmu_spte_clear_no_track(sptep); return true; @@ -4339,10 +5116,9 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, #include "paging_tmpl.h" #undef PTTYPE -static void -__reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check, - u64 pa_bits_rsvd, int level, bool nx, bool gbpages, - bool pse, bool amd) +static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check, + u64 pa_bits_rsvd, int level, bool nx, + bool gbpages, bool pse, bool amd) { u64 gbpages_bit_rsvd = 0; u64 nonleaf_bit8_rsvd = 0; @@ -4429,35 +5205,20 @@ __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check, } } -static bool guest_can_use_gbpages(struct kvm_vcpu *vcpu) -{ - /* - * If TDP is enabled, let the guest use GBPAGES if they're supported in - * hardware. The hardware page walker doesn't let KVM disable GBPAGES, - * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA - * walk for performance and complexity reasons. Not to mention KVM - * _can't_ solve the problem because GVA->GPA walks aren't visible to - * KVM once a TDP translation is installed. Mimic hardware behavior so - * that KVM's is at least consistent, i.e. doesn't randomly inject #PF. - */ - return tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) : - guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES); -} - static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { __reset_rsvds_bits_mask(&context->guest_rsvd_check, vcpu->arch.reserved_gpa_bits, context->cpu_role.base.level, is_efer_nx(context), - guest_can_use_gbpages(vcpu), + guest_cpu_cap_has(vcpu, X86_FEATURE_GBPAGES), is_cr4_pse(context), - guest_cpuid_is_amd_or_hygon(vcpu)); + guest_cpuid_is_amd_compatible(vcpu)); } -static void -__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, - u64 pa_bits_rsvd, bool execonly, int huge_page_level) +static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, + u64 pa_bits_rsvd, bool execonly, + int huge_page_level) { u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51); u64 large_1g_rsvd = 0, large_2m_rsvd = 0; @@ -4503,7 +5264,7 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, static inline u64 reserved_hpa_bits(void) { - return rsvd_bits(shadow_phys_bits, 63); + return rsvd_bits(kvm_host.maxphyaddr, 63); } /* @@ -4527,7 +5288,8 @@ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), context->root_role.level, context->root_role.efer_nx, - guest_can_use_gbpages(vcpu), is_pse, is_amd); + guest_cpu_cap_has(vcpu, X86_FEATURE_GBPAGES), + is_pse, is_amd); if (!shadow_me_mask) return; @@ -4557,8 +5319,7 @@ static inline bool boot_cpu_is_amd(void) * the direct page table on host, use as much mmu features as * possible, however, kvm currently does not do execution-protection. */ -static void -reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context) +static void reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context) { struct rsvd_bits_validate *shadow_zero_check; int i; @@ -4567,7 +5328,7 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context) if (boot_cpu_is_amd()) __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), - context->root_role.level, false, + context->root_role.level, true, boot_cpu_has(X86_FEATURE_GBPAGES), false, true); else @@ -4761,20 +5522,18 @@ static void paging64_init_context(struct kvm_mmu *context) { context->page_fault = paging64_page_fault; context->gva_to_gpa = paging64_gva_to_gpa; - context->sync_page = paging64_sync_page; - context->invlpg = paging64_invlpg; + context->sync_spte = paging64_sync_spte; } static void paging32_init_context(struct kvm_mmu *context) { context->page_fault = paging32_page_fault; context->gva_to_gpa = paging32_gva_to_gpa; - context->sync_page = paging32_sync_page; - context->invlpg = paging32_invlpg; + context->sync_spte = paging32_sync_spte; } -static union kvm_cpu_role -kvm_calc_cpu_role(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs) +static union kvm_cpu_role kvm_calc_cpu_role(struct kvm_vcpu *vcpu, + const struct kvm_mmu_role_regs *regs) { union kvm_cpu_role role = {0}; @@ -4813,19 +5572,46 @@ kvm_calc_cpu_role(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs) return role; } +void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu, + struct kvm_mmu *mmu) +{ + const bool cr0_wp = kvm_is_cr0_bit_set(vcpu, X86_CR0_WP); + + BUILD_BUG_ON((KVM_MMU_CR0_ROLE_BITS & KVM_POSSIBLE_CR0_GUEST_BITS) != X86_CR0_WP); + BUILD_BUG_ON((KVM_MMU_CR4_ROLE_BITS & KVM_POSSIBLE_CR4_GUEST_BITS)); + + if (is_cr0_wp(mmu) == cr0_wp) + return; + + mmu->cpu_role.base.cr0_wp = cr0_wp; + reset_guest_paging_metadata(vcpu, mmu); +} + static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) { + int maxpa; + + if (vcpu->kvm->arch.vm_type == KVM_X86_TDX_VM) + maxpa = cpuid_query_maxguestphyaddr(vcpu); + else + maxpa = cpuid_maxphyaddr(vcpu); + /* tdp_root_level is architecture forced level, use it if nonzero */ if (tdp_root_level) return tdp_root_level; /* Use 5-level TDP if and only if it's useful/necessary. */ - if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48) + if (max_tdp_level == 5 && maxpa <= 48) return 4; return max_tdp_level; } +u8 kvm_mmu_get_max_tdp_level(void) +{ + return tdp_root_level ? tdp_root_level : max_tdp_level; +} + static union kvm_mmu_page_role kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, union kvm_cpu_role cpu_role) @@ -4837,7 +5623,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, role.efer_nx = true; role.smm = cpu_role.base.smm; role.guest_mode = cpu_role.base.guest_mode; - role.ad_disabled = !kvm_ad_enabled(); + role.ad_disabled = !kvm_ad_enabled; role.level = kvm_mmu_get_tdp_level(vcpu); role.direct = true; role.has_4_byte_gpte = false; @@ -4858,9 +5644,8 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu, context->cpu_role.as_u64 = cpu_role.as_u64; context->root_role.word = root_role.word; context->page_fault = kvm_tdp_page_fault; - context->sync_page = nonpaging_sync_page; - context->invlpg = NULL; - context->get_guest_pgd = get_cr3; + context->sync_spte = NULL; + context->get_guest_pgd = get_guest_cr3; context->get_pdptr = kvm_pdptr_read; context->inject_page_fault = kvm_inject_page_fault; @@ -4935,7 +5720,7 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, union kvm_mmu_page_role root_role; /* NPT requires CR0.PG=1. */ - WARN_ON_ONCE(cpu_role.base.direct); + WARN_ON_ONCE(cpu_role.base.direct || !cpu_role.base.guest_mode); root_role = cpu_role.base; root_role.level = kvm_mmu_get_tdp_level(vcpu); @@ -4990,8 +5775,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, context->page_fault = ept_page_fault; context->gva_to_gpa = ept_gva_to_gpa; - context->sync_page = ept_sync_page; - context->invlpg = ept_invlpg; + context->sync_spte = ept_sync_spte; update_permission_bitmask(context, true); context->pkru_mask = 0; @@ -5010,7 +5794,7 @@ static void init_kvm_softmmu(struct kvm_vcpu *vcpu, kvm_init_shadow_mmu(vcpu, cpu_role); - context->get_guest_pgd = get_cr3; + context->get_guest_pgd = get_guest_cr3; context->get_pdptr = kvm_pdptr_read; context->inject_page_fault = kvm_inject_page_fault; } @@ -5024,7 +5808,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu, return; g_context->cpu_role.as_u64 = new_mode.as_u64; - g_context->get_guest_pgd = get_cr3; + g_context->get_guest_pgd = get_guest_cr3; g_context->get_pdptr = kvm_pdptr_read; g_context->inject_page_fault = kvm_inject_page_fault; @@ -5032,7 +5816,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu, * L2 page tables are never shadowed, so there is no need to sync * SPTEs. */ - g_context->invlpg = NULL; + g_context->sync_spte = NULL; /* * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using @@ -5078,13 +5862,13 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) * physical address properties) in a single VM would require tracking * all relevant CPUID information in kvm_mmu_page_role. That is very * undesirable as it would increase the memory requirements for - * gfn_track (see struct kvm_mmu_page_role comments). For now that - * problem is swept under the rug; KVM's CPUID API is horrific and + * gfn_write_track (see struct kvm_mmu_page_role comments). For now + * that problem is swept under the rug; KVM's CPUID API is horrific and * it's all but impossible to solve it without introducing a new API. */ - vcpu->arch.root_mmu.root_role.word = 0; - vcpu->arch.guest_mmu.root_role.word = 0; - vcpu->arch.nested_mmu.root_role.word = 0; + vcpu->arch.root_mmu.root_role.invalid = 1; + vcpu->arch.guest_mmu.root_role.invalid = 1; + vcpu->arch.nested_mmu.root_role.invalid = 1; vcpu->arch.root_mmu.cpu_role.ext.valid = 0; vcpu->arch.guest_mmu.cpu_role.ext.valid = 0; vcpu->arch.nested_mmu.cpu_role.ext.valid = 0; @@ -5094,7 +5878,7 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) * Changing guest CPUID after KVM_RUN is forbidden, see the comment in * kvm_arch_vcpu_ioctl(). */ - KVM_BUG_ON(vcpu->arch.last_vmentry_cpu != -1, vcpu->kvm); + KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm); } void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) @@ -5132,19 +5916,20 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) * stale entries. Flushing on alloc also allows KVM to skip the TLB * flush when freeing a root (see kvm_tdp_mmu_put_root()). */ - static_call(kvm_x86_flush_tlb_current)(vcpu); + kvm_x86_call(flush_tlb_current)(vcpu); out: return r; } +EXPORT_SYMBOL_GPL(kvm_mmu_load); void kvm_mmu_unload(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; kvm_mmu_free_roots(kvm, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL); - WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root.hpa)); + WARN_ON_ONCE(VALID_PAGE(vcpu->arch.root_mmu.root.hpa)); kvm_mmu_free_roots(kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); - WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa)); + WARN_ON_ONCE(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa)); vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); } @@ -5157,16 +5942,21 @@ static bool is_obsolete_root(struct kvm *kvm, hpa_t root_hpa) /* * When freeing obsolete roots, treat roots as obsolete if they don't - * have an associated shadow page. This does mean KVM will get false + * have an associated shadow page, as it's impossible to determine if + * such roots are fresh or stale. This does mean KVM will get false * positives and free roots that don't strictly need to be freed, but * such false positives are relatively rare: * - * (a) only PAE paging and nested NPT has roots without shadow pages + * (a) only PAE paging and nested NPT have roots without shadow pages + * (or any shadow paging flavor with a dummy root, see note below) * (b) remote reloads due to a memslot update obsoletes _all_ roots * (c) KVM doesn't track previous roots for PAE paging, and the guest * is unlikely to zap an in-use PGD. + * + * Note! Dummy roots are unique in that they are obsoleted by memslot + * _creation_! See also FNAME(fetch). */ - sp = to_shadow_page(root_hpa); + sp = root_to_sp(root_hpa); return !sp || is_obsolete_sp(kvm, sp); } @@ -5192,19 +5982,7 @@ void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu) __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu); __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu); } - -static bool need_remote_flush(u64 old, u64 new) -{ - if (!is_shadow_present_pte(old)) - return false; - if (!is_shadow_present_pte(new)) - return true; - if ((old ^ new) & PT64_BASE_ADDR_MASK) - return true; - old ^= shadow_nx_mask; - new ^= shadow_nx_mask; - return (old & ~new & PT64_PERM_MASK) != 0; -} +EXPORT_SYMBOL_GPL(kvm_mmu_free_obsolete_roots); static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, int *bytes) @@ -5258,9 +6036,6 @@ static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa, { unsigned offset, pte_size, misaligned; - pgprintk("misaligned: gpa %llx bytes %d role %x\n", - gpa, bytes, sp->role.word); - offset = offset_in_page(gpa); pte_size = sp->role.has_4_byte_gpte ? 4 : 8; @@ -5308,9 +6083,8 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte) return spte; } -static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes, - struct kvm_page_track_notifier_node *node) +void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, + int bytes) { gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *sp; @@ -5320,21 +6094,17 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, bool flush = false; /* - * If we don't have indirect shadow pages, it means no page is - * write-protected, so we can exit simply. + * When emulating guest writes, ensure the written value is visible to + * any task that is handling page faults before checking whether or not + * KVM is shadowing a guest PTE. This ensures either KVM will create + * the correct SPTE in the page fault handler, or this task will see + * a non-zero indirect_shadow_pages. Pairs with the smp_mb() in + * account_shadowed(). */ - if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) + smp_mb(); + if (!vcpu->kvm->arch.indirect_shadow_pages) return; - pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); - - /* - * No need to care whether allocation memory is successful - * or not since pte prefetch is skipped if it does not have - * enough objects in the cache. - */ - mmu_topup_memory_caches(vcpu, true); - write_lock(&vcpu->kvm->mmu_lock); gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes); @@ -5358,7 +6128,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL); if (gentry && sp->role.level != PG_LEVEL_4K) ++vcpu->kvm->stat.mmu_pde_zapped; - if (need_remote_flush(entry, *spte)) + if (is_shadow_present_pte(entry)) flush = true; ++spte; } @@ -5367,108 +6137,274 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, write_unlock(&vcpu->kvm->mmu_lock); } +static bool is_write_to_guest_page_table(u64 error_code) +{ + const u64 mask = PFERR_GUEST_PAGE_MASK | PFERR_WRITE_MASK | PFERR_PRESENT_MASK; + + return (error_code & mask) == mask; +} + +static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + u64 error_code, int *emulation_type) +{ + bool direct = vcpu->arch.mmu->root_role.direct; + + /* + * Do not try to unprotect and retry if the vCPU re-faulted on the same + * RIP with the same address that was previously unprotected, as doing + * so will likely put the vCPU into an infinite. E.g. if the vCPU uses + * a non-page-table modifying instruction on the PDE that points to the + * instruction, then unprotecting the gfn will unmap the instruction's + * code, i.e. make it impossible for the instruction to ever complete. + */ + if (vcpu->arch.last_retry_eip == kvm_rip_read(vcpu) && + vcpu->arch.last_retry_addr == cr2_or_gpa) + return RET_PF_EMULATE; + + /* + * Reset the unprotect+retry values that guard against infinite loops. + * The values will be refreshed if KVM explicitly unprotects a gfn and + * retries, in all other cases it's safe to retry in the future even if + * the next page fault happens on the same RIP+address. + */ + vcpu->arch.last_retry_eip = 0; + vcpu->arch.last_retry_addr = 0; + + /* + * It should be impossible to reach this point with an MMIO cache hit, + * as RET_PF_WRITE_PROTECTED is returned if and only if there's a valid, + * writable memslot, and creating a memslot should invalidate the MMIO + * cache by way of changing the memslot generation. WARN and disallow + * retry if MMIO is detected, as retrying MMIO emulation is pointless + * and could put the vCPU into an infinite loop because the processor + * will keep faulting on the non-existent MMIO address. + */ + if (WARN_ON_ONCE(mmio_info_in_cache(vcpu, cr2_or_gpa, direct))) + return RET_PF_EMULATE; + + /* + * Before emulating the instruction, check to see if the access was due + * to a read-only violation while the CPU was walking non-nested NPT + * page tables, i.e. for a direct MMU, for _guest_ page tables in L1. + * If L1 is sharing (a subset of) its page tables with L2, e.g. by + * having nCR3 share lower level page tables with hCR3, then when KVM + * (L0) write-protects the nested NPTs, i.e. npt12 entries, KVM is also + * unknowingly write-protecting L1's guest page tables, which KVM isn't + * shadowing. + * + * Because the CPU (by default) walks NPT page tables using a write + * access (to ensure the CPU can do A/D updates), page walks in L1 can + * trigger write faults for the above case even when L1 isn't modifying + * PTEs. As a result, KVM will unnecessarily emulate (or at least, try + * to emulate) an excessive number of L1 instructions; because L1's MMU + * isn't shadowed by KVM, there is no need to write-protect L1's gPTEs + * and thus no need to emulate in order to guarantee forward progress. + * + * Try to unprotect the gfn, i.e. zap any shadow pages, so that L1 can + * proceed without triggering emulation. If one or more shadow pages + * was zapped, skip emulation and resume L1 to let it natively execute + * the instruction. If no shadow pages were zapped, then the write- + * fault is due to something else entirely, i.e. KVM needs to emulate, + * as resuming the guest will put it into an infinite loop. + * + * Note, this code also applies to Intel CPUs, even though it is *very* + * unlikely that an L1 will share its page tables (IA32/PAE/paging64 + * format) with L2's page tables (EPT format). + * + * For indirect MMUs, i.e. if KVM is shadowing the current MMU, try to + * unprotect the gfn and retry if an event is awaiting reinjection. If + * KVM emulates multiple instructions before completing event injection, + * the event could be delayed beyond what is architecturally allowed, + * e.g. KVM could inject an IRQ after the TPR has been raised. + */ + if (((direct && is_write_to_guest_page_table(error_code)) || + (!direct && kvm_event_needs_reinjection(vcpu))) && + kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa)) + return RET_PF_RETRY; + + /* + * The gfn is write-protected, but if KVM detects its emulating an + * instruction that is unlikely to be used to modify page tables, or if + * emulation fails, KVM can try to unprotect the gfn and let the CPU + * re-execute the instruction that caused the page fault. Do not allow + * retrying an instruction from a nested guest as KVM is only explicitly + * shadowing L1's page tables, i.e. unprotecting something for L1 isn't + * going to magically fix whatever issue caused L2 to fail. + */ + if (!is_guest_mode(vcpu)) + *emulation_type |= EMULTYPE_ALLOW_RETRY_PF; + + return RET_PF_EMULATE; +} + int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, void *insn, int insn_len) { int r, emulation_type = EMULTYPE_PF; bool direct = vcpu->arch.mmu->root_role.direct; - if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) + if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) return RET_PF_RETRY; + /* + * Except for reserved faults (emulated MMIO is shared-only), set the + * PFERR_PRIVATE_ACCESS flag for software-protected VMs based on the gfn's + * current attributes, which are the source of truth for such VMs. Note, + * this wrong for nested MMUs as the GPA is an L2 GPA, but KVM doesn't + * currently supported nested virtualization (among many other things) + * for software-protected VMs. + */ + if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && + !(error_code & PFERR_RSVD_MASK) && + vcpu->kvm->arch.vm_type == KVM_X86_SW_PROTECTED_VM && + kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(cr2_or_gpa))) + error_code |= PFERR_PRIVATE_ACCESS; + r = RET_PF_INVALID; if (unlikely(error_code & PFERR_RSVD_MASK)) { + if (WARN_ON_ONCE(error_code & PFERR_PRIVATE_ACCESS)) + return -EFAULT; + r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct); if (r == RET_PF_EMULATE) goto emulate; } if (r == RET_PF_INVALID) { - r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, - lower_32_bits(error_code), false); + vcpu->stat.pf_taken++; + + r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false, + &emulation_type, NULL); if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm)) return -EIO; } if (r < 0) return r; - if (r != RET_PF_EMULATE) - return 1; + + if (r == RET_PF_WRITE_PROTECTED) + r = kvm_mmu_write_protect_fault(vcpu, cr2_or_gpa, error_code, + &emulation_type); + + if (r == RET_PF_FIXED) + vcpu->stat.pf_fixed++; + else if (r == RET_PF_EMULATE) + vcpu->stat.pf_emulate++; + else if (r == RET_PF_SPURIOUS) + vcpu->stat.pf_spurious++; /* - * Before emulating the instruction, check if the error code - * was due to a RO violation while translating the guest page. - * This can occur when using nested virtualization with nested - * paging in both guests. If true, we simply unprotect the page - * and resume the guest. + * None of handle_mmio_page_fault(), kvm_mmu_do_page_fault(), or + * kvm_mmu_write_protect_fault() return RET_PF_CONTINUE. + * kvm_mmu_do_page_fault() only uses RET_PF_CONTINUE internally to + * indicate continuing the page fault handling until to the final + * page table mapping phase. */ - if (vcpu->arch.mmu->root_role.direct && - (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) { - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)); - return 1; - } + WARN_ON_ONCE(r == RET_PF_CONTINUE); + if (r != RET_PF_EMULATE) + return r; - /* - * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still - * optimistically try to just unprotect the page and let the processor - * re-execute the instruction that caused the page fault. Do not allow - * retrying MMIO emulation, as it's not only pointless but could also - * cause us to enter an infinite loop because the processor will keep - * faulting on the non-existent MMIO address. Retrying an instruction - * from a nested guest is also pointless and dangerous as we are only - * explicitly shadowing L1's page tables, i.e. unprotecting something - * for L1 isn't going to magically fix whatever issue cause L2 to fail. - */ - if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu)) - emulation_type |= EMULTYPE_ALLOW_RETRY_PF; emulate: return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn, insn_len); } EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); -void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gva_t gva, hpa_t root_hpa) +void kvm_mmu_print_sptes(struct kvm_vcpu *vcpu, gpa_t gpa, const char *msg) +{ + u64 sptes[PT64_ROOT_MAX_LEVEL + 1]; + int root_level, leaf, level; + + leaf = get_sptes_lockless(vcpu, gpa, sptes, &root_level); + if (unlikely(leaf < 0)) + return; + + pr_err("%s %llx", msg, gpa); + for (level = root_level; level >= leaf; level--) + pr_cont(", spte[%d] = 0x%llx", level, sptes[level]); + pr_cont("\n"); +} +EXPORT_SYMBOL_GPL(kvm_mmu_print_sptes); + +static void __kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + u64 addr, hpa_t root_hpa) +{ + struct kvm_shadow_walk_iterator iterator; + + vcpu_clear_mmio_info(vcpu, addr); + + /* + * Walking and synchronizing SPTEs both assume they are operating in + * the context of the current MMU, and would need to be reworked if + * this is ever used to sync the guest_mmu, e.g. to emulate INVEPT. + */ + if (WARN_ON_ONCE(mmu != vcpu->arch.mmu)) + return; + + if (!VALID_PAGE(root_hpa)) + return; + + write_lock(&vcpu->kvm->mmu_lock); + for_each_shadow_entry_using_root(vcpu, root_hpa, addr, iterator) { + struct kvm_mmu_page *sp = sptep_to_sp(iterator.sptep); + + if (sp->unsync) { + int ret = kvm_sync_spte(vcpu, sp, iterator.index); + + if (ret < 0) + mmu_page_zap_pte(vcpu->kvm, sp, iterator.sptep, NULL); + if (ret) + kvm_flush_remote_tlbs_sptep(vcpu->kvm, iterator.sptep); + } + + if (!sp->unsync_children) + break; + } + write_unlock(&vcpu->kvm->mmu_lock); +} + +void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + u64 addr, unsigned long roots) { int i; + WARN_ON_ONCE(roots & ~KVM_MMU_ROOTS_ALL); + /* It's actually a GPA for vcpu->arch.guest_mmu. */ if (mmu != &vcpu->arch.guest_mmu) { /* INVLPG on a non-canonical address is a NOP according to the SDM. */ - if (is_noncanonical_address(gva, vcpu)) + if (is_noncanonical_invlpg_address(addr, vcpu)) return; - static_call(kvm_x86_flush_tlb_gva)(vcpu, gva); + kvm_x86_call(flush_tlb_gva)(vcpu, addr); } - if (!mmu->invlpg) + if (!mmu->sync_spte) return; - if (root_hpa == INVALID_PAGE) { - mmu->invlpg(vcpu, gva, mmu->root.hpa); + if (roots & KVM_MMU_ROOT_CURRENT) + __kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->root.hpa); - /* - * INVLPG is required to invalidate any global mappings for the VA, - * irrespective of PCID. Since it would take us roughly similar amount - * of work to determine whether any of the prev_root mappings of the VA - * is marked global, or to just sync it blindly, so we might as well - * just always sync it. - * - * Mappings not reachable via the current cr3 or the prev_roots will be - * synced when switching to that cr3, so nothing needs to be done here - * for them. - */ - for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - if (VALID_PAGE(mmu->prev_roots[i].hpa)) - mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); - } else { - mmu->invlpg(vcpu, gva, root_hpa); + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { + if (roots & KVM_MMU_ROOT_PREVIOUS(i)) + __kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->prev_roots[i].hpa); } } +EXPORT_SYMBOL_GPL(kvm_mmu_invalidate_addr); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { - kvm_mmu_invalidate_gva(vcpu, vcpu->arch.walk_mmu, gva, INVALID_PAGE); + /* + * INVLPG is required to invalidate any global mappings for the VA, + * irrespective of PCID. Blindly sync all roots as it would take + * roughly the same amount of work/time to determine whether any of the + * previous roots have a global mapping. + * + * Mappings not reachable via the current or previous cached roots will + * be synced when switching to that new cr3, so nothing needs to be + * done here for them. + */ + kvm_mmu_invalidate_addr(vcpu, vcpu->arch.walk_mmu, gva, KVM_MMU_ROOTS_ALL); ++vcpu->stat.invlpg; } EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); @@ -5477,27 +6413,20 @@ EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) { struct kvm_mmu *mmu = vcpu->arch.mmu; - bool tlb_flush = false; + unsigned long roots = 0; uint i; - if (pcid == kvm_get_active_pcid(vcpu)) { - if (mmu->invlpg) - mmu->invlpg(vcpu, gva, mmu->root.hpa); - tlb_flush = true; - } + if (pcid == kvm_get_active_pcid(vcpu)) + roots |= KVM_MMU_ROOT_CURRENT; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { if (VALID_PAGE(mmu->prev_roots[i].hpa) && - pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) { - if (mmu->invlpg) - mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); - tlb_flush = true; - } + pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) + roots |= KVM_MMU_ROOT_PREVIOUS(i); } - if (tlb_flush) - static_call(kvm_x86_flush_tlb_gva)(vcpu, gva); - + if (roots) + kvm_mmu_invalidate_addr(vcpu, mmu, gva, roots); ++vcpu->stat.invlpg; /* @@ -5514,6 +6443,9 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, tdp_root_level = tdp_forced_root_level; max_tdp_level = tdp_max_root_level; +#ifdef CONFIG_X86_64 + tdp_mmu_enabled = tdp_mmu_allowed && tdp_enabled; +#endif /* * max_huge_page_level reflects KVM's MMU capabilities irrespective * of kernel support, e.g. KVM may be capable of using 1GB pages when @@ -5530,58 +6462,6 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, } EXPORT_SYMBOL_GPL(kvm_configure_mmu); -/* The return value indicates if tlb flush on all vcpus is needed. */ -typedef bool (*slot_level_handler) (struct kvm *kvm, - struct kvm_rmap_head *rmap_head, - const struct kvm_memory_slot *slot); - -/* The caller should hold mmu-lock before calling this function. */ -static __always_inline bool -slot_handle_level_range(struct kvm *kvm, const struct kvm_memory_slot *memslot, - slot_level_handler fn, int start_level, int end_level, - gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield, - bool flush) -{ - struct slot_rmap_walk_iterator iterator; - - for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn, - end_gfn, &iterator) { - if (iterator.rmap) - flush |= fn(kvm, iterator.rmap, memslot); - - if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { - if (flush && flush_on_yield) { - kvm_flush_remote_tlbs_with_address(kvm, - start_gfn, - iterator.gfn - start_gfn + 1); - flush = false; - } - cond_resched_rwlock_write(&kvm->mmu_lock); - } - } - - return flush; -} - -static __always_inline bool -slot_handle_level(struct kvm *kvm, const struct kvm_memory_slot *memslot, - slot_level_handler fn, int start_level, int end_level, - bool flush_on_yield) -{ - return slot_handle_level_range(kvm, memslot, fn, start_level, - end_level, memslot->base_gfn, - memslot->base_gfn + memslot->npages - 1, - flush_on_yield, false); -} - -static __always_inline bool -slot_handle_level_4k(struct kvm *kvm, const struct kvm_memory_slot *memslot, - slot_level_handler fn, bool flush_on_yield) -{ - return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K, - PG_LEVEL_4K, flush_on_yield); -} - static void free_mmu_pages(struct kvm_mmu *mmu) { if (!tdp_enabled && mmu->pae_root) @@ -5598,6 +6478,7 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) mmu->root.hpa = INVALID_PAGE; mmu->root.pgd = 0; + mmu->mirror_root_hpa = INVALID_PAGE; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; @@ -5654,7 +6535,10 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache; vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO; - vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO; + vcpu->arch.mmu_shadow_page_cache.init_value = + SHADOW_NONPRESENT_VALUE; + if (!vcpu->arch.mmu_shadow_page_cache.init_value) + vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO; vcpu->arch.mmu = &vcpu->arch.root_mmu; vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; @@ -5678,8 +6562,11 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; int nr_zapped, batch = 0; + LIST_HEAD(invalid_list); bool unstable; + lockdep_assert_held(&kvm->slots_lock); + restart: list_for_each_entry_safe_reverse(sp, node, &kvm->arch.active_mmu_pages, link) { @@ -5695,7 +6582,7 @@ restart: * pages. Skip the bogus page, otherwise we'll get stuck in an * infinite loop if the page gets put back on the list (again). */ - if (WARN_ON(sp->role.invalid)) + if (WARN_ON_ONCE(sp->role.invalid)) continue; /* @@ -5711,7 +6598,7 @@ restart: } unstable = __kvm_mmu_prepare_zap_page(kvm, sp, - &kvm->arch.zapped_obsolete_pages, &nr_zapped); + &invalid_list, &nr_zapped); batch += nr_zapped; if (unstable) @@ -5727,7 +6614,7 @@ restart: * kvm_mmu_load()), and the reload in the caller ensure no vCPUs are * running with an obsolete MMU. */ - kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages); + kvm_mmu_commit_zap_page(kvm, &invalid_list); } /* @@ -5761,8 +6648,13 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) * write and in the same critical section as making the reload request, * e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield. */ - if (is_tdp_mmu_enabled(kvm)) - kvm_tdp_mmu_invalidate_all_roots(kvm); + if (tdp_mmu_enabled) { + /* + * External page tables don't support fast zapping, therefore + * their mirrors must be invalidated separately by the caller. + */ + kvm_tdp_mmu_invalidate_roots(kvm, KVM_DIRECT_ROOTS); + } /* * Notify all vcpus to reload its shadow page table and flush TLB. @@ -5786,52 +6678,45 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) * Deferring the zap until the final reference to the root is put would * lead to use-after-free. */ - if (is_tdp_mmu_enabled(kvm)) - kvm_tdp_mmu_zap_invalidated_roots(kvm); + if (tdp_mmu_enabled) + kvm_tdp_mmu_zap_invalidated_roots(kvm, true); } -static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) +void kvm_mmu_init_vm(struct kvm *kvm) { - return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); -} + kvm->arch.shadow_mmio_value = shadow_mmio_value; + INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); + INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages); + spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); -static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, - struct kvm_memory_slot *slot, - struct kvm_page_track_notifier_node *node) -{ - kvm_mmu_zap_all_fast(kvm); -} + if (tdp_mmu_enabled) + kvm_mmu_init_tdp_mmu(kvm); -int kvm_mmu_init_vm(struct kvm *kvm) -{ - struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; - int r; + kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache; + kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO; - INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); - INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); - INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages); - spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); + kvm->arch.split_shadow_page_cache.gfp_zero = __GFP_ZERO; - r = kvm_mmu_init_tdp_mmu(kvm); - if (r < 0) - return r; + kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache; + kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO; +} - node->track_write = kvm_mmu_pte_write; - node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; - kvm_page_track_register_notifier(kvm, node); - return 0; +static void mmu_free_vm_memory_caches(struct kvm *kvm) +{ + kvm_mmu_free_memory_cache(&kvm->arch.split_desc_cache); + kvm_mmu_free_memory_cache(&kvm->arch.split_page_header_cache); + kvm_mmu_free_memory_cache(&kvm->arch.split_shadow_page_cache); } void kvm_mmu_uninit_vm(struct kvm *kvm) { - struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; + if (tdp_mmu_enabled) + kvm_mmu_uninit_tdp_mmu(kvm); - kvm_page_track_unregister_notifier(kvm, node); - - kvm_mmu_uninit_tdp_mmu(kvm); + mmu_free_vm_memory_caches(kvm); } -static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) +static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) { const struct kvm_memory_slot *memslot; struct kvm_memslots *slots; @@ -5843,7 +6728,7 @@ static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) if (!kvm_memslots_have_rmaps(kvm)) return flush; - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) { @@ -5853,10 +6738,8 @@ static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) if (WARN_ON_ONCE(start >= end)) continue; - flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, - - PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, - start, end - 1, true, flush); + flush = __kvm_rmap_zap_gfn_range(kvm, memslot, start, + end, true, flush); } } @@ -5870,28 +6753,25 @@ static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) { bool flush; - int i; if (WARN_ON_ONCE(gfn_end <= gfn_start)) return; write_lock(&kvm->mmu_lock); - kvm_inc_notifier_count(kvm, gfn_start, gfn_end); + kvm_mmu_invalidate_begin(kvm); - flush = __kvm_zap_rmaps(kvm, gfn_start, gfn_end); + kvm_mmu_invalidate_range_add(kvm, gfn_start, gfn_end); - if (is_tdp_mmu_enabled(kvm)) { - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) - flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start, - gfn_end, true, flush); - } + flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end); + + if (tdp_mmu_enabled) + flush = kvm_tdp_mmu_zap_leafs(kvm, gfn_start, gfn_end, flush); if (flush) - kvm_flush_remote_tlbs_with_address(kvm, gfn_start, - gfn_end - gfn_start); + kvm_flush_remote_tlbs_range(kvm, gfn_start, gfn_end - gfn_start); - kvm_dec_notifier_count(kvm, gfn_start, gfn_end); + kvm_mmu_invalidate_end(kvm); write_unlock(&kvm->mmu_lock); } @@ -5907,47 +6787,247 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, const struct kvm_memory_slot *memslot, int start_level) { - bool flush = false; - if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); - flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, - start_level, KVM_MAX_HUGEPAGE_LEVEL, - false); + walk_slot_rmaps(kvm, memslot, slot_rmap_write_protect, + start_level, KVM_MAX_HUGEPAGE_LEVEL, false); write_unlock(&kvm->mmu_lock); } - if (is_tdp_mmu_enabled(kvm)) { + if (tdp_mmu_enabled) { read_lock(&kvm->mmu_lock); - flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level); + kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level); read_unlock(&kvm->mmu_lock); } +} + +static inline bool need_topup(struct kvm_mmu_memory_cache *cache, int min) +{ + return kvm_mmu_memory_cache_nr_free_objects(cache) < min; +} + +static bool need_topup_split_caches_or_resched(struct kvm *kvm) +{ + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) + return true; /* - * Flush TLBs if any SPTEs had to be write-protected to ensure that - * guest writes are reflected in the dirty bitmap before the memslot - * update completes, i.e. before enabling dirty logging is visible to - * userspace. - * - * Perform the TLB flush outside the mmu_lock to reduce the amount of - * time the lock is held. However, this does mean that another CPU can - * now grab mmu_lock and encounter a write-protected SPTE while CPUs - * still have a writable mapping for the associated GFN in their TLB. - * - * This is safe but requires KVM to be careful when making decisions - * based on the write-protection status of an SPTE. Specifically, KVM - * also write-protects SPTEs to monitor changes to guest page tables - * during shadow paging, and must guarantee no CPUs can write to those - * page before the lock is dropped. As mentioned in the previous - * paragraph, a write-protected SPTE is no guarantee that CPU cannot - * perform writes. So to determine if a TLB flush is truly required, KVM - * will clear a separate software-only bit (MMU-writable) and skip the - * flush if-and-only-if this bit was already clear. - * - * See is_writable_pte() for more details. + * In the worst case, SPLIT_DESC_CACHE_MIN_NR_OBJECTS descriptors are needed + * to split a single huge page. Calculating how many are actually needed + * is possible but not worth the complexity. */ - if (flush) - kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); + return need_topup(&kvm->arch.split_desc_cache, SPLIT_DESC_CACHE_MIN_NR_OBJECTS) || + need_topup(&kvm->arch.split_page_header_cache, 1) || + need_topup(&kvm->arch.split_shadow_page_cache, 1); +} + +static int topup_split_caches(struct kvm *kvm) +{ + /* + * Allocating rmap list entries when splitting huge pages for nested + * MMUs is uncommon as KVM needs to use a list if and only if there is + * more than one rmap entry for a gfn, i.e. requires an L1 gfn to be + * aliased by multiple L2 gfns and/or from multiple nested roots with + * different roles. Aliasing gfns when using TDP is atypical for VMMs; + * a few gfns are often aliased during boot, e.g. when remapping BIOS, + * but aliasing rarely occurs post-boot or for many gfns. If there is + * only one rmap entry, rmap->val points directly at that one entry and + * doesn't need to allocate a list. Buffer the cache by the default + * capacity so that KVM doesn't have to drop mmu_lock to topup if KVM + * encounters an aliased gfn or two. + */ + const int capacity = SPLIT_DESC_CACHE_MIN_NR_OBJECTS + + KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE; + int r; + + lockdep_assert_held(&kvm->slots_lock); + + r = __kvm_mmu_topup_memory_cache(&kvm->arch.split_desc_cache, capacity, + SPLIT_DESC_CACHE_MIN_NR_OBJECTS); + if (r) + return r; + + r = kvm_mmu_topup_memory_cache(&kvm->arch.split_page_header_cache, 1); + if (r) + return r; + + return kvm_mmu_topup_memory_cache(&kvm->arch.split_shadow_page_cache, 1); +} + +static struct kvm_mmu_page *shadow_mmu_get_sp_for_split(struct kvm *kvm, u64 *huge_sptep) +{ + struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep); + struct shadow_page_caches caches = {}; + union kvm_mmu_page_role role; + unsigned int access; + gfn_t gfn; + + gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep)); + access = kvm_mmu_page_get_access(huge_sp, spte_index(huge_sptep)); + + /* + * Note, huge page splitting always uses direct shadow pages, regardless + * of whether the huge page itself is mapped by a direct or indirect + * shadow page, since the huge page region itself is being directly + * mapped with smaller pages. + */ + role = kvm_mmu_child_role(huge_sptep, /*direct=*/true, access); + + /* Direct SPs do not require a shadowed_info_cache. */ + caches.page_header_cache = &kvm->arch.split_page_header_cache; + caches.shadow_page_cache = &kvm->arch.split_shadow_page_cache; + + /* Safe to pass NULL for vCPU since requesting a direct SP. */ + return __kvm_mmu_get_shadow_page(kvm, NULL, &caches, gfn, role); +} + +static void shadow_mmu_split_huge_page(struct kvm *kvm, + const struct kvm_memory_slot *slot, + u64 *huge_sptep) + +{ + struct kvm_mmu_memory_cache *cache = &kvm->arch.split_desc_cache; + u64 huge_spte = READ_ONCE(*huge_sptep); + struct kvm_mmu_page *sp; + bool flush = false; + u64 *sptep, spte; + gfn_t gfn; + int index; + + sp = shadow_mmu_get_sp_for_split(kvm, huge_sptep); + + for (index = 0; index < SPTE_ENT_PER_PAGE; index++) { + sptep = &sp->spt[index]; + gfn = kvm_mmu_page_get_gfn(sp, index); + + /* + * The SP may already have populated SPTEs, e.g. if this huge + * page is aliased by multiple sptes with the same access + * permissions. These entries are guaranteed to map the same + * gfn-to-pfn translation since the SP is direct, so no need to + * modify them. + * + * However, if a given SPTE points to a lower level page table, + * that lower level page table may only be partially populated. + * Installing such SPTEs would effectively unmap a potion of the + * huge page. Unmapping guest memory always requires a TLB flush + * since a subsequent operation on the unmapped regions would + * fail to detect the need to flush. + */ + if (is_shadow_present_pte(*sptep)) { + flush |= !is_last_spte(*sptep, sp->role.level); + continue; + } + + spte = make_small_spte(kvm, huge_spte, sp->role, index); + mmu_spte_set(sptep, spte); + __rmap_add(kvm, cache, slot, sptep, gfn, sp->role.access); + } + + __link_shadow_page(kvm, cache, huge_sptep, sp, flush); +} + +static int shadow_mmu_try_split_huge_page(struct kvm *kvm, + const struct kvm_memory_slot *slot, + u64 *huge_sptep) +{ + struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep); + int level, r = 0; + gfn_t gfn; + u64 spte; + + /* Grab information for the tracepoint before dropping the MMU lock. */ + gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep)); + level = huge_sp->role.level; + spte = *huge_sptep; + + if (kvm_mmu_available_pages(kvm) <= KVM_MIN_FREE_MMU_PAGES) { + r = -ENOSPC; + goto out; + } + + if (need_topup_split_caches_or_resched(kvm)) { + write_unlock(&kvm->mmu_lock); + cond_resched(); + /* + * If the topup succeeds, return -EAGAIN to indicate that the + * rmap iterator should be restarted because the MMU lock was + * dropped. + */ + r = topup_split_caches(kvm) ?: -EAGAIN; + write_lock(&kvm->mmu_lock); + goto out; + } + + shadow_mmu_split_huge_page(kvm, slot, huge_sptep); + +out: + trace_kvm_mmu_split_huge_page(gfn, spte, level, r); + return r; +} + +static bool shadow_mmu_try_split_huge_pages(struct kvm *kvm, + struct kvm_rmap_head *rmap_head, + const struct kvm_memory_slot *slot) +{ + struct rmap_iterator iter; + struct kvm_mmu_page *sp; + u64 *huge_sptep; + int r; + +restart: + for_each_rmap_spte(rmap_head, &iter, huge_sptep) { + sp = sptep_to_sp(huge_sptep); + + /* TDP MMU is enabled, so rmap only contains nested MMU SPs. */ + if (WARN_ON_ONCE(!sp->role.guest_mode)) + continue; + + /* The rmaps should never contain non-leaf SPTEs. */ + if (WARN_ON_ONCE(!is_large_pte(*huge_sptep))) + continue; + + /* SPs with level >PG_LEVEL_4K should never by unsync. */ + if (WARN_ON_ONCE(sp->unsync)) + continue; + + /* Don't bother splitting huge pages on invalid SPs. */ + if (sp->role.invalid) + continue; + + r = shadow_mmu_try_split_huge_page(kvm, slot, huge_sptep); + + /* + * The split succeeded or needs to be retried because the MMU + * lock was dropped. Either way, restart the iterator to get it + * back into a consistent state. + */ + if (!r || r == -EAGAIN) + goto restart; + + /* The split failed and shouldn't be retried (e.g. -ENOMEM). */ + break; + } + + return false; +} + +static void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *slot, + gfn_t start, gfn_t end, + int target_level) +{ + int level; + + /* + * Split huge pages starting with KVM_MAX_HUGEPAGE_LEVEL and working + * down to the target level. This ensures pages are recursively split + * all the way to the target level. There's no need to split pages + * already at the target level. + */ + for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--) + __walk_slot_rmaps(kvm, slot, shadow_mmu_try_split_huge_pages, + level, level, start, end - 1, true, true, false); } /* Must be called with the mmu_lock held in write-mode. */ @@ -5956,12 +7036,16 @@ void kvm_mmu_try_split_huge_pages(struct kvm *kvm, u64 start, u64 end, int target_level) { - if (is_tdp_mmu_enabled(kvm)) - kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, - target_level, false); + if (!tdp_mmu_enabled) + return; + + if (kvm_memslots_have_rmaps(kvm)) + kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level); + + kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, false); /* - * A TLB flush is unnecessary at this point for the same resons as in + * A TLB flush is unnecessary at this point for the same reasons as in * kvm_mmu_slot_try_split_huge_pages(). */ } @@ -5973,12 +7057,19 @@ void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm, u64 start = memslot->base_gfn; u64 end = start + memslot->npages; - if (is_tdp_mmu_enabled(kvm)) { - read_lock(&kvm->mmu_lock); - kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true); - read_unlock(&kvm->mmu_lock); + if (!tdp_mmu_enabled) + return; + + if (kvm_memslots_have_rmaps(kvm)) { + write_lock(&kvm->mmu_lock); + kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level); + write_unlock(&kvm->mmu_lock); } + read_lock(&kvm->mmu_lock); + kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true); + read_unlock(&kvm->mmu_lock); + /* * No TLB flush is necessary here. KVM will flush TLBs after * write-protecting and/or clearing dirty on the newly split SPTEs to @@ -5997,13 +7088,11 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, u64 *sptep; struct rmap_iterator iter; int need_tlb_flush = 0; - kvm_pfn_t pfn; struct kvm_mmu_page *sp; restart: for_each_rmap_spte(rmap_head, &iter, sptep) { sp = sptep_to_sp(sptep); - pfn = spte_to_pfn(*sptep); /* * We cannot do huge page mapping for indirect shadow pages, @@ -6012,14 +7101,12 @@ restart: * the guest, and the guest page table is using 4K page size * mapping if the indirect sp has level = 1. */ - if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && - sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, - pfn, PG_LEVEL_NUM)) { - pte_list_remove(kvm, rmap_head, sptep); - - if (kvm_available_flush_tlb_with_range()) - kvm_flush_remote_tlbs_with_address(kvm, sp->gfn, - KVM_PAGES_PER_HPAGE(sp->role.level)); + if (sp->role.direct && + sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn)) { + kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); + + if (kvm_available_flush_remote_tlbs_range()) + kvm_flush_remote_tlbs_sptep(kvm, sptep); else need_tlb_flush = 1; @@ -6029,76 +7116,66 @@ restart: return need_tlb_flush; } +EXPORT_SYMBOL_GPL(kvm_zap_gfn_range); -void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot) +static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm, + const struct kvm_memory_slot *slot) +{ + /* + * Note, use KVM_MAX_HUGEPAGE_LEVEL - 1 since there's no need to zap + * pages that are already mapped at the maximum hugepage level. + */ + if (walk_slot_rmaps(kvm, slot, kvm_mmu_zap_collapsible_spte, + PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true)) + kvm_flush_remote_tlbs_memslot(kvm, slot); +} + +void kvm_mmu_recover_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *slot) { if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); - /* - * Zap only 4k SPTEs since the legacy MMU only supports dirty - * logging at a 4k granularity and never creates collapsible - * 2m SPTEs during dirty logging. - */ - if (slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true)) - kvm_arch_flush_remote_tlbs_memslot(kvm, slot); + kvm_rmap_zap_collapsible_sptes(kvm, slot); write_unlock(&kvm->mmu_lock); } - if (is_tdp_mmu_enabled(kvm)) { + if (tdp_mmu_enabled) { read_lock(&kvm->mmu_lock); - kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot); + kvm_tdp_mmu_recover_huge_pages(kvm, slot); read_unlock(&kvm->mmu_lock); } } -void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, - const struct kvm_memory_slot *memslot) -{ - /* - * All current use cases for flushing the TLBs for a specific memslot - * related to dirty logging, and many do the TLB flush out of mmu_lock. - * The interaction between the various operations on memslot must be - * serialized by slots_locks to ensure the TLB flush from one operation - * is observed by any other operation on the same memslot. - */ - lockdep_assert_held(&kvm->slots_lock); - kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn, - memslot->npages); -} - void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, const struct kvm_memory_slot *memslot) { - bool flush = false; - if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); /* * Clear dirty bits only on 4k SPTEs since the legacy MMU only * support dirty logging at a 4k granularity. */ - flush = slot_handle_level_4k(kvm, memslot, __rmap_clear_dirty, false); + walk_slot_rmaps_4k(kvm, memslot, __rmap_clear_dirty, false); write_unlock(&kvm->mmu_lock); } - if (is_tdp_mmu_enabled(kvm)) { + if (tdp_mmu_enabled) { read_lock(&kvm->mmu_lock); - flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); + kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); read_unlock(&kvm->mmu_lock); } /* + * The caller will flush the TLBs after this function returns. + * * It's also safe to flush TLBs out of mmu lock here as currently this * function is only used for dirty logging, in which case flushing TLB * out of mmu lock also guarantees no dirty pages will be lost in * dirty_bitmap. */ - if (flush) - kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); } -void kvm_mmu_zap_all(struct kvm *kvm) +static void kvm_mmu_zap_all(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; LIST_HEAD(invalid_list); @@ -6107,7 +7184,7 @@ void kvm_mmu_zap_all(struct kvm *kvm) write_lock(&kvm->mmu_lock); restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { - if (WARN_ON(sp->role.invalid)) + if (WARN_ON_ONCE(sp->role.invalid)) continue; if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) goto restart; @@ -6117,15 +7194,87 @@ restart: kvm_mmu_commit_zap_page(kvm, &invalid_list); - if (is_tdp_mmu_enabled(kvm)) + if (tdp_mmu_enabled) kvm_tdp_mmu_zap_all(kvm); write_unlock(&kvm->mmu_lock); } +void kvm_arch_flush_shadow_all(struct kvm *kvm) +{ + kvm_mmu_zap_all(kvm); +} + +static void kvm_mmu_zap_memslot_pages_and_flush(struct kvm *kvm, + struct kvm_memory_slot *slot, + bool flush) +{ + LIST_HEAD(invalid_list); + unsigned long i; + + if (list_empty(&kvm->arch.active_mmu_pages)) + goto out_flush; + + /* + * Since accounting information is stored in struct kvm_arch_memory_slot, + * all MMU pages that are shadowing guest PTEs must be zapped before the + * memslot is deleted, as freeing such pages after the memslot is freed + * will result in use-after-free, e.g. in unaccount_shadowed(). + */ + for (i = 0; i < slot->npages; i++) { + struct kvm_mmu_page *sp; + gfn_t gfn = slot->base_gfn + i; + + for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); + + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { + kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); + flush = false; + cond_resched_rwlock_write(&kvm->mmu_lock); + } + } + +out_flush: + kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); +} + +static void kvm_mmu_zap_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + struct kvm_gfn_range range = { + .slot = slot, + .start = slot->base_gfn, + .end = slot->base_gfn + slot->npages, + .may_block = true, + .attr_filter = KVM_FILTER_PRIVATE | KVM_FILTER_SHARED, + }; + bool flush; + + write_lock(&kvm->mmu_lock); + flush = kvm_unmap_gfn_range(kvm, &range); + kvm_mmu_zap_memslot_pages_and_flush(kvm, slot, flush); + write_unlock(&kvm->mmu_lock); +} + +static inline bool kvm_memslot_flush_zap_all(struct kvm *kvm) +{ + return kvm->arch.vm_type == KVM_X86_DEFAULT_VM && + kvm_check_has_quirk(kvm, KVM_X86_QUIRK_SLOT_ZAP_ALL); +} + +void kvm_arch_flush_shadow_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + if (kvm_memslot_flush_zap_all(kvm)) + kvm_mmu_zap_all_fast(kvm); + else + kvm_mmu_zap_memslot(kvm, slot); +} + void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) { - WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); + WARN_ON_ONCE(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); gen &= MMIO_SPTE_GEN_MASK; @@ -6136,93 +7285,43 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) * modifier prior to checking for a wrap of the MMIO generation so * that a wrap in any address space is detected. */ - gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1); + gen &= ~((u64)kvm_arch_nr_memslot_as_ids(kvm) - 1); /* * The very rare case: if the MMIO generation number has wrapped, * zap all shadow pages. */ if (unlikely(gen == 0)) { - kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n"); + kvm_debug_ratelimited("zapping shadow pages for mmio generation wraparound\n"); kvm_mmu_zap_all_fast(kvm); } } -static unsigned long -mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) +static void mmu_destroy_caches(void) { - struct kvm *kvm; - int nr_to_scan = sc->nr_to_scan; - unsigned long freed = 0; - - mutex_lock(&kvm_lock); - - list_for_each_entry(kvm, &vm_list, vm_list) { - int idx; - LIST_HEAD(invalid_list); - - /* - * Never scan more than sc->nr_to_scan VM instances. - * Will not hit this condition practically since we do not try - * to shrink more than one VM and it is very unlikely to see - * !n_used_mmu_pages so many times. - */ - if (!nr_to_scan--) - break; - /* - * n_used_mmu_pages is accessed without holding kvm->mmu_lock - * here. We may skip a VM instance errorneosly, but we do not - * want to shrink a VM that only started to populate its MMU - * anyway. - */ - if (!kvm->arch.n_used_mmu_pages && - !kvm_has_zapped_obsolete_pages(kvm)) - continue; - - idx = srcu_read_lock(&kvm->srcu); - write_lock(&kvm->mmu_lock); - - if (kvm_has_zapped_obsolete_pages(kvm)) { - kvm_mmu_commit_zap_page(kvm, - &kvm->arch.zapped_obsolete_pages); - goto unlock; - } - - freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan); - -unlock: - write_unlock(&kvm->mmu_lock); - srcu_read_unlock(&kvm->srcu, idx); - - /* - * unfair on small ones - * per-vm shrinkers cry out - * sadness comes quickly - */ - list_move_tail(&kvm->vm_list, &vm_list); - break; - } - - mutex_unlock(&kvm_lock); - return freed; + kmem_cache_destroy(pte_list_desc_cache); + kmem_cache_destroy(mmu_page_header_cache); } -static unsigned long -mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) +static void kvm_wake_nx_recovery_thread(struct kvm *kvm) { - return percpu_counter_read_positive(&kvm_total_used_mmu_pages); -} + /* + * The NX recovery thread is spawned on-demand at the first KVM_RUN and + * may not be valid even though the VM is globally visible. Do nothing, + * as such a VM can't have any possible NX huge pages. + */ + struct vhost_task *nx_thread = READ_ONCE(kvm->arch.nx_huge_page_recovery_thread); -static struct shrinker mmu_shrinker = { - .count_objects = mmu_shrink_count, - .scan_objects = mmu_shrink_scan, - .seeks = DEFAULT_SEEKS * 10, -}; + if (nx_thread) + vhost_task_wake(nx_thread); +} -static void mmu_destroy_caches(void) +static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp) { - kmem_cache_destroy(pte_list_desc_cache); - kmem_cache_destroy(mmu_page_header_cache); + if (nx_hugepage_mitigation_hard_disabled) + return sysfs_emit(buffer, "never\n"); + + return param_get_bool(buffer, kp); } static bool get_nx_auto_mode(void) @@ -6241,15 +7340,29 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) bool old_val = nx_huge_pages; bool new_val; + if (nx_hugepage_mitigation_hard_disabled) + return -EPERM; + /* In "auto" mode deploy workaround only if CPU has the bug. */ - if (sysfs_streq(val, "off")) + if (sysfs_streq(val, "off")) { new_val = 0; - else if (sysfs_streq(val, "force")) + } else if (sysfs_streq(val, "force")) { new_val = 1; - else if (sysfs_streq(val, "auto")) + } else if (sysfs_streq(val, "auto")) { new_val = get_nx_auto_mode(); - else if (strtobool(val, &new_val) < 0) + } else if (sysfs_streq(val, "never")) { + new_val = 0; + + mutex_lock(&kvm_lock); + if (!list_empty(&vm_list)) { + mutex_unlock(&kvm_lock); + return -EBUSY; + } + nx_hugepage_mitigation_hard_disabled = true; + mutex_unlock(&kvm_lock); + } else if (kstrtobool(val, &new_val) < 0) { return -EINVAL; + } __set_nx_huge_pages(new_val); @@ -6263,7 +7376,7 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) kvm_mmu_zap_all_fast(kvm); mutex_unlock(&kvm->slots_lock); - wake_up_process(kvm->arch.nx_lpage_recovery_thread); + kvm_wake_nx_recovery_thread(kvm); } mutex_unlock(&kvm_lock); } @@ -6274,11 +7387,22 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) /* * nx_huge_pages needs to be resolved to true/false when kvm.ko is loaded, as * its default value of -1 is technically undefined behavior for a boolean. + * Forward the module init call to SPTE code so that it too can handle module + * params that need to be resolved/snapshot. */ -void kvm_mmu_x86_module_init(void) +void __init kvm_mmu_x86_module_init(void) { if (nx_huge_pages == -1) __set_nx_huge_pages(get_nx_auto_mode()); + + /* + * Snapshot userspace's desire to enable the TDP MMU. Whether or not the + * TDP MMU is actually enabled is determined in kvm_configure_mmu() + * when the vendor module is loaded. + */ + tdp_mmu_allowed = tdp_mmu_enabled; + + kvm_mmu_spte_module_init(); } /* @@ -6302,9 +7426,7 @@ int kvm_mmu_vendor_module_init(void) kvm_mmu_reset_all_pte_masks(); - pte_list_desc_cache = kmem_cache_create("pte_list_desc", - sizeof(struct pte_list_desc), - 0, SLAB_ACCOUNT, NULL); + pte_list_desc_cache = KMEM_CACHE(pte_list_desc, SLAB_ACCOUNT); if (!pte_list_desc_cache) goto out; @@ -6314,13 +7436,6 @@ int kvm_mmu_vendor_module_init(void) if (!mmu_page_header_cache) goto out; - if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL)) - goto out; - - ret = register_shrinker(&mmu_shrinker); - if (ret) - goto out; - return 0; out: @@ -6331,6 +7446,12 @@ out: void kvm_mmu_destroy(struct kvm_vcpu *vcpu) { kvm_mmu_unload(vcpu); + if (tdp_mmu_enabled) { + read_lock(&vcpu->kvm->mmu_lock); + mmu_free_root_page(vcpu->kvm, &vcpu->arch.mmu->mirror_root_hpa, + NULL); + read_unlock(&vcpu->kvm->mmu_lock); + } free_mmu_pages(&vcpu->arch.root_mmu); free_mmu_pages(&vcpu->arch.guest_mmu); mmu_free_memory_caches(vcpu); @@ -6339,8 +7460,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu) void kvm_mmu_vendor_module_exit(void) { mmu_destroy_caches(); - percpu_counter_destroy(&kvm_total_used_mmu_pages); - unregister_shrinker(&mmu_shrinker); } /* @@ -6374,6 +7493,9 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel uint old_period, new_period; int err; + if (nx_hugepage_mitigation_hard_disabled) + return -EPERM; + was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period); err = param_set_uint(val, kp); @@ -6389,7 +7511,7 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) - wake_up_process(kvm->arch.nx_lpage_recovery_thread); + kvm_wake_nx_recovery_thread(kvm); mutex_unlock(&kvm_lock); } @@ -6397,9 +7519,10 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel return err; } -static void kvm_recover_nx_lpages(struct kvm *kvm) +static void kvm_recover_nx_huge_pages(struct kvm *kvm) { unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits; + struct kvm_memory_slot *slot; int rcu_idx; struct kvm_mmu_page *sp; unsigned int ratio; @@ -6420,24 +7543,58 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) ratio = READ_ONCE(nx_huge_pages_recovery_ratio); to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0; for ( ; to_zap; --to_zap) { - if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) + if (list_empty(&kvm->arch.possible_nx_huge_pages)) break; /* * We use a separate list instead of just using active_mmu_pages - * because the number of lpage_disallowed pages is expected to - * be relatively small compared to the total. + * because the number of shadow pages that be replaced with an + * NX huge page is expected to be relatively small compared to + * the total number of shadow pages. And because the TDP MMU + * doesn't use active_mmu_pages. */ - sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages, + sp = list_first_entry(&kvm->arch.possible_nx_huge_pages, struct kvm_mmu_page, - lpage_disallowed_link); - WARN_ON_ONCE(!sp->lpage_disallowed); - if (is_tdp_mmu_page(sp)) { + possible_nx_huge_page_link); + WARN_ON_ONCE(!sp->nx_huge_page_disallowed); + WARN_ON_ONCE(!sp->role.direct); + + /* + * Unaccount and do not attempt to recover any NX Huge Pages + * that are being dirty tracked, as they would just be faulted + * back in as 4KiB pages. The NX Huge Pages in this slot will be + * recovered, along with all the other huge pages in the slot, + * when dirty logging is disabled. + * + * Since gfn_to_memslot() is relatively expensive, it helps to + * skip it if it the test cannot possibly return true. On the + * other hand, if any memslot has logging enabled, chances are + * good that all of them do, in which case unaccount_nx_huge_page() + * is much cheaper than zapping the page. + * + * If a memslot update is in progress, reading an incorrect value + * of kvm->nr_memslots_dirty_logging is not a problem: if it is + * becoming zero, gfn_to_memslot() will be done unnecessarily; if + * it is becoming nonzero, the page will be zapped unnecessarily. + * Either way, this only affects efficiency in racy situations, + * and not correctness. + */ + slot = NULL; + if (atomic_read(&kvm->nr_memslots_dirty_logging)) { + struct kvm_memslots *slots; + + slots = kvm_memslots_for_spte_role(kvm, sp->role); + slot = __gfn_to_memslot(slots, sp->gfn); + WARN_ON_ONCE(!slot); + } + + if (slot && kvm_slot_dirty_track_enabled(slot)) + unaccount_nx_huge_page(kvm, sp); + else if (is_tdp_mmu_page(sp)) flush |= kvm_tdp_mmu_zap_sp(kvm, sp); - } else { + else kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); - WARN_ON_ONCE(sp->lpage_disallowed); - } + WARN_ON_ONCE(sp->nx_huge_page_disallowed); if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); @@ -6457,57 +7614,270 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) srcu_read_unlock(&kvm->srcu, rcu_idx); } -static long get_nx_lpage_recovery_timeout(u64 start_time) +static void kvm_nx_huge_page_recovery_worker_kill(void *data) { +} + +static bool kvm_nx_huge_page_recovery_worker(void *data) +{ + struct kvm *kvm = data; bool enabled; uint period; + long remaining_time; enabled = calc_nx_huge_pages_recovery_period(&period); + if (!enabled) + return false; - return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64() - : MAX_SCHEDULE_TIMEOUT; + remaining_time = kvm->arch.nx_huge_page_last + msecs_to_jiffies(period) + - get_jiffies_64(); + if (remaining_time > 0) { + schedule_timeout(remaining_time); + /* check for signals and come back */ + return true; + } + + __set_current_state(TASK_RUNNING); + kvm_recover_nx_huge_pages(kvm); + kvm->arch.nx_huge_page_last = get_jiffies_64(); + return true; } -static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data) +static int kvm_mmu_start_lpage_recovery(struct once *once) { - u64 start_time; - long remaining_time; + struct kvm_arch *ka = container_of(once, struct kvm_arch, nx_once); + struct kvm *kvm = container_of(ka, struct kvm, arch); + struct vhost_task *nx_thread; - while (true) { - start_time = get_jiffies_64(); - remaining_time = get_nx_lpage_recovery_timeout(start_time); + kvm->arch.nx_huge_page_last = get_jiffies_64(); + nx_thread = vhost_task_create(kvm_nx_huge_page_recovery_worker, + kvm_nx_huge_page_recovery_worker_kill, + kvm, "kvm-nx-lpage-recovery"); - set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop() && remaining_time > 0) { - schedule_timeout(remaining_time); - remaining_time = get_nx_lpage_recovery_timeout(start_time); - set_current_state(TASK_INTERRUPTIBLE); - } + if (IS_ERR(nx_thread)) + return PTR_ERR(nx_thread); - set_current_state(TASK_RUNNING); + vhost_task_start(nx_thread); - if (kthread_should_stop()) - return 0; + /* Make the task visible only once it is fully started. */ + WRITE_ONCE(kvm->arch.nx_huge_page_recovery_thread, nx_thread); + return 0; +} + +int kvm_mmu_post_init_vm(struct kvm *kvm) +{ + if (nx_hugepage_mitigation_hard_disabled) + return 0; + + return call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery); +} - kvm_recover_nx_lpages(kvm); +void kvm_mmu_pre_destroy_vm(struct kvm *kvm) +{ + if (kvm->arch.nx_huge_page_recovery_thread) + vhost_task_stop(kvm->arch.nx_huge_page_recovery_thread); +} + +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES +static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG; +} + +static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG; +} + +static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG; +} + +bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, + struct kvm_gfn_range *range) +{ + struct kvm_memory_slot *slot = range->slot; + int level; + + /* + * Zap SPTEs even if the slot can't be mapped PRIVATE. KVM x86 only + * supports KVM_MEMORY_ATTRIBUTE_PRIVATE, and so it *seems* like KVM + * can simply ignore such slots. But if userspace is making memory + * PRIVATE, then KVM must prevent the guest from accessing the memory + * as shared. And if userspace is making memory SHARED and this point + * is reached, then at least one page within the range was previously + * PRIVATE, i.e. the slot's possible hugepage ranges are changing. + * Zapping SPTEs in this case ensures KVM will reassess whether or not + * a hugepage can be used for affected ranges. + */ + if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm))) + return false; + + if (WARN_ON_ONCE(range->end <= range->start)) + return false; + + /* + * If the head and tail pages of the range currently allow a hugepage, + * i.e. reside fully in the slot and don't have mixed attributes, then + * add each corresponding hugepage range to the ongoing invalidation, + * e.g. to prevent KVM from creating a hugepage in response to a fault + * for a gfn whose attributes aren't changing. Note, only the range + * of gfns whose attributes are being modified needs to be explicitly + * unmapped, as that will unmap any existing hugepages. + */ + for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) { + gfn_t start = gfn_round_for_level(range->start, level); + gfn_t end = gfn_round_for_level(range->end - 1, level); + gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level); + + if ((start != range->start || start + nr_pages > range->end) && + start >= slot->base_gfn && + start + nr_pages <= slot->base_gfn + slot->npages && + !hugepage_test_mixed(slot, start, level)) + kvm_mmu_invalidate_range_add(kvm, start, start + nr_pages); + + if (end == start) + continue; + + if ((end + nr_pages) > range->end && + (end + nr_pages) <= (slot->base_gfn + slot->npages) && + !hugepage_test_mixed(slot, end, level)) + kvm_mmu_invalidate_range_add(kvm, end, end + nr_pages); } + + /* Unmap the old attribute page. */ + if (range->arg.attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE) + range->attr_filter = KVM_FILTER_SHARED; + else + range->attr_filter = KVM_FILTER_PRIVATE; + + return kvm_unmap_gfn_range(kvm, range); } -int kvm_mmu_post_init_vm(struct kvm *kvm) + + +static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn, int level, unsigned long attrs) { - int err; + const unsigned long start = gfn; + const unsigned long end = start + KVM_PAGES_PER_HPAGE(level); - err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0, - "kvm-nx-lpage-recovery", - &kvm->arch.nx_lpage_recovery_thread); - if (!err) - kthread_unpark(kvm->arch.nx_lpage_recovery_thread); + if (level == PG_LEVEL_2M) + return kvm_range_has_memory_attributes(kvm, start, end, ~0, attrs); - return err; + for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1)) { + if (hugepage_test_mixed(slot, gfn, level - 1) || + attrs != kvm_get_memory_attributes(kvm, gfn)) + return false; + } + return true; } -void kvm_mmu_pre_destroy_vm(struct kvm *kvm) +bool kvm_arch_post_set_memory_attributes(struct kvm *kvm, + struct kvm_gfn_range *range) +{ + unsigned long attrs = range->arg.attributes; + struct kvm_memory_slot *slot = range->slot; + int level; + + lockdep_assert_held_write(&kvm->mmu_lock); + lockdep_assert_held(&kvm->slots_lock); + + /* + * Calculate which ranges can be mapped with hugepages even if the slot + * can't map memory PRIVATE. KVM mustn't create a SHARED hugepage over + * a range that has PRIVATE GFNs, and conversely converting a range to + * SHARED may now allow hugepages. + */ + if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm))) + return false; + + /* + * The sequence matters here: upper levels consume the result of lower + * level's scanning. + */ + for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) { + gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level); + gfn_t gfn = gfn_round_for_level(range->start, level); + + /* Process the head page if it straddles the range. */ + if (gfn != range->start || gfn + nr_pages > range->end) { + /* + * Skip mixed tracking if the aligned gfn isn't covered + * by the memslot, KVM can't use a hugepage due to the + * misaligned address regardless of memory attributes. + */ + if (gfn >= slot->base_gfn && + gfn + nr_pages <= slot->base_gfn + slot->npages) { + if (hugepage_has_attrs(kvm, slot, gfn, level, attrs)) + hugepage_clear_mixed(slot, gfn, level); + else + hugepage_set_mixed(slot, gfn, level); + } + gfn += nr_pages; + } + + /* + * Pages entirely covered by the range are guaranteed to have + * only the attributes which were just set. + */ + for ( ; gfn + nr_pages <= range->end; gfn += nr_pages) + hugepage_clear_mixed(slot, gfn, level); + + /* + * Process the last tail page if it straddles the range and is + * contained by the memslot. Like the head page, KVM can't + * create a hugepage if the slot size is misaligned. + */ + if (gfn < range->end && + (gfn + nr_pages) <= (slot->base_gfn + slot->npages)) { + if (hugepage_has_attrs(kvm, slot, gfn, level, attrs)) + hugepage_clear_mixed(slot, gfn, level); + else + hugepage_set_mixed(slot, gfn, level); + } + } + return false; +} + +void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm, + struct kvm_memory_slot *slot) { - if (kvm->arch.nx_lpage_recovery_thread) - kthread_stop(kvm->arch.nx_lpage_recovery_thread); + int level; + + if (!kvm_arch_has_private_mem(kvm)) + return; + + for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) { + /* + * Don't bother tracking mixed attributes for pages that can't + * be huge due to alignment, i.e. process only pages that are + * entirely contained by the memslot. + */ + gfn_t end = gfn_round_for_level(slot->base_gfn + slot->npages, level); + gfn_t start = gfn_round_for_level(slot->base_gfn, level); + gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level); + gfn_t gfn; + + if (start < slot->base_gfn) + start += nr_pages; + + /* + * Unlike setting attributes, every potential hugepage needs to + * be manually checked as the attributes may already be mixed. + */ + for (gfn = start; gfn < end; gfn += nr_pages) { + unsigned long attrs = kvm_get_memory_attributes(kvm, gfn); + + if (hugepage_has_attrs(kvm, slot, gfn, level, attrs)) + hugepage_clear_mixed(slot, gfn, level); + else + hugepage_set_mixed(slot, gfn, level); + } + } } +#endif diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index bd2a26897b97..db8f33e4de62 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -6,20 +6,29 @@ #include <linux/kvm_host.h> #include <asm/kvm_host.h> -#undef MMU_DEBUG +#include "mmu.h" -#ifdef MMU_DEBUG -extern bool dbg; - -#define pgprintk(x...) do { if (dbg) printk(x); } while (0) -#define rmap_printk(fmt, args...) do { if (dbg) printk("%s: " fmt, __func__, ## args); } while (0) -#define MMU_WARN_ON(x) WARN_ON(x) +#ifdef CONFIG_KVM_PROVE_MMU +#define KVM_MMU_WARN_ON(x) WARN_ON_ONCE(x) #else -#define pgprintk(x...) do { } while (0) -#define rmap_printk(x...) do { } while (0) -#define MMU_WARN_ON(x) do { } while (0) +#define KVM_MMU_WARN_ON(x) BUILD_BUG_ON_INVALID(x) #endif +/* Page table builder macros common to shadow (host) PTEs and guest PTEs. */ +#define __PT_BASE_ADDR_MASK GENMASK_ULL(51, 12) +#define __PT_LEVEL_SHIFT(level, bits_per_level) \ + (PAGE_SHIFT + ((level) - 1) * (bits_per_level)) +#define __PT_INDEX(address, level, bits_per_level) \ + (((address) >> __PT_LEVEL_SHIFT(level, bits_per_level)) & ((1 << (bits_per_level)) - 1)) + +#define __PT_LVL_ADDR_MASK(base_addr_mask, level, bits_per_level) \ + ((base_addr_mask) & ~((1ULL << (PAGE_SHIFT + (((level) - 1) * (bits_per_level)))) - 1)) + +#define __PT_LVL_OFFSET_MASK(base_addr_mask, level, bits_per_level) \ + ((base_addr_mask) & ((1ULL << (PAGE_SHIFT + (((level) - 1) * (bits_per_level)))) - 1)) + +#define __PT_ENT_PER_PAGE(bits_per_level) (1 << (bits_per_level)) + /* * Unlike regular MMU roots, PAE "roots", a.k.a. PDPTEs/PDPTRs, have a PRESENT * bit, and thus are guaranteed to be non-zero when valid. And, when a guest @@ -30,6 +39,16 @@ extern bool dbg; #define INVALID_PAE_ROOT 0 #define IS_VALID_PAE_ROOT(x) (!!(x)) +static inline hpa_t kvm_mmu_get_dummy_root(void) +{ + return my_zero_pfn(0) << PAGE_SHIFT; +} + +static inline bool kvm_mmu_is_dummy_root(hpa_t shadow_page) +{ + return is_zero_pfn(shadow_page >> PAGE_SHIFT); +} + typedef u64 __rcu *tdp_ptep_t; struct kvm_mmu_page { @@ -42,8 +61,19 @@ struct kvm_mmu_page { bool tdp_mmu_page; bool unsync; - u8 mmu_valid_gen; - bool lpage_disallowed; /* Can't be replaced by an equiv large page */ + union { + u8 mmu_valid_gen; + + /* Only accessed under slots_lock. */ + bool tdp_mmu_scheduled_root_to_zap; + }; + + /* + * The shadow page can't be replaced by an equivalent huge page + * because it is being used to map an executable page in the guest + * and the NX huge page mitigation is enabled. + */ + bool nx_huge_page_disallowed; /* * The following two entries are used to key the shadow page in the @@ -53,27 +83,56 @@ struct kvm_mmu_page { gfn_t gfn; u64 *spt; - /* hold the gfn of each spte inside spt */ - gfn_t *gfns; + + /* + * Stores the result of the guest translation being shadowed by each + * SPTE. KVM shadows two types of guest translations: nGPA -> GPA + * (shadow EPT/NPT) and GVA -> GPA (traditional shadow paging). In both + * cases the result of the translation is a GPA and a set of access + * constraints. + * + * The GFN is stored in the upper bits (PAGE_SHIFT) and the shadowed + * access permissions are stored in the lower bits. Note, for + * convenience and uniformity across guests, the access permissions are + * stored in KVM format (e.g. ACC_EXEC_MASK) not the raw guest format. + */ + u64 *shadowed_translation; + /* Currently serving as active root */ union { int root_count; refcount_t tdp_mmu_root_count; }; - unsigned int unsync_children; union { - struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ - tdp_ptep_t ptep; - }; - union { - DECLARE_BITMAP(unsync_child_bitmap, 512); + /* These two members aren't used for TDP MMU */ struct { - struct work_struct tdp_mmu_async_work; - void *tdp_mmu_async_data; + unsigned int unsync_children; + /* + * Number of writes since the last time traversal + * visited this page. + */ + atomic_t write_flooding_count; }; + /* + * Page table page of external PT. + * Passed to TDX module, not accessed by KVM. + */ + void *external_spt; }; + union { + struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ + tdp_ptep_t ptep; + }; + DECLARE_BITMAP(unsync_child_bitmap, 512); - struct list_head lpage_disallowed_link; + /* + * Tracks shadow pages that, if zapped, would allow KVM to create an NX + * huge page. A shadow page will have nx_huge_page_disallowed set but + * not be on the list if a huge page is disallowed for other reasons, + * e.g. because KVM is shadowing a PTE at the same gfn, the memslot + * isn't properly aligned, etc... + */ + struct list_head possible_nx_huge_page_link; #ifdef CONFIG_X86_32 /* * Used out of the mmu-lock to avoid reading spte values while an @@ -82,9 +141,6 @@ struct kvm_mmu_page { int clear_spte_count; #endif - /* Number of writes since the last time traversal visited this page. */ - atomic_t write_flooding_count; - #ifdef CONFIG_X86_64 /* Used for freeing the page asynchronously if it is a TDP MMU page. */ struct rcu_head rcu_head; @@ -93,29 +149,46 @@ struct kvm_mmu_page { extern struct kmem_cache *mmu_page_header_cache; -static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page) +static inline int kvm_mmu_role_as_id(union kvm_mmu_page_role role) { - struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); + return role.smm ? 1 : 0; +} - return (struct kvm_mmu_page *)page_private(page); +static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) +{ + return kvm_mmu_role_as_id(sp->role); } -static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep) +static inline bool is_mirror_sp(const struct kvm_mmu_page *sp) { - return to_shadow_page(__pa(sptep)); + return sp->role.is_mirror; } -static inline int kvm_mmu_role_as_id(union kvm_mmu_page_role role) +static inline void kvm_mmu_alloc_external_spt(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { - return role.smm ? 1 : 0; + /* + * external_spt is allocated for TDX module to hold private EPT mappings, + * TDX module will initialize the page by itself. + * Therefore, KVM does not need to initialize or access external_spt. + * KVM only interacts with sp->spt for private EPT operations. + */ + sp->external_spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_external_spt_cache); } -static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) +static inline gfn_t kvm_gfn_root_bits(const struct kvm *kvm, const struct kvm_mmu_page *root) { - return kvm_mmu_role_as_id(sp->role); + /* + * Since mirror SPs are used only for TDX, which maps private memory + * at its "natural" GFN, no mask needs to be applied to them - and, dually, + * we expect that the bits is only used for the shared PT. + */ + if (is_mirror_sp(root)) + return 0; + return kvm_gfn_direct_bits(kvm); } -static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp) +static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm *kvm, + struct kvm_mmu_page *sp) { /* * When using the EPT page-modification log, the GPAs in the CPU dirty @@ -125,31 +198,42 @@ static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp) * being enabled is mandatory as the bits used to denote WP-only SPTEs * are reserved for PAE paging (32-bit KVM). */ - return kvm_x86_ops.cpu_dirty_log_size && sp->role.guest_mode; + return kvm->arch.cpu_dirty_log_size && sp->role.guest_mode; +} + +static inline gfn_t gfn_round_for_level(gfn_t gfn, int level) +{ + return gfn & -KVM_PAGES_PER_HPAGE(level); } int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, - gfn_t gfn, bool can_unsync, bool prefetch); + gfn_t gfn, bool synchronizing, bool prefetch); void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn, int min_level); -void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, - u64 start_gfn, u64 pages); + +/* Flush the given page (huge or not) of guest memory. */ +static inline void kvm_flush_remote_tlbs_gfn(struct kvm *kvm, gfn_t gfn, int level) +{ + kvm_flush_remote_tlbs_range(kvm, gfn_round_for_level(gfn, level), + KVM_PAGES_PER_HPAGE(level)); +} + unsigned int pte_list_count(struct kvm_rmap_head *rmap_head); extern int nx_huge_pages; -static inline bool is_nx_huge_page_enabled(void) +static inline bool is_nx_huge_page_enabled(struct kvm *kvm) { - return READ_ONCE(nx_huge_pages); + return READ_ONCE(nx_huge_pages) && !kvm->arch.disable_nx_huge_pages; } struct kvm_page_fault { /* arguments to kvm_mmu_do_page_fault. */ const gpa_t addr; - const u32 error_code; + const u64 error_code; const bool prefetch; /* Derived from error_code. */ @@ -161,6 +245,7 @@ struct kvm_page_fault { /* Derived from mmu and global state. */ const bool is_tdp; + const bool is_private; const bool nx_huge_page_workaround_enabled; /* @@ -171,7 +256,7 @@ struct kvm_page_fault { /* * Maximum page size that can be created for this fault; input to - * FNAME(fetch), __direct_map and kvm_tdp_mmu_map. + * FNAME(fetch), direct_map() and kvm_tdp_mmu_map(). */ u8 max_level; @@ -187,16 +272,29 @@ struct kvm_page_fault { */ u8 goal_level; - /* Shifted addr, or result of guest page table walk if addr is a gva. */ + /* + * Shifted addr, or result of guest page table walk if addr is a gva. In + * the case of VM where memslot's can be mapped at multiple GPA aliases + * (i.e. TDX), the gfn field does not contain the bit that selects between + * the aliases (i.e. the shared bit for TDX). + */ gfn_t gfn; /* The memslot containing gfn. May be NULL. */ struct kvm_memory_slot *slot; - /* Outputs of kvm_faultin_pfn. */ + /* Outputs of kvm_mmu_faultin_pfn(). */ + unsigned long mmu_seq; kvm_pfn_t pfn; - hva_t hva; + struct page *refcounted_page; bool map_writable; + + /* + * Indicates the guest is trying to write a gfn that contains one or + * more of the PTEs used to translate the write itself, i.e. the access + * is changing its own translation in the guest page tables. + */ + bool write_fault_to_shadow_pgtable; }; int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); @@ -208,6 +306,8 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); * RET_PF_CONTINUE: So far, so good, keep handling the page fault. * RET_PF_RETRY: let CPU fault again on the address. * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. + * RET_PF_WRITE_PROTECTED: the gfn is write-protected, either unprotected the + * gfn and retry, or emulate the instruction directly. * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. * RET_PF_FIXED: The faulting entry has been fixed. * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. @@ -216,21 +316,37 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); * tracepoints via TRACE_DEFINE_ENUM() in mmutrace.h * * Note, all values must be greater than or equal to zero so as not to encroach - * on -errno return values. Somewhat arbitrarily use '0' for CONTINUE, which - * will allow for efficient machine code when checking for CONTINUE, e.g. - * "TEST %rax, %rax, JNZ", as all "stop!" values are non-zero. + * on -errno return values. */ enum { RET_PF_CONTINUE = 0, RET_PF_RETRY, RET_PF_EMULATE, + RET_PF_WRITE_PROTECTED, RET_PF_INVALID, RET_PF_FIXED, RET_PF_SPURIOUS, }; +/* + * Define RET_PF_CONTINUE as 0 to allow for + * - efficient machine code when checking for CONTINUE, e.g. + * "TEST %rax, %rax, JNZ", as all "stop!" values are non-zero, + * - kvm_mmu_do_page_fault() to return other RET_PF_* as a positive value. + */ +static_assert(RET_PF_CONTINUE == 0); + +static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) +{ + kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT, + PAGE_SIZE, fault->write, fault->exec, + fault->is_private); +} + static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - u32 err, bool prefetch) + u64 err, bool prefetch, + int *emulation_type, u8 *level) { struct kvm_page_fault fault = { .addr = cr2_or_gpa, @@ -242,52 +358,62 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, .user = err & PFERR_USER_MASK, .prefetch = prefetch, .is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault), - .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(), + .nx_huge_page_workaround_enabled = + is_nx_huge_page_enabled(vcpu->kvm), .max_level = KVM_MAX_HUGEPAGE_LEVEL, .req_level = PG_LEVEL_4K, .goal_level = PG_LEVEL_4K, + .is_private = err & PFERR_PRIVATE_ACCESS, + + .pfn = KVM_PFN_ERR_FAULT, }; int r; + if (vcpu->arch.mmu->root_role.direct) { + /* + * Things like memslots don't understand the concept of a shared + * bit. Strip it so that the GFN can be used like normal, and the + * fault.addr can be used when the shared bit is needed. + */ + fault.gfn = gpa_to_gfn(fault.addr) & ~kvm_gfn_direct_bits(vcpu->kvm); + fault.slot = kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn); + } + /* - * Async #PF "faults", a.k.a. prefetch faults, are not faults from the - * guest perspective and have already been counted at the time of the - * original fault. + * With retpoline being active an indirect call is rather expensive, + * so do a direct call in the most common case. */ - if (!prefetch) - vcpu->stat.pf_taken++; - - if (IS_ENABLED(CONFIG_RETPOLINE) && fault.is_tdp) + if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && fault.is_tdp) r = kvm_tdp_page_fault(vcpu, &fault); else r = vcpu->arch.mmu->page_fault(vcpu, &fault); /* - * Similar to above, prefetch faults aren't truly spurious, and the - * async #PF path doesn't do emulation. Do count faults that are fixed - * by the async #PF handler though, otherwise they'll never be counted. + * Not sure what's happening, but punt to userspace and hope that + * they can fix it by changing memory to shared, or they can + * provide a better error. */ - if (r == RET_PF_FIXED) - vcpu->stat.pf_fixed++; - else if (prefetch) - ; - else if (r == RET_PF_EMULATE) - vcpu->stat.pf_emulate++; - else if (r == RET_PF_SPURIOUS) - vcpu->stat.pf_spurious++; + if (r == RET_PF_EMULATE && fault.is_private) { + pr_warn_ratelimited("kvm: unexpected emulation request on private memory\n"); + kvm_mmu_prepare_memory_fault_exit(vcpu, &fault); + return -EFAULT; + } + + if (fault.write_fault_to_shadow_pgtable && emulation_type) + *emulation_type |= EMULTYPE_WRITE_PF_TO_SP; + if (level) + *level = fault.goal_level; + return r; } int kvm_mmu_max_mapping_level(struct kvm *kvm, - const struct kvm_memory_slot *slot, gfn_t gfn, - kvm_pfn_t pfn, int max_level); + const struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level); -void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); - -void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp); -void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp); +void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); +void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp); #endif /* __KVM_X86_MMU_INTERNAL_H */ diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index ae86820cef69..f35a830ce469 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -57,6 +57,7 @@ TRACE_DEFINE_ENUM(RET_PF_CONTINUE); TRACE_DEFINE_ENUM(RET_PF_RETRY); TRACE_DEFINE_ENUM(RET_PF_EMULATE); +TRACE_DEFINE_ENUM(RET_PF_WRITE_PROTECTED); TRACE_DEFINE_ENUM(RET_PF_INVALID); TRACE_DEFINE_ENUM(RET_PF_FIXED); TRACE_DEFINE_ENUM(RET_PF_SPURIOUS); @@ -260,7 +261,7 @@ TRACE_EVENT( TP_STRUCT__entry( __field(int, vcpu_id) __field(gpa_t, cr2_or_gpa) - __field(u32, error_code) + __field(u64, error_code) __field(u64 *, sptep) __field(u64, old_spte) __field(u64, new_spte) diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index 2e09d1b6249f..1b17b12393a8 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -10,120 +10,95 @@ * Author: * Xiao Guangrong <guangrong.xiao@linux.intel.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/lockdep.h> #include <linux/kvm_host.h> #include <linux/rculist.h> -#include <asm/kvm_page_track.h> - #include "mmu.h" #include "mmu_internal.h" +#include "page_track.h" + +static bool kvm_external_write_tracking_enabled(struct kvm *kvm) +{ +#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING + /* + * Read external_write_tracking_enabled before related pointers. Pairs + * with the smp_store_release in kvm_page_track_write_tracking_enable(). + */ + return smp_load_acquire(&kvm->arch.external_write_tracking_enabled); +#else + return false; +#endif +} bool kvm_page_track_write_tracking_enabled(struct kvm *kvm) { - return IS_ENABLED(CONFIG_KVM_EXTERNAL_WRITE_TRACKING) || - !tdp_enabled || kvm_shadow_root_allocated(kvm); + return kvm_external_write_tracking_enabled(kvm) || + kvm_shadow_root_allocated(kvm) || !tdp_enabled; } void kvm_page_track_free_memslot(struct kvm_memory_slot *slot) { - int i; - - for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) { - kvfree(slot->arch.gfn_track[i]); - slot->arch.gfn_track[i] = NULL; - } + vfree(slot->arch.gfn_write_track); + slot->arch.gfn_write_track = NULL; } -int kvm_page_track_create_memslot(struct kvm *kvm, - struct kvm_memory_slot *slot, - unsigned long npages) +static int __kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot, + unsigned long npages) { - int i; - - for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) { - if (i == KVM_PAGE_TRACK_WRITE && - !kvm_page_track_write_tracking_enabled(kvm)) - continue; - - slot->arch.gfn_track[i] = - __vcalloc(npages, sizeof(*slot->arch.gfn_track[i]), - GFP_KERNEL_ACCOUNT); - if (!slot->arch.gfn_track[i]) - goto track_free; - } + const size_t size = sizeof(*slot->arch.gfn_write_track); - return 0; + if (!slot->arch.gfn_write_track) + slot->arch.gfn_write_track = __vcalloc(npages, size, + GFP_KERNEL_ACCOUNT); -track_free: - kvm_page_track_free_memslot(slot); - return -ENOMEM; + return slot->arch.gfn_write_track ? 0 : -ENOMEM; } -static inline bool page_track_mode_is_valid(enum kvm_page_track_mode mode) +int kvm_page_track_create_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot, + unsigned long npages) { - if (mode < 0 || mode >= KVM_PAGE_TRACK_MAX) - return false; + if (!kvm_page_track_write_tracking_enabled(kvm)) + return 0; - return true; + return __kvm_page_track_write_tracking_alloc(slot, npages); } int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot) { - unsigned short *gfn_track; - - if (slot->arch.gfn_track[KVM_PAGE_TRACK_WRITE]) - return 0; - - gfn_track = __vcalloc(slot->npages, sizeof(*gfn_track), - GFP_KERNEL_ACCOUNT); - if (gfn_track == NULL) - return -ENOMEM; - - slot->arch.gfn_track[KVM_PAGE_TRACK_WRITE] = gfn_track; - return 0; + return __kvm_page_track_write_tracking_alloc(slot, slot->npages); } -static void update_gfn_track(struct kvm_memory_slot *slot, gfn_t gfn, - enum kvm_page_track_mode mode, short count) +static void update_gfn_write_track(struct kvm_memory_slot *slot, gfn_t gfn, + short count) { int index, val; index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K); - val = slot->arch.gfn_track[mode][index]; + val = slot->arch.gfn_write_track[index]; - if (WARN_ON(val + count < 0 || val + count > USHRT_MAX)) + if (WARN_ON_ONCE(val + count < 0 || val + count > USHRT_MAX)) return; - slot->arch.gfn_track[mode][index] += count; + slot->arch.gfn_write_track[index] += count; } -/* - * add guest page to the tracking pool so that corresponding access on that - * page will be intercepted. - * - * It should be called under the protection both of mmu-lock and kvm->srcu - * or kvm->slots_lock. - * - * @kvm: the guest instance we are interested in. - * @slot: the @gfn belongs to. - * @gfn: the guest page. - * @mode: tracking mode, currently only write track is supported. - */ -void kvm_slot_page_track_add_page(struct kvm *kvm, - struct kvm_memory_slot *slot, gfn_t gfn, - enum kvm_page_track_mode mode) +void __kvm_write_track_add_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn) { + lockdep_assert_held_write(&kvm->mmu_lock); - if (WARN_ON(!page_track_mode_is_valid(mode))) - return; + lockdep_assert_once(lockdep_is_held(&kvm->slots_lock) || + srcu_read_lock_held(&kvm->srcu)); - if (WARN_ON(mode == KVM_PAGE_TRACK_WRITE && - !kvm_page_track_write_tracking_enabled(kvm))) + if (KVM_BUG_ON(!kvm_page_track_write_tracking_enabled(kvm), kvm)) return; - update_gfn_track(slot, gfn, mode, 1); + update_gfn_write_track(slot, gfn, 1); /* * new track stops large page mapping for the @@ -131,37 +106,22 @@ void kvm_slot_page_track_add_page(struct kvm *kvm, */ kvm_mmu_gfn_disallow_lpage(slot, gfn); - if (mode == KVM_PAGE_TRACK_WRITE) - if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K)) - kvm_flush_remote_tlbs(kvm); + if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K)) + kvm_flush_remote_tlbs(kvm); } -EXPORT_SYMBOL_GPL(kvm_slot_page_track_add_page); -/* - * remove the guest page from the tracking pool which stops the interception - * of corresponding access on that page. It is the opposed operation of - * kvm_slot_page_track_add_page(). - * - * It should be called under the protection both of mmu-lock and kvm->srcu - * or kvm->slots_lock. - * - * @kvm: the guest instance we are interested in. - * @slot: the @gfn belongs to. - * @gfn: the guest page. - * @mode: tracking mode, currently only write track is supported. - */ -void kvm_slot_page_track_remove_page(struct kvm *kvm, - struct kvm_memory_slot *slot, gfn_t gfn, - enum kvm_page_track_mode mode) +void __kvm_write_track_remove_gfn(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn) { - if (WARN_ON(!page_track_mode_is_valid(mode))) - return; + lockdep_assert_held_write(&kvm->mmu_lock); + + lockdep_assert_once(lockdep_is_held(&kvm->slots_lock) || + srcu_read_lock_held(&kvm->srcu)); - if (WARN_ON(mode == KVM_PAGE_TRACK_WRITE && - !kvm_page_track_write_tracking_enabled(kvm))) + if (KVM_BUG_ON(!kvm_page_track_write_tracking_enabled(kvm), kvm)) return; - update_gfn_track(slot, gfn, mode, -1); + update_gfn_write_track(slot, gfn, -1); /* * allow large page mapping for the tracked page @@ -169,31 +129,26 @@ void kvm_slot_page_track_remove_page(struct kvm *kvm, */ kvm_mmu_gfn_allow_lpage(slot, gfn); } -EXPORT_SYMBOL_GPL(kvm_slot_page_track_remove_page); /* * check if the corresponding access on the specified guest page is tracked. */ -bool kvm_slot_page_track_is_active(struct kvm *kvm, - const struct kvm_memory_slot *slot, - gfn_t gfn, enum kvm_page_track_mode mode) +bool kvm_gfn_is_write_tracked(struct kvm *kvm, + const struct kvm_memory_slot *slot, gfn_t gfn) { int index; - if (WARN_ON(!page_track_mode_is_valid(mode))) - return false; - if (!slot) return false; - if (mode == KVM_PAGE_TRACK_WRITE && - !kvm_page_track_write_tracking_enabled(kvm)) + if (!kvm_page_track_write_tracking_enabled(kvm)) return false; index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K); - return !!READ_ONCE(slot->arch.gfn_track[mode][index]); + return !!READ_ONCE(slot->arch.gfn_write_track[index]); } +#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING void kvm_page_track_cleanup(struct kvm *kvm) { struct kvm_page_track_notifier_head *head; @@ -211,21 +166,80 @@ int kvm_page_track_init(struct kvm *kvm) return init_srcu_struct(&head->track_srcu); } +static int kvm_enable_external_write_tracking(struct kvm *kvm) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *slot; + int r = 0, i, bkt; + + if (kvm->arch.vm_type == KVM_X86_TDX_VM) + return -EOPNOTSUPP; + + mutex_lock(&kvm->slots_arch_lock); + + /* + * Check for *any* write tracking user (not just external users) under + * lock. This avoids unnecessary work, e.g. if KVM itself is using + * write tracking, or if two external users raced when registering. + */ + if (kvm_page_track_write_tracking_enabled(kvm)) + goto out_success; + + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { + slots = __kvm_memslots(kvm, i); + kvm_for_each_memslot(slot, bkt, slots) { + /* + * Intentionally do NOT free allocations on failure to + * avoid having to track which allocations were made + * now versus when the memslot was created. The + * metadata is guaranteed to be freed when the slot is + * freed, and will be kept/used if userspace retries + * the failed ioctl() instead of killing the VM. + */ + r = kvm_page_track_write_tracking_alloc(slot); + if (r) + goto out_unlock; + } + } + +out_success: + /* + * Ensure that external_write_tracking_enabled becomes true strictly + * after all the related pointers are set. + */ + smp_store_release(&kvm->arch.external_write_tracking_enabled, true); +out_unlock: + mutex_unlock(&kvm->slots_arch_lock); + return r; +} + /* * register the notifier so that event interception for the tracked guest * pages can be received. */ -void -kvm_page_track_register_notifier(struct kvm *kvm, - struct kvm_page_track_notifier_node *n) +int kvm_page_track_register_notifier(struct kvm *kvm, + struct kvm_page_track_notifier_node *n) { struct kvm_page_track_notifier_head *head; + int r; + + if (!kvm || kvm->mm != current->mm) + return -ESRCH; + + if (!kvm_external_write_tracking_enabled(kvm)) { + r = kvm_enable_external_write_tracking(kvm); + if (r) + return r; + } + + kvm_get_kvm(kvm); head = &kvm->arch.track_notifier_head; write_lock(&kvm->mmu_lock); hlist_add_head_rcu(&n->node, &head->track_notifier_list); write_unlock(&kvm->mmu_lock); + return 0; } EXPORT_SYMBOL_GPL(kvm_page_track_register_notifier); @@ -233,9 +247,8 @@ EXPORT_SYMBOL_GPL(kvm_page_track_register_notifier); * stop receiving the event interception. It is the opposed operation of * kvm_page_track_register_notifier(). */ -void -kvm_page_track_unregister_notifier(struct kvm *kvm, - struct kvm_page_track_notifier_node *n) +void kvm_page_track_unregister_notifier(struct kvm *kvm, + struct kvm_page_track_notifier_node *n) { struct kvm_page_track_notifier_head *head; @@ -245,6 +258,8 @@ kvm_page_track_unregister_notifier(struct kvm *kvm, hlist_del_rcu(&n->node); write_unlock(&kvm->mmu_lock); synchronize_srcu(&head->track_srcu); + + kvm_put_kvm(kvm); } EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier); @@ -255,34 +270,30 @@ EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier); * The node should figure out if the written page is the one that node is * interested in by itself. */ -void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, - int bytes) +void __kvm_page_track_write(struct kvm *kvm, gpa_t gpa, const u8 *new, int bytes) { struct kvm_page_track_notifier_head *head; struct kvm_page_track_notifier_node *n; int idx; - head = &vcpu->kvm->arch.track_notifier_head; + head = &kvm->arch.track_notifier_head; if (hlist_empty(&head->track_notifier_list)) return; idx = srcu_read_lock(&head->track_srcu); hlist_for_each_entry_srcu(n, &head->track_notifier_list, node, - srcu_read_lock_held(&head->track_srcu)) + srcu_read_lock_held(&head->track_srcu)) if (n->track_write) - n->track_write(vcpu, gpa, new, bytes, n); + n->track_write(gpa, new, bytes, n); srcu_read_unlock(&head->track_srcu, idx); } /* - * Notify the node that memory slot is being removed or moved so that it can - * drop write-protection for the pages in the memory slot. - * - * The node should figure out it has any write-protected pages in this slot - * by itself. + * Notify external page track nodes that a memory region is being removed from + * the VM, e.g. so that users can free any associated metadata. */ -void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot) +void kvm_page_track_delete_slot(struct kvm *kvm, struct kvm_memory_slot *slot) { struct kvm_page_track_notifier_head *head; struct kvm_page_track_notifier_node *n; @@ -295,8 +306,69 @@ void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot) idx = srcu_read_lock(&head->track_srcu); hlist_for_each_entry_srcu(n, &head->track_notifier_list, node, - srcu_read_lock_held(&head->track_srcu)) - if (n->track_flush_slot) - n->track_flush_slot(kvm, slot, n); + srcu_read_lock_held(&head->track_srcu)) + if (n->track_remove_region) + n->track_remove_region(slot->base_gfn, slot->npages, n); srcu_read_unlock(&head->track_srcu, idx); } + +/* + * add guest page to the tracking pool so that corresponding access on that + * page will be intercepted. + * + * @kvm: the guest instance we are interested in. + * @gfn: the guest page. + */ +int kvm_write_track_add_gfn(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_memory_slot *slot; + int idx; + + idx = srcu_read_lock(&kvm->srcu); + + slot = gfn_to_memslot(kvm, gfn); + if (!slot) { + srcu_read_unlock(&kvm->srcu, idx); + return -EINVAL; + } + + write_lock(&kvm->mmu_lock); + __kvm_write_track_add_gfn(kvm, slot, gfn); + write_unlock(&kvm->mmu_lock); + + srcu_read_unlock(&kvm->srcu, idx); + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_write_track_add_gfn); + +/* + * remove the guest page from the tracking pool which stops the interception + * of corresponding access on that page. + * + * @kvm: the guest instance we are interested in. + * @gfn: the guest page. + */ +int kvm_write_track_remove_gfn(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_memory_slot *slot; + int idx; + + idx = srcu_read_lock(&kvm->srcu); + + slot = gfn_to_memslot(kvm, gfn); + if (!slot) { + srcu_read_unlock(&kvm->srcu, idx); + return -EINVAL; + } + + write_lock(&kvm->mmu_lock); + __kvm_write_track_remove_gfn(kvm, slot, gfn); + write_unlock(&kvm->mmu_lock); + + srcu_read_unlock(&kvm->srcu, idx); + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_write_track_remove_gfn); +#endif diff --git a/arch/x86/kvm/mmu/page_track.h b/arch/x86/kvm/mmu/page_track.h new file mode 100644 index 000000000000..d4d72ed999b1 --- /dev/null +++ b/arch/x86/kvm/mmu/page_track.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_PAGE_TRACK_H +#define __KVM_X86_PAGE_TRACK_H + +#include <linux/kvm_host.h> + +#include <asm/kvm_page_track.h> + + +bool kvm_page_track_write_tracking_enabled(struct kvm *kvm); +int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot); + +void kvm_page_track_free_memslot(struct kvm_memory_slot *slot); +int kvm_page_track_create_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot, + unsigned long npages); + +void __kvm_write_track_add_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn); +void __kvm_write_track_remove_gfn(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn); + +bool kvm_gfn_is_write_tracked(struct kvm *kvm, + const struct kvm_memory_slot *slot, gfn_t gfn); + +#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING +int kvm_page_track_init(struct kvm *kvm); +void kvm_page_track_cleanup(struct kvm *kvm); + +void __kvm_page_track_write(struct kvm *kvm, gpa_t gpa, const u8 *new, int bytes); +void kvm_page_track_delete_slot(struct kvm *kvm, struct kvm_memory_slot *slot); + +static inline bool kvm_page_track_has_external_user(struct kvm *kvm) +{ + return !hlist_empty(&kvm->arch.track_notifier_head.track_notifier_list); +} +#else +static inline int kvm_page_track_init(struct kvm *kvm) { return 0; } +static inline void kvm_page_track_cleanup(struct kvm *kvm) { } + +static inline void __kvm_page_track_write(struct kvm *kvm, gpa_t gpa, + const u8 *new, int bytes) { } +static inline void kvm_page_track_delete_slot(struct kvm *kvm, + struct kvm_memory_slot *slot) { } + +static inline bool kvm_page_track_has_external_user(struct kvm *kvm) { return false; } + +#endif /* CONFIG_KVM_EXTERNAL_WRITE_TRACKING */ + +static inline void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, + const u8 *new, int bytes) +{ + __kvm_page_track_write(vcpu->kvm, gpa, new, bytes); + + kvm_mmu_track_write(vcpu, gpa, new, bytes); +} + +#endif /* __KVM_X86_PAGE_TRACK_H */ diff --git a/arch/x86/kvm/mmu/paging.h b/arch/x86/kvm/mmu/paging.h deleted file mode 100644 index de8ab323bb70..000000000000 --- a/arch/x86/kvm/mmu/paging.h +++ /dev/null @@ -1,14 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* Shadow paging constants/helpers that don't need to be #undef'd. */ -#ifndef __KVM_X86_PAGING_H -#define __KVM_X86_PAGING_H - -#define GUEST_PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) -#define PT64_LVL_ADDR_MASK(level) \ - (GUEST_PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT64_LEVEL_BITS))) - 1)) -#define PT64_LVL_OFFSET_MASK(level) \ - (GUEST_PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT64_LEVEL_BITS))) - 1)) -#endif /* __KVM_X86_PAGING_H */ - diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index db80f7ccaa4e..68e323568e95 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -16,25 +16,21 @@ */ /* - * We need the mmu code to access both 32-bit and 64-bit guest ptes, - * so the code in this file is compiled twice, once per pte size. + * The MMU needs to be able to access/walk 32-bit and 64-bit guest page tables, + * as well as guest EPT tables, so the code in this file is compiled thrice, + * once per guest PTE type. The per-type defines are #undef'd at the end. */ #if PTTYPE == 64 #define pt_element_t u64 #define guest_walker guest_walker64 #define FNAME(name) paging##64_##name - #define PT_BASE_ADDR_MASK GUEST_PT64_BASE_ADDR_MASK - #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) - #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) - #define PT_INDEX(addr, level) PT64_INDEX(addr, level) - #define PT_LEVEL_BITS PT64_LEVEL_BITS + #define PT_LEVEL_BITS 9 #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #define PT_HAVE_ACCESSED_DIRTY(mmu) true #ifdef CONFIG_X86_64 #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL - #define CMPXCHG "cmpxchgq" #else #define PT_MAX_FULL_LEVELS 2 #endif @@ -42,36 +38,35 @@ #define pt_element_t u32 #define guest_walker guest_walker32 #define FNAME(name) paging##32_##name - #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK - #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl) - #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl) - #define PT_INDEX(addr, level) PT32_INDEX(addr, level) - #define PT_LEVEL_BITS PT32_LEVEL_BITS + #define PT_LEVEL_BITS 10 #define PT_MAX_FULL_LEVELS 2 #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #define PT_HAVE_ACCESSED_DIRTY(mmu) true - #define CMPXCHG "cmpxchgl" + + #define PT32_DIR_PSE36_SIZE 4 + #define PT32_DIR_PSE36_SHIFT 13 + #define PT32_DIR_PSE36_MASK \ + (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) #elif PTTYPE == PTTYPE_EPT #define pt_element_t u64 #define guest_walker guest_walkerEPT #define FNAME(name) ept_##name - #define PT_BASE_ADDR_MASK GUEST_PT64_BASE_ADDR_MASK - #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) - #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) - #define PT_INDEX(addr, level) PT64_INDEX(addr, level) - #define PT_LEVEL_BITS PT64_LEVEL_BITS + #define PT_LEVEL_BITS 9 #define PT_GUEST_DIRTY_SHIFT 9 #define PT_GUEST_ACCESSED_SHIFT 8 #define PT_HAVE_ACCESSED_DIRTY(mmu) (!(mmu)->cpu_role.base.ad_disabled) - #ifdef CONFIG_X86_64 - #define CMPXCHG "cmpxchgq" - #endif #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL #else #error Invalid PTTYPE value #endif +/* Common logic, but per-type values. These also need to be undefined. */ +#define PT_BASE_ADDR_MASK ((pt_element_t)__PT_BASE_ADDR_MASK) +#define PT_LVL_ADDR_MASK(lvl) __PT_LVL_ADDR_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS) +#define PT_LVL_OFFSET_MASK(lvl) __PT_LVL_OFFSET_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS) +#define PT_INDEX(addr, lvl) __PT_INDEX(addr, lvl, PT_LEVEL_BITS) + #define PT_GUEST_DIRTY_MASK (1 << PT_GUEST_DIRTY_SHIFT) #define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT) @@ -97,6 +92,15 @@ struct guest_walker { struct x86_exception fault; }; +#if PTTYPE == 32 +static inline gfn_t pse36_gfn_delta(u32 gpte) +{ + int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; + + return (gpte & PT32_DIR_PSE36_MASK) << shift; +} +#endif + static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) { return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; @@ -320,7 +324,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, trace_kvm_mmu_pagetable_walk(addr, access); retry_walk: walker->level = mmu->cpu_role.base.level; - pte = mmu->get_guest_pgd(vcpu); + pte = kvm_mmu_get_guest_pgd(vcpu, mmu); have_ad = PT_HAVE_ACCESSED_DIRTY(mmu); #if PTTYPE == 64 @@ -334,7 +338,6 @@ retry_walk: } #endif walker->max_level = walker->level; - ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu))); /* * FIXME: on Intel processors, loads of the PDPTE registers for PAE paging @@ -344,9 +347,21 @@ retry_walk: nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK; pte_access = ~0; + + /* + * Queue a page fault for injection if this assertion fails, as callers + * assume that walker.fault contains sane info on a walk failure. I.e. + * avoid making the situation worse by inducing even worse badness + * between when the assertion fails and when KVM kicks the vCPU out to + * userspace (because the VM is bugged). + */ + if (KVM_BUG_ON(is_long_mode(vcpu) && !is_pae(vcpu), vcpu->kvm)) + goto error; + ++walker->level; do { + struct kvm_memory_slot *slot; unsigned long host_addr; pt_access = pte_access; @@ -374,10 +389,14 @@ retry_walk: * information to fix the exit_qualification or exit_info_1 * fields. */ - if (unlikely(real_gpa == UNMAPPED_GVA)) + if (unlikely(real_gpa == INVALID_GPA)) return 0; - host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gpa_to_gfn(real_gpa), + slot = kvm_vcpu_gfn_to_memslot(vcpu, gpa_to_gfn(real_gpa)); + if (!kvm_is_visible_memslot(slot)) + goto error; + + host_addr = gfn_to_hva_memslot_prot(slot, gpa_to_gfn(real_gpa), &walker->pte_writable[walker->level - 1]); if (unlikely(kvm_is_error_hva(host_addr))) goto error; @@ -421,11 +440,13 @@ retry_walk: gfn = gpte_to_gfn_lvl(pte, walker->level); gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT; - if (PTTYPE == 32 && walker->level > PG_LEVEL_4K && is_cpuid_PSE36()) +#if PTTYPE == 32 + if (walker->level > PG_LEVEL_4K && is_cpuid_PSE36()) gfn += pse36_gfn_delta(pte); +#endif real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), access, &walker->fault); - if (real_gpa == UNMAPPED_GVA) + if (real_gpa == INVALID_GPA) return 0; walker->gfn = real_gpa >> PAGE_SHIFT; @@ -450,9 +471,6 @@ retry_walk: goto retry_walk; } - pgprintk("%s: pte %llx pte_access %x pt_access %x\n", - __func__, (u64)pte, walker->pte_access, - walker->pt_access[walker->level - 1]); return 1; error: @@ -466,7 +484,7 @@ error: #if PTTYPE == PTTYPE_EPT /* - * Use PFERR_RSVD_MASK in error_code to to tell if EPT + * Use PFERR_RSVD_MASK in error_code to tell if EPT * misconfiguration requires to be injected. The detection is * done by is_rsvd_bits_set() above. * @@ -479,21 +497,20 @@ error: * The other bits are set to 0. */ if (!(errcode & PFERR_RSVD_MASK)) { - vcpu->arch.exit_qualification &= (EPT_VIOLATION_GVA_IS_VALID | - EPT_VIOLATION_GVA_TRANSLATED); + walker->fault.exit_qualification = 0; + if (write_fault) - vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_WRITE; + walker->fault.exit_qualification |= EPT_VIOLATION_ACC_WRITE; if (user_fault) - vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_READ; + walker->fault.exit_qualification |= EPT_VIOLATION_ACC_READ; if (fetch_fault) - vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_INSTR; + walker->fault.exit_qualification |= EPT_VIOLATION_ACC_INSTR; /* * Note, pte_access holds the raw RWX bits from the EPTE, not * ACC_*_MASK flags! */ - vcpu->arch.exit_qualification |= (pte_access & VMX_EPT_RWX_MASK) << - EPT_VIOLATION_RWX_SHIFT; + walker->fault.exit_qualification |= EPT_VIOLATION_RWX_TO_PROT(pte_access); } #endif walker->fault.address = addr; @@ -513,34 +530,19 @@ static int FNAME(walk_addr)(struct guest_walker *walker, static bool FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - u64 *spte, pt_element_t gpte, bool no_dirty_log) + u64 *spte, pt_element_t gpte) { - struct kvm_memory_slot *slot; unsigned pte_access; gfn_t gfn; - kvm_pfn_t pfn; if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) return false; - pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); - gfn = gpte_to_gfn(gpte); pte_access = sp->role.access & FNAME(gpte_access)(gpte); FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); - slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, - no_dirty_log && (pte_access & ACC_WRITE_MASK)); - if (!slot) - return false; - - pfn = gfn_to_pfn_memslot_atomic(slot, gfn); - if (is_error_pfn(pfn)) - return false; - - mmu_set_spte(vcpu, slot, spte, pte_access, gfn, pfn, NULL); - kvm_release_pfn_clean(pfn); - return true; + return kvm_mmu_prefetch_sptes(vcpu, gfn, spte, 1, pte_access); } static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, @@ -583,13 +585,13 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, * If addresses are being invalidated, skip prefetching to avoid * accidentally prefetching those addresses. */ - if (unlikely(vcpu->kvm->mmu_notifier_count)) + if (unlikely(vcpu->kvm->mmu_invalidate_in_progress)) return; if (sp->role.direct) return __direct_pte_prefetch(vcpu, sp, sptep); - i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); + i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1); spte = sp->spt + i; for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { @@ -599,7 +601,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, if (is_shadow_present_pte(*spte)) continue; - if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true)) + if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i])) break; } } @@ -631,64 +633,85 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, * really care if it changes underneath us after this point). */ if (FNAME(gpte_changed)(vcpu, gw, top_level)) - goto out_gpte_changed; + return RET_PF_RETRY; + + if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) + return RET_PF_RETRY; - if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) - goto out_gpte_changed; + /* + * Load a new root and retry the faulting instruction in the extremely + * unlikely scenario that the guest root gfn became visible between + * loading a dummy root and handling the resulting page fault, e.g. if + * userspace create a memslot in the interim. + */ + if (unlikely(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa))) { + kvm_make_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu); + return RET_PF_RETRY; + } - for (shadow_walk_init(&it, vcpu, fault->addr); - shadow_walk_okay(&it) && it.level > gw->level; - shadow_walk_next(&it)) { + for_each_shadow_entry(vcpu, fault->addr, it) { gfn_t table_gfn; clear_sp_write_flooding_count(it.sptep); - drop_large_spte(vcpu, it.sptep); - - sp = NULL; - if (!is_shadow_present_pte(*it.sptep)) { - table_gfn = gw->table_gfn[it.level - 2]; - access = gw->pt_access[it.level - 2]; - sp = kvm_mmu_get_page(vcpu, table_gfn, fault->addr, - it.level-1, false, access); - /* - * We must synchronize the pagetable before linking it - * because the guest doesn't need to flush tlb when - * the gpte is changed from non-present to present. - * Otherwise, the guest may use the wrong mapping. - * - * For PG_LEVEL_4K, kvm_mmu_get_page() has already - * synchronized it transiently via kvm_sync_page(). - * - * For higher level pagetable, we synchronize it via - * the slower mmu_sync_children(). If it needs to - * break, some progress has been made; return - * RET_PF_RETRY and retry on the next #PF. - * KVM_REQ_MMU_SYNC is not necessary but it - * expedites the process. - */ - if (sp->unsync_children && - mmu_sync_children(vcpu, sp, false)) - return RET_PF_RETRY; - } + if (it.level == gw->level) + break; + + table_gfn = gw->table_gfn[it.level - 2]; + access = gw->pt_access[it.level - 2]; + sp = kvm_mmu_get_child_sp(vcpu, it.sptep, table_gfn, + false, access); /* - * Verify that the gpte in the page we've just write - * protected is still there. + * Synchronize the new page before linking it, as the CPU (KVM) + * is architecturally disallowed from inserting non-present + * entries into the TLB, i.e. the guest isn't required to flush + * the TLB when changing the gPTE from non-present to present. + * + * For PG_LEVEL_4K, kvm_mmu_find_shadow_page() has already + * synchronized the page via kvm_sync_page(). + * + * For higher level pages, which cannot be unsync themselves + * but can have unsync children, synchronize via the slower + * mmu_sync_children(). If KVM needs to drop mmu_lock due to + * contention or to reschedule, instruct the caller to retry + * the #PF (mmu_sync_children() ensures forward progress will + * be made). + */ + if (sp != ERR_PTR(-EEXIST) && sp->unsync_children && + mmu_sync_children(vcpu, sp, false)) + return RET_PF_RETRY; + + /* + * Verify that the gpte in the page, which is now either + * write-protected or unsync, wasn't modified between the fault + * and acquiring mmu_lock. This needs to be done even when + * reusing an existing shadow page to ensure the information + * gathered by the walker matches the information stored in the + * shadow page (which could have been modified by a different + * vCPU even if the page was already linked). Holding mmu_lock + * prevents the shadow page from changing after this point. */ if (FNAME(gpte_changed)(vcpu, gw, it.level - 1)) - goto out_gpte_changed; + return RET_PF_RETRY; - if (sp) + if (sp != ERR_PTR(-EEXIST)) link_shadow_page(vcpu, it.sptep, sp); + + if (fault->write && table_gfn == fault->gfn) + fault->write_fault_to_shadow_pgtable = true; } + /* + * Adjust the hugepage size _after_ resolving indirect shadow pages. + * KVM doesn't support mapping hugepages into the guest for gfns that + * are being shadowed by KVM, i.e. allocating a new shadow page may + * affect the allowed hugepage size. + */ kvm_mmu_hugepage_adjust(vcpu, fault); trace_kvm_mmu_spte_requested(fault); for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { - clear_sp_write_flooding_count(it.sptep); - /* * We cannot overwrite existing page tables with an NX * large page, as the leaf could be executable. @@ -696,22 +719,21 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, if (fault->nx_huge_page_workaround_enabled) disallowed_hugepage_adjust(fault, *it.sptep, it.level); - base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + base_gfn = gfn_round_for_level(fault->gfn, it.level); if (it.level == fault->goal_level) break; validate_direct_spte(vcpu, it.sptep, direct_access); - drop_large_spte(vcpu, it.sptep); + sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, + true, direct_access); + if (sp == ERR_PTR(-EEXIST)) + continue; - if (!is_shadow_present_pte(*it.sptep)) { - sp = kvm_mmu_get_page(vcpu, base_gfn, fault->addr, - it.level - 1, true, direct_access); - link_shadow_page(vcpu, it.sptep, sp); - if (fault->huge_page_disallowed && - fault->req_level >= it.level) - account_huge_nx_page(vcpu->kvm, sp); - } + link_shadow_page(vcpu, it.sptep, sp); + if (fault->huge_page_disallowed) + account_nx_huge_page(vcpu->kvm, sp, + fault->req_level >= it.level); } if (WARN_ON_ONCE(it.level != fault->goal_level)) @@ -724,49 +746,6 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, FNAME(pte_prefetch)(vcpu, gw, it.sptep); return ret; - -out_gpte_changed: - return RET_PF_RETRY; -} - - /* - * To see whether the mapped gfn can write its page table in the current - * mapping. - * - * It is the helper function of FNAME(page_fault). When guest uses large page - * size to map the writable gfn which is used as current page table, we should - * force kvm to use small page size to map it because new shadow page will be - * created when kvm establishes shadow page table that stop kvm using large - * page size. Do it early can avoid unnecessary #PF and emulation. - * - * @write_fault_to_shadow_pgtable will return true if the fault gfn is - * currently used as its page table. - * - * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok - * since the PDPT is always shadowed, that means, we can not use large page - * size to map the gfn which is used as PDPT. - */ -static bool -FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, - struct guest_walker *walker, bool user_fault, - bool *write_fault_to_shadow_pgtable) -{ - int level; - gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1); - bool self_changed = false; - - if (!(walker->pte_access & ACC_WRITE_MASK || - (!is_cr0_wp(vcpu->arch.mmu) && !user_fault))) - return false; - - for (level = walker->level; level <= walker->max_level; level++) { - gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1]; - - self_changed |= !(gfn & mask); - *write_fault_to_shadow_pgtable |= !gfn; - } - - return self_changed; } /* @@ -787,10 +766,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault { struct guest_walker walker; int r; - unsigned long mmu_seq; - bool is_self_change_mapping; - pgprintk("%s: addr %lx err %x\n", __func__, fault->addr, fault->error_code); WARN_ON_ONCE(fault->is_tdp); /* @@ -805,7 +781,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault * The page is not mapped by the guest. Let the guest handle it. */ if (!r) { - pgprintk("%s: guest page fault\n", __func__); if (!fault->prefetch) kvm_inject_emulated_page_fault(vcpu, &walker.fault); @@ -813,35 +788,19 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault } fault->gfn = walker.gfn; + fault->max_level = walker.level; fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn); if (page_fault_handle_page_track(vcpu, fault)) { shadow_page_table_clear_flood(vcpu, fault->addr); - return RET_PF_EMULATE; + return RET_PF_WRITE_PROTECTED; } r = mmu_topup_memory_caches(vcpu, true); if (r) return r; - vcpu->arch.write_fault_to_shadow_pgtable = false; - - is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, - &walker, fault->user, &vcpu->arch.write_fault_to_shadow_pgtable); - - if (is_self_change_mapping) - fault->max_level = PG_LEVEL_4K; - else - fault->max_level = walker.level; - - mmu_seq = vcpu->kvm->mmu_notifier_seq; - smp_rmb(); - - r = kvm_faultin_pfn(vcpu, fault); - if (r != RET_PF_CONTINUE) - return r; - - r = handle_abnormal_pfn(vcpu, fault, walker.pte_access); + r = kvm_mmu_faultin_pfn(vcpu, fault, walker.pte_access); if (r != RET_PF_CONTINUE) return r; @@ -867,7 +826,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault r = RET_PF_RETRY; write_lock(&vcpu->kvm->mmu_lock); - if (is_page_fault_stale(vcpu, fault, mmu_seq)) + if (is_page_fault_stale(vcpu, fault)) goto out_unlock; r = make_mmu_pages_available(vcpu); @@ -876,8 +835,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault r = FNAME(fetch)(vcpu, fault, &walker); out_unlock: + kvm_mmu_finish_page_fault(vcpu, fault, r); write_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(fault->pfn); return r; } @@ -885,80 +844,21 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) { int offset = 0; - WARN_ON(sp->role.level != PG_LEVEL_4K); + WARN_ON_ONCE(sp->role.level != PG_LEVEL_4K); if (PTTYPE == 32) - offset = sp->role.quadrant << PT64_LEVEL_BITS; + offset = sp->role.quadrant << SPTE_LEVEL_BITS; return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t); } -static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) -{ - struct kvm_shadow_walk_iterator iterator; - struct kvm_mmu_page *sp; - u64 old_spte; - int level; - u64 *sptep; - - vcpu_clear_mmio_info(vcpu, gva); - - /* - * No need to check return value here, rmap_can_add() can - * help us to skip pte prefetch later. - */ - mmu_topup_memory_caches(vcpu, true); - - if (!VALID_PAGE(root_hpa)) { - WARN_ON(1); - return; - } - - write_lock(&vcpu->kvm->mmu_lock); - for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) { - level = iterator.level; - sptep = iterator.sptep; - - sp = sptep_to_sp(sptep); - old_spte = *sptep; - if (is_last_spte(old_spte, level)) { - pt_element_t gpte; - gpa_t pte_gpa; - - if (!sp->unsync) - break; - - pte_gpa = FNAME(get_level1_sp_gpa)(sp); - pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); - - mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL); - if (is_shadow_present_pte(old_spte)) - kvm_flush_remote_tlbs_with_address(vcpu->kvm, - sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); - - if (!rmap_can_add(vcpu)) - break; - - if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, - sizeof(pt_element_t))) - break; - - FNAME(prefetch_gpte)(vcpu, sp, sptep, gpte, false); - } - - if (!sp->unsync_children) - break; - } - write_unlock(&vcpu->kvm->mmu_lock); -} - /* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gpa_t addr, u64 access, struct x86_exception *exception) { struct guest_walker walker; - gpa_t gpa = UNMAPPED_GVA; + gpa_t gpa = INVALID_GPA; int r; #ifndef CONFIG_X86_64 @@ -978,99 +878,87 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, } /* - * Using the cached information from sp->gfns is safe because: - * - The spte has a reference to the struct page, so the pfn for a given gfn - * can't change unless all sptes pointing to it are nuked first. + * Using the information in sp->shadowed_translation (kvm_mmu_page_get_gfn()) is + * safe because SPTEs are protected by mmu_notifiers and memslot generations, so + * the pfn for a given gfn can't change unless all SPTEs pointing to the gfn are + * nuked first. * * Returns - * < 0: the sp should be zapped - * 0: the sp is synced and no tlb flushing is required - * > 0: the sp is synced and tlb flushing is required + * < 0: failed to sync spte + * 0: the spte is synced and no tlb flushing is required + * > 0: the spte is synced and tlb flushing is required */ -static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i) { - union kvm_mmu_page_role root_role = vcpu->arch.mmu->root_role; - int i; bool host_writable; gpa_t first_pte_gpa; - bool flush = false; - - /* - * Ignore various flags when verifying that it's safe to sync a shadow - * page using the current MMU context. - * - * - level: not part of the overall MMU role and will never match as the MMU's - * level tracks the root level - * - access: updated based on the new guest PTE - * - quadrant: not part of the overall MMU role (similar to level) - */ - const union kvm_mmu_page_role sync_role_ign = { - .level = 0xf, - .access = 0x7, - .quadrant = 0x3, - .passthrough = 0x1, - }; + u64 *sptep, spte; + struct kvm_memory_slot *slot; + unsigned pte_access; + pt_element_t gpte; + gpa_t pte_gpa; + gfn_t gfn; - /* - * Direct pages can never be unsync, and KVM should never attempt to - * sync a shadow page for a different MMU context, e.g. if the role - * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the - * reserved bits checks will be wrong, etc... - */ - if (WARN_ON_ONCE(sp->role.direct || - (sp->role.word ^ root_role.word) & ~sync_role_ign.word)) - return -1; + if (WARN_ON_ONCE(sp->spt[i] == SHADOW_NONPRESENT_VALUE || + !sp->shadowed_translation)) + return 0; first_pte_gpa = FNAME(get_level1_sp_gpa)(sp); + pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); - for (i = 0; i < PT64_ENT_PER_PAGE; i++) { - u64 *sptep, spte; - struct kvm_memory_slot *slot; - unsigned pte_access; - pt_element_t gpte; - gpa_t pte_gpa; - gfn_t gfn; - - if (!sp->spt[i]) - continue; - - pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); - - if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, - sizeof(pt_element_t))) - return -1; + if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, + sizeof(pt_element_t))) + return -1; - if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { - flush = true; - continue; - } + if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) + return 1; - gfn = gpte_to_gfn(gpte); - pte_access = sp->role.access; - pte_access &= FNAME(gpte_access)(gpte); - FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); + gfn = gpte_to_gfn(gpte); + pte_access = sp->role.access; + pte_access &= FNAME(gpte_access)(gpte); + FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); - if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access)) - continue; + if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access)) + return 0; - if (gfn != sp->gfns[i]) { - drop_spte(vcpu->kvm, &sp->spt[i]); - flush = true; - continue; - } + /* + * Drop the SPTE if the new protections result in no effective + * "present" bit or if the gfn is changing. The former case + * only affects EPT with execute-only support with pte_access==0; + * all other paging modes will create a read-only SPTE if + * pte_access is zero. + */ + if ((pte_access | shadow_present_mask) == SHADOW_NONPRESENT_VALUE || + gfn != kvm_mmu_page_get_gfn(sp, i)) { + drop_spte(vcpu->kvm, &sp->spt[i]); + return 1; + } + /* + * Do nothing if the permissions are unchanged. The existing SPTE is + * still, and prefetch_invalid_gpte() has verified that the A/D bits + * are set in the "new" gPTE, i.e. there is no danger of missing an A/D + * update due to A/D bits being set in the SPTE but not the gPTE. + */ + if (kvm_mmu_page_get_access(sp, i) == pte_access) + return 0; - sptep = &sp->spt[i]; - spte = *sptep; - host_writable = spte & shadow_host_writable_mask; - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - make_spte(vcpu, sp, slot, pte_access, gfn, - spte_to_pfn(spte), spte, true, false, - host_writable, &spte); + /* Update the shadowed access bits in case they changed. */ + kvm_mmu_page_set_access(sp, i, pte_access); - flush |= mmu_spte_update(sptep, spte); - } + sptep = &sp->spt[i]; + spte = *sptep; + host_writable = spte & shadow_host_writable_mask; + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + make_spte(vcpu, sp, slot, pte_access, gfn, + spte_to_pfn(spte), spte, true, true, + host_writable, &spte); - return flush; + /* + * There is no need to mark the pfn dirty, as the new protections must + * be a subset of the old protections, i.e. synchronizing a SPTE cannot + * change the SPTE from read-only to writable. + */ + return mmu_spte_update(sptep, spte); } #undef pt_element_t @@ -1084,7 +972,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) #undef PT_MAX_FULL_LEVELS #undef gpte_to_gfn #undef gpte_to_gfn_lvl -#undef CMPXCHG #undef PT_GUEST_ACCESSED_MASK #undef PT_GUEST_DIRTY_MASK #undef PT_GUEST_DIRTY_SHIFT diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index b5960bbde7f7..cfce03d8f123 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -7,7 +7,7 @@ * Copyright (C) 2006 Qumranet, Inc. * Copyright 2020 Red Hat, Inc. and/or its affiliates. */ - +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include "mmu.h" @@ -20,7 +20,11 @@ #include <asm/vmx.h> bool __read_mostly enable_mmio_caching = true; +static bool __ro_after_init allow_mmio_caching; module_param_named(mmio_caching, enable_mmio_caching, bool, 0444); +EXPORT_SYMBOL_GPL(enable_mmio_caching); + +bool __read_mostly kvm_ad_enabled; u64 __read_mostly shadow_host_writable_mask; u64 __read_mostly shadow_mmu_writable_mask; @@ -40,13 +44,45 @@ u64 __read_mostly shadow_acc_track_mask; u64 __read_mostly shadow_nonpresent_or_rsvd_mask; u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; -u8 __read_mostly shadow_phys_bits; +static u8 __init kvm_get_host_maxphyaddr(void) +{ + /* + * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected + * in CPU detection code, but the processor treats those reduced bits as + * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at + * the physical address bits reported by CPUID, i.e. the raw MAXPHYADDR, + * when reasoning about CPU behavior with respect to MAXPHYADDR. + */ + if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008)) + return cpuid_eax(0x80000008) & 0xff; + + /* + * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with + * custom CPUID. Proceed with whatever the kernel found since these features + * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008). + */ + return boot_cpu_data.x86_phys_bits; +} + +void __init kvm_mmu_spte_module_init(void) +{ + /* + * Snapshot userspace's desire to allow MMIO caching. Whether or not + * KVM can actually enable MMIO caching depends on vendor-specific + * hardware capabilities and other module params that can't be resolved + * until the vendor module is loaded, i.e. enable_mmio_caching can and + * will change when the vendor module is (re)loaded. + */ + allow_mmio_caching = enable_mmio_caching; + + kvm_host.maxphyaddr = kvm_get_host_maxphyaddr(); +} static u64 generation_mmio_spte_mask(u64 gen) { u64 mask; - WARN_ON(gen & ~MMIO_SPTE_GEN_MASK); + WARN_ON_ONCE(gen & ~MMIO_SPTE_GEN_MASK); mask = (gen << MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_SPTE_GEN_LOW_MASK; mask |= (gen << MMIO_SPTE_GEN_HIGH_SHIFT) & MMIO_SPTE_GEN_HIGH_MASK; @@ -59,10 +95,8 @@ u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access) u64 spte = generation_mmio_spte_mask(gen); u64 gpa = gfn << PAGE_SHIFT; - WARN_ON_ONCE(!shadow_mmio_value); - access &= shadow_mmio_access_mask; - spte |= shadow_mmio_value | access; + spte |= vcpu->kvm->arch.shadow_mmio_value | access; spte |= gpa | shadow_nonpresent_or_rsvd_mask; spte |= (gpa & shadow_nonpresent_or_rsvd_mask) << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN; @@ -92,60 +126,75 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) } /* - * Returns true if the SPTE has bits that may be set without holding mmu_lock. - * The caller is responsible for checking if the SPTE is shadow-present, and - * for determining whether or not the caller cares about non-leaf SPTEs. + * Returns true if the SPTE needs to be updated atomically due to having bits + * that may be changed without holding mmu_lock, and for which KVM must not + * lose information. E.g. KVM must not drop Dirty bit information. The caller + * is responsible for checking if the SPTE is shadow-present, and for + * determining whether or not the caller cares about non-leaf SPTEs. */ -bool spte_has_volatile_bits(u64 spte) +bool spte_needs_atomic_update(u64 spte) { - /* - * Always atomically update spte if it can be updated - * out of mmu-lock, it can ensure dirty bit is not lost, - * also, it can help us to get a stable is_writable_pte() - * to ensure tlb flush is not missed. - */ + /* SPTEs can be made Writable bit by KVM's fast page fault handler. */ if (!is_writable_pte(spte) && is_mmu_writable_spte(spte)) return true; - if (is_access_track_spte(spte)) + /* + * A/D-disabled SPTEs can be access-tracked by aging, and access-tracked + * SPTEs can be restored by KVM's fast page fault handler. + */ + if (!spte_ad_enabled(spte)) return true; - if (spte_ad_enabled(spte)) { - if (!(spte & shadow_accessed_mask) || - (is_writable_pte(spte) && !(spte & shadow_dirty_mask))) - return true; - } - - return false; + /* + * Dirty and Accessed bits can be set by the CPU. Ignore the Accessed + * bit, as KVM tolerates false negatives/positives, e.g. KVM doesn't + * invalidate TLBs when aging SPTEs, and so it's safe to clobber the + * Accessed bit (and rare in practice). + */ + return is_writable_pte(spte) && !(spte & shadow_dirty_mask); } bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, const struct kvm_memory_slot *slot, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, - u64 old_spte, bool prefetch, bool can_unsync, + u64 old_spte, bool prefetch, bool synchronizing, bool host_writable, u64 *new_spte) { int level = sp->role.level; u64 spte = SPTE_MMU_PRESENT_MASK; bool wrprot = false; - if (sp->role.ad_disabled) - spte |= SPTE_TDP_AD_DISABLED_MASK; - else if (kvm_mmu_page_ad_need_write_protect(sp)) - spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK; - /* - * For the EPT case, shadow_present_mask is 0 if hardware - * supports exec-only page table entries. In that case, + * For the EPT case, shadow_present_mask has no RWX bits set if + * exec-only page table entries are supported. In that case, * ACC_USER_MASK and shadow_user_mask are used to represent * read access. See FNAME(gpte_access) in paging_tmpl.h. */ + WARN_ON_ONCE((pte_access | shadow_present_mask) == SHADOW_NONPRESENT_VALUE); + + if (sp->role.ad_disabled) + spte |= SPTE_TDP_AD_DISABLED; + else if (kvm_mmu_page_ad_need_write_protect(vcpu->kvm, sp)) + spte |= SPTE_TDP_AD_WRPROT_ONLY; + spte |= shadow_present_mask; - if (!prefetch) - spte |= spte_shadow_accessed_mask(spte); + if (!prefetch || synchronizing) + spte |= shadow_accessed_mask; + /* + * For simplicity, enforce the NX huge page mitigation even if not + * strictly necessary. KVM could ignore the mitigation if paging is + * disabled in the guest, as the guest doesn't have any page tables to + * abuse. But to safely ignore the mitigation, KVM would have to + * ensure a new MMU is loaded (or all shadow pages zapped) when CR0.PG + * is toggled on, and that's a net negative for performance when TDP is + * enabled. When TDP is disabled, KVM will always switch to a new MMU + * when CR0.PG is toggled, but leveraging that to ignore the mitigation + * would tie make_spte() further to vCPU/MMU state, and add complexity + * just to optimize a mode that is anything but performance critical. + */ if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) && - is_nx_huge_page_enabled()) { + is_nx_huge_page_enabled(vcpu->kvm)) { pte_access &= ~ACC_EXEC_MASK; } @@ -159,10 +208,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (level > PG_LEVEL_4K) spte |= PT_PAGE_SIZE_MASK; - if (tdp_enabled) - spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn, - kvm_is_mmio_pfn(pfn)); + spte |= kvm_x86_call(get_mt_mask)(vcpu, gfn, kvm_is_mmio_pfn(pfn)); if (host_writable) spte |= shadow_host_writable_mask; else @@ -174,46 +221,42 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, spte |= (u64)pfn << PAGE_SHIFT; if (pte_access & ACC_WRITE_MASK) { - spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask; - - /* - * Optimization: for pte sync, if spte was writable the hash - * lookup is unnecessary (and expensive). Write protection - * is responsibility of kvm_mmu_get_page / kvm_mmu_sync_roots. - * Same reasoning can be applied to dirty page accounting. - */ - if (is_writable_pte(old_spte)) - goto out; - /* * Unsync shadow pages that are reachable by the new, writable * SPTE. Write-protect the SPTE if the page can't be unsync'd, * e.g. it's write-tracked (upper-level SPs) or has one or more * shadow pages and unsync'ing pages is not allowed. + * + * When overwriting an existing leaf SPTE, and the old SPTE was + * writable, skip trying to unsync shadow pages as any relevant + * shadow pages must already be unsync, i.e. the hash lookup is + * unnecessary (and expensive). Note, this relies on KVM not + * changing PFNs without first zapping the old SPTE, which is + * guaranteed by both the shadow MMU and the TDP MMU. */ - if (mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, can_unsync, prefetch)) { - pgprintk("%s: found shadow page for %llx, marking ro\n", - __func__, gfn); + if ((!is_last_spte(old_spte, level) || !is_writable_pte(old_spte)) && + mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, synchronizing, prefetch)) wrprot = true; - pte_access &= ~ACC_WRITE_MASK; - spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); - } + else + spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask | + shadow_dirty_mask; } - if (pte_access & ACC_WRITE_MASK) - spte |= spte_shadow_dirty_mask(spte); - -out: - if (prefetch) + if (prefetch && !synchronizing) spte = mark_spte_for_access_track(spte); WARN_ONCE(is_rsvd_spte(&vcpu->arch.mmu->shadow_zero_check, spte, level), "spte = 0x%llx, level = %d, rsvd bits = 0x%llx", spte, level, get_rsvd_bits(&vcpu->arch.mmu->shadow_zero_check, spte, level)); + /* + * Mark the memslot dirty *after* modifying it for access tracking. + * Unlike folios, memslots can be safely marked dirty out of mmu_lock, + * i.e. in the fast page fault handler. + */ if ((spte & PT_WRITABLE_MASK) && kvm_slot_dirty_track_enabled(slot)) { /* Enforced by kvm_mmu_hugepage_adjust. */ - WARN_ON(level > PG_LEVEL_4K); + WARN_ON_ONCE(level > PG_LEVEL_4K); mark_page_dirty_in_slot(vcpu->kvm, slot, gfn); } @@ -221,15 +264,15 @@ out: return wrprot; } -static u64 make_spte_executable(u64 spte) +static u64 modify_spte_protections(u64 spte, u64 set, u64 clear) { bool is_access_track = is_access_track_spte(spte); if (is_access_track) spte = restore_acc_track_spte(spte); - spte &= ~shadow_nx_mask; - spte |= shadow_x_mask; + KVM_MMU_WARN_ON(set & clear); + spte = (spte | set) & ~clear; if (is_access_track) spte = mark_spte_for_access_track(spte); @@ -237,6 +280,16 @@ static u64 make_spte_executable(u64 spte) return spte; } +static u64 make_spte_executable(u64 spte) +{ + return modify_spte_protections(spte, shadow_x_mask, shadow_nx_mask); +} + +static u64 make_spte_nonexecutable(u64 spte) +{ + return modify_spte_protections(spte, shadow_nx_mask, shadow_x_mask); +} + /* * Construct an SPTE that maps a sub-page of the given huge page SPTE where * `index` identifies which sub-page. @@ -244,41 +297,55 @@ static u64 make_spte_executable(u64 spte) * This is used during huge page splitting to build the SPTEs that make up the * new page table. */ -u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index) +u64 make_small_spte(struct kvm *kvm, u64 huge_spte, + union kvm_mmu_page_role role, int index) { - u64 child_spte; - int child_level; - - if (WARN_ON_ONCE(!is_shadow_present_pte(huge_spte))) - return 0; + u64 child_spte = huge_spte; - if (WARN_ON_ONCE(!is_large_pte(huge_spte))) - return 0; - - child_spte = huge_spte; - child_level = huge_level - 1; + KVM_BUG_ON(!is_shadow_present_pte(huge_spte) || !is_large_pte(huge_spte), kvm); /* * The child_spte already has the base address of the huge page being * split. So we just have to OR in the offset to the page at the next * lower level for the given index. */ - child_spte |= (index * KVM_PAGES_PER_HPAGE(child_level)) << PAGE_SHIFT; + child_spte |= (index * KVM_PAGES_PER_HPAGE(role.level)) << PAGE_SHIFT; - if (child_level == PG_LEVEL_4K) { + if (role.level == PG_LEVEL_4K) { child_spte &= ~PT_PAGE_SIZE_MASK; /* - * When splitting to a 4K page, mark the page executable as the - * NX hugepage mitigation no longer applies. + * When splitting to a 4K page where execution is allowed, mark + * the page executable as the NX hugepage mitigation no longer + * applies. */ - if (is_nx_huge_page_enabled()) + if ((role.access & ACC_EXEC_MASK) && is_nx_huge_page_enabled(kvm)) child_spte = make_spte_executable(child_spte); } return child_spte; } +u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level) +{ + u64 huge_spte; + + KVM_BUG_ON(!is_shadow_present_pte(small_spte) || level == PG_LEVEL_4K, kvm); + + huge_spte = small_spte | PT_PAGE_SIZE_MASK; + + /* + * huge_spte already has the address of the sub-page being collapsed + * from small_spte, so just clear the lower address bits to create the + * huge page address. + */ + huge_spte &= KVM_HPAGE_MASK(level) | ~PAGE_MASK; + + if (is_nx_huge_page_enabled(kvm)) + huge_spte = make_spte_nonexecutable(huge_spte); + + return huge_spte; +} u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) { @@ -288,29 +355,13 @@ u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) shadow_user_mask | shadow_x_mask | shadow_me_value; if (ad_disabled) - spte |= SPTE_TDP_AD_DISABLED_MASK; + spte |= SPTE_TDP_AD_DISABLED; else spte |= shadow_accessed_mask; return spte; } -u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn) -{ - u64 new_spte; - - new_spte = old_spte & ~PT64_BASE_ADDR_MASK; - new_spte |= (u64)new_pfn << PAGE_SHIFT; - - new_spte &= ~PT_WRITABLE_MASK; - new_spte &= ~shadow_host_writable_mask; - new_spte &= ~shadow_mmu_writable_mask; - - new_spte = mark_spte_for_access_track(new_spte); - - return new_spte; -} - u64 mark_spte_for_access_track(u64 spte) { if (spte_ad_enabled(spte)) @@ -323,11 +374,11 @@ u64 mark_spte_for_access_track(u64 spte) WARN_ONCE(spte & (SHADOW_ACC_TRACK_SAVED_BITS_MASK << SHADOW_ACC_TRACK_SAVED_BITS_SHIFT), - "kvm: Access Tracking saved bit locations are not zero\n"); + "Access Tracking saved bit locations are not zero\n"); spte |= (spte & SHADOW_ACC_TRACK_SAVED_BITS_MASK) << SHADOW_ACC_TRACK_SAVED_BITS_SHIFT; - spte &= ~shadow_acc_track_mask; + spte &= ~(shadow_acc_track_mask | shadow_accessed_mask); return spte; } @@ -337,10 +388,24 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask) BUG_ON((u64)(unsigned)access_mask != access_mask); WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask); + /* + * Reset to the original module param value to honor userspace's desire + * to (dis)allow MMIO caching. Update the param itself so that + * userspace can see whether or not KVM is actually using MMIO caching. + */ + enable_mmio_caching = allow_mmio_caching; if (!enable_mmio_caching) mmio_value = 0; /* + * The mask must contain only bits that are carved out specifically for + * the MMIO SPTE mask, e.g. to ensure there's no overlap with the MMIO + * generation. + */ + if (WARN_ON(mmio_mask & ~SPTE_MMIO_ALLOWED_MASK)) + mmio_value = 0; + + /* * Disable MMIO caching if the MMIO value collides with the bits that * are used to hold the relocated GFN when the L1TF mitigation is * enabled. This should never fire as there is no known hardware that @@ -352,13 +417,13 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask) mmio_value = 0; /* - * The masked MMIO value must obviously match itself and a removed SPTE - * must not get a false positive. Removed SPTEs and MMIO SPTEs should - * never collide as MMIO must set some RWX bits, and removed SPTEs must + * The masked MMIO value must obviously match itself and a frozen SPTE + * must not get a false positive. Frozen SPTEs and MMIO SPTEs should + * never collide as MMIO must set some RWX bits, and frozen SPTEs must * not set any RWX bits. */ if (WARN_ON((mmio_value & mmio_mask) != mmio_value) || - WARN_ON(mmio_value && (REMOVED_SPTE & mmio_mask) == mmio_value)) + WARN_ON(mmio_value && (FROZEN_SPTE & mmio_mask) == mmio_value)) mmio_value = 0; if (!mmio_value) @@ -370,6 +435,12 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask) } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); +void kvm_mmu_set_mmio_spte_value(struct kvm *kvm, u64 mmio_value) +{ + kvm->arch.shadow_mmio_value = mmio_value; +} +EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_value); + void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask) { /* shadow_me_value must be a subset of shadow_me_mask */ @@ -383,12 +454,17 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_me_spte_mask); void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) { + kvm_ad_enabled = has_ad_bits; + shadow_user_mask = VMX_EPT_READABLE_MASK; - shadow_accessed_mask = has_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull; - shadow_dirty_mask = has_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull; + shadow_accessed_mask = VMX_EPT_ACCESS_BIT; + shadow_dirty_mask = VMX_EPT_DIRTY_BIT; shadow_nx_mask = 0ull; shadow_x_mask = VMX_EPT_EXECUTABLE_MASK; - shadow_present_mask = has_exec_only ? 0ull : VMX_EPT_READABLE_MASK; + /* VMX_EPT_SUPPRESS_VE_BIT is needed for W or X violation. */ + shadow_present_mask = + (has_exec_only ? 0ull : VMX_EPT_READABLE_MASK) | VMX_EPT_SUPPRESS_VE_BIT; + shadow_acc_track_mask = VMX_EPT_RWX_MASK; shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE; shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE; @@ -398,7 +474,7 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) * of an EPT paging-structure entry is 110b (write/execute). */ kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, - VMX_EPT_RWX_MASK, 0); + VMX_EPT_RWX_MASK | VMX_EPT_SUPPRESS_VE_BIT, 0); } EXPORT_SYMBOL_GPL(kvm_mmu_set_ept_masks); @@ -407,7 +483,7 @@ void kvm_mmu_reset_all_pte_masks(void) u8 low_phys_bits; u64 mask; - shadow_phys_bits = kvm_get_shadow_phys_bits(); + kvm_ad_enabled = true; /* * If the CPU has 46 or less physical address bits, then set an @@ -439,6 +515,7 @@ void kvm_mmu_reset_all_pte_masks(void) shadow_nx_mask = PT64_NX_MASK; shadow_x_mask = 0; shadow_present_mask = PT_PRESENT_MASK; + shadow_acc_track_mask = 0; shadow_me_mask = 0; shadow_me_value = 0; @@ -453,7 +530,7 @@ void kvm_mmu_reset_all_pte_masks(void) * 52-bit physical addresses then there are no reserved PA bits in the * PTEs and so the reserved PA approach must be disabled. */ - if (shadow_phys_bits < 52) + if (kvm_host.maxphyaddr < 52) mask = BIT_ULL(51) | PT_PRESENT_MASK; else mask = 0; diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 0127bb6e3c7d..1e94f081bdaf 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -3,9 +3,10 @@ #ifndef KVM_X86_MMU_SPTE_H #define KVM_X86_MMU_SPTE_H -#include "mmu_internal.h" +#include <asm/vmx.h> -extern bool __read_mostly enable_mmio_caching; +#include "mmu.h" +#include "mmu_internal.h" /* * A MMU present SPTE is backed by actual memory and may or may not be present @@ -30,18 +31,18 @@ extern bool __read_mostly enable_mmio_caching; */ #define SPTE_TDP_AD_SHIFT 52 #define SPTE_TDP_AD_MASK (3ULL << SPTE_TDP_AD_SHIFT) -#define SPTE_TDP_AD_ENABLED_MASK (0ULL << SPTE_TDP_AD_SHIFT) -#define SPTE_TDP_AD_DISABLED_MASK (1ULL << SPTE_TDP_AD_SHIFT) -#define SPTE_TDP_AD_WRPROT_ONLY_MASK (2ULL << SPTE_TDP_AD_SHIFT) -static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); +#define SPTE_TDP_AD_ENABLED (0ULL << SPTE_TDP_AD_SHIFT) +#define SPTE_TDP_AD_DISABLED (1ULL << SPTE_TDP_AD_SHIFT) +#define SPTE_TDP_AD_WRPROT_ONLY (2ULL << SPTE_TDP_AD_SHIFT) +static_assert(SPTE_TDP_AD_ENABLED == 0); #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK -#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) +#define SPTE_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) #else -#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) +#define SPTE_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) #endif -#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ +#define SPTE_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ | shadow_x_mask | shadow_nx_mask | shadow_me_mask) #define ACC_EXEC_MASK 1 @@ -50,17 +51,13 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) /* The mask for the R/X bits in EPT PTEs */ -#define PT64_EPT_READABLE_MASK 0x1ull -#define PT64_EPT_EXECUTABLE_MASK 0x4ull - -#define PT64_LEVEL_BITS 9 +#define SPTE_EPT_READABLE_MASK 0x1ull +#define SPTE_EPT_EXECUTABLE_MASK 0x4ull -#define PT64_LEVEL_SHIFT(level) \ - (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) - -#define PT64_INDEX(address, level)\ - (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) -#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) +#define SPTE_LEVEL_BITS 9 +#define SPTE_LEVEL_SHIFT(level) __PT_LEVEL_SHIFT(level, SPTE_LEVEL_BITS) +#define SPTE_INDEX(address, level) __PT_INDEX(address, level, SPTE_LEVEL_BITS) +#define SPTE_ENT_PER_PAGE __PT_ENT_PER_PAGE(SPTE_LEVEL_BITS) /* * The mask/shift to use for saving the original R/X bits when marking the PTE @@ -69,8 +66,8 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); * restored only when a write is attempted to the page. This mask obviously * must not overlap the A/D type mask. */ -#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \ - PT64_EPT_EXECUTABLE_MASK) +#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (SPTE_EPT_READABLE_MASK | \ + SPTE_EPT_EXECUTABLE_MASK) #define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 54 #define SHADOW_ACC_TRACK_SAVED_MASK (SHADOW_ACC_TRACK_SAVED_BITS_MASK << \ SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) @@ -129,6 +126,20 @@ static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK)); static_assert(!(SPTE_MMU_PRESENT_MASK & (MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK))); +/* + * The SPTE MMIO mask must NOT overlap the MMIO generation bits or the + * MMU-present bit. The generation obviously co-exists with the magic MMIO + * mask/value, and MMIO SPTEs are considered !MMU-present. + * + * The SPTE MMIO mask is allowed to use hardware "present" bits (i.e. all EPT + * RWX bits), all physical address bits (legal PA bits are used for "fast" MMIO + * and so they're off-limits for generation; additional checks ensure the mask + * doesn't overlap legal PA bits), and bit 63 (carved out for future usage). + */ +#define SPTE_MMIO_ALLOWED_MASK (BIT_ULL(63) | GENMASK_ULL(51, 12) | GENMASK_ULL(2, 0)) +static_assert(!(SPTE_MMIO_ALLOWED_MASK & + (SPTE_MMU_PRESENT_MASK | MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK))); + #define MMIO_SPTE_GEN_LOW_BITS (MMIO_SPTE_GEN_LOW_END - MMIO_SPTE_GEN_LOW_START + 1) #define MMIO_SPTE_GEN_HIGH_BITS (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1) @@ -140,6 +151,31 @@ static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11); #define MMIO_SPTE_GEN_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0) +/* + * Non-present SPTE value needs to set bit 63 for TDX, in order to suppress + * #VE and get EPT violations on non-present PTEs. We can use the + * same value also without TDX for both VMX and SVM: + * + * For SVM NPT, for non-present spte (bit 0 = 0), other bits are ignored. + * For VMX EPT, bit 63 is ignored if #VE is disabled. (EPT_VIOLATION_VE=0) + * bit 63 is #VE suppress if #VE is enabled. (EPT_VIOLATION_VE=1) + */ +#ifdef CONFIG_X86_64 +#define SHADOW_NONPRESENT_VALUE BIT_ULL(63) +static_assert(!(SHADOW_NONPRESENT_VALUE & SPTE_MMU_PRESENT_MASK)); +#else +#define SHADOW_NONPRESENT_VALUE 0ULL +#endif + + +/* + * True if A/D bits are supported in hardware and are enabled by KVM. When + * enabled, KVM uses A/D bits for all non-nested MMUs. Because L1 can disable + * A/D bits in EPTP12, SP and SPTE variants are needed to handle the scenario + * where KVM is using A/D bits for L1, but not L2. + */ +extern bool __read_mostly kvm_ad_enabled; + extern u64 __read_mostly shadow_host_writable_mask; extern u64 __read_mostly shadow_mmu_writable_mask; extern u64 __read_mostly shadow_nx_mask; @@ -155,7 +191,7 @@ extern u64 __read_mostly shadow_me_value; extern u64 __read_mostly shadow_me_mask; /* - * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED_MASK; + * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED; * shadow_acc_track_mask is the set of bits to be cleared in non-accessed * pages. */ @@ -174,24 +210,30 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask; /* * If a thread running without exclusive control of the MMU lock must perform a - * multi-part operation on an SPTE, it can set the SPTE to REMOVED_SPTE as a + * multi-part operation on an SPTE, it can set the SPTE to FROZEN_SPTE as a * non-present intermediate value. Other threads which encounter this value * should not modify the SPTE. * * Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on - * bot AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF - * vulnerability. Use only low bits to avoid 64-bit immediates. + * both AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF + * vulnerability. * * Only used by the TDP MMU. */ -#define REMOVED_SPTE 0x5a0ULL +#define FROZEN_SPTE (SHADOW_NONPRESENT_VALUE | 0x5a0ULL) -/* Removed SPTEs must not be misconstrued as shadow present PTEs. */ -static_assert(!(REMOVED_SPTE & SPTE_MMU_PRESENT_MASK)); +/* Frozen SPTEs must not be misconstrued as shadow present PTEs. */ +static_assert(!(FROZEN_SPTE & SPTE_MMU_PRESENT_MASK)); -static inline bool is_removed_spte(u64 spte) +static inline bool is_frozen_spte(u64 spte) { - return spte == REMOVED_SPTE; + return spte == FROZEN_SPTE; +} + +/* Get an SPTE's index into its parent's page table (and the spt array). */ +static inline int spte_index(u64 *sptep) +{ + return ((unsigned long)sptep / sizeof(*sptep)) & (SPTE_ENT_PER_PAGE - 1); } /* @@ -204,9 +246,43 @@ static inline bool is_removed_spte(u64 spte) */ extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; -static inline bool is_mmio_spte(u64 spte) +static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page) +{ + struct page *page = pfn_to_page((shadow_page) >> PAGE_SHIFT); + + return (struct kvm_mmu_page *)page_private(page); +} + +static inline struct kvm_mmu_page *spte_to_child_sp(u64 spte) +{ + return to_shadow_page(spte & SPTE_BASE_ADDR_MASK); +} + +static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep) +{ + return to_shadow_page(__pa(sptep)); +} + +static inline struct kvm_mmu_page *root_to_sp(hpa_t root) +{ + if (kvm_mmu_is_dummy_root(root)) + return NULL; + + /* + * The "root" may be a special root, e.g. a PAE entry, treat it as a + * SPTE to ensure any non-PA bits are dropped. + */ + return spte_to_child_sp(root); +} + +static inline bool is_mirror_sptep(tdp_ptep_t sptep) +{ + return is_mirror_sp(sptep_to_sp(rcu_dereference(sptep))); +} + +static inline bool is_mmio_spte(struct kvm *kvm, u64 spte) { - return (spte & shadow_mmio_mask) == shadow_mmio_value && + return (spte & shadow_mmio_mask) == kvm->arch.shadow_mmio_value && likely(enable_mmio_caching); } @@ -215,15 +291,11 @@ static inline bool is_shadow_present_pte(u64 pte) return !!(pte & SPTE_MMU_PRESENT_MASK); } -/* - * Returns true if A/D bits are supported in hardware and are enabled by KVM. - * When enabled, KVM uses A/D bits for all non-nested MMUs. Because L1 can - * disable A/D bits in EPTP12, SP and SPTE variants are needed to handle the - * scenario where KVM is using A/D bits for L1, but not L2. - */ -static inline bool kvm_ad_enabled(void) +static inline bool is_ept_ve_possible(u64 spte) { - return !!shadow_accessed_mask; + return (shadow_present_mask & VMX_EPT_SUPPRESS_VE_BIT) && + !(spte & VMX_EPT_SUPPRESS_VE_BIT) && + (spte & VMX_EPT_RWX_MASK) != VMX_EPT_MISCONFIG_WX_VALUE; } static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) @@ -233,31 +305,19 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) static inline bool spte_ad_enabled(u64 spte) { - MMU_WARN_ON(!is_shadow_present_pte(spte)); - return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED_MASK; + KVM_MMU_WARN_ON(!is_shadow_present_pte(spte)); + return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED; } static inline bool spte_ad_need_write_protect(u64 spte) { - MMU_WARN_ON(!is_shadow_present_pte(spte)); + KVM_MMU_WARN_ON(!is_shadow_present_pte(spte)); /* - * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED_MASK is '0', + * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED is '0', * and non-TDP SPTEs will never set these bits. Optimize for 64-bit * TDP and do the A/D type check unconditionally. */ - return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED_MASK; -} - -static inline u64 spte_shadow_accessed_mask(u64 spte) -{ - MMU_WARN_ON(!is_shadow_present_pte(spte)); - return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; -} - -static inline u64 spte_shadow_dirty_mask(u64 spte) -{ - MMU_WARN_ON(!is_shadow_present_pte(spte)); - return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; + return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED; } static inline bool is_access_track_spte(u64 spte) @@ -282,22 +342,12 @@ static inline bool is_executable_pte(u64 spte) static inline kvm_pfn_t spte_to_pfn(u64 pte) { - return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; + return (pte & SPTE_BASE_ADDR_MASK) >> PAGE_SHIFT; } static inline bool is_accessed_spte(u64 spte) { - u64 accessed_mask = spte_shadow_accessed_mask(spte); - - return accessed_mask ? spte & accessed_mask - : !is_access_track_spte(spte); -} - -static inline bool is_dirty_spte(u64 spte) -{ - u64 dirty_mask = spte_shadow_dirty_mask(spte); - - return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK; + return spte & shadow_accessed_mask; } static inline u64 get_rsvd_bits(struct rsvd_bits_validate *rsvd_check, u64 pte, @@ -328,10 +378,10 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check, } /* - * An shadow-present leaf SPTE may be non-writable for 3 possible reasons: + * A shadow-present leaf SPTE may be non-writable for 4 possible reasons: * * 1. To intercept writes for dirty logging. KVM write-protects huge pages - * so that they can be split be split down into the dirty logging + * so that they can be split down into the dirty logging * granularity (4KiB) whenever the guest writes to them. KVM also * write-protects 4KiB pages so that writes can be recorded in the dirty log * (e.g. if not using PML). SPTEs are write-protected for dirty logging @@ -346,8 +396,13 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check, * read-only memslot or guest memory backed by a read-only VMA. Writes to * such pages are disallowed entirely. * - * To keep track of why a given SPTE is write-protected, KVM uses 2 - * software-only bits in the SPTE: + * 4. To emulate the Accessed bit for SPTEs without A/D bits. Note, in this + * case, the SPTE is access-protected, not just write-protected! + * + * For cases #1 and #4, KVM can safely make such SPTEs writable without taking + * mmu_lock as capturing the Accessed/Dirty state doesn't require taking it. + * To differentiate #1 and #4 from #2 and #3, KVM uses two software-only bits + * in the SPTE: * * shadow_mmu_writable_mask, aka MMU-writable - * Cleared on SPTEs that KVM is currently write-protecting for shadow paging @@ -376,7 +431,8 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check, * shadow page tables between vCPUs. Write-protecting an SPTE for dirty logging * (which does not clear the MMU-writable bit), does not flush TLBs before * dropping the lock, as it only needs to synchronize guest writes with the - * dirty bitmap. + * dirty bitmap. Similarly, making the SPTE inaccessible (and non-writable) for + * access-tracking via the clear_young() MMU notifier also does not flush TLBs. * * So, there is the problem: clearing the MMU-writable bit can encounter a * write-protected SPTE while CPUs still have writable mappings for that SPTE @@ -397,11 +453,11 @@ static inline void check_spte_writable_invariants(u64 spte) { if (spte & shadow_mmu_writable_mask) WARN_ONCE(!(spte & shadow_host_writable_mask), - "kvm: MMU-writable SPTE is not Host-writable: %llx", + KBUILD_MODNAME ": MMU-writable SPTE is not Host-writable: %llx", spte); else WARN_ONCE(is_writable_pte(spte), - "kvm: Writable SPTE is not MMU-writable: %llx", spte); + KBUILD_MODNAME ": Writable SPTE is not MMU-writable: %llx", spte); } static inline bool is_mmu_writable_spte(u64 spte) @@ -409,6 +465,50 @@ static inline bool is_mmu_writable_spte(u64 spte) return spte & shadow_mmu_writable_mask; } +/* + * Returns true if the access indicated by @fault is allowed by the existing + * SPTE protections. Note, the caller is responsible for checking that the + * SPTE is a shadow-present, leaf SPTE (either before or after). + */ +static inline bool is_access_allowed(struct kvm_page_fault *fault, u64 spte) +{ + if (fault->exec) + return is_executable_pte(spte); + + if (fault->write) + return is_writable_pte(spte); + + /* Fault was on Read access */ + return spte & PT_PRESENT_MASK; +} + +/* + * If the MMU-writable flag is cleared, i.e. the SPTE is write-protected for + * write-tracking, remote TLBs must be flushed, even if the SPTE was read-only, + * as KVM allows stale Writable TLB entries to exist. When dirty logging, KVM + * flushes TLBs based on whether or not dirty bitmap/ring entries were reaped, + * not whether or not SPTEs were modified, i.e. only the write-tracking case + * needs to flush at the time the SPTEs is modified, before dropping mmu_lock. + * + * Don't flush if the Accessed bit is cleared, as access tracking tolerates + * false negatives, e.g. KVM x86 omits TLB flushes even when aging SPTEs for a + * mmu_notifier.clear_flush_young() event. + * + * Lastly, don't flush if the Dirty bit is cleared, as KVM unconditionally + * flushes when enabling dirty logging (see kvm_mmu_slot_apply_flags()), and + * when clearing dirty logs, KVM flushes based on whether or not dirty entries + * were reaped from the bitmap/ring, not whether or not dirty SPTEs were found. + * + * Note, this logic only applies to shadow-present leaf SPTEs. The caller is + * responsible for checking that the old SPTE is shadow-present, and is also + * responsible for determining whether or not a TLB flush is required when + * modifying a shadow-present non-leaf SPTE. + */ +static inline bool leaf_spte_change_needs_tlb_flush(u64 old_spte, u64 new_spte) +{ + return is_mmu_writable_spte(old_spte) && !is_mmu_writable_spte(new_spte); +} + static inline u64 get_mmio_spte_generation(u64 spte) { u64 gen; @@ -418,14 +518,16 @@ static inline u64 get_mmio_spte_generation(u64 spte) return gen; } -bool spte_has_volatile_bits(u64 spte); +bool spte_needs_atomic_update(u64 spte); bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, const struct kvm_memory_slot *slot, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, - u64 old_spte, bool prefetch, bool can_unsync, + u64 old_spte, bool prefetch, bool synchronizing, bool host_writable, u64 *new_spte); -u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index); +u64 make_small_spte(struct kvm *kvm, u64 huge_spte, + union kvm_mmu_page_role role, int index); +u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level); u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled); u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access); u64 mark_spte_for_access_track(u64 spte); @@ -444,8 +546,7 @@ static inline u64 restore_acc_track_spte(u64 spte) return spte; } -u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn); - +void __init kvm_mmu_spte_module_init(void); void kvm_mmu_reset_all_pte_masks(void); #endif diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index ee4802d7b36c..9e17bfa80901 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "mmu_internal.h" #include "tdp_iter.h" @@ -11,15 +12,10 @@ static void tdp_iter_refresh_sptep(struct tdp_iter *iter) { iter->sptep = iter->pt_path[iter->level - 1] + - SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level); + SPTE_INDEX((iter->gfn | iter->gfn_bits) << PAGE_SHIFT, iter->level); iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep); } -static gfn_t round_gfn_for_level(gfn_t gfn, int level) -{ - return gfn & -KVM_PAGES_PER_HPAGE(level); -} - /* * Return the TDP iterator to the root PT and allow it to continue its * traversal over the paging structure from there. @@ -30,7 +26,7 @@ void tdp_iter_restart(struct tdp_iter *iter) iter->yielded_gfn = iter->next_last_level_gfn; iter->level = iter->root_level; - iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level); + iter->gfn = gfn_round_for_level(iter->next_last_level_gfn, iter->level); tdp_iter_refresh_sptep(iter); iter->valid = true; @@ -41,15 +37,18 @@ void tdp_iter_restart(struct tdp_iter *iter) * rooted at root_pt, starting with the walk to translate next_last_level_gfn. */ void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root, - int min_level, gfn_t next_last_level_gfn) + int min_level, gfn_t next_last_level_gfn, gfn_t gfn_bits) { - int root_level = root->role.level; - - WARN_ON(root_level < 1); - WARN_ON(root_level > PT64_ROOT_MAX_LEVEL); + if (WARN_ON_ONCE(!root || (root->role.level < 1) || + (root->role.level > PT64_ROOT_MAX_LEVEL) || + (gfn_bits && next_last_level_gfn >= gfn_bits))) { + iter->valid = false; + return; + } iter->next_last_level_gfn = next_last_level_gfn; - iter->root_level = root_level; + iter->gfn_bits = gfn_bits; + iter->root_level = root->role.level; iter->min_level = min_level; iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root->spt; iter->as_id = kvm_mmu_page_as_id(root); @@ -97,7 +96,7 @@ static bool try_step_down(struct tdp_iter *iter) iter->level--; iter->pt_path[iter->level - 1] = child_pt; - iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level); + iter->gfn = gfn_round_for_level(iter->next_last_level_gfn, iter->level); tdp_iter_refresh_sptep(iter); return true; @@ -116,8 +115,8 @@ static bool try_step_side(struct tdp_iter *iter) * Check if the iterator is already at the end of the current page * table. */ - if (SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level) == - (PT64_ENT_PER_PAGE - 1)) + if (SPTE_INDEX((iter->gfn | iter->gfn_bits) << PAGE_SHIFT, iter->level) == + (SPTE_ENT_PER_PAGE - 1)) return false; iter->gfn += KVM_PAGES_PER_HPAGE(iter->level); @@ -139,26 +138,17 @@ static bool try_step_up(struct tdp_iter *iter) return false; iter->level++; - iter->gfn = round_gfn_for_level(iter->gfn, iter->level); + iter->gfn = gfn_round_for_level(iter->gfn, iter->level); tdp_iter_refresh_sptep(iter); return true; } /* - * Step the iterator back up a level in the paging structure. Should only be - * used when the iterator is below the root level. - */ -void tdp_iter_step_up(struct tdp_iter *iter) -{ - WARN_ON(!try_step_up(iter)); -} - -/* * Step to the next SPTE in a pre-order traversal of the paging structure. * To get to the next SPTE, the iterator either steps down towards the goal * GFN, if at a present, non-last-level SPTE, or over to a SPTE mapping a - * highter GFN. + * higher GFN. * * The basic algorithm is as follows: * 1. If the current SPTE is a non-last-level SPTE, step down into the page diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index adfca0cf94d3..364c5da6c499 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -21,37 +21,55 @@ static inline u64 kvm_tdp_mmu_read_spte(tdp_ptep_t sptep) static inline u64 kvm_tdp_mmu_write_spte_atomic(tdp_ptep_t sptep, u64 new_spte) { + KVM_MMU_WARN_ON(is_ept_ve_possible(new_spte)); return xchg(rcu_dereference(sptep), new_spte); } +static inline u64 tdp_mmu_clear_spte_bits_atomic(tdp_ptep_t sptep, u64 mask) +{ + atomic64_t *sptep_atomic = (atomic64_t *)rcu_dereference(sptep); + + return (u64)atomic64_fetch_and(~mask, sptep_atomic); +} + static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte) { + KVM_MMU_WARN_ON(is_ept_ve_possible(new_spte)); WRITE_ONCE(*rcu_dereference(sptep), new_spte); } +/* + * SPTEs must be modified atomically if they are shadow-present, leaf SPTEs, + * and have volatile bits (bits that can be set outside of mmu_lock) that + * must not be clobbered. + */ +static inline bool kvm_tdp_mmu_spte_need_atomic_update(u64 old_spte, int level) +{ + return is_shadow_present_pte(old_spte) && + is_last_spte(old_spte, level) && + spte_needs_atomic_update(old_spte); +} + static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte, u64 new_spte, int level) { - /* - * Atomically write the SPTE if it is a shadow-present, leaf SPTE with - * volatile bits, i.e. has bits that can be set outside of mmu_lock. - * The Writable bit can be set by KVM's fast page fault handler, and - * Accessed and Dirty bits can be set by the CPU. - * - * Note, non-leaf SPTEs do have Accessed bits and those bits are - * technically volatile, but KVM doesn't consume the Accessed bit of - * non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This - * logic needs to be reassessed if KVM were to use non-leaf Accessed - * bits, e.g. to skip stepping down into child SPTEs when aging SPTEs. - */ - if (is_shadow_present_pte(old_spte) && is_last_spte(old_spte, level) && - spte_has_volatile_bits(old_spte)) + if (kvm_tdp_mmu_spte_need_atomic_update(old_spte, level)) return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte); __kvm_tdp_mmu_write_spte(sptep, new_spte); return old_spte; } +static inline u64 tdp_mmu_clear_spte_bits(tdp_ptep_t sptep, u64 old_spte, + u64 mask, int level) +{ + if (kvm_tdp_mmu_spte_need_atomic_update(old_spte, level)) + return tdp_mmu_clear_spte_bits_atomic(sptep, mask); + + __kvm_tdp_mmu_write_spte(sptep, old_spte & ~mask); + return old_spte; +} + /* * A TDP iterator performs a pre-order walk over a TDP paging structure. */ @@ -71,8 +89,10 @@ struct tdp_iter { tdp_ptep_t pt_path[PT64_ROOT_MAX_LEVEL]; /* A pointer to the current SPTE */ tdp_ptep_t sptep; - /* The lowest GFN mapped by the current SPTE */ + /* The lowest GFN (mask bits excluded) mapped by the current SPTE */ gfn_t gfn; + /* Mask applied to convert the GFN to the mapping GPA */ + gfn_t gfn_bits; /* The level of the root page given to the iterator */ int root_level; /* The lowest level the iterator should traverse to */ @@ -100,20 +120,24 @@ struct tdp_iter { * Iterates over every SPTE mapping the GFN range [start, end) in a * preorder traversal. */ -#define for_each_tdp_pte_min_level(iter, root, min_level, start, end) \ - for (tdp_iter_start(&iter, root, min_level, start); \ - iter.valid && iter.gfn < end; \ +#define for_each_tdp_pte_min_level(iter, kvm, root, min_level, start, end) \ + for (tdp_iter_start(&iter, root, min_level, start, kvm_gfn_root_bits(kvm, root)); \ + iter.valid && iter.gfn < end; \ tdp_iter_next(&iter)) -#define for_each_tdp_pte(iter, root, start, end) \ - for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) +#define for_each_tdp_pte_min_level_all(iter, root, min_level) \ + for (tdp_iter_start(&iter, root, min_level, 0, 0); \ + iter.valid && iter.gfn < tdp_mmu_max_gfn_exclusive(); \ + tdp_iter_next(&iter)) + +#define for_each_tdp_pte(iter, kvm, root, start, end) \ + for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end) tdp_ptep_t spte_to_child_pt(u64 pte, int level); void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root, - int min_level, gfn_t next_last_level_gfn); + int min_level, gfn_t next_last_level_gfn, gfn_t gfn_bits); void tdp_iter_next(struct tdp_iter *iter); void tdp_iter_restart(struct tdp_iter *iter); -void tdp_iter_step_up(struct tdp_iter *iter); #endif /* __KVM_X86_MMU_TDP_ITER_H */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 7b9265d67131..7f3d7229b2c1 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "mmu.h" #include "mmu_internal.h" @@ -10,28 +11,11 @@ #include <asm/cmpxchg.h> #include <trace/events/kvm.h> -static bool __read_mostly tdp_mmu_enabled = true; -module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); - /* Initializes the TDP MMU for the VM, if enabled. */ -int kvm_mmu_init_tdp_mmu(struct kvm *kvm) +void kvm_mmu_init_tdp_mmu(struct kvm *kvm) { - struct workqueue_struct *wq; - - if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled)) - return 0; - - wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0); - if (!wq) - return -ENOMEM; - - /* This should not be changed for the lifetime of the VM. */ - kvm->arch.tdp_mmu_enabled = true; INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); - INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); - kvm->arch.tdp_mmu_zap_wq = wq; - return 1; } /* Arbitrarily returns true so that this may be used in if statements. */ @@ -48,25 +32,30 @@ static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) { - if (!kvm->arch.tdp_mmu_enabled) - return; - - /* Also waits for any queued work items. */ - destroy_workqueue(kvm->arch.tdp_mmu_zap_wq); + /* + * Invalidate all roots, which besides the obvious, schedules all roots + * for zapping and thus puts the TDP MMU's reference to each root, i.e. + * ultimately frees all roots. + */ + kvm_tdp_mmu_invalidate_roots(kvm, KVM_VALID_ROOTS); + kvm_tdp_mmu_zap_invalidated_roots(kvm, false); - WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages)); +#ifdef CONFIG_KVM_PROVE_MMU + KVM_MMU_WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); +#endif WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); /* * Ensure that all the outstanding RCU callbacks to free shadow pages - * can run before the VM is torn down. Work items on tdp_mmu_zap_wq - * can call kvm_tdp_mmu_put_root and create new callbacks. + * can run before the VM is torn down. Putting the last reference to + * zapped roots will create new callbacks. */ rcu_barrier(); } static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) { + free_page((unsigned long)sp->external_spt); free_page((unsigned long)sp->spt); kmem_cache_free(mmu_page_header_cache, sp); } @@ -87,103 +76,17 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) tdp_mmu_free_sp(sp); } -static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, - bool shared); - -static void tdp_mmu_zap_root_work(struct work_struct *work) +void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) { - struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page, - tdp_mmu_async_work); - struct kvm *kvm = root->tdp_mmu_async_data; - - read_lock(&kvm->mmu_lock); - - /* - * A TLB flush is not necessary as KVM performs a local TLB flush when - * allocating a new root (see kvm_mmu_load()), and when migrating vCPU - * to a different pCPU. Note, the local TLB flush on reuse also - * invalidates any paging-structure-cache entries, i.e. TLB entries for - * intermediate paging structures, that may be zapped, as such entries - * are associated with the ASID on both VMX and SVM. - */ - tdp_mmu_zap_root(kvm, root, true); - - /* - * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for - * avoiding an infinite loop. By design, the root is reachable while - * it's being asynchronously zapped, thus a different task can put its - * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an - * asynchronously zapped root is unavoidable. - */ - kvm_tdp_mmu_put_root(kvm, root, true); - - read_unlock(&kvm->mmu_lock); -} - -static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root) -{ - root->tdp_mmu_async_data = kvm; - INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work); - queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work); -} - -static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page) -{ - union kvm_mmu_page_role role = page->role; - role.invalid = true; - - /* No need to use cmpxchg, only the invalid bit can change. */ - role.word = xchg(&page->role.word, role.word); - return role.invalid; -} - -void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, - bool shared) -{ - kvm_lockdep_assert_mmu_lock_held(kvm, shared); - if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) return; - WARN_ON(!root->tdp_mmu_page); - /* - * The root now has refcount=0. It is valid, but readers already - * cannot acquire a reference to it because kvm_tdp_mmu_get_root() - * rejects it. This remains true for the rest of the execution - * of this function, because readers visit valid roots only - * (except for tdp_mmu_zap_root_work(), which however - * does not acquire any reference itself). - * - * Even though there are flows that need to visit all roots for - * correctness, they all take mmu_lock for write, so they cannot yet - * run concurrently. The same is true after kvm_tdp_root_mark_invalid, - * since the root still has refcount=0. - * - * However, tdp_mmu_zap_root can yield, and writers do not expect to - * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()). - * So the root temporarily gets an extra reference, going to refcount=1 - * while staying invalid. Readers still cannot acquire any reference; - * but writers are now allowed to run if tdp_mmu_zap_root yields and - * they might take an extra reference if they themselves yield. - * Therefore, when the reference is given back by the worker, - * there is no guarantee that the refcount is still 1. If not, whoever - * puts the last reference will free the page, but they will not have to - * zap the root because a root cannot go from invalid to valid. + * The TDP MMU itself holds a reference to each root until the root is + * explicitly invalidated, i.e. the final reference should be never be + * put for a valid root. */ - if (!kvm_tdp_root_mark_invalid(root)) { - refcount_set(&root->tdp_mmu_root_count, 1); - - /* - * Zapping the root in a worker is not just "nice to have"; - * it is required because kvm_tdp_mmu_invalidate_all_roots() - * skips already-invalid roots. If kvm_tdp_mmu_put_root() did - * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast() - * might return with some roots not zapped yet. - */ - tdp_mmu_schedule_zap_root(kvm, root); - return; - } + KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm); spin_lock(&kvm->arch.tdp_mmu_pages_lock); list_del_rcu(&root->link); @@ -191,22 +94,42 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); } +static bool tdp_mmu_root_match(struct kvm_mmu_page *root, + enum kvm_tdp_mmu_root_types types) +{ + if (WARN_ON_ONCE(!(types & KVM_VALID_ROOTS))) + return false; + + if (root->role.invalid && !(types & KVM_INVALID_ROOTS)) + return false; + + if (likely(!is_mirror_sp(root))) + return types & KVM_DIRECT_ROOTS; + return types & KVM_MIRROR_ROOTS; +} + /* * Returns the next root after @prev_root (or the first root if @prev_root is - * NULL). A reference to the returned root is acquired, and the reference to - * @prev_root is released (the caller obviously must hold a reference to - * @prev_root if it's non-NULL). + * NULL) that matches with @types. A reference to the returned root is + * acquired, and the reference to @prev_root is released (the caller obviously + * must hold a reference to @prev_root if it's non-NULL). * - * If @only_valid is true, invalid roots are skipped. + * Roots that doesn't match with @types are skipped. * * Returns NULL if the end of tdp_mmu_roots was reached. */ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, struct kvm_mmu_page *prev_root, - bool shared, bool only_valid) + enum kvm_tdp_mmu_root_types types) { struct kvm_mmu_page *next_root; + /* + * While the roots themselves are RCU-protected, fields such as + * role.invalid are protected by mmu_lock. + */ + lockdep_assert_held(&kvm->mmu_lock); + rcu_read_lock(); if (prev_root) @@ -218,7 +141,7 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, typeof(*next_root), link); while (next_root) { - if ((!only_valid || !next_root->role.invalid) && + if (tdp_mmu_root_match(next_root, types) && kvm_tdp_mmu_get_root(next_root)) break; @@ -229,7 +152,7 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, rcu_read_unlock(); if (prev_root) - kvm_tdp_mmu_put_root(kvm, prev_root, shared); + kvm_tdp_mmu_put_root(kvm, prev_root); return next_root; } @@ -241,22 +164,22 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, * recent root. (Unless keeping a live reference is desirable.) * * If shared is set, this function is operating under the MMU lock in read - * mode. In the unlikely event that this thread must free a root, the lock - * will be temporarily dropped and reacquired in write mode. + * mode. */ -#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\ - for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \ - _root; \ - _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \ - if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \ - kvm_mmu_page_as_id(_root) != _as_id) { \ +#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _types) \ + for (_root = tdp_mmu_next_root(_kvm, NULL, _types); \ + ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \ + _root = tdp_mmu_next_root(_kvm, _root, _types)) \ + if (_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) { \ } else -#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ - __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true) +#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ + __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, KVM_VALID_ROOTS) -#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ - __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false) +#define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \ + for (_root = tdp_mmu_next_root(_kvm, NULL, KVM_ALL_ROOTS); \ + ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \ + _root = tdp_mmu_next_root(_kvm, _root, KVM_ALL_ROOTS)) /* * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write, @@ -265,12 +188,29 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, * Holding mmu_lock for write obviates the need for RCU protection as the list * is guaranteed to be stable. */ -#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ - list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \ - if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \ - kvm_mmu_page_as_id(_root) != _as_id) { \ +#define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _types) \ + list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \ + if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \ + ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \ + !tdp_mmu_root_match((_root), (_types)))) { \ + } else + +/* + * Iterate over all TDP MMU roots in an RCU read-side critical section. + * It is safe to iterate over the SPTEs under the root, but their values will + * be unstable, so all writes must be atomic. As this routine is meant to be + * used without holding the mmu_lock at all, any bits that are flipped must + * be reflected in kvm_tdp_mmu_spte_need_atomic_write(). + */ +#define for_each_tdp_mmu_root_rcu(_kvm, _root, _as_id, _types) \ + list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link) \ + if ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \ + !tdp_mmu_root_match((_root), (_types))) { \ } else +#define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) \ + __for_each_tdp_mmu_root(_kvm, _root, _as_id, KVM_VALID_ROOTS) + static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) { struct kvm_mmu_page *sp; @@ -284,6 +224,8 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep, gfn_t gfn, union kvm_mmu_page_role role) { + INIT_LIST_HEAD(&sp->possible_nx_huge_page_link); + set_page_private(virt_to_page(sp->spt), (unsigned long)sp); sp->role = role; @@ -308,68 +250,94 @@ static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp, tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role); } -hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) +void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool mirror) { - union kvm_mmu_page_role role = vcpu->arch.mmu->root_role; + struct kvm_mmu *mmu = vcpu->arch.mmu; + union kvm_mmu_page_role role = mmu->root_role; + int as_id = kvm_mmu_role_as_id(role); struct kvm *kvm = vcpu->kvm; struct kvm_mmu_page *root; - lockdep_assert_held_write(&kvm->mmu_lock); + if (mirror) + role.is_mirror = true; /* - * Check for an existing root before allocating a new one. Note, the - * role check prevents consuming an invalid root. + * Check for an existing root before acquiring the pages lock to avoid + * unnecessary serialization if multiple vCPUs are loading a new root. + * E.g. when bringing up secondary vCPUs, KVM will already have created + * a valid root on behalf of the primary vCPU. */ - for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) { + read_lock(&kvm->mmu_lock); + + for_each_valid_tdp_mmu_root_yield_safe(kvm, root, as_id) { + if (root->role.word == role.word) + goto out_read_unlock; + } + + spin_lock(&kvm->arch.tdp_mmu_pages_lock); + + /* + * Recheck for an existing root after acquiring the pages lock, another + * vCPU may have raced ahead and created a new usable root. Manually + * walk the list of roots as the standard macros assume that the pages + * lock is *not* held. WARN if grabbing a reference to a usable root + * fails, as the last reference to a root can only be put *after* the + * root has been invalidated, which requires holding mmu_lock for write. + */ + list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { if (root->role.word == role.word && - kvm_tdp_mmu_get_root(root)) - goto out; + !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) + goto out_spin_unlock; } root = tdp_mmu_alloc_sp(vcpu); tdp_mmu_init_sp(root, NULL, 0, role); - refcount_set(&root->tdp_mmu_root_count, 1); - - spin_lock(&kvm->arch.tdp_mmu_pages_lock); + /* + * TDP MMU roots are kept until they are explicitly invalidated, either + * by a memslot update or by the destruction of the VM. Initialize the + * refcount to two; one reference for the vCPU, and one reference for + * the TDP MMU itself, which is held until the root is invalidated and + * is ultimately put by kvm_tdp_mmu_zap_invalidated_roots(). + */ + refcount_set(&root->tdp_mmu_root_count, 2); list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots); - spin_unlock(&kvm->arch.tdp_mmu_pages_lock); -out: - return __pa(root->spt); +out_spin_unlock: + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); +out_read_unlock: + read_unlock(&kvm->mmu_lock); + /* + * Note, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS will prevent entering the guest + * and actually consuming the root if it's invalidated after dropping + * mmu_lock, and the root can't be freed as this vCPU holds a reference. + */ + if (mirror) { + mmu->mirror_root_hpa = __pa(root->spt); + } else { + mmu->root.hpa = __pa(root->spt); + mmu->root.pgd = 0; + } } static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, u64 old_spte, u64 new_spte, int level, bool shared); -static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) +static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { - if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) - return; - - if (is_accessed_spte(old_spte) && - (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) || - spte_to_pfn(old_spte) != spte_to_pfn(new_spte))) - kvm_set_pfn_accessed(spte_to_pfn(old_spte)); + kvm_account_pgtable_pages((void *)sp->spt, +1); +#ifdef CONFIG_KVM_PROVE_MMU + atomic64_inc(&kvm->arch.tdp_mmu_pages); +#endif } -static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, - u64 old_spte, u64 new_spte, int level) +static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { - bool pfn_changed; - struct kvm_memory_slot *slot; - - if (level > PG_LEVEL_4K) - return; - - pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); - - if ((!is_writable_pte(old_spte) || pfn_changed) && - is_writable_pte(new_spte)) { - slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn); - mark_page_dirty_in_slot(kvm, slot, gfn); - } + kvm_account_pgtable_pages((void *)sp->spt, -1); +#ifdef CONFIG_KVM_PROVE_MMU + atomic64_dec(&kvm->arch.tdp_mmu_pages); +#endif } /** @@ -377,24 +345,41 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, * * @kvm: kvm instance * @sp: the page to be removed - * @shared: This operation may not be running under the exclusive use of - * the MMU lock and the operation must synchronize with other - * threads that might be adding or removing pages. */ -static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp, - bool shared) +static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp) { - if (shared) - spin_lock(&kvm->arch.tdp_mmu_pages_lock); - else - lockdep_assert_held_write(&kvm->mmu_lock); + tdp_unaccount_mmu_page(kvm, sp); - list_del(&sp->link); - if (sp->lpage_disallowed) - unaccount_huge_nx_page(kvm, sp); + if (!sp->nx_huge_page_disallowed) + return; - if (shared) - spin_unlock(&kvm->arch.tdp_mmu_pages_lock); + spin_lock(&kvm->arch.tdp_mmu_pages_lock); + sp->nx_huge_page_disallowed = false; + untrack_possible_nx_huge_page(kvm, sp); + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); +} + +static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte, + int level) +{ + kvm_pfn_t old_pfn = spte_to_pfn(old_spte); + int ret; + + /* + * External (TDX) SPTEs are limited to PG_LEVEL_4K, and external + * PTs are removed in a special order, involving free_external_spt(). + * But remove_external_spte() will be called on non-leaf PTEs via + * __tdp_mmu_zap_root(), so avoid the error the former would return + * in this case. + */ + if (!is_last_spte(old_spte, level)) + return; + + /* Zapping leaf spte is allowed only when write lock is held. */ + lockdep_assert_held_write(&kvm->mmu_lock); + /* Because write lock is held, operation should success. */ + ret = kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_pfn); + KVM_BUG_ON(ret, kvm); } /** @@ -423,9 +408,9 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) trace_kvm_mmu_prepare_zap_page(sp); - tdp_mmu_unlink_sp(kvm, sp, shared); + tdp_mmu_unlink_sp(kvm, sp); - for (i = 0; i < PT64_ENT_PER_PAGE; i++) { + for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { tdp_ptep_t sptep = pt + i; gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); u64 old_spte; @@ -434,14 +419,14 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) /* * Set the SPTE to a nonpresent value that other * threads will not overwrite. If the SPTE was - * already marked as removed then another thread + * already marked as frozen then another thread * handling a page fault could overwrite it, so * set the SPTE until it is set from some other - * value to the removed SPTE value. + * value to the frozen SPTE value. */ for (;;) { - old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE); - if (!is_removed_spte(old_spte)) + old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, FROZEN_SPTE); + if (!is_frozen_spte(old_spte)) break; cpu_relax(); } @@ -472,11 +457,11 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) * No retry is needed in the atomic update path as the * sole concern is dropping a Dirty bit, i.e. no other * task can zap/remove the SPTE as mmu_lock is held for - * write. Marking the SPTE as a removed SPTE is not + * write. Marking the SPTE as a frozen SPTE is not * strictly necessary for the same reason, but using - * the remove SPTE value keeps the shared/exclusive + * the frozen SPTE value keeps the shared/exclusive * paths consistent and allows the handle_changed_spte() - * call below to hardcode the new value to REMOVED_SPTE. + * call below to hardcode the new value to FROZEN_SPTE. * * Note, even though dropping a Dirty bit is the only * scenario where a non-atomic update could result in a @@ -488,17 +473,87 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) * it here. */ old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, - REMOVED_SPTE, level); + FROZEN_SPTE, level); } handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, - old_spte, REMOVED_SPTE, level, shared); + old_spte, FROZEN_SPTE, level, shared); + + if (is_mirror_sp(sp)) { + KVM_BUG_ON(shared, kvm); + remove_external_spte(kvm, gfn, old_spte, level); + } + } + + if (is_mirror_sp(sp) && + WARN_ON(kvm_x86_call(free_external_spt)(kvm, base_gfn, sp->role.level, + sp->external_spt))) { + /* + * Failed to free page table page in mirror page table and + * there is nothing to do further. + * Intentionally leak the page to prevent the kernel from + * accessing the encrypted page. + */ + sp->external_spt = NULL; } call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); } +static void *get_external_spt(gfn_t gfn, u64 new_spte, int level) +{ + if (is_shadow_present_pte(new_spte) && !is_last_spte(new_spte, level)) { + struct kvm_mmu_page *sp = spte_to_child_sp(new_spte); + + WARN_ON_ONCE(sp->role.level + 1 != level); + WARN_ON_ONCE(sp->gfn != gfn); + return sp->external_spt; + } + + return NULL; +} + +static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sptep, + gfn_t gfn, u64 old_spte, + u64 new_spte, int level) +{ + bool was_present = is_shadow_present_pte(old_spte); + bool is_present = is_shadow_present_pte(new_spte); + bool is_leaf = is_present && is_last_spte(new_spte, level); + kvm_pfn_t new_pfn = spte_to_pfn(new_spte); + int ret = 0; + + KVM_BUG_ON(was_present, kvm); + + lockdep_assert_held(&kvm->mmu_lock); + /* + * We need to lock out other updates to the SPTE until the external + * page table has been modified. Use FROZEN_SPTE similar to + * the zapping case. + */ + if (!try_cmpxchg64(rcu_dereference(sptep), &old_spte, FROZEN_SPTE)) + return -EBUSY; + + /* + * Use different call to either set up middle level + * external page table, or leaf. + */ + if (is_leaf) { + ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_pfn); + } else { + void *external_spt = get_external_spt(gfn, new_spte, level); + + KVM_BUG_ON(!external_spt, kvm); + ret = kvm_x86_call(link_external_spt)(kvm, gfn, level, external_spt); + } + if (ret) + __kvm_tdp_mmu_write_spte(sptep, old_spte); + else + __kvm_tdp_mmu_write_spte(sptep, new_spte); + return ret; +} + /** - * __handle_changed_spte - handle bookkeeping associated with an SPTE change + * handle_changed_spte - handle bookkeeping associated with an SPTE change * @kvm: kvm instance * @as_id: the address space of the paging structure the SPTE was a part of * @gfn: the base GFN that was mapped by the SPTE @@ -509,12 +564,13 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) * the MMU lock and the operation must synchronize with other * threads that might be modifying SPTEs. * - * Handle bookkeeping that might result from the modification of a SPTE. - * This function must be called for all TDP SPTE modifications. + * Handle bookkeeping that might result from the modification of a SPTE. Note, + * dirty logging updates are handled in common code, not here (see make_spte() + * and fast_pf_fix_direct_spte()). */ -static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, - u64 old_spte, u64 new_spte, int level, - bool shared) +static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, + u64 old_spte, u64 new_spte, int level, + bool shared) { bool was_present = is_shadow_present_pte(old_spte); bool is_present = is_shadow_present_pte(new_spte); @@ -522,9 +578,9 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, bool is_leaf = is_present && is_last_spte(new_spte, level); bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); - WARN_ON(level > PT64_ROOT_MAX_LEVEL); - WARN_ON(level < PG_LEVEL_4K); - WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); + WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL); + WARN_ON_ONCE(level < PG_LEVEL_4K); + WARN_ON_ONCE(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); /* * If this warning were to trigger it would indicate that there was a @@ -564,19 +620,19 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, */ if (!was_present && !is_present) { /* - * If this change does not involve a MMIO SPTE or removed SPTE, + * If this change does not involve a MMIO SPTE or frozen SPTE, * it is unexpected. Log the change, though it should not * impact the guest since both the former and current SPTEs * are nonpresent. */ - if (WARN_ON(!is_mmio_spte(old_spte) && - !is_mmio_spte(new_spte) && - !is_removed_spte(new_spte))) + if (WARN_ON_ONCE(!is_mmio_spte(kvm, old_spte) && + !is_mmio_spte(kvm, new_spte) && + !is_frozen_spte(new_spte))) pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" "should not be replaced with another,\n" "different nonpresent SPTE, unless one or both\n" "are MMIO SPTEs, or the new SPTE is\n" - "a temporary removed SPTE.\n" + "a temporary frozen SPTE.\n" "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", as_id, gfn, old_spte, new_spte, level); return; @@ -585,10 +641,6 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, if (is_leaf != was_leaf) kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1); - if (was_leaf && is_dirty_spte(old_spte) && - (!is_present || !is_dirty_spte(new_spte) || pfn_changed)) - kvm_set_pfn_dirty(spte_to_pfn(old_spte)); - /* * Recursively handle child PTs if the change removed a subtree from * the paging structure. Note the WARN on the PFN changing without the @@ -600,15 +652,48 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared); } -static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, - u64 old_spte, u64 new_spte, int level, - bool shared) +static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm, + struct tdp_iter *iter, + u64 new_spte) { - __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, - shared); - handle_changed_spte_acc_track(old_spte, new_spte, level); - handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, - new_spte, level); + /* + * The caller is responsible for ensuring the old SPTE is not a FROZEN + * SPTE. KVM should never attempt to zap or manipulate a FROZEN SPTE, + * and pre-checking before inserting a new SPTE is advantageous as it + * avoids unnecessary work. + */ + WARN_ON_ONCE(iter->yielded || is_frozen_spte(iter->old_spte)); + + if (is_mirror_sptep(iter->sptep) && !is_frozen_spte(new_spte)) { + int ret; + + /* + * Users of atomic zapping don't operate on mirror roots, + * so don't handle it and bug the VM if it's seen. + */ + if (KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm)) + return -EBUSY; + + ret = set_external_spte_present(kvm, iter->sptep, iter->gfn, + iter->old_spte, new_spte, iter->level); + if (ret) + return ret; + } else { + u64 *sptep = rcu_dereference(iter->sptep); + + /* + * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs + * and does not hold the mmu_lock. On failure, i.e. if a + * different logical CPU modified the SPTE, try_cmpxchg64() + * updates iter->old_spte with the current value, so the caller + * operates on fresh data, e.g. if it retries + * tdp_mmu_set_spte_atomic() + */ + if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte)) + return -EBUSY; + } + + return 0; } /* @@ -628,79 +713,26 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, * no side-effects other than setting iter->old_spte to the last * known value of the spte. */ -static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, - struct tdp_iter *iter, - u64 new_spte) +static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm, + struct tdp_iter *iter, + u64 new_spte) { - u64 *sptep = rcu_dereference(iter->sptep); - u64 old_spte; - - /* - * The caller is responsible for ensuring the old SPTE is not a REMOVED - * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE, - * and pre-checking before inserting a new SPTE is advantageous as it - * avoids unnecessary work. - */ - WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte)); + int ret; lockdep_assert_held_read(&kvm->mmu_lock); - /* - * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and - * does not hold the mmu_lock. - */ - old_spte = cmpxchg64(sptep, iter->old_spte, new_spte); - if (old_spte != iter->old_spte) { - /* - * The page table entry was modified by a different logical - * CPU. Refresh iter->old_spte with the current value so the - * caller operates on fresh data, e.g. if it retries - * tdp_mmu_set_spte_atomic(). - */ - iter->old_spte = old_spte; - return -EBUSY; - } - - __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, - new_spte, iter->level, true); - handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); - - return 0; -} - -static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, - struct tdp_iter *iter) -{ - int ret; - - /* - * Freeze the SPTE by setting it to a special, - * non-present value. This will stop other threads from - * immediately installing a present entry in its place - * before the TLBs are flushed. - */ - ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE); + ret = __tdp_mmu_set_spte_atomic(kvm, iter, new_spte); if (ret) return ret; - kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, - KVM_PAGES_PER_HPAGE(iter->level)); - - /* - * No other thread can overwrite the removed SPTE as they must either - * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not - * overwrite the special removed SPTE value. No bookkeeping is needed - * here since the SPTE is going from non-present to non-present. Use - * the raw write helper to avoid an unnecessary check on volatile bits. - */ - __kvm_tdp_mmu_write_spte(iter->sptep, 0); + handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, + new_spte, iter->level, true); return 0; } - /* - * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping + * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping * @kvm: KVM instance * @as_id: Address space ID, i.e. regular vs. SMM * @sptep: Pointer to the SPTE @@ -708,91 +740,68 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, * @new_spte: The new value that will be set for the SPTE * @gfn: The base GFN that was (or will be) mapped by the SPTE * @level: The level _containing_ the SPTE (its parent PT's level) - * @record_acc_track: Notify the MM subsystem of changes to the accessed state - * of the page. Should be set unless handling an MMU - * notifier for access tracking. Leaving record_acc_track - * unset in that case prevents page accesses from being - * double counted. - * @record_dirty_log: Record the page as dirty in the dirty bitmap if - * appropriate for the change being made. Should be set - * unless performing certain dirty logging operations. - * Leaving record_dirty_log unset in that case prevents page - * writes from being double counted. * * Returns the old SPTE value, which _may_ be different than @old_spte if the * SPTE had voldatile bits. */ -static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, - u64 old_spte, u64 new_spte, gfn_t gfn, int level, - bool record_acc_track, bool record_dirty_log) +static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, + u64 old_spte, u64 new_spte, gfn_t gfn, int level) { lockdep_assert_held_write(&kvm->mmu_lock); /* * No thread should be using this function to set SPTEs to or from the - * temporary removed SPTE value. + * temporary frozen SPTE value. * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic * should be used. If operating under the MMU lock in write mode, the - * use of the removed SPTE should not be necessary. + * use of the frozen SPTE should not be necessary. */ - WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte)); + WARN_ON_ONCE(is_frozen_spte(old_spte) || is_frozen_spte(new_spte)); old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level); - __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false); + handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false); + + /* + * Users that do non-atomic setting of PTEs don't operate on mirror + * roots, so don't handle it and bug the VM if it's seen. + */ + if (is_mirror_sptep(sptep)) { + KVM_BUG_ON(is_shadow_present_pte(new_spte), kvm); + remove_external_spte(kvm, gfn, old_spte, level); + } - if (record_acc_track) - handle_changed_spte_acc_track(old_spte, new_spte, level); - if (record_dirty_log) - handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, - new_spte, level); return old_spte; } -static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, - u64 new_spte, bool record_acc_track, - bool record_dirty_log) +static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter, + u64 new_spte) { WARN_ON_ONCE(iter->yielded); - - iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, - iter->old_spte, new_spte, - iter->gfn, iter->level, - record_acc_track, record_dirty_log); -} - -static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, - u64 new_spte) -{ - _tdp_mmu_set_spte(kvm, iter, new_spte, true, true); -} - -static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm, - struct tdp_iter *iter, - u64 new_spte) -{ - _tdp_mmu_set_spte(kvm, iter, new_spte, false, true); -} - -static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, - struct tdp_iter *iter, - u64 new_spte) -{ - _tdp_mmu_set_spte(kvm, iter, new_spte, true, false); + iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, + iter->old_spte, new_spte, + iter->gfn, iter->level); } -#define tdp_root_for_each_pte(_iter, _root, _start, _end) \ - for_each_tdp_pte(_iter, _root, _start, _end) +#define tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \ + for_each_tdp_pte(_iter, _kvm, _root, _start, _end) -#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ - tdp_root_for_each_pte(_iter, _root, _start, _end) \ +#define tdp_root_for_each_leaf_pte(_iter, _kvm, _root, _start, _end) \ + tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \ if (!is_shadow_present_pte(_iter.old_spte) || \ !is_last_spte(_iter.old_spte, _iter.level)) \ continue; \ else -#define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ - for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end) +static inline bool __must_check tdp_mmu_iter_need_resched(struct kvm *kvm, + struct tdp_iter *iter) +{ + if (!need_resched() && !rwlock_needbreak(&kvm->mmu_lock)) + return false; + + /* Ensure forward progress has been made before yielding. */ + return iter->next_last_level_gfn != iter->yielded_gfn; +} /* * Yield if the MMU lock is contended or this thread needs to return control @@ -812,31 +821,27 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, struct tdp_iter *iter, bool flush, bool shared) { - WARN_ON(iter->yielded); + KVM_MMU_WARN_ON(iter->yielded); - /* Ensure forward progress has been made before yielding. */ - if (iter->next_last_level_gfn == iter->yielded_gfn) + if (!tdp_mmu_iter_need_resched(kvm, iter)) return false; - if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { - if (flush) - kvm_flush_remote_tlbs(kvm); - - rcu_read_unlock(); + if (flush) + kvm_flush_remote_tlbs(kvm); - if (shared) - cond_resched_rwlock_read(&kvm->mmu_lock); - else - cond_resched_rwlock_write(&kvm->mmu_lock); + rcu_read_unlock(); - rcu_read_lock(); + if (shared) + cond_resched_rwlock_read(&kvm->mmu_lock); + else + cond_resched_rwlock_write(&kvm->mmu_lock); - WARN_ON(iter->gfn > iter->next_last_level_gfn); + rcu_read_lock(); - iter->yielded = true; - } + WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn); - return iter->yielded; + iter->yielded = true; + return true; } static inline gfn_t tdp_mmu_max_gfn_exclusive(void) @@ -855,10 +860,7 @@ static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, { struct tdp_iter iter; - gfn_t end = tdp_mmu_max_gfn_exclusive(); - gfn_t start = 0; - - for_each_tdp_pte_min_level(iter, root, zap_level, start, end) { + for_each_tdp_pte_min_level_all(iter, root, zap_level) { retry: if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) continue; @@ -870,8 +872,8 @@ retry: continue; if (!shared) - tdp_mmu_set_spte(kvm, &iter, 0); - else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0)) + tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE); + else if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE)) goto retry; } } @@ -897,15 +899,26 @@ static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, rcu_read_lock(); /* - * To avoid RCU stalls due to recursively removing huge swaths of SPs, - * split the zap into two passes. On the first pass, zap at the 1gb - * level, and then zap top-level SPs on the second pass. "1gb" is not - * arbitrary, as KVM must be able to zap a 1gb shadow page without - * inducing a stall to allow in-place replacement with a 1gb hugepage. + * Zap roots in multiple passes of decreasing granularity, i.e. zap at + * 4KiB=>2MiB=>1GiB=>root, in order to better honor need_resched() (all + * preempt models) or mmu_lock contention (full or real-time models). + * Zapping at finer granularity marginally increases the total time of + * the zap, but in most cases the zap itself isn't latency sensitive. * - * Because zapping a SP recurses on its children, stepping down to - * PG_LEVEL_4K in the iterator itself is unnecessary. + * If KVM is configured to prove the MMU, skip the 4KiB and 2MiB zaps + * in order to mimic the page fault path, which can replace a 1GiB page + * table with an equivalent 1GiB hugepage, i.e. can get saddled with + * zapping a 1GiB region that's fully populated with 4KiB SPTEs. This + * allows verifying that KVM can safely zap 1GiB regions, e.g. without + * inducing RCU stalls, without relying on a relatively rare event + * (zapping roots is orders of magnitude more common). Note, because + * zapping a SP recurses on its children, stepping down to PG_LEVEL_4K + * in the iterator itself is unnecessary. */ + if (!IS_ENABLED(CONFIG_KVM_PROVE_MMU)) { + __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_4K); + __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_2M); + } __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G); __tdp_mmu_zap_root(kvm, root, shared, root->role.level); @@ -927,16 +940,13 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte))) return false; - __tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0, - sp->gfn, sp->role.level + 1, true, true); + tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, + SHADOW_NONPRESENT_VALUE, sp->gfn, sp->role.level + 1); return true; } /* - * Zap leafs SPTEs for the range of gfns, [start, end). Returns true if SPTEs - * have been cleared and a TLB flush is needed before releasing the MMU lock. - * * If can_yield is true, will release the MMU lock and reschedule if the * scheduler needs the CPU or there is contention on the MMU lock. If this * function cannot yield, it will not release the MMU lock or reschedule and @@ -954,7 +964,7 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, rcu_read_lock(); - for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) { + for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end) { if (can_yield && tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) { flush = false; @@ -965,8 +975,14 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, !is_last_spte(iter.old_spte, iter.level)) continue; - tdp_mmu_set_spte(kvm, &iter, 0); - flush = true; + tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE); + + /* + * Zappings SPTEs in invalid roots doesn't require a TLB flush, + * see kvm_tdp_mmu_zap_invalidated_roots() for details. + */ + if (!root->role.invalid) + flush = true; } rcu_read_unlock(); @@ -979,18 +995,17 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, } /* - * Tears down the mappings for the range of gfns, [start, end), and frees the - * non-root pages mapping GFNs strictly within that range. Returns true if - * SPTEs have been cleared and a TLB flush is needed before releasing the - * MMU lock. + * Zap leaf SPTEs for the range of gfns, [start, end), for all *VALID** roots. + * Returns true if a TLB flush is needed before releasing the MMU lock, i.e. if + * one or more SPTEs were zapped since the MMU lock was last acquired. */ -bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end, - bool can_yield, bool flush) +bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush) { struct kvm_mmu_page *root; - for_each_tdp_mmu_root_yield_safe(kvm, root, as_id) - flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush); + lockdep_assert_held_write(&kvm->mmu_lock); + for_each_valid_tdp_mmu_root_yield_safe(kvm, root, -1) + flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush); return flush; } @@ -998,62 +1013,126 @@ bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end, void kvm_tdp_mmu_zap_all(struct kvm *kvm) { struct kvm_mmu_page *root; - int i; /* - * Zap all roots, including invalid roots, as all SPTEs must be dropped - * before returning to the caller. Zap directly even if the root is - * also being zapped by a worker. Walking zapped top-level SPTEs isn't - * all that expensive and mmu_lock is already held, which means the - * worker has yielded, i.e. flushing the work instead of zapping here - * isn't guaranteed to be any faster. + * Zap all direct roots, including invalid direct roots, as all direct + * SPTEs must be dropped before returning to the caller. For TDX, mirror + * roots don't need handling in response to the mmu notifier (the caller). + * + * Zap directly even if the root is also being zapped by a concurrent + * "fast zap". Walking zapped top-level SPTEs isn't all that expensive + * and mmu_lock is already held, which means the other thread has yielded. * * A TLB flush is unnecessary, KVM zaps everything if and only the VM * is being destroyed or the userspace VMM has exited. In both cases, * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request. */ - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { - for_each_tdp_mmu_root_yield_safe(kvm, root, i) - tdp_mmu_zap_root(kvm, root, false); - } + lockdep_assert_held_write(&kvm->mmu_lock); + __for_each_tdp_mmu_root_yield_safe(kvm, root, -1, + KVM_DIRECT_ROOTS | KVM_INVALID_ROOTS) + tdp_mmu_zap_root(kvm, root, false); } /* * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast * zap" completes. */ -void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) +void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared) { - flush_workqueue(kvm->arch.tdp_mmu_zap_wq); + struct kvm_mmu_page *root; + + if (shared) + read_lock(&kvm->mmu_lock); + else + write_lock(&kvm->mmu_lock); + + for_each_tdp_mmu_root_yield_safe(kvm, root) { + if (!root->tdp_mmu_scheduled_root_to_zap) + continue; + + root->tdp_mmu_scheduled_root_to_zap = false; + KVM_BUG_ON(!root->role.invalid, kvm); + + /* + * A TLB flush is not necessary as KVM performs a local TLB + * flush when allocating a new root (see kvm_mmu_load()), and + * when migrating a vCPU to a different pCPU. Note, the local + * TLB flush on reuse also invalidates paging-structure-cache + * entries, i.e. TLB entries for intermediate paging structures, + * that may be zapped, as such entries are associated with the + * ASID on both VMX and SVM. + */ + tdp_mmu_zap_root(kvm, root, shared); + + /* + * The referenced needs to be put *after* zapping the root, as + * the root must be reachable by mmu_notifiers while it's being + * zapped + */ + kvm_tdp_mmu_put_root(kvm, root); + } + + if (shared) + read_unlock(&kvm->mmu_lock); + else + write_unlock(&kvm->mmu_lock); } /* * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that * is about to be zapped, e.g. in response to a memslots update. The actual - * zapping is performed asynchronously, so a reference is taken on all roots. - * Using a separate workqueue makes it easy to ensure that the destruction is - * performed before the "fast zap" completes, without keeping a separate list - * of invalidated roots; the list is effectively the list of work items in - * the workqueue. - * - * Get a reference even if the root is already invalid, the asynchronous worker - * assumes it was gifted a reference to the root it processes. Because mmu_lock - * is held for write, it should be impossible to observe a root with zero refcount, - * i.e. the list of roots cannot be stale. + * zapping is done separately so that it happens with mmu_lock with read, + * whereas invalidating roots must be done with mmu_lock held for write (unless + * the VM is being destroyed). * - * This has essentially the same effect for the TDP MMU - * as updating mmu_valid_gen does for the shadow MMU. + * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference. + * See kvm_tdp_mmu_alloc_root(). */ -void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) +void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm, + enum kvm_tdp_mmu_root_types root_types) { struct kvm_mmu_page *root; - lockdep_assert_held_write(&kvm->mmu_lock); + /* + * Invalidating invalid roots doesn't make sense, prevent developers from + * having to think about it. + */ + if (WARN_ON_ONCE(root_types & KVM_INVALID_ROOTS)) + root_types &= ~KVM_INVALID_ROOTS; + + /* + * mmu_lock must be held for write to ensure that a root doesn't become + * invalid while there are active readers (invalidating a root while + * there are active readers may or may not be problematic in practice, + * but it's uncharted territory and not supported). + * + * Waive the assertion if there are no users of @kvm, i.e. the VM is + * being destroyed after all references have been put, or if no vCPUs + * have been created (which means there are no roots), i.e. the VM is + * being destroyed in an error path of KVM_CREATE_VM. + */ + if (IS_ENABLED(CONFIG_PROVE_LOCKING) && + refcount_read(&kvm->users_count) && kvm->created_vcpus) + lockdep_assert_held_write(&kvm->mmu_lock); + + /* + * As above, mmu_lock isn't held when destroying the VM! There can't + * be other references to @kvm, i.e. nothing else can invalidate roots + * or get/put references to roots. + */ list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { - if (!root->role.invalid && - !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) { + if (!tdp_mmu_root_match(root, root_types)) + continue; + + /* + * Note, invalid roots can outlive a memslot update! Invalid + * roots must be *zapped* before the memslot update completes, + * but a different task can acquire a reference and keep the + * root alive after its been zapped. + */ + if (!root->role.invalid) { + root->tdp_mmu_scheduled_root_to_zap = true; root->role.invalid = true; - tdp_mmu_schedule_zap_root(kvm, root); } } } @@ -1071,35 +1150,42 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int ret = RET_PF_FIXED; bool wrprot = false; - WARN_ON(sp->role.level != fault->goal_level); + if (WARN_ON_ONCE(sp->role.level != fault->goal_level)) + return RET_PF_RETRY; + + if (is_shadow_present_pte(iter->old_spte) && + (fault->prefetch || is_access_allowed(fault, iter->old_spte)) && + is_last_spte(iter->old_spte, iter->level)) { + WARN_ON_ONCE(fault->pfn != spte_to_pfn(iter->old_spte)); + return RET_PF_SPURIOUS; + } + if (unlikely(!fault->slot)) new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); else wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn, - fault->pfn, iter->old_spte, fault->prefetch, true, - fault->map_writable, &new_spte); + fault->pfn, iter->old_spte, fault->prefetch, + false, fault->map_writable, &new_spte); if (new_spte == iter->old_spte) ret = RET_PF_SPURIOUS; else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) return RET_PF_RETRY; else if (is_shadow_present_pte(iter->old_spte) && - !is_last_spte(iter->old_spte, iter->level)) - kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, - KVM_PAGES_PER_HPAGE(iter->level + 1)); + (!is_last_spte(iter->old_spte, iter->level) || + WARN_ON_ONCE(leaf_spte_change_needs_tlb_flush(iter->old_spte, new_spte)))) + kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level); /* * If the page fault was caused by a write but the page is write * protected, emulation is needed. If the emulation was skipped, * the vCPU would have the same fault again. */ - if (wrprot) { - if (fault->write) - ret = RET_PF_EMULATE; - } + if (wrprot && fault->write) + ret = RET_PF_WRITE_PROTECTED; /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ - if (unlikely(is_mmio_spte(new_spte))) { + if (unlikely(is_mmio_spte(vcpu->kvm, new_spte))) { vcpu->stat.pf_mmio_spte_created++; trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, new_spte); @@ -1119,18 +1205,15 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, * @kvm: kvm instance * @iter: a tdp_iter instance currently on the SPTE that should be set * @sp: The new TDP page table to install. - * @account_nx: True if this page table is being installed to split a - * non-executable huge page. * @shared: This operation is running under the MMU lock in read mode. * * Returns: 0 if the new page table was installed. Non-0 if the page table * could not be installed (e.g. the atomic compare-exchange failed). */ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, - struct kvm_mmu_page *sp, bool account_nx, - bool shared) + struct kvm_mmu_page *sp, bool shared) { - u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled()); + u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled); int ret = 0; if (shared) { @@ -1138,28 +1221,28 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, if (ret) return ret; } else { - tdp_mmu_set_spte(kvm, iter, spte); + tdp_mmu_iter_set_spte(kvm, iter, spte); } - spin_lock(&kvm->arch.tdp_mmu_pages_lock); - list_add(&sp->link, &kvm->arch.tdp_mmu_pages); - if (account_nx) - account_huge_nx_page(kvm, sp); - spin_unlock(&kvm->arch.tdp_mmu_pages_lock); + tdp_account_mmu_page(kvm, sp); return 0; } +static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, + struct kvm_mmu_page *sp, bool shared); + /* * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing * page tables and SPTEs to translate the faulting guest physical address. */ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - struct kvm_mmu *mmu = vcpu->arch.mmu; + struct kvm_mmu_page *root = tdp_mmu_get_root_for_fault(vcpu, fault); + struct kvm *kvm = vcpu->kvm; struct tdp_iter iter; struct kvm_mmu_page *sp; - int ret; + int ret = RET_PF_RETRY; kvm_mmu_hugepage_adjust(vcpu, fault); @@ -1167,195 +1250,169 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) rcu_read_lock(); - tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) { + for_each_tdp_pte(iter, kvm, root, fault->gfn, fault->gfn + 1) { + int r; + if (fault->nx_huge_page_workaround_enabled) disallowed_hugepage_adjust(fault, iter.old_spte, iter.level); - if (iter.level == fault->goal_level) - break; - /* - * If there is an SPTE mapping a large page at a higher level - * than the target, that SPTE must be cleared and replaced - * with a non-leaf SPTE. + * If SPTE has been frozen by another thread, just give up and + * retry, avoiding unnecessary page table allocation and free. */ + if (is_frozen_spte(iter.old_spte)) + goto retry; + + if (iter.level == fault->goal_level) + goto map_target_level; + + /* Step down into the lower level page table if it exists. */ if (is_shadow_present_pte(iter.old_spte) && - is_large_pte(iter.old_spte)) { - if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter)) - break; + !is_large_pte(iter.old_spte)) + continue; - /* - * The iter must explicitly re-read the spte here - * because the new value informs the !present - * path below. - */ - iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep); - } + /* + * The SPTE is either non-present or points to a huge page that + * needs to be split. + */ + sp = tdp_mmu_alloc_sp(vcpu); + tdp_mmu_init_child_sp(sp, &iter); + if (is_mirror_sp(sp)) + kvm_mmu_alloc_external_spt(vcpu, sp); - if (!is_shadow_present_pte(iter.old_spte)) { - bool account_nx = fault->huge_page_disallowed && - fault->req_level >= iter.level; + sp->nx_huge_page_disallowed = fault->huge_page_disallowed; - /* - * If SPTE has been frozen by another thread, just - * give up and retry, avoiding unnecessary page table - * allocation and free. - */ - if (is_removed_spte(iter.old_spte)) - break; + if (is_shadow_present_pte(iter.old_spte)) { + /* Don't support large page for mirrored roots (TDX) */ + KVM_BUG_ON(is_mirror_sptep(iter.sptep), vcpu->kvm); + r = tdp_mmu_split_huge_page(kvm, &iter, sp, true); + } else { + r = tdp_mmu_link_sp(kvm, &iter, sp, true); + } - sp = tdp_mmu_alloc_sp(vcpu); - tdp_mmu_init_child_sp(sp, &iter); + /* + * Force the guest to retry if installing an upper level SPTE + * failed, e.g. because a different task modified the SPTE. + */ + if (r) { + tdp_mmu_free_sp(sp); + goto retry; + } - if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) { - tdp_mmu_free_sp(sp); - break; - } + if (fault->huge_page_disallowed && + fault->req_level >= iter.level) { + spin_lock(&kvm->arch.tdp_mmu_pages_lock); + if (sp->nx_huge_page_disallowed) + track_possible_nx_huge_page(kvm, sp); + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); } } /* - * Force the guest to retry the access if the upper level SPTEs aren't - * in place, or if the target leaf SPTE is frozen by another CPU. + * The walk aborted before reaching the target level, e.g. because the + * iterator detected an upper level SPTE was frozen during traversal. */ - if (iter.level != fault->goal_level || is_removed_spte(iter.old_spte)) { - rcu_read_unlock(); - return RET_PF_RETRY; - } + WARN_ON_ONCE(iter.level == fault->goal_level); + goto retry; +map_target_level: ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter); - rcu_read_unlock(); +retry: + rcu_read_unlock(); return ret; } +/* Used by mmu notifier via kvm_unmap_gfn_range() */ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, bool flush) { - return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start, - range->end, range->may_block, flush); -} - -typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter, - struct kvm_gfn_range *range); - -static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, - struct kvm_gfn_range *range, - tdp_handler_t handler) -{ + enum kvm_tdp_mmu_root_types types; struct kvm_mmu_page *root; - struct tdp_iter iter; - bool ret = false; - - /* - * Don't support rescheduling, none of the MMU notifiers that funnel - * into this helper allow blocking; it'd be dead, wasteful code. - */ - for_each_tdp_mmu_root(kvm, root, range->slot->as_id) { - rcu_read_lock(); - tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) - ret |= handler(kvm, &iter, range); + types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter) | KVM_INVALID_ROOTS; - rcu_read_unlock(); - } + __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, types) + flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end, + range->may_block, flush); - return ret; + return flush; } /* * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero * if any of the GFNs in the range have been accessed. + * + * No need to mark the corresponding PFN as accessed as this call is coming + * from the clear_young() or clear_flush_young() notifier, which uses the + * return value to determine if the page has been accessed. */ -static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, - struct kvm_gfn_range *range) +static void kvm_tdp_mmu_age_spte(struct kvm *kvm, struct tdp_iter *iter) { - u64 new_spte = 0; - - /* If we have a non-accessed entry we don't need to change the pte. */ - if (!is_accessed_spte(iter->old_spte)) - return false; - - new_spte = iter->old_spte; + u64 new_spte; - if (spte_ad_enabled(new_spte)) { - new_spte &= ~shadow_accessed_mask; + if (spte_ad_enabled(iter->old_spte)) { + iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep, + shadow_accessed_mask); + new_spte = iter->old_spte & ~shadow_accessed_mask; } else { + new_spte = mark_spte_for_access_track(iter->old_spte); /* - * Capture the dirty status of the page, so that it doesn't get - * lost when the SPTE is marked for access tracking. + * It is safe for the following cmpxchg to fail. Leave the + * Accessed bit set, as the spte is most likely young anyway. */ - if (is_writable_pte(new_spte)) - kvm_set_pfn_dirty(spte_to_pfn(new_spte)); - - new_spte = mark_spte_for_access_track(new_spte); + if (__tdp_mmu_set_spte_atomic(kvm, iter, new_spte)) + return; } - tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte); - - return true; -} - -bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) -{ - return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range); -} - -static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter, - struct kvm_gfn_range *range) -{ - return is_accessed_spte(iter->old_spte); -} - -bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) -{ - return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn); + trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level, + iter->old_spte, new_spte); } -static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter, - struct kvm_gfn_range *range) +static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, + struct kvm_gfn_range *range, + bool test_only) { - u64 new_spte; - - /* Huge pages aren't expected to be modified without first being zapped. */ - WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end); + enum kvm_tdp_mmu_root_types types; + struct kvm_mmu_page *root; + struct tdp_iter iter; + bool ret = false; - if (iter->level != PG_LEVEL_4K || - !is_shadow_present_pte(iter->old_spte)) - return false; + types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter); /* - * Note, when changing a read-only SPTE, it's not strictly necessary to - * zero the SPTE before setting the new PFN, but doing so preserves the - * invariant that the PFN of a present * leaf SPTE can never change. - * See __handle_changed_spte(). + * Don't support rescheduling, none of the MMU notifiers that funnel + * into this helper allow blocking; it'd be dead, wasteful code. Note, + * this helper must NOT be used to unmap GFNs, as it processes only + * valid roots! */ - tdp_mmu_set_spte(kvm, iter, 0); + WARN_ON(types & ~KVM_VALID_ROOTS); - if (!pte_write(range->pte)) { - new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte, - pte_pfn(range->pte)); + guard(rcu)(); + for_each_tdp_mmu_root_rcu(kvm, root, range->slot->as_id, types) { + tdp_root_for_each_leaf_pte(iter, kvm, root, range->start, range->end) { + if (!is_accessed_spte(iter.old_spte)) + continue; + + if (test_only) + return true; - tdp_mmu_set_spte(kvm, iter, new_spte); + ret = true; + kvm_tdp_mmu_age_spte(kvm, &iter); + } } - return true; + return ret; } -/* - * Handle the changed_pte MMU notifier for the TDP MMU. - * data is a pointer to the new pte_t mapping the HVA specified by the MMU - * notifier. - * Returns non-zero if a flush is needed before releasing the MMU lock. - */ -bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) +bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { - /* - * No need to handle the remote TLB flush under RCU protection, the - * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a - * shadow page. See the WARN on pfn_changed in __handle_changed_spte(). - */ - return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn); + return __kvm_tdp_mmu_age_gfn_range(kvm, range, false); +} + +bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) +{ + return __kvm_tdp_mmu_age_gfn_range(kvm, range, true); } /* @@ -1374,7 +1431,7 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); - for_each_tdp_pte_min_level(iter, root, min_level, start, end) { + for_each_tdp_pte_min_level(iter, kvm, root, min_level, start, end) { retry: if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) continue; @@ -1409,24 +1466,22 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, lockdep_assert_held_read(&kvm->mmu_lock); - for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) + for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, slot->base_gfn + slot->npages, min_level); return spte_set; } -static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp) +static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(void) { struct kvm_mmu_page *sp; - gfp |= __GFP_ZERO; - - sp = kmem_cache_alloc(mmu_page_header_cache, gfp); + sp = kmem_cache_zalloc(mmu_page_header_cache, GFP_KERNEL_ACCOUNT); if (!sp) return NULL; - sp->spt = (void *)__get_free_page(gfp); + sp->spt = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!sp->spt) { kmem_cache_free(mmu_page_header_cache, sp); return NULL; @@ -1435,45 +1490,7 @@ static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp) return sp; } -static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, - struct tdp_iter *iter, - bool shared) -{ - struct kvm_mmu_page *sp; - - /* - * Since we are allocating while under the MMU lock we have to be - * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct - * reclaim and to avoid making any filesystem callbacks (which can end - * up invoking KVM MMU notifiers, resulting in a deadlock). - * - * If this allocation fails we drop the lock and retry with reclaim - * allowed. - */ - sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT); - if (sp) - return sp; - - rcu_read_unlock(); - - if (shared) - read_unlock(&kvm->mmu_lock); - else - write_unlock(&kvm->mmu_lock); - - iter->yielded = true; - sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT); - - if (shared) - read_lock(&kvm->mmu_lock); - else - write_lock(&kvm->mmu_lock); - - rcu_read_lock(); - - return sp; -} - +/* Note, the caller is responsible for initializing @sp. */ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, struct kvm_mmu_page *sp, bool shared) { @@ -1481,14 +1498,12 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, const int level = iter->level; int ret, i; - tdp_mmu_init_child_sp(sp, iter); - /* * No need for atomics when writing to sp->spt since the page table has * not been linked in yet and thus is not reachable from any other CPU. */ - for (i = 0; i < PT64_ENT_PER_PAGE; i++) - sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i); + for (i = 0; i < SPTE_ENT_PER_PAGE; i++) + sp->spt[i] = make_small_spte(kvm, huge_spte, sp->role, i); /* * Replace the huge spte with a pointer to the populated lower level @@ -1498,7 +1513,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, * correctness standpoint since the translation will be the same either * way. */ - ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared); + ret = tdp_mmu_link_sp(kvm, iter, sp, shared); if (ret) goto out; @@ -1507,7 +1522,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, * are overwriting from the page stats. But we have to manually update * the page stats with the new present child pages. */ - kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE); + kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE); out: trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret); @@ -1521,7 +1536,6 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm, { struct kvm_mmu_page *sp = NULL; struct tdp_iter iter; - int ret = 0; rcu_read_lock(); @@ -1536,7 +1550,7 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm, * level above the target level (e.g. splitting a 1GB to 512 2MB pages, * and then splitting each of those to 512 4KB pages). */ - for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) { + for_each_tdp_pte_min_level(iter, kvm, root, target_level + 1, start, end) { retry: if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) continue; @@ -1545,19 +1559,35 @@ retry: continue; if (!sp) { - sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared); + rcu_read_unlock(); + + if (shared) + read_unlock(&kvm->mmu_lock); + else + write_unlock(&kvm->mmu_lock); + + sp = tdp_mmu_alloc_sp_for_split(); + + if (shared) + read_lock(&kvm->mmu_lock); + else + write_lock(&kvm->mmu_lock); + if (!sp) { - ret = -ENOMEM; trace_kvm_mmu_split_huge_page(iter.gfn, iter.old_spte, - iter.level, ret); - break; + iter.level, -ENOMEM); + return -ENOMEM; } - if (iter.yielded) - continue; + rcu_read_lock(); + + iter.yielded = true; + continue; } + tdp_mmu_init_child_sp(sp, &iter); + if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared)) goto retry; @@ -1574,7 +1604,7 @@ retry: if (sp) tdp_mmu_free_sp(sp); - return ret; + return 0; } @@ -1590,134 +1620,115 @@ void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, int r = 0; kvm_lockdep_assert_mmu_lock_held(kvm, shared); - - for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) { + for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) { r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared); if (r) { - kvm_tdp_mmu_put_root(kvm, root, shared); + kvm_tdp_mmu_put_root(kvm, root); break; } } } -/* - * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If - * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. - * If AD bits are not enabled, this will require clearing the writable bit on - * each SPTE. Returns true if an SPTE has been changed and the TLBs need to - * be flushed. - */ -static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t start, gfn_t end) +static bool tdp_mmu_need_write_protect(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + /* + * All TDP MMU shadow pages share the same role as their root, aside + * from level, so it is valid to key off any shadow page to determine if + * write protection is needed for an entire tree. + */ + return kvm_mmu_page_ad_need_write_protect(kvm, sp) || !kvm_ad_enabled; +} + +static void clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, + gfn_t start, gfn_t end) { + const u64 dbit = tdp_mmu_need_write_protect(kvm, root) ? + PT_WRITABLE_MASK : shadow_dirty_mask; struct tdp_iter iter; - u64 new_spte; - bool spte_set = false; rcu_read_lock(); - tdp_root_for_each_leaf_pte(iter, root, start, end) { + tdp_root_for_each_pte(iter, kvm, root, start, end) { retry: - if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) + if (!is_shadow_present_pte(iter.old_spte) || + !is_last_spte(iter.old_spte, iter.level)) continue; - if (!is_shadow_present_pte(iter.old_spte)) + if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) continue; - if (spte_ad_need_write_protect(iter.old_spte)) { - if (is_writable_pte(iter.old_spte)) - new_spte = iter.old_spte & ~PT_WRITABLE_MASK; - else - continue; - } else { - if (iter.old_spte & shadow_dirty_mask) - new_spte = iter.old_spte & ~shadow_dirty_mask; - else - continue; - } + KVM_MMU_WARN_ON(dbit == shadow_dirty_mask && + spte_ad_need_write_protect(iter.old_spte)); - if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) - goto retry; + if (!(iter.old_spte & dbit)) + continue; - spte_set = true; + if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit)) + goto retry; } rcu_read_unlock(); - return spte_set; } /* - * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If - * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. - * If AD bits are not enabled, this will require clearing the writable bit on - * each SPTE. Returns true if an SPTE has been changed and the TLBs need to - * be flushed. + * Clear the dirty status (D-bit or W-bit) of all the SPTEs mapping GFNs in the + * memslot. */ -bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, +void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, const struct kvm_memory_slot *slot) { struct kvm_mmu_page *root; - bool spte_set = false; lockdep_assert_held_read(&kvm->mmu_lock); - - for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) - spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, - slot->base_gfn + slot->npages); - - return spte_set; + for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) + clear_dirty_gfn_range(kvm, root, slot->base_gfn, + slot->base_gfn + slot->npages); } -/* - * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is - * set in mask, starting at gfn. The given memslot is expected to contain all - * the GFNs represented by set bits in the mask. If AD bits are enabled, - * clearing the dirty status will involve clearing the dirty bit on each SPTE - * or, if AD bits are not enabled, clearing the writable bit on each SPTE. - */ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t gfn, unsigned long mask, bool wrprot) { + const u64 dbit = (wrprot || tdp_mmu_need_write_protect(kvm, root)) ? + PT_WRITABLE_MASK : shadow_dirty_mask; struct tdp_iter iter; - u64 new_spte; + + lockdep_assert_held_write(&kvm->mmu_lock); rcu_read_lock(); - tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), + tdp_root_for_each_leaf_pte(iter, kvm, root, gfn + __ffs(mask), gfn + BITS_PER_LONG) { if (!mask) break; + KVM_MMU_WARN_ON(dbit == shadow_dirty_mask && + spte_ad_need_write_protect(iter.old_spte)); + if (iter.level > PG_LEVEL_4K || !(mask & (1UL << (iter.gfn - gfn)))) continue; mask &= ~(1UL << (iter.gfn - gfn)); - if (wrprot || spte_ad_need_write_protect(iter.old_spte)) { - if (is_writable_pte(iter.old_spte)) - new_spte = iter.old_spte & ~PT_WRITABLE_MASK; - else - continue; - } else { - if (iter.old_spte & shadow_dirty_mask) - new_spte = iter.old_spte & ~shadow_dirty_mask; - else - continue; - } + if (!(iter.old_spte & dbit)) + continue; + + iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep, + iter.old_spte, dbit, + iter.level); - tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); + trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level, + iter.old_spte, + iter.old_spte & ~dbit); } rcu_read_unlock(); } /* - * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is - * set in mask, starting at gfn. The given memslot is expected to contain all - * the GFNs represented by set bits in the mask. If AD bits are enabled, - * clearing the dirty status will involve clearing the dirty bit on each SPTE - * or, if AD bits are not enabled, clearing the writable bit on each SPTE. + * Clear the dirty status (D-bit or W-bit) of all the 4k SPTEs mapping GFNs for + * which a bit is set in mask, starting at gfn. The given memslot is expected to + * contain all the GFNs represented by set bits in the mask. */ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, @@ -1726,88 +1737,116 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, { struct kvm_mmu_page *root; - lockdep_assert_held_write(&kvm->mmu_lock); - for_each_tdp_mmu_root(kvm, root, slot->as_id) + for_each_valid_tdp_mmu_root(kvm, root, slot->as_id) clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); } -/* - * Clear leaf entries which could be replaced by large mappings, for - * GFNs within the slot. - */ -static void zap_collapsible_spte_range(struct kvm *kvm, - struct kvm_mmu_page *root, - const struct kvm_memory_slot *slot) +static int tdp_mmu_make_huge_spte(struct kvm *kvm, + struct tdp_iter *parent, + u64 *huge_spte) +{ + struct kvm_mmu_page *root = spte_to_child_sp(parent->old_spte); + gfn_t start = parent->gfn; + gfn_t end = start + KVM_PAGES_PER_HPAGE(parent->level); + struct tdp_iter iter; + + tdp_root_for_each_leaf_pte(iter, kvm, root, start, end) { + /* + * Use the parent iterator when checking for forward progress so + * that KVM doesn't get stuck continuously trying to yield (i.e. + * returning -EAGAIN here and then failing the forward progress + * check in the caller ad nauseam). + */ + if (tdp_mmu_iter_need_resched(kvm, parent)) + return -EAGAIN; + + *huge_spte = make_huge_spte(kvm, iter.old_spte, parent->level); + return 0; + } + + return -ENOENT; +} + +static void recover_huge_pages_range(struct kvm *kvm, + struct kvm_mmu_page *root, + const struct kvm_memory_slot *slot) { gfn_t start = slot->base_gfn; gfn_t end = start + slot->npages; struct tdp_iter iter; int max_mapping_level; - kvm_pfn_t pfn; + bool flush = false; + u64 huge_spte; + int r; + + if (WARN_ON_ONCE(kvm_slot_dirty_track_enabled(slot))) + return; rcu_read_lock(); - tdp_root_for_each_pte(iter, root, start, end) { - if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) + for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_2M, start, end) { +retry: + if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) { + flush = false; continue; + } - if (!is_shadow_present_pte(iter.old_spte) || - !is_last_spte(iter.old_spte, iter.level)) + if (iter.level > KVM_MAX_HUGEPAGE_LEVEL || + !is_shadow_present_pte(iter.old_spte)) continue; /* - * This is a leaf SPTE. Check if the PFN it maps can - * be mapped at a higher level. + * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with + * a large page size, then its parent would have been zapped + * instead of stepping down. */ - pfn = spte_to_pfn(iter.old_spte); - - if (kvm_is_reserved_pfn(pfn)) + if (is_last_spte(iter.old_spte, iter.level)) continue; - max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, - iter.gfn, pfn, PG_LEVEL_NUM); - - WARN_ON(max_mapping_level < iter.level); - /* - * If this page is already mapped at the highest - * viable level, there's nothing more to do. + * If iter.gfn resides outside of the slot, i.e. the page for + * the current level overlaps but is not contained by the slot, + * then the SPTE can't be made huge. More importantly, trying + * to query that info from slot->arch.lpage_info will cause an + * out-of-bounds access. */ - if (max_mapping_level == iter.level) + if (iter.gfn < start || iter.gfn >= end) continue; - /* - * The page can be remapped at a higher level, so step - * up to zap the parent SPTE. - */ - while (max_mapping_level > iter.level) - tdp_iter_step_up(&iter); + max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, iter.gfn); + if (max_mapping_level < iter.level) + continue; - /* Note, a successful atomic zap also does a remote TLB flush. */ - tdp_mmu_zap_spte_atomic(kvm, &iter); + r = tdp_mmu_make_huge_spte(kvm, &iter, &huge_spte); + if (r == -EAGAIN) + goto retry; + else if (r) + continue; - /* - * If the atomic zap fails, the iter will recurse back into - * the same subtree to retry. - */ + if (tdp_mmu_set_spte_atomic(kvm, &iter, huge_spte)) + goto retry; + + flush = true; } + if (flush) + kvm_flush_remote_tlbs_memslot(kvm, slot); + rcu_read_unlock(); } /* - * Clear non-leaf entries (and free associated page tables) which could - * be replaced by large mappings, for GFNs within the slot. + * Recover huge page mappings within the slot by replacing non-leaf SPTEs with + * huge SPTEs where possible. */ -void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot) +void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *slot) { struct kvm_mmu_page *root; lockdep_assert_held_read(&kvm->mmu_lock); - - for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) - zap_collapsible_spte_range(kvm, root, slot); + for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) + recover_huge_pages_range(kvm, root, slot); } /* @@ -1826,7 +1865,7 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, rcu_read_lock(); - for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) { + for_each_tdp_pte_min_level(iter, kvm, root, min_level, gfn, gfn + 1) { if (!is_shadow_present_pte(iter.old_spte) || !is_last_spte(iter.old_spte, iter.level)) continue; @@ -1837,7 +1876,7 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, if (new_spte == iter.old_spte) break; - tdp_mmu_set_spte(kvm, &iter, new_spte); + tdp_mmu_iter_set_spte(kvm, &iter, new_spte); spte_set = true; } @@ -1859,7 +1898,7 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, bool spte_set = false; lockdep_assert_held_write(&kvm->mmu_lock); - for_each_tdp_mmu_root(kvm, root, slot->as_id) + for_each_valid_tdp_mmu_root(kvm, root, slot->as_id) spte_set |= write_protect_gfn(kvm, root, gfn, min_level); return spte_set; @@ -1871,17 +1910,14 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, * * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. */ -int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, - int *root_level) +static int __kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, + struct kvm_mmu_page *root) { struct tdp_iter iter; - struct kvm_mmu *mmu = vcpu->arch.mmu; gfn_t gfn = addr >> PAGE_SHIFT; int leaf = -1; - *root_level = vcpu->arch.mmu->root_role.level; - - tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { + for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) { leaf = iter.level; sptes[leaf] = iter.old_spte; } @@ -1889,6 +1925,36 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, return leaf; } +int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, + int *root_level) +{ + struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa); + *root_level = vcpu->arch.mmu->root_role.level; + + return __kvm_tdp_mmu_get_walk(vcpu, addr, sptes, root); +} + +bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa) +{ + struct kvm *kvm = vcpu->kvm; + bool is_direct = kvm_is_addr_direct(kvm, gpa); + hpa_t root = is_direct ? vcpu->arch.mmu->root.hpa : + vcpu->arch.mmu->mirror_root_hpa; + u64 sptes[PT64_ROOT_MAX_LEVEL + 1], spte; + int leaf; + + lockdep_assert_held(&kvm->mmu_lock); + rcu_read_lock(); + leaf = __kvm_tdp_mmu_get_walk(vcpu, gpa, sptes, root_to_sp(root)); + rcu_read_unlock(); + if (leaf < 0) + return false; + + spte = sptes[leaf]; + return is_shadow_present_pte(spte) && is_last_spte(spte, leaf); +} +EXPORT_SYMBOL_GPL(kvm_tdp_mmu_gpa_is_mapped); + /* * Returns the last level spte pointer of the shadow page walk for the given * gpa, and sets *spte to the spte value. This spte may be non-preset. If no @@ -1900,15 +1966,15 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, * * WARNING: This function is only intended to be called during fast_page_fault. */ -u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, +u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn, u64 *spte) { + /* Fast pf is not supported for mirrored roots */ + struct kvm_mmu_page *root = tdp_mmu_get_root(vcpu, KVM_DIRECT_ROOTS); struct tdp_iter iter; - struct kvm_mmu *mmu = vcpu->arch.mmu; - gfn_t gfn = addr >> PAGE_SHIFT; tdp_ptep_t sptep = NULL; - tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { + for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) { *spte = iter.old_spte; sptep = iter.sptep; } diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index c163f7cc23ca..52acf99d40a0 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -5,22 +5,70 @@ #include <linux/kvm_host.h> -hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); +#include "spte.h" + +void kvm_mmu_init_tdp_mmu(struct kvm *kvm); +void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); + +void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool private); __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root) { return refcount_inc_not_zero(&root->tdp_mmu_root_count); } -void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, - bool shared); +void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root); + +enum kvm_tdp_mmu_root_types { + KVM_INVALID_ROOTS = BIT(0), + KVM_DIRECT_ROOTS = BIT(1), + KVM_MIRROR_ROOTS = BIT(2), + KVM_VALID_ROOTS = KVM_DIRECT_ROOTS | KVM_MIRROR_ROOTS, + KVM_ALL_ROOTS = KVM_VALID_ROOTS | KVM_INVALID_ROOTS, +}; + +static inline enum kvm_tdp_mmu_root_types kvm_gfn_range_filter_to_root_types(struct kvm *kvm, + enum kvm_gfn_range_filter process) +{ + enum kvm_tdp_mmu_root_types ret = 0; + + if (!kvm_has_mirrored_tdp(kvm)) + return KVM_DIRECT_ROOTS; + + if (process & KVM_FILTER_PRIVATE) + ret |= KVM_MIRROR_ROOTS; + if (process & KVM_FILTER_SHARED) + ret |= KVM_DIRECT_ROOTS; + + WARN_ON_ONCE(!ret); -bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, - gfn_t end, bool can_yield, bool flush); + return ret; +} + +static inline struct kvm_mmu_page *tdp_mmu_get_root_for_fault(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) +{ + if (unlikely(!kvm_is_addr_direct(vcpu->kvm, fault->addr))) + return root_to_sp(vcpu->arch.mmu->mirror_root_hpa); + + return root_to_sp(vcpu->arch.mmu->root.hpa); +} + +static inline struct kvm_mmu_page *tdp_mmu_get_root(struct kvm_vcpu *vcpu, + enum kvm_tdp_mmu_root_types type) +{ + if (unlikely(type == KVM_MIRROR_ROOTS)) + return root_to_sp(vcpu->arch.mmu->mirror_root_hpa); + + return root_to_sp(vcpu->arch.mmu->root.hpa); +} + +bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush); bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp); void kvm_tdp_mmu_zap_all(struct kvm *kvm); -void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm); -void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm); +void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm, + enum kvm_tdp_mmu_root_types root_types); +void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared); int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); @@ -28,18 +76,17 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, bool flush); bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); -bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, const struct kvm_memory_slot *slot, int min_level); -bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, +void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, const struct kvm_memory_slot *slot); void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, unsigned long mask, bool wrprot); -void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot); +void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *slot); bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, @@ -62,35 +109,13 @@ static inline void kvm_tdp_mmu_walk_lockless_end(void) int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level); -u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, +u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn, u64 *spte); #ifdef CONFIG_X86_64 -int kvm_mmu_init_tdp_mmu(struct kvm *kvm); -void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; } - -static inline bool is_tdp_mmu(struct kvm_mmu *mmu) -{ - struct kvm_mmu_page *sp; - hpa_t hpa = mmu->root.hpa; - - if (WARN_ON(!VALID_PAGE(hpa))) - return false; - - /* - * A NULL shadow page is legal when shadowing a non-paging guest with - * PAE paging, as the MMU will be direct with root_hpa pointing at the - * pae_root page, not a shadow page. - */ - sp = to_shadow_page(hpa); - return sp && is_tdp_mmu_page(sp) && sp->root_count; -} #else -static inline int kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return 0; } -static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {} static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; } -static inline bool is_tdp_mmu(struct kvm_mmu *mmu) { return false; } #endif #endif /* __KVM_X86_MMU_TDP_MMU_H */ |