aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm/mmu/mmu.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/mmu/mmu.c')
-rw-r--r--arch/x86/kvm/mmu/mmu.c1003
1 files changed, 316 insertions, 687 deletions
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 4e03841f053d..5bb1939b65d8 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -19,10 +19,12 @@
#include "ioapic.h"
#include "mmu.h"
#include "mmu_internal.h"
+#include "tdp_mmu.h"
#include "x86.h"
#include "kvm_cache_regs.h"
#include "kvm_emulate.h"
#include "cpuid.h"
+#include "spte.h"
#include <linux/kvm_host.h>
#include <linux/types.h>
@@ -45,7 +47,6 @@
#include <asm/page.h>
#include <asm/memtype.h>
#include <asm/cmpxchg.h>
-#include <asm/e820/api.h>
#include <asm/io.h>
#include <asm/vmx.h>
#include <asm/kvm_page_track.h>
@@ -64,12 +65,12 @@ static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp);
-static struct kernel_param_ops nx_huge_pages_ops = {
+static const struct kernel_param_ops nx_huge_pages_ops = {
.set = set_nx_huge_pages,
.get = param_get_bool,
};
-static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = {
+static const struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = {
.set = set_nx_huge_pages_recovery_ratio,
.get = param_get_uint,
};
@@ -104,45 +105,13 @@ enum {
AUDIT_POST_SYNC
};
-#undef MMU_DEBUG
-
#ifdef MMU_DEBUG
-static bool dbg = 0;
+bool dbg = 0;
module_param(dbg, bool, 0644);
-
-#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
-#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
-#define MMU_WARN_ON(x) WARN_ON(x)
-#else
-#define pgprintk(x...) do { } while (0)
-#define rmap_printk(x...) do { } while (0)
-#define MMU_WARN_ON(x) do { } while (0)
#endif
#define PTE_PREFETCH_NUM 8
-#define PT_FIRST_AVAIL_BITS_SHIFT 10
-#define PT64_SECOND_AVAIL_BITS_SHIFT 54
-
-/*
- * The mask used to denote special SPTEs, which can be either MMIO SPTEs or
- * Access Tracking SPTEs.
- */
-#define SPTE_SPECIAL_MASK (3ULL << 52)
-#define SPTE_AD_ENABLED_MASK (0ULL << 52)
-#define SPTE_AD_DISABLED_MASK (1ULL << 52)
-#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
-#define SPTE_MMIO_MASK (3ULL << 52)
-
-#define PT64_LEVEL_BITS 9
-
-#define PT64_LEVEL_SHIFT(level) \
- (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
-
-#define PT64_INDEX(address, level)\
- (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
-
-
#define PT32_LEVEL_BITS 10
#define PT32_LEVEL_SHIFT(level) \
@@ -156,18 +125,6 @@ module_param(dbg, bool, 0644);
(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
-#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
-#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
-#else
-#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
-#endif
-#define PT64_LVL_ADDR_MASK(level) \
- (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
- * PT64_LEVEL_BITS))) - 1))
-#define PT64_LVL_OFFSET_MASK(level) \
- (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
- * PT64_LEVEL_BITS))) - 1))
-
#define PT32_BASE_ADDR_MASK PAGE_MASK
#define PT32_DIR_BASE_ADDR_MASK \
(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
@@ -175,42 +132,11 @@ module_param(dbg, bool, 0644);
(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
* PT32_LEVEL_BITS))) - 1))
-#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
- | shadow_x_mask | shadow_nx_mask | shadow_me_mask)
-
-#define ACC_EXEC_MASK 1
-#define ACC_WRITE_MASK PT_WRITABLE_MASK
-#define ACC_USER_MASK PT_USER_MASK
-#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
-
-/* The mask for the R/X bits in EPT PTEs */
-#define PT64_EPT_READABLE_MASK 0x1ull
-#define PT64_EPT_EXECUTABLE_MASK 0x4ull
-
#include <trace/events/kvm.h>
-#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
-#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
-
-#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
-
/* make pte_list_desc fit well in cache line */
#define PTE_LIST_EXT 3
-/*
- * Return values of handle_mmio_page_fault and mmu.page_fault:
- * RET_PF_RETRY: let CPU fault again on the address.
- * RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
- *
- * For handle_mmio_page_fault only:
- * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
- */
-enum {
- RET_PF_RETRY = 0,
- RET_PF_EMULATE = 1,
- RET_PF_INVALID = 2,
-};
-
struct pte_list_desc {
u64 *sptes[PTE_LIST_EXT];
struct pte_list_desc *more;
@@ -242,65 +168,10 @@ struct kvm_shadow_walk_iterator {
__shadow_walk_next(&(_walker), spte))
static struct kmem_cache *pte_list_desc_cache;
-static struct kmem_cache *mmu_page_header_cache;
+struct kmem_cache *mmu_page_header_cache;
static struct percpu_counter kvm_total_used_mmu_pages;
-static u64 __read_mostly shadow_nx_mask;
-static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
-static u64 __read_mostly shadow_user_mask;
-static u64 __read_mostly shadow_accessed_mask;
-static u64 __read_mostly shadow_dirty_mask;
-static u64 __read_mostly shadow_mmio_value;
-static u64 __read_mostly shadow_mmio_access_mask;
-static u64 __read_mostly shadow_present_mask;
-static u64 __read_mostly shadow_me_mask;
-
-/*
- * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK;
- * shadow_acc_track_mask is the set of bits to be cleared in non-accessed
- * pages.
- */
-static u64 __read_mostly shadow_acc_track_mask;
-
-/*
- * The mask/shift to use for saving the original R/X bits when marking the PTE
- * as not-present for access tracking purposes. We do not save the W bit as the
- * PTEs being access tracked also need to be dirty tracked, so the W bit will be
- * restored only when a write is attempted to the page.
- */
-static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
- PT64_EPT_EXECUTABLE_MASK;
-static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
-
-/*
- * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order
- * to guard against L1TF attacks.
- */
-static u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
-
-/*
- * The number of high-order 1 bits to use in the mask above.
- */
-static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;
-
-/*
- * In some cases, we need to preserve the GFN of a non-present or reserved
- * SPTE when we usurp the upper five bits of the physical address space to
- * defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll
- * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask
- * left into the reserved bits, i.e. the GFN in the SPTE will be split into
- * high and low parts. This mask covers the lower bits of the GFN.
- */
-static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
-
-/*
- * The number of non-reserved physical address bits irrespective of features
- * that repurpose legal bits, e.g. MKTME.
- */
-static u8 __read_mostly shadow_phys_bits;
-
static void mmu_spte_set(u64 *sptep, u64 spte);
-static bool is_executable_pte(u64 spte);
static union kvm_mmu_page_role
kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
@@ -325,7 +196,7 @@ static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
kvm_flush_remote_tlbs(kvm);
}
-static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
+void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
u64 start_gfn, u64 pages)
{
struct kvm_tlb_range range;
@@ -336,143 +207,17 @@ static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
kvm_flush_remote_tlbs_with_range(kvm, &range);
}
-void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask)
-{
- BUG_ON((u64)(unsigned)access_mask != access_mask);
- WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len));
- WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
- shadow_mmio_value = mmio_value | SPTE_MMIO_MASK;
- shadow_mmio_access_mask = access_mask;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
-
-static bool is_mmio_spte(u64 spte)
-{
- return (spte & SPTE_SPECIAL_MASK) == SPTE_MMIO_MASK;
-}
-
-static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
-{
- return sp->role.ad_disabled;
-}
-
-static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
-{
- /*
- * When using the EPT page-modification log, the GPAs in the log
- * would come from L2 rather than L1. Therefore, we need to rely
- * on write protection to record dirty pages. This also bypasses
- * PML, since writes now result in a vmexit.
- */
- return vcpu->arch.mmu == &vcpu->arch.guest_mmu;
-}
-
-static inline bool spte_ad_enabled(u64 spte)
-{
- MMU_WARN_ON(is_mmio_spte(spte));
- return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK;
-}
-
-static inline bool spte_ad_need_write_protect(u64 spte)
-{
- MMU_WARN_ON(is_mmio_spte(spte));
- return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK;
-}
-
-static bool is_nx_huge_page_enabled(void)
+bool is_nx_huge_page_enabled(void)
{
return READ_ONCE(nx_huge_pages);
}
-static inline u64 spte_shadow_accessed_mask(u64 spte)
-{
- MMU_WARN_ON(is_mmio_spte(spte));
- return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
-}
-
-static inline u64 spte_shadow_dirty_mask(u64 spte)
-{
- MMU_WARN_ON(is_mmio_spte(spte));
- return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
-}
-
-static inline bool is_access_track_spte(u64 spte)
-{
- return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
-}
-
-/*
- * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of
- * the memslots generation and is derived as follows:
- *
- * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11
- * Bits 9-18 of the MMIO generation are propagated to spte bits 52-61
- *
- * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
- * the MMIO generation number, as doing so would require stealing a bit from
- * the "real" generation number and thus effectively halve the maximum number
- * of MMIO generations that can be handled before encountering a wrap (which
- * requires a full MMU zap). The flag is instead explicitly queried when
- * checking for MMIO spte cache hits.
- */
-#define MMIO_SPTE_GEN_MASK GENMASK_ULL(17, 0)
-
-#define MMIO_SPTE_GEN_LOW_START 3
-#define MMIO_SPTE_GEN_LOW_END 11
-#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
- MMIO_SPTE_GEN_LOW_START)
-
-#define MMIO_SPTE_GEN_HIGH_START PT64_SECOND_AVAIL_BITS_SHIFT
-#define MMIO_SPTE_GEN_HIGH_END 62
-#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
- MMIO_SPTE_GEN_HIGH_START)
-
-static u64 generation_mmio_spte_mask(u64 gen)
-{
- u64 mask;
-
- WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
- BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK);
-
- mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK;
- mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK;
- return mask;
-}
-
-static u64 get_mmio_spte_generation(u64 spte)
-{
- u64 gen;
-
- gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START;
- gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START;
- return gen;
-}
-
-static u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
-{
-
- u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
- u64 mask = generation_mmio_spte_mask(gen);
- u64 gpa = gfn << PAGE_SHIFT;
-
- access &= shadow_mmio_access_mask;
- mask |= shadow_mmio_value | access;
- mask |= gpa | shadow_nonpresent_or_rsvd_mask;
- mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
- << shadow_nonpresent_or_rsvd_mask_len;
-
- return mask;
-}
-
static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
unsigned int access)
{
u64 mask = make_mmio_spte(vcpu, gfn, access);
- unsigned int gen = get_mmio_spte_generation(mask);
- access = mask & ACC_ALL;
-
- trace_mark_mmio_spte(sptep, gfn, access, gen);
+ trace_mark_mmio_spte(sptep, gfn, mask);
mmu_spte_set(sptep, mask);
}
@@ -480,7 +225,7 @@ static gfn_t get_mmio_spte_gfn(u64 spte)
{
u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
- gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len)
+ gpa |= (spte >> SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)
& shadow_nonpresent_or_rsvd_mask;
return gpa >> PAGE_SHIFT;
@@ -521,7 +266,7 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
struct x86_exception *exception)
{
/* Check if guest physical address doesn't exceed guest maximum */
- if (kvm_mmu_is_illegal_gpa(vcpu, gpa)) {
+ if (kvm_vcpu_is_illegal_gpa(vcpu, gpa)) {
exception->error_code |= PFERR_RSVD_MASK;
return UNMAPPED_GVA;
}
@@ -529,90 +274,6 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
return gpa;
}
-/*
- * Sets the shadow PTE masks used by the MMU.
- *
- * Assumptions:
- * - Setting either @accessed_mask or @dirty_mask requires setting both
- * - At least one of @accessed_mask or @acc_track_mask must be set
- */
-void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
- u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
- u64 acc_track_mask, u64 me_mask)
-{
- BUG_ON(!dirty_mask != !accessed_mask);
- BUG_ON(!accessed_mask && !acc_track_mask);
- BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK);
-
- shadow_user_mask = user_mask;
- shadow_accessed_mask = accessed_mask;
- shadow_dirty_mask = dirty_mask;
- shadow_nx_mask = nx_mask;
- shadow_x_mask = x_mask;
- shadow_present_mask = p_mask;
- shadow_acc_track_mask = acc_track_mask;
- shadow_me_mask = me_mask;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
-
-static u8 kvm_get_shadow_phys_bits(void)
-{
- /*
- * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected
- * in CPU detection code, but the processor treats those reduced bits as
- * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at
- * the physical address bits reported by CPUID.
- */
- if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008))
- return cpuid_eax(0x80000008) & 0xff;
-
- /*
- * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with
- * custom CPUID. Proceed with whatever the kernel found since these features
- * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008).
- */
- return boot_cpu_data.x86_phys_bits;
-}
-
-static void kvm_mmu_reset_all_pte_masks(void)
-{
- u8 low_phys_bits;
-
- shadow_user_mask = 0;
- shadow_accessed_mask = 0;
- shadow_dirty_mask = 0;
- shadow_nx_mask = 0;
- shadow_x_mask = 0;
- shadow_present_mask = 0;
- shadow_acc_track_mask = 0;
-
- shadow_phys_bits = kvm_get_shadow_phys_bits();
-
- /*
- * If the CPU has 46 or less physical address bits, then set an
- * appropriate mask to guard against L1TF attacks. Otherwise, it is
- * assumed that the CPU is not vulnerable to L1TF.
- *
- * Some Intel CPUs address the L1 cache using more PA bits than are
- * reported by CPUID. Use the PA width of the L1 cache when possible
- * to achieve more effective mitigation, e.g. if system RAM overlaps
- * the most significant bits of legal physical address space.
- */
- shadow_nonpresent_or_rsvd_mask = 0;
- low_phys_bits = boot_cpu_data.x86_phys_bits;
- if (boot_cpu_has_bug(X86_BUG_L1TF) &&
- !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >=
- 52 - shadow_nonpresent_or_rsvd_mask_len)) {
- low_phys_bits = boot_cpu_data.x86_cache_bits
- - shadow_nonpresent_or_rsvd_mask_len;
- shadow_nonpresent_or_rsvd_mask =
- rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1);
- }
-
- shadow_nonpresent_or_rsvd_lower_gfn_mask =
- GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
-}
-
static int is_cpuid_PSE36(void)
{
return 1;
@@ -623,35 +284,6 @@ static int is_nx(struct kvm_vcpu *vcpu)
return vcpu->arch.efer & EFER_NX;
}
-static int is_shadow_present_pte(u64 pte)
-{
- return (pte != 0) && !is_mmio_spte(pte);
-}
-
-static int is_large_pte(u64 pte)
-{
- return pte & PT_PAGE_SIZE_MASK;
-}
-
-static int is_last_spte(u64 pte, int level)
-{
- if (level == PG_LEVEL_4K)
- return 1;
- if (is_large_pte(pte))
- return 1;
- return 0;
-}
-
-static bool is_executable_pte(u64 spte)
-{
- return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
-}
-
-static kvm_pfn_t spte_to_pfn(u64 pte)
-{
- return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
-}
-
static gfn_t pse36_gfn_delta(u32 gpte)
{
int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
@@ -796,12 +428,6 @@ retry:
}
#endif
-static bool spte_can_locklessly_be_made_writable(u64 spte)
-{
- return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
- (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
-}
-
static bool spte_has_volatile_bits(u64 spte)
{
if (!is_shadow_present_pte(spte))
@@ -826,21 +452,6 @@ static bool spte_has_volatile_bits(u64 spte)
return false;
}
-static bool is_accessed_spte(u64 spte)
-{
- u64 accessed_mask = spte_shadow_accessed_mask(spte);
-
- return accessed_mask ? spte & accessed_mask
- : !is_access_track_spte(spte);
-}
-
-static bool is_dirty_spte(u64 spte)
-{
- u64 dirty_mask = spte_shadow_dirty_mask(spte);
-
- return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
-}
-
/* Rules for using mmu_spte_set:
* Set the sptep from nonpresent to present.
* Note: the sptep being assigned *must* be either not present
@@ -976,47 +587,19 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
return __get_spte_lockless(sptep);
}
-static u64 mark_spte_for_access_track(u64 spte)
-{
- if (spte_ad_enabled(spte))
- return spte & ~shadow_accessed_mask;
-
- if (is_access_track_spte(spte))
- return spte;
-
- /*
- * Making an Access Tracking PTE will result in removal of write access
- * from the PTE. So, verify that we will be able to restore the write
- * access in the fast page fault path later on.
- */
- WARN_ONCE((spte & PT_WRITABLE_MASK) &&
- !spte_can_locklessly_be_made_writable(spte),
- "kvm: Writable SPTE is not locklessly dirty-trackable\n");
-
- WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask <<
- shadow_acc_track_saved_bits_shift),
- "kvm: Access Tracking saved bit locations are not zero\n");
-
- spte |= (spte & shadow_acc_track_saved_bits_mask) <<
- shadow_acc_track_saved_bits_shift;
- spte &= ~shadow_acc_track_mask;
-
- return spte;
-}
-
/* Restore an acc-track PTE back to a regular PTE */
static u64 restore_acc_track_spte(u64 spte)
{
u64 new_spte = spte;
- u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift)
- & shadow_acc_track_saved_bits_mask;
+ u64 saved_bits = (spte >> SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
+ & SHADOW_ACC_TRACK_SAVED_BITS_MASK;
WARN_ON_ONCE(spte_ad_enabled(spte));
WARN_ON_ONCE(!is_access_track_spte(spte));
new_spte &= ~shadow_acc_track_mask;
- new_spte &= ~(shadow_acc_track_saved_bits_mask <<
- shadow_acc_track_saved_bits_shift);
+ new_spte &= ~(SHADOW_ACC_TRACK_SAVED_BITS_MASK <<
+ SHADOW_ACC_TRACK_SAVED_BITS_SHIFT);
new_spte |= saved_bits;
return new_spte;
@@ -1193,7 +776,7 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
kvm_mmu_gfn_disallow_lpage(slot, gfn);
}
-static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
if (sp->lpage_disallowed)
return;
@@ -1221,7 +804,7 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
kvm_mmu_gfn_allow_lpage(slot, gfn);
}
-static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
--kvm->stat.nx_lpage_splits;
sp->lpage_disallowed = false;
@@ -1273,12 +856,14 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
} else {
rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
- while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
- desc = desc->more;
+ while (desc->sptes[PTE_LIST_EXT-1]) {
count += PTE_LIST_EXT;
- }
- if (desc->sptes[PTE_LIST_EXT-1]) {
- desc->more = mmu_alloc_pte_list_desc(vcpu);
+
+ if (!desc->more) {
+ desc->more = mmu_alloc_pte_list_desc(vcpu);
+ desc = desc->more;
+ break;
+ }
desc = desc->more;
}
for (i = 0; desc->sptes[i]; ++i)
@@ -1640,6 +1225,9 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
{
struct kvm_rmap_head *rmap_head;
+ if (kvm->arch.tdp_mmu_enabled)
+ kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
+ slot->base_gfn + gfn_offset, mask, true);
while (mask) {
rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
PG_LEVEL_4K, slot);
@@ -1666,6 +1254,9 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
{
struct kvm_rmap_head *rmap_head;
+ if (kvm->arch.tdp_mmu_enabled)
+ kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
+ slot->base_gfn + gfn_offset, mask, false);
while (mask) {
rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
PG_LEVEL_4K, slot);
@@ -1710,6 +1301,10 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
write_protected |= __rmap_write_protect(kvm, rmap_head, true);
}
+ if (kvm->arch.tdp_mmu_enabled)
+ write_protected |=
+ kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn);
+
return write_protected;
}
@@ -1769,13 +1364,8 @@ restart:
pte_list_remove(rmap_head, sptep);
goto restart;
} else {
- new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
- new_spte |= (u64)new_pfn << PAGE_SHIFT;
-
- new_spte &= ~PT_WRITABLE_MASK;
- new_spte &= ~SPTE_HOST_WRITEABLE;
-
- new_spte = mark_spte_for_access_track(new_spte);
+ new_spte = kvm_mmu_changed_pte_notifier_make_spte(
+ *sptep, new_pfn);
mmu_spte_clear_track_bits(sptep);
mmu_spte_set(sptep, new_spte);
@@ -1916,14 +1506,29 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
}
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
+ unsigned flags)
{
- return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
+ int r;
+
+ r = kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
+
+ if (kvm->arch.tdp_mmu_enabled)
+ r |= kvm_tdp_mmu_zap_hva_range(kvm, start, end);
+
+ return r;
}
int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
{
- return kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
+ int r;
+
+ r = kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
+
+ if (kvm->arch.tdp_mmu_enabled)
+ r |= kvm_tdp_mmu_set_spte_hva(kvm, hva, &pte);
+
+ return r;
}
static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
@@ -1972,12 +1577,24 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
{
- return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
+ int young = false;
+
+ young = kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
+ if (kvm->arch.tdp_mmu_enabled)
+ young |= kvm_tdp_mmu_age_hva_range(kvm, start, end);
+
+ return young;
}
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
{
- return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
+ int young = false;
+
+ young = kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
+ if (kvm->arch.tdp_mmu_enabled)
+ young |= kvm_tdp_mmu_test_age_hva(kvm, hva);
+
+ return young;
}
#ifdef MMU_DEBUG
@@ -2468,7 +2085,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
}
if (sp->unsync_children)
- kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
__clear_sp_write_flooding_count(sp);
@@ -2576,13 +2193,7 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
- spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
- shadow_user_mask | shadow_x_mask | shadow_me_mask;
-
- if (sp_ad_disabled(sp))
- spte |= SPTE_AD_DISABLED_MASK;
- else
- spte |= shadow_accessed_mask;
+ spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp));
mmu_spte_set(sptep, spte);
@@ -2614,8 +2225,9 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
}
}
-static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
- u64 *spte)
+/* Returns the number of zapped non-leaf child shadow pages. */
+static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
+ u64 *spte, struct list_head *invalid_list)
{
u64 pte;
struct kvm_mmu_page *child;
@@ -2629,23 +2241,34 @@ static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
} else {
child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
drop_parent_pte(child, spte);
- }
- return true;
- }
- if (is_mmio_spte(pte))
+ /*
+ * Recursively zap nested TDP SPs, parentless SPs are
+ * unlikely to be used again in the near future. This
+ * avoids retaining a large number of stale nested SPs.
+ */
+ if (tdp_enabled && invalid_list &&
+ child->role.guest_mode && !child->parent_ptes.val)
+ return kvm_mmu_prepare_zap_page(kvm, child,
+ invalid_list);
+ }
+ } else if (is_mmio_spte(pte)) {
mmu_spte_clear_no_track(spte);
-
- return false;
+ }
+ return 0;
}
-static void kvm_mmu_page_unlink_children(struct kvm *kvm,
- struct kvm_mmu_page *sp)
+static int kvm_mmu_page_unlink_children(struct kvm *kvm,
+ struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
{
+ int zapped = 0;
unsigned i;
for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
- mmu_page_zap_pte(kvm, sp, sp->spt + i);
+ zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list);
+
+ return zapped;
}
static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -2691,7 +2314,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
trace_kvm_mmu_prepare_zap_page(sp);
++kvm->stat.mmu_shadow_zapped;
*nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
- kvm_mmu_page_unlink_children(kvm, sp);
+ *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list);
kvm_mmu_unlink_parents(kvm, sp);
/* Zapping children means active_mmu_pages has become unstable. */
@@ -2884,8 +2507,8 @@ static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
kvm_mmu_mark_parents_unsync(sp);
}
-static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
- bool can_unsync)
+bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
+ bool can_unsync)
{
struct kvm_mmu_page *sp;
@@ -2945,132 +2568,42 @@ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
return false;
}
-static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
-{
- if (pfn_valid(pfn))
- return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) &&
- /*
- * Some reserved pages, such as those from NVDIMM
- * DAX devices, are not for MMIO, and can be mapped
- * with cached memory type for better performance.
- * However, the above check misconceives those pages
- * as MMIO, and results in KVM mapping them with UC
- * memory type, which would hurt the performance.
- * Therefore, we check the host memory type in addition
- * and only treat UC/UC-/WC pages as MMIO.
- */
- (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn));
-
- return !e820__mapped_raw_any(pfn_to_hpa(pfn),
- pfn_to_hpa(pfn + 1) - 1,
- E820_TYPE_RAM);
-}
-
-/* Bits which may be returned by set_spte() */
-#define SET_SPTE_WRITE_PROTECTED_PT BIT(0)
-#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
-
static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned int pte_access, int level,
gfn_t gfn, kvm_pfn_t pfn, bool speculative,
bool can_unsync, bool host_writable)
{
- u64 spte = 0;
- int ret = 0;
+ u64 spte;
struct kvm_mmu_page *sp;
+ int ret;
if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
return 0;
sp = sptep_to_sp(sptep);
- if (sp_ad_disabled(sp))
- spte |= SPTE_AD_DISABLED_MASK;
- else if (kvm_vcpu_ad_need_write_protect(vcpu))
- spte |= SPTE_AD_WRPROT_ONLY_MASK;
-
- /*
- * For the EPT case, shadow_present_mask is 0 if hardware
- * supports exec-only page table entries. In that case,
- * ACC_USER_MASK and shadow_user_mask are used to represent
- * read access. See FNAME(gpte_access) in paging_tmpl.h.
- */
- spte |= shadow_present_mask;
- if (!speculative)
- spte |= spte_shadow_accessed_mask(spte);
-
- if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) &&
- is_nx_huge_page_enabled()) {
- pte_access &= ~ACC_EXEC_MASK;
- }
-
- if (pte_access & ACC_EXEC_MASK)
- spte |= shadow_x_mask;
- else
- spte |= shadow_nx_mask;
-
- if (pte_access & ACC_USER_MASK)
- spte |= shadow_user_mask;
- if (level > PG_LEVEL_4K)
- spte |= PT_PAGE_SIZE_MASK;
- if (tdp_enabled)
- spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn,
- kvm_is_mmio_pfn(pfn));
-
- if (host_writable)
- spte |= SPTE_HOST_WRITEABLE;
- else
- pte_access &= ~ACC_WRITE_MASK;
-
- if (!kvm_is_mmio_pfn(pfn))
- spte |= shadow_me_mask;
-
- spte |= (u64)pfn << PAGE_SHIFT;
-
- if (pte_access & ACC_WRITE_MASK) {
- spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
-
- /*
- * Optimization: for pte sync, if spte was writable the hash
- * lookup is unnecessary (and expensive). Write protection
- * is responsibility of mmu_get_page / kvm_sync_page.
- * Same reasoning can be applied to dirty page accounting.
- */
- if (!can_unsync && is_writable_pte(*sptep))
- goto set_pte;
-
- if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
- pgprintk("%s: found shadow page for %llx, marking ro\n",
- __func__, gfn);
- ret |= SET_SPTE_WRITE_PROTECTED_PT;
- pte_access &= ~ACC_WRITE_MASK;
- spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
- }
- }
+ ret = make_spte(vcpu, pte_access, level, gfn, pfn, *sptep, speculative,
+ can_unsync, host_writable, sp_ad_disabled(sp), &spte);
- if (pte_access & ACC_WRITE_MASK) {
+ if (spte & PT_WRITABLE_MASK)
kvm_vcpu_mark_page_dirty(vcpu, gfn);
- spte |= spte_shadow_dirty_mask(spte);
- }
-
- if (speculative)
- spte = mark_spte_for_access_track(spte);
-set_pte:
- if (mmu_spte_update(sptep, spte))
+ if (*sptep == spte)
+ ret |= SET_SPTE_SPURIOUS;
+ else if (mmu_spte_update(sptep, spte))
ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
return ret;
}
static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
- unsigned int pte_access, int write_fault, int level,
+ unsigned int pte_access, bool write_fault, int level,
gfn_t gfn, kvm_pfn_t pfn, bool speculative,
bool host_writable)
{
int was_rmapped = 0;
int rmap_count;
int set_spte_ret;
- int ret = RET_PF_RETRY;
+ int ret = RET_PF_FIXED;
bool flush = false;
pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
@@ -3112,6 +2645,15 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (unlikely(is_mmio_spte(*sptep)))
ret = RET_PF_EMULATE;
+ /*
+ * The fault is fully spurious if and only if the new SPTE and old SPTE
+ * are identical, and emulation is not required.
+ */
+ if ((set_spte_ret & SET_SPTE_SPURIOUS) && ret == RET_PF_FIXED) {
+ WARN_ON_ONCE(!was_rmapped);
+ return RET_PF_SPURIOUS;
+ }
+
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
trace_kvm_mmu_set_spte(level, gfn, sptep);
if (!was_rmapped && is_large_pte(*sptep))
@@ -3160,7 +2702,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
return -1;
for (i = 0; i < ret; i++, gfn++, start++) {
- mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn,
+ mmu_set_spte(vcpu, start, access, false, sp->role.level, gfn,
page_to_pfn(pages[i]), true, true);
put_page(pages[i]);
}
@@ -3238,8 +2780,9 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn,
return level;
}
-static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
- int max_level, kvm_pfn_t *pfnp)
+int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
+ int max_level, kvm_pfn_t *pfnp,
+ bool huge_page_disallowed, int *req_level)
{
struct kvm_memory_slot *slot;
struct kvm_lpage_info *linfo;
@@ -3247,6 +2790,8 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
kvm_pfn_t mask;
int level;
+ *req_level = PG_LEVEL_4K;
+
if (unlikely(max_level == PG_LEVEL_4K))
return PG_LEVEL_4K;
@@ -3271,7 +2816,14 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
if (level == PG_LEVEL_4K)
return level;
- level = min(level, max_level);
+ *req_level = level = min(level, max_level);
+
+ /*
+ * Enforce the iTLB multihit workaround after capturing the requested
+ * level, which will be used to do precise, accurate accounting.
+ */
+ if (huge_page_disallowed)
+ return PG_LEVEL_4K;
/*
* mmu_notifier_retry() was successful and mmu_lock is held, so
@@ -3284,14 +2836,12 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
return level;
}
-static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
- gfn_t gfn, kvm_pfn_t *pfnp, int *levelp)
+void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level,
+ kvm_pfn_t *pfnp, int *goal_levelp)
{
- int level = *levelp;
- u64 spte = *it.sptep;
+ int level = *goal_levelp;
- if (it.level == level && level > PG_LEVEL_4K &&
- is_nx_huge_page_enabled() &&
+ if (cur_level == level && level > PG_LEVEL_4K &&
is_shadow_present_pte(spte) &&
!is_large_pte(spte)) {
/*
@@ -3301,26 +2851,32 @@ static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
* patching back for them into pfn the next 9 bits of
* the address.
*/
- u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1);
+ u64 page_mask = KVM_PAGES_PER_HPAGE(level) -
+ KVM_PAGES_PER_HPAGE(level - 1);
*pfnp |= gfn & page_mask;
- (*levelp)--;
+ (*goal_levelp)--;
}
}
-static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
+static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
int map_writable, int max_level, kvm_pfn_t pfn,
- bool prefault, bool account_disallowed_nx_lpage)
+ bool prefault, bool is_tdp)
{
+ bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
+ bool write = error_code & PFERR_WRITE_MASK;
+ bool exec = error_code & PFERR_FETCH_MASK;
+ bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
struct kvm_shadow_walk_iterator it;
struct kvm_mmu_page *sp;
- int level, ret;
+ int level, req_level, ret;
gfn_t gfn = gpa >> PAGE_SHIFT;
gfn_t base_gfn = gfn;
if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
return RET_PF_RETRY;
- level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn);
+ level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
+ huge_page_disallowed, &req_level);
trace_kvm_mmu_spte_requested(gpa, level, pfn);
for_each_shadow_entry(vcpu, gpa, it) {
@@ -3328,7 +2884,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
* We cannot overwrite existing page tables with an NX
* large page, as the leaf could be executable.
*/
- disallowed_hugepage_adjust(it, gfn, &pfn, &level);
+ if (nx_huge_page_workaround_enabled)
+ disallowed_hugepage_adjust(*it.sptep, gfn, it.level,
+ &pfn, &level);
base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
if (it.level == level)
@@ -3340,7 +2898,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
it.level - 1, true, ACC_ALL);
link_shadow_page(vcpu, it.sptep, sp);
- if (account_disallowed_nx_lpage)
+ if (is_tdp && huge_page_disallowed &&
+ req_level >= it.level)
account_huge_nx_page(vcpu->kvm, sp);
}
}
@@ -3348,6 +2907,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
write, level, base_gfn, pfn, prefault,
map_writable);
+ if (ret == RET_PF_SPURIOUS)
+ return ret;
+
direct_pte_prefetch(vcpu, it.sptep);
++vcpu->stat.pf_fixed;
return ret;
@@ -3478,21 +3040,19 @@ static bool is_access_allowed(u32 fault_err_code, u64 spte)
}
/*
- * Return value:
- * - true: let the vcpu to access on the same address again.
- * - false: let the real page fault path to fix it.
+ * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS.
*/
-static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
- u32 error_code)
+static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ u32 error_code)
{
struct kvm_shadow_walk_iterator iterator;
struct kvm_mmu_page *sp;
- bool fault_handled = false;
+ int ret = RET_PF_INVALID;
u64 spte = 0ull;
uint retry_count = 0;
if (!page_fault_can_be_fast(error_code))
- return false;
+ return ret;
walk_shadow_page_lockless_begin(vcpu);
@@ -3518,7 +3078,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
* they are always ACC_ALL.
*/
if (is_access_allowed(error_code, spte)) {
- fault_handled = true;
+ ret = RET_PF_SPURIOUS;
break;
}
@@ -3561,11 +3121,11 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
* since the gfn is not stable for indirect shadow page. See
* Documentation/virt/kvm/locking.rst to get more detail.
*/
- fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
- iterator.sptep, spte,
- new_spte);
- if (fault_handled)
+ if (fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte,
+ new_spte)) {
+ ret = RET_PF_FIXED;
break;
+ }
if (++retry_count > 4) {
printk_once(KERN_WARNING
@@ -3576,10 +3136,10 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
} while (true);
trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep,
- spte, fault_handled);
+ spte, ret);
walk_shadow_page_lockless_end(vcpu);
- return fault_handled;
+ return ret;
}
static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
@@ -3591,9 +3151,13 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
return;
sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
- --sp->root_count;
- if (!sp->root_count && sp->role.invalid)
- kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+
+ if (kvm_mmu_put_root(kvm, sp)) {
+ if (sp->tdp_mmu_page)
+ kvm_tdp_mmu_free_root(kvm, sp);
+ else if (sp->role.invalid)
+ kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+ }
*root_hpa = INVALID_PAGE;
}
@@ -3602,6 +3166,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
ulong roots_to_free)
{
+ struct kvm *kvm = vcpu->kvm;
int i;
LIST_HEAD(invalid_list);
bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
@@ -3619,22 +3184,21 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
return;
}
- spin_lock(&vcpu->kvm->mmu_lock);
+ spin_lock(&kvm->mmu_lock);
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
- mmu_free_root_page(vcpu->kvm, &mmu->prev_roots[i].hpa,
+ mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa,
&invalid_list);
if (free_active_root) {
if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
(mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
- mmu_free_root_page(vcpu->kvm, &mmu->root_hpa,
- &invalid_list);
+ mmu_free_root_page(kvm, &mmu->root_hpa, &invalid_list);
} else {
for (i = 0; i < 4; ++i)
if (mmu->pae_root[i] != 0)
- mmu_free_root_page(vcpu->kvm,
+ mmu_free_root_page(kvm,
&mmu->pae_root[i],
&invalid_list);
mmu->root_hpa = INVALID_PAGE;
@@ -3642,8 +3206,8 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
mmu->root_pgd = 0;
}
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
- spin_unlock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ spin_unlock(&kvm->mmu_lock);
}
EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
@@ -3683,8 +3247,16 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
hpa_t root;
unsigned i;
- if (shadow_root_level >= PT64_ROOT_4LEVEL) {
- root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true);
+ if (vcpu->kvm->arch.tdp_mmu_enabled) {
+ root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
+
+ if (!VALID_PAGE(root))
+ return -ENOSPC;
+ vcpu->arch.mmu->root_hpa = root;
+ } else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
+ root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level,
+ true);
+
if (!VALID_PAGE(root))
return -ENOSPC;
vcpu->arch.mmu->root_hpa = root;
@@ -3909,54 +3481,82 @@ static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
return vcpu_match_mmio_gva(vcpu, addr);
}
-/* return true if reserved bit is detected on spte. */
-static bool
-walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
+/*
+ * Return the level of the lowest level SPTE added to sptes.
+ * That SPTE may be non-present.
+ */
+static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes)
{
struct kvm_shadow_walk_iterator iterator;
- u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull;
- struct rsvd_bits_validate *rsvd_check;
- int root, leaf;
- bool reserved = false;
+ int leaf = vcpu->arch.mmu->root_level;
+ u64 spte;
- rsvd_check = &vcpu->arch.mmu->shadow_zero_check;
walk_shadow_page_lockless_begin(vcpu);
- for (shadow_walk_init(&iterator, vcpu, addr),
- leaf = root = iterator.level;
+ for (shadow_walk_init(&iterator, vcpu, addr);
shadow_walk_okay(&iterator);
__shadow_walk_next(&iterator, spte)) {
+ leaf = iterator.level;
spte = mmu_spte_get_lockless(iterator.sptep);
sptes[leaf - 1] = spte;
- leaf--;
if (!is_shadow_present_pte(spte))
break;
+ }
+
+ walk_shadow_page_lockless_end(vcpu);
+
+ return leaf;
+}
+
+/* return true if reserved bit is detected on spte. */
+static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
+{
+ u64 sptes[PT64_ROOT_MAX_LEVEL];
+ struct rsvd_bits_validate *rsvd_check;
+ int root = vcpu->arch.mmu->root_level;
+ int leaf;
+ int level;
+ bool reserved = false;
+
+ if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) {
+ *sptep = 0ull;
+ return reserved;
+ }
+
+ if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))
+ leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes);
+ else
+ leaf = get_walk(vcpu, addr, sptes);
+
+ rsvd_check = &vcpu->arch.mmu->shadow_zero_check;
+
+ for (level = root; level >= leaf; level--) {
+ if (!is_shadow_present_pte(sptes[level - 1]))
+ break;
/*
* Use a bitwise-OR instead of a logical-OR to aggregate the
* reserved bit and EPT's invalid memtype/XWR checks to avoid
* adding a Jcc in the loop.
*/
- reserved |= __is_bad_mt_xwr(rsvd_check, spte) |
- __is_rsvd_bits_set(rsvd_check, spte, iterator.level);
+ reserved |= __is_bad_mt_xwr(rsvd_check, sptes[level - 1]) |
+ __is_rsvd_bits_set(rsvd_check, sptes[level - 1],
+ level);
}
- walk_shadow_page_lockless_end(vcpu);
-
if (reserved) {
pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
__func__, addr);
- while (root > leaf) {
+ for (level = root; level >= leaf; level--)
pr_err("------ spte 0x%llx level %d.\n",
- sptes[root - 1], root);
- root--;
- }
+ sptes[level - 1], level);
}
- *sptep = spte;
+ *sptep = sptes[leaf - 1];
+
return reserved;
}
@@ -3968,7 +3568,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
if (mmio_info_in_cache(vcpu, addr, direct))
return RET_PF_EMULATE;
- reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
+ reserved = get_mmio_spte(vcpu, addr, &spte);
if (WARN_ON(reserved))
return -EINVAL;
@@ -4079,8 +3679,6 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
bool prefault, int max_level, bool is_tdp)
{
bool write = error_code & PFERR_WRITE_MASK;
- bool exec = error_code & PFERR_FETCH_MASK;
- bool lpage_disallowed = exec && is_nx_huge_page_enabled();
bool map_writable;
gfn_t gfn = gpa >> PAGE_SHIFT;
@@ -4091,16 +3689,16 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
if (page_fault_handle_page_track(vcpu, error_code, gfn))
return RET_PF_EMULATE;
- if (fast_page_fault(vcpu, gpa, error_code))
- return RET_PF_RETRY;
+ if (!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) {
+ r = fast_page_fault(vcpu, gpa, error_code);
+ if (r != RET_PF_INVALID)
+ return r;
+ }
r = mmu_topup_memory_caches(vcpu, false);
if (r)
return r;
- if (lpage_disallowed)
- max_level = PG_LEVEL_4K;
-
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
@@ -4117,8 +3715,13 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
r = make_mmu_pages_available(vcpu);
if (r)
goto out_unlock;
- r = __direct_map(vcpu, gpa, write, map_writable, max_level, pfn,
- prefault, is_tdp && lpage_disallowed);
+
+ if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))
+ r = kvm_tdp_mmu_map(vcpu, gpa, error_code, map_writable, max_level,
+ pfn, prefault);
+ else
+ r = __direct_map(vcpu, gpa, error_code, map_writable, max_level, pfn,
+ prefault, is_tdp);
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
@@ -4291,7 +3894,13 @@ static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd,
*/
vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
- __clear_sp_write_flooding_count(to_shadow_page(vcpu->arch.mmu->root_hpa));
+ /*
+ * If this is a direct root page, it doesn't have a write flooding
+ * count. Otherwise, clear the write flooding count.
+ */
+ if (!new_role.direct)
+ __clear_sp_write_flooding_count(
+ to_shadow_page(vcpu->arch.mmu->root_hpa));
}
void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush,
@@ -4421,7 +4030,7 @@ __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
rsvd_bits(maxphyaddr, 51);
rsvd_check->rsvd_bits_mask[1][4] =
rsvd_check->rsvd_bits_mask[0][4];
- /* fall through */
+ fallthrough;
case PT64_ROOT_4LEVEL:
rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd |
nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
@@ -5399,7 +5008,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
u32 base_role = vcpu->arch.mmu->mmu_role.base.word;
entry = *spte;
- mmu_page_zap_pte(vcpu->kvm, sp, spte);
+ mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL);
if (gentry &&
!((sp->role.word ^ base_role) & ~role_ign.word) &&
rmap_can_add(vcpu))
@@ -5449,13 +5058,14 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
if (r == RET_PF_INVALID) {
r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
lower_32_bits(error_code), false);
- WARN_ON(r == RET_PF_INVALID);
+ if (WARN_ON_ONCE(r == RET_PF_INVALID))
+ return -EIO;
}
- if (r == RET_PF_RETRY)
- return 1;
if (r < 0)
return r;
+ if (r != RET_PF_EMULATE)
+ return 1;
/*
* Before emulating the instruction, check if the error code
@@ -5484,18 +5094,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
emulate:
- /*
- * On AMD platforms, under certain conditions insn_len may be zero on #NPF.
- * This can happen if a guest gets a page-fault on data access but the HW
- * table walker is not able to read the instruction page (e.g instruction
- * page is not present in memory). In those cases we simply restart the
- * guest, with the exception of AMD Erratum 1096 which is unrecoverable.
- */
- if (unlikely(insn && !insn_len)) {
- if (!kvm_x86_ops.need_emulation_on_page_fault(vcpu))
- return 1;
- }
-
return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
insn_len);
}
@@ -5681,11 +5279,17 @@ static void free_mmu_pages(struct kvm_mmu *mmu)
free_page((unsigned long)mmu->lm_root);
}
-static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
{
struct page *page;
int i;
+ mmu->root_hpa = INVALID_PAGE;
+ mmu->root_pgd = 0;
+ mmu->translate_gpa = translate_gpa;
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
+
/*
* When using PAE paging, the four PDPTEs are treated as 'root' pages,
* while the PDP table is a per-vCPU construct that's allocated at MMU
@@ -5711,7 +5315,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
int kvm_mmu_create(struct kvm_vcpu *vcpu)
{
- uint i;
int ret;
vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache;
@@ -5725,25 +5328,13 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
vcpu->arch.mmu = &vcpu->arch.root_mmu;
vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
- vcpu->arch.root_mmu.root_hpa = INVALID_PAGE;
- vcpu->arch.root_mmu.root_pgd = 0;
- vcpu->arch.root_mmu.translate_gpa = translate_gpa;
- for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
- vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
-
- vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE;
- vcpu->arch.guest_mmu.root_pgd = 0;
- vcpu->arch.guest_mmu.translate_gpa = translate_gpa;
- for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
- vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
-
vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
- ret = alloc_mmu_pages(vcpu, &vcpu->arch.guest_mmu);
+ ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu);
if (ret)
return ret;
- ret = alloc_mmu_pages(vcpu, &vcpu->arch.root_mmu);
+ ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu);
if (ret)
goto fail_allocate_root;
@@ -5840,6 +5431,10 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
kvm_reload_remote_mmus(kvm);
kvm_zap_obsolete_pages(kvm);
+
+ if (kvm->arch.tdp_mmu_enabled)
+ kvm_tdp_mmu_zap_all(kvm);
+
spin_unlock(&kvm->mmu_lock);
}
@@ -5859,6 +5454,8 @@ void kvm_mmu_init_vm(struct kvm *kvm)
{
struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
+ kvm_mmu_init_tdp_mmu(kvm);
+
node->track_write = kvm_mmu_pte_write;
node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
kvm_page_track_register_notifier(kvm, node);
@@ -5869,6 +5466,8 @@ void kvm_mmu_uninit_vm(struct kvm *kvm)
struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
kvm_page_track_unregister_notifier(kvm, node);
+
+ kvm_mmu_uninit_tdp_mmu(kvm);
}
void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
@@ -5876,6 +5475,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
int i;
+ bool flush;
spin_lock(&kvm->mmu_lock);
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
@@ -5895,6 +5495,12 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
}
}
+ if (kvm->arch.tdp_mmu_enabled) {
+ flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end);
+ if (flush)
+ kvm_flush_remote_tlbs(kvm);
+ }
+
spin_unlock(&kvm->mmu_lock);
}
@@ -5913,6 +5519,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
spin_lock(&kvm->mmu_lock);
flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect,
start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
+ if (kvm->arch.tdp_mmu_enabled)
+ flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_4K);
spin_unlock(&kvm->mmu_lock);
/*
@@ -5976,6 +5584,9 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
spin_lock(&kvm->mmu_lock);
slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
kvm_mmu_zap_collapsible_spte, true);
+
+ if (kvm->arch.tdp_mmu_enabled)
+ kvm_tdp_mmu_zap_collapsible_sptes(kvm, memslot);
spin_unlock(&kvm->mmu_lock);
}
@@ -6001,6 +5612,8 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
spin_lock(&kvm->mmu_lock);
flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
+ if (kvm->arch.tdp_mmu_enabled)
+ flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
spin_unlock(&kvm->mmu_lock);
/*
@@ -6022,6 +5635,8 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
spin_lock(&kvm->mmu_lock);
flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
false);
+ if (kvm->arch.tdp_mmu_enabled)
+ flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_2M);
spin_unlock(&kvm->mmu_lock);
if (flush)
@@ -6036,6 +5651,8 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
spin_lock(&kvm->mmu_lock);
flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
+ if (kvm->arch.tdp_mmu_enabled)
+ flush |= kvm_tdp_mmu_slot_set_dirty(kvm, memslot);
spin_unlock(&kvm->mmu_lock);
if (flush)
@@ -6061,6 +5678,10 @@ restart:
}
kvm_mmu_commit_zap_page(kvm, &invalid_list);
+
+ if (kvm->arch.tdp_mmu_enabled)
+ kvm_tdp_mmu_zap_all(kvm);
+
spin_unlock(&kvm->mmu_lock);
}
@@ -6356,7 +5977,10 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;
- while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) {
+ for ( ; to_zap; --to_zap) {
+ if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages))
+ break;
+
/*
* We use a separate list instead of just using active_mmu_pages
* because the number of lpage_disallowed pages is expected to
@@ -6366,15 +5990,20 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
struct kvm_mmu_page,
lpage_disallowed_link);
WARN_ON_ONCE(!sp->lpage_disallowed);
- kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
- WARN_ON_ONCE(sp->lpage_disallowed);
+ if (sp->tdp_mmu_page)
+ kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn,
+ sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level));
+ else {
+ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+ WARN_ON_ONCE(sp->lpage_disallowed);
+ }
- if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+ if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
kvm_mmu_commit_zap_page(kvm, &invalid_list);
- if (to_zap)
- cond_resched_lock(&kvm->mmu_lock);
+ cond_resched_lock(&kvm->mmu_lock);
}
}
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
spin_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, rcu_idx);