aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm/mmu
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/mmu')
-rw-r--r--arch/x86/kvm/mmu/mmu.c914
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h15
-rw-r--r--arch/x86/kvm/mmu/mmutrace.h2
-rw-r--r--arch/x86/kvm/mmu/page_track.c2
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h82
-rw-r--r--arch/x86/kvm/mmu/spte.c22
-rw-r--r--arch/x86/kvm/mmu/spte.h32
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c66
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h35
9 files changed, 684 insertions, 486 deletions
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 4b3ee244ebe0..845d114ae075 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -55,7 +55,7 @@
extern bool itlb_multihit_kvm_mitigation;
-static int __read_mostly nx_huge_pages = -1;
+int __read_mostly nx_huge_pages = -1;
#ifdef CONFIG_PREEMPT_RT
/* Recovery can cause latency spikes, disable it for PREEMPT_RT. */
static uint __read_mostly nx_huge_pages_recovery_ratio = 0;
@@ -176,9 +176,80 @@ static void mmu_spte_set(u64 *sptep, u64 spte);
static union kvm_mmu_page_role
kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
+struct kvm_mmu_role_regs {
+ const unsigned long cr0;
+ const unsigned long cr4;
+ const u64 efer;
+};
+
#define CREATE_TRACE_POINTS
#include "mmutrace.h"
+/*
+ * Yes, lot's of underscores. They're a hint that you probably shouldn't be
+ * reading from the role_regs. Once the mmu_role is constructed, it becomes
+ * the single source of truth for the MMU's state.
+ */
+#define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag) \
+static inline bool ____is_##reg##_##name(struct kvm_mmu_role_regs *regs)\
+{ \
+ return !!(regs->reg & flag); \
+}
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, pg, X86_CR0_PG);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, wp, X86_CR0_WP);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pse, X86_CR4_PSE);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pae, X86_CR4_PAE);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smep, X86_CR4_SMEP);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smap, X86_CR4_SMAP);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pke, X86_CR4_PKE);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, la57, X86_CR4_LA57);
+BUILD_MMU_ROLE_REGS_ACCESSOR(efer, nx, EFER_NX);
+BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA);
+
+/*
+ * The MMU itself (with a valid role) is the single source of truth for the
+ * MMU. Do not use the regs used to build the MMU/role, nor the vCPU. The
+ * regs don't account for dependencies, e.g. clearing CR4 bits if CR0.PG=1,
+ * and the vCPU may be incorrect/irrelevant.
+ */
+#define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name) \
+static inline bool is_##reg##_##name(struct kvm_mmu *mmu) \
+{ \
+ return !!(mmu->mmu_role. base_or_ext . reg##_##name); \
+}
+BUILD_MMU_ROLE_ACCESSOR(ext, cr0, pg);
+BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp);
+BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pse);
+BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pae);
+BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smep);
+BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smap);
+BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pke);
+BUILD_MMU_ROLE_ACCESSOR(ext, cr4, la57);
+BUILD_MMU_ROLE_ACCESSOR(base, efer, nx);
+
+static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_role_regs regs = {
+ .cr0 = kvm_read_cr0_bits(vcpu, KVM_MMU_CR0_ROLE_BITS),
+ .cr4 = kvm_read_cr4_bits(vcpu, KVM_MMU_CR4_ROLE_BITS),
+ .efer = vcpu->arch.efer,
+ };
+
+ return regs;
+}
+
+static int role_regs_to_root_level(struct kvm_mmu_role_regs *regs)
+{
+ if (!____is_cr0_pg(regs))
+ return 0;
+ else if (____is_efer_lma(regs))
+ return ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL :
+ PT64_ROOT_4LEVEL;
+ else if (____is_cr4_pae(regs))
+ return PT32E_ROOT_LEVEL;
+ else
+ return PT32_ROOT_LEVEL;
+}
static inline bool kvm_available_flush_tlb_with_range(void)
{
@@ -208,11 +279,6 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
kvm_flush_remote_tlbs_with_range(kvm, &range);
}
-bool is_nx_huge_page_enabled(void)
-{
- return READ_ONCE(nx_huge_pages);
-}
-
static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
unsigned int access)
{
@@ -269,11 +335,6 @@ static int is_cpuid_PSE36(void)
return 1;
}
-static int is_nx(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.efer & EFER_NX;
-}
-
static gfn_t pse36_gfn_delta(u32 gpte)
{
int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
@@ -1177,8 +1238,7 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
* @gfn_offset: start of the BITS_PER_LONG pages we care about
* @mask: indicates which pages we should protect
*
- * Used when we do not need to care about huge page mappings: e.g. during dirty
- * logging we do not have any such mappings.
+ * Used when we do not need to care about huge page mappings.
*/
static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
@@ -1189,6 +1249,10 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
if (is_tdp_mmu_enabled(kvm))
kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
slot->base_gfn + gfn_offset, mask, true);
+
+ if (!kvm_memslots_have_rmaps(kvm))
+ return;
+
while (mask) {
rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
PG_LEVEL_4K, slot);
@@ -1218,6 +1282,10 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
if (is_tdp_mmu_enabled(kvm))
kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
slot->base_gfn + gfn_offset, mask, false);
+
+ if (!kvm_memslots_have_rmaps(kvm))
+ return;
+
while (mask) {
rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
PG_LEVEL_4K, slot);
@@ -1235,13 +1303,36 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
* It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
* enable dirty logging for them.
*
- * Used when we do not need to care about huge page mappings: e.g. during dirty
- * logging we do not have any such mappings.
+ * We need to care about huge page mappings: e.g. during dirty logging we may
+ * have such mappings.
*/
void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask)
{
+ /*
+ * Huge pages are NOT write protected when we start dirty logging in
+ * initially-all-set mode; must write protect them here so that they
+ * are split to 4K on the first write.
+ *
+ * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn
+ * of memslot has no such restriction, so the range can cross two large
+ * pages.
+ */
+ if (kvm_dirty_log_manual_protect_and_init_set(kvm)) {
+ gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask);
+ gfn_t end = slot->base_gfn + gfn_offset + __fls(mask);
+
+ kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M);
+
+ /* Cross two large pages? */
+ if (ALIGN(start << PAGE_SHIFT, PMD_SIZE) !=
+ ALIGN(end << PAGE_SHIFT, PMD_SIZE))
+ kvm_mmu_slot_gfn_write_protect(kvm, slot, end,
+ PG_LEVEL_2M);
+ }
+
+ /* Now handle 4K PTEs. */
if (kvm_x86_ops.cpu_dirty_log_size)
kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask);
else
@@ -1254,20 +1345,23 @@ int kvm_cpu_dirty_log_size(void)
}
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
- struct kvm_memory_slot *slot, u64 gfn)
+ struct kvm_memory_slot *slot, u64 gfn,
+ int min_level)
{
struct kvm_rmap_head *rmap_head;
int i;
bool write_protected = false;
- for (i = PG_LEVEL_4K; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
- rmap_head = __gfn_to_rmap(gfn, i, slot);
- write_protected |= __rmap_write_protect(kvm, rmap_head, true);
+ if (kvm_memslots_have_rmaps(kvm)) {
+ for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
+ rmap_head = __gfn_to_rmap(gfn, i, slot);
+ write_protected |= __rmap_write_protect(kvm, rmap_head, true);
+ }
}
if (is_tdp_mmu_enabled(kvm))
write_protected |=
- kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn);
+ kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level);
return write_protected;
}
@@ -1277,7 +1371,7 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
struct kvm_memory_slot *slot;
slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
- return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
+ return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K);
}
static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
@@ -1433,9 +1527,10 @@ static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
- bool flush;
+ bool flush = false;
- flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp);
+ if (kvm_memslots_have_rmaps(kvm))
+ flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp);
if (is_tdp_mmu_enabled(kvm))
flush |= kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
@@ -1445,9 +1540,10 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- bool flush;
+ bool flush = false;
- flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp);
+ if (kvm_memslots_have_rmaps(kvm))
+ flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp);
if (is_tdp_mmu_enabled(kvm))
flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range);
@@ -1500,9 +1596,10 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- bool young;
+ bool young = false;
- young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp);
+ if (kvm_memslots_have_rmaps(kvm))
+ young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp);
if (is_tdp_mmu_enabled(kvm))
young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
@@ -1512,9 +1609,10 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- bool young;
+ bool young = false;
- young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp);
+ if (kvm_memslots_have_rmaps(kvm))
+ young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp);
if (is_tdp_mmu_enabled(kvm))
young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
@@ -1748,17 +1846,10 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \
if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
-static inline bool is_ept_sp(struct kvm_mmu_page *sp)
-{
- return sp->role.cr0_wp && sp->role.smap_andnot_wp;
-}
-
-/* @sp->gfn should be write-protected at the call site */
-static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- struct list_head *invalid_list)
+static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
{
- if ((!is_ept_sp(sp) && sp->role.gpte_is_8_bytes != !!is_pae(vcpu)) ||
- vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
+ if (vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
return false;
}
@@ -1804,31 +1895,6 @@ static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
}
-static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- struct list_head *invalid_list)
-{
- kvm_unlink_unsync_page(vcpu->kvm, sp);
- return __kvm_sync_page(vcpu, sp, invalid_list);
-}
-
-/* @gfn should be write-protected at the call site */
-static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
- struct list_head *invalid_list)
-{
- struct kvm_mmu_page *s;
- bool ret = false;
-
- for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
- if (!s->unsync)
- continue;
-
- WARN_ON(s->role.level != PG_LEVEL_4K);
- ret |= kvm_sync_page(vcpu, s, invalid_list);
- }
-
- return ret;
-}
-
struct mmu_page_path {
struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
unsigned int idx[PT64_ROOT_MAX_LEVEL];
@@ -1923,6 +1989,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
}
for_each_sp(pages, sp, parents, i) {
+ kvm_unlink_unsync_page(vcpu->kvm, sp);
flush |= kvm_sync_page(vcpu, sp, &invalid_list);
mmu_pages_clear_parents(&parents);
}
@@ -1958,8 +2025,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
struct hlist_head *sp_list;
unsigned quadrant;
struct kvm_mmu_page *sp;
- bool need_sync = false;
- bool flush = false;
int collisions = 0;
LIST_HEAD(invalid_list);
@@ -1982,20 +2047,39 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
continue;
}
- if (!need_sync && sp->unsync)
- need_sync = true;
-
- if (sp->role.word != role.word)
+ if (sp->role.word != role.word) {
+ /*
+ * If the guest is creating an upper-level page, zap
+ * unsync pages for the same gfn. While it's possible
+ * the guest is using recursive page tables, in all
+ * likelihood the guest has stopped using the unsync
+ * page and is installing a completely unrelated page.
+ * Unsync pages must not be left as is, because the new
+ * upper-level page will be write-protected.
+ */
+ if (level > PG_LEVEL_4K && sp->unsync)
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
+ &invalid_list);
continue;
+ }
if (direct_mmu)
goto trace_get_page;
if (sp->unsync) {
- /* The page is good, but __kvm_sync_page might still end
- * up zapping it. If so, break in order to rebuild it.
+ /*
+ * The page is good, but is stale. kvm_sync_page does
+ * get the latest guest state, but (unlike mmu_unsync_children)
+ * it doesn't write-protect the page or mark it synchronized!
+ * This way the validity of the mapping is ensured, but the
+ * overhead of write protection is not incurred until the
+ * guest invalidates the TLB mapping. This allows multiple
+ * SPs for a single gfn to be unsync.
+ *
+ * If the sync fails, the page is zapped. If so, break
+ * in order to rebuild it.
*/
- if (!__kvm_sync_page(vcpu, sp, &invalid_list))
+ if (!kvm_sync_page(vcpu, sp, &invalid_list))
break;
WARN_ON(!list_empty(&invalid_list));
@@ -2020,22 +2104,14 @@ trace_get_page:
sp->role = role;
hlist_add_head(&sp->hash_link, sp_list);
if (!direct) {
- /*
- * we should do write protection before syncing pages
- * otherwise the content of the synced shadow page may
- * be inconsistent with guest page table.
- */
account_shadowed(vcpu->kvm, sp);
if (level == PG_LEVEL_4K && rmap_write_protect(vcpu, gfn))
kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
-
- if (level > PG_LEVEL_4K && need_sync)
- flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
}
trace_kvm_mmu_get_page(sp, true);
-
- kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
out:
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+
if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)
vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;
return sp;
@@ -2374,7 +2450,7 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
* page is available, while the caller may end up allocating as many as
* four pages, e.g. for PAE roots or for 5-level paging. Temporarily
* exceeding the (arbitrary by default) limit will not harm the host,
- * being too agressive may unnecessarily kill the guest, and getting an
+ * being too aggressive may unnecessarily kill the guest, and getting an
* exact count is far more trouble than it's worth, especially in the
* page fault paths.
*/
@@ -2448,17 +2524,33 @@ static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
kvm_mmu_mark_parents_unsync(sp);
}
-bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
- bool can_unsync)
+/*
+ * Attempt to unsync any shadow pages that can be reached by the specified gfn,
+ * KVM is creating a writable mapping for said gfn. Returns 0 if all pages
+ * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must
+ * be write-protected.
+ */
+int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync)
{
struct kvm_mmu_page *sp;
+ /*
+ * Force write-protection if the page is being tracked. Note, the page
+ * track machinery is used to write-protect upper-level shadow pages,
+ * i.e. this guards the role.level == 4K assertion below!
+ */
if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
- return true;
+ return -EPERM;
+ /*
+ * The page is not write-tracked, mark existing shadow pages unsync
+ * unless KVM is synchronizing an unsync SP (can_unsync = false). In
+ * that case, KVM must complete emulation of the guest TLB flush before
+ * allowing shadow pages to become unsync (writable by the guest).
+ */
for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
if (!can_unsync)
- return true;
+ return -EPERM;
if (sp->unsync)
continue;
@@ -2489,8 +2581,8 @@ bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
* 2.2 Guest issues TLB flush.
* That causes a VM Exit.
*
- * 2.3 kvm_mmu_sync_pages() reads sp->unsync.
- * Since it is false, so it just returns.
+ * 2.3 Walking of unsync pages sees sp->unsync is
+ * false and skips the page.
*
* 2.4 Guest accesses GVA X.
* Since the mapping in the SP was not updated,
@@ -2506,7 +2598,7 @@ bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
*/
smp_wmb();
- return false;
+ return 0;
}
static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
@@ -2827,9 +2919,6 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
gfn_t gfn = gpa >> PAGE_SHIFT;
gfn_t base_gfn = gfn;
- if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
- return RET_PF_RETRY;
-
level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
huge_page_disallowed, &req_level);
@@ -3180,6 +3269,33 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
}
EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
+void kvm_mmu_free_guest_mode_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+{
+ unsigned long roots_to_free = 0;
+ hpa_t root_hpa;
+ int i;
+
+ /*
+ * This should not be called while L2 is active, L2 can't invalidate
+ * _only_ its own roots, e.g. INVVPID unconditionally exits.
+ */
+ WARN_ON_ONCE(mmu->mmu_role.base.guest_mode);
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ root_hpa = mmu->prev_roots[i].hpa;
+ if (!VALID_PAGE(root_hpa))
+ continue;
+
+ if (!to_shadow_page(root_hpa) ||
+ to_shadow_page(root_hpa)->role.guest_mode)
+ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+ }
+
+ kvm_mmu_free_roots(vcpu, mmu, roots_to_free);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots);
+
+
static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
{
int ret = 0;
@@ -3280,6 +3396,10 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
}
}
+ r = alloc_all_memslots_rmaps(vcpu->kvm);
+ if (r)
+ return r;
+
write_lock(&vcpu->kvm->mmu_lock);
r = make_mmu_pages_available(vcpu);
if (r < 0)
@@ -3310,12 +3430,12 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
- if (WARN_ON_ONCE(!mmu->lm_root)) {
+ if (WARN_ON_ONCE(!mmu->pml4_root)) {
r = -EIO;
goto out_unlock;
}
- mmu->lm_root[0] = __pa(mmu->pae_root) | pm_mask;
+ mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask;
}
for (i = 0; i < 4; ++i) {
@@ -3335,7 +3455,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
}
if (mmu->shadow_root_level == PT64_ROOT_4LEVEL)
- mmu->root_hpa = __pa(mmu->lm_root);
+ mmu->root_hpa = __pa(mmu->pml4_root);
else
mmu->root_hpa = __pa(mmu->pae_root);
@@ -3350,7 +3470,7 @@ out_unlock:
static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
- u64 *lm_root, *pae_root;
+ u64 *pml4_root, *pae_root;
/*
* When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
@@ -3369,14 +3489,14 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
if (WARN_ON_ONCE(mmu->shadow_root_level != PT64_ROOT_4LEVEL))
return -EIO;
- if (mmu->pae_root && mmu->lm_root)
+ if (mmu->pae_root && mmu->pml4_root)
return 0;
/*
* The special roots should always be allocated in concert. Yell and
* bail if KVM ends up in a state where only one of the roots is valid.
*/
- if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->lm_root))
+ if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root))
return -EIO;
/*
@@ -3387,14 +3507,14 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
if (!pae_root)
return -ENOMEM;
- lm_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
- if (!lm_root) {
+ pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!pml4_root) {
free_page((unsigned long)pae_root);
return -ENOMEM;
}
mmu->pae_root = pae_root;
- mmu->lm_root = lm_root;
+ mmu->pml4_root = pml4_root;
return 0;
}
@@ -3423,8 +3543,8 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
* flush strictly after those changes are made. We only need to
* ensure that the other CPU sets these flags before any actual
* changes to the page tables are made. The comments in
- * mmu_need_write_protect() describe what could go wrong if this
- * requirement isn't satisfied.
+ * mmu_try_to_unsync_pages() describe what could go wrong if
+ * this requirement isn't satisfied.
*/
if (!smp_load_acquire(&sp->unsync) &&
!smp_load_acquire(&sp->unsync_children))
@@ -3474,19 +3594,6 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr,
return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception);
}
-static bool
-__is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level)
-{
- int bit7 = (pte >> 7) & 1;
-
- return pte & rsvd_check->rsvd_bits_mask[bit7][level-1];
-}
-
-static bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check, u64 pte)
-{
- return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f);
-}
-
static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
{
/*
@@ -3540,12 +3647,7 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
int root, leaf, level;
bool reserved = false;
- if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) {
- *sptep = 0ull;
- return reserved;
- }
-
- if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))
+ if (is_tdp_mmu(vcpu->arch.mmu))
leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root);
else
leaf = get_walk(vcpu, addr, sptes, &root);
@@ -3569,13 +3671,7 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
rsvd_check = &vcpu->arch.mmu->shadow_zero_check;
for (level = root; level >= leaf; level--)
- /*
- * Use a bitwise-OR instead of a logical-OR to aggregate the
- * reserved bit and EPT's invalid memtype/XWR checks to avoid
- * adding a Jcc in the loop.
- */
- reserved |= __is_bad_mt_xwr(rsvd_check, sptes[level]) |
- __is_rsvd_bits_set(rsvd_check, sptes[level], level);
+ reserved |= is_rsvd_spte(rsvd_check, sptes[level], level);
if (reserved) {
pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n",
@@ -3583,7 +3679,7 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
for (level = root; level >= leaf; level--)
pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx",
sptes[level], level,
- rsvd_check->rsvd_bits_mask[(sptes[level] >> 7) & 1][level-1]);
+ get_rsvd_bits(rsvd_check, sptes[level], level));
}
return reserved;
@@ -3717,6 +3813,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
bool prefault, int max_level, bool is_tdp)
{
+ bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu);
bool write = error_code & PFERR_WRITE_MASK;
bool map_writable;
@@ -3729,7 +3826,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
if (page_fault_handle_page_track(vcpu, error_code, gfn))
return RET_PF_EMULATE;
- if (!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) {
+ if (!is_tdp_mmu_fault) {
r = fast_page_fault(vcpu, gpa, error_code);
if (r != RET_PF_INVALID)
return r;
@@ -3751,7 +3848,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
r = RET_PF_RETRY;
- if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))
+ if (is_tdp_mmu_fault)
read_lock(&vcpu->kvm->mmu_lock);
else
write_lock(&vcpu->kvm->mmu_lock);
@@ -3762,7 +3859,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
if (r)
goto out_unlock;
- if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))
+ if (is_tdp_mmu_fault)
r = kvm_tdp_mmu_map(vcpu, gpa, error_code, map_writable, max_level,
pfn, prefault);
else
@@ -3770,7 +3867,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
prefault, is_tdp);
out_unlock:
- if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))
+ if (is_tdp_mmu_fault)
read_unlock(&vcpu->kvm->mmu_lock);
else
write_unlock(&vcpu->kvm->mmu_lock);
@@ -3840,17 +3937,13 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
max_level, true);
}
-static void nonpaging_init_context(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context)
+static void nonpaging_init_context(struct kvm_mmu *context)
{
context->page_fault = nonpaging_page_fault;
context->gva_to_gpa = nonpaging_gva_to_gpa;
context->sync_page = nonpaging_sync_page;
context->invlpg = NULL;
- context->root_level = 0;
- context->shadow_root_level = PT32E_ROOT_LEVEL;
context->direct_map = true;
- context->nx = false;
}
static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
@@ -3913,8 +4006,7 @@ static bool fast_pgd_switch(struct kvm_vcpu *vcpu, gpa_t new_pgd,
}
static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd,
- union kvm_mmu_page_role new_role,
- bool skip_tlb_flush, bool skip_mmu_sync)
+ union kvm_mmu_page_role new_role)
{
if (!fast_pgd_switch(vcpu, new_pgd, new_role)) {
kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, KVM_MMU_ROOT_CURRENT);
@@ -3929,10 +4021,10 @@ static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd,
*/
kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
- if (!skip_mmu_sync || force_flush_and_sync_on_reuse)
+ if (force_flush_and_sync_on_reuse) {
kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
- if (!skip_tlb_flush || force_flush_and_sync_on_reuse)
kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+ }
/*
* The last MMIO access's GVA and GPA are cached in the VCPU. When
@@ -3951,11 +4043,9 @@ static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd,
to_shadow_page(vcpu->arch.mmu->root_hpa));
}
-void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush,
- bool skip_mmu_sync)
+void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
{
- __kvm_mmu_new_pgd(vcpu, new_pgd, kvm_mmu_calc_root_page_role(vcpu),
- skip_tlb_flush, skip_mmu_sync);
+ __kvm_mmu_new_pgd(vcpu, new_pgd, kvm_mmu_calc_root_page_role(vcpu));
}
EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
@@ -3981,26 +4071,6 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
return false;
}
-static inline bool is_last_gpte(struct kvm_mmu *mmu,
- unsigned level, unsigned gpte)
-{
- /*
- * The RHS has bit 7 set iff level < mmu->last_nonleaf_level.
- * If it is clear, there are no large pages at this level, so clear
- * PT_PAGE_SIZE_MASK in gpte if that is the case.
- */
- gpte &= level - mmu->last_nonleaf_level;
-
- /*
- * PG_LEVEL_4K always terminates. The RHS has bit 7 set
- * iff level <= PG_LEVEL_4K, which for our purpose means
- * level == PG_LEVEL_4K; set PT_PAGE_SIZE_MASK in gpte then.
- */
- gpte |= level - PG_LEVEL_4K - 1;
-
- return gpte & PT_PAGE_SIZE_MASK;
-}
-
#define PTTYPE_EPT 18 /* arbitrary */
#define PTTYPE PTTYPE_EPT
#include "paging_tmpl.h"
@@ -4015,8 +4085,7 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu,
#undef PTTYPE
static void
-__reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
- struct rsvd_bits_validate *rsvd_check,
+__reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
u64 pa_bits_rsvd, int level, bool nx, bool gbpages,
bool pse, bool amd)
{
@@ -4105,14 +4174,29 @@ __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
}
}
+static bool guest_can_use_gbpages(struct kvm_vcpu *vcpu)
+{
+ /*
+ * If TDP is enabled, let the guest use GBPAGES if they're supported in
+ * hardware. The hardware page walker doesn't let KVM disable GBPAGES,
+ * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA
+ * walk for performance and complexity reasons. Not to mention KVM
+ * _can't_ solve the problem because GVA->GPA walks aren't visible to
+ * KVM once a TDP translation is installed. Mimic hardware behavior so
+ * that KVM's is at least consistent, i.e. doesn't randomly inject #PF.
+ */
+ return tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) :
+ guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES);
+}
+
static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
struct kvm_mmu *context)
{
- __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check,
+ __reset_rsvds_bits_mask(&context->guest_rsvd_check,
vcpu->arch.reserved_gpa_bits,
- context->root_level, context->nx,
- guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
- is_pse(vcpu),
+ context->root_level, is_efer_nx(context),
+ guest_can_use_gbpages(vcpu),
+ is_cr4_pse(context),
guest_cpuid_is_amd_or_hygon(vcpu));
}
@@ -4165,24 +4249,32 @@ static inline u64 reserved_hpa_bits(void)
* table in guest or amd nested guest, its mmu features completely
* follow the features in guest.
*/
-void
-reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
+static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
{
- bool uses_nx = context->nx ||
- context->mmu_role.base.smep_andnot_wp;
+ /*
+ * KVM uses NX when TDP is disabled to handle a variety of scenarios,
+ * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
+ * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
+ * The iTLB multi-hit workaround can be toggled at any time, so assume
+ * NX can be used by any non-nested shadow MMU to avoid having to reset
+ * MMU contexts. Note, KVM forces EFER.NX=1 when TDP is disabled.
+ */
+ bool uses_nx = is_efer_nx(context) || !tdp_enabled;
+
+ /* @amd adds a check on bit of SPTEs, which KVM shouldn't use anyways. */
+ bool is_amd = true;
+ /* KVM doesn't use 2-level page tables for the shadow MMU. */
+ bool is_pse = false;
struct rsvd_bits_validate *shadow_zero_check;
int i;
- /*
- * Passing "true" to the last argument is okay; it adds a check
- * on bit 8 of the SPTEs which KVM doesn't use anyway.
- */
+ WARN_ON_ONCE(context->shadow_root_level < PT32E_ROOT_LEVEL);
+
shadow_zero_check = &context->shadow_zero_check;
- __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
- reserved_hpa_bits(),
+ __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
context->shadow_root_level, uses_nx,
- guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
- is_pse(vcpu), true);
+ guest_can_use_gbpages(vcpu), is_pse, is_amd);
if (!shadow_me_mask)
return;
@@ -4193,7 +4285,6 @@ reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
}
}
-EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask);
static inline bool boot_cpu_is_amd(void)
{
@@ -4215,11 +4306,10 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
shadow_zero_check = &context->shadow_zero_check;
if (boot_cpu_is_amd())
- __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
- reserved_hpa_bits(),
+ __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
context->shadow_root_level, false,
boot_cpu_has(X86_FEATURE_GBPAGES),
- true, true);
+ false, true);
else
__reset_rsvds_bits_mask_ept(shadow_zero_check,
reserved_hpa_bits(), false);
@@ -4255,8 +4345,7 @@ reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
(7 & (access) ? 128 : 0))
-static void update_permission_bitmask(struct kvm_vcpu *vcpu,
- struct kvm_mmu *mmu, bool ept)
+static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept)
{
unsigned byte;
@@ -4264,9 +4353,10 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu,
const u8 w = BYTE_MASK(ACC_WRITE_MASK);
const u8 u = BYTE_MASK(ACC_USER_MASK);
- bool cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) != 0;
- bool cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP) != 0;
- bool cr0_wp = is_write_protection(vcpu);
+ bool cr4_smep = is_cr4_smep(mmu);
+ bool cr4_smap = is_cr4_smap(mmu);
+ bool cr0_wp = is_cr0_wp(mmu);
+ bool efer_nx = is_efer_nx(mmu);
for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
unsigned pfec = byte << 1;
@@ -4292,7 +4382,7 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu,
u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
/* Not really needed: !nx will cause pte.nx to fault */
- if (!mmu->nx)
+ if (!efer_nx)
ff = 0;
/* Allow supervisor writes if !cr0.wp */
@@ -4351,24 +4441,17 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu,
* away both AD and WD. For all reads or if the last condition holds, WD
* only will be masked away.
*/
-static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- bool ept)
+static void update_pkru_bitmask(struct kvm_mmu *mmu)
{
unsigned bit;
bool wp;
- if (ept) {
- mmu->pkru_mask = 0;
- return;
- }
-
- /* PKEY is enabled only if CR4.PKE and EFER.LMA are both set. */
- if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) {
+ if (!is_cr4_pke(mmu)) {
mmu->pkru_mask = 0;
return;
}
- wp = is_write_protection(vcpu);
+ wp = is_cr0_wp(mmu);
for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
unsigned pfec, pkey_bits;
@@ -4402,81 +4485,51 @@ static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
}
}
-static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu)
{
- unsigned root_level = mmu->root_level;
+ if (!is_cr0_pg(mmu))
+ return;
- mmu->last_nonleaf_level = root_level;
- if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu))
- mmu->last_nonleaf_level++;
+ reset_rsvds_bits_mask(vcpu, mmu);
+ update_permission_bitmask(mmu, false);
+ update_pkru_bitmask(mmu);
}
-static void paging64_init_context_common(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context,
- int level)
+static void paging64_init_context(struct kvm_mmu *context)
{
- context->nx = is_nx(vcpu);
- context->root_level = level;
-
- reset_rsvds_bits_mask(vcpu, context);
- update_permission_bitmask(vcpu, context, false);
- update_pkru_bitmask(vcpu, context, false);
- update_last_nonleaf_level(vcpu, context);
-
- MMU_WARN_ON(!is_pae(vcpu));
context->page_fault = paging64_page_fault;
context->gva_to_gpa = paging64_gva_to_gpa;
context->sync_page = paging64_sync_page;
context->invlpg = paging64_invlpg;
- context->shadow_root_level = level;
context->direct_map = false;
}
-static void paging64_init_context(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context)
+static void paging32_init_context(struct kvm_mmu *context)
{
- int root_level = is_la57_mode(vcpu) ?
- PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
-
- paging64_init_context_common(vcpu, context, root_level);
-}
-
-static void paging32_init_context(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context)
-{
- context->nx = false;
- context->root_level = PT32_ROOT_LEVEL;
-
- reset_rsvds_bits_mask(vcpu, context);
- update_permission_bitmask(vcpu, context, false);
- update_pkru_bitmask(vcpu, context, false);
- update_last_nonleaf_level(vcpu, context);
-
context->page_fault = paging32_page_fault;
context->gva_to_gpa = paging32_gva_to_gpa;
context->sync_page = paging32_sync_page;
context->invlpg = paging32_invlpg;
- context->shadow_root_level = PT32E_ROOT_LEVEL;
context->direct_map = false;
}
-static void paging32E_init_context(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context)
-{
- paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
-}
-
-static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu)
+static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_role_regs *regs)
{
union kvm_mmu_extended_role ext = {0};
- ext.cr0_pg = !!is_paging(vcpu);
- ext.cr4_pae = !!is_pae(vcpu);
- ext.cr4_smep = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
- ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
- ext.cr4_pse = !!is_pse(vcpu);
- ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE);
- ext.maxphyaddr = cpuid_maxphyaddr(vcpu);
+ if (____is_cr0_pg(regs)) {
+ ext.cr0_pg = 1;
+ ext.cr4_pae = ____is_cr4_pae(regs);
+ ext.cr4_smep = ____is_cr4_smep(regs);
+ ext.cr4_smap = ____is_cr4_smap(regs);
+ ext.cr4_pse = ____is_cr4_pse(regs);
+
+ /* PKEY and LA57 are active iff long mode is active. */
+ ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs);
+ ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs);
+ }
ext.valid = 1;
@@ -4484,20 +4537,23 @@ static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu)
}
static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_role_regs *regs,
bool base_only)
{
union kvm_mmu_role role = {0};
role.base.access = ACC_ALL;
- role.base.nxe = !!is_nx(vcpu);
- role.base.cr0_wp = is_write_protection(vcpu);
+ if (____is_cr0_pg(regs)) {
+ role.base.efer_nx = ____is_efer_nx(regs);
+ role.base.cr0_wp = ____is_cr0_wp(regs);
+ }
role.base.smm = is_smm(vcpu);
role.base.guest_mode = is_guest_mode(vcpu);
if (base_only)
return role;
- role.ext = kvm_calc_mmu_role_ext(vcpu);
+ role.ext = kvm_calc_mmu_role_ext(vcpu, regs);
return role;
}
@@ -4512,9 +4568,10 @@ static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
}
static union kvm_mmu_role
-kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
+kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_role_regs *regs, bool base_only)
{
- union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
+ union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs, base_only);
role.base.ad_disabled = (shadow_accessed_mask == 0);
role.base.level = kvm_mmu_get_tdp_level(vcpu);
@@ -4527,8 +4584,9 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *context = &vcpu->arch.root_mmu;
+ struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
union kvm_mmu_role new_role =
- kvm_calc_tdp_mmu_root_page_role(vcpu, false);
+ kvm_calc_tdp_mmu_root_page_role(vcpu, &regs, false);
if (new_role.as_u64 == context->mmu_role.as_u64)
return;
@@ -4542,60 +4600,44 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->get_guest_pgd = get_cr3;
context->get_pdptr = kvm_pdptr_read;
context->inject_page_fault = kvm_inject_page_fault;
+ context->root_level = role_regs_to_root_level(&regs);
- if (!is_paging(vcpu)) {
- context->nx = false;
+ if (!is_cr0_pg(context))
context->gva_to_gpa = nonpaging_gva_to_gpa;
- context->root_level = 0;
- } else if (is_long_mode(vcpu)) {
- context->nx = is_nx(vcpu);
- context->root_level = is_la57_mode(vcpu) ?
- PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
- reset_rsvds_bits_mask(vcpu, context);
- context->gva_to_gpa = paging64_gva_to_gpa;
- } else if (is_pae(vcpu)) {
- context->nx = is_nx(vcpu);
- context->root_level = PT32E_ROOT_LEVEL;
- reset_rsvds_bits_mask(vcpu, context);
+ else if (is_cr4_pae(context))
context->gva_to_gpa = paging64_gva_to_gpa;
- } else {
- context->nx = false;
- context->root_level = PT32_ROOT_LEVEL;
- reset_rsvds_bits_mask(vcpu, context);
+ else
context->gva_to_gpa = paging32_gva_to_gpa;
- }
- update_permission_bitmask(vcpu, context, false);
- update_pkru_bitmask(vcpu, context, false);
- update_last_nonleaf_level(vcpu, context);
+ reset_guest_paging_metadata(vcpu, context);
reset_tdp_shadow_zero_bits_mask(vcpu, context);
}
static union kvm_mmu_role
-kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu, bool base_only)
+kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_role_regs *regs, bool base_only)
{
- union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
+ union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs, base_only);
- role.base.smep_andnot_wp = role.ext.cr4_smep &&
- !is_write_protection(vcpu);
- role.base.smap_andnot_wp = role.ext.cr4_smap &&
- !is_write_protection(vcpu);
- role.base.gpte_is_8_bytes = !!is_pae(vcpu);
+ role.base.smep_andnot_wp = role.ext.cr4_smep && !____is_cr0_wp(regs);
+ role.base.smap_andnot_wp = role.ext.cr4_smap && !____is_cr0_wp(regs);
+ role.base.gpte_is_8_bytes = ____is_cr0_pg(regs) && ____is_cr4_pae(regs);
return role;
}
static union kvm_mmu_role
-kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
+kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_role_regs *regs, bool base_only)
{
union kvm_mmu_role role =
- kvm_calc_shadow_root_page_role_common(vcpu, base_only);
+ kvm_calc_shadow_root_page_role_common(vcpu, regs, base_only);
- role.base.direct = !is_paging(vcpu);
+ role.base.direct = !____is_cr0_pg(regs);
- if (!is_long_mode(vcpu))
+ if (!____is_efer_lma(regs))
role.base.level = PT32E_ROOT_LEVEL;
- else if (is_la57_mode(vcpu))
+ else if (____is_cr4_la57(regs))
role.base.level = PT64_ROOT_5LEVEL;
else
role.base.level = PT64_ROOT_4LEVEL;
@@ -4604,37 +4646,44 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
}
static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
- u32 cr0, u32 cr4, u32 efer,
+ struct kvm_mmu_role_regs *regs,
union kvm_mmu_role new_role)
{
- if (!(cr0 & X86_CR0_PG))
- nonpaging_init_context(vcpu, context);
- else if (efer & EFER_LMA)
- paging64_init_context(vcpu, context);
- else if (cr4 & X86_CR4_PAE)
- paging32E_init_context(vcpu, context);
- else
- paging32_init_context(vcpu, context);
+ if (new_role.as_u64 == context->mmu_role.as_u64)
+ return;
context->mmu_role.as_u64 = new_role.as_u64;
+
+ if (!is_cr0_pg(context))
+ nonpaging_init_context(context);
+ else if (is_cr4_pae(context))
+ paging64_init_context(context);
+ else
+ paging32_init_context(context);
+ context->root_level = role_regs_to_root_level(regs);
+
+ reset_guest_paging_metadata(vcpu, context);
+ context->shadow_root_level = new_role.base.level;
+
reset_shadow_zero_bits_mask(vcpu, context);
}
-static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer)
+static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_role_regs *regs)
{
struct kvm_mmu *context = &vcpu->arch.root_mmu;
union kvm_mmu_role new_role =
- kvm_calc_shadow_mmu_root_page_role(vcpu, false);
+ kvm_calc_shadow_mmu_root_page_role(vcpu, regs, false);
- if (new_role.as_u64 != context->mmu_role.as_u64)
- shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role);
+ shadow_mmu_init_context(vcpu, context, regs, new_role);
}
static union kvm_mmu_role
-kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu)
+kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_role_regs *regs)
{
union kvm_mmu_role role =
- kvm_calc_shadow_root_page_role_common(vcpu, false);
+ kvm_calc_shadow_root_page_role_common(vcpu, regs, false);
role.base.direct = false;
role.base.level = kvm_mmu_get_tdp_level(vcpu);
@@ -4642,23 +4691,22 @@ kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu)
return role;
}
-void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer,
- gpa_t nested_cr3)
+void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
+ unsigned long cr4, u64 efer, gpa_t nested_cr3)
{
struct kvm_mmu *context = &vcpu->arch.guest_mmu;
- union kvm_mmu_role new_role = kvm_calc_shadow_npt_root_page_role(vcpu);
+ struct kvm_mmu_role_regs regs = {
+ .cr0 = cr0,
+ .cr4 = cr4,
+ .efer = efer,
+ };
+ union kvm_mmu_role new_role;
- __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base, false, false);
+ new_role = kvm_calc_shadow_npt_root_page_role(vcpu, &regs);
- if (new_role.as_u64 != context->mmu_role.as_u64) {
- shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role);
+ __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base);
- /*
- * Override the level set by the common init helper, nested TDP
- * always uses the host's TDP configuration.
- */
- context->shadow_root_level = new_role.base.level;
- }
+ shadow_mmu_init_context(vcpu, context, &regs, new_role);
}
EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
@@ -4678,15 +4726,10 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
role.base.guest_mode = true;
role.base.access = ACC_ALL;
- /*
- * WP=1 and NOT_WP=1 is an impossible combination, use WP and the
- * SMAP variation to denote shadow EPT entries.
- */
- role.base.cr0_wp = true;
- role.base.smap_andnot_wp = true;
-
- role.ext = kvm_calc_mmu_role_ext(vcpu);
+ /* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */
+ role.ext.word = 0;
role.ext.execonly = execonly;
+ role.ext.valid = 1;
return role;
}
@@ -4700,14 +4743,15 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
execonly, level);
- __kvm_mmu_new_pgd(vcpu, new_eptp, new_role.base, true, true);
+ __kvm_mmu_new_pgd(vcpu, new_eptp, new_role.base);
if (new_role.as_u64 == context->mmu_role.as_u64)
return;
+ context->mmu_role.as_u64 = new_role.as_u64;
+
context->shadow_root_level = level;
- context->nx = true;
context->ept_ad = accessed_dirty;
context->page_fault = ept_page_fault;
context->gva_to_gpa = ept_gva_to_gpa;
@@ -4715,11 +4759,9 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
context->invlpg = ept_invlpg;
context->root_level = level;
context->direct_map = false;
- context->mmu_role.as_u64 = new_role.as_u64;
- update_permission_bitmask(vcpu, context, true);
- update_pkru_bitmask(vcpu, context, true);
- update_last_nonleaf_level(vcpu, context);
+ update_permission_bitmask(context, true);
+ update_pkru_bitmask(context);
reset_rsvds_bits_mask_ept(vcpu, context, execonly);
reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
}
@@ -4728,20 +4770,36 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *context = &vcpu->arch.root_mmu;
+ struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
- kvm_init_shadow_mmu(vcpu,
- kvm_read_cr0_bits(vcpu, X86_CR0_PG),
- kvm_read_cr4_bits(vcpu, X86_CR4_PAE),
- vcpu->arch.efer);
+ kvm_init_shadow_mmu(vcpu, &regs);
context->get_guest_pgd = get_cr3;
context->get_pdptr = kvm_pdptr_read;
context->inject_page_fault = kvm_inject_page_fault;
}
+static union kvm_mmu_role
+kvm_calc_nested_mmu_role(struct kvm_vcpu *vcpu, struct kvm_mmu_role_regs *regs)
+{
+ union kvm_mmu_role role;
+
+ role = kvm_calc_shadow_root_page_role_common(vcpu, regs, false);
+
+ /*
+ * Nested MMUs are used only for walking L2's gva->gpa, they never have
+ * shadow pages of their own and so "direct" has no meaning. Set it
+ * to "true" to try to detect bogus usage of the nested MMU.
+ */
+ role.base.direct = true;
+ role.base.level = role_regs_to_root_level(regs);
+ return role;
+}
+
static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
{
- union kvm_mmu_role new_role = kvm_calc_mmu_role_common(vcpu, false);
+ struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
+ union kvm_mmu_role new_role = kvm_calc_nested_mmu_role(vcpu, &regs);
struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
if (new_role.as_u64 == g_context->mmu_role.as_u64)
@@ -4751,6 +4809,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
g_context->get_guest_pgd = get_cr3;
g_context->get_pdptr = kvm_pdptr_read;
g_context->inject_page_fault = kvm_inject_page_fault;
+ g_context->root_level = new_role.base.level;
/*
* L2 page tables are never shadowed, so there is no need to sync
@@ -4766,44 +4825,20 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
* nested page tables as the second level of translation. Basically
* the gva_to_gpa functions between mmu and nested_mmu are swapped.
*/
- if (!is_paging(vcpu)) {
- g_context->nx = false;
- g_context->root_level = 0;
+ if (!is_paging(vcpu))
g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
- } else if (is_long_mode(vcpu)) {
- g_context->nx = is_nx(vcpu);
- g_context->root_level = is_la57_mode(vcpu) ?
- PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
- reset_rsvds_bits_mask(vcpu, g_context);
+ else if (is_long_mode(vcpu))
g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
- } else if (is_pae(vcpu)) {
- g_context->nx = is_nx(vcpu);
- g_context->root_level = PT32E_ROOT_LEVEL;
- reset_rsvds_bits_mask(vcpu, g_context);
+ else if (is_pae(vcpu))
g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
- } else {
- g_context->nx = false;
- g_context->root_level = PT32_ROOT_LEVEL;
- reset_rsvds_bits_mask(vcpu, g_context);
+ else
g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
- }
- update_permission_bitmask(vcpu, g_context, false);
- update_pkru_bitmask(vcpu, g_context, false);
- update_last_nonleaf_level(vcpu, g_context);
+ reset_guest_paging_metadata(vcpu, g_context);
}
-void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots)
+void kvm_init_mmu(struct kvm_vcpu *vcpu)
{
- if (reset_roots) {
- uint i;
-
- vcpu->arch.mmu->root_hpa = INVALID_PAGE;
-
- for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
- vcpu->arch.mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
- }
-
if (mmu_is_nested(vcpu))
init_kvm_nested_mmu(vcpu);
else if (tdp_enabled)
@@ -4816,20 +4851,53 @@ EXPORT_SYMBOL_GPL(kvm_init_mmu);
static union kvm_mmu_page_role
kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
{
+ struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
union kvm_mmu_role role;
if (tdp_enabled)
- role = kvm_calc_tdp_mmu_root_page_role(vcpu, true);
+ role = kvm_calc_tdp_mmu_root_page_role(vcpu, &regs, true);
else
- role = kvm_calc_shadow_mmu_root_page_role(vcpu, true);
+ role = kvm_calc_shadow_mmu_root_page_role(vcpu, &regs, true);
return role.base;
}
+void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Invalidate all MMU roles to force them to reinitialize as CPUID
+ * information is factored into reserved bit calculations.
+ */
+ vcpu->arch.root_mmu.mmu_role.ext.valid = 0;
+ vcpu->arch.guest_mmu.mmu_role.ext.valid = 0;
+ vcpu->arch.nested_mmu.mmu_role.ext.valid = 0;
+ kvm_mmu_reset_context(vcpu);
+
+ /*
+ * KVM does not correctly handle changing guest CPUID after KVM_RUN, as
+ * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
+ * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
+ * faults due to reusing SPs/SPTEs. Alert userspace, but otherwise
+ * sweep the problem under the rug.
+ *
+ * KVM's horrific CPUID ABI makes the problem all but impossible to
+ * solve, as correctly handling multiple vCPU models (with respect to
+ * paging and physical address properties) in a single VM would require
+ * tracking all relevant CPUID information in kvm_mmu_page_role. That
+ * is very undesirable as it would double the memory requirements for
+ * gfn_track (see struct kvm_mmu_page_role comments), and in practice
+ * no sane VMM mucks with the core vCPU model on the fly.
+ */
+ if (vcpu->arch.last_vmentry_cpu != -1) {
+ pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} after KVM_RUN may cause guest instability\n");
+ pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} will fail after KVM_RUN starting with Linux 5.16\n");
+ }
+}
+
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
{
kvm_mmu_unload(vcpu);
- kvm_init_mmu(vcpu, true);
+ kvm_init_mmu(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
@@ -5261,7 +5329,7 @@ static void free_mmu_pages(struct kvm_mmu *mmu)
if (!tdp_enabled && mmu->pae_root)
set_memory_encrypted((unsigned long)mmu->pae_root, 1);
free_page((unsigned long)mmu->pae_root);
- free_page((unsigned long)mmu->lm_root);
+ free_page((unsigned long)mmu->pml4_root);
}
static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
@@ -5467,7 +5535,13 @@ void kvm_mmu_init_vm(struct kvm *kvm)
{
struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
- kvm_mmu_init_tdp_mmu(kvm);
+ if (!kvm_mmu_init_tdp_mmu(kvm))
+ /*
+ * No smp_load/store wrappers needed here as we are in
+ * VM init and there cannot be any memslots / other threads
+ * accessing this struct kvm yet.
+ */
+ kvm->arch.memslots_have_rmaps = true;
node->track_write = kvm_mmu_pte_write;
node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
@@ -5490,29 +5564,29 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
int i;
bool flush = false;
- write_lock(&kvm->mmu_lock);
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
- slots = __kvm_memslots(kvm, i);
- kvm_for_each_memslot(memslot, slots) {
- gfn_t start, end;
-
- start = max(gfn_start, memslot->base_gfn);
- end = min(gfn_end, memslot->base_gfn + memslot->npages);
- if (start >= end)
- continue;
+ if (kvm_memslots_have_rmaps(kvm)) {
+ write_lock(&kvm->mmu_lock);
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ slots = __kvm_memslots(kvm, i);
+ kvm_for_each_memslot(memslot, slots) {
+ gfn_t start, end;
+
+ start = max(gfn_start, memslot->base_gfn);
+ end = min(gfn_end, memslot->base_gfn + memslot->npages);
+ if (start >= end)
+ continue;
- flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
- PG_LEVEL_4K,
- KVM_MAX_HUGEPAGE_LEVEL,
- start, end - 1, true, flush);
+ flush = slot_handle_level_range(kvm, memslot,
+ kvm_zap_rmapp, PG_LEVEL_4K,
+ KVM_MAX_HUGEPAGE_LEVEL, start,
+ end - 1, true, flush);
+ }
}
+ if (flush)
+ kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end);
+ write_unlock(&kvm->mmu_lock);
}
- if (flush)
- kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end);
-
- write_unlock(&kvm->mmu_lock);
-
if (is_tdp_mmu_enabled(kvm)) {
flush = false;
@@ -5539,12 +5613,15 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
struct kvm_memory_slot *memslot,
int start_level)
{
- bool flush;
+ bool flush = false;
- write_lock(&kvm->mmu_lock);
- flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect,
- start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
- write_unlock(&kvm->mmu_lock);
+ if (kvm_memslots_have_rmaps(kvm)) {
+ write_lock(&kvm->mmu_lock);
+ flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect,
+ start_level, KVM_MAX_HUGEPAGE_LEVEL,
+ false);
+ write_unlock(&kvm->mmu_lock);
+ }
if (is_tdp_mmu_enabled(kvm)) {
read_lock(&kvm->mmu_lock);
@@ -5612,18 +5689,17 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
{
/* FIXME: const-ify all uses of struct kvm_memory_slot. */
struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot;
- bool flush;
-
- write_lock(&kvm->mmu_lock);
- flush = slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true);
+ bool flush = false;
- if (flush)
- kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
- write_unlock(&kvm->mmu_lock);
+ if (kvm_memslots_have_rmaps(kvm)) {
+ write_lock(&kvm->mmu_lock);
+ flush = slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true);
+ if (flush)
+ kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
+ write_unlock(&kvm->mmu_lock);
+ }
if (is_tdp_mmu_enabled(kvm)) {
- flush = false;
-
read_lock(&kvm->mmu_lock);
flush = kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot, flush);
if (flush)
@@ -5650,11 +5726,14 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
struct kvm_memory_slot *memslot)
{
- bool flush;
+ bool flush = false;
- write_lock(&kvm->mmu_lock);
- flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
- write_unlock(&kvm->mmu_lock);
+ if (kvm_memslots_have_rmaps(kvm)) {
+ write_lock(&kvm->mmu_lock);
+ flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty,
+ false);
+ write_unlock(&kvm->mmu_lock);
+ }
if (is_tdp_mmu_enabled(kvm)) {
read_lock(&kvm->mmu_lock);
@@ -5957,6 +6036,7 @@ static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel
static void kvm_recover_nx_lpages(struct kvm *kvm)
{
+ unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits;
int rcu_idx;
struct kvm_mmu_page *sp;
unsigned int ratio;
@@ -5968,7 +6048,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
write_lock(&kvm->mmu_lock);
ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
- to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;
+ to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0;
for ( ; to_zap; --to_zap) {
if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages))
break;
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index d64ccb417c60..35567293c1fd 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -116,14 +116,19 @@ static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
kvm_x86_ops.cpu_dirty_log_size;
}
-bool is_nx_huge_page_enabled(void);
-bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
- bool can_unsync);
+extern int nx_huge_pages;
+static inline bool is_nx_huge_page_enabled(void)
+{
+ return READ_ONCE(nx_huge_pages);
+}
+
+int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync);
void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
- struct kvm_memory_slot *slot, u64 gfn);
+ struct kvm_memory_slot *slot, u64 gfn,
+ int min_level);
void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
u64 start_gfn, u64 pages);
@@ -158,8 +163,6 @@ int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level,
kvm_pfn_t *pfnp, int *goal_levelp);
-bool is_nx_huge_page_enabled(void);
-
void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index e798489b56b5..efbad33a0645 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -40,7 +40,7 @@
role.direct ? " direct" : "", \
access_str[role.access], \
role.invalid ? " invalid" : "", \
- role.nxe ? "" : "!", \
+ role.efer_nx ? "" : "!", \
role.ad_disabled ? "!" : "", \
__entry->root_count, \
__entry->unsync ? "unsync" : "sync", 0); \
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
index 34bb0ec69bd8..91a9f7e0fd91 100644
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -100,7 +100,7 @@ void kvm_slot_page_track_add_page(struct kvm *kvm,
kvm_mmu_gfn_disallow_lpage(slot, gfn);
if (mode == KVM_PAGE_TRACK_WRITE)
- if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn))
+ if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K))
kvm_flush_remote_tlbs(kvm);
}
EXPORT_SYMBOL_GPL(kvm_slot_page_track_add_page);
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 70b7e44e3035..490a028ddabe 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -90,8 +90,8 @@ struct guest_walker {
gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS];
bool pte_writable[PT_MAX_FULL_LEVELS];
- unsigned pt_access;
- unsigned pte_access;
+ unsigned int pt_access[PT_MAX_FULL_LEVELS];
+ unsigned int pte_access;
gfn_t gfn;
struct x86_exception fault;
};
@@ -305,6 +305,35 @@ static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte)
return pkeys;
}
+static inline bool FNAME(is_last_gpte)(struct kvm_mmu *mmu,
+ unsigned int level, unsigned int gpte)
+{
+ /*
+ * For EPT and PAE paging (both variants), bit 7 is either reserved at
+ * all level or indicates a huge page (ignoring CR3/EPTP). In either
+ * case, bit 7 being set terminates the walk.
+ */
+#if PTTYPE == 32
+ /*
+ * 32-bit paging requires special handling because bit 7 is ignored if
+ * CR4.PSE=0, not reserved. Clear bit 7 in the gpte if the level is
+ * greater than the last level for which bit 7 is the PAGE_SIZE bit.
+ *
+ * The RHS has bit 7 set iff level < (2 + PSE). If it is clear, bit 7
+ * is not reserved and does not indicate a large page at this level,
+ * so clear PT_PAGE_SIZE_MASK in gpte if that is the case.
+ */
+ gpte &= level - (PT32_ROOT_LEVEL + mmu->mmu_role.ext.cr4_pse);
+#endif
+ /*
+ * PG_LEVEL_4K always terminates. The RHS has bit 7 set
+ * iff level <= PG_LEVEL_4K, which for our purpose means
+ * level == PG_LEVEL_4K; set PT_PAGE_SIZE_MASK in gpte then.
+ */
+ gpte |= level - PG_LEVEL_4K - 1;
+
+ return gpte & PT_PAGE_SIZE_MASK;
+}
/*
* Fetch a guest pte for a guest virtual address, or for an L2's GPA.
*/
@@ -418,13 +447,15 @@ retry_walk:
}
walker->ptes[walker->level - 1] = pte;
- } while (!is_last_gpte(mmu, walker->level, pte));
+
+ /* Convert to ACC_*_MASK flags for struct guest_walker. */
+ walker->pt_access[walker->level - 1] = FNAME(gpte_access)(pt_access ^ walk_nx_mask);
+ } while (!FNAME(is_last_gpte)(mmu, walker->level, pte));
pte_pkey = FNAME(gpte_pkeys)(vcpu, pte);
accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0;
/* Convert to ACC_*_MASK flags for struct guest_walker. */
- walker->pt_access = FNAME(gpte_access)(pt_access ^ walk_nx_mask);
walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask);
errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access);
if (unlikely(errcode))
@@ -463,13 +494,13 @@ retry_walk:
}
pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
- __func__, (u64)pte, walker->pte_access, walker->pt_access);
+ __func__, (u64)pte, walker->pte_access,
+ walker->pt_access[walker->level - 1]);
return 1;
error:
errcode |= write_fault | user_fault;
- if (fetch_fault && (mmu->nx ||
- kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)))
+ if (fetch_fault && (is_efer_nx(mmu) || is_cr4_smep(mmu)))
errcode |= PFERR_FETCH_MASK;
walker->fault.vector = PF_VECTOR;
@@ -643,7 +674,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
struct kvm_mmu_page *sp = NULL;
struct kvm_shadow_walk_iterator it;
- unsigned direct_access, access = gw->pt_access;
+ unsigned int direct_access, access;
int top_level, level, req_level, ret;
gfn_t base_gfn = gw->gfn;
@@ -675,6 +706,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
sp = NULL;
if (!is_shadow_present_pte(*it.sptep)) {
table_gfn = gw->table_gfn[it.level - 2];
+ access = gw->pt_access[it.level - 2];
sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
false, access);
}
@@ -763,7 +795,7 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
bool self_changed = false;
if (!(walker->pte_access & ACC_WRITE_MASK ||
- (!is_write_protection(vcpu) && !user_fault)))
+ (!is_cr0_wp(vcpu->arch.mmu) && !user_fault)))
return false;
for (level = walker->level; level <= walker->max_level; level++) {
@@ -861,8 +893,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
* we will cache the incorrect access into mmio spte.
*/
if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) &&
- !is_write_protection(vcpu) && !user_fault &&
- !is_noslot_pfn(pfn)) {
+ !is_cr0_wp(vcpu->arch.mmu) && !user_fault && !is_noslot_pfn(pfn)) {
walker.pte_access |= ACC_WRITE_MASK;
walker.pte_access &= ~ACC_USER_MASK;
@@ -872,7 +903,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
* then we should prevent the kernel from executing it
* if SMEP is enabled.
*/
- if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
+ if (is_cr4_smep(vcpu->arch.mmu))
walker.pte_access &= ~ACC_EXEC_MASK;
}
@@ -1027,13 +1058,36 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr,
*/
static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
+ union kvm_mmu_page_role mmu_role = vcpu->arch.mmu->mmu_role.base;
int i, nr_present = 0;
bool host_writable;
gpa_t first_pte_gpa;
int set_spte_ret = 0;
- /* direct kvm_mmu_page can not be unsync. */
- BUG_ON(sp->role.direct);
+ /*
+ * Ignore various flags when verifying that it's safe to sync a shadow
+ * page using the current MMU context.
+ *
+ * - level: not part of the overall MMU role and will never match as the MMU's
+ * level tracks the root level
+ * - access: updated based on the new guest PTE
+ * - quadrant: not part of the overall MMU role (similar to level)
+ */
+ const union kvm_mmu_page_role sync_role_ign = {
+ .level = 0xf,
+ .access = 0x7,
+ .quadrant = 0x3,
+ };
+
+ /*
+ * Direct pages can never be unsync, and KVM should never attempt to
+ * sync a shadow page for a different MMU context, e.g. if the role
+ * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the
+ * reserved bits checks will be wrong, etc...
+ */
+ if (WARN_ON_ONCE(sp->role.direct ||
+ (sp->role.word ^ mmu_role.word) & ~sync_role_ign.word))
+ return 0;
first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 66d43cec0c31..3e97cdb13eb7 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -103,13 +103,6 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK;
/*
- * Bits 62:52 of PAE SPTEs are reserved. WARN if said bits are set
- * if PAE paging may be employed (shadow paging or any 32-bit KVM).
- */
- WARN_ON_ONCE((!tdp_enabled || !IS_ENABLED(CONFIG_X86_64)) &&
- (spte & SPTE_TDP_AD_MASK));
-
- /*
* For the EPT case, shadow_present_mask is 0 if hardware
* supports exec-only page table entries. In that case,
* ACC_USER_MASK and shadow_user_mask are used to represent
@@ -154,13 +147,19 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
/*
* Optimization: for pte sync, if spte was writable the hash
* lookup is unnecessary (and expensive). Write protection
- * is responsibility of mmu_get_page / kvm_sync_page.
+ * is responsibility of kvm_mmu_get_page / kvm_mmu_sync_roots.
* Same reasoning can be applied to dirty page accounting.
*/
if (!can_unsync && is_writable_pte(old_spte))
goto out;
- if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
+ /*
+ * Unsync shadow pages that are reachable by the new, writable
+ * SPTE. Write-protect the SPTE if the page can't be unsync'd,
+ * e.g. it's write-tracked (upper-level SPs) or has one or more
+ * shadow pages and unsync'ing pages is not allowed.
+ */
+ if (mmu_try_to_unsync_pages(vcpu, gfn, can_unsync)) {
pgprintk("%s: found shadow page for %llx, marking ro\n",
__func__, gfn);
ret |= SET_SPTE_WRITE_PROTECTED_PT;
@@ -176,7 +175,10 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
spte = mark_spte_for_access_track(spte);
out:
- WARN_ON(is_mmio_spte(spte));
+ WARN_ONCE(is_rsvd_spte(&vcpu->arch.mmu->shadow_zero_check, spte, level),
+ "spte = 0x%llx, level = %d, rsvd bits = 0x%llx", spte, level,
+ get_rsvd_bits(&vcpu->arch.mmu->shadow_zero_check, spte, level));
+
*new_spte = spte;
return ret;
}
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index bca0ba11cccf..7a5ce9314107 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -293,6 +293,38 @@ static inline bool is_dirty_spte(u64 spte)
return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
}
+static inline u64 get_rsvd_bits(struct rsvd_bits_validate *rsvd_check, u64 pte,
+ int level)
+{
+ int bit7 = (pte >> 7) & 1;
+
+ return rsvd_check->rsvd_bits_mask[bit7][level-1];
+}
+
+static inline bool __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check,
+ u64 pte, int level)
+{
+ return pte & get_rsvd_bits(rsvd_check, pte, level);
+}
+
+static inline bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check,
+ u64 pte)
+{
+ return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f);
+}
+
+static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check,
+ u64 spte, int level)
+{
+ /*
+ * Use a bitwise-OR instead of a logical-OR to aggregate the reserved
+ * bits and EPT's invalid memtype/XWR checks to avoid an extra Jcc
+ * (this is extremely unlikely to be short-circuited as true).
+ */
+ return __is_bad_mt_xwr(rsvd_check, spte) |
+ __is_rsvd_bits_set(rsvd_check, spte, level);
+}
+
static inline bool spte_can_locklessly_be_made_writable(u64 spte)
{
return (spte & shadow_host_writable_mask) &&
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 88f69a6cc492..0853370bd811 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -14,10 +14,10 @@ static bool __read_mostly tdp_mmu_enabled = false;
module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
/* Initializes the TDP MMU for the VM, if enabled. */
-void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
+bool kvm_mmu_init_tdp_mmu(struct kvm *kvm)
{
if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
- return;
+ return false;
/* This should not be changed for the lifetime of the VM. */
kvm->arch.tdp_mmu_enabled = true;
@@ -25,6 +25,8 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
+
+ return true;
}
static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
@@ -335,7 +337,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
sptep = rcu_dereference(pt) + i;
- gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
+ gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
if (shared) {
/*
@@ -377,18 +379,18 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
WRITE_ONCE(*sptep, REMOVED_SPTE);
}
handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
- old_child_spte, REMOVED_SPTE, level - 1,
+ old_child_spte, REMOVED_SPTE, level,
shared);
}
kvm_flush_remote_tlbs_with_address(kvm, gfn,
- KVM_PAGES_PER_HPAGE(level));
+ KVM_PAGES_PER_HPAGE(level + 1));
call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
}
/**
- * handle_changed_spte - handle bookkeeping associated with an SPTE change
+ * __handle_changed_spte - handle bookkeeping associated with an SPTE change
* @kvm: kvm instance
* @as_id: the address space of the paging structure the SPTE was a part of
* @gfn: the base GFN that was mapped by the SPTE
@@ -444,6 +446,13 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
+ if (is_large_pte(old_spte) != is_large_pte(new_spte)) {
+ if (is_large_pte(old_spte))
+ atomic64_sub(1, (atomic64_t*)&kvm->stat.lpages);
+ else
+ atomic64_add(1, (atomic64_t*)&kvm->stat.lpages);
+ }
+
/*
* The only times a SPTE should be changed from a non-present to
* non-present state is when an MMIO entry is installed/modified/
@@ -905,7 +914,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
kvm_pfn_t pfn, bool prefault)
{
u64 new_spte;
- int ret = 0;
+ int ret = RET_PF_FIXED;
int make_spte_ret = 0;
if (unlikely(is_noslot_pfn(pfn)))
@@ -942,7 +951,11 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
rcu_dereference(iter->sptep));
}
- if (!prefault)
+ /*
+ * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be
+ * consistent with legacy MMU behavior.
+ */
+ if (ret != RET_PF_SPURIOUS)
vcpu->stat.pf_fixed++;
return ret;
@@ -970,11 +983,6 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
int level;
int req_level;
- if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
- return RET_PF_RETRY;
- if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)))
- return RET_PF_RETRY;
-
level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
huge_page_disallowed, &req_level);
@@ -1009,7 +1017,15 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
}
if (!is_shadow_present_pte(iter.old_spte)) {
- sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
+ /*
+ * If SPTE has been frozen by another thread, just
+ * give up and retry, avoiding unnecessary page table
+ * allocation and free.
+ */
+ if (is_removed_spte(iter.old_spte))
+ break;
+
+ sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level - 1);
child_pt = sp->spt;
new_spte = make_nonleaf_spte(child_pt,
@@ -1177,9 +1193,9 @@ bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
}
/*
- * Remove write access from all the SPTEs mapping GFNs [start, end). If
- * skip_4k is set, SPTEs that map 4k pages, will not be write-protected.
- * Returns true if an SPTE has been changed and the TLBs need to be flushed.
+ * Remove write access from all SPTEs at or above min_level that map GFNs
+ * [start, end). Returns true if an SPTE has been changed and the TLBs need to
+ * be flushed.
*/
static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t start, gfn_t end, int min_level)
@@ -1447,15 +1463,22 @@ bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
* Returns true if an SPTE was set and a TLB flush is needed.
*/
static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
- gfn_t gfn)
+ gfn_t gfn, int min_level)
{
struct tdp_iter iter;
u64 new_spte;
bool spte_set = false;
+ BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
+
rcu_read_lock();
- tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) {
+ for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
+ min_level, gfn, gfn + 1) {
+ if (!is_shadow_present_pte(iter.old_spte) ||
+ !is_last_spte(iter.old_spte, iter.level))
+ continue;
+
if (!is_writable_pte(iter.old_spte))
break;
@@ -1477,14 +1500,15 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
* Returns true if an SPTE was set and a TLB flush is needed.
*/
bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
- struct kvm_memory_slot *slot, gfn_t gfn)
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ int min_level)
{
struct kvm_mmu_page *root;
bool spte_set = false;
lockdep_assert_held_write(&kvm->mmu_lock);
for_each_tdp_mmu_root(kvm, root, slot->as_id)
- spte_set |= write_protect_gfn(kvm, root, gfn);
+ spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
return spte_set;
}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 5fdf63090451..1cae4485b3bc 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -31,7 +31,7 @@ static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id,
}
static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
{
- gfn_t end = sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level);
+ gfn_t end = sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level + 1);
/*
* Don't allow yielding, as the caller may have a flush pending. Note,
@@ -74,37 +74,40 @@ bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
bool flush);
bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
- struct kvm_memory_slot *slot, gfn_t gfn);
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ int min_level);
int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
int *root_level);
#ifdef CONFIG_X86_64
-void kvm_mmu_init_tdp_mmu(struct kvm *kvm);
+bool kvm_mmu_init_tdp_mmu(struct kvm *kvm);
void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; }
static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; }
-#else
-static inline void kvm_mmu_init_tdp_mmu(struct kvm *kvm) {}
-static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {}
-static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; }
-static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; }
-#endif
-static inline bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa)
+static inline bool is_tdp_mmu(struct kvm_mmu *mmu)
{
struct kvm_mmu_page *sp;
+ hpa_t hpa = mmu->root_hpa;
- if (!is_tdp_mmu_enabled(kvm))
- return false;
if (WARN_ON(!VALID_PAGE(hpa)))
return false;
+ /*
+ * A NULL shadow page is legal when shadowing a non-paging guest with
+ * PAE paging, as the MMU will be direct with root_hpa pointing at the
+ * pae_root page, not a shadow page.
+ */
sp = to_shadow_page(hpa);
- if (WARN_ON(!sp))
- return false;
-
- return is_tdp_mmu_page(sp) && sp->root_count;
+ return sp && is_tdp_mmu_page(sp) && sp->root_count;
}
+#else
+static inline bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return false; }
+static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {}
+static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; }
+static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; }
+static inline bool is_tdp_mmu(struct kvm_mmu *mmu) { return false; }
+#endif
#endif /* __KVM_X86_MMU_TDP_MMU_H */