aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/arch/arm64/kvm/mmu.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm64/kvm/mmu.c')
-rw-r--r--arch/arm64/kvm/mmu.c775
1 files changed, 546 insertions, 229 deletions
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index f5651a05b6a8..18680771cdb0 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -25,12 +25,26 @@
static struct kvm_pgtable *hyp_pgtable;
static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
-static unsigned long hyp_idmap_start;
-static unsigned long hyp_idmap_end;
-static phys_addr_t hyp_idmap_vector;
+static unsigned long __ro_after_init hyp_idmap_start;
+static unsigned long __ro_after_init hyp_idmap_end;
+static phys_addr_t __ro_after_init hyp_idmap_vector;
-static unsigned long io_map_base;
+static unsigned long __ro_after_init io_map_base;
+static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end,
+ phys_addr_t size)
+{
+ phys_addr_t boundary = ALIGN_DOWN(addr + size, size);
+
+ return (boundary - 1 < end - 1) ? boundary : end;
+}
+
+static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end)
+{
+ phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL);
+
+ return __stage2_range_addr_end(addr, end, size);
+}
/*
* Release kvm_mmu_lock periodically if the memory region is large. Otherwise,
@@ -39,20 +53,21 @@ static unsigned long io_map_base;
* long will also starve other vCPUs. We have to also make sure that the page
* tables are not freed while we released the lock.
*/
-static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr,
+static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr,
phys_addr_t end,
int (*fn)(struct kvm_pgtable *, u64, u64),
bool resched)
{
+ struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
int ret;
u64 next;
do {
- struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
+ struct kvm_pgtable *pgt = mmu->pgt;
if (!pgt)
return -EINVAL;
- next = stage2_pgd_addr_end(kvm, addr, end);
+ next = stage2_range_addr_end(addr, end);
ret = fn(pgt, addr, next - addr);
if (ret)
break;
@@ -64,8 +79,81 @@ static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr,
return ret;
}
-#define stage2_apply_range_resched(kvm, addr, end, fn) \
- stage2_apply_range(kvm, addr, end, fn, true)
+#define stage2_apply_range_resched(mmu, addr, end, fn) \
+ stage2_apply_range(mmu, addr, end, fn, true)
+
+/*
+ * Get the maximum number of page-tables pages needed to split a range
+ * of blocks into PAGE_SIZE PTEs. It assumes the range is already
+ * mapped at level 2, or at level 1 if allowed.
+ */
+static int kvm_mmu_split_nr_page_tables(u64 range)
+{
+ int n = 0;
+
+ if (KVM_PGTABLE_MIN_BLOCK_LEVEL < 2)
+ n += DIV_ROUND_UP(range, PUD_SIZE);
+ n += DIV_ROUND_UP(range, PMD_SIZE);
+ return n;
+}
+
+static bool need_split_memcache_topup_or_resched(struct kvm *kvm)
+{
+ struct kvm_mmu_memory_cache *cache;
+ u64 chunk_size, min;
+
+ if (need_resched() || rwlock_needbreak(&kvm->mmu_lock))
+ return true;
+
+ chunk_size = kvm->arch.mmu.split_page_chunk_size;
+ min = kvm_mmu_split_nr_page_tables(chunk_size);
+ cache = &kvm->arch.mmu.split_page_cache;
+ return kvm_mmu_memory_cache_nr_free_objects(cache) < min;
+}
+
+static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr,
+ phys_addr_t end)
+{
+ struct kvm_mmu_memory_cache *cache;
+ struct kvm_pgtable *pgt;
+ int ret, cache_capacity;
+ u64 next, chunk_size;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ chunk_size = kvm->arch.mmu.split_page_chunk_size;
+ cache_capacity = kvm_mmu_split_nr_page_tables(chunk_size);
+
+ if (chunk_size == 0)
+ return 0;
+
+ cache = &kvm->arch.mmu.split_page_cache;
+
+ do {
+ if (need_split_memcache_topup_or_resched(kvm)) {
+ write_unlock(&kvm->mmu_lock);
+ cond_resched();
+ /* Eager page splitting is best-effort. */
+ ret = __kvm_mmu_topup_memory_cache(cache,
+ cache_capacity,
+ cache_capacity);
+ write_lock(&kvm->mmu_lock);
+ if (ret)
+ break;
+ }
+
+ pgt = kvm->arch.mmu.pgt;
+ if (!pgt)
+ return -EINVAL;
+
+ next = __stage2_range_addr_end(addr, end, chunk_size);
+ ret = kvm_pgtable_stage2_split(pgt, addr, next - addr, cache);
+ if (ret)
+ break;
+ } while (addr = next, addr != end);
+
+ return ret;
+}
static bool memslot_is_logging(struct kvm_memory_slot *memslot)
{
@@ -73,15 +161,23 @@ static bool memslot_is_logging(struct kvm_memory_slot *memslot)
}
/**
- * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
+ * kvm_arch_flush_remote_tlbs() - flush all VM TLB entries for v7/8
* @kvm: pointer to kvm structure.
*
* Interface to HYP function to flush all VM TLB entries
*/
-void kvm_flush_remote_tlbs(struct kvm *kvm)
+int kvm_arch_flush_remote_tlbs(struct kvm *kvm)
{
- ++kvm->stat.generic.remote_tlb_flush_requests;
kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
+ return 0;
+}
+
+int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm,
+ gfn_t gfn, u64 nr_pages)
+{
+ kvm_tlb_flush_vmid_range(&kvm->arch.mmu,
+ gfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT);
+ return 0;
}
static bool kvm_is_device_pfn(unsigned long pfn)
@@ -92,9 +188,13 @@ static bool kvm_is_device_pfn(unsigned long pfn)
static void *stage2_memcache_zalloc_page(void *arg)
{
struct kvm_mmu_memory_cache *mc = arg;
+ void *virt;
/* Allocated with __GFP_ZERO, so no need to zero */
- return kvm_mmu_memory_cache_alloc(mc);
+ virt = kvm_mmu_memory_cache_alloc(mc);
+ if (virt)
+ kvm_account_pgtable_pages(virt, 1);
+ return virt;
}
static void *kvm_host_zalloc_pages_exact(size_t size)
@@ -102,6 +202,40 @@ static void *kvm_host_zalloc_pages_exact(size_t size)
return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
}
+static void *kvm_s2_zalloc_pages_exact(size_t size)
+{
+ void *virt = kvm_host_zalloc_pages_exact(size);
+
+ if (virt)
+ kvm_account_pgtable_pages(virt, (size >> PAGE_SHIFT));
+ return virt;
+}
+
+static void kvm_s2_free_pages_exact(void *virt, size_t size)
+{
+ kvm_account_pgtable_pages(virt, -(size >> PAGE_SHIFT));
+ free_pages_exact(virt, size);
+}
+
+static struct kvm_pgtable_mm_ops kvm_s2_mm_ops;
+
+static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head)
+{
+ struct page *page = container_of(head, struct page, rcu_head);
+ void *pgtable = page_to_virt(page);
+ s8 level = page_private(page);
+
+ kvm_pgtable_stage2_free_unlinked(&kvm_s2_mm_ops, pgtable, level);
+}
+
+static void stage2_free_unlinked_table(void *addr, s8 level)
+{
+ struct page *page = virt_to_page(addr);
+
+ set_page_private(page, (unsigned long)level);
+ call_rcu(&page->rcu_head, stage2_free_unlinked_table_rcu_cb);
+}
+
static void kvm_host_get_page(void *addr)
{
get_page(virt_to_page(addr));
@@ -112,6 +246,15 @@ static void kvm_host_put_page(void *addr)
put_page(virt_to_page(addr));
}
+static void kvm_s2_put_page(void *addr)
+{
+ struct page *p = virt_to_page(addr);
+ /* Dropping last refcount, the page will be freed */
+ if (page_count(p) == 1)
+ kvm_account_pgtable_pages(addr, -1);
+ put_page(p);
+}
+
static int kvm_host_page_count(void *addr)
{
return page_count(virt_to_page(addr));
@@ -162,7 +305,7 @@ static void invalidate_icache_guest_page(void *va, size_t size)
* does.
*/
/**
- * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
+ * __unmap_stage2_range -- Clear stage2 page table entries to unmap a range
* @mmu: The KVM stage-2 MMU pointer
* @start: The intermediate physical base address of the range to unmap
* @size: The size of the area to unmap
@@ -181,7 +324,7 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64
lockdep_assert_held_write(&kvm->mmu_lock);
WARN_ON(size & ~PAGE_MASK);
- WARN_ON(stage2_apply_range(kvm, start, end, kvm_pgtable_stage2_unmap,
+ WARN_ON(stage2_apply_range(mmu, start, end, kvm_pgtable_stage2_unmap,
may_block));
}
@@ -196,7 +339,7 @@ static void stage2_flush_memslot(struct kvm *kvm,
phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
- stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_flush);
+ stage2_apply_range_resched(&kvm->arch.mmu, addr, end, kvm_pgtable_stage2_flush);
}
/**
@@ -226,7 +369,7 @@ static void stage2_flush_vm(struct kvm *kvm)
/**
* free_hyp_pgds - free Hyp-mode page tables
*/
-void free_hyp_pgds(void)
+void __init free_hyp_pgds(void)
{
mutex_lock(&kvm_hyp_pgd_mutex);
if (hyp_pgtable) {
@@ -457,6 +600,25 @@ int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
return 0;
}
+static int __hyp_alloc_private_va_range(unsigned long base)
+{
+ lockdep_assert_held(&kvm_hyp_pgd_mutex);
+
+ if (!PAGE_ALIGNED(base))
+ return -EINVAL;
+
+ /*
+ * Verify that BIT(VA_BITS - 1) hasn't been flipped by
+ * allocating the new area, as it would indicate we've
+ * overflowed the idmap/IO address range.
+ */
+ if ((base ^ io_map_base) & BIT(VA_BITS - 1))
+ return -ENOMEM;
+
+ io_map_base = base;
+
+ return 0;
+}
/**
* hyp_alloc_private_va_range - Allocates a private VA range.
@@ -477,29 +639,22 @@ int hyp_alloc_private_va_range(size_t size, unsigned long *haddr)
/*
* This assumes that we have enough space below the idmap
- * page to allocate our VAs. If not, the check below will
- * kick. A potential alternative would be to detect that
- * overflow and switch to an allocation above the idmap.
+ * page to allocate our VAs. If not, the check in
+ * __hyp_alloc_private_va_range() will kick. A potential
+ * alternative would be to detect that overflow and switch
+ * to an allocation above the idmap.
*
* The allocated size is always a multiple of PAGE_SIZE.
*/
- base = io_map_base - PAGE_ALIGN(size);
-
- /* Align the allocation based on the order of its size */
- base = ALIGN_DOWN(base, PAGE_SIZE << get_order(size));
-
- /*
- * Verify that BIT(VA_BITS - 1) hasn't been flipped by
- * allocating the new area, as it would indicate we've
- * overflowed the idmap/IO address range.
- */
- if ((base ^ io_map_base) & BIT(VA_BITS - 1))
- ret = -ENOMEM;
- else
- *haddr = io_map_base = base;
+ size = PAGE_ALIGN(size);
+ base = io_map_base - size;
+ ret = __hyp_alloc_private_va_range(base);
mutex_unlock(&kvm_hyp_pgd_mutex);
+ if (!ret)
+ *haddr = base;
+
return ret;
}
@@ -533,6 +688,48 @@ static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
return ret;
}
+int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr)
+{
+ unsigned long base;
+ size_t size;
+ int ret;
+
+ mutex_lock(&kvm_hyp_pgd_mutex);
+ /*
+ * Efficient stack verification using the PAGE_SHIFT bit implies
+ * an alignment of our allocation on the order of the size.
+ */
+ size = PAGE_SIZE * 2;
+ base = ALIGN_DOWN(io_map_base - size, size);
+
+ ret = __hyp_alloc_private_va_range(base);
+
+ mutex_unlock(&kvm_hyp_pgd_mutex);
+
+ if (ret) {
+ kvm_err("Cannot allocate hyp stack guard page\n");
+ return ret;
+ }
+
+ /*
+ * Since the stack grows downwards, map the stack to the page
+ * at the higher address and leave the lower guard page
+ * unbacked.
+ *
+ * Any valid stack address now has the PAGE_SHIFT bit as 1
+ * and addresses corresponding to the guard page have the
+ * PAGE_SHIFT bit as 0 - this is used for overflow detection.
+ */
+ ret = __create_hyp_mappings(base + PAGE_SIZE, PAGE_SIZE, phys_addr,
+ PAGE_HYP);
+ if (ret)
+ kvm_err("Cannot map hyp stack\n");
+
+ *haddr = base + size;
+
+ return ret;
+}
+
/**
* create_hyp_io_mappings - Map IO into both kernel and HYP
* @phys_addr: The physical start address which gets mapped
@@ -605,30 +802,52 @@ static struct kvm_pgtable_mm_ops kvm_user_mm_ops = {
static int get_user_mapping_size(struct kvm *kvm, u64 addr)
{
struct kvm_pgtable pgt = {
- .pgd = (kvm_pte_t *)kvm->mm->pgd,
- .ia_bits = VA_BITS,
- .start_level = (KVM_PGTABLE_MAX_LEVELS -
- CONFIG_PGTABLE_LEVELS),
+ .pgd = (kvm_pteref_t)kvm->mm->pgd,
+ .ia_bits = vabits_actual,
+ .start_level = (KVM_PGTABLE_LAST_LEVEL -
+ ARM64_HW_PGTABLE_LEVELS(pgt.ia_bits) + 1),
.mm_ops = &kvm_user_mm_ops,
};
+ unsigned long flags;
kvm_pte_t pte = 0; /* Keep GCC quiet... */
- u32 level = ~0;
+ s8 level = S8_MAX;
int ret;
+ /*
+ * Disable IRQs so that we hazard against a concurrent
+ * teardown of the userspace page tables (which relies on
+ * IPI-ing threads).
+ */
+ local_irq_save(flags);
ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level);
- VM_BUG_ON(ret);
- VM_BUG_ON(level >= KVM_PGTABLE_MAX_LEVELS);
- VM_BUG_ON(!(pte & PTE_VALID));
+ local_irq_restore(flags);
+
+ if (ret)
+ return ret;
+
+ /*
+ * Not seeing an error, but not updating level? Something went
+ * deeply wrong...
+ */
+ if (WARN_ON(level > KVM_PGTABLE_LAST_LEVEL))
+ return -EFAULT;
+ if (WARN_ON(level < KVM_PGTABLE_FIRST_LEVEL))
+ return -EFAULT;
+
+ /* Oops, the userspace PTs are gone... Replay the fault */
+ if (!kvm_pte_valid(pte))
+ return -EAGAIN;
return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level));
}
static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
.zalloc_page = stage2_memcache_zalloc_page,
- .zalloc_pages_exact = kvm_host_zalloc_pages_exact,
- .free_pages_exact = free_pages_exact,
+ .zalloc_pages_exact = kvm_s2_zalloc_pages_exact,
+ .free_pages_exact = kvm_s2_free_pages_exact,
+ .free_unlinked_table = stage2_free_unlinked_table,
.get_page = kvm_host_get_page,
- .put_page = kvm_host_put_page,
+ .put_page = kvm_s2_put_page,
.page_count = kvm_host_page_count,
.phys_to_virt = kvm_host_va,
.virt_to_phys = kvm_host_pa,
@@ -640,15 +859,42 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
* kvm_init_stage2_mmu - Initialise a S2 MMU structure
* @kvm: The pointer to the KVM structure
* @mmu: The pointer to the s2 MMU structure
+ * @type: The machine type of the virtual machine
*
* Allocates only the stage-2 HW PGD level table(s).
* Note we don't need locking here as this is only called when the VM is
* created, which can only be done once.
*/
-int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
+int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type)
{
+ u32 kvm_ipa_limit = get_kvm_ipa_limit();
int cpu, err;
struct kvm_pgtable *pgt;
+ u64 mmfr0, mmfr1;
+ u32 phys_shift;
+
+ if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK)
+ return -EINVAL;
+
+ phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type);
+ if (is_protected_kvm_enabled()) {
+ phys_shift = kvm_ipa_limit;
+ } else if (phys_shift) {
+ if (phys_shift > kvm_ipa_limit ||
+ phys_shift < ARM64_MIN_PARANGE_BITS)
+ return -EINVAL;
+ } else {
+ phys_shift = KVM_PHYS_SHIFT;
+ if (phys_shift > kvm_ipa_limit) {
+ pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n",
+ current->comm);
+ return -EINVAL;
+ }
+ }
+
+ mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
+ mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
+ mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
if (mmu->pgt != NULL) {
kvm_err("kvm_arch already initialized?\n");
@@ -673,6 +919,10 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
for_each_possible_cpu(cpu)
*per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
+ /* The eager page splitting is disabled by default */
+ mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT;
+ mmu->split_page_cache.gfp_zero = __GFP_ZERO;
+
mmu->pgt = pgt;
mmu->pgd_phys = __pa(pgt->pgd);
return 0;
@@ -684,6 +934,12 @@ out_free_pgtable:
return err;
}
+void kvm_uninit_stage2_mmu(struct kvm *kvm)
+{
+ kvm_free_stage2_pgd(&kvm->arch.mmu);
+ kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache);
+}
+
static void stage2_unmap_memslot(struct kvm *kvm,
struct kvm_memory_slot *memslot)
{
@@ -772,6 +1028,32 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
}
}
+static void hyp_mc_free_fn(void *addr, void *unused)
+{
+ free_page((unsigned long)addr);
+}
+
+static void *hyp_mc_alloc_fn(void *unused)
+{
+ return (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
+}
+
+void free_hyp_memcache(struct kvm_hyp_memcache *mc)
+{
+ if (is_protected_kvm_enabled())
+ __free_hyp_memcache(mc, hyp_mc_free_fn,
+ kvm_host_va, NULL);
+}
+
+int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages)
+{
+ if (!is_protected_kvm_enabled())
+ return 0;
+
+ return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn,
+ kvm_host_pa, NULL);
+}
+
/**
* kvm_phys_addr_ioremap - map a device range to guest IPA
*
@@ -786,8 +1068,9 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
{
phys_addr_t addr;
int ret = 0;
- struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, };
- struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
+ struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO };
+ struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
+ struct kvm_pgtable *pgt = mmu->pgt;
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
KVM_PGTABLE_PROT_R |
(writable ? KVM_PGTABLE_PROT_W : 0);
@@ -800,13 +1083,13 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
ret = kvm_mmu_topup_memory_cache(&cache,
- kvm_mmu_cache_min_pages(kvm));
+ kvm_mmu_cache_min_pages(mmu));
if (ret)
break;
write_lock(&kvm->mmu_lock);
ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
- &cache);
+ &cache, 0);
write_unlock(&kvm->mmu_lock);
if (ret)
break;
@@ -826,8 +1109,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
*/
static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
{
- struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
- stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_wrprotect);
+ stage2_apply_range_resched(mmu, addr, end, kvm_pgtable_stage2_wrprotect);
}
/**
@@ -858,43 +1140,70 @@ static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
write_lock(&kvm->mmu_lock);
stage2_wp_range(&kvm->arch.mmu, start, end);
write_unlock(&kvm->mmu_lock);
- kvm_flush_remote_tlbs(kvm);
+ kvm_flush_remote_tlbs_memslot(kvm, memslot);
}
/**
- * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
+ * kvm_mmu_split_memory_region() - split the stage 2 blocks into PAGE_SIZE
+ * pages for memory slot
* @kvm: The KVM pointer
- * @slot: The memory slot associated with mask
- * @gfn_offset: The gfn offset in memory slot
- * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory
- * slot to be write protected
+ * @slot: The memory slot to split
*
- * Walks bits set in mask write protects the associated pte's. Caller must
- * acquire kvm_mmu_lock.
+ * Acquires kvm->mmu_lock. Called with kvm->slots_lock mutex acquired,
+ * serializing operations for VM memory regions.
*/
-static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
- struct kvm_memory_slot *slot,
- gfn_t gfn_offset, unsigned long mask)
+static void kvm_mmu_split_memory_region(struct kvm *kvm, int slot)
{
- phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
- phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT;
- phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *memslot;
+ phys_addr_t start, end;
- stage2_wp_range(&kvm->arch.mmu, start, end);
+ lockdep_assert_held(&kvm->slots_lock);
+
+ slots = kvm_memslots(kvm);
+ memslot = id_to_memslot(slots, slot);
+
+ start = memslot->base_gfn << PAGE_SHIFT;
+ end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
+
+ write_lock(&kvm->mmu_lock);
+ kvm_mmu_split_huge_pages(kvm, start, end);
+ write_unlock(&kvm->mmu_lock);
}
/*
- * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
- * dirty pages.
+ * kvm_arch_mmu_enable_log_dirty_pt_masked() - enable dirty logging for selected pages.
+ * @kvm: The KVM pointer
+ * @slot: The memory slot associated with mask
+ * @gfn_offset: The gfn offset in memory slot
+ * @mask: The mask of pages at offset 'gfn_offset' in this memory
+ * slot to enable dirty logging on
*
- * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
- * enable dirty logging for them.
+ * Writes protect selected pages to enable dirty logging, and then
+ * splits them to PAGE_SIZE. Caller must acquire kvm->mmu_lock.
*/
void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask)
{
- kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
+ phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
+ phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT;
+ phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ stage2_wp_range(&kvm->arch.mmu, start, end);
+
+ /*
+ * Eager-splitting is done when manual-protect is set. We
+ * also check for initially-all-set because we can avoid
+ * eager-splitting if initially-all-set is false.
+ * Initially-all-set equal false implies that huge-pages were
+ * already split when enabling dirty logging: no need to do it
+ * again.
+ */
+ if (kvm_dirty_log_manual_protect_and_init_set(kvm))
+ kvm_mmu_split_huge_pages(kvm, start, end);
}
static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
@@ -971,7 +1280,7 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
*
* Returns the size of the mapping.
*/
-static unsigned long
+static long
transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
unsigned long hva, kvm_pfn_t *pfnp,
phys_addr_t *ipap)
@@ -983,30 +1292,17 @@ transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
* sure that the HVA and IPA are sufficiently aligned and that the
* block map is contained within the memslot.
*/
- if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) &&
- get_user_mapping_size(kvm, hva) >= PMD_SIZE) {
- /*
- * The address we faulted on is backed by a transparent huge
- * page. However, because we map the compound huge page and
- * not the individual tail page, we need to transfer the
- * refcount to the head page. We have to be careful that the
- * THP doesn't start to split while we are adjusting the
- * refcounts.
- *
- * We are sure this doesn't happen, because mmu_notifier_retry
- * was successful and we are holding the mmu_lock, so if this
- * THP is trying to split, it will be blocked in the mmu
- * notifier before touching any of the pages, specifically
- * before being able to call __split_huge_page_refcount().
- *
- * We can therefore safely transfer the refcount from PG_tail
- * to PG_head and switch the pfn from a tail page to the head
- * page accordingly.
- */
+ if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
+ int sz = get_user_mapping_size(kvm, hva);
+
+ if (sz < 0)
+ return sz;
+
+ if (sz < PMD_SIZE)
+ return PAGE_SIZE;
+
*ipap &= PMD_MASK;
- kvm_release_pfn_clean(pfn);
pfn &= ~(PTRS_PER_PMD - 1);
- get_page(pfn_to_page(pfn));
*pfnp = pfn;
return PMD_SIZE;
@@ -1056,43 +1352,36 @@ static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva)
* - mmap_lock protects between a VM faulting a page in and the VMM performing
* an mprotect() to add VM_MTE
*/
-static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
- unsigned long size)
+static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
+ unsigned long size)
{
unsigned long i, nr_pages = size >> PAGE_SHIFT;
- struct page *page;
+ struct page *page = pfn_to_page(pfn);
if (!kvm_has_mte(kvm))
- return 0;
-
- /*
- * pfn_to_online_page() is used to reject ZONE_DEVICE pages
- * that may not support tags.
- */
- page = pfn_to_online_page(pfn);
-
- if (!page)
- return -EFAULT;
+ return;
for (i = 0; i < nr_pages; i++, page++) {
- if (!test_bit(PG_mte_tagged, &page->flags)) {
+ if (try_page_mte_tagging(page)) {
mte_clear_page_tags(page_address(page));
- set_bit(PG_mte_tagged, &page->flags);
+ set_page_mte_tagged(page);
}
}
+}
- return 0;
+static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
+{
+ return vma->vm_flags & VM_MTE_ALLOWED;
}
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
struct kvm_memory_slot *memslot, unsigned long hva,
- unsigned long fault_status)
+ bool fault_is_perm)
{
int ret = 0;
bool write_fault, writable, force_pte = false;
- bool exec_fault;
- bool device = false;
- bool shared;
+ bool exec_fault, mte_allowed;
+ bool device = false, vfio_allow_any_uc = false;
unsigned long mmu_seq;
struct kvm *kvm = vcpu->kvm;
struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
@@ -1101,23 +1390,35 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
gfn_t gfn;
kvm_pfn_t pfn;
bool logging_active = memslot_is_logging(memslot);
- bool use_read_lock = false;
- unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu);
- unsigned long vma_pagesize, fault_granule;
+ long vma_pagesize, fault_granule;
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
struct kvm_pgtable *pgt;
- fault_granule = 1UL << ARM64_HW_PGTABLE_LEVEL_SHIFT(fault_level);
+ if (fault_is_perm)
+ fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
write_fault = kvm_is_write_fault(vcpu);
exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
VM_BUG_ON(write_fault && exec_fault);
- if (fault_status == FSC_PERM && !write_fault && !exec_fault) {
+ if (fault_is_perm && !write_fault && !exec_fault) {
kvm_err("Unexpected L2 read permission error\n");
return -EFAULT;
}
/*
+ * Permission faults just need to update the existing leaf entry,
+ * and so normally don't require allocations from the memcache. The
+ * only exception to this is when dirty logging is enabled at runtime
+ * and a write fault needs to collapse a block entry into a table.
+ */
+ if (!fault_is_perm || (logging_active && write_fault)) {
+ ret = kvm_mmu_topup_memory_cache(memcache,
+ kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu));
+ if (ret)
+ return ret;
+ }
+
+ /*
* Let's check if we will get back a huge page backed by hugetlbfs, or
* get block mapping for device MMIO region.
*/
@@ -1136,14 +1437,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (logging_active) {
force_pte = true;
vma_shift = PAGE_SHIFT;
- use_read_lock = (fault_status == FSC_PERM && write_fault &&
- fault_granule == PAGE_SIZE);
} else {
vma_shift = get_vma_page_shift(vma, hva);
}
- shared = (vma->vm_flags & VM_SHARED);
-
switch (vma_shift) {
#ifndef __PAGETABLE_PMD_FOLDED
case PUD_SHIFT:
@@ -1173,38 +1470,25 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
fault_ipa &= ~(vma_pagesize - 1);
gfn = fault_ipa >> PAGE_SHIFT;
- mmap_read_unlock(current->mm);
+ mte_allowed = kvm_vma_mte_allowed(vma);
- /*
- * Permission faults just need to update the existing leaf entry,
- * and so normally don't require allocations from the memcache. The
- * only exception to this is when dirty logging is enabled at runtime
- * and a write fault needs to collapse a block entry into a table.
- */
- if (fault_status != FSC_PERM || (logging_active && write_fault)) {
- ret = kvm_mmu_topup_memory_cache(memcache,
- kvm_mmu_cache_min_pages(kvm));
- if (ret)
- return ret;
- }
+ vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
+
+ /* Don't use the VMA after the unlock -- it may have vanished */
+ vma = NULL;
- mmu_seq = vcpu->kvm->mmu_notifier_seq;
/*
- * Ensure the read of mmu_notifier_seq happens before we call
- * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
- * the page we just got a reference to gets unmapped before we have a
- * chance to grab the mmu_lock, which ensure that if the page gets
- * unmapped afterwards, the call to kvm_unmap_gfn will take it away
- * from us again properly. This smp_rmb() interacts with the smp_wmb()
- * in kvm_mmu_notifier_invalidate_<page|range_end>.
+ * Read mmu_invalidate_seq so that KVM can detect if the results of
+ * vma_lookup() or __gfn_to_pfn_memslot() become stale prior to
+ * acquiring kvm->mmu_lock.
*
- * Besides, __gfn_to_pfn_memslot() instead of gfn_to_pfn_prot() is
- * used to avoid unnecessary overhead introduced to locate the memory
- * slot because it's always fixed even @gfn is adjusted for huge pages.
+ * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
+ * with the smp_wmb() in kvm_mmu_invalidate_end().
*/
- smp_rmb();
+ mmu_seq = vcpu->kvm->mmu_invalidate_seq;
+ mmap_read_unlock(current->mm);
- pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
+ pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL,
write_fault, &writable, NULL);
if (pfn == KVM_PFN_ERR_HWPOISON) {
kvm_send_hwpoison_signal(hva, vma_shift);
@@ -1236,17 +1520,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (exec_fault && device)
return -ENOEXEC;
- /*
- * To reduce MMU contentions and enhance concurrency during dirty
- * logging dirty logging, only acquire read lock for permission
- * relaxation.
- */
- if (use_read_lock)
- read_lock(&kvm->mmu_lock);
- else
- write_lock(&kvm->mmu_lock);
+ read_lock(&kvm->mmu_lock);
pgt = vcpu->arch.hw_mmu->pgt;
- if (mmu_notifier_retry(kvm, mmu_seq))
+ if (mmu_invalidate_retry(kvm, mmu_seq))
goto out_unlock;
/*
@@ -1254,22 +1530,27 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* backed by a THP and thus use block mapping if possible.
*/
if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) {
- if (fault_status == FSC_PERM && fault_granule > PAGE_SIZE)
+ if (fault_is_perm && fault_granule > PAGE_SIZE)
vma_pagesize = fault_granule;
else
vma_pagesize = transparent_hugepage_adjust(kvm, memslot,
hva, &pfn,
&fault_ipa);
+
+ if (vma_pagesize < 0) {
+ ret = vma_pagesize;
+ goto out_unlock;
+ }
}
- if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) {
- /* Check the VMM hasn't introduced a new VM_SHARED VMA */
- if (!shared)
- ret = sanitise_mte_tags(kvm, pfn, vma_pagesize);
- else
+ if (!fault_is_perm && !device && kvm_has_mte(kvm)) {
+ /* Check the VMM hasn't introduced a new disallowed VMA */
+ if (mte_allowed) {
+ sanitise_mte_tags(kvm, pfn, vma_pagesize);
+ } else {
ret = -EFAULT;
- if (ret)
goto out_unlock;
+ }
}
if (writable)
@@ -1278,25 +1559,28 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (exec_fault)
prot |= KVM_PGTABLE_PROT_X;
- if (device)
- prot |= KVM_PGTABLE_PROT_DEVICE;
- else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
+ if (device) {
+ if (vfio_allow_any_uc)
+ prot |= KVM_PGTABLE_PROT_NORMAL_NC;
+ else
+ prot |= KVM_PGTABLE_PROT_DEVICE;
+ } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) {
prot |= KVM_PGTABLE_PROT_X;
+ }
/*
* Under the premise of getting a FSC_PERM fault, we just need to relax
* permissions only if vma_pagesize equals fault_granule. Otherwise,
* kvm_pgtable_stage2_map() should be called to change block size.
*/
- if (fault_status == FSC_PERM && vma_pagesize == fault_granule) {
+ if (fault_is_perm && vma_pagesize == fault_granule)
ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
- } else {
- WARN_ONCE(use_read_lock, "Attempted stage-2 map outside of write lock\n");
-
+ else
ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
__pfn_to_phys(pfn), prot,
- memcache);
- }
+ memcache,
+ KVM_PGTABLE_WALK_HANDLE_FAULT |
+ KVM_PGTABLE_WALK_SHARED);
/* Mark the page dirty only if the fault is handled successfully */
if (writable && !ret) {
@@ -1305,11 +1589,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
}
out_unlock:
- if (use_read_lock)
- read_unlock(&kvm->mmu_lock);
- else
- write_unlock(&kvm->mmu_lock);
- kvm_set_pfn_accessed(pfn);
+ read_unlock(&kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
return ret != -EAGAIN ? ret : 0;
}
@@ -1317,20 +1597,18 @@ out_unlock:
/* Resolve the access fault by making the page young again. */
static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
{
- pte_t pte;
- kvm_pte_t kpte;
+ kvm_pte_t pte;
struct kvm_s2_mmu *mmu;
trace_kvm_access_fault(fault_ipa);
- write_lock(&vcpu->kvm->mmu_lock);
+ read_lock(&vcpu->kvm->mmu_lock);
mmu = vcpu->arch.hw_mmu;
- kpte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa);
- write_unlock(&vcpu->kvm->mmu_lock);
+ pte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa);
+ read_unlock(&vcpu->kvm->mmu_lock);
- pte = __pte(kpte);
- if (pte_valid(pte))
- kvm_set_pfn_accessed(pte_pfn(pte));
+ if (kvm_pte_valid(pte))
+ kvm_set_pfn_accessed(kvm_pte_to_pfn(pte));
}
/**
@@ -1346,7 +1624,7 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
*/
int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
{
- unsigned long fault_status;
+ unsigned long esr;
phys_addr_t fault_ipa;
struct kvm_memory_slot *memslot;
unsigned long hva;
@@ -1354,12 +1632,12 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
gfn_t gfn;
int ret, idx;
- fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
+ esr = kvm_vcpu_get_esr(vcpu);
fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
- if (fault_status == FSC_FAULT) {
+ if (esr_fsc_is_permission_fault(esr)) {
/* Beyond sanitised PARange (which is the IPA limit) */
if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) {
kvm_inject_size_fault(vcpu);
@@ -1394,8 +1672,9 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
kvm_vcpu_get_hfar(vcpu), fault_ipa);
/* Check the stage-2 fault is trans. fault or write fault */
- if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
- fault_status != FSC_ACCESS) {
+ if (!esr_fsc_is_translation_fault(esr) &&
+ !esr_fsc_is_permission_fault(esr) &&
+ !esr_fsc_is_access_flag_fault(esr)) {
kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
kvm_vcpu_trap_get_class(vcpu),
(unsigned long)kvm_vcpu_trap_get_fault(vcpu),
@@ -1455,15 +1734,16 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
}
/* Userspace should not be able to register out-of-bounds IPAs */
- VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm));
+ VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->arch.hw_mmu));
- if (fault_status == FSC_ACCESS) {
+ if (esr_fsc_is_access_flag_fault(esr)) {
handle_access_fault(vcpu, fault_ipa);
ret = 1;
goto out_unlock;
}
- ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
+ ret = user_mem_abort(vcpu, fault_ipa, memslot, hva,
+ esr_fsc_is_permission_fault(esr));
if (ret == 0)
ret = 1;
out:
@@ -1490,16 +1770,19 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- kvm_pfn_t pfn = pte_pfn(range->pte);
- int ret;
+ kvm_pfn_t pfn = pte_pfn(range->arg.pte);
if (!kvm->arch.mmu.pgt)
return false;
WARN_ON(range->end - range->start != 1);
- ret = sanitise_mte_tags(kvm, pfn, PAGE_SIZE);
- if (ret)
+ /*
+ * If the page isn't tagged, defer to user_mem_abort() for sanitising
+ * the MTE tags. The S2 pte should have been unmapped by
+ * mmu_notifier_invalidate_range_end().
+ */
+ if (kvm_has_mte(kvm) && !page_mte_tagged(pfn_to_page(pfn)))
return false;
/*
@@ -1514,7 +1797,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
*/
kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT,
PAGE_SIZE, __pfn_to_phys(pfn),
- KVM_PGTABLE_PROT_R, NULL);
+ KVM_PGTABLE_PROT_R, NULL, 0);
return false;
}
@@ -1522,27 +1805,25 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
u64 size = (range->end - range->start) << PAGE_SHIFT;
- kvm_pte_t kpte;
- pte_t pte;
if (!kvm->arch.mmu.pgt)
return false;
- WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
-
- kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt,
- range->start << PAGE_SHIFT);
- pte = __pte(kpte);
- return pte_valid(pte) && pte_young(pte);
+ return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt,
+ range->start << PAGE_SHIFT,
+ size, true);
}
bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
+ u64 size = (range->end - range->start) << PAGE_SHIFT;
+
if (!kvm->arch.mmu.pgt)
return false;
- return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt,
- range->start << PAGE_SHIFT);
+ return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt,
+ range->start << PAGE_SHIFT,
+ size, false);
}
phys_addr_t kvm_mmu_get_httbr(void)
@@ -1580,9 +1861,11 @@ static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = {
.virt_to_phys = kvm_host_pa,
};
-int kvm_mmu_init(u32 *hyp_va_bits)
+int __init kvm_mmu_init(u32 *hyp_va_bits)
{
int err;
+ u32 idmap_bits;
+ u32 kernel_bits;
hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
@@ -1596,7 +1879,24 @@ int kvm_mmu_init(u32 *hyp_va_bits)
*/
BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
- *hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
+ /*
+ * The ID map is always configured for 48 bits of translation, which
+ * may be fewer than the number of VA bits used by the regular kernel
+ * stage 1, when VA_BITS=52.
+ *
+ * At EL2, there is only one TTBR register, and we can't switch between
+ * translation tables *and* update TCR_EL2.T0SZ at the same time. Bottom
+ * line: we need to use the extended range with *both* our translation
+ * tables.
+ *
+ * So use the maximum of the idmap VA bits and the regular kernel stage
+ * 1 VA bits to assure that the hypervisor can both ID map its code page
+ * and map any kernel memory.
+ */
+ idmap_bits = IDMAP_VA_BITS;
+ kernel_bits = vabits_actual;
+ *hyp_va_bits = max(idmap_bits, kernel_bits);
+
kvm_debug("Using %u-bit virtual addresses at EL2\n", *hyp_va_bits);
kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
kvm_debug("HYP VA range: %lx:%lx\n",
@@ -1647,20 +1947,42 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
const struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
+ bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES;
+
/*
* At this point memslot has been committed and there is an
* allocated dirty_bitmap[], dirty pages will be tracked while the
* memory slot is write protected.
*/
- if (change != KVM_MR_DELETE && new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
+ if (log_dirty_pages) {
+
+ if (change == KVM_MR_DELETE)
+ return;
+
/*
- * If we're with initial-all-set, we don't need to write
- * protect any pages because they're all reported as dirty.
- * Huge pages and normal pages will be write protect gradually.
+ * Huge and normal pages are write-protected and split
+ * on either of these two cases:
+ *
+ * 1. with initial-all-set: gradually with CLEAR ioctls,
*/
- if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) {
- kvm_mmu_wp_memory_region(kvm, new->id);
- }
+ if (kvm_dirty_log_manual_protect_and_init_set(kvm))
+ return;
+ /*
+ * or
+ * 2. without initial-all-set: all in one shot when
+ * enabling dirty logging.
+ */
+ kvm_mmu_wp_memory_region(kvm, new->id);
+ kvm_mmu_split_memory_region(kvm, new->id);
+ } else {
+ /*
+ * Free any leftovers from the eager page splitting cache. Do
+ * this when deleting, moving, disabling dirty logging, or
+ * creating the memslot (a nop). Doing it for deletes makes
+ * sure we don't leak memory, and there's no need to keep the
+ * cache around for any of the other cases.
+ */
+ kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache);
}
}
@@ -1680,7 +2002,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
* Prevent userspace from creating a memory region outside of the IPA
* space addressable by the KVM guest IPA space.
*/
- if ((new->base_gfn + new->npages) > (kvm_phys_size(kvm) >> PAGE_SHIFT))
+ if ((new->base_gfn + new->npages) > (kvm_phys_size(&kvm->arch.mmu) >> PAGE_SHIFT))
return -EFAULT;
hva = new->userspace_addr;
@@ -1705,12 +2027,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
if (!vma)
break;
- /*
- * VM_SHARED mappings are not allowed with MTE to avoid races
- * when updating the PG_mte_tagged page flag, see
- * sanitise_mte_tags for more details.
- */
- if (kvm_has_mte(kvm) && vma->vm_flags & VM_SHARED) {
+ if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) {
ret = -EINVAL;
break;
}
@@ -1739,7 +2056,7 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
void kvm_arch_flush_shadow_all(struct kvm *kvm)
{
- kvm_free_stage2_pgd(&kvm->arch.mmu);
+ kvm_uninit_stage2_mmu(kvm);
}
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,