aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc/kvm/book3s_64_mmu_radix.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-04-09 11:42:31 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2018-04-09 11:42:31 -0700
commitd8312a3f61024352f1c7cb967571fd53631b0d6c (patch)
treebe2f2f699e763330b0f0179e9f86009affbc0c7d /arch/powerpc/kvm/book3s_64_mmu_radix.c
parentFix subtle macro variable shadowing in min_not_zero() (diff)
parentkvm: x86: fix a prototype warning (diff)
downloadlinux-dev-d8312a3f61024352f1c7cb967571fd53631b0d6c.tar.xz
linux-dev-d8312a3f61024352f1c7cb967571fd53631b0d6c.zip
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm updates from Paolo Bonzini: "ARM: - VHE optimizations - EL2 address space randomization - speculative execution mitigations ("variant 3a", aka execution past invalid privilege register access) - bugfixes and cleanups PPC: - improvements for the radix page fault handler for HV KVM on POWER9 s390: - more kvm stat counters - virtio gpu plumbing - documentation - facilities improvements x86: - support for VMware magic I/O port and pseudo-PMCs - AMD pause loop exiting - support for AMD core performance extensions - support for synchronous register access - expose nVMX capabilities to userspace - support for Hyper-V signaling via eventfd - use Enlightened VMCS when running on Hyper-V - allow userspace to disable MWAIT/HLT/PAUSE vmexits - usual roundup of optimizations and nested virtualization bugfixes Generic: - API selftest infrastructure (though the only tests are for x86 as of now)" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (174 commits) kvm: x86: fix a prototype warning kvm: selftests: add sync_regs_test kvm: selftests: add API testing infrastructure kvm: x86: fix a compile warning KVM: X86: Add Force Emulation Prefix for "emulate the next instruction" KVM: X86: Introduce handle_ud() KVM: vmx: unify adjacent #ifdefs x86: kvm: hide the unused 'cpu' variable KVM: VMX: remove bogus WARN_ON in handle_ept_misconfig Revert "KVM: X86: Fix SMRAM accessing even if VM is shutdown" kvm: Add emulation for movups/movupd KVM: VMX: raise internal error for exception during invalid protected mode state KVM: nVMX: Optimization: Dont set KVM_REQ_EVENT when VMExit with nested_run_pending KVM: nVMX: Require immediate-exit when event reinjected to L2 and L1 event pending KVM: x86: Fix misleading comments on handling pending exceptions KVM: x86: Rename interrupt.pending to interrupt.injected KVM: VMX: No need to clear pending NMI/interrupt on inject realmode interrupt x86/kvm: use Enlightened VMCS when running on Hyper-V x86/hyper-v: detect nested features x86/hyper-v: define struct hv_enlightened_vmcs and clean field bits ...
Diffstat (limited to 'arch/powerpc/kvm/book3s_64_mmu_radix.c')
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_radix.c333
1 files changed, 208 insertions, 125 deletions
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 5d9bafe9a371..a57eafec4dc2 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -150,7 +150,9 @@ static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
{
int psize = MMU_BASE_PSIZE;
- if (pshift >= PMD_SHIFT)
+ if (pshift >= PUD_SHIFT)
+ psize = MMU_PAGE_1G;
+ else if (pshift >= PMD_SHIFT)
psize = MMU_PAGE_2M;
addr &= ~0xfffUL;
addr |= mmu_psize_defs[psize].ap << 5;
@@ -163,6 +165,17 @@ static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
asm volatile("ptesync": : :"memory");
}
+static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned long addr)
+{
+ unsigned long rb = 0x2 << PPC_BITLSHIFT(53); /* IS = 2 */
+
+ asm volatile("ptesync": : :"memory");
+ /* RIC=1 PRS=0 R=1 IS=2 */
+ asm volatile(PPC_TLBIE_5(%0, %1, 1, 0, 1)
+ : : "r" (rb), "r" (kvm->arch.lpid) : "memory");
+ asm volatile("ptesync": : :"memory");
+}
+
unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
unsigned long clr, unsigned long set,
unsigned long addr, unsigned int shift)
@@ -223,9 +236,9 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
new_pud = pud_alloc_one(kvm->mm, gpa);
pmd = NULL;
- if (pud && pud_present(*pud))
+ if (pud && pud_present(*pud) && !pud_huge(*pud))
pmd = pmd_offset(pud, gpa);
- else
+ else if (level <= 1)
new_pmd = pmd_alloc_one(kvm->mm, gpa);
if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_is_leaf(*pmd)))
@@ -246,6 +259,50 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
new_pud = NULL;
}
pud = pud_offset(pgd, gpa);
+ if (pud_huge(*pud)) {
+ unsigned long hgpa = gpa & PUD_MASK;
+
+ /*
+ * If we raced with another CPU which has just put
+ * a 1GB pte in after we saw a pmd page, try again.
+ */
+ if (level <= 1 && !new_pmd) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+ /* Check if we raced and someone else has set the same thing */
+ if (level == 2 && pud_raw(*pud) == pte_raw(pte)) {
+ ret = 0;
+ goto out_unlock;
+ }
+ /* Valid 1GB page here already, remove it */
+ old = kvmppc_radix_update_pte(kvm, (pte_t *)pud,
+ ~0UL, 0, hgpa, PUD_SHIFT);
+ kvmppc_radix_tlbie_page(kvm, hgpa, PUD_SHIFT);
+ if (old & _PAGE_DIRTY) {
+ unsigned long gfn = hgpa >> PAGE_SHIFT;
+ struct kvm_memory_slot *memslot;
+ memslot = gfn_to_memslot(kvm, gfn);
+ if (memslot && memslot->dirty_bitmap)
+ kvmppc_update_dirty_map(memslot,
+ gfn, PUD_SIZE);
+ }
+ }
+ if (level == 2) {
+ if (!pud_none(*pud)) {
+ /*
+ * There's a page table page here, but we wanted to
+ * install a large page, so remove and free the page
+ * table page. new_pmd will be NULL since level == 2.
+ */
+ new_pmd = pmd_offset(pud, 0);
+ pud_clear(pud);
+ kvmppc_radix_flush_pwc(kvm, gpa);
+ }
+ kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);
+ ret = 0;
+ goto out_unlock;
+ }
if (pud_none(*pud)) {
if (!new_pmd)
goto out_unlock;
@@ -264,6 +321,11 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
ret = -EAGAIN;
goto out_unlock;
}
+ /* Check if we raced and someone else has set the same thing */
+ if (level == 1 && pmd_raw(*pmd) == pte_raw(pte)) {
+ ret = 0;
+ goto out_unlock;
+ }
/* Valid 2MB page here already, remove it */
old = kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd),
~0UL, 0, lgpa, PMD_SHIFT);
@@ -276,35 +338,43 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
kvmppc_update_dirty_map(memslot,
gfn, PMD_SIZE);
}
- } else if (level == 1 && !pmd_none(*pmd)) {
- /*
- * There's a page table page here, but we wanted
- * to install a large page. Tell the caller and let
- * it try installing a normal page if it wants.
- */
- ret = -EBUSY;
- goto out_unlock;
}
- if (level == 0) {
- if (pmd_none(*pmd)) {
- if (!new_ptep)
- goto out_unlock;
- pmd_populate(kvm->mm, pmd, new_ptep);
- new_ptep = NULL;
- }
- ptep = pte_offset_kernel(pmd, gpa);
- if (pte_present(*ptep)) {
- /* PTE was previously valid, so invalidate it */
- old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT,
- 0, gpa, 0);
- kvmppc_radix_tlbie_page(kvm, gpa, 0);
- if (old & _PAGE_DIRTY)
- mark_page_dirty(kvm, gpa >> PAGE_SHIFT);
+ if (level == 1) {
+ if (!pmd_none(*pmd)) {
+ /*
+ * There's a page table page here, but we wanted to
+ * install a large page, so remove and free the page
+ * table page. new_ptep will be NULL since level == 1.
+ */
+ new_ptep = pte_offset_kernel(pmd, 0);
+ pmd_clear(pmd);
+ kvmppc_radix_flush_pwc(kvm, gpa);
}
- kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
- } else {
kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
+ ret = 0;
+ goto out_unlock;
}
+ if (pmd_none(*pmd)) {
+ if (!new_ptep)
+ goto out_unlock;
+ pmd_populate(kvm->mm, pmd, new_ptep);
+ new_ptep = NULL;
+ }
+ ptep = pte_offset_kernel(pmd, gpa);
+ if (pte_present(*ptep)) {
+ /* Check if someone else set the same thing */
+ if (pte_raw(*ptep) == pte_raw(pte)) {
+ ret = 0;
+ goto out_unlock;
+ }
+ /* PTE was previously valid, so invalidate it */
+ old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT,
+ 0, gpa, 0);
+ kvmppc_radix_tlbie_page(kvm, gpa, 0);
+ if (old & _PAGE_DIRTY)
+ mark_page_dirty(kvm, gpa >> PAGE_SHIFT);
+ }
+ kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
ret = 0;
out_unlock:
@@ -325,11 +395,11 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
unsigned long mmu_seq, pte_size;
unsigned long gpa, gfn, hva, pfn;
struct kvm_memory_slot *memslot;
- struct page *page = NULL, *pages[1];
- long ret, npages, ok;
- unsigned int writing;
- struct vm_area_struct *vma;
- unsigned long flags;
+ struct page *page = NULL;
+ long ret;
+ bool writing;
+ bool upgrade_write = false;
+ bool *upgrade_p = &upgrade_write;
pte_t pte, *ptep;
unsigned long pgflags;
unsigned int shift, level;
@@ -369,122 +439,131 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
dsisr & DSISR_ISSTORE);
}
- /* used to check for invalidations in progress */
- mmu_seq = kvm->mmu_notifier_seq;
- smp_rmb();
-
writing = (dsisr & DSISR_ISSTORE) != 0;
- hva = gfn_to_hva_memslot(memslot, gfn);
+ if (memslot->flags & KVM_MEM_READONLY) {
+ if (writing) {
+ /* give the guest a DSI */
+ dsisr = DSISR_ISSTORE | DSISR_PROTFAULT;
+ kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
+ return RESUME_GUEST;
+ }
+ upgrade_p = NULL;
+ }
+
if (dsisr & DSISR_SET_RC) {
/*
* Need to set an R or C bit in the 2nd-level tables;
- * if the relevant bits aren't already set in the linux
- * page tables, fall through to do the gup_fast to
- * set them in the linux page tables too.
+ * since we are just helping out the hardware here,
+ * it is sufficient to do what the hardware does.
*/
- ok = 0;
pgflags = _PAGE_ACCESSED;
if (writing)
pgflags |= _PAGE_DIRTY;
- local_irq_save(flags);
- ptep = find_current_mm_pte(current->mm->pgd, hva, NULL, NULL);
- if (ptep) {
- pte = READ_ONCE(*ptep);
- if (pte_present(pte) &&
- (pte_val(pte) & pgflags) == pgflags)
- ok = 1;
- }
- local_irq_restore(flags);
- if (ok) {
- spin_lock(&kvm->mmu_lock);
- if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) {
- spin_unlock(&kvm->mmu_lock);
- return RESUME_GUEST;
- }
- /*
- * We are walking the secondary page table here. We can do this
- * without disabling irq.
- */
- ptep = __find_linux_pte(kvm->arch.pgtable,
- gpa, NULL, &shift);
- if (ptep && pte_present(*ptep)) {
- kvmppc_radix_update_pte(kvm, ptep, 0, pgflags,
- gpa, shift);
- spin_unlock(&kvm->mmu_lock);
- return RESUME_GUEST;
- }
- spin_unlock(&kvm->mmu_lock);
+ /*
+ * We are walking the secondary page table here. We can do this
+ * without disabling irq.
+ */
+ spin_lock(&kvm->mmu_lock);
+ ptep = __find_linux_pte(kvm->arch.pgtable,
+ gpa, NULL, &shift);
+ if (ptep && pte_present(*ptep) &&
+ (!writing || pte_write(*ptep))) {
+ kvmppc_radix_update_pte(kvm, ptep, 0, pgflags,
+ gpa, shift);
+ dsisr &= ~DSISR_SET_RC;
}
+ spin_unlock(&kvm->mmu_lock);
+ if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
+ DSISR_PROTFAULT | DSISR_SET_RC)))
+ return RESUME_GUEST;
}
- ret = -EFAULT;
- pfn = 0;
- pte_size = PAGE_SIZE;
- pgflags = _PAGE_READ | _PAGE_EXEC;
- level = 0;
- npages = get_user_pages_fast(hva, 1, writing, pages);
- if (npages < 1) {
- /* Check if it's an I/O mapping */
- down_read(&current->mm->mmap_sem);
- vma = find_vma(current->mm, hva);
- if (vma && vma->vm_start <= hva && hva < vma->vm_end &&
- (vma->vm_flags & VM_PFNMAP)) {
- pfn = vma->vm_pgoff +
- ((hva - vma->vm_start) >> PAGE_SHIFT);
- pgflags = pgprot_val(vma->vm_page_prot);
- }
- up_read(&current->mm->mmap_sem);
- if (!pfn)
- return -EFAULT;
- } else {
- page = pages[0];
+ /* used to check for invalidations in progress */
+ mmu_seq = kvm->mmu_notifier_seq;
+ smp_rmb();
+
+ /*
+ * Do a fast check first, since __gfn_to_pfn_memslot doesn't
+ * do it with !atomic && !async, which is how we call it.
+ * We always ask for write permission since the common case
+ * is that the page is writable.
+ */
+ hva = gfn_to_hva_memslot(memslot, gfn);
+ if (upgrade_p && __get_user_pages_fast(hva, 1, 1, &page) == 1) {
pfn = page_to_pfn(page);
- if (PageCompound(page)) {
- pte_size <<= compound_order(compound_head(page));
- /* See if we can insert a 2MB large-page PTE here */
- if (pte_size >= PMD_SIZE &&
- (gpa & (PMD_SIZE - PAGE_SIZE)) ==
- (hva & (PMD_SIZE - PAGE_SIZE))) {
- level = 1;
- pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1);
- }
+ upgrade_write = true;
+ } else {
+ /* Call KVM generic code to do the slow-path check */
+ pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
+ writing, upgrade_p);
+ if (is_error_noslot_pfn(pfn))
+ return -EFAULT;
+ page = NULL;
+ if (pfn_valid(pfn)) {
+ page = pfn_to_page(pfn);
+ if (PageReserved(page))
+ page = NULL;
}
- /* See if we can provide write access */
- if (writing) {
- pgflags |= _PAGE_WRITE;
- } else {
- local_irq_save(flags);
- ptep = find_current_mm_pte(current->mm->pgd,
- hva, NULL, NULL);
- if (ptep && pte_write(*ptep))
- pgflags |= _PAGE_WRITE;
- local_irq_restore(flags);
+ }
+
+ /* See if we can insert a 1GB or 2MB large PTE here */
+ level = 0;
+ if (page && PageCompound(page)) {
+ pte_size = PAGE_SIZE << compound_order(compound_head(page));
+ if (pte_size >= PUD_SIZE &&
+ (gpa & (PUD_SIZE - PAGE_SIZE)) ==
+ (hva & (PUD_SIZE - PAGE_SIZE))) {
+ level = 2;
+ pfn &= ~((PUD_SIZE >> PAGE_SHIFT) - 1);
+ } else if (pte_size >= PMD_SIZE &&
+ (gpa & (PMD_SIZE - PAGE_SIZE)) ==
+ (hva & (PMD_SIZE - PAGE_SIZE))) {
+ level = 1;
+ pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1);
}
}
/*
* Compute the PTE value that we need to insert.
*/
- pgflags |= _PAGE_PRESENT | _PAGE_PTE | _PAGE_ACCESSED;
- if (pgflags & _PAGE_WRITE)
- pgflags |= _PAGE_DIRTY;
- pte = pfn_pte(pfn, __pgprot(pgflags));
-
- /* Allocate space in the tree and write the PTE */
- ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq);
- if (ret == -EBUSY) {
+ if (page) {
+ pgflags = _PAGE_READ | _PAGE_EXEC | _PAGE_PRESENT | _PAGE_PTE |
+ _PAGE_ACCESSED;
+ if (writing || upgrade_write)
+ pgflags |= _PAGE_WRITE | _PAGE_DIRTY;
+ pte = pfn_pte(pfn, __pgprot(pgflags));
+ } else {
/*
- * There's already a PMD where wanted to install a large page;
- * for now, fall back to installing a small page.
+ * Read the PTE from the process' radix tree and use that
+ * so we get the attribute bits.
*/
- level = 0;
- pfn |= gfn & ((PMD_SIZE >> PAGE_SHIFT) - 1);
- pte = pfn_pte(pfn, __pgprot(pgflags));
- ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq);
+ local_irq_disable();
+ ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift);
+ pte = *ptep;
+ local_irq_enable();
+ if (shift == PUD_SHIFT &&
+ (gpa & (PUD_SIZE - PAGE_SIZE)) ==
+ (hva & (PUD_SIZE - PAGE_SIZE))) {
+ level = 2;
+ } else if (shift == PMD_SHIFT &&
+ (gpa & (PMD_SIZE - PAGE_SIZE)) ==
+ (hva & (PMD_SIZE - PAGE_SIZE))) {
+ level = 1;
+ } else if (shift && shift != PAGE_SHIFT) {
+ /* Adjust PFN */
+ unsigned long mask = (1ul << shift) - PAGE_SIZE;
+ pte = __pte(pte_val(pte) | (hva & mask));
+ }
+ if (!(writing || upgrade_write))
+ pte = __pte(pte_val(pte) & ~ _PAGE_WRITE);
+ pte = __pte(pte_val(pte) | _PAGE_EXEC);
}
+ /* Allocate space in the tree and write the PTE */
+ ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq);
+
if (page) {
- if (!ret && (pgflags & _PAGE_WRITE))
+ if (!ret && (pte_val(pte) & _PAGE_WRITE))
set_page_dirty_lock(page);
put_page(page);
}
@@ -662,6 +741,10 @@ void kvmppc_free_radix(struct kvm *kvm)
for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++pud) {
if (!pud_present(*pud))
continue;
+ if (pud_huge(*pud)) {
+ pud_clear(pud);
+ continue;
+ }
pmd = pmd_offset(pud, 0);
for (im = 0; im < PTRS_PER_PMD; ++im, ++pmd) {
if (pmd_is_leaf(*pmd)) {