aboutsummaryrefslogtreecommitdiffstats
path: root/arch/arm64/kvm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-10-23 11:17:56 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2020-10-23 11:17:56 -0700
commitf9a705ad1c077ec2872c641f0db9c0d5b4a097bb (patch)
tree7f5d18d74f700be5bcf72ec5f4955f016eac9ab9 /arch/arm64/kvm
parentMerge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost (diff)
parentkvm: x86/mmu: NX largepage recovery for TDP MMU (diff)
downloadlinux-dev-f9a705ad1c077ec2872c641f0db9c0d5b4a097bb.tar.xz
linux-dev-f9a705ad1c077ec2872c641f0db9c0d5b4a097bb.zip
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Paolo Bonzini: "For x86, there is a new alternative and (in the future) more scalable implementation of extended page tables that does not need a reverse map from guest physical addresses to host physical addresses. For now it is disabled by default because it is still lacking a few of the existing MMU's bells and whistles. However it is a very solid piece of work and it is already available for people to hammer on it. Other updates: ARM: - New page table code for both hypervisor and guest stage-2 - Introduction of a new EL2-private host context - Allow EL2 to have its own private per-CPU variables - Support of PMU event filtering - Complete rework of the Spectre mitigation PPC: - Fix for running nested guests with in-kernel IRQ chip - Fix race condition causing occasional host hard lockup - Minor cleanups and bugfixes x86: - allow trapping unknown MSRs to userspace - allow userspace to force #GP on specific MSRs - INVPCID support on AMD - nested AMD cleanup, on demand allocation of nested SVM state - hide PV MSRs and hypercalls for features not enabled in CPUID - new test for MSR_IA32_TSC writes from host and guest - cleanups: MMU, CPUID, shared MSRs - LAPIC latency optimizations ad bugfixes" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (232 commits) kvm: x86/mmu: NX largepage recovery for TDP MMU kvm: x86/mmu: Don't clear write flooding count for direct roots kvm: x86/mmu: Support MMIO in the TDP MMU kvm: x86/mmu: Support write protection for nesting in tdp MMU kvm: x86/mmu: Support disabling dirty logging for the tdp MMU kvm: x86/mmu: Support dirty logging for the TDP MMU kvm: x86/mmu: Support changed pte notifier in tdp MMU kvm: x86/mmu: Add access tracking for tdp_mmu kvm: x86/mmu: Support invalidate range MMU notifier for TDP MMU kvm: x86/mmu: Allocate struct kvm_mmu_pages for all pages in TDP MMU kvm: x86/mmu: Add TDP MMU PF handler kvm: x86/mmu: Remove disallowed_hugepage_adjust shadow_walk_iterator arg kvm: x86/mmu: Support zapping SPTEs in the TDP MMU KVM: Cache as_id in kvm_memory_slot kvm: x86/mmu: Add functions to handle changed TDP SPTEs kvm: x86/mmu: Allocate and free TDP MMU roots kvm: x86/mmu: Init / Uninit the TDP MMU kvm: x86/mmu: Introduce tdp_iter KVM: mmu: extract spte.h and spte.c KVM: mmu: Separate updating a PTE from kvm_set_pte_rmapp ...
Diffstat (limited to 'arch/arm64/kvm')
-rw-r--r--arch/arm64/kvm/Makefile2
-rw-r--r--arch/arm64/kvm/arm.c70
-rw-r--r--arch/arm64/kvm/hyp.S34
-rw-r--r--arch/arm64/kvm/hyp/Makefile2
-rw-r--r--arch/arm64/kvm/hyp/entry.S95
-rw-r--r--arch/arm64/kvm/hyp/hyp-entry.S76
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/debug-sr.h4
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/switch.h15
-rw-r--r--arch/arm64/kvm/hyp/nvhe/.gitignore2
-rw-r--r--arch/arm64/kvm/hyp/nvhe/Makefile62
-rw-r--r--arch/arm64/kvm/hyp/nvhe/host.S187
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-init.S67
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-main.c117
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp.lds.S19
-rw-r--r--arch/arm64/kvm/hyp/nvhe/switch.c48
-rw-r--r--arch/arm64/kvm/hyp/nvhe/tlb.c2
-rw-r--r--arch/arm64/kvm/hyp/pgtable.c892
-rw-r--r--arch/arm64/kvm/hyp/vhe/switch.c31
-rw-r--r--arch/arm64/kvm/hyp/vhe/sysreg-sr.c4
-rw-r--r--arch/arm64/kvm/inject_fault.c1
-rw-r--r--arch/arm64/kvm/mmu.c1611
-rw-r--r--arch/arm64/kvm/pmu-emul.c195
-rw-r--r--arch/arm64/kvm/pmu.c13
-rw-r--r--arch/arm64/kvm/reset.c40
-rw-r--r--arch/arm64/kvm/sys_regs.c5
-rw-r--r--arch/arm64/kvm/vgic/vgic-debug.c24
-rw-r--r--arch/arm64/kvm/vgic/vgic-v3.c4
27 files changed, 1848 insertions, 1774 deletions
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 99977c1972cc..1504c81fbf5d 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -13,7 +13,7 @@ obj-$(CONFIG_KVM) += hyp/
kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \
$(KVM)/vfio.o $(KVM)/irqchip.o \
arm.o mmu.o mmio.o psci.o perf.o hypercalls.o pvtime.o \
- inject_fault.o regmap.o va_layout.o hyp.o handle_exit.o \
+ inject_fault.o regmap.o va_layout.o handle_exit.o \
guest.o debug.o reset.o sys_regs.o \
vgic-sys-reg-v3.o fpsimd.o pmu.o \
aarch32.o arch_timer.o \
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index acf9a993dfb6..f56122eedffc 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -46,8 +46,10 @@
__asm__(".arch_extension virt");
#endif
-DEFINE_PER_CPU(kvm_host_data_t, kvm_host_data);
+DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector);
+
static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
+unsigned long kvm_arm_hyp_percpu_base[NR_CPUS];
/* The VMID used in the VTTBR */
static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
@@ -145,6 +147,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
{
int i;
+ bitmap_free(kvm->arch.pmu_filter);
+
kvm_vgic_destroy(kvm);
for (i = 0; i < KVM_MAX_VCPUS; ++i) {
@@ -286,7 +290,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm)))
static_branch_dec(&userspace_irqchip_in_use);
- kvm_mmu_free_memory_caches(vcpu);
+ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
kvm_timer_vcpu_terminate(vcpu);
kvm_pmu_vcpu_destroy(vcpu);
@@ -1259,6 +1263,19 @@ long kvm_arch_vm_ioctl(struct file *filp,
}
}
+static unsigned long nvhe_percpu_size(void)
+{
+ return (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_end) -
+ (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_start);
+}
+
+static unsigned long nvhe_percpu_order(void)
+{
+ unsigned long size = nvhe_percpu_size();
+
+ return size ? get_order(size) : 0;
+}
+
static int kvm_map_vectors(void)
{
/*
@@ -1299,6 +1316,7 @@ static void cpu_init_hyp_mode(void)
unsigned long hyp_stack_ptr;
unsigned long vector_ptr;
unsigned long tpidr_el2;
+ struct arm_smccc_res res;
/* Switch from the HYP stub to our own HYP init vector */
__hyp_set_vectors(kvm_get_idmap_vector());
@@ -1308,12 +1326,13 @@ static void cpu_init_hyp_mode(void)
* kernel's mapping to the linear mapping, and store it in tpidr_el2
* so that we can use adr_l to access per-cpu variables in EL2.
*/
- tpidr_el2 = ((unsigned long)this_cpu_ptr(&kvm_host_data) -
- (unsigned long)kvm_ksym_ref(&kvm_host_data));
+ tpidr_el2 = (unsigned long)this_cpu_ptr_nvhe_sym(__per_cpu_start) -
+ (unsigned long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start));
pgd_ptr = kvm_mmu_get_httbr();
hyp_stack_ptr = __this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE;
- vector_ptr = (unsigned long)kvm_get_hyp_vector();
+ hyp_stack_ptr = kern_hyp_va(hyp_stack_ptr);
+ vector_ptr = (unsigned long)kern_hyp_va(kvm_ksym_ref(__kvm_hyp_host_vector));
/*
* Call initialization code, and switch to the full blown HYP code.
@@ -1322,7 +1341,9 @@ static void cpu_init_hyp_mode(void)
* cpus_have_const_cap() wrapper.
*/
BUG_ON(!system_capabilities_finalized());
- __kvm_call_hyp((void *)pgd_ptr, hyp_stack_ptr, vector_ptr, tpidr_el2);
+ arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(__kvm_hyp_init),
+ pgd_ptr, tpidr_el2, hyp_stack_ptr, vector_ptr, &res);
+ WARN_ON(res.a0 != SMCCC_RET_SUCCESS);
/*
* Disabling SSBD on a non-VHE system requires us to enable SSBS
@@ -1342,10 +1363,12 @@ static void cpu_hyp_reset(void)
static void cpu_hyp_reinit(void)
{
- kvm_init_host_cpu_context(&this_cpu_ptr(&kvm_host_data)->host_ctxt);
+ kvm_init_host_cpu_context(&this_cpu_ptr_hyp_sym(kvm_host_data)->host_ctxt);
cpu_hyp_reset();
+ *this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)kvm_get_hyp_vector();
+
if (is_kernel_in_hyp_mode())
kvm_timer_init_vhe();
else
@@ -1496,8 +1519,10 @@ static void teardown_hyp_mode(void)
int cpu;
free_hyp_pgds();
- for_each_possible_cpu(cpu)
+ for_each_possible_cpu(cpu) {
free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
+ free_pages(kvm_arm_hyp_percpu_base[cpu], nvhe_percpu_order());
+ }
}
/**
@@ -1531,6 +1556,24 @@ static int init_hyp_mode(void)
}
/*
+ * Allocate and initialize pages for Hypervisor-mode percpu regions.
+ */
+ for_each_possible_cpu(cpu) {
+ struct page *page;
+ void *page_addr;
+
+ page = alloc_pages(GFP_KERNEL, nvhe_percpu_order());
+ if (!page) {
+ err = -ENOMEM;
+ goto out_err;
+ }
+
+ page_addr = page_address(page);
+ memcpy(page_addr, CHOOSE_NVHE_SYM(__per_cpu_start), nvhe_percpu_size());
+ kvm_arm_hyp_percpu_base[cpu] = (unsigned long)page_addr;
+ }
+
+ /*
* Map the Hyp-code called directly from the host
*/
err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start),
@@ -1574,14 +1617,17 @@ static int init_hyp_mode(void)
}
}
+ /*
+ * Map Hyp percpu pages
+ */
for_each_possible_cpu(cpu) {
- kvm_host_data_t *cpu_data;
+ char *percpu_begin = (char *)kvm_arm_hyp_percpu_base[cpu];
+ char *percpu_end = percpu_begin + nvhe_percpu_size();
- cpu_data = per_cpu_ptr(&kvm_host_data, cpu);
- err = create_hyp_mappings(cpu_data, cpu_data + 1, PAGE_HYP);
+ err = create_hyp_mappings(percpu_begin, percpu_end, PAGE_HYP);
if (err) {
- kvm_err("Cannot map host CPU state: %d\n", err);
+ kvm_err("Cannot map hyp percpu region\n");
goto out_err;
}
}
diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
deleted file mode 100644
index 3c79a1124af2..000000000000
--- a/arch/arm64/kvm/hyp.S
+++ /dev/null
@@ -1,34 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2012,2013 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- */
-
-#include <linux/linkage.h>
-
-#include <asm/alternative.h>
-#include <asm/assembler.h>
-#include <asm/cpufeature.h>
-
-/*
- * u64 __kvm_call_hyp(void *hypfn, ...);
- *
- * This is not really a variadic function in the classic C-way and care must
- * be taken when calling this to ensure parameters are passed in registers
- * only, since the stack will change between the caller and the callee.
- *
- * Call the function with the first argument containing a pointer to the
- * function you wish to call in Hyp mode, and subsequent arguments will be
- * passed as x0, x1, and x2 (a maximum of 3 arguments in addition to the
- * function pointer can be passed). The function being called must be mapped
- * in Hyp mode (see init_hyp_mode in arch/arm/kvm/arm.c). Return values are
- * passed in x0.
- *
- * A function pointer with a value less than 0xfff has a special meaning,
- * and is used to implement hyp stubs in the same way as in
- * arch/arm64/kernel/hyp_stub.S.
- */
-SYM_FUNC_START(__kvm_call_hyp)
- hvc #0
- ret
-SYM_FUNC_END(__kvm_call_hyp)
diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
index d898f0da5802..4a81eddabcd8 100644
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -10,4 +10,4 @@ subdir-ccflags-y := -I$(incdir) \
-DDISABLE_BRANCH_PROFILING \
$(DISABLE_STACKLEAK_PLUGIN)
-obj-$(CONFIG_KVM) += vhe/ nvhe/ smccc_wa.o
+obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o smccc_wa.o
diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
index 76e7eaf4675e..b0afad7a99c6 100644
--- a/arch/arm64/kvm/hyp/entry.S
+++ b/arch/arm64/kvm/hyp/entry.S
@@ -7,7 +7,6 @@
#include <linux/linkage.h>
#include <asm/alternative.h>
-#include <asm/asm-offsets.h>
#include <asm/assembler.h>
#include <asm/fpsimdmacros.h>
#include <asm/kvm.h>
@@ -16,66 +15,28 @@
#include <asm/kvm_mmu.h>
#include <asm/kvm_ptrauth.h>
-#define CPU_XREG_OFFSET(x) (CPU_USER_PT_REGS + 8*x)
-#define CPU_SP_EL0_OFFSET (CPU_XREG_OFFSET(30) + 8)
-
.text
/*
- * We treat x18 as callee-saved as the host may use it as a platform
- * register (e.g. for shadow call stack).
- */
-.macro save_callee_saved_regs ctxt
- str x18, [\ctxt, #CPU_XREG_OFFSET(18)]
- stp x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)]
- stp x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)]
- stp x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)]
- stp x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)]
- stp x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)]
- stp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)]
-.endm
-
-.macro restore_callee_saved_regs ctxt
- // We require \ctxt is not x18-x28
- ldr x18, [\ctxt, #CPU_XREG_OFFSET(18)]
- ldp x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)]
- ldp x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)]
- ldp x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)]
- ldp x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)]
- ldp x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)]
- ldp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)]
-.endm
-
-.macro save_sp_el0 ctxt, tmp
- mrs \tmp, sp_el0
- str \tmp, [\ctxt, #CPU_SP_EL0_OFFSET]
-.endm
-
-.macro restore_sp_el0 ctxt, tmp
- ldr \tmp, [\ctxt, #CPU_SP_EL0_OFFSET]
- msr sp_el0, \tmp
-.endm
-
-/*
- * u64 __guest_enter(struct kvm_vcpu *vcpu,
- * struct kvm_cpu_context *host_ctxt);
+ * u64 __guest_enter(struct kvm_vcpu *vcpu);
*/
SYM_FUNC_START(__guest_enter)
// x0: vcpu
- // x1: host context
- // x2-x17: clobbered by macros
+ // x1-x17: clobbered by macros
// x29: guest context
- // Store the host regs
+ adr_this_cpu x1, kvm_hyp_ctxt, x2
+
+ // Store the hyp regs
save_callee_saved_regs x1
- // Save the host's sp_el0
+ // Save hyp's sp_el0
save_sp_el0 x1, x2
- // Now the host state is stored if we have a pending RAS SError it must
- // affect the host. If any asynchronous exception is pending we defer
- // the guest entry. The DSB isn't necessary before v8.2 as any SError
- // would be fatal.
+ // Now the hyp state is stored if we have a pending RAS SError it must
+ // affect the host or hyp. If any asynchronous exception is pending we
+ // defer the guest entry. The DSB isn't necessary before v8.2 as any
+ // SError would be fatal.
alternative_if ARM64_HAS_RAS_EXTN
dsb nshst
isb
@@ -86,6 +47,8 @@ alternative_else_nop_endif
ret
1:
+ set_loaded_vcpu x0, x1, x2
+
add x29, x0, #VCPU_CONTEXT
// Macro ptrauth_switch_to_guest format:
@@ -116,6 +79,26 @@ alternative_else_nop_endif
eret
sb
+SYM_INNER_LABEL(__guest_exit_panic, SYM_L_GLOBAL)
+ // x2-x29,lr: vcpu regs
+ // vcpu x0-x1 on the stack
+
+ // If the hyp context is loaded, go straight to hyp_panic
+ get_loaded_vcpu x0, x1
+ cbz x0, hyp_panic
+
+ // The hyp context is saved so make sure it is restored to allow
+ // hyp_panic to run at hyp and, subsequently, panic to run in the host.
+ // This makes use of __guest_exit to avoid duplication but sets the
+ // return address to tail call into hyp_panic. As a side effect, the
+ // current state is saved to the guest context but it will only be
+ // accurate if the guest had been completely restored.
+ adr_this_cpu x0, kvm_hyp_ctxt, x1
+ adr x1, hyp_panic
+ str x1, [x0, #CPU_XREG_OFFSET(30)]
+
+ get_vcpu_ptr x1, x0
+
SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL)
// x0: return code
// x1: vcpu
@@ -148,21 +131,23 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL)
// Store the guest's sp_el0
save_sp_el0 x1, x2
- get_host_ctxt x2, x3
+ adr_this_cpu x2, kvm_hyp_ctxt, x3
- // Macro ptrauth_switch_to_guest format:
- // ptrauth_switch_to_host(guest cxt, host cxt, tmp1, tmp2, tmp3)
+ // Macro ptrauth_switch_to_hyp format:
+ // ptrauth_switch_to_hyp(guest cxt, host cxt, tmp1, tmp2, tmp3)
// The below macro to save/restore keys is not implemented in C code
// as it may cause Pointer Authentication key signing mismatch errors
// when this feature is enabled for kernel code.
- ptrauth_switch_to_host x1, x2, x3, x4, x5
+ ptrauth_switch_to_hyp x1, x2, x3, x4, x5
- // Restore the hosts's sp_el0
+ // Restore hyp's sp_el0
restore_sp_el0 x2, x3
- // Now restore the host regs
+ // Now restore the hyp regs
restore_callee_saved_regs x2
+ set_loaded_vcpu xzr, x1, x2
+
alternative_if ARM64_HAS_RAS_EXTN
// If we have the RAS extensions we can consume a pending error
// without an unmask-SError and isb. The ESB-instruction consumed any
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index 7ea277b82967..0a5b36eb54b3 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -12,7 +12,6 @@
#include <asm/cpufeature.h>
#include <asm/kvm_arm.h>
#include <asm/kvm_asm.h>
-#include <asm/kvm_mmu.h>
#include <asm/mmu.h>
.macro save_caller_saved_regs_vect
@@ -41,20 +40,6 @@
.text
-.macro do_el2_call
- /*
- * Shuffle the parameters before calling the function
- * pointed to in x0. Assumes parameters in x[1,2,3].
- */
- str lr, [sp, #-16]!
- mov lr, x0
- mov x0, x1
- mov x1, x2
- mov x2, x3
- blr lr
- ldr lr, [sp], #16
-.endm
-
el1_sync: // Guest trapped into EL2
mrs x0, esr_el2
@@ -63,44 +48,6 @@ el1_sync: // Guest trapped into EL2
ccmp x0, #ESR_ELx_EC_HVC32, #4, ne
b.ne el1_trap
-#ifdef __KVM_NVHE_HYPERVISOR__
- mrs x1, vttbr_el2 // If vttbr is valid, the guest
- cbnz x1, el1_hvc_guest // called HVC
-
- /* Here, we're pretty sure the host called HVC. */
- ldp x0, x1, [sp], #16
-
- /* Check for a stub HVC call */
- cmp x0, #HVC_STUB_HCALL_NR
- b.hs 1f
-
- /*
- * Compute the idmap address of __kvm_handle_stub_hvc and
- * jump there. Since we use kimage_voffset, do not use the
- * HYP VA for __kvm_handle_stub_hvc, but the kernel VA instead
- * (by loading it from the constant pool).
- *
- * Preserve x0-x4, which may contain stub parameters.
- */
- ldr x5, =__kvm_handle_stub_hvc
- ldr_l x6, kimage_voffset
-
- /* x5 = __pa(x5) */
- sub x5, x5, x6
- br x5
-
-1:
- /*
- * Perform the EL2 call
- */
- kern_hyp_va x0
- do_el2_call
-
- eret
- sb
-#endif /* __KVM_NVHE_HYPERVISOR__ */
-
-el1_hvc_guest:
/*
* Fastest possible path for ARM_SMCCC_ARCH_WORKAROUND_1.
* The workaround has already been applied on the host,
@@ -169,24 +116,7 @@ el2_error:
eret
sb
-#ifdef __KVM_NVHE_HYPERVISOR__
-SYM_FUNC_START(__hyp_do_panic)
- mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\
- PSR_MODE_EL1h)
- msr spsr_el2, lr
- ldr lr, =panic
- msr elr_el2, lr
- eret
- sb
-SYM_FUNC_END(__hyp_do_panic)
-#endif
-
-SYM_CODE_START(__hyp_panic)
- get_host_ctxt x0, x1
- b hyp_panic
-SYM_CODE_END(__hyp_panic)
-
-.macro invalid_vector label, target = __hyp_panic
+.macro invalid_vector label, target = __guest_exit_panic
.align 2
SYM_CODE_START(\label)
b \target
@@ -198,7 +128,6 @@ SYM_CODE_END(\label)
invalid_vector el2t_irq_invalid
invalid_vector el2t_fiq_invalid
invalid_vector el2t_error_invalid
- invalid_vector el2h_sync_invalid
invalid_vector el2h_irq_invalid
invalid_vector el2h_fiq_invalid
invalid_vector el1_fiq_invalid
@@ -228,10 +157,9 @@ check_preamble_length 661b, 662b
.macro invalid_vect target
.align 7
661:
- b \target
nop
+ stp x0, x1, [sp, #-16]!
662:
- ldp x0, x1, [sp], #16
b \target
check_preamble_length 661b, 662b
diff --git a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h
index 5e28ea6aa097..4ebe9f558f3a 100644
--- a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h
+++ b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h
@@ -135,7 +135,7 @@ static inline void __debug_switch_to_guest_common(struct kvm_vcpu *vcpu)
if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))
return;
- host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
+ host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
guest_ctxt = &vcpu->arch.ctxt;
host_dbg = &vcpu->arch.host_debug_state.regs;
guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr);
@@ -154,7 +154,7 @@ static inline void __debug_switch_to_host_common(struct kvm_vcpu *vcpu)
if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))
return;
- host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
+ host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
guest_ctxt = &vcpu->arch.ctxt;
host_dbg = &vcpu->arch.host_debug_state.regs;
guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr);
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index d0f07e8cc3ff..313a8fa3c721 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -126,11 +126,6 @@ static inline void ___deactivate_traps(struct kvm_vcpu *vcpu)
}
}
-static inline void __activate_vm(struct kvm_s2_mmu *mmu)
-{
- __load_guest_stage2(mmu);
-}
-
static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar)
{
u64 par, tmp;
@@ -377,6 +372,8 @@ static inline bool esr_is_ptrauth_trap(u32 esr)
ctxt_sys_reg(ctxt, key ## KEYHI_EL1) = __val; \
} while(0)
+DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
+
static inline bool __hyp_handle_ptrauth(struct kvm_vcpu *vcpu)
{
struct kvm_cpu_context *ctxt;
@@ -386,7 +383,7 @@ static inline bool __hyp_handle_ptrauth(struct kvm_vcpu *vcpu)
!esr_is_ptrauth_trap(kvm_vcpu_get_esr(vcpu)))
return false;
- ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
+ ctxt = this_cpu_ptr(&kvm_hyp_ctxt);
__ptrauth_save_key(ctxt, APIA);
__ptrauth_save_key(ctxt, APIB);
__ptrauth_save_key(ctxt, APDA);
@@ -481,14 +478,13 @@ exit:
static inline void __kvm_unexpected_el2_exception(void)
{
+ extern char __guest_exit_panic[];
unsigned long addr, fixup;
- struct kvm_cpu_context *host_ctxt;
struct exception_table_entry *entry, *end;
unsigned long elr_el2 = read_sysreg(elr_el2);
entry = hyp_symbol_addr(__start___kvm_ex_table);
end = hyp_symbol_addr(__stop___kvm_ex_table);
- host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
while (entry < end) {
addr = (unsigned long)&entry->insn + entry->insn;
@@ -503,7 +499,8 @@ static inline void __kvm_unexpected_el2_exception(void)
return;
}
- hyp_panic(host_ctxt);
+ /* Trigger a panic after restoring the hyp context. */
+ write_sysreg(__guest_exit_panic, elr_el2);
}
#endif /* __ARM64_KVM_HYP_SWITCH_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/.gitignore b/arch/arm64/kvm/hyp/nvhe/.gitignore
new file mode 100644
index 000000000000..695d73d0249e
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+hyp.lds
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index aef76487edc2..ddde15fe85f2 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -6,44 +6,50 @@
asflags-y := -D__KVM_NVHE_HYPERVISOR__
ccflags-y := -D__KVM_NVHE_HYPERVISOR__
-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o
+obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o hyp-main.o
obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
../fpsimd.o ../hyp-entry.o
-obj-y := $(patsubst %.o,%.hyp.o,$(obj-y))
-extra-y := $(patsubst %.hyp.o,%.hyp.tmp.o,$(obj-y))
+##
+## Build rules for compiling nVHE hyp code
+## Output of this folder is `kvm_nvhe.o`, a partially linked object
+## file containing all nVHE hyp code and data.
+##
-$(obj)/%.hyp.tmp.o: $(src)/%.c FORCE
+hyp-obj := $(patsubst %.o,%.nvhe.o,$(obj-y))
+obj-y := kvm_nvhe.o
+extra-y := $(hyp-obj) kvm_nvhe.tmp.o hyp.lds
+
+# 1) Compile all source files to `.nvhe.o` object files. The file extension
+# avoids file name clashes for files shared with VHE.
+$(obj)/%.nvhe.o: $(src)/%.c FORCE
$(call if_changed_rule,cc_o_c)
-$(obj)/%.hyp.tmp.o: $(src)/%.S FORCE
+$(obj)/%.nvhe.o: $(src)/%.S FORCE
$(call if_changed_rule,as_o_S)
-$(obj)/%.hyp.o: $(obj)/%.hyp.tmp.o FORCE
- $(call if_changed,hypcopy)
-# Disable reordering functions by GCC (enabled at -O2).
-# This pass puts functions into '.text.*' sections to aid the linker
-# in optimizing ELF layout. See HYPCOPY comment below for more info.
-ccflags-y += $(call cc-option,-fno-reorder-functions)
+# 2) Compile linker script.
+$(obj)/hyp.lds: $(src)/hyp.lds.S FORCE
+ $(call if_changed_dep,cpp_lds_S)
+
+# 3) Partially link all '.nvhe.o' files and apply the linker script.
+# Prefixes names of ELF sections with '.hyp', eg. '.hyp.text'.
+# Note: The following rule assumes that the 'ld' rule puts LDFLAGS before
+# the list of dependencies to form '-T $(obj)/hyp.lds'. This is to
+# keep the dependency on the target while avoiding an error from
+# GNU ld if the linker script is passed to it twice.
+LDFLAGS_kvm_nvhe.tmp.o := -r -T
+$(obj)/kvm_nvhe.tmp.o: $(obj)/hyp.lds $(addprefix $(obj)/,$(hyp-obj)) FORCE
+ $(call if_changed,ld)
+
+# 4) Produce the final 'kvm_nvhe.o', ready to be linked into 'vmlinux'.
+# Prefixes names of ELF symbols with '__kvm_nvhe_'.
+$(obj)/kvm_nvhe.o: $(obj)/kvm_nvhe.tmp.o FORCE
+ $(call if_changed,hypcopy)
# The HYPCOPY command uses `objcopy` to prefix all ELF symbol names
-# and relevant ELF section names to avoid clashes with VHE code/data.
-#
-# Hyp code is assumed to be in the '.text' section of the input object
-# files (with the exception of specialized sections such as
-# '.hyp.idmap.text'). This assumption may be broken by a compiler that
-# divides code into sections like '.text.unlikely' so as to optimize
-# ELF layout. HYPCOPY checks that no such sections exist in the input
-# using `objdump`, otherwise they would be linked together with other
-# kernel code and not memory-mapped correctly at runtime.
+# to avoid clashes with VHE code/data.
quiet_cmd_hypcopy = HYPCOPY $@
- cmd_hypcopy = \
- if $(OBJDUMP) -h $< | grep -F '.text.'; then \
- echo "$@: function reordering not supported in nVHE hyp code" >&2; \
- /bin/false; \
- fi; \
- $(OBJCOPY) --prefix-symbols=__kvm_nvhe_ \
- --rename-section=.text=.hyp.text \
- $< $@
+ cmd_hypcopy = $(OBJCOPY) --prefix-symbols=__kvm_nvhe_ $< $@
# Remove ftrace and Shadow Call Stack CFLAGS.
# This is equivalent to the 'notrace' and '__noscs' annotations.
diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S
new file mode 100644
index 000000000000..ff9a0f547b9f
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/host.S
@@ -0,0 +1,187 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020 - Google Inc
+ * Author: Andrew Scull <ascull@google.com>
+ */
+
+#include <linux/linkage.h>
+
+#include <asm/assembler.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_mmu.h>
+
+ .text
+
+SYM_FUNC_START(__host_exit)
+ stp x0, x1, [sp, #-16]!
+
+ get_host_ctxt x0, x1
+
+ ALTERNATIVE(nop, SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
+
+ /* Store the host regs x2 and x3 */
+ stp x2, x3, [x0, #CPU_XREG_OFFSET(2)]
+
+ /* Retrieve the host regs x0-x1 from the stack */
+ ldp x2, x3, [sp], #16 // x0, x1
+
+ /* Store the host regs x0-x1 and x4-x17 */
+ stp x2, x3, [x0, #CPU_XREG_OFFSET(0)]
+ stp x4, x5, [x0, #CPU_XREG_OFFSET(4)]
+ stp x6, x7, [x0, #CPU_XREG_OFFSET(6)]
+ stp x8, x9, [x0, #CPU_XREG_OFFSET(8)]
+ stp x10, x11, [x0, #CPU_XREG_OFFSET(10)]
+ stp x12, x13, [x0, #CPU_XREG_OFFSET(12)]
+ stp x14, x15, [x0, #CPU_XREG_OFFSET(14)]
+ stp x16, x17, [x0, #CPU_XREG_OFFSET(16)]
+
+ /* Store the host regs x18-x29, lr */
+ save_callee_saved_regs x0
+
+ /* Save the host context pointer in x29 across the function call */
+ mov x29, x0
+ bl handle_trap
+
+ /* Restore host regs x0-x17 */
+ ldp x0, x1, [x29, #CPU_XREG_OFFSET(0)]
+ ldp x2, x3, [x29, #CPU_XREG_OFFSET(2)]
+ ldp x4, x5, [x29, #CPU_XREG_OFFSET(4)]
+ ldp x6, x7, [x29, #CPU_XREG_OFFSET(6)]
+
+ /* x0-7 are use for panic arguments */
+__host_enter_for_panic:
+ ldp x8, x9, [x29, #CPU_XREG_OFFSET(8)]
+ ldp x10, x11, [x29, #CPU_XREG_OFFSET(10)]
+ ldp x12, x13, [x29, #CPU_XREG_OFFSET(12)]
+ ldp x14, x15, [x29, #CPU_XREG_OFFSET(14)]
+ ldp x16, x17, [x29, #CPU_XREG_OFFSET(16)]
+
+ /* Restore host regs x18-x29, lr */
+ restore_callee_saved_regs x29
+
+ /* Do not touch any register after this! */
+__host_enter_without_restoring:
+ eret
+ sb
+SYM_FUNC_END(__host_exit)
+
+/*
+ * void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par);
+ */
+SYM_FUNC_START(__hyp_do_panic)
+ /* Load the format arguments into x1-7 */
+ mov x6, x3
+ get_vcpu_ptr x7, x3
+
+ mrs x3, esr_el2
+ mrs x4, far_el2
+ mrs x5, hpfar_el2
+
+ /* Prepare and exit to the host's panic funciton. */
+ mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\
+ PSR_MODE_EL1h)
+ msr spsr_el2, lr
+ ldr lr, =panic
+ msr elr_el2, lr
+
+ /*
+ * Set the panic format string and enter the host, conditionally
+ * restoring the host context.
+ */
+ cmp x0, xzr
+ ldr x0, =__hyp_panic_string
+ b.eq __host_enter_without_restoring
+ b __host_enter_for_panic
+SYM_FUNC_END(__hyp_do_panic)
+
+.macro host_el1_sync_vect
+ .align 7
+.L__vect_start\@:
+ stp x0, x1, [sp, #-16]!
+ mrs x0, esr_el2
+ lsr x0, x0, #ESR_ELx_EC_SHIFT
+ cmp x0, #ESR_ELx_EC_HVC64
+ ldp x0, x1, [sp], #16
+ b.ne __host_exit
+
+ /* Check for a stub HVC call */
+ cmp x0, #HVC_STUB_HCALL_NR
+ b.hs __host_exit
+
+ /*
+ * Compute the idmap address of __kvm_handle_stub_hvc and
+ * jump there. Since we use kimage_voffset, do not use the
+ * HYP VA for __kvm_handle_stub_hvc, but the kernel VA instead
+ * (by loading it from the constant pool).
+ *
+ * Preserve x0-x4, which may contain stub parameters.
+ */
+ ldr x5, =__kvm_handle_stub_hvc
+ ldr_l x6, kimage_voffset
+
+ /* x5 = __pa(x5) */
+ sub x5, x5, x6
+ br x5
+.L__vect_end\@:
+.if ((.L__vect_end\@ - .L__vect_start\@) > 0x80)
+ .error "host_el1_sync_vect larger than vector entry"
+.endif
+.endm
+
+.macro invalid_host_el2_vect
+ .align 7
+ /* If a guest is loaded, panic out of it. */
+ stp x0, x1, [sp, #-16]!
+ get_loaded_vcpu x0, x1
+ cbnz x0, __guest_exit_panic
+ add sp, sp, #16
+
+ /*
+ * The panic may not be clean if the exception is taken before the host
+ * context has been saved by __host_exit or after the hyp context has
+ * been partially clobbered by __host_enter.
+ */
+ b hyp_panic
+.endm
+
+.macro invalid_host_el1_vect
+ .align 7
+ mov x0, xzr /* restore_host = false */
+ mrs x1, spsr_el2
+ mrs x2, elr_el2
+ mrs x3, par_el1
+ b __hyp_do_panic
+.endm
+
+/*
+ * The host vector does not use an ESB instruction in order to avoid consuming
+ * SErrors that should only be consumed by the host. Guest entry is deferred by
+ * __guest_enter if there are any pending asynchronous exceptions so hyp will
+ * always return to the host without having consumerd host SErrors.
+ *
+ * CONFIG_KVM_INDIRECT_VECTORS is not applied to the host vectors because the
+ * host knows about the EL2 vectors already, and there is no point in hiding
+ * them.
+ */
+ .align 11
+SYM_CODE_START(__kvm_hyp_host_vector)
+ invalid_host_el2_vect // Synchronous EL2t
+ invalid_host_el2_vect // IRQ EL2t
+ invalid_host_el2_vect // FIQ EL2t
+ invalid_host_el2_vect // Error EL2t
+
+ invalid_host_el2_vect // Synchronous EL2h
+ invalid_host_el2_vect // IRQ EL2h
+ invalid_host_el2_vect // FIQ EL2h
+ invalid_host_el2_vect // Error EL2h
+
+ host_el1_sync_vect // Synchronous 64-bit EL1
+ invalid_host_el1_vect // IRQ 64-bit EL1
+ invalid_host_el1_vect // FIQ 64-bit EL1
+ invalid_host_el1_vect // Error 64-bit EL1
+
+ invalid_host_el1_vect // Synchronous 32-bit EL1
+ invalid_host_el1_vect // IRQ 32-bit EL1
+ invalid_host_el1_vect // FIQ 32-bit EL1
+ invalid_host_el1_vect // Error 32-bit EL1
+SYM_CODE_END(__kvm_hyp_host_vector)
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
index d9434e90c06d..47224dc62c51 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
@@ -4,11 +4,13 @@
* Author: Marc Zyngier <marc.zyngier@arm.com>
*/
+#include <linux/arm-smccc.h>
#include <linux/linkage.h>
#include <asm/alternative.h>
#include <asm/assembler.h>
#include <asm/kvm_arm.h>
+#include <asm/kvm_asm.h>
#include <asm/kvm_mmu.h>
#include <asm/pgtable-hwdef.h>
#include <asm/sysreg.h>
@@ -44,27 +46,37 @@ __invalid:
b .
/*
- * x0: HYP pgd
- * x1: HYP stack
- * x2: HYP vectors
- * x3: per-CPU offset
+ * x0: SMCCC function ID
+ * x1: HYP pgd
+ * x2: per-CPU offset
+ * x3: HYP stack
+ * x4: HYP vectors
*/
__do_hyp_init:
/* Check for a stub HVC call */
cmp x0, #HVC_STUB_HCALL_NR
b.lo __kvm_handle_stub_hvc
- phys_to_ttbr x4, x0
+ /* Set tpidr_el2 for use by HYP to free a register */
+ msr tpidr_el2, x2
+
+ mov x2, #KVM_HOST_SMCCC_FUNC(__kvm_hyp_init)
+ cmp x0, x2
+ b.eq 1f
+ mov x0, #SMCCC_RET_NOT_SUPPORTED
+ eret
+
+1: phys_to_ttbr x0, x1
alternative_if ARM64_HAS_CNP
- orr x4, x4, #TTBR_CNP_BIT
+ orr x0, x0, #TTBR_CNP_BIT
alternative_else_nop_endif
- msr ttbr0_el2, x4
+ msr ttbr0_el2, x0
- mrs x4, tcr_el1
- mov_q x5, TCR_EL2_MASK
- and x4, x4, x5
- mov x5, #TCR_EL2_RES1
- orr x4, x4, x5
+ mrs x0, tcr_el1
+ mov_q x1, TCR_EL2_MASK
+ and x0, x0, x1
+ mov x1, #TCR_EL2_RES1
+ orr x0, x0, x1
/*
* The ID map may be configured to use an extended virtual address
@@ -80,18 +92,18 @@ alternative_else_nop_endif
*
* So use the same T0SZ value we use for the ID map.
*/
- ldr_l x5, idmap_t0sz
- bfi x4, x5, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH
+ ldr_l x1, idmap_t0sz
+ bfi x0, x1, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH
/*
* Set the PS bits in TCR_EL2.
*/
- tcr_compute_pa_size x4, #TCR_EL2_PS_SHIFT, x5, x6
+ tcr_compute_pa_size x0, #TCR_EL2_PS_SHIFT, x1, x2
- msr tcr_el2, x4
+ msr tcr_el2, x0
- mrs x4, mair_el1
- msr mair_el2, x4
+ mrs x0, mair_el1
+ msr mair_el2, x0
isb
/* Invalidate the stale TLBs from Bootloader */
@@ -103,25 +115,22 @@ alternative_else_nop_endif
* as well as the EE bit on BE. Drop the A flag since the compiler
* is allowed to generate unaligned accesses.
*/
- mov_q x4, (SCTLR_EL2_RES1 | (SCTLR_ELx_FLAGS & ~SCTLR_ELx_A))
-CPU_BE( orr x4, x4, #SCTLR_ELx_EE)
+ mov_q x0, (SCTLR_EL2_RES1 | (SCTLR_ELx_FLAGS & ~SCTLR_ELx_A))
+CPU_BE( orr x0, x0, #SCTLR_ELx_EE)
alternative_if ARM64_HAS_ADDRESS_AUTH
- mov_q x5, (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | \
+ mov_q x1, (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | \
SCTLR_ELx_ENDA | SCTLR_ELx_ENDB)
- orr x4, x4, x5
+ orr x0, x0, x1
alternative_else_nop_endif
- msr sctlr_el2, x4
+ msr sctlr_el2, x0
isb
/* Set the stack and new vectors */
- kern_hyp_va x1
- mov sp, x1
- msr vbar_el2, x2
-
- /* Set tpidr_el2 for use by HYP */
- msr tpidr_el2, x3
+ mov sp, x3
+ msr vbar_el2, x4
/* Hello, World! */
+ mov x0, #SMCCC_RET_SUCCESS
eret
SYM_CODE_END(__kvm_hyp_init)
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
new file mode 100644
index 000000000000..e2eafe2c93af
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 - Google Inc
+ * Author: Andrew Scull <ascull@google.com>
+ */
+
+#include <hyp/switch.h>
+
+#include <asm/kvm_asm.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_host.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+
+#include <kvm/arm_hypercalls.h>
+
+static void handle_host_hcall(unsigned long func_id,
+ struct kvm_cpu_context *host_ctxt)
+{
+ unsigned long ret = 0;
+
+ switch (func_id) {
+ case KVM_HOST_SMCCC_FUNC(__kvm_vcpu_run): {
+ unsigned long r1 = host_ctxt->regs.regs[1];
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu *)r1;
+
+ ret = __kvm_vcpu_run(kern_hyp_va(vcpu));
+ break;
+ }
+ case KVM_HOST_SMCCC_FUNC(__kvm_flush_vm_context):
+ __kvm_flush_vm_context();
+ break;
+ case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_vmid_ipa): {
+ unsigned long r1 = host_ctxt->regs.regs[1];
+ struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1;
+ phys_addr_t ipa = host_ctxt->regs.regs[2];
+ int level = host_ctxt->regs.regs[3];
+
+ __kvm_tlb_flush_vmid_ipa(kern_hyp_va(mmu), ipa, level);
+ break;
+ }
+ case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_vmid): {
+ unsigned long r1 = host_ctxt->regs.regs[1];
+ struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1;
+
+ __kvm_tlb_flush_vmid(kern_hyp_va(mmu));
+ break;
+ }
+ case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_local_vmid): {
+ unsigned long r1 = host_ctxt->regs.regs[1];
+ struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1;
+
+ __kvm_tlb_flush_local_vmid(kern_hyp_va(mmu));
+ break;
+ }
+ case KVM_HOST_SMCCC_FUNC(__kvm_timer_set_cntvoff): {
+ u64 cntvoff = host_ctxt->regs.regs[1];
+
+ __kvm_timer_set_cntvoff(cntvoff);
+ break;
+ }
+ case KVM_HOST_SMCCC_FUNC(__kvm_enable_ssbs):
+ __kvm_enable_ssbs();
+ break;
+ case KVM_HOST_SMCCC_FUNC(__vgic_v3_get_ich_vtr_el2):
+ ret = __vgic_v3_get_ich_vtr_el2();
+ break;
+ case KVM_HOST_SMCCC_FUNC(__vgic_v3_read_vmcr):
+ ret = __vgic_v3_read_vmcr();
+ break;
+ case KVM_HOST_SMCCC_FUNC(__vgic_v3_write_vmcr): {
+ u32 vmcr = host_ctxt->regs.regs[1];
+
+ __vgic_v3_write_vmcr(vmcr);
+ break;
+ }
+ case KVM_HOST_SMCCC_FUNC(__vgic_v3_init_lrs):
+ __vgic_v3_init_lrs();
+ break;
+ case KVM_HOST_SMCCC_FUNC(__kvm_get_mdcr_el2):
+ ret = __kvm_get_mdcr_el2();
+ break;
+ case KVM_HOST_SMCCC_FUNC(__vgic_v3_save_aprs): {
+ unsigned long r1 = host_ctxt->regs.regs[1];
+ struct vgic_v3_cpu_if *cpu_if = (struct vgic_v3_cpu_if *)r1;
+
+ __vgic_v3_save_aprs(kern_hyp_va(cpu_if));
+ break;
+ }
+ case KVM_HOST_SMCCC_FUNC(__vgic_v3_restore_aprs): {
+ unsigned long r1 = host_ctxt->regs.regs[1];
+ struct vgic_v3_cpu_if *cpu_if = (struct vgic_v3_cpu_if *)r1;
+
+ __vgic_v3_restore_aprs(kern_hyp_va(cpu_if));
+ break;
+ }
+ default:
+ /* Invalid host HVC. */
+ host_ctxt->regs.regs[0] = SMCCC_RET_NOT_SUPPORTED;
+ return;
+ }
+
+ host_ctxt->regs.regs[0] = SMCCC_RET_SUCCESS;
+ host_ctxt->regs.regs[1] = ret;
+}
+
+void handle_trap(struct kvm_cpu_context *host_ctxt)
+{
+ u64 esr = read_sysreg_el2(SYS_ESR);
+ unsigned long func_id;
+
+ if (ESR_ELx_EC(esr) != ESR_ELx_EC_HVC64)
+ hyp_panic();
+
+ func_id = host_ctxt->regs.regs[0];
+ handle_host_hcall(func_id, host_ctxt);
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
new file mode 100644
index 000000000000..bb2d986ff696
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2020 Google LLC.
+ * Written by David Brazdil <dbrazdil@google.com>
+ *
+ * Linker script used for partial linking of nVHE EL2 object files.
+ */
+
+#include <asm/hyp_image.h>
+#include <asm-generic/vmlinux.lds.h>
+#include <asm/cache.h>
+#include <asm/memory.h>
+
+SECTIONS {
+ HYP_SECTION(.text)
+ HYP_SECTION_NAME(.data..percpu) : {
+ PERCPU_INPUT(L1_CACHE_BYTES)
+ }
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index 8d3dd4f47924..a457a0306e03 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -27,6 +27,11 @@
#include <asm/processor.h>
#include <asm/thread_info.h>
+/* Non-VHE specific context */
+DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data);
+DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
+DEFINE_PER_CPU(unsigned long, kvm_hyp_vector);
+
static void __activate_traps(struct kvm_vcpu *vcpu)
{
u64 val;
@@ -42,6 +47,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
}
write_sysreg(val, cptr_el2);
+ write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el2);
if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
struct kvm_cpu_context *ctxt = &vcpu->arch.ctxt;
@@ -60,6 +66,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
static void __deactivate_traps(struct kvm_vcpu *vcpu)
{
+ extern char __kvm_hyp_host_vector[];
u64 mdcr_el2;
___deactivate_traps(vcpu);
@@ -91,9 +98,10 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
write_sysreg(mdcr_el2, mdcr_el2);
write_sysreg(HCR_HOST_NVHE_FLAGS, hcr_el2);
write_sysreg(CPTR_EL2_DEFAULT, cptr_el2);
+ write_sysreg(__kvm_hyp_host_vector, vbar_el2);
}
-static void __deactivate_vm(struct kvm_vcpu *vcpu)
+static void __load_host_stage2(void)
{
write_sysreg(0, vttbr_el2);
}
@@ -173,9 +181,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
pmr_sync();
}
- vcpu = kern_hyp_va(vcpu);
-
- host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
+ host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
host_ctxt->__hyp_running_vcpu = vcpu;
guest_ctxt = &vcpu->arch.ctxt;
@@ -194,7 +200,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
__sysreg32_restore_state(vcpu);
__sysreg_restore_state_nvhe(guest_ctxt);
- __activate_vm(kern_hyp_va(vcpu->arch.hw_mmu));
+ __load_guest_stage2(kern_hyp_va(vcpu->arch.hw_mmu));
__activate_traps(vcpu);
__hyp_vgic_restore_state(vcpu);
@@ -204,7 +210,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
do {
/* Jump in the fire! */
- exit_code = __guest_enter(vcpu, host_ctxt);
+ exit_code = __guest_enter(vcpu);
/* And we're baaack! */
} while (fixup_guest_exit(vcpu, &exit_code));
@@ -215,7 +221,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
__hyp_vgic_save_state(vcpu);
__deactivate_traps(vcpu);
- __deactivate_vm(vcpu);
+ __load_host_stage2();
__sysreg_restore_state_nvhe(host_ctxt);
@@ -235,35 +241,31 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
if (system_uses_irq_prio_masking())
gic_write_pmr(GIC_PRIO_IRQOFF);
+ host_ctxt->__hyp_running_vcpu = NULL;
+
return exit_code;
}
-void __noreturn hyp_panic(struct kvm_cpu_context *host_ctxt)
+void __noreturn hyp_panic(void)
{
u64 spsr = read_sysreg_el2(SYS_SPSR);
u64 elr = read_sysreg_el2(SYS_ELR);
u64 par = read_sysreg(par_el1);
- struct kvm_vcpu *vcpu = host_ctxt->__hyp_running_vcpu;
- unsigned long str_va;
+ bool restore_host = true;
+ struct kvm_cpu_context *host_ctxt;
+ struct kvm_vcpu *vcpu;
- if (read_sysreg(vttbr_el2)) {
+ host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
+ vcpu = host_ctxt->__hyp_running_vcpu;
+
+ if (vcpu) {
__timer_disable_traps(vcpu);
__deactivate_traps(vcpu);
- __deactivate_vm(vcpu);
+ __load_host_stage2();
__sysreg_restore_state_nvhe(host_ctxt);
}
- /*
- * Force the panic string to be loaded from the literal pool,
- * making sure it is a kernel address and not a PC-relative
- * reference.
- */
- asm volatile("ldr %0, =%1" : "=r" (str_va) : "S" (__hyp_panic_string));
-
- __hyp_do_panic(str_va,
- spsr, elr,
- read_sysreg(esr_el2), read_sysreg_el2(SYS_FAR),
- read_sysreg(hpfar_el2), par, vcpu);
+ __hyp_do_panic(restore_host, spsr, elr, par);
unreachable();
}
diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c
index b15d65a42042..39ca71ab8866 100644
--- a/arch/arm64/kvm/hyp/nvhe/tlb.c
+++ b/arch/arm64/kvm/hyp/nvhe/tlb.c
@@ -61,7 +61,6 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
dsb(ishst);
/* Switch to requested VMID */
- mmu = kern_hyp_va(mmu);
__tlb_switch_to_guest(mmu, &cxt);
/*
@@ -115,7 +114,6 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
dsb(ishst);
/* Switch to requested VMID */
- mmu = kern_hyp_va(mmu);
__tlb_switch_to_guest(mmu, &cxt);
__tlbi(vmalls12e1is);
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
new file mode 100644
index 000000000000..0cdf6e461cbd
--- /dev/null
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -0,0 +1,892 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Stand-alone page-table allocator for hyp stage-1 and guest stage-2.
+ * No bombay mix was harmed in the writing of this file.
+ *
+ * Copyright (C) 2020 Google LLC
+ * Author: Will Deacon <will@kernel.org>
+ */
+
+#include <linux/bitfield.h>
+#include <asm/kvm_pgtable.h>
+
+#define KVM_PGTABLE_MAX_LEVELS 4U
+
+#define KVM_PTE_VALID BIT(0)
+
+#define KVM_PTE_TYPE BIT(1)
+#define KVM_PTE_TYPE_BLOCK 0
+#define KVM_PTE_TYPE_PAGE 1
+#define KVM_PTE_TYPE_TABLE 1
+
+#define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT)
+#define KVM_PTE_ADDR_51_48 GENMASK(15, 12)
+
+#define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2)
+
+#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2)
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6)
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO 3
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW 1
+#define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8)
+#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3
+#define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10)
+
+#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2)
+#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6)
+#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7)
+#define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8)
+#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3
+#define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10)
+
+#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 51)
+
+#define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54)
+
+#define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54)
+
+struct kvm_pgtable_walk_data {
+ struct kvm_pgtable *pgt;
+ struct kvm_pgtable_walker *walker;
+
+ u64 addr;
+ u64 end;
+};
+
+static u64 kvm_granule_shift(u32 level)
+{
+ /* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */
+ return ARM64_HW_PGTABLE_LEVEL_SHIFT(level);
+}
+
+static u64 kvm_granule_size(u32 level)
+{
+ return BIT(kvm_granule_shift(level));
+}
+
+static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level)
+{
+ u64 granule = kvm_granule_size(level);
+
+ /*
+ * Reject invalid block mappings and don't bother with 4TB mappings for
+ * 52-bit PAs.
+ */
+ if (level == 0 || (PAGE_SIZE != SZ_4K && level == 1))
+ return false;
+
+ if (granule > (end - addr))
+ return false;
+
+ return IS_ALIGNED(addr, granule) && IS_ALIGNED(phys, granule);
+}
+
+static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level)
+{
+ u64 shift = kvm_granule_shift(level);
+ u64 mask = BIT(PAGE_SHIFT - 3) - 1;
+
+ return (data->addr >> shift) & mask;
+}
+
+static u32 __kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr)
+{
+ u64 shift = kvm_granule_shift(pgt->start_level - 1); /* May underflow */
+ u64 mask = BIT(pgt->ia_bits) - 1;
+
+ return (addr & mask) >> shift;
+}
+
+static u32 kvm_pgd_page_idx(struct kvm_pgtable_walk_data *data)
+{
+ return __kvm_pgd_page_idx(data->pgt, data->addr);
+}
+
+static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level)
+{
+ struct kvm_pgtable pgt = {
+ .ia_bits = ia_bits,
+ .start_level = start_level,
+ };
+
+ return __kvm_pgd_page_idx(&pgt, -1ULL) + 1;
+}
+
+static bool kvm_pte_valid(kvm_pte_t pte)
+{
+ return pte & KVM_PTE_VALID;
+}
+
+static bool kvm_pte_table(kvm_pte_t pte, u32 level)
+{
+ if (level == KVM_PGTABLE_MAX_LEVELS - 1)
+ return false;
+
+ if (!kvm_pte_valid(pte))
+ return false;
+
+ return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE;
+}
+
+static u64 kvm_pte_to_phys(kvm_pte_t pte)
+{
+ u64 pa = pte & KVM_PTE_ADDR_MASK;
+
+ if (PAGE_SHIFT == 16)
+ pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48;
+
+ return pa;
+}
+
+static kvm_pte_t kvm_phys_to_pte(u64 pa)
+{
+ kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK;
+
+ if (PAGE_SHIFT == 16)
+ pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48);
+
+ return pte;
+}
+
+static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte)
+{
+ return __va(kvm_pte_to_phys(pte));
+}
+
+static void kvm_set_invalid_pte(kvm_pte_t *ptep)
+{
+ kvm_pte_t pte = *ptep;
+ WRITE_ONCE(*ptep, pte & ~KVM_PTE_VALID);
+}
+
+static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp)
+{
+ kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(__pa(childp));
+
+ pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE);
+ pte |= KVM_PTE_VALID;
+
+ WARN_ON(kvm_pte_valid(old));
+ smp_store_release(ptep, pte);
+}
+
+static bool kvm_set_valid_leaf_pte(kvm_pte_t *ptep, u64 pa, kvm_pte_t attr,
+ u32 level)
+{
+ kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(pa);
+ u64 type = (level == KVM_PGTABLE_MAX_LEVELS - 1) ? KVM_PTE_TYPE_PAGE :
+ KVM_PTE_TYPE_BLOCK;
+
+ pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI);
+ pte |= FIELD_PREP(KVM_PTE_TYPE, type);
+ pte |= KVM_PTE_VALID;
+
+ /* Tolerate KVM recreating the exact same mapping. */
+ if (kvm_pte_valid(old))
+ return old == pte;
+
+ smp_store_release(ptep, pte);
+ return true;
+}
+
+static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr,
+ u32 level, kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag)
+{
+ struct kvm_pgtable_walker *walker = data->walker;
+ return walker->cb(addr, data->end, level, ptep, flag, walker->arg);
+}
+
+static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
+ kvm_pte_t *pgtable, u32 level);
+
+static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
+ kvm_pte_t *ptep, u32 level)
+{
+ int ret = 0;
+ u64 addr = data->addr;
+ kvm_pte_t *childp, pte = *ptep;
+ bool table = kvm_pte_table(pte, level);
+ enum kvm_pgtable_walk_flags flags = data->walker->flags;
+
+ if (table && (flags & KVM_PGTABLE_WALK_TABLE_PRE)) {
+ ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
+ KVM_PGTABLE_WALK_TABLE_PRE);
+ }
+
+ if (!table && (flags & KVM_PGTABLE_WALK_LEAF)) {
+ ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
+ KVM_PGTABLE_WALK_LEAF);
+ pte = *ptep;
+ table = kvm_pte_table(pte, level);
+ }
+
+ if (ret)
+ goto out;
+
+ if (!table) {
+ data->addr += kvm_granule_size(level);
+ goto out;
+ }
+
+ childp = kvm_pte_follow(pte);
+ ret = __kvm_pgtable_walk(data, childp, level + 1);
+ if (ret)
+ goto out;
+
+ if (flags & KVM_PGTABLE_WALK_TABLE_POST) {
+ ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
+ KVM_PGTABLE_WALK_TABLE_POST);
+ }
+
+out:
+ return ret;
+}
+
+static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
+ kvm_pte_t *pgtable, u32 level)
+{
+ u32 idx;
+ int ret = 0;
+
+ if (WARN_ON_ONCE(level >= KVM_PGTABLE_MAX_LEVELS))
+ return -EINVAL;
+
+ for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) {
+ kvm_pte_t *ptep = &pgtable[idx];
+
+ if (data->addr >= data->end)
+ break;
+
+ ret = __kvm_pgtable_visit(data, ptep, level);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static int _kvm_pgtable_walk(struct kvm_pgtable_walk_data *data)
+{
+ u32 idx;
+ int ret = 0;
+ struct kvm_pgtable *pgt = data->pgt;
+ u64 limit = BIT(pgt->ia_bits);
+
+ if (data->addr > limit || data->end > limit)
+ return -ERANGE;
+
+ if (!pgt->pgd)
+ return -EINVAL;
+
+ for (idx = kvm_pgd_page_idx(data); data->addr < data->end; ++idx) {
+ kvm_pte_t *ptep = &pgt->pgd[idx * PTRS_PER_PTE];
+
+ ret = __kvm_pgtable_walk(data, ptep, pgt->start_level);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
+ struct kvm_pgtable_walker *walker)
+{
+ struct kvm_pgtable_walk_data walk_data = {
+ .pgt = pgt,
+ .addr = ALIGN_DOWN(addr, PAGE_SIZE),
+ .end = PAGE_ALIGN(walk_data.addr + size),
+ .walker = walker,
+ };
+
+ return _kvm_pgtable_walk(&walk_data);
+}
+
+struct hyp_map_data {
+ u64 phys;
+ kvm_pte_t attr;
+};
+
+static int hyp_map_set_prot_attr(enum kvm_pgtable_prot prot,
+ struct hyp_map_data *data)
+{
+ bool device = prot & KVM_PGTABLE_PROT_DEVICE;
+ u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL;
+ kvm_pte_t attr = FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX, mtype);
+ u32 sh = KVM_PTE_LEAF_ATTR_LO_S1_SH_IS;
+ u32 ap = (prot & KVM_PGTABLE_PROT_W) ? KVM_PTE_LEAF_ATTR_LO_S1_AP_RW :
+ KVM_PTE_LEAF_ATTR_LO_S1_AP_RO;
+
+ if (!(prot & KVM_PGTABLE_PROT_R))
+ return -EINVAL;
+
+ if (prot & KVM_PGTABLE_PROT_X) {
+ if (prot & KVM_PGTABLE_PROT_W)
+ return -EINVAL;
+
+ if (device)
+ return -EINVAL;
+ } else {
+ attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN;
+ }
+
+ attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap);
+ attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh);
+ attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF;
+ data->attr = attr;
+ return 0;
+}
+
+static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level,
+ kvm_pte_t *ptep, struct hyp_map_data *data)
+{
+ u64 granule = kvm_granule_size(level), phys = data->phys;
+
+ if (!kvm_block_mapping_supported(addr, end, phys, level))
+ return false;
+
+ WARN_ON(!kvm_set_valid_leaf_pte(ptep, phys, data->attr, level));
+ data->phys += granule;
+ return true;
+}
+
+static int hyp_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag, void * const arg)
+{
+ kvm_pte_t *childp;
+
+ if (hyp_map_walker_try_leaf(addr, end, level, ptep, arg))
+ return 0;
+
+ if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1))
+ return -EINVAL;
+
+ childp = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL);
+ if (!childp)
+ return -ENOMEM;
+
+ kvm_set_table_pte(ptep, childp);
+ return 0;
+}
+
+int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
+ enum kvm_pgtable_prot prot)
+{
+ int ret;
+ struct hyp_map_data map_data = {
+ .phys = ALIGN_DOWN(phys, PAGE_SIZE),
+ };
+ struct kvm_pgtable_walker walker = {
+ .cb = hyp_map_walker,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ .arg = &map_data,
+ };
+
+ ret = hyp_map_set_prot_attr(prot, &map_data);
+ if (ret)
+ return ret;
+
+ ret = kvm_pgtable_walk(pgt, addr, size, &walker);
+ dsb(ishst);
+ isb();
+ return ret;
+}
+
+int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits)
+{
+ u64 levels = ARM64_HW_PGTABLE_LEVELS(va_bits);
+
+ pgt->pgd = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL);
+ if (!pgt->pgd)
+ return -ENOMEM;
+
+ pgt->ia_bits = va_bits;
+ pgt->start_level = KVM_PGTABLE_MAX_LEVELS - levels;
+ pgt->mmu = NULL;
+ return 0;
+}
+
+static int hyp_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag, void * const arg)
+{
+ free_page((unsigned long)kvm_pte_follow(*ptep));
+ return 0;
+}
+
+void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt)
+{
+ struct kvm_pgtable_walker walker = {
+ .cb = hyp_free_walker,
+ .flags = KVM_PGTABLE_WALK_TABLE_POST,
+ };
+
+ WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
+ free_page((unsigned long)pgt->pgd);
+ pgt->pgd = NULL;
+}
+
+struct stage2_map_data {
+ u64 phys;
+ kvm_pte_t attr;
+
+ kvm_pte_t *anchor;
+
+ struct kvm_s2_mmu *mmu;
+ struct kvm_mmu_memory_cache *memcache;
+};
+
+static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot,
+ struct stage2_map_data *data)
+{
+ bool device = prot & KVM_PGTABLE_PROT_DEVICE;
+ kvm_pte_t attr = device ? PAGE_S2_MEMATTR(DEVICE_nGnRE) :
+ PAGE_S2_MEMATTR(NORMAL);
+ u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS;
+
+ if (!(prot & KVM_PGTABLE_PROT_X))
+ attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
+ else if (device)
+ return -EINVAL;
+
+ if (prot & KVM_PGTABLE_PROT_R)
+ attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
+
+ if (prot & KVM_PGTABLE_PROT_W)
+ attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
+
+ attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh);
+ attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
+ data->attr = attr;
+ return 0;
+}
+
+static bool stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
+ kvm_pte_t *ptep,
+ struct stage2_map_data *data)
+{
+ u64 granule = kvm_granule_size(level), phys = data->phys;
+
+ if (!kvm_block_mapping_supported(addr, end, phys, level))
+ return false;
+
+ if (kvm_set_valid_leaf_pte(ptep, phys, data->attr, level))
+ goto out;
+
+ /* There's an existing valid leaf entry, so perform break-before-make */
+ kvm_set_invalid_pte(ptep);
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
+ kvm_set_valid_leaf_pte(ptep, phys, data->attr, level);
+out:
+ data->phys += granule;
+ return true;
+}
+
+static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level,
+ kvm_pte_t *ptep,
+ struct stage2_map_data *data)
+{
+ if (data->anchor)
+ return 0;
+
+ if (!kvm_block_mapping_supported(addr, end, data->phys, level))
+ return 0;
+
+ kvm_set_invalid_pte(ptep);
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, 0);
+ data->anchor = ptep;
+ return 0;
+}
+
+static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+ struct stage2_map_data *data)
+{
+ kvm_pte_t *childp, pte = *ptep;
+ struct page *page = virt_to_page(ptep);
+
+ if (data->anchor) {
+ if (kvm_pte_valid(pte))
+ put_page(page);
+
+ return 0;
+ }
+
+ if (stage2_map_walker_try_leaf(addr, end, level, ptep, data))
+ goto out_get_page;
+
+ if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1))
+ return -EINVAL;
+
+ if (!data->memcache)
+ return -ENOMEM;
+
+ childp = kvm_mmu_memory_cache_alloc(data->memcache);
+ if (!childp)
+ return -ENOMEM;
+
+ /*
+ * If we've run into an existing block mapping then replace it with
+ * a table. Accesses beyond 'end' that fall within the new table
+ * will be mapped lazily.
+ */
+ if (kvm_pte_valid(pte)) {
+ kvm_set_invalid_pte(ptep);
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
+ put_page(page);
+ }
+
+ kvm_set_table_pte(ptep, childp);
+
+out_get_page:
+ get_page(page);
+ return 0;
+}
+
+static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level,
+ kvm_pte_t *ptep,
+ struct stage2_map_data *data)
+{
+ int ret = 0;
+
+ if (!data->anchor)
+ return 0;
+
+ free_page((unsigned long)kvm_pte_follow(*ptep));
+ put_page(virt_to_page(ptep));
+
+ if (data->anchor == ptep) {
+ data->anchor = NULL;
+ ret = stage2_map_walk_leaf(addr, end, level, ptep, data);
+ }
+
+ return ret;
+}
+
+/*
+ * This is a little fiddly, as we use all three of the walk flags. The idea
+ * is that the TABLE_PRE callback runs for table entries on the way down,
+ * looking for table entries which we could conceivably replace with a
+ * block entry for this mapping. If it finds one, then it sets the 'anchor'
+ * field in 'struct stage2_map_data' to point at the table entry, before
+ * clearing the entry to zero and descending into the now detached table.
+ *
+ * The behaviour of the LEAF callback then depends on whether or not the
+ * anchor has been set. If not, then we're not using a block mapping higher
+ * up the table and we perform the mapping at the existing leaves instead.
+ * If, on the other hand, the anchor _is_ set, then we drop references to
+ * all valid leaves so that the pages beneath the anchor can be freed.
+ *
+ * Finally, the TABLE_POST callback does nothing if the anchor has not
+ * been set, but otherwise frees the page-table pages while walking back up
+ * the page-table, installing the block entry when it revisits the anchor
+ * pointer and clearing the anchor to NULL.
+ */
+static int stage2_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag, void * const arg)
+{
+ struct stage2_map_data *data = arg;
+
+ switch (flag) {
+ case KVM_PGTABLE_WALK_TABLE_PRE:
+ return stage2_map_walk_table_pre(addr, end, level, ptep, data);
+ case KVM_PGTABLE_WALK_LEAF:
+ return stage2_map_walk_leaf(addr, end, level, ptep, data);
+ case KVM_PGTABLE_WALK_TABLE_POST:
+ return stage2_map_walk_table_post(addr, end, level, ptep, data);
+ }
+
+ return -EINVAL;
+}
+
+int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
+ u64 phys, enum kvm_pgtable_prot prot,
+ struct kvm_mmu_memory_cache *mc)
+{
+ int ret;
+ struct stage2_map_data map_data = {
+ .phys = ALIGN_DOWN(phys, PAGE_SIZE),
+ .mmu = pgt->mmu,
+ .memcache = mc,
+ };
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_map_walker,
+ .flags = KVM_PGTABLE_WALK_TABLE_PRE |
+ KVM_PGTABLE_WALK_LEAF |
+ KVM_PGTABLE_WALK_TABLE_POST,
+ .arg = &map_data,
+ };
+
+ ret = stage2_map_set_prot_attr(prot, &map_data);
+ if (ret)
+ return ret;
+
+ ret = kvm_pgtable_walk(pgt, addr, size, &walker);
+ dsb(ishst);
+ return ret;
+}
+
+static void stage2_flush_dcache(void *addr, u64 size)
+{
+ if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+ return;
+
+ __flush_dcache_area(addr, size);
+}
+
+static bool stage2_pte_cacheable(kvm_pte_t pte)
+{
+ u64 memattr = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR, pte);
+ return memattr == PAGE_S2_MEMATTR(NORMAL);
+}
+
+static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag,
+ void * const arg)
+{
+ struct kvm_s2_mmu *mmu = arg;
+ kvm_pte_t pte = *ptep, *childp = NULL;
+ bool need_flush = false;
+
+ if (!kvm_pte_valid(pte))
+ return 0;
+
+ if (kvm_pte_table(pte, level)) {
+ childp = kvm_pte_follow(pte);
+
+ if (page_count(virt_to_page(childp)) != 1)
+ return 0;
+ } else if (stage2_pte_cacheable(pte)) {
+ need_flush = true;
+ }
+
+ /*
+ * This is similar to the map() path in that we unmap the entire
+ * block entry and rely on the remaining portions being faulted
+ * back lazily.
+ */
+ kvm_set_invalid_pte(ptep);
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, addr, level);
+ put_page(virt_to_page(ptep));
+
+ if (need_flush) {
+ stage2_flush_dcache(kvm_pte_follow(pte),
+ kvm_granule_size(level));
+ }
+
+ if (childp)
+ free_page((unsigned long)childp);
+
+ return 0;
+}
+
+int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
+{
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_unmap_walker,
+ .arg = pgt->mmu,
+ .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
+ };
+
+ return kvm_pgtable_walk(pgt, addr, size, &walker);
+}
+
+struct stage2_attr_data {
+ kvm_pte_t attr_set;
+ kvm_pte_t attr_clr;
+ kvm_pte_t pte;
+ u32 level;
+};
+
+static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag,
+ void * const arg)
+{
+ kvm_pte_t pte = *ptep;
+ struct stage2_attr_data *data = arg;
+
+ if (!kvm_pte_valid(pte))
+ return 0;
+
+ data->level = level;
+ data->pte = pte;
+ pte &= ~data->attr_clr;
+ pte |= data->attr_set;
+
+ /*
+ * We may race with the CPU trying to set the access flag here,
+ * but worst-case the access flag update gets lost and will be
+ * set on the next access instead.
+ */
+ if (data->pte != pte)
+ WRITE_ONCE(*ptep, pte);
+
+ return 0;
+}
+
+static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr,
+ u64 size, kvm_pte_t attr_set,
+ kvm_pte_t attr_clr, kvm_pte_t *orig_pte,
+ u32 *level)
+{
+ int ret;
+ kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI;
+ struct stage2_attr_data data = {
+ .attr_set = attr_set & attr_mask,
+ .attr_clr = attr_clr & attr_mask,
+ };
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_attr_walker,
+ .arg = &data,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ };
+
+ ret = kvm_pgtable_walk(pgt, addr, size, &walker);
+ if (ret)
+ return ret;
+
+ if (orig_pte)
+ *orig_pte = data.pte;
+
+ if (level)
+ *level = data.level;
+ return 0;
+}
+
+int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
+{
+ return stage2_update_leaf_attrs(pgt, addr, size, 0,
+ KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W,
+ NULL, NULL);
+}
+
+kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr)
+{
+ kvm_pte_t pte = 0;
+ stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0,
+ &pte, NULL);
+ dsb(ishst);
+ return pte;
+}
+
+kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr)
+{
+ kvm_pte_t pte = 0;
+ stage2_update_leaf_attrs(pgt, addr, 1, 0, KVM_PTE_LEAF_ATTR_LO_S2_AF,
+ &pte, NULL);
+ /*
+ * "But where's the TLBI?!", you scream.
+ * "Over in the core code", I sigh.
+ *
+ * See the '->clear_flush_young()' callback on the KVM mmu notifier.
+ */
+ return pte;
+}
+
+bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr)
+{
+ kvm_pte_t pte = 0;
+ stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL);
+ return pte & KVM_PTE_LEAF_ATTR_LO_S2_AF;
+}
+
+int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
+ enum kvm_pgtable_prot prot)
+{
+ int ret;
+ u32 level;
+ kvm_pte_t set = 0, clr = 0;
+
+ if (prot & KVM_PGTABLE_PROT_R)
+ set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
+
+ if (prot & KVM_PGTABLE_PROT_W)
+ set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
+
+ if (prot & KVM_PGTABLE_PROT_X)
+ clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
+
+ ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level);
+ if (!ret)
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, pgt->mmu, addr, level);
+ return ret;
+}
+
+static int stage2_flush_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag,
+ void * const arg)
+{
+ kvm_pte_t pte = *ptep;
+
+ if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pte))
+ return 0;
+
+ stage2_flush_dcache(kvm_pte_follow(pte), kvm_granule_size(level));
+ return 0;
+}
+
+int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
+{
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_flush_walker,
+ .flags = KVM_PGTABLE_WALK_LEAF,
+ };
+
+ if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+ return 0;
+
+ return kvm_pgtable_walk(pgt, addr, size, &walker);
+}
+
+int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm)
+{
+ size_t pgd_sz;
+ u64 vtcr = kvm->arch.vtcr;
+ u32 ia_bits = VTCR_EL2_IPA(vtcr);
+ u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
+ u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
+
+ pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
+ pgt->pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL | __GFP_ZERO);
+ if (!pgt->pgd)
+ return -ENOMEM;
+
+ pgt->ia_bits = ia_bits;
+ pgt->start_level = start_level;
+ pgt->mmu = &kvm->arch.mmu;
+
+ /* Ensure zeroed PGD pages are visible to the hardware walker */
+ dsb(ishst);
+ return 0;
+}
+
+static int stage2_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+ enum kvm_pgtable_walk_flags flag,
+ void * const arg)
+{
+ kvm_pte_t pte = *ptep;
+
+ if (!kvm_pte_valid(pte))
+ return 0;
+
+ put_page(virt_to_page(ptep));
+
+ if (kvm_pte_table(pte, level))
+ free_page((unsigned long)kvm_pte_follow(pte));
+
+ return 0;
+}
+
+void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
+{
+ size_t pgd_sz;
+ struct kvm_pgtable_walker walker = {
+ .cb = stage2_free_walker,
+ .flags = KVM_PGTABLE_WALK_LEAF |
+ KVM_PGTABLE_WALK_TABLE_POST,
+ };
+
+ WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
+ pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE;
+ free_pages_exact(pgt->pgd, pgd_sz);
+ pgt->pgd = NULL;
+}
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index ecf67e678203..fe69de16dadc 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -28,6 +28,11 @@
const char __hyp_panic_string[] = "HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n";
+/* VHE specific context */
+DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data);
+DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
+DEFINE_PER_CPU(unsigned long, kvm_hyp_vector);
+
static void __activate_traps(struct kvm_vcpu *vcpu)
{
u64 val;
@@ -59,7 +64,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
write_sysreg(val, cpacr_el1);
- write_sysreg(kvm_get_hyp_vector(), vbar_el1);
+ write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el1);
}
NOKPROBE_SYMBOL(__activate_traps);
@@ -108,7 +113,7 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
struct kvm_cpu_context *guest_ctxt;
u64 exit_code;
- host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
+ host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
host_ctxt->__hyp_running_vcpu = vcpu;
guest_ctxt = &vcpu->arch.ctxt;
@@ -120,12 +125,12 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
* HCR_EL2.TGE.
*
* We have already configured the guest's stage 1 translation in
- * kvm_vcpu_load_sysregs_vhe above. We must now call __activate_vm
- * before __activate_traps, because __activate_vm configures
- * stage 2 translation, and __activate_traps clear HCR_EL2.TGE
- * (among other things).
+ * kvm_vcpu_load_sysregs_vhe above. We must now call
+ * __load_guest_stage2 before __activate_traps, because
+ * __load_guest_stage2 configures stage 2 translation, and
+ * __activate_traps clear HCR_EL2.TGE (among other things).
*/
- __activate_vm(vcpu->arch.hw_mmu);
+ __load_guest_stage2(vcpu->arch.hw_mmu);
__activate_traps(vcpu);
sysreg_restore_guest_state_vhe(guest_ctxt);
@@ -133,7 +138,7 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
do {
/* Jump in the fire! */
- exit_code = __guest_enter(vcpu, host_ctxt);
+ exit_code = __guest_enter(vcpu);
/* And we're baaack! */
} while (fixup_guest_exit(vcpu, &exit_code));
@@ -188,10 +193,12 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
return ret;
}
-static void __hyp_call_panic(u64 spsr, u64 elr, u64 par,
- struct kvm_cpu_context *host_ctxt)
+static void __hyp_call_panic(u64 spsr, u64 elr, u64 par)
{
+ struct kvm_cpu_context *host_ctxt;
struct kvm_vcpu *vcpu;
+
+ host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
vcpu = host_ctxt->__hyp_running_vcpu;
__deactivate_traps(vcpu);
@@ -204,13 +211,13 @@ static void __hyp_call_panic(u64 spsr, u64 elr, u64 par,
}
NOKPROBE_SYMBOL(__hyp_call_panic);
-void __noreturn hyp_panic(struct kvm_cpu_context *host_ctxt)
+void __noreturn hyp_panic(void)
{
u64 spsr = read_sysreg_el2(SYS_SPSR);
u64 elr = read_sysreg_el2(SYS_ELR);
u64 par = read_sysreg(par_el1);
- __hyp_call_panic(spsr, elr, par, host_ctxt);
+ __hyp_call_panic(spsr, elr, par);
unreachable();
}
diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
index 996471e4c138..2a0b8c88d74f 100644
--- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
@@ -66,7 +66,7 @@ void kvm_vcpu_load_sysregs_vhe(struct kvm_vcpu *vcpu)
struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt;
struct kvm_cpu_context *host_ctxt;
- host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
+ host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
__sysreg_save_user_state(host_ctxt);
/*
@@ -100,7 +100,7 @@ void kvm_vcpu_put_sysregs_vhe(struct kvm_vcpu *vcpu)
struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt;
struct kvm_cpu_context *host_ctxt;
- host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
+ host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
deactivate_traps_vhe_put();
__sysreg_save_el1_state(guest_ctxt);
diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c
index ebfdfc27b2bd..34a96ab244fa 100644
--- a/arch/arm64/kvm/inject_fault.c
+++ b/arch/arm64/kvm/inject_fault.c
@@ -202,6 +202,7 @@ void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr)
/**
* kvm_inject_undefined - inject an undefined instruction into the guest
+ * @vcpu: The vCPU in which to inject the exception
*
* It is assumed that this code is called from the VCPU thread and that the
* VCPU therefore is not currently executing guest code.
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 3d26b47a1343..19aacc7d64de 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -14,6 +14,7 @@
#include <asm/cacheflush.h>
#include <asm/kvm_arm.h>
#include <asm/kvm_mmu.h>
+#include <asm/kvm_pgtable.h>
#include <asm/kvm_ras.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
@@ -21,9 +22,7 @@
#include "trace.h"
-static pgd_t *boot_hyp_pgd;
-static pgd_t *hyp_pgd;
-static pgd_t *merged_hyp_pgd;
+static struct kvm_pgtable *hyp_pgtable;
static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
static unsigned long hyp_idmap_start;
@@ -32,16 +31,42 @@ static phys_addr_t hyp_idmap_vector;
static unsigned long io_map_base;
-#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
-#define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0)
-#define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1)
-
-static bool is_iomap(unsigned long flags)
+/*
+ * Release kvm_mmu_lock periodically if the memory region is large. Otherwise,
+ * we may see kernel panics with CONFIG_DETECT_HUNG_TASK,
+ * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too
+ * long will also starve other vCPUs. We have to also make sure that the page
+ * tables are not freed while we released the lock.
+ */
+static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr,
+ phys_addr_t end,
+ int (*fn)(struct kvm_pgtable *, u64, u64),
+ bool resched)
{
- return flags & KVM_S2PTE_FLAG_IS_IOMAP;
+ int ret;
+ u64 next;
+
+ do {
+ struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
+ if (!pgt)
+ return -EINVAL;
+
+ next = stage2_pgd_addr_end(kvm, addr, end);
+ ret = fn(pgt, addr, next - addr);
+ if (ret)
+ break;
+
+ if (resched && next != end)
+ cond_resched_lock(&kvm->mmu_lock);
+ } while (addr = next, addr != end);
+
+ return ret;
}
+#define stage2_apply_range_resched(kvm, addr, end, fn) \
+ stage2_apply_range(kvm, addr, end, fn, true)
+
static bool memslot_is_logging(struct kvm_memory_slot *memslot)
{
return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
@@ -58,154 +83,11 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
}
-static void kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa,
- int level)
-{
- kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ipa, level);
-}
-
-/*
- * D-Cache management functions. They take the page table entries by
- * value, as they are flushing the cache using the kernel mapping (or
- * kmap on 32bit).
- */
-static void kvm_flush_dcache_pte(pte_t pte)
-{
- __kvm_flush_dcache_pte(pte);
-}
-
-static void kvm_flush_dcache_pmd(pmd_t pmd)
-{
- __kvm_flush_dcache_pmd(pmd);
-}
-
-static void kvm_flush_dcache_pud(pud_t pud)
-{
- __kvm_flush_dcache_pud(pud);
-}
-
static bool kvm_is_device_pfn(unsigned long pfn)
{
return !pfn_valid(pfn);
}
-/**
- * stage2_dissolve_pmd() - clear and flush huge PMD entry
- * @mmu: pointer to mmu structure to operate on
- * @addr: IPA
- * @pmd: pmd pointer for IPA
- *
- * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs.
- */
-static void stage2_dissolve_pmd(struct kvm_s2_mmu *mmu, phys_addr_t addr, pmd_t *pmd)
-{
- if (!pmd_thp_or_huge(*pmd))
- return;
-
- pmd_clear(pmd);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
- put_page(virt_to_page(pmd));
-}
-
-/**
- * stage2_dissolve_pud() - clear and flush huge PUD entry
- * @mmu: pointer to mmu structure to operate on
- * @addr: IPA
- * @pud: pud pointer for IPA
- *
- * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs.
- */
-static void stage2_dissolve_pud(struct kvm_s2_mmu *mmu, phys_addr_t addr, pud_t *pudp)
-{
- struct kvm *kvm = mmu->kvm;
-
- if (!stage2_pud_huge(kvm, *pudp))
- return;
-
- stage2_pud_clear(kvm, pudp);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
- put_page(virt_to_page(pudp));
-}
-
-static void clear_stage2_pgd_entry(struct kvm_s2_mmu *mmu, pgd_t *pgd, phys_addr_t addr)
-{
- struct kvm *kvm = mmu->kvm;
- p4d_t *p4d_table __maybe_unused = stage2_p4d_offset(kvm, pgd, 0UL);
- stage2_pgd_clear(kvm, pgd);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
- stage2_p4d_free(kvm, p4d_table);
- put_page(virt_to_page(pgd));
-}
-
-static void clear_stage2_p4d_entry(struct kvm_s2_mmu *mmu, p4d_t *p4d, phys_addr_t addr)
-{
- struct kvm *kvm = mmu->kvm;
- pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, p4d, 0);
- stage2_p4d_clear(kvm, p4d);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
- stage2_pud_free(kvm, pud_table);
- put_page(virt_to_page(p4d));
-}
-
-static void clear_stage2_pud_entry(struct kvm_s2_mmu *mmu, pud_t *pud, phys_addr_t addr)
-{
- struct kvm *kvm = mmu->kvm;
- pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0);
-
- VM_BUG_ON(stage2_pud_huge(kvm, *pud));
- stage2_pud_clear(kvm, pud);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
- stage2_pmd_free(kvm, pmd_table);
- put_page(virt_to_page(pud));
-}
-
-static void clear_stage2_pmd_entry(struct kvm_s2_mmu *mmu, pmd_t *pmd, phys_addr_t addr)
-{
- pte_t *pte_table = pte_offset_kernel(pmd, 0);
- VM_BUG_ON(pmd_thp_or_huge(*pmd));
- pmd_clear(pmd);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
- free_page((unsigned long)pte_table);
- put_page(virt_to_page(pmd));
-}
-
-static inline void kvm_set_pte(pte_t *ptep, pte_t new_pte)
-{
- WRITE_ONCE(*ptep, new_pte);
- dsb(ishst);
-}
-
-static inline void kvm_set_pmd(pmd_t *pmdp, pmd_t new_pmd)
-{
- WRITE_ONCE(*pmdp, new_pmd);
- dsb(ishst);
-}
-
-static inline void kvm_pmd_populate(pmd_t *pmdp, pte_t *ptep)
-{
- kvm_set_pmd(pmdp, kvm_mk_pmd(ptep));
-}
-
-static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp)
-{
- WRITE_ONCE(*pudp, kvm_mk_pud(pmdp));
- dsb(ishst);
-}
-
-static inline void kvm_p4d_populate(p4d_t *p4dp, pud_t *pudp)
-{
- WRITE_ONCE(*p4dp, kvm_mk_p4d(pudp));
- dsb(ishst);
-}
-
-static inline void kvm_pgd_populate(pgd_t *pgdp, p4d_t *p4dp)
-{
-#ifndef __PAGETABLE_P4D_FOLDED
- WRITE_ONCE(*pgdp, kvm_mk_pgd(p4dp));
- dsb(ishst);
-#endif
-}
-
/*
* Unmapping vs dcache management:
*
@@ -223,120 +105,19 @@ static inline void kvm_pgd_populate(pgd_t *pgdp, p4d_t *p4dp)
* end up writing old data to disk.
*
* This is why right after unmapping a page/section and invalidating
- * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure
- * the IO subsystem will never hit in the cache.
+ * the corresponding TLBs, we flush to make sure the IO subsystem will
+ * never hit in the cache.
*
* This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
* we then fully enforce cacheability of RAM, no matter what the guest
* does.
*/
-static void unmap_stage2_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd,
- phys_addr_t addr, phys_addr_t end)
-{
- phys_addr_t start_addr = addr;
- pte_t *pte, *start_pte;
-
- start_pte = pte = pte_offset_kernel(pmd, addr);
- do {
- if (!pte_none(*pte)) {
- pte_t old_pte = *pte;
-
- kvm_set_pte(pte, __pte(0));
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PTE_LEVEL);
-
- /* No need to invalidate the cache for device mappings */
- if (!kvm_is_device_pfn(pte_pfn(old_pte)))
- kvm_flush_dcache_pte(old_pte);
-
- put_page(virt_to_page(pte));
- }
- } while (pte++, addr += PAGE_SIZE, addr != end);
-
- if (stage2_pte_table_empty(mmu->kvm, start_pte))
- clear_stage2_pmd_entry(mmu, pmd, start_addr);
-}
-
-static void unmap_stage2_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
- phys_addr_t addr, phys_addr_t end)
-{
- struct kvm *kvm = mmu->kvm;
- phys_addr_t next, start_addr = addr;
- pmd_t *pmd, *start_pmd;
-
- start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr);
- do {
- next = stage2_pmd_addr_end(kvm, addr, end);
- if (!pmd_none(*pmd)) {
- if (pmd_thp_or_huge(*pmd)) {
- pmd_t old_pmd = *pmd;
-
- pmd_clear(pmd);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
-
- kvm_flush_dcache_pmd(old_pmd);
-
- put_page(virt_to_page(pmd));
- } else {
- unmap_stage2_ptes(mmu, pmd, addr, next);
- }
- }
- } while (pmd++, addr = next, addr != end);
-
- if (stage2_pmd_table_empty(kvm, start_pmd))
- clear_stage2_pud_entry(mmu, pud, start_addr);
-}
-
-static void unmap_stage2_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
- phys_addr_t addr, phys_addr_t end)
-{
- struct kvm *kvm = mmu->kvm;
- phys_addr_t next, start_addr = addr;
- pud_t *pud, *start_pud;
-
- start_pud = pud = stage2_pud_offset(kvm, p4d, addr);
- do {
- next = stage2_pud_addr_end(kvm, addr, end);
- if (!stage2_pud_none(kvm, *pud)) {
- if (stage2_pud_huge(kvm, *pud)) {
- pud_t old_pud = *pud;
-
- stage2_pud_clear(kvm, pud);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
- kvm_flush_dcache_pud(old_pud);
- put_page(virt_to_page(pud));
- } else {
- unmap_stage2_pmds(mmu, pud, addr, next);
- }
- }
- } while (pud++, addr = next, addr != end);
-
- if (stage2_pud_table_empty(kvm, start_pud))
- clear_stage2_p4d_entry(mmu, p4d, start_addr);
-}
-
-static void unmap_stage2_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
- phys_addr_t addr, phys_addr_t end)
-{
- struct kvm *kvm = mmu->kvm;
- phys_addr_t next, start_addr = addr;
- p4d_t *p4d, *start_p4d;
-
- start_p4d = p4d = stage2_p4d_offset(kvm, pgd, addr);
- do {
- next = stage2_p4d_addr_end(kvm, addr, end);
- if (!stage2_p4d_none(kvm, *p4d))
- unmap_stage2_puds(mmu, p4d, addr, next);
- } while (p4d++, addr = next, addr != end);
-
- if (stage2_p4d_table_empty(kvm, start_p4d))
- clear_stage2_pgd_entry(mmu, pgd, start_addr);
-}
-
/**
* unmap_stage2_range -- Clear stage2 page table entries to unmap a range
- * @kvm: The VM pointer
+ * @mmu: The KVM stage-2 MMU pointer
* @start: The intermediate physical base address of the range to unmap
* @size: The size of the area to unmap
+ * @may_block: Whether or not we are permitted to block
*
* Clear a range of stage-2 mappings, lowering the various ref-counts. Must
* be called while holding mmu_lock (unless for freeing the stage2 pgd before
@@ -347,32 +128,12 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64
bool may_block)
{
struct kvm *kvm = mmu->kvm;
- pgd_t *pgd;
- phys_addr_t addr = start, end = start + size;
- phys_addr_t next;
+ phys_addr_t end = start + size;
assert_spin_locked(&kvm->mmu_lock);
WARN_ON(size & ~PAGE_MASK);
-
- pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
- do {
- /*
- * Make sure the page table is still active, as another thread
- * could have possibly freed the page table, while we released
- * the lock.
- */
- if (!READ_ONCE(mmu->pgd))
- break;
- next = stage2_pgd_addr_end(kvm, addr, end);
- if (!stage2_pgd_none(kvm, *pgd))
- unmap_stage2_p4ds(mmu, pgd, addr, next);
- /*
- * If the range is too large, release the kvm->mmu_lock
- * to prevent starvation and lockup detector warnings.
- */
- if (may_block && next != end)
- cond_resched_lock(&kvm->mmu_lock);
- } while (pgd++, addr = next, addr != end);
+ WARN_ON(stage2_apply_range(kvm, start, end, kvm_pgtable_stage2_unmap,
+ may_block));
}
static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
@@ -380,89 +141,13 @@ static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 si
__unmap_stage2_range(mmu, start, size, true);
}
-static void stage2_flush_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd,
- phys_addr_t addr, phys_addr_t end)
-{
- pte_t *pte;
-
- pte = pte_offset_kernel(pmd, addr);
- do {
- if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte)))
- kvm_flush_dcache_pte(*pte);
- } while (pte++, addr += PAGE_SIZE, addr != end);
-}
-
-static void stage2_flush_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
- phys_addr_t addr, phys_addr_t end)
-{
- struct kvm *kvm = mmu->kvm;
- pmd_t *pmd;
- phys_addr_t next;
-
- pmd = stage2_pmd_offset(kvm, pud, addr);
- do {
- next = stage2_pmd_addr_end(kvm, addr, end);
- if (!pmd_none(*pmd)) {
- if (pmd_thp_or_huge(*pmd))
- kvm_flush_dcache_pmd(*pmd);
- else
- stage2_flush_ptes(mmu, pmd, addr, next);
- }
- } while (pmd++, addr = next, addr != end);
-}
-
-static void stage2_flush_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
- phys_addr_t addr, phys_addr_t end)
-{
- struct kvm *kvm = mmu->kvm;
- pud_t *pud;
- phys_addr_t next;
-
- pud = stage2_pud_offset(kvm, p4d, addr);
- do {
- next = stage2_pud_addr_end(kvm, addr, end);
- if (!stage2_pud_none(kvm, *pud)) {
- if (stage2_pud_huge(kvm, *pud))
- kvm_flush_dcache_pud(*pud);
- else
- stage2_flush_pmds(mmu, pud, addr, next);
- }
- } while (pud++, addr = next, addr != end);
-}
-
-static void stage2_flush_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
- phys_addr_t addr, phys_addr_t end)
-{
- struct kvm *kvm = mmu->kvm;
- p4d_t *p4d;
- phys_addr_t next;
-
- p4d = stage2_p4d_offset(kvm, pgd, addr);
- do {
- next = stage2_p4d_addr_end(kvm, addr, end);
- if (!stage2_p4d_none(kvm, *p4d))
- stage2_flush_puds(mmu, p4d, addr, next);
- } while (p4d++, addr = next, addr != end);
-}
-
static void stage2_flush_memslot(struct kvm *kvm,
struct kvm_memory_slot *memslot)
{
- struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
- phys_addr_t next;
- pgd_t *pgd;
- pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
- do {
- next = stage2_pgd_addr_end(kvm, addr, end);
- if (!stage2_pgd_none(kvm, *pgd))
- stage2_flush_p4ds(mmu, pgd, addr, next);
-
- if (next != end)
- cond_resched_lock(&kvm->mmu_lock);
- } while (pgd++, addr = next, addr != end);
+ stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_flush);
}
/**
@@ -489,338 +174,28 @@ static void stage2_flush_vm(struct kvm *kvm)
srcu_read_unlock(&kvm->srcu, idx);
}
-static void clear_hyp_pgd_entry(pgd_t *pgd)
-{
- p4d_t *p4d_table __maybe_unused = p4d_offset(pgd, 0UL);
- pgd_clear(pgd);
- p4d_free(NULL, p4d_table);
- put_page(virt_to_page(pgd));
-}
-
-static void clear_hyp_p4d_entry(p4d_t *p4d)
-{
- pud_t *pud_table __maybe_unused = pud_offset(p4d, 0UL);
- VM_BUG_ON(p4d_huge(*p4d));
- p4d_clear(p4d);
- pud_free(NULL, pud_table);
- put_page(virt_to_page(p4d));
-}
-
-static void clear_hyp_pud_entry(pud_t *pud)
-{
- pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0);
- VM_BUG_ON(pud_huge(*pud));
- pud_clear(pud);
- pmd_free(NULL, pmd_table);
- put_page(virt_to_page(pud));
-}
-
-static void clear_hyp_pmd_entry(pmd_t *pmd)
-{
- pte_t *pte_table = pte_offset_kernel(pmd, 0);
- VM_BUG_ON(pmd_thp_or_huge(*pmd));
- pmd_clear(pmd);
- pte_free_kernel(NULL, pte_table);
- put_page(virt_to_page(pmd));
-}
-
-static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
-{
- pte_t *pte, *start_pte;
-
- start_pte = pte = pte_offset_kernel(pmd, addr);
- do {
- if (!pte_none(*pte)) {
- kvm_set_pte(pte, __pte(0));
- put_page(virt_to_page(pte));
- }
- } while (pte++, addr += PAGE_SIZE, addr != end);
-
- if (hyp_pte_table_empty(start_pte))
- clear_hyp_pmd_entry(pmd);
-}
-
-static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
-{
- phys_addr_t next;
- pmd_t *pmd, *start_pmd;
-
- start_pmd = pmd = pmd_offset(pud, addr);
- do {
- next = pmd_addr_end(addr, end);
- /* Hyp doesn't use huge pmds */
- if (!pmd_none(*pmd))
- unmap_hyp_ptes(pmd, addr, next);
- } while (pmd++, addr = next, addr != end);
-
- if (hyp_pmd_table_empty(start_pmd))
- clear_hyp_pud_entry(pud);
-}
-
-static void unmap_hyp_puds(p4d_t *p4d, phys_addr_t addr, phys_addr_t end)
-{
- phys_addr_t next;
- pud_t *pud, *start_pud;
-
- start_pud = pud = pud_offset(p4d, addr);
- do {
- next = pud_addr_end(addr, end);
- /* Hyp doesn't use huge puds */
- if (!pud_none(*pud))
- unmap_hyp_pmds(pud, addr, next);
- } while (pud++, addr = next, addr != end);
-
- if (hyp_pud_table_empty(start_pud))
- clear_hyp_p4d_entry(p4d);
-}
-
-static void unmap_hyp_p4ds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
-{
- phys_addr_t next;
- p4d_t *p4d, *start_p4d;
-
- start_p4d = p4d = p4d_offset(pgd, addr);
- do {
- next = p4d_addr_end(addr, end);
- /* Hyp doesn't use huge p4ds */
- if (!p4d_none(*p4d))
- unmap_hyp_puds(p4d, addr, next);
- } while (p4d++, addr = next, addr != end);
-
- if (hyp_p4d_table_empty(start_p4d))
- clear_hyp_pgd_entry(pgd);
-}
-
-static unsigned int kvm_pgd_index(unsigned long addr, unsigned int ptrs_per_pgd)
-{
- return (addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1);
-}
-
-static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd,
- phys_addr_t start, u64 size)
-{
- pgd_t *pgd;
- phys_addr_t addr = start, end = start + size;
- phys_addr_t next;
-
- /*
- * We don't unmap anything from HYP, except at the hyp tear down.
- * Hence, we don't have to invalidate the TLBs here.
- */
- pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
- do {
- next = pgd_addr_end(addr, end);
- if (!pgd_none(*pgd))
- unmap_hyp_p4ds(pgd, addr, next);
- } while (pgd++, addr = next, addr != end);
-}
-
-static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
-{
- __unmap_hyp_range(pgdp, PTRS_PER_PGD, start, size);
-}
-
-static void unmap_hyp_idmap_range(pgd_t *pgdp, phys_addr_t start, u64 size)
-{
- __unmap_hyp_range(pgdp, __kvm_idmap_ptrs_per_pgd(), start, size);
-}
-
/**
* free_hyp_pgds - free Hyp-mode page tables
- *
- * Assumes hyp_pgd is a page table used strictly in Hyp-mode and
- * therefore contains either mappings in the kernel memory area (above
- * PAGE_OFFSET), or device mappings in the idmap range.
- *
- * boot_hyp_pgd should only map the idmap range, and is only used in
- * the extended idmap case.
*/
void free_hyp_pgds(void)
{
- pgd_t *id_pgd;
-
mutex_lock(&kvm_hyp_pgd_mutex);
-
- id_pgd = boot_hyp_pgd ? boot_hyp_pgd : hyp_pgd;
-
- if (id_pgd) {
- /* In case we never called hyp_mmu_init() */
- if (!io_map_base)
- io_map_base = hyp_idmap_start;
- unmap_hyp_idmap_range(id_pgd, io_map_base,
- hyp_idmap_start + PAGE_SIZE - io_map_base);
+ if (hyp_pgtable) {
+ kvm_pgtable_hyp_destroy(hyp_pgtable);
+ kfree(hyp_pgtable);
}
-
- if (boot_hyp_pgd) {
- free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
- boot_hyp_pgd = NULL;
- }
-
- if (hyp_pgd) {
- unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET),
- (uintptr_t)high_memory - PAGE_OFFSET);
-
- free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
- hyp_pgd = NULL;
- }
- if (merged_hyp_pgd) {
- clear_page(merged_hyp_pgd);
- free_page((unsigned long)merged_hyp_pgd);
- merged_hyp_pgd = NULL;
- }
-
mutex_unlock(&kvm_hyp_pgd_mutex);
}
-static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start,
- unsigned long end, unsigned long pfn,
- pgprot_t prot)
-{
- pte_t *pte;
- unsigned long addr;
-
- addr = start;
- do {
- pte = pte_offset_kernel(pmd, addr);
- kvm_set_pte(pte, kvm_pfn_pte(pfn, prot));
- get_page(virt_to_page(pte));
- pfn++;
- } while (addr += PAGE_SIZE, addr != end);
-}
-
-static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start,
- unsigned long end, unsigned long pfn,
- pgprot_t prot)
-{
- pmd_t *pmd;
- pte_t *pte;
- unsigned long addr, next;
-
- addr = start;
- do {
- pmd = pmd_offset(pud, addr);
-
- BUG_ON(pmd_sect(*pmd));
-
- if (pmd_none(*pmd)) {
- pte = pte_alloc_one_kernel(NULL);
- if (!pte) {
- kvm_err("Cannot allocate Hyp pte\n");
- return -ENOMEM;
- }
- kvm_pmd_populate(pmd, pte);
- get_page(virt_to_page(pmd));
- }
-
- next = pmd_addr_end(addr, end);
-
- create_hyp_pte_mappings(pmd, addr, next, pfn, prot);
- pfn += (next - addr) >> PAGE_SHIFT;
- } while (addr = next, addr != end);
-
- return 0;
-}
-
-static int create_hyp_pud_mappings(p4d_t *p4d, unsigned long start,
- unsigned long end, unsigned long pfn,
- pgprot_t prot)
-{
- pud_t *pud;
- pmd_t *pmd;
- unsigned long addr, next;
- int ret;
-
- addr = start;
- do {
- pud = pud_offset(p4d, addr);
-
- if (pud_none_or_clear_bad(pud)) {
- pmd = pmd_alloc_one(NULL, addr);
- if (!pmd) {
- kvm_err("Cannot allocate Hyp pmd\n");
- return -ENOMEM;
- }
- kvm_pud_populate(pud, pmd);
- get_page(virt_to_page(pud));
- }
-
- next = pud_addr_end(addr, end);
- ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot);
- if (ret)
- return ret;
- pfn += (next - addr) >> PAGE_SHIFT;
- } while (addr = next, addr != end);
-
- return 0;
-}
-
-static int create_hyp_p4d_mappings(pgd_t *pgd, unsigned long start,
- unsigned long end, unsigned long pfn,
- pgprot_t prot)
+static int __create_hyp_mappings(unsigned long start, unsigned long size,
+ unsigned long phys, enum kvm_pgtable_prot prot)
{
- p4d_t *p4d;
- pud_t *pud;
- unsigned long addr, next;
- int ret;
-
- addr = start;
- do {
- p4d = p4d_offset(pgd, addr);
-
- if (p4d_none(*p4d)) {
- pud = pud_alloc_one(NULL, addr);
- if (!pud) {
- kvm_err("Cannot allocate Hyp pud\n");
- return -ENOMEM;
- }
- kvm_p4d_populate(p4d, pud);
- get_page(virt_to_page(p4d));
- }
-
- next = p4d_addr_end(addr, end);
- ret = create_hyp_pud_mappings(p4d, addr, next, pfn, prot);
- if (ret)
- return ret;
- pfn += (next - addr) >> PAGE_SHIFT;
- } while (addr = next, addr != end);
-
- return 0;
-}
-
-static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd,
- unsigned long start, unsigned long end,
- unsigned long pfn, pgprot_t prot)
-{
- pgd_t *pgd;
- p4d_t *p4d;
- unsigned long addr, next;
- int err = 0;
+ int err;
mutex_lock(&kvm_hyp_pgd_mutex);
- addr = start & PAGE_MASK;
- end = PAGE_ALIGN(end);
- do {
- pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
-
- if (pgd_none(*pgd)) {
- p4d = p4d_alloc_one(NULL, addr);
- if (!p4d) {
- kvm_err("Cannot allocate Hyp p4d\n");
- err = -ENOMEM;
- goto out;
- }
- kvm_pgd_populate(pgd, p4d);
- get_page(virt_to_page(pgd));
- }
-
- next = pgd_addr_end(addr, end);
- err = create_hyp_p4d_mappings(pgd, addr, next, pfn, prot);
- if (err)
- goto out;
- pfn += (next - addr) >> PAGE_SHIFT;
- } while (addr = next, addr != end);
-out:
+ err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot);
mutex_unlock(&kvm_hyp_pgd_mutex);
+
return err;
}
@@ -845,7 +220,7 @@ static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
* in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
* physical pages.
*/
-int create_hyp_mappings(void *from, void *to, pgprot_t prot)
+int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
{
phys_addr_t phys_addr;
unsigned long virt_addr;
@@ -862,9 +237,7 @@ int create_hyp_mappings(void *from, void *to, pgprot_t prot)
int err;
phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
- err = __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD,
- virt_addr, virt_addr + PAGE_SIZE,
- __phys_to_pfn(phys_addr),
+ err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr,
prot);
if (err)
return err;
@@ -874,9 +247,9 @@ int create_hyp_mappings(void *from, void *to, pgprot_t prot)
}
static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
- unsigned long *haddr, pgprot_t prot)
+ unsigned long *haddr,
+ enum kvm_pgtable_prot prot)
{
- pgd_t *pgd = hyp_pgd;
unsigned long base;
int ret = 0;
@@ -908,17 +281,11 @@ static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
if (ret)
goto out;
- if (__kvm_cpu_uses_extended_idmap())
- pgd = boot_hyp_pgd;
-
- ret = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
- base, base + size,
- __phys_to_pfn(phys_addr), prot);
+ ret = __create_hyp_mappings(base, size, phys_addr, prot);
if (ret)
goto out;
*haddr = base + offset_in_page(phys_addr);
-
out:
return ret;
}
@@ -989,47 +356,48 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
* @kvm: The pointer to the KVM structure
* @mmu: The pointer to the s2 MMU structure
*
- * Allocates only the stage-2 HW PGD level table(s) of size defined by
- * stage2_pgd_size(mmu->kvm).
- *
+ * Allocates only the stage-2 HW PGD level table(s).
* Note we don't need locking here as this is only called when the VM is
* created, which can only be done once.
*/
int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
{
- phys_addr_t pgd_phys;
- pgd_t *pgd;
- int cpu;
+ int cpu, err;
+ struct kvm_pgtable *pgt;
- if (mmu->pgd != NULL) {
+ if (mmu->pgt != NULL) {
kvm_err("kvm_arch already initialized?\n");
return -EINVAL;
}
- /* Allocate the HW PGD, making sure that each page gets its own refcount */
- pgd = alloc_pages_exact(stage2_pgd_size(kvm), GFP_KERNEL | __GFP_ZERO);
- if (!pgd)
+ pgt = kzalloc(sizeof(*pgt), GFP_KERNEL);
+ if (!pgt)
return -ENOMEM;
- pgd_phys = virt_to_phys(pgd);
- if (WARN_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm)))
- return -EINVAL;
+ err = kvm_pgtable_stage2_init(pgt, kvm);
+ if (err)
+ goto out_free_pgtable;
mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
if (!mmu->last_vcpu_ran) {
- free_pages_exact(pgd, stage2_pgd_size(kvm));
- return -ENOMEM;
+ err = -ENOMEM;
+ goto out_destroy_pgtable;
}
for_each_possible_cpu(cpu)
*per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
mmu->kvm = kvm;
- mmu->pgd = pgd;
- mmu->pgd_phys = pgd_phys;
+ mmu->pgt = pgt;
+ mmu->pgd_phys = __pa(pgt->pgd);
mmu->vmid.vmid_gen = 0;
-
return 0;
+
+out_destroy_pgtable:
+ kvm_pgtable_stage2_destroy(pgt);
+out_free_pgtable:
+ kfree(pgt);
+ return err;
}
static void stage2_unmap_memslot(struct kvm *kvm,
@@ -1102,363 +470,21 @@ void stage2_unmap_vm(struct kvm *kvm)
void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
{
struct kvm *kvm = mmu->kvm;
- void *pgd = NULL;
+ struct kvm_pgtable *pgt = NULL;
spin_lock(&kvm->mmu_lock);
- if (mmu->pgd) {
- unmap_stage2_range(mmu, 0, kvm_phys_size(kvm));
- pgd = READ_ONCE(mmu->pgd);
- mmu->pgd = NULL;
- }
- spin_unlock(&kvm->mmu_lock);
-
- /* Free the HW pgd, one page at a time */
- if (pgd) {
- free_pages_exact(pgd, stage2_pgd_size(kvm));
+ pgt = mmu->pgt;
+ if (pgt) {
+ mmu->pgd_phys = 0;
+ mmu->pgt = NULL;
free_percpu(mmu->last_vcpu_ran);
}
-}
-
-static p4d_t *stage2_get_p4d(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
- phys_addr_t addr)
-{
- struct kvm *kvm = mmu->kvm;
- pgd_t *pgd;
- p4d_t *p4d;
-
- pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
- if (stage2_pgd_none(kvm, *pgd)) {
- if (!cache)
- return NULL;
- p4d = kvm_mmu_memory_cache_alloc(cache);
- stage2_pgd_populate(kvm, pgd, p4d);
- get_page(virt_to_page(pgd));
- }
-
- return stage2_p4d_offset(kvm, pgd, addr);
-}
-
-static pud_t *stage2_get_pud(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
- phys_addr_t addr)
-{
- struct kvm *kvm = mmu->kvm;
- p4d_t *p4d;
- pud_t *pud;
-
- p4d = stage2_get_p4d(mmu, cache, addr);
- if (stage2_p4d_none(kvm, *p4d)) {
- if (!cache)
- return NULL;
- pud = kvm_mmu_memory_cache_alloc(cache);
- stage2_p4d_populate(kvm, p4d, pud);
- get_page(virt_to_page(p4d));
- }
-
- return stage2_pud_offset(kvm, p4d, addr);
-}
-
-static pmd_t *stage2_get_pmd(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
- phys_addr_t addr)
-{
- struct kvm *kvm = mmu->kvm;
- pud_t *pud;
- pmd_t *pmd;
-
- pud = stage2_get_pud(mmu, cache, addr);
- if (!pud || stage2_pud_huge(kvm, *pud))
- return NULL;
-
- if (stage2_pud_none(kvm, *pud)) {
- if (!cache)
- return NULL;
- pmd = kvm_mmu_memory_cache_alloc(cache);
- stage2_pud_populate(kvm, pud, pmd);
- get_page(virt_to_page(pud));
- }
-
- return stage2_pmd_offset(kvm, pud, addr);
-}
-
-static int stage2_set_pmd_huge(struct kvm_s2_mmu *mmu,
- struct kvm_mmu_memory_cache *cache,
- phys_addr_t addr, const pmd_t *new_pmd)
-{
- pmd_t *pmd, old_pmd;
-
-retry:
- pmd = stage2_get_pmd(mmu, cache, addr);
- VM_BUG_ON(!pmd);
-
- old_pmd = *pmd;
- /*
- * Multiple vcpus faulting on the same PMD entry, can
- * lead to them sequentially updating the PMD with the
- * same value. Following the break-before-make
- * (pmd_clear() followed by tlb_flush()) process can
- * hinder forward progress due to refaults generated
- * on missing translations.
- *
- * Skip updating the page table if the entry is
- * unchanged.
- */
- if (pmd_val(old_pmd) == pmd_val(*new_pmd))
- return 0;
-
- if (pmd_present(old_pmd)) {
- /*
- * If we already have PTE level mapping for this block,
- * we must unmap it to avoid inconsistent TLB state and
- * leaking the table page. We could end up in this situation
- * if the memory slot was marked for dirty logging and was
- * reverted, leaving PTE level mappings for the pages accessed
- * during the period. So, unmap the PTE level mapping for this
- * block and retry, as we could have released the upper level
- * table in the process.
- *
- * Normal THP split/merge follows mmu_notifier callbacks and do
- * get handled accordingly.
- */
- if (!pmd_thp_or_huge(old_pmd)) {
- unmap_stage2_range(mmu, addr & S2_PMD_MASK, S2_PMD_SIZE);
- goto retry;
- }
- /*
- * Mapping in huge pages should only happen through a
- * fault. If a page is merged into a transparent huge
- * page, the individual subpages of that huge page
- * should be unmapped through MMU notifiers before we
- * get here.
- *
- * Merging of CompoundPages is not supported; they
- * should become splitting first, unmapped, merged,
- * and mapped back in on-demand.
- */
- WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
- pmd_clear(pmd);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
- } else {
- get_page(virt_to_page(pmd));
- }
-
- kvm_set_pmd(pmd, *new_pmd);
- return 0;
-}
-
-static int stage2_set_pud_huge(struct kvm_s2_mmu *mmu,
- struct kvm_mmu_memory_cache *cache,
- phys_addr_t addr, const pud_t *new_pudp)
-{
- struct kvm *kvm = mmu->kvm;
- pud_t *pudp, old_pud;
-
-retry:
- pudp = stage2_get_pud(mmu, cache, addr);
- VM_BUG_ON(!pudp);
-
- old_pud = *pudp;
-
- /*
- * A large number of vcpus faulting on the same stage 2 entry,
- * can lead to a refault due to the stage2_pud_clear()/tlb_flush().
- * Skip updating the page tables if there is no change.
- */
- if (pud_val(old_pud) == pud_val(*new_pudp))
- return 0;
-
- if (stage2_pud_present(kvm, old_pud)) {
- /*
- * If we already have table level mapping for this block, unmap
- * the range for this block and retry.
- */
- if (!stage2_pud_huge(kvm, old_pud)) {
- unmap_stage2_range(mmu, addr & S2_PUD_MASK, S2_PUD_SIZE);
- goto retry;
- }
-
- WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp));
- stage2_pud_clear(kvm, pudp);
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
- } else {
- get_page(virt_to_page(pudp));
- }
-
- kvm_set_pud(pudp, *new_pudp);
- return 0;
-}
-
-/*
- * stage2_get_leaf_entry - walk the stage2 VM page tables and return
- * true if a valid and present leaf-entry is found. A pointer to the
- * leaf-entry is returned in the appropriate level variable - pudpp,
- * pmdpp, ptepp.
- */
-static bool stage2_get_leaf_entry(struct kvm_s2_mmu *mmu, phys_addr_t addr,
- pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp)
-{
- struct kvm *kvm = mmu->kvm;
- pud_t *pudp;
- pmd_t *pmdp;
- pte_t *ptep;
-
- *pudpp = NULL;
- *pmdpp = NULL;
- *ptepp = NULL;
-
- pudp = stage2_get_pud(mmu, NULL, addr);
- if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp))
- return false;
-
- if (stage2_pud_huge(kvm, *pudp)) {
- *pudpp = pudp;
- return true;
- }
-
- pmdp = stage2_pmd_offset(kvm, pudp, addr);
- if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp))
- return false;
-
- if (pmd_thp_or_huge(*pmdp)) {
- *pmdpp = pmdp;
- return true;
- }
-
- ptep = pte_offset_kernel(pmdp, addr);
- if (!ptep || pte_none(*ptep) || !pte_present(*ptep))
- return false;
-
- *ptepp = ptep;
- return true;
-}
-
-static bool stage2_is_exec(struct kvm_s2_mmu *mmu, phys_addr_t addr, unsigned long sz)
-{
- pud_t *pudp;
- pmd_t *pmdp;
- pte_t *ptep;
- bool found;
-
- found = stage2_get_leaf_entry(mmu, addr, &pudp, &pmdp, &ptep);
- if (!found)
- return false;
-
- if (pudp)
- return sz <= PUD_SIZE && kvm_s2pud_exec(pudp);
- else if (pmdp)
- return sz <= PMD_SIZE && kvm_s2pmd_exec(pmdp);
- else
- return sz == PAGE_SIZE && kvm_s2pte_exec(ptep);
-}
-
-static int stage2_set_pte(struct kvm_s2_mmu *mmu,
- struct kvm_mmu_memory_cache *cache,
- phys_addr_t addr, const pte_t *new_pte,
- unsigned long flags)
-{
- struct kvm *kvm = mmu->kvm;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte, old_pte;
- bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
- bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE;
-
- VM_BUG_ON(logging_active && !cache);
-
- /* Create stage-2 page table mapping - Levels 0 and 1 */
- pud = stage2_get_pud(mmu, cache, addr);
- if (!pud) {
- /*
- * Ignore calls from kvm_set_spte_hva for unallocated
- * address ranges.
- */
- return 0;
- }
-
- /*
- * While dirty page logging - dissolve huge PUD, then continue
- * on to allocate page.
- */
- if (logging_active)
- stage2_dissolve_pud(mmu, addr, pud);
-
- if (stage2_pud_none(kvm, *pud)) {
- if (!cache)
- return 0; /* ignore calls from kvm_set_spte_hva */
- pmd = kvm_mmu_memory_cache_alloc(cache);
- stage2_pud_populate(kvm, pud, pmd);
- get_page(virt_to_page(pud));
- }
-
- pmd = stage2_pmd_offset(kvm, pud, addr);
- if (!pmd) {
- /*
- * Ignore calls from kvm_set_spte_hva for unallocated
- * address ranges.
- */
- return 0;
- }
-
- /*
- * While dirty page logging - dissolve huge PMD, then continue on to
- * allocate page.
- */
- if (logging_active)
- stage2_dissolve_pmd(mmu, addr, pmd);
-
- /* Create stage-2 page mappings - Level 2 */
- if (pmd_none(*pmd)) {
- if (!cache)
- return 0; /* ignore calls from kvm_set_spte_hva */
- pte = kvm_mmu_memory_cache_alloc(cache);
- kvm_pmd_populate(pmd, pte);
- get_page(virt_to_page(pmd));
- }
-
- pte = pte_offset_kernel(pmd, addr);
-
- if (iomap && pte_present(*pte))
- return -EFAULT;
-
- /* Create 2nd stage page table mapping - Level 3 */
- old_pte = *pte;
- if (pte_present(old_pte)) {
- /* Skip page table update if there is no change */
- if (pte_val(old_pte) == pte_val(*new_pte))
- return 0;
-
- kvm_set_pte(pte, __pte(0));
- kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PTE_LEVEL);
- } else {
- get_page(virt_to_page(pte));
- }
-
- kvm_set_pte(pte, *new_pte);
- return 0;
-}
+ spin_unlock(&kvm->mmu_lock);
-#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-static int stage2_ptep_test_and_clear_young(pte_t *pte)
-{
- if (pte_young(*pte)) {
- *pte = pte_mkold(*pte);
- return 1;
+ if (pgt) {
+ kvm_pgtable_stage2_destroy(pgt);
+ kfree(pgt);
}
- return 0;
-}
-#else
-static int stage2_ptep_test_and_clear_young(pte_t *pte)
-{
- return __ptep_test_and_clear_young(pte);
-}
-#endif
-
-static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
-{
- return stage2_ptep_test_and_clear_young((pte_t *)pmd);
-}
-
-static int stage2_pudp_test_and_clear_young(pud_t *pud)
-{
- return stage2_ptep_test_and_clear_young((pte_t *)pud);
}
/**
@@ -1468,169 +494,52 @@ static int stage2_pudp_test_and_clear_young(pud_t *pud)
* @guest_ipa: The IPA at which to insert the mapping
* @pa: The physical address of the device
* @size: The size of the mapping
+ * @writable: Whether or not to create a writable mapping
*/
int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
phys_addr_t pa, unsigned long size, bool writable)
{
- phys_addr_t addr, end;
+ phys_addr_t addr;
int ret = 0;
- unsigned long pfn;
struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, };
+ struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
+ enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
+ KVM_PGTABLE_PROT_R |
+ (writable ? KVM_PGTABLE_PROT_W : 0);
- end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK;
- pfn = __phys_to_pfn(pa);
-
- for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) {
- pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE);
-
- if (writable)
- pte = kvm_s2pte_mkwrite(pte);
+ size += offset_in_page(guest_ipa);
+ guest_ipa &= PAGE_MASK;
+ for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
ret = kvm_mmu_topup_memory_cache(&cache,
kvm_mmu_cache_min_pages(kvm));
if (ret)
- goto out;
+ break;
+
spin_lock(&kvm->mmu_lock);
- ret = stage2_set_pte(&kvm->arch.mmu, &cache, addr, &pte,
- KVM_S2PTE_FLAG_IS_IOMAP);
+ ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
+ &cache);
spin_unlock(&kvm->mmu_lock);
if (ret)
- goto out;
+ break;
- pfn++;
+ pa += PAGE_SIZE;
}
-out:
kvm_mmu_free_memory_cache(&cache);
return ret;
}
/**
- * stage2_wp_ptes - write protect PMD range
- * @pmd: pointer to pmd entry
- * @addr: range start address
- * @end: range end address
- */
-static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
-{
- pte_t *pte;
-
- pte = pte_offset_kernel(pmd, addr);
- do {
- if (!pte_none(*pte)) {
- if (!kvm_s2pte_readonly(pte))
- kvm_set_s2pte_readonly(pte);
- }
- } while (pte++, addr += PAGE_SIZE, addr != end);
-}
-
-/**
- * stage2_wp_pmds - write protect PUD range
- * kvm: kvm instance for the VM
- * @pud: pointer to pud entry
- * @addr: range start address
- * @end: range end address
- */
-static void stage2_wp_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
- phys_addr_t addr, phys_addr_t end)
-{
- struct kvm *kvm = mmu->kvm;
- pmd_t *pmd;
- phys_addr_t next;
-
- pmd = stage2_pmd_offset(kvm, pud, addr);
-
- do {
- next = stage2_pmd_addr_end(kvm, addr, end);
- if (!pmd_none(*pmd)) {
- if (pmd_thp_or_huge(*pmd)) {
- if (!kvm_s2pmd_readonly(pmd))
- kvm_set_s2pmd_readonly(pmd);
- } else {
- stage2_wp_ptes(pmd, addr, next);
- }
- }
- } while (pmd++, addr = next, addr != end);
-}
-
-/**
- * stage2_wp_puds - write protect P4D range
- * @p4d: pointer to p4d entry
- * @addr: range start address
- * @end: range end address
- */
-static void stage2_wp_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
- phys_addr_t addr, phys_addr_t end)
-{
- struct kvm *kvm = mmu->kvm;
- pud_t *pud;
- phys_addr_t next;
-
- pud = stage2_pud_offset(kvm, p4d, addr);
- do {
- next = stage2_pud_addr_end(kvm, addr, end);
- if (!stage2_pud_none(kvm, *pud)) {
- if (stage2_pud_huge(kvm, *pud)) {
- if (!kvm_s2pud_readonly(pud))
- kvm_set_s2pud_readonly(pud);
- } else {
- stage2_wp_pmds(mmu, pud, addr, next);
- }
- }
- } while (pud++, addr = next, addr != end);
-}
-
-/**
- * stage2_wp_p4ds - write protect PGD range
- * @pgd: pointer to pgd entry
- * @addr: range start address
- * @end: range end address
- */
-static void stage2_wp_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
- phys_addr_t addr, phys_addr_t end)
-{
- struct kvm *kvm = mmu->kvm;
- p4d_t *p4d;
- phys_addr_t next;
-
- p4d = stage2_p4d_offset(kvm, pgd, addr);
- do {
- next = stage2_p4d_addr_end(kvm, addr, end);
- if (!stage2_p4d_none(kvm, *p4d))
- stage2_wp_puds(mmu, p4d, addr, next);
- } while (p4d++, addr = next, addr != end);
-}
-
-/**
* stage2_wp_range() - write protect stage2 memory region range
- * @kvm: The KVM pointer
+ * @mmu: The KVM stage-2 MMU pointer
* @addr: Start address of range
* @end: End address of range
*/
static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
{
struct kvm *kvm = mmu->kvm;
- pgd_t *pgd;
- phys_addr_t next;
-
- pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
- do {
- /*
- * Release kvm_mmu_lock periodically if the memory region is
- * large. Otherwise, we may see kernel panics with
- * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR,
- * CONFIG_LOCKDEP. Additionally, holding the lock too long
- * will also starve other vCPUs. We have to also make sure
- * that the page tables are not freed while we released
- * the lock.
- */
- cond_resched_lock(&kvm->mmu_lock);
- if (!READ_ONCE(mmu->pgd))
- break;
- next = stage2_pgd_addr_end(kvm, addr, end);
- if (stage2_pgd_present(kvm, *pgd))
- stage2_wp_p4ds(mmu, pgd, addr, next);
- } while (pgd++, addr = next, addr != end);
+ stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_wrprotect);
}
/**
@@ -1833,20 +742,21 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
struct kvm_memory_slot *memslot, unsigned long hva,
unsigned long fault_status)
{
- int ret;
+ int ret = 0;
bool write_fault, writable, force_pte = false;
- bool exec_fault, needs_exec;
+ bool exec_fault;
+ bool device = false;
unsigned long mmu_seq;
- gfn_t gfn = fault_ipa >> PAGE_SHIFT;
struct kvm *kvm = vcpu->kvm;
struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
struct vm_area_struct *vma;
short vma_shift;
+ gfn_t gfn;
kvm_pfn_t pfn;
- pgprot_t mem_type = PAGE_S2;
bool logging_active = memslot_is_logging(memslot);
- unsigned long vma_pagesize, flags = 0;
- struct kvm_s2_mmu *mmu = vcpu->arch.hw_mmu;
+ unsigned long vma_pagesize;
+ enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
+ struct kvm_pgtable *pgt;
write_fault = kvm_is_write_fault(vcpu);
exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
@@ -1871,31 +781,41 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
else
vma_shift = PAGE_SHIFT;
- vma_pagesize = 1ULL << vma_shift;
if (logging_active ||
- (vma->vm_flags & VM_PFNMAP) ||
- !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) {
+ (vma->vm_flags & VM_PFNMAP)) {
force_pte = true;
- vma_pagesize = PAGE_SIZE;
vma_shift = PAGE_SHIFT;
}
- /*
- * The stage2 has a minimum of 2 level table (For arm64 see
- * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can
- * use PMD_SIZE huge mappings (even when the PMD is folded into PGD).
- * As for PUD huge maps, we must make sure that we have at least
- * 3 levels, i.e, PMD is not folded.
- */
- if (vma_pagesize == PMD_SIZE ||
- (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm)))
- gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
+ if (vma_shift == PUD_SHIFT &&
+ !fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE))
+ vma_shift = PMD_SHIFT;
+
+ if (vma_shift == PMD_SHIFT &&
+ !fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
+ force_pte = true;
+ vma_shift = PAGE_SHIFT;
+ }
+
+ vma_pagesize = 1UL << vma_shift;
+ if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
+ fault_ipa &= ~(vma_pagesize - 1);
+
+ gfn = fault_ipa >> PAGE_SHIFT;
mmap_read_unlock(current->mm);
- /* We need minimum second+third level pages */
- ret = kvm_mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm));
- if (ret)
- return ret;
+ /*
+ * Permission faults just need to update the existing leaf entry,
+ * and so normally don't require allocations from the memcache. The
+ * only exception to this is when dirty logging is enabled at runtime
+ * and a write fault needs to collapse a block entry into a table.
+ */
+ if (fault_status != FSC_PERM || (logging_active && write_fault)) {
+ ret = kvm_mmu_topup_memory_cache(memcache,
+ kvm_mmu_cache_min_pages(kvm));
+ if (ret)
+ return ret;
+ }
mmu_seq = vcpu->kvm->mmu_notifier_seq;
/*
@@ -1918,28 +838,20 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
return -EFAULT;
if (kvm_is_device_pfn(pfn)) {
- mem_type = PAGE_S2_DEVICE;
- flags |= KVM_S2PTE_FLAG_IS_IOMAP;
- } else if (logging_active) {
- /*
- * Faults on pages in a memslot with logging enabled
- * should not be mapped with huge pages (it introduces churn
- * and performance degradation), so force a pte mapping.
- */
- flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
-
+ device = true;
+ } else if (logging_active && !write_fault) {
/*
* Only actually map the page as writable if this was a write
* fault.
*/
- if (!write_fault)
- writable = false;
+ writable = false;
}
- if (exec_fault && is_iomap(flags))
+ if (exec_fault && device)
return -ENOEXEC;
spin_lock(&kvm->mmu_lock);
+ pgt = vcpu->arch.hw_mmu->pgt;
if (mmu_notifier_retry(kvm, mmu_seq))
goto out_unlock;
@@ -1950,67 +862,31 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (vma_pagesize == PAGE_SIZE && !force_pte)
vma_pagesize = transparent_hugepage_adjust(memslot, hva,
&pfn, &fault_ipa);
- if (writable)
+ if (writable) {
+ prot |= KVM_PGTABLE_PROT_W;
kvm_set_pfn_dirty(pfn);
+ mark_page_dirty(kvm, gfn);
+ }
- if (fault_status != FSC_PERM && !is_iomap(flags))
+ if (fault_status != FSC_PERM && !device)
clean_dcache_guest_page(pfn, vma_pagesize);
- if (exec_fault)
+ if (exec_fault) {
+ prot |= KVM_PGTABLE_PROT_X;
invalidate_icache_guest_page(pfn, vma_pagesize);
+ }
- /*
- * If we took an execution fault we have made the
- * icache/dcache coherent above and should now let the s2
- * mapping be executable.
- *
- * Write faults (!exec_fault && FSC_PERM) are orthogonal to
- * execute permissions, and we preserve whatever we have.
- */
- needs_exec = exec_fault ||
- (fault_status == FSC_PERM &&
- stage2_is_exec(mmu, fault_ipa, vma_pagesize));
-
- /*
- * If PUD_SIZE == PMD_SIZE, there is no real PUD level, and
- * all we have is a 2-level page table. Trying to map a PUD in
- * this case would be fatally wrong.
- */
- if (PUD_SIZE != PMD_SIZE && vma_pagesize == PUD_SIZE) {
- pud_t new_pud = kvm_pfn_pud(pfn, mem_type);
-
- new_pud = kvm_pud_mkhuge(new_pud);
- if (writable)
- new_pud = kvm_s2pud_mkwrite(new_pud);
-
- if (needs_exec)
- new_pud = kvm_s2pud_mkexec(new_pud);
-
- ret = stage2_set_pud_huge(mmu, memcache, fault_ipa, &new_pud);
- } else if (vma_pagesize == PMD_SIZE) {
- pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type);
-
- new_pmd = kvm_pmd_mkhuge(new_pmd);
-
- if (writable)
- new_pmd = kvm_s2pmd_mkwrite(new_pmd);
-
- if (needs_exec)
- new_pmd = kvm_s2pmd_mkexec(new_pmd);
+ if (device)
+ prot |= KVM_PGTABLE_PROT_DEVICE;
+ else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
+ prot |= KVM_PGTABLE_PROT_X;
- ret = stage2_set_pmd_huge(mmu, memcache, fault_ipa, &new_pmd);
+ if (fault_status == FSC_PERM && !(logging_active && writable)) {
+ ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
} else {
- pte_t new_pte = kvm_pfn_pte(pfn, mem_type);
-
- if (writable) {
- new_pte = kvm_s2pte_mkwrite(new_pte);
- mark_page_dirty(kvm, gfn);
- }
-
- if (needs_exec)
- new_pte = kvm_s2pte_mkexec(new_pte);
-
- ret = stage2_set_pte(mmu, memcache, fault_ipa, &new_pte, flags);
+ ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
+ __pfn_to_phys(pfn), prot,
+ memcache);
}
out_unlock:
@@ -2020,46 +896,23 @@ out_unlock:
return ret;
}
-/*
- * Resolve the access fault by making the page young again.
- * Note that because the faulting entry is guaranteed not to be
- * cached in the TLB, we don't need to invalidate anything.
- * Only the HW Access Flag updates are supported for Stage 2 (no DBM),
- * so there is no need for atomic (pte|pmd)_mkyoung operations.
- */
+/* Resolve the access fault by making the page young again. */
static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
{
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
- kvm_pfn_t pfn;
- bool pfn_valid = false;
+ pte_t pte;
+ kvm_pte_t kpte;
+ struct kvm_s2_mmu *mmu;
trace_kvm_access_fault(fault_ipa);
spin_lock(&vcpu->kvm->mmu_lock);
-
- if (!stage2_get_leaf_entry(vcpu->arch.hw_mmu, fault_ipa, &pud, &pmd, &pte))
- goto out;
-
- if (pud) { /* HugeTLB */
- *pud = kvm_s2pud_mkyoung(*pud);
- pfn = kvm_pud_pfn(*pud);
- pfn_valid = true;
- } else if (pmd) { /* THP, HugeTLB */
- *pmd = pmd_mkyoung(*pmd);
- pfn = pmd_pfn(*pmd);
- pfn_valid = true;
- } else {
- *pte = pte_mkyoung(*pte); /* Just a page... */
- pfn = pte_pfn(*pte);
- pfn_valid = true;
- }
-
-out:
+ mmu = vcpu->arch.hw_mmu;
+ kpte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa);
spin_unlock(&vcpu->kvm->mmu_lock);
- if (pfn_valid)
- kvm_set_pfn_accessed(pfn);
+
+ pte = __pte(kpte);
+ if (pte_valid(pte))
+ kvm_set_pfn_accessed(pte_pfn(pte));
}
/**
@@ -2230,7 +1083,7 @@ static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *dat
int kvm_unmap_hva_range(struct kvm *kvm,
unsigned long start, unsigned long end, unsigned flags)
{
- if (!kvm->arch.mmu.pgd)
+ if (!kvm->arch.mmu.pgt)
return 0;
trace_kvm_unmap_hva_range(start, end);
@@ -2240,28 +1093,27 @@ int kvm_unmap_hva_range(struct kvm *kvm,
static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
{
- pte_t *pte = (pte_t *)data;
+ kvm_pfn_t *pfn = (kvm_pfn_t *)data;
WARN_ON(size != PAGE_SIZE);
+
/*
- * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
- * flag clear because MMU notifiers will have unmapped a huge PMD before
- * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
- * therefore stage2_set_pte() never needs to clear out a huge PMD
- * through this calling path.
+ * The MMU notifiers will have unmapped a huge PMD before calling
+ * ->change_pte() (which in turn calls kvm_set_spte_hva()) and
+ * therefore we never need to clear out a huge PMD through this
+ * calling path and a memcache is not required.
*/
- stage2_set_pte(&kvm->arch.mmu, NULL, gpa, pte, 0);
+ kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, gpa, PAGE_SIZE,
+ __pfn_to_phys(*pfn), KVM_PGTABLE_PROT_R, NULL);
return 0;
}
-
int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
{
unsigned long end = hva + PAGE_SIZE;
kvm_pfn_t pfn = pte_pfn(pte);
- pte_t stage2_pte;
- if (!kvm->arch.mmu.pgd)
+ if (!kvm->arch.mmu.pgt)
return 0;
trace_kvm_set_spte_hva(hva);
@@ -2271,51 +1123,30 @@ int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
* just like a translation fault and clean the cache to the PoC.
*/
clean_dcache_guest_page(pfn, PAGE_SIZE);
- stage2_pte = kvm_pfn_pte(pfn, PAGE_S2);
- handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte);
-
+ handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pfn);
return 0;
}
static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
{
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
+ pte_t pte;
+ kvm_pte_t kpte;
WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
- if (!stage2_get_leaf_entry(&kvm->arch.mmu, gpa, &pud, &pmd, &pte))
- return 0;
-
- if (pud)
- return stage2_pudp_test_and_clear_young(pud);
- else if (pmd)
- return stage2_pmdp_test_and_clear_young(pmd);
- else
- return stage2_ptep_test_and_clear_young(pte);
+ kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt, gpa);
+ pte = __pte(kpte);
+ return pte_valid(pte) && pte_young(pte);
}
static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
{
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
-
WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
- if (!stage2_get_leaf_entry(&kvm->arch.mmu, gpa, &pud, &pmd, &pte))
- return 0;
-
- if (pud)
- return kvm_s2pud_young(*pud);
- else if (pmd)
- return pmd_young(*pmd);
- else
- return pte_young(*pte);
+ return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt, gpa);
}
int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
{
- if (!kvm->arch.mmu.pgd)
+ if (!kvm->arch.mmu.pgt)
return 0;
trace_kvm_age_hva(start, end);
return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
@@ -2323,24 +1154,16 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
{
- if (!kvm->arch.mmu.pgd)
+ if (!kvm->arch.mmu.pgt)
return 0;
trace_kvm_test_age_hva(hva);
return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE,
kvm_test_age_hva_handler, NULL);
}
-void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
-{
- kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
-}
-
phys_addr_t kvm_mmu_get_httbr(void)
{
- if (__kvm_cpu_uses_extended_idmap())
- return virt_to_phys(merged_hyp_pgd);
- else
- return virt_to_phys(hyp_pgd);
+ return __pa(hyp_pgtable->pgd);
}
phys_addr_t kvm_get_idmap_vector(void)
@@ -2348,15 +1171,11 @@ phys_addr_t kvm_get_idmap_vector(void)
return hyp_idmap_vector;
}
-static int kvm_map_idmap_text(pgd_t *pgd)
+static int kvm_map_idmap_text(void)
{
- int err;
-
- /* Create the idmap in the boot page tables */
- err = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
- hyp_idmap_start, hyp_idmap_end,
- __phys_to_pfn(hyp_idmap_start),
- PAGE_HYP_EXEC);
+ unsigned long size = hyp_idmap_end - hyp_idmap_start;
+ int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start,
+ PAGE_HYP_EXEC);
if (err)
kvm_err("Failed to idmap %lx-%lx\n",
hyp_idmap_start, hyp_idmap_end);
@@ -2367,6 +1186,7 @@ static int kvm_map_idmap_text(pgd_t *pgd)
int kvm_mmu_init(void)
{
int err;
+ u32 hyp_va_bits;
hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
@@ -2380,6 +1200,8 @@ int kvm_mmu_init(void)
*/
BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
+ hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
+ kvm_debug("Using %u-bit virtual addresses at EL2\n", hyp_va_bits);
kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
kvm_debug("HYP VA range: %lx:%lx\n",
kern_hyp_va(PAGE_OFFSET),
@@ -2397,43 +1219,30 @@ int kvm_mmu_init(void)
goto out;
}
- hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
- if (!hyp_pgd) {
- kvm_err("Hyp mode PGD not allocated\n");
+ hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL);
+ if (!hyp_pgtable) {
+ kvm_err("Hyp mode page-table not allocated\n");
err = -ENOMEM;
goto out;
}
- if (__kvm_cpu_uses_extended_idmap()) {
- boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
- hyp_pgd_order);
- if (!boot_hyp_pgd) {
- kvm_err("Hyp boot PGD not allocated\n");
- err = -ENOMEM;
- goto out;
- }
-
- err = kvm_map_idmap_text(boot_hyp_pgd);
- if (err)
- goto out;
+ err = kvm_pgtable_hyp_init(hyp_pgtable, hyp_va_bits);
+ if (err)
+ goto out_free_pgtable;
- merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
- if (!merged_hyp_pgd) {
- kvm_err("Failed to allocate extra HYP pgd\n");
- goto out;
- }
- __kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd,
- hyp_idmap_start);
- } else {
- err = kvm_map_idmap_text(hyp_pgd);
- if (err)
- goto out;
- }
+ err = kvm_map_idmap_text();
+ if (err)
+ goto out_destroy_pgtable;
io_map_base = hyp_idmap_start;
return 0;
+
+out_destroy_pgtable:
+ kvm_pgtable_hyp_destroy(hyp_pgtable);
+out_free_pgtable:
+ kfree(hyp_pgtable);
+ hyp_pgtable = NULL;
out:
- free_hyp_pgds();
return err;
}
@@ -2537,7 +1346,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
spin_lock(&kvm->mmu_lock);
if (ret)
unmap_stage2_range(&kvm->arch.mmu, mem->guest_phys_addr, mem->memory_size);
- else
+ else if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
stage2_flush_memslot(kvm, memslot);
spin_unlock(&kvm->mmu_lock);
out:
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index 81916e360b1e..2ed5ef8f274b 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -20,6 +20,21 @@ static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc);
#define PERF_ATTR_CFG1_KVM_PMU_CHAINED 0x1
+static u32 kvm_pmu_event_mask(struct kvm *kvm)
+{
+ switch (kvm->arch.pmuver) {
+ case 1: /* ARMv8.0 */
+ return GENMASK(9, 0);
+ case 4: /* ARMv8.1 */
+ case 5: /* ARMv8.4 */
+ case 6: /* ARMv8.5 */
+ return GENMASK(15, 0);
+ default: /* Shouldn't be here, just for sanity */
+ WARN_ONCE(1, "Unknown PMU version %d\n", kvm->arch.pmuver);
+ return 0;
+ }
+}
+
/**
* kvm_pmu_idx_is_64bit - determine if select_idx is a 64bit counter
* @vcpu: The vcpu pointer
@@ -100,7 +115,7 @@ static bool kvm_pmu_idx_has_chain_evtype(struct kvm_vcpu *vcpu, u64 select_idx)
return false;
reg = PMEVTYPER0_EL0 + select_idx;
- eventsel = __vcpu_sys_reg(vcpu, reg) & ARMV8_PMU_EVTYPE_EVENT;
+ eventsel = __vcpu_sys_reg(vcpu, reg) & kvm_pmu_event_mask(vcpu->kvm);
return eventsel == ARMV8_PMUV3_PERFCTR_CHAIN;
}
@@ -516,7 +531,7 @@ void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val)
/* PMSWINC only applies to ... SW_INC! */
type = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i);
- type &= ARMV8_PMU_EVTYPE_EVENT;
+ type &= kvm_pmu_event_mask(vcpu->kvm);
if (type != ARMV8_PMUV3_PERFCTR_SW_INCR)
continue;
@@ -599,11 +614,21 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx)
data = __vcpu_sys_reg(vcpu, reg);
kvm_pmu_stop_counter(vcpu, pmc);
- eventsel = data & ARMV8_PMU_EVTYPE_EVENT;
+ if (pmc->idx == ARMV8_PMU_CYCLE_IDX)
+ eventsel = ARMV8_PMUV3_PERFCTR_CPU_CYCLES;
+ else
+ eventsel = data & kvm_pmu_event_mask(vcpu->kvm);
+
+ /* Software increment event doesn't need to be backed by a perf event */
+ if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR)
+ return;
- /* Software increment event does't need to be backed by a perf event */
- if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR &&
- pmc->idx != ARMV8_PMU_CYCLE_IDX)
+ /*
+ * If we have a filter in place and that the event isn't allowed, do
+ * not install a perf event either.
+ */
+ if (vcpu->kvm->arch.pmu_filter &&
+ !test_bit(eventsel, vcpu->kvm->arch.pmu_filter))
return;
memset(&attr, 0, sizeof(struct perf_event_attr));
@@ -615,8 +640,7 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx)
attr.exclude_kernel = data & ARMV8_PMU_EXCLUDE_EL1 ? 1 : 0;
attr.exclude_hv = 1; /* Don't count EL2 events */
attr.exclude_host = 1; /* Don't count host events */
- attr.config = (pmc->idx == ARMV8_PMU_CYCLE_IDX) ?
- ARMV8_PMUV3_PERFCTR_CPU_CYCLES : eventsel;
+ attr.config = eventsel;
counter = kvm_pmu_get_pair_counter_value(vcpu, pmc);
@@ -700,17 +724,95 @@ static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx)
void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
u64 select_idx)
{
- u64 reg, event_type = data & ARMV8_PMU_EVTYPE_MASK;
+ u64 reg, mask;
+
+ mask = ARMV8_PMU_EVTYPE_MASK;
+ mask &= ~ARMV8_PMU_EVTYPE_EVENT;
+ mask |= kvm_pmu_event_mask(vcpu->kvm);
reg = (select_idx == ARMV8_PMU_CYCLE_IDX)
? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + select_idx;
- __vcpu_sys_reg(vcpu, reg) = event_type;
+ __vcpu_sys_reg(vcpu, reg) = data & mask;
kvm_pmu_update_pmc_chained(vcpu, select_idx);
kvm_pmu_create_perf_event(vcpu, select_idx);
}
+static int kvm_pmu_probe_pmuver(void)
+{
+ struct perf_event_attr attr = { };
+ struct perf_event *event;
+ struct arm_pmu *pmu;
+ int pmuver = 0xf;
+
+ /*
+ * Create a dummy event that only counts user cycles. As we'll never
+ * leave this function with the event being live, it will never
+ * count anything. But it allows us to probe some of the PMU
+ * details. Yes, this is terrible.
+ */
+ attr.type = PERF_TYPE_RAW;
+ attr.size = sizeof(attr);
+ attr.pinned = 1;
+ attr.disabled = 0;
+ attr.exclude_user = 0;
+ attr.exclude_kernel = 1;
+ attr.exclude_hv = 1;
+ attr.exclude_host = 1;
+ attr.config = ARMV8_PMUV3_PERFCTR_CPU_CYCLES;
+ attr.sample_period = GENMASK(63, 0);
+
+ event = perf_event_create_kernel_counter(&attr, -1, current,
+ kvm_pmu_perf_overflow, &attr);
+
+ if (IS_ERR(event)) {
+ pr_err_once("kvm: pmu event creation failed %ld\n",
+ PTR_ERR(event));
+ return 0xf;
+ }
+
+ if (event->pmu) {
+ pmu = to_arm_pmu(event->pmu);
+ if (pmu->pmuver)
+ pmuver = pmu->pmuver;
+ }
+
+ perf_event_disable(event);
+ perf_event_release_kernel(event);
+
+ return pmuver;
+}
+
+u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1)
+{
+ unsigned long *bmap = vcpu->kvm->arch.pmu_filter;
+ u64 val, mask = 0;
+ int base, i;
+
+ if (!pmceid1) {
+ val = read_sysreg(pmceid0_el0);
+ base = 0;
+ } else {
+ val = read_sysreg(pmceid1_el0);
+ base = 32;
+ }
+
+ if (!bmap)
+ return val;
+
+ for (i = 0; i < 32; i += 8) {
+ u64 byte;
+
+ byte = bitmap_get_value8(bmap, base + i);
+ mask |= byte << i;
+ byte = bitmap_get_value8(bmap, 0x4000 + base + i);
+ mask |= byte << (32 + i);
+ }
+
+ return val & mask;
+}
+
bool kvm_arm_support_pmu_v3(void)
{
/*
@@ -756,15 +858,6 @@ int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu)
{
- if (!kvm_arm_support_pmu_v3())
- return -ENODEV;
-
- if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
- return -ENXIO;
-
- if (vcpu->arch.pmu.created)
- return -EBUSY;
-
if (irqchip_in_kernel(vcpu->kvm)) {
int ret;
@@ -820,6 +913,19 @@ static bool pmu_irq_is_valid(struct kvm *kvm, int irq)
int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
{
+ if (!kvm_arm_support_pmu_v3() ||
+ !test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
+ return -ENODEV;
+
+ if (vcpu->arch.pmu.created)
+ return -EBUSY;
+
+ if (!vcpu->kvm->arch.pmuver)
+ vcpu->kvm->arch.pmuver = kvm_pmu_probe_pmuver();
+
+ if (vcpu->kvm->arch.pmuver == 0xf)
+ return -ENODEV;
+
switch (attr->attr) {
case KVM_ARM_VCPU_PMU_V3_IRQ: {
int __user *uaddr = (int __user *)(long)attr->addr;
@@ -828,9 +934,6 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
if (!irqchip_in_kernel(vcpu->kvm))
return -EINVAL;
- if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
- return -ENODEV;
-
if (get_user(irq, uaddr))
return -EFAULT;
@@ -848,6 +951,53 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
vcpu->arch.pmu.irq_num = irq;
return 0;
}
+ case KVM_ARM_VCPU_PMU_V3_FILTER: {
+ struct kvm_pmu_event_filter __user *uaddr;
+ struct kvm_pmu_event_filter filter;
+ int nr_events;
+
+ nr_events = kvm_pmu_event_mask(vcpu->kvm) + 1;
+
+ uaddr = (struct kvm_pmu_event_filter __user *)(long)attr->addr;
+
+ if (copy_from_user(&filter, uaddr, sizeof(filter)))
+ return -EFAULT;
+
+ if (((u32)filter.base_event + filter.nevents) > nr_events ||
+ (filter.action != KVM_PMU_EVENT_ALLOW &&
+ filter.action != KVM_PMU_EVENT_DENY))
+ return -EINVAL;
+
+ mutex_lock(&vcpu->kvm->lock);
+
+ if (!vcpu->kvm->arch.pmu_filter) {
+ vcpu->kvm->arch.pmu_filter = bitmap_alloc(nr_events, GFP_KERNEL);
+ if (!vcpu->kvm->arch.pmu_filter) {
+ mutex_unlock(&vcpu->kvm->lock);
+ return -ENOMEM;
+ }
+
+ /*
+ * The default depends on the first applied filter.
+ * If it allows events, the default is to deny.
+ * Conversely, if the first filter denies a set of
+ * events, the default is to allow.
+ */
+ if (filter.action == KVM_PMU_EVENT_ALLOW)
+ bitmap_zero(vcpu->kvm->arch.pmu_filter, nr_events);
+ else
+ bitmap_fill(vcpu->kvm->arch.pmu_filter, nr_events);
+ }
+
+ if (filter.action == KVM_PMU_EVENT_ALLOW)
+ bitmap_set(vcpu->kvm->arch.pmu_filter, filter.base_event, filter.nevents);
+ else
+ bitmap_clear(vcpu->kvm->arch.pmu_filter, filter.base_event, filter.nevents);
+
+ mutex_unlock(&vcpu->kvm->lock);
+
+ return 0;
+ }
case KVM_ARM_VCPU_PMU_V3_INIT:
return kvm_arm_pmu_v3_init(vcpu);
}
@@ -884,6 +1034,7 @@ int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
switch (attr->attr) {
case KVM_ARM_VCPU_PMU_V3_IRQ:
case KVM_ARM_VCPU_PMU_V3_INIT:
+ case KVM_ARM_VCPU_PMU_V3_FILTER:
if (kvm_arm_support_pmu_v3() &&
test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
return 0;
diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c
index 3c224162b3dd..faf32a44ba04 100644
--- a/arch/arm64/kvm/pmu.c
+++ b/arch/arm64/kvm/pmu.c
@@ -31,9 +31,9 @@ static bool kvm_pmu_switch_needed(struct perf_event_attr *attr)
*/
void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr)
{
- struct kvm_host_data *ctx = this_cpu_ptr(&kvm_host_data);
+ struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data);
- if (!kvm_pmu_switch_needed(attr))
+ if (!ctx || !kvm_pmu_switch_needed(attr))
return;
if (!attr->exclude_host)
@@ -47,7 +47,10 @@ void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr)
*/
void kvm_clr_pmu_events(u32 clr)
{
- struct kvm_host_data *ctx = this_cpu_ptr(&kvm_host_data);
+ struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data);
+
+ if (!ctx)
+ return;
ctx->pmu_events.events_host &= ~clr;
ctx->pmu_events.events_guest &= ~clr;
@@ -173,7 +176,7 @@ void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu)
return;
preempt_disable();
- host = this_cpu_ptr(&kvm_host_data);
+ host = this_cpu_ptr_hyp_sym(kvm_host_data);
events_guest = host->pmu_events.events_guest;
events_host = host->pmu_events.events_host;
@@ -193,7 +196,7 @@ void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu)
if (!has_vhe())
return;
- host = this_cpu_ptr(&kvm_host_data);
+ host = this_cpu_ptr_hyp_sym(kvm_host_data);
events_guest = host->pmu_events.events_guest;
events_host = host->pmu_events.events_host;
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index f6e8b4a75cbb..f32490229a4c 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -335,7 +335,7 @@ u32 get_kvm_ipa_limit(void)
int kvm_set_ipa_limit(void)
{
- unsigned int ipa_max, pa_max, va_max, parange, tgran_2;
+ unsigned int parange, tgran_2;
u64 mmfr0;
mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
@@ -372,39 +372,11 @@ int kvm_set_ipa_limit(void)
break;
}
- pa_max = id_aa64mmfr0_parange_to_phys_shift(parange);
-
- /* Clamp the IPA limit to the PA size supported by the kernel */
- ipa_max = (pa_max > PHYS_MASK_SHIFT) ? PHYS_MASK_SHIFT : pa_max;
- /*
- * Since our stage2 table is dependent on the stage1 page table code,
- * we must always honor the following condition:
- *
- * Number of levels in Stage1 >= Number of levels in Stage2.
- *
- * So clamp the ipa limit further down to limit the number of levels.
- * Since we can concatenate upto 16 tables at entry level, we could
- * go upto 4bits above the maximum VA addressable with the current
- * number of levels.
- */
- va_max = PGDIR_SHIFT + PAGE_SHIFT - 3;
- va_max += 4;
-
- if (va_max < ipa_max)
- ipa_max = va_max;
-
- /*
- * If the final limit is lower than the real physical address
- * limit of the CPUs, report the reason.
- */
- if (ipa_max < pa_max)
- pr_info("kvm: Limiting the IPA size due to kernel %s Address limit\n",
- (va_max < pa_max) ? "Virtual" : "Physical");
-
- WARN(ipa_max < KVM_PHYS_SHIFT,
- "KVM IPA limit (%d bit) is smaller than default size\n", ipa_max);
- kvm_ipa_limit = ipa_max;
- kvm_info("IPA Size Limit: %dbits\n", kvm_ipa_limit);
+ kvm_ipa_limit = id_aa64mmfr0_parange_to_phys_shift(parange);
+ WARN(kvm_ipa_limit < KVM_PHYS_SHIFT,
+ "KVM IPA Size Limit (%d bits) is smaller than default size\n",
+ kvm_ipa_limit);
+ kvm_info("IPA Size Limit: %d bits\n", kvm_ipa_limit);
return 0;
}
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 9ca270603980..d9117bc56237 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -769,10 +769,7 @@ static bool access_pmceid(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
if (pmu_access_el0_disabled(vcpu))
return false;
- if (!(p->Op2 & 1))
- pmceid = read_sysreg(pmceid0_el0);
- else
- pmceid = read_sysreg(pmceid1_el0);
+ pmceid = kvm_pmu_get_pmceid(vcpu, (p->Op2 & 1));
p->regval = pmceid;
diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c
index b13a9e3f99dd..f38c40a76251 100644
--- a/arch/arm64/kvm/vgic/vgic-debug.c
+++ b/arch/arm64/kvm/vgic/vgic-debug.c
@@ -260,34 +260,14 @@ static int vgic_debug_show(struct seq_file *s, void *v)
return 0;
}
-static const struct seq_operations vgic_debug_seq_ops = {
+static const struct seq_operations vgic_debug_sops = {
.start = vgic_debug_start,
.next = vgic_debug_next,
.stop = vgic_debug_stop,
.show = vgic_debug_show
};
-static int debug_open(struct inode *inode, struct file *file)
-{
- int ret;
- ret = seq_open(file, &vgic_debug_seq_ops);
- if (!ret) {
- struct seq_file *seq;
- /* seq_open will have modified file->private_data */
- seq = file->private_data;
- seq->private = inode->i_private;
- }
-
- return ret;
-};
-
-static const struct file_operations vgic_debug_fops = {
- .owner = THIS_MODULE,
- .open = debug_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release
-};
+DEFINE_SEQ_ATTRIBUTE(vgic_debug);
void vgic_debug_init(struct kvm *kvm)
{
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 76e2d85789ed..9cdf39a94a63 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -662,7 +662,7 @@ void vgic_v3_load(struct kvm_vcpu *vcpu)
if (likely(cpu_if->vgic_sre))
kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr);
- kvm_call_hyp(__vgic_v3_restore_aprs, kern_hyp_va(cpu_if));
+ kvm_call_hyp(__vgic_v3_restore_aprs, cpu_if);
if (has_vhe())
__vgic_v3_activate_traps(cpu_if);
@@ -686,7 +686,7 @@ void vgic_v3_put(struct kvm_vcpu *vcpu)
vgic_v3_vmcr_sync(vcpu);
- kvm_call_hyp(__vgic_v3_save_aprs, kern_hyp_va(cpu_if));
+ kvm_call_hyp(__vgic_v3_save_aprs, cpu_if);
if (has_vhe())
__vgic_v3_deactivate_traps(cpu_if);