aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-06-28 15:40:51 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2021-06-28 15:40:51 -0700
commit36824f198c621cebeb22966b5e244378fa341295 (patch)
treeee1e358a4ed0cd022ae12b4b7ba1fa3d0e5746d5 /arch
parentMerge tag 'arm64-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux (diff)
parentMerge tag 'kvmarm-5.14' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm into HEAD (diff)
downloadlinux-dev-36824f198c621cebeb22966b5e244378fa341295.tar.xz
linux-dev-36824f198c621cebeb22966b5e244378fa341295.zip
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm updates from Paolo Bonzini: "This covers all architectures (except MIPS) so I don't expect any other feature pull requests this merge window. ARM: - Add MTE support in guests, complete with tag save/restore interface - Reduce the impact of CMOs by moving them in the page-table code - Allow device block mappings at stage-2 - Reduce the footprint of the vmemmap in protected mode - Support the vGIC on dumb systems such as the Apple M1 - Add selftest infrastructure to support multiple configuration and apply that to PMU/non-PMU setups - Add selftests for the debug architecture - The usual crop of PMU fixes PPC: - Support for the H_RPT_INVALIDATE hypercall - Conversion of Book3S entry/exit to C - Bug fixes S390: - new HW facilities for guests - make inline assembly more robust with KASAN and co x86: - Allow userspace to handle emulation errors (unknown instructions) - Lazy allocation of the rmap (host physical -> guest physical address) - Support for virtualizing TSC scaling on VMX machines - Optimizations to avoid shattering huge pages at the beginning of live migration - Support for initializing the PDPTRs without loading them from memory - Many TLB flushing cleanups - Refuse to load if two-stage paging is available but NX is not (this has been a requirement in practice for over a year) - A large series that separates the MMU mode (WP/SMAP/SMEP etc.) from CR0/CR4/EFER, using the MMU mode everywhere once it is computed from the CPU registers - Use PM notifier to notify the guest about host suspend or hibernate - Support for passing arguments to Hyper-V hypercalls using XMM registers - Support for Hyper-V TLB flush hypercalls and enlightened MSR bitmap on AMD processors - Hide Hyper-V hypercalls that are not included in the guest CPUID - Fixes for live migration of virtual machines that use the Hyper-V "enlightened VMCS" optimization of nested virtualization - Bugfixes (not many) Generic: - Support for retrieving statistics without debugfs - Cleanups for the KVM selftests API" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (314 commits) KVM: x86: rename apic_access_page_done to apic_access_memslot_enabled kvm: x86: disable the narrow guest module parameter on unload selftests: kvm: Allows userspace to handle emulation errors. kvm: x86: Allow userspace to handle emulation errors KVM: x86/mmu: Let guest use GBPAGES if supported in hardware and TDP is on KVM: x86/mmu: Get CR4.SMEP from MMU, not vCPU, in shadow page fault KVM: x86/mmu: Get CR0.WP from MMU, not vCPU, in shadow page fault KVM: x86/mmu: Drop redundant rsvd bits reset for nested NPT KVM: x86/mmu: Optimize and clean up so called "last nonleaf level" logic KVM: x86: Enhance comments for MMU roles and nested transition trickiness KVM: x86/mmu: WARN on any reserved SPTE value when making a valid SPTE KVM: x86/mmu: Add helpers to do full reserved SPTE checks w/ generic MMU KVM: x86/mmu: Use MMU's role to determine PTTYPE KVM: x86/mmu: Collapse 32-bit PAE and 64-bit statements for helpers KVM: x86/mmu: Add a helper to calculate root from role_regs KVM: x86/mmu: Add helper to update paging metadata KVM: x86/mmu: Don't update nested guest's paging bitmasks if CR0.PG=0 KVM: x86/mmu: Consolidate reset_rsvds_bits_mask() calls KVM: x86/mmu: Use MMU role_regs to get LA57, and drop vCPU LA57 helper KVM: x86/mmu: Get nested MMU's root level from the MMU's role ...
Diffstat (limited to 'arch')
-rw-r--r--arch/arm64/include/asm/kvm_arm.h3
-rw-r--r--arch/arm64/include/asm/kvm_emulate.h3
-rw-r--r--arch/arm64/include/asm/kvm_host.h23
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h12
-rw-r--r--arch/arm64/include/asm/kvm_mte.h66
-rw-r--r--arch/arm64/include/asm/kvm_pgtable.h42
-rw-r--r--arch/arm64/include/asm/mte-def.h1
-rw-r--r--arch/arm64/include/asm/mte.h4
-rw-r--r--arch/arm64/include/asm/pgtable.h22
-rw-r--r--arch/arm64/include/asm/sysreg.h3
-rw-r--r--arch/arm64/include/uapi/asm/kvm.h11
-rw-r--r--arch/arm64/kernel/asm-offsets.c2
-rw-r--r--arch/arm64/kernel/mte.c18
-rw-r--r--arch/arm64/kvm/Makefile2
-rw-r--r--arch/arm64/kvm/arch_timer.c162
-rw-r--r--arch/arm64/kvm/arm.c20
-rw-r--r--arch/arm64/kvm/guest.c134
-rw-r--r--arch/arm64/kvm/hyp/entry.S7
-rw-r--r--arch/arm64/kvm/hyp/exception.c3
-rw-r--r--arch/arm64/kvm/hyp/hyp-entry.S6
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h21
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/gfp.h45
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/mem_protect.h2
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/memory.h7
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/mm.h13
-rw-r--r--arch/arm64/kvm/hyp/nvhe/mem_protect.c60
-rw-r--r--arch/arm64/kvm/hyp/nvhe/page_alloc.c112
-rw-r--r--arch/arm64/kvm/hyp/nvhe/setup.c30
-rw-r--r--arch/arm64/kvm/hyp/pgtable.c48
-rw-r--r--arch/arm64/kvm/hyp/reserved_mem.c3
-rw-r--r--arch/arm64/kvm/mmu.c196
-rw-r--r--arch/arm64/kvm/pmu-emul.c4
-rw-r--r--arch/arm64/kvm/reset.c4
-rw-r--r--arch/arm64/kvm/sys_regs.c32
-rw-r--r--arch/arm64/kvm/vgic/vgic-init.c36
-rw-r--r--arch/arm64/kvm/vgic/vgic-v2.c19
-rw-r--r--arch/arm64/kvm/vgic/vgic-v3.c19
-rw-r--r--arch/arm64/kvm/vgic/vgic.c14
-rw-r--r--arch/mips/include/asm/kvm_host.h9
-rw-r--r--arch/mips/kvm/Makefile2
-rw-r--r--arch/mips/kvm/mips.c90
-rw-r--r--arch/powerpc/include/asm/asm-prototypes.h3
-rw-r--r--arch/powerpc/include/asm/book3s/64/mmu.h1
-rw-r--r--arch/powerpc/include/asm/book3s/64/tlbflush-radix.h4
-rw-r--r--arch/powerpc/include/asm/cputhreads.h30
-rw-r--r--arch/powerpc/include/asm/exception-64s.h13
-rw-r--r--arch/powerpc/include/asm/hvcall.h4
-rw-r--r--arch/powerpc/include/asm/kvm_asm.h1
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h3
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64.h8
-rw-r--r--arch/powerpc/include/asm/kvm_host.h21
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h18
-rw-r--r--arch/powerpc/include/asm/mmu_context.h18
-rw-r--r--arch/powerpc/include/asm/time.h12
-rw-r--r--arch/powerpc/kernel/asm-offsets.c1
-rw-r--r--arch/powerpc/kernel/exceptions-64s.S250
-rw-r--r--arch/powerpc/kernel/security.c5
-rw-r--r--arch/powerpc/kernel/time.c10
-rw-r--r--arch/powerpc/kvm/Makefile4
-rw-r--r--arch/powerpc/kvm/book3s.c108
-rw-r--r--arch/powerpc/kvm/book3s_64_entry.S416
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_radix.c27
-rw-r--r--arch/powerpc/kvm/book3s_64_vio_hv.c12
-rw-r--r--arch/powerpc/kvm/book3s_hv.c810
-rw-r--r--arch/powerpc/kvm/book3s_hv_builtin.c137
-rw-r--r--arch/powerpc/kvm/book3s_hv_interrupts.S9
-rw-r--r--arch/powerpc/kvm/book3s_hv_nested.c122
-rw-r--r--arch/powerpc/kvm/book3s_hv_p9_entry.c508
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c14
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_xics.c15
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S684
-rw-r--r--arch/powerpc/kvm/book3s_pr.c2
-rw-r--r--arch/powerpc/kvm/book3s_pr_papr.c2
-rw-r--r--arch/powerpc/kvm/book3s_segment.S3
-rw-r--r--arch/powerpc/kvm/book3s_xive.c113
-rw-r--r--arch/powerpc/kvm/book3s_xive.h7
-rw-r--r--arch/powerpc/kvm/book3s_xive_native.c10
-rw-r--r--arch/powerpc/kvm/booke.c76
-rw-r--r--arch/powerpc/kvm/powerpc.c3
-rw-r--r--arch/powerpc/mm/book3s64/radix_pgtable.c32
-rw-r--r--arch/powerpc/mm/book3s64/radix_tlb.c202
-rw-r--r--arch/powerpc/mm/mmu_context.c4
-rw-r--r--arch/powerpc/platforms/powernv/idle.c52
-rw-r--r--arch/s390/include/asm/kvm_host.h9
-rw-r--r--arch/s390/kvm/Makefile3
-rw-r--r--arch/s390/kvm/kvm-s390.c254
-rw-r--r--arch/s390/tools/gen_facilities.c4
-rw-r--r--arch/x86/include/asm/hyperv-tlfs.h19
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h9
-rw-r--r--arch/x86/include/asm/kvm_host.h169
-rw-r--r--arch/x86/include/asm/svm.h9
-rw-r--r--arch/x86/include/uapi/asm/kvm.h13
-rw-r--r--arch/x86/include/uapi/asm/kvm_para.h13
-rw-r--r--arch/x86/include/uapi/asm/svm.h3
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c10
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/Makefile11
-rw-r--r--arch/x86/kvm/cpuid.c6
-rw-r--r--arch/x86/kvm/debugfs.c11
-rw-r--r--arch/x86/kvm/emulate.c166
-rw-r--r--arch/x86/kvm/fpu.h140
-rw-r--r--arch/x86/kvm/hyperv.c440
-rw-r--r--arch/x86/kvm/hyperv.h1
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h15
-rw-r--r--arch/x86/kvm/kvm_emulate.h10
-rw-r--r--arch/x86/kvm/kvm_onhyperv.c93
-rw-r--r--arch/x86/kvm/kvm_onhyperv.h32
-rw-r--r--arch/x86/kvm/lapic.c12
-rw-r--r--arch/x86/kvm/lapic.h2
-rw-r--r--arch/x86/kvm/mmu.h30
-rw-r--r--arch/x86/kvm/mmu/mmu.c894
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h15
-rw-r--r--arch/x86/kvm/mmu/mmutrace.h2
-rw-r--r--arch/x86/kvm/mmu/page_track.c2
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h68
-rw-r--r--arch/x86/kvm/mmu/spte.c22
-rw-r--r--arch/x86/kvm/mmu/spte.h32
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c43
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h35
-rw-r--r--arch/x86/kvm/svm/avic.c18
-rw-r--r--arch/x86/kvm/svm/nested.c91
-rw-r--r--arch/x86/kvm/svm/svm.c106
-rw-r--r--arch/x86/kvm/svm/svm.h24
-rw-r--r--arch/x86/kvm/svm/svm_onhyperv.c41
-rw-r--r--arch/x86/kvm/svm/svm_onhyperv.h130
-rw-r--r--arch/x86/kvm/trace.h2
-rw-r--r--arch/x86/kvm/vmx/capabilities.h1
-rw-r--r--arch/x86/kvm/vmx/evmcs.c3
-rw-r--r--arch/x86/kvm/vmx/evmcs.h8
-rw-r--r--arch/x86/kvm/vmx/nested.c418
-rw-r--r--arch/x86/kvm/vmx/nested.h11
-rw-r--r--arch/x86/kvm/vmx/vmcs.h13
-rw-r--r--arch/x86/kvm/vmx/vmcs12.c1
-rw-r--r--arch/x86/kvm/vmx/vmcs12.h10
-rw-r--r--arch/x86/kvm/vmx/vmx.c225
-rw-r--r--arch/x86/kvm/vmx/vmx.h21
-rw-r--r--arch/x86/kvm/x86.c909
-rw-r--r--arch/x86/kvm/x86.h10
138 files changed, 6161 insertions, 3563 deletions
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index 692c9049befa..d436831dd706 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -12,7 +12,8 @@
#include <asm/types.h>
/* Hyp Configuration Register (HCR) bits */
-#define HCR_ATA (UL(1) << 56)
+#define HCR_ATA_SHIFT 56
+#define HCR_ATA (UL(1) << HCR_ATA_SHIFT)
#define HCR_FWB (UL(1) << 46)
#define HCR_API (UL(1) << 41)
#define HCR_APK (UL(1) << 40)
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 01b9857757f2..fd418955e31e 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -84,6 +84,9 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
if (cpus_have_const_cap(ARM64_MISMATCHED_CACHE_TYPE) ||
vcpu_el1_is_32bit(vcpu))
vcpu->arch.hcr_el2 |= HCR_TID2;
+
+ if (kvm_has_mte(vcpu->kvm))
+ vcpu->arch.hcr_el2 |= HCR_ATA;
}
static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu)
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 7cd7d5c8c4bc..41911585ae0c 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -46,6 +46,7 @@
#define KVM_REQ_VCPU_RESET KVM_ARCH_REQ(2)
#define KVM_REQ_RECORD_STEAL KVM_ARCH_REQ(3)
#define KVM_REQ_RELOAD_GICv4 KVM_ARCH_REQ(4)
+#define KVM_REQ_RELOAD_PMU KVM_ARCH_REQ(5)
#define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
KVM_DIRTY_LOG_INITIALLY_SET)
@@ -132,6 +133,9 @@ struct kvm_arch {
u8 pfr0_csv2;
u8 pfr0_csv3;
+
+ /* Memory Tagging Extension enabled for the guest */
+ bool mte_enabled;
};
struct kvm_vcpu_fault_info {
@@ -206,6 +210,12 @@ enum vcpu_sysreg {
CNTP_CVAL_EL0,
CNTP_CTL_EL0,
+ /* Memory Tagging Extension registers */
+ RGSR_EL1, /* Random Allocation Tag Seed Register */
+ GCR_EL1, /* Tag Control Register */
+ TFSR_EL1, /* Tag Fault Status Register (EL1) */
+ TFSRE0_EL1, /* Tag Fault Status Register (EL0) */
+
/* 32bit specific registers. Keep them at the end of the range */
DACR32_EL2, /* Domain Access Control Register */
IFSR32_EL2, /* Instruction Fault Status Register */
@@ -556,16 +566,11 @@ static inline bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg)
}
struct kvm_vm_stat {
- ulong remote_tlb_flush;
+ struct kvm_vm_stat_generic generic;
};
struct kvm_vcpu_stat {
- u64 halt_successful_poll;
- u64 halt_attempted_poll;
- u64 halt_poll_success_ns;
- u64 halt_poll_fail_ns;
- u64 halt_poll_invalid;
- u64 halt_wakeup;
+ struct kvm_vcpu_stat_generic generic;
u64 hvc_exit_stat;
u64 wfe_exit_stat;
u64 wfi_exit_stat;
@@ -721,6 +726,9 @@ int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
struct kvm_device_attr *attr);
+long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
+ struct kvm_arm_copy_mte_tags *copy_tags);
+
/* Guest/host FPSIMD coordination helpers */
int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu);
void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu);
@@ -769,6 +777,7 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu);
#define kvm_arm_vcpu_sve_finalized(vcpu) \
((vcpu)->arch.flags & KVM_ARM64_VCPU_SVE_FINALIZED)
+#define kvm_has_mte(kvm) (system_supports_mte() && (kvm)->arch.mte_enabled)
#define kvm_vcpu_has_pmu(vcpu) \
(test_bit(KVM_ARM_VCPU_PMU_V3, (vcpu)->arch.features))
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index f4cbfa9025a8..b52c5c4b9a3d 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -188,10 +188,8 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
return (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101;
}
-static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
+static inline void __clean_dcache_guest_page(void *va, size_t size)
{
- void *va = page_address(pfn_to_page(pfn));
-
/*
* With FWB, we ensure that the guest always accesses memory using
* cacheable attributes, and we don't have to clean to PoC when
@@ -204,18 +202,14 @@ static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
kvm_flush_dcache_to_poc(va, size);
}
-static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn,
- unsigned long size)
+static inline void __invalidate_icache_guest_page(void *va, size_t size)
{
if (icache_is_aliasing()) {
/* any kind of VIPT cache */
icache_inval_all_pou();
} else if (is_kernel_in_hyp_mode() || !icache_is_vpipt()) {
/* PIPT or VPIPT at EL2 (see comment in __kvm_tlb_flush_vmid_ipa) */
- void *va = page_address(pfn_to_page(pfn));
-
- icache_inval_pou((unsigned long)va,
- (unsigned long)va + size);
+ icache_inval_pou((unsigned long)va, (unsigned long)va + size);
}
}
diff --git a/arch/arm64/include/asm/kvm_mte.h b/arch/arm64/include/asm/kvm_mte.h
new file mode 100644
index 000000000000..de002636eb1f
--- /dev/null
+++ b/arch/arm64/include/asm/kvm_mte.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2020-2021 ARM Ltd.
+ */
+#ifndef __ASM_KVM_MTE_H
+#define __ASM_KVM_MTE_H
+
+#ifdef __ASSEMBLY__
+
+#include <asm/sysreg.h>
+
+#ifdef CONFIG_ARM64_MTE
+
+.macro mte_switch_to_guest g_ctxt, h_ctxt, reg1
+alternative_if_not ARM64_MTE
+ b .L__skip_switch\@
+alternative_else_nop_endif
+ mrs \reg1, hcr_el2
+ tbz \reg1, #(HCR_ATA_SHIFT), .L__skip_switch\@
+
+ mrs_s \reg1, SYS_RGSR_EL1
+ str \reg1, [\h_ctxt, #CPU_RGSR_EL1]
+ mrs_s \reg1, SYS_GCR_EL1
+ str \reg1, [\h_ctxt, #CPU_GCR_EL1]
+
+ ldr \reg1, [\g_ctxt, #CPU_RGSR_EL1]
+ msr_s SYS_RGSR_EL1, \reg1
+ ldr \reg1, [\g_ctxt, #CPU_GCR_EL1]
+ msr_s SYS_GCR_EL1, \reg1
+
+.L__skip_switch\@:
+.endm
+
+.macro mte_switch_to_hyp g_ctxt, h_ctxt, reg1
+alternative_if_not ARM64_MTE
+ b .L__skip_switch\@
+alternative_else_nop_endif
+ mrs \reg1, hcr_el2
+ tbz \reg1, #(HCR_ATA_SHIFT), .L__skip_switch\@
+
+ mrs_s \reg1, SYS_RGSR_EL1
+ str \reg1, [\g_ctxt, #CPU_RGSR_EL1]
+ mrs_s \reg1, SYS_GCR_EL1
+ str \reg1, [\g_ctxt, #CPU_GCR_EL1]
+
+ ldr \reg1, [\h_ctxt, #CPU_RGSR_EL1]
+ msr_s SYS_RGSR_EL1, \reg1
+ ldr \reg1, [\h_ctxt, #CPU_GCR_EL1]
+ msr_s SYS_GCR_EL1, \reg1
+
+ isb
+
+.L__skip_switch\@:
+.endm
+
+#else /* !CONFIG_ARM64_MTE */
+
+.macro mte_switch_to_guest g_ctxt, h_ctxt, reg1
+.endm
+
+.macro mte_switch_to_hyp g_ctxt, h_ctxt, reg1
+.endm
+
+#endif /* CONFIG_ARM64_MTE */
+#endif /* __ASSEMBLY__ */
+#endif /* __ASM_KVM_MTE_H */
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index c3674c47d48c..f004c0115d89 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -27,23 +27,29 @@ typedef u64 kvm_pte_t;
/**
* struct kvm_pgtable_mm_ops - Memory management callbacks.
- * @zalloc_page: Allocate a single zeroed memory page. The @arg parameter
- * can be used by the walker to pass a memcache. The
- * initial refcount of the page is 1.
- * @zalloc_pages_exact: Allocate an exact number of zeroed memory pages. The
- * @size parameter is in bytes, and is rounded-up to the
- * next page boundary. The resulting allocation is
- * physically contiguous.
- * @free_pages_exact: Free an exact number of memory pages previously
- * allocated by zalloc_pages_exact.
- * @get_page: Increment the refcount on a page.
- * @put_page: Decrement the refcount on a page. When the refcount
- * reaches 0 the page is automatically freed.
- * @page_count: Return the refcount of a page.
- * @phys_to_virt: Convert a physical address into a virtual address mapped
- * in the current context.
- * @virt_to_phys: Convert a virtual address mapped in the current context
- * into a physical address.
+ * @zalloc_page: Allocate a single zeroed memory page.
+ * The @arg parameter can be used by the walker
+ * to pass a memcache. The initial refcount of
+ * the page is 1.
+ * @zalloc_pages_exact: Allocate an exact number of zeroed memory pages.
+ * The @size parameter is in bytes, and is rounded
+ * up to the next page boundary. The resulting
+ * allocation is physically contiguous.
+ * @free_pages_exact: Free an exact number of memory pages previously
+ * allocated by zalloc_pages_exact.
+ * @get_page: Increment the refcount on a page.
+ * @put_page: Decrement the refcount on a page. When the
+ * refcount reaches 0 the page is automatically
+ * freed.
+ * @page_count: Return the refcount of a page.
+ * @phys_to_virt: Convert a physical address into a virtual
+ * address mapped in the current context.
+ * @virt_to_phys: Convert a virtual address mapped in the current
+ * context into a physical address.
+ * @dcache_clean_inval_poc: Clean and invalidate the data cache to the PoC
+ * for the specified memory address range.
+ * @icache_inval_pou: Invalidate the instruction cache to the PoU
+ * for the specified memory address range.
*/
struct kvm_pgtable_mm_ops {
void* (*zalloc_page)(void *arg);
@@ -54,6 +60,8 @@ struct kvm_pgtable_mm_ops {
int (*page_count)(void *addr);
void* (*phys_to_virt)(phys_addr_t phys);
phys_addr_t (*virt_to_phys)(void *addr);
+ void (*dcache_clean_inval_poc)(void *addr, size_t size);
+ void (*icache_inval_pou)(void *addr, size_t size);
};
/**
diff --git a/arch/arm64/include/asm/mte-def.h b/arch/arm64/include/asm/mte-def.h
index cf241b0f0a42..626d359b396e 100644
--- a/arch/arm64/include/asm/mte-def.h
+++ b/arch/arm64/include/asm/mte-def.h
@@ -7,6 +7,7 @@
#define MTE_GRANULE_SIZE UL(16)
#define MTE_GRANULE_MASK (~(MTE_GRANULE_SIZE - 1))
+#define MTE_GRANULES_PER_PAGE (PAGE_SIZE / MTE_GRANULE_SIZE)
#define MTE_TAG_SHIFT 56
#define MTE_TAG_SIZE 4
#define MTE_TAG_MASK GENMASK((MTE_TAG_SHIFT + (MTE_TAG_SIZE - 1)), MTE_TAG_SHIFT)
diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h
index 67bf259ae768..58c7f80f5596 100644
--- a/arch/arm64/include/asm/mte.h
+++ b/arch/arm64/include/asm/mte.h
@@ -38,7 +38,7 @@ void mte_free_tag_storage(char *storage);
#define PG_mte_tagged PG_arch_2
void mte_zero_clear_page_tags(void *addr);
-void mte_sync_tags(pte_t *ptep, pte_t pte);
+void mte_sync_tags(pte_t old_pte, pte_t pte);
void mte_copy_page_tags(void *kto, const void *kfrom);
void mte_thread_init_user(void);
void mte_thread_switch(struct task_struct *next);
@@ -57,7 +57,7 @@ int mte_ptrace_copy_tags(struct task_struct *child, long request,
static inline void mte_zero_clear_page_tags(void *addr)
{
}
-static inline void mte_sync_tags(pte_t *ptep, pte_t pte)
+static inline void mte_sync_tags(pte_t old_pte, pte_t pte)
{
}
static inline void mte_copy_page_tags(void *kto, const void *kfrom)
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 11e60d0cd9b6..c0ba8cdfa10a 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -314,9 +314,25 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
if (pte_present(pte) && pte_user_exec(pte) && !pte_special(pte))
__sync_icache_dcache(pte);
- if (system_supports_mte() &&
- pte_present(pte) && pte_tagged(pte) && !pte_special(pte))
- mte_sync_tags(ptep, pte);
+ /*
+ * If the PTE would provide user space access to the tags associated
+ * with it then ensure that the MTE tags are synchronised. Although
+ * pte_access_permitted() returns false for exec only mappings, they
+ * don't expose tags (instruction fetches don't check tags).
+ */
+ if (system_supports_mte() && pte_access_permitted(pte, false) &&
+ !pte_special(pte)) {
+ pte_t old_pte = READ_ONCE(*ptep);
+ /*
+ * We only need to synchronise if the new PTE has tags enabled
+ * or if swapping in (in which case another mapping may have
+ * set tags in the past even if this PTE isn't tagged).
+ * (!pte_none() && !pte_present()) is an open coded version of
+ * is_swap_pte()
+ */
+ if (pte_tagged(pte) || (!pte_none(old_pte) && !pte_present(old_pte)))
+ mte_sync_tags(old_pte, pte);
+ }
__check_racy_pte_update(mm, ptep, pte);
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 9ea84bcddf85..7b9c3acba684 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -651,7 +651,8 @@
#define INIT_SCTLR_EL2_MMU_ON \
(SCTLR_ELx_M | SCTLR_ELx_C | SCTLR_ELx_SA | SCTLR_ELx_I | \
- SCTLR_ELx_IESB | SCTLR_ELx_WXN | ENDIAN_SET_EL2 | SCTLR_EL2_RES1)
+ SCTLR_ELx_IESB | SCTLR_ELx_WXN | ENDIAN_SET_EL2 | \
+ SCTLR_ELx_ITFSB | SCTLR_EL2_RES1)
#define INIT_SCTLR_EL2_MMU_OFF \
(SCTLR_EL2_RES1 | ENDIAN_SET_EL2)
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index 24223adae150..b3edde68bc3e 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -184,6 +184,17 @@ struct kvm_vcpu_events {
__u32 reserved[12];
};
+struct kvm_arm_copy_mte_tags {
+ __u64 guest_ipa;
+ __u64 length;
+ void __user *addr;
+ __u64 flags;
+ __u64 reserved[2];
+};
+
+#define KVM_ARM_TAGS_TO_GUEST 0
+#define KVM_ARM_TAGS_FROM_GUEST 1
+
/* If you need to interpret the index values, here is the key: */
#define KVM_REG_ARM_COPROC_MASK 0x000000000FFF0000
#define KVM_REG_ARM_COPROC_SHIFT 16
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 0a73b7643d6c..c85670692afa 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -113,6 +113,8 @@ int main(void)
DEFINE(VCPU_WORKAROUND_FLAGS, offsetof(struct kvm_vcpu, arch.workaround_flags));
DEFINE(VCPU_HCR_EL2, offsetof(struct kvm_vcpu, arch.hcr_el2));
DEFINE(CPU_USER_PT_REGS, offsetof(struct kvm_cpu_context, regs));
+ DEFINE(CPU_RGSR_EL1, offsetof(struct kvm_cpu_context, sys_regs[RGSR_EL1]));
+ DEFINE(CPU_GCR_EL1, offsetof(struct kvm_cpu_context, sys_regs[GCR_EL1]));
DEFINE(CPU_APIAKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APIAKEYLO_EL1]));
DEFINE(CPU_APIBKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APIBKEYLO_EL1]));
DEFINE(CPU_APDAKEYLO_EL1, offsetof(struct kvm_cpu_context, sys_regs[APDAKEYLO_EL1]));
diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
index 125a10e413e9..69b3fde8759e 100644
--- a/arch/arm64/kernel/mte.c
+++ b/arch/arm64/kernel/mte.c
@@ -32,10 +32,9 @@ DEFINE_STATIC_KEY_FALSE(mte_async_mode);
EXPORT_SYMBOL_GPL(mte_async_mode);
#endif
-static void mte_sync_page_tags(struct page *page, pte_t *ptep, bool check_swap)
+static void mte_sync_page_tags(struct page *page, pte_t old_pte,
+ bool check_swap, bool pte_is_tagged)
{
- pte_t old_pte = READ_ONCE(*ptep);
-
if (check_swap && is_swap_pte(old_pte)) {
swp_entry_t entry = pte_to_swp_entry(old_pte);
@@ -43,6 +42,9 @@ static void mte_sync_page_tags(struct page *page, pte_t *ptep, bool check_swap)
return;
}
+ if (!pte_is_tagged)
+ return;
+
page_kasan_tag_reset(page);
/*
* We need smp_wmb() in between setting the flags and clearing the
@@ -55,16 +57,22 @@ static void mte_sync_page_tags(struct page *page, pte_t *ptep, bool check_swap)
mte_clear_page_tags(page_address(page));
}
-void mte_sync_tags(pte_t *ptep, pte_t pte)
+void mte_sync_tags(pte_t old_pte, pte_t pte)
{
struct page *page = pte_page(pte);
long i, nr_pages = compound_nr(page);
bool check_swap = nr_pages == 1;
+ bool pte_is_tagged = pte_tagged(pte);
+
+ /* Early out if there's nothing to do */
+ if (!check_swap && !pte_is_tagged)
+ return;
/* if PG_mte_tagged is set, tags have already been initialised */
for (i = 0; i < nr_pages; i++, page++) {
if (!test_and_set_bit(PG_mte_tagged, &page->flags))
- mte_sync_page_tags(page, ptep, check_swap);
+ mte_sync_page_tags(page, old_pte, check_swap,
+ pte_is_tagged);
}
}
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 589921392cb1..989bb5dad2c8 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -11,7 +11,7 @@ obj-$(CONFIG_KVM) += kvm.o
obj-$(CONFIG_KVM) += hyp/
kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \
- $(KVM)/vfio.o $(KVM)/irqchip.o \
+ $(KVM)/vfio.o $(KVM)/irqchip.o $(KVM)/binary_stats.o \
arm.o mmu.o mmio.o psci.o perf.o hypercalls.o pvtime.o \
inject_fault.o va_layout.o handle_exit.o \
guest.o debug.o reset.o sys_regs.o \
diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index 74e0699661e9..3df67c127489 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -9,6 +9,7 @@
#include <linux/kvm_host.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
+#include <linux/irqdomain.h>
#include <linux/uaccess.h>
#include <clocksource/arm_arch_timer.h>
@@ -973,36 +974,154 @@ static int kvm_timer_dying_cpu(unsigned int cpu)
return 0;
}
-int kvm_timer_hyp_init(bool has_gic)
+static int timer_irq_set_vcpu_affinity(struct irq_data *d, void *vcpu)
{
- struct arch_timer_kvm_info *info;
- int err;
+ if (vcpu)
+ irqd_set_forwarded_to_vcpu(d);
+ else
+ irqd_clr_forwarded_to_vcpu(d);
- info = arch_timer_get_kvm_info();
- timecounter = &info->timecounter;
+ return 0;
+}
- if (!timecounter->cc) {
- kvm_err("kvm_arch_timer: uninitialized timecounter\n");
- return -ENODEV;
+static int timer_irq_set_irqchip_state(struct irq_data *d,
+ enum irqchip_irq_state which, bool val)
+{
+ if (which != IRQCHIP_STATE_ACTIVE || !irqd_is_forwarded_to_vcpu(d))
+ return irq_chip_set_parent_state(d, which, val);
+
+ if (val)
+ irq_chip_mask_parent(d);
+ else
+ irq_chip_unmask_parent(d);
+
+ return 0;
+}
+
+static void timer_irq_eoi(struct irq_data *d)
+{
+ if (!irqd_is_forwarded_to_vcpu(d))
+ irq_chip_eoi_parent(d);
+}
+
+static void timer_irq_ack(struct irq_data *d)
+{
+ d = d->parent_data;
+ if (d->chip->irq_ack)
+ d->chip->irq_ack(d);
+}
+
+static struct irq_chip timer_chip = {
+ .name = "KVM",
+ .irq_ack = timer_irq_ack,
+ .irq_mask = irq_chip_mask_parent,
+ .irq_unmask = irq_chip_unmask_parent,
+ .irq_eoi = timer_irq_eoi,
+ .irq_set_type = irq_chip_set_type_parent,
+ .irq_set_vcpu_affinity = timer_irq_set_vcpu_affinity,
+ .irq_set_irqchip_state = timer_irq_set_irqchip_state,
+};
+
+static int timer_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ irq_hw_number_t hwirq = (uintptr_t)arg;
+
+ return irq_domain_set_hwirq_and_chip(domain, virq, hwirq,
+ &timer_chip, NULL);
+}
+
+static void timer_irq_domain_free(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs)
+{
+}
+
+static const struct irq_domain_ops timer_domain_ops = {
+ .alloc = timer_irq_domain_alloc,
+ .free = timer_irq_domain_free,
+};
+
+static struct irq_ops arch_timer_irq_ops = {
+ .get_input_level = kvm_arch_timer_get_input_level,
+};
+
+static void kvm_irq_fixup_flags(unsigned int virq, u32 *flags)
+{
+ *flags = irq_get_trigger_type(virq);
+ if (*flags != IRQF_TRIGGER_HIGH && *flags != IRQF_TRIGGER_LOW) {
+ kvm_err("Invalid trigger for timer IRQ%d, assuming level low\n",
+ virq);
+ *flags = IRQF_TRIGGER_LOW;
}
+}
- /* First, do the virtual EL1 timer irq */
+static int kvm_irq_init(struct arch_timer_kvm_info *info)
+{
+ struct irq_domain *domain = NULL;
if (info->virtual_irq <= 0) {
kvm_err("kvm_arch_timer: invalid virtual timer IRQ: %d\n",
info->virtual_irq);
return -ENODEV;
}
+
host_vtimer_irq = info->virtual_irq;
+ kvm_irq_fixup_flags(host_vtimer_irq, &host_vtimer_irq_flags);
+
+ if (kvm_vgic_global_state.no_hw_deactivation) {
+ struct fwnode_handle *fwnode;
+ struct irq_data *data;
+
+ fwnode = irq_domain_alloc_named_fwnode("kvm-timer");
+ if (!fwnode)
+ return -ENOMEM;
+
+ /* Assume both vtimer and ptimer in the same parent */
+ data = irq_get_irq_data(host_vtimer_irq);
+ domain = irq_domain_create_hierarchy(data->domain, 0,
+ NR_KVM_TIMERS, fwnode,
+ &timer_domain_ops, NULL);
+ if (!domain) {
+ irq_domain_free_fwnode(fwnode);
+ return -ENOMEM;
+ }
+
+ arch_timer_irq_ops.flags |= VGIC_IRQ_SW_RESAMPLE;
+ WARN_ON(irq_domain_push_irq(domain, host_vtimer_irq,
+ (void *)TIMER_VTIMER));
+ }
- host_vtimer_irq_flags = irq_get_trigger_type(host_vtimer_irq);
- if (host_vtimer_irq_flags != IRQF_TRIGGER_HIGH &&
- host_vtimer_irq_flags != IRQF_TRIGGER_LOW) {
- kvm_err("Invalid trigger for vtimer IRQ%d, assuming level low\n",
- host_vtimer_irq);
- host_vtimer_irq_flags = IRQF_TRIGGER_LOW;
+ if (info->physical_irq > 0) {
+ host_ptimer_irq = info->physical_irq;
+ kvm_irq_fixup_flags(host_ptimer_irq, &host_ptimer_irq_flags);
+
+ if (domain)
+ WARN_ON(irq_domain_push_irq(domain, host_ptimer_irq,
+ (void *)TIMER_PTIMER));
}
+ return 0;
+}
+
+int kvm_timer_hyp_init(bool has_gic)
+{
+ struct arch_timer_kvm_info *info;
+ int err;
+
+ info = arch_timer_get_kvm_info();
+ timecounter = &info->timecounter;
+
+ if (!timecounter->cc) {
+ kvm_err("kvm_arch_timer: uninitialized timecounter\n");
+ return -ENODEV;
+ }
+
+ err = kvm_irq_init(info);
+ if (err)
+ return err;
+
+ /* First, do the virtual EL1 timer irq */
+
err = request_percpu_irq(host_vtimer_irq, kvm_arch_timer_handler,
"kvm guest vtimer", kvm_get_running_vcpus());
if (err) {
@@ -1027,15 +1146,6 @@ int kvm_timer_hyp_init(bool has_gic)
/* Now let's do the physical EL1 timer irq */
if (info->physical_irq > 0) {
- host_ptimer_irq = info->physical_irq;
- host_ptimer_irq_flags = irq_get_trigger_type(host_ptimer_irq);
- if (host_ptimer_irq_flags != IRQF_TRIGGER_HIGH &&
- host_ptimer_irq_flags != IRQF_TRIGGER_LOW) {
- kvm_err("Invalid trigger for ptimer IRQ%d, assuming level low\n",
- host_ptimer_irq);
- host_ptimer_irq_flags = IRQF_TRIGGER_LOW;
- }
-
err = request_percpu_irq(host_ptimer_irq, kvm_arch_timer_handler,
"kvm guest ptimer", kvm_get_running_vcpus());
if (err) {
@@ -1143,7 +1253,7 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu)
ret = kvm_vgic_map_phys_irq(vcpu,
map.direct_vtimer->host_timer_irq,
map.direct_vtimer->irq.irq,
- kvm_arch_timer_get_input_level);
+ &arch_timer_irq_ops);
if (ret)
return ret;
@@ -1151,7 +1261,7 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu)
ret = kvm_vgic_map_phys_irq(vcpu,
map.direct_ptimer->host_timer_irq,
map.direct_ptimer->irq.irq,
- kvm_arch_timer_get_input_level);
+ &arch_timer_irq_ops);
}
if (ret)
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 06f8e2de05c6..e9a2b8f27792 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -93,6 +93,12 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
r = 0;
kvm->arch.return_nisv_io_abort_to_user = true;
break;
+ case KVM_CAP_ARM_MTE:
+ if (!system_supports_mte() || kvm->created_vcpus)
+ return -EINVAL;
+ r = 0;
+ kvm->arch.mte_enabled = true;
+ break;
default:
r = -EINVAL;
break;
@@ -237,6 +243,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
*/
r = 1;
break;
+ case KVM_CAP_ARM_MTE:
+ r = system_supports_mte();
+ break;
case KVM_CAP_STEAL_TIME:
r = kvm_arm_pvtime_supported();
break;
@@ -689,6 +698,10 @@ static void check_vcpu_requests(struct kvm_vcpu *vcpu)
vgic_v4_load(vcpu);
preempt_enable();
}
+
+ if (kvm_check_request(KVM_REQ_RELOAD_PMU, vcpu))
+ kvm_pmu_handle_pmcr(vcpu,
+ __vcpu_sys_reg(vcpu, PMCR_EL0));
}
}
@@ -1359,6 +1372,13 @@ long kvm_arch_vm_ioctl(struct file *filp,
return 0;
}
+ case KVM_ARM_MTE_COPY_TAGS: {
+ struct kvm_arm_copy_mte_tags copy_tags;
+
+ if (copy_from_user(&copy_tags, argp, sizeof(copy_tags)))
+ return -EFAULT;
+ return kvm_vm_ioctl_mte_copy_tags(kvm, &copy_tags);
+ }
default:
return -EINVAL;
}
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 5cb4a1cd5603..1dfb83578277 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -28,20 +28,40 @@
#include "trace.h"
-struct kvm_stats_debugfs_item debugfs_entries[] = {
- VCPU_STAT("halt_successful_poll", halt_successful_poll),
- VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
- VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
- VCPU_STAT("halt_wakeup", halt_wakeup),
- VCPU_STAT("hvc_exit_stat", hvc_exit_stat),
- VCPU_STAT("wfe_exit_stat", wfe_exit_stat),
- VCPU_STAT("wfi_exit_stat", wfi_exit_stat),
- VCPU_STAT("mmio_exit_user", mmio_exit_user),
- VCPU_STAT("mmio_exit_kernel", mmio_exit_kernel),
- VCPU_STAT("exits", exits),
- VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
- VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
- { NULL }
+const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
+ KVM_GENERIC_VM_STATS()
+};
+static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
+ sizeof(struct kvm_vm_stat) / sizeof(u64));
+
+const struct kvm_stats_header kvm_vm_stats_header = {
+ .name_size = KVM_STATS_NAME_SIZE,
+ .num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
+ .id_offset = sizeof(struct kvm_stats_header),
+ .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+ .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+ sizeof(kvm_vm_stats_desc),
+};
+
+const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
+ KVM_GENERIC_VCPU_STATS(),
+ STATS_DESC_COUNTER(VCPU, hvc_exit_stat),
+ STATS_DESC_COUNTER(VCPU, wfe_exit_stat),
+ STATS_DESC_COUNTER(VCPU, wfi_exit_stat),
+ STATS_DESC_COUNTER(VCPU, mmio_exit_user),
+ STATS_DESC_COUNTER(VCPU, mmio_exit_kernel),
+ STATS_DESC_COUNTER(VCPU, exits)
+};
+static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
+ sizeof(struct kvm_vcpu_stat) / sizeof(u64));
+
+const struct kvm_stats_header kvm_vcpu_stats_header = {
+ .name_size = KVM_STATS_NAME_SIZE,
+ .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
+ .id_offset = sizeof(struct kvm_stats_header),
+ .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+ .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+ sizeof(kvm_vcpu_stats_desc),
};
static bool core_reg_offset_is_vreg(u64 off)
@@ -995,3 +1015,89 @@ int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
return ret;
}
+
+long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
+ struct kvm_arm_copy_mte_tags *copy_tags)
+{
+ gpa_t guest_ipa = copy_tags->guest_ipa;
+ size_t length = copy_tags->length;
+ void __user *tags = copy_tags->addr;
+ gpa_t gfn;
+ bool write = !(copy_tags->flags & KVM_ARM_TAGS_FROM_GUEST);
+ int ret = 0;
+
+ if (!kvm_has_mte(kvm))
+ return -EINVAL;
+
+ if (copy_tags->reserved[0] || copy_tags->reserved[1])
+ return -EINVAL;
+
+ if (copy_tags->flags & ~KVM_ARM_TAGS_FROM_GUEST)
+ return -EINVAL;
+
+ if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK)
+ return -EINVAL;
+
+ gfn = gpa_to_gfn(guest_ipa);
+
+ mutex_lock(&kvm->slots_lock);
+
+ while (length > 0) {
+ kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL);
+ void *maddr;
+ unsigned long num_tags;
+ struct page *page;
+
+ if (is_error_noslot_pfn(pfn)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ page = pfn_to_online_page(pfn);
+ if (!page) {
+ /* Reject ZONE_DEVICE memory */
+ ret = -EFAULT;
+ goto out;
+ }
+ maddr = page_address(page);
+
+ if (!write) {
+ if (test_bit(PG_mte_tagged, &page->flags))
+ num_tags = mte_copy_tags_to_user(tags, maddr,
+ MTE_GRANULES_PER_PAGE);
+ else
+ /* No tags in memory, so write zeros */
+ num_tags = MTE_GRANULES_PER_PAGE -
+ clear_user(tags, MTE_GRANULES_PER_PAGE);
+ kvm_release_pfn_clean(pfn);
+ } else {
+ num_tags = mte_copy_tags_from_user(maddr, tags,
+ MTE_GRANULES_PER_PAGE);
+
+ /*
+ * Set the flag after checking the write
+ * completed fully
+ */
+ if (num_tags == MTE_GRANULES_PER_PAGE)
+ set_bit(PG_mte_tagged, &page->flags);
+
+ kvm_release_pfn_dirty(pfn);
+ }
+
+ if (num_tags != MTE_GRANULES_PER_PAGE) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ gfn++;
+ tags += num_tags;
+ length -= PAGE_SIZE;
+ }
+
+out:
+ mutex_unlock(&kvm->slots_lock);
+ /* If some data has been copied report the number of bytes copied */
+ if (length != copy_tags->length)
+ return copy_tags->length - length;
+ return ret;
+}
diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
index e831d3dfd50d..435346ea1504 100644
--- a/arch/arm64/kvm/hyp/entry.S
+++ b/arch/arm64/kvm/hyp/entry.S
@@ -13,6 +13,7 @@
#include <asm/kvm_arm.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_mmu.h>
+#include <asm/kvm_mte.h>
#include <asm/kvm_ptrauth.h>
.text
@@ -51,6 +52,9 @@ alternative_else_nop_endif
add x29, x0, #VCPU_CONTEXT
+ // mte_switch_to_guest(g_ctxt, h_ctxt, tmp1)
+ mte_switch_to_guest x29, x1, x2
+
// Macro ptrauth_switch_to_guest format:
// ptrauth_switch_to_guest(guest cxt, tmp1, tmp2, tmp3)
// The below macro to restore guest keys is not implemented in C code
@@ -142,6 +146,9 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL)
// when this feature is enabled for kernel code.
ptrauth_switch_to_hyp x1, x2, x3, x4, x5
+ // mte_switch_to_hyp(g_ctxt, h_ctxt, reg1)
+ mte_switch_to_hyp x1, x2, x3
+
// Restore hyp's sp_el0
restore_sp_el0 x2, x3
diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c
index 11541b94b328..0418399e0a20 100644
--- a/arch/arm64/kvm/hyp/exception.c
+++ b/arch/arm64/kvm/hyp/exception.c
@@ -112,7 +112,8 @@ static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode,
new |= (old & PSR_C_BIT);
new |= (old & PSR_V_BIT);
- // TODO: TCO (if/when ARMv8.5-MemTag is exposed to guests)
+ if (kvm_has_mte(vcpu->kvm))
+ new |= PSR_TCO_BIT;
new |= (old & PSR_DIT_BIT);
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index 5f49df4ffdd8..9aa9b73475c9 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -76,6 +76,7 @@ el1_trap:
b __guest_exit
el1_irq:
+el1_fiq:
get_vcpu_ptr x1, x0
mov x0, #ARM_EXCEPTION_IRQ
b __guest_exit
@@ -131,7 +132,6 @@ SYM_CODE_END(\label)
invalid_vector el2t_error_invalid
invalid_vector el2h_irq_invalid
invalid_vector el2h_fiq_invalid
- invalid_vector el1_fiq_invalid
.ltorg
@@ -179,12 +179,12 @@ SYM_CODE_START(__kvm_hyp_vector)
valid_vect el1_sync // Synchronous 64-bit EL1
valid_vect el1_irq // IRQ 64-bit EL1
- invalid_vect el1_fiq_invalid // FIQ 64-bit EL1
+ valid_vect el1_fiq // FIQ 64-bit EL1
valid_vect el1_error // Error 64-bit EL1
valid_vect el1_sync // Synchronous 32-bit EL1
valid_vect el1_irq // IRQ 32-bit EL1
- invalid_vect el1_fiq_invalid // FIQ 32-bit EL1
+ valid_vect el1_fiq // FIQ 32-bit EL1
valid_vect el1_error // Error 32-bit EL1
SYM_CODE_END(__kvm_hyp_vector)
diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
index cce43bfe158f..de7e14c862e6 100644
--- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
+++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
@@ -14,6 +14,7 @@
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
static inline void __sysreg_save_common_state(struct kvm_cpu_context *ctxt)
{
@@ -26,6 +27,16 @@ static inline void __sysreg_save_user_state(struct kvm_cpu_context *ctxt)
ctxt_sys_reg(ctxt, TPIDRRO_EL0) = read_sysreg(tpidrro_el0);
}
+static inline bool ctxt_has_mte(struct kvm_cpu_context *ctxt)
+{
+ struct kvm_vcpu *vcpu = ctxt->__hyp_running_vcpu;
+
+ if (!vcpu)
+ vcpu = container_of(ctxt, struct kvm_vcpu, arch.ctxt);
+
+ return kvm_has_mte(kern_hyp_va(vcpu->kvm));
+}
+
static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
{
ctxt_sys_reg(ctxt, CSSELR_EL1) = read_sysreg(csselr_el1);
@@ -46,6 +57,11 @@ static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
ctxt_sys_reg(ctxt, PAR_EL1) = read_sysreg_par();
ctxt_sys_reg(ctxt, TPIDR_EL1) = read_sysreg(tpidr_el1);
+ if (ctxt_has_mte(ctxt)) {
+ ctxt_sys_reg(ctxt, TFSR_EL1) = read_sysreg_el1(SYS_TFSR);
+ ctxt_sys_reg(ctxt, TFSRE0_EL1) = read_sysreg_s(SYS_TFSRE0_EL1);
+ }
+
ctxt_sys_reg(ctxt, SP_EL1) = read_sysreg(sp_el1);
ctxt_sys_reg(ctxt, ELR_EL1) = read_sysreg_el1(SYS_ELR);
ctxt_sys_reg(ctxt, SPSR_EL1) = read_sysreg_el1(SYS_SPSR);
@@ -107,6 +123,11 @@ static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
write_sysreg(ctxt_sys_reg(ctxt, PAR_EL1), par_el1);
write_sysreg(ctxt_sys_reg(ctxt, TPIDR_EL1), tpidr_el1);
+ if (ctxt_has_mte(ctxt)) {
+ write_sysreg_el1(ctxt_sys_reg(ctxt, TFSR_EL1), SYS_TFSR);
+ write_sysreg_s(ctxt_sys_reg(ctxt, TFSRE0_EL1), SYS_TFSRE0_EL1);
+ }
+
if (!has_vhe() &&
cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT) &&
ctxt->__hyp_running_vcpu) {
diff --git a/arch/arm64/kvm/hyp/include/nvhe/gfp.h b/arch/arm64/kvm/hyp/include/nvhe/gfp.h
index 18a4494337bd..fb0f523d1492 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/gfp.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/gfp.h
@@ -7,7 +7,7 @@
#include <nvhe/memory.h>
#include <nvhe/spinlock.h>
-#define HYP_NO_ORDER UINT_MAX
+#define HYP_NO_ORDER USHRT_MAX
struct hyp_pool {
/*
@@ -19,48 +19,13 @@ struct hyp_pool {
struct list_head free_area[MAX_ORDER];
phys_addr_t range_start;
phys_addr_t range_end;
- unsigned int max_order;
+ unsigned short max_order;
};
-static inline void hyp_page_ref_inc(struct hyp_page *p)
-{
- struct hyp_pool *pool = hyp_page_to_pool(p);
-
- hyp_spin_lock(&pool->lock);
- p->refcount++;
- hyp_spin_unlock(&pool->lock);
-}
-
-static inline int hyp_page_ref_dec_and_test(struct hyp_page *p)
-{
- struct hyp_pool *pool = hyp_page_to_pool(p);
- int ret;
-
- hyp_spin_lock(&pool->lock);
- p->refcount--;
- ret = (p->refcount == 0);
- hyp_spin_unlock(&pool->lock);
-
- return ret;
-}
-
-static inline void hyp_set_page_refcounted(struct hyp_page *p)
-{
- struct hyp_pool *pool = hyp_page_to_pool(p);
-
- hyp_spin_lock(&pool->lock);
- if (p->refcount) {
- hyp_spin_unlock(&pool->lock);
- BUG();
- }
- p->refcount = 1;
- hyp_spin_unlock(&pool->lock);
-}
-
/* Allocation */
-void *hyp_alloc_pages(struct hyp_pool *pool, unsigned int order);
-void hyp_get_page(void *addr);
-void hyp_put_page(void *addr);
+void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order);
+void hyp_get_page(struct hyp_pool *pool, void *addr);
+void hyp_put_page(struct hyp_pool *pool, void *addr);
/* Used pages cannot be freed */
int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages,
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index 42d81ec739fa..9c227d87c36d 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -23,7 +23,7 @@ extern struct host_kvm host_kvm;
int __pkvm_prot_finalize(void);
int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end);
-int kvm_host_prepare_stage2(void *mem_pgt_pool, void *dev_pgt_pool);
+int kvm_host_prepare_stage2(void *pgt_pool_base);
void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt);
static __always_inline void __load_host_stage2(void)
diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h b/arch/arm64/kvm/hyp/include/nvhe/memory.h
index fd78bde939ee..592b7edb3edb 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/memory.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h
@@ -7,12 +7,9 @@
#include <linux/types.h>
-struct hyp_pool;
struct hyp_page {
- unsigned int refcount;
- unsigned int order;
- struct hyp_pool *pool;
- struct list_head node;
+ unsigned short refcount;
+ unsigned short order;
};
extern u64 __hyp_vmemmap;
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h
index 0095f6289742..8ec3a5a7744b 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h
@@ -78,19 +78,20 @@ static inline unsigned long hyp_s1_pgtable_pages(void)
return res;
}
-static inline unsigned long host_s2_mem_pgtable_pages(void)
+static inline unsigned long host_s2_pgtable_pages(void)
{
+ unsigned long res;
+
/*
* Include an extra 16 pages to safely upper-bound the worst case of
* concatenated pgds.
*/
- return __hyp_pgtable_total_pages() + 16;
-}
+ res = __hyp_pgtable_total_pages() + 16;
-static inline unsigned long host_s2_dev_pgtable_pages(void)
-{
/* Allow 1 GiB for MMIO mappings */
- return __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT);
+ res += __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT);
+
+ return res;
}
#endif /* __KVM_HYP_MM_H */
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 4b60c0056c04..d938ce95d3bd 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -23,8 +23,7 @@
extern unsigned long hyp_nr_cpus;
struct host_kvm host_kvm;
-static struct hyp_pool host_s2_mem;
-static struct hyp_pool host_s2_dev;
+static struct hyp_pool host_s2_pool;
/*
* Copies of the host's CPU features registers holding sanitized values.
@@ -36,7 +35,7 @@ static const u8 pkvm_hyp_id = 1;
static void *host_s2_zalloc_pages_exact(size_t size)
{
- return hyp_alloc_pages(&host_s2_mem, get_order(size));
+ return hyp_alloc_pages(&host_s2_pool, get_order(size));
}
static void *host_s2_zalloc_page(void *pool)
@@ -44,20 +43,24 @@ static void *host_s2_zalloc_page(void *pool)
return hyp_alloc_pages(pool, 0);
}
-static int prepare_s2_pools(void *mem_pgt_pool, void *dev_pgt_pool)
+static void host_s2_get_page(void *addr)
+{
+ hyp_get_page(&host_s2_pool, addr);
+}
+
+static void host_s2_put_page(void *addr)
+{
+ hyp_put_page(&host_s2_pool, addr);
+}
+
+static int prepare_s2_pool(void *pgt_pool_base)
{
unsigned long nr_pages, pfn;
int ret;
- pfn = hyp_virt_to_pfn(mem_pgt_pool);
- nr_pages = host_s2_mem_pgtable_pages();
- ret = hyp_pool_init(&host_s2_mem, pfn, nr_pages, 0);
- if (ret)
- return ret;
-
- pfn = hyp_virt_to_pfn(dev_pgt_pool);
- nr_pages = host_s2_dev_pgtable_pages();
- ret = hyp_pool_init(&host_s2_dev, pfn, nr_pages, 0);
+ pfn = hyp_virt_to_pfn(pgt_pool_base);
+ nr_pages = host_s2_pgtable_pages();
+ ret = hyp_pool_init(&host_s2_pool, pfn, nr_pages, 0);
if (ret)
return ret;
@@ -67,8 +70,8 @@ static int prepare_s2_pools(void *mem_pgt_pool, void *dev_pgt_pool)
.phys_to_virt = hyp_phys_to_virt,
.virt_to_phys = hyp_virt_to_phys,
.page_count = hyp_page_count,
- .get_page = hyp_get_page,
- .put_page = hyp_put_page,
+ .get_page = host_s2_get_page,
+ .put_page = host_s2_put_page,
};
return 0;
@@ -86,7 +89,7 @@ static void prepare_host_vtcr(void)
id_aa64mmfr1_el1_sys_val, phys_shift);
}
-int kvm_host_prepare_stage2(void *mem_pgt_pool, void *dev_pgt_pool)
+int kvm_host_prepare_stage2(void *pgt_pool_base)
{
struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu;
int ret;
@@ -94,7 +97,7 @@ int kvm_host_prepare_stage2(void *mem_pgt_pool, void *dev_pgt_pool)
prepare_host_vtcr();
hyp_spin_lock_init(&host_kvm.lock);
- ret = prepare_s2_pools(mem_pgt_pool, dev_pgt_pool);
+ ret = prepare_s2_pool(pgt_pool_base);
if (ret)
return ret;
@@ -199,11 +202,10 @@ static bool range_is_memory(u64 start, u64 end)
}
static inline int __host_stage2_idmap(u64 start, u64 end,
- enum kvm_pgtable_prot prot,
- struct hyp_pool *pool)
+ enum kvm_pgtable_prot prot)
{
return kvm_pgtable_stage2_map(&host_kvm.pgt, start, end - start, start,
- prot, pool);
+ prot, &host_s2_pool);
}
static int host_stage2_idmap(u64 addr)
@@ -211,7 +213,6 @@ static int host_stage2_idmap(u64 addr)
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W;
struct kvm_mem_range range;
bool is_memory = find_mem_range(addr, &range);
- struct hyp_pool *pool = is_memory ? &host_s2_mem : &host_s2_dev;
int ret;
if (is_memory)
@@ -222,22 +223,21 @@ static int host_stage2_idmap(u64 addr)
if (ret)
goto unlock;
- ret = __host_stage2_idmap(range.start, range.end, prot, pool);
- if (is_memory || ret != -ENOMEM)
+ ret = __host_stage2_idmap(range.start, range.end, prot);
+ if (ret != -ENOMEM)
goto unlock;
/*
- * host_s2_mem has been provided with enough pages to cover all of
- * memory with page granularity, so we should never hit the ENOMEM case.
- * However, it is difficult to know how much of the MMIO range we will
- * need to cover upfront, so we may need to 'recycle' the pages if we
- * run out.
+ * The pool has been provided with enough pages to cover all of memory
+ * with page granularity, but it is difficult to know how much of the
+ * MMIO range we will need to cover upfront, so we may need to 'recycle'
+ * the pages if we run out.
*/
ret = host_stage2_unmap_dev_all();
if (ret)
goto unlock;
- ret = __host_stage2_idmap(range.start, range.end, prot, pool);
+ ret = __host_stage2_idmap(range.start, range.end, prot);
unlock:
hyp_spin_unlock(&host_kvm.lock);
@@ -258,7 +258,7 @@ int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end)
hyp_spin_lock(&host_kvm.lock);
ret = kvm_pgtable_stage2_set_owner(&host_kvm.pgt, start, end - start,
- &host_s2_mem, pkvm_hyp_id);
+ &host_s2_pool, pkvm_hyp_id);
hyp_spin_unlock(&host_kvm.lock);
return ret != -EAGAIN ? ret : 0;
diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
index 237e03bf0cb1..41fc25bdfb34 100644
--- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c
+++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c
@@ -32,7 +32,7 @@ u64 __hyp_vmemmap;
*/
static struct hyp_page *__find_buddy_nocheck(struct hyp_pool *pool,
struct hyp_page *p,
- unsigned int order)
+ unsigned short order)
{
phys_addr_t addr = hyp_page_to_phys(p);
@@ -51,21 +51,49 @@ static struct hyp_page *__find_buddy_nocheck(struct hyp_pool *pool,
/* Find a buddy page currently available for allocation */
static struct hyp_page *__find_buddy_avail(struct hyp_pool *pool,
struct hyp_page *p,
- unsigned int order)
+ unsigned short order)
{
struct hyp_page *buddy = __find_buddy_nocheck(pool, p, order);
- if (!buddy || buddy->order != order || list_empty(&buddy->node))
+ if (!buddy || buddy->order != order || buddy->refcount)
return NULL;
return buddy;
}
+/*
+ * Pages that are available for allocation are tracked in free-lists, so we use
+ * the pages themselves to store the list nodes to avoid wasting space. As the
+ * allocator always returns zeroed pages (which are zeroed on the hyp_put_page()
+ * path to optimize allocation speed), we also need to clean-up the list node in
+ * each page when we take it out of the list.
+ */
+static inline void page_remove_from_list(struct hyp_page *p)
+{
+ struct list_head *node = hyp_page_to_virt(p);
+
+ __list_del_entry(node);
+ memset(node, 0, sizeof(*node));
+}
+
+static inline void page_add_to_list(struct hyp_page *p, struct list_head *head)
+{
+ struct list_head *node = hyp_page_to_virt(p);
+
+ INIT_LIST_HEAD(node);
+ list_add_tail(node, head);
+}
+
+static inline struct hyp_page *node_to_page(struct list_head *node)
+{
+ return hyp_virt_to_page(node);
+}
+
static void __hyp_attach_page(struct hyp_pool *pool,
struct hyp_page *p)
{
- unsigned int order = p->order;
+ unsigned short order = p->order;
struct hyp_page *buddy;
memset(hyp_page_to_virt(p), 0, PAGE_SIZE << p->order);
@@ -83,32 +111,23 @@ static void __hyp_attach_page(struct hyp_pool *pool,
break;
/* Take the buddy out of its list, and coallesce with @p */
- list_del_init(&buddy->node);
+ page_remove_from_list(buddy);
buddy->order = HYP_NO_ORDER;
p = min(p, buddy);
}
/* Mark the new head, and insert it */
p->order = order;
- list_add_tail(&p->node, &pool->free_area[order]);
-}
-
-static void hyp_attach_page(struct hyp_page *p)
-{
- struct hyp_pool *pool = hyp_page_to_pool(p);
-
- hyp_spin_lock(&pool->lock);
- __hyp_attach_page(pool, p);
- hyp_spin_unlock(&pool->lock);
+ page_add_to_list(p, &pool->free_area[order]);
}
static struct hyp_page *__hyp_extract_page(struct hyp_pool *pool,
struct hyp_page *p,
- unsigned int order)
+ unsigned short order)
{
struct hyp_page *buddy;
- list_del_init(&p->node);
+ page_remove_from_list(p);
while (p->order > order) {
/*
* The buddy of order n - 1 currently has HYP_NO_ORDER as it
@@ -119,30 +138,64 @@ static struct hyp_page *__hyp_extract_page(struct hyp_pool *pool,
p->order--;
buddy = __find_buddy_nocheck(pool, p, p->order);
buddy->order = p->order;
- list_add_tail(&buddy->node, &pool->free_area[buddy->order]);
+ page_add_to_list(buddy, &pool->free_area[buddy->order]);
}
return p;
}
-void hyp_put_page(void *addr)
+static inline void hyp_page_ref_inc(struct hyp_page *p)
{
- struct hyp_page *p = hyp_virt_to_page(addr);
+ BUG_ON(p->refcount == USHRT_MAX);
+ p->refcount++;
+}
+static inline int hyp_page_ref_dec_and_test(struct hyp_page *p)
+{
+ p->refcount--;
+ return (p->refcount == 0);
+}
+
+static inline void hyp_set_page_refcounted(struct hyp_page *p)
+{
+ BUG_ON(p->refcount);
+ p->refcount = 1;
+}
+
+static void __hyp_put_page(struct hyp_pool *pool, struct hyp_page *p)
+{
if (hyp_page_ref_dec_and_test(p))
- hyp_attach_page(p);
+ __hyp_attach_page(pool, p);
+}
+
+/*
+ * Changes to the buddy tree and page refcounts must be done with the hyp_pool
+ * lock held. If a refcount change requires an update to the buddy tree (e.g.
+ * hyp_put_page()), both operations must be done within the same critical
+ * section to guarantee transient states (e.g. a page with null refcount but
+ * not yet attached to a free list) can't be observed by well-behaved readers.
+ */
+void hyp_put_page(struct hyp_pool *pool, void *addr)
+{
+ struct hyp_page *p = hyp_virt_to_page(addr);
+
+ hyp_spin_lock(&pool->lock);
+ __hyp_put_page(pool, p);
+ hyp_spin_unlock(&pool->lock);
}
-void hyp_get_page(void *addr)
+void hyp_get_page(struct hyp_pool *pool, void *addr)
{
struct hyp_page *p = hyp_virt_to_page(addr);
+ hyp_spin_lock(&pool->lock);
hyp_page_ref_inc(p);
+ hyp_spin_unlock(&pool->lock);
}
-void *hyp_alloc_pages(struct hyp_pool *pool, unsigned int order)
+void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order)
{
- unsigned int i = order;
+ unsigned short i = order;
struct hyp_page *p;
hyp_spin_lock(&pool->lock);
@@ -156,11 +209,11 @@ void *hyp_alloc_pages(struct hyp_pool *pool, unsigned int order)
}
/* Extract it from the tree at the right order */
- p = list_first_entry(&pool->free_area[i], struct hyp_page, node);
+ p = node_to_page(pool->free_area[i].next);
p = __hyp_extract_page(pool, p, order);
- hyp_spin_unlock(&pool->lock);
hyp_set_page_refcounted(p);
+ hyp_spin_unlock(&pool->lock);
return hyp_page_to_virt(p);
}
@@ -181,15 +234,14 @@ int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages,
/* Init the vmemmap portion */
p = hyp_phys_to_page(phys);
- memset(p, 0, sizeof(*p) * nr_pages);
for (i = 0; i < nr_pages; i++) {
- p[i].pool = pool;
- INIT_LIST_HEAD(&p[i].node);
+ p[i].order = 0;
+ hyp_set_page_refcounted(&p[i]);
}
/* Attach the unused pages to the buddy tree */
for (i = reserved_pages; i < nr_pages; i++)
- __hyp_attach_page(pool, &p[i]);
+ __hyp_put_page(pool, &p[i]);
return 0;
}
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index 545db985cf9a..0b574d106519 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -24,8 +24,7 @@ unsigned long hyp_nr_cpus;
static void *vmemmap_base;
static void *hyp_pgt_base;
-static void *host_s2_mem_pgt_base;
-static void *host_s2_dev_pgt_base;
+static void *host_s2_pgt_base;
static struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops;
static int divide_memory_pool(void *virt, unsigned long size)
@@ -45,14 +44,9 @@ static int divide_memory_pool(void *virt, unsigned long size)
if (!hyp_pgt_base)
return -ENOMEM;
- nr_pages = host_s2_mem_pgtable_pages();
- host_s2_mem_pgt_base = hyp_early_alloc_contig(nr_pages);
- if (!host_s2_mem_pgt_base)
- return -ENOMEM;
-
- nr_pages = host_s2_dev_pgtable_pages();
- host_s2_dev_pgt_base = hyp_early_alloc_contig(nr_pages);
- if (!host_s2_dev_pgt_base)
+ nr_pages = host_s2_pgtable_pages();
+ host_s2_pgt_base = hyp_early_alloc_contig(nr_pages);
+ if (!host_s2_pgt_base)
return -ENOMEM;
return 0;
@@ -144,6 +138,16 @@ static void *hyp_zalloc_hyp_page(void *arg)
return hyp_alloc_pages(&hpool, 0);
}
+static void hpool_get_page(void *addr)
+{
+ hyp_get_page(&hpool, addr);
+}
+
+static void hpool_put_page(void *addr)
+{
+ hyp_put_page(&hpool, addr);
+}
+
void __noreturn __pkvm_init_finalise(void)
{
struct kvm_host_data *host_data = this_cpu_ptr(&kvm_host_data);
@@ -159,7 +163,7 @@ void __noreturn __pkvm_init_finalise(void)
if (ret)
goto out;
- ret = kvm_host_prepare_stage2(host_s2_mem_pgt_base, host_s2_dev_pgt_base);
+ ret = kvm_host_prepare_stage2(host_s2_pgt_base);
if (ret)
goto out;
@@ -167,8 +171,8 @@ void __noreturn __pkvm_init_finalise(void)
.zalloc_page = hyp_zalloc_hyp_page,
.phys_to_virt = hyp_phys_to_virt,
.virt_to_phys = hyp_virt_to_phys,
- .get_page = hyp_get_page,
- .put_page = hyp_put_page,
+ .get_page = hpool_get_page,
+ .put_page = hpool_put_page,
};
pkvm_pgtable.mm_ops = &pkvm_pgtable_mm_ops;
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index e9ad7fb28ee3..05321f4165e3 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -577,12 +577,24 @@ static void stage2_put_pte(kvm_pte_t *ptep, struct kvm_s2_mmu *mmu, u64 addr,
mm_ops->put_page(ptep);
}
+static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte)
+{
+ u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
+ return memattr == KVM_S2_MEMATTR(pgt, NORMAL);
+}
+
+static bool stage2_pte_executable(kvm_pte_t pte)
+{
+ return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN);
+}
+
static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
kvm_pte_t *ptep,
struct stage2_map_data *data)
{
kvm_pte_t new, old = *ptep;
u64 granule = kvm_granule_size(level), phys = data->phys;
+ struct kvm_pgtable *pgt = data->mmu->pgt;
struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
if (!kvm_block_mapping_supported(addr, end, phys, level))
@@ -606,6 +618,14 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
stage2_put_pte(ptep, data->mmu, addr, level, mm_ops);
}
+ /* Perform CMOs before installation of the guest stage-2 PTE */
+ if (mm_ops->dcache_clean_inval_poc && stage2_pte_cacheable(pgt, new))
+ mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new, mm_ops),
+ granule);
+
+ if (mm_ops->icache_inval_pou && stage2_pte_executable(new))
+ mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule);
+
smp_store_release(ptep, new);
if (stage2_pte_is_counted(new))
mm_ops->get_page(ptep);
@@ -798,12 +818,6 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
return ret;
}
-static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte)
-{
- u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
- return memattr == KVM_S2_MEMATTR(pgt, NORMAL);
-}
-
static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
enum kvm_pgtable_walk_flags flag,
void * const arg)
@@ -864,10 +878,11 @@ int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
}
struct stage2_attr_data {
- kvm_pte_t attr_set;
- kvm_pte_t attr_clr;
- kvm_pte_t pte;
- u32 level;
+ kvm_pte_t attr_set;
+ kvm_pte_t attr_clr;
+ kvm_pte_t pte;
+ u32 level;
+ struct kvm_pgtable_mm_ops *mm_ops;
};
static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
@@ -876,6 +891,7 @@ static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
{
kvm_pte_t pte = *ptep;
struct stage2_attr_data *data = arg;
+ struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
if (!kvm_pte_valid(pte))
return 0;
@@ -890,8 +906,17 @@ static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
* but worst-case the access flag update gets lost and will be
* set on the next access instead.
*/
- if (data->pte != pte)
+ if (data->pte != pte) {
+ /*
+ * Invalidate instruction cache before updating the guest
+ * stage-2 PTE if we are going to add executable permission.
+ */
+ if (mm_ops->icache_inval_pou &&
+ stage2_pte_executable(pte) && !stage2_pte_executable(*ptep))
+ mm_ops->icache_inval_pou(kvm_pte_follow(pte, mm_ops),
+ kvm_granule_size(level));
WRITE_ONCE(*ptep, pte);
+ }
return 0;
}
@@ -906,6 +931,7 @@ static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr,
struct stage2_attr_data data = {
.attr_set = attr_set & attr_mask,
.attr_clr = attr_clr & attr_mask,
+ .mm_ops = pgt->mm_ops,
};
struct kvm_pgtable_walker walker = {
.cb = stage2_attr_walker,
diff --git a/arch/arm64/kvm/hyp/reserved_mem.c b/arch/arm64/kvm/hyp/reserved_mem.c
index 83ca23ac259b..d654921dd09b 100644
--- a/arch/arm64/kvm/hyp/reserved_mem.c
+++ b/arch/arm64/kvm/hyp/reserved_mem.c
@@ -71,8 +71,7 @@ void __init kvm_hyp_reserve(void)
}
hyp_mem_pages += hyp_s1_pgtable_pages();
- hyp_mem_pages += host_s2_mem_pgtable_pages();
- hyp_mem_pages += host_s2_dev_pgtable_pages();
+ hyp_mem_pages += host_s2_pgtable_pages();
/*
* The hyp_vmemmap needs to be backed by pages, but these pages
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index c10207fed2f3..57292dc5ce35 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -126,6 +126,16 @@ static void *kvm_host_va(phys_addr_t phys)
return __va(phys);
}
+static void clean_dcache_guest_page(void *va, size_t size)
+{
+ __clean_dcache_guest_page(va, size);
+}
+
+static void invalidate_icache_guest_page(void *va, size_t size)
+{
+ __invalidate_icache_guest_page(va, size);
+}
+
/*
* Unmapping vs dcache management:
*
@@ -432,6 +442,8 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
.page_count = kvm_host_page_count,
.phys_to_virt = kvm_host_va,
.virt_to_phys = kvm_host_pa,
+ .dcache_clean_inval_poc = clean_dcache_guest_page,
+ .icache_inval_pou = invalidate_icache_guest_page,
};
/**
@@ -693,16 +705,6 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
}
-static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
-{
- __clean_dcache_guest_page(pfn, size);
-}
-
-static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
-{
- __invalidate_icache_guest_page(pfn, size);
-}
-
static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
{
send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
@@ -822,6 +824,74 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
return PAGE_SIZE;
}
+static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva)
+{
+ unsigned long pa;
+
+ if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP))
+ return huge_page_shift(hstate_vma(vma));
+
+ if (!(vma->vm_flags & VM_PFNMAP))
+ return PAGE_SHIFT;
+
+ VM_BUG_ON(is_vm_hugetlb_page(vma));
+
+ pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start);
+
+#ifndef __PAGETABLE_PMD_FOLDED
+ if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) &&
+ ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start &&
+ ALIGN(hva, PUD_SIZE) <= vma->vm_end)
+ return PUD_SHIFT;
+#endif
+
+ if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) &&
+ ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start &&
+ ALIGN(hva, PMD_SIZE) <= vma->vm_end)
+ return PMD_SHIFT;
+
+ return PAGE_SHIFT;
+}
+
+/*
+ * The page will be mapped in stage 2 as Normal Cacheable, so the VM will be
+ * able to see the page's tags and therefore they must be initialised first. If
+ * PG_mte_tagged is set, tags have already been initialised.
+ *
+ * The race in the test/set of the PG_mte_tagged flag is handled by:
+ * - preventing VM_SHARED mappings in a memslot with MTE preventing two VMs
+ * racing to santise the same page
+ * - mmap_lock protects between a VM faulting a page in and the VMM performing
+ * an mprotect() to add VM_MTE
+ */
+static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
+ unsigned long size)
+{
+ unsigned long i, nr_pages = size >> PAGE_SHIFT;
+ struct page *page;
+
+ if (!kvm_has_mte(kvm))
+ return 0;
+
+ /*
+ * pfn_to_online_page() is used to reject ZONE_DEVICE pages
+ * that may not support tags.
+ */
+ page = pfn_to_online_page(pfn);
+
+ if (!page)
+ return -EFAULT;
+
+ for (i = 0; i < nr_pages; i++, page++) {
+ if (!test_bit(PG_mte_tagged, &page->flags)) {
+ mte_clear_page_tags(page_address(page));
+ set_bit(PG_mte_tagged, &page->flags);
+ }
+ }
+
+ return 0;
+}
+
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
struct kvm_memory_slot *memslot, unsigned long hva,
unsigned long fault_status)
@@ -830,6 +900,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
bool write_fault, writable, force_pte = false;
bool exec_fault;
bool device = false;
+ bool shared;
unsigned long mmu_seq;
struct kvm *kvm = vcpu->kvm;
struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
@@ -853,7 +924,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
return -EFAULT;
}
- /* Let's check if we will get back a huge page backed by hugetlbfs */
+ /*
+ * Let's check if we will get back a huge page backed by hugetlbfs, or
+ * get block mapping for device MMIO region.
+ */
mmap_read_lock(current->mm);
vma = find_vma_intersection(current->mm, hva, hva + 1);
if (unlikely(!vma)) {
@@ -862,17 +936,19 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
return -EFAULT;
}
- if (is_vm_hugetlb_page(vma))
- vma_shift = huge_page_shift(hstate_vma(vma));
- else
- vma_shift = PAGE_SHIFT;
-
- if (logging_active ||
- (vma->vm_flags & VM_PFNMAP)) {
+ /*
+ * logging_active is guaranteed to never be true for VM_PFNMAP
+ * memslots.
+ */
+ if (logging_active) {
force_pte = true;
vma_shift = PAGE_SHIFT;
+ } else {
+ vma_shift = get_vma_page_shift(vma, hva);
}
+ shared = (vma->vm_flags & VM_PFNMAP);
+
switch (vma_shift) {
#ifndef __PAGETABLE_PMD_FOLDED
case PUD_SHIFT:
@@ -943,8 +1019,17 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
return -EFAULT;
if (kvm_is_device_pfn(pfn)) {
+ /*
+ * If the page was identified as device early by looking at
+ * the VMA flags, vma_pagesize is already representing the
+ * largest quantity we can map. If instead it was mapped
+ * via gfn_to_pfn_prot(), vma_pagesize is set to PAGE_SIZE
+ * and must not be upgraded.
+ *
+ * In both cases, we don't let transparent_hugepage_adjust()
+ * change things at the last minute.
+ */
device = true;
- force_pte = true;
} else if (logging_active && !write_fault) {
/*
* Only actually map the page as writable if this was a write
@@ -965,19 +1050,25 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* If we are not forced to use page mapping, check if we are
* backed by a THP and thus use block mapping if possible.
*/
- if (vma_pagesize == PAGE_SIZE && !force_pte)
+ if (vma_pagesize == PAGE_SIZE && !(force_pte || device))
vma_pagesize = transparent_hugepage_adjust(memslot, hva,
&pfn, &fault_ipa);
+
+ if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) {
+ /* Check the VMM hasn't introduced a new VM_SHARED VMA */
+ if (!shared)
+ ret = sanitise_mte_tags(kvm, pfn, vma_pagesize);
+ else
+ ret = -EFAULT;
+ if (ret)
+ goto out_unlock;
+ }
+
if (writable)
prot |= KVM_PGTABLE_PROT_W;
- if (fault_status != FSC_PERM && !device)
- clean_dcache_guest_page(pfn, vma_pagesize);
-
- if (exec_fault) {
+ if (exec_fault)
prot |= KVM_PGTABLE_PROT_X;
- invalidate_icache_guest_page(pfn, vma_pagesize);
- }
if (device)
prot |= KVM_PGTABLE_PROT_DEVICE;
@@ -1168,19 +1259,22 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
kvm_pfn_t pfn = pte_pfn(range->pte);
+ int ret;
if (!kvm->arch.mmu.pgt)
return false;
WARN_ON(range->end - range->start != 1);
- /*
- * We've moved a page around, probably through CoW, so let's treat it
- * just like a translation fault and clean the cache to the PoC.
- */
- clean_dcache_guest_page(pfn, PAGE_SIZE);
+ ret = sanitise_mte_tags(kvm, pfn, PAGE_SIZE);
+ if (ret)
+ return false;
/*
+ * We've moved a page around, probably through CoW, so let's treat
+ * it just like a translation fault and the map handler will clean
+ * the cache to the PoC.
+ *
* The MMU notifiers will have unmapped a huge PMD before calling
* ->change_pte() (which in turn calls kvm_set_spte_gfn()) and
* therefore we never need to clear out a huge PMD through this
@@ -1346,7 +1440,6 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
{
hva_t hva = mem->userspace_addr;
hva_t reg_end = hva + mem->memory_size;
- bool writable = !(mem->flags & KVM_MEM_READONLY);
int ret = 0;
if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
@@ -1363,8 +1456,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
mmap_read_lock(current->mm);
/*
* A memory region could potentially cover multiple VMAs, and any holes
- * between them, so iterate over all of them to find out if we can map
- * any of them right now.
+ * between them, so iterate over all of them.
*
* +--------------------------------------------+
* +---------------+----------------+ +----------------+
@@ -1375,51 +1467,29 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
*/
do {
struct vm_area_struct *vma;
- hva_t vm_start, vm_end;
vma = find_vma_intersection(current->mm, hva, reg_end);
if (!vma)
break;
/*
- * Take the intersection of this VMA with the memory region
+ * VM_SHARED mappings are not allowed with MTE to avoid races
+ * when updating the PG_mte_tagged page flag, see
+ * sanitise_mte_tags for more details.
*/
- vm_start = max(hva, vma->vm_start);
- vm_end = min(reg_end, vma->vm_end);
+ if (kvm_has_mte(kvm) && vma->vm_flags & VM_SHARED)
+ return -EINVAL;
if (vma->vm_flags & VM_PFNMAP) {
- gpa_t gpa = mem->guest_phys_addr +
- (vm_start - mem->userspace_addr);
- phys_addr_t pa;
-
- pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
- pa += vm_start - vma->vm_start;
-
/* IO region dirty page logging not allowed */
if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) {
ret = -EINVAL;
- goto out;
- }
-
- ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
- vm_end - vm_start,
- writable);
- if (ret)
break;
+ }
}
- hva = vm_end;
+ hva = min(reg_end, vma->vm_end);
} while (hva < reg_end);
- if (change == KVM_MR_FLAGS_ONLY)
- goto out;
-
- spin_lock(&kvm->mmu_lock);
- if (ret)
- unmap_stage2_range(&kvm->arch.mmu, mem->guest_phys_addr, mem->memory_size);
- else if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
- stage2_flush_memslot(kvm, memslot);
- spin_unlock(&kvm->mmu_lock);
-out:
mmap_read_unlock(current->mm);
return ret;
}
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index fd167d4f4215..f33825c995cb 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -578,6 +578,7 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val)
kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0);
if (val & ARMV8_PMU_PMCR_P) {
+ mask &= ~BIT(ARMV8_PMU_CYCLE_IDX);
for_each_set_bit(i, &mask, 32)
kvm_pmu_set_counter_value(vcpu, i, 0);
}
@@ -850,6 +851,9 @@ int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
return -EINVAL;
}
+ /* One-off reload of the PMU on first run */
+ kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu);
+
return 0;
}
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index d37ebee085cf..cba7872d69a8 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -176,6 +176,10 @@ static bool vcpu_allowed_register_width(struct kvm_vcpu *vcpu)
if (!cpus_have_const_cap(ARM64_HAS_32BIT_EL1) && is32bit)
return false;
+ /* MTE is incompatible with AArch32 */
+ if (kvm_has_mte(vcpu->kvm) && is32bit)
+ return false;
+
/* Check that the vcpus are either all 32bit or all 64bit */
kvm_for_each_vcpu(i, tmp, vcpu->kvm) {
if (vcpu_has_feature(tmp, KVM_ARM_VCPU_EL1_32BIT) != is32bit)
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 1a7968ad078c..f6f126eb6ac1 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1047,6 +1047,13 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu,
break;
case SYS_ID_AA64PFR1_EL1:
val &= ~FEATURE(ID_AA64PFR1_MTE);
+ if (kvm_has_mte(vcpu->kvm)) {
+ u64 pfr, mte;
+
+ pfr = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1);
+ mte = cpuid_feature_extract_unsigned_field(pfr, ID_AA64PFR1_MTE_SHIFT);
+ val |= FIELD_PREP(FEATURE(ID_AA64PFR1_MTE), mte);
+ }
break;
case SYS_ID_AA64ISAR1_EL1:
if (!vcpu_has_ptrauth(vcpu))
@@ -1302,6 +1309,23 @@ static bool access_ccsidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
return true;
}
+static unsigned int mte_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ if (kvm_has_mte(vcpu->kvm))
+ return 0;
+
+ return REG_HIDDEN;
+}
+
+#define MTE_REG(name) { \
+ SYS_DESC(SYS_##name), \
+ .access = undef_access, \
+ .reset = reset_unknown, \
+ .reg = name, \
+ .visibility = mte_visibility, \
+}
+
/* sys_reg_desc initialiser for known cpufeature ID registers */
#define ID_SANITISED(name) { \
SYS_DESC(SYS_##name), \
@@ -1470,8 +1494,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_ACTLR_EL1), access_actlr, reset_actlr, ACTLR_EL1 },
{ SYS_DESC(SYS_CPACR_EL1), NULL, reset_val, CPACR_EL1, 0 },
- { SYS_DESC(SYS_RGSR_EL1), undef_access },
- { SYS_DESC(SYS_GCR_EL1), undef_access },
+ MTE_REG(RGSR_EL1),
+ MTE_REG(GCR_EL1),
{ SYS_DESC(SYS_ZCR_EL1), NULL, reset_val, ZCR_EL1, 0, .visibility = sve_visibility },
{ SYS_DESC(SYS_TRFCR_EL1), undef_access },
@@ -1498,8 +1522,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_ERXMISC0_EL1), trap_raz_wi },
{ SYS_DESC(SYS_ERXMISC1_EL1), trap_raz_wi },
- { SYS_DESC(SYS_TFSR_EL1), undef_access },
- { SYS_DESC(SYS_TFSRE0_EL1), undef_access },
+ MTE_REG(TFSR_EL1),
+ MTE_REG(TFSRE0_EL1),
{ SYS_DESC(SYS_FAR_EL1), access_vm_reg, reset_unknown, FAR_EL1 },
{ SYS_DESC(SYS_PAR_EL1), NULL, reset_unknown, PAR_EL1 },
diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index 58cbda00e56d..340c51d87677 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -482,6 +482,16 @@ static irqreturn_t vgic_maintenance_handler(int irq, void *data)
return IRQ_HANDLED;
}
+static struct gic_kvm_info *gic_kvm_info;
+
+void __init vgic_set_kvm_info(const struct gic_kvm_info *info)
+{
+ BUG_ON(gic_kvm_info != NULL);
+ gic_kvm_info = kmalloc(sizeof(*info), GFP_KERNEL);
+ if (gic_kvm_info)
+ *gic_kvm_info = *info;
+}
+
/**
* kvm_vgic_init_cpu_hardware - initialize the GIC VE hardware
*
@@ -509,18 +519,29 @@ void kvm_vgic_init_cpu_hardware(void)
*/
int kvm_vgic_hyp_init(void)
{
- const struct gic_kvm_info *gic_kvm_info;
+ bool has_mask;
int ret;
- gic_kvm_info = gic_get_kvm_info();
if (!gic_kvm_info)
return -ENODEV;
- if (!gic_kvm_info->maint_irq) {
+ has_mask = !gic_kvm_info->no_maint_irq_mask;
+
+ if (has_mask && !gic_kvm_info->maint_irq) {
kvm_err("No vgic maintenance irq\n");
return -ENXIO;
}
+ /*
+ * If we get one of these oddball non-GICs, taint the kernel,
+ * as we have no idea of how they *really* behave.
+ */
+ if (gic_kvm_info->no_hw_deactivation) {
+ kvm_info("Non-architectural vgic, tainting kernel\n");
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+ kvm_vgic_global_state.no_hw_deactivation = true;
+ }
+
switch (gic_kvm_info->type) {
case GIC_V2:
ret = vgic_v2_probe(gic_kvm_info);
@@ -536,10 +557,17 @@ int kvm_vgic_hyp_init(void)
ret = -ENODEV;
}
+ kvm_vgic_global_state.maint_irq = gic_kvm_info->maint_irq;
+
+ kfree(gic_kvm_info);
+ gic_kvm_info = NULL;
+
if (ret)
return ret;
- kvm_vgic_global_state.maint_irq = gic_kvm_info->maint_irq;
+ if (!has_mask)
+ return 0;
+
ret = request_percpu_irq(kvm_vgic_global_state.maint_irq,
vgic_maintenance_handler,
"vgic", kvm_get_running_vcpus());
diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c
index 11934c2af2f4..2c580204f1dc 100644
--- a/arch/arm64/kvm/vgic/vgic-v2.c
+++ b/arch/arm64/kvm/vgic/vgic-v2.c
@@ -108,11 +108,22 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
* If this causes us to lower the level, we have to also clear
* the physical active state, since we will otherwise never be
* told when the interrupt becomes asserted again.
+ *
+ * Another case is when the interrupt requires a helping hand
+ * on deactivation (no HW deactivation, for example).
*/
- if (vgic_irq_is_mapped_level(irq) && (val & GICH_LR_PENDING_BIT)) {
- irq->line_level = vgic_get_phys_line_level(irq);
+ if (vgic_irq_is_mapped_level(irq)) {
+ bool resample = false;
+
+ if (val & GICH_LR_PENDING_BIT) {
+ irq->line_level = vgic_get_phys_line_level(irq);
+ resample = !irq->line_level;
+ } else if (vgic_irq_needs_resampling(irq) &&
+ !(irq->active || irq->pending_latch)) {
+ resample = true;
+ }
- if (!irq->line_level)
+ if (resample)
vgic_irq_set_phys_active(irq, false);
}
@@ -152,7 +163,7 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
if (irq->group)
val |= GICH_LR_GROUP1;
- if (irq->hw) {
+ if (irq->hw && !vgic_irq_needs_resampling(irq)) {
val |= GICH_LR_HW;
val |= irq->hwintid << GICH_LR_PHYSID_CPUID_SHIFT;
/*
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 41ecf219c333..66004f61cd83 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -101,11 +101,22 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
* If this causes us to lower the level, we have to also clear
* the physical active state, since we will otherwise never be
* told when the interrupt becomes asserted again.
+ *
+ * Another case is when the interrupt requires a helping hand
+ * on deactivation (no HW deactivation, for example).
*/
- if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT)) {
- irq->line_level = vgic_get_phys_line_level(irq);
+ if (vgic_irq_is_mapped_level(irq)) {
+ bool resample = false;
+
+ if (val & ICH_LR_PENDING_BIT) {
+ irq->line_level = vgic_get_phys_line_level(irq);
+ resample = !irq->line_level;
+ } else if (vgic_irq_needs_resampling(irq) &&
+ !(irq->active || irq->pending_latch)) {
+ resample = true;
+ }
- if (!irq->line_level)
+ if (resample)
vgic_irq_set_phys_active(irq, false);
}
@@ -136,7 +147,7 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
}
}
- if (irq->hw) {
+ if (irq->hw && !vgic_irq_needs_resampling(irq)) {
val |= ICH_LR_HW;
val |= ((u64)irq->hwintid) << ICH_LR_PHYS_ID_SHIFT;
/*
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index 15b666200f0b..111bff47e471 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -182,8 +182,8 @@ bool vgic_get_phys_line_level(struct vgic_irq *irq)
BUG_ON(!irq->hw);
- if (irq->get_input_level)
- return irq->get_input_level(irq->intid);
+ if (irq->ops && irq->ops->get_input_level)
+ return irq->ops->get_input_level(irq->intid);
WARN_ON(irq_get_irqchip_state(irq->host_irq,
IRQCHIP_STATE_PENDING,
@@ -480,7 +480,7 @@ int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
/* @irq->irq_lock must be held */
static int kvm_vgic_map_irq(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
unsigned int host_irq,
- bool (*get_input_level)(int vindid))
+ struct irq_ops *ops)
{
struct irq_desc *desc;
struct irq_data *data;
@@ -500,7 +500,7 @@ static int kvm_vgic_map_irq(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
irq->hw = true;
irq->host_irq = host_irq;
irq->hwintid = data->hwirq;
- irq->get_input_level = get_input_level;
+ irq->ops = ops;
return 0;
}
@@ -509,11 +509,11 @@ static inline void kvm_vgic_unmap_irq(struct vgic_irq *irq)
{
irq->hw = false;
irq->hwintid = 0;
- irq->get_input_level = NULL;
+ irq->ops = NULL;
}
int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq,
- u32 vintid, bool (*get_input_level)(int vindid))
+ u32 vintid, struct irq_ops *ops)
{
struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
unsigned long flags;
@@ -522,7 +522,7 @@ int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq,
BUG_ON(!irq);
raw_spin_lock_irqsave(&irq->irq_lock, flags);
- ret = kvm_vgic_map_irq(vcpu, irq, host_irq, get_input_level);
+ ret = kvm_vgic_map_irq(vcpu, irq, host_irq, ops);
raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
vgic_put_irq(vcpu->kvm, irq);
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index fca4547d580f..696f6b009377 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -109,10 +109,11 @@ static inline bool kvm_is_error_hva(unsigned long addr)
}
struct kvm_vm_stat {
- ulong remote_tlb_flush;
+ struct kvm_vm_stat_generic generic;
};
struct kvm_vcpu_stat {
+ struct kvm_vcpu_stat_generic generic;
u64 wait_exits;
u64 cache_exits;
u64 signal_exits;
@@ -142,12 +143,6 @@ struct kvm_vcpu_stat {
#ifdef CONFIG_CPU_LOONGSON64
u64 vz_cpucfg_exits;
#endif
- u64 halt_successful_poll;
- u64 halt_attempted_poll;
- u64 halt_poll_success_ns;
- u64 halt_poll_fail_ns;
- u64 halt_poll_invalid;
- u64 halt_wakeup;
};
struct kvm_arch_memory_slot {
diff --git a/arch/mips/kvm/Makefile b/arch/mips/kvm/Makefile
index 30cc060857c7..c67250a956b8 100644
--- a/arch/mips/kvm/Makefile
+++ b/arch/mips/kvm/Makefile
@@ -2,7 +2,7 @@
# Makefile for KVM support for MIPS
#
-common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o eventfd.o)
+common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o eventfd.o binary_stats.o)
EXTRA_CFLAGS += -Ivirt/kvm -Iarch/mips/kvm
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 4d4af97dcc88..af9dd029a4e1 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -38,43 +38,63 @@
#define VECTORSPACING 0x100 /* for EI/VI mode */
#endif
-struct kvm_stats_debugfs_item debugfs_entries[] = {
- VCPU_STAT("wait", wait_exits),
- VCPU_STAT("cache", cache_exits),
- VCPU_STAT("signal", signal_exits),
- VCPU_STAT("interrupt", int_exits),
- VCPU_STAT("cop_unusable", cop_unusable_exits),
- VCPU_STAT("tlbmod", tlbmod_exits),
- VCPU_STAT("tlbmiss_ld", tlbmiss_ld_exits),
- VCPU_STAT("tlbmiss_st", tlbmiss_st_exits),
- VCPU_STAT("addrerr_st", addrerr_st_exits),
- VCPU_STAT("addrerr_ld", addrerr_ld_exits),
- VCPU_STAT("syscall", syscall_exits),
- VCPU_STAT("resvd_inst", resvd_inst_exits),
- VCPU_STAT("break_inst", break_inst_exits),
- VCPU_STAT("trap_inst", trap_inst_exits),
- VCPU_STAT("msa_fpe", msa_fpe_exits),
- VCPU_STAT("fpe", fpe_exits),
- VCPU_STAT("msa_disabled", msa_disabled_exits),
- VCPU_STAT("flush_dcache", flush_dcache_exits),
- VCPU_STAT("vz_gpsi", vz_gpsi_exits),
- VCPU_STAT("vz_gsfc", vz_gsfc_exits),
- VCPU_STAT("vz_hc", vz_hc_exits),
- VCPU_STAT("vz_grr", vz_grr_exits),
- VCPU_STAT("vz_gva", vz_gva_exits),
- VCPU_STAT("vz_ghfc", vz_ghfc_exits),
- VCPU_STAT("vz_gpa", vz_gpa_exits),
- VCPU_STAT("vz_resvd", vz_resvd_exits),
+const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
+ KVM_GENERIC_VM_STATS()
+};
+static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
+ sizeof(struct kvm_vm_stat) / sizeof(u64));
+
+const struct kvm_stats_header kvm_vm_stats_header = {
+ .name_size = KVM_STATS_NAME_SIZE,
+ .num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
+ .id_offset = sizeof(struct kvm_stats_header),
+ .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+ .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+ sizeof(kvm_vm_stats_desc),
+};
+
+const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
+ KVM_GENERIC_VCPU_STATS(),
+ STATS_DESC_COUNTER(VCPU, wait_exits),
+ STATS_DESC_COUNTER(VCPU, cache_exits),
+ STATS_DESC_COUNTER(VCPU, signal_exits),
+ STATS_DESC_COUNTER(VCPU, int_exits),
+ STATS_DESC_COUNTER(VCPU, cop_unusable_exits),
+ STATS_DESC_COUNTER(VCPU, tlbmod_exits),
+ STATS_DESC_COUNTER(VCPU, tlbmiss_ld_exits),
+ STATS_DESC_COUNTER(VCPU, tlbmiss_st_exits),
+ STATS_DESC_COUNTER(VCPU, addrerr_st_exits),
+ STATS_DESC_COUNTER(VCPU, addrerr_ld_exits),
+ STATS_DESC_COUNTER(VCPU, syscall_exits),
+ STATS_DESC_COUNTER(VCPU, resvd_inst_exits),
+ STATS_DESC_COUNTER(VCPU, break_inst_exits),
+ STATS_DESC_COUNTER(VCPU, trap_inst_exits),
+ STATS_DESC_COUNTER(VCPU, msa_fpe_exits),
+ STATS_DESC_COUNTER(VCPU, fpe_exits),
+ STATS_DESC_COUNTER(VCPU, msa_disabled_exits),
+ STATS_DESC_COUNTER(VCPU, flush_dcache_exits),
+ STATS_DESC_COUNTER(VCPU, vz_gpsi_exits),
+ STATS_DESC_COUNTER(VCPU, vz_gsfc_exits),
+ STATS_DESC_COUNTER(VCPU, vz_hc_exits),
+ STATS_DESC_COUNTER(VCPU, vz_grr_exits),
+ STATS_DESC_COUNTER(VCPU, vz_gva_exits),
+ STATS_DESC_COUNTER(VCPU, vz_ghfc_exits),
+ STATS_DESC_COUNTER(VCPU, vz_gpa_exits),
+ STATS_DESC_COUNTER(VCPU, vz_resvd_exits),
#ifdef CONFIG_CPU_LOONGSON64
- VCPU_STAT("vz_cpucfg", vz_cpucfg_exits),
+ STATS_DESC_COUNTER(VCPU, vz_cpucfg_exits),
#endif
- VCPU_STAT("halt_successful_poll", halt_successful_poll),
- VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
- VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
- VCPU_STAT("halt_wakeup", halt_wakeup),
- VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
- VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
- {NULL}
+};
+static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
+ sizeof(struct kvm_vcpu_stat) / sizeof(u64));
+
+const struct kvm_stats_header kvm_vcpu_stats_header = {
+ .name_size = KVM_STATS_NAME_SIZE,
+ .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
+ .id_offset = sizeof(struct kvm_stats_header),
+ .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+ .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+ sizeof(kvm_vcpu_stats_desc),
};
bool kvm_trace_guest_mode_change;
diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h
index 1c7b75834e04..02ee6f5ac9fe 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -120,6 +120,7 @@ extern s32 patch__call_flush_branch_caches3;
extern s32 patch__flush_count_cache_return;
extern s32 patch__flush_link_stack_return;
extern s32 patch__call_kvm_flush_link_stack;
+extern s32 patch__call_kvm_flush_link_stack_p9;
extern s32 patch__memset_nocache, patch__memcpy_nocache;
extern long flush_branch_caches;
@@ -140,7 +141,7 @@ void kvmhv_load_host_pmu(void);
void kvmhv_save_guest_pmu(struct kvm_vcpu *vcpu, bool pmu_in_use);
void kvmhv_load_guest_pmu(struct kvm_vcpu *vcpu);
-int __kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu);
+void kvmppc_p9_enter_guest(struct kvm_vcpu *vcpu);
long kvmppc_h_set_dabr(struct kvm_vcpu *vcpu, unsigned long dabr);
long kvmppc_h_set_xdabr(struct kvm_vcpu *vcpu, unsigned long dabr,
diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index eace8c3f7b0a..c02f42d1031e 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -19,6 +19,7 @@ struct mmu_psize_def {
int penc[MMU_PAGE_COUNT]; /* HPTE encoding */
unsigned int tlbiel; /* tlbiel supported for that page size */
unsigned long avpnm; /* bits to mask out in AVPN in the HPTE */
+ unsigned long h_rpt_pgsize; /* H_RPT_INVALIDATE page size encoding */
union {
unsigned long sllp; /* SLB L||LP (exact mask to use in slbmte) */
unsigned long ap; /* Ap encoding used by PowerISA 3.0 */
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
index 8b33601cdb9d..a46fd37ad552 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
@@ -4,6 +4,10 @@
#include <asm/hvcall.h>
+#define RIC_FLUSH_TLB 0
+#define RIC_FLUSH_PWC 1
+#define RIC_FLUSH_ALL 2
+
struct vm_area_struct;
struct mm_struct;
struct mmu_gather;
diff --git a/arch/powerpc/include/asm/cputhreads.h b/arch/powerpc/include/asm/cputhreads.h
index 98c8bd155bf9..b167186aaee4 100644
--- a/arch/powerpc/include/asm/cputhreads.h
+++ b/arch/powerpc/include/asm/cputhreads.h
@@ -98,6 +98,36 @@ static inline int cpu_last_thread_sibling(int cpu)
return cpu | (threads_per_core - 1);
}
+/*
+ * tlb_thread_siblings are siblings which share a TLB. This is not
+ * architected, is not something a hypervisor could emulate and a future
+ * CPU may change behaviour even in compat mode, so this should only be
+ * used on PowerNV, and only with care.
+ */
+static inline int cpu_first_tlb_thread_sibling(int cpu)
+{
+ if (cpu_has_feature(CPU_FTR_ARCH_300) && (threads_per_core == 8))
+ return cpu & ~0x6; /* Big Core */
+ else
+ return cpu_first_thread_sibling(cpu);
+}
+
+static inline int cpu_last_tlb_thread_sibling(int cpu)
+{
+ if (cpu_has_feature(CPU_FTR_ARCH_300) && (threads_per_core == 8))
+ return cpu | 0x6; /* Big Core */
+ else
+ return cpu_last_thread_sibling(cpu);
+}
+
+static inline int cpu_tlb_thread_sibling_step(void)
+{
+ if (cpu_has_feature(CPU_FTR_ARCH_300) && (threads_per_core == 8))
+ return 2; /* Big Core */
+ else
+ return 1;
+}
+
static inline u32 get_tensr(void)
{
#ifdef CONFIG_BOOKE
diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index c1a8aac01cf9..bb6f78fcf981 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -35,6 +35,19 @@
/* PACA save area size in u64 units (exgen, exmc, etc) */
#define EX_SIZE 10
+/* PACA save area offsets */
+#define EX_R9 0
+#define EX_R10 8
+#define EX_R11 16
+#define EX_R12 24
+#define EX_R13 32
+#define EX_DAR 40
+#define EX_DSISR 48
+#define EX_CCR 52
+#define EX_CFAR 56
+#define EX_PPR 64
+#define EX_CTR 72
+
/*
* maximum recursive depth of MCE exceptions
*/
diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index e3b29eda8074..7e4b2cef40c2 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -413,9 +413,9 @@
#define H_RPTI_TYPE_NESTED 0x0001 /* Invalidate nested guest partition-scope */
#define H_RPTI_TYPE_TLB 0x0002 /* Invalidate TLB */
#define H_RPTI_TYPE_PWC 0x0004 /* Invalidate Page Walk Cache */
-/* Invalidate Process Table Entries if H_RPTI_TYPE_NESTED is clear */
+/* Invalidate caching of Process Table Entries if H_RPTI_TYPE_NESTED is clear */
#define H_RPTI_TYPE_PRT 0x0008
-/* Invalidate Partition Table Entries if H_RPTI_TYPE_NESTED is set */
+/* Invalidate caching of Partition Table Entries if H_RPTI_TYPE_NESTED is set */
#define H_RPTI_TYPE_PAT 0x0008
#define H_RPTI_TYPE_ALL (H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC | \
H_RPTI_TYPE_PRT)
diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
index a3633560493b..fbbf3cec92e9 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -147,6 +147,7 @@
#define KVM_GUEST_MODE_SKIP 2
#define KVM_GUEST_MODE_GUEST_HV 3
#define KVM_GUEST_MODE_HOST_HV 4
+#define KVM_GUEST_MODE_HV_P9 5 /* ISA >= v3.0 path */
#define KVM_INST_FETCH_FAILED -1
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index e6b53c6e21e3..caaa0f592d8e 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -307,6 +307,9 @@ void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1);
void kvmhv_release_all_nested(struct kvm *kvm);
long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu);
long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu);
+long do_h_rpt_invalidate_pat(struct kvm_vcpu *vcpu, unsigned long lpid,
+ unsigned long type, unsigned long pg_sizes,
+ unsigned long start, unsigned long end);
int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu,
u64 time_limit, unsigned long lpcr);
void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr);
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 9bb9bb370b53..eaf3a562bf1e 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -153,10 +153,18 @@ static inline bool kvmhv_vcpu_is_radix(struct kvm_vcpu *vcpu)
return radix;
}
+int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr);
+
#define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */
#endif
/*
+ * Invalid HDSISR value which is used to indicate when HW has not set the reg.
+ * Used to work around an errata.
+ */
+#define HDSISR_CANARY 0x7fff
+
+/*
* We use a lock bit in HPTE dword 0 to synchronize updates and
* accesses to each HPTE, and another bit to indicate non-present
* HPTEs.
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 7f2e90db2050..9f52f282b1aa 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -81,12 +81,13 @@ struct kvmppc_book3s_shadow_vcpu;
struct kvm_nested_guest;
struct kvm_vm_stat {
- ulong remote_tlb_flush;
- ulong num_2M_pages;
- ulong num_1G_pages;
+ struct kvm_vm_stat_generic generic;
+ u64 num_2M_pages;
+ u64 num_1G_pages;
};
struct kvm_vcpu_stat {
+ struct kvm_vcpu_stat_generic generic;
u64 sum_exits;
u64 mmio_exits;
u64 signal_exits;
@@ -102,14 +103,8 @@ struct kvm_vcpu_stat {
u64 emulated_inst_exits;
u64 dec_exits;
u64 ext_intr_exits;
- u64 halt_poll_success_ns;
- u64 halt_poll_fail_ns;
u64 halt_wait_ns;
- u64 halt_successful_poll;
- u64 halt_attempted_poll;
u64 halt_successful_wait;
- u64 halt_poll_invalid;
- u64 halt_wakeup;
u64 dbell_exits;
u64 gdbell_exits;
u64 ld;
@@ -298,7 +293,6 @@ struct kvm_arch {
u8 fwnmi_enabled;
u8 secure_guest;
u8 svm_enabled;
- bool threads_indep;
bool nested_enable;
bool dawr1_enabled;
pgd_t *pgtable;
@@ -684,7 +678,12 @@ struct kvm_vcpu_arch {
ulong fault_dar;
u32 fault_dsisr;
unsigned long intr_msr;
- ulong fault_gpa; /* guest real address of page fault (POWER9) */
+ /*
+ * POWER9 and later: fault_gpa contains the guest real address of page
+ * fault for a radix guest, or segment descriptor (equivalent to result
+ * from slbmfev of SLB entry that translated the EA) for hash guests.
+ */
+ ulong fault_gpa;
#endif
#ifdef CONFIG_BOOKE
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 5bf8ae9bb2cc..2d88944f9f34 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -129,6 +129,7 @@ extern void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu);
extern int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu);
extern int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu);
extern void kvmppc_core_queue_machine_check(struct kvm_vcpu *vcpu, ulong flags);
+extern void kvmppc_core_queue_syscall(struct kvm_vcpu *vcpu);
extern void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags);
extern void kvmppc_core_queue_fpunavail(struct kvm_vcpu *vcpu);
extern void kvmppc_core_queue_vec_unavail(struct kvm_vcpu *vcpu);
@@ -606,6 +607,7 @@ extern void kvmppc_free_pimap(struct kvm *kvm);
extern int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall);
extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu);
extern int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd);
+extern int kvmppc_xive_xics_hcall(struct kvm_vcpu *vcpu, u32 req);
extern u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu);
extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev,
@@ -638,6 +640,8 @@ static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { }
static inline int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
{ return 0; }
+static inline int kvmppc_xive_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
+ { return 0; }
#endif
#ifdef CONFIG_KVM_XIVE
@@ -655,8 +659,6 @@ extern int kvmppc_xive_get_xive(struct kvm *kvm, u32 irq, u32 *server,
u32 *priority);
extern int kvmppc_xive_int_on(struct kvm *kvm, u32 irq);
extern int kvmppc_xive_int_off(struct kvm *kvm, u32 irq);
-extern void kvmppc_xive_init_module(void);
-extern void kvmppc_xive_exit_module(void);
extern int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
struct kvm_vcpu *vcpu, u32 cpu);
@@ -671,6 +673,8 @@ extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
int level, bool line_status);
extern void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu);
+extern void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu);
+extern void kvmppc_xive_rearm_escalation(struct kvm_vcpu *vcpu);
static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu)
{
@@ -680,8 +684,6 @@ static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu)
extern int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
struct kvm_vcpu *vcpu, u32 cpu);
extern void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu);
-extern void kvmppc_xive_native_init_module(void);
-extern void kvmppc_xive_native_exit_module(void);
extern int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu,
union kvmppc_one_reg *val);
extern int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu,
@@ -695,8 +697,6 @@ static inline int kvmppc_xive_get_xive(struct kvm *kvm, u32 irq, u32 *server,
u32 *priority) { return -1; }
static inline int kvmppc_xive_int_on(struct kvm *kvm, u32 irq) { return -1; }
static inline int kvmppc_xive_int_off(struct kvm *kvm, u32 irq) { return -1; }
-static inline void kvmppc_xive_init_module(void) { }
-static inline void kvmppc_xive_exit_module(void) { }
static inline int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
struct kvm_vcpu *vcpu, u32 cpu) { return -EBUSY; }
@@ -711,14 +711,14 @@ static inline int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval) { retur
static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
int level, bool line_status) { return -ENODEV; }
static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { }
+static inline void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu) { }
+static inline void kvmppc_xive_rearm_escalation(struct kvm_vcpu *vcpu) { }
static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu)
{ return 0; }
static inline int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
struct kvm_vcpu *vcpu, u32 cpu) { return -EBUSY; }
static inline void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu) { }
-static inline void kvmppc_xive_native_init_module(void) { }
-static inline void kvmppc_xive_native_exit_module(void) { }
static inline int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu,
union kvmppc_one_reg *val)
{ return 0; }
@@ -754,7 +754,7 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
unsigned long tce_value, unsigned long npages);
long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target,
unsigned int yield_count);
-long kvmppc_h_random(struct kvm_vcpu *vcpu);
+long kvmppc_rm_h_random(struct kvm_vcpu *vcpu);
void kvmhv_commence_exit(int trap);
void kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu);
void kvmppc_subcore_enter_guest(void);
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index 4bc45d3ed8b0..db186c539d37 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -122,12 +122,6 @@ static inline bool need_extra_context(struct mm_struct *mm, unsigned long ea)
}
#endif
-#if defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE) && defined(CONFIG_PPC_RADIX_MMU)
-extern void radix_kvm_prefetch_workaround(struct mm_struct *mm);
-#else
-static inline void radix_kvm_prefetch_workaround(struct mm_struct *mm) { }
-#endif
-
extern void switch_cop(struct mm_struct *next);
extern int use_cop(unsigned long acop, struct mm_struct *mm);
extern void drop_cop(unsigned long acop, struct mm_struct *mm);
@@ -222,6 +216,18 @@ static inline void mm_context_add_copro(struct mm_struct *mm) { }
static inline void mm_context_remove_copro(struct mm_struct *mm) { }
#endif
+#if defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE) && defined(CONFIG_PPC_RADIX_MMU)
+void do_h_rpt_invalidate_prt(unsigned long pid, unsigned long lpid,
+ unsigned long type, unsigned long pg_sizes,
+ unsigned long start, unsigned long end);
+#else
+static inline void do_h_rpt_invalidate_prt(unsigned long pid,
+ unsigned long lpid,
+ unsigned long type,
+ unsigned long pg_sizes,
+ unsigned long start,
+ unsigned long end) { }
+#endif
extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk);
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 8dd3cdb25338..8c2c3dd4ddba 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -97,6 +97,18 @@ extern void div128_by_32(u64 dividend_high, u64 dividend_low,
extern void secondary_cpu_time_init(void);
extern void __init time_init(void);
+#ifdef CONFIG_PPC64
+static inline unsigned long test_irq_work_pending(void)
+{
+ unsigned long x;
+
+ asm volatile("lbz %0,%1(13)"
+ : "=r" (x)
+ : "i" (offsetof(struct paca_struct, irq_work_pending)));
+ return x;
+}
+#endif
+
DECLARE_PER_CPU(u64, decrementers_next_tb);
/* Convert timebase ticks to nanoseconds */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 28af4efb4587..aa267d173ded 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -534,7 +534,6 @@ int main(void)
OFFSET(VCPU_SLB_NR, kvm_vcpu, arch.slb_nr);
OFFSET(VCPU_FAULT_DSISR, kvm_vcpu, arch.fault_dsisr);
OFFSET(VCPU_FAULT_DAR, kvm_vcpu, arch.fault_dar);
- OFFSET(VCPU_FAULT_GPA, kvm_vcpu, arch.fault_gpa);
OFFSET(VCPU_INTR_MSR, kvm_vcpu, arch.intr_msr);
OFFSET(VCPU_LAST_INST, kvm_vcpu, arch.last_inst);
OFFSET(VCPU_TRAP, kvm_vcpu, arch.trap);
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index fa8e52a0239e..f7fc6e078d4e 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -21,22 +21,6 @@
#include <asm/feature-fixups.h>
#include <asm/kup.h>
-/* PACA save area offsets (exgen, exmc, etc) */
-#define EX_R9 0
-#define EX_R10 8
-#define EX_R11 16
-#define EX_R12 24
-#define EX_R13 32
-#define EX_DAR 40
-#define EX_DSISR 48
-#define EX_CCR 52
-#define EX_CFAR 56
-#define EX_PPR 64
-#define EX_CTR 72
-.if EX_SIZE != 10
- .error "EX_SIZE is wrong"
-.endif
-
/*
* Following are fixed section helper macros.
*
@@ -133,7 +117,6 @@ name:
#define IBRANCH_TO_COMMON .L_IBRANCH_TO_COMMON_\name\() /* ENTRY branch to common */
#define IREALMODE_COMMON .L_IREALMODE_COMMON_\name\() /* Common runs in realmode */
#define IMASK .L_IMASK_\name\() /* IRQ soft-mask bit */
-#define IKVM_SKIP .L_IKVM_SKIP_\name\() /* Generate KVM skip handler */
#define IKVM_REAL .L_IKVM_REAL_\name\() /* Real entry tests KVM */
#define __IKVM_REAL(name) .L_IKVM_REAL_ ## name
#define IKVM_VIRT .L_IKVM_VIRT_\name\() /* Virt entry tests KVM */
@@ -190,9 +173,6 @@ do_define_int n
.ifndef IMASK
IMASK=0
.endif
- .ifndef IKVM_SKIP
- IKVM_SKIP=0
- .endif
.ifndef IKVM_REAL
IKVM_REAL=0
.endif
@@ -207,8 +187,6 @@ do_define_int n
.endif
.endm
-#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
/*
* All interrupts which set HSRR registers, as well as SRESET and MCE and
* syscall when invoked with "sc 1" switch to MSR[HV]=1 (HVMODE) to be taken,
@@ -238,88 +216,28 @@ do_define_int n
/*
* If an interrupt is taken while a guest is running, it is immediately routed
- * to KVM to handle. If both HV and PR KVM arepossible, KVM interrupts go first
- * to kvmppc_interrupt_hv, which handles the PR guest case.
+ * to KVM to handle.
*/
-#define kvmppc_interrupt kvmppc_interrupt_hv
-#else
-#define kvmppc_interrupt kvmppc_interrupt_pr
-#endif
-.macro KVMTEST name
+.macro KVMTEST name handler
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
lbz r10,HSTATE_IN_GUEST(r13)
cmpwi r10,0
- bne \name\()_kvm
-.endm
-
-.macro GEN_KVM name
- .balign IFETCH_ALIGN_BYTES
-\name\()_kvm:
-
- .if IKVM_SKIP
- cmpwi r10,KVM_GUEST_MODE_SKIP
- beq 89f
- .else
-BEGIN_FTR_SECTION
- ld r10,IAREA+EX_CFAR(r13)
- std r10,HSTATE_CFAR(r13)
-END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
- .endif
-
- ld r10,IAREA+EX_CTR(r13)
- mtctr r10
-BEGIN_FTR_SECTION
- ld r10,IAREA+EX_PPR(r13)
- std r10,HSTATE_PPR(r13)
-END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
- ld r11,IAREA+EX_R11(r13)
- ld r12,IAREA+EX_R12(r13)
- std r12,HSTATE_SCRATCH0(r13)
- sldi r12,r9,32
- ld r9,IAREA+EX_R9(r13)
- ld r10,IAREA+EX_R10(r13)
/* HSRR variants have the 0x2 bit added to their trap number */
.if IHSRR_IF_HVMODE
BEGIN_FTR_SECTION
- ori r12,r12,(IVEC + 0x2)
- FTR_SECTION_ELSE
- ori r12,r12,(IVEC)
- ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
- .elseif IHSRR
- ori r12,r12,(IVEC+ 0x2)
- .else
- ori r12,r12,(IVEC)
- .endif
- b kvmppc_interrupt
-
- .if IKVM_SKIP
-89: mtocrf 0x80,r9
- ld r10,IAREA+EX_CTR(r13)
- mtctr r10
- ld r9,IAREA+EX_R9(r13)
- ld r10,IAREA+EX_R10(r13)
- ld r11,IAREA+EX_R11(r13)
- ld r12,IAREA+EX_R12(r13)
- .if IHSRR_IF_HVMODE
- BEGIN_FTR_SECTION
- b kvmppc_skip_Hinterrupt
+ li r10,(IVEC + 0x2)
FTR_SECTION_ELSE
- b kvmppc_skip_interrupt
+ li r10,(IVEC)
ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
.elseif IHSRR
- b kvmppc_skip_Hinterrupt
+ li r10,(IVEC + 0x2)
.else
- b kvmppc_skip_interrupt
+ li r10,(IVEC)
.endif
- .endif
-.endm
-
-#else
-.macro KVMTEST name
-.endm
-.macro GEN_KVM name
-.endm
+ bne \handler
#endif
+.endm
/*
* This is the BOOK3S interrupt entry code macro.
@@ -461,7 +379,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
DEFINE_FIXED_SYMBOL(\name\()_common_real)
\name\()_common_real:
.if IKVM_REAL
- KVMTEST \name
+ KVMTEST \name kvm_interrupt
.endif
ld r10,PACAKMSR(r13) /* get MSR value for kernel */
@@ -484,7 +402,7 @@ DEFINE_FIXED_SYMBOL(\name\()_common_real)
DEFINE_FIXED_SYMBOL(\name\()_common_virt)
\name\()_common_virt:
.if IKVM_VIRT
- KVMTEST \name
+ KVMTEST \name kvm_interrupt
1:
.endif
.endif /* IVIRT */
@@ -498,7 +416,7 @@ DEFINE_FIXED_SYMBOL(\name\()_common_virt)
DEFINE_FIXED_SYMBOL(\name\()_common_real)
\name\()_common_real:
.if IKVM_REAL
- KVMTEST \name
+ KVMTEST \name kvm_interrupt
.endif
.endm
@@ -1000,8 +918,6 @@ EXC_COMMON_BEGIN(system_reset_common)
EXCEPTION_RESTORE_REGS
RFI_TO_USER_OR_KERNEL
- GEN_KVM system_reset
-
/**
* Interrupt 0x200 - Machine Check Interrupt (MCE).
@@ -1070,7 +986,6 @@ INT_DEFINE_BEGIN(machine_check)
ISET_RI=0
IDAR=1
IDSISR=1
- IKVM_SKIP=1
IKVM_REAL=1
INT_DEFINE_END(machine_check)
@@ -1166,7 +1081,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
/*
* Check if we are coming from guest. If yes, then run the normal
* exception handler which will take the
- * machine_check_kvm->kvmppc_interrupt branch to deliver the MC event
+ * machine_check_kvm->kvm_interrupt branch to deliver the MC event
* to guest.
*/
lbz r11,HSTATE_IN_GUEST(r13)
@@ -1236,8 +1151,6 @@ EXC_COMMON_BEGIN(machine_check_common)
bl machine_check_exception
b interrupt_return
- GEN_KVM machine_check
-
#ifdef CONFIG_PPC_P7_NAP
/*
@@ -1342,7 +1255,6 @@ INT_DEFINE_BEGIN(data_access)
IVEC=0x300
IDAR=1
IDSISR=1
- IKVM_SKIP=1
IKVM_REAL=1
INT_DEFINE_END(data_access)
@@ -1373,8 +1285,6 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
REST_NVGPRS(r1)
b interrupt_return
- GEN_KVM data_access
-
/**
* Interrupt 0x380 - Data Segment Interrupt (DSLB).
@@ -1396,7 +1306,6 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
INT_DEFINE_BEGIN(data_access_slb)
IVEC=0x380
IDAR=1
- IKVM_SKIP=1
IKVM_REAL=1
INT_DEFINE_END(data_access_slb)
@@ -1425,8 +1334,6 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
bl do_bad_slb_fault
b interrupt_return
- GEN_KVM data_access_slb
-
/**
* Interrupt 0x400 - Instruction Storage Interrupt (ISI).
@@ -1463,8 +1370,6 @@ MMU_FTR_SECTION_ELSE
ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
b interrupt_return
- GEN_KVM instruction_access
-
/**
* Interrupt 0x480 - Instruction Segment Interrupt (ISLB).
@@ -1509,8 +1414,6 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
bl do_bad_slb_fault
b interrupt_return
- GEN_KVM instruction_access_slb
-
/**
* Interrupt 0x500 - External Interrupt.
@@ -1555,8 +1458,6 @@ EXC_COMMON_BEGIN(hardware_interrupt_common)
bl do_IRQ
b interrupt_return
- GEN_KVM hardware_interrupt
-
/**
* Interrupt 0x600 - Alignment Interrupt
@@ -1584,8 +1485,6 @@ EXC_COMMON_BEGIN(alignment_common)
REST_NVGPRS(r1) /* instruction emulation may change GPRs */
b interrupt_return
- GEN_KVM alignment
-
/**
* Interrupt 0x700 - Program Interrupt (program check).
@@ -1693,8 +1592,6 @@ EXC_COMMON_BEGIN(program_check_common)
REST_NVGPRS(r1) /* instruction emulation may change GPRs */
b interrupt_return
- GEN_KVM program_check
-
/*
* Interrupt 0x800 - Floating-Point Unavailable Interrupt.
@@ -1744,8 +1641,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM)
b interrupt_return
#endif
- GEN_KVM fp_unavailable
-
/**
* Interrupt 0x900 - Decrementer Interrupt.
@@ -1784,8 +1679,6 @@ EXC_COMMON_BEGIN(decrementer_common)
bl timer_interrupt
b interrupt_return
- GEN_KVM decrementer
-
/**
* Interrupt 0x980 - Hypervisor Decrementer Interrupt.
@@ -1831,8 +1724,6 @@ EXC_COMMON_BEGIN(hdecrementer_common)
ld r13,PACA_EXGEN+EX_R13(r13)
HRFI_TO_KERNEL
- GEN_KVM hdecrementer
-
/**
* Interrupt 0xa00 - Directed Privileged Doorbell Interrupt.
@@ -1872,8 +1763,6 @@ EXC_COMMON_BEGIN(doorbell_super_common)
#endif
b interrupt_return
- GEN_KVM doorbell_super
-
EXC_REAL_NONE(0xb00, 0x100)
EXC_VIRT_NONE(0x4b00, 0x100)
@@ -1923,7 +1812,7 @@ INT_DEFINE_END(system_call)
GET_PACA(r13)
std r10,PACA_EXGEN+EX_R10(r13)
INTERRUPT_TO_KERNEL
- KVMTEST system_call /* uses r10, branch to system_call_kvm */
+ KVMTEST system_call kvm_hcall /* uses r10, branch to kvm_hcall */
mfctr r9
#else
mr r9,r13
@@ -1979,14 +1868,16 @@ EXC_VIRT_BEGIN(system_call, 0x4c00, 0x100)
EXC_VIRT_END(system_call, 0x4c00, 0x100)
#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
-TRAMP_REAL_BEGIN(system_call_kvm)
- /*
- * This is a hcall, so register convention is as above, with these
- * differences:
- * r13 = PACA
- * ctr = orig r13
- * orig r10 saved in PACA
- */
+TRAMP_REAL_BEGIN(kvm_hcall)
+ std r9,PACA_EXGEN+EX_R9(r13)
+ std r11,PACA_EXGEN+EX_R11(r13)
+ std r12,PACA_EXGEN+EX_R12(r13)
+ mfcr r9
+ mfctr r10
+ std r10,PACA_EXGEN+EX_R13(r13)
+ li r10,0
+ std r10,PACA_EXGEN+EX_CFAR(r13)
+ std r10,PACA_EXGEN+EX_CTR(r13)
/*
* Save the PPR (on systems that support it) before changing to
* HMT_MEDIUM. That allows the KVM code to save that value into the
@@ -1994,31 +1885,24 @@ TRAMP_REAL_BEGIN(system_call_kvm)
*/
BEGIN_FTR_SECTION
mfspr r10,SPRN_PPR
- std r10,HSTATE_PPR(r13)
+ std r10,PACA_EXGEN+EX_PPR(r13)
END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
+
HMT_MEDIUM
- mfctr r10
- SET_SCRATCH0(r10)
- mfcr r10
- std r12,HSTATE_SCRATCH0(r13)
- sldi r12,r10,32
- ori r12,r12,0xc00
+
#ifdef CONFIG_RELOCATABLE
/*
- * Requires __LOAD_FAR_HANDLER beause kvmppc_interrupt lives
+ * Requires __LOAD_FAR_HANDLER beause kvmppc_hcall lives
* outside the head section.
*/
- __LOAD_FAR_HANDLER(r10, kvmppc_interrupt)
+ __LOAD_FAR_HANDLER(r10, kvmppc_hcall)
mtctr r10
- ld r10,PACA_EXGEN+EX_R10(r13)
bctr
#else
- ld r10,PACA_EXGEN+EX_R10(r13)
- b kvmppc_interrupt
+ b kvmppc_hcall
#endif
#endif
-
/**
* Interrupt 0xd00 - Trace Interrupt.
* This is a synchronous interrupt in response to instruction step or
@@ -2043,8 +1927,6 @@ EXC_COMMON_BEGIN(single_step_common)
bl single_step_exception
b interrupt_return
- GEN_KVM single_step
-
/**
* Interrupt 0xe00 - Hypervisor Data Storage Interrupt (HDSI).
@@ -2063,7 +1945,6 @@ INT_DEFINE_BEGIN(h_data_storage)
IHSRR=1
IDAR=1
IDSISR=1
- IKVM_SKIP=1
IKVM_REAL=1
IKVM_VIRT=1
INT_DEFINE_END(h_data_storage)
@@ -2084,8 +1965,6 @@ MMU_FTR_SECTION_ELSE
ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_TYPE_RADIX)
b interrupt_return
- GEN_KVM h_data_storage
-
/**
* Interrupt 0xe20 - Hypervisor Instruction Storage Interrupt (HISI).
@@ -2111,8 +1990,6 @@ EXC_COMMON_BEGIN(h_instr_storage_common)
bl unknown_exception
b interrupt_return
- GEN_KVM h_instr_storage
-
/**
* Interrupt 0xe40 - Hypervisor Emulation Assistance Interrupt.
@@ -2137,8 +2014,6 @@ EXC_COMMON_BEGIN(emulation_assist_common)
REST_NVGPRS(r1) /* instruction emulation may change GPRs */
b interrupt_return
- GEN_KVM emulation_assist
-
/**
* Interrupt 0xe60 - Hypervisor Maintenance Interrupt (HMI).
@@ -2210,16 +2085,12 @@ EXC_COMMON_BEGIN(hmi_exception_early_common)
EXCEPTION_RESTORE_REGS hsrr=1
GEN_INT_ENTRY hmi_exception, virt=0
- GEN_KVM hmi_exception_early
-
EXC_COMMON_BEGIN(hmi_exception_common)
GEN_COMMON hmi_exception
addi r3,r1,STACK_FRAME_OVERHEAD
bl handle_hmi_exception
b interrupt_return
- GEN_KVM hmi_exception
-
/**
* Interrupt 0xe80 - Directed Hypervisor Doorbell Interrupt.
@@ -2250,8 +2121,6 @@ EXC_COMMON_BEGIN(h_doorbell_common)
#endif
b interrupt_return
- GEN_KVM h_doorbell
-
/**
* Interrupt 0xea0 - Hypervisor Virtualization Interrupt.
@@ -2278,8 +2147,6 @@ EXC_COMMON_BEGIN(h_virt_irq_common)
bl do_IRQ
b interrupt_return
- GEN_KVM h_virt_irq
-
EXC_REAL_NONE(0xec0, 0x20)
EXC_VIRT_NONE(0x4ec0, 0x20)
@@ -2323,8 +2190,6 @@ EXC_COMMON_BEGIN(performance_monitor_common)
bl performance_monitor_exception
b interrupt_return
- GEN_KVM performance_monitor
-
/**
* Interrupt 0xf20 - Vector Unavailable Interrupt.
@@ -2374,8 +2239,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
bl altivec_unavailable_exception
b interrupt_return
- GEN_KVM altivec_unavailable
-
/**
* Interrupt 0xf40 - VSX Unavailable Interrupt.
@@ -2424,8 +2287,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
bl vsx_unavailable_exception
b interrupt_return
- GEN_KVM vsx_unavailable
-
/**
* Interrupt 0xf60 - Facility Unavailable Interrupt.
@@ -2454,8 +2315,6 @@ EXC_COMMON_BEGIN(facility_unavailable_common)
REST_NVGPRS(r1) /* instruction emulation may change GPRs */
b interrupt_return
- GEN_KVM facility_unavailable
-
/**
* Interrupt 0xf60 - Hypervisor Facility Unavailable Interrupt.
@@ -2484,8 +2343,6 @@ EXC_COMMON_BEGIN(h_facility_unavailable_common)
REST_NVGPRS(r1) /* XXX Shouldn't be necessary in practice */
b interrupt_return
- GEN_KVM h_facility_unavailable
-
EXC_REAL_NONE(0xfa0, 0x20)
EXC_VIRT_NONE(0x4fa0, 0x20)
@@ -2515,8 +2372,6 @@ EXC_COMMON_BEGIN(cbe_system_error_common)
bl cbe_system_error_exception
b interrupt_return
- GEN_KVM cbe_system_error
-
#else /* CONFIG_CBE_RAS */
EXC_REAL_NONE(0x1200, 0x100)
EXC_VIRT_NONE(0x5200, 0x100)
@@ -2548,8 +2403,6 @@ EXC_COMMON_BEGIN(instruction_breakpoint_common)
bl instruction_breakpoint_exception
b interrupt_return
- GEN_KVM instruction_breakpoint
-
EXC_REAL_NONE(0x1400, 0x100)
EXC_VIRT_NONE(0x5400, 0x100)
@@ -2670,8 +2523,6 @@ EXC_COMMON_BEGIN(denorm_exception_common)
bl unknown_exception
b interrupt_return
- GEN_KVM denorm_exception
-
#ifdef CONFIG_CBE_RAS
INT_DEFINE_BEGIN(cbe_maintenance)
@@ -2689,8 +2540,6 @@ EXC_COMMON_BEGIN(cbe_maintenance_common)
bl cbe_maintenance_exception
b interrupt_return
- GEN_KVM cbe_maintenance
-
#else /* CONFIG_CBE_RAS */
EXC_REAL_NONE(0x1600, 0x100)
EXC_VIRT_NONE(0x5600, 0x100)
@@ -2721,8 +2570,6 @@ EXC_COMMON_BEGIN(altivec_assist_common)
#endif
b interrupt_return
- GEN_KVM altivec_assist
-
#ifdef CONFIG_CBE_RAS
INT_DEFINE_BEGIN(cbe_thermal)
@@ -2740,8 +2587,6 @@ EXC_COMMON_BEGIN(cbe_thermal_common)
bl cbe_thermal_exception
b interrupt_return
- GEN_KVM cbe_thermal
-
#else /* CONFIG_CBE_RAS */
EXC_REAL_NONE(0x1800, 0x100)
EXC_VIRT_NONE(0x5800, 0x100)
@@ -2994,6 +2839,15 @@ TRAMP_REAL_BEGIN(rfscv_flush_fallback)
USE_TEXT_SECTION()
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+kvm_interrupt:
+ /*
+ * The conditional branch in KVMTEST can't reach all the way,
+ * make a stub.
+ */
+ b kvmppc_interrupt
+#endif
+
_GLOBAL(do_uaccess_flush)
UACCESS_FLUSH_FIXUP_SECTION
nop
@@ -3009,32 +2863,6 @@ EXPORT_SYMBOL(do_uaccess_flush)
MASKED_INTERRUPT
MASKED_INTERRUPT hsrr=1
-#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
-kvmppc_skip_interrupt:
- /*
- * Here all GPRs are unchanged from when the interrupt happened
- * except for r13, which is saved in SPRG_SCRATCH0.
- */
- mfspr r13, SPRN_SRR0
- addi r13, r13, 4
- mtspr SPRN_SRR0, r13
- GET_SCRATCH0(r13)
- RFI_TO_KERNEL
- b .
-
-kvmppc_skip_Hinterrupt:
- /*
- * Here all GPRs are unchanged from when the interrupt happened
- * except for r13, which is saved in SPRG_SCRATCH0.
- */
- mfspr r13, SPRN_HSRR0
- addi r13, r13, 4
- mtspr SPRN_HSRR0, r13
- GET_SCRATCH0(r13)
- HRFI_TO_KERNEL
- b .
-#endif
-
/*
* Relocation-on interrupts: A subset of the interrupts can be delivered
* with IR=1/DR=1, if AIL==2 and MSR.HV won't be changed by delivering
diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c
index 0fdfcdd9d880..c17d1c9362b5 100644
--- a/arch/powerpc/kernel/security.c
+++ b/arch/powerpc/kernel/security.c
@@ -432,16 +432,19 @@ device_initcall(stf_barrier_debugfs_init);
static void update_branch_cache_flush(void)
{
- u32 *site;
+ u32 *site, __maybe_unused *site2;
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
site = &patch__call_kvm_flush_link_stack;
+ site2 = &patch__call_kvm_flush_link_stack_p9;
// This controls the branch from guest_exit_cont to kvm_flush_link_stack
if (link_stack_flush_type == BRANCH_CACHE_FLUSH_NONE) {
patch_instruction_site(site, ppc_inst(PPC_INST_NOP));
+ patch_instruction_site(site2, ppc_inst(PPC_INST_NOP));
} else {
// Could use HW flush, but that could also flush count cache
patch_branch_site(site, (u64)&kvm_flush_link_stack, BRANCH_SET_LINK);
+ patch_branch_site(site2, (u64)&kvm_flush_link_stack, BRANCH_SET_LINK);
}
#endif
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index b67d93a609a2..da995c5fb97d 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -508,16 +508,6 @@ EXPORT_SYMBOL(profile_pc);
* 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable...
*/
#ifdef CONFIG_PPC64
-static inline unsigned long test_irq_work_pending(void)
-{
- unsigned long x;
-
- asm volatile("lbz %0,%1(13)"
- : "=r" (x)
- : "i" (offsetof(struct paca_struct, irq_work_pending)));
- return x;
-}
-
static inline void set_irq_work_pending_flag(void)
{
asm volatile("stb %0,%1(13)" : :
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 2bfeaa13befb..583c14ef596e 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -6,7 +6,7 @@
ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm
KVM := ../../../virt/kvm
-common-objs-y = $(KVM)/kvm_main.o $(KVM)/eventfd.o
+common-objs-y = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/binary_stats.o
common-objs-$(CONFIG_KVM_VFIO) += $(KVM)/vfio.o
common-objs-$(CONFIG_KVM_MMIO) += $(KVM)/coalesced_mmio.o
@@ -57,6 +57,7 @@ kvm-pr-y := \
book3s_32_mmu.o
kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
+ book3s_64_entry.o \
tm.o
ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
@@ -86,6 +87,7 @@ kvm-book3s_64-builtin-tm-objs-$(CONFIG_PPC_TRANSACTIONAL_MEM) += \
ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
book3s_hv_hmi.o \
+ book3s_hv_p9_entry.o \
book3s_hv_rmhandlers.o \
book3s_hv_rm_mmu.o \
book3s_hv_ras.o \
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 2b691f4d1f26..79833f78d1da 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -38,37 +38,66 @@
/* #define EXIT_DEBUG */
-struct kvm_stats_debugfs_item debugfs_entries[] = {
- VCPU_STAT("exits", sum_exits),
- VCPU_STAT("mmio", mmio_exits),
- VCPU_STAT("sig", signal_exits),
- VCPU_STAT("sysc", syscall_exits),
- VCPU_STAT("inst_emu", emulated_inst_exits),
- VCPU_STAT("dec", dec_exits),
- VCPU_STAT("ext_intr", ext_intr_exits),
- VCPU_STAT("queue_intr", queue_intr),
- VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
- VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
- VCPU_STAT("halt_wait_ns", halt_wait_ns),
- VCPU_STAT("halt_successful_poll", halt_successful_poll),
- VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
- VCPU_STAT("halt_successful_wait", halt_successful_wait),
- VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
- VCPU_STAT("halt_wakeup", halt_wakeup),
- VCPU_STAT("pf_storage", pf_storage),
- VCPU_STAT("sp_storage", sp_storage),
- VCPU_STAT("pf_instruc", pf_instruc),
- VCPU_STAT("sp_instruc", sp_instruc),
- VCPU_STAT("ld", ld),
- VCPU_STAT("ld_slow", ld_slow),
- VCPU_STAT("st", st),
- VCPU_STAT("st_slow", st_slow),
- VCPU_STAT("pthru_all", pthru_all),
- VCPU_STAT("pthru_host", pthru_host),
- VCPU_STAT("pthru_bad_aff", pthru_bad_aff),
- VM_STAT("largepages_2M", num_2M_pages, .mode = 0444),
- VM_STAT("largepages_1G", num_1G_pages, .mode = 0444),
- { NULL }
+const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
+ KVM_GENERIC_VM_STATS(),
+ STATS_DESC_ICOUNTER(VM, num_2M_pages),
+ STATS_DESC_ICOUNTER(VM, num_1G_pages)
+};
+static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
+ sizeof(struct kvm_vm_stat) / sizeof(u64));
+
+const struct kvm_stats_header kvm_vm_stats_header = {
+ .name_size = KVM_STATS_NAME_SIZE,
+ .num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
+ .id_offset = sizeof(struct kvm_stats_header),
+ .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+ .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+ sizeof(kvm_vm_stats_desc),
+};
+
+const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
+ KVM_GENERIC_VCPU_STATS(),
+ STATS_DESC_COUNTER(VCPU, sum_exits),
+ STATS_DESC_COUNTER(VCPU, mmio_exits),
+ STATS_DESC_COUNTER(VCPU, signal_exits),
+ STATS_DESC_COUNTER(VCPU, light_exits),
+ STATS_DESC_COUNTER(VCPU, itlb_real_miss_exits),
+ STATS_DESC_COUNTER(VCPU, itlb_virt_miss_exits),
+ STATS_DESC_COUNTER(VCPU, dtlb_real_miss_exits),
+ STATS_DESC_COUNTER(VCPU, dtlb_virt_miss_exits),
+ STATS_DESC_COUNTER(VCPU, syscall_exits),
+ STATS_DESC_COUNTER(VCPU, isi_exits),
+ STATS_DESC_COUNTER(VCPU, dsi_exits),
+ STATS_DESC_COUNTER(VCPU, emulated_inst_exits),
+ STATS_DESC_COUNTER(VCPU, dec_exits),
+ STATS_DESC_COUNTER(VCPU, ext_intr_exits),
+ STATS_DESC_TIME_NSEC(VCPU, halt_wait_ns),
+ STATS_DESC_COUNTER(VCPU, halt_successful_wait),
+ STATS_DESC_COUNTER(VCPU, dbell_exits),
+ STATS_DESC_COUNTER(VCPU, gdbell_exits),
+ STATS_DESC_COUNTER(VCPU, ld),
+ STATS_DESC_COUNTER(VCPU, st),
+ STATS_DESC_COUNTER(VCPU, pf_storage),
+ STATS_DESC_COUNTER(VCPU, pf_instruc),
+ STATS_DESC_COUNTER(VCPU, sp_storage),
+ STATS_DESC_COUNTER(VCPU, sp_instruc),
+ STATS_DESC_COUNTER(VCPU, queue_intr),
+ STATS_DESC_COUNTER(VCPU, ld_slow),
+ STATS_DESC_COUNTER(VCPU, st_slow),
+ STATS_DESC_COUNTER(VCPU, pthru_all),
+ STATS_DESC_COUNTER(VCPU, pthru_host),
+ STATS_DESC_COUNTER(VCPU, pthru_bad_aff)
+};
+static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
+ sizeof(struct kvm_vcpu_stat) / sizeof(u64));
+
+const struct kvm_stats_header kvm_vcpu_stats_header = {
+ .name_size = KVM_STATS_NAME_SIZE,
+ .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
+ .id_offset = sizeof(struct kvm_stats_header),
+ .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+ .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+ sizeof(kvm_vcpu_stats_desc),
};
static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
@@ -171,6 +200,12 @@ void kvmppc_core_queue_machine_check(struct kvm_vcpu *vcpu, ulong flags)
}
EXPORT_SYMBOL_GPL(kvmppc_core_queue_machine_check);
+void kvmppc_core_queue_syscall(struct kvm_vcpu *vcpu)
+{
+ kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_SYSCALL, 0);
+}
+EXPORT_SYMBOL(kvmppc_core_queue_syscall);
+
void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags)
{
/* might as well deliver this straight away */
@@ -1044,13 +1079,10 @@ static int kvmppc_book3s_init(void)
#ifdef CONFIG_KVM_XICS
#ifdef CONFIG_KVM_XIVE
if (xics_on_xive()) {
- kvmppc_xive_init_module();
kvm_register_device_ops(&kvm_xive_ops, KVM_DEV_TYPE_XICS);
- if (kvmppc_xive_native_supported()) {
- kvmppc_xive_native_init_module();
+ if (kvmppc_xive_native_supported())
kvm_register_device_ops(&kvm_xive_native_ops,
KVM_DEV_TYPE_XIVE);
- }
} else
#endif
kvm_register_device_ops(&kvm_xics_ops, KVM_DEV_TYPE_XICS);
@@ -1060,12 +1092,6 @@ static int kvmppc_book3s_init(void)
static void kvmppc_book3s_exit(void)
{
-#ifdef CONFIG_KVM_XICS
- if (xics_on_xive()) {
- kvmppc_xive_exit_module();
- kvmppc_xive_native_exit_module();
- }
-#endif
#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
kvmppc_book3s_exit_pr();
#endif
diff --git a/arch/powerpc/kvm/book3s_64_entry.S b/arch/powerpc/kvm/book3s_64_entry.S
new file mode 100644
index 000000000000..983b8c18bc31
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_64_entry.S
@@ -0,0 +1,416 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#include <asm/asm-offsets.h>
+#include <asm/cache.h>
+#include <asm/code-patching-asm.h>
+#include <asm/exception-64s.h>
+#include <asm/export.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_book3s_asm.h>
+#include <asm/mmu.h>
+#include <asm/ppc_asm.h>
+#include <asm/ptrace.h>
+#include <asm/reg.h>
+#include <asm/ultravisor-api.h>
+
+/*
+ * These are branched to from interrupt handlers in exception-64s.S which set
+ * IKVM_REAL or IKVM_VIRT, if HSTATE_IN_GUEST was found to be non-zero.
+ */
+
+/*
+ * This is a hcall, so register convention is as
+ * Documentation/powerpc/papr_hcalls.rst.
+ *
+ * This may also be a syscall from PR-KVM userspace that is to be
+ * reflected to the PR guest kernel, so registers may be set up for
+ * a system call rather than hcall. We don't currently clobber
+ * anything here, but the 0xc00 handler has already clobbered CTR
+ * and CR0, so PR-KVM can not support a guest kernel that preserves
+ * those registers across its system calls.
+ *
+ * The state of registers is as kvmppc_interrupt, except CFAR is not
+ * saved, R13 is not in SCRATCH0, and R10 does not contain the trap.
+ */
+.global kvmppc_hcall
+.balign IFETCH_ALIGN_BYTES
+kvmppc_hcall:
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+ lbz r10,HSTATE_IN_GUEST(r13)
+ cmpwi r10,KVM_GUEST_MODE_HV_P9
+ beq kvmppc_p9_exit_hcall
+#endif
+ ld r10,PACA_EXGEN+EX_R13(r13)
+ SET_SCRATCH0(r10)
+ li r10,0xc00
+ /* Now we look like kvmppc_interrupt */
+ li r11,PACA_EXGEN
+ b .Lgot_save_area
+
+/*
+ * KVM interrupt entry occurs after GEN_INT_ENTRY runs, and follows that
+ * call convention:
+ *
+ * guest R9-R13, CTR, CFAR, PPR saved in PACA EX_xxx save area
+ * guest (H)DAR, (H)DSISR are also in the save area for relevant interrupts
+ * guest R13 also saved in SCRATCH0
+ * R13 = PACA
+ * R11 = (H)SRR0
+ * R12 = (H)SRR1
+ * R9 = guest CR
+ * PPR is set to medium
+ *
+ * With the addition for KVM:
+ * R10 = trap vector
+ */
+.global kvmppc_interrupt
+.balign IFETCH_ALIGN_BYTES
+kvmppc_interrupt:
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+ std r10,HSTATE_SCRATCH0(r13)
+ lbz r10,HSTATE_IN_GUEST(r13)
+ cmpwi r10,KVM_GUEST_MODE_HV_P9
+ beq kvmppc_p9_exit_interrupt
+ ld r10,HSTATE_SCRATCH0(r13)
+#endif
+ li r11,PACA_EXGEN
+ cmpdi r10,0x200
+ bgt+ .Lgot_save_area
+ li r11,PACA_EXMC
+ beq .Lgot_save_area
+ li r11,PACA_EXNMI
+.Lgot_save_area:
+ add r11,r11,r13
+BEGIN_FTR_SECTION
+ ld r12,EX_CFAR(r11)
+ std r12,HSTATE_CFAR(r13)
+END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
+ ld r12,EX_CTR(r11)
+ mtctr r12
+BEGIN_FTR_SECTION
+ ld r12,EX_PPR(r11)
+ std r12,HSTATE_PPR(r13)
+END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
+ ld r12,EX_R12(r11)
+ std r12,HSTATE_SCRATCH0(r13)
+ sldi r12,r9,32
+ or r12,r12,r10
+ ld r9,EX_R9(r11)
+ ld r10,EX_R10(r11)
+ ld r11,EX_R11(r11)
+
+ /*
+ * Hcalls and other interrupts come here after normalising register
+ * contents and save locations:
+ *
+ * R12 = (guest CR << 32) | interrupt vector
+ * R13 = PACA
+ * guest R12 saved in shadow HSTATE_SCRATCH0
+ * guest R13 saved in SPRN_SCRATCH0
+ */
+ std r9,HSTATE_SCRATCH2(r13)
+ lbz r9,HSTATE_IN_GUEST(r13)
+ cmpwi r9,KVM_GUEST_MODE_SKIP
+ beq- .Lmaybe_skip
+.Lno_skip:
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
+ cmpwi r9,KVM_GUEST_MODE_GUEST
+ beq kvmppc_interrupt_pr
+#endif
+ b kvmppc_interrupt_hv
+#else
+ b kvmppc_interrupt_pr
+#endif
+
+/*
+ * "Skip" interrupts are part of a trick KVM uses a with hash guests to load
+ * the faulting instruction in guest memory from the the hypervisor without
+ * walking page tables.
+ *
+ * When the guest takes a fault that requires the hypervisor to load the
+ * instruction (e.g., MMIO emulation), KVM is running in real-mode with HV=1
+ * and the guest MMU context loaded. It sets KVM_GUEST_MODE_SKIP, and sets
+ * MSR[DR]=1 while leaving MSR[IR]=0, so it continues to fetch HV instructions
+ * but loads and stores will access the guest context. This is used to load
+ * the faulting instruction using the faulting guest effective address.
+ *
+ * However the guest context may not be able to translate, or it may cause a
+ * machine check or other issue, which results in a fault in the host
+ * (even with KVM-HV).
+ *
+ * These faults come here because KVM_GUEST_MODE_SKIP was set, so if they
+ * are (or are likely) caused by that load, the instruction is skipped by
+ * just returning with the PC advanced +4, where it is noticed the load did
+ * not execute and it goes to the slow path which walks the page tables to
+ * read guest memory.
+ */
+.Lmaybe_skip:
+ cmpwi r12,BOOK3S_INTERRUPT_MACHINE_CHECK
+ beq 1f
+ cmpwi r12,BOOK3S_INTERRUPT_DATA_STORAGE
+ beq 1f
+ cmpwi r12,BOOK3S_INTERRUPT_DATA_SEGMENT
+ beq 1f
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+ /* HSRR interrupts get 2 added to interrupt number */
+ cmpwi r12,BOOK3S_INTERRUPT_H_DATA_STORAGE | 0x2
+ beq 2f
+#endif
+ b .Lno_skip
+1: mfspr r9,SPRN_SRR0
+ addi r9,r9,4
+ mtspr SPRN_SRR0,r9
+ ld r12,HSTATE_SCRATCH0(r13)
+ ld r9,HSTATE_SCRATCH2(r13)
+ GET_SCRATCH0(r13)
+ RFI_TO_KERNEL
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+2: mfspr r9,SPRN_HSRR0
+ addi r9,r9,4
+ mtspr SPRN_HSRR0,r9
+ ld r12,HSTATE_SCRATCH0(r13)
+ ld r9,HSTATE_SCRATCH2(r13)
+ GET_SCRATCH0(r13)
+ HRFI_TO_KERNEL
+#endif
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+
+/* Stack frame offsets for kvmppc_p9_enter_guest */
+#define SFS (144 + STACK_FRAME_MIN_SIZE)
+#define STACK_SLOT_NVGPRS (SFS - 144) /* 18 gprs */
+
+/*
+ * void kvmppc_p9_enter_guest(struct vcpu *vcpu);
+ *
+ * Enter the guest on a ISAv3.0 or later system.
+ */
+.balign IFETCH_ALIGN_BYTES
+_GLOBAL(kvmppc_p9_enter_guest)
+EXPORT_SYMBOL_GPL(kvmppc_p9_enter_guest)
+ mflr r0
+ std r0,PPC_LR_STKOFF(r1)
+ stdu r1,-SFS(r1)
+
+ std r1,HSTATE_HOST_R1(r13)
+
+ mfcr r4
+ stw r4,SFS+8(r1)
+
+ reg = 14
+ .rept 18
+ std reg,STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1)
+ reg = reg + 1
+ .endr
+
+ ld r4,VCPU_LR(r3)
+ mtlr r4
+ ld r4,VCPU_CTR(r3)
+ mtctr r4
+ ld r4,VCPU_XER(r3)
+ mtspr SPRN_XER,r4
+
+ ld r1,VCPU_CR(r3)
+
+BEGIN_FTR_SECTION
+ ld r4,VCPU_CFAR(r3)
+ mtspr SPRN_CFAR,r4
+END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
+BEGIN_FTR_SECTION
+ ld r4,VCPU_PPR(r3)
+ mtspr SPRN_PPR,r4
+END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
+
+ reg = 4
+ .rept 28
+ ld reg,__VCPU_GPR(reg)(r3)
+ reg = reg + 1
+ .endr
+
+ ld r4,VCPU_KVM(r3)
+ lbz r4,KVM_SECURE_GUEST(r4)
+ cmpdi r4,0
+ ld r4,VCPU_GPR(R4)(r3)
+ bne .Lret_to_ultra
+
+ mtcr r1
+
+ ld r0,VCPU_GPR(R0)(r3)
+ ld r1,VCPU_GPR(R1)(r3)
+ ld r2,VCPU_GPR(R2)(r3)
+ ld r3,VCPU_GPR(R3)(r3)
+
+ HRFI_TO_GUEST
+ b .
+
+ /*
+ * Use UV_RETURN ultracall to return control back to the Ultravisor
+ * after processing an hypercall or interrupt that was forwarded
+ * (a.k.a. reflected) to the Hypervisor.
+ *
+ * All registers have already been reloaded except the ucall requires:
+ * R0 = hcall result
+ * R2 = SRR1, so UV can detect a synthesized interrupt (if any)
+ * R3 = UV_RETURN
+ */
+.Lret_to_ultra:
+ mtcr r1
+ ld r1,VCPU_GPR(R1)(r3)
+
+ ld r0,VCPU_GPR(R3)(r3)
+ mfspr r2,SPRN_SRR1
+ LOAD_REG_IMMEDIATE(r3, UV_RETURN)
+ sc 2
+
+/*
+ * kvmppc_p9_exit_hcall and kvmppc_p9_exit_interrupt are branched to from
+ * above if the interrupt was taken for a guest that was entered via
+ * kvmppc_p9_enter_guest().
+ *
+ * The exit code recovers the host stack and vcpu pointer, saves all guest GPRs
+ * and CR, LR, XER as well as guest MSR and NIA into the VCPU, then re-
+ * establishes the host stack and registers to return from the
+ * kvmppc_p9_enter_guest() function, which saves CTR and other guest registers
+ * (SPRs and FP, VEC, etc).
+ */
+.balign IFETCH_ALIGN_BYTES
+kvmppc_p9_exit_hcall:
+ mfspr r11,SPRN_SRR0
+ mfspr r12,SPRN_SRR1
+ li r10,0xc00
+ std r10,HSTATE_SCRATCH0(r13)
+
+.balign IFETCH_ALIGN_BYTES
+kvmppc_p9_exit_interrupt:
+ /*
+ * If set to KVM_GUEST_MODE_HV_P9 but we're still in the
+ * hypervisor, that means we can't return from the entry stack.
+ */
+ rldicl. r10,r12,64-MSR_HV_LG,63
+ bne- kvmppc_p9_bad_interrupt
+
+ std r1,HSTATE_SCRATCH1(r13)
+ std r3,HSTATE_SCRATCH2(r13)
+ ld r1,HSTATE_HOST_R1(r13)
+ ld r3,HSTATE_KVM_VCPU(r13)
+
+ std r9,VCPU_CR(r3)
+
+1:
+ std r11,VCPU_PC(r3)
+ std r12,VCPU_MSR(r3)
+
+ reg = 14
+ .rept 18
+ std reg,__VCPU_GPR(reg)(r3)
+ reg = reg + 1
+ .endr
+
+ /* r1, r3, r9-r13 are saved to vcpu by C code */
+ std r0,VCPU_GPR(R0)(r3)
+ std r2,VCPU_GPR(R2)(r3)
+ reg = 4
+ .rept 5
+ std reg,__VCPU_GPR(reg)(r3)
+ reg = reg + 1
+ .endr
+
+ ld r2,PACATOC(r13)
+
+ mflr r4
+ std r4,VCPU_LR(r3)
+ mfspr r4,SPRN_XER
+ std r4,VCPU_XER(r3)
+
+ reg = 14
+ .rept 18
+ ld reg,STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1)
+ reg = reg + 1
+ .endr
+
+ lwz r4,SFS+8(r1)
+ mtcr r4
+
+ /*
+ * Flush the link stack here, before executing the first blr on the
+ * way out of the guest.
+ *
+ * The link stack won't match coming out of the guest anyway so the
+ * only cost is the flush itself. The call clobbers r0.
+ */
+1: nop
+ patch_site 1b patch__call_kvm_flush_link_stack_p9
+
+ addi r1,r1,SFS
+ ld r0,PPC_LR_STKOFF(r1)
+ mtlr r0
+ blr
+
+/*
+ * Took an interrupt somewhere right before HRFID to guest, so registers are
+ * in a bad way. Return things hopefully enough to run host virtual code and
+ * run the Linux interrupt handler (SRESET or MCE) to print something useful.
+ *
+ * We could be really clever and save all host registers in known locations
+ * before setting HSTATE_IN_GUEST, then restoring them all here, and setting
+ * return address to a fixup that sets them up again. But that's a lot of
+ * effort for a small bit of code. Lots of other things to do first.
+ */
+kvmppc_p9_bad_interrupt:
+BEGIN_MMU_FTR_SECTION
+ /*
+ * Hash host doesn't try to recover MMU (requires host SLB reload)
+ */
+ b .
+END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
+ /*
+ * Clean up guest registers to give host a chance to run.
+ */
+ li r10,0
+ mtspr SPRN_AMR,r10
+ mtspr SPRN_IAMR,r10
+ mtspr SPRN_CIABR,r10
+ mtspr SPRN_DAWRX0,r10
+BEGIN_FTR_SECTION
+ mtspr SPRN_DAWRX1,r10
+END_FTR_SECTION_IFSET(CPU_FTR_DAWR1)
+ mtspr SPRN_PID,r10
+
+ /*
+ * Switch to host MMU mode
+ */
+ ld r10, HSTATE_KVM_VCPU(r13)
+ ld r10, VCPU_KVM(r10)
+ lwz r10, KVM_HOST_LPID(r10)
+ mtspr SPRN_LPID,r10
+
+ ld r10, HSTATE_KVM_VCPU(r13)
+ ld r10, VCPU_KVM(r10)
+ ld r10, KVM_HOST_LPCR(r10)
+ mtspr SPRN_LPCR,r10
+
+ /*
+ * Set GUEST_MODE_NONE so the handler won't branch to KVM, and clear
+ * MSR_RI in r12 ([H]SRR1) so the handler won't try to return.
+ */
+ li r10,KVM_GUEST_MODE_NONE
+ stb r10,HSTATE_IN_GUEST(r13)
+ li r10,MSR_RI
+ andc r12,r12,r10
+
+ /*
+ * Go back to interrupt handler. MCE and SRESET have their specific
+ * PACA save area so they should be used directly. They set up their
+ * own stack. The other handlers all use EXGEN. They will use the
+ * guest r1 if it looks like a kernel stack, so just load the
+ * emergency stack and go to program check for all other interrupts.
+ */
+ ld r10,HSTATE_SCRATCH0(r13)
+ cmpwi r10,BOOK3S_INTERRUPT_MACHINE_CHECK
+ beq machine_check_common
+
+ cmpwi r10,BOOK3S_INTERRUPT_SYSTEM_RESET
+ beq system_reset_common
+
+ b .
+#endif
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index d909c069363e..b5905ae4377c 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -21,6 +21,7 @@
#include <asm/pte-walk.h>
#include <asm/ultravisor.h>
#include <asm/kvm_book3s_uvmem.h>
+#include <asm/plpar_wrappers.h>
/*
* Supported radix tree geometry.
@@ -318,9 +319,19 @@ void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
}
psi = shift_to_mmu_psize(pshift);
- rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58));
- rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1),
- lpid, rb);
+
+ if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE)) {
+ rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58));
+ rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1),
+ lpid, rb);
+ } else {
+ rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU,
+ H_RPTI_TYPE_NESTED |
+ H_RPTI_TYPE_TLB,
+ psize_to_rpti_pgsize(psi),
+ addr, addr + psize);
+ }
+
if (rc)
pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc);
}
@@ -334,8 +345,14 @@ static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned int lpid)
return;
}
- rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1),
- lpid, TLBIEL_INVAL_SET_LPID);
+ if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE))
+ rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1),
+ lpid, TLBIEL_INVAL_SET_LPID);
+ else
+ rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU,
+ H_RPTI_TYPE_NESTED |
+ H_RPTI_TYPE_PWC, H_RPTI_PAGE_ALL,
+ 0, -1UL);
if (rc)
pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc);
}
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 083a4e037718..dc6591548f0c 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -391,10 +391,6 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
/* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
/* liobn, ioba, tce); */
- /* For radix, we might be in virtual mode, so punt */
- if (kvm_is_radix(vcpu->kvm))
- return H_TOO_HARD;
-
stt = kvmppc_find_table(vcpu->kvm, liobn);
if (!stt)
return H_TOO_HARD;
@@ -489,10 +485,6 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
bool prereg = false;
struct kvmppc_spapr_tce_iommu_table *stit;
- /* For radix, we might be in virtual mode, so punt */
- if (kvm_is_radix(vcpu->kvm))
- return H_TOO_HARD;
-
/*
* used to check for invalidations in progress
*/
@@ -602,10 +594,6 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
long i, ret;
struct kvmppc_spapr_tce_iommu_table *stit;
- /* For radix, we might be in virtual mode, so punt */
- if (kvm_is_radix(vcpu->kvm))
- return H_TOO_HARD;
-
stt = kvmppc_find_table(vcpu->kvm, liobn);
if (!stt)
return H_TOO_HARD;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index bc0813644666..cd544a46183e 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -76,6 +76,7 @@
#include <asm/kvm_book3s_uvmem.h>
#include <asm/ultravisor.h>
#include <asm/dtl.h>
+#include <asm/plpar_wrappers.h>
#include "book3s.h"
@@ -103,13 +104,9 @@ static int target_smt_mode;
module_param(target_smt_mode, int, 0644);
MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)");
-static bool indep_threads_mode = true;
-module_param(indep_threads_mode, bool, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(indep_threads_mode, "Independent-threads mode (only on POWER9)");
-
static bool one_vm_per_core;
module_param(one_vm_per_core, bool, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(one_vm_per_core, "Only run vCPUs from the same VM on a core (requires indep_threads_mode=N)");
+MODULE_PARM_DESC(one_vm_per_core, "Only run vCPUs from the same VM on a core (requires POWER8 or older)");
#ifdef CONFIG_KVM_XICS
static const struct kernel_param_ops module_param_ops = {
@@ -134,9 +131,6 @@ static inline bool nesting_enabled(struct kvm *kvm)
return kvm->arch.nested_enable && kvm_is_radix(kvm);
}
-/* If set, the threads on each CPU core have to be in the same MMU mode */
-static bool no_mixing_hpt_and_radix __read_mostly;
-
static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
/*
@@ -236,7 +230,7 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
waitp = kvm_arch_vcpu_get_wait(vcpu);
if (rcuwait_wake_up(waitp))
- ++vcpu->stat.halt_wakeup;
+ ++vcpu->stat.generic.halt_wakeup;
cpu = READ_ONCE(vcpu->arch.thread_cpu);
if (cpu >= 0 && kvmppc_ipi_thread(cpu))
@@ -807,7 +801,8 @@ static int kvmppc_h_set_mode(struct kvm_vcpu *vcpu, unsigned long mflags,
* KVM does not support mflags=2 (AIL=2) and AIL=1 is reserved.
* Keep this in synch with kvmppc_filter_guest_lpcr_hv.
*/
- if (mflags != 0 && mflags != 3)
+ if (cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG) &&
+ kvmhv_vcpu_is_radix(vcpu) && mflags == 3)
return H_UNSUPPORTED_FLAG_START;
return H_TOO_HARD;
default:
@@ -899,6 +894,10 @@ static int kvm_arch_vcpu_yield_to(struct kvm_vcpu *target)
* H_SUCCESS if the source vcore wasn't idle (e.g. if it may
* have useful work to do and should not confer) so we don't
* recheck that here.
+ *
+ * In the case of the P9 single vcpu per vcore case, the real
+ * mode handler is not called but no other threads are in the
+ * source vcore.
*/
spin_lock(&vcore->lock);
@@ -924,8 +923,71 @@ static int kvmppc_get_yield_count(struct kvm_vcpu *vcpu)
return yield_count;
}
+/*
+ * H_RPT_INVALIDATE hcall handler for nested guests.
+ *
+ * Handles only nested process-scoped invalidation requests in L0.
+ */
+static int kvmppc_nested_h_rpt_invalidate(struct kvm_vcpu *vcpu)
+{
+ unsigned long type = kvmppc_get_gpr(vcpu, 6);
+ unsigned long pid, pg_sizes, start, end;
+
+ /*
+ * The partition-scoped invalidations aren't handled here in L0.
+ */
+ if (type & H_RPTI_TYPE_NESTED)
+ return RESUME_HOST;
+
+ pid = kvmppc_get_gpr(vcpu, 4);
+ pg_sizes = kvmppc_get_gpr(vcpu, 7);
+ start = kvmppc_get_gpr(vcpu, 8);
+ end = kvmppc_get_gpr(vcpu, 9);
+
+ do_h_rpt_invalidate_prt(pid, vcpu->arch.nested->shadow_lpid,
+ type, pg_sizes, start, end);
+
+ kvmppc_set_gpr(vcpu, 3, H_SUCCESS);
+ return RESUME_GUEST;
+}
+
+static long kvmppc_h_rpt_invalidate(struct kvm_vcpu *vcpu,
+ unsigned long id, unsigned long target,
+ unsigned long type, unsigned long pg_sizes,
+ unsigned long start, unsigned long end)
+{
+ if (!kvm_is_radix(vcpu->kvm))
+ return H_UNSUPPORTED;
+
+ if (end < start)
+ return H_P5;
+
+ /*
+ * Partition-scoped invalidation for nested guests.
+ */
+ if (type & H_RPTI_TYPE_NESTED) {
+ if (!nesting_enabled(vcpu->kvm))
+ return H_FUNCTION;
+
+ /* Support only cores as target */
+ if (target != H_RPTI_TARGET_CMMU)
+ return H_P2;
+
+ return do_h_rpt_invalidate_pat(vcpu, id, type, pg_sizes,
+ start, end);
+ }
+
+ /*
+ * Process-scoped invalidation for L1 guests.
+ */
+ do_h_rpt_invalidate_prt(id, vcpu->kvm->arch.lpid,
+ type, pg_sizes, start, end);
+ return H_SUCCESS;
+}
+
int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
{
+ struct kvm *kvm = vcpu->kvm;
unsigned long req = kvmppc_get_gpr(vcpu, 3);
unsigned long target, ret = H_SUCCESS;
int yield_count;
@@ -937,11 +999,57 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
return RESUME_HOST;
switch (req) {
+ case H_REMOVE:
+ ret = kvmppc_h_remove(vcpu, kvmppc_get_gpr(vcpu, 4),
+ kvmppc_get_gpr(vcpu, 5),
+ kvmppc_get_gpr(vcpu, 6));
+ if (ret == H_TOO_HARD)
+ return RESUME_HOST;
+ break;
+ case H_ENTER:
+ ret = kvmppc_h_enter(vcpu, kvmppc_get_gpr(vcpu, 4),
+ kvmppc_get_gpr(vcpu, 5),
+ kvmppc_get_gpr(vcpu, 6),
+ kvmppc_get_gpr(vcpu, 7));
+ if (ret == H_TOO_HARD)
+ return RESUME_HOST;
+ break;
+ case H_READ:
+ ret = kvmppc_h_read(vcpu, kvmppc_get_gpr(vcpu, 4),
+ kvmppc_get_gpr(vcpu, 5));
+ if (ret == H_TOO_HARD)
+ return RESUME_HOST;
+ break;
+ case H_CLEAR_MOD:
+ ret = kvmppc_h_clear_mod(vcpu, kvmppc_get_gpr(vcpu, 4),
+ kvmppc_get_gpr(vcpu, 5));
+ if (ret == H_TOO_HARD)
+ return RESUME_HOST;
+ break;
+ case H_CLEAR_REF:
+ ret = kvmppc_h_clear_ref(vcpu, kvmppc_get_gpr(vcpu, 4),
+ kvmppc_get_gpr(vcpu, 5));
+ if (ret == H_TOO_HARD)
+ return RESUME_HOST;
+ break;
+ case H_PROTECT:
+ ret = kvmppc_h_protect(vcpu, kvmppc_get_gpr(vcpu, 4),
+ kvmppc_get_gpr(vcpu, 5),
+ kvmppc_get_gpr(vcpu, 6));
+ if (ret == H_TOO_HARD)
+ return RESUME_HOST;
+ break;
+ case H_BULK_REMOVE:
+ ret = kvmppc_h_bulk_remove(vcpu);
+ if (ret == H_TOO_HARD)
+ return RESUME_HOST;
+ break;
+
case H_CEDE:
break;
case H_PROD:
target = kvmppc_get_gpr(vcpu, 4);
- tvcpu = kvmppc_find_vcpu(vcpu->kvm, target);
+ tvcpu = kvmppc_find_vcpu(kvm, target);
if (!tvcpu) {
ret = H_PARAMETER;
break;
@@ -955,7 +1063,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
target = kvmppc_get_gpr(vcpu, 4);
if (target == -1)
break;
- tvcpu = kvmppc_find_vcpu(vcpu->kvm, target);
+ tvcpu = kvmppc_find_vcpu(kvm, target);
if (!tvcpu) {
ret = H_PARAMETER;
break;
@@ -971,12 +1079,12 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
kvmppc_get_gpr(vcpu, 6));
break;
case H_RTAS:
- if (list_empty(&vcpu->kvm->arch.rtas_tokens))
+ if (list_empty(&kvm->arch.rtas_tokens))
return RESUME_HOST;
- idx = srcu_read_lock(&vcpu->kvm->srcu);
+ idx = srcu_read_lock(&kvm->srcu);
rc = kvmppc_rtas_hcall(vcpu);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ srcu_read_unlock(&kvm->srcu, idx);
if (rc == -ENOENT)
return RESUME_HOST;
@@ -1060,15 +1168,23 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
if (!powernv_get_random_long(&vcpu->arch.regs.gpr[4]))
ret = H_HARDWARE;
break;
+ case H_RPT_INVALIDATE:
+ ret = kvmppc_h_rpt_invalidate(vcpu, kvmppc_get_gpr(vcpu, 4),
+ kvmppc_get_gpr(vcpu, 5),
+ kvmppc_get_gpr(vcpu, 6),
+ kvmppc_get_gpr(vcpu, 7),
+ kvmppc_get_gpr(vcpu, 8),
+ kvmppc_get_gpr(vcpu, 9));
+ break;
case H_SET_PARTITION_TABLE:
ret = H_FUNCTION;
- if (nesting_enabled(vcpu->kvm))
+ if (nesting_enabled(kvm))
ret = kvmhv_set_partition_table(vcpu);
break;
case H_ENTER_NESTED:
ret = H_FUNCTION;
- if (!nesting_enabled(vcpu->kvm))
+ if (!nesting_enabled(kvm))
break;
ret = kvmhv_enter_nested_guest(vcpu);
if (ret == H_INTERRUPT) {
@@ -1083,12 +1199,12 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
break;
case H_TLB_INVALIDATE:
ret = H_FUNCTION;
- if (nesting_enabled(vcpu->kvm))
+ if (nesting_enabled(kvm))
ret = kvmhv_do_nested_tlbie(vcpu);
break;
case H_COPY_TOFROM_GUEST:
ret = H_FUNCTION;
- if (nesting_enabled(vcpu->kvm))
+ if (nesting_enabled(kvm))
ret = kvmhv_copy_tofrom_guest_nested(vcpu);
break;
case H_PAGE_INIT:
@@ -1099,7 +1215,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
case H_SVM_PAGE_IN:
ret = H_UNSUPPORTED;
if (kvmppc_get_srr1(vcpu) & MSR_S)
- ret = kvmppc_h_svm_page_in(vcpu->kvm,
+ ret = kvmppc_h_svm_page_in(kvm,
kvmppc_get_gpr(vcpu, 4),
kvmppc_get_gpr(vcpu, 5),
kvmppc_get_gpr(vcpu, 6));
@@ -1107,7 +1223,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
case H_SVM_PAGE_OUT:
ret = H_UNSUPPORTED;
if (kvmppc_get_srr1(vcpu) & MSR_S)
- ret = kvmppc_h_svm_page_out(vcpu->kvm,
+ ret = kvmppc_h_svm_page_out(kvm,
kvmppc_get_gpr(vcpu, 4),
kvmppc_get_gpr(vcpu, 5),
kvmppc_get_gpr(vcpu, 6));
@@ -1115,12 +1231,12 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
case H_SVM_INIT_START:
ret = H_UNSUPPORTED;
if (kvmppc_get_srr1(vcpu) & MSR_S)
- ret = kvmppc_h_svm_init_start(vcpu->kvm);
+ ret = kvmppc_h_svm_init_start(kvm);
break;
case H_SVM_INIT_DONE:
ret = H_UNSUPPORTED;
if (kvmppc_get_srr1(vcpu) & MSR_S)
- ret = kvmppc_h_svm_init_done(vcpu->kvm);
+ ret = kvmppc_h_svm_init_done(kvm);
break;
case H_SVM_INIT_ABORT:
/*
@@ -1130,24 +1246,26 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
* Instead the kvm->arch.secure_guest flag is checked inside
* kvmppc_h_svm_init_abort().
*/
- ret = kvmppc_h_svm_init_abort(vcpu->kvm);
+ ret = kvmppc_h_svm_init_abort(kvm);
break;
default:
return RESUME_HOST;
}
+ WARN_ON_ONCE(ret == H_TOO_HARD);
kvmppc_set_gpr(vcpu, 3, ret);
vcpu->arch.hcall_needed = 0;
return RESUME_GUEST;
}
/*
- * Handle H_CEDE in the nested virtualization case where we haven't
- * called the real-mode hcall handlers in book3s_hv_rmhandlers.S.
+ * Handle H_CEDE in the P9 path where we don't call the real-mode hcall
+ * handlers in book3s_hv_rmhandlers.S.
+ *
* This has to be done early, not in kvmppc_pseries_do_hcall(), so
* that the cede logic in kvmppc_run_single_vcpu() works properly.
*/
-static void kvmppc_nested_cede(struct kvm_vcpu *vcpu)
+static void kvmppc_cede(struct kvm_vcpu *vcpu)
{
vcpu->arch.shregs.msr |= MSR_EE;
vcpu->arch.ceded = 1;
@@ -1178,6 +1296,7 @@ static int kvmppc_hcall_impl_hv(unsigned long cmd)
case H_XIRR_X:
#endif
case H_PAGE_INIT:
+ case H_RPT_INVALIDATE:
return 1;
}
@@ -1400,13 +1519,39 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu,
}
case BOOK3S_INTERRUPT_SYSCALL:
{
- /* hcall - punt to userspace */
int i;
- /* hypercall with MSR_PR has already been handled in rmode,
- * and never reaches here.
- */
+ if (unlikely(vcpu->arch.shregs.msr & MSR_PR)) {
+ /*
+ * Guest userspace executed sc 1. This can only be
+ * reached by the P9 path because the old path
+ * handles this case in realmode hcall handlers.
+ */
+ if (!kvmhv_vcpu_is_radix(vcpu)) {
+ /*
+ * A guest could be running PR KVM, so this
+ * may be a PR KVM hcall. It must be reflected
+ * to the guest kernel as a sc interrupt.
+ */
+ kvmppc_core_queue_syscall(vcpu);
+ } else {
+ /*
+ * Radix guests can not run PR KVM or nested HV
+ * hash guests which might run PR KVM, so this
+ * is always a privilege fault. Send a program
+ * check to guest kernel.
+ */
+ kvmppc_core_queue_program(vcpu, SRR1_PROGPRIV);
+ }
+ r = RESUME_GUEST;
+ break;
+ }
+ /*
+ * hcall - gather args and set exit_reason. This will next be
+ * handled by kvmppc_pseries_do_hcall which may be able to deal
+ * with it and resume guest, or may punt to userspace.
+ */
run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3);
for (i = 0; i < 9; ++i)
run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i);
@@ -1419,20 +1564,102 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu,
* We get these next two if the guest accesses a page which it thinks
* it has mapped but which is not actually present, either because
* it is for an emulated I/O device or because the corresonding
- * host page has been paged out. Any other HDSI/HISI interrupts
- * have been handled already.
+ * host page has been paged out.
+ *
+ * Any other HDSI/HISI interrupts have been handled already for P7/8
+ * guests. For POWER9 hash guests not using rmhandlers, basic hash
+ * fault handling is done here.
*/
- case BOOK3S_INTERRUPT_H_DATA_STORAGE:
- r = RESUME_PAGE_FAULT;
+ case BOOK3S_INTERRUPT_H_DATA_STORAGE: {
+ unsigned long vsid;
+ long err;
+
+ if (vcpu->arch.fault_dsisr == HDSISR_CANARY) {
+ r = RESUME_GUEST; /* Just retry if it's the canary */
+ break;
+ }
+
+ if (kvm_is_radix(vcpu->kvm) || !cpu_has_feature(CPU_FTR_ARCH_300)) {
+ /*
+ * Radix doesn't require anything, and pre-ISAv3.0 hash
+ * already attempted to handle this in rmhandlers. The
+ * hash fault handling below is v3 only (it uses ASDR
+ * via fault_gpa).
+ */
+ r = RESUME_PAGE_FAULT;
+ break;
+ }
+
+ if (!(vcpu->arch.fault_dsisr & (DSISR_NOHPTE | DSISR_PROTFAULT))) {
+ kvmppc_core_queue_data_storage(vcpu,
+ vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
+ r = RESUME_GUEST;
+ break;
+ }
+
+ if (!(vcpu->arch.shregs.msr & MSR_DR))
+ vsid = vcpu->kvm->arch.vrma_slb_v;
+ else
+ vsid = vcpu->arch.fault_gpa;
+
+ err = kvmppc_hpte_hv_fault(vcpu, vcpu->arch.fault_dar,
+ vsid, vcpu->arch.fault_dsisr, true);
+ if (err == 0) {
+ r = RESUME_GUEST;
+ } else if (err == -1 || err == -2) {
+ r = RESUME_PAGE_FAULT;
+ } else {
+ kvmppc_core_queue_data_storage(vcpu,
+ vcpu->arch.fault_dar, err);
+ r = RESUME_GUEST;
+ }
break;
- case BOOK3S_INTERRUPT_H_INST_STORAGE:
+ }
+ case BOOK3S_INTERRUPT_H_INST_STORAGE: {
+ unsigned long vsid;
+ long err;
+
vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
vcpu->arch.fault_dsisr = vcpu->arch.shregs.msr &
DSISR_SRR1_MATCH_64S;
- if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
- vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
- r = RESUME_PAGE_FAULT;
+ if (kvm_is_radix(vcpu->kvm) || !cpu_has_feature(CPU_FTR_ARCH_300)) {
+ /*
+ * Radix doesn't require anything, and pre-ISAv3.0 hash
+ * already attempted to handle this in rmhandlers. The
+ * hash fault handling below is v3 only (it uses ASDR
+ * via fault_gpa).
+ */
+ if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
+ vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
+ r = RESUME_PAGE_FAULT;
+ break;
+ }
+
+ if (!(vcpu->arch.fault_dsisr & SRR1_ISI_NOPT)) {
+ kvmppc_core_queue_inst_storage(vcpu,
+ vcpu->arch.fault_dsisr);
+ r = RESUME_GUEST;
+ break;
+ }
+
+ if (!(vcpu->arch.shregs.msr & MSR_IR))
+ vsid = vcpu->kvm->arch.vrma_slb_v;
+ else
+ vsid = vcpu->arch.fault_gpa;
+
+ err = kvmppc_hpte_hv_fault(vcpu, vcpu->arch.fault_dar,
+ vsid, vcpu->arch.fault_dsisr, false);
+ if (err == 0) {
+ r = RESUME_GUEST;
+ } else if (err == -1) {
+ r = RESUME_PAGE_FAULT;
+ } else {
+ kvmppc_core_queue_inst_storage(vcpu, err);
+ r = RESUME_GUEST;
+ }
break;
+ }
+
/*
* This occurs if the guest executes an illegal instruction.
* If the guest debug is disabled, generate a program interrupt
@@ -1593,6 +1820,23 @@ static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)
if (!xics_on_xive())
kvmppc_xics_rm_complete(vcpu, 0);
break;
+ case BOOK3S_INTERRUPT_SYSCALL:
+ {
+ unsigned long req = kvmppc_get_gpr(vcpu, 3);
+
+ /*
+ * The H_RPT_INVALIDATE hcalls issued by nested
+ * guests for process-scoped invalidations when
+ * GTSE=0, are handled here in L0.
+ */
+ if (req == H_RPT_INVALIDATE) {
+ r = kvmppc_nested_h_rpt_invalidate(vcpu);
+ break;
+ }
+
+ r = RESUME_HOST;
+ break;
+ }
default:
r = RESUME_HOST;
break;
@@ -1654,6 +1898,14 @@ unsigned long kvmppc_filter_lpcr_hv(struct kvm *kvm, unsigned long lpcr)
lpcr &= ~LPCR_AIL;
if ((lpcr & LPCR_AIL) != LPCR_AIL_3)
lpcr &= ~LPCR_AIL; /* LPCR[AIL]=1/2 is disallowed */
+ /*
+ * On some POWER9s we force AIL off for radix guests to prevent
+ * executing in MSR[HV]=1 mode with the MMU enabled and PIDR set to
+ * guest, which can result in Q0 translations with LPID=0 PID=PIDR to
+ * be cached, which the host TLB management does not expect.
+ */
+ if (kvm_is_radix(kvm) && cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG))
+ lpcr &= ~LPCR_AIL;
/*
* On POWER9, allow userspace to enable large decrementer for the
@@ -2233,7 +2485,7 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
*/
static int threads_per_vcore(struct kvm *kvm)
{
- if (kvm->arch.threads_indep)
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
return 1;
return threads_per_subcore;
}
@@ -2657,7 +2909,7 @@ static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
cpumask_t *cpu_in_guest;
int i;
- cpu = cpu_first_thread_sibling(cpu);
+ cpu = cpu_first_tlb_thread_sibling(cpu);
if (nested) {
cpumask_set_cpu(cpu, &nested->need_tlb_flush);
cpu_in_guest = &nested->cpu_in_guest;
@@ -2671,9 +2923,10 @@ static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
* the other side is the first smp_mb() in kvmppc_run_core().
*/
smp_mb();
- for (i = 0; i < threads_per_core; ++i)
- if (cpumask_test_cpu(cpu + i, cpu_in_guest))
- smp_call_function_single(cpu + i, do_nothing, NULL, 1);
+ for (i = cpu; i <= cpu_last_tlb_thread_sibling(cpu);
+ i += cpu_tlb_thread_sibling_step())
+ if (cpumask_test_cpu(i, cpu_in_guest))
+ smp_call_function_single(i, do_nothing, NULL, 1);
}
static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
@@ -2704,8 +2957,8 @@ static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
*/
if (prev_cpu != pcpu) {
if (prev_cpu >= 0 &&
- cpu_first_thread_sibling(prev_cpu) !=
- cpu_first_thread_sibling(pcpu))
+ cpu_first_tlb_thread_sibling(prev_cpu) !=
+ cpu_first_tlb_thread_sibling(pcpu))
radix_flush_cpu(kvm, prev_cpu, vcpu);
if (nested)
nested->prev_cpu[vcpu->arch.nested_vcpu_id] = pcpu;
@@ -2967,9 +3220,6 @@ static void prepare_threads(struct kvmppc_vcore *vc)
for_each_runnable_thread(i, vcpu, vc) {
if (signal_pending(vcpu->arch.run_task))
vcpu->arch.ret = -EINTR;
- else if (no_mixing_hpt_and_radix &&
- kvm_is_radix(vc->kvm) != radix_enabled())
- vcpu->arch.ret = -EINVAL;
else if (vcpu->arch.vpa.update_pending ||
vcpu->arch.slb_shadow.update_pending ||
vcpu->arch.dtl.update_pending)
@@ -3176,6 +3426,9 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
int trap;
bool is_power8;
+ if (WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300)))
+ return;
+
/*
* Remove from the list any threads that have a signal pending
* or need a VPA update done
@@ -3203,9 +3456,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
* Make sure we are running on primary threads, and that secondary
* threads are offline. Also check if the number of threads in this
* guest are greater than the current system threads per guest.
- * On POWER9, we need to be not in independent-threads mode if
- * this is a HPT guest on a radix host machine where the
- * CPU threads may not be in different MMU modes.
*/
if ((controlled_threads > 1) &&
((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
@@ -3230,18 +3480,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
collect_piggybacks(&core_info, target_threads);
/*
- * On radix, arrange for TLB flushing if necessary.
- * This has to be done before disabling interrupts since
- * it uses smp_call_function().
- */
- pcpu = smp_processor_id();
- if (kvm_is_radix(vc->kvm)) {
- for (sub = 0; sub < core_info.n_subcores; ++sub)
- for_each_runnable_thread(i, vcpu, core_info.vc[sub])
- kvmppc_prepare_radix_vcpu(vcpu, pcpu);
- }
-
- /*
* Hard-disable interrupts, and check resched flag and signals.
* If we need to reschedule or deliver a signal, clean up
* and return without going into the guest(s).
@@ -3273,8 +3511,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
cmd_bit = stat_bit = 0;
split = core_info.n_subcores;
sip = NULL;
- is_power8 = cpu_has_feature(CPU_FTR_ARCH_207S)
- && !cpu_has_feature(CPU_FTR_ARCH_300);
+ is_power8 = cpu_has_feature(CPU_FTR_ARCH_207S);
if (split > 1) {
sip = &split_info;
@@ -3478,184 +3715,113 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
trace_kvmppc_run_core(vc, 1);
}
-/*
- * Load up hypervisor-mode registers on P9.
- */
-static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
- unsigned long lpcr)
+static void load_spr_state(struct kvm_vcpu *vcpu)
{
- struct kvmppc_vcore *vc = vcpu->arch.vcore;
- s64 hdec;
- u64 tb, purr, spurr;
- int trap;
- unsigned long host_hfscr = mfspr(SPRN_HFSCR);
- unsigned long host_ciabr = mfspr(SPRN_CIABR);
- unsigned long host_dawr0 = mfspr(SPRN_DAWR0);
- unsigned long host_dawrx0 = mfspr(SPRN_DAWRX0);
- unsigned long host_psscr = mfspr(SPRN_PSSCR);
- unsigned long host_pidr = mfspr(SPRN_PID);
- unsigned long host_dawr1 = 0;
- unsigned long host_dawrx1 = 0;
-
- if (cpu_has_feature(CPU_FTR_DAWR1)) {
- host_dawr1 = mfspr(SPRN_DAWR1);
- host_dawrx1 = mfspr(SPRN_DAWRX1);
- }
+ mtspr(SPRN_DSCR, vcpu->arch.dscr);
+ mtspr(SPRN_IAMR, vcpu->arch.iamr);
+ mtspr(SPRN_PSPB, vcpu->arch.pspb);
+ mtspr(SPRN_FSCR, vcpu->arch.fscr);
+ mtspr(SPRN_TAR, vcpu->arch.tar);
+ mtspr(SPRN_EBBHR, vcpu->arch.ebbhr);
+ mtspr(SPRN_EBBRR, vcpu->arch.ebbrr);
+ mtspr(SPRN_BESCR, vcpu->arch.bescr);
+ mtspr(SPRN_WORT, vcpu->arch.wort);
+ mtspr(SPRN_TIDR, vcpu->arch.tid);
+ mtspr(SPRN_AMR, vcpu->arch.amr);
+ mtspr(SPRN_UAMOR, vcpu->arch.uamor);
/*
- * P8 and P9 suppress the HDEC exception when LPCR[HDICE] = 0,
- * so set HDICE before writing HDEC.
+ * DAR, DSISR, and for nested HV, SPRGs must be set with MSR[RI]
+ * clear (or hstate set appropriately to catch those registers
+ * being clobbered if we take a MCE or SRESET), so those are done
+ * later.
*/
- mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr | LPCR_HDICE);
- isync();
-
- hdec = time_limit - mftb();
- if (hdec < 0) {
- mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr);
- isync();
- return BOOK3S_INTERRUPT_HV_DECREMENTER;
- }
- mtspr(SPRN_HDEC, hdec);
-
- if (vc->tb_offset) {
- u64 new_tb = mftb() + vc->tb_offset;
- mtspr(SPRN_TBU40, new_tb);
- tb = mftb();
- if ((tb & 0xffffff) < (new_tb & 0xffffff))
- mtspr(SPRN_TBU40, new_tb + 0x1000000);
- vc->tb_offset_applied = vc->tb_offset;
- }
-
- if (vc->pcr)
- mtspr(SPRN_PCR, vc->pcr | PCR_MASK);
- mtspr(SPRN_DPDES, vc->dpdes);
- mtspr(SPRN_VTB, vc->vtb);
-
- local_paca->kvm_hstate.host_purr = mfspr(SPRN_PURR);
- local_paca->kvm_hstate.host_spurr = mfspr(SPRN_SPURR);
- mtspr(SPRN_PURR, vcpu->arch.purr);
- mtspr(SPRN_SPURR, vcpu->arch.spurr);
-
- if (dawr_enabled()) {
- mtspr(SPRN_DAWR0, vcpu->arch.dawr0);
- mtspr(SPRN_DAWRX0, vcpu->arch.dawrx0);
- if (cpu_has_feature(CPU_FTR_DAWR1)) {
- mtspr(SPRN_DAWR1, vcpu->arch.dawr1);
- mtspr(SPRN_DAWRX1, vcpu->arch.dawrx1);
- }
- }
- mtspr(SPRN_CIABR, vcpu->arch.ciabr);
- mtspr(SPRN_IC, vcpu->arch.ic);
- mtspr(SPRN_PID, vcpu->arch.pid);
- mtspr(SPRN_PSSCR, vcpu->arch.psscr | PSSCR_EC |
- (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
-
- mtspr(SPRN_HFSCR, vcpu->arch.hfscr);
-
- mtspr(SPRN_SPRG0, vcpu->arch.shregs.sprg0);
- mtspr(SPRN_SPRG1, vcpu->arch.shregs.sprg1);
- mtspr(SPRN_SPRG2, vcpu->arch.shregs.sprg2);
- mtspr(SPRN_SPRG3, vcpu->arch.shregs.sprg3);
-
- mtspr(SPRN_AMOR, ~0UL);
-
- mtspr(SPRN_LPCR, lpcr);
- isync();
-
- kvmppc_xive_push_vcpu(vcpu);
-
- mtspr(SPRN_SRR0, vcpu->arch.shregs.srr0);
- mtspr(SPRN_SRR1, vcpu->arch.shregs.srr1);
-
- trap = __kvmhv_vcpu_entry_p9(vcpu);
-
- /* Advance host PURR/SPURR by the amount used by guest */
- purr = mfspr(SPRN_PURR);
- spurr = mfspr(SPRN_SPURR);
- mtspr(SPRN_PURR, local_paca->kvm_hstate.host_purr +
- purr - vcpu->arch.purr);
- mtspr(SPRN_SPURR, local_paca->kvm_hstate.host_spurr +
- spurr - vcpu->arch.spurr);
- vcpu->arch.purr = purr;
- vcpu->arch.spurr = spurr;
+ if (!(vcpu->arch.ctrl & 1))
+ mtspr(SPRN_CTRLT, mfspr(SPRN_CTRLF) & ~1);
+}
- vcpu->arch.ic = mfspr(SPRN_IC);
- vcpu->arch.pid = mfspr(SPRN_PID);
- vcpu->arch.psscr = mfspr(SPRN_PSSCR) & PSSCR_GUEST_VIS;
+static void store_spr_state(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.ctrl = mfspr(SPRN_CTRLF);
- vcpu->arch.shregs.sprg0 = mfspr(SPRN_SPRG0);
- vcpu->arch.shregs.sprg1 = mfspr(SPRN_SPRG1);
- vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2);
- vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3);
+ vcpu->arch.iamr = mfspr(SPRN_IAMR);
+ vcpu->arch.pspb = mfspr(SPRN_PSPB);
+ vcpu->arch.fscr = mfspr(SPRN_FSCR);
+ vcpu->arch.tar = mfspr(SPRN_TAR);
+ vcpu->arch.ebbhr = mfspr(SPRN_EBBHR);
+ vcpu->arch.ebbrr = mfspr(SPRN_EBBRR);
+ vcpu->arch.bescr = mfspr(SPRN_BESCR);
+ vcpu->arch.wort = mfspr(SPRN_WORT);
+ vcpu->arch.tid = mfspr(SPRN_TIDR);
+ vcpu->arch.amr = mfspr(SPRN_AMR);
+ vcpu->arch.uamor = mfspr(SPRN_UAMOR);
+ vcpu->arch.dscr = mfspr(SPRN_DSCR);
+}
- /* Preserve PSSCR[FAKE_SUSPEND] until we've called kvmppc_save_tm_hv */
- mtspr(SPRN_PSSCR, host_psscr |
- (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
- mtspr(SPRN_HFSCR, host_hfscr);
- mtspr(SPRN_CIABR, host_ciabr);
- mtspr(SPRN_DAWR0, host_dawr0);
- mtspr(SPRN_DAWRX0, host_dawrx0);
- if (cpu_has_feature(CPU_FTR_DAWR1)) {
- mtspr(SPRN_DAWR1, host_dawr1);
- mtspr(SPRN_DAWRX1, host_dawrx1);
- }
- mtspr(SPRN_PID, host_pidr);
+/*
+ * Privileged (non-hypervisor) host registers to save.
+ */
+struct p9_host_os_sprs {
+ unsigned long dscr;
+ unsigned long tidr;
+ unsigned long iamr;
+ unsigned long amr;
+ unsigned long fscr;
+};
- /*
- * Since this is radix, do a eieio; tlbsync; ptesync sequence in
- * case we interrupted the guest between a tlbie and a ptesync.
- */
- asm volatile("eieio; tlbsync; ptesync");
+static void save_p9_host_os_sprs(struct p9_host_os_sprs *host_os_sprs)
+{
+ host_os_sprs->dscr = mfspr(SPRN_DSCR);
+ host_os_sprs->tidr = mfspr(SPRN_TIDR);
+ host_os_sprs->iamr = mfspr(SPRN_IAMR);
+ host_os_sprs->amr = mfspr(SPRN_AMR);
+ host_os_sprs->fscr = mfspr(SPRN_FSCR);
+}
- /*
- * cp_abort is required if the processor supports local copy-paste
- * to clear the copy buffer that was under control of the guest.
- */
- if (cpu_has_feature(CPU_FTR_ARCH_31))
- asm volatile(PPC_CP_ABORT);
+/* vcpu guest regs must already be saved */
+static void restore_p9_host_os_sprs(struct kvm_vcpu *vcpu,
+ struct p9_host_os_sprs *host_os_sprs)
+{
+ mtspr(SPRN_PSPB, 0);
+ mtspr(SPRN_WORT, 0);
+ mtspr(SPRN_UAMOR, 0);
- mtspr(SPRN_LPID, vcpu->kvm->arch.host_lpid); /* restore host LPID */
- isync();
+ mtspr(SPRN_DSCR, host_os_sprs->dscr);
+ mtspr(SPRN_TIDR, host_os_sprs->tidr);
+ mtspr(SPRN_IAMR, host_os_sprs->iamr);
- vc->dpdes = mfspr(SPRN_DPDES);
- vc->vtb = mfspr(SPRN_VTB);
- mtspr(SPRN_DPDES, 0);
- if (vc->pcr)
- mtspr(SPRN_PCR, PCR_MASK);
+ if (host_os_sprs->amr != vcpu->arch.amr)
+ mtspr(SPRN_AMR, host_os_sprs->amr);
- if (vc->tb_offset_applied) {
- u64 new_tb = mftb() - vc->tb_offset_applied;
- mtspr(SPRN_TBU40, new_tb);
- tb = mftb();
- if ((tb & 0xffffff) < (new_tb & 0xffffff))
- mtspr(SPRN_TBU40, new_tb + 0x1000000);
- vc->tb_offset_applied = 0;
- }
+ if (host_os_sprs->fscr != vcpu->arch.fscr)
+ mtspr(SPRN_FSCR, host_os_sprs->fscr);
- mtspr(SPRN_HDEC, 0x7fffffff);
- mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr);
+ /* Save guest CTRL register, set runlatch to 1 */
+ if (!(vcpu->arch.ctrl & 1))
+ mtspr(SPRN_CTRLT, 1);
+}
- return trap;
+static inline bool hcall_is_xics(unsigned long req)
+{
+ return req == H_EOI || req == H_CPPR || req == H_IPI ||
+ req == H_IPOLL || req == H_XIRR || req == H_XIRR_X;
}
/*
- * Virtual-mode guest entry for POWER9 and later when the host and
- * guest are both using the radix MMU. The LPIDR has already been set.
+ * Guest entry for POWER9 and later CPUs.
*/
static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
unsigned long lpcr)
{
struct kvmppc_vcore *vc = vcpu->arch.vcore;
- unsigned long host_dscr = mfspr(SPRN_DSCR);
- unsigned long host_tidr = mfspr(SPRN_TIDR);
- unsigned long host_iamr = mfspr(SPRN_IAMR);
- unsigned long host_amr = mfspr(SPRN_AMR);
- unsigned long host_fscr = mfspr(SPRN_FSCR);
+ struct p9_host_os_sprs host_os_sprs;
s64 dec;
u64 tb;
int trap, save_pmu;
+ WARN_ON_ONCE(vcpu->arch.ceded);
+
dec = mfspr(SPRN_DEC);
tb = mftb();
if (dec < 0)
@@ -3664,7 +3830,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
if (local_paca->kvm_hstate.dec_expires < time_limit)
time_limit = local_paca->kvm_hstate.dec_expires;
- vcpu->arch.ceded = 0;
+ save_p9_host_os_sprs(&host_os_sprs);
kvmhv_save_host_pmu(); /* saves it to PACA kvm_hstate */
@@ -3693,24 +3859,20 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
#endif
mtspr(SPRN_VRSAVE, vcpu->arch.vrsave);
- mtspr(SPRN_DSCR, vcpu->arch.dscr);
- mtspr(SPRN_IAMR, vcpu->arch.iamr);
- mtspr(SPRN_PSPB, vcpu->arch.pspb);
- mtspr(SPRN_FSCR, vcpu->arch.fscr);
- mtspr(SPRN_TAR, vcpu->arch.tar);
- mtspr(SPRN_EBBHR, vcpu->arch.ebbhr);
- mtspr(SPRN_EBBRR, vcpu->arch.ebbrr);
- mtspr(SPRN_BESCR, vcpu->arch.bescr);
- mtspr(SPRN_WORT, vcpu->arch.wort);
- mtspr(SPRN_TIDR, vcpu->arch.tid);
- mtspr(SPRN_DAR, vcpu->arch.shregs.dar);
- mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr);
- mtspr(SPRN_AMR, vcpu->arch.amr);
- mtspr(SPRN_UAMOR, vcpu->arch.uamor);
-
- if (!(vcpu->arch.ctrl & 1))
- mtspr(SPRN_CTRLT, mfspr(SPRN_CTRLF) & ~1);
+ load_spr_state(vcpu);
+ /*
+ * When setting DEC, we must always deal with irq_work_raise via NMI vs
+ * setting DEC. The problem occurs right as we switch into guest mode
+ * if a NMI hits and sets pending work and sets DEC, then that will
+ * apply to the guest and not bring us back to the host.
+ *
+ * irq_work_raise could check a flag (or possibly LPCR[HDICE] for
+ * example) and set HDEC to 1? That wouldn't solve the nested hv
+ * case which needs to abort the hcall or zero the time limit.
+ *
+ * XXX: Another day's problem.
+ */
mtspr(SPRN_DEC, vcpu->arch.dec_expires - mftb());
if (kvmhv_on_pseries()) {
@@ -3718,7 +3880,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
* We need to save and restore the guest visible part of the
* psscr (i.e. using SPRN_PSSCR_PR) since the hypervisor
* doesn't do this for us. Note only required if pseries since
- * this is done in kvmhv_load_hv_regs_and_go() below otherwise.
+ * this is done in kvmhv_vcpu_entry_p9() below otherwise.
*/
unsigned long host_psscr;
/* call our hypervisor to load up HV regs and go */
@@ -3738,6 +3900,8 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
hvregs.vcpu_token = vcpu->vcpu_id;
}
hvregs.hdec_expiry = time_limit;
+ mtspr(SPRN_DAR, vcpu->arch.shregs.dar);
+ mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr);
trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs),
__pa(&vcpu->arch.regs));
kvmhv_restore_hv_return_state(vcpu, &hvregs);
@@ -3750,15 +3914,41 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
/* H_CEDE has to be handled now, not later */
if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested &&
kvmppc_get_gpr(vcpu, 3) == H_CEDE) {
- kvmppc_nested_cede(vcpu);
+ kvmppc_cede(vcpu);
kvmppc_set_gpr(vcpu, 3, 0);
trap = 0;
}
} else {
- trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit, lpcr);
+ kvmppc_xive_push_vcpu(vcpu);
+ trap = kvmhv_vcpu_entry_p9(vcpu, time_limit, lpcr);
+ if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested &&
+ !(vcpu->arch.shregs.msr & MSR_PR)) {
+ unsigned long req = kvmppc_get_gpr(vcpu, 3);
+
+ /* H_CEDE has to be handled now, not later */
+ if (req == H_CEDE) {
+ kvmppc_cede(vcpu);
+ kvmppc_xive_rearm_escalation(vcpu); /* may un-cede */
+ kvmppc_set_gpr(vcpu, 3, 0);
+ trap = 0;
+
+ /* XICS hcalls must be handled before xive is pulled */
+ } else if (hcall_is_xics(req)) {
+ int ret;
+
+ ret = kvmppc_xive_xics_hcall(vcpu, req);
+ if (ret != H_TOO_HARD) {
+ kvmppc_set_gpr(vcpu, 3, ret);
+ trap = 0;
+ }
+ }
+ }
+ kvmppc_xive_pull_vcpu(vcpu);
+
+ if (kvm_is_radix(vcpu->kvm))
+ vcpu->arch.slb_max = 0;
}
- vcpu->arch.slb_max = 0;
dec = mfspr(SPRN_DEC);
if (!(lpcr & LPCR_LD)) /* Sign extend if not using large decrementer */
dec = (s32) dec;
@@ -3766,36 +3956,10 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
vcpu->arch.dec_expires = dec + tb;
vcpu->cpu = -1;
vcpu->arch.thread_cpu = -1;
- /* Save guest CTRL register, set runlatch to 1 */
- vcpu->arch.ctrl = mfspr(SPRN_CTRLF);
- if (!(vcpu->arch.ctrl & 1))
- mtspr(SPRN_CTRLT, vcpu->arch.ctrl | 1);
-
- vcpu->arch.iamr = mfspr(SPRN_IAMR);
- vcpu->arch.pspb = mfspr(SPRN_PSPB);
- vcpu->arch.fscr = mfspr(SPRN_FSCR);
- vcpu->arch.tar = mfspr(SPRN_TAR);
- vcpu->arch.ebbhr = mfspr(SPRN_EBBHR);
- vcpu->arch.ebbrr = mfspr(SPRN_EBBRR);
- vcpu->arch.bescr = mfspr(SPRN_BESCR);
- vcpu->arch.wort = mfspr(SPRN_WORT);
- vcpu->arch.tid = mfspr(SPRN_TIDR);
- vcpu->arch.amr = mfspr(SPRN_AMR);
- vcpu->arch.uamor = mfspr(SPRN_UAMOR);
- vcpu->arch.dscr = mfspr(SPRN_DSCR);
- mtspr(SPRN_PSPB, 0);
- mtspr(SPRN_WORT, 0);
- mtspr(SPRN_UAMOR, 0);
- mtspr(SPRN_DSCR, host_dscr);
- mtspr(SPRN_TIDR, host_tidr);
- mtspr(SPRN_IAMR, host_iamr);
+ store_spr_state(vcpu);
- if (host_amr != vcpu->arch.amr)
- mtspr(SPRN_AMR, host_amr);
-
- if (host_fscr != vcpu->arch.fscr)
- mtspr(SPRN_FSCR, host_fscr);
+ restore_p9_host_os_sprs(vcpu, &host_os_sprs);
msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX);
store_fp_state(&vcpu->arch.fp);
@@ -3825,6 +3989,9 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
vc->in_guest = 0;
mtspr(SPRN_DEC, local_paca->kvm_hstate.dec_expires - mftb());
+ /* We may have raced with new irq work */
+ if (test_irq_work_pending())
+ set_dec(1);
mtspr(SPRN_SPRG_VDSO_WRITE, local_paca->sprg_vdso);
kvmhv_load_host_pmu();
@@ -3925,7 +4092,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
cur = start_poll = ktime_get();
if (vc->halt_poll_ns) {
ktime_t stop = ktime_add_ns(start_poll, vc->halt_poll_ns);
- ++vc->runner->stat.halt_attempted_poll;
+ ++vc->runner->stat.generic.halt_attempted_poll;
vc->vcore_state = VCORE_POLLING;
spin_unlock(&vc->lock);
@@ -3942,7 +4109,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
vc->vcore_state = VCORE_INACTIVE;
if (!do_sleep) {
- ++vc->runner->stat.halt_successful_poll;
+ ++vc->runner->stat.generic.halt_successful_poll;
goto out;
}
}
@@ -3954,7 +4121,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
do_sleep = 0;
/* If we polled, count this as a successful poll */
if (vc->halt_poll_ns)
- ++vc->runner->stat.halt_successful_poll;
+ ++vc->runner->stat.generic.halt_successful_poll;
goto out;
}
@@ -3981,13 +4148,13 @@ out:
ktime_to_ns(cur) - ktime_to_ns(start_wait);
/* Attribute failed poll time */
if (vc->halt_poll_ns)
- vc->runner->stat.halt_poll_fail_ns +=
+ vc->runner->stat.generic.halt_poll_fail_ns +=
ktime_to_ns(start_wait) -
ktime_to_ns(start_poll);
} else {
/* Attribute successful poll time */
if (vc->halt_poll_ns)
- vc->runner->stat.halt_poll_success_ns +=
+ vc->runner->stat.generic.halt_poll_success_ns +=
ktime_to_ns(cur) -
ktime_to_ns(start_poll);
}
@@ -4014,7 +4181,6 @@ out:
/*
* This never fails for a radix guest, as none of the operations it does
* for a radix guest can fail or have a way to report failure.
- * kvmhv_run_single_vcpu() relies on this fact.
*/
static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu)
{
@@ -4170,7 +4336,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
{
struct kvm_run *run = vcpu->run;
int trap, r, pcpu;
- int srcu_idx, lpid;
+ int srcu_idx;
struct kvmppc_vcore *vc;
struct kvm *kvm = vcpu->kvm;
struct kvm_nested_guest *nested = vcpu->arch.nested;
@@ -4193,8 +4359,15 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
vc->runner = vcpu;
/* See if the MMU is ready to go */
- if (!kvm->arch.mmu_ready)
- kvmhv_setup_mmu(vcpu);
+ if (!kvm->arch.mmu_ready) {
+ r = kvmhv_setup_mmu(vcpu);
+ if (r) {
+ run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ run->fail_entry.hardware_entry_failure_reason = 0;
+ vcpu->arch.ret = r;
+ return r;
+ }
+ }
if (need_resched())
cond_resched();
@@ -4207,7 +4380,8 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
preempt_disable();
pcpu = smp_processor_id();
vc->pcpu = pcpu;
- kvmppc_prepare_radix_vcpu(vcpu, pcpu);
+ if (kvm_is_radix(kvm))
+ kvmppc_prepare_radix_vcpu(vcpu, pcpu);
local_irq_disable();
hard_irq_disable();
@@ -4244,13 +4418,6 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
vc->vcore_state = VCORE_RUNNING;
trace_kvmppc_run_core(vc, 0);
- if (cpu_has_feature(CPU_FTR_HVMODE)) {
- lpid = nested ? nested->shadow_lpid : kvm->arch.lpid;
- mtspr(SPRN_LPID, lpid);
- isync();
- kvmppc_check_need_tlb_flush(kvm, pcpu, nested);
- }
-
guest_enter_irqoff();
srcu_idx = srcu_read_lock(&kvm->srcu);
@@ -4269,11 +4436,6 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
srcu_read_unlock(&kvm->srcu, srcu_idx);
- if (cpu_has_feature(CPU_FTR_HVMODE)) {
- mtspr(SPRN_LPID, kvm->arch.host_lpid);
- isync();
- }
-
set_irq_happened(trap);
kvmppc_set_host_core(pcpu);
@@ -4419,19 +4581,23 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu)
vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
do {
- /*
- * The TLB prefetch bug fixup is only in the kvmppc_run_vcpu
- * path, which also handles hash and dependent threads mode.
- */
- if (kvm->arch.threads_indep && kvm_is_radix(kvm) &&
- !cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG))
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
r = kvmhv_run_single_vcpu(vcpu, ~(u64)0,
vcpu->arch.vcore->lpcr);
else
r = kvmppc_run_vcpu(vcpu);
- if (run->exit_reason == KVM_EXIT_PAPR_HCALL &&
- !(vcpu->arch.shregs.msr & MSR_PR)) {
+ if (run->exit_reason == KVM_EXIT_PAPR_HCALL) {
+ if (WARN_ON_ONCE(vcpu->arch.shregs.msr & MSR_PR)) {
+ /*
+ * These should have been caught reflected
+ * into the guest by now. Final sanity check:
+ * don't allow userspace to execute hcalls in
+ * the hypervisor.
+ */
+ r = RESUME_GUEST;
+ continue;
+ }
trace_kvm_hcall_enter(vcpu);
r = kvmppc_pseries_do_hcall(vcpu);
trace_kvm_hcall_exit(vcpu, r);
@@ -5038,18 +5204,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
/*
* Track that we now have a HV mode VM active. This blocks secondary
* CPU threads from coming online.
- * On POWER9, we only need to do this if the "indep_threads_mode"
- * module parameter has been set to N.
*/
- if (cpu_has_feature(CPU_FTR_ARCH_300)) {
- if (!indep_threads_mode && !cpu_has_feature(CPU_FTR_HVMODE)) {
- pr_warn("KVM: Ignoring indep_threads_mode=N in nested hypervisor\n");
- kvm->arch.threads_indep = true;
- } else {
- kvm->arch.threads_indep = indep_threads_mode;
- }
- }
- if (!kvm->arch.threads_indep)
+ if (!cpu_has_feature(CPU_FTR_ARCH_300))
kvm_hv_vm_activated();
/*
@@ -5090,7 +5246,7 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
{
debugfs_remove_recursive(kvm->arch.debugfs_dir);
- if (!kvm->arch.threads_indep)
+ if (!cpu_has_feature(CPU_FTR_ARCH_300))
kvm_hv_vm_deactivated();
kvmppc_free_vcores(kvm);
@@ -5511,7 +5667,9 @@ static int kvmhv_enable_nested(struct kvm *kvm)
{
if (!nested)
return -EPERM;
- if (!cpu_has_feature(CPU_FTR_ARCH_300) || no_mixing_hpt_and_radix)
+ if (!cpu_has_feature(CPU_FTR_ARCH_300))
+ return -ENODEV;
+ if (!radix_enabled())
return -ENODEV;
/* kvm == NULL means the caller is testing if the capability exists */
@@ -5674,11 +5832,25 @@ static int kvmhv_enable_dawr1(struct kvm *kvm)
static bool kvmppc_hash_v3_possible(void)
{
- if (radix_enabled() && no_mixing_hpt_and_radix)
+ if (!cpu_has_feature(CPU_FTR_ARCH_300))
+ return false;
+
+ if (!cpu_has_feature(CPU_FTR_HVMODE))
return false;
- return cpu_has_feature(CPU_FTR_ARCH_300) &&
- cpu_has_feature(CPU_FTR_HVMODE);
+ /*
+ * POWER9 chips before version 2.02 can't have some threads in
+ * HPT mode and some in radix mode on the same core.
+ */
+ if (radix_enabled()) {
+ unsigned int pvr = mfspr(SPRN_PVR);
+ if ((pvr >> 16) == PVR_POWER9 &&
+ (((pvr & 0xe000) == 0 && (pvr & 0xfff) < 0x202) ||
+ ((pvr & 0xe000) == 0x2000 && (pvr & 0xfff) < 0x101)))
+ return false;
+ }
+
+ return true;
}
static struct kvmppc_ops kvm_ops_hv = {
@@ -5822,18 +5994,6 @@ static int kvmppc_book3s_init_hv(void)
if (kvmppc_radix_possible())
r = kvmppc_radix_init();
- /*
- * POWER9 chips before version 2.02 can't have some threads in
- * HPT mode and some in radix mode on the same core.
- */
- if (cpu_has_feature(CPU_FTR_ARCH_300)) {
- unsigned int pvr = mfspr(SPRN_PVR);
- if ((pvr >> 16) == PVR_POWER9 &&
- (((pvr & 0xe000) == 0 && (pvr & 0xfff) < 0x202) ||
- ((pvr & 0xe000) == 0x2000 && (pvr & 0xfff) < 0x101)))
- no_mixing_hpt_and_radix = true;
- }
-
r = kvmppc_uvmem_init();
if (r < 0)
pr_err("KVM-HV: kvmppc_uvmem_init failed %d\n", r);
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 7a0e33a9c980..be8ef1c5b1bf 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -35,21 +35,6 @@
#include "book3s_xive.h"
/*
- * The XIVE module will populate these when it loads
- */
-unsigned long (*__xive_vm_h_xirr)(struct kvm_vcpu *vcpu);
-unsigned long (*__xive_vm_h_ipoll)(struct kvm_vcpu *vcpu, unsigned long server);
-int (*__xive_vm_h_ipi)(struct kvm_vcpu *vcpu, unsigned long server,
- unsigned long mfrr);
-int (*__xive_vm_h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr);
-int (*__xive_vm_h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr);
-EXPORT_SYMBOL_GPL(__xive_vm_h_xirr);
-EXPORT_SYMBOL_GPL(__xive_vm_h_ipoll);
-EXPORT_SYMBOL_GPL(__xive_vm_h_ipi);
-EXPORT_SYMBOL_GPL(__xive_vm_h_cppr);
-EXPORT_SYMBOL_GPL(__xive_vm_h_eoi);
-
-/*
* Hash page table alignment on newer cpus(CPU_FTR_ARCH_206)
* should be power of 2.
*/
@@ -196,16 +181,9 @@ int kvmppc_hwrng_present(void)
}
EXPORT_SYMBOL_GPL(kvmppc_hwrng_present);
-long kvmppc_h_random(struct kvm_vcpu *vcpu)
+long kvmppc_rm_h_random(struct kvm_vcpu *vcpu)
{
- int r;
-
- /* Only need to do the expensive mfmsr() on radix */
- if (kvm_is_radix(vcpu->kvm) && (mfmsr() & MSR_IR))
- r = powernv_get_random_long(&vcpu->arch.regs.gpr[4]);
- else
- r = powernv_get_random_real_mode(&vcpu->arch.regs.gpr[4]);
- if (r)
+ if (powernv_get_random_real_mode(&vcpu->arch.regs.gpr[4]))
return H_SUCCESS;
return H_HARDWARE;
@@ -221,15 +199,6 @@ void kvmhv_rm_send_ipi(int cpu)
void __iomem *xics_phys;
unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
- /* For a nested hypervisor, use the XICS via hcall */
- if (kvmhv_on_pseries()) {
- unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
-
- plpar_hcall_raw(H_IPI, retbuf, get_hard_smp_processor_id(cpu),
- IPI_PRIORITY);
- return;
- }
-
/* On POWER9 we can use msgsnd for any destination cpu. */
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
msg |= get_hard_smp_processor_id(cpu);
@@ -442,19 +411,12 @@ static long kvmppc_read_one_intr(bool *again)
return 1;
/* Now read the interrupt from the ICP */
- if (kvmhv_on_pseries()) {
- unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
-
- rc = plpar_hcall_raw(H_XIRR, retbuf, 0xFF);
- xirr = cpu_to_be32(retbuf[0]);
- } else {
- xics_phys = local_paca->kvm_hstate.xics_phys;
- rc = 0;
- if (!xics_phys)
- rc = opal_int_get_xirr(&xirr, false);
- else
- xirr = __raw_rm_readl(xics_phys + XICS_XIRR);
- }
+ xics_phys = local_paca->kvm_hstate.xics_phys;
+ rc = 0;
+ if (!xics_phys)
+ rc = opal_int_get_xirr(&xirr, false);
+ else
+ xirr = __raw_rm_readl(xics_phys + XICS_XIRR);
if (rc < 0)
return 1;
@@ -483,13 +445,7 @@ static long kvmppc_read_one_intr(bool *again)
*/
if (xisr == XICS_IPI) {
rc = 0;
- if (kvmhv_on_pseries()) {
- unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
-
- plpar_hcall_raw(H_IPI, retbuf,
- hard_smp_processor_id(), 0xff);
- plpar_hcall_raw(H_EOI, retbuf, h_xirr);
- } else if (xics_phys) {
+ if (xics_phys) {
__raw_rm_writeb(0xff, xics_phys + XICS_MFRR);
__raw_rm_writel(xirr, xics_phys + XICS_XIRR);
} else {
@@ -515,13 +471,7 @@ static long kvmppc_read_one_intr(bool *again)
/* We raced with the host,
* we need to resend that IPI, bummer
*/
- if (kvmhv_on_pseries()) {
- unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
-
- plpar_hcall_raw(H_IPI, retbuf,
- hard_smp_processor_id(),
- IPI_PRIORITY);
- } else if (xics_phys)
+ if (xics_phys)
__raw_rm_writeb(IPI_PRIORITY,
xics_phys + XICS_MFRR);
else
@@ -541,22 +491,13 @@ static long kvmppc_read_one_intr(bool *again)
}
#ifdef CONFIG_KVM_XICS
-static inline bool is_rm(void)
-{
- return !(mfmsr() & MSR_DR);
-}
-
unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu)
{
if (!kvmppc_xics_enabled(vcpu))
return H_TOO_HARD;
- if (xics_on_xive()) {
- if (is_rm())
- return xive_rm_h_xirr(vcpu);
- if (unlikely(!__xive_vm_h_xirr))
- return H_NOT_AVAILABLE;
- return __xive_vm_h_xirr(vcpu);
- } else
+ if (xics_on_xive())
+ return xive_rm_h_xirr(vcpu);
+ else
return xics_rm_h_xirr(vcpu);
}
@@ -565,13 +506,9 @@ unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu)
if (!kvmppc_xics_enabled(vcpu))
return H_TOO_HARD;
vcpu->arch.regs.gpr[5] = get_tb();
- if (xics_on_xive()) {
- if (is_rm())
- return xive_rm_h_xirr(vcpu);
- if (unlikely(!__xive_vm_h_xirr))
- return H_NOT_AVAILABLE;
- return __xive_vm_h_xirr(vcpu);
- } else
+ if (xics_on_xive())
+ return xive_rm_h_xirr(vcpu);
+ else
return xics_rm_h_xirr(vcpu);
}
@@ -579,13 +516,9 @@ unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server)
{
if (!kvmppc_xics_enabled(vcpu))
return H_TOO_HARD;
- if (xics_on_xive()) {
- if (is_rm())
- return xive_rm_h_ipoll(vcpu, server);
- if (unlikely(!__xive_vm_h_ipoll))
- return H_NOT_AVAILABLE;
- return __xive_vm_h_ipoll(vcpu, server);
- } else
+ if (xics_on_xive())
+ return xive_rm_h_ipoll(vcpu, server);
+ else
return H_TOO_HARD;
}
@@ -594,13 +527,9 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
{
if (!kvmppc_xics_enabled(vcpu))
return H_TOO_HARD;
- if (xics_on_xive()) {
- if (is_rm())
- return xive_rm_h_ipi(vcpu, server, mfrr);
- if (unlikely(!__xive_vm_h_ipi))
- return H_NOT_AVAILABLE;
- return __xive_vm_h_ipi(vcpu, server, mfrr);
- } else
+ if (xics_on_xive())
+ return xive_rm_h_ipi(vcpu, server, mfrr);
+ else
return xics_rm_h_ipi(vcpu, server, mfrr);
}
@@ -608,13 +537,9 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
{
if (!kvmppc_xics_enabled(vcpu))
return H_TOO_HARD;
- if (xics_on_xive()) {
- if (is_rm())
- return xive_rm_h_cppr(vcpu, cppr);
- if (unlikely(!__xive_vm_h_cppr))
- return H_NOT_AVAILABLE;
- return __xive_vm_h_cppr(vcpu, cppr);
- } else
+ if (xics_on_xive())
+ return xive_rm_h_cppr(vcpu, cppr);
+ else
return xics_rm_h_cppr(vcpu, cppr);
}
@@ -622,13 +547,9 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
{
if (!kvmppc_xics_enabled(vcpu))
return H_TOO_HARD;
- if (xics_on_xive()) {
- if (is_rm())
- return xive_rm_h_eoi(vcpu, xirr);
- if (unlikely(!__xive_vm_h_eoi))
- return H_NOT_AVAILABLE;
- return __xive_vm_h_eoi(vcpu, xirr);
- } else
+ if (xics_on_xive())
+ return xive_rm_h_eoi(vcpu, xirr);
+ else
return xics_rm_h_eoi(vcpu, xirr);
}
#endif /* CONFIG_KVM_XICS */
@@ -800,7 +721,7 @@ void kvmppc_check_need_tlb_flush(struct kvm *kvm, int pcpu,
* Thus we make all 4 threads use the same bit.
*/
if (cpu_has_feature(CPU_FTR_ARCH_300))
- pcpu = cpu_first_thread_sibling(pcpu);
+ pcpu = cpu_first_tlb_thread_sibling(pcpu);
if (nested)
need_tlb_flush = &nested->need_tlb_flush;
diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S
index 327417d79eac..4444f83cb133 100644
--- a/arch/powerpc/kvm/book3s_hv_interrupts.S
+++ b/arch/powerpc/kvm/book3s_hv_interrupts.S
@@ -58,7 +58,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
/*
* Put whatever is in the decrementer into the
* hypervisor decrementer.
- * Because of a hardware deviation in P8 and P9,
+ * Because of a hardware deviation in P8,
* we need to set LPCR[HDICE] before writing HDEC.
*/
ld r5, HSTATE_KVM_VCORE(r13)
@@ -67,15 +67,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
ori r8, r9, LPCR_HDICE
mtspr SPRN_LPCR, r8
isync
- andis. r0, r9, LPCR_LD@h
mfspr r8,SPRN_DEC
mftb r7
-BEGIN_FTR_SECTION
- /* On POWER9, don't sign-extend if host LPCR[LD] bit is set */
- bne 32f
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
extsw r8,r8
-32: mtspr SPRN_HDEC,r8
+ mtspr SPRN_HDEC,r8
add r8,r8,r7
std r8,HSTATE_DECEXP(r13)
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
index 60724f674421..8543ad538b0c 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -19,6 +19,7 @@
#include <asm/pgalloc.h>
#include <asm/pte-walk.h>
#include <asm/reg.h>
+#include <asm/plpar_wrappers.h>
static struct patb_entry *pseries_partition_tb;
@@ -53,7 +54,8 @@ void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
hr->dawrx1 = vcpu->arch.dawrx1;
}
-static void byteswap_pt_regs(struct pt_regs *regs)
+/* Use noinline_for_stack due to https://bugs.llvm.org/show_bug.cgi?id=49610 */
+static noinline_for_stack void byteswap_pt_regs(struct pt_regs *regs)
{
unsigned long *addr = (unsigned long *) regs;
@@ -467,8 +469,15 @@ static void kvmhv_flush_lpid(unsigned int lpid)
return;
}
- rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(2, 0, 1),
- lpid, TLBIEL_INVAL_SET_LPID);
+ if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE))
+ rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(2, 0, 1),
+ lpid, TLBIEL_INVAL_SET_LPID);
+ else
+ rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU,
+ H_RPTI_TYPE_NESTED |
+ H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC |
+ H_RPTI_TYPE_PAT,
+ H_RPTI_PAGE_ALL, 0, -1UL);
if (rc)
pr_err("KVM: TLB LPID invalidation hcall failed, rc=%ld\n", rc);
}
@@ -1214,6 +1223,113 @@ long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu)
return H_SUCCESS;
}
+static long do_tlb_invalidate_nested_all(struct kvm_vcpu *vcpu,
+ unsigned long lpid, unsigned long ric)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_nested_guest *gp;
+
+ gp = kvmhv_get_nested(kvm, lpid, false);
+ if (gp) {
+ kvmhv_emulate_tlbie_lpid(vcpu, gp, ric);
+ kvmhv_put_nested(gp);
+ }
+ return H_SUCCESS;
+}
+
+/*
+ * Number of pages above which we invalidate the entire LPID rather than
+ * flush individual pages.
+ */
+static unsigned long tlb_range_flush_page_ceiling __read_mostly = 33;
+
+static long do_tlb_invalidate_nested_tlb(struct kvm_vcpu *vcpu,
+ unsigned long lpid,
+ unsigned long pg_sizes,
+ unsigned long start,
+ unsigned long end)
+{
+ int ret = H_P4;
+ unsigned long addr, nr_pages;
+ struct mmu_psize_def *def;
+ unsigned long psize, ap, page_size;
+ bool flush_lpid;
+
+ for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
+ def = &mmu_psize_defs[psize];
+ if (!(pg_sizes & def->h_rpt_pgsize))
+ continue;
+
+ nr_pages = (end - start) >> def->shift;
+ flush_lpid = nr_pages > tlb_range_flush_page_ceiling;
+ if (flush_lpid)
+ return do_tlb_invalidate_nested_all(vcpu, lpid,
+ RIC_FLUSH_TLB);
+ addr = start;
+ ap = mmu_get_ap(psize);
+ page_size = 1UL << def->shift;
+ do {
+ ret = kvmhv_emulate_tlbie_tlb_addr(vcpu, lpid, ap,
+ get_epn(addr));
+ if (ret)
+ return H_P4;
+ addr += page_size;
+ } while (addr < end);
+ }
+ return ret;
+}
+
+/*
+ * Performs partition-scoped invalidations for nested guests
+ * as part of H_RPT_INVALIDATE hcall.
+ */
+long do_h_rpt_invalidate_pat(struct kvm_vcpu *vcpu, unsigned long lpid,
+ unsigned long type, unsigned long pg_sizes,
+ unsigned long start, unsigned long end)
+{
+ /*
+ * If L2 lpid isn't valid, we need to return H_PARAMETER.
+ *
+ * However, nested KVM issues a L2 lpid flush call when creating
+ * partition table entries for L2. This happens even before the
+ * corresponding shadow lpid is created in HV which happens in
+ * H_ENTER_NESTED call. Since we can't differentiate this case from
+ * the invalid case, we ignore such flush requests and return success.
+ */
+ if (!kvmhv_find_nested(vcpu->kvm, lpid))
+ return H_SUCCESS;
+
+ /*
+ * A flush all request can be handled by a full lpid flush only.
+ */
+ if ((type & H_RPTI_TYPE_NESTED_ALL) == H_RPTI_TYPE_NESTED_ALL)
+ return do_tlb_invalidate_nested_all(vcpu, lpid, RIC_FLUSH_ALL);
+
+ /*
+ * We don't need to handle a PWC flush like process table here,
+ * because intermediate partition scoped table in nested guest doesn't
+ * really have PWC. Only level we have PWC is in L0 and for nested
+ * invalidate at L0 we always do kvm_flush_lpid() which does
+ * radix__flush_all_lpid(). For range invalidate at any level, we
+ * are not removing the higher level page tables and hence there is
+ * no PWC invalidate needed.
+ *
+ * if (type & H_RPTI_TYPE_PWC) {
+ * ret = do_tlb_invalidate_nested_all(vcpu, lpid, RIC_FLUSH_PWC);
+ * if (ret)
+ * return H_P4;
+ * }
+ */
+
+ if (start == 0 && end == -1)
+ return do_tlb_invalidate_nested_all(vcpu, lpid, RIC_FLUSH_TLB);
+
+ if (type & H_RPTI_TYPE_TLB)
+ return do_tlb_invalidate_nested_tlb(vcpu, lpid, pg_sizes,
+ start, end);
+ return H_SUCCESS;
+}
+
/* Used to convert a nested guest real address to a L1 guest real address */
static int kvmhv_translate_addr_nested(struct kvm_vcpu *vcpu,
struct kvm_nested_guest *gp,
diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c
new file mode 100644
index 000000000000..83f592eadcd2
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c
@@ -0,0 +1,508 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <asm/asm-prototypes.h>
+#include <asm/dbell.h>
+#include <asm/kvm_ppc.h>
+#include <asm/ppc-opcode.h>
+
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+static void __start_timing(struct kvm_vcpu *vcpu, struct kvmhv_tb_accumulator *next)
+{
+ struct kvmppc_vcore *vc = vcpu->arch.vcore;
+ u64 tb = mftb() - vc->tb_offset_applied;
+
+ vcpu->arch.cur_activity = next;
+ vcpu->arch.cur_tb_start = tb;
+}
+
+static void __accumulate_time(struct kvm_vcpu *vcpu, struct kvmhv_tb_accumulator *next)
+{
+ struct kvmppc_vcore *vc = vcpu->arch.vcore;
+ struct kvmhv_tb_accumulator *curr;
+ u64 tb = mftb() - vc->tb_offset_applied;
+ u64 prev_tb;
+ u64 delta;
+ u64 seq;
+
+ curr = vcpu->arch.cur_activity;
+ vcpu->arch.cur_activity = next;
+ prev_tb = vcpu->arch.cur_tb_start;
+ vcpu->arch.cur_tb_start = tb;
+
+ if (!curr)
+ return;
+
+ delta = tb - prev_tb;
+
+ seq = curr->seqcount;
+ curr->seqcount = seq + 1;
+ smp_wmb();
+ curr->tb_total += delta;
+ if (seq == 0 || delta < curr->tb_min)
+ curr->tb_min = delta;
+ if (delta > curr->tb_max)
+ curr->tb_max = delta;
+ smp_wmb();
+ curr->seqcount = seq + 2;
+}
+
+#define start_timing(vcpu, next) __start_timing(vcpu, next)
+#define end_timing(vcpu) __start_timing(vcpu, NULL)
+#define accumulate_time(vcpu, next) __accumulate_time(vcpu, next)
+#else
+#define start_timing(vcpu, next) do {} while (0)
+#define end_timing(vcpu) do {} while (0)
+#define accumulate_time(vcpu, next) do {} while (0)
+#endif
+
+static inline void mfslb(unsigned int idx, u64 *slbee, u64 *slbev)
+{
+ asm volatile("slbmfev %0,%1" : "=r" (*slbev) : "r" (idx));
+ asm volatile("slbmfee %0,%1" : "=r" (*slbee) : "r" (idx));
+}
+
+static inline void mtslb(u64 slbee, u64 slbev)
+{
+ asm volatile("slbmte %0,%1" :: "r" (slbev), "r" (slbee));
+}
+
+static inline void clear_slb_entry(unsigned int idx)
+{
+ mtslb(idx, 0);
+}
+
+static inline void slb_clear_invalidate_partition(void)
+{
+ clear_slb_entry(0);
+ asm volatile(PPC_SLBIA(6));
+}
+
+/*
+ * Malicious or buggy radix guests may have inserted SLB entries
+ * (only 0..3 because radix always runs with UPRT=1), so these must
+ * be cleared here to avoid side-channels. slbmte is used rather
+ * than slbia, as it won't clear cached translations.
+ */
+static void radix_clear_slb(void)
+{
+ int i;
+
+ for (i = 0; i < 4; i++)
+ clear_slb_entry(i);
+}
+
+static void switch_mmu_to_guest_radix(struct kvm *kvm, struct kvm_vcpu *vcpu, u64 lpcr)
+{
+ struct kvm_nested_guest *nested = vcpu->arch.nested;
+ u32 lpid;
+
+ lpid = nested ? nested->shadow_lpid : kvm->arch.lpid;
+
+ /*
+ * All the isync()s are overkill but trivially follow the ISA
+ * requirements. Some can likely be replaced with justification
+ * comment for why they are not needed.
+ */
+ isync();
+ mtspr(SPRN_LPID, lpid);
+ isync();
+ mtspr(SPRN_LPCR, lpcr);
+ isync();
+ mtspr(SPRN_PID, vcpu->arch.pid);
+ isync();
+}
+
+static void switch_mmu_to_guest_hpt(struct kvm *kvm, struct kvm_vcpu *vcpu, u64 lpcr)
+{
+ u32 lpid;
+ int i;
+
+ lpid = kvm->arch.lpid;
+
+ mtspr(SPRN_LPID, lpid);
+ mtspr(SPRN_LPCR, lpcr);
+ mtspr(SPRN_PID, vcpu->arch.pid);
+
+ for (i = 0; i < vcpu->arch.slb_max; i++)
+ mtslb(vcpu->arch.slb[i].orige, vcpu->arch.slb[i].origv);
+
+ isync();
+}
+
+static void switch_mmu_to_host(struct kvm *kvm, u32 pid)
+{
+ isync();
+ mtspr(SPRN_PID, pid);
+ isync();
+ mtspr(SPRN_LPID, kvm->arch.host_lpid);
+ isync();
+ mtspr(SPRN_LPCR, kvm->arch.host_lpcr);
+ isync();
+
+ if (!radix_enabled())
+ slb_restore_bolted_realmode();
+}
+
+static void save_clear_host_mmu(struct kvm *kvm)
+{
+ if (!radix_enabled()) {
+ /*
+ * Hash host could save and restore host SLB entries to
+ * reduce SLB fault overheads of VM exits, but for now the
+ * existing code clears all entries and restores just the
+ * bolted ones when switching back to host.
+ */
+ slb_clear_invalidate_partition();
+ }
+}
+
+static void save_clear_guest_mmu(struct kvm *kvm, struct kvm_vcpu *vcpu)
+{
+ if (kvm_is_radix(kvm)) {
+ radix_clear_slb();
+ } else {
+ int i;
+ int nr = 0;
+
+ /*
+ * This must run before switching to host (radix host can't
+ * access all SLBs).
+ */
+ for (i = 0; i < vcpu->arch.slb_nr; i++) {
+ u64 slbee, slbev;
+ mfslb(i, &slbee, &slbev);
+ if (slbee & SLB_ESID_V) {
+ vcpu->arch.slb[nr].orige = slbee | i;
+ vcpu->arch.slb[nr].origv = slbev;
+ nr++;
+ }
+ }
+ vcpu->arch.slb_max = nr;
+ slb_clear_invalidate_partition();
+ }
+}
+
+int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_nested_guest *nested = vcpu->arch.nested;
+ struct kvmppc_vcore *vc = vcpu->arch.vcore;
+ s64 hdec;
+ u64 tb, purr, spurr;
+ u64 *exsave;
+ bool ri_set;
+ int trap;
+ unsigned long msr;
+ unsigned long host_hfscr;
+ unsigned long host_ciabr;
+ unsigned long host_dawr0;
+ unsigned long host_dawrx0;
+ unsigned long host_psscr;
+ unsigned long host_pidr;
+ unsigned long host_dawr1;
+ unsigned long host_dawrx1;
+
+ hdec = time_limit - mftb();
+ if (hdec < 0)
+ return BOOK3S_INTERRUPT_HV_DECREMENTER;
+
+ WARN_ON_ONCE(vcpu->arch.shregs.msr & MSR_HV);
+ WARN_ON_ONCE(!(vcpu->arch.shregs.msr & MSR_ME));
+
+ start_timing(vcpu, &vcpu->arch.rm_entry);
+
+ vcpu->arch.ceded = 0;
+
+ if (vc->tb_offset) {
+ u64 new_tb = mftb() + vc->tb_offset;
+ mtspr(SPRN_TBU40, new_tb);
+ tb = mftb();
+ if ((tb & 0xffffff) < (new_tb & 0xffffff))
+ mtspr(SPRN_TBU40, new_tb + 0x1000000);
+ vc->tb_offset_applied = vc->tb_offset;
+ }
+
+ msr = mfmsr();
+
+ host_hfscr = mfspr(SPRN_HFSCR);
+ host_ciabr = mfspr(SPRN_CIABR);
+ host_dawr0 = mfspr(SPRN_DAWR0);
+ host_dawrx0 = mfspr(SPRN_DAWRX0);
+ host_psscr = mfspr(SPRN_PSSCR);
+ host_pidr = mfspr(SPRN_PID);
+ if (cpu_has_feature(CPU_FTR_DAWR1)) {
+ host_dawr1 = mfspr(SPRN_DAWR1);
+ host_dawrx1 = mfspr(SPRN_DAWRX1);
+ }
+
+ if (vc->pcr)
+ mtspr(SPRN_PCR, vc->pcr | PCR_MASK);
+ mtspr(SPRN_DPDES, vc->dpdes);
+ mtspr(SPRN_VTB, vc->vtb);
+
+ local_paca->kvm_hstate.host_purr = mfspr(SPRN_PURR);
+ local_paca->kvm_hstate.host_spurr = mfspr(SPRN_SPURR);
+ mtspr(SPRN_PURR, vcpu->arch.purr);
+ mtspr(SPRN_SPURR, vcpu->arch.spurr);
+
+ if (dawr_enabled()) {
+ mtspr(SPRN_DAWR0, vcpu->arch.dawr0);
+ mtspr(SPRN_DAWRX0, vcpu->arch.dawrx0);
+ if (cpu_has_feature(CPU_FTR_DAWR1)) {
+ mtspr(SPRN_DAWR1, vcpu->arch.dawr1);
+ mtspr(SPRN_DAWRX1, vcpu->arch.dawrx1);
+ }
+ }
+ mtspr(SPRN_CIABR, vcpu->arch.ciabr);
+ mtspr(SPRN_IC, vcpu->arch.ic);
+
+ mtspr(SPRN_PSSCR, vcpu->arch.psscr | PSSCR_EC |
+ (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
+
+ mtspr(SPRN_HFSCR, vcpu->arch.hfscr);
+
+ mtspr(SPRN_HSRR0, vcpu->arch.regs.nip);
+ mtspr(SPRN_HSRR1, (vcpu->arch.shregs.msr & ~MSR_HV) | MSR_ME);
+
+ /*
+ * On POWER9 DD2.1 and below, sometimes on a Hypervisor Data Storage
+ * Interrupt (HDSI) the HDSISR is not be updated at all.
+ *
+ * To work around this we put a canary value into the HDSISR before
+ * returning to a guest and then check for this canary when we take a
+ * HDSI. If we find the canary on a HDSI, we know the hardware didn't
+ * update the HDSISR. In this case we return to the guest to retake the
+ * HDSI which should correctly update the HDSISR the second time HDSI
+ * entry.
+ *
+ * Just do this on all p9 processors for now.
+ */
+ mtspr(SPRN_HDSISR, HDSISR_CANARY);
+
+ mtspr(SPRN_SPRG0, vcpu->arch.shregs.sprg0);
+ mtspr(SPRN_SPRG1, vcpu->arch.shregs.sprg1);
+ mtspr(SPRN_SPRG2, vcpu->arch.shregs.sprg2);
+ mtspr(SPRN_SPRG3, vcpu->arch.shregs.sprg3);
+
+ mtspr(SPRN_AMOR, ~0UL);
+
+ local_paca->kvm_hstate.in_guest = KVM_GUEST_MODE_HV_P9;
+
+ /*
+ * Hash host, hash guest, or radix guest with prefetch bug, all have
+ * to disable the MMU before switching to guest MMU state.
+ */
+ if (!radix_enabled() || !kvm_is_radix(kvm) ||
+ cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG))
+ __mtmsrd(msr & ~(MSR_IR|MSR_DR|MSR_RI), 0);
+
+ save_clear_host_mmu(kvm);
+
+ if (kvm_is_radix(kvm)) {
+ switch_mmu_to_guest_radix(kvm, vcpu, lpcr);
+ if (!cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG))
+ __mtmsrd(0, 1); /* clear RI */
+
+ } else {
+ switch_mmu_to_guest_hpt(kvm, vcpu, lpcr);
+ }
+
+ /* TLBIEL uses LPID=LPIDR, so run this after setting guest LPID */
+ kvmppc_check_need_tlb_flush(kvm, vc->pcpu, nested);
+
+ /*
+ * P9 suppresses the HDEC exception when LPCR[HDICE] = 0,
+ * so set guest LPCR (with HDICE) before writing HDEC.
+ */
+ mtspr(SPRN_HDEC, hdec);
+
+ mtspr(SPRN_DAR, vcpu->arch.shregs.dar);
+ mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr);
+ mtspr(SPRN_SRR0, vcpu->arch.shregs.srr0);
+ mtspr(SPRN_SRR1, vcpu->arch.shregs.srr1);
+
+ accumulate_time(vcpu, &vcpu->arch.guest_time);
+
+ kvmppc_p9_enter_guest(vcpu);
+
+ accumulate_time(vcpu, &vcpu->arch.rm_intr);
+
+ /* XXX: Could get these from r11/12 and paca exsave instead */
+ vcpu->arch.shregs.srr0 = mfspr(SPRN_SRR0);
+ vcpu->arch.shregs.srr1 = mfspr(SPRN_SRR1);
+ vcpu->arch.shregs.dar = mfspr(SPRN_DAR);
+ vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR);
+
+ /* 0x2 bit for HSRR is only used by PR and P7/8 HV paths, clear it */
+ trap = local_paca->kvm_hstate.scratch0 & ~0x2;
+
+ /* HSRR interrupts leave MSR[RI] unchanged, SRR interrupts clear it. */
+ ri_set = false;
+ if (likely(trap > BOOK3S_INTERRUPT_MACHINE_CHECK)) {
+ if (trap != BOOK3S_INTERRUPT_SYSCALL &&
+ (vcpu->arch.shregs.msr & MSR_RI))
+ ri_set = true;
+ exsave = local_paca->exgen;
+ } else if (trap == BOOK3S_INTERRUPT_SYSTEM_RESET) {
+ exsave = local_paca->exnmi;
+ } else { /* trap == 0x200 */
+ exsave = local_paca->exmc;
+ }
+
+ vcpu->arch.regs.gpr[1] = local_paca->kvm_hstate.scratch1;
+ vcpu->arch.regs.gpr[3] = local_paca->kvm_hstate.scratch2;
+
+ /*
+ * Only set RI after reading machine check regs (DAR, DSISR, SRR0/1)
+ * and hstate scratch (which we need to move into exsave to make
+ * re-entrant vs SRESET/MCE)
+ */
+ if (ri_set) {
+ if (unlikely(!(mfmsr() & MSR_RI))) {
+ __mtmsrd(MSR_RI, 1);
+ WARN_ON_ONCE(1);
+ }
+ } else {
+ WARN_ON_ONCE(mfmsr() & MSR_RI);
+ __mtmsrd(MSR_RI, 1);
+ }
+
+ vcpu->arch.regs.gpr[9] = exsave[EX_R9/sizeof(u64)];
+ vcpu->arch.regs.gpr[10] = exsave[EX_R10/sizeof(u64)];
+ vcpu->arch.regs.gpr[11] = exsave[EX_R11/sizeof(u64)];
+ vcpu->arch.regs.gpr[12] = exsave[EX_R12/sizeof(u64)];
+ vcpu->arch.regs.gpr[13] = exsave[EX_R13/sizeof(u64)];
+ vcpu->arch.ppr = exsave[EX_PPR/sizeof(u64)];
+ vcpu->arch.cfar = exsave[EX_CFAR/sizeof(u64)];
+ vcpu->arch.regs.ctr = exsave[EX_CTR/sizeof(u64)];
+
+ vcpu->arch.last_inst = KVM_INST_FETCH_FAILED;
+
+ if (unlikely(trap == BOOK3S_INTERRUPT_MACHINE_CHECK)) {
+ vcpu->arch.fault_dar = exsave[EX_DAR/sizeof(u64)];
+ vcpu->arch.fault_dsisr = exsave[EX_DSISR/sizeof(u64)];
+ kvmppc_realmode_machine_check(vcpu);
+
+ } else if (unlikely(trap == BOOK3S_INTERRUPT_HMI)) {
+ kvmppc_realmode_hmi_handler();
+
+ } else if (trap == BOOK3S_INTERRUPT_H_EMUL_ASSIST) {
+ vcpu->arch.emul_inst = mfspr(SPRN_HEIR);
+
+ } else if (trap == BOOK3S_INTERRUPT_H_DATA_STORAGE) {
+ vcpu->arch.fault_dar = exsave[EX_DAR/sizeof(u64)];
+ vcpu->arch.fault_dsisr = exsave[EX_DSISR/sizeof(u64)];
+ vcpu->arch.fault_gpa = mfspr(SPRN_ASDR);
+
+ } else if (trap == BOOK3S_INTERRUPT_H_INST_STORAGE) {
+ vcpu->arch.fault_gpa = mfspr(SPRN_ASDR);
+
+ } else if (trap == BOOK3S_INTERRUPT_H_FAC_UNAVAIL) {
+ vcpu->arch.hfscr = mfspr(SPRN_HFSCR);
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ /*
+ * Softpatch interrupt for transactional memory emulation cases
+ * on POWER9 DD2.2. This is early in the guest exit path - we
+ * haven't saved registers or done a treclaim yet.
+ */
+ } else if (trap == BOOK3S_INTERRUPT_HV_SOFTPATCH) {
+ vcpu->arch.emul_inst = mfspr(SPRN_HEIR);
+
+ /*
+ * The cases we want to handle here are those where the guest
+ * is in real suspend mode and is trying to transition to
+ * transactional mode.
+ */
+ if (local_paca->kvm_hstate.fake_suspend &&
+ (vcpu->arch.shregs.msr & MSR_TS_S)) {
+ if (kvmhv_p9_tm_emulation_early(vcpu)) {
+ /* Prevent it being handled again. */
+ trap = 0;
+ }
+ }
+#endif
+ }
+
+ accumulate_time(vcpu, &vcpu->arch.rm_exit);
+
+ /* Advance host PURR/SPURR by the amount used by guest */
+ purr = mfspr(SPRN_PURR);
+ spurr = mfspr(SPRN_SPURR);
+ mtspr(SPRN_PURR, local_paca->kvm_hstate.host_purr +
+ purr - vcpu->arch.purr);
+ mtspr(SPRN_SPURR, local_paca->kvm_hstate.host_spurr +
+ spurr - vcpu->arch.spurr);
+ vcpu->arch.purr = purr;
+ vcpu->arch.spurr = spurr;
+
+ vcpu->arch.ic = mfspr(SPRN_IC);
+ vcpu->arch.pid = mfspr(SPRN_PID);
+ vcpu->arch.psscr = mfspr(SPRN_PSSCR) & PSSCR_GUEST_VIS;
+
+ vcpu->arch.shregs.sprg0 = mfspr(SPRN_SPRG0);
+ vcpu->arch.shregs.sprg1 = mfspr(SPRN_SPRG1);
+ vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2);
+ vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3);
+
+ /* Preserve PSSCR[FAKE_SUSPEND] until we've called kvmppc_save_tm_hv */
+ mtspr(SPRN_PSSCR, host_psscr |
+ (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
+ mtspr(SPRN_HFSCR, host_hfscr);
+ mtspr(SPRN_CIABR, host_ciabr);
+ mtspr(SPRN_DAWR0, host_dawr0);
+ mtspr(SPRN_DAWRX0, host_dawrx0);
+ if (cpu_has_feature(CPU_FTR_DAWR1)) {
+ mtspr(SPRN_DAWR1, host_dawr1);
+ mtspr(SPRN_DAWRX1, host_dawrx1);
+ }
+
+ if (kvm_is_radix(kvm)) {
+ /*
+ * Since this is radix, do a eieio; tlbsync; ptesync sequence
+ * in case we interrupted the guest between a tlbie and a
+ * ptesync.
+ */
+ asm volatile("eieio; tlbsync; ptesync");
+ }
+
+ /*
+ * cp_abort is required if the processor supports local copy-paste
+ * to clear the copy buffer that was under control of the guest.
+ */
+ if (cpu_has_feature(CPU_FTR_ARCH_31))
+ asm volatile(PPC_CP_ABORT);
+
+ vc->dpdes = mfspr(SPRN_DPDES);
+ vc->vtb = mfspr(SPRN_VTB);
+ mtspr(SPRN_DPDES, 0);
+ if (vc->pcr)
+ mtspr(SPRN_PCR, PCR_MASK);
+
+ if (vc->tb_offset_applied) {
+ u64 new_tb = mftb() - vc->tb_offset_applied;
+ mtspr(SPRN_TBU40, new_tb);
+ tb = mftb();
+ if ((tb & 0xffffff) < (new_tb & 0xffffff))
+ mtspr(SPRN_TBU40, new_tb + 0x1000000);
+ vc->tb_offset_applied = 0;
+ }
+
+ mtspr(SPRN_HDEC, 0x7fffffff);
+
+ save_clear_guest_mmu(kvm, vcpu);
+ switch_mmu_to_host(kvm, host_pidr);
+ local_paca->kvm_hstate.in_guest = KVM_GUEST_MODE_NONE;
+
+ /*
+ * If we are in real mode, only switch MMU on after the MMU is
+ * switched to host, to avoid the P9_RADIX_PREFETCH_BUG.
+ */
+ __mtmsrd(msr, 0);
+
+ end_timing(vcpu);
+
+ return trap;
+}
+EXPORT_SYMBOL_GPL(kvmhv_vcpu_entry_p9);
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 7a0f12404e0e..632b2545072b 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -46,6 +46,10 @@ static int global_invalidates(struct kvm *kvm)
else
global = 1;
+ /* LPID has been switched to host if in virt mode so can't do local */
+ if (!global && (mfmsr() & (MSR_IR|MSR_DR)))
+ global = 1;
+
if (!global) {
/* any other core might now have stale TLB entries... */
smp_wmb();
@@ -56,7 +60,7 @@ static int global_invalidates(struct kvm *kvm)
* so use the bit for the first thread to represent the core.
*/
if (cpu_has_feature(CPU_FTR_ARCH_300))
- cpu = cpu_first_thread_sibling(cpu);
+ cpu = cpu_first_tlb_thread_sibling(cpu);
cpumask_clear_cpu(cpu, &kvm->arch.need_tlb_flush);
}
@@ -398,6 +402,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
vcpu->arch.pgdir, true,
&vcpu->arch.regs.gpr[4]);
}
+EXPORT_SYMBOL_GPL(kvmppc_h_enter);
#ifdef __BIG_ENDIAN__
#define LOCK_TOKEN (*(u32 *)(&get_paca()->lock_token))
@@ -542,6 +547,7 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
return kvmppc_do_h_remove(vcpu->kvm, flags, pte_index, avpn,
&vcpu->arch.regs.gpr[4]);
}
+EXPORT_SYMBOL_GPL(kvmppc_h_remove);
long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
{
@@ -660,6 +666,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
return ret;
}
+EXPORT_SYMBOL_GPL(kvmppc_h_bulk_remove);
long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
unsigned long pte_index, unsigned long avpn)
@@ -730,6 +737,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
return H_SUCCESS;
}
+EXPORT_SYMBOL_GPL(kvmppc_h_protect);
long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
unsigned long pte_index)
@@ -770,6 +778,7 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
}
return H_SUCCESS;
}
+EXPORT_SYMBOL_GPL(kvmppc_h_read);
long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags,
unsigned long pte_index)
@@ -818,6 +827,7 @@ long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags,
unlock_hpte(hpte, v & ~HPTE_V_HVLOCK);
return ret;
}
+EXPORT_SYMBOL_GPL(kvmppc_h_clear_ref);
long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
unsigned long pte_index)
@@ -865,6 +875,7 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
unlock_hpte(hpte, v & ~HPTE_V_HVLOCK);
return ret;
}
+EXPORT_SYMBOL_GPL(kvmppc_h_clear_mod);
static int kvmppc_get_hpa(struct kvm_vcpu *vcpu, unsigned long mmu_seq,
unsigned long gpa, int writing, unsigned long *hpa,
@@ -1283,3 +1294,4 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
return -1; /* send fault up to host kernel mode */
}
+EXPORT_SYMBOL_GPL(kvmppc_hpte_hv_fault);
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index c2c9c733f359..0a11ec88a0ae 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -141,13 +141,6 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
return;
}
- if (xive_enabled() && kvmhv_on_pseries()) {
- /* No XICS access or hypercalls available, too hard */
- this_icp->rm_action |= XICS_RM_KICK_VCPU;
- this_icp->rm_kick_target = vcpu;
- return;
- }
-
/*
* Check if the core is loaded,
* if not, find an available host core to post to wake the VCPU,
@@ -771,14 +764,6 @@ static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again)
void __iomem *xics_phys;
int64_t rc;
- if (kvmhv_on_pseries()) {
- unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
-
- iosync();
- plpar_hcall_raw(H_EOI, retbuf, hwirq);
- return;
- }
-
rc = pnv_opal_pci_msi_eoi(c, hwirq);
if (rc)
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 004f0d4e665f..8dd437d7a2c6 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -25,18 +25,10 @@
#include <asm/export.h>
#include <asm/tm.h>
#include <asm/opal.h>
-#include <asm/xive-regs.h>
#include <asm/thread_info.h>
#include <asm/asm-compat.h>
#include <asm/feature-fixups.h>
#include <asm/cpuidle.h>
-#include <asm/ultravisor-api.h>
-
-/* Sign-extend HDEC if not on POWER9 */
-#define EXTEND_HDEC(reg) \
-BEGIN_FTR_SECTION; \
- extsw reg, reg; \
-END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
/* Values in HSTATE_NAPPING(r13) */
#define NAPPING_CEDE 1
@@ -44,9 +36,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
#define NAPPING_UNSPLIT 3
/* Stack frame offsets for kvmppc_hv_entry */
-#define SFS 208
+#define SFS 160
#define STACK_SLOT_TRAP (SFS-4)
-#define STACK_SLOT_SHORT_PATH (SFS-8)
#define STACK_SLOT_TID (SFS-16)
#define STACK_SLOT_PSSCR (SFS-24)
#define STACK_SLOT_PID (SFS-32)
@@ -57,11 +48,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
#define STACK_SLOT_HFSCR (SFS-72)
#define STACK_SLOT_AMR (SFS-80)
#define STACK_SLOT_UAMOR (SFS-88)
-#define STACK_SLOT_DAWR1 (SFS-96)
-#define STACK_SLOT_DAWRX1 (SFS-104)
-#define STACK_SLOT_FSCR (SFS-112)
-/* the following is used by the P9 short path */
-#define STACK_SLOT_NVGPRS (SFS-152) /* 18 gprs */
+#define STACK_SLOT_FSCR (SFS-96)
/*
* Call kvmppc_hv_entry in real mode.
@@ -137,15 +124,6 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
/* Return the trap number on this thread as the return value */
mr r3, r12
- /*
- * If we came back from the guest via a relocation-on interrupt,
- * we will be in virtual mode at this point, which makes it a
- * little easier to get back to the caller.
- */
- mfmsr r0
- andi. r0, r0, MSR_IR /* in real mode? */
- bne .Lvirt_return
-
/* RFI into the highmem handler */
mfmsr r6
li r0, MSR_RI
@@ -155,11 +133,6 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
mtsrr1 r7
RFI_TO_KERNEL
- /* Virtual-mode return */
-.Lvirt_return:
- mtlr r8
- blr
-
kvmppc_primary_no_guest:
/* We handle this much like a ceded vcpu */
/* put the HDEC into the DEC, since HDEC interrupts don't wake us */
@@ -246,7 +219,7 @@ kvm_novcpu_wakeup:
/* See if our timeslice has expired (HDEC is negative) */
mfspr r0, SPRN_HDEC
- EXTEND_HDEC(r0)
+ extsw r0, r0
li r12, BOOK3S_INTERRUPT_HV_DECREMENTER
cmpdi r0, 0
blt kvm_novcpu_exit
@@ -348,10 +321,8 @@ kvm_secondary_got_guest:
lbz r4, HSTATE_PTID(r13)
cmpwi r4, 0
bne 63f
- LOAD_REG_ADDR(r6, decrementer_max)
- ld r6, 0(r6)
+ lis r6,0x7fff /* MAX_INT@h */
mtspr SPRN_HDEC, r6
-BEGIN_FTR_SECTION
/* and set per-LPAR registers, if doing dynamic micro-threading */
ld r6, HSTATE_SPLIT_MODE(r13)
cmpdi r6, 0
@@ -363,7 +334,6 @@ BEGIN_FTR_SECTION
ld r0, KVM_SPLIT_LDBAR(r6)
mtspr SPRN_LDBAR, r0
isync
-END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
63:
/* Order load of vcpu after load of vcore */
lwsync
@@ -434,7 +404,6 @@ kvm_no_guest:
blr
53:
-BEGIN_FTR_SECTION
HMT_LOW
ld r5, HSTATE_KVM_VCORE(r13)
cmpdi r5, 0
@@ -449,14 +418,6 @@ BEGIN_FTR_SECTION
b kvm_unsplit_nap
60: HMT_MEDIUM
b kvm_secondary_got_guest
-FTR_SECTION_ELSE
- HMT_LOW
- ld r5, HSTATE_KVM_VCORE(r13)
- cmpdi r5, 0
- beq kvm_no_guest
- HMT_MEDIUM
- b kvm_secondary_got_guest
-ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
54: li r0, KVM_HWTHREAD_IN_KVM
stb r0, HSTATE_HWTHREAD_STATE(r13)
@@ -582,13 +543,11 @@ kvmppc_hv_entry:
bne 10f
lwz r7,KVM_LPID(r9)
-BEGIN_FTR_SECTION
ld r6,KVM_SDR1(r9)
li r0,LPID_RSVD /* switch to reserved LPID */
mtspr SPRN_LPID,r0
ptesync
mtspr SPRN_SDR1,r6 /* switch to partition page table */
-END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
mtspr SPRN_LPID,r7
isync
@@ -669,16 +628,6 @@ kvmppc_got_guest:
/* Save host values of some registers */
BEGIN_FTR_SECTION
- mfspr r5, SPRN_TIDR
- mfspr r6, SPRN_PSSCR
- mfspr r7, SPRN_PID
- std r5, STACK_SLOT_TID(r1)
- std r6, STACK_SLOT_PSSCR(r1)
- std r7, STACK_SLOT_PID(r1)
- mfspr r5, SPRN_HFSCR
- std r5, STACK_SLOT_HFSCR(r1)
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
-BEGIN_FTR_SECTION
mfspr r5, SPRN_CIABR
mfspr r6, SPRN_DAWR0
mfspr r7, SPRN_DAWRX0
@@ -690,12 +639,6 @@ BEGIN_FTR_SECTION
mfspr r5, SPRN_FSCR
std r5, STACK_SLOT_FSCR(r1)
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
-BEGIN_FTR_SECTION
- mfspr r6, SPRN_DAWR1
- mfspr r7, SPRN_DAWRX1
- std r6, STACK_SLOT_DAWR1(r1)
- std r7, STACK_SLOT_DAWRX1(r1)
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S | CPU_FTR_DAWR1)
mfspr r5, SPRN_AMR
std r5, STACK_SLOT_AMR(r1)
@@ -713,13 +656,9 @@ BEGIN_FTR_SECTION
END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-/*
- * Branch around the call if both CPU_FTR_TM and
- * CPU_FTR_P9_TM_HV_ASSIST are off.
- */
BEGIN_FTR_SECTION
b 91f
-END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
+END_FTR_SECTION_IFCLR(CPU_FTR_TM)
/*
* NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
*/
@@ -786,12 +725,6 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
ld r6, VCPU_DAWRX0(r4)
mtspr SPRN_DAWR0, r5
mtspr SPRN_DAWRX0, r6
-BEGIN_FTR_SECTION
- ld r5, VCPU_DAWR1(r4)
- ld r6, VCPU_DAWRX1(r4)
- mtspr SPRN_DAWR1, r5
- mtspr SPRN_DAWRX1, r6
-END_FTR_SECTION_IFSET(CPU_FTR_DAWR1)
1:
ld r7, VCPU_CIABR(r4)
ld r8, VCPU_TAR(r4)
@@ -809,7 +742,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_DAWR1)
mtspr SPRN_BESCR, r6
mtspr SPRN_PID, r7
mtspr SPRN_WORT, r8
-BEGIN_FTR_SECTION
/* POWER8-only registers */
ld r5, VCPU_TCSCR(r4)
ld r6, VCPU_ACOP(r4)
@@ -820,18 +752,6 @@ BEGIN_FTR_SECTION
mtspr SPRN_CSIGR, r7
mtspr SPRN_TACR, r8
nop
-FTR_SECTION_ELSE
- /* POWER9-only registers */
- ld r5, VCPU_TID(r4)
- ld r6, VCPU_PSSCR(r4)
- lbz r8, HSTATE_FAKE_SUSPEND(r13)
- oris r6, r6, PSSCR_EC@h /* This makes stop trap to HV */
- rldimi r6, r8, PSSCR_FAKE_SUSPEND_LG, 63 - PSSCR_FAKE_SUSPEND_LG
- ld r7, VCPU_HFSCR(r4)
- mtspr SPRN_TIDR, r5
- mtspr SPRN_PSSCR, r6
- mtspr SPRN_HFSCR, r7
-ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
8:
ld r5, VCPU_SPRG0(r4)
@@ -901,23 +821,15 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
/* Check if HDEC expires soon */
mfspr r3, SPRN_HDEC
- EXTEND_HDEC(r3)
+ extsw r3, r3
cmpdi r3, 512 /* 1 microsecond */
blt hdec_soon
- ld r6, VCPU_KVM(r4)
- lbz r0, KVM_RADIX(r6)
- cmpwi r0, 0
- bne 9f
-
- /* For hash guest, clear out and reload the SLB */
-BEGIN_MMU_FTR_SECTION
- /* Radix host won't have populated the SLB, so no need to clear */
+ /* Clear out and reload the SLB */
li r6, 0
slbmte r6, r6
PPC_SLBIA(6)
ptesync
-END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
/* Load up guest SLB entries (N.B. slb_max will be 0 for radix) */
lwz r5,VCPU_SLB_MAX(r4)
@@ -932,96 +844,9 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
bdnz 1b
9:
-#ifdef CONFIG_KVM_XICS
- /* We are entering the guest on that thread, push VCPU to XIVE */
- ld r11, VCPU_XIVE_SAVED_STATE(r4)
- li r9, TM_QW1_OS
- lwz r8, VCPU_XIVE_CAM_WORD(r4)
- cmpwi r8, 0
- beq no_xive
- li r7, TM_QW1_OS + TM_WORD2
- mfmsr r0
- andi. r0, r0, MSR_DR /* in real mode? */
- beq 2f
- ld r10, HSTATE_XIVE_TIMA_VIRT(r13)
- cmpldi cr1, r10, 0
- beq cr1, no_xive
- eieio
- stdx r11,r9,r10
- stwx r8,r7,r10
- b 3f
-2: ld r10, HSTATE_XIVE_TIMA_PHYS(r13)
- cmpldi cr1, r10, 0
- beq cr1, no_xive
- eieio
- stdcix r11,r9,r10
- stwcix r8,r7,r10
-3: li r9, 1
- stb r9, VCPU_XIVE_PUSHED(r4)
- eieio
-
- /*
- * We clear the irq_pending flag. There is a small chance of a
- * race vs. the escalation interrupt happening on another
- * processor setting it again, but the only consequence is to
- * cause a spurrious wakeup on the next H_CEDE which is not an
- * issue.
- */
- li r0,0
- stb r0, VCPU_IRQ_PENDING(r4)
-
- /*
- * In single escalation mode, if the escalation interrupt is
- * on, we mask it.
- */
- lbz r0, VCPU_XIVE_ESC_ON(r4)
- cmpwi cr1, r0,0
- beq cr1, 1f
- li r9, XIVE_ESB_SET_PQ_01
- beq 4f /* in real mode? */
- ld r10, VCPU_XIVE_ESC_VADDR(r4)
- ldx r0, r10, r9
- b 5f
-4: ld r10, VCPU_XIVE_ESC_RADDR(r4)
- ldcix r0, r10, r9
-5: sync
-
- /* We have a possible subtle race here: The escalation interrupt might
- * have fired and be on its way to the host queue while we mask it,
- * and if we unmask it early enough (re-cede right away), there is
- * a theorical possibility that it fires again, thus landing in the
- * target queue more than once which is a big no-no.
- *
- * Fortunately, solving this is rather easy. If the above load setting
- * PQ to 01 returns a previous value where P is set, then we know the
- * escalation interrupt is somewhere on its way to the host. In that
- * case we simply don't clear the xive_esc_on flag below. It will be
- * eventually cleared by the handler for the escalation interrupt.
- *
- * Then, when doing a cede, we check that flag again before re-enabling
- * the escalation interrupt, and if set, we abort the cede.
- */
- andi. r0, r0, XIVE_ESB_VAL_P
- bne- 1f
-
- /* Now P is 0, we can clear the flag */
- li r0, 0
- stb r0, VCPU_XIVE_ESC_ON(r4)
-1:
-no_xive:
-#endif /* CONFIG_KVM_XICS */
-
- li r0, 0
- stw r0, STACK_SLOT_SHORT_PATH(r1)
-
deliver_guest_interrupt: /* r4 = vcpu, r13 = paca */
/* Check if we can deliver an external or decrementer interrupt now */
ld r0, VCPU_PENDING_EXC(r4)
-BEGIN_FTR_SECTION
- /* On POWER9, also check for emulated doorbell interrupt */
- lbz r3, VCPU_DBELL_REQ(r4)
- or r0, r0, r3
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
cmpdi r0, 0
beq 71f
mr r3, r4
@@ -1033,7 +858,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
mtspr SPRN_SRR0, r6
mtspr SPRN_SRR1, r7
-fast_guest_entry_c:
ld r10, VCPU_PC(r4)
ld r11, VCPU_MSR(r4)
/* r11 = vcpu->arch.msr & ~MSR_HV */
@@ -1095,18 +919,8 @@ BEGIN_FTR_SECTION
mtspr SPRN_PPR, r0
END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
-/* Move canary into DSISR to check for later */
-BEGIN_FTR_SECTION
- li r0, 0x7fff
- mtspr SPRN_HDSISR, r0
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
-
- ld r6, VCPU_KVM(r4)
- lbz r7, KVM_SECURE_GUEST(r6)
- cmpdi r7, 0
ld r6, VCPU_GPR(R6)(r4)
ld r7, VCPU_GPR(R7)(r4)
- bne ret_to_ultra
ld r0, VCPU_CR(r4)
mtcr r0
@@ -1117,117 +931,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
ld r4, VCPU_GPR(R4)(r4)
HRFI_TO_GUEST
b .
-/*
- * Use UV_RETURN ultracall to return control back to the Ultravisor after
- * processing an hypercall or interrupt that was forwarded (a.k.a. reflected)
- * to the Hypervisor.
- *
- * All registers have already been loaded, except:
- * R0 = hcall result
- * R2 = SRR1, so UV can detect a synthesized interrupt (if any)
- * R3 = UV_RETURN
- */
-ret_to_ultra:
- ld r0, VCPU_CR(r4)
- mtcr r0
-
- ld r0, VCPU_GPR(R3)(r4)
- mfspr r2, SPRN_SRR1
- li r3, 0
- ori r3, r3, UV_RETURN
- ld r4, VCPU_GPR(R4)(r4)
- sc 2
-
-/*
- * Enter the guest on a P9 or later system where we have exactly
- * one vcpu per vcore and we don't need to go to real mode
- * (which implies that host and guest are both using radix MMU mode).
- * r3 = vcpu pointer
- * Most SPRs and all the VSRs have been loaded already.
- */
-_GLOBAL(__kvmhv_vcpu_entry_p9)
-EXPORT_SYMBOL_GPL(__kvmhv_vcpu_entry_p9)
- mflr r0
- std r0, PPC_LR_STKOFF(r1)
- stdu r1, -SFS(r1)
-
- li r0, 1
- stw r0, STACK_SLOT_SHORT_PATH(r1)
-
- std r3, HSTATE_KVM_VCPU(r13)
- mfcr r4
- stw r4, SFS+8(r1)
-
- std r1, HSTATE_HOST_R1(r13)
-
- reg = 14
- .rept 18
- std reg, STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1)
- reg = reg + 1
- .endr
-
- reg = 14
- .rept 18
- ld reg, __VCPU_GPR(reg)(r3)
- reg = reg + 1
- .endr
-
- mfmsr r10
- std r10, HSTATE_HOST_MSR(r13)
-
- mr r4, r3
- b fast_guest_entry_c
-guest_exit_short_path:
- /*
- * Malicious or buggy radix guests may have inserted SLB entries
- * (only 0..3 because radix always runs with UPRT=1), so these must
- * be cleared here to avoid side-channels. slbmte is used rather
- * than slbia, as it won't clear cached translations.
- */
- li r0,0
- slbmte r0,r0
- li r4,1
- slbmte r0,r4
- li r4,2
- slbmte r0,r4
- li r4,3
- slbmte r0,r4
-
- li r0, KVM_GUEST_MODE_NONE
- stb r0, HSTATE_IN_GUEST(r13)
-
- reg = 14
- .rept 18
- std reg, __VCPU_GPR(reg)(r9)
- reg = reg + 1
- .endr
-
- reg = 14
- .rept 18
- ld reg, STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1)
- reg = reg + 1
- .endr
-
- lwz r4, SFS+8(r1)
- mtcr r4
-
- mr r3, r12 /* trap number */
-
- addi r1, r1, SFS
- ld r0, PPC_LR_STKOFF(r1)
- mtlr r0
-
- /* If we are in real mode, do a rfid to get back to the caller */
- mfmsr r4
- andi. r5, r4, MSR_IR
- bnelr
- rldicl r5, r4, 64 - MSR_TS_S_LG, 62 /* extract TS field */
- mtspr SPRN_SRR0, r0
- ld r10, HSTATE_HOST_MSR(r13)
- rldimi r10, r5, MSR_TS_S_LG, 63 - MSR_TS_T_LG
- mtspr SPRN_SRR1, r10
- RFI_TO_KERNEL
- b .
secondary_too_late:
li r12, 0
@@ -1268,21 +971,16 @@ hdec_soon:
kvmppc_interrupt_hv:
/*
* Register contents:
+ * R9 = HSTATE_IN_GUEST
* R12 = (guest CR << 32) | interrupt vector
* R13 = PACA
* guest R12 saved in shadow VCPU SCRATCH0
* guest R13 saved in SPRN_SCRATCH0
+ * guest R9 saved in HSTATE_SCRATCH2
*/
- std r9, HSTATE_SCRATCH2(r13)
- lbz r9, HSTATE_IN_GUEST(r13)
- cmpwi r9, KVM_GUEST_MODE_HOST_HV
- beq kvmppc_bad_host_intr
-#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
- cmpwi r9, KVM_GUEST_MODE_GUEST
- ld r9, HSTATE_SCRATCH2(r13)
- beq kvmppc_interrupt_pr
-#endif
/* We're now back in the host but in guest MMU context */
+ cmpwi r9,KVM_GUEST_MODE_HOST_HV
+ beq kvmppc_bad_host_intr
li r9, KVM_GUEST_MODE_HOST_HV
stb r9, HSTATE_IN_GUEST(r13)
@@ -1400,7 +1098,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER
bne 2f
mfspr r3,SPRN_HDEC
- EXTEND_HDEC(r3)
+ extsw r3, r3
cmpdi r3,0
mr r4,r9
bge fast_guest_return
@@ -1412,14 +1110,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
/* Hypervisor doorbell - exit only if host IPI flag set */
cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL
bne 3f
-BEGIN_FTR_SECTION
- PPC_MSGSYNC
- lwsync
- /* always exit if we're running a nested guest */
- ld r0, VCPU_NESTED(r9)
- cmpdi r0, 0
- bne guest_exit_cont
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
lbz r0, HSTATE_HOST_IPI(r13)
cmpwi r0, 0
beq maybe_reenter_guest
@@ -1449,62 +1139,16 @@ guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */
mr r4, r9
bl kvmhv_accumulate_time
#endif
-#ifdef CONFIG_KVM_XICS
- /* We are exiting, pull the VP from the XIVE */
- lbz r0, VCPU_XIVE_PUSHED(r9)
- cmpwi cr0, r0, 0
- beq 1f
- li r7, TM_SPC_PULL_OS_CTX
- li r6, TM_QW1_OS
- mfmsr r0
- andi. r0, r0, MSR_DR /* in real mode? */
- beq 2f
- ld r10, HSTATE_XIVE_TIMA_VIRT(r13)
- cmpldi cr0, r10, 0
- beq 1f
- /* First load to pull the context, we ignore the value */
- eieio
- lwzx r11, r7, r10
- /* Second load to recover the context state (Words 0 and 1) */
- ldx r11, r6, r10
- b 3f
-2: ld r10, HSTATE_XIVE_TIMA_PHYS(r13)
- cmpldi cr0, r10, 0
- beq 1f
- /* First load to pull the context, we ignore the value */
- eieio
- lwzcix r11, r7, r10
- /* Second load to recover the context state (Words 0 and 1) */
- ldcix r11, r6, r10
-3: std r11, VCPU_XIVE_SAVED_STATE(r9)
- /* Fixup some of the state for the next load */
- li r10, 0
- li r0, 0xff
- stb r10, VCPU_XIVE_PUSHED(r9)
- stb r10, (VCPU_XIVE_SAVED_STATE+3)(r9)
- stb r0, (VCPU_XIVE_SAVED_STATE+4)(r9)
- eieio
-1:
-#endif /* CONFIG_KVM_XICS */
/*
* Possibly flush the link stack here, before we do a blr in
- * guest_exit_short_path.
+ * kvmhv_switch_to_host.
*/
1: nop
patch_site 1b patch__call_kvm_flush_link_stack
- /* If we came in through the P9 short path, go back out to C now */
- lwz r0, STACK_SLOT_SHORT_PATH(r1)
- cmpwi r0, 0
- bne guest_exit_short_path
-
/* For hash guest, read the guest SLB and save it away */
- ld r5, VCPU_KVM(r9)
- lbz r0, KVM_RADIX(r5)
li r5, 0
- cmpwi r0, 0
- bne 0f /* for radix, save 0 entries */
lwz r0,VCPU_SLB_NR(r9) /* number of entries in SLB */
mtctr r0
li r6,0
@@ -1528,9 +1172,6 @@ guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */
stw r5,VCPU_SLB_MAX(r9)
/* load host SLB entries */
-BEGIN_MMU_FTR_SECTION
- b guest_bypass
-END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
ld r8,PACA_SLBSHADOWPTR(r13)
.rept SLB_NUM_BOLTED
@@ -1543,21 +1184,6 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
slbmte r6,r5
1: addi r8,r8,16
.endr
- b guest_bypass
-
-0: /*
- * Sanitise radix guest SLB, see guest_exit_short_path comment.
- * We clear vcpu->arch.slb_max to match earlier behaviour.
- */
- li r0,0
- stw r0,VCPU_SLB_MAX(r9)
- slbmte r0,r0
- li r4,1
- slbmte r0,r4
- li r4,2
- slbmte r0,r4
- li r4,3
- slbmte r0,r4
guest_bypass:
stw r12, STACK_SLOT_TRAP(r1)
@@ -1567,12 +1193,6 @@ guest_bypass:
ld r3, HSTATE_KVM_VCORE(r13)
mfspr r5,SPRN_DEC
mftb r6
- /* On P9, if the guest has large decr enabled, don't sign extend */
-BEGIN_FTR_SECTION
- ld r4, VCORE_LPCR(r3)
- andis. r4, r4, LPCR_LD@h
- bne 16f
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
extsw r5,r5
16: add r5,r5,r6
/* r5 is a guest timebase value here, convert to host TB */
@@ -1646,7 +1266,6 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
std r6, VCPU_BESCR(r9)
stw r7, VCPU_GUEST_PID(r9)
std r8, VCPU_WORT(r9)
-BEGIN_FTR_SECTION
mfspr r5, SPRN_TCSCR
mfspr r6, SPRN_ACOP
mfspr r7, SPRN_CSIGR
@@ -1655,17 +1274,6 @@ BEGIN_FTR_SECTION
std r6, VCPU_ACOP(r9)
std r7, VCPU_CSIGR(r9)
std r8, VCPU_TACR(r9)
-FTR_SECTION_ELSE
- mfspr r5, SPRN_TIDR
- mfspr r6, SPRN_PSSCR
- std r5, VCPU_TID(r9)
- rldicl r6, r6, 4, 50 /* r6 &= PSSCR_GUEST_VIS */
- rotldi r6, r6, 60
- std r6, VCPU_PSSCR(r9)
- /* Restore host HFSCR value */
- ld r7, STACK_SLOT_HFSCR(r1)
- mtspr SPRN_HFSCR, r7
-ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
BEGIN_FTR_SECTION
ld r5, STACK_SLOT_FSCR(r1)
mtspr SPRN_FSCR, r5
@@ -1677,13 +1285,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
li r0, 0
mtspr SPRN_PSPB, r0
mtspr SPRN_WORT, r0
-BEGIN_FTR_SECTION
mtspr SPRN_TCSCR, r0
/* Set MMCRS to 1<<31 to freeze and disable the SPMC counters */
li r0, 1
sldi r0, r0, 31
mtspr SPRN_MMCRS, r0
-END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
/* Save and restore AMR, IAMR and UAMOR before turning on the MMU */
ld r8, STACK_SLOT_IAMR(r1)
@@ -1740,13 +1346,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
bl kvmppc_save_fp
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-/*
- * Branch around the call if both CPU_FTR_TM and
- * CPU_FTR_P9_TM_HV_ASSIST are off.
- */
BEGIN_FTR_SECTION
b 91f
-END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
+END_FTR_SECTION_IFCLR(CPU_FTR_TM)
/*
* NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
*/
@@ -1792,80 +1394,6 @@ BEGIN_FTR_SECTION
mtspr SPRN_DAWR0, r6
mtspr SPRN_DAWRX0, r7
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
-BEGIN_FTR_SECTION
- ld r6, STACK_SLOT_DAWR1(r1)
- ld r7, STACK_SLOT_DAWRX1(r1)
- mtspr SPRN_DAWR1, r6
- mtspr SPRN_DAWRX1, r7
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S | CPU_FTR_DAWR1)
-BEGIN_FTR_SECTION
- ld r5, STACK_SLOT_TID(r1)
- ld r6, STACK_SLOT_PSSCR(r1)
- ld r7, STACK_SLOT_PID(r1)
- mtspr SPRN_TIDR, r5
- mtspr SPRN_PSSCR, r6
- mtspr SPRN_PID, r7
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
-
-#ifdef CONFIG_PPC_RADIX_MMU
- /*
- * Are we running hash or radix ?
- */
- ld r5, VCPU_KVM(r9)
- lbz r0, KVM_RADIX(r5)
- cmpwi cr2, r0, 0
- beq cr2, 2f
-
- /*
- * Radix: do eieio; tlbsync; ptesync sequence in case we
- * interrupted the guest between a tlbie and a ptesync.
- */
- eieio
- tlbsync
- ptesync
-
-BEGIN_FTR_SECTION
- /* Radix: Handle the case where the guest used an illegal PID */
- LOAD_REG_ADDR(r4, mmu_base_pid)
- lwz r3, VCPU_GUEST_PID(r9)
- lwz r5, 0(r4)
- cmpw cr0,r3,r5
- blt 2f
-
- /*
- * Illegal PID, the HW might have prefetched and cached in the TLB
- * some translations for the LPID 0 / guest PID combination which
- * Linux doesn't know about, so we need to flush that PID out of
- * the TLB. First we need to set LPIDR to 0 so tlbiel applies to
- * the right context.
- */
- li r0,0
- mtspr SPRN_LPID,r0
- isync
-
- /* Then do a congruence class local flush */
- ld r6,VCPU_KVM(r9)
- lwz r0,KVM_TLB_SETS(r6)
- mtctr r0
- li r7,0x400 /* IS field = 0b01 */
- ptesync
- sldi r0,r3,32 /* RS has PID */
-1: PPC_TLBIEL(7,0,2,1,1) /* RIC=2, PRS=1, R=1 */
- addi r7,r7,0x1000
- bdnz 1b
- ptesync
-END_FTR_SECTION_IFSET(CPU_FTR_P9_RADIX_PREFETCH_BUG)
-
-2:
-#endif /* CONFIG_PPC_RADIX_MMU */
-
- /*
- * cp_abort is required if the processor supports local copy-paste
- * to clear the copy buffer that was under control of the guest.
- */
-BEGIN_FTR_SECTION
- PPC_CP_ABORT
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_31)
/*
* POWER7/POWER8 guest -> host partition switch code.
@@ -1902,13 +1430,11 @@ kvmhv_switch_to_host:
/* Primary thread switches back to host partition */
lwz r7,KVM_HOST_LPID(r4)
-BEGIN_FTR_SECTION
ld r6,KVM_HOST_SDR1(r4)
li r8,LPID_RSVD /* switch to reserved LPID */
mtspr SPRN_LPID,r8
ptesync
mtspr SPRN_SDR1,r6 /* switch to host page table */
-END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
mtspr SPRN_LPID,r7
isync
@@ -2117,26 +1643,13 @@ kvmppc_tm_emul:
* reflect the HDSI to the guest as a DSI.
*/
kvmppc_hdsi:
- ld r3, VCPU_KVM(r9)
- lbz r0, KVM_RADIX(r3)
mfspr r4, SPRN_HDAR
mfspr r6, SPRN_HDSISR
-BEGIN_FTR_SECTION
- /* Look for DSISR canary. If we find it, retry instruction */
- cmpdi r6, 0x7fff
- beq 6f
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
- cmpwi r0, 0
- bne .Lradix_hdsi /* on radix, just save DAR/DSISR/ASDR */
/* HPTE not found fault or protection fault? */
andis. r0, r6, (DSISR_NOHPTE | DSISR_PROTFAULT)@h
beq 1f /* if not, send it to the guest */
andi. r0, r11, MSR_DR /* data relocation enabled? */
beq 3f
-BEGIN_FTR_SECTION
- mfspr r5, SPRN_ASDR /* on POWER9, use ASDR to get VSID */
- b 4f
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
clrrdi r0, r4, 28
PPC_SLBFEE_DOT(R5, R0) /* if so, look up SLB */
li r0, BOOK3S_INTERRUPT_DATA_SEGMENT
@@ -2204,31 +1717,15 @@ fast_interrupt_c_return:
stb r0, HSTATE_IN_GUEST(r13)
b guest_exit_cont
-.Lradix_hdsi:
- std r4, VCPU_FAULT_DAR(r9)
- stw r6, VCPU_FAULT_DSISR(r9)
-.Lradix_hisi:
- mfspr r5, SPRN_ASDR
- std r5, VCPU_FAULT_GPA(r9)
- b guest_exit_cont
-
/*
* Similarly for an HISI, reflect it to the guest as an ISI unless
* it is an HPTE not found fault for a page that we have paged out.
*/
kvmppc_hisi:
- ld r3, VCPU_KVM(r9)
- lbz r0, KVM_RADIX(r3)
- cmpwi r0, 0
- bne .Lradix_hisi /* for radix, just save ASDR */
andis. r0, r11, SRR1_ISI_NOPT@h
beq 1f
andi. r0, r11, MSR_IR /* instruction relocation enabled? */
beq 3f
-BEGIN_FTR_SECTION
- mfspr r5, SPRN_ASDR /* on POWER9, use ASDR to get VSID */
- b 4f
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
clrrdi r0, r10, 28
PPC_SLBFEE_DOT(R5, R0) /* if so, look up SLB */
li r0, BOOK3S_INTERRUPT_INST_SEGMENT
@@ -2276,10 +1773,6 @@ hcall_try_real_mode:
andi. r0,r11,MSR_PR
/* sc 1 from userspace - reflect to guest syscall */
bne sc_1_fast_return
- /* sc 1 from nested guest - give it to L1 to handle */
- ld r0, VCPU_NESTED(r9)
- cmpdi r0, 0
- bne guest_exit_cont
clrrdi r3,r3,2
cmpldi r3,hcall_real_table_end - hcall_real_table
bge guest_exit_cont
@@ -2544,7 +2037,7 @@ hcall_real_table:
#else
.long 0 /* 0x2fc - H_XIRR_X*/
#endif
- .long DOTSYM(kvmppc_h_random) - hcall_real_table
+ .long DOTSYM(kvmppc_rm_h_random) - hcall_real_table
.globl hcall_real_table_end
hcall_real_table_end:
@@ -2675,13 +2168,9 @@ _GLOBAL(kvmppc_h_cede) /* r3 = vcpu pointer, r11 = msr, r13 = paca */
bl kvmppc_save_fp
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-/*
- * Branch around the call if both CPU_FTR_TM and
- * CPU_FTR_P9_TM_HV_ASSIST are off.
- */
BEGIN_FTR_SECTION
b 91f
-END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
+END_FTR_SECTION_IFCLR(CPU_FTR_TM)
/*
* NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
*/
@@ -2701,15 +2190,8 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
mfspr r3, SPRN_DEC
mfspr r4, SPRN_HDEC
mftb r5
-BEGIN_FTR_SECTION
- /* On P9 check whether the guest has large decrementer mode enabled */
- ld r6, HSTATE_KVM_VCORE(r13)
- ld r6, VCORE_LPCR(r6)
- andis. r6, r6, LPCR_LD@h
- bne 68f
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
extsw r3, r3
-68: EXTEND_HDEC(r4)
+ extsw r4, r4
cmpd r3, r4
ble 67f
mtspr SPRN_DEC, r4
@@ -2754,28 +2236,11 @@ BEGIN_FTR_SECTION
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
kvm_nap_sequence: /* desired LPCR value in r5 */
-BEGIN_FTR_SECTION
- /*
- * PSSCR bits: exit criterion = 1 (wakeup based on LPCR at sreset)
- * enable state loss = 1 (allow SMT mode switch)
- * requested level = 0 (just stop dispatching)
- */
- lis r3, (PSSCR_EC | PSSCR_ESL)@h
- /* Set LPCR_PECE_HVEE bit to enable wakeup by HV interrupts */
- li r4, LPCR_PECE_HVEE@higher
- sldi r4, r4, 32
- or r5, r5, r4
-FTR_SECTION_ELSE
li r3, PNV_THREAD_NAP
-ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
mtspr SPRN_LPCR,r5
isync
-BEGIN_FTR_SECTION
- bl isa300_idle_stop_mayloss
-FTR_SECTION_ELSE
bl isa206_idle_insn_mayloss
-ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
mfspr r0, SPRN_CTRLF
ori r0, r0, 1
@@ -2794,10 +2259,8 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
beq kvm_end_cede
cmpwi r0, NAPPING_NOVCPU
beq kvm_novcpu_wakeup
-BEGIN_FTR_SECTION
cmpwi r0, NAPPING_UNSPLIT
beq kvm_unsplit_wakeup
-END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
twi 31,0,0 /* Nap state must not be zero */
33: mr r4, r3
@@ -2817,13 +2280,9 @@ kvm_end_cede:
#endif
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-/*
- * Branch around the call if both CPU_FTR_TM and
- * CPU_FTR_P9_TM_HV_ASSIST are off.
- */
BEGIN_FTR_SECTION
b 91f
-END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
+END_FTR_SECTION_IFCLR(CPU_FTR_TM)
/*
* NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
*/
@@ -2913,47 +2372,7 @@ kvm_cede_prodded:
/* we've ceded but we want to give control to the host */
kvm_cede_exit:
ld r9, HSTATE_KVM_VCPU(r13)
-#ifdef CONFIG_KVM_XICS
- /* are we using XIVE with single escalation? */
- ld r10, VCPU_XIVE_ESC_VADDR(r9)
- cmpdi r10, 0
- beq 3f
- li r6, XIVE_ESB_SET_PQ_00
- /*
- * If we still have a pending escalation, abort the cede,
- * and we must set PQ to 10 rather than 00 so that we don't
- * potentially end up with two entries for the escalation
- * interrupt in the XIVE interrupt queue. In that case
- * we also don't want to set xive_esc_on to 1 here in
- * case we race with xive_esc_irq().
- */
- lbz r5, VCPU_XIVE_ESC_ON(r9)
- cmpwi r5, 0
- beq 4f
- li r0, 0
- stb r0, VCPU_CEDED(r9)
- /*
- * The escalation interrupts are special as we don't EOI them.
- * There is no need to use the load-after-store ordering offset
- * to set PQ to 10 as we won't use StoreEOI.
- */
- li r6, XIVE_ESB_SET_PQ_10
- b 5f
-4: li r0, 1
- stb r0, VCPU_XIVE_ESC_ON(r9)
- /* make sure store to xive_esc_on is seen before xive_esc_irq runs */
- sync
-5: /* Enable XIVE escalation */
- mfmsr r0
- andi. r0, r0, MSR_DR /* in real mode? */
- beq 1f
- ldx r0, r10, r6
- b 2f
-1: ld r10, VCPU_XIVE_ESC_RADDR(r9)
- ldcix r0, r10, r6
-2: sync
-#endif /* CONFIG_KVM_XICS */
-3: b guest_exit_cont
+ b guest_exit_cont
/* Try to do machine check recovery in real mode */
machine_check_realmode:
@@ -3030,10 +2449,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
PPC_MSGCLR(6)
/* see if it's a host IPI */
li r3, 1
-BEGIN_FTR_SECTION
- PPC_MSGSYNC
- lwsync
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
lbz r0, HSTATE_HOST_IPI(r13)
cmpwi r0, 0
bnelr
@@ -3342,73 +2757,12 @@ kvmppc_bad_host_intr:
std r3, STACK_FRAME_OVERHEAD-16(r1)
/*
- * On POWER9 do a minimal restore of the MMU and call C code,
- * which will print a message and panic.
* XXX On POWER7 and POWER8, we just spin here since we don't
* know what the other threads are doing (and we don't want to
* coordinate with them) - but at least we now have register state
* in memory that we might be able to look at from another CPU.
*/
-BEGIN_FTR_SECTION
b .
-END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
- ld r9, HSTATE_KVM_VCPU(r13)
- ld r10, VCPU_KVM(r9)
-
- li r0, 0
- mtspr SPRN_AMR, r0
- mtspr SPRN_IAMR, r0
- mtspr SPRN_CIABR, r0
- mtspr SPRN_DAWRX0, r0
-BEGIN_FTR_SECTION
- mtspr SPRN_DAWRX1, r0
-END_FTR_SECTION_IFSET(CPU_FTR_DAWR1)
-
- /* Clear hash and radix guest SLB, see guest_exit_short_path comment. */
- slbmte r0, r0
- PPC_SLBIA(6)
-
-BEGIN_MMU_FTR_SECTION
- b 4f
-END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
-
- ptesync
- ld r8, PACA_SLBSHADOWPTR(r13)
- .rept SLB_NUM_BOLTED
- li r3, SLBSHADOW_SAVEAREA
- LDX_BE r5, r8, r3
- addi r3, r3, 8
- LDX_BE r6, r8, r3
- andis. r7, r5, SLB_ESID_V@h
- beq 3f
- slbmte r6, r5
-3: addi r8, r8, 16
- .endr
-
-4: lwz r7, KVM_HOST_LPID(r10)
- mtspr SPRN_LPID, r7
- mtspr SPRN_PID, r0
- ld r8, KVM_HOST_LPCR(r10)
- mtspr SPRN_LPCR, r8
- isync
- li r0, KVM_GUEST_MODE_NONE
- stb r0, HSTATE_IN_GUEST(r13)
-
- /*
- * Turn on the MMU and jump to C code
- */
- bcl 20, 31, .+4
-5: mflr r3
- addi r3, r3, 9f - 5b
- li r4, -1
- rldimi r3, r4, 62, 0 /* ensure 0xc000000000000000 bits are set */
- ld r4, PACAKMSR(r13)
- mtspr SPRN_SRR0, r3
- mtspr SPRN_SRR1, r4
- RFI_TO_KERNEL
-9: addi r3, r1, STACK_FRAME_OVERHEAD
- bl kvmppc_bad_interrupt
- b 9b
/*
* This mimics the MSR transition on IRQ delivery. The new guest MSR is taken
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index d7733b07f489..71bcb0140461 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -493,7 +493,7 @@ static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 msr)
if (!vcpu->arch.pending_exceptions) {
kvm_vcpu_block(vcpu);
kvm_clear_request(KVM_REQ_UNHALT, vcpu);
- vcpu->stat.halt_wakeup++;
+ vcpu->stat.generic.halt_wakeup++;
/* Unset POW bit after we woke up */
msr &= ~MSR_POW;
diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
index 031c8015864a..ac14239f3424 100644
--- a/arch/powerpc/kvm/book3s_pr_papr.c
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@@ -378,7 +378,7 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
kvmppc_set_msr_fast(vcpu, kvmppc_get_msr(vcpu) | MSR_EE);
kvm_vcpu_block(vcpu);
kvm_clear_request(KVM_REQ_UNHALT, vcpu);
- vcpu->stat.halt_wakeup++;
+ vcpu->stat.generic.halt_wakeup++;
return EMULATE_DONE;
case H_LOGICAL_CI_LOAD:
return kvmppc_h_pr_logical_ci_load(vcpu);
diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S
index 1f492aa4c8d6..202046a83fc1 100644
--- a/arch/powerpc/kvm/book3s_segment.S
+++ b/arch/powerpc/kvm/book3s_segment.S
@@ -164,12 +164,15 @@ kvmppc_interrupt_pr:
/* 64-bit entry. Register usage at this point:
*
* SPRG_SCRATCH0 = guest R13
+ * R9 = HSTATE_IN_GUEST
* R12 = (guest CR << 32) | exit handler id
* R13 = PACA
* HSTATE.SCRATCH0 = guest R12
+ * HSTATE.SCRATCH2 = guest R9
*/
#ifdef CONFIG_PPC64
/* Match 32-bit entry */
+ ld r9,HSTATE_SCRATCH2(r13)
rotldi r12, r12, 32 /* Flip R12 halves for stw */
stw r12, HSTATE_SCRATCH1(r13) /* CR is now in the low half */
srdi r12, r12, 32 /* shift trap into low half */
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index e7219b6f5f9a..9268d386b128 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -128,6 +128,71 @@ void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu)
EXPORT_SYMBOL_GPL(kvmppc_xive_push_vcpu);
/*
+ * Pull a vcpu's context from the XIVE on guest exit.
+ * This assumes we are in virtual mode (MMU on)
+ */
+void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu)
+{
+ void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt;
+
+ if (!vcpu->arch.xive_pushed)
+ return;
+
+ /*
+ * Should not have been pushed if there is no tima
+ */
+ if (WARN_ON(!tima))
+ return;
+
+ eieio();
+ /* First load to pull the context, we ignore the value */
+ __raw_readl(tima + TM_SPC_PULL_OS_CTX);
+ /* Second load to recover the context state (Words 0 and 1) */
+ vcpu->arch.xive_saved_state.w01 = __raw_readq(tima + TM_QW1_OS);
+
+ /* Fixup some of the state for the next load */
+ vcpu->arch.xive_saved_state.lsmfb = 0;
+ vcpu->arch.xive_saved_state.ack = 0xff;
+ vcpu->arch.xive_pushed = 0;
+ eieio();
+}
+EXPORT_SYMBOL_GPL(kvmppc_xive_pull_vcpu);
+
+void kvmppc_xive_rearm_escalation(struct kvm_vcpu *vcpu)
+{
+ void __iomem *esc_vaddr = (void __iomem *)vcpu->arch.xive_esc_vaddr;
+
+ if (!esc_vaddr)
+ return;
+
+ /* we are using XIVE with single escalation */
+
+ if (vcpu->arch.xive_esc_on) {
+ /*
+ * If we still have a pending escalation, abort the cede,
+ * and we must set PQ to 10 rather than 00 so that we don't
+ * potentially end up with two entries for the escalation
+ * interrupt in the XIVE interrupt queue. In that case
+ * we also don't want to set xive_esc_on to 1 here in
+ * case we race with xive_esc_irq().
+ */
+ vcpu->arch.ceded = 0;
+ /*
+ * The escalation interrupts are special as we don't EOI them.
+ * There is no need to use the load-after-store ordering offset
+ * to set PQ to 10 as we won't use StoreEOI.
+ */
+ __raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_10);
+ } else {
+ vcpu->arch.xive_esc_on = true;
+ mb();
+ __raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_00);
+ }
+ mb();
+}
+EXPORT_SYMBOL_GPL(kvmppc_xive_rearm_escalation);
+
+/*
* This is a simple trigger for a generic XIVE IRQ. This must
* only be called for interrupts that support a trigger page
*/
@@ -2075,6 +2140,36 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
return 0;
}
+int kvmppc_xive_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
+{
+ struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+ /* The VM should have configured XICS mode before doing XICS hcalls. */
+ if (!kvmppc_xics_enabled(vcpu))
+ return H_TOO_HARD;
+
+ switch (req) {
+ case H_XIRR:
+ return xive_vm_h_xirr(vcpu);
+ case H_CPPR:
+ return xive_vm_h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4));
+ case H_EOI:
+ return xive_vm_h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4));
+ case H_IPI:
+ return xive_vm_h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4),
+ kvmppc_get_gpr(vcpu, 5));
+ case H_IPOLL:
+ return xive_vm_h_ipoll(vcpu, kvmppc_get_gpr(vcpu, 4));
+ case H_XIRR_X:
+ xive_vm_h_xirr(vcpu);
+ kvmppc_set_gpr(vcpu, 5, get_tb() + vc->tb_offset);
+ return H_SUCCESS;
+ }
+
+ return H_UNSUPPORTED;
+}
+EXPORT_SYMBOL_GPL(kvmppc_xive_xics_hcall);
+
int kvmppc_xive_debug_show_queues(struct seq_file *m, struct kvm_vcpu *vcpu)
{
struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
@@ -2257,21 +2352,3 @@ struct kvm_device_ops kvm_xive_ops = {
.get_attr = xive_get_attr,
.has_attr = xive_has_attr,
};
-
-void kvmppc_xive_init_module(void)
-{
- __xive_vm_h_xirr = xive_vm_h_xirr;
- __xive_vm_h_ipoll = xive_vm_h_ipoll;
- __xive_vm_h_ipi = xive_vm_h_ipi;
- __xive_vm_h_cppr = xive_vm_h_cppr;
- __xive_vm_h_eoi = xive_vm_h_eoi;
-}
-
-void kvmppc_xive_exit_module(void)
-{
- __xive_vm_h_xirr = NULL;
- __xive_vm_h_ipoll = NULL;
- __xive_vm_h_ipi = NULL;
- __xive_vm_h_cppr = NULL;
- __xive_vm_h_eoi = NULL;
-}
diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h
index 86c24a4ad809..afe9eeac6d56 100644
--- a/arch/powerpc/kvm/book3s_xive.h
+++ b/arch/powerpc/kvm/book3s_xive.h
@@ -289,13 +289,6 @@ extern int xive_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
extern int xive_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr);
extern int xive_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr);
-extern unsigned long (*__xive_vm_h_xirr)(struct kvm_vcpu *vcpu);
-extern unsigned long (*__xive_vm_h_ipoll)(struct kvm_vcpu *vcpu, unsigned long server);
-extern int (*__xive_vm_h_ipi)(struct kvm_vcpu *vcpu, unsigned long server,
- unsigned long mfrr);
-extern int (*__xive_vm_h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr);
-extern int (*__xive_vm_h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr);
-
/*
* Common Xive routines for XICS-over-XIVE and XIVE native
*/
diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
index 76800c84f2a3..1253666dd4d8 100644
--- a/arch/powerpc/kvm/book3s_xive_native.c
+++ b/arch/powerpc/kvm/book3s_xive_native.c
@@ -1281,13 +1281,3 @@ struct kvm_device_ops kvm_xive_native_ops = {
.has_attr = kvmppc_xive_native_has_attr,
.mmap = kvmppc_xive_native_mmap,
};
-
-void kvmppc_xive_native_init_module(void)
-{
- ;
-}
-
-void kvmppc_xive_native_exit_module(void)
-{
- ;
-}
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 7d5fe43f85c4..551b30d84aee 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -36,29 +36,59 @@
unsigned long kvmppc_booke_handlers;
-struct kvm_stats_debugfs_item debugfs_entries[] = {
- VCPU_STAT("mmio", mmio_exits),
- VCPU_STAT("sig", signal_exits),
- VCPU_STAT("itlb_r", itlb_real_miss_exits),
- VCPU_STAT("itlb_v", itlb_virt_miss_exits),
- VCPU_STAT("dtlb_r", dtlb_real_miss_exits),
- VCPU_STAT("dtlb_v", dtlb_virt_miss_exits),
- VCPU_STAT("sysc", syscall_exits),
- VCPU_STAT("isi", isi_exits),
- VCPU_STAT("dsi", dsi_exits),
- VCPU_STAT("inst_emu", emulated_inst_exits),
- VCPU_STAT("dec", dec_exits),
- VCPU_STAT("ext_intr", ext_intr_exits),
- VCPU_STAT("halt_successful_poll", halt_successful_poll),
- VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
- VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
- VCPU_STAT("halt_wakeup", halt_wakeup),
- VCPU_STAT("doorbell", dbell_exits),
- VCPU_STAT("guest doorbell", gdbell_exits),
- VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
- VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
- VM_STAT("remote_tlb_flush", remote_tlb_flush),
- { NULL }
+const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
+ KVM_GENERIC_VM_STATS(),
+ STATS_DESC_ICOUNTER(VM, num_2M_pages),
+ STATS_DESC_ICOUNTER(VM, num_1G_pages)
+};
+static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
+ sizeof(struct kvm_vm_stat) / sizeof(u64));
+
+const struct kvm_stats_header kvm_vm_stats_header = {
+ .name_size = KVM_STATS_NAME_SIZE,
+ .num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
+ .id_offset = sizeof(struct kvm_stats_header),
+ .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+ .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+ sizeof(kvm_vm_stats_desc),
+};
+
+const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
+ KVM_GENERIC_VCPU_STATS(),
+ STATS_DESC_COUNTER(VCPU, sum_exits),
+ STATS_DESC_COUNTER(VCPU, mmio_exits),
+ STATS_DESC_COUNTER(VCPU, signal_exits),
+ STATS_DESC_COUNTER(VCPU, light_exits),
+ STATS_DESC_COUNTER(VCPU, itlb_real_miss_exits),
+ STATS_DESC_COUNTER(VCPU, itlb_virt_miss_exits),
+ STATS_DESC_COUNTER(VCPU, dtlb_real_miss_exits),
+ STATS_DESC_COUNTER(VCPU, dtlb_virt_miss_exits),
+ STATS_DESC_COUNTER(VCPU, syscall_exits),
+ STATS_DESC_COUNTER(VCPU, isi_exits),
+ STATS_DESC_COUNTER(VCPU, dsi_exits),
+ STATS_DESC_COUNTER(VCPU, emulated_inst_exits),
+ STATS_DESC_COUNTER(VCPU, dec_exits),
+ STATS_DESC_COUNTER(VCPU, ext_intr_exits),
+ STATS_DESC_TIME_NSEC(VCPU, halt_wait_ns),
+ STATS_DESC_COUNTER(VCPU, halt_successful_wait),
+ STATS_DESC_COUNTER(VCPU, dbell_exits),
+ STATS_DESC_COUNTER(VCPU, gdbell_exits),
+ STATS_DESC_COUNTER(VCPU, ld),
+ STATS_DESC_COUNTER(VCPU, st),
+ STATS_DESC_COUNTER(VCPU, pthru_all),
+ STATS_DESC_COUNTER(VCPU, pthru_host),
+ STATS_DESC_COUNTER(VCPU, pthru_bad_aff)
+};
+static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
+ sizeof(struct kvm_vcpu_stat) / sizeof(u64));
+
+const struct kvm_stats_header kvm_vcpu_stats_header = {
+ .name_size = KVM_STATS_NAME_SIZE,
+ .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
+ .id_offset = sizeof(struct kvm_stats_header),
+ .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+ .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+ sizeof(kvm_vcpu_stats_desc),
};
/* TODO: use vcpu_printf() */
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index a2a68a958fa0..be33b5321a76 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -682,6 +682,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = !!(hv_enabled && kvmppc_hv_ops->enable_dawr1 &&
!kvmppc_hv_ops->enable_dawr1(NULL));
break;
+ case KVM_CAP_PPC_RPT_INVALIDATE:
+ r = 1;
+ break;
#endif
default:
r = 0;
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 5fef8db3b463..6e3495221ab7 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -357,30 +357,19 @@ static void __init radix_init_pgtable(void)
}
/* Find out how many PID bits are supported */
- if (!cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) {
- if (!mmu_pid_bits)
- mmu_pid_bits = 20;
- mmu_base_pid = 1;
- } else if (cpu_has_feature(CPU_FTR_HVMODE)) {
- if (!mmu_pid_bits)
- mmu_pid_bits = 20;
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+ if (!cpu_has_feature(CPU_FTR_HVMODE) &&
+ cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) {
/*
- * When KVM is possible, we only use the top half of the
- * PID space to avoid collisions between host and guest PIDs
- * which can cause problems due to prefetch when exiting the
- * guest with AIL=3
+ * Older versions of KVM on these machines perfer if the
+ * guest only uses the low 19 PID bits.
*/
- mmu_base_pid = 1 << (mmu_pid_bits - 1);
-#else
- mmu_base_pid = 1;
-#endif
- } else {
- /* The guest uses the bottom half of the PID space */
if (!mmu_pid_bits)
mmu_pid_bits = 19;
- mmu_base_pid = 1;
+ } else {
+ if (!mmu_pid_bits)
+ mmu_pid_bits = 20;
}
+ mmu_base_pid = 1;
/*
* Allocate Partition table and process table for the
@@ -486,6 +475,7 @@ static int __init radix_dt_scan_page_sizes(unsigned long node,
def = &mmu_psize_defs[idx];
def->shift = shift;
def->ap = ap;
+ def->h_rpt_pgsize = psize_to_rpti_pgsize(idx);
}
/* needed ? */
@@ -560,9 +550,13 @@ void __init radix__early_init_devtree(void)
*/
mmu_psize_defs[MMU_PAGE_4K].shift = 12;
mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
+ mmu_psize_defs[MMU_PAGE_4K].h_rpt_pgsize =
+ psize_to_rpti_pgsize(MMU_PAGE_4K);
mmu_psize_defs[MMU_PAGE_64K].shift = 16;
mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
+ mmu_psize_defs[MMU_PAGE_64K].h_rpt_pgsize =
+ psize_to_rpti_pgsize(MMU_PAGE_64K);
}
/*
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index 409e61210789..318ec4f33661 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -20,10 +20,6 @@
#include "internal.h"
-#define RIC_FLUSH_TLB 0
-#define RIC_FLUSH_PWC 1
-#define RIC_FLUSH_ALL 2
-
/*
* tlbiel instruction for radix, set invalidation
* i.e., r=1 and is=01 or is=10 or is=11
@@ -130,6 +126,21 @@ static __always_inline void __tlbie_pid(unsigned long pid, unsigned long ric)
trace_tlbie(0, 0, rb, rs, ric, prs, r);
}
+static __always_inline void __tlbie_pid_lpid(unsigned long pid,
+ unsigned long lpid,
+ unsigned long ric)
+{
+ unsigned long rb, rs, prs, r;
+
+ rb = PPC_BIT(53); /* IS = 1 */
+ rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31)));
+ prs = 1; /* process scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(0, 0, rb, rs, ric, prs, r);
+}
static __always_inline void __tlbie_lpid(unsigned long lpid, unsigned long ric)
{
unsigned long rb,rs,prs,r;
@@ -190,6 +201,23 @@ static __always_inline void __tlbie_va(unsigned long va, unsigned long pid,
trace_tlbie(0, 0, rb, rs, ric, prs, r);
}
+static __always_inline void __tlbie_va_lpid(unsigned long va, unsigned long pid,
+ unsigned long lpid,
+ unsigned long ap, unsigned long ric)
+{
+ unsigned long rb, rs, prs, r;
+
+ rb = va & ~(PPC_BITMASK(52, 63));
+ rb |= ap << PPC_BITLSHIFT(58);
+ rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31)));
+ prs = 1; /* process scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(0, 0, rb, rs, ric, prs, r);
+}
+
static __always_inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid,
unsigned long ap, unsigned long ric)
{
@@ -235,6 +263,22 @@ static inline void fixup_tlbie_va_range(unsigned long va, unsigned long pid,
}
}
+static inline void fixup_tlbie_va_range_lpid(unsigned long va,
+ unsigned long pid,
+ unsigned long lpid,
+ unsigned long ap)
+{
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+ asm volatile("ptesync" : : : "memory");
+ __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB);
+ }
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+ asm volatile("ptesync" : : : "memory");
+ __tlbie_va_lpid(va, pid, lpid, ap, RIC_FLUSH_TLB);
+ }
+}
+
static inline void fixup_tlbie_pid(unsigned long pid)
{
/*
@@ -254,6 +298,25 @@ static inline void fixup_tlbie_pid(unsigned long pid)
}
}
+static inline void fixup_tlbie_pid_lpid(unsigned long pid, unsigned long lpid)
+{
+ /*
+ * We can use any address for the invalidation, pick one which is
+ * probably unused as an optimisation.
+ */
+ unsigned long va = ((1UL << 52) - 1);
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+ asm volatile("ptesync" : : : "memory");
+ __tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB);
+ }
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+ asm volatile("ptesync" : : : "memory");
+ __tlbie_va_lpid(va, pid, lpid, mmu_get_ap(MMU_PAGE_64K),
+ RIC_FLUSH_TLB);
+ }
+}
static inline void fixup_tlbie_lpid_va(unsigned long va, unsigned long lpid,
unsigned long ap)
@@ -344,6 +407,31 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
+static inline void _tlbie_pid_lpid(unsigned long pid, unsigned long lpid,
+ unsigned long ric)
+{
+ asm volatile("ptesync" : : : "memory");
+
+ /*
+ * Workaround the fact that the "ric" argument to __tlbie_pid
+ * must be a compile-time contraint to match the "i" constraint
+ * in the asm statement.
+ */
+ switch (ric) {
+ case RIC_FLUSH_TLB:
+ __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB);
+ fixup_tlbie_pid_lpid(pid, lpid);
+ break;
+ case RIC_FLUSH_PWC:
+ __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
+ break;
+ case RIC_FLUSH_ALL:
+ default:
+ __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_ALL);
+ fixup_tlbie_pid_lpid(pid, lpid);
+ }
+ asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+}
struct tlbiel_pid {
unsigned long pid;
unsigned long ric;
@@ -469,6 +557,20 @@ static inline void __tlbie_va_range(unsigned long start, unsigned long end,
fixup_tlbie_va_range(addr - page_size, pid, ap);
}
+static inline void __tlbie_va_range_lpid(unsigned long start, unsigned long end,
+ unsigned long pid, unsigned long lpid,
+ unsigned long page_size,
+ unsigned long psize)
+{
+ unsigned long addr;
+ unsigned long ap = mmu_get_ap(psize);
+
+ for (addr = start; addr < end; addr += page_size)
+ __tlbie_va_lpid(addr, pid, lpid, ap, RIC_FLUSH_TLB);
+
+ fixup_tlbie_va_range_lpid(addr - page_size, pid, lpid, ap);
+}
+
static __always_inline void _tlbie_va(unsigned long va, unsigned long pid,
unsigned long psize, unsigned long ric)
{
@@ -549,6 +651,18 @@ static inline void _tlbie_va_range(unsigned long start, unsigned long end,
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
+static inline void _tlbie_va_range_lpid(unsigned long start, unsigned long end,
+ unsigned long pid, unsigned long lpid,
+ unsigned long page_size,
+ unsigned long psize, bool also_pwc)
+{
+ asm volatile("ptesync" : : : "memory");
+ if (also_pwc)
+ __tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
+ __tlbie_va_range_lpid(start, end, pid, lpid, page_size, psize);
+ asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+}
+
static inline void _tlbiel_va_range_multicast(struct mm_struct *mm,
unsigned long start, unsigned long end,
unsigned long pid, unsigned long page_size,
@@ -1338,47 +1452,57 @@ void radix__flush_tlb_all(void)
}
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-extern void radix_kvm_prefetch_workaround(struct mm_struct *mm)
+/*
+ * Performs process-scoped invalidations for a given LPID
+ * as part of H_RPT_INVALIDATE hcall.
+ */
+void do_h_rpt_invalidate_prt(unsigned long pid, unsigned long lpid,
+ unsigned long type, unsigned long pg_sizes,
+ unsigned long start, unsigned long end)
{
- unsigned long pid = mm->context.id;
+ unsigned long psize, nr_pages;
+ struct mmu_psize_def *def;
+ bool flush_pid;
- if (unlikely(pid == MMU_NO_CONTEXT))
+ /*
+ * A H_RPTI_TYPE_ALL request implies RIC=3, hence
+ * do a single IS=1 based flush.
+ */
+ if ((type & H_RPTI_TYPE_ALL) == H_RPTI_TYPE_ALL) {
+ _tlbie_pid_lpid(pid, lpid, RIC_FLUSH_ALL);
return;
+ }
- if (!cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG))
- return;
+ if (type & H_RPTI_TYPE_PWC)
+ _tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
- /*
- * If this context hasn't run on that CPU before and KVM is
- * around, there's a slim chance that the guest on another
- * CPU just brought in obsolete translation into the TLB of
- * this CPU due to a bad prefetch using the guest PID on
- * the way into the hypervisor.
- *
- * We work around this here. If KVM is possible, we check if
- * any sibling thread is in KVM. If it is, the window may exist
- * and thus we flush that PID from the core.
- *
- * A potential future improvement would be to mark which PIDs
- * have never been used on the system and avoid it if the PID
- * is new and the process has no other cpumask bit set.
- */
- if (cpu_has_feature(CPU_FTR_HVMODE) && radix_enabled()) {
- int cpu = smp_processor_id();
- int sib = cpu_first_thread_sibling(cpu);
- bool flush = false;
-
- for (; sib <= cpu_last_thread_sibling(cpu) && !flush; sib++) {
- if (sib == cpu)
- continue;
- if (!cpu_possible(sib))
- continue;
- if (paca_ptrs[sib]->kvm_hstate.kvm_vcpu)
- flush = true;
+ /* Full PID flush */
+ if (start == 0 && end == -1)
+ return _tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB);
+
+ /* Do range invalidation for all the valid page sizes */
+ for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
+ def = &mmu_psize_defs[psize];
+ if (!(pg_sizes & def->h_rpt_pgsize))
+ continue;
+
+ nr_pages = (end - start) >> def->shift;
+ flush_pid = nr_pages > tlb_single_page_flush_ceiling;
+
+ /*
+ * If the number of pages spanning the range is above
+ * the ceiling, convert the request into a full PID flush.
+ * And since PID flush takes out all the page sizes, there
+ * is no need to consider remaining page sizes.
+ */
+ if (flush_pid) {
+ _tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB);
+ return;
}
- if (flush)
- _tlbiel_pid(pid, RIC_FLUSH_ALL);
+ _tlbie_va_range_lpid(start, end, pid, lpid,
+ (1UL << def->shift), psize, false);
}
}
-EXPORT_SYMBOL_GPL(radix_kvm_prefetch_workaround);
+EXPORT_SYMBOL_GPL(do_h_rpt_invalidate_prt);
+
#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c
index a857af401738..74246536b832 100644
--- a/arch/powerpc/mm/mmu_context.c
+++ b/arch/powerpc/mm/mmu_context.c
@@ -83,9 +83,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
if (cpu_has_feature(CPU_FTR_ALTIVEC))
asm volatile ("dssall");
- if (new_on_cpu)
- radix_kvm_prefetch_workaround(next);
- else
+ if (!new_on_cpu)
membarrier_arch_switch_mm(prev, next, tsk);
/*
diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index 999997d9e9a9..528a7e0cf83a 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -604,7 +604,7 @@ struct p9_sprs {
u64 uamor;
};
-static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
+static unsigned long power9_idle_stop(unsigned long psscr)
{
int cpu = raw_smp_processor_id();
int first = cpu_first_thread_sibling(cpu);
@@ -620,8 +620,6 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
if (!(psscr & (PSSCR_EC|PSSCR_ESL))) {
/* EC=ESL=0 case */
- BUG_ON(!mmu_on);
-
/*
* Wake synchronously. SRESET via xscom may still cause
* a 0x100 powersave wakeup with SRR1 reason!
@@ -803,8 +801,7 @@ core_woken:
__slb_restore_bolted_realmode();
out:
- if (mmu_on)
- mtmsr(MSR_KERNEL);
+ mtmsr(MSR_KERNEL);
return srr1;
}
@@ -895,7 +892,7 @@ struct p10_sprs {
*/
};
-static unsigned long power10_idle_stop(unsigned long psscr, bool mmu_on)
+static unsigned long power10_idle_stop(unsigned long psscr)
{
int cpu = raw_smp_processor_id();
int first = cpu_first_thread_sibling(cpu);
@@ -909,8 +906,6 @@ static unsigned long power10_idle_stop(unsigned long psscr, bool mmu_on)
if (!(psscr & (PSSCR_EC|PSSCR_ESL))) {
/* EC=ESL=0 case */
- BUG_ON(!mmu_on);
-
/*
* Wake synchronously. SRESET via xscom may still cause
* a 0x100 powersave wakeup with SRR1 reason!
@@ -991,8 +986,7 @@ core_woken:
__slb_restore_bolted_realmode();
out:
- if (mmu_on)
- mtmsr(MSR_KERNEL);
+ mtmsr(MSR_KERNEL);
return srr1;
}
@@ -1002,40 +996,10 @@ static unsigned long arch300_offline_stop(unsigned long psscr)
{
unsigned long srr1;
-#ifndef CONFIG_KVM_BOOK3S_HV_POSSIBLE
- __ppc64_runlatch_off();
if (cpu_has_feature(CPU_FTR_ARCH_31))
- srr1 = power10_idle_stop(psscr, true);
+ srr1 = power10_idle_stop(psscr);
else
- srr1 = power9_idle_stop(psscr, true);
- __ppc64_runlatch_on();
-#else
- /*
- * Tell KVM we're entering idle.
- * This does not have to be done in real mode because the P9 MMU
- * is independent per-thread. Some steppings share radix/hash mode
- * between threads, but in that case KVM has a barrier sync in real
- * mode before and after switching between radix and hash.
- *
- * kvm_start_guest must still be called in real mode though, hence
- * the false argument.
- */
- local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_IDLE;
-
- __ppc64_runlatch_off();
- if (cpu_has_feature(CPU_FTR_ARCH_31))
- srr1 = power10_idle_stop(psscr, false);
- else
- srr1 = power9_idle_stop(psscr, false);
- __ppc64_runlatch_on();
-
- local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_KERNEL;
- /* Order setting hwthread_state vs. testing hwthread_req */
- smp_mb();
- if (local_paca->kvm_hstate.hwthread_req)
- srr1 = idle_kvm_start_guest(srr1);
- mtmsr(MSR_KERNEL);
-#endif
+ srr1 = power9_idle_stop(psscr);
return srr1;
}
@@ -1055,9 +1019,9 @@ void arch300_idle_type(unsigned long stop_psscr_val,
__ppc64_runlatch_off();
if (cpu_has_feature(CPU_FTR_ARCH_31))
- srr1 = power10_idle_stop(psscr, true);
+ srr1 = power10_idle_stop(psscr);
else
- srr1 = power9_idle_stop(psscr, true);
+ srr1 = power9_idle_stop(psscr);
__ppc64_runlatch_on();
fini_irq_for_idle_irqsoff();
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 8925f3969478..9b4473f76e56 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -361,6 +361,7 @@ struct sie_page {
};
struct kvm_vcpu_stat {
+ struct kvm_vcpu_stat_generic generic;
u64 exit_userspace;
u64 exit_null;
u64 exit_external_request;
@@ -370,13 +371,7 @@ struct kvm_vcpu_stat {
u64 exit_validity;
u64 exit_instruction;
u64 exit_pei;
- u64 halt_successful_poll;
- u64 halt_attempted_poll;
- u64 halt_poll_invalid;
u64 halt_no_poll_steal;
- u64 halt_wakeup;
- u64 halt_poll_success_ns;
- u64 halt_poll_fail_ns;
u64 instruction_lctl;
u64 instruction_lctlg;
u64 instruction_stctl;
@@ -755,12 +750,12 @@ struct kvm_vcpu_arch {
};
struct kvm_vm_stat {
+ struct kvm_vm_stat_generic generic;
u64 inject_io;
u64 inject_float_mchk;
u64 inject_pfault_done;
u64 inject_service_signal;
u64 inject_virtio;
- u64 remote_tlb_flush;
};
struct kvm_arch_memory_slot {
diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile
index 12decca22e7c..b3aaadc60ead 100644
--- a/arch/s390/kvm/Makefile
+++ b/arch/s390/kvm/Makefile
@@ -4,7 +4,8 @@
# Copyright IBM Corp. 2008
KVM := ../../../virt/kvm
-common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/async_pf.o $(KVM)/irqchip.o $(KVM)/vfio.o
+common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/async_pf.o \
+ $(KVM)/irqchip.o $(KVM)/vfio.o $(KVM)/binary_stats.o
ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 1296fc10f80c..f9fb1e1d960d 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -58,112 +58,132 @@
#define VCPU_IRQS_MAX_BUF (sizeof(struct kvm_s390_irq) * \
(KVM_MAX_VCPUS + LOCAL_IRQS))
-struct kvm_stats_debugfs_item debugfs_entries[] = {
- VCPU_STAT("userspace_handled", exit_userspace),
- VCPU_STAT("exit_null", exit_null),
- VCPU_STAT("pfault_sync", pfault_sync),
- VCPU_STAT("exit_validity", exit_validity),
- VCPU_STAT("exit_stop_request", exit_stop_request),
- VCPU_STAT("exit_external_request", exit_external_request),
- VCPU_STAT("exit_io_request", exit_io_request),
- VCPU_STAT("exit_external_interrupt", exit_external_interrupt),
- VCPU_STAT("exit_instruction", exit_instruction),
- VCPU_STAT("exit_pei", exit_pei),
- VCPU_STAT("exit_program_interruption", exit_program_interruption),
- VCPU_STAT("exit_instr_and_program_int", exit_instr_and_program),
- VCPU_STAT("exit_operation_exception", exit_operation_exception),
- VCPU_STAT("halt_successful_poll", halt_successful_poll),
- VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
- VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
- VCPU_STAT("halt_no_poll_steal", halt_no_poll_steal),
- VCPU_STAT("halt_wakeup", halt_wakeup),
- VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
- VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
- VCPU_STAT("instruction_lctlg", instruction_lctlg),
- VCPU_STAT("instruction_lctl", instruction_lctl),
- VCPU_STAT("instruction_stctl", instruction_stctl),
- VCPU_STAT("instruction_stctg", instruction_stctg),
- VCPU_STAT("deliver_ckc", deliver_ckc),
- VCPU_STAT("deliver_cputm", deliver_cputm),
- VCPU_STAT("deliver_emergency_signal", deliver_emergency_signal),
- VCPU_STAT("deliver_external_call", deliver_external_call),
- VCPU_STAT("deliver_service_signal", deliver_service_signal),
- VCPU_STAT("deliver_virtio", deliver_virtio),
- VCPU_STAT("deliver_stop_signal", deliver_stop_signal),
- VCPU_STAT("deliver_prefix_signal", deliver_prefix_signal),
- VCPU_STAT("deliver_restart_signal", deliver_restart_signal),
- VCPU_STAT("deliver_program", deliver_program),
- VCPU_STAT("deliver_io", deliver_io),
- VCPU_STAT("deliver_machine_check", deliver_machine_check),
- VCPU_STAT("exit_wait_state", exit_wait_state),
- VCPU_STAT("inject_ckc", inject_ckc),
- VCPU_STAT("inject_cputm", inject_cputm),
- VCPU_STAT("inject_external_call", inject_external_call),
- VM_STAT("inject_float_mchk", inject_float_mchk),
- VCPU_STAT("inject_emergency_signal", inject_emergency_signal),
- VM_STAT("inject_io", inject_io),
- VCPU_STAT("inject_mchk", inject_mchk),
- VM_STAT("inject_pfault_done", inject_pfault_done),
- VCPU_STAT("inject_program", inject_program),
- VCPU_STAT("inject_restart", inject_restart),
- VM_STAT("inject_service_signal", inject_service_signal),
- VCPU_STAT("inject_set_prefix", inject_set_prefix),
- VCPU_STAT("inject_stop_signal", inject_stop_signal),
- VCPU_STAT("inject_pfault_init", inject_pfault_init),
- VM_STAT("inject_virtio", inject_virtio),
- VCPU_STAT("instruction_epsw", instruction_epsw),
- VCPU_STAT("instruction_gs", instruction_gs),
- VCPU_STAT("instruction_io_other", instruction_io_other),
- VCPU_STAT("instruction_lpsw", instruction_lpsw),
- VCPU_STAT("instruction_lpswe", instruction_lpswe),
- VCPU_STAT("instruction_pfmf", instruction_pfmf),
- VCPU_STAT("instruction_ptff", instruction_ptff),
- VCPU_STAT("instruction_stidp", instruction_stidp),
- VCPU_STAT("instruction_sck", instruction_sck),
- VCPU_STAT("instruction_sckpf", instruction_sckpf),
- VCPU_STAT("instruction_spx", instruction_spx),
- VCPU_STAT("instruction_stpx", instruction_stpx),
- VCPU_STAT("instruction_stap", instruction_stap),
- VCPU_STAT("instruction_iske", instruction_iske),
- VCPU_STAT("instruction_ri", instruction_ri),
- VCPU_STAT("instruction_rrbe", instruction_rrbe),
- VCPU_STAT("instruction_sske", instruction_sske),
- VCPU_STAT("instruction_ipte_interlock", instruction_ipte_interlock),
- VCPU_STAT("instruction_essa", instruction_essa),
- VCPU_STAT("instruction_stsi", instruction_stsi),
- VCPU_STAT("instruction_stfl", instruction_stfl),
- VCPU_STAT("instruction_tb", instruction_tb),
- VCPU_STAT("instruction_tpi", instruction_tpi),
- VCPU_STAT("instruction_tprot", instruction_tprot),
- VCPU_STAT("instruction_tsch", instruction_tsch),
- VCPU_STAT("instruction_sthyi", instruction_sthyi),
- VCPU_STAT("instruction_sie", instruction_sie),
- VCPU_STAT("instruction_sigp_sense", instruction_sigp_sense),
- VCPU_STAT("instruction_sigp_sense_running", instruction_sigp_sense_running),
- VCPU_STAT("instruction_sigp_external_call", instruction_sigp_external_call),
- VCPU_STAT("instruction_sigp_emergency", instruction_sigp_emergency),
- VCPU_STAT("instruction_sigp_cond_emergency", instruction_sigp_cond_emergency),
- VCPU_STAT("instruction_sigp_start", instruction_sigp_start),
- VCPU_STAT("instruction_sigp_stop", instruction_sigp_stop),
- VCPU_STAT("instruction_sigp_stop_store_status", instruction_sigp_stop_store_status),
- VCPU_STAT("instruction_sigp_store_status", instruction_sigp_store_status),
- VCPU_STAT("instruction_sigp_store_adtl_status", instruction_sigp_store_adtl_status),
- VCPU_STAT("instruction_sigp_set_arch", instruction_sigp_arch),
- VCPU_STAT("instruction_sigp_set_prefix", instruction_sigp_prefix),
- VCPU_STAT("instruction_sigp_restart", instruction_sigp_restart),
- VCPU_STAT("instruction_sigp_cpu_reset", instruction_sigp_cpu_reset),
- VCPU_STAT("instruction_sigp_init_cpu_reset", instruction_sigp_init_cpu_reset),
- VCPU_STAT("instruction_sigp_unknown", instruction_sigp_unknown),
- VCPU_STAT("instruction_diag_10", diagnose_10),
- VCPU_STAT("instruction_diag_44", diagnose_44),
- VCPU_STAT("instruction_diag_9c", diagnose_9c),
- VCPU_STAT("diag_9c_ignored", diagnose_9c_ignored),
- VCPU_STAT("diag_9c_forward", diagnose_9c_forward),
- VCPU_STAT("instruction_diag_258", diagnose_258),
- VCPU_STAT("instruction_diag_308", diagnose_308),
- VCPU_STAT("instruction_diag_500", diagnose_500),
- VCPU_STAT("instruction_diag_other", diagnose_other),
- { NULL }
+const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
+ KVM_GENERIC_VM_STATS(),
+ STATS_DESC_COUNTER(VM, inject_io),
+ STATS_DESC_COUNTER(VM, inject_float_mchk),
+ STATS_DESC_COUNTER(VM, inject_pfault_done),
+ STATS_DESC_COUNTER(VM, inject_service_signal),
+ STATS_DESC_COUNTER(VM, inject_virtio)
+};
+static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
+ sizeof(struct kvm_vm_stat) / sizeof(u64));
+
+const struct kvm_stats_header kvm_vm_stats_header = {
+ .name_size = KVM_STATS_NAME_SIZE,
+ .num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
+ .id_offset = sizeof(struct kvm_stats_header),
+ .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+ .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+ sizeof(kvm_vm_stats_desc),
+};
+
+const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
+ KVM_GENERIC_VCPU_STATS(),
+ STATS_DESC_COUNTER(VCPU, exit_userspace),
+ STATS_DESC_COUNTER(VCPU, exit_null),
+ STATS_DESC_COUNTER(VCPU, exit_external_request),
+ STATS_DESC_COUNTER(VCPU, exit_io_request),
+ STATS_DESC_COUNTER(VCPU, exit_external_interrupt),
+ STATS_DESC_COUNTER(VCPU, exit_stop_request),
+ STATS_DESC_COUNTER(VCPU, exit_validity),
+ STATS_DESC_COUNTER(VCPU, exit_instruction),
+ STATS_DESC_COUNTER(VCPU, exit_pei),
+ STATS_DESC_COUNTER(VCPU, halt_no_poll_steal),
+ STATS_DESC_COUNTER(VCPU, instruction_lctl),
+ STATS_DESC_COUNTER(VCPU, instruction_lctlg),
+ STATS_DESC_COUNTER(VCPU, instruction_stctl),
+ STATS_DESC_COUNTER(VCPU, instruction_stctg),
+ STATS_DESC_COUNTER(VCPU, exit_program_interruption),
+ STATS_DESC_COUNTER(VCPU, exit_instr_and_program),
+ STATS_DESC_COUNTER(VCPU, exit_operation_exception),
+ STATS_DESC_COUNTER(VCPU, deliver_ckc),
+ STATS_DESC_COUNTER(VCPU, deliver_cputm),
+ STATS_DESC_COUNTER(VCPU, deliver_external_call),
+ STATS_DESC_COUNTER(VCPU, deliver_emergency_signal),
+ STATS_DESC_COUNTER(VCPU, deliver_service_signal),
+ STATS_DESC_COUNTER(VCPU, deliver_virtio),
+ STATS_DESC_COUNTER(VCPU, deliver_stop_signal),
+ STATS_DESC_COUNTER(VCPU, deliver_prefix_signal),
+ STATS_DESC_COUNTER(VCPU, deliver_restart_signal),
+ STATS_DESC_COUNTER(VCPU, deliver_program),
+ STATS_DESC_COUNTER(VCPU, deliver_io),
+ STATS_DESC_COUNTER(VCPU, deliver_machine_check),
+ STATS_DESC_COUNTER(VCPU, exit_wait_state),
+ STATS_DESC_COUNTER(VCPU, inject_ckc),
+ STATS_DESC_COUNTER(VCPU, inject_cputm),
+ STATS_DESC_COUNTER(VCPU, inject_external_call),
+ STATS_DESC_COUNTER(VCPU, inject_emergency_signal),
+ STATS_DESC_COUNTER(VCPU, inject_mchk),
+ STATS_DESC_COUNTER(VCPU, inject_pfault_init),
+ STATS_DESC_COUNTER(VCPU, inject_program),
+ STATS_DESC_COUNTER(VCPU, inject_restart),
+ STATS_DESC_COUNTER(VCPU, inject_set_prefix),
+ STATS_DESC_COUNTER(VCPU, inject_stop_signal),
+ STATS_DESC_COUNTER(VCPU, instruction_epsw),
+ STATS_DESC_COUNTER(VCPU, instruction_gs),
+ STATS_DESC_COUNTER(VCPU, instruction_io_other),
+ STATS_DESC_COUNTER(VCPU, instruction_lpsw),
+ STATS_DESC_COUNTER(VCPU, instruction_lpswe),
+ STATS_DESC_COUNTER(VCPU, instruction_pfmf),
+ STATS_DESC_COUNTER(VCPU, instruction_ptff),
+ STATS_DESC_COUNTER(VCPU, instruction_sck),
+ STATS_DESC_COUNTER(VCPU, instruction_sckpf),
+ STATS_DESC_COUNTER(VCPU, instruction_stidp),
+ STATS_DESC_COUNTER(VCPU, instruction_spx),
+ STATS_DESC_COUNTER(VCPU, instruction_stpx),
+ STATS_DESC_COUNTER(VCPU, instruction_stap),
+ STATS_DESC_COUNTER(VCPU, instruction_iske),
+ STATS_DESC_COUNTER(VCPU, instruction_ri),
+ STATS_DESC_COUNTER(VCPU, instruction_rrbe),
+ STATS_DESC_COUNTER(VCPU, instruction_sske),
+ STATS_DESC_COUNTER(VCPU, instruction_ipte_interlock),
+ STATS_DESC_COUNTER(VCPU, instruction_stsi),
+ STATS_DESC_COUNTER(VCPU, instruction_stfl),
+ STATS_DESC_COUNTER(VCPU, instruction_tb),
+ STATS_DESC_COUNTER(VCPU, instruction_tpi),
+ STATS_DESC_COUNTER(VCPU, instruction_tprot),
+ STATS_DESC_COUNTER(VCPU, instruction_tsch),
+ STATS_DESC_COUNTER(VCPU, instruction_sie),
+ STATS_DESC_COUNTER(VCPU, instruction_essa),
+ STATS_DESC_COUNTER(VCPU, instruction_sthyi),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_sense),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_sense_running),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_external_call),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_emergency),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_cond_emergency),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_start),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_stop),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_stop_store_status),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_store_status),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_store_adtl_status),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_arch),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_prefix),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_restart),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_init_cpu_reset),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_cpu_reset),
+ STATS_DESC_COUNTER(VCPU, instruction_sigp_unknown),
+ STATS_DESC_COUNTER(VCPU, diagnose_10),
+ STATS_DESC_COUNTER(VCPU, diagnose_44),
+ STATS_DESC_COUNTER(VCPU, diagnose_9c),
+ STATS_DESC_COUNTER(VCPU, diagnose_9c_ignored),
+ STATS_DESC_COUNTER(VCPU, diagnose_9c_forward),
+ STATS_DESC_COUNTER(VCPU, diagnose_258),
+ STATS_DESC_COUNTER(VCPU, diagnose_308),
+ STATS_DESC_COUNTER(VCPU, diagnose_500),
+ STATS_DESC_COUNTER(VCPU, diagnose_other),
+ STATS_DESC_COUNTER(VCPU, pfault_sync)
+};
+static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
+ sizeof(struct kvm_vcpu_stat) / sizeof(u64));
+
+const struct kvm_stats_header kvm_vcpu_stats_header = {
+ .name_size = KVM_STATS_NAME_SIZE,
+ .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
+ .id_offset = sizeof(struct kvm_stats_header),
+ .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+ .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+ sizeof(kvm_vcpu_stats_desc),
};
/* allow nested virtualization in KVM (if enabled by user space) */
@@ -329,31 +349,31 @@ static void allow_cpu_feat(unsigned long nr)
static inline int plo_test_bit(unsigned char nr)
{
- register unsigned long r0 asm("0") = (unsigned long) nr | 0x100;
+ unsigned long function = (unsigned long)nr | 0x100;
int cc;
asm volatile(
+ " lgr 0,%[function]\n"
/* Parameter registers are ignored for "test bit" */
" plo 0,0,0,0(0)\n"
" ipm %0\n"
" srl %0,28\n"
: "=d" (cc)
- : "d" (r0)
- : "cc");
+ : [function] "d" (function)
+ : "cc", "0");
return cc == 0;
}
static __always_inline void __insn32_query(unsigned int opcode, u8 *query)
{
- register unsigned long r0 asm("0") = 0; /* query function */
- register unsigned long r1 asm("1") = (unsigned long) query;
-
asm volatile(
- /* Parameter regs are ignored */
+ " lghi 0,0\n"
+ " lgr 1,%[query]\n"
+ /* Parameter registers are ignored */
" .insn rrf,%[opc] << 16,2,4,6,0\n"
:
- : "d" (r0), "a" (r1), [opc] "i" (opcode)
- : "cc", "memory");
+ : [query] "d" ((unsigned long)query), [opc] "i" (opcode)
+ : "cc", "memory", "0", "1");
}
#define INSN_SORTL 0xb938
@@ -713,6 +733,10 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
set_kvm_facility(kvm->arch.model.fac_mask, 152);
set_kvm_facility(kvm->arch.model.fac_list, 152);
}
+ if (test_facility(192)) {
+ set_kvm_facility(kvm->arch.model.fac_mask, 192);
+ set_kvm_facility(kvm->arch.model.fac_list, 192);
+ }
r = 0;
} else
r = -EINVAL;
diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c
index 61ce5b59b828..606324e56e4e 100644
--- a/arch/s390/tools/gen_facilities.c
+++ b/arch/s390/tools/gen_facilities.c
@@ -115,6 +115,10 @@ static struct facility_def facility_defs[] = {
12, /* AP Query Configuration Information */
15, /* AP Facilities Test */
156, /* etoken facility */
+ 165, /* nnpa facility */
+ 193, /* bear enhancement facility */
+ 194, /* rdp enhancement facility */
+ 196, /* processor activity instrumentation facility */
-1 /* END */
}
},
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index 606f5cc579b2..f1366ce609e3 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -52,7 +52,7 @@
* Support for passing hypercall input parameter block via XMM
* registers is available
*/
-#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE BIT(4)
+#define HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE BIT(4)
/* Support for a virtual guest idle state is available */
#define HV_X64_GUEST_IDLE_STATE_AVAILABLE BIT(5)
/* Frequency MSRs available */
@@ -61,6 +61,11 @@
#define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE BIT(10)
/* Support for debug MSRs available */
#define HV_FEATURE_DEBUG_MSRS_AVAILABLE BIT(11)
+/*
+ * Support for returning hypercall output block via XMM
+ * registers is available
+ */
+#define HV_X64_HYPERCALL_XMM_OUTPUT_AVAILABLE BIT(15)
/* stimer Direct Mode is available */
#define HV_STIMER_DIRECT_MODE_AVAILABLE BIT(19)
@@ -133,6 +138,15 @@
#define HV_X64_NESTED_GUEST_MAPPING_FLUSH BIT(18)
#define HV_X64_NESTED_MSR_BITMAP BIT(19)
+/*
+ * This is specific to AMD and specifies that enlightened TLB flush is
+ * supported. If guest opts in to this feature, ASID invalidations only
+ * flushes gva -> hpa mapping entries. To flush the TLB entries derived
+ * from NPT, hypercalls should be used (HvFlushGuestPhysicalAddressSpace
+ * or HvFlushGuestPhysicalAddressList).
+ */
+#define HV_X64_NESTED_ENLIGHTENED_TLB BIT(22)
+
/* HYPERV_CPUID_ISOLATION_CONFIG.EAX bits. */
#define HV_PARAVISOR_PRESENT BIT(0)
@@ -314,6 +328,9 @@ struct hv_tsc_emulation_status {
#define HV_X64_MSR_TSC_REFERENCE_ENABLE 0x00000001
#define HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT 12
+/* Number of XMM registers used in hypercall input/output */
+#define HV_HYPERCALL_MAX_XMM_REGISTERS 6
+
struct hv_nested_enlightenments_control {
struct {
__u32 directhypercall:1;
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index e7bef91cee04..a12a4987154e 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -87,7 +87,10 @@ KVM_X86_OP(set_identity_map_addr)
KVM_X86_OP(get_mt_mask)
KVM_X86_OP(load_mmu_pgd)
KVM_X86_OP_NULL(has_wbinvd_exit)
-KVM_X86_OP(write_l1_tsc_offset)
+KVM_X86_OP(get_l2_tsc_offset)
+KVM_X86_OP(get_l2_tsc_multiplier)
+KVM_X86_OP(write_tsc_offset)
+KVM_X86_OP(write_tsc_multiplier)
KVM_X86_OP(get_exit_info)
KVM_X86_OP(check_intercept)
KVM_X86_OP(handle_exit_irqoff)
@@ -106,8 +109,8 @@ KVM_X86_OP_NULL(set_hv_timer)
KVM_X86_OP_NULL(cancel_hv_timer)
KVM_X86_OP(setup_mce)
KVM_X86_OP(smi_allowed)
-KVM_X86_OP(pre_enter_smm)
-KVM_X86_OP(pre_leave_smm)
+KVM_X86_OP(enter_smm)
+KVM_X86_OP(leave_smm)
KVM_X86_OP(enable_smi_window)
KVM_X86_OP_NULL(mem_enc_op)
KVM_X86_OP_NULL(mem_enc_reg_region)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9c7ced0e3171..974cbfb1eefe 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -85,7 +85,7 @@
#define KVM_REQ_APICV_UPDATE \
KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_TLB_FLUSH_CURRENT KVM_ARCH_REQ(26)
-#define KVM_REQ_HV_TLB_FLUSH \
+#define KVM_REQ_TLB_FLUSH_GUEST \
KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_APF_READY KVM_ARCH_REQ(28)
#define KVM_REQ_MSR_FILTER_CHANGED KVM_ARCH_REQ(29)
@@ -269,12 +269,36 @@ enum x86_intercept_stage;
struct kvm_kernel_irq_routing_entry;
/*
- * the pages used as guest page table on soft mmu are tracked by
- * kvm_memory_slot.arch.gfn_track which is 16 bits, so the role bits used
- * by indirect shadow page can not be more than 15 bits.
+ * kvm_mmu_page_role tracks the properties of a shadow page (where shadow page
+ * also includes TDP pages) to determine whether or not a page can be used in
+ * the given MMU context. This is a subset of the overall kvm_mmu_role to
+ * minimize the size of kvm_memory_slot.arch.gfn_track, i.e. allows allocating
+ * 2 bytes per gfn instead of 4 bytes per gfn.
*
- * Currently, we used 14 bits that are @level, @gpte_is_8_bytes, @quadrant, @access,
- * @nxe, @cr0_wp, @smep_andnot_wp and @smap_andnot_wp.
+ * Indirect upper-level shadow pages are tracked for write-protection via
+ * gfn_track. As above, gfn_track is a 16 bit counter, so KVM must not create
+ * more than 2^16-1 upper-level shadow pages at a single gfn, otherwise
+ * gfn_track will overflow and explosions will ensure.
+ *
+ * A unique shadow page (SP) for a gfn is created if and only if an existing SP
+ * cannot be reused. The ability to reuse a SP is tracked by its role, which
+ * incorporates various mode bits and properties of the SP. Roughly speaking,
+ * the number of unique SPs that can theoretically be created is 2^n, where n
+ * is the number of bits that are used to compute the role.
+ *
+ * But, even though there are 18 bits in the mask below, not all combinations
+ * of modes and flags are possible. The maximum number of possible upper-level
+ * shadow pages for a single gfn is in the neighborhood of 2^13.
+ *
+ * - invalid shadow pages are not accounted.
+ * - level is effectively limited to four combinations, not 16 as the number
+ * bits would imply, as 4k SPs are not tracked (allowed to go unsync).
+ * - level is effectively unused for non-PAE paging because there is exactly
+ * one upper level (see 4k SP exception above).
+ * - quadrant is used only for non-PAE paging and is exclusive with
+ * gpte_is_8_bytes.
+ * - execonly and ad_disabled are used only for nested EPT, which makes it
+ * exclusive with quadrant.
*/
union kvm_mmu_page_role {
u32 word;
@@ -285,7 +309,7 @@ union kvm_mmu_page_role {
unsigned direct:1;
unsigned access:3;
unsigned invalid:1;
- unsigned nxe:1;
+ unsigned efer_nx:1;
unsigned cr0_wp:1;
unsigned smep_andnot_wp:1;
unsigned smap_andnot_wp:1;
@@ -303,13 +327,26 @@ union kvm_mmu_page_role {
};
};
-union kvm_mmu_extended_role {
/*
- * This structure complements kvm_mmu_page_role caching everything needed for
- * MMU configuration. If nothing in both these structures changed, MMU
- * re-configuration can be skipped. @valid bit is set on first usage so we don't
- * treat all-zero structure as valid data.
+ * kvm_mmu_extended_role complements kvm_mmu_page_role, tracking properties
+ * relevant to the current MMU configuration. When loading CR0, CR4, or EFER,
+ * including on nested transitions, if nothing in the full role changes then
+ * MMU re-configuration can be skipped. @valid bit is set on first usage so we
+ * don't treat all-zero structure as valid data.
+ *
+ * The properties that are tracked in the extended role but not the page role
+ * are for things that either (a) do not affect the validity of the shadow page
+ * or (b) are indirectly reflected in the shadow page's role. For example,
+ * CR4.PKE only affects permission checks for software walks of the guest page
+ * tables (because KVM doesn't support Protection Keys with shadow paging), and
+ * CR0.PG, CR4.PAE, and CR4.PSE are indirectly reflected in role.level.
+ *
+ * Note, SMEP and SMAP are not redundant with sm*p_andnot_wp in the page role.
+ * If CR0.WP=1, KVM can reuse shadow pages for the guest regardless of SMEP and
+ * SMAP, but the MMU's permission checks for software walks need to be SMEP and
+ * SMAP aware regardless of CR0.WP.
*/
+union kvm_mmu_extended_role {
u32 word;
struct {
unsigned int valid:1;
@@ -320,7 +357,7 @@ union kvm_mmu_extended_role {
unsigned int cr4_pke:1;
unsigned int cr4_smap:1;
unsigned int cr4_smep:1;
- unsigned int maxphyaddr:6;
+ unsigned int cr4_la57:1;
};
};
@@ -420,11 +457,6 @@ struct kvm_mmu {
struct rsvd_bits_validate guest_rsvd_check;
- /* Can have large pages at levels 2..last_nonleaf_level-1. */
- u8 last_nonleaf_level;
-
- bool nx;
-
u64 pdptrs[4]; /* pae */
};
@@ -543,6 +575,15 @@ struct kvm_vcpu_hv {
struct kvm_vcpu_hv_stimer stimer[HV_SYNIC_STIMER_COUNT];
DECLARE_BITMAP(stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT);
cpumask_t tlb_flush;
+ bool enforce_cpuid;
+ struct {
+ u32 features_eax; /* HYPERV_CPUID_FEATURES.EAX */
+ u32 features_ebx; /* HYPERV_CPUID_FEATURES.EBX */
+ u32 features_edx; /* HYPERV_CPUID_FEATURES.EDX */
+ u32 enlightenments_eax; /* HYPERV_CPUID_ENLIGHTMENT_INFO.EAX */
+ u32 enlightenments_ebx; /* HYPERV_CPUID_ENLIGHTMENT_INFO.EBX */
+ u32 syndbg_cap_eax; /* HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX */
+ } cpuid_cache;
};
/* Xen HVM per vcpu emulation context */
@@ -707,7 +748,7 @@ struct kvm_vcpu_arch {
} st;
u64 l1_tsc_offset;
- u64 tsc_offset;
+ u64 tsc_offset; /* current tsc offset */
u64 last_guest_tsc;
u64 last_host_tsc;
u64 tsc_offset_adjustment;
@@ -721,7 +762,8 @@ struct kvm_vcpu_arch {
u32 virtual_tsc_khz;
s64 ia32_tsc_adjust_msr;
u64 msr_ia32_power_ctl;
- u64 tsc_scaling_ratio;
+ u64 l1_tsc_scaling_ratio;
+ u64 tsc_scaling_ratio; /* current scaling ratio */
atomic_t nmi_queued; /* unprocessed asynchronous NMIs */
unsigned nmi_pending; /* NMI queued after currently running handler */
@@ -829,7 +871,7 @@ struct kvm_vcpu_arch {
bool l1tf_flush_l1d;
/* Host CPU on which VM-entry was most recently attempted */
- unsigned int last_vmentry_cpu;
+ int last_vmentry_cpu;
/* AMD MSRC001_0015 Hardware Configuration */
u64 msr_hwcr;
@@ -851,6 +893,16 @@ struct kvm_vcpu_arch {
/* Protected Guests */
bool guest_state_protected;
+
+ /*
+ * Set when PDPTS were loaded directly by the userspace without
+ * reading the guest memory
+ */
+ bool pdptrs_from_userspace;
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ hpa_t hv_root_tdp;
+#endif
};
struct kvm_lpage_info {
@@ -1002,7 +1054,7 @@ struct kvm_arch {
struct kvm_apic_map __rcu *apic_map;
atomic_t apic_map_dirty;
- bool apic_access_page_done;
+ bool apic_access_memslot_enabled;
unsigned long apicv_inhibit_reasons;
gpa_t wall_clock;
@@ -1062,11 +1114,19 @@ struct kvm_arch {
bool exception_payload_enabled;
bool bus_lock_detection_enabled;
+ /*
+ * If exit_on_emulation_error is set, and the in-kernel instruction
+ * emulator fails to emulate an instruction, allow userspace
+ * the opportunity to look at it.
+ */
+ bool exit_on_emulation_error;
/* Deflect RDMSR and WRMSR to user space when they trigger a #GP */
u32 user_space_msr_mask;
struct kvm_x86_msr_filter __rcu *msr_filter;
+ u32 hypercall_exit_enabled;
+
/* Guest can access the SGX PROVISIONKEY. */
bool sgx_provisioning_allowed;
@@ -1124,23 +1184,35 @@ struct kvm_arch {
*/
spinlock_t tdp_mmu_pages_lock;
#endif /* CONFIG_X86_64 */
+
+ /*
+ * If set, rmaps have been allocated for all memslots and should be
+ * allocated for any newly created or modified memslots.
+ */
+ bool memslots_have_rmaps;
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ hpa_t hv_root_tdp;
+ spinlock_t hv_root_tdp_lock;
+#endif
};
struct kvm_vm_stat {
- ulong mmu_shadow_zapped;
- ulong mmu_pte_write;
- ulong mmu_pde_zapped;
- ulong mmu_flooded;
- ulong mmu_recycled;
- ulong mmu_cache_miss;
- ulong mmu_unsync;
- ulong remote_tlb_flush;
- ulong lpages;
- ulong nx_lpage_splits;
- ulong max_mmu_page_hash_collisions;
+ struct kvm_vm_stat_generic generic;
+ u64 mmu_shadow_zapped;
+ u64 mmu_pte_write;
+ u64 mmu_pde_zapped;
+ u64 mmu_flooded;
+ u64 mmu_recycled;
+ u64 mmu_cache_miss;
+ u64 mmu_unsync;
+ u64 lpages;
+ u64 nx_lpage_splits;
+ u64 max_mmu_page_hash_collisions;
};
struct kvm_vcpu_stat {
+ struct kvm_vcpu_stat_generic generic;
u64 pf_fixed;
u64 pf_guest;
u64 tlb_flush;
@@ -1154,10 +1226,6 @@ struct kvm_vcpu_stat {
u64 nmi_window_exits;
u64 l1d_flush;
u64 halt_exits;
- u64 halt_successful_poll;
- u64 halt_attempted_poll;
- u64 halt_poll_invalid;
- u64 halt_wakeup;
u64 request_irq_exits;
u64 irq_exits;
u64 host_state_reload;
@@ -1168,11 +1236,10 @@ struct kvm_vcpu_stat {
u64 irq_injections;
u64 nmi_injections;
u64 req_event;
- u64 halt_poll_success_ns;
- u64 halt_poll_fail_ns;
u64 nested_run;
u64 directed_yield_attempted;
u64 directed_yield_successful;
+ u64 guest_mode;
};
struct x86_instruction_info;
@@ -1304,8 +1371,10 @@ struct kvm_x86_ops {
bool (*has_wbinvd_exit)(void);
- /* Returns actual tsc_offset set in active VMCS */
- u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
+ u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu);
+ u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu);
+ void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
+ void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu, u64 multiplier);
/*
* Retrieve somewhat arbitrary exit information. Intended to be used
@@ -1363,8 +1432,8 @@ struct kvm_x86_ops {
void (*setup_mce)(struct kvm_vcpu *vcpu);
int (*smi_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
- int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
- int (*pre_leave_smm)(struct kvm_vcpu *vcpu, const char *smstate);
+ int (*enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
+ int (*leave_smm)(struct kvm_vcpu *vcpu, const char *smstate);
void (*enable_smi_window)(struct kvm_vcpu *vcpu);
int (*mem_enc_op)(struct kvm *kvm, void __user *argp);
@@ -1423,6 +1492,7 @@ struct kvm_arch_async_pf {
extern u32 __read_mostly kvm_nr_uret_msrs;
extern u64 __read_mostly host_efer;
extern bool __read_mostly allow_smaller_maxphyaddr;
+extern bool __read_mostly enable_apicv;
extern struct kvm_x86_ops kvm_x86_ops;
#define KVM_X86_OP(func) \
@@ -1463,6 +1533,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu);
void kvm_mmu_init_vm(struct kvm *kvm);
void kvm_mmu_uninit_vm(struct kvm *kvm);
+void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu);
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
struct kvm_memory_slot *memslot,
@@ -1477,7 +1548,6 @@ unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm);
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages);
int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
-bool pdptrs_changed(struct kvm_vcpu *vcpu);
int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
const void *val, int bytes);
@@ -1650,6 +1720,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
ulong roots_to_free);
+void kvm_mmu_free_guest_mode_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu);
gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
struct x86_exception *exception);
gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
@@ -1662,7 +1733,6 @@ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception);
bool kvm_apicv_activated(struct kvm *kvm);
-void kvm_apicv_init(struct kvm *kvm, bool enable);
void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu);
void kvm_request_apicv_update(struct kvm *kvm, bool activate,
unsigned long bit);
@@ -1675,8 +1745,7 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
gva_t gva, hpa_t root_hpa);
void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
-void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush,
- bool skip_mmu_sync);
+void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd);
void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
int tdp_huge_page_level);
@@ -1788,8 +1857,10 @@ static inline bool kvm_is_supported_user_return_msr(u32 msr)
return kvm_find_user_return_msr(msr) >= 0;
}
-u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc);
+u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc, u64 ratio);
u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc);
+u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier);
+u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier);
unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu);
bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
@@ -1863,4 +1934,6 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)
int kvm_cpu_dirty_log_size(void);
+int alloc_all_memslots_rmaps(struct kvm *kvm);
+
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 772e60efe243..e322676039f4 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -156,6 +156,12 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
u64 avic_physical_id; /* Offset 0xf8 */
u8 reserved_7[8];
u64 vmsa_pa; /* Used for an SEV-ES guest */
+ u8 reserved_8[720];
+ /*
+ * Offset 0x3e0, 32 bytes reserved
+ * for use by hypervisor/software.
+ */
+ u8 reserved_sw[32];
};
@@ -314,7 +320,7 @@ struct ghcb {
#define EXPECTED_VMCB_SAVE_AREA_SIZE 1032
-#define EXPECTED_VMCB_CONTROL_AREA_SIZE 272
+#define EXPECTED_VMCB_CONTROL_AREA_SIZE 1024
#define EXPECTED_GHCB_SIZE PAGE_SIZE
static inline void __unused_size_checks(void)
@@ -326,7 +332,6 @@ static inline void __unused_size_checks(void)
struct vmcb {
struct vmcb_control_area control;
- u8 reserved_control[1024 - sizeof(struct vmcb_control_area)];
struct vmcb_save_area save;
} __packed;
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 0662f644aad9..a6c327f8ad9e 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -159,6 +159,19 @@ struct kvm_sregs {
__u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
};
+struct kvm_sregs2 {
+ /* out (KVM_GET_SREGS2) / in (KVM_SET_SREGS2) */
+ struct kvm_segment cs, ds, es, fs, gs, ss;
+ struct kvm_segment tr, ldt;
+ struct kvm_dtable gdt, idt;
+ __u64 cr0, cr2, cr3, cr4, cr8;
+ __u64 efer;
+ __u64 apic_base;
+ __u64 flags;
+ __u64 pdptrs[4];
+};
+#define KVM_SREGS2_FLAGS_PDPTRS_VALID 1
+
/* for KVM_GET_FPU and KVM_SET_FPU */
struct kvm_fpu {
__u8 fpr[8][16];
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 950afebfba88..5146bbab84d4 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -33,6 +33,8 @@
#define KVM_FEATURE_PV_SCHED_YIELD 13
#define KVM_FEATURE_ASYNC_PF_INT 14
#define KVM_FEATURE_MSI_EXT_DEST_ID 15
+#define KVM_FEATURE_HC_MAP_GPA_RANGE 16
+#define KVM_FEATURE_MIGRATION_CONTROL 17
#define KVM_HINTS_REALTIME 0
@@ -54,6 +56,7 @@
#define MSR_KVM_POLL_CONTROL 0x4b564d05
#define MSR_KVM_ASYNC_PF_INT 0x4b564d06
#define MSR_KVM_ASYNC_PF_ACK 0x4b564d07
+#define MSR_KVM_MIGRATION_CONTROL 0x4b564d08
struct kvm_steal_time {
__u64 steal;
@@ -90,6 +93,16 @@ struct kvm_clock_pairing {
/* MSR_KVM_ASYNC_PF_INT */
#define KVM_ASYNC_PF_VEC_MASK GENMASK(7, 0)
+/* MSR_KVM_MIGRATION_CONTROL */
+#define KVM_MIGRATION_READY (1 << 0)
+
+/* KVM_HC_MAP_GPA_RANGE */
+#define KVM_MAP_GPA_RANGE_PAGE_SZ_4K 0
+#define KVM_MAP_GPA_RANGE_PAGE_SZ_2M (1 << 0)
+#define KVM_MAP_GPA_RANGE_PAGE_SZ_1G (1 << 1)
+#define KVM_MAP_GPA_RANGE_ENC_STAT(n) (n << 4)
+#define KVM_MAP_GPA_RANGE_ENCRYPTED KVM_MAP_GPA_RANGE_ENC_STAT(1)
+#define KVM_MAP_GPA_RANGE_DECRYPTED KVM_MAP_GPA_RANGE_ENC_STAT(0)
/* Operations for KVM_HC_MMU_OP */
#define KVM_MMU_OP_WRITE_PTE 1
diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
index 554f75fe013c..efa969325ede 100644
--- a/arch/x86/include/uapi/asm/svm.h
+++ b/arch/x86/include/uapi/asm/svm.h
@@ -110,6 +110,9 @@
#define SVM_VMGEXIT_GET_AP_JUMP_TABLE 1
#define SVM_VMGEXIT_UNSUPPORTED_EVENT 0x8000ffff
+/* Exit code reserved for hypervisor/software use */
+#define SVM_EXIT_SW 0xf0000000
+
#define SVM_EXIT_ERR -1
#define SVM_EXIT_REASONS \
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 22f13343b5da..c268c2730048 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -252,6 +252,7 @@ static void __init hv_smp_prepare_cpus(unsigned int max_cpus)
static void __init ms_hyperv_init_platform(void)
{
+ int hv_max_functions_eax;
int hv_host_info_eax;
int hv_host_info_ebx;
int hv_host_info_ecx;
@@ -269,6 +270,8 @@ static void __init ms_hyperv_init_platform(void)
ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
+ hv_max_functions_eax = cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS);
+
pr_info("Hyper-V: privilege flags low 0x%x, high 0x%x, hints 0x%x, misc 0x%x\n",
ms_hyperv.features, ms_hyperv.priv_high, ms_hyperv.hints,
ms_hyperv.misc_features);
@@ -298,8 +301,7 @@ static void __init ms_hyperv_init_platform(void)
/*
* Extract host information.
*/
- if (cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS) >=
- HYPERV_CPUID_VERSION) {
+ if (hv_max_functions_eax >= HYPERV_CPUID_VERSION) {
hv_host_info_eax = cpuid_eax(HYPERV_CPUID_VERSION);
hv_host_info_ebx = cpuid_ebx(HYPERV_CPUID_VERSION);
hv_host_info_ecx = cpuid_ecx(HYPERV_CPUID_VERSION);
@@ -325,9 +327,11 @@ static void __init ms_hyperv_init_platform(void)
ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b);
}
- if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED) {
+ if (hv_max_functions_eax >= HYPERV_CPUID_NESTED_FEATURES) {
ms_hyperv.nested_features =
cpuid_eax(HYPERV_CPUID_NESTED_FEATURES);
+ pr_info("Hyper-V: Nested features: 0x%x\n",
+ ms_hyperv.nested_features);
}
/*
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index fb8efb387aff..ac69894eab88 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -43,6 +43,7 @@ config KVM
select KVM_GENERIC_DIRTYLOG_READ_PROTECT
select KVM_VFIO
select SRCU
+ select HAVE_KVM_PM_NOTIFIER if PM
help
Support hosting fully virtualized guest machines using hardware
virtualization extensions. You will need a fairly recent
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index c589db5d91b3..75dfd27b6e8a 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -11,13 +11,18 @@ KVM := ../../../virt/kvm
kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
$(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o \
- $(KVM)/dirty_ring.o
+ $(KVM)/dirty_ring.o $(KVM)/binary_stats.o
kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \
i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \
mmu/spte.o
+
+ifdef CONFIG_HYPERV
+kvm-y += kvm_onhyperv.o
+endif
+
kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o
kvm-$(CONFIG_KVM_XEN) += xen.o
@@ -27,6 +32,10 @@ kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o
kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o
+ifdef CONFIG_HYPERV
+kvm-amd-y += svm/svm_onhyperv.o
+endif
+
obj-$(CONFIG_KVM) += kvm.o
obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
obj-$(CONFIG_KVM_AMD) += kvm-amd.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index b4da665bb892..c42613cfb5ba 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -202,10 +202,10 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
static_call(kvm_x86_vcpu_after_set_cpuid)(vcpu);
/*
- * Except for the MMU, which needs to be reset after any vendor
- * specific adjustments to the reserved GPA bits.
+ * Except for the MMU, which needs to do its thing any vendor specific
+ * adjustments to the reserved GPA bits.
*/
- kvm_mmu_reset_context(vcpu);
+ kvm_mmu_after_set_cpuid(vcpu);
}
static int is_efer_nx(void)
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
index 7e818d64bb4d..95a98413dc32 100644
--- a/arch/x86/kvm/debugfs.c
+++ b/arch/x86/kvm/debugfs.c
@@ -17,6 +17,15 @@ static int vcpu_get_timer_advance_ns(void *data, u64 *val)
DEFINE_SIMPLE_ATTRIBUTE(vcpu_timer_advance_ns_fops, vcpu_get_timer_advance_ns, NULL, "%llu\n");
+static int vcpu_get_guest_mode(void *data, u64 *val)
+{
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
+ *val = vcpu->stat.guest_mode;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_guest_mode_fops, vcpu_get_guest_mode, NULL, "%lld\n");
+
static int vcpu_get_tsc_offset(void *data, u64 *val)
{
struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
@@ -45,6 +54,8 @@ DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_frac_fops, vcpu_get_tsc_scaling_frac_bi
void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry)
{
+ debugfs_create_file("guest_mode", 0444, debugfs_dentry, vcpu,
+ &vcpu_guest_mode_fops);
debugfs_create_file("tsc-offset", 0444, debugfs_dentry, vcpu,
&vcpu_tsc_offset_fops);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5e5de05a8fbf..2837110e66ed 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -22,7 +22,6 @@
#include "kvm_cache_regs.h"
#include "kvm_emulate.h"
#include <linux/stringify.h>
-#include <asm/fpu/api.h>
#include <asm/debugreg.h>
#include <asm/nospec-branch.h>
@@ -1081,116 +1080,14 @@ static void fetch_register_operand(struct operand *op)
}
}
-static void emulator_get_fpu(void)
-{
- fpregs_lock();
-
- fpregs_assert_state_consistent();
- if (test_thread_flag(TIF_NEED_FPU_LOAD))
- switch_fpu_return();
-}
-
-static void emulator_put_fpu(void)
-{
- fpregs_unlock();
-}
-
-static void read_sse_reg(sse128_t *data, int reg)
-{
- emulator_get_fpu();
- switch (reg) {
- case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break;
- case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break;
- case 2: asm("movdqa %%xmm2, %0" : "=m"(*data)); break;
- case 3: asm("movdqa %%xmm3, %0" : "=m"(*data)); break;
- case 4: asm("movdqa %%xmm4, %0" : "=m"(*data)); break;
- case 5: asm("movdqa %%xmm5, %0" : "=m"(*data)); break;
- case 6: asm("movdqa %%xmm6, %0" : "=m"(*data)); break;
- case 7: asm("movdqa %%xmm7, %0" : "=m"(*data)); break;
-#ifdef CONFIG_X86_64
- case 8: asm("movdqa %%xmm8, %0" : "=m"(*data)); break;
- case 9: asm("movdqa %%xmm9, %0" : "=m"(*data)); break;
- case 10: asm("movdqa %%xmm10, %0" : "=m"(*data)); break;
- case 11: asm("movdqa %%xmm11, %0" : "=m"(*data)); break;
- case 12: asm("movdqa %%xmm12, %0" : "=m"(*data)); break;
- case 13: asm("movdqa %%xmm13, %0" : "=m"(*data)); break;
- case 14: asm("movdqa %%xmm14, %0" : "=m"(*data)); break;
- case 15: asm("movdqa %%xmm15, %0" : "=m"(*data)); break;
-#endif
- default: BUG();
- }
- emulator_put_fpu();
-}
-
-static void write_sse_reg(sse128_t *data, int reg)
-{
- emulator_get_fpu();
- switch (reg) {
- case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break;
- case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break;
- case 2: asm("movdqa %0, %%xmm2" : : "m"(*data)); break;
- case 3: asm("movdqa %0, %%xmm3" : : "m"(*data)); break;
- case 4: asm("movdqa %0, %%xmm4" : : "m"(*data)); break;
- case 5: asm("movdqa %0, %%xmm5" : : "m"(*data)); break;
- case 6: asm("movdqa %0, %%xmm6" : : "m"(*data)); break;
- case 7: asm("movdqa %0, %%xmm7" : : "m"(*data)); break;
-#ifdef CONFIG_X86_64
- case 8: asm("movdqa %0, %%xmm8" : : "m"(*data)); break;
- case 9: asm("movdqa %0, %%xmm9" : : "m"(*data)); break;
- case 10: asm("movdqa %0, %%xmm10" : : "m"(*data)); break;
- case 11: asm("movdqa %0, %%xmm11" : : "m"(*data)); break;
- case 12: asm("movdqa %0, %%xmm12" : : "m"(*data)); break;
- case 13: asm("movdqa %0, %%xmm13" : : "m"(*data)); break;
- case 14: asm("movdqa %0, %%xmm14" : : "m"(*data)); break;
- case 15: asm("movdqa %0, %%xmm15" : : "m"(*data)); break;
-#endif
- default: BUG();
- }
- emulator_put_fpu();
-}
-
-static void read_mmx_reg(u64 *data, int reg)
-{
- emulator_get_fpu();
- switch (reg) {
- case 0: asm("movq %%mm0, %0" : "=m"(*data)); break;
- case 1: asm("movq %%mm1, %0" : "=m"(*data)); break;
- case 2: asm("movq %%mm2, %0" : "=m"(*data)); break;
- case 3: asm("movq %%mm3, %0" : "=m"(*data)); break;
- case 4: asm("movq %%mm4, %0" : "=m"(*data)); break;
- case 5: asm("movq %%mm5, %0" : "=m"(*data)); break;
- case 6: asm("movq %%mm6, %0" : "=m"(*data)); break;
- case 7: asm("movq %%mm7, %0" : "=m"(*data)); break;
- default: BUG();
- }
- emulator_put_fpu();
-}
-
-static void write_mmx_reg(u64 *data, int reg)
-{
- emulator_get_fpu();
- switch (reg) {
- case 0: asm("movq %0, %%mm0" : : "m"(*data)); break;
- case 1: asm("movq %0, %%mm1" : : "m"(*data)); break;
- case 2: asm("movq %0, %%mm2" : : "m"(*data)); break;
- case 3: asm("movq %0, %%mm3" : : "m"(*data)); break;
- case 4: asm("movq %0, %%mm4" : : "m"(*data)); break;
- case 5: asm("movq %0, %%mm5" : : "m"(*data)); break;
- case 6: asm("movq %0, %%mm6" : : "m"(*data)); break;
- case 7: asm("movq %0, %%mm7" : : "m"(*data)); break;
- default: BUG();
- }
- emulator_put_fpu();
-}
-
static int em_fninit(struct x86_emulate_ctxt *ctxt)
{
if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
return emulate_nm(ctxt);
- emulator_get_fpu();
+ kvm_fpu_get();
asm volatile("fninit");
- emulator_put_fpu();
+ kvm_fpu_put();
return X86EMUL_CONTINUE;
}
@@ -1201,9 +1098,9 @@ static int em_fnstcw(struct x86_emulate_ctxt *ctxt)
if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
return emulate_nm(ctxt);
- emulator_get_fpu();
+ kvm_fpu_get();
asm volatile("fnstcw %0": "+m"(fcw));
- emulator_put_fpu();
+ kvm_fpu_put();
ctxt->dst.val = fcw;
@@ -1217,9 +1114,9 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt)
if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
return emulate_nm(ctxt);
- emulator_get_fpu();
+ kvm_fpu_get();
asm volatile("fnstsw %0": "+m"(fsw));
- emulator_put_fpu();
+ kvm_fpu_put();
ctxt->dst.val = fsw;
@@ -1238,7 +1135,7 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
op->type = OP_XMM;
op->bytes = 16;
op->addr.xmm = reg;
- read_sse_reg(&op->vec_val, reg);
+ kvm_read_sse_reg(reg, &op->vec_val);
return;
}
if (ctxt->d & Mmx) {
@@ -1289,7 +1186,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
op->type = OP_XMM;
op->bytes = 16;
op->addr.xmm = ctxt->modrm_rm;
- read_sse_reg(&op->vec_val, ctxt->modrm_rm);
+ kvm_read_sse_reg(ctxt->modrm_rm, &op->vec_val);
return rc;
}
if (ctxt->d & Mmx) {
@@ -1866,10 +1763,10 @@ static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
op->bytes * op->count);
break;
case OP_XMM:
- write_sse_reg(&op->vec_val, op->addr.xmm);
+ kvm_write_sse_reg(op->addr.xmm, &op->vec_val);
break;
case OP_MM:
- write_mmx_reg(&op->mm_val, op->addr.mm);
+ kvm_write_mmx_reg(op->addr.mm, &op->mm_val);
break;
case OP_NONE:
/* no writeback */
@@ -2638,8 +2535,7 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_INSIDE_NMI_MASK) == 0)
ctxt->ops->set_nmi_mask(ctxt, false);
- ctxt->ops->set_hflags(ctxt, ctxt->ops->get_hflags(ctxt) &
- ~(X86EMUL_SMM_INSIDE_NMI_MASK | X86EMUL_SMM_MASK));
+ ctxt->ops->exiting_smm(ctxt);
/*
* Get back to real mode, to prepare a safe state in which to load
@@ -2678,12 +2574,12 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
}
/*
- * Give pre_leave_smm() a chance to make ISA-specific changes to the
- * vCPU state (e.g. enter guest mode) before loading state from the SMM
+ * Give leave_smm() a chance to make ISA-specific changes to the vCPU
+ * state (e.g. enter guest mode) before loading state from the SMM
* state-save area.
*/
- if (ctxt->ops->pre_leave_smm(ctxt, buf))
- return X86EMUL_UNHANDLEABLE;
+ if (ctxt->ops->leave_smm(ctxt, buf))
+ goto emulate_shutdown;
#ifdef CONFIG_X86_64
if (emulator_has_longmode(ctxt))
@@ -2692,13 +2588,21 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
#endif
ret = rsm_load_state_32(ctxt, buf);
- if (ret != X86EMUL_CONTINUE) {
- /* FIXME: should triple fault */
- return X86EMUL_UNHANDLEABLE;
- }
+ if (ret != X86EMUL_CONTINUE)
+ goto emulate_shutdown;
- ctxt->ops->post_leave_smm(ctxt);
+ /*
+ * Note, the ctxt->ops callbacks are responsible for handling side
+ * effects when writing MSRs and CRs, e.g. MMU context resets, CPUID
+ * runtime updates, etc... If that changes, e.g. this flow is moved
+ * out of the emulator to make it look more like enter_smm(), then
+ * those side effects need to be explicitly handled for both success
+ * and shutdown.
+ */
+ return X86EMUL_CONTINUE;
+emulate_shutdown:
+ ctxt->ops->triple_fault(ctxt);
return X86EMUL_CONTINUE;
}
@@ -4124,11 +4028,11 @@ static int em_fxsave(struct x86_emulate_ctxt *ctxt)
if (rc != X86EMUL_CONTINUE)
return rc;
- emulator_get_fpu();
+ kvm_fpu_get();
rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state));
- emulator_put_fpu();
+ kvm_fpu_put();
if (rc != X86EMUL_CONTINUE)
return rc;
@@ -4172,7 +4076,7 @@ static int em_fxrstor(struct x86_emulate_ctxt *ctxt)
if (rc != X86EMUL_CONTINUE)
return rc;
- emulator_get_fpu();
+ kvm_fpu_get();
if (size < __fxstate_size(16)) {
rc = fxregs_fixup(&fx_state, size);
@@ -4189,7 +4093,7 @@ static int em_fxrstor(struct x86_emulate_ctxt *ctxt)
rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state));
out:
- emulator_put_fpu();
+ kvm_fpu_put();
return rc;
}
@@ -5437,9 +5341,9 @@ static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt)
{
int rc;
- emulator_get_fpu();
+ kvm_fpu_get();
rc = asm_safe("fwait");
- emulator_put_fpu();
+ kvm_fpu_put();
if (unlikely(rc != X86EMUL_CONTINUE))
return emulate_exception(ctxt, MF_VECTOR, 0, false);
@@ -5450,7 +5354,7 @@ static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt)
static void fetch_possible_mmx_operand(struct operand *op)
{
if (op->type == OP_MM)
- read_mmx_reg(&op->mm_val, op->addr.mm);
+ kvm_read_mmx_reg(op->addr.mm, &op->mm_val);
}
static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop)
diff --git a/arch/x86/kvm/fpu.h b/arch/x86/kvm/fpu.h
new file mode 100644
index 000000000000..3ba12888bf66
--- /dev/null
+++ b/arch/x86/kvm/fpu.h
@@ -0,0 +1,140 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __KVM_FPU_H_
+#define __KVM_FPU_H_
+
+#include <asm/fpu/api.h>
+
+typedef u32 __attribute__((vector_size(16))) sse128_t;
+#define __sse128_u union { sse128_t vec; u64 as_u64[2]; u32 as_u32[4]; }
+#define sse128_lo(x) ({ __sse128_u t; t.vec = x; t.as_u64[0]; })
+#define sse128_hi(x) ({ __sse128_u t; t.vec = x; t.as_u64[1]; })
+#define sse128_l0(x) ({ __sse128_u t; t.vec = x; t.as_u32[0]; })
+#define sse128_l1(x) ({ __sse128_u t; t.vec = x; t.as_u32[1]; })
+#define sse128_l2(x) ({ __sse128_u t; t.vec = x; t.as_u32[2]; })
+#define sse128_l3(x) ({ __sse128_u t; t.vec = x; t.as_u32[3]; })
+#define sse128(lo, hi) ({ __sse128_u t; t.as_u64[0] = lo; t.as_u64[1] = hi; t.vec; })
+
+static inline void _kvm_read_sse_reg(int reg, sse128_t *data)
+{
+ switch (reg) {
+ case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break;
+ case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break;
+ case 2: asm("movdqa %%xmm2, %0" : "=m"(*data)); break;
+ case 3: asm("movdqa %%xmm3, %0" : "=m"(*data)); break;
+ case 4: asm("movdqa %%xmm4, %0" : "=m"(*data)); break;
+ case 5: asm("movdqa %%xmm5, %0" : "=m"(*data)); break;
+ case 6: asm("movdqa %%xmm6, %0" : "=m"(*data)); break;
+ case 7: asm("movdqa %%xmm7, %0" : "=m"(*data)); break;
+#ifdef CONFIG_X86_64
+ case 8: asm("movdqa %%xmm8, %0" : "=m"(*data)); break;
+ case 9: asm("movdqa %%xmm9, %0" : "=m"(*data)); break;
+ case 10: asm("movdqa %%xmm10, %0" : "=m"(*data)); break;
+ case 11: asm("movdqa %%xmm11, %0" : "=m"(*data)); break;
+ case 12: asm("movdqa %%xmm12, %0" : "=m"(*data)); break;
+ case 13: asm("movdqa %%xmm13, %0" : "=m"(*data)); break;
+ case 14: asm("movdqa %%xmm14, %0" : "=m"(*data)); break;
+ case 15: asm("movdqa %%xmm15, %0" : "=m"(*data)); break;
+#endif
+ default: BUG();
+ }
+}
+
+static inline void _kvm_write_sse_reg(int reg, const sse128_t *data)
+{
+ switch (reg) {
+ case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break;
+ case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break;
+ case 2: asm("movdqa %0, %%xmm2" : : "m"(*data)); break;
+ case 3: asm("movdqa %0, %%xmm3" : : "m"(*data)); break;
+ case 4: asm("movdqa %0, %%xmm4" : : "m"(*data)); break;
+ case 5: asm("movdqa %0, %%xmm5" : : "m"(*data)); break;
+ case 6: asm("movdqa %0, %%xmm6" : : "m"(*data)); break;
+ case 7: asm("movdqa %0, %%xmm7" : : "m"(*data)); break;
+#ifdef CONFIG_X86_64
+ case 8: asm("movdqa %0, %%xmm8" : : "m"(*data)); break;
+ case 9: asm("movdqa %0, %%xmm9" : : "m"(*data)); break;
+ case 10: asm("movdqa %0, %%xmm10" : : "m"(*data)); break;
+ case 11: asm("movdqa %0, %%xmm11" : : "m"(*data)); break;
+ case 12: asm("movdqa %0, %%xmm12" : : "m"(*data)); break;
+ case 13: asm("movdqa %0, %%xmm13" : : "m"(*data)); break;
+ case 14: asm("movdqa %0, %%xmm14" : : "m"(*data)); break;
+ case 15: asm("movdqa %0, %%xmm15" : : "m"(*data)); break;
+#endif
+ default: BUG();
+ }
+}
+
+static inline void _kvm_read_mmx_reg(int reg, u64 *data)
+{
+ switch (reg) {
+ case 0: asm("movq %%mm0, %0" : "=m"(*data)); break;
+ case 1: asm("movq %%mm1, %0" : "=m"(*data)); break;
+ case 2: asm("movq %%mm2, %0" : "=m"(*data)); break;
+ case 3: asm("movq %%mm3, %0" : "=m"(*data)); break;
+ case 4: asm("movq %%mm4, %0" : "=m"(*data)); break;
+ case 5: asm("movq %%mm5, %0" : "=m"(*data)); break;
+ case 6: asm("movq %%mm6, %0" : "=m"(*data)); break;
+ case 7: asm("movq %%mm7, %0" : "=m"(*data)); break;
+ default: BUG();
+ }
+}
+
+static inline void _kvm_write_mmx_reg(int reg, const u64 *data)
+{
+ switch (reg) {
+ case 0: asm("movq %0, %%mm0" : : "m"(*data)); break;
+ case 1: asm("movq %0, %%mm1" : : "m"(*data)); break;
+ case 2: asm("movq %0, %%mm2" : : "m"(*data)); break;
+ case 3: asm("movq %0, %%mm3" : : "m"(*data)); break;
+ case 4: asm("movq %0, %%mm4" : : "m"(*data)); break;
+ case 5: asm("movq %0, %%mm5" : : "m"(*data)); break;
+ case 6: asm("movq %0, %%mm6" : : "m"(*data)); break;
+ case 7: asm("movq %0, %%mm7" : : "m"(*data)); break;
+ default: BUG();
+ }
+}
+
+static inline void kvm_fpu_get(void)
+{
+ fpregs_lock();
+
+ fpregs_assert_state_consistent();
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ switch_fpu_return();
+}
+
+static inline void kvm_fpu_put(void)
+{
+ fpregs_unlock();
+}
+
+static inline void kvm_read_sse_reg(int reg, sse128_t *data)
+{
+ kvm_fpu_get();
+ _kvm_read_sse_reg(reg, data);
+ kvm_fpu_put();
+}
+
+static inline void kvm_write_sse_reg(int reg, const sse128_t *data)
+{
+ kvm_fpu_get();
+ _kvm_write_sse_reg(reg, data);
+ kvm_fpu_put();
+}
+
+static inline void kvm_read_mmx_reg(int reg, u64 *data)
+{
+ kvm_fpu_get();
+ _kvm_read_mmx_reg(reg, data);
+ kvm_fpu_put();
+}
+
+static inline void kvm_write_mmx_reg(int reg, const u64 *data)
+{
+ kvm_fpu_get();
+ _kvm_write_mmx_reg(reg, data);
+ kvm_fpu_put();
+}
+
+#endif
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index f00830e5202f..b07592ca92f0 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -36,6 +36,7 @@
#include "trace.h"
#include "irq.h"
+#include "fpu.h"
/* "Hv#1" signature */
#define HYPERV_CPUID_SIGNATURE_EAX 0x31237648
@@ -273,15 +274,10 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
static bool kvm_hv_is_syndbg_enabled(struct kvm_vcpu *vcpu)
{
- struct kvm_cpuid_entry2 *entry;
-
- entry = kvm_find_cpuid_entry(vcpu,
- HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES,
- 0);
- if (!entry)
- return false;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
- return entry->eax & HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING;
+ return hv_vcpu->cpuid_cache.syndbg_cap_eax &
+ HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING;
}
static int kvm_hv_syndbg_complete_userspace(struct kvm_vcpu *vcpu)
@@ -635,11 +631,17 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
union hv_stimer_config new_config = {.as_uint64 = config},
old_config = {.as_uint64 = stimer->config.as_uint64};
struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu);
if (!synic->active && !host)
return 1;
+ if (unlikely(!host && hv_vcpu->enforce_cpuid && new_config.direct_mode &&
+ !(hv_vcpu->cpuid_cache.features_edx &
+ HV_STIMER_DIRECT_MODE_AVAILABLE)))
+ return 1;
+
trace_kvm_hv_stimer_set_config(hv_stimer_to_vcpu(stimer)->vcpu_id,
stimer->index, config, host);
@@ -1206,12 +1208,90 @@ out_unlock:
mutex_unlock(&hv->hv_lock);
}
+
+static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr)
+{
+ if (!hv_vcpu->enforce_cpuid)
+ return true;
+
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ case HV_X64_MSR_HYPERCALL:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_HYPERCALL_AVAILABLE;
+ case HV_X64_MSR_VP_RUNTIME:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_VP_RUNTIME_AVAILABLE;
+ case HV_X64_MSR_TIME_REF_COUNT:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_TIME_REF_COUNT_AVAILABLE;
+ case HV_X64_MSR_VP_INDEX:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_VP_INDEX_AVAILABLE;
+ case HV_X64_MSR_RESET:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_RESET_AVAILABLE;
+ case HV_X64_MSR_REFERENCE_TSC:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_REFERENCE_TSC_AVAILABLE;
+ case HV_X64_MSR_SCONTROL:
+ case HV_X64_MSR_SVERSION:
+ case HV_X64_MSR_SIEFP:
+ case HV_X64_MSR_SIMP:
+ case HV_X64_MSR_EOM:
+ case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_SYNIC_AVAILABLE;
+ case HV_X64_MSR_STIMER0_CONFIG:
+ case HV_X64_MSR_STIMER1_CONFIG:
+ case HV_X64_MSR_STIMER2_CONFIG:
+ case HV_X64_MSR_STIMER3_CONFIG:
+ case HV_X64_MSR_STIMER0_COUNT:
+ case HV_X64_MSR_STIMER1_COUNT:
+ case HV_X64_MSR_STIMER2_COUNT:
+ case HV_X64_MSR_STIMER3_COUNT:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_SYNTIMER_AVAILABLE;
+ case HV_X64_MSR_EOI:
+ case HV_X64_MSR_ICR:
+ case HV_X64_MSR_TPR:
+ case HV_X64_MSR_VP_ASSIST_PAGE:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_MSR_APIC_ACCESS_AVAILABLE;
+ break;
+ case HV_X64_MSR_TSC_FREQUENCY:
+ case HV_X64_MSR_APIC_FREQUENCY:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_ACCESS_FREQUENCY_MSRS;
+ case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_CONTROL:
+ case HV_X64_MSR_TSC_EMULATION_STATUS:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_ACCESS_REENLIGHTENMENT;
+ case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ case HV_X64_MSR_CRASH_CTL:
+ return hv_vcpu->cpuid_cache.features_edx &
+ HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
+ case HV_X64_MSR_SYNDBG_OPTIONS:
+ case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+ return hv_vcpu->cpuid_cache.features_edx &
+ HV_FEATURE_DEBUG_MSRS_AVAILABLE;
+ default:
+ break;
+ }
+
+ return false;
+}
+
static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
bool host)
{
struct kvm *kvm = vcpu->kvm;
struct kvm_hv *hv = to_kvm_hv(kvm);
+ if (unlikely(!host && !hv_check_msr_access(to_hv_vcpu(vcpu), msr)))
+ return 1;
+
switch (msr) {
case HV_X64_MSR_GUEST_OS_ID:
hv->hv_guest_os_id = data;
@@ -1340,6 +1420,9 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
{
struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ if (unlikely(!host && !hv_check_msr_access(hv_vcpu, msr)))
+ return 1;
+
switch (msr) {
case HV_X64_MSR_VP_INDEX: {
struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
@@ -1454,6 +1537,9 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
struct kvm *kvm = vcpu->kvm;
struct kvm_hv *hv = to_kvm_hv(kvm);
+ if (unlikely(!host && !hv_check_msr_access(to_hv_vcpu(vcpu), msr)))
+ return 1;
+
switch (msr) {
case HV_X64_MSR_GUEST_OS_ID:
data = hv->hv_guest_os_id;
@@ -1503,6 +1589,9 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
u64 data = 0;
struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ if (unlikely(!host && !hv_check_msr_access(hv_vcpu, msr)))
+ return 1;
+
switch (msr) {
case HV_X64_MSR_VP_INDEX:
data = hv_vcpu->vp_index;
@@ -1631,8 +1720,22 @@ static __always_inline unsigned long *sparse_set_to_vcpu_mask(
return vcpu_bitmap;
}
-static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, u64 ingpa, u16 rep_cnt, bool ex)
+struct kvm_hv_hcall {
+ u64 param;
+ u64 ingpa;
+ u64 outgpa;
+ u16 code;
+ u16 rep_cnt;
+ u16 rep_idx;
+ bool fast;
+ bool rep;
+ sse128_t xmm[HV_HYPERCALL_MAX_XMM_REGISTERS];
+};
+
+static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool ex)
{
+ int i;
+ gpa_t gpa;
struct kvm *kvm = vcpu->kvm;
struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
struct hv_tlb_flush_ex flush_ex;
@@ -1646,8 +1749,15 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, u64 ingpa, u16 rep_cnt, bool
bool all_cpus;
if (!ex) {
- if (unlikely(kvm_read_guest(kvm, ingpa, &flush, sizeof(flush))))
- return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ if (hc->fast) {
+ flush.address_space = hc->ingpa;
+ flush.flags = hc->outgpa;
+ flush.processor_mask = sse128_lo(hc->xmm[0]);
+ } else {
+ if (unlikely(kvm_read_guest(kvm, hc->ingpa,
+ &flush, sizeof(flush))))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ }
trace_kvm_hv_flush_tlb(flush.processor_mask,
flush.address_space, flush.flags);
@@ -1665,9 +1775,16 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, u64 ingpa, u16 rep_cnt, bool
all_cpus = (flush.flags & HV_FLUSH_ALL_PROCESSORS) ||
flush.processor_mask == 0;
} else {
- if (unlikely(kvm_read_guest(kvm, ingpa, &flush_ex,
- sizeof(flush_ex))))
- return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ if (hc->fast) {
+ flush_ex.address_space = hc->ingpa;
+ flush_ex.flags = hc->outgpa;
+ memcpy(&flush_ex.hv_vp_set,
+ &hc->xmm[0], sizeof(hc->xmm[0]));
+ } else {
+ if (unlikely(kvm_read_guest(kvm, hc->ingpa, &flush_ex,
+ sizeof(flush_ex))))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ }
trace_kvm_hv_flush_tlb_ex(flush_ex.hv_vp_set.valid_bank_mask,
flush_ex.hv_vp_set.format,
@@ -1678,20 +1795,28 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, u64 ingpa, u16 rep_cnt, bool
all_cpus = flush_ex.hv_vp_set.format !=
HV_GENERIC_SET_SPARSE_4K;
- sparse_banks_len =
- bitmap_weight((unsigned long *)&valid_bank_mask, 64) *
- sizeof(sparse_banks[0]);
+ sparse_banks_len = bitmap_weight((unsigned long *)&valid_bank_mask, 64);
if (!sparse_banks_len && !all_cpus)
goto ret_success;
- if (!all_cpus &&
- kvm_read_guest(kvm,
- ingpa + offsetof(struct hv_tlb_flush_ex,
- hv_vp_set.bank_contents),
- sparse_banks,
- sparse_banks_len))
- return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ if (!all_cpus) {
+ if (hc->fast) {
+ if (sparse_banks_len > HV_HYPERCALL_MAX_XMM_REGISTERS - 1)
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ for (i = 0; i < sparse_banks_len; i += 2) {
+ sparse_banks[i] = sse128_lo(hc->xmm[i / 2 + 1]);
+ sparse_banks[i + 1] = sse128_hi(hc->xmm[i / 2 + 1]);
+ }
+ } else {
+ gpa = hc->ingpa + offsetof(struct hv_tlb_flush_ex,
+ hv_vp_set.bank_contents);
+ if (unlikely(kvm_read_guest(kvm, gpa, sparse_banks,
+ sparse_banks_len *
+ sizeof(sparse_banks[0]))))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ }
+ }
}
cpumask_clear(&hv_vcpu->tlb_flush);
@@ -1704,13 +1829,13 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, u64 ingpa, u16 rep_cnt, bool
* vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't
* analyze it here, flush TLB regardless of the specified address space.
*/
- kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH,
+ kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST,
NULL, vcpu_mask, &hv_vcpu->tlb_flush);
ret_success:
- /* We always do full TLB flush, set rep_done = rep_cnt. */
+ /* We always do full TLB flush, set 'Reps completed' = 'Rep Count' */
return (u64)HV_STATUS_SUCCESS |
- ((u64)rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET);
+ ((u64)hc->rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET);
}
static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector,
@@ -1732,8 +1857,7 @@ static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector,
}
}
-static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, u64 ingpa, u64 outgpa,
- bool ex, bool fast)
+static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool ex)
{
struct kvm *kvm = vcpu->kvm;
struct hv_send_ipi_ex send_ipi_ex;
@@ -1748,25 +1872,25 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, u64 ingpa, u64 outgpa,
bool all_cpus;
if (!ex) {
- if (!fast) {
- if (unlikely(kvm_read_guest(kvm, ingpa, &send_ipi,
+ if (!hc->fast) {
+ if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi,
sizeof(send_ipi))))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
sparse_banks[0] = send_ipi.cpu_mask;
vector = send_ipi.vector;
} else {
/* 'reserved' part of hv_send_ipi should be 0 */
- if (unlikely(ingpa >> 32 != 0))
+ if (unlikely(hc->ingpa >> 32 != 0))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
- sparse_banks[0] = outgpa;
- vector = (u32)ingpa;
+ sparse_banks[0] = hc->outgpa;
+ vector = (u32)hc->ingpa;
}
all_cpus = false;
valid_bank_mask = BIT_ULL(0);
trace_kvm_hv_send_ipi(vector, sparse_banks[0]);
} else {
- if (unlikely(kvm_read_guest(kvm, ingpa, &send_ipi_ex,
+ if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi_ex,
sizeof(send_ipi_ex))))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
@@ -1786,8 +1910,8 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, u64 ingpa, u64 outgpa,
if (!all_cpus &&
kvm_read_guest(kvm,
- ingpa + offsetof(struct hv_send_ipi_ex,
- vp_set.bank_contents),
+ hc->ingpa + offsetof(struct hv_send_ipi_ex,
+ vp_set.bank_contents),
sparse_banks,
sparse_banks_len))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
@@ -1809,12 +1933,67 @@ ret_success:
void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *entry;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE, 0);
- if (entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX)
+ if (entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX) {
vcpu->arch.hyperv_enabled = true;
- else
+ } else {
vcpu->arch.hyperv_enabled = false;
+ return;
+ }
+
+ if (!to_hv_vcpu(vcpu) && kvm_hv_vcpu_init(vcpu))
+ return;
+
+ hv_vcpu = to_hv_vcpu(vcpu);
+
+ entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES, 0);
+ if (entry) {
+ hv_vcpu->cpuid_cache.features_eax = entry->eax;
+ hv_vcpu->cpuid_cache.features_ebx = entry->ebx;
+ hv_vcpu->cpuid_cache.features_edx = entry->edx;
+ } else {
+ hv_vcpu->cpuid_cache.features_eax = 0;
+ hv_vcpu->cpuid_cache.features_ebx = 0;
+ hv_vcpu->cpuid_cache.features_edx = 0;
+ }
+
+ entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO, 0);
+ if (entry) {
+ hv_vcpu->cpuid_cache.enlightenments_eax = entry->eax;
+ hv_vcpu->cpuid_cache.enlightenments_ebx = entry->ebx;
+ } else {
+ hv_vcpu->cpuid_cache.enlightenments_eax = 0;
+ hv_vcpu->cpuid_cache.enlightenments_ebx = 0;
+ }
+
+ entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES, 0);
+ if (entry)
+ hv_vcpu->cpuid_cache.syndbg_cap_eax = entry->eax;
+ else
+ hv_vcpu->cpuid_cache.syndbg_cap_eax = 0;
+}
+
+int kvm_hv_set_enforce_cpuid(struct kvm_vcpu *vcpu, bool enforce)
+{
+ struct kvm_vcpu_hv *hv_vcpu;
+ int ret = 0;
+
+ if (!to_hv_vcpu(vcpu)) {
+ if (enforce) {
+ ret = kvm_hv_vcpu_init(vcpu);
+ if (ret)
+ return ret;
+ } else {
+ return 0;
+ }
+ }
+
+ hv_vcpu = to_hv_vcpu(vcpu);
+ hv_vcpu->enforce_cpuid = enforce;
+
+ return ret;
}
bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu)
@@ -1847,20 +2026,21 @@ static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
return kvm_hv_hypercall_complete(vcpu, vcpu->run->hyperv.u.hcall.result);
}
-static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param)
+static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
{
struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
struct eventfd_ctx *eventfd;
- if (unlikely(!fast)) {
+ if (unlikely(!hc->fast)) {
int ret;
- gpa_t gpa = param;
+ gpa_t gpa = hc->ingpa;
- if ((gpa & (__alignof__(param) - 1)) ||
- offset_in_page(gpa) + sizeof(param) > PAGE_SIZE)
+ if ((gpa & (__alignof__(hc->ingpa) - 1)) ||
+ offset_in_page(gpa) + sizeof(hc->ingpa) > PAGE_SIZE)
return HV_STATUS_INVALID_ALIGNMENT;
- ret = kvm_vcpu_read_guest(vcpu, gpa, &param, sizeof(param));
+ ret = kvm_vcpu_read_guest(vcpu, gpa,
+ &hc->ingpa, sizeof(hc->ingpa));
if (ret < 0)
return HV_STATUS_INVALID_ALIGNMENT;
}
@@ -1870,15 +2050,15 @@ static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param)
* have no use for it, and in all known usecases it is zero, so just
* report lookup failure if it isn't.
*/
- if (param & 0xffff00000000ULL)
+ if (hc->ingpa & 0xffff00000000ULL)
return HV_STATUS_INVALID_PORT_ID;
/* remaining bits are reserved-zero */
- if (param & ~KVM_HYPERV_CONN_ID_MASK)
+ if (hc->ingpa & ~KVM_HYPERV_CONN_ID_MASK)
return HV_STATUS_INVALID_HYPERCALL_INPUT;
/* the eventfd is protected by vcpu->kvm->srcu, but conn_to_evt isn't */
rcu_read_lock();
- eventfd = idr_find(&hv->conn_to_evt, param);
+ eventfd = idr_find(&hv->conn_to_evt, hc->ingpa);
rcu_read_unlock();
if (!eventfd)
return HV_STATUS_INVALID_PORT_ID;
@@ -1887,11 +2067,80 @@ static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param)
return HV_STATUS_SUCCESS;
}
+static bool is_xmm_fast_hypercall(struct kvm_hv_hcall *hc)
+{
+ switch (hc->code) {
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST:
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE:
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX:
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX:
+ return true;
+ }
+
+ return false;
+}
+
+static void kvm_hv_hypercall_read_xmm(struct kvm_hv_hcall *hc)
+{
+ int reg;
+
+ kvm_fpu_get();
+ for (reg = 0; reg < HV_HYPERCALL_MAX_XMM_REGISTERS; reg++)
+ _kvm_read_sse_reg(reg, &hc->xmm[reg]);
+ kvm_fpu_put();
+}
+
+static bool hv_check_hypercall_access(struct kvm_vcpu_hv *hv_vcpu, u16 code)
+{
+ if (!hv_vcpu->enforce_cpuid)
+ return true;
+
+ switch (code) {
+ case HVCALL_NOTIFY_LONG_SPIN_WAIT:
+ return hv_vcpu->cpuid_cache.enlightenments_ebx &&
+ hv_vcpu->cpuid_cache.enlightenments_ebx != U32_MAX;
+ case HVCALL_POST_MESSAGE:
+ return hv_vcpu->cpuid_cache.features_ebx & HV_POST_MESSAGES;
+ case HVCALL_SIGNAL_EVENT:
+ return hv_vcpu->cpuid_cache.features_ebx & HV_SIGNAL_EVENTS;
+ case HVCALL_POST_DEBUG_DATA:
+ case HVCALL_RETRIEVE_DEBUG_DATA:
+ case HVCALL_RESET_DEBUG_SESSION:
+ /*
+ * Return 'true' when SynDBG is disabled so the resulting code
+ * will be HV_STATUS_INVALID_HYPERCALL_CODE.
+ */
+ return !kvm_hv_is_syndbg_enabled(hv_vcpu->vcpu) ||
+ hv_vcpu->cpuid_cache.features_ebx & HV_DEBUGGING;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX:
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX:
+ if (!(hv_vcpu->cpuid_cache.enlightenments_eax &
+ HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
+ return false;
+ fallthrough;
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST:
+ case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE:
+ return hv_vcpu->cpuid_cache.enlightenments_eax &
+ HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED;
+ case HVCALL_SEND_IPI_EX:
+ if (!(hv_vcpu->cpuid_cache.enlightenments_eax &
+ HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
+ return false;
+ fallthrough;
+ case HVCALL_SEND_IPI:
+ return hv_vcpu->cpuid_cache.enlightenments_eax &
+ HV_X64_CLUSTER_IPI_RECOMMENDED;
+ default:
+ break;
+ }
+
+ return true;
+}
+
int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
{
- u64 param, ingpa, outgpa, ret = HV_STATUS_SUCCESS;
- uint16_t code, rep_idx, rep_cnt;
- bool fast, rep;
+ struct kvm_hv_hcall hc;
+ u64 ret = HV_STATUS_SUCCESS;
/*
* hypercall generates UD from non zero cpl and real mode
@@ -1904,104 +2153,113 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
#ifdef CONFIG_X86_64
if (is_64_bit_mode(vcpu)) {
- param = kvm_rcx_read(vcpu);
- ingpa = kvm_rdx_read(vcpu);
- outgpa = kvm_r8_read(vcpu);
+ hc.param = kvm_rcx_read(vcpu);
+ hc.ingpa = kvm_rdx_read(vcpu);
+ hc.outgpa = kvm_r8_read(vcpu);
} else
#endif
{
- param = ((u64)kvm_rdx_read(vcpu) << 32) |
- (kvm_rax_read(vcpu) & 0xffffffff);
- ingpa = ((u64)kvm_rbx_read(vcpu) << 32) |
- (kvm_rcx_read(vcpu) & 0xffffffff);
- outgpa = ((u64)kvm_rdi_read(vcpu) << 32) |
- (kvm_rsi_read(vcpu) & 0xffffffff);
+ hc.param = ((u64)kvm_rdx_read(vcpu) << 32) |
+ (kvm_rax_read(vcpu) & 0xffffffff);
+ hc.ingpa = ((u64)kvm_rbx_read(vcpu) << 32) |
+ (kvm_rcx_read(vcpu) & 0xffffffff);
+ hc.outgpa = ((u64)kvm_rdi_read(vcpu) << 32) |
+ (kvm_rsi_read(vcpu) & 0xffffffff);
}
- code = param & 0xffff;
- fast = !!(param & HV_HYPERCALL_FAST_BIT);
- rep_cnt = (param >> HV_HYPERCALL_REP_COMP_OFFSET) & 0xfff;
- rep_idx = (param >> HV_HYPERCALL_REP_START_OFFSET) & 0xfff;
- rep = !!(rep_cnt || rep_idx);
+ hc.code = hc.param & 0xffff;
+ hc.fast = !!(hc.param & HV_HYPERCALL_FAST_BIT);
+ hc.rep_cnt = (hc.param >> HV_HYPERCALL_REP_COMP_OFFSET) & 0xfff;
+ hc.rep_idx = (hc.param >> HV_HYPERCALL_REP_START_OFFSET) & 0xfff;
+ hc.rep = !!(hc.rep_cnt || hc.rep_idx);
- trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
+ if (hc.fast && is_xmm_fast_hypercall(&hc))
+ kvm_hv_hypercall_read_xmm(&hc);
- switch (code) {
+ trace_kvm_hv_hypercall(hc.code, hc.fast, hc.rep_cnt, hc.rep_idx,
+ hc.ingpa, hc.outgpa);
+
+ if (unlikely(!hv_check_hypercall_access(to_hv_vcpu(vcpu), hc.code))) {
+ ret = HV_STATUS_ACCESS_DENIED;
+ goto hypercall_complete;
+ }
+
+ switch (hc.code) {
case HVCALL_NOTIFY_LONG_SPIN_WAIT:
- if (unlikely(rep)) {
+ if (unlikely(hc.rep)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
kvm_vcpu_on_spin(vcpu, true);
break;
case HVCALL_SIGNAL_EVENT:
- if (unlikely(rep)) {
+ if (unlikely(hc.rep)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
- ret = kvm_hvcall_signal_event(vcpu, fast, ingpa);
+ ret = kvm_hvcall_signal_event(vcpu, &hc);
if (ret != HV_STATUS_INVALID_PORT_ID)
break;
fallthrough; /* maybe userspace knows this conn_id */
case HVCALL_POST_MESSAGE:
/* don't bother userspace if it has no way to handle it */
- if (unlikely(rep || !to_hv_synic(vcpu)->active)) {
+ if (unlikely(hc.rep || !to_hv_synic(vcpu)->active)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
vcpu->run->exit_reason = KVM_EXIT_HYPERV;
vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL;
- vcpu->run->hyperv.u.hcall.input = param;
- vcpu->run->hyperv.u.hcall.params[0] = ingpa;
- vcpu->run->hyperv.u.hcall.params[1] = outgpa;
+ vcpu->run->hyperv.u.hcall.input = hc.param;
+ vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa;
+ vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa;
vcpu->arch.complete_userspace_io =
kvm_hv_hypercall_complete_userspace;
return 0;
case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST:
- if (unlikely(fast || !rep_cnt || rep_idx)) {
+ if (unlikely(!hc.rep_cnt || hc.rep_idx)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
- ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, false);
+ ret = kvm_hv_flush_tlb(vcpu, &hc, false);
break;
case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE:
- if (unlikely(fast || rep)) {
+ if (unlikely(hc.rep)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
- ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, false);
+ ret = kvm_hv_flush_tlb(vcpu, &hc, false);
break;
case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX:
- if (unlikely(fast || !rep_cnt || rep_idx)) {
+ if (unlikely(!hc.rep_cnt || hc.rep_idx)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
- ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, true);
+ ret = kvm_hv_flush_tlb(vcpu, &hc, true);
break;
case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX:
- if (unlikely(fast || rep)) {
+ if (unlikely(hc.rep)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
- ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, true);
+ ret = kvm_hv_flush_tlb(vcpu, &hc, true);
break;
case HVCALL_SEND_IPI:
- if (unlikely(rep)) {
+ if (unlikely(hc.rep)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
- ret = kvm_hv_send_ipi(vcpu, ingpa, outgpa, false, fast);
+ ret = kvm_hv_send_ipi(vcpu, &hc, false);
break;
case HVCALL_SEND_IPI_EX:
- if (unlikely(fast || rep)) {
+ if (unlikely(hc.fast || hc.rep)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
- ret = kvm_hv_send_ipi(vcpu, ingpa, outgpa, true, false);
+ ret = kvm_hv_send_ipi(vcpu, &hc, true);
break;
case HVCALL_POST_DEBUG_DATA:
case HVCALL_RETRIEVE_DEBUG_DATA:
- if (unlikely(fast)) {
+ if (unlikely(hc.fast)) {
ret = HV_STATUS_INVALID_PARAMETER;
break;
}
@@ -2020,9 +2278,9 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
}
vcpu->run->exit_reason = KVM_EXIT_HYPERV;
vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL;
- vcpu->run->hyperv.u.hcall.input = param;
- vcpu->run->hyperv.u.hcall.params[0] = ingpa;
- vcpu->run->hyperv.u.hcall.params[1] = outgpa;
+ vcpu->run->hyperv.u.hcall.input = hc.param;
+ vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa;
+ vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa;
vcpu->arch.complete_userspace_io =
kvm_hv_hypercall_complete_userspace;
return 0;
@@ -2032,6 +2290,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
break;
}
+hypercall_complete:
return kvm_hv_hypercall_complete(vcpu, ret);
}
@@ -2180,6 +2439,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
ent->ebx |= HV_POST_MESSAGES;
ent->ebx |= HV_SIGNAL_EVENTS;
+ ent->edx |= HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE;
ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE;
ent->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 60547d5cb6d7..730da8537d05 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -138,6 +138,7 @@ void kvm_hv_invalidate_tsc_page(struct kvm *kvm);
void kvm_hv_init_vm(struct kvm *kvm);
void kvm_hv_destroy_vm(struct kvm *kvm);
void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu);
+int kvm_hv_set_enforce_cpuid(struct kvm_vcpu *vcpu, bool enforce);
int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args);
int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_entry2 __user *entries);
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 3db5c42c9ecd..90e1ffdc05b7 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -55,6 +55,13 @@ static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu,
__set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
}
+static inline void kvm_register_clear_available(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ __clear_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+ __clear_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
+}
+
static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
enum kvm_reg reg)
{
@@ -118,6 +125,11 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
return vcpu->arch.walk_mmu->pdptrs[index];
}
+static inline void kvm_pdptr_write(struct kvm_vcpu *vcpu, int index, u64 value)
+{
+ vcpu->arch.walk_mmu->pdptrs[index] = value;
+}
+
static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
{
ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
@@ -162,6 +174,7 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
{
vcpu->arch.hflags |= HF_GUEST_MASK;
+ vcpu->stat.guest_mode = 1;
}
static inline void leave_guest_mode(struct kvm_vcpu *vcpu)
@@ -172,6 +185,8 @@ static inline void leave_guest_mode(struct kvm_vcpu *vcpu)
vcpu->arch.load_eoi_exitmap_pending = false;
kvm_make_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu);
}
+
+ vcpu->stat.guest_mode = 0;
}
static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index 3e870bf9ca4d..68b420289d7e 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -13,6 +13,7 @@
#define _ASM_X86_KVM_X86_EMULATE_H
#include <asm/desc_defs.h>
+#include "fpu.h"
struct x86_emulate_ctxt;
enum x86_intercept;
@@ -229,15 +230,12 @@ struct x86_emulate_ops {
void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked);
unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt);
- void (*set_hflags)(struct x86_emulate_ctxt *ctxt, unsigned hflags);
- int (*pre_leave_smm)(struct x86_emulate_ctxt *ctxt,
- const char *smstate);
- void (*post_leave_smm)(struct x86_emulate_ctxt *ctxt);
+ void (*exiting_smm)(struct x86_emulate_ctxt *ctxt);
+ int (*leave_smm)(struct x86_emulate_ctxt *ctxt, const char *smstate);
+ void (*triple_fault)(struct x86_emulate_ctxt *ctxt);
int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr);
};
-typedef u32 __attribute__((vector_size(16))) sse128_t;
-
/* Type, address-of, and value of an instruction's operand. */
struct operand {
enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_MM, OP_NONE } type;
diff --git a/arch/x86/kvm/kvm_onhyperv.c b/arch/x86/kvm/kvm_onhyperv.c
new file mode 100644
index 000000000000..c7db2df50a7a
--- /dev/null
+++ b/arch/x86/kvm/kvm_onhyperv.c
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM L1 hypervisor optimizations on Hyper-V.
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/mshyperv.h>
+
+#include "hyperv.h"
+#include "kvm_onhyperv.h"
+
+static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
+ void *data)
+{
+ struct kvm_tlb_range *range = data;
+
+ return hyperv_fill_flush_guest_mapping_list(flush, range->start_gfn,
+ range->pages);
+}
+
+static inline int hv_remote_flush_root_tdp(hpa_t root_tdp,
+ struct kvm_tlb_range *range)
+{
+ if (range)
+ return hyperv_flush_guest_mapping_range(root_tdp,
+ kvm_fill_hv_flush_list_func, (void *)range);
+ else
+ return hyperv_flush_guest_mapping(root_tdp);
+}
+
+int hv_remote_flush_tlb_with_range(struct kvm *kvm,
+ struct kvm_tlb_range *range)
+{
+ struct kvm_arch *kvm_arch = &kvm->arch;
+ struct kvm_vcpu *vcpu;
+ int ret = 0, i, nr_unique_valid_roots;
+ hpa_t root;
+
+ spin_lock(&kvm_arch->hv_root_tdp_lock);
+
+ if (!VALID_PAGE(kvm_arch->hv_root_tdp)) {
+ nr_unique_valid_roots = 0;
+
+ /*
+ * Flush all valid roots, and see if all vCPUs have converged
+ * on a common root, in which case future flushes can skip the
+ * loop and flush the common root.
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ root = vcpu->arch.hv_root_tdp;
+ if (!VALID_PAGE(root) || root == kvm_arch->hv_root_tdp)
+ continue;
+
+ /*
+ * Set the tracked root to the first valid root. Keep
+ * this root for the entirety of the loop even if more
+ * roots are encountered as a low effort optimization
+ * to avoid flushing the same (first) root again.
+ */
+ if (++nr_unique_valid_roots == 1)
+ kvm_arch->hv_root_tdp = root;
+
+ if (!ret)
+ ret = hv_remote_flush_root_tdp(root, range);
+
+ /*
+ * Stop processing roots if a failure occurred and
+ * multiple valid roots have already been detected.
+ */
+ if (ret && nr_unique_valid_roots > 1)
+ break;
+ }
+
+ /*
+ * The optimized flush of a single root can't be used if there
+ * are multiple valid roots (obviously).
+ */
+ if (nr_unique_valid_roots > 1)
+ kvm_arch->hv_root_tdp = INVALID_PAGE;
+ } else {
+ ret = hv_remote_flush_root_tdp(kvm_arch->hv_root_tdp, range);
+ }
+
+ spin_unlock(&kvm_arch->hv_root_tdp_lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(hv_remote_flush_tlb_with_range);
+
+int hv_remote_flush_tlb(struct kvm *kvm)
+{
+ return hv_remote_flush_tlb_with_range(kvm, NULL);
+}
+EXPORT_SYMBOL_GPL(hv_remote_flush_tlb);
diff --git a/arch/x86/kvm/kvm_onhyperv.h b/arch/x86/kvm/kvm_onhyperv.h
new file mode 100644
index 000000000000..1c67abf2eba9
--- /dev/null
+++ b/arch/x86/kvm/kvm_onhyperv.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * KVM L1 hypervisor optimizations on Hyper-V.
+ */
+
+#ifndef __ARCH_X86_KVM_KVM_ONHYPERV_H__
+#define __ARCH_X86_KVM_KVM_ONHYPERV_H__
+
+#if IS_ENABLED(CONFIG_HYPERV)
+int hv_remote_flush_tlb_with_range(struct kvm *kvm,
+ struct kvm_tlb_range *range);
+int hv_remote_flush_tlb(struct kvm *kvm);
+
+static inline void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp)
+{
+ struct kvm_arch *kvm_arch = &vcpu->kvm->arch;
+
+ if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) {
+ spin_lock(&kvm_arch->hv_root_tdp_lock);
+ vcpu->arch.hv_root_tdp = root_tdp;
+ if (root_tdp != kvm_arch->hv_root_tdp)
+ kvm_arch->hv_root_tdp = INVALID_PAGE;
+ spin_unlock(&kvm_arch->hv_root_tdp_lock);
+ }
+}
+#else /* !CONFIG_HYPERV */
+static inline void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp)
+{
+}
+#endif /* !CONFIG_HYPERV */
+
+#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 17fa4ab1b834..ba5a27879f1d 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2631,6 +2631,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
update_divide_count(apic);
__start_apic_timer(apic, APIC_TMCCT);
+ kvm_lapic_set_reg(apic, APIC_TMCCT, 0);
kvm_apic_update_apicv(vcpu);
apic->highest_isr_cache = -1;
if (vcpu->arch.apicv_active) {
@@ -2872,7 +2873,7 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len)
return kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, addr, new_len);
}
-void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
+int kvm_apic_accept_events(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
u8 sipi_vector;
@@ -2880,7 +2881,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
unsigned long pe;
if (!lapic_in_kernel(vcpu))
- return;
+ return 0;
/*
* Read pending events before calling the check_events
@@ -2888,12 +2889,12 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
*/
pe = smp_load_acquire(&apic->pending_events);
if (!pe)
- return;
+ return 0;
if (is_guest_mode(vcpu)) {
r = kvm_check_nested_events(vcpu);
if (r < 0)
- return;
+ return r == -EBUSY ? 0 : r;
/*
* If an event has happened and caused a vmexit,
* we know INITs are latched and therefore
@@ -2914,7 +2915,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED);
if (test_bit(KVM_APIC_SIPI, &pe))
clear_bit(KVM_APIC_SIPI, &apic->pending_events);
- return;
+ return 0;
}
if (test_bit(KVM_APIC_INIT, &pe)) {
@@ -2935,6 +2936,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
}
}
+ return 0;
}
void kvm_lapic_exit(void)
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 997c45a5963a..d7c25d0c1354 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -76,7 +76,7 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu);
int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
-void kvm_apic_accept_events(struct kvm_vcpu *vcpu);
+int kvm_apic_accept_events(struct kvm_vcpu *vcpu);
void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event);
u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 88d0ed5225a4..83e6c6965f1e 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -44,6 +44,12 @@
#define PT32_ROOT_LEVEL 2
#define PT32E_ROOT_LEVEL 3
+#define KVM_MMU_CR4_ROLE_BITS (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | \
+ X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE | \
+ X86_CR4_LA57)
+
+#define KVM_MMU_CR0_ROLE_BITS (X86_CR0_PG | X86_CR0_WP)
+
static __always_inline u64 rsvd_bits(int s, int e)
{
BUILD_BUG_ON(__builtin_constant_p(e) && __builtin_constant_p(s) && e < s);
@@ -62,12 +68,9 @@ static __always_inline u64 rsvd_bits(int s, int e)
void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);
-void
-reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
-
-void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots);
-void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer,
- gpa_t nested_cr3);
+void kvm_init_mmu(struct kvm_vcpu *vcpu);
+void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
+ unsigned long cr4, u64 efer, gpa_t nested_cr3);
void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
bool accessed_dirty, gpa_t new_eptp);
bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
@@ -162,11 +165,6 @@ static inline bool is_writable_pte(unsigned long pte)
return pte & PT_WRITABLE_MASK;
}
-static inline bool is_write_protection(struct kvm_vcpu *vcpu)
-{
- return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
-}
-
/*
* Check if a given access (described through the I/D, W/R and U/S bits of a
* page fault error code pfec) causes a permission fault with the given PTE
@@ -232,4 +230,14 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
int kvm_mmu_post_init_vm(struct kvm *kvm);
void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
+static inline bool kvm_memslots_have_rmaps(struct kvm *kvm)
+{
+ /*
+ * Read memslot_have_rmaps before rmap pointers. Hence, threads reading
+ * memslots_have_rmaps in any lock context are guaranteed to see the
+ * pointers. Pairs with smp_store_release in alloc_all_memslots_rmaps.
+ */
+ return smp_load_acquire(&kvm->arch.memslots_have_rmaps);
+}
+
#endif
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 70979358a0ad..845d114ae075 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -55,7 +55,7 @@
extern bool itlb_multihit_kvm_mitigation;
-static int __read_mostly nx_huge_pages = -1;
+int __read_mostly nx_huge_pages = -1;
#ifdef CONFIG_PREEMPT_RT
/* Recovery can cause latency spikes, disable it for PREEMPT_RT. */
static uint __read_mostly nx_huge_pages_recovery_ratio = 0;
@@ -176,9 +176,80 @@ static void mmu_spte_set(u64 *sptep, u64 spte);
static union kvm_mmu_page_role
kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
+struct kvm_mmu_role_regs {
+ const unsigned long cr0;
+ const unsigned long cr4;
+ const u64 efer;
+};
+
#define CREATE_TRACE_POINTS
#include "mmutrace.h"
+/*
+ * Yes, lot's of underscores. They're a hint that you probably shouldn't be
+ * reading from the role_regs. Once the mmu_role is constructed, it becomes
+ * the single source of truth for the MMU's state.
+ */
+#define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag) \
+static inline bool ____is_##reg##_##name(struct kvm_mmu_role_regs *regs)\
+{ \
+ return !!(regs->reg & flag); \
+}
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, pg, X86_CR0_PG);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, wp, X86_CR0_WP);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pse, X86_CR4_PSE);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pae, X86_CR4_PAE);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smep, X86_CR4_SMEP);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smap, X86_CR4_SMAP);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pke, X86_CR4_PKE);
+BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, la57, X86_CR4_LA57);
+BUILD_MMU_ROLE_REGS_ACCESSOR(efer, nx, EFER_NX);
+BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA);
+
+/*
+ * The MMU itself (with a valid role) is the single source of truth for the
+ * MMU. Do not use the regs used to build the MMU/role, nor the vCPU. The
+ * regs don't account for dependencies, e.g. clearing CR4 bits if CR0.PG=1,
+ * and the vCPU may be incorrect/irrelevant.
+ */
+#define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name) \
+static inline bool is_##reg##_##name(struct kvm_mmu *mmu) \
+{ \
+ return !!(mmu->mmu_role. base_or_ext . reg##_##name); \
+}
+BUILD_MMU_ROLE_ACCESSOR(ext, cr0, pg);
+BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp);
+BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pse);
+BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pae);
+BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smep);
+BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smap);
+BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pke);
+BUILD_MMU_ROLE_ACCESSOR(ext, cr4, la57);
+BUILD_MMU_ROLE_ACCESSOR(base, efer, nx);
+
+static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_role_regs regs = {
+ .cr0 = kvm_read_cr0_bits(vcpu, KVM_MMU_CR0_ROLE_BITS),
+ .cr4 = kvm_read_cr4_bits(vcpu, KVM_MMU_CR4_ROLE_BITS),
+ .efer = vcpu->arch.efer,
+ };
+
+ return regs;
+}
+
+static int role_regs_to_root_level(struct kvm_mmu_role_regs *regs)
+{
+ if (!____is_cr0_pg(regs))
+ return 0;
+ else if (____is_efer_lma(regs))
+ return ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL :
+ PT64_ROOT_4LEVEL;
+ else if (____is_cr4_pae(regs))
+ return PT32E_ROOT_LEVEL;
+ else
+ return PT32_ROOT_LEVEL;
+}
static inline bool kvm_available_flush_tlb_with_range(void)
{
@@ -208,11 +279,6 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
kvm_flush_remote_tlbs_with_range(kvm, &range);
}
-bool is_nx_huge_page_enabled(void)
-{
- return READ_ONCE(nx_huge_pages);
-}
-
static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
unsigned int access)
{
@@ -269,11 +335,6 @@ static int is_cpuid_PSE36(void)
return 1;
}
-static int is_nx(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.efer & EFER_NX;
-}
-
static gfn_t pse36_gfn_delta(u32 gpte)
{
int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
@@ -1177,8 +1238,7 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
* @gfn_offset: start of the BITS_PER_LONG pages we care about
* @mask: indicates which pages we should protect
*
- * Used when we do not need to care about huge page mappings: e.g. during dirty
- * logging we do not have any such mappings.
+ * Used when we do not need to care about huge page mappings.
*/
static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
@@ -1189,6 +1249,10 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
if (is_tdp_mmu_enabled(kvm))
kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
slot->base_gfn + gfn_offset, mask, true);
+
+ if (!kvm_memslots_have_rmaps(kvm))
+ return;
+
while (mask) {
rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
PG_LEVEL_4K, slot);
@@ -1218,6 +1282,10 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
if (is_tdp_mmu_enabled(kvm))
kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
slot->base_gfn + gfn_offset, mask, false);
+
+ if (!kvm_memslots_have_rmaps(kvm))
+ return;
+
while (mask) {
rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
PG_LEVEL_4K, slot);
@@ -1235,13 +1303,36 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
* It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
* enable dirty logging for them.
*
- * Used when we do not need to care about huge page mappings: e.g. during dirty
- * logging we do not have any such mappings.
+ * We need to care about huge page mappings: e.g. during dirty logging we may
+ * have such mappings.
*/
void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask)
{
+ /*
+ * Huge pages are NOT write protected when we start dirty logging in
+ * initially-all-set mode; must write protect them here so that they
+ * are split to 4K on the first write.
+ *
+ * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn
+ * of memslot has no such restriction, so the range can cross two large
+ * pages.
+ */
+ if (kvm_dirty_log_manual_protect_and_init_set(kvm)) {
+ gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask);
+ gfn_t end = slot->base_gfn + gfn_offset + __fls(mask);
+
+ kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M);
+
+ /* Cross two large pages? */
+ if (ALIGN(start << PAGE_SHIFT, PMD_SIZE) !=
+ ALIGN(end << PAGE_SHIFT, PMD_SIZE))
+ kvm_mmu_slot_gfn_write_protect(kvm, slot, end,
+ PG_LEVEL_2M);
+ }
+
+ /* Now handle 4K PTEs. */
if (kvm_x86_ops.cpu_dirty_log_size)
kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask);
else
@@ -1254,20 +1345,23 @@ int kvm_cpu_dirty_log_size(void)
}
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
- struct kvm_memory_slot *slot, u64 gfn)
+ struct kvm_memory_slot *slot, u64 gfn,
+ int min_level)
{
struct kvm_rmap_head *rmap_head;
int i;
bool write_protected = false;
- for (i = PG_LEVEL_4K; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
- rmap_head = __gfn_to_rmap(gfn, i, slot);
- write_protected |= __rmap_write_protect(kvm, rmap_head, true);
+ if (kvm_memslots_have_rmaps(kvm)) {
+ for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
+ rmap_head = __gfn_to_rmap(gfn, i, slot);
+ write_protected |= __rmap_write_protect(kvm, rmap_head, true);
+ }
}
if (is_tdp_mmu_enabled(kvm))
write_protected |=
- kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn);
+ kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level);
return write_protected;
}
@@ -1277,7 +1371,7 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
struct kvm_memory_slot *slot;
slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
- return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
+ return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K);
}
static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
@@ -1433,9 +1527,10 @@ static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
- bool flush;
+ bool flush = false;
- flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp);
+ if (kvm_memslots_have_rmaps(kvm))
+ flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp);
if (is_tdp_mmu_enabled(kvm))
flush |= kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
@@ -1445,9 +1540,10 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- bool flush;
+ bool flush = false;
- flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp);
+ if (kvm_memslots_have_rmaps(kvm))
+ flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp);
if (is_tdp_mmu_enabled(kvm))
flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range);
@@ -1500,9 +1596,10 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- bool young;
+ bool young = false;
- young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp);
+ if (kvm_memslots_have_rmaps(kvm))
+ young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp);
if (is_tdp_mmu_enabled(kvm))
young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
@@ -1512,9 +1609,10 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
- bool young;
+ bool young = false;
- young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp);
+ if (kvm_memslots_have_rmaps(kvm))
+ young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp);
if (is_tdp_mmu_enabled(kvm))
young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
@@ -1748,17 +1846,10 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \
if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
-static inline bool is_ept_sp(struct kvm_mmu_page *sp)
-{
- return sp->role.cr0_wp && sp->role.smap_andnot_wp;
-}
-
-/* @sp->gfn should be write-protected at the call site */
-static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- struct list_head *invalid_list)
+static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
{
- if ((!is_ept_sp(sp) && sp->role.gpte_is_8_bytes != !!is_pae(vcpu)) ||
- vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
+ if (vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
return false;
}
@@ -1804,31 +1895,6 @@ static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
}
-static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- struct list_head *invalid_list)
-{
- kvm_unlink_unsync_page(vcpu->kvm, sp);
- return __kvm_sync_page(vcpu, sp, invalid_list);
-}
-
-/* @gfn should be write-protected at the call site */
-static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
- struct list_head *invalid_list)
-{
- struct kvm_mmu_page *s;
- bool ret = false;
-
- for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
- if (!s->unsync)
- continue;
-
- WARN_ON(s->role.level != PG_LEVEL_4K);
- ret |= kvm_sync_page(vcpu, s, invalid_list);
- }
-
- return ret;
-}
-
struct mmu_page_path {
struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
unsigned int idx[PT64_ROOT_MAX_LEVEL];
@@ -1923,6 +1989,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
}
for_each_sp(pages, sp, parents, i) {
+ kvm_unlink_unsync_page(vcpu->kvm, sp);
flush |= kvm_sync_page(vcpu, sp, &invalid_list);
mmu_pages_clear_parents(&parents);
}
@@ -1958,8 +2025,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
struct hlist_head *sp_list;
unsigned quadrant;
struct kvm_mmu_page *sp;
- bool need_sync = false;
- bool flush = false;
int collisions = 0;
LIST_HEAD(invalid_list);
@@ -1982,20 +2047,39 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
continue;
}
- if (!need_sync && sp->unsync)
- need_sync = true;
-
- if (sp->role.word != role.word)
+ if (sp->role.word != role.word) {
+ /*
+ * If the guest is creating an upper-level page, zap
+ * unsync pages for the same gfn. While it's possible
+ * the guest is using recursive page tables, in all
+ * likelihood the guest has stopped using the unsync
+ * page and is installing a completely unrelated page.
+ * Unsync pages must not be left as is, because the new
+ * upper-level page will be write-protected.
+ */
+ if (level > PG_LEVEL_4K && sp->unsync)
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
+ &invalid_list);
continue;
+ }
if (direct_mmu)
goto trace_get_page;
if (sp->unsync) {
- /* The page is good, but __kvm_sync_page might still end
- * up zapping it. If so, break in order to rebuild it.
+ /*
+ * The page is good, but is stale. kvm_sync_page does
+ * get the latest guest state, but (unlike mmu_unsync_children)
+ * it doesn't write-protect the page or mark it synchronized!
+ * This way the validity of the mapping is ensured, but the
+ * overhead of write protection is not incurred until the
+ * guest invalidates the TLB mapping. This allows multiple
+ * SPs for a single gfn to be unsync.
+ *
+ * If the sync fails, the page is zapped. If so, break
+ * in order to rebuild it.
*/
- if (!__kvm_sync_page(vcpu, sp, &invalid_list))
+ if (!kvm_sync_page(vcpu, sp, &invalid_list))
break;
WARN_ON(!list_empty(&invalid_list));
@@ -2020,22 +2104,14 @@ trace_get_page:
sp->role = role;
hlist_add_head(&sp->hash_link, sp_list);
if (!direct) {
- /*
- * we should do write protection before syncing pages
- * otherwise the content of the synced shadow page may
- * be inconsistent with guest page table.
- */
account_shadowed(vcpu->kvm, sp);
if (level == PG_LEVEL_4K && rmap_write_protect(vcpu, gfn))
kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
-
- if (level > PG_LEVEL_4K && need_sync)
- flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
}
trace_kvm_mmu_get_page(sp, true);
-
- kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
out:
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+
if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)
vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;
return sp;
@@ -2448,17 +2524,33 @@ static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
kvm_mmu_mark_parents_unsync(sp);
}
-bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
- bool can_unsync)
+/*
+ * Attempt to unsync any shadow pages that can be reached by the specified gfn,
+ * KVM is creating a writable mapping for said gfn. Returns 0 if all pages
+ * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must
+ * be write-protected.
+ */
+int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync)
{
struct kvm_mmu_page *sp;
+ /*
+ * Force write-protection if the page is being tracked. Note, the page
+ * track machinery is used to write-protect upper-level shadow pages,
+ * i.e. this guards the role.level == 4K assertion below!
+ */
if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
- return true;
+ return -EPERM;
+ /*
+ * The page is not write-tracked, mark existing shadow pages unsync
+ * unless KVM is synchronizing an unsync SP (can_unsync = false). In
+ * that case, KVM must complete emulation of the guest TLB flush before
+ * allowing shadow pages to become unsync (writable by the guest).
+ */
for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
if (!can_unsync)
- return true;
+ return -EPERM;
if (sp->unsync)
continue;
@@ -2489,8 +2581,8 @@ bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
* 2.2 Guest issues TLB flush.
* That causes a VM Exit.
*
- * 2.3 kvm_mmu_sync_pages() reads sp->unsync.
- * Since it is false, so it just returns.
+ * 2.3 Walking of unsync pages sees sp->unsync is
+ * false and skips the page.
*
* 2.4 Guest accesses GVA X.
* Since the mapping in the SP was not updated,
@@ -2506,7 +2598,7 @@ bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
*/
smp_wmb();
- return false;
+ return 0;
}
static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
@@ -2827,9 +2919,6 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
gfn_t gfn = gpa >> PAGE_SHIFT;
gfn_t base_gfn = gfn;
- if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
- return RET_PF_RETRY;
-
level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
huge_page_disallowed, &req_level);
@@ -3180,6 +3269,33 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
}
EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
+void kvm_mmu_free_guest_mode_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+{
+ unsigned long roots_to_free = 0;
+ hpa_t root_hpa;
+ int i;
+
+ /*
+ * This should not be called while L2 is active, L2 can't invalidate
+ * _only_ its own roots, e.g. INVVPID unconditionally exits.
+ */
+ WARN_ON_ONCE(mmu->mmu_role.base.guest_mode);
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ root_hpa = mmu->prev_roots[i].hpa;
+ if (!VALID_PAGE(root_hpa))
+ continue;
+
+ if (!to_shadow_page(root_hpa) ||
+ to_shadow_page(root_hpa)->role.guest_mode)
+ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+ }
+
+ kvm_mmu_free_roots(vcpu, mmu, roots_to_free);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots);
+
+
static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
{
int ret = 0;
@@ -3280,6 +3396,10 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
}
}
+ r = alloc_all_memslots_rmaps(vcpu->kvm);
+ if (r)
+ return r;
+
write_lock(&vcpu->kvm->mmu_lock);
r = make_mmu_pages_available(vcpu);
if (r < 0)
@@ -3423,8 +3543,8 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
* flush strictly after those changes are made. We only need to
* ensure that the other CPU sets these flags before any actual
* changes to the page tables are made. The comments in
- * mmu_need_write_protect() describe what could go wrong if this
- * requirement isn't satisfied.
+ * mmu_try_to_unsync_pages() describe what could go wrong if
+ * this requirement isn't satisfied.
*/
if (!smp_load_acquire(&sp->unsync) &&
!smp_load_acquire(&sp->unsync_children))
@@ -3474,19 +3594,6 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr,
return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception);
}
-static bool
-__is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level)
-{
- int bit7 = (pte >> 7) & 1;
-
- return pte & rsvd_check->rsvd_bits_mask[bit7][level-1];
-}
-
-static bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check, u64 pte)
-{
- return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f);
-}
-
static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
{
/*
@@ -3540,12 +3647,7 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
int root, leaf, level;
bool reserved = false;
- if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) {
- *sptep = 0ull;
- return reserved;
- }
-
- if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))
+ if (is_tdp_mmu(vcpu->arch.mmu))
leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root);
else
leaf = get_walk(vcpu, addr, sptes, &root);
@@ -3569,13 +3671,7 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
rsvd_check = &vcpu->arch.mmu->shadow_zero_check;
for (level = root; level >= leaf; level--)
- /*
- * Use a bitwise-OR instead of a logical-OR to aggregate the
- * reserved bit and EPT's invalid memtype/XWR checks to avoid
- * adding a Jcc in the loop.
- */
- reserved |= __is_bad_mt_xwr(rsvd_check, sptes[level]) |
- __is_rsvd_bits_set(rsvd_check, sptes[level], level);
+ reserved |= is_rsvd_spte(rsvd_check, sptes[level], level);
if (reserved) {
pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n",
@@ -3583,7 +3679,7 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
for (level = root; level >= leaf; level--)
pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx",
sptes[level], level,
- rsvd_check->rsvd_bits_mask[(sptes[level] >> 7) & 1][level-1]);
+ get_rsvd_bits(rsvd_check, sptes[level], level));
}
return reserved;
@@ -3717,6 +3813,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
bool prefault, int max_level, bool is_tdp)
{
+ bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu);
bool write = error_code & PFERR_WRITE_MASK;
bool map_writable;
@@ -3729,7 +3826,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
if (page_fault_handle_page_track(vcpu, error_code, gfn))
return RET_PF_EMULATE;
- if (!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) {
+ if (!is_tdp_mmu_fault) {
r = fast_page_fault(vcpu, gpa, error_code);
if (r != RET_PF_INVALID)
return r;
@@ -3751,7 +3848,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
r = RET_PF_RETRY;
- if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))
+ if (is_tdp_mmu_fault)
read_lock(&vcpu->kvm->mmu_lock);
else
write_lock(&vcpu->kvm->mmu_lock);
@@ -3762,7 +3859,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
if (r)
goto out_unlock;
- if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))
+ if (is_tdp_mmu_fault)
r = kvm_tdp_mmu_map(vcpu, gpa, error_code, map_writable, max_level,
pfn, prefault);
else
@@ -3770,7 +3867,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
prefault, is_tdp);
out_unlock:
- if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))
+ if (is_tdp_mmu_fault)
read_unlock(&vcpu->kvm->mmu_lock);
else
write_unlock(&vcpu->kvm->mmu_lock);
@@ -3840,17 +3937,13 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
max_level, true);
}
-static void nonpaging_init_context(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context)
+static void nonpaging_init_context(struct kvm_mmu *context)
{
context->page_fault = nonpaging_page_fault;
context->gva_to_gpa = nonpaging_gva_to_gpa;
context->sync_page = nonpaging_sync_page;
context->invlpg = NULL;
- context->root_level = 0;
- context->shadow_root_level = PT32E_ROOT_LEVEL;
context->direct_map = true;
- context->nx = false;
}
static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
@@ -3913,8 +4006,7 @@ static bool fast_pgd_switch(struct kvm_vcpu *vcpu, gpa_t new_pgd,
}
static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd,
- union kvm_mmu_page_role new_role,
- bool skip_tlb_flush, bool skip_mmu_sync)
+ union kvm_mmu_page_role new_role)
{
if (!fast_pgd_switch(vcpu, new_pgd, new_role)) {
kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, KVM_MMU_ROOT_CURRENT);
@@ -3929,10 +4021,10 @@ static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd,
*/
kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
- if (!skip_mmu_sync || force_flush_and_sync_on_reuse)
+ if (force_flush_and_sync_on_reuse) {
kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
- if (!skip_tlb_flush || force_flush_and_sync_on_reuse)
kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+ }
/*
* The last MMIO access's GVA and GPA are cached in the VCPU. When
@@ -3951,11 +4043,9 @@ static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd,
to_shadow_page(vcpu->arch.mmu->root_hpa));
}
-void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush,
- bool skip_mmu_sync)
+void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
{
- __kvm_mmu_new_pgd(vcpu, new_pgd, kvm_mmu_calc_root_page_role(vcpu),
- skip_tlb_flush, skip_mmu_sync);
+ __kvm_mmu_new_pgd(vcpu, new_pgd, kvm_mmu_calc_root_page_role(vcpu));
}
EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
@@ -3981,26 +4071,6 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
return false;
}
-static inline bool is_last_gpte(struct kvm_mmu *mmu,
- unsigned level, unsigned gpte)
-{
- /*
- * The RHS has bit 7 set iff level < mmu->last_nonleaf_level.
- * If it is clear, there are no large pages at this level, so clear
- * PT_PAGE_SIZE_MASK in gpte if that is the case.
- */
- gpte &= level - mmu->last_nonleaf_level;
-
- /*
- * PG_LEVEL_4K always terminates. The RHS has bit 7 set
- * iff level <= PG_LEVEL_4K, which for our purpose means
- * level == PG_LEVEL_4K; set PT_PAGE_SIZE_MASK in gpte then.
- */
- gpte |= level - PG_LEVEL_4K - 1;
-
- return gpte & PT_PAGE_SIZE_MASK;
-}
-
#define PTTYPE_EPT 18 /* arbitrary */
#define PTTYPE PTTYPE_EPT
#include "paging_tmpl.h"
@@ -4015,8 +4085,7 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu,
#undef PTTYPE
static void
-__reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
- struct rsvd_bits_validate *rsvd_check,
+__reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
u64 pa_bits_rsvd, int level, bool nx, bool gbpages,
bool pse, bool amd)
{
@@ -4105,14 +4174,29 @@ __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
}
}
+static bool guest_can_use_gbpages(struct kvm_vcpu *vcpu)
+{
+ /*
+ * If TDP is enabled, let the guest use GBPAGES if they're supported in
+ * hardware. The hardware page walker doesn't let KVM disable GBPAGES,
+ * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA
+ * walk for performance and complexity reasons. Not to mention KVM
+ * _can't_ solve the problem because GVA->GPA walks aren't visible to
+ * KVM once a TDP translation is installed. Mimic hardware behavior so
+ * that KVM's is at least consistent, i.e. doesn't randomly inject #PF.
+ */
+ return tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) :
+ guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES);
+}
+
static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
struct kvm_mmu *context)
{
- __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check,
+ __reset_rsvds_bits_mask(&context->guest_rsvd_check,
vcpu->arch.reserved_gpa_bits,
- context->root_level, context->nx,
- guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
- is_pse(vcpu),
+ context->root_level, is_efer_nx(context),
+ guest_can_use_gbpages(vcpu),
+ is_cr4_pse(context),
guest_cpuid_is_amd_or_hygon(vcpu));
}
@@ -4165,24 +4249,32 @@ static inline u64 reserved_hpa_bits(void)
* table in guest or amd nested guest, its mmu features completely
* follow the features in guest.
*/
-void
-reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
+static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
{
- bool uses_nx = context->nx ||
- context->mmu_role.base.smep_andnot_wp;
+ /*
+ * KVM uses NX when TDP is disabled to handle a variety of scenarios,
+ * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
+ * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
+ * The iTLB multi-hit workaround can be toggled at any time, so assume
+ * NX can be used by any non-nested shadow MMU to avoid having to reset
+ * MMU contexts. Note, KVM forces EFER.NX=1 when TDP is disabled.
+ */
+ bool uses_nx = is_efer_nx(context) || !tdp_enabled;
+
+ /* @amd adds a check on bit of SPTEs, which KVM shouldn't use anyways. */
+ bool is_amd = true;
+ /* KVM doesn't use 2-level page tables for the shadow MMU. */
+ bool is_pse = false;
struct rsvd_bits_validate *shadow_zero_check;
int i;
- /*
- * Passing "true" to the last argument is okay; it adds a check
- * on bit 8 of the SPTEs which KVM doesn't use anyway.
- */
+ WARN_ON_ONCE(context->shadow_root_level < PT32E_ROOT_LEVEL);
+
shadow_zero_check = &context->shadow_zero_check;
- __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
- reserved_hpa_bits(),
+ __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
context->shadow_root_level, uses_nx,
- guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
- is_pse(vcpu), true);
+ guest_can_use_gbpages(vcpu), is_pse, is_amd);
if (!shadow_me_mask)
return;
@@ -4193,7 +4285,6 @@ reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
}
}
-EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask);
static inline bool boot_cpu_is_amd(void)
{
@@ -4215,11 +4306,10 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
shadow_zero_check = &context->shadow_zero_check;
if (boot_cpu_is_amd())
- __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
- reserved_hpa_bits(),
+ __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
context->shadow_root_level, false,
boot_cpu_has(X86_FEATURE_GBPAGES),
- true, true);
+ false, true);
else
__reset_rsvds_bits_mask_ept(shadow_zero_check,
reserved_hpa_bits(), false);
@@ -4255,8 +4345,7 @@ reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
(7 & (access) ? 128 : 0))
-static void update_permission_bitmask(struct kvm_vcpu *vcpu,
- struct kvm_mmu *mmu, bool ept)
+static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept)
{
unsigned byte;
@@ -4264,9 +4353,10 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu,
const u8 w = BYTE_MASK(ACC_WRITE_MASK);
const u8 u = BYTE_MASK(ACC_USER_MASK);
- bool cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) != 0;
- bool cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP) != 0;
- bool cr0_wp = is_write_protection(vcpu);
+ bool cr4_smep = is_cr4_smep(mmu);
+ bool cr4_smap = is_cr4_smap(mmu);
+ bool cr0_wp = is_cr0_wp(mmu);
+ bool efer_nx = is_efer_nx(mmu);
for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
unsigned pfec = byte << 1;
@@ -4292,7 +4382,7 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu,
u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
/* Not really needed: !nx will cause pte.nx to fault */
- if (!mmu->nx)
+ if (!efer_nx)
ff = 0;
/* Allow supervisor writes if !cr0.wp */
@@ -4351,24 +4441,17 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu,
* away both AD and WD. For all reads or if the last condition holds, WD
* only will be masked away.
*/
-static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- bool ept)
+static void update_pkru_bitmask(struct kvm_mmu *mmu)
{
unsigned bit;
bool wp;
- if (ept) {
- mmu->pkru_mask = 0;
- return;
- }
-
- /* PKEY is enabled only if CR4.PKE and EFER.LMA are both set. */
- if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) {
+ if (!is_cr4_pke(mmu)) {
mmu->pkru_mask = 0;
return;
}
- wp = is_write_protection(vcpu);
+ wp = is_cr0_wp(mmu);
for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
unsigned pfec, pkey_bits;
@@ -4402,81 +4485,51 @@ static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
}
}
-static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu)
{
- unsigned root_level = mmu->root_level;
+ if (!is_cr0_pg(mmu))
+ return;
- mmu->last_nonleaf_level = root_level;
- if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu))
- mmu->last_nonleaf_level++;
+ reset_rsvds_bits_mask(vcpu, mmu);
+ update_permission_bitmask(mmu, false);
+ update_pkru_bitmask(mmu);
}
-static void paging64_init_context_common(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context,
- int level)
+static void paging64_init_context(struct kvm_mmu *context)
{
- context->nx = is_nx(vcpu);
- context->root_level = level;
-
- reset_rsvds_bits_mask(vcpu, context);
- update_permission_bitmask(vcpu, context, false);
- update_pkru_bitmask(vcpu, context, false);
- update_last_nonleaf_level(vcpu, context);
-
- MMU_WARN_ON(!is_pae(vcpu));
context->page_fault = paging64_page_fault;
context->gva_to_gpa = paging64_gva_to_gpa;
context->sync_page = paging64_sync_page;
context->invlpg = paging64_invlpg;
- context->shadow_root_level = level;
context->direct_map = false;
}
-static void paging64_init_context(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context)
-{
- int root_level = is_la57_mode(vcpu) ?
- PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
-
- paging64_init_context_common(vcpu, context, root_level);
-}
-
-static void paging32_init_context(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context)
+static void paging32_init_context(struct kvm_mmu *context)
{
- context->nx = false;
- context->root_level = PT32_ROOT_LEVEL;
-
- reset_rsvds_bits_mask(vcpu, context);
- update_permission_bitmask(vcpu, context, false);
- update_pkru_bitmask(vcpu, context, false);
- update_last_nonleaf_level(vcpu, context);
-
context->page_fault = paging32_page_fault;
context->gva_to_gpa = paging32_gva_to_gpa;
context->sync_page = paging32_sync_page;
context->invlpg = paging32_invlpg;
- context->shadow_root_level = PT32E_ROOT_LEVEL;
context->direct_map = false;
}
-static void paging32E_init_context(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context)
-{
- paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
-}
-
-static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu)
+static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_role_regs *regs)
{
union kvm_mmu_extended_role ext = {0};
- ext.cr0_pg = !!is_paging(vcpu);
- ext.cr4_pae = !!is_pae(vcpu);
- ext.cr4_smep = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
- ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
- ext.cr4_pse = !!is_pse(vcpu);
- ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE);
- ext.maxphyaddr = cpuid_maxphyaddr(vcpu);
+ if (____is_cr0_pg(regs)) {
+ ext.cr0_pg = 1;
+ ext.cr4_pae = ____is_cr4_pae(regs);
+ ext.cr4_smep = ____is_cr4_smep(regs);
+ ext.cr4_smap = ____is_cr4_smap(regs);
+ ext.cr4_pse = ____is_cr4_pse(regs);
+
+ /* PKEY and LA57 are active iff long mode is active. */
+ ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs);
+ ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs);
+ }
ext.valid = 1;
@@ -4484,20 +4537,23 @@ static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu)
}
static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_role_regs *regs,
bool base_only)
{
union kvm_mmu_role role = {0};
role.base.access = ACC_ALL;
- role.base.nxe = !!is_nx(vcpu);
- role.base.cr0_wp = is_write_protection(vcpu);
+ if (____is_cr0_pg(regs)) {
+ role.base.efer_nx = ____is_efer_nx(regs);
+ role.base.cr0_wp = ____is_cr0_wp(regs);
+ }
role.base.smm = is_smm(vcpu);
role.base.guest_mode = is_guest_mode(vcpu);
if (base_only)
return role;
- role.ext = kvm_calc_mmu_role_ext(vcpu);
+ role.ext = kvm_calc_mmu_role_ext(vcpu, regs);
return role;
}
@@ -4512,9 +4568,10 @@ static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
}
static union kvm_mmu_role
-kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
+kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_role_regs *regs, bool base_only)
{
- union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
+ union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs, base_only);
role.base.ad_disabled = (shadow_accessed_mask == 0);
role.base.level = kvm_mmu_get_tdp_level(vcpu);
@@ -4527,8 +4584,9 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *context = &vcpu->arch.root_mmu;
+ struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
union kvm_mmu_role new_role =
- kvm_calc_tdp_mmu_root_page_role(vcpu, false);
+ kvm_calc_tdp_mmu_root_page_role(vcpu, &regs, false);
if (new_role.as_u64 == context->mmu_role.as_u64)
return;
@@ -4542,60 +4600,44 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->get_guest_pgd = get_cr3;
context->get_pdptr = kvm_pdptr_read;
context->inject_page_fault = kvm_inject_page_fault;
+ context->root_level = role_regs_to_root_level(&regs);
- if (!is_paging(vcpu)) {
- context->nx = false;
+ if (!is_cr0_pg(context))
context->gva_to_gpa = nonpaging_gva_to_gpa;
- context->root_level = 0;
- } else if (is_long_mode(vcpu)) {
- context->nx = is_nx(vcpu);
- context->root_level = is_la57_mode(vcpu) ?
- PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
- reset_rsvds_bits_mask(vcpu, context);
- context->gva_to_gpa = paging64_gva_to_gpa;
- } else if (is_pae(vcpu)) {
- context->nx = is_nx(vcpu);
- context->root_level = PT32E_ROOT_LEVEL;
- reset_rsvds_bits_mask(vcpu, context);
+ else if (is_cr4_pae(context))
context->gva_to_gpa = paging64_gva_to_gpa;
- } else {
- context->nx = false;
- context->root_level = PT32_ROOT_LEVEL;
- reset_rsvds_bits_mask(vcpu, context);
+ else
context->gva_to_gpa = paging32_gva_to_gpa;
- }
- update_permission_bitmask(vcpu, context, false);
- update_pkru_bitmask(vcpu, context, false);
- update_last_nonleaf_level(vcpu, context);
+ reset_guest_paging_metadata(vcpu, context);
reset_tdp_shadow_zero_bits_mask(vcpu, context);
}
static union kvm_mmu_role
-kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu, bool base_only)
+kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_role_regs *regs, bool base_only)
{
- union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
+ union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs, base_only);
- role.base.smep_andnot_wp = role.ext.cr4_smep &&
- !is_write_protection(vcpu);
- role.base.smap_andnot_wp = role.ext.cr4_smap &&
- !is_write_protection(vcpu);
- role.base.gpte_is_8_bytes = !!is_pae(vcpu);
+ role.base.smep_andnot_wp = role.ext.cr4_smep && !____is_cr0_wp(regs);
+ role.base.smap_andnot_wp = role.ext.cr4_smap && !____is_cr0_wp(regs);
+ role.base.gpte_is_8_bytes = ____is_cr0_pg(regs) && ____is_cr4_pae(regs);
return role;
}
static union kvm_mmu_role
-kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
+kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_role_regs *regs, bool base_only)
{
union kvm_mmu_role role =
- kvm_calc_shadow_root_page_role_common(vcpu, base_only);
+ kvm_calc_shadow_root_page_role_common(vcpu, regs, base_only);
- role.base.direct = !is_paging(vcpu);
+ role.base.direct = !____is_cr0_pg(regs);
- if (!is_long_mode(vcpu))
+ if (!____is_efer_lma(regs))
role.base.level = PT32E_ROOT_LEVEL;
- else if (is_la57_mode(vcpu))
+ else if (____is_cr4_la57(regs))
role.base.level = PT64_ROOT_5LEVEL;
else
role.base.level = PT64_ROOT_4LEVEL;
@@ -4604,37 +4646,44 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
}
static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
- u32 cr0, u32 cr4, u32 efer,
+ struct kvm_mmu_role_regs *regs,
union kvm_mmu_role new_role)
{
- if (!(cr0 & X86_CR0_PG))
- nonpaging_init_context(vcpu, context);
- else if (efer & EFER_LMA)
- paging64_init_context(vcpu, context);
- else if (cr4 & X86_CR4_PAE)
- paging32E_init_context(vcpu, context);
- else
- paging32_init_context(vcpu, context);
+ if (new_role.as_u64 == context->mmu_role.as_u64)
+ return;
context->mmu_role.as_u64 = new_role.as_u64;
+
+ if (!is_cr0_pg(context))
+ nonpaging_init_context(context);
+ else if (is_cr4_pae(context))
+ paging64_init_context(context);
+ else
+ paging32_init_context(context);
+ context->root_level = role_regs_to_root_level(regs);
+
+ reset_guest_paging_metadata(vcpu, context);
+ context->shadow_root_level = new_role.base.level;
+
reset_shadow_zero_bits_mask(vcpu, context);
}
-static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer)
+static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_role_regs *regs)
{
struct kvm_mmu *context = &vcpu->arch.root_mmu;
union kvm_mmu_role new_role =
- kvm_calc_shadow_mmu_root_page_role(vcpu, false);
+ kvm_calc_shadow_mmu_root_page_role(vcpu, regs, false);
- if (new_role.as_u64 != context->mmu_role.as_u64)
- shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role);
+ shadow_mmu_init_context(vcpu, context, regs, new_role);
}
static union kvm_mmu_role
-kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu)
+kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_role_regs *regs)
{
union kvm_mmu_role role =
- kvm_calc_shadow_root_page_role_common(vcpu, false);
+ kvm_calc_shadow_root_page_role_common(vcpu, regs, false);
role.base.direct = false;
role.base.level = kvm_mmu_get_tdp_level(vcpu);
@@ -4642,23 +4691,22 @@ kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu)
return role;
}
-void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer,
- gpa_t nested_cr3)
+void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
+ unsigned long cr4, u64 efer, gpa_t nested_cr3)
{
struct kvm_mmu *context = &vcpu->arch.guest_mmu;
- union kvm_mmu_role new_role = kvm_calc_shadow_npt_root_page_role(vcpu);
+ struct kvm_mmu_role_regs regs = {
+ .cr0 = cr0,
+ .cr4 = cr4,
+ .efer = efer,
+ };
+ union kvm_mmu_role new_role;
- __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base, false, false);
+ new_role = kvm_calc_shadow_npt_root_page_role(vcpu, &regs);
- if (new_role.as_u64 != context->mmu_role.as_u64) {
- shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role);
+ __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base);
- /*
- * Override the level set by the common init helper, nested TDP
- * always uses the host's TDP configuration.
- */
- context->shadow_root_level = new_role.base.level;
- }
+ shadow_mmu_init_context(vcpu, context, &regs, new_role);
}
EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
@@ -4678,15 +4726,10 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
role.base.guest_mode = true;
role.base.access = ACC_ALL;
- /*
- * WP=1 and NOT_WP=1 is an impossible combination, use WP and the
- * SMAP variation to denote shadow EPT entries.
- */
- role.base.cr0_wp = true;
- role.base.smap_andnot_wp = true;
-
- role.ext = kvm_calc_mmu_role_ext(vcpu);
+ /* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */
+ role.ext.word = 0;
role.ext.execonly = execonly;
+ role.ext.valid = 1;
return role;
}
@@ -4700,14 +4743,15 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
execonly, level);
- __kvm_mmu_new_pgd(vcpu, new_eptp, new_role.base, true, true);
+ __kvm_mmu_new_pgd(vcpu, new_eptp, new_role.base);
if (new_role.as_u64 == context->mmu_role.as_u64)
return;
+ context->mmu_role.as_u64 = new_role.as_u64;
+
context->shadow_root_level = level;
- context->nx = true;
context->ept_ad = accessed_dirty;
context->page_fault = ept_page_fault;
context->gva_to_gpa = ept_gva_to_gpa;
@@ -4715,11 +4759,9 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
context->invlpg = ept_invlpg;
context->root_level = level;
context->direct_map = false;
- context->mmu_role.as_u64 = new_role.as_u64;
- update_permission_bitmask(vcpu, context, true);
- update_pkru_bitmask(vcpu, context, true);
- update_last_nonleaf_level(vcpu, context);
+ update_permission_bitmask(context, true);
+ update_pkru_bitmask(context);
reset_rsvds_bits_mask_ept(vcpu, context, execonly);
reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
}
@@ -4728,20 +4770,21 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *context = &vcpu->arch.root_mmu;
+ struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
- kvm_init_shadow_mmu(vcpu,
- kvm_read_cr0_bits(vcpu, X86_CR0_PG),
- kvm_read_cr4_bits(vcpu, X86_CR4_PAE),
- vcpu->arch.efer);
+ kvm_init_shadow_mmu(vcpu, &regs);
context->get_guest_pgd = get_cr3;
context->get_pdptr = kvm_pdptr_read;
context->inject_page_fault = kvm_inject_page_fault;
}
-static union kvm_mmu_role kvm_calc_nested_mmu_role(struct kvm_vcpu *vcpu)
+static union kvm_mmu_role
+kvm_calc_nested_mmu_role(struct kvm_vcpu *vcpu, struct kvm_mmu_role_regs *regs)
{
- union kvm_mmu_role role = kvm_calc_shadow_root_page_role_common(vcpu, false);
+ union kvm_mmu_role role;
+
+ role = kvm_calc_shadow_root_page_role_common(vcpu, regs, false);
/*
* Nested MMUs are used only for walking L2's gva->gpa, they never have
@@ -4749,23 +4792,14 @@ static union kvm_mmu_role kvm_calc_nested_mmu_role(struct kvm_vcpu *vcpu)
* to "true" to try to detect bogus usage of the nested MMU.
*/
role.base.direct = true;
-
- if (!is_paging(vcpu))
- role.base.level = 0;
- else if (is_long_mode(vcpu))
- role.base.level = is_la57_mode(vcpu) ? PT64_ROOT_5LEVEL :
- PT64_ROOT_4LEVEL;
- else if (is_pae(vcpu))
- role.base.level = PT32E_ROOT_LEVEL;
- else
- role.base.level = PT32_ROOT_LEVEL;
-
+ role.base.level = role_regs_to_root_level(regs);
return role;
}
static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
{
- union kvm_mmu_role new_role = kvm_calc_nested_mmu_role(vcpu);
+ struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
+ union kvm_mmu_role new_role = kvm_calc_nested_mmu_role(vcpu, &regs);
struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
if (new_role.as_u64 == g_context->mmu_role.as_u64)
@@ -4775,6 +4809,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
g_context->get_guest_pgd = get_cr3;
g_context->get_pdptr = kvm_pdptr_read;
g_context->inject_page_fault = kvm_inject_page_fault;
+ g_context->root_level = new_role.base.level;
/*
* L2 page tables are never shadowed, so there is no need to sync
@@ -4790,44 +4825,20 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
* nested page tables as the second level of translation. Basically
* the gva_to_gpa functions between mmu and nested_mmu are swapped.
*/
- if (!is_paging(vcpu)) {
- g_context->nx = false;
- g_context->root_level = 0;
+ if (!is_paging(vcpu))
g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
- } else if (is_long_mode(vcpu)) {
- g_context->nx = is_nx(vcpu);
- g_context->root_level = is_la57_mode(vcpu) ?
- PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
- reset_rsvds_bits_mask(vcpu, g_context);
+ else if (is_long_mode(vcpu))
g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
- } else if (is_pae(vcpu)) {
- g_context->nx = is_nx(vcpu);
- g_context->root_level = PT32E_ROOT_LEVEL;
- reset_rsvds_bits_mask(vcpu, g_context);
+ else if (is_pae(vcpu))
g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
- } else {
- g_context->nx = false;
- g_context->root_level = PT32_ROOT_LEVEL;
- reset_rsvds_bits_mask(vcpu, g_context);
+ else
g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
- }
- update_permission_bitmask(vcpu, g_context, false);
- update_pkru_bitmask(vcpu, g_context, false);
- update_last_nonleaf_level(vcpu, g_context);
+ reset_guest_paging_metadata(vcpu, g_context);
}
-void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots)
+void kvm_init_mmu(struct kvm_vcpu *vcpu)
{
- if (reset_roots) {
- uint i;
-
- vcpu->arch.mmu->root_hpa = INVALID_PAGE;
-
- for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
- vcpu->arch.mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
- }
-
if (mmu_is_nested(vcpu))
init_kvm_nested_mmu(vcpu);
else if (tdp_enabled)
@@ -4840,20 +4851,53 @@ EXPORT_SYMBOL_GPL(kvm_init_mmu);
static union kvm_mmu_page_role
kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
{
+ struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
union kvm_mmu_role role;
if (tdp_enabled)
- role = kvm_calc_tdp_mmu_root_page_role(vcpu, true);
+ role = kvm_calc_tdp_mmu_root_page_role(vcpu, &regs, true);
else
- role = kvm_calc_shadow_mmu_root_page_role(vcpu, true);
+ role = kvm_calc_shadow_mmu_root_page_role(vcpu, &regs, true);
return role.base;
}
+void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Invalidate all MMU roles to force them to reinitialize as CPUID
+ * information is factored into reserved bit calculations.
+ */
+ vcpu->arch.root_mmu.mmu_role.ext.valid = 0;
+ vcpu->arch.guest_mmu.mmu_role.ext.valid = 0;
+ vcpu->arch.nested_mmu.mmu_role.ext.valid = 0;
+ kvm_mmu_reset_context(vcpu);
+
+ /*
+ * KVM does not correctly handle changing guest CPUID after KVM_RUN, as
+ * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
+ * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
+ * faults due to reusing SPs/SPTEs. Alert userspace, but otherwise
+ * sweep the problem under the rug.
+ *
+ * KVM's horrific CPUID ABI makes the problem all but impossible to
+ * solve, as correctly handling multiple vCPU models (with respect to
+ * paging and physical address properties) in a single VM would require
+ * tracking all relevant CPUID information in kvm_mmu_page_role. That
+ * is very undesirable as it would double the memory requirements for
+ * gfn_track (see struct kvm_mmu_page_role comments), and in practice
+ * no sane VMM mucks with the core vCPU model on the fly.
+ */
+ if (vcpu->arch.last_vmentry_cpu != -1) {
+ pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} after KVM_RUN may cause guest instability\n");
+ pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} will fail after KVM_RUN starting with Linux 5.16\n");
+ }
+}
+
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
{
kvm_mmu_unload(vcpu);
- kvm_init_mmu(vcpu, true);
+ kvm_init_mmu(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
@@ -5491,7 +5535,13 @@ void kvm_mmu_init_vm(struct kvm *kvm)
{
struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
- kvm_mmu_init_tdp_mmu(kvm);
+ if (!kvm_mmu_init_tdp_mmu(kvm))
+ /*
+ * No smp_load/store wrappers needed here as we are in
+ * VM init and there cannot be any memslots / other threads
+ * accessing this struct kvm yet.
+ */
+ kvm->arch.memslots_have_rmaps = true;
node->track_write = kvm_mmu_pte_write;
node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
@@ -5514,29 +5564,29 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
int i;
bool flush = false;
- write_lock(&kvm->mmu_lock);
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
- slots = __kvm_memslots(kvm, i);
- kvm_for_each_memslot(memslot, slots) {
- gfn_t start, end;
-
- start = max(gfn_start, memslot->base_gfn);
- end = min(gfn_end, memslot->base_gfn + memslot->npages);
- if (start >= end)
- continue;
+ if (kvm_memslots_have_rmaps(kvm)) {
+ write_lock(&kvm->mmu_lock);
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ slots = __kvm_memslots(kvm, i);
+ kvm_for_each_memslot(memslot, slots) {
+ gfn_t start, end;
+
+ start = max(gfn_start, memslot->base_gfn);
+ end = min(gfn_end, memslot->base_gfn + memslot->npages);
+ if (start >= end)
+ continue;
- flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
- PG_LEVEL_4K,
- KVM_MAX_HUGEPAGE_LEVEL,
- start, end - 1, true, flush);
+ flush = slot_handle_level_range(kvm, memslot,
+ kvm_zap_rmapp, PG_LEVEL_4K,
+ KVM_MAX_HUGEPAGE_LEVEL, start,
+ end - 1, true, flush);
+ }
}
+ if (flush)
+ kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end);
+ write_unlock(&kvm->mmu_lock);
}
- if (flush)
- kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end);
-
- write_unlock(&kvm->mmu_lock);
-
if (is_tdp_mmu_enabled(kvm)) {
flush = false;
@@ -5563,12 +5613,15 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
struct kvm_memory_slot *memslot,
int start_level)
{
- bool flush;
+ bool flush = false;
- write_lock(&kvm->mmu_lock);
- flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect,
- start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
- write_unlock(&kvm->mmu_lock);
+ if (kvm_memslots_have_rmaps(kvm)) {
+ write_lock(&kvm->mmu_lock);
+ flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect,
+ start_level, KVM_MAX_HUGEPAGE_LEVEL,
+ false);
+ write_unlock(&kvm->mmu_lock);
+ }
if (is_tdp_mmu_enabled(kvm)) {
read_lock(&kvm->mmu_lock);
@@ -5636,18 +5689,17 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
{
/* FIXME: const-ify all uses of struct kvm_memory_slot. */
struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot;
- bool flush;
-
- write_lock(&kvm->mmu_lock);
- flush = slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true);
+ bool flush = false;
- if (flush)
- kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
- write_unlock(&kvm->mmu_lock);
+ if (kvm_memslots_have_rmaps(kvm)) {
+ write_lock(&kvm->mmu_lock);
+ flush = slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true);
+ if (flush)
+ kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
+ write_unlock(&kvm->mmu_lock);
+ }
if (is_tdp_mmu_enabled(kvm)) {
- flush = false;
-
read_lock(&kvm->mmu_lock);
flush = kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot, flush);
if (flush)
@@ -5674,11 +5726,14 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
struct kvm_memory_slot *memslot)
{
- bool flush;
+ bool flush = false;
- write_lock(&kvm->mmu_lock);
- flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
- write_unlock(&kvm->mmu_lock);
+ if (kvm_memslots_have_rmaps(kvm)) {
+ write_lock(&kvm->mmu_lock);
+ flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty,
+ false);
+ write_unlock(&kvm->mmu_lock);
+ }
if (is_tdp_mmu_enabled(kvm)) {
read_lock(&kvm->mmu_lock);
@@ -5981,6 +6036,7 @@ static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel
static void kvm_recover_nx_lpages(struct kvm *kvm)
{
+ unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits;
int rcu_idx;
struct kvm_mmu_page *sp;
unsigned int ratio;
@@ -5992,7 +6048,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
write_lock(&kvm->mmu_lock);
ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
- to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;
+ to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0;
for ( ; to_zap; --to_zap) {
if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages))
break;
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index d64ccb417c60..35567293c1fd 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -116,14 +116,19 @@ static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
kvm_x86_ops.cpu_dirty_log_size;
}
-bool is_nx_huge_page_enabled(void);
-bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
- bool can_unsync);
+extern int nx_huge_pages;
+static inline bool is_nx_huge_page_enabled(void)
+{
+ return READ_ONCE(nx_huge_pages);
+}
+
+int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync);
void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
- struct kvm_memory_slot *slot, u64 gfn);
+ struct kvm_memory_slot *slot, u64 gfn,
+ int min_level);
void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
u64 start_gfn, u64 pages);
@@ -158,8 +163,6 @@ int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level,
kvm_pfn_t *pfnp, int *goal_levelp);
-bool is_nx_huge_page_enabled(void);
-
void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index e798489b56b5..efbad33a0645 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -40,7 +40,7 @@
role.direct ? " direct" : "", \
access_str[role.access], \
role.invalid ? " invalid" : "", \
- role.nxe ? "" : "!", \
+ role.efer_nx ? "" : "!", \
role.ad_disabled ? "!" : "", \
__entry->root_count, \
__entry->unsync ? "unsync" : "sync", 0); \
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
index 34bb0ec69bd8..91a9f7e0fd91 100644
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -100,7 +100,7 @@ void kvm_slot_page_track_add_page(struct kvm *kvm,
kvm_mmu_gfn_disallow_lpage(slot, gfn);
if (mode == KVM_PAGE_TRACK_WRITE)
- if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn))
+ if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K))
kvm_flush_remote_tlbs(kvm);
}
EXPORT_SYMBOL_GPL(kvm_slot_page_track_add_page);
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 823a5919f9fa..490a028ddabe 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -305,6 +305,35 @@ static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte)
return pkeys;
}
+static inline bool FNAME(is_last_gpte)(struct kvm_mmu *mmu,
+ unsigned int level, unsigned int gpte)
+{
+ /*
+ * For EPT and PAE paging (both variants), bit 7 is either reserved at
+ * all level or indicates a huge page (ignoring CR3/EPTP). In either
+ * case, bit 7 being set terminates the walk.
+ */
+#if PTTYPE == 32
+ /*
+ * 32-bit paging requires special handling because bit 7 is ignored if
+ * CR4.PSE=0, not reserved. Clear bit 7 in the gpte if the level is
+ * greater than the last level for which bit 7 is the PAGE_SIZE bit.
+ *
+ * The RHS has bit 7 set iff level < (2 + PSE). If it is clear, bit 7
+ * is not reserved and does not indicate a large page at this level,
+ * so clear PT_PAGE_SIZE_MASK in gpte if that is the case.
+ */
+ gpte &= level - (PT32_ROOT_LEVEL + mmu->mmu_role.ext.cr4_pse);
+#endif
+ /*
+ * PG_LEVEL_4K always terminates. The RHS has bit 7 set
+ * iff level <= PG_LEVEL_4K, which for our purpose means
+ * level == PG_LEVEL_4K; set PT_PAGE_SIZE_MASK in gpte then.
+ */
+ gpte |= level - PG_LEVEL_4K - 1;
+
+ return gpte & PT_PAGE_SIZE_MASK;
+}
/*
* Fetch a guest pte for a guest virtual address, or for an L2's GPA.
*/
@@ -421,7 +450,7 @@ retry_walk:
/* Convert to ACC_*_MASK flags for struct guest_walker. */
walker->pt_access[walker->level - 1] = FNAME(gpte_access)(pt_access ^ walk_nx_mask);
- } while (!is_last_gpte(mmu, walker->level, pte));
+ } while (!FNAME(is_last_gpte)(mmu, walker->level, pte));
pte_pkey = FNAME(gpte_pkeys)(vcpu, pte);
accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0;
@@ -471,8 +500,7 @@ retry_walk:
error:
errcode |= write_fault | user_fault;
- if (fetch_fault && (mmu->nx ||
- kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)))
+ if (fetch_fault && (is_efer_nx(mmu) || is_cr4_smep(mmu)))
errcode |= PFERR_FETCH_MASK;
walker->fault.vector = PF_VECTOR;
@@ -767,7 +795,7 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
bool self_changed = false;
if (!(walker->pte_access & ACC_WRITE_MASK ||
- (!is_write_protection(vcpu) && !user_fault)))
+ (!is_cr0_wp(vcpu->arch.mmu) && !user_fault)))
return false;
for (level = walker->level; level <= walker->max_level; level++) {
@@ -865,8 +893,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
* we will cache the incorrect access into mmio spte.
*/
if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) &&
- !is_write_protection(vcpu) && !user_fault &&
- !is_noslot_pfn(pfn)) {
+ !is_cr0_wp(vcpu->arch.mmu) && !user_fault && !is_noslot_pfn(pfn)) {
walker.pte_access |= ACC_WRITE_MASK;
walker.pte_access &= ~ACC_USER_MASK;
@@ -876,7 +903,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
* then we should prevent the kernel from executing it
* if SMEP is enabled.
*/
- if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
+ if (is_cr4_smep(vcpu->arch.mmu))
walker.pte_access &= ~ACC_EXEC_MASK;
}
@@ -1031,13 +1058,36 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr,
*/
static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
+ union kvm_mmu_page_role mmu_role = vcpu->arch.mmu->mmu_role.base;
int i, nr_present = 0;
bool host_writable;
gpa_t first_pte_gpa;
int set_spte_ret = 0;
- /* direct kvm_mmu_page can not be unsync. */
- BUG_ON(sp->role.direct);
+ /*
+ * Ignore various flags when verifying that it's safe to sync a shadow
+ * page using the current MMU context.
+ *
+ * - level: not part of the overall MMU role and will never match as the MMU's
+ * level tracks the root level
+ * - access: updated based on the new guest PTE
+ * - quadrant: not part of the overall MMU role (similar to level)
+ */
+ const union kvm_mmu_page_role sync_role_ign = {
+ .level = 0xf,
+ .access = 0x7,
+ .quadrant = 0x3,
+ };
+
+ /*
+ * Direct pages can never be unsync, and KVM should never attempt to
+ * sync a shadow page for a different MMU context, e.g. if the role
+ * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the
+ * reserved bits checks will be wrong, etc...
+ */
+ if (WARN_ON_ONCE(sp->role.direct ||
+ (sp->role.word ^ mmu_role.word) & ~sync_role_ign.word))
+ return 0;
first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 66d43cec0c31..3e97cdb13eb7 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -103,13 +103,6 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK;
/*
- * Bits 62:52 of PAE SPTEs are reserved. WARN if said bits are set
- * if PAE paging may be employed (shadow paging or any 32-bit KVM).
- */
- WARN_ON_ONCE((!tdp_enabled || !IS_ENABLED(CONFIG_X86_64)) &&
- (spte & SPTE_TDP_AD_MASK));
-
- /*
* For the EPT case, shadow_present_mask is 0 if hardware
* supports exec-only page table entries. In that case,
* ACC_USER_MASK and shadow_user_mask are used to represent
@@ -154,13 +147,19 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
/*
* Optimization: for pte sync, if spte was writable the hash
* lookup is unnecessary (and expensive). Write protection
- * is responsibility of mmu_get_page / kvm_sync_page.
+ * is responsibility of kvm_mmu_get_page / kvm_mmu_sync_roots.
* Same reasoning can be applied to dirty page accounting.
*/
if (!can_unsync && is_writable_pte(old_spte))
goto out;
- if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
+ /*
+ * Unsync shadow pages that are reachable by the new, writable
+ * SPTE. Write-protect the SPTE if the page can't be unsync'd,
+ * e.g. it's write-tracked (upper-level SPs) or has one or more
+ * shadow pages and unsync'ing pages is not allowed.
+ */
+ if (mmu_try_to_unsync_pages(vcpu, gfn, can_unsync)) {
pgprintk("%s: found shadow page for %llx, marking ro\n",
__func__, gfn);
ret |= SET_SPTE_WRITE_PROTECTED_PT;
@@ -176,7 +175,10 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
spte = mark_spte_for_access_track(spte);
out:
- WARN_ON(is_mmio_spte(spte));
+ WARN_ONCE(is_rsvd_spte(&vcpu->arch.mmu->shadow_zero_check, spte, level),
+ "spte = 0x%llx, level = %d, rsvd bits = 0x%llx", spte, level,
+ get_rsvd_bits(&vcpu->arch.mmu->shadow_zero_check, spte, level));
+
*new_spte = spte;
return ret;
}
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index bca0ba11cccf..7a5ce9314107 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -293,6 +293,38 @@ static inline bool is_dirty_spte(u64 spte)
return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
}
+static inline u64 get_rsvd_bits(struct rsvd_bits_validate *rsvd_check, u64 pte,
+ int level)
+{
+ int bit7 = (pte >> 7) & 1;
+
+ return rsvd_check->rsvd_bits_mask[bit7][level-1];
+}
+
+static inline bool __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check,
+ u64 pte, int level)
+{
+ return pte & get_rsvd_bits(rsvd_check, pte, level);
+}
+
+static inline bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check,
+ u64 pte)
+{
+ return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f);
+}
+
+static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check,
+ u64 spte, int level)
+{
+ /*
+ * Use a bitwise-OR instead of a logical-OR to aggregate the reserved
+ * bits and EPT's invalid memtype/XWR checks to avoid an extra Jcc
+ * (this is extremely unlikely to be short-circuited as true).
+ */
+ return __is_bad_mt_xwr(rsvd_check, spte) |
+ __is_rsvd_bits_set(rsvd_check, spte, level);
+}
+
static inline bool spte_can_locklessly_be_made_writable(u64 spte)
{
return (spte & shadow_host_writable_mask) &&
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index d90eb364d73a..0853370bd811 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -14,10 +14,10 @@ static bool __read_mostly tdp_mmu_enabled = false;
module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
/* Initializes the TDP MMU for the VM, if enabled. */
-void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
+bool kvm_mmu_init_tdp_mmu(struct kvm *kvm)
{
if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
- return;
+ return false;
/* This should not be changed for the lifetime of the VM. */
kvm->arch.tdp_mmu_enabled = true;
@@ -25,6 +25,8 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
+
+ return true;
}
static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
@@ -335,7 +337,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
sptep = rcu_dereference(pt) + i;
- gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
+ gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
if (shared) {
/*
@@ -377,12 +379,12 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
WRITE_ONCE(*sptep, REMOVED_SPTE);
}
handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
- old_child_spte, REMOVED_SPTE, level - 1,
+ old_child_spte, REMOVED_SPTE, level,
shared);
}
kvm_flush_remote_tlbs_with_address(kvm, gfn,
- KVM_PAGES_PER_HPAGE(level));
+ KVM_PAGES_PER_HPAGE(level + 1));
call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
}
@@ -912,7 +914,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
kvm_pfn_t pfn, bool prefault)
{
u64 new_spte;
- int ret = 0;
+ int ret = RET_PF_FIXED;
int make_spte_ret = 0;
if (unlikely(is_noslot_pfn(pfn)))
@@ -949,7 +951,11 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
rcu_dereference(iter->sptep));
}
- if (!prefault)
+ /*
+ * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be
+ * consistent with legacy MMU behavior.
+ */
+ if (ret != RET_PF_SPURIOUS)
vcpu->stat.pf_fixed++;
return ret;
@@ -977,11 +983,6 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
int level;
int req_level;
- if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
- return RET_PF_RETRY;
- if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)))
- return RET_PF_RETRY;
-
level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
huge_page_disallowed, &req_level);
@@ -1024,7 +1025,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
if (is_removed_spte(iter.old_spte))
break;
- sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
+ sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level - 1);
child_pt = sp->spt;
new_spte = make_nonleaf_spte(child_pt,
@@ -1462,15 +1463,22 @@ bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
* Returns true if an SPTE was set and a TLB flush is needed.
*/
static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
- gfn_t gfn)
+ gfn_t gfn, int min_level)
{
struct tdp_iter iter;
u64 new_spte;
bool spte_set = false;
+ BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
+
rcu_read_lock();
- tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) {
+ for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
+ min_level, gfn, gfn + 1) {
+ if (!is_shadow_present_pte(iter.old_spte) ||
+ !is_last_spte(iter.old_spte, iter.level))
+ continue;
+
if (!is_writable_pte(iter.old_spte))
break;
@@ -1492,14 +1500,15 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
* Returns true if an SPTE was set and a TLB flush is needed.
*/
bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
- struct kvm_memory_slot *slot, gfn_t gfn)
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ int min_level)
{
struct kvm_mmu_page *root;
bool spte_set = false;
lockdep_assert_held_write(&kvm->mmu_lock);
for_each_tdp_mmu_root(kvm, root, slot->as_id)
- spte_set |= write_protect_gfn(kvm, root, gfn);
+ spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
return spte_set;
}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 5fdf63090451..1cae4485b3bc 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -31,7 +31,7 @@ static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id,
}
static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
{
- gfn_t end = sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level);
+ gfn_t end = sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level + 1);
/*
* Don't allow yielding, as the caller may have a flush pending. Note,
@@ -74,37 +74,40 @@ bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
bool flush);
bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
- struct kvm_memory_slot *slot, gfn_t gfn);
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ int min_level);
int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
int *root_level);
#ifdef CONFIG_X86_64
-void kvm_mmu_init_tdp_mmu(struct kvm *kvm);
+bool kvm_mmu_init_tdp_mmu(struct kvm *kvm);
void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; }
static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; }
-#else
-static inline void kvm_mmu_init_tdp_mmu(struct kvm *kvm) {}
-static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {}
-static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; }
-static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; }
-#endif
-static inline bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa)
+static inline bool is_tdp_mmu(struct kvm_mmu *mmu)
{
struct kvm_mmu_page *sp;
+ hpa_t hpa = mmu->root_hpa;
- if (!is_tdp_mmu_enabled(kvm))
- return false;
if (WARN_ON(!VALID_PAGE(hpa)))
return false;
+ /*
+ * A NULL shadow page is legal when shadowing a non-paging guest with
+ * PAE paging, as the MMU will be direct with root_hpa pointing at the
+ * pae_root page, not a shadow page.
+ */
sp = to_shadow_page(hpa);
- if (WARN_ON(!sp))
- return false;
-
- return is_tdp_mmu_page(sp) && sp->root_count;
+ return sp && is_tdp_mmu_page(sp) && sp->root_count;
}
+#else
+static inline bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return false; }
+static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {}
+static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; }
+static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; }
+static inline bool is_tdp_mmu(struct kvm_mmu *mmu) { return false; }
+#endif
#endif /* __KVM_X86_MMU_TDP_MMU_H */
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 5e7e920113f3..1d01da64c333 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -27,10 +27,6 @@
#include "irq.h"
#include "svm.h"
-/* enable / disable AVIC */
-bool avic;
-module_param(avic, bool, S_IRUGO);
-
#define SVM_AVIC_DOORBELL 0xc001011b
#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF)
@@ -124,7 +120,7 @@ void avic_vm_destroy(struct kvm *kvm)
unsigned long flags;
struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
- if (!avic)
+ if (!enable_apicv)
return;
if (kvm_svm->avic_logical_id_table_page)
@@ -147,7 +143,7 @@ int avic_vm_init(struct kvm *kvm)
struct page *l_page;
u32 vm_id;
- if (!avic)
+ if (!enable_apicv)
return 0;
/* Allocating physical APIC ID table (4KB) */
@@ -240,7 +236,7 @@ static int avic_update_access_page(struct kvm *kvm, bool activate)
* APICv mode change, which update APIC_ACCESS_PAGE_PRIVATE_MEMSLOT
* memory region. So, we need to ensure that kvm->mm == current->mm.
*/
- if ((kvm->arch.apic_access_page_done == activate) ||
+ if ((kvm->arch.apic_access_memslot_enabled == activate) ||
(kvm->mm != current->mm))
goto out;
@@ -253,7 +249,7 @@ static int avic_update_access_page(struct kvm *kvm, bool activate)
goto out;
}
- kvm->arch.apic_access_page_done = activate;
+ kvm->arch.apic_access_memslot_enabled = activate;
out:
mutex_unlock(&kvm->slots_lock);
return r;
@@ -569,7 +565,7 @@ int avic_init_vcpu(struct vcpu_svm *svm)
int ret;
struct kvm_vcpu *vcpu = &svm->vcpu;
- if (!avic || !irqchip_in_kernel(vcpu->kvm))
+ if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm))
return 0;
ret = avic_init_backing_page(vcpu);
@@ -593,7 +589,7 @@ void avic_post_state_restore(struct kvm_vcpu *vcpu)
void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate)
{
- if (!avic || !lapic_in_kernel(vcpu))
+ if (!enable_apicv || !lapic_in_kernel(vcpu))
return;
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
@@ -653,7 +649,7 @@ void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
struct vmcb *vmcb = svm->vmcb;
bool activated = kvm_vcpu_apicv_active(vcpu);
- if (!avic)
+ if (!enable_apicv)
return;
if (activated) {
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 5e8d8443154e..21d03e3a5dfd 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -98,13 +98,18 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
WARN_ON(mmu_is_nested(vcpu));
vcpu->arch.mmu = &vcpu->arch.guest_mmu;
+
+ /*
+ * The NPT format depends on L1's CR4 and EFER, which is in vmcb01. Note,
+ * when called via KVM_SET_NESTED_STATE, that state may _not_ match current
+ * vCPU state. CR0.WP is explicitly ignored, while CR0.PG is required.
+ */
kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01.ptr->save.cr4,
svm->vmcb01.ptr->save.efer,
svm->nested.ctl.nested_cr3);
vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3;
vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr;
vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit;
- reset_shadow_zero_bits_mask(vcpu, vcpu->arch.mmu);
vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
}
@@ -380,33 +385,47 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm)
return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE;
}
+static void nested_svm_transition_tlb_flush(struct kvm_vcpu *vcpu)
+{
+ /*
+ * TODO: optimize unconditional TLB flush/MMU sync. A partial list of
+ * things to fix before this can be conditional:
+ *
+ * - Flush TLBs for both L1 and L2 remote TLB flush
+ * - Honor L1's request to flush an ASID on nested VMRUN
+ * - Sync nested NPT MMU on VMRUN that flushes L2's ASID[*]
+ * - Don't crush a pending TLB flush in vmcb02 on nested VMRUN
+ * - Flush L1's ASID on KVM_REQ_TLB_FLUSH_GUEST
+ *
+ * [*] Unlike nested EPT, SVM's ASID management can invalidate nested
+ * NPT guest-physical mappings on VMRUN.
+ */
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+}
+
/*
* Load guest's/host's cr3 on nested vmentry or vmexit. @nested_npt is true
* if we are emulating VM-Entry into a guest with NPT enabled.
*/
static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
- bool nested_npt)
+ bool nested_npt, bool reload_pdptrs)
{
if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3)))
return -EINVAL;
- if (!nested_npt && is_pae_paging(vcpu) &&
- (cr3 != kvm_read_cr3(vcpu) || pdptrs_changed(vcpu))) {
- if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)))
- return -EINVAL;
- }
+ if (reload_pdptrs && !nested_npt && is_pae_paging(vcpu) &&
+ CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)))
+ return -EINVAL;
- /*
- * TODO: optimize unconditional TLB flush/MMU sync here and in
- * kvm_init_shadow_npt_mmu().
- */
if (!nested_npt)
- kvm_mmu_new_pgd(vcpu, cr3, false, false);
+ kvm_mmu_new_pgd(vcpu, cr3);
vcpu->arch.cr3 = cr3;
kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
- kvm_init_mmu(vcpu, false);
+ /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */
+ kvm_init_mmu(vcpu);
return 0;
}
@@ -481,6 +500,7 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
{
const u32 mask = V_INTR_MASKING_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
/*
* Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2,
@@ -505,10 +525,10 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
/* nested_cr3. */
if (nested_npt_enabled(svm))
- nested_svm_init_mmu_context(&svm->vcpu);
+ nested_svm_init_mmu_context(vcpu);
- svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset =
- svm->vcpu.arch.l1_tsc_offset + svm->nested.ctl.tsc_offset;
+ svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset =
+ vcpu->arch.l1_tsc_offset + svm->nested.ctl.tsc_offset;
svm->vmcb->control.int_ctl =
(svm->nested.ctl.int_ctl & ~mask) |
@@ -523,8 +543,10 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
svm->vmcb->control.pause_filter_count = svm->nested.ctl.pause_filter_count;
svm->vmcb->control.pause_filter_thresh = svm->nested.ctl.pause_filter_thresh;
+ nested_svm_transition_tlb_flush(vcpu);
+
/* Enter Guest-Mode */
- enter_guest_mode(&svm->vcpu);
+ enter_guest_mode(vcpu);
/*
* Merge guest and host intercepts - must be called with vcpu in
@@ -576,7 +598,7 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
nested_vmcb02_prepare_save(svm, vmcb12);
ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3,
- nested_npt_enabled(svm));
+ nested_npt_enabled(svm), true);
if (ret)
return ret;
@@ -596,8 +618,6 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
struct kvm_host_map map;
u64 vmcb12_gpa;
- ++vcpu->stat.nested_run;
-
if (is_smm(vcpu)) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
@@ -803,9 +823,11 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
kvm_vcpu_unmap(vcpu, &map, true);
+ nested_svm_transition_tlb_flush(vcpu);
+
nested_svm_uninit_mmu_context(vcpu);
- rc = nested_svm_load_cr3(vcpu, svm->vmcb->save.cr3, false);
+ rc = nested_svm_load_cr3(vcpu, svm->vmcb->save.cr3, false, true);
if (rc)
return 1;
@@ -1228,8 +1250,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
&user_kvm_nested_state->data.svm[0];
struct vmcb_control_area *ctl;
struct vmcb_save_area *save;
+ unsigned long cr0;
int ret;
- u32 cr0;
BUILD_BUG_ON(sizeof(struct vmcb_control_area) + sizeof(struct vmcb_save_area) >
KVM_STATE_NESTED_SVM_VMCB_SIZE);
@@ -1302,6 +1324,19 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
goto out_free;
/*
+ * While the nested guest CR3 is already checked and set by
+ * KVM_SET_SREGS, it was set when nested state was yet loaded,
+ * thus MMU might not be initialized correctly.
+ * Set it again to fix this.
+ */
+
+ ret = nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3,
+ nested_npt_enabled(svm), false);
+ if (WARN_ON_ONCE(ret))
+ goto out_free;
+
+
+ /*
* All checks done, we can enter guest mode. Userspace provides
* vmcb12.control, which will be combined with L1 and stored into
* vmcb02, and the L1 save state which we store in vmcb01.
@@ -1358,9 +1393,15 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
if (WARN_ON(!is_guest_mode(vcpu)))
return true;
- if (nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3,
- nested_npt_enabled(svm)))
- return false;
+ if (!vcpu->arch.pdptrs_from_userspace &&
+ !nested_npt_enabled(svm) && is_pae_paging(vcpu))
+ /*
+ * Reload the guest's PDPTRs since after a migration
+ * the guest CR3 might be restored prior to setting the nested
+ * state which can lead to a load of wrong PDPTRs.
+ */
+ if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3)))
+ return false;
if (!nested_svm_vmrun_msrpm(svm)) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index e088086f3de6..8834822c00cd 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -43,6 +43,9 @@
#include "svm.h"
#include "svm_ops.h"
+#include "kvm_onhyperv.h"
+#include "svm_onhyperv.h"
+
#define __ex(x) __kvm_handle_fault_on_reboot(x)
MODULE_AUTHOR("Qumranet");
@@ -185,6 +188,13 @@ module_param(vls, int, 0444);
static int vgif = true;
module_param(vgif, int, 0444);
+/*
+ * enable / disable AVIC. Because the defaults differ for APICv
+ * support between VMX and SVM we cannot use module_param_named.
+ */
+static bool avic;
+module_param(avic, bool, 0444);
+
bool __read_mostly dump_invalid_vmcb;
module_param(dump_invalid_vmcb, bool, 0644);
@@ -673,6 +683,9 @@ static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
msrpm[offset] = tmp;
+
+ svm_hv_vmcb_dirty_nested_enlightenments(vcpu);
+
}
void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
@@ -939,6 +952,16 @@ static __init int svm_hardware_setup(void)
int r;
unsigned int order = get_order(IOPM_SIZE);
+ /*
+ * NX is required for shadow paging and for NPT if the NX huge pages
+ * mitigation is enabled.
+ */
+ if (!boot_cpu_has(X86_FEATURE_NX)) {
+ pr_err_ratelimited("NX (Execute Disable) not supported\n");
+ return -EOPNOTSUPP;
+ }
+ kvm_enable_efer_bits(EFER_NX);
+
iopm_pages = alloc_pages(GFP_KERNEL, order);
if (!iopm_pages)
@@ -952,9 +975,6 @@ static __init int svm_hardware_setup(void)
supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
- if (boot_cpu_has(X86_FEATURE_NX))
- kvm_enable_efer_bits(EFER_NX);
-
if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
kvm_enable_efer_bits(EFER_FFXSR);
@@ -996,6 +1016,8 @@ static __init int svm_hardware_setup(void)
/* Note, SEV setup consumes npt_enabled. */
sev_hardware_setup();
+ svm_hv_hardware_setup();
+
svm_adjust_mmio_mask();
for_each_possible_cpu(cpu) {
@@ -1009,14 +1031,12 @@ static __init int svm_hardware_setup(void)
nrips = false;
}
- if (avic) {
- if (!npt_enabled || !boot_cpu_has(X86_FEATURE_AVIC)) {
- avic = false;
- } else {
- pr_info("AVIC enabled\n");
+ enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC);
- amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
- }
+ if (enable_apicv) {
+ pr_info("AVIC enabled\n");
+
+ amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
}
if (vls) {
@@ -1080,26 +1100,30 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
seg->base = 0;
}
-static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- u64 g_tsc_offset = 0;
- if (is_guest_mode(vcpu)) {
- /* Write L1's TSC offset. */
- g_tsc_offset = svm->vmcb->control.tsc_offset -
- svm->vmcb01.ptr->control.tsc_offset;
- svm->vmcb01.ptr->control.tsc_offset = offset;
- }
+ return svm->nested.ctl.tsc_offset;
+}
- trace_kvm_write_tsc_offset(vcpu->vcpu_id,
- svm->vmcb->control.tsc_offset - g_tsc_offset,
- offset);
+static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
+{
+ return kvm_default_tsc_scaling_ratio;
+}
- svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
+static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
+ svm->vmcb->control.tsc_offset = offset;
vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
- return svm->vmcb->control.tsc_offset;
+}
+
+static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
+{
+ wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
}
/* Evaluate instruction intercepts that depend on guest CPUID features. */
@@ -1287,6 +1311,8 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
}
}
+ svm_hv_init_vmcb(svm->vmcb);
+
vmcb_mark_all_dirty(svm->vmcb);
enable_gif(svm);
@@ -3106,6 +3132,8 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
return;
}
+ pr_err("VMCB %p, last attempted VMRUN on CPU %d\n",
+ svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
pr_err("VMCB Control Area:\n");
pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
@@ -3762,6 +3790,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
}
svm->vmcb->save.cr2 = vcpu->arch.cr2;
+ svm_hv_update_vp_id(svm->vmcb, vcpu);
+
/*
* Run with all-zero DR6 unless needed, so that we can get the exact cause
* of a #DB.
@@ -3835,6 +3865,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
svm->next_rip = 0;
if (is_guest_mode(vcpu)) {
nested_sync_control_from_vmcb02(svm);
+
+ /* Track VMRUNs that have made past consistency checking */
+ if (svm->nested.nested_run_pending &&
+ svm->vmcb->control.exit_code != SVM_EXIT_ERR)
+ ++vcpu->stat.nested_run;
+
svm->nested.nested_run_pending = 0;
}
@@ -3846,10 +3882,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
vcpu->arch.apf.host_apf_flags =
kvm_read_and_reset_apf_flags();
- if (npt_enabled) {
- vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
- vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
- }
+ if (npt_enabled)
+ kvm_register_clear_available(vcpu, VCPU_EXREG_PDPTR);
/*
* We need to handle MC intercepts here before the vcpu has a chance to
@@ -3877,6 +3911,8 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
svm->vmcb->control.nested_cr3 = __sme_set(root_hpa);
vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
+ hv_track_root_tdp(vcpu, root_hpa);
+
/* Loading L2's CR3 is handled by enter_svm_guest_mode. */
if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
return;
@@ -4249,7 +4285,7 @@ static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
return !svm_smi_blocked(vcpu);
}
-static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
+static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
{
struct vcpu_svm *svm = to_svm(vcpu);
int ret;
@@ -4271,7 +4307,7 @@ static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
return 0;
}
-static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
+static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct kvm_host_map map;
@@ -4427,13 +4463,12 @@ static int svm_vm_init(struct kvm *kvm)
if (!pause_filter_count || !pause_filter_thresh)
kvm->arch.pause_in_guest = true;
- if (avic) {
+ if (enable_apicv) {
int ret = avic_vm_init(kvm);
if (ret)
return ret;
}
- kvm_apicv_init(kvm, avic);
return 0;
}
@@ -4524,7 +4559,10 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.has_wbinvd_exit = svm_has_wbinvd_exit,
- .write_l1_tsc_offset = svm_write_l1_tsc_offset,
+ .get_l2_tsc_offset = svm_get_l2_tsc_offset,
+ .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier,
+ .write_tsc_offset = svm_write_tsc_offset,
+ .write_tsc_multiplier = svm_write_tsc_multiplier,
.load_mmu_pgd = svm_load_mmu_pgd,
@@ -4544,8 +4582,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.setup_mce = svm_setup_mce,
.smi_allowed = svm_smi_allowed,
- .pre_enter_smm = svm_pre_enter_smm,
- .pre_leave_smm = svm_pre_leave_smm,
+ .enter_smm = svm_enter_smm,
+ .leave_smm = svm_leave_smm,
.enable_smi_window = svm_enable_smi_window,
.mem_enc_op = svm_mem_enc_op,
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 2908c6ab5bb4..f89b623bb591 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -32,6 +32,11 @@
extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
extern bool npt_enabled;
+/*
+ * Clean bits in VMCB.
+ * VMCB_ALL_CLEAN_MASK might also need to
+ * be updated if this enum is modified.
+ */
enum {
VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
pause filter count */
@@ -49,9 +54,17 @@ enum {
* AVIC PHYSICAL_TABLE pointer,
* AVIC LOGICAL_TABLE pointer
*/
- VMCB_DIRTY_MAX,
+ VMCB_SW = 31, /* Reserved for hypervisor/software use */
};
+#define VMCB_ALL_CLEAN_MASK ( \
+ (1U << VMCB_INTERCEPTS) | (1U << VMCB_PERM_MAP) | \
+ (1U << VMCB_ASID) | (1U << VMCB_INTR) | \
+ (1U << VMCB_NPT) | (1U << VMCB_CR) | (1U << VMCB_DR) | \
+ (1U << VMCB_DT) | (1U << VMCB_SEG) | (1U << VMCB_CR2) | \
+ (1U << VMCB_LBR) | (1U << VMCB_AVIC) | \
+ (1U << VMCB_SW))
+
/* TPR and CR2 are always written before VMRUN */
#define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2))
@@ -238,10 +251,15 @@ static inline void vmcb_mark_all_dirty(struct vmcb *vmcb)
static inline void vmcb_mark_all_clean(struct vmcb *vmcb)
{
- vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
+ vmcb->control.clean = VMCB_ALL_CLEAN_MASK
& ~VMCB_ALWAYS_DIRTY_MASK;
}
+static inline bool vmcb_is_clean(struct vmcb *vmcb, int bit)
+{
+ return (vmcb->control.clean & (1 << bit));
+}
+
static inline void vmcb_mark_dirty(struct vmcb *vmcb, int bit)
{
vmcb->control.clean &= ~(1 << bit);
@@ -480,8 +498,6 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
-extern bool avic;
-
static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data)
{
svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK;
diff --git a/arch/x86/kvm/svm/svm_onhyperv.c b/arch/x86/kvm/svm/svm_onhyperv.c
new file mode 100644
index 000000000000..98aa981c04ec
--- /dev/null
+++ b/arch/x86/kvm/svm/svm_onhyperv.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM L1 hypervisor optimizations on Hyper-V for SVM.
+ */
+
+#include <linux/kvm_host.h>
+#include "kvm_cache_regs.h"
+
+#include <asm/mshyperv.h>
+
+#include "svm.h"
+#include "svm_ops.h"
+
+#include "hyperv.h"
+#include "kvm_onhyperv.h"
+#include "svm_onhyperv.h"
+
+int svm_hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
+{
+ struct hv_enlightenments *hve;
+ struct hv_partition_assist_pg **p_hv_pa_pg =
+ &to_kvm_hv(vcpu->kvm)->hv_pa_pg;
+
+ if (!*p_hv_pa_pg)
+ *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL);
+
+ if (!*p_hv_pa_pg)
+ return -ENOMEM;
+
+ hve = (struct hv_enlightenments *)to_svm(vcpu)->vmcb->control.reserved_sw;
+
+ hve->partition_assist_page = __pa(*p_hv_pa_pg);
+ hve->hv_vm_id = (unsigned long)vcpu->kvm;
+ if (!hve->hv_enlightenments_control.nested_flush_hypercall) {
+ hve->hv_enlightenments_control.nested_flush_hypercall = 1;
+ vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS);
+ }
+
+ return 0;
+}
+
diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h
new file mode 100644
index 000000000000..9b9a55abc29f
--- /dev/null
+++ b/arch/x86/kvm/svm/svm_onhyperv.h
@@ -0,0 +1,130 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * KVM L1 hypervisor optimizations on Hyper-V for SVM.
+ */
+
+#ifndef __ARCH_X86_KVM_SVM_ONHYPERV_H__
+#define __ARCH_X86_KVM_SVM_ONHYPERV_H__
+
+#if IS_ENABLED(CONFIG_HYPERV)
+#include <asm/mshyperv.h>
+
+#include "hyperv.h"
+#include "kvm_onhyperv.h"
+
+static struct kvm_x86_ops svm_x86_ops;
+
+/*
+ * Hyper-V uses the software reserved 32 bytes in VMCB
+ * control area to expose SVM enlightenments to guests.
+ */
+struct hv_enlightenments {
+ struct __packed hv_enlightenments_control {
+ u32 nested_flush_hypercall:1;
+ u32 msr_bitmap:1;
+ u32 enlightened_npt_tlb: 1;
+ u32 reserved:29;
+ } __packed hv_enlightenments_control;
+ u32 hv_vp_id;
+ u64 hv_vm_id;
+ u64 partition_assist_page;
+ u64 reserved;
+} __packed;
+
+/*
+ * Hyper-V uses the software reserved clean bit in VMCB
+ */
+#define VMCB_HV_NESTED_ENLIGHTENMENTS VMCB_SW
+
+int svm_hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu);
+
+static inline void svm_hv_init_vmcb(struct vmcb *vmcb)
+{
+ struct hv_enlightenments *hve =
+ (struct hv_enlightenments *)vmcb->control.reserved_sw;
+
+ if (npt_enabled &&
+ ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB)
+ hve->hv_enlightenments_control.enlightened_npt_tlb = 1;
+}
+
+static inline void svm_hv_hardware_setup(void)
+{
+ if (npt_enabled &&
+ ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) {
+ pr_info("kvm: Hyper-V enlightened NPT TLB flush enabled\n");
+ svm_x86_ops.tlb_remote_flush = hv_remote_flush_tlb;
+ svm_x86_ops.tlb_remote_flush_with_range =
+ hv_remote_flush_tlb_with_range;
+ }
+
+ if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) {
+ int cpu;
+
+ pr_info("kvm: Hyper-V Direct TLB Flush enabled\n");
+ for_each_online_cpu(cpu) {
+ struct hv_vp_assist_page *vp_ap =
+ hv_get_vp_assist_page(cpu);
+
+ if (!vp_ap)
+ continue;
+
+ vp_ap->nested_control.features.directhypercall = 1;
+ }
+ svm_x86_ops.enable_direct_tlbflush =
+ svm_hv_enable_direct_tlbflush;
+ }
+}
+
+static inline void svm_hv_vmcb_dirty_nested_enlightenments(
+ struct kvm_vcpu *vcpu)
+{
+ struct vmcb *vmcb = to_svm(vcpu)->vmcb;
+ struct hv_enlightenments *hve =
+ (struct hv_enlightenments *)vmcb->control.reserved_sw;
+
+ /*
+ * vmcb can be NULL if called during early vcpu init.
+ * And its okay not to mark vmcb dirty during vcpu init
+ * as we mark it dirty unconditionally towards end of vcpu
+ * init phase.
+ */
+ if (vmcb && vmcb_is_clean(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS) &&
+ hve->hv_enlightenments_control.msr_bitmap)
+ vmcb_mark_dirty(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS);
+}
+
+static inline void svm_hv_update_vp_id(struct vmcb *vmcb,
+ struct kvm_vcpu *vcpu)
+{
+ struct hv_enlightenments *hve =
+ (struct hv_enlightenments *)vmcb->control.reserved_sw;
+ u32 vp_index = kvm_hv_get_vpindex(vcpu);
+
+ if (hve->hv_vp_id != vp_index) {
+ hve->hv_vp_id = vp_index;
+ vmcb_mark_dirty(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS);
+ }
+}
+#else
+
+static inline void svm_hv_init_vmcb(struct vmcb *vmcb)
+{
+}
+
+static inline void svm_hv_hardware_setup(void)
+{
+}
+
+static inline void svm_hv_vmcb_dirty_nested_enlightenments(
+ struct kvm_vcpu *vcpu)
+{
+}
+
+static inline void svm_hv_update_vp_id(struct vmcb *vmcb,
+ struct kvm_vcpu *vcpu)
+{
+}
+#endif /* CONFIG_HYPERV */
+
+#endif /* __ARCH_X86_KVM_SVM_ONHYPERV_H__ */
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 4f839148948b..b484141ea15b 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -997,7 +997,7 @@ TRACE_EVENT(kvm_wait_lapic_expire,
__entry->delta < 0 ? "early" : "late")
);
-TRACE_EVENT(kvm_enter_smm,
+TRACE_EVENT(kvm_smm_transition,
TP_PROTO(unsigned int vcpu_id, u64 smbase, bool entering),
TP_ARGS(vcpu_id, smbase, entering),
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index aa0e7872fcc9..4705ad55abb5 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -12,7 +12,6 @@ extern bool __read_mostly enable_ept;
extern bool __read_mostly enable_unrestricted_guest;
extern bool __read_mostly enable_ept_ad_bits;
extern bool __read_mostly enable_pml;
-extern bool __read_mostly enable_apicv;
extern int __read_mostly pt_mode;
#define PT_MODE_SYSTEM 0
diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c
index 41f24661af04..896b2a50b4aa 100644
--- a/arch/x86/kvm/vmx/evmcs.c
+++ b/arch/x86/kvm/vmx/evmcs.c
@@ -319,6 +319,9 @@ bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa)
if (unlikely(!assist_page.enlighten_vmentry))
return false;
+ if (unlikely(!evmptr_is_valid(assist_page.current_nested_vmcs)))
+ return false;
+
*evmcs_gpa = assist_page.current_nested_vmcs;
return true;
diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h
index bd41d9462355..2ec9b46f0d0c 100644
--- a/arch/x86/kvm/vmx/evmcs.h
+++ b/arch/x86/kvm/vmx/evmcs.h
@@ -197,6 +197,14 @@ static inline void evmcs_load(u64 phys_addr) {}
static inline void evmcs_touch_msr_bitmap(void) {}
#endif /* IS_ENABLED(CONFIG_HYPERV) */
+#define EVMPTR_INVALID (-1ULL)
+#define EVMPTR_MAP_PENDING (-2ULL)
+
+static inline bool evmptr_is_valid(u64 evmptr)
+{
+ return evmptr != EVMPTR_INVALID && evmptr != EVMPTR_MAP_PENDING;
+}
+
enum nested_evmptrld_status {
EVMPTRLD_DISABLED,
EVMPTRLD_SUCCEEDED,
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 6058a65a6ede..1a52134b0c42 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -173,9 +173,13 @@ static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
| X86_EFLAGS_ZF);
get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
/*
- * We don't need to force a shadow sync because
- * VM_INSTRUCTION_ERROR is not shadowed
+ * We don't need to force sync to shadow VMCS because
+ * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all
+ * fields and thus must be synced.
*/
+ if (to_vmx(vcpu)->nested.hv_evmcs_vmptr != EVMPTR_INVALID)
+ to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true;
+
return kvm_skip_emulated_instruction(vcpu);
}
@@ -187,7 +191,8 @@ static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error)
* failValid writes the error number to the current VMCS, which
* can't be done if there isn't a current VMCS.
*/
- if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs)
+ if (vmx->nested.current_vmptr == -1ull &&
+ !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
return nested_vmx_failInvalid(vcpu);
return nested_vmx_failValid(vcpu, vm_instruction_error);
@@ -221,12 +226,12 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (!vmx->nested.hv_evmcs)
- return;
+ if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
+ kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true);
+ vmx->nested.hv_evmcs = NULL;
+ }
- kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true);
- vmx->nested.hv_evmcs_vmptr = 0;
- vmx->nested.hv_evmcs = NULL;
+ vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
}
static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
@@ -346,16 +351,21 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
vmcs12->guest_physical_address = fault->address;
}
+static void nested_ept_new_eptp(struct kvm_vcpu *vcpu)
+{
+ kvm_init_shadow_ept_mmu(vcpu,
+ to_vmx(vcpu)->nested.msrs.ept_caps &
+ VMX_EPT_EXECUTE_ONLY_BIT,
+ nested_ept_ad_enabled(vcpu),
+ nested_ept_get_eptp(vcpu));
+}
+
static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
{
WARN_ON(mmu_is_nested(vcpu));
vcpu->arch.mmu = &vcpu->arch.guest_mmu;
- kvm_init_shadow_ept_mmu(vcpu,
- to_vmx(vcpu)->nested.msrs.ept_caps &
- VMX_EPT_EXECUTE_ONLY_BIT,
- nested_ept_ad_enabled(vcpu),
- nested_ept_get_eptp(vcpu));
+ nested_ept_new_eptp(vcpu);
vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp;
vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
vcpu->arch.mmu->get_pdptr = kvm_pdptr_read;
@@ -1058,54 +1068,13 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
}
/*
- * Returns true if the MMU needs to be sync'd on nested VM-Enter/VM-Exit.
- * tl;dr: the MMU needs a sync if L0 is using shadow paging and L1 didn't
- * enable VPID for L2 (implying it expects a TLB flush on VMX transitions).
- * Here's why.
- *
- * If EPT is enabled by L0 a sync is never needed:
- * - if it is disabled by L1, then L0 is not shadowing L1 or L2 PTEs, there
- * cannot be unsync'd SPTEs for either L1 or L2.
- *
- * - if it is also enabled by L1, then L0 doesn't need to sync on VM-Enter
- * VM-Enter as VM-Enter isn't required to invalidate guest-physical mappings
- * (irrespective of VPID), i.e. L1 can't rely on the (virtual) CPU to flush
- * stale guest-physical mappings for L2 from the TLB. And as above, L0 isn't
- * shadowing L1 PTEs so there are no unsync'd SPTEs to sync on VM-Exit.
- *
- * If EPT is disabled by L0:
- * - if VPID is enabled by L1 (for L2), the situation is similar to when L1
- * enables EPT: L0 doesn't need to sync as VM-Enter and VM-Exit aren't
- * required to invalidate linear mappings (EPT is disabled so there are
- * no combined or guest-physical mappings), i.e. L1 can't rely on the
- * (virtual) CPU to flush stale linear mappings for either L2 or itself (L1).
- *
- * - however if VPID is disabled by L1, then a sync is needed as L1 expects all
- * linear mappings (EPT is disabled so there are no combined or guest-physical
- * mappings) to be invalidated on both VM-Enter and VM-Exit.
- *
- * Note, this logic is subtly different than nested_has_guest_tlb_tag(), which
- * additionally checks that L2 has been assigned a VPID (when EPT is disabled).
- * Whether or not L2 has been assigned a VPID by L0 is irrelevant with respect
- * to L1's expectations, e.g. L0 needs to invalidate hardware TLB entries if L2
- * doesn't have a unique VPID to prevent reusing L1's entries (assuming L1 has
- * been assigned a VPID), but L0 doesn't need to do a MMU sync because L1
- * doesn't expect stale (virtual) TLB entries to be flushed, i.e. L1 doesn't
- * know that L0 will flush the TLB and so L1 will do INVVPID as needed to flush
- * stale TLB entries, at which point L0 will sync L2's MMU.
- */
-static bool nested_vmx_transition_mmu_sync(struct kvm_vcpu *vcpu)
-{
- return !enable_ept && !nested_cpu_has_vpid(get_vmcs12(vcpu));
-}
-
-/*
* Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are
* emulating VM-Entry into a guest with EPT enabled. On failure, the expected
* Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to
* @entry_failure_code.
*/
-static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
+static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
+ bool nested_ept, bool reload_pdptrs,
enum vm_entry_failure_code *entry_failure_code)
{
if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) {
@@ -1117,27 +1086,20 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
* If PAE paging and EPT are both on, CR3 is not used by the CPU and
* must not be dereferenced.
*/
- if (!nested_ept && is_pae_paging(vcpu) &&
- (cr3 != kvm_read_cr3(vcpu) || pdptrs_changed(vcpu))) {
- if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) {
- *entry_failure_code = ENTRY_FAIL_PDPTE;
- return -EINVAL;
- }
+ if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) &&
+ CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) {
+ *entry_failure_code = ENTRY_FAIL_PDPTE;
+ return -EINVAL;
}
- /*
- * Unconditionally skip the TLB flush on fast CR3 switch, all TLB
- * flushes are handled by nested_vmx_transition_tlb_flush(). See
- * nested_vmx_transition_mmu_sync for details on skipping the MMU sync.
- */
if (!nested_ept)
- kvm_mmu_new_pgd(vcpu, cr3, true,
- !nested_vmx_transition_mmu_sync(vcpu));
+ kvm_mmu_new_pgd(vcpu, cr3);
vcpu->arch.cr3 = cr3;
kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
- kvm_init_mmu(vcpu, false);
+ /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */
+ kvm_init_mmu(vcpu);
return 0;
}
@@ -1170,17 +1132,28 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
struct vcpu_vmx *vmx = to_vmx(vcpu);
/*
- * If VPID is disabled, linear and combined mappings are flushed on
- * VM-Enter/VM-Exit, and guest-physical mappings are valid only for
- * their associated EPTP.
+ * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings
+ * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a
+ * full TLB flush from the guest's perspective. This is required even
+ * if VPID is disabled in the host as KVM may need to synchronize the
+ * MMU in response to the guest TLB flush.
+ *
+ * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use.
+ * EPT is a special snowflake, as guest-physical mappings aren't
+ * flushed on VPID invalidations, including VM-Enter or VM-Exit with
+ * VPID disabled. As a result, KVM _never_ needs to sync nEPT
+ * entries on VM-Enter because L1 can't rely on VM-Enter to flush
+ * those mappings.
*/
- if (!enable_vpid)
+ if (!nested_cpu_has_vpid(vmcs12)) {
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
return;
+ }
+
+ /* L2 should never have a VPID if VPID is disabled. */
+ WARN_ON(!enable_vpid);
/*
- * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings
- * for *all* contexts to be flushed on VM-Enter/VM-Exit.
- *
* If VPID is enabled and used by vmc12, but L2 does not have a unique
* TLB tag (ASID), i.e. EPT is disabled and KVM was unable to allocate
* a VPID for L2, flush the current context as the effective ASID is
@@ -1192,13 +1165,12 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
*
* If a TLB flush isn't required due to any of the above, and vpid12 is
* changing then the new "virtual" VPID (vpid12) will reuse the same
- * "real" VPID (vpid02), and so needs to be sync'd. There is no direct
+ * "real" VPID (vpid02), and so needs to be flushed. There's no direct
* mapping between vpid02 and vpid12, vpid02 is per-vCPU and reused for
- * all nested vCPUs.
+ * all nested vCPUs. Remember, a flush on VM-Enter does not invalidate
+ * guest-physical mappings, so there is no need to sync the nEPT MMU.
*/
- if (!nested_cpu_has_vpid(vmcs12)) {
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
- } else if (!nested_has_guest_tlb_tag(vcpu)) {
+ if (!nested_has_guest_tlb_tag(vcpu)) {
kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
} else if (is_vmenter &&
vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
@@ -1586,7 +1558,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
vmcs_load(vmx->loaded_vmcs->vmcs);
}
-static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
+static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields)
{
struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
@@ -1595,7 +1567,7 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
vmcs12->tpr_threshold = evmcs->tpr_threshold;
vmcs12->guest_rip = evmcs->guest_rip;
- if (unlikely(!(evmcs->hv_clean_fields &
+ if (unlikely(!(hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
vmcs12->guest_rsp = evmcs->guest_rsp;
vmcs12->guest_rflags = evmcs->guest_rflags;
@@ -1603,23 +1575,23 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
evmcs->guest_interruptibility_info;
}
- if (unlikely(!(evmcs->hv_clean_fields &
+ if (unlikely(!(hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
vmcs12->cpu_based_vm_exec_control =
evmcs->cpu_based_vm_exec_control;
}
- if (unlikely(!(evmcs->hv_clean_fields &
+ if (unlikely(!(hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) {
vmcs12->exception_bitmap = evmcs->exception_bitmap;
}
- if (unlikely(!(evmcs->hv_clean_fields &
+ if (unlikely(!(hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) {
vmcs12->vm_entry_controls = evmcs->vm_entry_controls;
}
- if (unlikely(!(evmcs->hv_clean_fields &
+ if (unlikely(!(hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) {
vmcs12->vm_entry_intr_info_field =
evmcs->vm_entry_intr_info_field;
@@ -1629,7 +1601,7 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
evmcs->vm_entry_instruction_len;
}
- if (unlikely(!(evmcs->hv_clean_fields &
+ if (unlikely(!(hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
vmcs12->host_ia32_pat = evmcs->host_ia32_pat;
vmcs12->host_ia32_efer = evmcs->host_ia32_efer;
@@ -1649,7 +1621,7 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
vmcs12->host_tr_selector = evmcs->host_tr_selector;
}
- if (unlikely(!(evmcs->hv_clean_fields &
+ if (unlikely(!(hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) {
vmcs12->pin_based_vm_exec_control =
evmcs->pin_based_vm_exec_control;
@@ -1658,18 +1630,18 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
evmcs->secondary_vm_exec_control;
}
- if (unlikely(!(evmcs->hv_clean_fields &
+ if (unlikely(!(hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) {
vmcs12->io_bitmap_a = evmcs->io_bitmap_a;
vmcs12->io_bitmap_b = evmcs->io_bitmap_b;
}
- if (unlikely(!(evmcs->hv_clean_fields &
+ if (unlikely(!(hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) {
vmcs12->msr_bitmap = evmcs->msr_bitmap;
}
- if (unlikely(!(evmcs->hv_clean_fields &
+ if (unlikely(!(hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) {
vmcs12->guest_es_base = evmcs->guest_es_base;
vmcs12->guest_cs_base = evmcs->guest_cs_base;
@@ -1709,14 +1681,14 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
vmcs12->guest_tr_selector = evmcs->guest_tr_selector;
}
- if (unlikely(!(evmcs->hv_clean_fields &
+ if (unlikely(!(hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) {
vmcs12->tsc_offset = evmcs->tsc_offset;
vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
}
- if (unlikely(!(evmcs->hv_clean_fields &
+ if (unlikely(!(hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) {
vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask;
vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask;
@@ -1728,7 +1700,7 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
vmcs12->guest_dr7 = evmcs->guest_dr7;
}
- if (unlikely(!(evmcs->hv_clean_fields &
+ if (unlikely(!(hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) {
vmcs12->host_fs_base = evmcs->host_fs_base;
vmcs12->host_gs_base = evmcs->host_gs_base;
@@ -1738,13 +1710,13 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
vmcs12->host_rsp = evmcs->host_rsp;
}
- if (unlikely(!(evmcs->hv_clean_fields &
+ if (unlikely(!(hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) {
vmcs12->ept_pointer = evmcs->ept_pointer;
vmcs12->virtual_processor_id = evmcs->virtual_processor_id;
}
- if (unlikely(!(evmcs->hv_clean_fields &
+ if (unlikely(!(hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) {
vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer;
vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl;
@@ -1799,10 +1771,10 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
* vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip;
*/
- return 0;
+ return;
}
-static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
+static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
{
struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
@@ -1962,7 +1934,7 @@ static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
- return 0;
+ return;
}
/*
@@ -1979,13 +1951,13 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
if (likely(!vmx->nested.enlightened_vmcs_enabled))
return EVMPTRLD_DISABLED;
- if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa))
+ if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) {
+ nested_release_evmcs(vcpu);
return EVMPTRLD_DISABLED;
+ }
- if (unlikely(!vmx->nested.hv_evmcs ||
- evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
- if (!vmx->nested.hv_evmcs)
- vmx->nested.current_vmptr = -1ull;
+ if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
+ vmx->nested.current_vmptr = -1ull;
nested_release_evmcs(vcpu);
@@ -2023,7 +1995,6 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
return EVMPTRLD_VMFAIL;
}
- vmx->nested.dirty_vmcs12 = true;
vmx->nested.hv_evmcs_vmptr = evmcs_gpa;
evmcs_gpa_changed = true;
@@ -2056,14 +2027,10 @@ void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (vmx->nested.hv_evmcs) {
+ if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
copy_vmcs12_to_enlightened(vmx);
- /* All fields are clean */
- vmx->nested.hv_evmcs->hv_clean_fields |=
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
- } else {
+ else
copy_vmcs12_to_shadow(vmx);
- }
vmx->nested.need_vmcs12_to_shadow_sync = false;
}
@@ -2208,7 +2175,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
u32 exec_control;
u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
- if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs)
+ if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
prepare_vmcs02_early_rare(vmx, vmcs12);
/*
@@ -2277,7 +2244,8 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
- SECONDARY_EXEC_ENABLE_VMFUNC);
+ SECONDARY_EXEC_ENABLE_VMFUNC |
+ SECONDARY_EXEC_TSC_SCALING);
if (nested_cpu_has(vmcs12,
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
exec_control |= vmcs12->secondary_vm_exec_control;
@@ -2488,18 +2456,18 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
* is assigned to entry_failure_code on failure.
*/
static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+ bool from_vmentry,
enum vm_entry_failure_code *entry_failure_code)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
bool load_guest_pdptrs_vmcs12 = false;
- if (vmx->nested.dirty_vmcs12 || hv_evmcs) {
+ if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
prepare_vmcs02_rare(vmx, vmcs12);
vmx->nested.dirty_vmcs12 = false;
- load_guest_pdptrs_vmcs12 = !hv_evmcs ||
- !(hv_evmcs->hv_clean_fields &
+ load_guest_pdptrs_vmcs12 = !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) ||
+ !(vmx->nested.hv_evmcs->hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
}
@@ -2532,10 +2500,18 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
}
- vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
+ vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
+ vcpu->arch.l1_tsc_offset,
+ vmx_get_l2_tsc_offset(vcpu),
+ vmx_get_l2_tsc_multiplier(vcpu));
+ vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
+ vcpu->arch.l1_tsc_scaling_ratio,
+ vmx_get_l2_tsc_multiplier(vcpu));
+
+ vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
if (kvm_has_tsc_control)
- decache_tsc_multiplier(vmx);
+ vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
nested_vmx_transition_tlb_flush(vcpu, vmcs12, true);
@@ -2572,7 +2548,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
/* Shadow page tables on either EPT or shadow page tables. */
if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
- entry_failure_code))
+ from_vmentry, entry_failure_code))
return -EINVAL;
/*
@@ -2604,6 +2580,17 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
kvm_rsp_write(vcpu, vmcs12->guest_rsp);
kvm_rip_write(vcpu, vmcs12->guest_rip);
+
+ /*
+ * It was observed that genuine Hyper-V running in L1 doesn't reset
+ * 'hv_clean_fields' by itself, it only sets the corresponding dirty
+ * bits when it changes a field in eVMCS. Mark all fields as clean
+ * here.
+ */
+ if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
+ vmx->nested.hv_evmcs->hv_clean_fields |=
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+
return 0;
}
@@ -3093,13 +3080,20 @@ static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
* L2 was running), map it here to make sure vmcs12 changes are
* properly reflected.
*/
- if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs) {
+ if (vmx->nested.enlightened_vmcs_enabled &&
+ vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) {
enum nested_evmptrld_status evmptrld_status =
nested_vmx_handle_enlightened_vmptrld(vcpu, false);
if (evmptrld_status == EVMPTRLD_VMFAIL ||
evmptrld_status == EVMPTRLD_ERROR)
return false;
+
+ /*
+ * Post migration VMCS12 always provides the most actual
+ * information, copy it to eVMCS upon entry.
+ */
+ vmx->nested.need_vmcs12_to_shadow_sync = true;
}
return true;
@@ -3113,6 +3107,18 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
struct page *page;
u64 hpa;
+ if (!vcpu->arch.pdptrs_from_userspace &&
+ !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
+ /*
+ * Reload the guest's PDPTRs since after a migration
+ * the guest CR3 might be restored prior to setting the nested
+ * state which can lead to a load of wrong PDPTRs.
+ */
+ if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3)))
+ return false;
+ }
+
+
if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
/*
* Translate L1 physical address to host physical
@@ -3175,6 +3181,15 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
offset_in_page(vmcs12->posted_intr_desc_addr));
vmcs_write64(POSTED_INTR_DESC_ADDR,
pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr));
+ } else {
+ /*
+ * Defer the KVM_INTERNAL_EXIT until KVM tries to
+ * access the contents of the VMCS12 posted interrupt
+ * descriptor. (Note that KVM may do this when it
+ * should not, per the architectural specification.)
+ */
+ vmx->nested.pi_desc = NULL;
+ pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR);
}
}
if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
@@ -3354,10 +3369,8 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
}
enter_guest_mode(vcpu);
- if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
- vcpu->arch.tsc_offset += vmcs12->tsc_offset;
- if (prepare_vmcs02(vcpu, vmcs12, &entry_failure_code)) {
+ if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) {
exit_reason.basic = EXIT_REASON_INVALID_STATE;
vmcs12->exit_qualification = entry_failure_code;
goto vmentry_fail_vmexit_guest_mode;
@@ -3437,7 +3450,7 @@ vmentry_fail_vmexit:
load_vmcs12_host_state(vcpu, vmcs12);
vmcs12->vm_exit_reason = exit_reason.full;
- if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
+ if (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
vmx->nested.need_vmcs12_to_shadow_sync = true;
return NVMX_VMENTRY_VMEXIT;
}
@@ -3454,8 +3467,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
enum nested_evmptrld_status evmptrld_status;
- ++vcpu->stat.nested_run;
-
if (!nested_vmx_check_permission(vcpu))
return 1;
@@ -3467,7 +3478,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
return nested_vmx_failInvalid(vcpu);
}
- if (CC(!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull))
+ if (CC(!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) &&
+ vmx->nested.current_vmptr == -1ull))
return nested_vmx_failInvalid(vcpu);
vmcs12 = get_vmcs12(vcpu);
@@ -3481,8 +3493,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
if (CC(vmcs12->hdr.shadow_vmcs))
return nested_vmx_failInvalid(vcpu);
- if (vmx->nested.hv_evmcs) {
- copy_enlightened_to_vmcs12(vmx);
+ if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
+ copy_enlightened_to_vmcs12(vmx, vmx->nested.hv_evmcs->hv_clean_fields);
/* Enlightened VMCS doesn't have launch state */
vmcs12->launch_state = !launch;
} else if (enable_shadow_vmcs) {
@@ -3682,25 +3694,29 @@ void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
}
}
-static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
+static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int max_irr;
void *vapic_page;
u16 status;
- if (!vmx->nested.pi_desc || !vmx->nested.pi_pending)
- return;
+ if (!vmx->nested.pi_pending)
+ return 0;
+
+ if (!vmx->nested.pi_desc)
+ goto mmio_needed;
vmx->nested.pi_pending = false;
+
if (!pi_test_and_clear_on(vmx->nested.pi_desc))
- return;
+ return 0;
max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
if (max_irr != 256) {
vapic_page = vmx->nested.virtual_apic_map.hva;
if (!vapic_page)
- return;
+ goto mmio_needed;
__kvm_apic_update_irr(vmx->nested.pi_desc->pir,
vapic_page, &max_irr);
@@ -3713,6 +3729,11 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
}
nested_mark_vmcs12_pages_dirty(vcpu);
+ return 0;
+
+mmio_needed:
+ kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL);
+ return -ENXIO;
}
static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
@@ -3887,8 +3908,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
}
no_vmexit:
- vmx_complete_nested_posted_interrupt(vcpu);
- return 0;
+ return vmx_complete_nested_posted_interrupt(vcpu);
}
static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
@@ -4032,10 +4052,11 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (vmx->nested.hv_evmcs)
+ if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
- vmx->nested.need_sync_vmcs02_to_vmcs12_rare = !vmx->nested.hv_evmcs;
+ vmx->nested.need_sync_vmcs02_to_vmcs12_rare =
+ !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr);
vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
@@ -4206,7 +4227,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
* Only PDPTE load can fail as the value of cr3 was checked on entry and
* couldn't have changed.
*/
- if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &ignored))
+ if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored))
nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
nested_vmx_transition_tlb_flush(vcpu, vmcs12, false);
@@ -4463,8 +4484,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
if (nested_cpu_has_preemption_timer(vmcs12))
hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
- if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
- vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
+ if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) {
+ vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset;
+ if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
+ vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
+ }
if (likely(!vmx->fail)) {
sync_vmcs02_to_vmcs12(vcpu, vmcs12);
@@ -4501,12 +4525,12 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
+ if (kvm_has_tsc_control)
+ vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
+
if (vmx->nested.l1_tpr_threshold != -1)
vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold);
- if (kvm_has_tsc_control)
- decache_tsc_multiplier(vmx);
-
if (vmx->nested.change_vmcs01_virtual_apic_mode) {
vmx->nested.change_vmcs01_virtual_apic_mode = false;
vmx_set_virtual_apic_mode(vcpu);
@@ -4532,7 +4556,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
}
if ((vm_exit_reason != -1) &&
- (enable_shadow_vmcs || vmx->nested.hv_evmcs))
+ (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)))
vmx->nested.need_vmcs12_to_shadow_sync = true;
/* in case we halted in L2 */
@@ -4987,6 +5011,8 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
vmptr + offsetof(struct vmcs12,
launch_state),
&zero, sizeof(zero));
+ } else if (vmx->nested.hv_evmcs && vmptr == vmx->nested.hv_evmcs_vmptr) {
+ nested_release_evmcs(vcpu);
}
return nested_vmx_succeed(vcpu);
@@ -5228,7 +5254,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
/* Forbid normal VMPTRLD if Enlightened version was used */
- if (vmx->nested.hv_evmcs)
+ if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
return 1;
if (vmx->nested.current_vmptr != vmptr) {
@@ -5284,7 +5310,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
if (!nested_vmx_check_permission(vcpu))
return 1;
- if (unlikely(to_vmx(vcpu)->nested.hv_evmcs))
+ if (unlikely(evmptr_is_valid(to_vmx(vcpu)->nested.hv_evmcs_vmptr)))
return 1;
if (get_vmx_mem_address(vcpu, exit_qual, instr_info,
@@ -5461,8 +5487,8 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
/*
* Sync the shadow page tables if EPT is disabled, L1 is invalidating
- * linear mappings for L2 (tagged with L2's VPID). Free all roots as
- * VPIDs are not tracked in the MMU role.
+ * linear mappings for L2 (tagged with L2's VPID). Free all guest
+ * roots as VPIDs are not tracked in the MMU role.
*
* Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share
* an MMU when EPT is disabled.
@@ -5470,8 +5496,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
* TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR.
*/
if (!enable_ept)
- kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu,
- KVM_MMU_ROOTS_ALL);
+ kvm_mmu_free_guest_mode_roots(vcpu, &vcpu->arch.root_mmu);
return nested_vmx_succeed(vcpu);
}
@@ -5481,23 +5506,16 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
{
u32 index = kvm_rcx_read(vcpu);
u64 new_eptp;
- bool accessed_dirty;
- struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
- if (!nested_cpu_has_eptp_switching(vmcs12) ||
- !nested_cpu_has_ept(vmcs12))
+ if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12)))
return 1;
-
if (index >= VMFUNC_EPTP_ENTRIES)
return 1;
-
if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
&new_eptp, index * 8, 8))
return 1;
- accessed_dirty = !!(new_eptp & VMX_EPTP_AD_ENABLE_BIT);
-
/*
* If the (L2) guest does a vmfunc to the currently
* active ept pointer, we don't have to do anything else
@@ -5506,11 +5524,11 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
if (!nested_vmx_check_eptp(vcpu, new_eptp))
return 1;
- mmu->ept_ad = accessed_dirty;
- mmu->mmu_role.base.ad_disabled = !accessed_dirty;
vmcs12->ept_pointer = new_eptp;
+ nested_ept_new_eptp(vcpu);
- kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
+ if (!nested_cpu_has_vpid(vmcs12))
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
}
return 0;
@@ -5533,7 +5551,17 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
}
vmcs12 = get_vmcs12(vcpu);
- if ((vmcs12->vm_function_control & (1 << function)) == 0)
+
+ /*
+ * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC
+ * is enabled in vmcs02 if and only if it's enabled in vmcs12.
+ */
+ if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+
+ if (!(vmcs12->vm_function_control & BIT_ULL(function)))
goto fail;
switch (function) {
@@ -5806,6 +5834,9 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
else if (is_breakpoint(intr_info) &&
vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
return true;
+ else if (is_alignment_check(intr_info) &&
+ !vmx_guest_inject_ac(vcpu))
+ return true;
return false;
case EXIT_REASON_EXTERNAL_INTERRUPT:
return true;
@@ -6056,7 +6087,8 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
if (vmx_has_valid_vmcs12(vcpu)) {
kvm_state.size += sizeof(user_vmx_nested_state->vmcs12);
- if (vmx->nested.hv_evmcs)
+ /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */
+ if (vmx->nested.hv_evmcs_vmptr != EVMPTR_INVALID)
kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
if (is_guest_mode(vcpu) &&
@@ -6112,8 +6144,15 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
} else {
copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
if (!vmx->nested.need_vmcs12_to_shadow_sync) {
- if (vmx->nested.hv_evmcs)
- copy_enlightened_to_vmcs12(vmx);
+ if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
+ /*
+ * L1 hypervisor is not obliged to keep eVMCS
+ * clean fields data always up-to-date while
+ * not in guest mode, 'hv_clean_fields' is only
+ * supposed to be actual upon vmentry so we need
+ * to ignore it here and do full copy.
+ */
+ copy_enlightened_to_vmcs12(vmx, 0);
else if (enable_shadow_vmcs)
copy_shadow_to_vmcs12(vmx);
}
@@ -6255,6 +6294,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
* restored yet. EVMCS will be mapped from
* nested_get_vmcs12_pages().
*/
+ vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING;
kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
} else {
return -EINVAL;
@@ -6339,6 +6379,40 @@ void nested_vmx_set_vmcs_shadowing_bitmap(void)
}
/*
+ * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo
+ * that madness to get the encoding for comparison.
+ */
+#define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10)))
+
+static u64 nested_vmx_calc_vmcs_enum_msr(void)
+{
+ /*
+ * Note these are the so called "index" of the VMCS field encoding, not
+ * the index into vmcs12.
+ */
+ unsigned int max_idx, idx;
+ int i;
+
+ /*
+ * For better or worse, KVM allows VMREAD/VMWRITE to all fields in
+ * vmcs12, regardless of whether or not the associated feature is
+ * exposed to L1. Simply find the field with the highest index.
+ */
+ max_idx = 0;
+ for (i = 0; i < nr_vmcs12_fields; i++) {
+ /* The vmcs12 table is very, very sparsely populated. */
+ if (!vmcs_field_to_offset_table[i])
+ continue;
+
+ idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i));
+ if (idx > max_idx)
+ max_idx = idx;
+ }
+
+ return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT;
+}
+
+/*
* nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
* returned for the various VMX controls MSRs when nested VMX is enabled.
* The same values should also be used to verify that vmcs12 control fields are
@@ -6474,7 +6548,8 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
SECONDARY_EXEC_RDRAND_EXITING |
SECONDARY_EXEC_ENABLE_INVPCID |
SECONDARY_EXEC_RDSEED_EXITING |
- SECONDARY_EXEC_XSAVES;
+ SECONDARY_EXEC_XSAVES |
+ SECONDARY_EXEC_TSC_SCALING;
/*
* We can emulate "VMCS shadowing," even if the hardware
@@ -6582,8 +6657,7 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
- /* highest index: VMX_PREEMPTION_TIMER_VALUE */
- msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1;
+ msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr();
}
void nested_vmx_hardware_unsetup(void)
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index 184418baeb3c..b69a80f43b37 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -56,14 +56,9 @@ static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- /*
- * In case we do two consecutive get/set_nested_state()s while L2 was
- * running hv_evmcs may end up not being mapped (we map it from
- * nested_vmx_run()/vmx_vcpu_run()). Check is_guest_mode() as we always
- * have vmcs12 if it is true.
- */
- return is_guest_mode(vcpu) || vmx->nested.current_vmptr != -1ull ||
- vmx->nested.hv_evmcs;
+ /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */
+ return vmx->nested.current_vmptr != -1ull ||
+ vmx->nested.hv_evmcs_vmptr != EVMPTR_INVALID;
}
static inline u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index 1472c6c376f7..4b9957e2bf5b 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -117,6 +117,11 @@ static inline bool is_gp_fault(u32 intr_info)
return is_exception_n(intr_info, GP_VECTOR);
}
+static inline bool is_alignment_check(u32 intr_info)
+{
+ return is_exception_n(intr_info, AC_VECTOR);
+}
+
static inline bool is_machine_check(u32 intr_info)
{
return is_exception_n(intr_info, MC_VECTOR);
@@ -164,4 +169,12 @@ static inline int vmcs_field_readonly(unsigned long field)
return (((field >> 10) & 0x3) == 1);
}
+#define VMCS_FIELD_INDEX_SHIFT (1)
+#define VMCS_FIELD_INDEX_MASK GENMASK(9, 1)
+
+static inline unsigned int vmcs_field_index(unsigned long field)
+{
+ return (field & VMCS_FIELD_INDEX_MASK) >> VMCS_FIELD_INDEX_SHIFT;
+}
+
#endif /* __KVM_X86_VMX_VMCS_H */
diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c
index 034adb6404dc..d9f5d7c56ae3 100644
--- a/arch/x86/kvm/vmx/vmcs12.c
+++ b/arch/x86/kvm/vmx/vmcs12.c
@@ -37,6 +37,7 @@ const unsigned short vmcs_field_to_offset_table[] = {
FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
FIELD64(PML_ADDRESS, pml_address),
FIELD64(TSC_OFFSET, tsc_offset),
+ FIELD64(TSC_MULTIPLIER, tsc_multiplier),
FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
index 13494956d0e9..5e0e1b39f495 100644
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -70,7 +70,8 @@ struct __packed vmcs12 {
u64 eptp_list_address;
u64 pml_address;
u64 encls_exiting_bitmap;
- u64 padding64[2]; /* room for future expansion */
+ u64 tsc_multiplier;
+ u64 padding64[1]; /* room for future expansion */
/*
* To allow migration of L1 (complete with its L2 guests) between
* machines of different natural widths (32 or 64 bit), we cannot have
@@ -205,12 +206,6 @@ struct __packed vmcs12 {
#define VMCS12_SIZE KVM_STATE_NESTED_VMX_VMCS_SIZE
/*
- * VMCS12_MAX_FIELD_INDEX is the highest index value used in any
- * supported VMCS12 field encoding.
- */
-#define VMCS12_MAX_FIELD_INDEX 0x17
-
-/*
* For save/restore compatibility, the vmcs12 field offsets must not change.
*/
#define CHECK_OFFSET(field, loc) \
@@ -258,6 +253,7 @@ static inline void vmx_check_vmcs12_offsets(void)
CHECK_OFFSET(eptp_list_address, 304);
CHECK_OFFSET(pml_address, 312);
CHECK_OFFSET(encls_exiting_bitmap, 320);
+ CHECK_OFFSET(tsc_multiplier, 328);
CHECK_OFFSET(cr0_guest_host_mask, 344);
CHECK_OFFSET(cr4_guest_host_mask, 352);
CHECK_OFFSET(cr0_read_shadow, 360);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index c2a779b688e6..927a552393b9 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -52,6 +52,7 @@
#include "cpuid.h"
#include "evmcs.h"
#include "hyperv.h"
+#include "kvm_onhyperv.h"
#include "irq.h"
#include "kvm_cache_regs.h"
#include "lapic.h"
@@ -101,7 +102,6 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
static bool __read_mostly fasteoi = 1;
module_param(fasteoi, bool, S_IRUGO);
-bool __read_mostly enable_apicv = 1;
module_param(enable_apicv, bool, S_IRUGO);
/*
@@ -459,86 +459,6 @@ static unsigned long host_idt_base;
static bool __read_mostly enlightened_vmcs = true;
module_param(enlightened_vmcs, bool, 0444);
-static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
- void *data)
-{
- struct kvm_tlb_range *range = data;
-
- return hyperv_fill_flush_guest_mapping_list(flush, range->start_gfn,
- range->pages);
-}
-
-static inline int hv_remote_flush_root_ept(hpa_t root_ept,
- struct kvm_tlb_range *range)
-{
- if (range)
- return hyperv_flush_guest_mapping_range(root_ept,
- kvm_fill_hv_flush_list_func, (void *)range);
- else
- return hyperv_flush_guest_mapping(root_ept);
-}
-
-static int hv_remote_flush_tlb_with_range(struct kvm *kvm,
- struct kvm_tlb_range *range)
-{
- struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
- struct kvm_vcpu *vcpu;
- int ret = 0, i, nr_unique_valid_roots;
- hpa_t root;
-
- spin_lock(&kvm_vmx->hv_root_ept_lock);
-
- if (!VALID_PAGE(kvm_vmx->hv_root_ept)) {
- nr_unique_valid_roots = 0;
-
- /*
- * Flush all valid roots, and see if all vCPUs have converged
- * on a common root, in which case future flushes can skip the
- * loop and flush the common root.
- */
- kvm_for_each_vcpu(i, vcpu, kvm) {
- root = to_vmx(vcpu)->hv_root_ept;
- if (!VALID_PAGE(root) || root == kvm_vmx->hv_root_ept)
- continue;
-
- /*
- * Set the tracked root to the first valid root. Keep
- * this root for the entirety of the loop even if more
- * roots are encountered as a low effort optimization
- * to avoid flushing the same (first) root again.
- */
- if (++nr_unique_valid_roots == 1)
- kvm_vmx->hv_root_ept = root;
-
- if (!ret)
- ret = hv_remote_flush_root_ept(root, range);
-
- /*
- * Stop processing roots if a failure occurred and
- * multiple valid roots have already been detected.
- */
- if (ret && nr_unique_valid_roots > 1)
- break;
- }
-
- /*
- * The optimized flush of a single root can't be used if there
- * are multiple valid roots (obviously).
- */
- if (nr_unique_valid_roots > 1)
- kvm_vmx->hv_root_ept = INVALID_PAGE;
- } else {
- ret = hv_remote_flush_root_ept(kvm_vmx->hv_root_ept, range);
- }
-
- spin_unlock(&kvm_vmx->hv_root_ept_lock);
- return ret;
-}
-static int hv_remote_flush_tlb(struct kvm *kvm)
-{
- return hv_remote_flush_tlb_with_range(kvm, NULL);
-}
-
static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
{
struct hv_enlightened_vmcs *evmcs;
@@ -566,21 +486,6 @@ static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
#endif /* IS_ENABLED(CONFIG_HYPERV) */
-static void hv_track_root_ept(struct kvm_vcpu *vcpu, hpa_t root_ept)
-{
-#if IS_ENABLED(CONFIG_HYPERV)
- struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
-
- if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) {
- spin_lock(&kvm_vmx->hv_root_ept_lock);
- to_vmx(vcpu)->hv_root_ept = root_ept;
- if (root_ept != kvm_vmx->hv_root_ept)
- kvm_vmx->hv_root_ept = INVALID_PAGE;
- spin_unlock(&kvm_vmx->hv_root_ept_lock);
- }
-#endif
-}
-
/*
* Comment's format: document - errata name - stepping - processor name.
* Refer from
@@ -842,16 +747,21 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
if (is_guest_mode(vcpu))
eb |= get_vmcs12(vcpu)->exception_bitmap;
else {
- /*
- * If EPT is enabled, #PF is only trapped if MAXPHYADDR is mismatched
- * between guest and host. In that case we only care about present
- * faults. For vmcs02, however, PFEC_MASK and PFEC_MATCH are set in
- * prepare_vmcs02_rare.
- */
- bool selective_pf_trap = enable_ept && (eb & (1u << PF_VECTOR));
- int mask = selective_pf_trap ? PFERR_PRESENT_MASK : 0;
+ int mask = 0, match = 0;
+
+ if (enable_ept && (eb & (1u << PF_VECTOR))) {
+ /*
+ * If EPT is enabled, #PF is currently only intercepted
+ * if MAXPHYADDR is smaller on the guest than on the
+ * host. In that case we only care about present,
+ * non-reserved faults. For vmcs02, however, PFEC_MASK
+ * and PFEC_MATCH are set in prepare_vmcs02_rare.
+ */
+ mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK;
+ match = PFERR_PRESENT_MASK;
+ }
vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask);
- vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, mask);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match);
}
vmcs_write32(EXCEPTION_BITMAP, eb);
@@ -1390,11 +1300,6 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
vmx->loaded_vmcs->cpu = cpu;
}
-
- /* Setup TSC multiplier */
- if (kvm_has_tsc_control &&
- vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
- decache_tsc_multiplier(vmx);
}
/*
@@ -1787,26 +1692,35 @@ static void setup_msrs(struct vcpu_vmx *vmx)
vmx->guest_uret_msrs_loaded = false;
}
-static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- u64 g_tsc_offset = 0;
- /*
- * We're here if L1 chose not to trap WRMSR to TSC. According
- * to the spec, this should set L1's TSC; The offset that L1
- * set for L2 remains unchanged, and still needs to be added
- * to the newly set TSC to get L2's TSC.
- */
- if (is_guest_mode(vcpu) &&
- (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING))
- g_tsc_offset = vmcs12->tsc_offset;
+ if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING))
+ return vmcs12->tsc_offset;
+
+ return 0;
+}
+
+u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) &&
+ nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
+ return vmcs12->tsc_multiplier;
- trace_kvm_write_tsc_offset(vcpu->vcpu_id,
- vcpu->arch.tsc_offset - g_tsc_offset,
- offset);
- vmcs_write64(TSC_OFFSET, offset + g_tsc_offset);
- return offset + g_tsc_offset;
+ return kvm_default_tsc_scaling_ratio;
+}
+
+static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+{
+ vmcs_write64(TSC_OFFSET, offset);
+}
+
+static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
+{
+ vmcs_write64(TSC_MULTIPLIER, multiplier);
}
/*
@@ -3181,7 +3095,7 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
eptp = construct_eptp(vcpu, root_hpa, root_level);
vmcs_write64(EPT_POINTER, eptp);
- hv_track_root_ept(vcpu, root_hpa);
+ hv_track_root_tdp(vcpu, root_hpa);
if (!enable_unrestricted_guest && !is_paging(vcpu))
guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
@@ -3707,7 +3621,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
int ret = 0;
mutex_lock(&kvm->slots_lock);
- if (kvm->arch.apic_access_page_done)
+ if (kvm->arch.apic_access_memslot_enabled)
goto out;
hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
@@ -3727,7 +3641,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
* is able to migrate it.
*/
put_page(page);
- kvm->arch.apic_access_page_done = true;
+ kvm->arch.apic_access_memslot_enabled = true;
out:
mutex_unlock(&kvm->slots_lock);
return ret;
@@ -4829,7 +4743,7 @@ static int handle_machine_check(struct kvm_vcpu *vcpu)
* - Guest has #AC detection enabled in CR0
* - Guest EFLAGS has AC bit set
*/
-static inline bool guest_inject_ac(struct kvm_vcpu *vcpu)
+bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu)
{
if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
return true;
@@ -4937,7 +4851,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
kvm_run->debug.arch.exception = ex_no;
break;
case AC_VECTOR:
- if (guest_inject_ac(vcpu)) {
+ if (vmx_guest_inject_ac(vcpu)) {
kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
return 1;
}
@@ -5810,6 +5724,8 @@ void dump_vmcs(struct kvm_vcpu *vcpu)
if (cpu_has_secondary_exec_ctrls())
secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+ pr_err("VMCS %p, last attempted VM-entry on CPU %d\n",
+ vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu);
pr_err("*** Guest State ***\n");
pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
@@ -6806,7 +6722,18 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
kvm_load_host_xsave_state(vcpu);
- vmx->nested.nested_run_pending = 0;
+ if (is_guest_mode(vcpu)) {
+ /*
+ * Track VMLAUNCH/VMRESUME that have made past guest state
+ * checking.
+ */
+ if (vmx->nested.nested_run_pending &&
+ !vmx->exit_reason.failed_vmentry)
+ ++vcpu->stat.nested_run;
+
+ vmx->nested.nested_run_pending = 0;
+ }
+
vmx->idt_vectoring_info = 0;
if (unlikely(vmx->fail)) {
@@ -6941,6 +6868,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
vmx->nested.posted_intr_nv = -1;
vmx->nested.current_vmptr = -1ull;
+ vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
vcpu->arch.microcode_version = 0x100000000ULL;
vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
@@ -6952,9 +6880,6 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
vmx->pi_desc.nv = POSTED_INTR_VECTOR;
vmx->pi_desc.sn = 1;
-#if IS_ENABLED(CONFIG_HYPERV)
- vmx->hv_root_ept = INVALID_PAGE;
-#endif
return 0;
free_vmcs:
@@ -6971,10 +6896,6 @@ free_vpid:
static int vmx_vm_init(struct kvm *kvm)
{
-#if IS_ENABLED(CONFIG_HYPERV)
- spin_lock_init(&to_kvm_vmx(kvm)->hv_root_ept_lock);
-#endif
-
if (!ple_gap)
kvm->arch.pause_in_guest = true;
@@ -7001,7 +6922,6 @@ static int vmx_vm_init(struct kvm *kvm)
break;
}
}
- kvm_apicv_init(kvm, enable_apicv);
return 0;
}
@@ -7453,10 +7373,10 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
delta_tsc = 0;
/* Convert to host delta tsc if tsc scaling is enabled */
- if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
+ if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
delta_tsc && u64_shl_div_u64(delta_tsc,
kvm_tsc_scaling_ratio_frac_bits,
- vcpu->arch.tsc_scaling_ratio, &delta_tsc))
+ vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc))
return -ERANGE;
/*
@@ -7542,7 +7462,7 @@ static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
return !is_smm(vcpu);
}
-static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
+static int vmx_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -7556,7 +7476,7 @@ static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
return 0;
}
-static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
+static int vmx_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int ret;
@@ -7700,7 +7620,10 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
- .write_l1_tsc_offset = vmx_write_l1_tsc_offset,
+ .get_l2_tsc_offset = vmx_get_l2_tsc_offset,
+ .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
+ .write_tsc_offset = vmx_write_tsc_offset,
+ .write_tsc_multiplier = vmx_write_tsc_multiplier,
.load_mmu_pgd = vmx_load_mmu_pgd,
@@ -7731,8 +7654,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.setup_mce = vmx_setup_mce,
.smi_allowed = vmx_smi_allowed,
- .pre_enter_smm = vmx_pre_enter_smm,
- .pre_leave_smm = vmx_pre_leave_smm,
+ .enter_smm = vmx_enter_smm,
+ .leave_smm = vmx_leave_smm,
.enable_smi_window = vmx_enable_smi_window,
.can_emulate_instruction = vmx_can_emulate_instruction,
@@ -7807,6 +7730,12 @@ static __init int hardware_setup(void)
!cpu_has_vmx_invept_global())
enable_ept = 0;
+ /* NX support is required for shadow paging. */
+ if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) {
+ pr_err_ratelimited("kvm: NX (Execute Disable) not supported\n");
+ return -EOPNOTSUPP;
+ }
+
if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
enable_ept_ad_bits = 0;
@@ -7996,6 +7925,8 @@ static void vmx_exit(void)
}
#endif
vmx_cleanup_l1d_flush();
+
+ allow_smaller_maxphyaddr = false;
}
module_exit(vmx_exit);
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 16e4e457ba23..3979a947933a 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -322,8 +322,6 @@ struct vcpu_vmx {
/* apic deadline value in host tsc */
u64 hv_deadline_tsc;
- u64 current_tsc_ratio;
-
unsigned long host_debugctlmsr;
/*
@@ -336,10 +334,6 @@ struct vcpu_vmx {
/* SGX Launch Control public key hash */
u64 msr_ia32_sgxlepubkeyhash[4];
-#if IS_ENABLED(CONFIG_HYPERV)
- u64 hv_root_ept;
-#endif
-
struct pt_desc pt_desc;
struct lbr_desc lbr_desc;
@@ -357,11 +351,6 @@ struct kvm_vmx {
unsigned int tss_addr;
bool ept_identity_pagetable_done;
gpa_t ept_identity_map_addr;
-
-#if IS_ENABLED(CONFIG_HYPERV)
- hpa_t hv_root_ept;
- spinlock_t hv_root_ept_lock;
-#endif
};
bool nested_vmx_allowed(struct kvm_vcpu *vcpu);
@@ -387,6 +376,7 @@ void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
+bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu);
void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
bool vmx_nmi_blocked(struct kvm_vcpu *vcpu);
@@ -404,6 +394,9 @@ void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu);
void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type);
void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type);
+u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu);
+u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu);
+
static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr,
int type, bool value)
{
@@ -529,12 +522,6 @@ static inline struct vmcs *alloc_vmcs(bool shadow)
GFP_KERNEL_ACCOUNT);
}
-static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx)
-{
- vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio;
- vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
-}
-
static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx)
{
return vmx->secondary_exec_control &
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e0f4a46649d7..17468d983fbd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -58,6 +58,7 @@
#include <linux/sched/isolation.h>
#include <linux/mem_encrypt.h>
#include <linux/entry-kvm.h>
+#include <linux/suspend.h>
#include <trace/events/kvm.h>
@@ -102,6 +103,8 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
+#define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE)
+
#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
@@ -113,6 +116,9 @@ static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
static void store_regs(struct kvm_vcpu *vcpu);
static int sync_regs(struct kvm_vcpu *vcpu);
+static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
+static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
+
struct kvm_x86_ops kvm_x86_ops __read_mostly;
EXPORT_SYMBOL_GPL(kvm_x86_ops);
@@ -209,55 +215,78 @@ EXPORT_SYMBOL_GPL(host_efer);
bool __read_mostly allow_smaller_maxphyaddr = 0;
EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
+bool __read_mostly enable_apicv = true;
+EXPORT_SYMBOL_GPL(enable_apicv);
+
u64 __read_mostly host_xss;
EXPORT_SYMBOL_GPL(host_xss);
u64 __read_mostly supported_xss;
EXPORT_SYMBOL_GPL(supported_xss);
-struct kvm_stats_debugfs_item debugfs_entries[] = {
- VCPU_STAT("pf_fixed", pf_fixed),
- VCPU_STAT("pf_guest", pf_guest),
- VCPU_STAT("tlb_flush", tlb_flush),
- VCPU_STAT("invlpg", invlpg),
- VCPU_STAT("exits", exits),
- VCPU_STAT("io_exits", io_exits),
- VCPU_STAT("mmio_exits", mmio_exits),
- VCPU_STAT("signal_exits", signal_exits),
- VCPU_STAT("irq_window", irq_window_exits),
- VCPU_STAT("nmi_window", nmi_window_exits),
- VCPU_STAT("halt_exits", halt_exits),
- VCPU_STAT("halt_successful_poll", halt_successful_poll),
- VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
- VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
- VCPU_STAT("halt_wakeup", halt_wakeup),
- VCPU_STAT("hypercalls", hypercalls),
- VCPU_STAT("request_irq", request_irq_exits),
- VCPU_STAT("irq_exits", irq_exits),
- VCPU_STAT("host_state_reload", host_state_reload),
- VCPU_STAT("fpu_reload", fpu_reload),
- VCPU_STAT("insn_emulation", insn_emulation),
- VCPU_STAT("insn_emulation_fail", insn_emulation_fail),
- VCPU_STAT("irq_injections", irq_injections),
- VCPU_STAT("nmi_injections", nmi_injections),
- VCPU_STAT("req_event", req_event),
- VCPU_STAT("l1d_flush", l1d_flush),
- VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
- VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
- VCPU_STAT("nested_run", nested_run),
- VCPU_STAT("directed_yield_attempted", directed_yield_attempted),
- VCPU_STAT("directed_yield_successful", directed_yield_successful),
- VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
- VM_STAT("mmu_pte_write", mmu_pte_write),
- VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
- VM_STAT("mmu_flooded", mmu_flooded),
- VM_STAT("mmu_recycled", mmu_recycled),
- VM_STAT("mmu_cache_miss", mmu_cache_miss),
- VM_STAT("mmu_unsync", mmu_unsync),
- VM_STAT("remote_tlb_flush", remote_tlb_flush),
- VM_STAT("largepages", lpages, .mode = 0444),
- VM_STAT("nx_largepages_splitted", nx_lpage_splits, .mode = 0444),
- VM_STAT("max_mmu_page_hash_collisions", max_mmu_page_hash_collisions),
- { NULL }
+const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
+ KVM_GENERIC_VM_STATS(),
+ STATS_DESC_COUNTER(VM, mmu_shadow_zapped),
+ STATS_DESC_COUNTER(VM, mmu_pte_write),
+ STATS_DESC_COUNTER(VM, mmu_pde_zapped),
+ STATS_DESC_COUNTER(VM, mmu_flooded),
+ STATS_DESC_COUNTER(VM, mmu_recycled),
+ STATS_DESC_COUNTER(VM, mmu_cache_miss),
+ STATS_DESC_ICOUNTER(VM, mmu_unsync),
+ STATS_DESC_ICOUNTER(VM, lpages),
+ STATS_DESC_ICOUNTER(VM, nx_lpage_splits),
+ STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions)
+};
+static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
+ sizeof(struct kvm_vm_stat) / sizeof(u64));
+
+const struct kvm_stats_header kvm_vm_stats_header = {
+ .name_size = KVM_STATS_NAME_SIZE,
+ .num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
+ .id_offset = sizeof(struct kvm_stats_header),
+ .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+ .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+ sizeof(kvm_vm_stats_desc),
+};
+
+const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
+ KVM_GENERIC_VCPU_STATS(),
+ STATS_DESC_COUNTER(VCPU, pf_fixed),
+ STATS_DESC_COUNTER(VCPU, pf_guest),
+ STATS_DESC_COUNTER(VCPU, tlb_flush),
+ STATS_DESC_COUNTER(VCPU, invlpg),
+ STATS_DESC_COUNTER(VCPU, exits),
+ STATS_DESC_COUNTER(VCPU, io_exits),
+ STATS_DESC_COUNTER(VCPU, mmio_exits),
+ STATS_DESC_COUNTER(VCPU, signal_exits),
+ STATS_DESC_COUNTER(VCPU, irq_window_exits),
+ STATS_DESC_COUNTER(VCPU, nmi_window_exits),
+ STATS_DESC_COUNTER(VCPU, l1d_flush),
+ STATS_DESC_COUNTER(VCPU, halt_exits),
+ STATS_DESC_COUNTER(VCPU, request_irq_exits),
+ STATS_DESC_COUNTER(VCPU, irq_exits),
+ STATS_DESC_COUNTER(VCPU, host_state_reload),
+ STATS_DESC_COUNTER(VCPU, fpu_reload),
+ STATS_DESC_COUNTER(VCPU, insn_emulation),
+ STATS_DESC_COUNTER(VCPU, insn_emulation_fail),
+ STATS_DESC_COUNTER(VCPU, hypercalls),
+ STATS_DESC_COUNTER(VCPU, irq_injections),
+ STATS_DESC_COUNTER(VCPU, nmi_injections),
+ STATS_DESC_COUNTER(VCPU, req_event),
+ STATS_DESC_COUNTER(VCPU, nested_run),
+ STATS_DESC_COUNTER(VCPU, directed_yield_attempted),
+ STATS_DESC_COUNTER(VCPU, directed_yield_successful),
+ STATS_DESC_ICOUNTER(VCPU, guest_mode)
+};
+static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
+ sizeof(struct kvm_vcpu_stat) / sizeof(u64));
+
+const struct kvm_stats_header kvm_vcpu_stats_header = {
+ .name_size = KVM_STATS_NAME_SIZE,
+ .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
+ .id_offset = sizeof(struct kvm_stats_header),
+ .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+ .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+ sizeof(kvm_vcpu_stats_desc),
};
u64 __read_mostly host_xcr0;
@@ -778,13 +807,6 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
}
EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
-static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
- void *data, int offset, int len, u32 access)
-{
- return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
- data, offset, len, access);
-}
-
static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
{
return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2);
@@ -819,6 +841,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+ vcpu->arch.pdptrs_from_userspace = false;
out:
@@ -826,40 +849,14 @@ out:
}
EXPORT_SYMBOL_GPL(load_pdptrs);
-bool pdptrs_changed(struct kvm_vcpu *vcpu)
-{
- u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
- int offset;
- gfn_t gfn;
- int r;
-
- if (!is_pae_paging(vcpu))
- return false;
-
- if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR))
- return true;
-
- gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
- offset = (kvm_read_cr3(vcpu) & 0xffffffe0ul) & (PAGE_SIZE - 1);
- r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
- PFERR_USER_MASK | PFERR_WRITE_MASK);
- if (r < 0)
- return true;
-
- return memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
-}
-EXPORT_SYMBOL_GPL(pdptrs_changed);
-
void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
{
- unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
-
if ((cr0 ^ old_cr0) & X86_CR0_PG) {
kvm_clear_async_pf_completion_queue(vcpu);
kvm_async_pf_hash_reset(vcpu);
}
- if ((cr0 ^ old_cr0) & update_bits)
+ if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS)
kvm_mmu_reset_context(vcpu);
if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
@@ -1038,10 +1035,7 @@ EXPORT_SYMBOL_GPL(kvm_is_valid_cr4);
void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4)
{
- unsigned long mmu_role_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
- X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE;
-
- if (((cr4 ^ old_cr4) & mmu_role_bits) ||
+ if (((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS) ||
(!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
kvm_mmu_reset_context(vcpu);
}
@@ -1084,25 +1078,46 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
}
EXPORT_SYMBOL_GPL(kvm_set_cr4);
+static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid)
+{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ unsigned long roots_to_free = 0;
+ int i;
+
+ /*
+ * If neither the current CR3 nor any of the prev_roots use the given
+ * PCID, then nothing needs to be done here because a resync will
+ * happen anyway before switching to any other CR3.
+ */
+ if (kvm_get_active_pcid(vcpu) == pcid) {
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+ }
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if (kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd) == pcid)
+ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+
+ kvm_mmu_free_roots(vcpu, mmu, roots_to_free);
+}
+
int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
bool skip_tlb_flush = false;
+ unsigned long pcid = 0;
#ifdef CONFIG_X86_64
bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
if (pcid_enabled) {
skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
cr3 &= ~X86_CR3_PCID_NOFLUSH;
+ pcid = cr3 & X86_CR3_PCID_MASK;
}
#endif
- if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
- if (!skip_tlb_flush) {
- kvm_mmu_sync_roots(vcpu);
- kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
- }
- return 0;
- }
+ /* PDPTRs are always reloaded for PAE paging. */
+ if (cr3 == kvm_read_cr3(vcpu) && !is_pae_paging(vcpu))
+ goto handle_tlb_flush;
/*
* Do not condition the GPA check on long mode, this helper is used to
@@ -1115,10 +1130,23 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
return 1;
- kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush);
+ if (cr3 != kvm_read_cr3(vcpu))
+ kvm_mmu_new_pgd(vcpu, cr3);
+
vcpu->arch.cr3 = cr3;
kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
+handle_tlb_flush:
+ /*
+ * A load of CR3 that flushes the TLB flushes only the current PCID,
+ * even if PCID is disabled, in which case PCID=0 is flushed. It's a
+ * moot point in the end because _disabling_ PCID will flush all PCIDs,
+ * and it's impossible to use a non-zero PCID when PCID is disabled,
+ * i.e. only PCID=0 can be relevant.
+ */
+ if (!skip_tlb_flush)
+ kvm_invalidate_pcid(vcpu, pcid);
+
return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_cr3);
@@ -2179,13 +2207,15 @@ static u32 adjust_tsc_khz(u32 khz, s32 ppm)
return v;
}
+static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier);
+
static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
{
u64 ratio;
/* Guest TSC same frequency as host TSC? */
if (!scale) {
- vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
+ kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio);
return 0;
}
@@ -2211,7 +2241,7 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
return -1;
}
- vcpu->arch.tsc_scaling_ratio = ratio;
+ kvm_vcpu_write_tsc_multiplier(vcpu, ratio);
return 0;
}
@@ -2223,7 +2253,7 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
/* tsc_khz can be zero if TSC calibration fails */
if (user_tsc_khz == 0) {
/* set tsc_scaling_ratio to a safe value */
- vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
+ kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio);
return -1;
}
@@ -2305,10 +2335,9 @@ static inline u64 __scale_tsc(u64 ratio, u64 tsc)
return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
}
-u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
+u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc, u64 ratio)
{
u64 _tsc = tsc;
- u64 ratio = vcpu->arch.tsc_scaling_ratio;
if (ratio != kvm_default_tsc_scaling_ratio)
_tsc = __scale_tsc(ratio, tsc);
@@ -2317,25 +2346,86 @@ u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
}
EXPORT_SYMBOL_GPL(kvm_scale_tsc);
-static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
+static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
{
u64 tsc;
- tsc = kvm_scale_tsc(vcpu, rdtsc());
+ tsc = kvm_scale_tsc(vcpu, rdtsc(), vcpu->arch.l1_tsc_scaling_ratio);
return target_tsc - tsc;
}
u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
{
- return vcpu->arch.l1_tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
+ return vcpu->arch.l1_tsc_offset +
+ kvm_scale_tsc(vcpu, host_tsc, vcpu->arch.l1_tsc_scaling_ratio);
}
EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
-static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier)
{
- vcpu->arch.l1_tsc_offset = offset;
- vcpu->arch.tsc_offset = static_call(kvm_x86_write_l1_tsc_offset)(vcpu, offset);
+ u64 nested_offset;
+
+ if (l2_multiplier == kvm_default_tsc_scaling_ratio)
+ nested_offset = l1_offset;
+ else
+ nested_offset = mul_s64_u64_shr((s64) l1_offset, l2_multiplier,
+ kvm_tsc_scaling_ratio_frac_bits);
+
+ nested_offset += l2_offset;
+ return nested_offset;
+}
+EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_offset);
+
+u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier)
+{
+ if (l2_multiplier != kvm_default_tsc_scaling_ratio)
+ return mul_u64_u64_shr(l1_multiplier, l2_multiplier,
+ kvm_tsc_scaling_ratio_frac_bits);
+
+ return l1_multiplier;
+}
+EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_multiplier);
+
+static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset)
+{
+ trace_kvm_write_tsc_offset(vcpu->vcpu_id,
+ vcpu->arch.l1_tsc_offset,
+ l1_offset);
+
+ vcpu->arch.l1_tsc_offset = l1_offset;
+
+ /*
+ * If we are here because L1 chose not to trap WRMSR to TSC then
+ * according to the spec this should set L1's TSC (as opposed to
+ * setting L1's offset for L2).
+ */
+ if (is_guest_mode(vcpu))
+ vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
+ l1_offset,
+ static_call(kvm_x86_get_l2_tsc_offset)(vcpu),
+ static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
+ else
+ vcpu->arch.tsc_offset = l1_offset;
+
+ static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset);
+}
+
+static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier)
+{
+ vcpu->arch.l1_tsc_scaling_ratio = l1_multiplier;
+
+ /* Userspace is changing the multiplier while L2 is active */
+ if (is_guest_mode(vcpu))
+ vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
+ l1_multiplier,
+ static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
+ else
+ vcpu->arch.tsc_scaling_ratio = l1_multiplier;
+
+ if (kvm_has_tsc_control)
+ static_call(kvm_x86_write_tsc_multiplier)(
+ vcpu, vcpu->arch.tsc_scaling_ratio);
}
static inline bool kvm_check_tsc_unstable(void)
@@ -2361,7 +2451,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
bool synchronizing = false;
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
- offset = kvm_compute_tsc_offset(vcpu, data);
+ offset = kvm_compute_l1_tsc_offset(vcpu, data);
ns = get_kvmclock_base_ns();
elapsed = ns - kvm->arch.last_tsc_nsec;
@@ -2400,7 +2490,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
} else {
u64 delta = nsec_to_cycles(vcpu, elapsed);
data += delta;
- offset = kvm_compute_tsc_offset(vcpu, data);
+ offset = kvm_compute_l1_tsc_offset(vcpu, data);
}
matched = true;
already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
@@ -2459,9 +2549,10 @@ static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
{
- if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
+ if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
WARN_ON(adjustment < 0);
- adjustment = kvm_scale_tsc(vcpu, (u64) adjustment);
+ adjustment = kvm_scale_tsc(vcpu, (u64) adjustment,
+ vcpu->arch.l1_tsc_scaling_ratio);
adjust_tsc_offset_guest(vcpu, adjustment);
}
@@ -2844,7 +2935,8 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
/* With all the info we got, fill in the values */
if (kvm_has_tsc_control)
- tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
+ tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz,
+ v->arch.l1_tsc_scaling_ratio);
if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
@@ -3250,7 +3342,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (msr_info->host_initiated) {
kvm_synchronize_tsc(vcpu, data);
} else {
- u64 adj = kvm_compute_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
+ u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
adjust_tsc_offset_guest(vcpu, adj);
vcpu->arch.ia32_tsc_adjust_msr += adj;
}
@@ -3552,10 +3644,17 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
* return L1's TSC value to ensure backwards-compatible
* behavior for migration.
*/
- u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset :
- vcpu->arch.tsc_offset;
+ u64 offset, ratio;
- msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + tsc_offset;
+ if (msr_info->host_initiated) {
+ offset = vcpu->arch.l1_tsc_offset;
+ ratio = vcpu->arch.l1_tsc_scaling_ratio;
+ } else {
+ offset = vcpu->arch.tsc_offset;
+ ratio = vcpu->arch.tsc_scaling_ratio;
+ }
+
+ msr_info->data = kvm_scale_tsc(vcpu, rdtsc(), ratio) + offset;
break;
}
case MSR_MTRRcap:
@@ -3879,6 +3978,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_HYPERV_TLBFLUSH:
case KVM_CAP_HYPERV_SEND_IPI:
case KVM_CAP_HYPERV_CPUID:
+ case KVM_CAP_HYPERV_ENFORCE_CPUID:
case KVM_CAP_SYS_HYPERV_CPUID:
case KVM_CAP_PCI_SEGMENT:
case KVM_CAP_DEBUGREGS:
@@ -3909,8 +4009,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_SGX_ATTRIBUTE:
#endif
case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
+ case KVM_CAP_SREGS2:
+ case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
r = 1;
break;
+ case KVM_CAP_EXIT_HYPERCALL:
+ r = KVM_EXIT_HYPERCALL_VALID_MASK;
+ break;
case KVM_CAP_SET_GUEST_DEBUG2:
return KVM_GUESTDBG_VALID_MASK;
#ifdef CONFIG_KVM_XEN
@@ -4138,7 +4243,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
mark_tsc_unstable("KVM discovered backwards TSC");
if (kvm_check_tsc_unstable()) {
- u64 offset = kvm_compute_tsc_offset(vcpu,
+ u64 offset = kvm_compute_l1_tsc_offset(vcpu,
vcpu->arch.last_guest_tsc);
kvm_vcpu_write_tsc_offset(vcpu, offset);
vcpu->arch.tsc_catchup = 1;
@@ -4457,7 +4562,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
memset(&events->reserved, 0, sizeof(events->reserved));
}
-static void kvm_smm_changed(struct kvm_vcpu *vcpu);
+static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm);
static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events)
@@ -4517,13 +4622,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.apic->sipi_vector = events->sipi_vector;
if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
- if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
- if (events->smi.smm)
- vcpu->arch.hflags |= HF_SMM_MASK;
- else
- vcpu->arch.hflags &= ~HF_SMM_MASK;
- kvm_smm_changed(vcpu);
- }
+ if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm)
+ kvm_smm_changed(vcpu, events->smi.smm);
vcpu->arch.smi_pending = events->smi.pending;
@@ -4807,6 +4907,9 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
return static_call(kvm_x86_enable_direct_tlbflush)(vcpu);
+ case KVM_CAP_HYPERV_ENFORCE_CPUID:
+ return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]);
+
case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
vcpu->arch.pv_cpuid.enforce = cap->args[0];
if (vcpu->arch.pv_cpuid.enforce)
@@ -4825,6 +4928,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
void __user *argp = (void __user *)arg;
int r;
union {
+ struct kvm_sregs2 *sregs2;
struct kvm_lapic_state *lapic;
struct kvm_xsave *xsave;
struct kvm_xcrs *xcrs;
@@ -5197,6 +5301,28 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
break;
}
#endif
+ case KVM_GET_SREGS2: {
+ u.sregs2 = kzalloc(sizeof(struct kvm_sregs2), GFP_KERNEL);
+ r = -ENOMEM;
+ if (!u.sregs2)
+ goto out;
+ __get_sregs2(vcpu, u.sregs2);
+ r = -EFAULT;
+ if (copy_to_user(argp, u.sregs2, sizeof(struct kvm_sregs2)))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_SET_SREGS2: {
+ u.sregs2 = memdup_user(argp, sizeof(struct kvm_sregs2));
+ if (IS_ERR(u.sregs2)) {
+ r = PTR_ERR(u.sregs2);
+ u.sregs2 = NULL;
+ goto out;
+ }
+ r = __set_sregs2(vcpu, u.sregs2);
+ break;
+ }
default:
r = -EINVAL;
}
@@ -5516,6 +5642,21 @@ split_irqchip_unlock:
if (kvm_x86_ops.vm_copy_enc_context_from)
r = kvm_x86_ops.vm_copy_enc_context_from(kvm, cap->args[0]);
return r;
+ case KVM_CAP_EXIT_HYPERCALL:
+ if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) {
+ r = -EINVAL;
+ break;
+ }
+ kvm->arch.hypercall_exit_enabled = cap->args[0];
+ r = 0;
+ break;
+ case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
+ r = -EINVAL;
+ if (cap->args[0] & ~1)
+ break;
+ kvm->arch.exit_on_emulation_error = cap->args[0];
+ r = 0;
+ break;
default:
r = -EINVAL;
break;
@@ -5630,6 +5771,41 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
return 0;
}
+#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
+static int kvm_arch_suspend_notifier(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu;
+ int i, ret = 0;
+
+ mutex_lock(&kvm->lock);
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!vcpu->arch.pv_time_enabled)
+ continue;
+
+ ret = kvm_set_guest_paused(vcpu);
+ if (ret) {
+ kvm_err("Failed to pause guest VCPU%d: %d\n",
+ vcpu->vcpu_id, ret);
+ break;
+ }
+ }
+ mutex_unlock(&kvm->lock);
+
+ return ret ? NOTIFY_BAD : NOTIFY_DONE;
+}
+
+int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state)
+{
+ switch (state) {
+ case PM_HIBERNATION_PREPARE:
+ case PM_SUSPEND_PREPARE:
+ return kvm_arch_suspend_notifier(kvm);
+ }
+
+ return NOTIFY_DONE;
+}
+#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
+
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -7104,23 +7280,22 @@ static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
return emul_to_vcpu(ctxt)->arch.hflags;
}
-static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags)
+static void emulator_exiting_smm(struct x86_emulate_ctxt *ctxt)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- vcpu->arch.hflags = emul_flags;
- kvm_mmu_reset_context(vcpu);
+ kvm_smm_changed(vcpu, false);
}
-static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt,
+static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt,
const char *smstate)
{
- return static_call(kvm_x86_pre_leave_smm)(emul_to_vcpu(ctxt), smstate);
+ return static_call(kvm_x86_leave_smm)(emul_to_vcpu(ctxt), smstate);
}
-static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt)
+static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt)
{
- kvm_smm_changed(emul_to_vcpu(ctxt));
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, emul_to_vcpu(ctxt));
}
static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr)
@@ -7169,9 +7344,9 @@ static const struct x86_emulate_ops emulate_ops = {
.guest_has_fxsr = emulator_guest_has_fxsr,
.set_nmi_mask = emulator_set_nmi_mask,
.get_hflags = emulator_get_hflags,
- .set_hflags = emulator_set_hflags,
- .pre_leave_smm = emulator_pre_leave_smm,
- .post_leave_smm = emulator_post_leave_smm,
+ .exiting_smm = emulator_exiting_smm,
+ .leave_smm = emulator_leave_smm,
+ .triple_fault = emulator_triple_fault,
.set_xcr = emulator_set_xcr,
};
@@ -7277,8 +7452,33 @@ void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
}
EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
+static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu)
+{
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+ u32 insn_size = ctxt->fetch.end - ctxt->fetch.data;
+ struct kvm_run *run = vcpu->run;
+
+ run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ run->emulation_failure.ndata = 0;
+ run->emulation_failure.flags = 0;
+
+ if (insn_size) {
+ run->emulation_failure.ndata = 3;
+ run->emulation_failure.flags |=
+ KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES;
+ run->emulation_failure.insn_size = insn_size;
+ memset(run->emulation_failure.insn_bytes, 0x90,
+ sizeof(run->emulation_failure.insn_bytes));
+ memcpy(run->emulation_failure.insn_bytes,
+ ctxt->fetch.data, insn_size);
+ }
+}
+
static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
{
+ struct kvm *kvm = vcpu->kvm;
+
++vcpu->stat.insn_emulation_fail;
trace_kvm_emulate_insn_failed(vcpu);
@@ -7287,10 +7487,9 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
return 1;
}
- if (emulation_type & EMULTYPE_SKIP) {
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
+ if (kvm->arch.exit_on_emulation_error ||
+ (emulation_type & EMULTYPE_SKIP)) {
+ prepare_emulation_failure_exit(vcpu);
return 0;
}
@@ -7432,11 +7631,14 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
static int complete_emulated_pio(struct kvm_vcpu *vcpu);
-static void kvm_smm_changed(struct kvm_vcpu *vcpu)
+static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm)
{
- if (!(vcpu->arch.hflags & HF_SMM_MASK)) {
- /* This is a good place to trace that we are exiting SMM. */
- trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false);
+ trace_kvm_smm_transition(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm);
+
+ if (entering_smm) {
+ vcpu->arch.hflags |= HF_SMM_MASK;
+ } else {
+ vcpu->arch.hflags &= ~(HF_SMM_MASK | HF_SMM_INSIDE_NMI_MASK);
/* Process a latched INIT or SMI, if any. */
kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -8361,16 +8563,15 @@ bool kvm_apicv_activated(struct kvm *kvm)
}
EXPORT_SYMBOL_GPL(kvm_apicv_activated);
-void kvm_apicv_init(struct kvm *kvm, bool enable)
+static void kvm_apicv_init(struct kvm *kvm)
{
- if (enable)
+ if (enable_apicv)
clear_bit(APICV_INHIBIT_REASON_DISABLE,
&kvm->arch.apicv_inhibit_reasons);
else
set_bit(APICV_INHIBIT_REASON_DISABLE,
&kvm->arch.apicv_inhibit_reasons);
}
-EXPORT_SYMBOL_GPL(kvm_apicv_init);
static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
{
@@ -8406,6 +8607,17 @@ no_yield:
return;
}
+static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
+{
+ u64 ret = vcpu->run->hypercall.ret;
+
+ if (!is_64_bit_mode(vcpu))
+ ret = (u32)ret;
+ kvm_rax_write(vcpu, ret);
+ ++vcpu->stat.hypercalls;
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
{
unsigned long nr, a0, a1, a2, a3, ret;
@@ -8471,6 +8683,28 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
kvm_sched_yield(vcpu, a0);
ret = 0;
break;
+ case KVM_HC_MAP_GPA_RANGE: {
+ u64 gpa = a0, npages = a1, attrs = a2;
+
+ ret = -KVM_ENOSYS;
+ if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE)))
+ break;
+
+ if (!PAGE_ALIGNED(gpa) || !npages ||
+ gpa_to_gfn(gpa) + npages <= gpa_to_gfn(gpa)) {
+ ret = -KVM_EINVAL;
+ break;
+ }
+
+ vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
+ vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
+ vcpu->run->hypercall.args[0] = gpa;
+ vcpu->run->hypercall.args[1] = npages;
+ vcpu->run->hypercall.args[2] = attrs;
+ vcpu->run->hypercall.longmode = op_64_bit;
+ vcpu->arch.complete_userspace_io = complete_hypercall_exit;
+ return 0;
+ }
default:
ret = -KVM_ENOSYS;
break;
@@ -8554,9 +8788,6 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
int kvm_check_nested_events(struct kvm_vcpu *vcpu)
{
- if (WARN_ON_ONCE(!is_guest_mode(vcpu)))
- return -EIO;
-
if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
kvm_x86_ops.nested_ops->triple_fault(vcpu);
return 1;
@@ -8572,7 +8803,7 @@ static void kvm_inject_exception(struct kvm_vcpu *vcpu)
static_call(kvm_x86_queue_exception)(vcpu);
}
-static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
+static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
{
int r;
bool can_inject = true;
@@ -8619,7 +8850,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
if (is_guest_mode(vcpu)) {
r = kvm_check_nested_events(vcpu);
if (r < 0)
- goto busy;
+ goto out;
}
/* try to inject new event if pending */
@@ -8661,7 +8892,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
if (vcpu->arch.smi_pending) {
r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY;
if (r < 0)
- goto busy;
+ goto out;
if (r) {
vcpu->arch.smi_pending = false;
++vcpu->arch.smi_count;
@@ -8674,7 +8905,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
if (vcpu->arch.nmi_pending) {
r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY;
if (r < 0)
- goto busy;
+ goto out;
if (r) {
--vcpu->arch.nmi_pending;
vcpu->arch.nmi_injected = true;
@@ -8689,7 +8920,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
if (kvm_cpu_has_injectable_intr(vcpu)) {
r = can_inject ? static_call(kvm_x86_interrupt_allowed)(vcpu, true) : -EBUSY;
if (r < 0)
- goto busy;
+ goto out;
if (r) {
kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
static_call(kvm_x86_set_irq)(vcpu);
@@ -8705,11 +8936,14 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
*req_immediate_exit = true;
WARN_ON(vcpu->arch.exception.pending);
- return;
+ return 0;
-busy:
- *req_immediate_exit = true;
- return;
+out:
+ if (r == -EBUSY) {
+ *req_immediate_exit = true;
+ r = 0;
+ }
+ return r;
}
static void process_nmi(struct kvm_vcpu *vcpu)
@@ -8888,10 +9122,9 @@ static void enter_smm(struct kvm_vcpu *vcpu)
{
struct kvm_segment cs, ds;
struct desc_ptr dt;
+ unsigned long cr0;
char buf[512];
- u32 cr0;
- trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
memset(buf, 0, 512);
#ifdef CONFIG_X86_64
if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
@@ -8901,13 +9134,13 @@ static void enter_smm(struct kvm_vcpu *vcpu)
enter_smm_save_state_32(vcpu, buf);
/*
- * Give pre_enter_smm() a chance to make ISA-specific changes to the
- * vCPU state (e.g. leave guest mode) after we've saved the state into
- * the SMM state-save area.
+ * Give enter_smm() a chance to make ISA-specific changes to the vCPU
+ * state (e.g. leave guest mode) after we've saved the state into the
+ * SMM state-save area.
*/
- static_call(kvm_x86_pre_enter_smm)(vcpu, buf);
+ static_call(kvm_x86_enter_smm)(vcpu, buf);
- vcpu->arch.hflags |= HF_SMM_MASK;
+ kvm_smm_changed(vcpu, true);
kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
if (static_call(kvm_x86_get_nmi_mask)(vcpu))
@@ -8996,6 +9229,15 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm);
kvm_apic_update_apicv(vcpu);
static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu);
+
+ /*
+ * When APICv gets disabled, we may still have injected interrupts
+ * pending. At the same time, KVM_REQ_EVENT may not be set as APICv was
+ * still active when the interrupt got accepted. Make sure
+ * inject_pending_event() is called to check for that.
+ */
+ if (!vcpu->arch.apicv_active)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
@@ -9171,7 +9413,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
}
if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
kvm_vcpu_flush_tlb_current(vcpu);
- if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu))
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
kvm_vcpu_flush_tlb_guest(vcpu);
if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
@@ -9264,13 +9506,21 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win ||
kvm_xen_has_interrupt(vcpu)) {
++vcpu->stat.req_event;
- kvm_apic_accept_events(vcpu);
+ r = kvm_apic_accept_events(vcpu);
+ if (r < 0) {
+ r = 0;
+ goto out;
+ }
if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
r = 1;
goto out;
}
- inject_pending_event(vcpu, &req_immediate_exit);
+ r = inject_pending_event(vcpu, &req_immediate_exit);
+ if (r < 0) {
+ r = 0;
+ goto out;
+ }
if (req_int_win)
static_call(kvm_x86_enable_irq_window)(vcpu);
@@ -9472,7 +9722,8 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
return 1;
}
- kvm_apic_accept_events(vcpu);
+ if (kvm_apic_accept_events(vcpu) < 0)
+ return 0;
switch(vcpu->arch.mp_state) {
case KVM_MP_STATE_HALTED:
case KVM_MP_STATE_AP_RESET_HOLD:
@@ -9696,7 +9947,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
goto out;
}
kvm_vcpu_block(vcpu);
- kvm_apic_accept_events(vcpu);
+ if (kvm_apic_accept_events(vcpu) < 0) {
+ r = 0;
+ goto out;
+ }
kvm_clear_request(KVM_REQ_UNHALT, vcpu);
r = -EAGAIN;
if (signal_pending(current)) {
@@ -9845,7 +10099,7 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
}
EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
-static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
{
struct desc_ptr dt;
@@ -9878,14 +10132,36 @@ skip_protected_regs:
sregs->cr8 = kvm_get_cr8(vcpu);
sregs->efer = vcpu->arch.efer;
sregs->apic_base = kvm_get_apic_base(vcpu);
+}
+
+static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+ __get_sregs_common(vcpu, sregs);
- memset(sregs->interrupt_bitmap, 0, sizeof(sregs->interrupt_bitmap));
+ if (vcpu->arch.guest_state_protected)
+ return;
if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft)
set_bit(vcpu->arch.interrupt.nr,
(unsigned long *)sregs->interrupt_bitmap);
}
+static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
+{
+ int i;
+
+ __get_sregs_common(vcpu, (struct kvm_sregs *)sregs2);
+
+ if (vcpu->arch.guest_state_protected)
+ return;
+
+ if (is_pae_paging(vcpu)) {
+ for (i = 0 ; i < 4 ; i++)
+ sregs2->pdptrs[i] = kvm_pdptr_read(vcpu, i);
+ sregs2->flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID;
+ }
+}
+
int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
struct kvm_sregs *sregs)
{
@@ -9898,11 +10174,17 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
+ int r;
+
vcpu_load(vcpu);
if (kvm_mpx_supported())
kvm_load_guest_fpu(vcpu);
- kvm_apic_accept_events(vcpu);
+ r = kvm_apic_accept_events(vcpu);
+ if (r < 0)
+ goto out;
+ r = 0;
+
if ((vcpu->arch.mp_state == KVM_MP_STATE_HALTED ||
vcpu->arch.mp_state == KVM_MP_STATE_AP_RESET_HOLD) &&
vcpu->arch.pv.pv_unhalted)
@@ -9910,10 +10192,11 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
else
mp_state->mp_state = vcpu->arch.mp_state;
+out:
if (kvm_mpx_supported())
kvm_put_guest_fpu(vcpu);
vcpu_put(vcpu);
- return 0;
+ return r;
}
int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
@@ -9997,24 +10280,23 @@ static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
return kvm_is_valid_cr4(vcpu, sregs->cr4);
}
-static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
+ int *mmu_reset_needed, bool update_pdptrs)
{
struct msr_data apic_base_msr;
- int mmu_reset_needed = 0;
- int pending_vec, max_bits, idx;
+ int idx;
struct desc_ptr dt;
- int ret = -EINVAL;
if (!kvm_is_valid_sregs(vcpu, sregs))
- goto out;
+ return -EINVAL;
apic_base_msr.data = sregs->apic_base;
apic_base_msr.host_initiated = true;
if (kvm_set_apic_base(vcpu, &apic_base_msr))
- goto out;
+ return -EINVAL;
if (vcpu->arch.guest_state_protected)
- goto skip_protected_regs;
+ return 0;
dt.size = sregs->idt.limit;
dt.address = sregs->idt.base;
@@ -10024,31 +10306,30 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
static_call(kvm_x86_set_gdt)(vcpu, &dt);
vcpu->arch.cr2 = sregs->cr2;
- mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
+ *mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
vcpu->arch.cr3 = sregs->cr3;
kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
kvm_set_cr8(vcpu, sregs->cr8);
- mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
+ *mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
static_call(kvm_x86_set_efer)(vcpu, sregs->efer);
- mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
+ *mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0);
vcpu->arch.cr0 = sregs->cr0;
- mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
+ *mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4);
- idx = srcu_read_lock(&vcpu->kvm->srcu);
- if (is_pae_paging(vcpu)) {
- load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
- mmu_reset_needed = 1;
+ if (update_pdptrs) {
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ if (is_pae_paging(vcpu)) {
+ load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
+ *mmu_reset_needed = 1;
+ }
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
}
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
-
- if (mmu_reset_needed)
- kvm_mmu_reset_context(vcpu);
kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
@@ -10068,20 +10349,63 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
!is_protmode(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
-skip_protected_regs:
+ return 0;
+}
+
+static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+ int pending_vec, max_bits;
+ int mmu_reset_needed = 0;
+ int ret = __set_sregs_common(vcpu, sregs, &mmu_reset_needed, true);
+
+ if (ret)
+ return ret;
+
+ if (mmu_reset_needed)
+ kvm_mmu_reset_context(vcpu);
+
max_bits = KVM_NR_INTERRUPTS;
pending_vec = find_first_bit(
(const unsigned long *)sregs->interrupt_bitmap, max_bits);
+
if (pending_vec < max_bits) {
kvm_queue_interrupt(vcpu, pending_vec, false);
pr_debug("Set back pending irq %d\n", pending_vec);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
}
+ return 0;
+}
- kvm_make_request(KVM_REQ_EVENT, vcpu);
+static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
+{
+ int mmu_reset_needed = 0;
+ bool valid_pdptrs = sregs2->flags & KVM_SREGS2_FLAGS_PDPTRS_VALID;
+ bool pae = (sregs2->cr0 & X86_CR0_PG) && (sregs2->cr4 & X86_CR4_PAE) &&
+ !(sregs2->efer & EFER_LMA);
+ int i, ret;
- ret = 0;
-out:
- return ret;
+ if (sregs2->flags & ~KVM_SREGS2_FLAGS_PDPTRS_VALID)
+ return -EINVAL;
+
+ if (valid_pdptrs && (!pae || vcpu->arch.guest_state_protected))
+ return -EINVAL;
+
+ ret = __set_sregs_common(vcpu, (struct kvm_sregs *)sregs2,
+ &mmu_reset_needed, !valid_pdptrs);
+ if (ret)
+ return ret;
+
+ if (valid_pdptrs) {
+ for (i = 0; i < 4 ; i++)
+ kvm_pdptr_write(vcpu, i, sregs2->pdptrs[i]);
+
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+ mmu_reset_needed = 1;
+ vcpu->arch.pdptrs_from_userspace = true;
+ }
+ if (mmu_reset_needed)
+ kvm_mmu_reset_context(vcpu);
+ return 0;
}
int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
@@ -10305,13 +10629,13 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
struct page *page;
int r;
+ vcpu->arch.last_vmentry_cpu = -1;
+
if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
else
vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
- kvm_set_tsc_khz(vcpu, max_tsc_khz);
-
r = kvm_mmu_create(vcpu);
if (r < 0)
return r;
@@ -10371,6 +10695,10 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.pending_external_vector = -1;
vcpu->arch.preempted_in_kernel = false;
+#if IS_ENABLED(CONFIG_HYPERV)
+ vcpu->arch.hv_root_tdp = INVALID_PAGE;
+#endif
+
r = static_call(kvm_x86_vcpu_create)(vcpu);
if (r)
goto free_guest_fpu;
@@ -10379,8 +10707,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
kvm_vcpu_mtrr_init(vcpu);
vcpu_load(vcpu);
+ kvm_set_tsc_khz(vcpu, max_tsc_khz);
kvm_vcpu_reset(vcpu, false);
- kvm_init_mmu(vcpu, false);
+ kvm_init_mmu(vcpu);
vcpu_put(vcpu);
return 0;
@@ -10454,6 +10783,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
+ unsigned long old_cr0 = kvm_read_cr0(vcpu);
+
kvm_lapic_reset(vcpu, init_event);
vcpu->arch.hflags = 0;
@@ -10522,6 +10853,17 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vcpu->arch.ia32_xss = 0;
static_call(kvm_x86_vcpu_reset)(vcpu, init_event);
+
+ /*
+ * Reset the MMU context if paging was enabled prior to INIT (which is
+ * implied if CR0.PG=1 as CR0 will be '0' prior to RESET). Unlike the
+ * standard CR0/CR4/EFER modification paths, only CR0.PG needs to be
+ * checked because it is unconditionally cleared on INIT and all other
+ * paging related bits are ignored if paging is disabled, i.e. CR0.WP,
+ * CR4, and EFER changes are all irrelevant if CR0.PG was '0'.
+ */
+ if (old_cr0 & X86_CR0_PG)
+ kvm_mmu_reset_context(vcpu);
}
void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
@@ -10639,6 +10981,9 @@ int kvm_arch_hardware_setup(void *opaque)
int r;
rdmsrl_safe(MSR_EFER, &host_efer);
+ if (WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_NX) &&
+ !(host_efer & EFER_NX)))
+ return -EIO;
if (boot_cpu_has(X86_FEATURE_XSAVES))
rdmsrl(MSR_IA32_XSS, host_xss);
@@ -10754,9 +11099,15 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm->arch.guest_can_read_msr_platform_info = true;
+#if IS_ENABLED(CONFIG_HYPERV)
+ spin_lock_init(&kvm->arch.hv_root_tdp_lock);
+ kvm->arch.hv_root_tdp = INVALID_PAGE;
+#endif
+
INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
+ kvm_apicv_init(kvm);
kvm_hv_init_vm(kvm);
kvm_page_track_init(kvm);
kvm_mmu_init_vm(kvm);
@@ -10917,17 +11268,23 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kvm_hv_destroy_vm(kvm);
}
-void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
+static void memslot_rmap_free(struct kvm_memory_slot *slot)
{
int i;
for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
kvfree(slot->arch.rmap[i]);
slot->arch.rmap[i] = NULL;
+ }
+}
- if (i == 0)
- continue;
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+ int i;
+ memslot_rmap_free(slot);
+
+ for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
kvfree(slot->arch.lpage_info[i - 1]);
slot->arch.lpage_info[i - 1] = NULL;
}
@@ -10935,11 +11292,79 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
kvm_page_track_free_memslot(slot);
}
-static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
- unsigned long npages)
+static int memslot_rmap_alloc(struct kvm_memory_slot *slot,
+ unsigned long npages)
{
+ const int sz = sizeof(*slot->arch.rmap[0]);
int i;
+ for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+ int level = i + 1;
+ int lpages = gfn_to_index(slot->base_gfn + npages - 1,
+ slot->base_gfn, level) + 1;
+
+ WARN_ON(slot->arch.rmap[i]);
+
+ slot->arch.rmap[i] = kvcalloc(lpages, sz, GFP_KERNEL_ACCOUNT);
+ if (!slot->arch.rmap[i]) {
+ memslot_rmap_free(slot);
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+}
+
+int alloc_all_memslots_rmaps(struct kvm *kvm)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
+ int r, i;
+
+ /*
+ * Check if memslots alreday have rmaps early before acquiring
+ * the slots_arch_lock below.
+ */
+ if (kvm_memslots_have_rmaps(kvm))
+ return 0;
+
+ mutex_lock(&kvm->slots_arch_lock);
+
+ /*
+ * Read memslots_have_rmaps again, under the slots arch lock,
+ * before allocating the rmaps
+ */
+ if (kvm_memslots_have_rmaps(kvm)) {
+ mutex_unlock(&kvm->slots_arch_lock);
+ return 0;
+ }
+
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ slots = __kvm_memslots(kvm, i);
+ kvm_for_each_memslot(slot, slots) {
+ r = memslot_rmap_alloc(slot, slot->npages);
+ if (r) {
+ mutex_unlock(&kvm->slots_arch_lock);
+ return r;
+ }
+ }
+ }
+
+ /*
+ * Ensure that memslots_have_rmaps becomes true strictly after
+ * all the rmap pointers are set.
+ */
+ smp_store_release(&kvm->arch.memslots_have_rmaps, true);
+ mutex_unlock(&kvm->slots_arch_lock);
+ return 0;
+}
+
+static int kvm_alloc_memslot_metadata(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ unsigned long npages)
+{
+ int i, r;
+
/*
* Clear out the previous array pointers for the KVM_MR_MOVE case. The
* old arrays will be freed by __kvm_set_memory_region() if installing
@@ -10947,7 +11372,13 @@ static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
*/
memset(&slot->arch, 0, sizeof(slot->arch));
- for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+ if (kvm_memslots_have_rmaps(kvm)) {
+ r = memslot_rmap_alloc(slot, npages);
+ if (r)
+ return r;
+ }
+
+ for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
struct kvm_lpage_info *linfo;
unsigned long ugfn;
int lpages;
@@ -10956,14 +11387,6 @@ static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
lpages = gfn_to_index(slot->base_gfn + npages - 1,
slot->base_gfn, level) + 1;
- slot->arch.rmap[i] =
- kvcalloc(lpages, sizeof(*slot->arch.rmap[i]),
- GFP_KERNEL_ACCOUNT);
- if (!slot->arch.rmap[i])
- goto out_free;
- if (i == 0)
- continue;
-
linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
if (!linfo)
goto out_free;
@@ -10993,12 +11416,9 @@ static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
return 0;
out_free:
- for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
- kvfree(slot->arch.rmap[i]);
- slot->arch.rmap[i] = NULL;
- if (i == 0)
- continue;
+ memslot_rmap_free(slot);
+ for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
kvfree(slot->arch.lpage_info[i - 1]);
slot->arch.lpage_info[i - 1] = NULL;
}
@@ -11027,7 +11447,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
enum kvm_mr_change change)
{
if (change == KVM_MR_CREATE || change == KVM_MR_MOVE)
- return kvm_alloc_memslot_metadata(memslot,
+ return kvm_alloc_memslot_metadata(kvm, memslot,
mem->memory_size >> PAGE_SHIFT);
return 0;
}
@@ -11103,36 +11523,19 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
*/
kvm_mmu_zap_collapsible_sptes(kvm, new);
} else {
- /* By default, write-protect everything to log writes. */
- int level = PG_LEVEL_4K;
+ /*
+ * Initially-all-set does not require write protecting any page,
+ * because they're all assumed to be dirty.
+ */
+ if (kvm_dirty_log_manual_protect_and_init_set(kvm))
+ return;
if (kvm_x86_ops.cpu_dirty_log_size) {
- /*
- * Clear all dirty bits, unless pages are treated as
- * dirty from the get-go.
- */
- if (!kvm_dirty_log_manual_protect_and_init_set(kvm))
- kvm_mmu_slot_leaf_clear_dirty(kvm, new);
-
- /*
- * Write-protect large pages on write so that dirty
- * logging happens at 4k granularity. No need to
- * write-protect small SPTEs since write accesses are
- * logged by the CPU via dirty bits.
- */
- level = PG_LEVEL_2M;
- } else if (kvm_dirty_log_manual_protect_and_init_set(kvm)) {
- /*
- * If we're with initial-all-set, we don't need
- * to write protect any small page because
- * they're reported as dirty already. However
- * we still need to write-protect huge pages
- * so that the page split can happen lazily on
- * the first write to the huge page.
- */
- level = PG_LEVEL_2M;
+ kvm_mmu_slot_leaf_clear_dirty(kvm, new);
+ kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M);
+ } else {
+ kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_4K);
}
- kvm_mmu_slot_remove_write_access(kvm, new, level);
}
}
@@ -11701,8 +12104,6 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
{
bool pcid_enabled;
struct x86_exception e;
- unsigned i;
- unsigned long roots_to_free = 0;
struct {
u64 pcid;
u64 gla;
@@ -11736,23 +12137,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
return 1;
}
- if (kvm_get_active_pcid(vcpu) == operand.pcid) {
- kvm_mmu_sync_roots(vcpu);
- kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
- }
-
- for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
- if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd)
- == operand.pcid)
- roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
-
- kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
- /*
- * If neither the current cr3 nor any of the prev_roots use the
- * given PCID, then nothing needs to be done here because a
- * resync will happen anyway before switching to any other CR3.
- */
-
+ kvm_invalidate_pcid(vcpu, operand.pcid);
return kvm_skip_emulated_instruction(vcpu);
case INVPCID_TYPE_ALL_NON_GLOBAL:
@@ -11765,7 +12150,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
fallthrough;
case INVPCID_TYPE_ALL_INCL_GLOBAL:
- kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
return kvm_skip_emulated_instruction(vcpu);
default:
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 521f74e5bbf2..44ae10312740 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -157,16 +157,6 @@ static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu)
return cs_l;
}
-static inline bool is_la57_mode(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_X86_64
- return (vcpu->arch.efer & EFER_LMA) &&
- kvm_read_cr4_bits(vcpu, X86_CR4_LA57);
-#else
- return 0;
-#endif
-}
-
static inline bool x86_exception_has_error_code(unsigned int vector)
{
static u32 exception_has_error_code = BIT(DF_VECTOR) | BIT(TS_VECTOR) |