diff options
59 files changed, 2116 insertions, 595 deletions
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 6fb1870f0999..1bd2d42e6424 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8001,6 +8001,11 @@ apply some other policy-based mitigation. When exiting to userspace, KVM sets KVM_RUN_X86_BUS_LOCK in vcpu-run->flags, and conditionally sets the exit_reason to KVM_EXIT_X86_BUS_LOCK. +Due to differences in the underlying hardware implementation, the vCPU's RIP at +the time of exit diverges between Intel and AMD. On Intel hosts, RIP points at +the next instruction, i.e. the exit is trap-like. On AMD hosts, RIP points at +the offending instruction, i.e. the exit is fault-like. + Note! Detected bus locks may be coincident with other exits to userspace, i.e. KVM_RUN_X86_BUS_LOCK should be checked regardless of the primary exit reason if userspace wants to take action on all detected bus locks. diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 30144ef9ef02..77a265e0672e 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -336,6 +336,7 @@ #define X86_FEATURE_AMD_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */ #define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */ #define X86_FEATURE_AMD_STIBP_ALWAYS_ON (13*32+17) /* Single Thread Indirect Branch Predictors always-on preferred */ +#define X86_FEATURE_AMD_IBRS_SAME_MODE (13*32+19) /* Indirect Branch Restricted Speculation same mode protection*/ #define X86_FEATURE_AMD_PPIN (13*32+23) /* "amd_ppin" Protected Processor Inventory Number */ #define X86_FEATURE_AMD_SSBD (13*32+24) /* Speculative Store Bypass Disable */ #define X86_FEATURE_VIRT_SSBD (13*32+25) /* "virt_ssbd" Virtualized Speculative Store Bypass Disable */ @@ -378,6 +379,7 @@ #define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* "v_spec_ctrl" Virtual SPEC_CTRL */ #define X86_FEATURE_VNMI (15*32+25) /* "vnmi" Virtual NMI */ #define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* SVME addr check */ +#define X86_FEATURE_BUS_LOCK_THRESHOLD (15*32+29) /* Bus lock threshold */ #define X86_FEATURE_IDLE_HLT (15*32+30) /* IDLE HLT intercept */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */ @@ -446,6 +448,7 @@ #define X86_FEATURE_DEBUG_SWAP (19*32+14) /* "debug_swap" SEV-ES full debug state swap support */ #define X86_FEATURE_RMPREAD (19*32+21) /* RMPREAD instruction */ #define X86_FEATURE_SEGMENTED_RMP (19*32+23) /* Segmented RMP support */ +#define X86_FEATURE_ALLOWED_SEV_FEATURES (19*32+27) /* Allowed SEV Features */ #define X86_FEATURE_SVSM (19*32+28) /* "svsm" SVSM present */ #define X86_FEATURE_HV_INUSE_WR_ALLOWED (19*32+30) /* Allow Write to in-use hypervisor-owned pages */ @@ -457,6 +460,7 @@ #define X86_FEATURE_AUTOIBRS (20*32+ 8) /* Automatic IBRS */ #define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* SMM_CTL MSR is not present */ +#define X86_FEATURE_PREFETCHI (20*32+20) /* Prefetch Data/Instruction to Cache Level */ #define X86_FEATURE_SBPB (20*32+27) /* Selective Branch Prediction Barrier */ #define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* MSR_PRED_CMD[IBPB] flushes all branch type predictions */ #define X86_FEATURE_SRSO_NO (20*32+29) /* CPU is not affected by SRSO */ diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 79406bf07a1c..8d50e3e0a19b 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -127,7 +127,7 @@ KVM_X86_OP(leave_smm) KVM_X86_OP(enable_smi_window) #endif KVM_X86_OP_OPTIONAL(dev_get_attr) -KVM_X86_OP(mem_enc_ioctl) +KVM_X86_OP_OPTIONAL(mem_enc_ioctl) KVM_X86_OP_OPTIONAL(vcpu_mem_enc_ioctl) KVM_X86_OP_OPTIONAL(mem_enc_register_region) KVM_X86_OP_OPTIONAL(mem_enc_unregister_region) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6c06f3d6e081..330cdcbed1a6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -125,7 +125,8 @@ KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_HV_TLB_FLUSH \ KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) -#define KVM_REQ_UPDATE_PROTECTED_GUEST_STATE KVM_ARCH_REQ(34) +#define KVM_REQ_UPDATE_PROTECTED_GUEST_STATE \ + KVM_ARCH_REQ_FLAGS(34, KVM_REQUEST_WAIT) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ @@ -411,7 +412,6 @@ struct kvm_rmap_head { }; struct kvm_pio_request { - unsigned long linear_rip; unsigned long count; int in; int port; @@ -917,6 +917,7 @@ struct kvm_vcpu_arch { bool emulate_regs_need_sync_to_vcpu; bool emulate_regs_need_sync_from_vcpu; int (*complete_userspace_io)(struct kvm_vcpu *vcpu); + unsigned long cui_linear_rip; gpa_t time; s8 pvclock_tsc_shift; @@ -1034,6 +1035,7 @@ struct kvm_vcpu_arch { int pending_ioapic_eoi; int pending_external_vector; + int highest_stale_pending_ioapic_eoi; /* be preempted when it's in kernel-mode(cpl=0) */ bool preempted_in_kernel; @@ -1941,6 +1943,7 @@ struct kvm_arch_async_pf { extern u32 __read_mostly kvm_nr_uret_msrs; extern bool __read_mostly allow_smaller_maxphyaddr; extern bool __read_mostly enable_apicv; +extern bool __read_mostly enable_device_posted_irqs; extern struct kvm_x86_ops kvm_x86_ops; #define kvm_x86_call(func) static_call(kvm_x86_##func) @@ -2444,7 +2447,7 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); static inline bool kvm_arch_has_irq_bypass(void) { - return enable_apicv && irq_remapping_cap(IRQ_POSTING_CAP); + return enable_device_posted_irqs; } #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 9397a319d165..7b16501912ad 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -300,7 +300,7 @@ do { \ #endif /* !CONFIG_PARAVIRT_XXL */ /* Instruction opcode for WRMSRNS supported in binutils >= 2.40 */ -#define WRMSRNS _ASM_BYTES(0x0f,0x01,0xc6) +#define ASM_WRMSRNS _ASM_BYTES(0x0f,0x01,0xc6) /* Non-serializing WRMSR, when available. Falls back to a serializing WRMSR. */ static __always_inline void wrmsrns(u32 msr, u64 val) @@ -309,7 +309,7 @@ static __always_inline void wrmsrns(u32 msr, u64 val) * WRMSR is 2 bytes. WRMSRNS is 3 bytes. Pad WRMSR with a redundant * DS prefix to avoid a trailing NOP. */ - asm volatile("1: " ALTERNATIVE("ds wrmsr", WRMSRNS, X86_FEATURE_WRMSRNS) + asm volatile("1: " ALTERNATIVE("ds wrmsr", ASM_WRMSRNS, X86_FEATURE_WRMSRNS) "2: " _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR) : : "c" (msr), "a" ((u32)val), "d" ((u32)(val >> 32))); } diff --git a/arch/x86/include/asm/posted_intr.h b/arch/x86/include/asm/posted_intr.h index bb107ebbe713..a5f761fbf45b 100644 --- a/arch/x86/include/asm/posted_intr.h +++ b/arch/x86/include/asm/posted_intr.h @@ -1,19 +1,24 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _X86_POSTED_INTR_H #define _X86_POSTED_INTR_H + +#include <asm/cmpxchg.h> +#include <asm/rwonce.h> #include <asm/irq_vectors.h> +#include <linux/bitmap.h> + #define POSTED_INTR_ON 0 #define POSTED_INTR_SN 1 #define PID_TABLE_ENTRY_VALID 1 +#define NR_PIR_VECTORS 256 +#define NR_PIR_WORDS (NR_PIR_VECTORS / BITS_PER_LONG) + /* Posted-Interrupt Descriptor */ struct pi_desc { - union { - u32 pir[8]; /* Posted interrupt requested */ - u64 pir64[4]; - }; + unsigned long pir[NR_PIR_WORDS]; /* Posted interrupt requested */ union { struct { u16 notifications; /* Suppress and outstanding bits */ @@ -26,6 +31,65 @@ struct pi_desc { u32 rsvd[6]; } __aligned(64); +/* + * De-multiplexing posted interrupts is on the performance path, the code + * below is written to optimize the cache performance based on the following + * considerations: + * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently + * accessed by both CPU and IOMMU. + * 2.During software processing of posted interrupts, the CPU needs to do + * natural width read and xchg for checking and clearing posted interrupt + * request (PIR), a 256 bit field within the PID. + * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache + * line when posting interrupts and setting control bits. + * 4.The CPU can access the cache line a magnitude faster than the IOMMU. + * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID + * cache line. The cache line states after each operation are as follows, + * assuming a 64-bit kernel: + * CPU IOMMU PID Cache line state + * --------------------------------------------------------------- + *...read64 exclusive + *...lock xchg64 modified + *... post/atomic swap invalid + *...------------------------------------------------------------- + * + * To reduce L1 data cache miss, it is important to avoid contention with + * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used + * when processing posted interrupts in software, e.g. to dispatch interrupt + * handlers for posted MSIs, or to move interrupts from the PIR to the vIRR + * in KVM. + * + * In addition, the code is trying to keep the cache line state consistent + * as much as possible. e.g. when making a copy and clearing the PIR + * (assuming non-zero PIR bits are present in the entire PIR), it does: + * read, read, read, read, xchg, xchg, xchg, xchg + * instead of: + * read, xchg, read, xchg, read, xchg, read, xchg + */ +static __always_inline bool pi_harvest_pir(unsigned long *pir, + unsigned long *pir_vals) +{ + unsigned long pending = 0; + int i; + + for (i = 0; i < NR_PIR_WORDS; i++) { + pir_vals[i] = READ_ONCE(pir[i]); + pending |= pir_vals[i]; + } + + if (!pending) + return false; + + for (i = 0; i < NR_PIR_WORDS; i++) { + if (!pir_vals[i]) + continue; + + pir_vals[i] = arch_xchg(&pir[i], 0); + } + + return true; +} + static inline bool pi_test_and_set_on(struct pi_desc *pi_desc) { return test_and_set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control); @@ -43,12 +107,12 @@ static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc) static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) { - return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); + return test_and_set_bit(vector, pi_desc->pir); } static inline bool pi_is_pir_empty(struct pi_desc *pi_desc) { - return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS); + return bitmap_empty(pi_desc->pir, NR_VECTORS); } static inline void pi_set_sn(struct pi_desc *pi_desc) @@ -110,7 +174,7 @@ static inline bool pi_pending_this_cpu(unsigned int vector) if (WARN_ON_ONCE(vector > NR_VECTORS || vector < FIRST_EXTERNAL_VECTOR)) return false; - return test_bit(vector, (unsigned long *)pid->pir); + return test_bit(vector, pid->pir); } extern void intel_posted_msi_init(void); diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 9b7fa99ae951..ad954a1a6656 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -116,6 +116,7 @@ enum { INTERCEPT_INVPCID, INTERCEPT_MCOMMIT, INTERCEPT_TLBSYNC, + INTERCEPT_BUSLOCK, INTERCEPT_IDLE_HLT = 166, }; @@ -159,7 +160,12 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u64 avic_physical_id; /* Offset 0xf8 */ u8 reserved_7[8]; u64 vmsa_pa; /* Used for an SEV-ES guest */ - u8 reserved_8[720]; + u8 reserved_8[16]; + u16 bus_lock_counter; /* Offset 0x120 */ + u8 reserved_9[22]; + u64 allowed_sev_features; /* Offset 0x138 */ + u64 guest_sev_features; /* Offset 0x140 */ + u8 reserved_10[664]; /* * Offset 0x3e0, 32 bytes reserved * for use by hypervisor/software. @@ -291,6 +297,8 @@ static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_ #define SVM_SEV_FEAT_ALTERNATE_INJECTION BIT(4) #define SVM_SEV_FEAT_DEBUG_SWAP BIT(5) +#define VMCB_ALLOWED_SEV_FEATURES_VALID BIT_ULL(63) + struct vmcb_seg { u16 selector; u16 attrib; diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 225a12e0d5d6..6f3499507c5e 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -845,6 +845,7 @@ struct kvm_sev_snp_launch_start { }; /* Kept in sync with firmware values for simplicity. */ +#define KVM_SEV_PAGE_TYPE_INVALID 0x0 #define KVM_SEV_SNP_PAGE_TYPE_NORMAL 0x1 #define KVM_SEV_SNP_PAGE_TYPE_ZERO 0x3 #define KVM_SEV_SNP_PAGE_TYPE_UNMEASURED 0x4 diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index ec1321248dac..9c640a521a67 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -95,6 +95,7 @@ #define SVM_EXIT_CR14_WRITE_TRAP 0x09e #define SVM_EXIT_CR15_WRITE_TRAP 0x09f #define SVM_EXIT_INVPCID 0x0a2 +#define SVM_EXIT_BUS_LOCK 0x0a5 #define SVM_EXIT_IDLE_HLT 0x0a6 #define SVM_EXIT_NPF 0x400 #define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401 @@ -225,6 +226,7 @@ { SVM_EXIT_CR4_WRITE_TRAP, "write_cr4_trap" }, \ { SVM_EXIT_CR8_WRITE_TRAP, "write_cr8_trap" }, \ { SVM_EXIT_INVPCID, "invpcid" }, \ + { SVM_EXIT_BUS_LOCK, "buslock" }, \ { SVM_EXIT_IDLE_HLT, "idle-halt" }, \ { SVM_EXIT_NPF, "npf" }, \ { SVM_EXIT_AVIC_INCOMPLETE_IPI, "avic_incomplete_ipi" }, \ diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 81f9b78e0f7b..9ed29ff10e59 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -380,61 +380,18 @@ void intel_posted_msi_init(void) this_cpu_write(posted_msi_pi_desc.ndst, destination); } -/* - * De-multiplexing posted interrupts is on the performance path, the code - * below is written to optimize the cache performance based on the following - * considerations: - * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently - * accessed by both CPU and IOMMU. - * 2.During posted MSI processing, the CPU needs to do 64-bit read and xchg - * for checking and clearing posted interrupt request (PIR), a 256 bit field - * within the PID. - * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache - * line when posting interrupts and setting control bits. - * 4.The CPU can access the cache line a magnitude faster than the IOMMU. - * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID - * cache line. The cache line states after each operation are as follows: - * CPU IOMMU PID Cache line state - * --------------------------------------------------------------- - *...read64 exclusive - *...lock xchg64 modified - *... post/atomic swap invalid - *...------------------------------------------------------------- - * - * To reduce L1 data cache miss, it is important to avoid contention with - * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used - * to dispatch interrupt handlers. - * - * In addition, the code is trying to keep the cache line state consistent - * as much as possible. e.g. when making a copy and clearing the PIR - * (assuming non-zero PIR bits are present in the entire PIR), it does: - * read, read, read, read, xchg, xchg, xchg, xchg - * instead of: - * read, xchg, read, xchg, read, xchg, read, xchg - */ -static __always_inline bool handle_pending_pir(u64 *pir, struct pt_regs *regs) +static __always_inline bool handle_pending_pir(unsigned long *pir, struct pt_regs *regs) { - int i, vec = FIRST_EXTERNAL_VECTOR; - unsigned long pir_copy[4]; - bool handled = false; + unsigned long pir_copy[NR_PIR_WORDS]; + int vec = FIRST_EXTERNAL_VECTOR; - for (i = 0; i < 4; i++) - pir_copy[i] = pir[i]; - - for (i = 0; i < 4; i++) { - if (!pir_copy[i]) - continue; + if (!pi_harvest_pir(pir, pir_copy)) + return false; - pir_copy[i] = arch_xchg(&pir[i], 0); - handled = true; - } - - if (handled) { - for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR) - call_irq_handler(vec, regs); - } + for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR) + call_irq_handler(vec, regs); - return handled; + return true; } /* @@ -464,7 +421,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification) * MAX_POSTED_MSI_COALESCING_LOOP - 1 loops are executed here. */ while (++i < MAX_POSTED_MSI_COALESCING_LOOP) { - if (!handle_pending_pir(pid->pir64, regs)) + if (!handle_pending_pir(pid->pir, regs)) break; } @@ -479,7 +436,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification) * process PIR bits one last time such that handling the new interrupts * are not delayed until the next IRQ. */ - handle_pending_pir(pid->pir64, regs); + handle_pending_pir(pid->pir, regs); apic_eoi(); irq_exit(); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index e92263fbbad6..41df8729c167 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -978,6 +978,7 @@ void kvm_set_cpu_caps(void) F(FZRM), F(FSRS), F(FSRC), + F(WRMSRNS), F(AMX_FP16), F(AVX_IFMA), F(LAM), @@ -1093,6 +1094,7 @@ void kvm_set_cpu_caps(void) F(AMD_SSB_NO), F(AMD_STIBP), F(AMD_STIBP_ALWAYS_ON), + F(AMD_IBRS_SAME_MODE), F(AMD_PSFD), F(AMD_IBPB_RET), ); @@ -1150,6 +1152,7 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_init(CPUID_8000_0021_EAX, F(NO_NESTED_DATA_BP), + F(WRMSR_XX_BASE_NS), /* * Synthesize "LFENCE is serializing" into the AMD-defined entry * in KVM's supported CPUID, i.e. if the feature is reported as @@ -1163,10 +1166,13 @@ void kvm_set_cpu_caps(void) SYNTHESIZED_F(LFENCE_RDTSC), /* SmmPgCfgLock */ F(NULL_SEL_CLR_BASE), + /* UpperAddressIgnore */ F(AUTOIBRS), + F(PREFETCHI), EMULATED_F(NO_SMM_CTL_MSR), /* PrefetchCtlMsr */ - F(WRMSR_XX_BASE_NS), + /* GpOnUserCpuid */ + /* EPSF */ SYNTHESIZED_F(SBPB), SYNTHESIZED_F(IBPB_BRTYPE), SYNTHESIZED_F(SRSO_NO), diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 995eb5054360..45dae2d5d2f1 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -296,11 +296,8 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors) index == RTC_GSI) { u16 dm = kvm_lapic_irq_dest_mode(!!e->fields.dest_mode); - if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, - e->fields.dest_id, dm) || - kvm_apic_pending_eoi(vcpu, e->fields.vector)) - __set_bit(e->fields.vector, - ioapic_handled_vectors); + kvm_scan_ioapic_irq(vcpu, e->fields.dest_id, dm, + e->fields.vector, ioapic_handled_vectors); } } spin_unlock(&ioapic->lock); diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 539333ac4b38..aa8cb4ac0479 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -120,4 +120,6 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors); void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors); +void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode, + u8 vector, unsigned long *ioapic_handled_vectors); #endif diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 8136695f7b96..d6d792b5d1bd 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -402,6 +402,33 @@ void kvm_arch_post_irq_routing_update(struct kvm *kvm) kvm_make_scan_ioapic_request(kvm); } +void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode, + u8 vector, unsigned long *ioapic_handled_vectors) +{ + /* + * Intercept EOI if the vCPU is the target of the new IRQ routing, or + * the vCPU has a pending IRQ from the old routing, i.e. if the vCPU + * may receive a level-triggered IRQ in the future, or already received + * level-triggered IRQ. The EOI needs to be intercepted and forwarded + * to I/O APIC emulation so that the IRQ can be de-asserted. + */ + if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, dest_id, dest_mode)) { + __set_bit(vector, ioapic_handled_vectors); + } else if (kvm_apic_pending_eoi(vcpu, vector)) { + __set_bit(vector, ioapic_handled_vectors); + + /* + * Track the highest pending EOI for which the vCPU is NOT the + * target in the new routing. Only the EOI for the IRQ that is + * in-flight (for the old routing) needs to be intercepted, any + * future IRQs that arrive on this vCPU will be coincidental to + * the level-triggered routing and don't need to be intercepted. + */ + if ((int)vector > vcpu->arch.highest_stale_pending_ioapic_eoi) + vcpu->arch.highest_stale_pending_ioapic_eoi = vector; + } +} + void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors) { @@ -424,11 +451,11 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, kvm_set_msi_irq(vcpu->kvm, entry, &irq); - if (irq.trig_mode && - (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, - irq.dest_id, irq.dest_mode) || - kvm_apic_pending_eoi(vcpu, irq.vector))) - __set_bit(irq.vector, ioapic_handled_vectors); + if (!irq.trig_mode) + continue; + + kvm_scan_ioapic_irq(vcpu, irq.dest_id, irq.dest_mode, + irq.vector, ioapic_handled_vectors); } } srcu_read_unlock(&kvm->irq_srcu, idx); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c9de81cc27e1..73418dc0ebb2 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -655,27 +655,29 @@ static u8 count_vectors(void *bitmap) return count; } -bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr) +bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr) { + unsigned long pir_vals[NR_PIR_WORDS]; + u32 *__pir = (void *)pir_vals; u32 i, vec; - u32 pir_val, irr_val, prev_irr_val; + u32 irr_val, prev_irr_val; int max_updated_irr; max_updated_irr = -1; *max_irr = -1; + if (!pi_harvest_pir(pir, pir_vals)) + return false; + for (i = vec = 0; i <= 7; i++, vec += 32) { u32 *p_irr = (u32 *)(regs + APIC_IRR + i * 0x10); - irr_val = *p_irr; - pir_val = READ_ONCE(pir[i]); - - if (pir_val) { - pir_val = xchg(&pir[i], 0); + irr_val = READ_ONCE(*p_irr); + if (__pir[i]) { prev_irr_val = irr_val; do { - irr_val = prev_irr_val | pir_val; + irr_val = prev_irr_val | __pir[i]; } while (prev_irr_val != irr_val && !try_cmpxchg(p_irr, &prev_irr_val, irr_val)); @@ -691,7 +693,7 @@ bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr) } EXPORT_SYMBOL_GPL(__kvm_apic_update_irr); -bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr) +bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr) { struct kvm_lapic *apic = vcpu->arch.apic; bool irr_updated = __kvm_apic_update_irr(pir, apic->regs, max_irr); @@ -1459,6 +1461,14 @@ static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) if (!kvm_ioapic_handles_vector(apic, vector)) return; + /* + * If the intercepted EOI is for an IRQ that was pending from previous + * routing, then re-scan the I/O APIC routes as EOIs for the IRQ likely + * no longer need to be intercepted. + */ + if (apic->vcpu->arch.highest_stale_pending_ioapic_eoi == vector) + kvm_make_request(KVM_REQ_SCAN_IOAPIC, apic->vcpu); + /* Request a KVM exit to inform the userspace IOAPIC. */ if (irqchip_split(apic->vcpu->kvm)) { apic->vcpu->arch.pending_ioapic_eoi = vector; diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index e33c969439f7..4ce30db65828 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -103,8 +103,8 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int shorthand, unsigned int dest, int dest_mode); int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec); -bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr); -bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr); +bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr); +bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr); void kvm_apic_update_ppr(struct kvm_vcpu *vcpu); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, struct dest_map *dest_map); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 7b3f1783ab3c..cbc84c6abc2e 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3020,7 +3020,8 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, } if (is_shadow_present_pte(*sptep)) { - if (prefetch) + if (prefetch && is_last_spte(*sptep, level) && + pfn == spte_to_pfn(*sptep)) return RET_PF_SPURIOUS; /* @@ -3034,7 +3035,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, child = spte_to_child_sp(pte); drop_parent_pte(vcpu->kvm, child, sptep); flush = true; - } else if (pfn != spte_to_pfn(*sptep)) { + } else if (WARN_ON_ONCE(pfn != spte_to_pfn(*sptep))) { drop_spte(vcpu->kvm, sptep); flush = true; } else diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 405874f4d088..7f3d7229b2c1 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -378,7 +378,7 @@ static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte, /* Zapping leaf spte is allowed only when write lock is held. */ lockdep_assert_held_write(&kvm->mmu_lock); /* Because write lock is held, operation should success. */ - ret = static_call(kvm_x86_remove_external_spte)(kvm, gfn, level, old_pfn); + ret = kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_pfn); KVM_BUG_ON(ret, kvm); } @@ -485,8 +485,8 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) } if (is_mirror_sp(sp) && - WARN_ON(static_call(kvm_x86_free_external_spt)(kvm, base_gfn, sp->role.level, - sp->external_spt))) { + WARN_ON(kvm_x86_call(free_external_spt)(kvm, base_gfn, sp->role.level, + sp->external_spt))) { /* * Failed to free page table page in mirror page table and * there is nothing to do further. @@ -538,12 +538,12 @@ static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sp * external page table, or leaf. */ if (is_leaf) { - ret = static_call(kvm_x86_set_external_spte)(kvm, gfn, level, new_pfn); + ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_pfn); } else { void *external_spt = get_external_spt(gfn, new_spte, level); KVM_BUG_ON(!external_spt, kvm); - ret = static_call(kvm_x86_link_external_spt)(kvm, gfn, level, external_spt); + ret = kvm_x86_call(link_external_spt)(kvm, gfn, level, external_spt); } if (ret) __kvm_tdp_mmu_write_spte(sptep, old_spte); @@ -1153,13 +1153,12 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, if (WARN_ON_ONCE(sp->role.level != fault->goal_level)) return RET_PF_RETRY; - if (fault->prefetch && is_shadow_present_pte(iter->old_spte)) - return RET_PF_SPURIOUS; - if (is_shadow_present_pte(iter->old_spte) && - is_access_allowed(fault, iter->old_spte) && - is_last_spte(iter->old_spte, iter->level)) + (fault->prefetch || is_access_allowed(fault, iter->old_spte)) && + is_last_spte(iter->old_spte, iter->level)) { + WARN_ON_ONCE(fault->pfn != spte_to_pfn(iter->old_spte)); return RET_PF_SPURIOUS; + } if (unlikely(!fault->slot)) new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 834b67672d50..8427a48b8b7a 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -678,6 +678,33 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa; vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa; + /* + * Stash vmcb02's counter if the guest hasn't moved past the guilty + * instruction; otherwise, reset the counter to '0'. + * + * In order to detect if L2 has made forward progress or not, track the + * RIP at which a bus lock has occurred on a per-vmcb12 basis. If RIP + * is changed, guest has clearly made forward progress, bus_lock_counter + * still remained '1', so reset bus_lock_counter to '0'. Eg. In the + * scenario, where a buslock happened in L1 before VMRUN, the bus lock + * firmly happened on an instruction in the past. Even if vmcb01's + * counter is still '1', (because the guilty instruction got patched), + * the vCPU has clearly made forward progress and so KVM should reset + * vmcb02's counter to '0'. + * + * If the RIP hasn't changed, stash the bus lock counter at nested VMRUN + * to prevent the same guilty instruction from triggering a VM-Exit. Eg. + * if userspace rate-limits the vCPU, then it's entirely possible that + * L1's tick interrupt is pending by the time userspace re-runs the + * vCPU. If KVM unconditionally clears the counter on VMRUN, then when + * L1 re-enters L2, the same instruction will trigger a VM-Exit and the + * entire cycle start over. + */ + if (vmcb02->save.rip && (svm->nested.ctl.bus_lock_rip == vmcb02->save.rip)) + vmcb02->control.bus_lock_counter = 1; + else + vmcb02->control.bus_lock_counter = 0; + /* Done at vmrun: asid. */ /* Also overwritten later if necessary. */ @@ -1039,8 +1066,17 @@ int nested_svm_vmexit(struct vcpu_svm *svm) } + /* + * Invalidate bus_lock_rip unless KVM is still waiting for the guest + * to make forward progress before re-enabling bus lock detection. + */ + if (!vmcb02->control.bus_lock_counter) + svm->nested.ctl.bus_lock_rip = INVALID_GPA; + nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr); + kvm_nested_vmexit_handle_ibrs(vcpu); + svm_switch_vmcb(svm, &svm->vmcb01); /* diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 710ca9810a17..6c2f840a0171 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -560,6 +560,8 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (copy_from_user(¶ms, u64_to_user_ptr(argp->data), sizeof(params))) return -EFAULT; + sev->policy = params.policy; + memset(&start, 0, sizeof(start)); dh_blob = NULL; @@ -1592,11 +1594,11 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) /* allocate memory for header and transport buffer */ ret = -ENOMEM; - hdr = kzalloc(params.hdr_len, GFP_KERNEL_ACCOUNT); + hdr = kzalloc(params.hdr_len, GFP_KERNEL); if (!hdr) goto e_unpin; - trans_data = kzalloc(params.trans_len, GFP_KERNEL_ACCOUNT); + trans_data = kzalloc(params.trans_len, GFP_KERNEL); if (!trans_data) goto e_free_hdr; @@ -2135,6 +2137,8 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (params.policy & SNP_POLICY_MASK_SINGLE_SOCKET) return -EINVAL; + sev->policy = params.policy; + sev->snp_context = snp_context_create(kvm, argp); if (!sev->snp_context) return -ENOTTY; @@ -3930,10 +3934,8 @@ static int sev_snp_ap_creation(struct vcpu_svm *svm) * Unless Creation is deferred until INIT, signal the vCPU to update * its state. */ - if (request != SVM_VMGEXIT_AP_CREATE_ON_INIT) { - kvm_make_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu); - kvm_vcpu_kick(target_vcpu); - } + if (request != SVM_VMGEXIT_AP_CREATE_ON_INIT) + kvm_make_request_and_kick(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu); return 0; } @@ -4391,6 +4393,7 @@ void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm) static void sev_es_init_vmcb(struct vcpu_svm *svm) { + struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm); struct vmcb *vmcb = svm->vmcb01.ptr; struct kvm_vcpu *vcpu = &svm->vcpu; @@ -4406,6 +4409,10 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm) if (svm->sev_es.vmsa && !svm->sev_es.snp_has_guest_vmsa) svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa); + if (cpu_feature_enabled(X86_FEATURE_ALLOWED_SEV_FEATURES)) + svm->vmcb->control.allowed_sev_features = sev->vmsa_features | + VMCB_ALLOWED_SEV_FEATURES_VALID; + /* Can't intercept CR register access, HV can't modify CR registers */ svm_clr_intercept(svm, INTERCEPT_CR0_READ); svm_clr_intercept(svm, INTERCEPT_CR4_READ); @@ -4866,3 +4873,97 @@ int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) return level; } + +struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_save_area *vmsa; + struct kvm_sev_info *sev; + int error = 0; + int ret; + + if (!sev_es_guest(vcpu->kvm)) + return NULL; + + /* + * If the VMSA has not yet been encrypted, return a pointer to the + * current un-encrypted VMSA. + */ + if (!vcpu->arch.guest_state_protected) + return (struct vmcb_save_area *)svm->sev_es.vmsa; + + sev = to_kvm_sev_info(vcpu->kvm); + + /* Check if the SEV policy allows debugging */ + if (sev_snp_guest(vcpu->kvm)) { + if (!(sev->policy & SNP_POLICY_DEBUG)) + return NULL; + } else { + if (sev->policy & SEV_POLICY_NODBG) + return NULL; + } + + if (sev_snp_guest(vcpu->kvm)) { + struct sev_data_snp_dbg dbg = {0}; + + vmsa = snp_alloc_firmware_page(__GFP_ZERO); + if (!vmsa) + return NULL; + + dbg.gctx_paddr = __psp_pa(sev->snp_context); + dbg.src_addr = svm->vmcb->control.vmsa_pa; + dbg.dst_addr = __psp_pa(vmsa); + + ret = sev_do_cmd(SEV_CMD_SNP_DBG_DECRYPT, &dbg, &error); + + /* + * Return the target page to a hypervisor page no matter what. + * If this fails, the page can't be used, so leak it and don't + * try to use it. + */ + if (snp_page_reclaim(vcpu->kvm, PHYS_PFN(__pa(vmsa)))) + return NULL; + + if (ret) { + pr_err("SEV: SNP_DBG_DECRYPT failed ret=%d, fw_error=%d (%#x)\n", + ret, error, error); + free_page((unsigned long)vmsa); + + return NULL; + } + } else { + struct sev_data_dbg dbg = {0}; + struct page *vmsa_page; + + vmsa_page = alloc_page(GFP_KERNEL); + if (!vmsa_page) + return NULL; + + vmsa = page_address(vmsa_page); + + dbg.handle = sev->handle; + dbg.src_addr = svm->vmcb->control.vmsa_pa; + dbg.dst_addr = __psp_pa(vmsa); + dbg.len = PAGE_SIZE; + + ret = sev_do_cmd(SEV_CMD_DBG_DECRYPT, &dbg, &error); + if (ret) { + pr_err("SEV: SEV_CMD_DBG_DECRYPT failed ret=%d, fw_error=%d (0x%x)\n", + ret, error, error); + __free_page(vmsa_page); + + return NULL; + } + } + + return vmsa; +} + +void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa) +{ + /* If the VMSA has not yet been encrypted, nothing was allocated */ + if (!vcpu->arch.guest_state_protected || !vmsa) + return; + + free_page((unsigned long)vmsa); +} diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9366a93ae2c6..0ad1a6d4fb6d 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -29,6 +29,7 @@ #include <linux/cc_platform.h> #include <linux/smp.h> #include <linux/string_choices.h> +#include <linux/mutex.h> #include <asm/apic.h> #include <asm/perf_event.h> @@ -231,6 +232,8 @@ module_param(tsc_scaling, int, 0444); static bool avic; module_param(avic, bool, 0444); +module_param(enable_device_posted_irqs, bool, 0444); + bool __read_mostly dump_invalid_vmcb; module_param(dump_invalid_vmcb, bool, 0644); @@ -249,6 +252,8 @@ static unsigned long iopm_base; DEFINE_PER_CPU(struct svm_cpu_data, svm_data); +static DEFINE_MUTEX(vmcb_dump_mutex); + /* * Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE. @@ -1375,6 +1380,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK; } + if (vcpu->kvm->arch.bus_lock_detection_enabled) + svm_set_intercept(svm, INTERCEPT_BUSLOCK); + if (sev_guest(vcpu->kvm)) sev_init_vmcb(svm); @@ -1484,25 +1492,10 @@ out: return err; } -static void svm_clear_current_vmcb(struct vmcb *vmcb) -{ - int i; - - for_each_online_cpu(i) - cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL); -} - static void svm_vcpu_free(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - /* - * The vmcb page can be recycled, causing a false negative in - * svm_vcpu_load(). So, ensure that no logical CPU has this - * vmcb page recorded as its current vmcb. - */ - svm_clear_current_vmcb(svm->vmcb); - svm_leave_nested(vcpu); svm_free_nested(svm); @@ -1616,19 +1609,9 @@ static void svm_prepare_host_switch(struct kvm_vcpu *vcpu) static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - struct vcpu_svm *svm = to_svm(vcpu); - struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); - if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) shrink_ple_window(vcpu); - if (sd->current_vmcb != svm->vmcb) { - sd->current_vmcb = svm->vmcb; - - if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT) && - static_branch_likely(&switch_vcpu_ibpb)) - indirect_branch_prediction_barrier(); - } if (kvm_vcpu_apicv_active(vcpu)) avic_vcpu_load(vcpu, cpu); } @@ -3234,17 +3217,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) } /* - * AMD changed the architectural behavior of bits 5:2. On CPUs - * without BusLockTrap, bits 5:2 control "external pins", but - * on CPUs that support BusLockDetect, bit 2 enables BusLockTrap - * and bits 5:3 are reserved-to-zero. Sadly, old KVM allowed - * the guest to set bits 5:2 despite not actually virtualizing - * Performance-Monitoring/Breakpoint external pins. Drop bits - * 5:2 for backwards compatibility. - */ - data &= ~GENMASK(5, 2); - - /* * Suppress BTF as KVM doesn't virtualize BTF, but there's no * way to communicate lack of support to the guest. */ @@ -3374,6 +3346,37 @@ static int invpcid_interception(struct kvm_vcpu *vcpu) return kvm_handle_invpcid(vcpu, type, gva); } +static inline int complete_userspace_buslock(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + /* + * If userspace has NOT changed RIP, then KVM's ABI is to let the guest + * execute the bus-locking instruction. Set the bus lock counter to '1' + * to effectively step past the bus lock. + */ + if (kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip)) + svm->vmcb->control.bus_lock_counter = 1; + + return 1; +} + +static int bus_lock_exit(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; + vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK; + + vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu); + vcpu->arch.complete_userspace_io = complete_userspace_buslock; + + if (is_guest_mode(vcpu)) + svm->nested.ctl.bus_lock_rip = vcpu->arch.cui_linear_rip; + + return 0; +} + static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { [SVM_EXIT_READ_CR0] = cr_interception, [SVM_EXIT_READ_CR3] = cr_interception, @@ -3443,6 +3446,7 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { [SVM_EXIT_INVPCID] = invpcid_interception, [SVM_EXIT_IDLE_HLT] = kvm_emulate_halt, [SVM_EXIT_NPF] = npf_interception, + [SVM_EXIT_BUS_LOCK] = bus_lock_exit, [SVM_EXIT_RSM] = rsm_interception, [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception, @@ -3457,14 +3461,21 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save; + char *vm_type; if (!dump_invalid_vmcb) { pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n"); return; } - pr_err("VMCB %p, last attempted VMRUN on CPU %d\n", - svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu); + guard(mutex)(&vmcb_dump_mutex); + + vm_type = sev_snp_guest(vcpu->kvm) ? "SEV-SNP" : + sev_es_guest(vcpu->kvm) ? "SEV-ES" : + sev_guest(vcpu->kvm) ? "SEV" : "SVM"; + + pr_err("%s vCPU%u VMCB %p, last attempted VMRUN on CPU %d\n", + vm_type, vcpu->vcpu_id, svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu); pr_err("VMCB Control Area:\n"); pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff); pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16); @@ -3502,6 +3513,17 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id); pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id); pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa); + pr_err("%-20s%016llx\n", "allowed_sev_features:", control->allowed_sev_features); + pr_err("%-20s%016llx\n", "guest_sev_features:", control->guest_sev_features); + + if (sev_es_guest(vcpu->kvm)) { + save = sev_decrypt_vmsa(vcpu); + if (!save) + goto no_vmsa; + + save01 = save; + } + pr_err("VMCB State Save Area:\n"); pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", "es:", @@ -3572,6 +3594,63 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-15s %016llx %-13s %016llx\n", "excp_from:", save->last_excp_from, "excp_to:", save->last_excp_to); + + if (sev_es_guest(vcpu->kvm)) { + struct sev_es_save_area *vmsa = (struct sev_es_save_area *)save; + + pr_err("%-15s %016llx\n", + "sev_features", vmsa->sev_features); + + pr_err("%-15s %016llx %-13s %016llx\n", + "rax:", vmsa->rax, "rbx:", vmsa->rbx); + pr_err("%-15s %016llx %-13s %016llx\n", + "rcx:", vmsa->rcx, "rdx:", vmsa->rdx); + pr_err("%-15s %016llx %-13s %016llx\n", + "rsi:", vmsa->rsi, "rdi:", vmsa->rdi); + pr_err("%-15s %016llx %-13s %016llx\n", + "rbp:", vmsa->rbp, "rsp:", vmsa->rsp); + pr_err("%-15s %016llx %-13s %016llx\n", + "r8:", vmsa->r8, "r9:", vmsa->r9); + pr_err("%-15s %016llx %-13s %016llx\n", + "r10:", vmsa->r10, "r11:", vmsa->r11); + pr_err("%-15s %016llx %-13s %016llx\n", + "r12:", vmsa->r12, "r13:", vmsa->r13); + pr_err("%-15s %016llx %-13s %016llx\n", + "r14:", vmsa->r14, "r15:", vmsa->r15); + pr_err("%-15s %016llx %-13s %016llx\n", + "xcr0:", vmsa->xcr0, "xss:", vmsa->xss); + } else { + pr_err("%-15s %016llx %-13s %016lx\n", + "rax:", save->rax, "rbx:", + vcpu->arch.regs[VCPU_REGS_RBX]); + pr_err("%-15s %016lx %-13s %016lx\n", + "rcx:", vcpu->arch.regs[VCPU_REGS_RCX], + "rdx:", vcpu->arch.regs[VCPU_REGS_RDX]); + pr_err("%-15s %016lx %-13s %016lx\n", + "rsi:", vcpu->arch.regs[VCPU_REGS_RSI], + "rdi:", vcpu->arch.regs[VCPU_REGS_RDI]); + pr_err("%-15s %016lx %-13s %016llx\n", + "rbp:", vcpu->arch.regs[VCPU_REGS_RBP], + "rsp:", save->rsp); +#ifdef CONFIG_X86_64 + pr_err("%-15s %016lx %-13s %016lx\n", + "r8:", vcpu->arch.regs[VCPU_REGS_R8], + "r9:", vcpu->arch.regs[VCPU_REGS_R9]); + pr_err("%-15s %016lx %-13s %016lx\n", + "r10:", vcpu->arch.regs[VCPU_REGS_R10], + "r11:", vcpu->arch.regs[VCPU_REGS_R11]); + pr_err("%-15s %016lx %-13s %016lx\n", + "r12:", vcpu->arch.regs[VCPU_REGS_R12], + "r13:", vcpu->arch.regs[VCPU_REGS_R13]); + pr_err("%-15s %016lx %-13s %016lx\n", + "r14:", vcpu->arch.regs[VCPU_REGS_R14], + "r15:", vcpu->arch.regs[VCPU_REGS_R15]); +#endif + } + +no_vmsa: + if (sev_es_guest(vcpu->kvm)) + sev_free_decrypted_vmsa(vcpu, save); } static bool svm_check_exit_valid(u64 exit_code) @@ -3608,6 +3687,10 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) return kvm_emulate_halt(vcpu); else if (exit_code == SVM_EXIT_NPF) return npf_interception(vcpu); +#ifdef CONFIG_KVM_AMD_SEV + else if (exit_code == SVM_EXIT_VMGEXIT) + return sev_handle_vmgexit(vcpu); +#endif #endif return svm_exit_handlers[exit_code](vcpu); } @@ -5369,6 +5452,9 @@ static __init void svm_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); } + if (cpu_feature_enabled(X86_FEATURE_BUS_LOCK_THRESHOLD)) + kvm_caps.has_bus_lock_exit = true; + /* CPUID 0x80000008 */ if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || boot_cpu_has(X86_FEATURE_AMD_SSBD)) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index f16b068c4228..e6f3c6a153a0 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -98,6 +98,7 @@ struct kvm_sev_info { unsigned int asid; /* ASID used for this guest */ unsigned int handle; /* SEV firmware handle */ int fd; /* SEV device fd */ + unsigned long policy; unsigned long pages_locked; /* Number of pages locked */ struct list_head regions_list; /* List of registered regions */ u64 ap_jump_table; /* SEV-ES AP Jump Table address */ @@ -114,6 +115,9 @@ struct kvm_sev_info { struct mutex guest_req_mutex; /* Must acquire before using bounce buffers */ }; +#define SEV_POLICY_NODBG BIT_ULL(0) +#define SNP_POLICY_DEBUG BIT_ULL(19) + struct kvm_svm { struct kvm kvm; @@ -169,6 +173,7 @@ struct vmcb_ctrl_area_cached { u64 nested_cr3; u64 virt_ext; u32 clean; + u64 bus_lock_rip; union { #if IS_ENABLED(CONFIG_HYPERV) || IS_ENABLED(CONFIG_KVM_HYPERV) struct hv_vmcb_enlightenments hv_enlightenments; @@ -340,8 +345,6 @@ struct svm_cpu_data { struct vmcb *save_area; unsigned long save_area_pa; - struct vmcb *current_vmcb; - /* index = sev_asid, value = vmcb pointer */ struct vmcb **sev_vmcbs; }; @@ -785,6 +788,8 @@ void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu); int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn); +struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu); +void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa); #else static inline struct page *snp_safe_alloc_page_node(int node, gfp_t gfp) { @@ -816,6 +821,11 @@ static inline int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) return 0; } +static inline struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu) +{ + return NULL; +} +static inline void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa) {} #endif /* vmenter.S */ diff --git a/arch/x86/kvm/vmx/common.h b/arch/x86/kvm/vmx/common.h index 8f46a06e2c44..a0c5e8781c33 100644 --- a/arch/x86/kvm/vmx/common.h +++ b/arch/x86/kvm/vmx/common.h @@ -71,8 +71,8 @@ static __always_inline bool is_td_vcpu(struct kvm_vcpu *vcpu) #else -static inline bool is_td(struct kvm *kvm) { return false; } -static inline bool is_td_vcpu(struct kvm_vcpu *vcpu) { return false; } +static __always_inline bool is_td(struct kvm *kvm) { return false; } +static __always_inline bool is_td_vcpu(struct kvm_vcpu *vcpu) { return false; } #endif diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 94d5d907d37b..d1e02e567b57 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -12,7 +12,6 @@ #ifdef CONFIG_KVM_INTEL_TDX static_assert(offsetof(struct vcpu_vmx, vt) == offsetof(struct vcpu_tdx, vt)); -#endif static void vt_disable_virtualization_cpu(void) { @@ -240,7 +239,7 @@ static int vt_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) if (is_td_vcpu(vcpu)) return tdx_complete_emulated_msr(vcpu, err); - return kvm_complete_insn_gp(vcpu, err); + return vmx_complete_emulated_msr(vcpu, err); } #ifdef CONFIG_KVM_SMM @@ -315,14 +314,6 @@ static void vt_set_virtual_apic_mode(struct kvm_vcpu *vcpu) return vmx_set_virtual_apic_mode(vcpu); } -static void vt_apicv_pre_state_restore(struct kvm_vcpu *vcpu) -{ - struct pi_desc *pi = vcpu_to_pi_desc(vcpu); - - pi_clear_on(pi); - memset(pi->pir, 0, sizeof(pi->pir)); -} - static void vt_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) { if (is_td_vcpu(vcpu)) @@ -888,6 +879,13 @@ static int vt_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) return 0; } +#define vt_op(name) vt_##name +#define vt_op_tdx_only(name) vt_##name +#else /* CONFIG_KVM_INTEL_TDX */ +#define vt_op(name) vmx_##name +#define vt_op_tdx_only(name) NULL +#endif /* CONFIG_KVM_INTEL_TDX */ + #define VMX_REQUIRED_APICV_INHIBITS \ (BIT(APICV_INHIBIT_REASON_DISABLED) | \ BIT(APICV_INHIBIT_REASON_ABSENT) | \ @@ -905,113 +903,113 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .hardware_unsetup = vmx_hardware_unsetup, .enable_virtualization_cpu = vmx_enable_virtualization_cpu, - .disable_virtualization_cpu = vt_disable_virtualization_cpu, + .disable_virtualization_cpu = vt_op(disable_virtualization_cpu), .emergency_disable_virtualization_cpu = vmx_emergency_disable_virtualization_cpu, - .has_emulated_msr = vt_has_emulated_msr, + .has_emulated_msr = vt_op(has_emulated_msr), .vm_size = sizeof(struct kvm_vmx), - .vm_init = vt_vm_init, - .vm_pre_destroy = vt_vm_pre_destroy, - .vm_destroy = vt_vm_destroy, + .vm_init = vt_op(vm_init), + .vm_destroy = vt_op(vm_destroy), + .vm_pre_destroy = vt_op_tdx_only(vm_pre_destroy), - .vcpu_precreate = vt_vcpu_precreate, - .vcpu_create = vt_vcpu_create, - .vcpu_free = vt_vcpu_free, - .vcpu_reset = vt_vcpu_reset, + .vcpu_precreate = vt_op(vcpu_precreate), + .vcpu_create = vt_op(vcpu_create), + .vcpu_free = vt_op(vcpu_free), + .vcpu_reset = vt_op(vcpu_reset), - .prepare_switch_to_guest = vt_prepare_switch_to_guest, - .vcpu_load = vt_vcpu_load, - .vcpu_put = vt_vcpu_put, + .prepare_switch_to_guest = vt_op(prepare_switch_to_guest), + .vcpu_load = vt_op(vcpu_load), + .vcpu_put = vt_op(vcpu_put), - .update_exception_bitmap = vt_update_exception_bitmap, + .update_exception_bitmap = vt_op(update_exception_bitmap), .get_feature_msr = vmx_get_feature_msr, - .get_msr = vt_get_msr, - .set_msr = vt_set_msr, - - .get_segment_base = vt_get_segment_base, - .get_segment = vt_get_segment, - .set_segment = vt_set_segment, - .get_cpl = vt_get_cpl, - .get_cpl_no_cache = vt_get_cpl_no_cache, - .get_cs_db_l_bits = vt_get_cs_db_l_bits, - .is_valid_cr0 = vt_is_valid_cr0, - .set_cr0 = vt_set_cr0, - .is_valid_cr4 = vt_is_valid_cr4, - .set_cr4 = vt_set_cr4, - .set_efer = vt_set_efer, - .get_idt = vt_get_idt, - .set_idt = vt_set_idt, - .get_gdt = vt_get_gdt, - .set_gdt = vt_set_gdt, - .set_dr6 = vt_set_dr6, - .set_dr7 = vt_set_dr7, - .sync_dirty_debug_regs = vt_sync_dirty_debug_regs, - .cache_reg = vt_cache_reg, - .get_rflags = vt_get_rflags, - .set_rflags = vt_set_rflags, - .get_if_flag = vt_get_if_flag, - - .flush_tlb_all = vt_flush_tlb_all, - .flush_tlb_current = vt_flush_tlb_current, - .flush_tlb_gva = vt_flush_tlb_gva, - .flush_tlb_guest = vt_flush_tlb_guest, - - .vcpu_pre_run = vt_vcpu_pre_run, - .vcpu_run = vt_vcpu_run, - .handle_exit = vt_handle_exit, + .get_msr = vt_op(get_msr), + .set_msr = vt_op(set_msr), + + .get_segment_base = vt_op(get_segment_base), + .get_segment = vt_op(get_segment), + .set_segment = vt_op(set_segment), + .get_cpl = vt_op(get_cpl), + .get_cpl_no_cache = vt_op(get_cpl_no_cache), + .get_cs_db_l_bits = vt_op(get_cs_db_l_bits), + .is_valid_cr0 = vt_op(is_valid_cr0), + .set_cr0 = vt_op(set_cr0), + .is_valid_cr4 = vt_op(is_valid_cr4), + .set_cr4 = vt_op(set_cr4), + .set_efer = vt_op(set_efer), + .get_idt = vt_op(get_idt), + .set_idt = vt_op(set_idt), + .get_gdt = vt_op(get_gdt), + .set_gdt = vt_op(set_gdt), + .set_dr6 = vt_op(set_dr6), + .set_dr7 = vt_op(set_dr7), + .sync_dirty_debug_regs = vt_op(sync_dirty_debug_regs), + .cache_reg = vt_op(cache_reg), + .get_rflags = vt_op(get_rflags), + .set_rflags = vt_op(set_rflags), + .get_if_flag = vt_op(get_if_flag), + + .flush_tlb_all = vt_op(flush_tlb_all), + .flush_tlb_current = vt_op(flush_tlb_current), + .flush_tlb_gva = vt_op(flush_tlb_gva), + .flush_tlb_guest = vt_op(flush_tlb_guest), + + .vcpu_pre_run = vt_op(vcpu_pre_run), + .vcpu_run = vt_op(vcpu_run), + .handle_exit = vt_op(handle_exit), .skip_emulated_instruction = vmx_skip_emulated_instruction, .update_emulated_instruction = vmx_update_emulated_instruction, - .set_interrupt_shadow = vt_set_interrupt_shadow, - .get_interrupt_shadow = vt_get_interrupt_shadow, - .patch_hypercall = vt_patch_hypercall, - .inject_irq = vt_inject_irq, - .inject_nmi = vt_inject_nmi, - .inject_exception = vt_inject_exception, - .cancel_injection = vt_cancel_injection, - .interrupt_allowed = vt_interrupt_allowed, - .nmi_allowed = vt_nmi_allowed, - .get_nmi_mask = vt_get_nmi_mask, - .set_nmi_mask = vt_set_nmi_mask, - .enable_nmi_window = vt_enable_nmi_window, - .enable_irq_window = vt_enable_irq_window, - .update_cr8_intercept = vt_update_cr8_intercept, + .set_interrupt_shadow = vt_op(set_interrupt_shadow), + .get_interrupt_shadow = vt_op(get_interrupt_shadow), + .patch_hypercall = vt_op(patch_hypercall), + .inject_irq = vt_op(inject_irq), + .inject_nmi = vt_op(inject_nmi), + .inject_exception = vt_op(inject_exception), + .cancel_injection = vt_op(cancel_injection), + .interrupt_allowed = vt_op(interrupt_allowed), + .nmi_allowed = vt_op(nmi_allowed), + .get_nmi_mask = vt_op(get_nmi_mask), + .set_nmi_mask = vt_op(set_nmi_mask), + .enable_nmi_window = vt_op(enable_nmi_window), + .enable_irq_window = vt_op(enable_irq_window), + .update_cr8_intercept = vt_op(update_cr8_intercept), .x2apic_icr_is_split = false, - .set_virtual_apic_mode = vt_set_virtual_apic_mode, - .set_apic_access_page_addr = vt_set_apic_access_page_addr, - .refresh_apicv_exec_ctrl = vt_refresh_apicv_exec_ctrl, - .load_eoi_exitmap = vt_load_eoi_exitmap, - .apicv_pre_state_restore = vt_apicv_pre_state_restore, + .set_virtual_apic_mode = vt_op(set_virtual_apic_mode), + .set_apic_access_page_addr = vt_op(set_apic_access_page_addr), + .refresh_apicv_exec_ctrl = vt_op(refresh_apicv_exec_ctrl), + .load_eoi_exitmap = vt_op(load_eoi_exitmap), + .apicv_pre_state_restore = pi_apicv_pre_state_restore, .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS, - .hwapic_isr_update = vt_hwapic_isr_update, - .sync_pir_to_irr = vt_sync_pir_to_irr, - .deliver_interrupt = vt_deliver_interrupt, + .hwapic_isr_update = vt_op(hwapic_isr_update), + .sync_pir_to_irr = vt_op(sync_pir_to_irr), + .deliver_interrupt = vt_op(deliver_interrupt), .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt, - .set_tss_addr = vt_set_tss_addr, - .set_identity_map_addr = vt_set_identity_map_addr, + .set_tss_addr = vt_op(set_tss_addr), + .set_identity_map_addr = vt_op(set_identity_map_addr), .get_mt_mask = vmx_get_mt_mask, - .get_exit_info = vt_get_exit_info, - .get_entry_info = vt_get_entry_info, + .get_exit_info = vt_op(get_exit_info), + .get_entry_info = vt_op(get_entry_info), - .vcpu_after_set_cpuid = vt_vcpu_after_set_cpuid, + .vcpu_after_set_cpuid = vt_op(vcpu_after_set_cpuid), .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, - .get_l2_tsc_offset = vt_get_l2_tsc_offset, - .get_l2_tsc_multiplier = vt_get_l2_tsc_multiplier, - .write_tsc_offset = vt_write_tsc_offset, - .write_tsc_multiplier = vt_write_tsc_multiplier, + .get_l2_tsc_offset = vt_op(get_l2_tsc_offset), + .get_l2_tsc_multiplier = vt_op(get_l2_tsc_multiplier), + .write_tsc_offset = vt_op(write_tsc_offset), + .write_tsc_multiplier = vt_op(write_tsc_multiplier), - .load_mmu_pgd = vt_load_mmu_pgd, + .load_mmu_pgd = vt_op(load_mmu_pgd), .check_intercept = vmx_check_intercept, .handle_exit_irqoff = vmx_handle_exit_irqoff, - .update_cpu_dirty_logging = vt_update_cpu_dirty_logging, + .update_cpu_dirty_logging = vt_op(update_cpu_dirty_logging), .nested_ops = &vmx_nested_ops, @@ -1019,38 +1017,38 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .pi_start_assignment = vmx_pi_start_assignment, #ifdef CONFIG_X86_64 - .set_hv_timer = vt_set_hv_timer, - .cancel_hv_timer = vt_cancel_hv_timer, + .set_hv_timer = vt_op(set_hv_timer), + .cancel_hv_timer = vt_op(cancel_hv_timer), #endif - .setup_mce = vt_setup_mce, + .setup_mce = vt_op(setup_mce), #ifdef CONFIG_KVM_SMM - .smi_allowed = vt_smi_allowed, - .enter_smm = vt_enter_smm, - .leave_smm = vt_leave_smm, - .enable_smi_window = vt_enable_smi_window, + .smi_allowed = vt_op(smi_allowed), + .enter_smm = vt_op(enter_smm), + .leave_smm = vt_op(leave_smm), + .enable_smi_window = vt_op(enable_smi_window), #endif - .check_emulate_instruction = vt_check_emulate_instruction, - .apic_init_signal_blocked = vt_apic_init_signal_blocked, + .check_emulate_instruction = vt_op(check_emulate_instruction), + .apic_init_signal_blocked = vt_op(apic_init_signal_blocked), .migrate_timers = vmx_migrate_timers, - .msr_filter_changed = vt_msr_filter_changed, - .complete_emulated_msr = vt_complete_emulated_msr, + .msr_filter_changed = vt_op(msr_filter_changed), + .complete_emulated_msr = vt_op(complete_emulated_msr), .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, .get_untagged_addr = vmx_get_untagged_addr, - .mem_enc_ioctl = vt_mem_enc_ioctl, - .vcpu_mem_enc_ioctl = vt_vcpu_mem_enc_ioctl, + .mem_enc_ioctl = vt_op_tdx_only(mem_enc_ioctl), + .vcpu_mem_enc_ioctl = vt_op_tdx_only(vcpu_mem_enc_ioctl), - .private_max_mapping_level = vt_gmem_private_max_mapping_level + .private_max_mapping_level = vt_op_tdx_only(gmem_private_max_mapping_level) }; struct kvm_x86_init_ops vt_init_ops __initdata = { - .hardware_setup = vt_hardware_setup, + .hardware_setup = vt_op(hardware_setup), .handle_intel_pt_intr = NULL, .runtime_ops = &vt_x86_ops, diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e073e3008b16..b50f9c894ab8 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -301,7 +301,7 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) cpu = get_cpu(); prev = vmx->loaded_vmcs; vmx->loaded_vmcs = vmcs; - vmx_vcpu_load_vmcs(vcpu, cpu, prev); + vmx_vcpu_load_vmcs(vcpu, cpu); vmx_sync_vmcs_host_state(vmx, prev); put_cpu(); @@ -824,12 +824,30 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, return 0; } +static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, + vmx->nested.msrs.misc_high); + + return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; +} + static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, u32 count, u64 addr) { if (count == 0) return 0; + /* + * Exceeding the limit results in architecturally _undefined_ behavior, + * i.e. KVM is allowed to do literally anything in response to a bad + * limit. Immediately generate a consistency check so that code that + * consumes the count doesn't need to worry about extreme edge cases. + */ + if (count > nested_vmx_max_atomic_switch_msrs(vcpu)) + return -EINVAL; + if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) return -EINVAL; @@ -940,15 +958,6 @@ static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, return 0; } -static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, - vmx->nested.msrs.misc_high); - - return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; -} - /* * Load guest's/host's msr at nested entry/exit. * return 0 for success, entry index for failure. @@ -965,7 +974,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); for (i = 0; i < count; i++) { - if (unlikely(i >= max_msr_list_size)) + if (WARN_ON_ONCE(i >= max_msr_list_size)) goto fail; if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), @@ -1053,7 +1062,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); for (i = 0; i < count; i++) { - if (unlikely(i >= max_msr_list_size)) + if (WARN_ON_ONCE(i >= max_msr_list_size)) return -EINVAL; if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) @@ -4520,12 +4529,12 @@ static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, cpu = get_cpu(); vmx->loaded_vmcs = &vmx->nested.vmcs02; - vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01); + vmx_vcpu_load_vmcs(vcpu, cpu); sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); vmx->loaded_vmcs = &vmx->vmcs01; - vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02); + vmx_vcpu_load_vmcs(vcpu, cpu); put_cpu(); } @@ -5020,16 +5029,7 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmx_switch_vmcs(vcpu, &vmx->vmcs01); - /* - * If IBRS is advertised to the vCPU, KVM must flush the indirect - * branch predictors when transitioning from L2 to L1, as L1 expects - * hardware (KVM in this case) to provide separate predictor modes. - * Bare metal isolates VMX root (host) from VMX non-root (guest), but - * doesn't isolate different VMCSs, i.e. in this case, doesn't provide - * separate modes for L2 vs L1. - */ - if (guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL)) - indirect_branch_prediction_barrier(); + kvm_nested_vmexit_handle_ibrs(vcpu); /* Update any VMCS fields that might have changed while L2 ran */ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 99d1d599ff8c..5c615e5845bf 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -34,7 +34,7 @@ static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock); #define PI_LOCK_SCHED_OUT SINGLE_DEPTH_NESTING -struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) +static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) { return &(to_vt(vcpu)->pi_desc); } @@ -148,9 +148,8 @@ after_clear_sn: static bool vmx_can_use_vtd_pi(struct kvm *kvm) { - return irqchip_in_kernel(kvm) && enable_apicv && - kvm_arch_has_assigned_device(kvm) && - irq_remapping_cap(IRQ_POSTING_CAP); + return irqchip_in_kernel(kvm) && kvm_arch_has_irq_bypass() && + kvm_arch_has_assigned_device(kvm); } /* @@ -264,6 +263,14 @@ void __init pi_init_cpu(int cpu) raw_spin_lock_init(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu)); } +void pi_apicv_pre_state_restore(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi = vcpu_to_pi_desc(vcpu); + + pi_clear_on(pi); + memset(pi->pir, 0, sizeof(pi->pir)); +} + bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); @@ -281,7 +288,7 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) */ void vmx_pi_start_assignment(struct kvm *kvm) { - if (!irq_remapping_cap(IRQ_POSTING_CAP)) + if (!kvm_arch_has_irq_bypass()) return; kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK); diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index 68605ca7ef68..80499ea0e674 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -5,12 +5,11 @@ #include <linux/bitmap.h> #include <asm/posted_intr.h> -struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu); - void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu); void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu); void pi_wakeup_handler(void); void __init pi_init_cpu(int cpu); +void pi_apicv_pre_state_restore(struct kvm_vcpu *vcpu); bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu); int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); @@ -20,7 +19,7 @@ static inline int pi_find_highest_vector(struct pi_desc *pi_desc) { int vec; - vec = find_last_bit((unsigned long *)pi_desc->pir, 256); + vec = find_last_bit(pi_desc->pir, 256); return vec < 256 ? vec : -1; } diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index f6986dee6f8c..0a6cf5bff2aa 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -59,8 +59,7 @@ * without the explicit restore, thinks the stack is getting walloped. * Using an unwind hint is problematic due to x86-64's dynamic alignment. */ - mov %_ASM_BP, %_ASM_SP - pop %_ASM_BP + leave RET .endm diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ef2d7208dd20..9ff00ae9f05a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -116,6 +116,8 @@ module_param(enable_apicv, bool, 0444); bool __read_mostly enable_ipiv = true; module_param(enable_ipiv, bool, 0444); +module_param(enable_device_posted_irqs, bool, 0444); + /* * If nested=1, nested virtualization is supported, i.e., guests may use * VMX and be a hypervisor for its own guests. If nested=0, guests may not @@ -770,8 +772,11 @@ void vmx_emergency_disable_virtualization_cpu(void) return; list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), - loaded_vmcss_on_cpu_link) + loaded_vmcss_on_cpu_link) { vmcs_clear(v->vmcs); + if (v->shadow_vmcs) + vmcs_clear(v->shadow_vmcs); + } kvm_cpu_vmxoff(); } @@ -1443,8 +1448,7 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) } } -void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, - struct loaded_vmcs *buddy) +void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); bool already_loaded = vmx->loaded_vmcs->cpu == cpu; @@ -1471,17 +1475,6 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, if (prev != vmx->loaded_vmcs->vmcs) { per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; vmcs_load(vmx->loaded_vmcs->vmcs); - - /* - * No indirect branch prediction barrier needed when switching - * the active VMCS within a vCPU, unless IBRS is advertised to - * the vCPU. To minimize the number of IBPBs executed, KVM - * performs IBPB on nested VM-Exit (a single nested transition - * may switch the active VMCS multiple times). - */ - if (static_branch_likely(&switch_vcpu_ibpb) && - (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))) - indirect_branch_prediction_barrier(); } if (!already_loaded) { @@ -1520,7 +1513,7 @@ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) shrink_ple_window(vcpu); - vmx_vcpu_load_vmcs(vcpu, cpu, NULL); + vmx_vcpu_load_vmcs(vcpu, cpu); vmx_vcpu_pi_load(vcpu, cpu); } diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 6d1e40ecc024..b5758c33c60f 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -354,8 +354,7 @@ static __always_inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu) return vt->exit_intr_info; } -void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, - struct loaded_vmcs *buddy); +void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu); int allocate_vpid(void); void free_vpid(int vpid); void vmx_set_constant_host_state(struct vcpu_vmx *vmx); diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index 6bf8be570b2e..b4596f651232 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -57,6 +57,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); int vmx_get_feature_msr(u32 msr, u64 *data); int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); +#define vmx_complete_emulated_msr kvm_complete_insn_gp u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg); void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -163,71 +164,6 @@ void tdx_flush_tlb_current(struct kvm_vcpu *vcpu); void tdx_flush_tlb_all(struct kvm_vcpu *vcpu); void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn); -#else -static inline void tdx_disable_virtualization_cpu(void) {} -static inline int tdx_vm_init(struct kvm *kvm) { return -EOPNOTSUPP; } -static inline void tdx_mmu_release_hkid(struct kvm *kvm) {} -static inline void tdx_vm_destroy(struct kvm *kvm) {} -static inline int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) { return -EOPNOTSUPP; } - -static inline int tdx_vcpu_create(struct kvm_vcpu *vcpu) { return -EOPNOTSUPP; } -static inline void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) {} -static inline void tdx_vcpu_free(struct kvm_vcpu *vcpu) {} -static inline void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) {} -static inline int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu) { return -EOPNOTSUPP; } -static inline fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) -{ - return EXIT_FASTPATH_NONE; -} -static inline void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) {} -static inline void tdx_vcpu_put(struct kvm_vcpu *vcpu) {} -static inline bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu) { return false; } -static inline int tdx_handle_exit(struct kvm_vcpu *vcpu, - enum exit_fastpath_completion fastpath) { return 0; } - -static inline void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, - int trig_mode, int vector) {} -static inline void tdx_inject_nmi(struct kvm_vcpu *vcpu) {} -static inline void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, u64 *info1, - u64 *info2, u32 *intr_info, u32 *error_code) {} -static inline bool tdx_has_emulated_msr(u32 index) { return false; } -static inline int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { return 1; } -static inline int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { return 1; } - -static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { return -EOPNOTSUPP; } - -static inline int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, - enum pg_level level, - void *private_spt) -{ - return -EOPNOTSUPP; -} - -static inline int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, - enum pg_level level, - void *private_spt) -{ - return -EOPNOTSUPP; -} - -static inline int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, - enum pg_level level, - kvm_pfn_t pfn) -{ - return -EOPNOTSUPP; -} - -static inline int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, - enum pg_level level, - kvm_pfn_t pfn) -{ - return -EOPNOTSUPP; -} - -static inline void tdx_flush_tlb_current(struct kvm_vcpu *vcpu) {} -static inline void tdx_flush_tlb_all(struct kvm_vcpu *vcpu) {} -static inline void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) {} -static inline int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) { return 0; } #endif #endif /* __KVM_X86_VMX_X86_OPS_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cf73d58150ab..dd34a2ec854c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -226,6 +226,9 @@ EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); bool __read_mostly enable_apicv = true; EXPORT_SYMBOL_GPL(enable_apicv); +bool __read_mostly enable_device_posted_irqs = true; +EXPORT_SYMBOL_GPL(enable_device_posted_irqs); + const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS(), STATS_DESC_COUNTER(VM, mmu_shadow_zapped), @@ -4990,6 +4993,8 @@ static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) return kvm_arch_has_noncoherent_dma(vcpu->kvm); } +static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu); + void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -5012,6 +5017,19 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_x86_call(vcpu_load)(vcpu, cpu); + if (vcpu != per_cpu(last_vcpu, cpu)) { + /* + * Flush the branch predictor when switching vCPUs on the same + * physical CPU, as each vCPU needs its own branch prediction + * domain. No IBPB is needed when switching between L1 and L2 + * on the same vCPU unless IBRS is advertised to the vCPU; that + * is handled on the nested VM-Exit path. + */ + if (static_branch_likely(&switch_vcpu_ibpb)) + indirect_branch_prediction_barrier(); + per_cpu(last_vcpu, cpu) = vcpu; + } + /* Save host pkru register if supported */ vcpu->arch.host_pkru = read_pkru(); @@ -7326,10 +7344,13 @@ set_pit2_out: r = READ_ONCE(kvm->arch.default_tsc_khz); goto out; } - case KVM_MEMORY_ENCRYPT_OP: { + case KVM_MEMORY_ENCRYPT_OP: + r = -ENOTTY; + if (!kvm_x86_ops.mem_enc_ioctl) + goto out; + r = kvm_x86_call(mem_enc_ioctl)(kvm, argp); break; - } case KVM_MEMORY_ENCRYPT_REG_REGION: { struct kvm_enc_region region; @@ -8023,7 +8044,7 @@ static int emulator_read_write(struct x86_emulate_ctxt *ctxt, return rc; if (!vcpu->mmio_nr_fragments) - return rc; + return X86EMUL_CONTINUE; gpa = vcpu->mmio_fragments[0].gpa; @@ -9361,7 +9382,7 @@ static int complete_fast_pio_out(struct kvm_vcpu *vcpu) { vcpu->arch.pio.count = 0; - if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) + if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))) return 1; return kvm_skip_emulated_instruction(vcpu); @@ -9386,7 +9407,7 @@ static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, complete_fast_pio_out_port_0x7e; kvm_skip_emulated_instruction(vcpu); } else { - vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu); + vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu); vcpu->arch.complete_userspace_io = complete_fast_pio_out; } return 0; @@ -9399,7 +9420,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu) /* We should only ever be called with arch.pio.count equal to 1 */ BUG_ON(vcpu->arch.pio.count != 1); - if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) { + if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))) { vcpu->arch.pio.count = 0; return 1; } @@ -9428,7 +9449,7 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, return ret; } - vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu); + vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu); vcpu->arch.complete_userspace_io = complete_fast_pio_in; return 0; @@ -9811,6 +9832,9 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) if (r != 0) goto out_mmu_exit; + enable_device_posted_irqs &= enable_apicv && + irq_remapping_cap(IRQ_POSTING_CAP); + kvm_ops_update(ops); for_each_online_cpu(cpu) { @@ -10694,6 +10718,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) return; bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256); + vcpu->arch.highest_stale_pending_ioapic_eoi = -1; kvm_x86_call(sync_pir_to_irr)(vcpu); @@ -12419,13 +12444,16 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { - int idx; + int idx, cpu; kvm_clear_async_pf_completion_queue(vcpu); kvm_mmu_unload(vcpu); kvmclock_reset(vcpu); + for_each_possible_cpu(cpu) + cmpxchg(per_cpu_ptr(&last_vcpu, cpu), vcpu, NULL); + kvm_x86_call(vcpu_free)(vcpu); kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 88a9475899c8..832f0faf4779 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -121,6 +121,24 @@ static inline void kvm_leave_nested(struct kvm_vcpu *vcpu) kvm_x86_ops.nested_ops->leave_nested(vcpu); } +/* + * If IBRS is advertised to the vCPU, KVM must flush the indirect branch + * predictors when transitioning from L2 to L1, as L1 expects hardware (KVM in + * this case) to provide separate predictor modes. Bare metal isolates the host + * from the guest, but doesn't isolate different guests from one another (in + * this case L1 and L2). The exception is if bare metal supports same mode IBRS, + * which offers protection within the same mode, and hence protects L1 from L2. + */ +static inline void kvm_nested_vmexit_handle_ibrs(struct kvm_vcpu *vcpu) +{ + if (cpu_feature_enabled(X86_FEATURE_AMD_IBRS_SAME_MODE)) + return; + + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL) || + guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_IBRS)) + indirect_branch_prediction_barrier(); +} + static inline bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu) { return vcpu->arch.last_vmentry_cpu != -1; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index a6140415c693..3bde4fb5c6aa 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1509,7 +1509,16 @@ bool kvm_vcpu_block(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu); bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu); -void kvm_vcpu_kick(struct kvm_vcpu *vcpu); + +#ifndef CONFIG_S390 +void __kvm_vcpu_kick(struct kvm_vcpu *vcpu, bool wait); + +static inline void kvm_vcpu_kick(struct kvm_vcpu *vcpu) +{ + __kvm_vcpu_kick(vcpu, false); +} +#endif + int kvm_vcpu_yield_to(struct kvm_vcpu *target); void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu, bool yield_to_kernel_mode); @@ -2257,6 +2266,14 @@ static __always_inline void kvm_make_request(int req, struct kvm_vcpu *vcpu) __kvm_make_request(req, vcpu); } +#ifndef CONFIG_S390 +static inline void kvm_make_request_and_kick(int req, struct kvm_vcpu *vcpu) +{ + kvm_make_request(req, vcpu); + __kvm_vcpu_kick(vcpu, req & KVM_REQUEST_WAIT); +} +#endif + static inline bool kvm_request_pending(struct kvm_vcpu *vcpu) { return READ_ONCE(vcpu->requests); diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 6c2c152d8a67..51b383bea007 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -336,6 +336,7 @@ #define X86_FEATURE_AMD_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */ #define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */ #define X86_FEATURE_AMD_STIBP_ALWAYS_ON (13*32+17) /* Single Thread Indirect Branch Predictors always-on preferred */ +#define X86_FEATURE_AMD_IBRS_SAME_MODE (13*32+19) /* Indirect Branch Restricted Speculation same mode protection*/ #define X86_FEATURE_AMD_PPIN (13*32+23) /* "amd_ppin" Protected Processor Inventory Number */ #define X86_FEATURE_AMD_SSBD (13*32+24) /* Speculative Store Bypass Disable */ #define X86_FEATURE_VIRT_SSBD (13*32+25) /* "virt_ssbd" Virtualized Speculative Store Bypass Disable */ diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index 460306b35a4b..b663d916f162 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -844,6 +844,7 @@ struct kvm_sev_snp_launch_start { }; /* Kept in sync with firmware values for simplicity. */ +#define KVM_SEV_PAGE_TYPE_INVALID 0x0 #define KVM_SEV_SNP_PAGE_TYPE_NORMAL 0x1 #define KVM_SEV_SNP_PAGE_TYPE_ZERO 0x3 #define KVM_SEV_SNP_PAGE_TYPE_UNMEASURED 0x4 diff --git a/tools/testing/selftests/cgroup/Makefile b/tools/testing/selftests/cgroup/Makefile index 1b897152bab6..e01584c2189a 100644 --- a/tools/testing/selftests/cgroup/Makefile +++ b/tools/testing/selftests/cgroup/Makefile @@ -21,14 +21,15 @@ TEST_GEN_PROGS += test_zswap LOCAL_HDRS += $(selfdir)/clone3/clone3_selftests.h $(selfdir)/pidfd/pidfd.h include ../lib.mk +include lib/libcgroup.mk -$(OUTPUT)/test_core: cgroup_util.c -$(OUTPUT)/test_cpu: cgroup_util.c -$(OUTPUT)/test_cpuset: cgroup_util.c -$(OUTPUT)/test_freezer: cgroup_util.c -$(OUTPUT)/test_hugetlb_memcg: cgroup_util.c -$(OUTPUT)/test_kill: cgroup_util.c -$(OUTPUT)/test_kmem: cgroup_util.c -$(OUTPUT)/test_memcontrol: cgroup_util.c -$(OUTPUT)/test_pids: cgroup_util.c -$(OUTPUT)/test_zswap: cgroup_util.c +$(OUTPUT)/test_core: $(LIBCGROUP_O) +$(OUTPUT)/test_cpu: $(LIBCGROUP_O) +$(OUTPUT)/test_cpuset: $(LIBCGROUP_O) +$(OUTPUT)/test_freezer: $(LIBCGROUP_O) +$(OUTPUT)/test_hugetlb_memcg: $(LIBCGROUP_O) +$(OUTPUT)/test_kill: $(LIBCGROUP_O) +$(OUTPUT)/test_kmem: $(LIBCGROUP_O) +$(OUTPUT)/test_memcontrol: $(LIBCGROUP_O) +$(OUTPUT)/test_pids: $(LIBCGROUP_O) +$(OUTPUT)/test_zswap: $(LIBCGROUP_O) diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/lib/cgroup_util.c index 1e2d46636a0c..8832f3d1cb61 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.c +++ b/tools/testing/selftests/cgroup/lib/cgroup_util.c @@ -17,10 +17,10 @@ #include <unistd.h> #include "cgroup_util.h" -#include "../clone3/clone3_selftests.h" +#include "../../clone3/clone3_selftests.h" /* Returns read len on success, or -errno on failure. */ -static ssize_t read_text(const char *path, char *buf, size_t max_len) +ssize_t read_text(const char *path, char *buf, size_t max_len) { ssize_t len; int fd; @@ -39,7 +39,7 @@ static ssize_t read_text(const char *path, char *buf, size_t max_len) } /* Returns written len on success, or -errno on failure. */ -static ssize_t write_text(const char *path, char *buf, ssize_t len) +ssize_t write_text(const char *path, char *buf, ssize_t len) { int fd; @@ -217,7 +217,8 @@ int cg_write_numeric(const char *cgroup, const char *control, long value) return cg_write(cgroup, control, buf); } -int cg_find_unified_root(char *root, size_t len, bool *nsdelegate) +static int cg_find_root(char *root, size_t len, const char *controller, + bool *nsdelegate) { char buf[10 * PAGE_SIZE]; char *fs, *mount, *type, *options; @@ -236,18 +237,37 @@ int cg_find_unified_root(char *root, size_t len, bool *nsdelegate) options = strtok(NULL, delim); strtok(NULL, delim); strtok(NULL, delim); - - if (strcmp(type, "cgroup2") == 0) { - strncpy(root, mount, len); - if (nsdelegate) - *nsdelegate = !!strstr(options, "nsdelegate"); - return 0; + if (strcmp(type, "cgroup") == 0) { + if (!controller || !strstr(options, controller)) + continue; + } else if (strcmp(type, "cgroup2") == 0) { + if (controller && + cg_read_strstr(mount, "cgroup.controllers", controller)) + continue; + } else { + continue; } + strncpy(root, mount, len); + + if (nsdelegate) + *nsdelegate = !!strstr(options, "nsdelegate"); + return 0; + } return -1; } +int cg_find_controller_root(char *root, size_t len, const char *controller) +{ + return cg_find_root(root, len, controller, NULL); +} + +int cg_find_unified_root(char *root, size_t len, bool *nsdelegate) +{ + return cg_find_root(root, len, NULL, nsdelegate); +} + int cg_create(const char *cgroup) { return mkdir(cgroup, 0755); @@ -488,84 +508,6 @@ int cg_run_nowait(const char *cgroup, return pid; } -int get_temp_fd(void) -{ - return open(".", O_TMPFILE | O_RDWR | O_EXCL); -} - -int alloc_pagecache(int fd, size_t size) -{ - char buf[PAGE_SIZE]; - struct stat st; - int i; - - if (fstat(fd, &st)) - goto cleanup; - - size += st.st_size; - - if (ftruncate(fd, size)) - goto cleanup; - - for (i = 0; i < size; i += sizeof(buf)) - read(fd, buf, sizeof(buf)); - - return 0; - -cleanup: - return -1; -} - -int alloc_anon(const char *cgroup, void *arg) -{ - size_t size = (unsigned long)arg; - char *buf, *ptr; - - buf = malloc(size); - for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) - *ptr = 0; - - free(buf); - return 0; -} - -int is_swap_enabled(void) -{ - char buf[PAGE_SIZE]; - const char delim[] = "\n"; - int cnt = 0; - char *line; - - if (read_text("/proc/swaps", buf, sizeof(buf)) <= 0) - return -1; - - for (line = strtok(buf, delim); line; line = strtok(NULL, delim)) - cnt++; - - return cnt > 1; -} - -int set_oom_adj_score(int pid, int score) -{ - char path[PATH_MAX]; - int fd, len; - - sprintf(path, "/proc/%d/oom_score_adj", pid); - - fd = open(path, O_WRONLY | O_APPEND); - if (fd < 0) - return fd; - - len = dprintf(fd, "%d", score); - if (len < 0) { - close(fd); - return len; - } - - close(fd); - return 0; -} - int proc_mount_contains(const char *option) { char buf[4 * PAGE_SIZE]; diff --git a/tools/testing/selftests/cgroup/cgroup_util.h b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h index 19b131ee7707..adb2bc193183 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.h +++ b/tools/testing/selftests/cgroup/lib/include/cgroup_util.h @@ -2,9 +2,9 @@ #include <stdbool.h> #include <stdlib.h> -#include "../kselftest.h" - +#ifndef PAGE_SIZE #define PAGE_SIZE 4096 +#endif #define MB(x) (x << 20) @@ -21,6 +21,10 @@ static inline int values_close(long a, long b, int err) return labs(a - b) <= (a + b) / 100 * err; } +extern ssize_t read_text(const char *path, char *buf, size_t max_len); +extern ssize_t write_text(const char *path, char *buf, ssize_t len); + +extern int cg_find_controller_root(char *root, size_t len, const char *controller); extern int cg_find_unified_root(char *root, size_t len, bool *nsdelegate); extern char *cg_name(const char *root, const char *name); extern char *cg_name_indexed(const char *root, const char *name, int index); @@ -49,11 +53,6 @@ extern int cg_enter_current_thread(const char *cgroup); extern int cg_run_nowait(const char *cgroup, int (*fn)(const char *cgroup, void *arg), void *arg); -extern int get_temp_fd(void); -extern int alloc_pagecache(int fd, size_t size); -extern int alloc_anon(const char *cgroup, void *arg); -extern int is_swap_enabled(void); -extern int set_oom_adj_score(int pid, int score); extern int cg_wait_for_proc_count(const char *cgroup, int count); extern int cg_killall(const char *cgroup); int proc_mount_contains(const char *option); diff --git a/tools/testing/selftests/cgroup/lib/libcgroup.mk b/tools/testing/selftests/cgroup/lib/libcgroup.mk new file mode 100644 index 000000000000..7a73007204c3 --- /dev/null +++ b/tools/testing/selftests/cgroup/lib/libcgroup.mk @@ -0,0 +1,19 @@ +CGROUP_DIR := $(selfdir)/cgroup + +LIBCGROUP_C := lib/cgroup_util.c + +LIBCGROUP_O := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBCGROUP_C)) + +LIBCGROUP_O_DIRS := $(shell dirname $(LIBCGROUP_O) | uniq) + +CFLAGS += -I$(CGROUP_DIR)/lib/include + +EXTRA_HDRS := $(selfdir)/clone3/clone3_selftests.h + +$(LIBCGROUP_O_DIRS): + mkdir -p $@ + +$(LIBCGROUP_O): $(OUTPUT)/%.o : $(CGROUP_DIR)/%.c $(EXTRA_HDRS) $(LIBCGROUP_O_DIRS) + $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@ + +EXTRA_CLEAN += $(LIBCGROUP_O) diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index 16f5d74ae762..2908f4e0629d 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -24,6 +24,84 @@ static bool has_localevents; static bool has_recursiveprot; +int get_temp_fd(void) +{ + return open(".", O_TMPFILE | O_RDWR | O_EXCL); +} + +int alloc_pagecache(int fd, size_t size) +{ + char buf[PAGE_SIZE]; + struct stat st; + int i; + + if (fstat(fd, &st)) + goto cleanup; + + size += st.st_size; + + if (ftruncate(fd, size)) + goto cleanup; + + for (i = 0; i < size; i += sizeof(buf)) + read(fd, buf, sizeof(buf)); + + return 0; + +cleanup: + return -1; +} + +int alloc_anon(const char *cgroup, void *arg) +{ + size_t size = (unsigned long)arg; + char *buf, *ptr; + + buf = malloc(size); + for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) + *ptr = 0; + + free(buf); + return 0; +} + +int is_swap_enabled(void) +{ + char buf[PAGE_SIZE]; + const char delim[] = "\n"; + int cnt = 0; + char *line; + + if (read_text("/proc/swaps", buf, sizeof(buf)) <= 0) + return -1; + + for (line = strtok(buf, delim); line; line = strtok(NULL, delim)) + cnt++; + + return cnt > 1; +} + +int set_oom_adj_score(int pid, int score) +{ + char path[PATH_MAX]; + int fd, len; + + sprintf(path, "/proc/%d/oom_score_adj", pid); + + fd = open(path, O_WRONLY | O_APPEND); + if (fd < 0) + return fd; + + len = dprintf(fd, "%d", score); + if (len < 0) { + close(fd); + return len; + } + + close(fd); + return 0; +} + /* * This test creates two nested cgroups with and without enabling * the memory controller. diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index 3e786080473d..38b95998e1e6 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -8,6 +8,7 @@ LIBKVM += lib/elf.c LIBKVM += lib/guest_modes.c LIBKVM += lib/io.c LIBKVM += lib/kvm_util.c +LIBKVM += lib/lru_gen_util.c LIBKVM += lib/memstress.c LIBKVM += lib/guest_sprintf.c LIBKVM += lib/rbtree.c @@ -70,6 +71,7 @@ TEST_GEN_PROGS_x86 += x86/cr4_cpuid_sync_test TEST_GEN_PROGS_x86 += x86/dirty_log_page_splitting_test TEST_GEN_PROGS_x86 += x86/feature_msrs_test TEST_GEN_PROGS_x86 += x86/exit_on_emulation_failure_test +TEST_GEN_PROGS_x86 += x86/fastops_test TEST_GEN_PROGS_x86 += x86/fix_hypercall_test TEST_GEN_PROGS_x86 += x86/hwcr_msr_test TEST_GEN_PROGS_x86 += x86/hyperv_clock @@ -82,6 +84,7 @@ TEST_GEN_PROGS_x86 += x86/hyperv_svm_test TEST_GEN_PROGS_x86 += x86/hyperv_tlb_flush TEST_GEN_PROGS_x86 += x86/kvm_clock_test TEST_GEN_PROGS_x86 += x86/kvm_pv_test +TEST_GEN_PROGS_x86 += x86/kvm_buslock_test TEST_GEN_PROGS_x86 += x86/monitor_mwait_test TEST_GEN_PROGS_x86 += x86/nested_emulation_test TEST_GEN_PROGS_x86 += x86/nested_exceptions_test @@ -222,6 +225,7 @@ OVERRIDE_TARGETS = 1 # importantly defines, i.e. overwrites, $(CC) (unless `make -e` or `make CC=`, # which causes the environment variable to override the makefile). include ../lib.mk +include ../cgroup/lib/libcgroup.mk INSTALL_HDR_PATH = $(top_srcdir)/usr LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/ @@ -275,7 +279,7 @@ LIBKVM_S := $(filter %.S,$(LIBKVM)) LIBKVM_C_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_C)) LIBKVM_S_OBJ := $(patsubst %.S, $(OUTPUT)/%.o, $(LIBKVM_S)) LIBKVM_STRING_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_STRING)) -LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ) $(LIBKVM_STRING_OBJ) +LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ) $(LIBKVM_STRING_OBJ) $(LIBCGROUP_O) SPLIT_TEST_GEN_PROGS := $(patsubst %, $(OUTPUT)/%, $(SPLIT_TESTS)) SPLIT_TEST_GEN_OBJ := $(patsubst %, $(OUTPUT)/$(ARCH)/%.o, $(SPLIT_TESTS)) diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c index 447e619cf856..da7196fd1b23 100644 --- a/tools/testing/selftests/kvm/access_tracking_perf_test.c +++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c @@ -7,9 +7,11 @@ * This test measures the performance effects of KVM's access tracking. * Access tracking is driven by the MMU notifiers test_young, clear_young, and * clear_flush_young. These notifiers do not have a direct userspace API, - * however the clear_young notifier can be triggered by marking a pages as idle - * in /sys/kernel/mm/page_idle/bitmap. This test leverages that mechanism to - * enable access tracking on guest memory. + * however the clear_young notifier can be triggered either by + * 1. marking a pages as idle in /sys/kernel/mm/page_idle/bitmap OR + * 2. adding a new MGLRU generation using the lru_gen debugfs file. + * This test leverages page_idle to enable access tracking on guest memory + * unless MGLRU is enabled, in which case MGLRU is used. * * To measure performance this test runs a VM with a configurable number of * vCPUs that each touch every page in disjoint regions of memory. Performance @@ -17,10 +19,11 @@ * predefined region. * * Note that a deterministic correctness test of access tracking is not possible - * by using page_idle as it exists today. This is for a few reasons: + * by using page_idle or MGLRU aging as it exists today. This is for a few + * reasons: * - * 1. page_idle only issues clear_young notifiers, which lack a TLB flush. This - * means subsequent guest accesses are not guaranteed to see page table + * 1. page_idle and MGLRU only issue clear_young notifiers, which lack a TLB flush. + * This means subsequent guest accesses are not guaranteed to see page table * updates made by KVM until some time in the future. * * 2. page_idle only operates on LRU pages. Newly allocated pages are not @@ -48,9 +51,17 @@ #include "guest_modes.h" #include "processor.h" +#include "cgroup_util.h" +#include "lru_gen_util.h" + +static const char *TEST_MEMCG_NAME = "access_tracking_perf_test"; + /* Global variable used to synchronize all of the vCPU threads. */ static int iteration; +/* The cgroup memory controller root. Needed for lru_gen-based aging. */ +char cgroup_root[PATH_MAX]; + /* Defines what vCPU threads should do during a given iteration. */ static enum { /* Run the vCPU to access all its memory. */ @@ -65,6 +76,25 @@ static int vcpu_last_completed_iteration[KVM_MAX_VCPUS]; /* Whether to overlap the regions of memory vCPUs access. */ static bool overlap_memory_access; +/* + * If the test should only warn if there are too many idle pages (i.e., it is + * expected). + * -1: Not yet set. + * 0: We do not expect too many idle pages, so FAIL if too many idle pages. + * 1: Having too many idle pages is expected, so merely print a warning if + * too many idle pages are found. + */ +static int idle_pages_warn_only = -1; + +/* Whether or not to use MGLRU instead of page_idle for access tracking */ +static bool use_lru_gen; + +/* Total number of pages to expect in the memcg after touching everything */ +static long test_pages; + +/* Last generation we found the pages in */ +static int lru_gen_last_gen = -1; + struct test_params { /* The backing source for the region of memory. */ enum vm_mem_backing_src_type backing_src; @@ -123,8 +153,24 @@ static void mark_page_idle(int page_idle_fd, uint64_t pfn) "Set page_idle bits for PFN 0x%" PRIx64, pfn); } -static void mark_vcpu_memory_idle(struct kvm_vm *vm, - struct memstress_vcpu_args *vcpu_args) +static void too_many_idle_pages(long idle_pages, long total_pages, int vcpu_idx) +{ + char prefix[18] = {}; + + if (vcpu_idx >= 0) + snprintf(prefix, 18, "vCPU%d: ", vcpu_idx); + + TEST_ASSERT(idle_pages_warn_only, + "%sToo many pages still idle (%lu out of %lu)", + prefix, idle_pages, total_pages); + + printf("WARNING: %sToo many pages still idle (%lu out of %lu), " + "this will affect performance results.\n", + prefix, idle_pages, total_pages); +} + +static void pageidle_mark_vcpu_memory_idle(struct kvm_vm *vm, + struct memstress_vcpu_args *vcpu_args) { int vcpu_idx = vcpu_args->vcpu_idx; uint64_t base_gva = vcpu_args->gva; @@ -177,27 +223,79 @@ static void mark_vcpu_memory_idle(struct kvm_vm *vm, * arbitrary; high enough that we ensure most memory access went through * access tracking but low enough as to not make the test too brittle * over time and across architectures. - * - * When running the guest as a nested VM, "warn" instead of asserting - * as the TLB size is effectively unlimited and the KVM doesn't - * explicitly flush the TLB when aging SPTEs. As a result, more pages - * are cached and the guest won't see the "idle" bit cleared. */ - if (still_idle >= pages / 10) { -#ifdef __x86_64__ - TEST_ASSERT(this_cpu_has(X86_FEATURE_HYPERVISOR), - "vCPU%d: Too many pages still idle (%lu out of %lu)", - vcpu_idx, still_idle, pages); -#endif - printf("WARNING: vCPU%d: Too many pages still idle (%lu out of %lu), " - "this will affect performance results.\n", - vcpu_idx, still_idle, pages); - } + if (still_idle >= pages / 10) + too_many_idle_pages(still_idle, pages, + overlap_memory_access ? -1 : vcpu_idx); close(page_idle_fd); close(pagemap_fd); } +int find_generation(struct memcg_stats *stats, long total_pages) +{ + /* + * For finding the generation that contains our pages, use the same + * 90% threshold that page_idle uses. + */ + int gen = lru_gen_find_generation(stats, total_pages * 9 / 10); + + if (gen >= 0) + return gen; + + if (!idle_pages_warn_only) { + TEST_FAIL("Could not find a generation with 90%% of guest memory (%ld pages).", + total_pages * 9 / 10); + return gen; + } + + /* + * We couldn't find a generation with 90% of guest memory, which can + * happen if access tracking is unreliable. Simply look for a majority + * of pages. + */ + puts("WARNING: Couldn't find a generation with 90% of guest memory. " + "Performance results may not be accurate."); + gen = lru_gen_find_generation(stats, total_pages / 2); + TEST_ASSERT(gen >= 0, + "Could not find a generation with 50%% of guest memory (%ld pages).", + total_pages / 2); + return gen; +} + +static void lru_gen_mark_memory_idle(struct kvm_vm *vm) +{ + struct timespec ts_start; + struct timespec ts_elapsed; + struct memcg_stats stats; + int new_gen; + + /* Make a new generation */ + clock_gettime(CLOCK_MONOTONIC, &ts_start); + lru_gen_do_aging(&stats, TEST_MEMCG_NAME); + ts_elapsed = timespec_elapsed(ts_start); + + /* Check the generation again */ + new_gen = find_generation(&stats, test_pages); + + /* + * This function should only be invoked with newly-accessed pages, + * so pages should always move to a newer generation. + */ + if (new_gen <= lru_gen_last_gen) { + /* We did not move to a newer generation. */ + long idle_pages = lru_gen_sum_memcg_stats_for_gen(lru_gen_last_gen, + &stats); + + too_many_idle_pages(min_t(long, idle_pages, test_pages), + test_pages, -1); + } + pr_info("%-30s: %ld.%09lds\n", + "Mark memory idle (lru_gen)", ts_elapsed.tv_sec, + ts_elapsed.tv_nsec); + lru_gen_last_gen = new_gen; +} + static void assert_ucall(struct kvm_vcpu *vcpu, uint64_t expected_ucall) { struct ucall uc; @@ -237,7 +335,7 @@ static void vcpu_thread_main(struct memstress_vcpu_args *vcpu_args) assert_ucall(vcpu, UCALL_SYNC); break; case ITERATION_MARK_IDLE: - mark_vcpu_memory_idle(vm, vcpu_args); + pageidle_mark_vcpu_memory_idle(vm, vcpu_args); break; } @@ -289,15 +387,18 @@ static void access_memory(struct kvm_vm *vm, int nr_vcpus, static void mark_memory_idle(struct kvm_vm *vm, int nr_vcpus) { + if (use_lru_gen) + return lru_gen_mark_memory_idle(vm); + /* * Even though this parallelizes the work across vCPUs, this is still a * very slow operation because page_idle forces the test to mark one pfn - * at a time and the clear_young notifier serializes on the KVM MMU + * at a time and the clear_young notifier may serialize on the KVM MMU * lock. */ pr_debug("Marking VM memory idle (slow)...\n"); iteration_work = ITERATION_MARK_IDLE; - run_iteration(vm, nr_vcpus, "Mark memory idle"); + run_iteration(vm, nr_vcpus, "Mark memory idle (page_idle)"); } static void run_test(enum vm_guest_mode mode, void *arg) @@ -309,11 +410,38 @@ static void run_test(enum vm_guest_mode mode, void *arg) vm = memstress_create_vm(mode, nr_vcpus, params->vcpu_memory_bytes, 1, params->backing_src, !overlap_memory_access); + /* + * If guest_page_size is larger than the host's page size, the + * guest (memstress) will only fault in a subset of the host's pages. + */ + test_pages = params->nr_vcpus * params->vcpu_memory_bytes / + max(memstress_args.guest_page_size, + (uint64_t)getpagesize()); + memstress_start_vcpu_threads(nr_vcpus, vcpu_thread_main); pr_info("\n"); access_memory(vm, nr_vcpus, ACCESS_WRITE, "Populating memory"); + if (use_lru_gen) { + struct memcg_stats stats; + + /* + * Do a page table scan now. Following initial population, aging + * may not cause the pages to move to a newer generation. Do + * an aging pass now so that future aging passes always move + * pages to a newer generation. + */ + printf("Initial aging pass (lru_gen)\n"); + lru_gen_do_aging(&stats, TEST_MEMCG_NAME); + TEST_ASSERT(lru_gen_sum_memcg_stats(&stats) >= test_pages, + "Not all pages accounted for (looking for %ld). " + "Was the memcg set up correctly?", test_pages); + access_memory(vm, nr_vcpus, ACCESS_WRITE, "Re-populating memory"); + lru_gen_read_memcg_stats(&stats, TEST_MEMCG_NAME); + lru_gen_last_gen = find_generation(&stats, test_pages); + } + /* As a control, read and write to the populated memory first. */ access_memory(vm, nr_vcpus, ACCESS_WRITE, "Writing to populated memory"); access_memory(vm, nr_vcpus, ACCESS_READ, "Reading from populated memory"); @@ -328,6 +456,37 @@ static void run_test(enum vm_guest_mode mode, void *arg) memstress_destroy_vm(vm); } +static int access_tracking_unreliable(void) +{ +#ifdef __x86_64__ + /* + * When running nested, the TLB size may be effectively unlimited (for + * example, this is the case when running on KVM L0), and KVM doesn't + * explicitly flush the TLB when aging SPTEs. As a result, more pages + * are cached and the guest won't see the "idle" bit cleared. + */ + if (this_cpu_has(X86_FEATURE_HYPERVISOR)) { + puts("Skipping idle page count sanity check, because the test is run nested"); + return 1; + } +#endif + /* + * When NUMA balancing is enabled, guest memory will be unmapped to get + * NUMA faults, dropping the Accessed bits. + */ + if (is_numa_balancing_enabled()) { + puts("Skipping idle page count sanity check, because NUMA balancing is enabled"); + return 1; + } + return 0; +} + +static int run_test_for_each_guest_mode(const char *cgroup, void *arg) +{ + for_each_guest_mode(run_test, arg); + return 0; +} + static void help(char *name) { puts(""); @@ -342,11 +501,22 @@ static void help(char *name) printf(" -v: specify the number of vCPUs to run.\n"); printf(" -o: Overlap guest memory accesses instead of partitioning\n" " them into a separate region of memory for each vCPU.\n"); + printf(" -w: Control whether the test warns or fails if more than 10%%\n" + " of pages are still seen as idle/old after accessing guest\n" + " memory. >0 == warn only, 0 == fail, <0 == auto. For auto\n" + " mode, the test fails by default, but switches to warn only\n" + " if NUMA balancing is enabled or the test detects it's running\n" + " in a VM.\n"); backing_src_help("-s"); puts(""); exit(0); } +void destroy_cgroup(char *cg) +{ + printf("Destroying cgroup: %s\n", cg); +} + int main(int argc, char *argv[]) { struct test_params params = { @@ -354,12 +524,13 @@ int main(int argc, char *argv[]) .vcpu_memory_bytes = DEFAULT_PER_VCPU_MEM_SIZE, .nr_vcpus = 1, }; + char *new_cg = NULL; int page_idle_fd; int opt; guest_modes_append_default(); - while ((opt = getopt(argc, argv, "hm:b:v:os:")) != -1) { + while ((opt = getopt(argc, argv, "hm:b:v:os:w:")) != -1) { switch (opt) { case 'm': guest_modes_cmdline(optarg); @@ -376,6 +547,11 @@ int main(int argc, char *argv[]) case 's': params.backing_src = parse_backing_src_type(optarg); break; + case 'w': + idle_pages_warn_only = + atoi_non_negative("Idle pages warning", + optarg); + break; case 'h': default: help(argv[0]); @@ -383,12 +559,53 @@ int main(int argc, char *argv[]) } } - page_idle_fd = open("/sys/kernel/mm/page_idle/bitmap", O_RDWR); - __TEST_REQUIRE(page_idle_fd >= 0, - "CONFIG_IDLE_PAGE_TRACKING is not enabled"); - close(page_idle_fd); + if (idle_pages_warn_only == -1) + idle_pages_warn_only = access_tracking_unreliable(); + + if (lru_gen_usable()) { + bool cg_created = true; + int ret; - for_each_guest_mode(run_test, ¶ms); + puts("Using lru_gen for aging"); + use_lru_gen = true; + + if (cg_find_controller_root(cgroup_root, sizeof(cgroup_root), "memory")) + ksft_exit_skip("Cannot find memory cgroup controller\n"); + + new_cg = cg_name(cgroup_root, TEST_MEMCG_NAME); + printf("Creating cgroup: %s\n", new_cg); + if (cg_create(new_cg)) { + if (errno == EEXIST) { + printf("Found existing cgroup"); + cg_created = false; + } else { + ksft_exit_skip("could not create new cgroup: %s\n", new_cg); + } + } + + /* + * This will fork off a new process to run the test within + * a new memcg, so we need to properly propagate the return + * value up. + */ + ret = cg_run(new_cg, &run_test_for_each_guest_mode, ¶ms); + if (cg_created) + cg_destroy(new_cg); + if (ret < 0) + TEST_FAIL("child did not spawn or was abnormally killed"); + if (ret) + return ret; + } else { + page_idle_fd = open("/sys/kernel/mm/page_idle/bitmap", O_RDWR); + __TEST_REQUIRE(page_idle_fd >= 0, + "Couldn't open /sys/kernel/mm/page_idle/bitmap. " + "Is CONFIG_IDLE_PAGE_TRACKING enabled?"); + + close(page_idle_fd); + + puts("Using page_idle for aging"); + run_test_for_each_guest_mode(NULL, ¶ms); + } return 0; } diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 93013564428b..bee65ca08721 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -555,6 +555,41 @@ void kvm_get_stat(struct kvm_binary_stats *stats, const char *name, #define vm_get_stat(vm, stat) __get_stat(&(vm)->stats, stat) #define vcpu_get_stat(vcpu, stat) __get_stat(&(vcpu)->stats, stat) +static inline bool read_smt_control(char *buf, size_t buf_size) +{ + FILE *f = fopen("/sys/devices/system/cpu/smt/control", "r"); + bool ret; + + if (!f) + return false; + + ret = fread(buf, sizeof(*buf), buf_size, f) > 0; + fclose(f); + + return ret; +} + +static inline bool is_smt_possible(void) +{ + char buf[16]; + + if (read_smt_control(buf, sizeof(buf)) && + (!strncmp(buf, "forceoff", 8) || !strncmp(buf, "notsupported", 12))) + return false; + + return true; +} + +static inline bool is_smt_on(void) +{ + char buf[16]; + + if (read_smt_control(buf, sizeof(buf)) && !strncmp(buf, "on", 2)) + return true; + + return false; +} + void vm_create_irqchip(struct kvm_vm *vm); static inline int __vm_create_guest_memfd(struct kvm_vm *vm, uint64_t size, diff --git a/tools/testing/selftests/kvm/include/lru_gen_util.h b/tools/testing/selftests/kvm/include/lru_gen_util.h new file mode 100644 index 000000000000..d32ff5d8ffd0 --- /dev/null +++ b/tools/testing/selftests/kvm/include/lru_gen_util.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Tools for integrating with lru_gen, like parsing the lru_gen debugfs output. + * + * Copyright (C) 2025, Google LLC. + */ +#ifndef SELFTEST_KVM_LRU_GEN_UTIL_H +#define SELFTEST_KVM_LRU_GEN_UTIL_H + +#include <inttypes.h> +#include <limits.h> +#include <stdlib.h> + +#include "test_util.h" + +#define MAX_NR_GENS 16 /* MAX_NR_GENS in include/linux/mmzone.h */ +#define MAX_NR_NODES 4 /* Maximum number of nodes supported by the test */ + +#define LRU_GEN_DEBUGFS "/sys/kernel/debug/lru_gen" +#define LRU_GEN_ENABLED_PATH "/sys/kernel/mm/lru_gen/enabled" +#define LRU_GEN_ENABLED 1 +#define LRU_GEN_MM_WALK 2 + +struct generation_stats { + int gen; + long age_ms; + long nr_anon; + long nr_file; +}; + +struct node_stats { + int node; + int nr_gens; /* Number of populated gens entries. */ + struct generation_stats gens[MAX_NR_GENS]; +}; + +struct memcg_stats { + unsigned long memcg_id; + int nr_nodes; /* Number of populated nodes entries. */ + struct node_stats nodes[MAX_NR_NODES]; +}; + +void lru_gen_read_memcg_stats(struct memcg_stats *stats, const char *memcg); +long lru_gen_sum_memcg_stats(const struct memcg_stats *stats); +long lru_gen_sum_memcg_stats_for_gen(int gen, const struct memcg_stats *stats); +void lru_gen_do_aging(struct memcg_stats *stats, const char *memcg); +int lru_gen_find_generation(const struct memcg_stats *stats, + unsigned long total_pages); +bool lru_gen_usable(void); + +#endif /* SELFTEST_KVM_LRU_GEN_UTIL_H */ diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index 77d13d7920cb..c6ef895fbd9a 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -153,6 +153,7 @@ bool is_backing_src_hugetlb(uint32_t i); void backing_src_help(const char *flag); enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name); long get_run_delay(void); +bool is_numa_balancing_enabled(void); /* * Whether or not the given source type is shared memory (as opposed to diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 32ab6ca7ec32..b11b5a53ebd5 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -203,6 +203,7 @@ struct kvm_x86_cpu_feature { #define X86_FEATURE_IDLE_HLT KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 30) #define X86_FEATURE_SEV KVM_X86_CPU_FEATURE(0x8000001F, 0, EAX, 1) #define X86_FEATURE_SEV_ES KVM_X86_CPU_FEATURE(0x8000001F, 0, EAX, 3) +#define X86_FEATURE_SEV_SNP KVM_X86_CPU_FEATURE(0x8000001F, 0, EAX, 4) #define X86_FEATURE_PERFMON_V2 KVM_X86_CPU_FEATURE(0x80000022, 0, EAX, 0) #define X86_FEATURE_LBR_PMC_FREEZE KVM_X86_CPU_FEATURE(0x80000022, 0, EAX, 2) diff --git a/tools/testing/selftests/kvm/include/x86/sev.h b/tools/testing/selftests/kvm/include/x86/sev.h index 82c11c81a956..008b4169f5e2 100644 --- a/tools/testing/selftests/kvm/include/x86/sev.h +++ b/tools/testing/selftests/kvm/include/x86/sev.h @@ -25,19 +25,51 @@ enum sev_guest_state { #define SEV_POLICY_NO_DBG (1UL << 0) #define SEV_POLICY_ES (1UL << 2) +#define SNP_POLICY_SMT (1ULL << 16) +#define SNP_POLICY_RSVD_MBO (1ULL << 17) +#define SNP_POLICY_DBG (1ULL << 19) + #define GHCB_MSR_TERM_REQ 0x100 +static inline bool is_sev_snp_vm(struct kvm_vm *vm) +{ + return vm->type == KVM_X86_SNP_VM; +} + +static inline bool is_sev_es_vm(struct kvm_vm *vm) +{ + return is_sev_snp_vm(vm) || vm->type == KVM_X86_SEV_ES_VM; +} + +static inline bool is_sev_vm(struct kvm_vm *vm) +{ + return is_sev_es_vm(vm) || vm->type == KVM_X86_SEV_VM; +} + void sev_vm_launch(struct kvm_vm *vm, uint32_t policy); void sev_vm_launch_measure(struct kvm_vm *vm, uint8_t *measurement); void sev_vm_launch_finish(struct kvm_vm *vm); +void snp_vm_launch_start(struct kvm_vm *vm, uint64_t policy); +void snp_vm_launch_update(struct kvm_vm *vm); +void snp_vm_launch_finish(struct kvm_vm *vm); struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t type, void *guest_code, struct kvm_vcpu **cpu); -void vm_sev_launch(struct kvm_vm *vm, uint32_t policy, uint8_t *measurement); +void vm_sev_launch(struct kvm_vm *vm, uint64_t policy, uint8_t *measurement); kvm_static_assert(SEV_RET_SUCCESS == 0); /* + * A SEV-SNP VM requires the policy reserved bit to always be set. + * The SMT policy bit is also required to be set based on SMT being + * available and active on the system. + */ +static inline u64 snp_default_policy(void) +{ + return SNP_POLICY_RSVD_MBO | (is_smt_on() ? SNP_POLICY_SMT : 0); +} + +/* * The KVM_MEMORY_ENCRYPT_OP uAPI is utter garbage and takes an "unsigned long" * instead of a proper struct. The size of the parameter is embedded in the * ioctl number, i.e. is ABI and thus immutable. Hack around the mess by @@ -70,6 +102,12 @@ kvm_static_assert(SEV_RET_SUCCESS == 0); void sev_vm_init(struct kvm_vm *vm); void sev_es_vm_init(struct kvm_vm *vm); +void snp_vm_init(struct kvm_vm *vm); + +static inline void vmgexit(void) +{ + __asm__ __volatile__("rep; vmmcall"); +} static inline void sev_register_encrypted_memory(struct kvm_vm *vm, struct userspace_mem_region *region) @@ -93,4 +131,17 @@ static inline void sev_launch_update_data(struct kvm_vm *vm, vm_paddr_t gpa, vm_sev_ioctl(vm, KVM_SEV_LAUNCH_UPDATE_DATA, &update_data); } +static inline void snp_launch_update_data(struct kvm_vm *vm, vm_paddr_t gpa, + uint64_t hva, uint64_t size, uint8_t type) +{ + struct kvm_sev_snp_launch_update update_data = { + .uaddr = hva, + .gfn_start = gpa >> PAGE_SHIFT, + .len = size, + .type = type, + }; + + vm_sev_ioctl(vm, KVM_SEV_SNP_LAUNCH_UPDATE, &update_data); +} + #endif /* SELFTEST_KVM_SEV_H */ diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 5649cf2f40e8..a055343a7bf7 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -447,6 +447,15 @@ void kvm_set_files_rlimit(uint32_t nr_vcpus) } +static bool is_guest_memfd_required(struct vm_shape shape) +{ +#ifdef __x86_64__ + return shape.type == KVM_X86_SNP_VM; +#else + return false; +#endif +} + struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus, uint64_t nr_extra_pages) { @@ -454,7 +463,7 @@ struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus, nr_extra_pages); struct userspace_mem_region *slot0; struct kvm_vm *vm; - int i; + int i, flags; kvm_set_files_rlimit(nr_runnable_vcpus); @@ -463,7 +472,15 @@ struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus, vm = ____vm_create(shape); - vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, nr_pages, 0); + /* + * Force GUEST_MEMFD for the primary memory region if necessary, e.g. + * for CoCo VMs that require GUEST_MEMFD backed private memory. + */ + flags = 0; + if (is_guest_memfd_required(shape)) + flags |= KVM_MEM_GUEST_MEMFD; + + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, nr_pages, flags); for (i = 0; i < NR_MEM_REGIONS; i++) vm->memslots[i] = 0; diff --git a/tools/testing/selftests/kvm/lib/lru_gen_util.c b/tools/testing/selftests/kvm/lib/lru_gen_util.c new file mode 100644 index 000000000000..46a14fd63d9e --- /dev/null +++ b/tools/testing/selftests/kvm/lib/lru_gen_util.c @@ -0,0 +1,387 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025, Google LLC. + */ + +#include <time.h> + +#include "lru_gen_util.h" + +/* + * Tracks state while we parse memcg lru_gen stats. The file we're parsing is + * structured like this (some extra whitespace elided): + * + * memcg (id) (path) + * node (id) + * (gen_nr) (age_in_ms) (nr_anon_pages) (nr_file_pages) + */ +struct memcg_stats_parse_context { + bool consumed; /* Whether or not this line was consumed */ + /* Next parse handler to invoke */ + void (*next_handler)(struct memcg_stats *stats, + struct memcg_stats_parse_context *ctx, + char *line); + int current_node_idx; /* Current index in nodes array */ + const char *name; /* The name of the memcg we're looking for */ +}; + +static void memcg_stats_handle_searching(struct memcg_stats *stats, + struct memcg_stats_parse_context *ctx, + char *line); +static void memcg_stats_handle_in_memcg(struct memcg_stats *stats, + struct memcg_stats_parse_context *ctx, + char *line); +static void memcg_stats_handle_in_node(struct memcg_stats *stats, + struct memcg_stats_parse_context *ctx, + char *line); + +struct split_iterator { + char *str; + char *save; +}; + +static char *split_next(struct split_iterator *it) +{ + char *ret = strtok_r(it->str, " \t\n\r", &it->save); + + it->str = NULL; + return ret; +} + +static void memcg_stats_handle_searching(struct memcg_stats *stats, + struct memcg_stats_parse_context *ctx, + char *line) +{ + struct split_iterator it = { .str = line }; + char *prefix = split_next(&it); + char *memcg_id = split_next(&it); + char *memcg_name = split_next(&it); + char *end; + + ctx->consumed = true; + + if (!prefix || strcmp("memcg", prefix)) + return; /* Not a memcg line (maybe empty), skip */ + + TEST_ASSERT(memcg_id && memcg_name, + "malformed memcg line; no memcg id or memcg_name"); + + if (strcmp(memcg_name + 1, ctx->name)) + return; /* Wrong memcg, skip */ + + /* Found it! */ + + stats->memcg_id = strtoul(memcg_id, &end, 10); + TEST_ASSERT(*end == '\0', "malformed memcg id '%s'", memcg_id); + if (!stats->memcg_id) + return; /* Removed memcg? */ + + ctx->next_handler = memcg_stats_handle_in_memcg; +} + +static void memcg_stats_handle_in_memcg(struct memcg_stats *stats, + struct memcg_stats_parse_context *ctx, + char *line) +{ + struct split_iterator it = { .str = line }; + char *prefix = split_next(&it); + char *id = split_next(&it); + long found_node_id; + char *end; + + ctx->consumed = true; + ctx->current_node_idx = -1; + + if (!prefix) + return; /* Skip empty lines */ + + if (!strcmp("memcg", prefix)) { + /* Memcg done, found next one; stop. */ + ctx->next_handler = NULL; + return; + } else if (strcmp("node", prefix)) + TEST_ASSERT(false, "found malformed line after 'memcg ...'," + "token: '%s'", prefix); + + /* At this point we know we have a node line. Parse the ID. */ + + TEST_ASSERT(id, "malformed node line; no node id"); + + found_node_id = strtol(id, &end, 10); + TEST_ASSERT(*end == '\0', "malformed node id '%s'", id); + + ctx->current_node_idx = stats->nr_nodes++; + TEST_ASSERT(ctx->current_node_idx < MAX_NR_NODES, + "memcg has stats for too many nodes, max is %d", + MAX_NR_NODES); + stats->nodes[ctx->current_node_idx].node = found_node_id; + + ctx->next_handler = memcg_stats_handle_in_node; +} + +static void memcg_stats_handle_in_node(struct memcg_stats *stats, + struct memcg_stats_parse_context *ctx, + char *line) +{ + char *my_line = strdup(line); + struct split_iterator it = { .str = my_line }; + char *gen, *age, *nr_anon, *nr_file; + struct node_stats *node_stats; + struct generation_stats *gen_stats; + char *end; + + TEST_ASSERT(it.str, "failed to copy input line"); + + gen = split_next(&it); + + if (!gen) + goto out_consume; /* Skip empty lines */ + + if (!strcmp("memcg", gen) || !strcmp("node", gen)) { + /* + * Reached next memcg or node section. Don't consume, let the + * other handler deal with this. + */ + ctx->next_handler = memcg_stats_handle_in_memcg; + goto out; + } + + node_stats = &stats->nodes[ctx->current_node_idx]; + TEST_ASSERT(node_stats->nr_gens < MAX_NR_GENS, + "found too many generation lines; max is %d", + MAX_NR_GENS); + gen_stats = &node_stats->gens[node_stats->nr_gens++]; + + age = split_next(&it); + nr_anon = split_next(&it); + nr_file = split_next(&it); + + TEST_ASSERT(age && nr_anon && nr_file, + "malformed generation line; not enough tokens"); + + gen_stats->gen = (int)strtol(gen, &end, 10); + TEST_ASSERT(*end == '\0', "malformed generation number '%s'", gen); + + gen_stats->age_ms = strtol(age, &end, 10); + TEST_ASSERT(*end == '\0', "malformed generation age '%s'", age); + + gen_stats->nr_anon = strtol(nr_anon, &end, 10); + TEST_ASSERT(*end == '\0', "malformed anonymous page count '%s'", + nr_anon); + + gen_stats->nr_file = strtol(nr_file, &end, 10); + TEST_ASSERT(*end == '\0', "malformed file page count '%s'", nr_file); + +out_consume: + ctx->consumed = true; +out: + free(my_line); +} + +static void print_memcg_stats(const struct memcg_stats *stats, const char *name) +{ + int node, gen; + + pr_debug("stats for memcg %s (id %lu):\n", name, stats->memcg_id); + for (node = 0; node < stats->nr_nodes; ++node) { + pr_debug("\tnode %d\n", stats->nodes[node].node); + for (gen = 0; gen < stats->nodes[node].nr_gens; ++gen) { + const struct generation_stats *gstats = + &stats->nodes[node].gens[gen]; + + pr_debug("\t\tgen %d\tage_ms %ld" + "\tnr_anon %ld\tnr_file %ld\n", + gstats->gen, gstats->age_ms, gstats->nr_anon, + gstats->nr_file); + } + } +} + +/* Re-read lru_gen debugfs information for @memcg into @stats. */ +void lru_gen_read_memcg_stats(struct memcg_stats *stats, const char *memcg) +{ + FILE *f; + ssize_t read = 0; + char *line = NULL; + size_t bufsz; + struct memcg_stats_parse_context ctx = { + .next_handler = memcg_stats_handle_searching, + .name = memcg, + }; + + memset(stats, 0, sizeof(struct memcg_stats)); + + f = fopen(LRU_GEN_DEBUGFS, "r"); + TEST_ASSERT(f, "fopen(%s) failed", LRU_GEN_DEBUGFS); + + while (ctx.next_handler && (read = getline(&line, &bufsz, f)) > 0) { + ctx.consumed = false; + + do { + ctx.next_handler(stats, &ctx, line); + if (!ctx.next_handler) + break; + } while (!ctx.consumed); + } + + if (read < 0 && !feof(f)) + TEST_ASSERT(false, "getline(%s) failed", LRU_GEN_DEBUGFS); + + TEST_ASSERT(stats->memcg_id > 0, "Couldn't find memcg: %s\n" + "Did the memcg get created in the proper mount?", + memcg); + if (line) + free(line); + TEST_ASSERT(!fclose(f), "fclose(%s) failed", LRU_GEN_DEBUGFS); + + print_memcg_stats(stats, memcg); +} + +/* + * Find all pages tracked by lru_gen for this memcg in generation @target_gen. + * + * If @target_gen is negative, look for all generations. + */ +long lru_gen_sum_memcg_stats_for_gen(int target_gen, + const struct memcg_stats *stats) +{ + int node, gen; + long total_nr = 0; + + for (node = 0; node < stats->nr_nodes; ++node) { + const struct node_stats *node_stats = &stats->nodes[node]; + + for (gen = 0; gen < node_stats->nr_gens; ++gen) { + const struct generation_stats *gen_stats = + &node_stats->gens[gen]; + + if (target_gen >= 0 && gen_stats->gen != target_gen) + continue; + + total_nr += gen_stats->nr_anon + gen_stats->nr_file; + } + } + + return total_nr; +} + +/* Find all pages tracked by lru_gen for this memcg. */ +long lru_gen_sum_memcg_stats(const struct memcg_stats *stats) +{ + return lru_gen_sum_memcg_stats_for_gen(-1, stats); +} + +/* + * If lru_gen aging should force page table scanning. + * + * If you want to set this to false, you will need to do eviction + * before doing extra aging passes. + */ +static const bool force_scan = true; + +static void run_aging_impl(unsigned long memcg_id, int node_id, int max_gen) +{ + FILE *f = fopen(LRU_GEN_DEBUGFS, "w"); + char *command; + size_t sz; + + TEST_ASSERT(f, "fopen(%s) failed", LRU_GEN_DEBUGFS); + sz = asprintf(&command, "+ %lu %d %d 1 %d\n", + memcg_id, node_id, max_gen, force_scan); + TEST_ASSERT(sz > 0, "creating aging command failed"); + + pr_debug("Running aging command: %s", command); + if (fwrite(command, sizeof(char), sz, f) < sz) { + TEST_ASSERT(false, "writing aging command %s to %s failed", + command, LRU_GEN_DEBUGFS); + } + + TEST_ASSERT(!fclose(f), "fclose(%s) failed", LRU_GEN_DEBUGFS); +} + +void lru_gen_do_aging(struct memcg_stats *stats, const char *memcg) +{ + int node, gen; + + pr_debug("lru_gen: invoking aging...\n"); + + /* Must read memcg stats to construct the proper aging command. */ + lru_gen_read_memcg_stats(stats, memcg); + + for (node = 0; node < stats->nr_nodes; ++node) { + int max_gen = 0; + + for (gen = 0; gen < stats->nodes[node].nr_gens; ++gen) { + int this_gen = stats->nodes[node].gens[gen].gen; + + max_gen = max_gen > this_gen ? max_gen : this_gen; + } + + run_aging_impl(stats->memcg_id, stats->nodes[node].node, + max_gen); + } + + /* Re-read so callers get updated information */ + lru_gen_read_memcg_stats(stats, memcg); +} + +/* + * Find which generation contains at least @pages pages, assuming that + * such a generation exists. + */ +int lru_gen_find_generation(const struct memcg_stats *stats, + unsigned long pages) +{ + int node, gen, gen_idx, min_gen = INT_MAX, max_gen = -1; + + for (node = 0; node < stats->nr_nodes; ++node) + for (gen_idx = 0; gen_idx < stats->nodes[node].nr_gens; + ++gen_idx) { + gen = stats->nodes[node].gens[gen_idx].gen; + max_gen = gen > max_gen ? gen : max_gen; + min_gen = gen < min_gen ? gen : min_gen; + } + + for (gen = min_gen; gen <= max_gen; ++gen) + /* See if this generation has enough pages. */ + if (lru_gen_sum_memcg_stats_for_gen(gen, stats) > pages) + return gen; + + return -1; +} + +bool lru_gen_usable(void) +{ + long required_features = LRU_GEN_ENABLED | LRU_GEN_MM_WALK; + int lru_gen_fd, lru_gen_debug_fd; + char mglru_feature_str[8] = {}; + long mglru_features; + + lru_gen_fd = open(LRU_GEN_ENABLED_PATH, O_RDONLY); + if (lru_gen_fd < 0) { + puts("lru_gen: Could not open " LRU_GEN_ENABLED_PATH); + return false; + } + if (read(lru_gen_fd, &mglru_feature_str, 7) < 7) { + puts("lru_gen: Could not read from " LRU_GEN_ENABLED_PATH); + close(lru_gen_fd); + return false; + } + close(lru_gen_fd); + + mglru_features = strtol(mglru_feature_str, NULL, 16); + if ((mglru_features & required_features) != required_features) { + printf("lru_gen: missing features, got: 0x%lx, expected: 0x%lx\n", + mglru_features, required_features); + printf("lru_gen: Try 'echo 0x%lx > /sys/kernel/mm/lru_gen/enabled'\n", + required_features); + return false; + } + + lru_gen_debug_fd = open(LRU_GEN_DEBUGFS, O_RDWR); + __TEST_REQUIRE(lru_gen_debug_fd >= 0, + "lru_gen: Could not open " LRU_GEN_DEBUGFS ", " + "but lru_gen is enabled, so cannot use page_idle."); + close(lru_gen_debug_fd); + return true; +} diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c index 8ed0b74ae837..03eb99af9b8d 100644 --- a/tools/testing/selftests/kvm/lib/test_util.c +++ b/tools/testing/selftests/kvm/lib/test_util.c @@ -132,37 +132,57 @@ void print_skip(const char *fmt, ...) puts(", skipping test"); } -bool thp_configured(void) +static bool test_sysfs_path(const char *path) { - int ret; struct stat statbuf; + int ret; - ret = stat("/sys/kernel/mm/transparent_hugepage", &statbuf); + ret = stat(path, &statbuf); TEST_ASSERT(ret == 0 || (ret == -1 && errno == ENOENT), - "Error in stating /sys/kernel/mm/transparent_hugepage"); + "Error in stat()ing '%s'", path); return ret == 0; } -size_t get_trans_hugepagesz(void) +bool thp_configured(void) +{ + return test_sysfs_path("/sys/kernel/mm/transparent_hugepage"); +} + +static size_t get_sysfs_val(const char *path) { size_t size; FILE *f; int ret; - TEST_ASSERT(thp_configured(), "THP is not configured in host kernel"); - - f = fopen("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", "r"); - TEST_ASSERT(f != NULL, "Error in opening transparent_hugepage/hpage_pmd_size"); + f = fopen(path, "r"); + TEST_ASSERT(f, "Error opening '%s'", path); ret = fscanf(f, "%ld", &size); + TEST_ASSERT(ret > 0, "Error reading '%s'", path); + + /* Re-scan the input stream to verify the entire file was read. */ ret = fscanf(f, "%ld", &size); - TEST_ASSERT(ret < 1, "Error reading transparent_hugepage/hpage_pmd_size"); - fclose(f); + TEST_ASSERT(ret < 1, "Error reading '%s'", path); + fclose(f); return size; } +size_t get_trans_hugepagesz(void) +{ + TEST_ASSERT(thp_configured(), "THP is not configured in host kernel"); + + return get_sysfs_val("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"); +} + +bool is_numa_balancing_enabled(void) +{ + if (!test_sysfs_path("/proc/sys/kernel/numa_balancing")) + return false; + return get_sysfs_val("/proc/sys/kernel/numa_balancing") == 1; +} + size_t get_def_hugetlb_pagesz(void) { char buf[64]; diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index bd5a802fa7a5..a92dc1dad085 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -639,7 +639,7 @@ void kvm_arch_vm_post_create(struct kvm_vm *vm) sync_global_to_guest(vm, host_cpu_is_amd); sync_global_to_guest(vm, is_forced_emulation_enabled); - if (vm->type == KVM_X86_SEV_VM || vm->type == KVM_X86_SEV_ES_VM) { + if (is_sev_vm(vm)) { struct kvm_sev_init init = { 0 }; vm_sev_ioctl(vm, KVM_SEV_INIT2, &init); @@ -1156,7 +1156,7 @@ void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits) void kvm_init_vm_address_properties(struct kvm_vm *vm) { - if (vm->type == KVM_X86_SEV_VM || vm->type == KVM_X86_SEV_ES_VM) { + if (is_sev_vm(vm)) { vm->arch.sev_fd = open_sev_dev_path_or_exit(); vm->arch.c_bit = BIT_ULL(this_cpu_property(X86_PROPERTY_SEV_C_BIT)); vm->gpa_tag_mask = vm->arch.c_bit; diff --git a/tools/testing/selftests/kvm/lib/x86/sev.c b/tools/testing/selftests/kvm/lib/x86/sev.c index e9535ee20b7f..c3a9838f4806 100644 --- a/tools/testing/selftests/kvm/lib/x86/sev.c +++ b/tools/testing/selftests/kvm/lib/x86/sev.c @@ -14,7 +14,8 @@ * and find the first range, but that's correct because the condition * expression would cause us to quit the loop. */ -static void encrypt_region(struct kvm_vm *vm, struct userspace_mem_region *region) +static void encrypt_region(struct kvm_vm *vm, struct userspace_mem_region *region, + uint8_t page_type, bool private) { const struct sparsebit *protected_phy_pages = region->protected_phy_pages; const vm_paddr_t gpa_base = region->region.guest_phys_addr; @@ -24,25 +25,35 @@ static void encrypt_region(struct kvm_vm *vm, struct userspace_mem_region *regio if (!sparsebit_any_set(protected_phy_pages)) return; - sev_register_encrypted_memory(vm, region); + if (!is_sev_snp_vm(vm)) + sev_register_encrypted_memory(vm, region); sparsebit_for_each_set_range(protected_phy_pages, i, j) { const uint64_t size = (j - i + 1) * vm->page_size; const uint64_t offset = (i - lowest_page_in_region) * vm->page_size; - sev_launch_update_data(vm, gpa_base + offset, size); + if (private) + vm_mem_set_private(vm, gpa_base + offset, size); + + if (is_sev_snp_vm(vm)) + snp_launch_update_data(vm, gpa_base + offset, + (uint64_t)addr_gpa2hva(vm, gpa_base + offset), + size, page_type); + else + sev_launch_update_data(vm, gpa_base + offset, size); + } } void sev_vm_init(struct kvm_vm *vm) { if (vm->type == KVM_X86_DEFAULT_VM) { - assert(vm->arch.sev_fd == -1); + TEST_ASSERT_EQ(vm->arch.sev_fd, -1); vm->arch.sev_fd = open_sev_dev_path_or_exit(); vm_sev_ioctl(vm, KVM_SEV_INIT, NULL); } else { struct kvm_sev_init init = { 0 }; - assert(vm->type == KVM_X86_SEV_VM); + TEST_ASSERT_EQ(vm->type, KVM_X86_SEV_VM); vm_sev_ioctl(vm, KVM_SEV_INIT2, &init); } } @@ -50,16 +61,24 @@ void sev_vm_init(struct kvm_vm *vm) void sev_es_vm_init(struct kvm_vm *vm) { if (vm->type == KVM_X86_DEFAULT_VM) { - assert(vm->arch.sev_fd == -1); + TEST_ASSERT_EQ(vm->arch.sev_fd, -1); vm->arch.sev_fd = open_sev_dev_path_or_exit(); vm_sev_ioctl(vm, KVM_SEV_ES_INIT, NULL); } else { struct kvm_sev_init init = { 0 }; - assert(vm->type == KVM_X86_SEV_ES_VM); + TEST_ASSERT_EQ(vm->type, KVM_X86_SEV_ES_VM); vm_sev_ioctl(vm, KVM_SEV_INIT2, &init); } } +void snp_vm_init(struct kvm_vm *vm) +{ + struct kvm_sev_init init = { 0 }; + + TEST_ASSERT_EQ(vm->type, KVM_X86_SNP_VM); + vm_sev_ioctl(vm, KVM_SEV_INIT2, &init); +} + void sev_vm_launch(struct kvm_vm *vm, uint32_t policy) { struct kvm_sev_launch_start launch_start = { @@ -76,7 +95,7 @@ void sev_vm_launch(struct kvm_vm *vm, uint32_t policy) TEST_ASSERT_EQ(status.state, SEV_GUEST_STATE_LAUNCH_UPDATE); hash_for_each(vm->regions.slot_hash, ctr, region, slot_node) - encrypt_region(vm, region); + encrypt_region(vm, region, KVM_SEV_PAGE_TYPE_INVALID, false); if (policy & SEV_POLICY_ES) vm_sev_ioctl(vm, KVM_SEV_LAUNCH_UPDATE_VMSA, NULL); @@ -112,6 +131,33 @@ void sev_vm_launch_finish(struct kvm_vm *vm) TEST_ASSERT_EQ(status.state, SEV_GUEST_STATE_RUNNING); } +void snp_vm_launch_start(struct kvm_vm *vm, uint64_t policy) +{ + struct kvm_sev_snp_launch_start launch_start = { + .policy = policy, + }; + + vm_sev_ioctl(vm, KVM_SEV_SNP_LAUNCH_START, &launch_start); +} + +void snp_vm_launch_update(struct kvm_vm *vm) +{ + struct userspace_mem_region *region; + int ctr; + + hash_for_each(vm->regions.slot_hash, ctr, region, slot_node) + encrypt_region(vm, region, KVM_SEV_SNP_PAGE_TYPE_NORMAL, true); + + vm->arch.is_pt_protected = true; +} + +void snp_vm_launch_finish(struct kvm_vm *vm) +{ + struct kvm_sev_snp_launch_finish launch_finish = { 0 }; + + vm_sev_ioctl(vm, KVM_SEV_SNP_LAUNCH_FINISH, &launch_finish); +} + struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t type, void *guest_code, struct kvm_vcpu **cpu) { @@ -128,8 +174,20 @@ struct kvm_vm *vm_sev_create_with_one_vcpu(uint32_t type, void *guest_code, return vm; } -void vm_sev_launch(struct kvm_vm *vm, uint32_t policy, uint8_t *measurement) +void vm_sev_launch(struct kvm_vm *vm, uint64_t policy, uint8_t *measurement) { + if (is_sev_snp_vm(vm)) { + vm_enable_cap(vm, KVM_CAP_EXIT_HYPERCALL, BIT(KVM_HC_MAP_GPA_RANGE)); + + snp_vm_launch_start(vm, policy); + + snp_vm_launch_update(vm); + + snp_vm_launch_finish(vm); + + return; + } + sev_vm_launch(vm, policy); if (!measurement) diff --git a/tools/testing/selftests/kvm/x86/fastops_test.c b/tools/testing/selftests/kvm/x86/fastops_test.c new file mode 100644 index 000000000000..2ac89d6c1e46 --- /dev/null +++ b/tools/testing/selftests/kvm/x86/fastops_test.c @@ -0,0 +1,165 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +/* + * Execute a fastop() instruction, with or without forced emulation. BT bit 0 + * to set RFLAGS.CF based on whether or not the input is even or odd, so that + * instructions like ADC and SBB are deterministic. + */ +#define guest_execute_fastop_1(FEP, insn, __val, __flags) \ +({ \ + __asm__ __volatile__("bt $0, %[val]\n\t" \ + FEP insn " %[val]\n\t" \ + "pushfq\n\t" \ + "pop %[flags]\n\t" \ + : [val]"+r"(__val), [flags]"=r"(__flags) \ + : : "cc", "memory"); \ +}) + +#define guest_test_fastop_1(insn, type_t, __val) \ +({ \ + type_t val = __val, ex_val = __val, input = __val; \ + uint64_t flags, ex_flags; \ + \ + guest_execute_fastop_1("", insn, ex_val, ex_flags); \ + guest_execute_fastop_1(KVM_FEP, insn, val, flags); \ + \ + __GUEST_ASSERT(val == ex_val, \ + "Wanted 0x%lx for '%s 0x%lx', got 0x%lx", \ + (uint64_t)ex_val, insn, (uint64_t)input, (uint64_t)val); \ + __GUEST_ASSERT(flags == ex_flags, \ + "Wanted flags 0x%lx for '%s 0x%lx', got 0x%lx", \ + ex_flags, insn, (uint64_t)input, flags); \ +}) + +#define guest_execute_fastop_2(FEP, insn, __input, __output, __flags) \ +({ \ + __asm__ __volatile__("bt $0, %[output]\n\t" \ + FEP insn " %[input], %[output]\n\t" \ + "pushfq\n\t" \ + "pop %[flags]\n\t" \ + : [output]"+r"(__output), [flags]"=r"(__flags) \ + : [input]"r"(__input) : "cc", "memory"); \ +}) + +#define guest_test_fastop_2(insn, type_t, __val1, __val2) \ +({ \ + type_t input = __val1, input2 = __val2, output = __val2, ex_output = __val2; \ + uint64_t flags, ex_flags; \ + \ + guest_execute_fastop_2("", insn, input, ex_output, ex_flags); \ + guest_execute_fastop_2(KVM_FEP, insn, input, output, flags); \ + \ + __GUEST_ASSERT(output == ex_output, \ + "Wanted 0x%lx for '%s 0x%lx 0x%lx', got 0x%lx", \ + (uint64_t)ex_output, insn, (uint64_t)input, \ + (uint64_t)input2, (uint64_t)output); \ + __GUEST_ASSERT(flags == ex_flags, \ + "Wanted flags 0x%lx for '%s 0x%lx, 0x%lx', got 0x%lx", \ + ex_flags, insn, (uint64_t)input, (uint64_t)input2, flags); \ +}) + +#define guest_execute_fastop_cl(FEP, insn, __shift, __output, __flags) \ +({ \ + __asm__ __volatile__("bt $0, %[output]\n\t" \ + FEP insn " %%cl, %[output]\n\t" \ + "pushfq\n\t" \ + "pop %[flags]\n\t" \ + : [output]"+r"(__output), [flags]"=r"(__flags) \ + : "c"(__shift) : "cc", "memory"); \ +}) + +#define guest_test_fastop_cl(insn, type_t, __val1, __val2) \ +({ \ + type_t output = __val2, ex_output = __val2, input = __val2; \ + uint8_t shift = __val1; \ + uint64_t flags, ex_flags; \ + \ + guest_execute_fastop_cl("", insn, shift, ex_output, ex_flags); \ + guest_execute_fastop_cl(KVM_FEP, insn, shift, output, flags); \ + \ + __GUEST_ASSERT(output == ex_output, \ + "Wanted 0x%lx for '%s 0x%x, 0x%lx', got 0x%lx", \ + (uint64_t)ex_output, insn, shift, (uint64_t)input, \ + (uint64_t)output); \ + __GUEST_ASSERT(flags == ex_flags, \ + "Wanted flags 0x%lx for '%s 0x%x, 0x%lx', got 0x%lx", \ + ex_flags, insn, shift, (uint64_t)input, flags); \ +}) + +static const uint64_t vals[] = { + 0, + 1, + 2, + 4, + 7, + 0x5555555555555555, + 0xaaaaaaaaaaaaaaaa, + 0xfefefefefefefefe, + 0xffffffffffffffff, +}; + +#define guest_test_fastops(type_t, suffix) \ +do { \ + int i, j; \ + \ + for (i = 0; i < ARRAY_SIZE(vals); i++) { \ + guest_test_fastop_1("dec" suffix, type_t, vals[i]); \ + guest_test_fastop_1("inc" suffix, type_t, vals[i]); \ + guest_test_fastop_1("neg" suffix, type_t, vals[i]); \ + guest_test_fastop_1("not" suffix, type_t, vals[i]); \ + \ + for (j = 0; j < ARRAY_SIZE(vals); j++) { \ + guest_test_fastop_2("add" suffix, type_t, vals[i], vals[j]); \ + guest_test_fastop_2("adc" suffix, type_t, vals[i], vals[j]); \ + guest_test_fastop_2("and" suffix, type_t, vals[i], vals[j]); \ + guest_test_fastop_2("bsf" suffix, type_t, vals[i], vals[j]); \ + guest_test_fastop_2("bsr" suffix, type_t, vals[i], vals[j]); \ + guest_test_fastop_2("bt" suffix, type_t, vals[i], vals[j]); \ + guest_test_fastop_2("btc" suffix, type_t, vals[i], vals[j]); \ + guest_test_fastop_2("btr" suffix, type_t, vals[i], vals[j]); \ + guest_test_fastop_2("bts" suffix, type_t, vals[i], vals[j]); \ + guest_test_fastop_2("cmp" suffix, type_t, vals[i], vals[j]); \ + guest_test_fastop_2("imul" suffix, type_t, vals[i], vals[j]); \ + guest_test_fastop_2("or" suffix, type_t, vals[i], vals[j]); \ + guest_test_fastop_2("sbb" suffix, type_t, vals[i], vals[j]); \ + guest_test_fastop_2("sub" suffix, type_t, vals[i], vals[j]); \ + guest_test_fastop_2("test" suffix, type_t, vals[i], vals[j]); \ + guest_test_fastop_2("xor" suffix, type_t, vals[i], vals[j]); \ + \ + guest_test_fastop_cl("rol" suffix, type_t, vals[i], vals[j]); \ + guest_test_fastop_cl("ror" suffix, type_t, vals[i], vals[j]); \ + guest_test_fastop_cl("rcl" suffix, type_t, vals[i], vals[j]); \ + guest_test_fastop_cl("rcr" suffix, type_t, vals[i], vals[j]); \ + guest_test_fastop_cl("sar" suffix, type_t, vals[i], vals[j]); \ + guest_test_fastop_cl("shl" suffix, type_t, vals[i], vals[j]); \ + guest_test_fastop_cl("shr" suffix, type_t, vals[i], vals[j]); \ + } \ + } \ +} while (0) + +static void guest_code(void) +{ + guest_test_fastops(uint16_t, "w"); + guest_test_fastops(uint32_t, "l"); + guest_test_fastops(uint64_t, "q"); + + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + TEST_REQUIRE(is_forced_emulation_enabled); + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + vcpu_run(vcpu); + TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE); + + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/x86/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86/hyperv_cpuid.c index 4e920705681a..c863a689aa98 100644 --- a/tools/testing/selftests/kvm/x86/hyperv_cpuid.c +++ b/tools/testing/selftests/kvm/x86/hyperv_cpuid.c @@ -22,25 +22,6 @@ static void guest_code(void) { } -static bool smt_possible(void) -{ - char buf[16]; - FILE *f; - bool res = true; - - f = fopen("/sys/devices/system/cpu/smt/control", "r"); - if (f) { - if (fread(buf, sizeof(*buf), sizeof(buf), f) > 0) { - if (!strncmp(buf, "forceoff", 8) || - !strncmp(buf, "notsupported", 12)) - res = false; - } - fclose(f); - } - - return res; -} - static void test_hv_cpuid(struct kvm_vcpu *vcpu, bool evmcs_expected) { const bool has_irqchip = !vcpu || vcpu->vm->has_irqchip; @@ -93,7 +74,7 @@ static void test_hv_cpuid(struct kvm_vcpu *vcpu, bool evmcs_expected) case 0x40000004: test_val = entry->eax & (1UL << 18); - TEST_ASSERT(!!test_val == !smt_possible(), + TEST_ASSERT(!!test_val == !is_smt_possible(), "NoNonArchitecturalCoreSharing bit" " doesn't reflect SMT setting"); diff --git a/tools/testing/selftests/kvm/x86/kvm_buslock_test.c b/tools/testing/selftests/kvm/x86/kvm_buslock_test.c new file mode 100644 index 000000000000..d88500c118eb --- /dev/null +++ b/tools/testing/selftests/kvm/x86/kvm_buslock_test.c @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2024 Advanced Micro Devices, Inc. + */ +#include <linux/atomic.h> + +#include "kvm_util.h" +#include "processor.h" +#include "svm_util.h" +#include "vmx.h" +#include "test_util.h" + +#define NR_BUS_LOCKS_PER_LEVEL 100 +#define CACHE_LINE_SIZE 64 + +/* + * To generate a bus lock, carve out a buffer that precisely occupies two cache + * lines and perform an atomic access that splits the two lines. + */ +static u8 buffer[CACHE_LINE_SIZE * 2] __aligned(CACHE_LINE_SIZE); +static atomic_t *val = (void *)&buffer[CACHE_LINE_SIZE - (sizeof(*val) / 2)]; + +static void guest_generate_buslocks(void) +{ + for (int i = 0; i < NR_BUS_LOCKS_PER_LEVEL; i++) + atomic_inc(val); +} + +#define L2_GUEST_STACK_SIZE 64 + +static void l2_guest_code(void) +{ + guest_generate_buslocks(); + GUEST_DONE(); +} + +static void l1_svm_code(struct svm_test_data *svm) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + struct vmcb *vmcb = svm->vmcb; + + generic_svm_setup(svm, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + run_guest(vmcb, svm->vmcb_gpa); +} + +static void l1_vmx_code(struct vmx_pages *vmx) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + + GUEST_ASSERT_EQ(prepare_for_vmx_operation(vmx), true); + GUEST_ASSERT_EQ(load_vmcs(vmx), true); + + prepare_vmcs(vmx, NULL, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + GUEST_ASSERT(!vmwrite(GUEST_RIP, (u64)l2_guest_code)); + GUEST_ASSERT(!vmlaunch()); +} + +static void guest_code(void *test_data) +{ + guest_generate_buslocks(); + + if (this_cpu_has(X86_FEATURE_SVM)) + l1_svm_code(test_data); + else if (this_cpu_has(X86_FEATURE_VMX)) + l1_vmx_code(test_data); + else + GUEST_DONE(); + + TEST_FAIL("L2 should have signaled 'done'"); +} + +int main(int argc, char *argv[]) +{ + const bool has_nested = kvm_cpu_has(X86_FEATURE_SVM) || kvm_cpu_has(X86_FEATURE_VMX); + vm_vaddr_t nested_test_data_gva; + struct kvm_vcpu *vcpu; + struct kvm_run *run; + struct kvm_vm *vm; + int i, bus_locks = 0; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_X86_BUS_LOCK_EXIT)); + + vm = vm_create(1); + vm_enable_cap(vm, KVM_CAP_X86_BUS_LOCK_EXIT, KVM_BUS_LOCK_DETECTION_EXIT); + vcpu = vm_vcpu_add(vm, 0, guest_code); + + if (kvm_cpu_has(X86_FEATURE_SVM)) + vcpu_alloc_svm(vm, &nested_test_data_gva); + else + vcpu_alloc_vmx(vm, &nested_test_data_gva); + + vcpu_args_set(vcpu, 1, nested_test_data_gva); + + run = vcpu->run; + + for (i = 0; i <= NR_BUS_LOCKS_PER_LEVEL * (1 + has_nested); i++) { + struct ucall uc; + + vcpu_run(vcpu); + + if (run->exit_reason == KVM_EXIT_IO) { + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + goto done; + case UCALL_SYNC: + continue; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd); + } + } + + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_X86_BUS_LOCK); + + /* + * Verify the counter is actually getting incremented, e.g. that + * KVM isn't skipping the instruction. On Intel, the exit is + * trap-like, i.e. the counter should already have been + * incremented. On AMD, it's fault-like, i.e. the counter will + * be incremented when the guest re-executes the instruction. + */ + sync_global_from_guest(vm, *val); + TEST_ASSERT_EQ(atomic_read(val), bus_locks + host_cpu_is_intel); + + bus_locks++; + } + TEST_FAIL("Didn't receive UCALL_DONE, took %u bus lock exits\n", bus_locks); +done: + TEST_ASSERT_EQ(i, bus_locks); + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86/sev_init2_tests.c b/tools/testing/selftests/kvm/x86/sev_init2_tests.c index 3fb967f40c6a..b238615196ad 100644 --- a/tools/testing/selftests/kvm/x86/sev_init2_tests.c +++ b/tools/testing/selftests/kvm/x86/sev_init2_tests.c @@ -28,6 +28,7 @@ int kvm_fd; u64 supported_vmsa_features; bool have_sev_es; +bool have_snp; static int __sev_ioctl(int vm_fd, int cmd_id, void *data) { @@ -83,6 +84,9 @@ void test_vm_types(void) if (have_sev_es) test_init2(KVM_X86_SEV_ES_VM, &(struct kvm_sev_init){}); + if (have_snp) + test_init2(KVM_X86_SNP_VM, &(struct kvm_sev_init){}); + test_init2_invalid(0, &(struct kvm_sev_init){}, "VM type is KVM_X86_DEFAULT_VM"); if (kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM)) @@ -138,15 +142,24 @@ int main(int argc, char *argv[]) "sev-es: KVM_CAP_VM_TYPES (%x) does not match cpuid (checking %x)", kvm_check_cap(KVM_CAP_VM_TYPES), 1 << KVM_X86_SEV_ES_VM); + have_snp = kvm_cpu_has(X86_FEATURE_SEV_SNP); + TEST_ASSERT(have_snp == !!(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SNP_VM)), + "sev-snp: KVM_CAP_VM_TYPES (%x) indicates SNP support (bit %d), but CPUID does not", + kvm_check_cap(KVM_CAP_VM_TYPES), KVM_X86_SNP_VM); + test_vm_types(); test_flags(KVM_X86_SEV_VM); if (have_sev_es) test_flags(KVM_X86_SEV_ES_VM); + if (have_snp) + test_flags(KVM_X86_SNP_VM); test_features(KVM_X86_SEV_VM, 0); if (have_sev_es) test_features(KVM_X86_SEV_ES_VM, supported_vmsa_features); + if (have_snp) + test_features(KVM_X86_SNP_VM, supported_vmsa_features); return 0; } diff --git a/tools/testing/selftests/kvm/x86/sev_smoke_test.c b/tools/testing/selftests/kvm/x86/sev_smoke_test.c index d97816dc476a..77256c89bb8d 100644 --- a/tools/testing/selftests/kvm/x86/sev_smoke_test.c +++ b/tools/testing/selftests/kvm/x86/sev_smoke_test.c @@ -16,6 +16,18 @@ #define XFEATURE_MASK_X87_AVX (XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM) +static void guest_snp_code(void) +{ + uint64_t sev_msr = rdmsr(MSR_AMD64_SEV); + + GUEST_ASSERT(sev_msr & MSR_AMD64_SEV_ENABLED); + GUEST_ASSERT(sev_msr & MSR_AMD64_SEV_ES_ENABLED); + GUEST_ASSERT(sev_msr & MSR_AMD64_SEV_SNP_ENABLED); + + wrmsr(MSR_AMD64_SEV_ES_GHCB, GHCB_MSR_TERM_REQ); + vmgexit(); +} + static void guest_sev_es_code(void) { /* TODO: Check CPUID after GHCB-based hypercall support is added. */ @@ -27,7 +39,7 @@ static void guest_sev_es_code(void) * force "termination" to signal "done" via the GHCB MSR protocol. */ wrmsr(MSR_AMD64_SEV_ES_GHCB, GHCB_MSR_TERM_REQ); - __asm__ __volatile__("rep; vmmcall"); + vmgexit(); } static void guest_sev_code(void) @@ -62,7 +74,7 @@ static void compare_xsave(u8 *from_host, u8 *from_guest) abort(); } -static void test_sync_vmsa(uint32_t policy) +static void test_sync_vmsa(uint32_t type, uint64_t policy) { struct kvm_vcpu *vcpu; struct kvm_vm *vm; @@ -72,7 +84,7 @@ static void test_sync_vmsa(uint32_t policy) double x87val = M_PI; struct kvm_xsave __attribute__((aligned(64))) xsave = { 0 }; - vm = vm_sev_create_with_one_vcpu(KVM_X86_SEV_ES_VM, guest_code_xsave, &vcpu); + vm = vm_sev_create_with_one_vcpu(type, guest_code_xsave, &vcpu); gva = vm_vaddr_alloc_shared(vm, PAGE_SIZE, KVM_UTIL_MIN_VADDR, MEM_REGION_TEST_DATA); hva = addr_gva2hva(vm, gva); @@ -89,7 +101,7 @@ static void test_sync_vmsa(uint32_t policy) : "ymm4", "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"); vcpu_xsave_set(vcpu, &xsave); - vm_sev_launch(vm, SEV_POLICY_ES | policy, NULL); + vm_sev_launch(vm, policy, NULL); /* This page is shared, so make it decrypted. */ memset(hva, 0, 4096); @@ -108,14 +120,12 @@ static void test_sync_vmsa(uint32_t policy) kvm_vm_free(vm); } -static void test_sev(void *guest_code, uint64_t policy) +static void test_sev(void *guest_code, uint32_t type, uint64_t policy) { struct kvm_vcpu *vcpu; struct kvm_vm *vm; struct ucall uc; - uint32_t type = policy & SEV_POLICY_ES ? KVM_X86_SEV_ES_VM : KVM_X86_SEV_VM; - vm = vm_sev_create_with_one_vcpu(type, guest_code, &vcpu); /* TODO: Validate the measurement is as expected. */ @@ -124,7 +134,7 @@ static void test_sev(void *guest_code, uint64_t policy) for (;;) { vcpu_run(vcpu); - if (policy & SEV_POLICY_ES) { + if (is_sev_es_vm(vm)) { TEST_ASSERT(vcpu->run->exit_reason == KVM_EXIT_SYSTEM_EVENT, "Wanted SYSTEM_EVENT, got %s", exit_reason_str(vcpu->run->exit_reason)); @@ -161,16 +171,14 @@ static void guest_shutdown_code(void) __asm__ __volatile__("ud2"); } -static void test_sev_es_shutdown(void) +static void test_sev_shutdown(uint32_t type, uint64_t policy) { struct kvm_vcpu *vcpu; struct kvm_vm *vm; - uint32_t type = KVM_X86_SEV_ES_VM; - vm = vm_sev_create_with_one_vcpu(type, guest_shutdown_code, &vcpu); - vm_sev_launch(vm, SEV_POLICY_ES, NULL); + vm_sev_launch(vm, policy, NULL); vcpu_run(vcpu); TEST_ASSERT(vcpu->run->exit_reason == KVM_EXIT_SHUTDOWN, @@ -180,27 +188,42 @@ static void test_sev_es_shutdown(void) kvm_vm_free(vm); } -int main(int argc, char *argv[]) +static void test_sev_smoke(void *guest, uint32_t type, uint64_t policy) { const u64 xf_mask = XFEATURE_MASK_X87_AVX; - TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SEV)); - - test_sev(guest_sev_code, SEV_POLICY_NO_DBG); - test_sev(guest_sev_code, 0); + if (type == KVM_X86_SNP_VM) + test_sev(guest, type, policy | SNP_POLICY_DBG); + else + test_sev(guest, type, policy | SEV_POLICY_NO_DBG); + test_sev(guest, type, policy); - if (kvm_cpu_has(X86_FEATURE_SEV_ES)) { - test_sev(guest_sev_es_code, SEV_POLICY_ES | SEV_POLICY_NO_DBG); - test_sev(guest_sev_es_code, SEV_POLICY_ES); + if (type == KVM_X86_SEV_VM) + return; - test_sev_es_shutdown(); + test_sev_shutdown(type, policy); - if (kvm_has_cap(KVM_CAP_XCRS) && - (xgetbv(0) & kvm_cpu_supported_xcr0() & xf_mask) == xf_mask) { - test_sync_vmsa(0); - test_sync_vmsa(SEV_POLICY_NO_DBG); - } + if (kvm_has_cap(KVM_CAP_XCRS) && + (xgetbv(0) & kvm_cpu_supported_xcr0() & xf_mask) == xf_mask) { + test_sync_vmsa(type, policy); + if (type == KVM_X86_SNP_VM) + test_sync_vmsa(type, policy | SNP_POLICY_DBG); + else + test_sync_vmsa(type, policy | SEV_POLICY_NO_DBG); } +} + +int main(int argc, char *argv[]) +{ + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SEV)); + + test_sev_smoke(guest_sev_code, KVM_X86_SEV_VM, 0); + + if (kvm_cpu_has(X86_FEATURE_SEV_ES)) + test_sev_smoke(guest_sev_es_code, KVM_X86_SEV_ES_VM, SEV_POLICY_ES); + + if (kvm_cpu_has(X86_FEATURE_SEV_SNP)) + test_sev_smoke(guest_snp_code, KVM_X86_SNP_VM, snp_default_policy()); return 0; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d660a7da3baa..eec82775c5bf 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3798,7 +3798,7 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up); /* * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode. */ -void kvm_vcpu_kick(struct kvm_vcpu *vcpu) +void __kvm_vcpu_kick(struct kvm_vcpu *vcpu, bool wait) { int me, cpu; @@ -3827,13 +3827,24 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) */ if (kvm_arch_vcpu_should_kick(vcpu)) { cpu = READ_ONCE(vcpu->cpu); - if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) - smp_send_reschedule(cpu); + if (cpu != me && (unsigned int)cpu < nr_cpu_ids && cpu_online(cpu)) { + /* + * Use a reschedule IPI to kick the vCPU if the caller + * doesn't need to wait for a response, as KVM allows + * kicking vCPUs while IRQs are disabled, but using the + * SMP function call framework with IRQs disabled can + * deadlock due to taking cross-CPU locks. + */ + if (wait) + smp_call_function_single(cpu, ack_kick, NULL, wait); + else + smp_send_reschedule(cpu); + } } out: put_cpu(); } -EXPORT_SYMBOL_GPL(kvm_vcpu_kick); +EXPORT_SYMBOL_GPL(__kvm_vcpu_kick); #endif /* !CONFIG_S390 */ int kvm_vcpu_yield_to(struct kvm_vcpu *target) @@ -5824,7 +5835,6 @@ static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, return -EOPNOTSUPP; } -/* kvm_io_bus_write - called under kvm->slots_lock */ int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val) { @@ -5845,7 +5855,6 @@ int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, } EXPORT_SYMBOL_GPL(kvm_io_bus_write); -/* kvm_io_bus_write_cookie - called under kvm->slots_lock */ int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val, long cookie) { @@ -5895,7 +5904,6 @@ static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, return -EOPNOTSUPP; } -/* kvm_io_bus_read - called under kvm->slots_lock */ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, void *val) { |