68 files changed, 13656 insertions, 862 deletions
diff --git a/tools/testing/selftests/kvm/x86_64/amx_test.c b/tools/testing/selftests/kvm/x86_64/amx_test.c
new file mode 100644
index 000000000000..eae521f050e0
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/amx_test.c
@@ -0,0 +1,334 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * amx tests
+ *
+ * Copyright (C) 2021, Intel, Inc.
+ *
+ * Tests for amx #NM exception and save/restore.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+#ifndef __x86_64__
+# error This test is 64-bit only
+#endif
+
+#define NUM_TILES			8
+#define TILE_SIZE			1024
+#define XSAVE_SIZE			((NUM_TILES * TILE_SIZE) + PAGE_SIZE)
+
+/* Tile configuration associated: */
+#define PALETTE_TABLE_INDEX		1
+#define MAX_TILES			16
+#define RESERVED_BYTES			14
+
+#define XSAVE_HDR_OFFSET		512
+
+struct tile_config {
+	u8  palette_id;
+	u8  start_row;
+	u8  reserved[RESERVED_BYTES];
+	u16 colsb[MAX_TILES];
+	u8  rows[MAX_TILES];
+};
+
+struct tile_data {
+	u8 data[NUM_TILES * TILE_SIZE];
+};
+
+struct xtile_info {
+	u16 bytes_per_tile;
+	u16 bytes_per_row;
+	u16 max_names;
+	u16 max_rows;
+	u32 xsave_offset;
+	u32 xsave_size;
+};
+
+static struct xtile_info xtile;
+
+static inline void __ldtilecfg(void *cfg)
+{
+	asm volatile(".byte 0xc4,0xe2,0x78,0x49,0x00"
+		     : : "a"(cfg));
+}
+
+static inline void __tileloadd(void *tile)
+{
+	asm volatile(".byte 0xc4,0xe2,0x7b,0x4b,0x04,0x10"
+		     : : "a"(tile), "d"(0));
+}
+
+static inline void __tilerelease(void)
+{
+	asm volatile(".byte 0xc4, 0xe2, 0x78, 0x49, 0xc0" ::);
+}
+
+static inline void __xsavec(struct xstate *xstate, uint64_t rfbm)
+{
+	uint32_t rfbm_lo = rfbm;
+	uint32_t rfbm_hi = rfbm >> 32;
+
+	asm volatile("xsavec (%%rdi)"
+		     : : "D" (xstate), "a" (rfbm_lo), "d" (rfbm_hi)
+		     : "memory");
+}
+
+static void check_xtile_info(void)
+{
+	GUEST_ASSERT(this_cpu_has_p(X86_PROPERTY_XSTATE_MAX_SIZE_XCR0));
+	GUEST_ASSERT(this_cpu_property(X86_PROPERTY_XSTATE_MAX_SIZE_XCR0) <= XSAVE_SIZE);
+
+	xtile.xsave_offset = this_cpu_property(X86_PROPERTY_XSTATE_TILE_OFFSET);
+	GUEST_ASSERT(xtile.xsave_offset == 2816);
+	xtile.xsave_size = this_cpu_property(X86_PROPERTY_XSTATE_TILE_SIZE);
+	GUEST_ASSERT(xtile.xsave_size == 8192);
+	GUEST_ASSERT(sizeof(struct tile_data) >= xtile.xsave_size);
+
+	GUEST_ASSERT(this_cpu_has_p(X86_PROPERTY_AMX_MAX_PALETTE_TABLES));
+	GUEST_ASSERT(this_cpu_property(X86_PROPERTY_AMX_MAX_PALETTE_TABLES) >=
+		     PALETTE_TABLE_INDEX);
+
+	GUEST_ASSERT(this_cpu_has_p(X86_PROPERTY_AMX_NR_TILE_REGS));
+	xtile.max_names = this_cpu_property(X86_PROPERTY_AMX_NR_TILE_REGS);
+	GUEST_ASSERT(xtile.max_names == 8);
+	xtile.bytes_per_tile = this_cpu_property(X86_PROPERTY_AMX_BYTES_PER_TILE);
+	GUEST_ASSERT(xtile.bytes_per_tile == 1024);
+	xtile.bytes_per_row = this_cpu_property(X86_PROPERTY_AMX_BYTES_PER_ROW);
+	GUEST_ASSERT(xtile.bytes_per_row == 64);
+	xtile.max_rows = this_cpu_property(X86_PROPERTY_AMX_MAX_ROWS);
+	GUEST_ASSERT(xtile.max_rows == 16);
+}
+
+static void set_tilecfg(struct tile_config *cfg)
+{
+	int i;
+
+	/* Only palette id 1 */
+	cfg->palette_id = 1;
+	for (i = 0; i < xtile.max_names; i++) {
+		cfg->colsb[i] = xtile.bytes_per_row;
+		cfg->rows[i] = xtile.max_rows;
+	}
+}
+
+static void init_regs(void)
+{
+	uint64_t cr4, xcr0;
+
+	GUEST_ASSERT(this_cpu_has(X86_FEATURE_XSAVE));
+
+	/* turn on CR4.OSXSAVE */
+	cr4 = get_cr4();
+	cr4 |= X86_CR4_OSXSAVE;
+	set_cr4(cr4);
+	GUEST_ASSERT(this_cpu_has(X86_FEATURE_OSXSAVE));
+
+	xcr0 = xgetbv(0);
+	xcr0 |= XFEATURE_MASK_XTILE;
+	xsetbv(0x0, xcr0);
+	GUEST_ASSERT((xgetbv(0) & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE);
+}
+
+static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg,
+						    struct tile_data *tiledata,
+						    struct xstate *xstate)
+{
+	init_regs();
+	check_xtile_info();
+	GUEST_SYNC(1);
+
+	/* xfd=0, enable amx */
+	wrmsr(MSR_IA32_XFD, 0);
+	GUEST_SYNC(2);
+	GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == 0);
+	set_tilecfg(amx_cfg);
+	__ldtilecfg(amx_cfg);
+	GUEST_SYNC(3);
+	/* Check save/restore when trap to userspace */
+	__tileloadd(tiledata);
+	GUEST_SYNC(4);
+	__tilerelease();
+	GUEST_SYNC(5);
+	/*
+	 * After XSAVEC, XTILEDATA is cleared in the xstate_bv but is set in
+	 * the xcomp_bv.
+	 */
+	xstate->header.xstate_bv = XFEATURE_MASK_XTILE_DATA;
+	__xsavec(xstate, XFEATURE_MASK_XTILE_DATA);
+	GUEST_ASSERT(!(xstate->header.xstate_bv & XFEATURE_MASK_XTILE_DATA));
+	GUEST_ASSERT(xstate->header.xcomp_bv & XFEATURE_MASK_XTILE_DATA);
+
+	/* xfd=0x40000, disable amx tiledata */
+	wrmsr(MSR_IA32_XFD, XFEATURE_MASK_XTILE_DATA);
+
+	/*
+	 * XTILEDATA is cleared in xstate_bv but set in xcomp_bv, this property
+	 * remains the same even when amx tiledata is disabled by IA32_XFD.
+	 */
+	xstate->header.xstate_bv = XFEATURE_MASK_XTILE_DATA;
+	__xsavec(xstate, XFEATURE_MASK_XTILE_DATA);
+	GUEST_ASSERT(!(xstate->header.xstate_bv & XFEATURE_MASK_XTILE_DATA));
+	GUEST_ASSERT((xstate->header.xcomp_bv & XFEATURE_MASK_XTILE_DATA));
+
+	GUEST_SYNC(6);
+	GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA);
+	set_tilecfg(amx_cfg);
+	__ldtilecfg(amx_cfg);
+	/* Trigger #NM exception */
+	__tileloadd(tiledata);
+	GUEST_SYNC(10);
+
+	GUEST_DONE();
+}
+
+void guest_nm_handler(struct ex_regs *regs)
+{
+	/* Check if #NM is triggered by XFEATURE_MASK_XTILE_DATA */
+	GUEST_SYNC(7);
+	GUEST_ASSERT(!(get_cr0() & X86_CR0_TS));
+	GUEST_ASSERT(rdmsr(MSR_IA32_XFD_ERR) == XFEATURE_MASK_XTILE_DATA);
+	GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA);
+	GUEST_SYNC(8);
+	GUEST_ASSERT(rdmsr(MSR_IA32_XFD_ERR) == XFEATURE_MASK_XTILE_DATA);
+	GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA);
+	/* Clear xfd_err */
+	wrmsr(MSR_IA32_XFD_ERR, 0);
+	/* xfd=0, enable amx */
+	wrmsr(MSR_IA32_XFD, 0);
+	GUEST_SYNC(9);
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_regs regs1, regs2;
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	struct kvm_x86_state *state;
+	int xsave_restore_size;
+	vm_vaddr_t amx_cfg, tiledata, xstate;
+	struct ucall uc;
+	u32 amx_offset;
+	int ret;
+
+	/*
+	 * Note, all off-by-default features must be enabled before anything
+	 * caches KVM_GET_SUPPORTED_CPUID, e.g. before using kvm_cpu_has().
+	 */
+	vm_xsave_require_permission(XFEATURE_MASK_XTILE_DATA);
+
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XFD));
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XSAVE));
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_AMX_TILE));
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XTILECFG));
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XTILEDATA));
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XTILEDATA_XFD));
+
+	/* Create VM */
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+
+	TEST_ASSERT(kvm_cpu_has_p(X86_PROPERTY_XSTATE_MAX_SIZE),
+		    "KVM should enumerate max XSAVE size when XSAVE is supported");
+	xsave_restore_size = kvm_cpu_property(X86_PROPERTY_XSTATE_MAX_SIZE);
+
+	vcpu_regs_get(vcpu, &regs1);
+
+	/* Register #NM handler */
+	vm_init_descriptor_tables(vm);
+	vcpu_init_descriptor_tables(vcpu);
+	vm_install_exception_handler(vm, NM_VECTOR, guest_nm_handler);
+
+	/* amx cfg for guest_code */
+	amx_cfg = vm_vaddr_alloc_page(vm);
+	memset(addr_gva2hva(vm, amx_cfg), 0x0, getpagesize());
+
+	/* amx tiledata for guest_code */
+	tiledata = vm_vaddr_alloc_pages(vm, 2);
+	memset(addr_gva2hva(vm, tiledata), rand() | 1, 2 * getpagesize());
+
+	/* XSAVE state for guest_code */
+	xstate = vm_vaddr_alloc_pages(vm, DIV_ROUND_UP(XSAVE_SIZE, PAGE_SIZE));
+	memset(addr_gva2hva(vm, xstate), 0, PAGE_SIZE * DIV_ROUND_UP(XSAVE_SIZE, PAGE_SIZE));
+	vcpu_args_set(vcpu, 3, amx_cfg, tiledata, xstate);
+
+	for (;;) {
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			/* NOT REACHED */
+		case UCALL_SYNC:
+			switch (uc.args[1]) {
+			case 1:
+			case 2:
+			case 3:
+			case 5:
+			case 6:
+			case 7:
+			case 8:
+				fprintf(stderr, "GUEST_SYNC(%ld)\n", uc.args[1]);
+				break;
+			case 4:
+			case 10:
+				fprintf(stderr,
+				"GUEST_SYNC(%ld), check save/restore status\n", uc.args[1]);
+
+				/* Compacted mode, get amx offset by xsave area
+				 * size subtract 8K amx size.
+				 */
+				amx_offset = xsave_restore_size - NUM_TILES*TILE_SIZE;
+				state = vcpu_save_state(vcpu);
+				void *amx_start = (void *)state->xsave + amx_offset;
+				void *tiles_data = (void *)addr_gva2hva(vm, tiledata);
+				/* Only check TMM0 register, 1 tile */
+				ret = memcmp(amx_start, tiles_data, TILE_SIZE);
+				TEST_ASSERT(ret == 0, "memcmp failed, ret=%d", ret);
+				kvm_x86_state_cleanup(state);
+				break;
+			case 9:
+				fprintf(stderr,
+				"GUEST_SYNC(%ld), #NM exception and enable amx\n", uc.args[1]);
+				break;
+			}
+			break;
+		case UCALL_DONE:
+			fprintf(stderr, "UCALL_DONE\n");
+			goto done;
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+
+		state = vcpu_save_state(vcpu);
+		memset(&regs1, 0, sizeof(regs1));
+		vcpu_regs_get(vcpu, &regs1);
+
+		kvm_vm_release(vm);
+
+		/* Restore state in a new VM.  */
+		vcpu = vm_recreate_with_one_vcpu(vm);
+		vcpu_load_state(vcpu, state);
+		kvm_x86_state_cleanup(state);
+
+		memset(&regs2, 0, sizeof(regs2));
+		vcpu_regs_get(vcpu, &regs2);
+		TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)),
+			    "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
+			    (ulong) regs2.rdi, (ulong) regs2.rsi);
+	}
+done:
+	kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/cpuid_test.c b/tools/testing/selftests/kvm/x86_64/cpuid_test.c
new file mode 100644
index 000000000000..8c579ce714e9
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/cpuid_test.c
@@ -0,0 +1,208 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021, Red Hat Inc.
+ *
+ * Generic tests for KVM CPUID set/get ioctls
+ */
+#include <asm/kvm_para.h>
+#include <linux/kvm_para.h>
+#include <stdint.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+/* CPUIDs known to differ */
+struct {
+	u32 function;
+	u32 index;
+} mangled_cpuids[] = {
+	/*
+	 * These entries depend on the vCPU's XCR0 register and IA32_XSS MSR,
+	 * which are not controlled for by this test.
+	 */
+	{.function = 0xd, .index = 0},
+	{.function = 0xd, .index = 1},
+};
+
+static void test_guest_cpuids(struct kvm_cpuid2 *guest_cpuid)
+{
+	int i;
+	u32 eax, ebx, ecx, edx;
+
+	for (i = 0; i < guest_cpuid->nent; i++) {
+		__cpuid(guest_cpuid->entries[i].function,
+			guest_cpuid->entries[i].index,
+			&eax, &ebx, &ecx, &edx);
+
+		GUEST_ASSERT_EQ(eax, guest_cpuid->entries[i].eax);
+		GUEST_ASSERT_EQ(ebx, guest_cpuid->entries[i].ebx);
+		GUEST_ASSERT_EQ(ecx, guest_cpuid->entries[i].ecx);
+		GUEST_ASSERT_EQ(edx, guest_cpuid->entries[i].edx);
+	}
+
+}
+
+static void guest_main(struct kvm_cpuid2 *guest_cpuid)
+{
+	GUEST_SYNC(1);
+
+	test_guest_cpuids(guest_cpuid);
+
+	GUEST_SYNC(2);
+
+	GUEST_ASSERT_EQ(this_cpu_property(X86_PROPERTY_MAX_KVM_LEAF), 0x40000001);
+
+	GUEST_DONE();
+}
+
+static bool is_cpuid_mangled(const struct kvm_cpuid_entry2 *entrie)
+{
+	int i;
+
+	for (i = 0; i < sizeof(mangled_cpuids); i++) {
+		if (mangled_cpuids[i].function == entrie->function &&
+		    mangled_cpuids[i].index == entrie->index)
+			return true;
+	}
+
+	return false;
+}
+
+static void compare_cpuids(const struct kvm_cpuid2 *cpuid1,
+			   const struct kvm_cpuid2 *cpuid2)
+{
+	const struct kvm_cpuid_entry2 *e1, *e2;
+	int i;
+
+	TEST_ASSERT(cpuid1->nent == cpuid2->nent,
+		    "CPUID nent mismatch: %d vs. %d", cpuid1->nent, cpuid2->nent);
+
+	for (i = 0; i < cpuid1->nent; i++) {
+		e1 = &cpuid1->entries[i];
+		e2 = &cpuid2->entries[i];
+
+		TEST_ASSERT(e1->function == e2->function &&
+			    e1->index == e2->index && e1->flags == e2->flags,
+			    "CPUID entries[%d] mismtach: 0x%x.%d.%x vs. 0x%x.%d.%x",
+			    i, e1->function, e1->index, e1->flags,
+			    e2->function, e2->index, e2->flags);
+
+		if (is_cpuid_mangled(e1))
+			continue;
+
+		TEST_ASSERT(e1->eax == e2->eax && e1->ebx == e2->ebx &&
+			    e1->ecx == e2->ecx && e1->edx == e2->edx,
+			    "CPUID 0x%x.%x differ: 0x%x:0x%x:0x%x:0x%x vs 0x%x:0x%x:0x%x:0x%x",
+			    e1->function, e1->index,
+			    e1->eax, e1->ebx, e1->ecx, e1->edx,
+			    e2->eax, e2->ebx, e2->ecx, e2->edx);
+	}
+}
+
+static void run_vcpu(struct kvm_vcpu *vcpu, int stage)
+{
+	struct ucall uc;
+
+	vcpu_run(vcpu);
+
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_SYNC:
+		TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
+			    uc.args[1] == stage + 1,
+			    "Stage %d: Unexpected register values vmexit, got %lx",
+			    stage + 1, (ulong)uc.args[1]);
+		return;
+	case UCALL_DONE:
+		return;
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+	default:
+		TEST_ASSERT(false, "Unexpected exit: %s",
+			    exit_reason_str(vcpu->run->exit_reason));
+	}
+}
+
+struct kvm_cpuid2 *vcpu_alloc_cpuid(struct kvm_vm *vm, vm_vaddr_t *p_gva, struct kvm_cpuid2 *cpuid)
+{
+	int size = sizeof(*cpuid) + cpuid->nent * sizeof(cpuid->entries[0]);
+	vm_vaddr_t gva = vm_vaddr_alloc(vm, size, KVM_UTIL_MIN_VADDR);
+	struct kvm_cpuid2 *guest_cpuids = addr_gva2hva(vm, gva);
+
+	memcpy(guest_cpuids, cpuid, size);
+
+	*p_gva = gva;
+	return guest_cpuids;
+}
+
+static void set_cpuid_after_run(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *ent;
+	int rc;
+	u32 eax, ebx, x;
+
+	/* Setting unmodified CPUID is allowed */
+	rc = __vcpu_set_cpuid(vcpu);
+	TEST_ASSERT(!rc, "Setting unmodified CPUID after KVM_RUN failed: %d", rc);
+
+	/* Changing CPU features is forbidden */
+	ent = vcpu_get_cpuid_entry(vcpu, 0x7);
+	ebx = ent->ebx;
+	ent->ebx--;
+	rc = __vcpu_set_cpuid(vcpu);
+	TEST_ASSERT(rc, "Changing CPU features should fail");
+	ent->ebx = ebx;
+
+	/* Changing MAXPHYADDR is forbidden */
+	ent = vcpu_get_cpuid_entry(vcpu, 0x80000008);
+	eax = ent->eax;
+	x = eax & 0xff;
+	ent->eax = (eax & ~0xffu) | (x - 1);
+	rc = __vcpu_set_cpuid(vcpu);
+	TEST_ASSERT(rc, "Changing MAXPHYADDR should fail");
+	ent->eax = eax;
+}
+
+static void test_get_cpuid2(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid2 *cpuid = allocate_kvm_cpuid2(vcpu->cpuid->nent + 1);
+	int i, r;
+
+	vcpu_ioctl(vcpu, KVM_GET_CPUID2, cpuid);
+	TEST_ASSERT(cpuid->nent == vcpu->cpuid->nent,
+		    "KVM didn't update nent on success, wanted %u, got %u",
+		    vcpu->cpuid->nent, cpuid->nent);
+
+	for (i = 0; i < vcpu->cpuid->nent; i++) {
+		cpuid->nent = i;
+		r = __vcpu_ioctl(vcpu, KVM_GET_CPUID2, cpuid);
+		TEST_ASSERT(r && errno == E2BIG, KVM_IOCTL_ERROR(KVM_GET_CPUID2, r));
+		TEST_ASSERT(cpuid->nent == i, "KVM modified nent on failure");
+	}
+	free(cpuid);
+}
+
+int main(void)
+{
+	struct kvm_vcpu *vcpu;
+	vm_vaddr_t cpuid_gva;
+	struct kvm_vm *vm;
+	int stage;
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_main);
+
+	compare_cpuids(kvm_get_supported_cpuid(), vcpu->cpuid);
+
+	vcpu_alloc_cpuid(vm, &cpuid_gva, vcpu->cpuid);
+
+	vcpu_args_set(vcpu, 1, cpuid_gva);
+
+	for (stage = 0; stage < 3; stage++)
+		run_vcpu(vcpu, stage);
+
+	set_cpuid_after_run(vcpu);
+
+	test_get_cpuid2(vcpu);
+
+	kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c
index 140e91901582..624dc725e14d 100644
--- a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c
+++ b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c
@@ -19,25 +19,11 @@
 #include "kvm_util.h"
 #include "processor.h"
 
-#define X86_FEATURE_XSAVE	(1<<26)
-#define X86_FEATURE_OSXSAVE	(1<<27)
-#define VCPU_ID			1
-
 static inline bool cr4_cpuid_is_sync(void)
 {
-	int func, subfunc;
-	uint32_t eax, ebx, ecx, edx;
-	uint64_t cr4;
-
-	func = 0x1;
-	subfunc = 0x0;
-	__asm__ __volatile__("cpuid"
-			     : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
-			     : "a"(func), "c"(subfunc));
+	uint64_t cr4 = get_cr4();
 
-	cr4 = get_cr4();
-
-	return (!!(ecx & X86_FEATURE_OSXSAVE)) == (!!(cr4 & X86_CR4_OSXSAVE));
+	return (this_cpu_has(X86_FEATURE_OSXSAVE) == !!(cr4 & X86_CR4_OSXSAVE));
 }
 
 static void guest_code(void)
@@ -63,45 +49,28 @@ static void guest_code(void)
 
 int main(int argc, char *argv[])
 {
-	struct kvm_run *run;
+	struct kvm_vcpu *vcpu;
 	struct kvm_vm *vm;
 	struct kvm_sregs sregs;
-	struct kvm_cpuid_entry2 *entry;
 	struct ucall uc;
-	int rc;
 
-	entry = kvm_get_supported_cpuid_entry(1);
-	if (!(entry->ecx & X86_FEATURE_XSAVE)) {
-		print_skip("XSAVE feature not supported");
-		return 0;
-	}
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XSAVE));
 
-	/* Tell stdout not to buffer its content */
-	setbuf(stdout, NULL);
-
-	/* Create VM */
-	vm = vm_create_default(VCPU_ID, 0, guest_code);
-	vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
-	run = vcpu_state(vm, VCPU_ID);
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
 
 	while (1) {
-		rc = _vcpu_run(vm, VCPU_ID);
-
-		TEST_ASSERT(rc == 0, "vcpu_run failed: %d\n", rc);
-		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
-			    "Unexpected exit reason: %u (%s),\n",
-			    run->exit_reason,
-			    exit_reason_str(run->exit_reason));
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
 
-		switch (get_ucall(vm, VCPU_ID, &uc)) {
+		switch (get_ucall(vcpu, &uc)) {
 		case UCALL_SYNC:
 			/* emulate hypervisor clearing CR4.OSXSAVE */
-			vcpu_sregs_get(vm, VCPU_ID, &sregs);
+			vcpu_sregs_get(vcpu, &sregs);
 			sregs.cr4 &= ~X86_CR4_OSXSAVE;
-			vcpu_sregs_set(vm, VCPU_ID, &sregs);
+			vcpu_sregs_set(vcpu, &sregs);
 			break;
 		case UCALL_ABORT:
-			TEST_FAIL("Guest CR4 bit (OSXSAVE) unsynchronized with CPUID bit.");
+			REPORT_GUEST_ASSERT(uc);
 			break;
 		case UCALL_DONE:
 			goto done;
@@ -110,8 +79,7 @@ int main(int argc, char *argv[])
 		}
 	}
 
-	kvm_vm_free(vm);
-
 done:
+	kvm_vm_free(vm);
 	return 0;
 }
diff --git a/tools/testing/selftests/kvm/x86_64/debug_regs.c b/tools/testing/selftests/kvm/x86_64/debug_regs.c
index 8162c58a1234..f6b295e0b2d2 100644
--- a/tools/testing/selftests/kvm/x86_64/debug_regs.c
+++ b/tools/testing/selftests/kvm/x86_64/debug_regs.c
@@ -8,12 +8,13 @@
 #include <string.h>
 #include "kvm_util.h"
 #include "processor.h"
-
-#define VCPU_ID 0
+#include "apic.h"
 
 #define DR6_BD		(1 << 13)
 #define DR7_GD		(1 << 13)
 
+#define IRQ_VECTOR 0xAA
+
 /* For testing data access debug BP */
 uint32_t guest_value;
 
@@ -21,6 +22,11 @@ extern unsigned char sw_bp, hw_bp, write_data, ss_start, bd_start;
 
 static void guest_code(void)
 {
+	/* Create a pending interrupt on current vCPU */
+	x2apic_enable();
+	x2apic_write_reg(APIC_ICR, APIC_DEST_SELF | APIC_INT_ASSERT |
+			 APIC_DM_FIXED | IRQ_VECTOR);
+
 	/*
 	 * Software BP tests.
 	 *
@@ -38,77 +44,83 @@ static void guest_code(void)
 		     "mov %%rax,%0;\n\t write_data:"
 		     : "=m" (guest_value) : : "rax");
 
-	/* Single step test, covers 2 basic instructions and 2 emulated */
+	/*
+	 * Single step test, covers 2 basic instructions and 2 emulated
+	 *
+	 * Enable interrupts during the single stepping to see that
+	 * pending interrupt we raised is not handled due to KVM_GUESTDBG_BLOCKIRQ
+	 */
 	asm volatile("ss_start: "
-		     "xor %%rax,%%rax\n\t"
+		     "sti\n\t"
+		     "xor %%eax,%%eax\n\t"
 		     "cpuid\n\t"
 		     "movl $0x1a0,%%ecx\n\t"
 		     "rdmsr\n\t"
-		     : : : "rax", "ecx");
+		     "cli\n\t"
+		     : : : "eax", "ebx", "ecx", "edx");
 
 	/* DR6.BD test */
 	asm volatile("bd_start: mov %%dr0, %%rax" : : : "rax");
 	GUEST_DONE();
 }
 
-#define  CLEAR_DEBUG()  memset(&debug, 0, sizeof(debug))
-#define  APPLY_DEBUG()  vcpu_set_guest_debug(vm, VCPU_ID, &debug)
 #define  CAST_TO_RIP(v)  ((unsigned long long)&(v))
-#define  SET_RIP(v)  do {				\
-		vcpu_regs_get(vm, VCPU_ID, &regs);	\
-		regs.rip = (v);				\
-		vcpu_regs_set(vm, VCPU_ID, &regs);	\
-	} while (0)
-#define  MOVE_RIP(v)  SET_RIP(regs.rip + (v));
+
+static void vcpu_skip_insn(struct kvm_vcpu *vcpu, int insn_len)
+{
+	struct kvm_regs regs;
+
+	vcpu_regs_get(vcpu, &regs);
+	regs.rip += insn_len;
+	vcpu_regs_set(vcpu, &regs);
+}
 
 int main(void)
 {
 	struct kvm_guest_debug debug;
 	unsigned long long target_dr6, target_rip;
-	struct kvm_regs regs;
+	struct kvm_vcpu *vcpu;
 	struct kvm_run *run;
 	struct kvm_vm *vm;
 	struct ucall uc;
 	uint64_t cmd;
 	int i;
 	/* Instruction lengths starting at ss_start */
-	int ss_size[4] = {
-		3,		/* xor */
+	int ss_size[6] = {
+		1,		/* sti*/
+		2,		/* xor */
 		2,		/* cpuid */
 		5,		/* mov */
 		2,		/* rdmsr */
+		1,		/* cli */
 	};
 
-	if (!kvm_check_cap(KVM_CAP_SET_GUEST_DEBUG)) {
-		print_skip("KVM_CAP_SET_GUEST_DEBUG not supported");
-		return 0;
-	}
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_SET_GUEST_DEBUG));
 
-	vm = vm_create_default(VCPU_ID, 0, guest_code);
-	vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
-	run = vcpu_state(vm, VCPU_ID);
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+	run = vcpu->run;
 
 	/* Test software BPs - int3 */
-	CLEAR_DEBUG();
+	memset(&debug, 0, sizeof(debug));
 	debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP;
-	APPLY_DEBUG();
-	vcpu_run(vm, VCPU_ID);
+	vcpu_guest_debug_set(vcpu, &debug);
+	vcpu_run(vcpu);
 	TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG &&
 		    run->debug.arch.exception == BP_VECTOR &&
 		    run->debug.arch.pc == CAST_TO_RIP(sw_bp),
 		    "INT3: exit %d exception %d rip 0x%llx (should be 0x%llx)",
 		    run->exit_reason, run->debug.arch.exception,
 		    run->debug.arch.pc, CAST_TO_RIP(sw_bp));
-	MOVE_RIP(1);
+	vcpu_skip_insn(vcpu, 1);
 
 	/* Test instruction HW BP over DR[0-3] */
 	for (i = 0; i < 4; i++) {
-		CLEAR_DEBUG();
+		memset(&debug, 0, sizeof(debug));
 		debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
 		debug.arch.debugreg[i] = CAST_TO_RIP(hw_bp);
 		debug.arch.debugreg[7] = 0x400 | (1UL << (2*i+1));
-		APPLY_DEBUG();
-		vcpu_run(vm, VCPU_ID);
+		vcpu_guest_debug_set(vcpu, &debug);
+		vcpu_run(vcpu);
 		target_dr6 = 0xffff0ff0 | (1UL << i);
 		TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG &&
 			    run->debug.arch.exception == DB_VECTOR &&
@@ -121,17 +133,17 @@ int main(void)
 			    run->debug.arch.dr6, target_dr6);
 	}
 	/* Skip "nop" */
-	MOVE_RIP(1);
+	vcpu_skip_insn(vcpu, 1);
 
 	/* Test data access HW BP over DR[0-3] */
 	for (i = 0; i < 4; i++) {
-		CLEAR_DEBUG();
+		memset(&debug, 0, sizeof(debug));
 		debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
 		debug.arch.debugreg[i] = CAST_TO_RIP(guest_value);
 		debug.arch.debugreg[7] = 0x00000400 | (1UL << (2*i+1)) |
 		    (0x000d0000UL << (4*i));
-		APPLY_DEBUG();
-		vcpu_run(vm, VCPU_ID);
+		vcpu_guest_debug_set(vcpu, &debug);
+		vcpu_run(vcpu);
 		target_dr6 = 0xffff0ff0 | (1UL << i);
 		TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG &&
 			    run->debug.arch.exception == DB_VECTOR &&
@@ -143,22 +155,22 @@ int main(void)
 			    run->debug.arch.pc, CAST_TO_RIP(write_data),
 			    run->debug.arch.dr6, target_dr6);
 		/* Rollback the 4-bytes "mov" */
-		MOVE_RIP(-7);
+		vcpu_skip_insn(vcpu, -7);
 	}
 	/* Skip the 4-bytes "mov" */
-	MOVE_RIP(7);
+	vcpu_skip_insn(vcpu, 7);
 
 	/* Test single step */
 	target_rip = CAST_TO_RIP(ss_start);
 	target_dr6 = 0xffff4ff0ULL;
-	vcpu_regs_get(vm, VCPU_ID, &regs);
 	for (i = 0; i < (sizeof(ss_size) / sizeof(ss_size[0])); i++) {
 		target_rip += ss_size[i];
-		CLEAR_DEBUG();
-		debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
+		memset(&debug, 0, sizeof(debug));
+		debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP |
+				KVM_GUESTDBG_BLOCKIRQ;
 		debug.arch.debugreg[7] = 0x00000400;
-		APPLY_DEBUG();
-		vcpu_run(vm, VCPU_ID);
+		vcpu_guest_debug_set(vcpu, &debug);
+		vcpu_run(vcpu);
 		TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG &&
 			    run->debug.arch.exception == DB_VECTOR &&
 			    run->debug.arch.pc == target_rip &&
@@ -171,11 +183,11 @@ int main(void)
 	}
 
 	/* Finally test global disable */
-	CLEAR_DEBUG();
+	memset(&debug, 0, sizeof(debug));
 	debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
 	debug.arch.debugreg[7] = 0x400 | DR7_GD;
-	APPLY_DEBUG();
-	vcpu_run(vm, VCPU_ID);
+	vcpu_guest_debug_set(vcpu, &debug);
+	vcpu_run(vcpu);
 	target_dr6 = 0xffff0ff0 | DR6_BD;
 	TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG &&
 		    run->debug.arch.exception == DB_VECTOR &&
@@ -188,12 +200,12 @@ int main(void)
 			    target_dr6);
 
 	/* Disable all debug controls, run to the end */
-	CLEAR_DEBUG();
-	APPLY_DEBUG();
+	memset(&debug, 0, sizeof(debug));
+	vcpu_guest_debug_set(vcpu, &debug);
 
-	vcpu_run(vm, VCPU_ID);
-	TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "KVM_EXIT_IO");
-	cmd = get_ucall(vm, VCPU_ID, &uc);
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+	cmd = get_ucall(vcpu, &uc);
 	TEST_ASSERT(cmd == UCALL_DONE, "UCALL_DONE");
 
 	kvm_vm_free(vm);
diff --git a/tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c b/tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c
new file mode 100644
index 000000000000..ee3b384b991c
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c
@@ -0,0 +1,262 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM dirty logging page splitting test
+ *
+ * Based on dirty_log_perf.c
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ * Copyright (C) 2023, Google, Inc.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <pthread.h>
+#include <linux/bitmap.h>
+
+#include "kvm_util.h"
+#include "test_util.h"
+#include "memstress.h"
+#include "guest_modes.h"
+
+#define VCPUS		2
+#define SLOTS		2
+#define ITERATIONS	2
+
+static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE;
+
+static enum vm_mem_backing_src_type backing_src = VM_MEM_SRC_ANONYMOUS_HUGETLB;
+
+static u64 dirty_log_manual_caps;
+static bool host_quit;
+static int iteration;
+static int vcpu_last_completed_iteration[KVM_MAX_VCPUS];
+
+struct kvm_page_stats {
+	uint64_t pages_4k;
+	uint64_t pages_2m;
+	uint64_t pages_1g;
+	uint64_t hugepages;
+};
+
+static void get_page_stats(struct kvm_vm *vm, struct kvm_page_stats *stats, const char *stage)
+{
+	stats->pages_4k = vm_get_stat(vm, "pages_4k");
+	stats->pages_2m = vm_get_stat(vm, "pages_2m");
+	stats->pages_1g = vm_get_stat(vm, "pages_1g");
+	stats->hugepages = stats->pages_2m + stats->pages_1g;
+
+	pr_debug("\nPage stats after %s: 4K: %ld 2M: %ld 1G: %ld huge: %ld\n",
+		 stage, stats->pages_4k, stats->pages_2m, stats->pages_1g,
+		 stats->hugepages);
+}
+
+static void run_vcpu_iteration(struct kvm_vm *vm)
+{
+	int i;
+
+	iteration++;
+	for (i = 0; i < VCPUS; i++) {
+		while (READ_ONCE(vcpu_last_completed_iteration[i]) !=
+		       iteration)
+			;
+	}
+}
+
+static void vcpu_worker(struct memstress_vcpu_args *vcpu_args)
+{
+	struct kvm_vcpu *vcpu = vcpu_args->vcpu;
+	int vcpu_idx = vcpu_args->vcpu_idx;
+
+	while (!READ_ONCE(host_quit)) {
+		int current_iteration = READ_ONCE(iteration);
+
+		vcpu_run(vcpu);
+
+		TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_SYNC);
+
+		vcpu_last_completed_iteration[vcpu_idx] = current_iteration;
+
+		/* Wait for the start of the next iteration to be signaled. */
+		while (current_iteration == READ_ONCE(iteration) &&
+		       READ_ONCE(iteration) >= 0 &&
+		       !READ_ONCE(host_quit))
+			;
+	}
+}
+
+static void run_test(enum vm_guest_mode mode, void *unused)
+{
+	struct kvm_vm *vm;
+	unsigned long **bitmaps;
+	uint64_t guest_num_pages;
+	uint64_t host_num_pages;
+	uint64_t pages_per_slot;
+	int i;
+	struct kvm_page_stats stats_populated;
+	struct kvm_page_stats stats_dirty_logging_enabled;
+	struct kvm_page_stats stats_dirty_pass[ITERATIONS];
+	struct kvm_page_stats stats_clear_pass[ITERATIONS];
+	struct kvm_page_stats stats_dirty_logging_disabled;
+	struct kvm_page_stats stats_repopulated;
+
+	vm = memstress_create_vm(mode, VCPUS, guest_percpu_mem_size,
+				 SLOTS, backing_src, false);
+
+	guest_num_pages = (VCPUS * guest_percpu_mem_size) >> vm->page_shift;
+	guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages);
+	host_num_pages = vm_num_host_pages(mode, guest_num_pages);
+	pages_per_slot = host_num_pages / SLOTS;
+	TEST_ASSERT_EQ(host_num_pages, pages_per_slot * SLOTS);
+	TEST_ASSERT(!(host_num_pages % 512),
+		    "Number of pages, '%lu' not a multiple of 2MiB", host_num_pages);
+
+	bitmaps = memstress_alloc_bitmaps(SLOTS, pages_per_slot);
+
+	if (dirty_log_manual_caps)
+		vm_enable_cap(vm, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2,
+			      dirty_log_manual_caps);
+
+	/* Start the iterations */
+	iteration = -1;
+	host_quit = false;
+
+	for (i = 0; i < VCPUS; i++)
+		vcpu_last_completed_iteration[i] = -1;
+
+	memstress_start_vcpu_threads(VCPUS, vcpu_worker);
+
+	run_vcpu_iteration(vm);
+	get_page_stats(vm, &stats_populated, "populating memory");
+
+	/* Enable dirty logging */
+	memstress_enable_dirty_logging(vm, SLOTS);
+
+	get_page_stats(vm, &stats_dirty_logging_enabled, "enabling dirty logging");
+
+	while (iteration < ITERATIONS) {
+		run_vcpu_iteration(vm);
+		get_page_stats(vm, &stats_dirty_pass[iteration - 1],
+			       "dirtying memory");
+
+		memstress_get_dirty_log(vm, bitmaps, SLOTS);
+
+		if (dirty_log_manual_caps) {
+			memstress_clear_dirty_log(vm, bitmaps, SLOTS, pages_per_slot);
+
+			get_page_stats(vm, &stats_clear_pass[iteration - 1], "clearing dirty log");
+		}
+	}
+
+	/* Disable dirty logging */
+	memstress_disable_dirty_logging(vm, SLOTS);
+
+	get_page_stats(vm, &stats_dirty_logging_disabled, "disabling dirty logging");
+
+	/* Run vCPUs again to fault pages back in. */
+	run_vcpu_iteration(vm);
+	get_page_stats(vm, &stats_repopulated, "repopulating memory");
+
+	/*
+	 * Tell the vCPU threads to quit.  No need to manually check that vCPUs
+	 * have stopped running after disabling dirty logging, the join will
+	 * wait for them to exit.
+	 */
+	host_quit = true;
+	memstress_join_vcpu_threads(VCPUS);
+
+	memstress_free_bitmaps(bitmaps, SLOTS);
+	memstress_destroy_vm(vm);
+
+	TEST_ASSERT_EQ((stats_populated.pages_2m * 512 +
+			stats_populated.pages_1g * 512 * 512), host_num_pages);
+
+	/*
+	 * Check that all huge pages were split. Since large pages can only
+	 * exist in the data slot, and the vCPUs should have dirtied all pages
+	 * in the data slot, there should be no huge pages left after splitting.
+	 * Splitting happens at dirty log enable time without
+	 * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 and after the first clear pass
+	 * with that capability.
+	 */
+	if (dirty_log_manual_caps) {
+		TEST_ASSERT_EQ(stats_clear_pass[0].hugepages, 0);
+		TEST_ASSERT(stats_clear_pass[0].pages_4k >= host_num_pages,
+			    "Expected at least '%lu' 4KiB pages, found only '%lu'",
+			    host_num_pages, stats_clear_pass[0].pages_4k);
+		TEST_ASSERT_EQ(stats_dirty_logging_enabled.hugepages, stats_populated.hugepages);
+	} else {
+		TEST_ASSERT_EQ(stats_dirty_logging_enabled.hugepages, 0);
+		TEST_ASSERT(stats_dirty_logging_enabled.pages_4k >= host_num_pages,
+			    "Expected at least '%lu' 4KiB pages, found only '%lu'",
+			    host_num_pages, stats_dirty_logging_enabled.pages_4k);
+	}
+
+	/*
+	 * Once dirty logging is disabled and the vCPUs have touched all their
+	 * memory again, the hugepage counts should be the same as they were
+	 * right after initial population of memory.
+	 */
+	TEST_ASSERT_EQ(stats_populated.pages_2m, stats_repopulated.pages_2m);
+	TEST_ASSERT_EQ(stats_populated.pages_1g, stats_repopulated.pages_1g);
+}
+
+static void help(char *name)
+{
+	puts("");
+	printf("usage: %s [-h] [-b vcpu bytes] [-s mem type]\n",
+	       name);
+	puts("");
+	printf(" -b: specify the size of the memory region which should be\n"
+	       "     dirtied by each vCPU. e.g. 10M or 3G.\n"
+	       "     (default: 1G)\n");
+	backing_src_help("-s");
+	puts("");
+}
+
+int main(int argc, char *argv[])
+{
+	int opt;
+
+	TEST_REQUIRE(get_kvm_param_bool("eager_page_split"));
+	TEST_REQUIRE(get_kvm_param_bool("tdp_mmu"));
+
+	while ((opt = getopt(argc, argv, "b:hs:")) != -1) {
+		switch (opt) {
+		case 'b':
+			guest_percpu_mem_size = parse_size(optarg);
+			break;
+		case 'h':
+			help(argv[0]);
+			exit(0);
+		case 's':
+			backing_src = parse_backing_src_type(optarg);
+			break;
+		default:
+			help(argv[0]);
+			exit(1);
+		}
+	}
+
+	if (!is_backing_src_hugetlb(backing_src)) {
+		pr_info("This test will only work reliably with HugeTLB memory. "
+			"It can work with THP, but that is best effort.\n");
+	}
+
+	guest_modes_append_default();
+
+	dirty_log_manual_caps = 0;
+	for_each_guest_mode(run_test, NULL);
+
+	dirty_log_manual_caps =
+		kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
+
+	if (dirty_log_manual_caps) {
+		dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
+					  KVM_DIRTY_LOG_INITIALLY_SET);
+		for_each_guest_mode(run_test, NULL);
+	} else {
+		pr_info("Skipping testing with MANUAL_PROTECT as it is not supported");
+	}
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c
deleted file mode 100644
index 757928199f19..000000000000
--- a/tools/testing/selftests/kvm/x86_64/evmcs_test.c
+++ /dev/null
@@ -1,166 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2018, Red Hat, Inc.
- *
- * Tests for Enlightened VMCS, including nested guest state.
- */
-#define _GNU_SOURCE /* for program_invocation_short_name */
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/ioctl.h>
-
-#include "test_util.h"
-
-#include "kvm_util.h"
-
-#include "vmx.h"
-
-#define VCPU_ID		5
-
-void l2_guest_code(void)
-{
-	GUEST_SYNC(7);
-
-	GUEST_SYNC(8);
-
-	/* Done, exit to L1 and never come back.  */
-	vmcall();
-}
-
-void l1_guest_code(struct vmx_pages *vmx_pages)
-{
-#define L2_GUEST_STACK_SIZE 64
-	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
-
-	enable_vp_assist(vmx_pages->vp_assist_gpa, vmx_pages->vp_assist);
-
-	GUEST_ASSERT(vmx_pages->vmcs_gpa);
-	GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
-	GUEST_SYNC(3);
-	GUEST_ASSERT(load_vmcs(vmx_pages));
-	GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa);
-
-	GUEST_SYNC(4);
-	GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa);
-
-	prepare_vmcs(vmx_pages, l2_guest_code,
-		     &l2_guest_stack[L2_GUEST_STACK_SIZE]);
-
-	GUEST_SYNC(5);
-	GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa);
-	current_evmcs->revision_id = -1u;
-	GUEST_ASSERT(vmlaunch());
-	current_evmcs->revision_id = EVMCS_VERSION;
-	GUEST_SYNC(6);
-
-	GUEST_ASSERT(!vmlaunch());
-	GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa);
-	GUEST_SYNC(9);
-	GUEST_ASSERT(!vmresume());
-	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
-	GUEST_SYNC(10);
-}
-
-void guest_code(struct vmx_pages *vmx_pages)
-{
-	GUEST_SYNC(1);
-	GUEST_SYNC(2);
-
-	if (vmx_pages)
-		l1_guest_code(vmx_pages);
-
-	GUEST_DONE();
-
-	/* Try enlightened vmptrld with an incorrect GPA */
-	evmcs_vmptrld(0xdeadbeef, vmx_pages->enlightened_vmcs);
-	GUEST_ASSERT(vmlaunch());
-}
-
-int main(int argc, char *argv[])
-{
-	vm_vaddr_t vmx_pages_gva = 0;
-
-	struct kvm_regs regs1, regs2;
-	struct kvm_vm *vm;
-	struct kvm_run *run;
-	struct kvm_x86_state *state;
-	struct ucall uc;
-	int stage;
-
-	/* Create VM */
-	vm = vm_create_default(VCPU_ID, 0, guest_code);
-
-	vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
-
-	if (!nested_vmx_supported() ||
-	    !kvm_check_cap(KVM_CAP_NESTED_STATE) ||
-	    !kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)) {
-		print_skip("Enlightened VMCS is unsupported");
-		exit(KSFT_SKIP);
-	}
-
-	vcpu_enable_evmcs(vm, VCPU_ID);
-
-	run = vcpu_state(vm, VCPU_ID);
-
-	vcpu_regs_get(vm, VCPU_ID, &regs1);
-
-	vcpu_alloc_vmx(vm, &vmx_pages_gva);
-	vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
-
-	for (stage = 1;; stage++) {
-		_vcpu_run(vm, VCPU_ID);
-		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
-			    "Stage %d: unexpected exit reason: %u (%s),\n",
-			    stage, run->exit_reason,
-			    exit_reason_str(run->exit_reason));
-
-		switch (get_ucall(vm, VCPU_ID, &uc)) {
-		case UCALL_ABORT:
-			TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
-		      		  __FILE__, uc.args[1]);
-			/* NOT REACHED */
-		case UCALL_SYNC:
-			break;
-		case UCALL_DONE:
-			goto part1_done;
-		default:
-			TEST_FAIL("Unknown ucall %lu", uc.cmd);
-		}
-
-		/* UCALL_SYNC is handled here.  */
-		TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
-			    uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx",
-			    stage, (ulong)uc.args[1]);
-
-		state = vcpu_save_state(vm, VCPU_ID);
-		memset(&regs1, 0, sizeof(regs1));
-		vcpu_regs_get(vm, VCPU_ID, &regs1);
-
-		kvm_vm_release(vm);
-
-		/* Restore state in a new VM.  */
-		kvm_vm_restart(vm, O_RDWR);
-		vm_vcpu_add(vm, VCPU_ID);
-		vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
-		vcpu_enable_evmcs(vm, VCPU_ID);
-		vcpu_load_state(vm, VCPU_ID, state);
-		run = vcpu_state(vm, VCPU_ID);
-		free(state);
-
-		memset(&regs2, 0, sizeof(regs2));
-		vcpu_regs_get(vm, VCPU_ID, &regs2);
-		TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)),
-			    "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
-			    (ulong) regs2.rdi, (ulong) regs2.rsi);
-	}
-
-part1_done:
-	_vcpu_run(vm, VCPU_ID);
-	TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN,
-		    "Unexpected successful VMEnter with invalid eVMCS pointer!");
-
-	kvm_vm_free(vm);
-}
diff --git a/tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c b/tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c
new file mode 100644
index 000000000000..6c2e5e0ceb1f
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/exit_on_emulation_failure_test.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022, Google LLC.
+ *
+ * Test for KVM_CAP_EXIT_ON_EMULATION_FAILURE.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+
+#include "flds_emulation.h"
+
+#include "test_util.h"
+
+#define MMIO_GPA	0x700000000
+#define MMIO_GVA	MMIO_GPA
+
+static void guest_code(void)
+{
+	/* Execute flds with an MMIO address to force KVM to emulate it. */
+	flds(MMIO_GVA);
+	GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_EXIT_ON_EMULATION_FAILURE));
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+	vm_enable_cap(vm, KVM_CAP_EXIT_ON_EMULATION_FAILURE, 1);
+	virt_map(vm, MMIO_GVA, MMIO_GPA, 1);
+
+	vcpu_run(vcpu);
+	handle_flds_emulation_failure_exit(vcpu);
+	vcpu_run(vcpu);
+	TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE);
+
+	kvm_vm_free(vm);
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c
new file mode 100644
index 000000000000..f3c2239228b1
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020, Google LLC.
+ *
+ * Tests for KVM paravirtual feature disablement
+ */
+#include <asm/kvm_para.h>
+#include <linux/kvm_para.h>
+#include <linux/stringify.h>
+#include <stdint.h>
+
+#include "kvm_test_harness.h"
+#include "apic.h"
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+/* VMCALL and VMMCALL are both 3-byte opcodes. */
+#define HYPERCALL_INSN_SIZE	3
+
+static bool quirk_disabled;
+
+static void guest_ud_handler(struct ex_regs *regs)
+{
+	regs->rax = -EFAULT;
+	regs->rip += HYPERCALL_INSN_SIZE;
+}
+
+static const uint8_t vmx_vmcall[HYPERCALL_INSN_SIZE]  = { 0x0f, 0x01, 0xc1 };
+static const uint8_t svm_vmmcall[HYPERCALL_INSN_SIZE] = { 0x0f, 0x01, 0xd9 };
+
+extern uint8_t hypercall_insn[HYPERCALL_INSN_SIZE];
+static uint64_t do_sched_yield(uint8_t apic_id)
+{
+	uint64_t ret;
+
+	asm volatile("hypercall_insn:\n\t"
+		     ".byte 0xcc,0xcc,0xcc\n\t"
+		     : "=a"(ret)
+		     : "a"((uint64_t)KVM_HC_SCHED_YIELD), "b"((uint64_t)apic_id)
+		     : "memory");
+
+	return ret;
+}
+
+static void guest_main(void)
+{
+	const uint8_t *native_hypercall_insn;
+	const uint8_t *other_hypercall_insn;
+	uint64_t ret;
+
+	if (host_cpu_is_intel) {
+		native_hypercall_insn = vmx_vmcall;
+		other_hypercall_insn  = svm_vmmcall;
+	} else if (host_cpu_is_amd) {
+		native_hypercall_insn = svm_vmmcall;
+		other_hypercall_insn  = vmx_vmcall;
+	} else {
+		GUEST_ASSERT(0);
+		/* unreachable */
+		return;
+	}
+
+	memcpy(hypercall_insn, other_hypercall_insn, HYPERCALL_INSN_SIZE);
+
+	ret = do_sched_yield(GET_APIC_ID_FIELD(xapic_read_reg(APIC_ID)));
+
+	/*
+	 * If the quirk is disabled, verify that guest_ud_handler() "returned"
+	 * -EFAULT and that KVM did NOT patch the hypercall.  If the quirk is
+	 * enabled, verify that the hypercall succeeded and that KVM patched in
+	 * the "right" hypercall.
+	 */
+	if (quirk_disabled) {
+		GUEST_ASSERT(ret == (uint64_t)-EFAULT);
+		GUEST_ASSERT(!memcmp(other_hypercall_insn, hypercall_insn,
+			     HYPERCALL_INSN_SIZE));
+	} else {
+		GUEST_ASSERT(!ret);
+		GUEST_ASSERT(!memcmp(native_hypercall_insn, hypercall_insn,
+			     HYPERCALL_INSN_SIZE));
+	}
+
+	GUEST_DONE();
+}
+
+KVM_ONE_VCPU_TEST_SUITE(fix_hypercall);
+
+static void enter_guest(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+	struct ucall uc;
+
+	vcpu_run(vcpu);
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_SYNC:
+		pr_info("%s: %016lx\n", (const char *)uc.args[2], uc.args[3]);
+		break;
+	case UCALL_DONE:
+		return;
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+	default:
+		TEST_FAIL("Unhandled ucall: %ld\nexit_reason: %u (%s)",
+			  uc.cmd, run->exit_reason, exit_reason_str(run->exit_reason));
+	}
+}
+
+static void test_fix_hypercall(struct kvm_vcpu *vcpu, bool disable_quirk)
+{
+	struct kvm_vm *vm = vcpu->vm;
+
+	vm_init_descriptor_tables(vm);
+	vcpu_init_descriptor_tables(vcpu);
+	vm_install_exception_handler(vcpu->vm, UD_VECTOR, guest_ud_handler);
+
+	if (disable_quirk)
+		vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2,
+			      KVM_X86_QUIRK_FIX_HYPERCALL_INSN);
+
+	quirk_disabled = disable_quirk;
+	sync_global_to_guest(vm, quirk_disabled);
+
+	virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA);
+
+	enter_guest(vcpu);
+}
+
+KVM_ONE_VCPU_TEST(fix_hypercall, enable_quirk, guest_main)
+{
+	test_fix_hypercall(vcpu, false);
+}
+
+KVM_ONE_VCPU_TEST(fix_hypercall, disable_quirk, guest_main)
+{
+	test_fix_hypercall(vcpu, true);
+}
+
+int main(int argc, char *argv[])
+{
+	TEST_REQUIRE(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & KVM_X86_QUIRK_FIX_HYPERCALL_INSN);
+
+	return test_harness_run(argc, argv);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/flds_emulation.h b/tools/testing/selftests/kvm/x86_64/flds_emulation.h
new file mode 100644
index 000000000000..37b1a9f52864
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/flds_emulation.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef SELFTEST_KVM_FLDS_EMULATION_H
+#define SELFTEST_KVM_FLDS_EMULATION_H
+
+#include "kvm_util.h"
+
+#define FLDS_MEM_EAX ".byte 0xd9, 0x00"
+
+/*
+ * flds is an instruction that the KVM instruction emulator is known not to
+ * support. This can be used in guest code along with a mechanism to force
+ * KVM to emulate the instruction (e.g. by providing an MMIO address) to
+ * exercise emulation failures.
+ */
+static inline void flds(uint64_t address)
+{
+	__asm__ __volatile__(FLDS_MEM_EAX :: "a"(address));
+}
+
+static inline void handle_flds_emulation_failure_exit(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+	struct kvm_regs regs;
+	uint8_t *insn_bytes;
+	uint64_t flags;
+
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_INTERNAL_ERROR);
+
+	TEST_ASSERT(run->emulation_failure.suberror == KVM_INTERNAL_ERROR_EMULATION,
+		    "Unexpected suberror: %u",
+		    run->emulation_failure.suberror);
+
+	flags = run->emulation_failure.flags;
+	TEST_ASSERT(run->emulation_failure.ndata >= 3 &&
+		    flags & KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES,
+		    "run->emulation_failure is missing instruction bytes");
+
+	TEST_ASSERT(run->emulation_failure.insn_size >= 2,
+		    "Expected a 2-byte opcode for 'flds', got %d bytes",
+		    run->emulation_failure.insn_size);
+
+	insn_bytes = run->emulation_failure.insn_bytes;
+	TEST_ASSERT(insn_bytes[0] == 0xd9 && insn_bytes[1] == 0,
+		    "Expected 'flds [eax]', opcode '0xd9 0x00', got opcode 0x%02x 0x%02x",
+		    insn_bytes[0], insn_bytes[1]);
+
+	vcpu_regs_get(vcpu, &regs);
+	regs.rip += 2;
+	vcpu_regs_set(vcpu, &regs);
+}
+
+#endif /* !SELFTEST_KVM_FLDS_EMULATION_H */
diff --git a/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c b/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c
new file mode 100644
index 000000000000..d09b3cbcadc6
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test that KVM_GET_MSR_INDEX_LIST and
+ * KVM_GET_MSR_FEATURE_INDEX_LIST work as intended
+ *
+ * Copyright (C) 2020, Red Hat, Inc.
+ */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+int main(int argc, char *argv[])
+{
+	const struct kvm_msr_list *feature_list;
+	int i;
+
+	/*
+	 * Skip the entire test if MSR_FEATURES isn't supported, other tests
+	 * will cover the "regular" list of MSRs, the coverage here is purely
+	 * opportunistic and not interesting on its own.
+	 */
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_GET_MSR_FEATURES));
+
+	(void)kvm_get_msr_index_list();
+
+	feature_list = kvm_get_feature_msr_index_list();
+	for (i = 0; i < feature_list->nmsrs; i++)
+		kvm_get_feature_msr(feature_list->indices[i]);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/hwcr_msr_test.c b/tools/testing/selftests/kvm/x86_64/hwcr_msr_test.c
new file mode 100644
index 000000000000..df351ae17029
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/hwcr_msr_test.c
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023, Google LLC.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "vmx.h"
+
+void test_hwcr_bit(struct kvm_vcpu *vcpu, unsigned int bit)
+{
+	const uint64_t ignored = BIT_ULL(3) | BIT_ULL(6) | BIT_ULL(8);
+	const uint64_t valid = BIT_ULL(18) | BIT_ULL(24);
+	const uint64_t legal = ignored | valid;
+	uint64_t val = BIT_ULL(bit);
+	uint64_t actual;
+	int r;
+
+	r = _vcpu_set_msr(vcpu, MSR_K7_HWCR, val);
+	TEST_ASSERT(val & ~legal ? !r : r == 1,
+		    "Expected KVM_SET_MSRS(MSR_K7_HWCR) = 0x%lx to %s",
+		    val, val & ~legal ? "fail" : "succeed");
+
+	actual = vcpu_get_msr(vcpu, MSR_K7_HWCR);
+	TEST_ASSERT(actual == (val & valid),
+		    "Bit %u: unexpected HWCR 0x%lx; expected 0x%lx",
+		    bit, actual, (val & valid));
+
+	vcpu_set_msr(vcpu, MSR_K7_HWCR, 0);
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vm *vm;
+	struct kvm_vcpu *vcpu;
+	unsigned int bit;
+
+	vm = vm_create_with_one_vcpu(&vcpu, NULL);
+
+	for (bit = 0; bit < BITS_PER_LONG; bit++)
+		test_hwcr_bit(vcpu, bit);
+
+	kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_clock.c b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c
new file mode 100644
index 000000000000..e058bc676cd6
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c
@@ -0,0 +1,263 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021, Red Hat, Inc.
+ *
+ * Tests for Hyper-V clocksources
+ */
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "hyperv.h"
+
+struct ms_hyperv_tsc_page {
+	volatile u32 tsc_sequence;
+	u32 reserved1;
+	volatile u64 tsc_scale;
+	volatile s64 tsc_offset;
+} __packed;
+
+/* Simplified mul_u64_u64_shr() */
+static inline u64 mul_u64_u64_shr64(u64 a, u64 b)
+{
+	union {
+		u64 ll;
+		struct {
+			u32 low, high;
+		} l;
+	} rm, rn, rh, a0, b0;
+	u64 c;
+
+	a0.ll = a;
+	b0.ll = b;
+
+	rm.ll = (u64)a0.l.low * b0.l.high;
+	rn.ll = (u64)a0.l.high * b0.l.low;
+	rh.ll = (u64)a0.l.high * b0.l.high;
+
+	rh.l.low = c = rm.l.high + rn.l.high + rh.l.low;
+	rh.l.high = (c >> 32) + rh.l.high;
+
+	return rh.ll;
+}
+
+static inline void nop_loop(void)
+{
+	int i;
+
+	for (i = 0; i < 100000000; i++)
+		asm volatile("nop");
+}
+
+static inline void check_tsc_msr_rdtsc(void)
+{
+	u64 tsc_freq, r1, r2, t1, t2;
+	s64 delta_ns;
+
+	tsc_freq = rdmsr(HV_X64_MSR_TSC_FREQUENCY);
+	GUEST_ASSERT(tsc_freq > 0);
+
+	/* For increased accuracy, take mean rdtsc() before and afrer rdmsr() */
+	r1 = rdtsc();
+	t1 = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
+	r1 = (r1 + rdtsc()) / 2;
+	nop_loop();
+	r2 = rdtsc();
+	t2 = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
+	r2 = (r2 + rdtsc()) / 2;
+
+	GUEST_ASSERT(r2 > r1 && t2 > t1);
+
+	/* HV_X64_MSR_TIME_REF_COUNT is in 100ns */
+	delta_ns = ((t2 - t1) * 100) - ((r2 - r1) * 1000000000 / tsc_freq);
+	if (delta_ns < 0)
+		delta_ns = -delta_ns;
+
+	/* 1% tolerance */
+	GUEST_ASSERT(delta_ns * 100 < (t2 - t1) * 100);
+}
+
+static inline u64 get_tscpage_ts(struct ms_hyperv_tsc_page *tsc_page)
+{
+	return mul_u64_u64_shr64(rdtsc(), tsc_page->tsc_scale) + tsc_page->tsc_offset;
+}
+
+static inline void check_tsc_msr_tsc_page(struct ms_hyperv_tsc_page *tsc_page)
+{
+	u64 r1, r2, t1, t2;
+
+	/* Compare TSC page clocksource with HV_X64_MSR_TIME_REF_COUNT */
+	t1 = get_tscpage_ts(tsc_page);
+	r1 = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
+
+	/* 10 ms tolerance */
+	GUEST_ASSERT(r1 >= t1 && r1 - t1 < 100000);
+	nop_loop();
+
+	t2 = get_tscpage_ts(tsc_page);
+	r2 = rdmsr(HV_X64_MSR_TIME_REF_COUNT);
+	GUEST_ASSERT(r2 >= t1 && r2 - t2 < 100000);
+}
+
+static void guest_main(struct ms_hyperv_tsc_page *tsc_page, vm_paddr_t tsc_page_gpa)
+{
+	u64 tsc_scale, tsc_offset;
+
+	/* Set Guest OS id to enable Hyper-V emulation */
+	GUEST_SYNC(1);
+	wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID);
+	GUEST_SYNC(2);
+
+	check_tsc_msr_rdtsc();
+
+	GUEST_SYNC(3);
+
+	/* Set up TSC page is disabled state, check that it's clean */
+	wrmsr(HV_X64_MSR_REFERENCE_TSC, tsc_page_gpa);
+	GUEST_ASSERT(tsc_page->tsc_sequence == 0);
+	GUEST_ASSERT(tsc_page->tsc_scale == 0);
+	GUEST_ASSERT(tsc_page->tsc_offset == 0);
+
+	GUEST_SYNC(4);
+
+	/* Set up TSC page is enabled state */
+	wrmsr(HV_X64_MSR_REFERENCE_TSC, tsc_page_gpa | 0x1);
+	GUEST_ASSERT(tsc_page->tsc_sequence != 0);
+
+	GUEST_SYNC(5);
+
+	check_tsc_msr_tsc_page(tsc_page);
+
+	GUEST_SYNC(6);
+
+	tsc_offset = tsc_page->tsc_offset;
+	/* Call KVM_SET_CLOCK from userspace, check that TSC page was updated */
+
+	GUEST_SYNC(7);
+	/* Sanity check TSC page timestamp, it should be close to 0 */
+	GUEST_ASSERT(get_tscpage_ts(tsc_page) < 100000);
+
+	GUEST_ASSERT(tsc_page->tsc_offset != tsc_offset);
+
+	nop_loop();
+
+	/*
+	 * Enable Re-enlightenment and check that TSC page stays constant across
+	 * KVM_SET_CLOCK.
+	 */
+	wrmsr(HV_X64_MSR_REENLIGHTENMENT_CONTROL, 0x1 << 16 | 0xff);
+	wrmsr(HV_X64_MSR_TSC_EMULATION_CONTROL, 0x1);
+	tsc_offset = tsc_page->tsc_offset;
+	tsc_scale = tsc_page->tsc_scale;
+	GUEST_SYNC(8);
+	GUEST_ASSERT(tsc_page->tsc_offset == tsc_offset);
+	GUEST_ASSERT(tsc_page->tsc_scale == tsc_scale);
+
+	GUEST_SYNC(9);
+
+	check_tsc_msr_tsc_page(tsc_page);
+
+	/*
+	 * Disable re-enlightenment and TSC page, check that KVM doesn't update
+	 * it anymore.
+	 */
+	wrmsr(HV_X64_MSR_REENLIGHTENMENT_CONTROL, 0);
+	wrmsr(HV_X64_MSR_TSC_EMULATION_CONTROL, 0);
+	wrmsr(HV_X64_MSR_REFERENCE_TSC, 0);
+	memset(tsc_page, 0, sizeof(*tsc_page));
+
+	GUEST_SYNC(10);
+	GUEST_ASSERT(tsc_page->tsc_sequence == 0);
+	GUEST_ASSERT(tsc_page->tsc_offset == 0);
+	GUEST_ASSERT(tsc_page->tsc_scale == 0);
+
+	GUEST_DONE();
+}
+
+static void host_check_tsc_msr_rdtsc(struct kvm_vcpu *vcpu)
+{
+	u64 tsc_freq, r1, r2, t1, t2;
+	s64 delta_ns;
+
+	tsc_freq = vcpu_get_msr(vcpu, HV_X64_MSR_TSC_FREQUENCY);
+	TEST_ASSERT(tsc_freq > 0, "TSC frequency must be nonzero");
+
+	/* For increased accuracy, take mean rdtsc() before and afrer ioctl */
+	r1 = rdtsc();
+	t1 = vcpu_get_msr(vcpu, HV_X64_MSR_TIME_REF_COUNT);
+	r1 = (r1 + rdtsc()) / 2;
+	nop_loop();
+	r2 = rdtsc();
+	t2 = vcpu_get_msr(vcpu, HV_X64_MSR_TIME_REF_COUNT);
+	r2 = (r2 + rdtsc()) / 2;
+
+	TEST_ASSERT(t2 > t1, "Time reference MSR is not monotonic (%ld <= %ld)", t1, t2);
+
+	/* HV_X64_MSR_TIME_REF_COUNT is in 100ns */
+	delta_ns = ((t2 - t1) * 100) - ((r2 - r1) * 1000000000 / tsc_freq);
+	if (delta_ns < 0)
+		delta_ns = -delta_ns;
+
+	/* 1% tolerance */
+	TEST_ASSERT(delta_ns * 100 < (t2 - t1) * 100,
+		    "Elapsed time does not match (MSR=%ld, TSC=%ld)",
+		    (t2 - t1) * 100, (r2 - r1) * 1000000000 / tsc_freq);
+}
+
+int main(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	struct ucall uc;
+	vm_vaddr_t tsc_page_gva;
+	int stage;
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_TIME));
+	TEST_REQUIRE(sys_clocksource_is_based_on_tsc());
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_main);
+
+	vcpu_set_hv_cpuid(vcpu);
+
+	tsc_page_gva = vm_vaddr_alloc_page(vm);
+	memset(addr_gva2hva(vm, tsc_page_gva), 0x0, getpagesize());
+	TEST_ASSERT((addr_gva2gpa(vm, tsc_page_gva) & (getpagesize() - 1)) == 0,
+		"TSC page has to be page aligned");
+	vcpu_args_set(vcpu, 2, tsc_page_gva, addr_gva2gpa(vm, tsc_page_gva));
+
+	host_check_tsc_msr_rdtsc(vcpu);
+
+	for (stage = 1;; stage++) {
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			/* NOT REACHED */
+		case UCALL_SYNC:
+			break;
+		case UCALL_DONE:
+			/* Keep in sync with guest_main() */
+			TEST_ASSERT(stage == 11, "Testing ended prematurely, stage %d",
+				    stage);
+			goto out;
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+
+		TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
+			    uc.args[1] == stage,
+			    "Stage %d: Unexpected register values vmexit, got %lx",
+			    stage, (ulong)uc.args[1]);
+
+		/* Reset kvmclock triggering TSC page update */
+		if (stage == 7 || stage == 8 || stage == 10) {
+			struct kvm_clock_data clock = {0};
+
+			vm_ioctl(vm, KVM_SET_CLOCK, &clock);
+		}
+	}
+
+out:
+	kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c
index 745b708c2d3b..5c27efbf405e 100644
--- a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c
@@ -20,8 +20,6 @@
 #include "processor.h"
 #include "vmx.h"
 
-#define VCPU_ID 0
-
 static void guest_code(void)
 {
 }
@@ -45,32 +43,26 @@ static bool smt_possible(void)
 	return res;
 }
 
-static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries,
-			  bool evmcs_enabled)
+static void test_hv_cpuid(const struct kvm_cpuid2 *hv_cpuid_entries,
+			  bool evmcs_expected)
 {
 	int i;
-	int nent = 9;
+	int nent_expected = 10;
 	u32 test_val;
 
-	if (evmcs_enabled)
-		nent += 1; /* 0x4000000A */
-
-	TEST_ASSERT(hv_cpuid_entries->nent == nent,
+	TEST_ASSERT(hv_cpuid_entries->nent == nent_expected,
 		    "KVM_GET_SUPPORTED_HV_CPUID should return %d entries"
-		    " with evmcs=%d (returned %d)",
-		    nent, evmcs_enabled, hv_cpuid_entries->nent);
+		    " (returned %d)",
+		    nent_expected, hv_cpuid_entries->nent);
 
 	for (i = 0; i < hv_cpuid_entries->nent; i++) {
-		struct kvm_cpuid_entry2 *entry = &hv_cpuid_entries->entries[i];
+		const struct kvm_cpuid_entry2 *entry = &hv_cpuid_entries->entries[i];
 
 		TEST_ASSERT((entry->function >= 0x40000000) &&
 			    (entry->function <= 0x40000082),
 			    "function %x is our of supported range",
 			    entry->function);
 
-		TEST_ASSERT(evmcs_enabled || (entry->function != 0x4000000A),
-			    "0x4000000A leaf should not be reported");
-
 		TEST_ASSERT(entry->index == 0,
 			    ".index field should be zero");
 
@@ -87,7 +79,7 @@ static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries,
 			TEST_ASSERT(entry->eax == test_val,
 				    "Wrong max leaf report in 0x40000000.EAX: %x"
 				    " (evmcs=%d)",
-				    entry->eax, evmcs_enabled
+				    entry->eax, evmcs_expected
 				);
 			break;
 		case 0x40000004:
@@ -97,8 +89,20 @@ static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries,
 				    "NoNonArchitecturalCoreSharing bit"
 				    " doesn't reflect SMT setting");
 			break;
-		}
+		case 0x4000000A:
+			TEST_ASSERT(entry->eax & (1UL << 19),
+				    "Enlightened MSR-Bitmap should always be supported"
+				    " 0x40000000.EAX: %x", entry->eax);
+			if (evmcs_expected)
+				TEST_ASSERT((entry->eax & 0xffff) == 0x101,
+				    "Supported Enlightened VMCS version range is supposed to be 1:1"
+				    " 0x40000000.EAX: %x", entry->eax);
+
+			break;
+		default:
+			break;
 
+		}
 		/*
 		 * If needed for debug:
 		 * fprintf(stdout,
@@ -107,84 +111,64 @@ static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries,
 		 *	entry->edx);
 		 */
 	}
-
 }
 
-void test_hv_cpuid_e2big(struct kvm_vm *vm)
+void test_hv_cpuid_e2big(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
 {
 	static struct kvm_cpuid2 cpuid = {.nent = 0};
 	int ret;
 
-	ret = _vcpu_ioctl(vm, VCPU_ID, KVM_GET_SUPPORTED_HV_CPUID, &cpuid);
+	if (vcpu)
+		ret = __vcpu_ioctl(vcpu, KVM_GET_SUPPORTED_HV_CPUID, &cpuid);
+	else
+		ret = __kvm_ioctl(vm->kvm_fd, KVM_GET_SUPPORTED_HV_CPUID, &cpuid);
 
 	TEST_ASSERT(ret == -1 && errno == E2BIG,
-		    "KVM_GET_SUPPORTED_HV_CPUID didn't fail with -E2BIG when"
-		    " it should have: %d %d", ret, errno);
+		    "%s KVM_GET_SUPPORTED_HV_CPUID didn't fail with -E2BIG when"
+		    " it should have: %d %d", !vcpu ? "KVM" : "vCPU", ret, errno);
 }
 
-
-struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(struct kvm_vm *vm)
+int main(int argc, char *argv[])
 {
-	int nent = 20; /* should be enough */
-	static struct kvm_cpuid2 *cpuid;
-
-	cpuid = malloc(sizeof(*cpuid) + nent * sizeof(struct kvm_cpuid_entry2));
-
-	if (!cpuid) {
-		perror("malloc");
-		abort();
-	}
-
-	cpuid->nent = nent;
+	struct kvm_vm *vm;
+	const struct kvm_cpuid2 *hv_cpuid_entries;
+	struct kvm_vcpu *vcpu;
 
-	vcpu_ioctl(vm, VCPU_ID, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_CPUID));
 
-	return cpuid;
-}
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
 
+	/* Test vCPU ioctl version */
+	test_hv_cpuid_e2big(vm, vcpu);
 
-int main(int argc, char *argv[])
-{
-	struct kvm_vm *vm;
-	int rv, stage;
-	struct kvm_cpuid2 *hv_cpuid_entries;
-	bool evmcs_enabled;
+	hv_cpuid_entries = vcpu_get_supported_hv_cpuid(vcpu);
+	test_hv_cpuid(hv_cpuid_entries, false);
+	free((void *)hv_cpuid_entries);
 
-	/* Tell stdout not to buffer its content */
-	setbuf(stdout, NULL);
-
-	rv = kvm_check_cap(KVM_CAP_HYPERV_CPUID);
-	if (!rv) {
-		print_skip("KVM_CAP_HYPERV_CPUID not supported");
-		exit(KSFT_SKIP);
+	if (!kvm_cpu_has(X86_FEATURE_VMX) ||
+	    !kvm_has_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)) {
+		print_skip("Enlightened VMCS is unsupported");
+		goto do_sys;
+	}
+	vcpu_enable_evmcs(vcpu);
+	hv_cpuid_entries = vcpu_get_supported_hv_cpuid(vcpu);
+	test_hv_cpuid(hv_cpuid_entries, true);
+	free((void *)hv_cpuid_entries);
+
+do_sys:
+	/* Test system ioctl version */
+	if (!kvm_has_cap(KVM_CAP_SYS_HYPERV_CPUID)) {
+		print_skip("KVM_CAP_SYS_HYPERV_CPUID not supported");
+		goto out;
 	}
 
-	for (stage = 0; stage < 3; stage++) {
-		evmcs_enabled = false;
+	test_hv_cpuid_e2big(vm, NULL);
 
-		vm = vm_create_default(VCPU_ID, 0, guest_code);
-		switch (stage) {
-		case 0:
-			test_hv_cpuid_e2big(vm);
-			continue;
-		case 1:
-			break;
-		case 2:
-			if (!nested_vmx_supported() ||
-			    !kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)) {
-				print_skip("Enlightened VMCS is unsupported");
-				continue;
-			}
-			vcpu_enable_evmcs(vm, VCPU_ID);
-			evmcs_enabled = true;
-			break;
-		}
+	hv_cpuid_entries = kvm_get_supported_hv_cpuid();
+	test_hv_cpuid(hv_cpuid_entries, kvm_cpu_has(X86_FEATURE_VMX));
 
-		hv_cpuid_entries = kvm_get_supported_hv_cpuid(vm);
-		test_hv_cpuid(hv_cpuid_entries, evmcs_enabled);
-		free(hv_cpuid_entries);
-		kvm_vm_free(vm);
-	}
+out:
+	kvm_vm_free(vm);
 
 	return 0;
 }
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c b/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c
new file mode 100644
index 000000000000..4c7257ecd2a6
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c
@@ -0,0 +1,310 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2018, Red Hat, Inc.
+ *
+ * Tests for Enlightened VMCS, including nested guest state.
+ */
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <linux/bitmap.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+
+#include "hyperv.h"
+#include "vmx.h"
+
+static int ud_count;
+
+static void guest_ud_handler(struct ex_regs *regs)
+{
+	ud_count++;
+	regs->rip += 3; /* VMLAUNCH */
+}
+
+static void guest_nmi_handler(struct ex_regs *regs)
+{
+}
+
+static inline void rdmsr_from_l2(uint32_t msr)
+{
+	/* Currently, L1 doesn't preserve GPRs during vmexits. */
+	__asm__ __volatile__ ("rdmsr" : : "c"(msr) :
+			      "rax", "rbx", "rdx", "rsi", "rdi", "r8", "r9",
+			      "r10", "r11", "r12", "r13", "r14", "r15");
+}
+
+/* Exit to L1 from L2 with RDMSR instruction */
+void l2_guest_code(void)
+{
+	u64 unused;
+
+	GUEST_SYNC(7);
+
+	GUEST_SYNC(8);
+
+	/* Forced exit to L1 upon restore */
+	GUEST_SYNC(9);
+
+	vmcall();
+
+	/* MSR-Bitmap tests */
+	rdmsr_from_l2(MSR_FS_BASE); /* intercepted */
+	rdmsr_from_l2(MSR_FS_BASE); /* intercepted */
+	rdmsr_from_l2(MSR_GS_BASE); /* not intercepted */
+	vmcall();
+	rdmsr_from_l2(MSR_GS_BASE); /* intercepted */
+
+	/* L2 TLB flush tests */
+	hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE | HV_HYPERCALL_FAST_BIT, 0x0,
+			 HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES | HV_FLUSH_ALL_PROCESSORS);
+	rdmsr_from_l2(MSR_FS_BASE);
+	/*
+	 * Note: hypercall status (RAX) is not preserved correctly by L1 after
+	 * synthetic vmexit, use unchecked version.
+	 */
+	__hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE | HV_HYPERCALL_FAST_BIT, 0x0,
+			   HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES | HV_FLUSH_ALL_PROCESSORS,
+			   &unused);
+
+	/* Done, exit to L1 and never come back.  */
+	vmcall();
+}
+
+void guest_code(struct vmx_pages *vmx_pages, struct hyperv_test_pages *hv_pages,
+		vm_vaddr_t hv_hcall_page_gpa)
+{
+#define L2_GUEST_STACK_SIZE 64
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+
+	wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID);
+	wrmsr(HV_X64_MSR_HYPERCALL, hv_hcall_page_gpa);
+
+	x2apic_enable();
+
+	GUEST_SYNC(1);
+	GUEST_SYNC(2);
+
+	enable_vp_assist(hv_pages->vp_assist_gpa, hv_pages->vp_assist);
+	evmcs_enable();
+
+	GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+	GUEST_SYNC(3);
+	GUEST_ASSERT(load_evmcs(hv_pages));
+	GUEST_ASSERT(vmptrstz() == hv_pages->enlightened_vmcs_gpa);
+
+	GUEST_SYNC(4);
+	GUEST_ASSERT(vmptrstz() == hv_pages->enlightened_vmcs_gpa);
+
+	prepare_vmcs(vmx_pages, l2_guest_code,
+		     &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+	GUEST_SYNC(5);
+	GUEST_ASSERT(vmptrstz() == hv_pages->enlightened_vmcs_gpa);
+	current_evmcs->revision_id = -1u;
+	GUEST_ASSERT(vmlaunch());
+	current_evmcs->revision_id = EVMCS_VERSION;
+	GUEST_SYNC(6);
+
+	vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmreadz(PIN_BASED_VM_EXEC_CONTROL) |
+		PIN_BASED_NMI_EXITING);
+
+	/* L2 TLB flush setup */
+	current_evmcs->partition_assist_page = hv_pages->partition_assist_gpa;
+	current_evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;
+	current_evmcs->hv_vm_id = 1;
+	current_evmcs->hv_vp_id = 1;
+	current_vp_assist->nested_control.features.directhypercall = 1;
+	*(u32 *)(hv_pages->partition_assist) = 0;
+
+	GUEST_ASSERT(!vmlaunch());
+	GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_EXCEPTION_NMI);
+	GUEST_ASSERT_EQ((vmreadz(VM_EXIT_INTR_INFO) & 0xff), NMI_VECTOR);
+	GUEST_ASSERT(vmptrstz() == hv_pages->enlightened_vmcs_gpa);
+
+	/*
+	 * NMI forces L2->L1 exit, resuming L2 and hope that EVMCS is
+	 * up-to-date (RIP points where it should and not at the beginning
+	 * of l2_guest_code(). GUEST_SYNC(9) checkes that.
+	 */
+	GUEST_ASSERT(!vmresume());
+
+	GUEST_SYNC(10);
+
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+	current_evmcs->guest_rip += 3; /* vmcall */
+
+	/* Intercept RDMSR 0xc0000100 */
+	vmwrite(CPU_BASED_VM_EXEC_CONTROL, vmreadz(CPU_BASED_VM_EXEC_CONTROL) |
+		CPU_BASED_USE_MSR_BITMAPS);
+	__set_bit(MSR_FS_BASE & 0x1fff, vmx_pages->msr + 0x400);
+	GUEST_ASSERT(!vmresume());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_MSR_READ);
+	current_evmcs->guest_rip += 2; /* rdmsr */
+
+	/* Enable enlightened MSR bitmap */
+	current_evmcs->hv_enlightenments_control.msr_bitmap = 1;
+	GUEST_ASSERT(!vmresume());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_MSR_READ);
+	current_evmcs->guest_rip += 2; /* rdmsr */
+
+	/* Intercept RDMSR 0xc0000101 without telling KVM about it */
+	__set_bit(MSR_GS_BASE & 0x1fff, vmx_pages->msr + 0x400);
+	/* Make sure HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP is set */
+	current_evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
+	GUEST_ASSERT(!vmresume());
+	/* Make sure we don't see EXIT_REASON_MSR_READ here so eMSR bitmap works */
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+	current_evmcs->guest_rip += 3; /* vmcall */
+
+	/* Now tell KVM we've changed MSR-Bitmap */
+	current_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
+	GUEST_ASSERT(!vmresume());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_MSR_READ);
+	current_evmcs->guest_rip += 2; /* rdmsr */
+
+	/*
+	 * L2 TLB flush test. First VMCALL should be handled directly by L0,
+	 * no VMCALL exit expected.
+	 */
+	GUEST_ASSERT(!vmresume());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_MSR_READ);
+	current_evmcs->guest_rip += 2; /* rdmsr */
+	/* Enable synthetic vmexit */
+	*(u32 *)(hv_pages->partition_assist) = 1;
+	GUEST_ASSERT(!vmresume());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == HV_VMX_SYNTHETIC_EXIT_REASON_TRAP_AFTER_FLUSH);
+
+	GUEST_ASSERT(!vmresume());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+	GUEST_SYNC(11);
+
+	/* Try enlightened vmptrld with an incorrect GPA */
+	evmcs_vmptrld(0xdeadbeef, hv_pages->enlightened_vmcs);
+	GUEST_ASSERT(vmlaunch());
+	GUEST_ASSERT(ud_count == 1);
+	GUEST_DONE();
+}
+
+void inject_nmi(struct kvm_vcpu *vcpu)
+{
+	struct kvm_vcpu_events events;
+
+	vcpu_events_get(vcpu, &events);
+
+	events.nmi.pending = 1;
+	events.flags |= KVM_VCPUEVENT_VALID_NMI_PENDING;
+
+	vcpu_events_set(vcpu, &events);
+}
+
+static struct kvm_vcpu *save_restore_vm(struct kvm_vm *vm,
+					struct kvm_vcpu *vcpu)
+{
+	struct kvm_regs regs1, regs2;
+	struct kvm_x86_state *state;
+
+	state = vcpu_save_state(vcpu);
+	memset(&regs1, 0, sizeof(regs1));
+	vcpu_regs_get(vcpu, &regs1);
+
+	kvm_vm_release(vm);
+
+	/* Restore state in a new VM.  */
+	vcpu = vm_recreate_with_one_vcpu(vm);
+	vcpu_set_hv_cpuid(vcpu);
+	vcpu_enable_evmcs(vcpu);
+	vcpu_load_state(vcpu, state);
+	kvm_x86_state_cleanup(state);
+
+	memset(&regs2, 0, sizeof(regs2));
+	vcpu_regs_get(vcpu, &regs2);
+	TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)),
+		    "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
+		    (ulong) regs2.rdi, (ulong) regs2.rsi);
+	return vcpu;
+}
+
+int main(int argc, char *argv[])
+{
+	vm_vaddr_t vmx_pages_gva = 0, hv_pages_gva = 0;
+	vm_vaddr_t hcall_page;
+
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	struct ucall uc;
+	int stage;
+
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE));
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS));
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_DIRECT_TLBFLUSH));
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+
+	hcall_page = vm_vaddr_alloc_pages(vm, 1);
+	memset(addr_gva2hva(vm, hcall_page), 0x0,  getpagesize());
+
+	vcpu_set_hv_cpuid(vcpu);
+	vcpu_enable_evmcs(vcpu);
+
+	vcpu_alloc_vmx(vm, &vmx_pages_gva);
+	vcpu_alloc_hyperv_test_pages(vm, &hv_pages_gva);
+	vcpu_args_set(vcpu, 3, vmx_pages_gva, hv_pages_gva, addr_gva2gpa(vm, hcall_page));
+	vcpu_set_msr(vcpu, HV_X64_MSR_VP_INDEX, vcpu->id);
+
+	vm_init_descriptor_tables(vm);
+	vcpu_init_descriptor_tables(vcpu);
+	vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler);
+	vm_install_exception_handler(vm, NMI_VECTOR, guest_nmi_handler);
+
+	pr_info("Running L1 which uses EVMCS to run L2\n");
+
+	for (stage = 1;; stage++) {
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			/* NOT REACHED */
+		case UCALL_SYNC:
+			break;
+		case UCALL_DONE:
+			goto done;
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+
+		/* UCALL_SYNC is handled here.  */
+		TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
+			    uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx",
+			    stage, (ulong)uc.args[1]);
+
+		vcpu = save_restore_vm(vm, vcpu);
+
+		/* Force immediate L2->L1 exit before resuming */
+		if (stage == 8) {
+			pr_info("Injecting NMI into L1 before L2 had a chance to run after restore\n");
+			inject_nmi(vcpu);
+		}
+
+		/*
+		 * Do KVM_GET_NESTED_STATE/KVM_SET_NESTED_STATE for a freshly
+		 * restored VM (before the first KVM_RUN) to check that
+		 * KVM_STATE_NESTED_EVMCS is not lost.
+		 */
+		if (stage == 9) {
+			pr_info("Trying extra KVM_GET_NESTED_STATE/KVM_SET_NESTED_STATE cycle\n");
+			vcpu = save_restore_vm(vm, vcpu);
+		}
+	}
+
+done:
+	kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_extended_hypercalls.c b/tools/testing/selftests/kvm/x86_64/hyperv_extended_hypercalls.c
new file mode 100644
index 000000000000..949e08e98f31
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_extended_hypercalls.c
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Test Hyper-V extended hypercall, HV_EXT_CALL_QUERY_CAPABILITIES (0x8001),
+ * exit to userspace and receive result in guest.
+ *
+ * Negative tests are present in hyperv_features.c
+ *
+ * Copyright 2022 Google LLC
+ * Author: Vipin Sharma <vipinsh@google.com>
+ */
+#include "kvm_util.h"
+#include "processor.h"
+#include "hyperv.h"
+
+/* Any value is fine */
+#define EXT_CAPABILITIES 0xbull
+
+static void guest_code(vm_paddr_t in_pg_gpa, vm_paddr_t out_pg_gpa,
+		       vm_vaddr_t out_pg_gva)
+{
+	uint64_t *output_gva;
+
+	wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID);
+	wrmsr(HV_X64_MSR_HYPERCALL, in_pg_gpa);
+
+	output_gva = (uint64_t *)out_pg_gva;
+
+	hyperv_hypercall(HV_EXT_CALL_QUERY_CAPABILITIES, in_pg_gpa, out_pg_gpa);
+
+	/* TLFS states output will be a uint64_t value */
+	GUEST_ASSERT_EQ(*output_gva, EXT_CAPABILITIES);
+
+	GUEST_DONE();
+}
+
+int main(void)
+{
+	vm_vaddr_t hcall_out_page;
+	vm_vaddr_t hcall_in_page;
+	struct kvm_vcpu *vcpu;
+	struct kvm_run *run;
+	struct kvm_vm *vm;
+	uint64_t *outval;
+	struct ucall uc;
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_CPUID));
+
+	/* Verify if extended hypercalls are supported */
+	if (!kvm_cpuid_has(kvm_get_supported_hv_cpuid(),
+			   HV_ENABLE_EXTENDED_HYPERCALLS)) {
+		print_skip("Extended calls not supported by the kernel");
+		exit(KSFT_SKIP);
+	}
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+	run = vcpu->run;
+	vcpu_set_hv_cpuid(vcpu);
+
+	/* Hypercall input */
+	hcall_in_page = vm_vaddr_alloc_pages(vm, 1);
+	memset(addr_gva2hva(vm, hcall_in_page), 0x0, vm->page_size);
+
+	/* Hypercall output */
+	hcall_out_page = vm_vaddr_alloc_pages(vm, 1);
+	memset(addr_gva2hva(vm, hcall_out_page), 0x0, vm->page_size);
+
+	vcpu_args_set(vcpu, 3, addr_gva2gpa(vm, hcall_in_page),
+		      addr_gva2gpa(vm, hcall_out_page), hcall_out_page);
+
+	vcpu_run(vcpu);
+
+	TEST_ASSERT(run->exit_reason == KVM_EXIT_HYPERV,
+		    "Unexpected exit reason: %u (%s)",
+		    run->exit_reason, exit_reason_str(run->exit_reason));
+
+	outval = addr_gpa2hva(vm, run->hyperv.u.hcall.params[1]);
+	*outval = EXT_CAPABILITIES;
+	run->hyperv.u.hcall.result = HV_STATUS_SUCCESS;
+
+	vcpu_run(vcpu);
+
+	TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+		    "Unexpected exit reason: %u (%s)",
+		    run->exit_reason, exit_reason_str(run->exit_reason));
+
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+		break;
+	case UCALL_DONE:
+		break;
+	default:
+		TEST_FAIL("Unhandled ucall: %ld", uc.cmd);
+	}
+
+	kvm_vm_free(vm);
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c
new file mode 100644
index 000000000000..b923a285e96f
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_features.c
@@ -0,0 +1,701 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021, Red Hat, Inc.
+ *
+ * Tests for Hyper-V features enablement
+ */
+#include <asm/kvm_para.h>
+#include <linux/kvm_para.h>
+#include <stdint.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "hyperv.h"
+
+/*
+ * HYPERV_CPUID_ENLIGHTMENT_INFO.EBX is not a 'feature' CPUID leaf
+ * but to activate the feature it is sufficient to set it to a non-zero
+ * value. Use BIT(0) for that.
+ */
+#define HV_PV_SPINLOCKS_TEST            \
+	KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EBX, 0)
+
+struct msr_data {
+	uint32_t idx;
+	bool fault_expected;
+	bool write;
+	u64 write_val;
+};
+
+struct hcall_data {
+	uint64_t control;
+	uint64_t expect;
+	bool ud_expected;
+};
+
+static bool is_write_only_msr(uint32_t msr)
+{
+	return msr == HV_X64_MSR_EOI;
+}
+
+static void guest_msr(struct msr_data *msr)
+{
+	uint8_t vector = 0;
+	uint64_t msr_val = 0;
+
+	GUEST_ASSERT(msr->idx);
+
+	if (msr->write)
+		vector = wrmsr_safe(msr->idx, msr->write_val);
+
+	if (!vector && (!msr->write || !is_write_only_msr(msr->idx)))
+		vector = rdmsr_safe(msr->idx, &msr_val);
+
+	if (msr->fault_expected)
+		__GUEST_ASSERT(vector == GP_VECTOR,
+			       "Expected #GP on %sMSR(0x%x), got vector '0x%x'",
+			       msr->write ? "WR" : "RD", msr->idx, vector);
+	else
+		__GUEST_ASSERT(!vector,
+			       "Expected success on %sMSR(0x%x), got vector '0x%x'",
+			       msr->write ? "WR" : "RD", msr->idx, vector);
+
+	if (vector || is_write_only_msr(msr->idx))
+		goto done;
+
+	if (msr->write)
+		__GUEST_ASSERT(!vector,
+			       "WRMSR(0x%x) to '0x%lx', RDMSR read '0x%lx'",
+			       msr->idx, msr->write_val, msr_val);
+
+	/* Invariant TSC bit appears when TSC invariant control MSR is written to */
+	if (msr->idx == HV_X64_MSR_TSC_INVARIANT_CONTROL) {
+		if (!this_cpu_has(HV_ACCESS_TSC_INVARIANT))
+			GUEST_ASSERT(this_cpu_has(X86_FEATURE_INVTSC));
+		else
+			GUEST_ASSERT(this_cpu_has(X86_FEATURE_INVTSC) ==
+				     !!(msr_val & HV_INVARIANT_TSC_EXPOSED));
+	}
+
+done:
+	GUEST_DONE();
+}
+
+static void guest_hcall(vm_vaddr_t pgs_gpa, struct hcall_data *hcall)
+{
+	u64 res, input, output;
+	uint8_t vector;
+
+	GUEST_ASSERT_NE(hcall->control, 0);
+
+	wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID);
+	wrmsr(HV_X64_MSR_HYPERCALL, pgs_gpa);
+
+	if (!(hcall->control & HV_HYPERCALL_FAST_BIT)) {
+		input = pgs_gpa;
+		output = pgs_gpa + 4096;
+	} else {
+		input = output = 0;
+	}
+
+	vector = __hyperv_hypercall(hcall->control, input, output, &res);
+	if (hcall->ud_expected) {
+		__GUEST_ASSERT(vector == UD_VECTOR,
+			       "Expected #UD for control '%lu', got vector '0x%x'",
+			       hcall->control, vector);
+	} else {
+		__GUEST_ASSERT(!vector,
+			       "Expected no exception for control '%lu', got vector '0x%x'",
+			       hcall->control, vector);
+		GUEST_ASSERT_EQ(res, hcall->expect);
+	}
+
+	GUEST_DONE();
+}
+
+static void vcpu_reset_hv_cpuid(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * Enable all supported Hyper-V features, then clear the leafs holding
+	 * the features that will be tested one by one.
+	 */
+	vcpu_set_hv_cpuid(vcpu);
+
+	vcpu_clear_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES);
+	vcpu_clear_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO);
+	vcpu_clear_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES);
+}
+
+static void guest_test_msrs_access(void)
+{
+	struct kvm_cpuid2 *prev_cpuid = NULL;
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	struct ucall uc;
+	int stage = 0;
+	vm_vaddr_t msr_gva;
+	struct msr_data *msr;
+	bool has_invtsc = kvm_cpu_has(X86_FEATURE_INVTSC);
+
+	while (true) {
+		vm = vm_create_with_one_vcpu(&vcpu, guest_msr);
+
+		msr_gva = vm_vaddr_alloc_page(vm);
+		memset(addr_gva2hva(vm, msr_gva), 0x0, getpagesize());
+		msr = addr_gva2hva(vm, msr_gva);
+
+		vcpu_args_set(vcpu, 1, msr_gva);
+		vcpu_enable_cap(vcpu, KVM_CAP_HYPERV_ENFORCE_CPUID, 1);
+
+		if (!prev_cpuid) {
+			vcpu_reset_hv_cpuid(vcpu);
+
+			prev_cpuid = allocate_kvm_cpuid2(vcpu->cpuid->nent);
+		} else {
+			vcpu_init_cpuid(vcpu, prev_cpuid);
+		}
+
+		vm_init_descriptor_tables(vm);
+		vcpu_init_descriptor_tables(vcpu);
+
+		/* TODO: Make this entire test easier to maintain. */
+		if (stage >= 21)
+			vcpu_enable_cap(vcpu, KVM_CAP_HYPERV_SYNIC2, 0);
+
+		switch (stage) {
+		case 0:
+			/*
+			 * Only available when Hyper-V identification is set
+			 */
+			msr->idx = HV_X64_MSR_GUEST_OS_ID;
+			msr->write = false;
+			msr->fault_expected = true;
+			break;
+		case 1:
+			msr->idx = HV_X64_MSR_HYPERCALL;
+			msr->write = false;
+			msr->fault_expected = true;
+			break;
+		case 2:
+			vcpu_set_cpuid_feature(vcpu, HV_MSR_HYPERCALL_AVAILABLE);
+			/*
+			 * HV_X64_MSR_GUEST_OS_ID has to be written first to make
+			 * HV_X64_MSR_HYPERCALL available.
+			 */
+			msr->idx = HV_X64_MSR_GUEST_OS_ID;
+			msr->write = true;
+			msr->write_val = HYPERV_LINUX_OS_ID;
+			msr->fault_expected = false;
+			break;
+		case 3:
+			msr->idx = HV_X64_MSR_GUEST_OS_ID;
+			msr->write = false;
+			msr->fault_expected = false;
+			break;
+		case 4:
+			msr->idx = HV_X64_MSR_HYPERCALL;
+			msr->write = false;
+			msr->fault_expected = false;
+			break;
+
+		case 5:
+			msr->idx = HV_X64_MSR_VP_RUNTIME;
+			msr->write = false;
+			msr->fault_expected = true;
+			break;
+		case 6:
+			vcpu_set_cpuid_feature(vcpu, HV_MSR_VP_RUNTIME_AVAILABLE);
+			msr->idx = HV_X64_MSR_VP_RUNTIME;
+			msr->write = false;
+			msr->fault_expected = false;
+			break;
+		case 7:
+			/* Read only */
+			msr->idx = HV_X64_MSR_VP_RUNTIME;
+			msr->write = true;
+			msr->write_val = 1;
+			msr->fault_expected = true;
+			break;
+
+		case 8:
+			msr->idx = HV_X64_MSR_TIME_REF_COUNT;
+			msr->write = false;
+			msr->fault_expected = true;
+			break;
+		case 9:
+			vcpu_set_cpuid_feature(vcpu, HV_MSR_TIME_REF_COUNT_AVAILABLE);
+			msr->idx = HV_X64_MSR_TIME_REF_COUNT;
+			msr->write = false;
+			msr->fault_expected = false;
+			break;
+		case 10:
+			/* Read only */
+			msr->idx = HV_X64_MSR_TIME_REF_COUNT;
+			msr->write = true;
+			msr->write_val = 1;
+			msr->fault_expected = true;
+			break;
+
+		case 11:
+			msr->idx = HV_X64_MSR_VP_INDEX;
+			msr->write = false;
+			msr->fault_expected = true;
+			break;
+		case 12:
+			vcpu_set_cpuid_feature(vcpu, HV_MSR_VP_INDEX_AVAILABLE);
+			msr->idx = HV_X64_MSR_VP_INDEX;
+			msr->write = false;
+			msr->fault_expected = false;
+			break;
+		case 13:
+			/* Read only */
+			msr->idx = HV_X64_MSR_VP_INDEX;
+			msr->write = true;
+			msr->write_val = 1;
+			msr->fault_expected = true;
+			break;
+
+		case 14:
+			msr->idx = HV_X64_MSR_RESET;
+			msr->write = false;
+			msr->fault_expected = true;
+			break;
+		case 15:
+			vcpu_set_cpuid_feature(vcpu, HV_MSR_RESET_AVAILABLE);
+			msr->idx = HV_X64_MSR_RESET;
+			msr->write = false;
+			msr->fault_expected = false;
+			break;
+		case 16:
+			msr->idx = HV_X64_MSR_RESET;
+			msr->write = true;
+			/*
+			 * TODO: the test only writes '0' to HV_X64_MSR_RESET
+			 * at the moment, writing some other value there will
+			 * trigger real vCPU reset and the code is not prepared
+			 * to handle it yet.
+			 */
+			msr->write_val = 0;
+			msr->fault_expected = false;
+			break;
+
+		case 17:
+			msr->idx = HV_X64_MSR_REFERENCE_TSC;
+			msr->write = false;
+			msr->fault_expected = true;
+			break;
+		case 18:
+			vcpu_set_cpuid_feature(vcpu, HV_MSR_REFERENCE_TSC_AVAILABLE);
+			msr->idx = HV_X64_MSR_REFERENCE_TSC;
+			msr->write = false;
+			msr->fault_expected = false;
+			break;
+		case 19:
+			msr->idx = HV_X64_MSR_REFERENCE_TSC;
+			msr->write = true;
+			msr->write_val = 0;
+			msr->fault_expected = false;
+			break;
+
+		case 20:
+			msr->idx = HV_X64_MSR_EOM;
+			msr->write = false;
+			msr->fault_expected = true;
+			break;
+		case 21:
+			/*
+			 * Remains unavailable even with KVM_CAP_HYPERV_SYNIC2
+			 * capability enabled and guest visible CPUID bit unset.
+			 */
+			msr->idx = HV_X64_MSR_EOM;
+			msr->write = false;
+			msr->fault_expected = true;
+			break;
+		case 22:
+			vcpu_set_cpuid_feature(vcpu, HV_MSR_SYNIC_AVAILABLE);
+			msr->idx = HV_X64_MSR_EOM;
+			msr->write = false;
+			msr->fault_expected = false;
+			break;
+		case 23:
+			msr->idx = HV_X64_MSR_EOM;
+			msr->write = true;
+			msr->write_val = 0;
+			msr->fault_expected = false;
+			break;
+
+		case 24:
+			msr->idx = HV_X64_MSR_STIMER0_CONFIG;
+			msr->write = false;
+			msr->fault_expected = true;
+			break;
+		case 25:
+			vcpu_set_cpuid_feature(vcpu, HV_MSR_SYNTIMER_AVAILABLE);
+			msr->idx = HV_X64_MSR_STIMER0_CONFIG;
+			msr->write = false;
+			msr->fault_expected = false;
+			break;
+		case 26:
+			msr->idx = HV_X64_MSR_STIMER0_CONFIG;
+			msr->write = true;
+			msr->write_val = 0;
+			msr->fault_expected = false;
+			break;
+		case 27:
+			/* Direct mode test */
+			msr->idx = HV_X64_MSR_STIMER0_CONFIG;
+			msr->write = true;
+			msr->write_val = 1 << 12;
+			msr->fault_expected = true;
+			break;
+		case 28:
+			vcpu_set_cpuid_feature(vcpu, HV_STIMER_DIRECT_MODE_AVAILABLE);
+			msr->idx = HV_X64_MSR_STIMER0_CONFIG;
+			msr->write = true;
+			msr->write_val = 1 << 12;
+			msr->fault_expected = false;
+			break;
+
+		case 29:
+			msr->idx = HV_X64_MSR_EOI;
+			msr->write = false;
+			msr->fault_expected = true;
+			break;
+		case 30:
+			vcpu_set_cpuid_feature(vcpu, HV_MSR_APIC_ACCESS_AVAILABLE);
+			msr->idx = HV_X64_MSR_EOI;
+			msr->write = true;
+			msr->write_val = 1;
+			msr->fault_expected = false;
+			break;
+
+		case 31:
+			msr->idx = HV_X64_MSR_TSC_FREQUENCY;
+			msr->write = false;
+			msr->fault_expected = true;
+			break;
+		case 32:
+			vcpu_set_cpuid_feature(vcpu, HV_ACCESS_FREQUENCY_MSRS);
+			msr->idx = HV_X64_MSR_TSC_FREQUENCY;
+			msr->write = false;
+			msr->fault_expected = false;
+			break;
+		case 33:
+			/* Read only */
+			msr->idx = HV_X64_MSR_TSC_FREQUENCY;
+			msr->write = true;
+			msr->write_val = 1;
+			msr->fault_expected = true;
+			break;
+
+		case 34:
+			msr->idx = HV_X64_MSR_REENLIGHTENMENT_CONTROL;
+			msr->write = false;
+			msr->fault_expected = true;
+			break;
+		case 35:
+			vcpu_set_cpuid_feature(vcpu, HV_ACCESS_REENLIGHTENMENT);
+			msr->idx = HV_X64_MSR_REENLIGHTENMENT_CONTROL;
+			msr->write = false;
+			msr->fault_expected = false;
+			break;
+		case 36:
+			msr->idx = HV_X64_MSR_REENLIGHTENMENT_CONTROL;
+			msr->write = true;
+			msr->write_val = 1;
+			msr->fault_expected = false;
+			break;
+		case 37:
+			/* Can only write '0' */
+			msr->idx = HV_X64_MSR_TSC_EMULATION_STATUS;
+			msr->write = true;
+			msr->write_val = 1;
+			msr->fault_expected = true;
+			break;
+
+		case 38:
+			msr->idx = HV_X64_MSR_CRASH_P0;
+			msr->write = false;
+			msr->fault_expected = true;
+			break;
+		case 39:
+			vcpu_set_cpuid_feature(vcpu, HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE);
+			msr->idx = HV_X64_MSR_CRASH_P0;
+			msr->write = false;
+			msr->fault_expected = false;
+			break;
+		case 40:
+			msr->idx = HV_X64_MSR_CRASH_P0;
+			msr->write = true;
+			msr->write_val = 1;
+			msr->fault_expected = false;
+			break;
+
+		case 41:
+			msr->idx = HV_X64_MSR_SYNDBG_STATUS;
+			msr->write = false;
+			msr->fault_expected = true;
+			break;
+		case 42:
+			vcpu_set_cpuid_feature(vcpu, HV_FEATURE_DEBUG_MSRS_AVAILABLE);
+			vcpu_set_cpuid_feature(vcpu, HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING);
+			msr->idx = HV_X64_MSR_SYNDBG_STATUS;
+			msr->write = false;
+			msr->fault_expected = false;
+			break;
+		case 43:
+			msr->idx = HV_X64_MSR_SYNDBG_STATUS;
+			msr->write = true;
+			msr->write_val = 0;
+			msr->fault_expected = false;
+			break;
+
+		case 44:
+			/* MSR is not available when CPUID feature bit is unset */
+			if (!has_invtsc)
+				goto next_stage;
+			msr->idx = HV_X64_MSR_TSC_INVARIANT_CONTROL;
+			msr->write = false;
+			msr->fault_expected = true;
+			break;
+		case 45:
+			/* MSR is vailable when CPUID feature bit is set */
+			if (!has_invtsc)
+				goto next_stage;
+			vcpu_set_cpuid_feature(vcpu, HV_ACCESS_TSC_INVARIANT);
+			msr->idx = HV_X64_MSR_TSC_INVARIANT_CONTROL;
+			msr->write = false;
+			msr->fault_expected = false;
+			break;
+		case 46:
+			/* Writing bits other than 0 is forbidden */
+			if (!has_invtsc)
+				goto next_stage;
+			msr->idx = HV_X64_MSR_TSC_INVARIANT_CONTROL;
+			msr->write = true;
+			msr->write_val = 0xdeadbeef;
+			msr->fault_expected = true;
+			break;
+		case 47:
+			/* Setting bit 0 enables the feature */
+			if (!has_invtsc)
+				goto next_stage;
+			msr->idx = HV_X64_MSR_TSC_INVARIANT_CONTROL;
+			msr->write = true;
+			msr->write_val = 1;
+			msr->fault_expected = false;
+			break;
+
+		default:
+			kvm_vm_free(vm);
+			return;
+		}
+
+		vcpu_set_cpuid(vcpu);
+
+		memcpy(prev_cpuid, vcpu->cpuid, kvm_cpuid2_size(vcpu->cpuid->nent));
+
+		pr_debug("Stage %d: testing msr: 0x%x for %s\n", stage,
+			 msr->idx, msr->write ? "write" : "read");
+
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			return;
+		case UCALL_DONE:
+			break;
+		default:
+			TEST_FAIL("Unhandled ucall: %ld", uc.cmd);
+			return;
+		}
+
+next_stage:
+		stage++;
+		kvm_vm_free(vm);
+	}
+}
+
+static void guest_test_hcalls_access(void)
+{
+	struct kvm_cpuid2 *prev_cpuid = NULL;
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	struct ucall uc;
+	int stage = 0;
+	vm_vaddr_t hcall_page, hcall_params;
+	struct hcall_data *hcall;
+
+	while (true) {
+		vm = vm_create_with_one_vcpu(&vcpu, guest_hcall);
+
+		vm_init_descriptor_tables(vm);
+		vcpu_init_descriptor_tables(vcpu);
+
+		/* Hypercall input/output */
+		hcall_page = vm_vaddr_alloc_pages(vm, 2);
+		memset(addr_gva2hva(vm, hcall_page), 0x0, 2 * getpagesize());
+
+		hcall_params = vm_vaddr_alloc_page(vm);
+		memset(addr_gva2hva(vm, hcall_params), 0x0, getpagesize());
+		hcall = addr_gva2hva(vm, hcall_params);
+
+		vcpu_args_set(vcpu, 2, addr_gva2gpa(vm, hcall_page), hcall_params);
+		vcpu_enable_cap(vcpu, KVM_CAP_HYPERV_ENFORCE_CPUID, 1);
+
+		if (!prev_cpuid) {
+			vcpu_reset_hv_cpuid(vcpu);
+
+			prev_cpuid = allocate_kvm_cpuid2(vcpu->cpuid->nent);
+		} else {
+			vcpu_init_cpuid(vcpu, prev_cpuid);
+		}
+
+		switch (stage) {
+		case 0:
+			vcpu_set_cpuid_feature(vcpu, HV_MSR_HYPERCALL_AVAILABLE);
+			hcall->control = 0xbeef;
+			hcall->expect = HV_STATUS_INVALID_HYPERCALL_CODE;
+			break;
+
+		case 1:
+			hcall->control = HVCALL_POST_MESSAGE;
+			hcall->expect = HV_STATUS_ACCESS_DENIED;
+			break;
+		case 2:
+			vcpu_set_cpuid_feature(vcpu, HV_POST_MESSAGES);
+			hcall->control = HVCALL_POST_MESSAGE;
+			hcall->expect = HV_STATUS_INVALID_HYPERCALL_INPUT;
+			break;
+
+		case 3:
+			hcall->control = HVCALL_SIGNAL_EVENT;
+			hcall->expect = HV_STATUS_ACCESS_DENIED;
+			break;
+		case 4:
+			vcpu_set_cpuid_feature(vcpu, HV_SIGNAL_EVENTS);
+			hcall->control = HVCALL_SIGNAL_EVENT;
+			hcall->expect = HV_STATUS_INVALID_HYPERCALL_INPUT;
+			break;
+
+		case 5:
+			hcall->control = HVCALL_RESET_DEBUG_SESSION;
+			hcall->expect = HV_STATUS_INVALID_HYPERCALL_CODE;
+			break;
+		case 6:
+			vcpu_set_cpuid_feature(vcpu, HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING);
+			hcall->control = HVCALL_RESET_DEBUG_SESSION;
+			hcall->expect = HV_STATUS_ACCESS_DENIED;
+			break;
+		case 7:
+			vcpu_set_cpuid_feature(vcpu, HV_DEBUGGING);
+			hcall->control = HVCALL_RESET_DEBUG_SESSION;
+			hcall->expect = HV_STATUS_OPERATION_DENIED;
+			break;
+
+		case 8:
+			hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE;
+			hcall->expect = HV_STATUS_ACCESS_DENIED;
+			break;
+		case 9:
+			vcpu_set_cpuid_feature(vcpu, HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED);
+			hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE;
+			hcall->expect = HV_STATUS_SUCCESS;
+			break;
+		case 10:
+			hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX;
+			hcall->expect = HV_STATUS_ACCESS_DENIED;
+			break;
+		case 11:
+			vcpu_set_cpuid_feature(vcpu, HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED);
+			hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX;
+			hcall->expect = HV_STATUS_SUCCESS;
+			break;
+
+		case 12:
+			hcall->control = HVCALL_SEND_IPI;
+			hcall->expect = HV_STATUS_ACCESS_DENIED;
+			break;
+		case 13:
+			vcpu_set_cpuid_feature(vcpu, HV_X64_CLUSTER_IPI_RECOMMENDED);
+			hcall->control = HVCALL_SEND_IPI;
+			hcall->expect = HV_STATUS_INVALID_HYPERCALL_INPUT;
+			break;
+		case 14:
+			/* Nothing in 'sparse banks' -> success */
+			hcall->control = HVCALL_SEND_IPI_EX;
+			hcall->expect = HV_STATUS_SUCCESS;
+			break;
+
+		case 15:
+			hcall->control = HVCALL_NOTIFY_LONG_SPIN_WAIT;
+			hcall->expect = HV_STATUS_ACCESS_DENIED;
+			break;
+		case 16:
+			vcpu_set_cpuid_feature(vcpu, HV_PV_SPINLOCKS_TEST);
+			hcall->control = HVCALL_NOTIFY_LONG_SPIN_WAIT;
+			hcall->expect = HV_STATUS_SUCCESS;
+			break;
+		case 17:
+			/* XMM fast hypercall */
+			hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE | HV_HYPERCALL_FAST_BIT;
+			hcall->ud_expected = true;
+			break;
+		case 18:
+			vcpu_set_cpuid_feature(vcpu, HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE);
+			hcall->control = HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE | HV_HYPERCALL_FAST_BIT;
+			hcall->ud_expected = false;
+			hcall->expect = HV_STATUS_SUCCESS;
+			break;
+		case 19:
+			hcall->control = HV_EXT_CALL_QUERY_CAPABILITIES;
+			hcall->expect = HV_STATUS_ACCESS_DENIED;
+			break;
+		case 20:
+			vcpu_set_cpuid_feature(vcpu, HV_ENABLE_EXTENDED_HYPERCALLS);
+			hcall->control = HV_EXT_CALL_QUERY_CAPABILITIES | HV_HYPERCALL_FAST_BIT;
+			hcall->expect = HV_STATUS_INVALID_PARAMETER;
+			break;
+		case 21:
+			kvm_vm_free(vm);
+			return;
+		}
+
+		vcpu_set_cpuid(vcpu);
+
+		memcpy(prev_cpuid, vcpu->cpuid, kvm_cpuid2_size(vcpu->cpuid->nent));
+
+		pr_debug("Stage %d: testing hcall: 0x%lx\n", stage, hcall->control);
+
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			return;
+		case UCALL_DONE:
+			break;
+		default:
+			TEST_FAIL("Unhandled ucall: %ld", uc.cmd);
+			return;
+		}
+
+		stage++;
+		kvm_vm_free(vm);
+	}
+}
+
+int main(void)
+{
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_ENFORCE_CPUID));
+
+	pr_info("Testing access to Hyper-V specific MSRs\n");
+	guest_test_msrs_access();
+
+	pr_info("Testing access to Hyper-V hypercalls\n");
+	guest_test_hcalls_access();
+}
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_ipi.c b/tools/testing/selftests/kvm/x86_64/hyperv_ipi.c
new file mode 100644
index 000000000000..f1617762c22f
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_ipi.c
@@ -0,0 +1,313 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Hyper-V HvCallSendSyntheticClusterIpi{,Ex} tests
+ *
+ * Copyright (C) 2022, Red Hat, Inc.
+ *
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <pthread.h>
+#include <inttypes.h>
+
+#include "kvm_util.h"
+#include "hyperv.h"
+#include "test_util.h"
+#include "vmx.h"
+
+#define RECEIVER_VCPU_ID_1 2
+#define RECEIVER_VCPU_ID_2 65
+
+#define IPI_VECTOR	 0xfe
+
+static volatile uint64_t ipis_rcvd[RECEIVER_VCPU_ID_2 + 1];
+
+struct hv_vpset {
+	u64 format;
+	u64 valid_bank_mask;
+	u64 bank_contents[2];
+};
+
+enum HV_GENERIC_SET_FORMAT {
+	HV_GENERIC_SET_SPARSE_4K,
+	HV_GENERIC_SET_ALL,
+};
+
+/* HvCallSendSyntheticClusterIpi hypercall */
+struct hv_send_ipi {
+	u32 vector;
+	u32 reserved;
+	u64 cpu_mask;
+};
+
+/* HvCallSendSyntheticClusterIpiEx hypercall */
+struct hv_send_ipi_ex {
+	u32 vector;
+	u32 reserved;
+	struct hv_vpset vp_set;
+};
+
+static inline void hv_init(vm_vaddr_t pgs_gpa)
+{
+	wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID);
+	wrmsr(HV_X64_MSR_HYPERCALL, pgs_gpa);
+}
+
+static void receiver_code(void *hcall_page, vm_vaddr_t pgs_gpa)
+{
+	u32 vcpu_id;
+
+	x2apic_enable();
+	hv_init(pgs_gpa);
+
+	vcpu_id = rdmsr(HV_X64_MSR_VP_INDEX);
+
+	/* Signal sender vCPU we're ready */
+	ipis_rcvd[vcpu_id] = (u64)-1;
+
+	for (;;)
+		asm volatile("sti; hlt; cli");
+}
+
+static void guest_ipi_handler(struct ex_regs *regs)
+{
+	u32 vcpu_id = rdmsr(HV_X64_MSR_VP_INDEX);
+
+	ipis_rcvd[vcpu_id]++;
+	wrmsr(HV_X64_MSR_EOI, 1);
+}
+
+static inline void nop_loop(void)
+{
+	int i;
+
+	for (i = 0; i < 100000000; i++)
+		asm volatile("nop");
+}
+
+static void sender_guest_code(void *hcall_page, vm_vaddr_t pgs_gpa)
+{
+	struct hv_send_ipi *ipi = (struct hv_send_ipi *)hcall_page;
+	struct hv_send_ipi_ex *ipi_ex = (struct hv_send_ipi_ex *)hcall_page;
+	int stage = 1, ipis_expected[2] = {0};
+
+	hv_init(pgs_gpa);
+	GUEST_SYNC(stage++);
+
+	/* Wait for receiver vCPUs to come up */
+	while (!ipis_rcvd[RECEIVER_VCPU_ID_1] || !ipis_rcvd[RECEIVER_VCPU_ID_2])
+		nop_loop();
+	ipis_rcvd[RECEIVER_VCPU_ID_1] = ipis_rcvd[RECEIVER_VCPU_ID_2] = 0;
+
+	/* 'Slow' HvCallSendSyntheticClusterIpi to RECEIVER_VCPU_ID_1 */
+	ipi->vector = IPI_VECTOR;
+	ipi->cpu_mask = 1 << RECEIVER_VCPU_ID_1;
+	hyperv_hypercall(HVCALL_SEND_IPI, pgs_gpa, pgs_gpa + 4096);
+	nop_loop();
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]);
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ipis_expected[1]);
+	GUEST_SYNC(stage++);
+	/* 'Fast' HvCallSendSyntheticClusterIpi to RECEIVER_VCPU_ID_1 */
+	hyperv_hypercall(HVCALL_SEND_IPI | HV_HYPERCALL_FAST_BIT,
+			 IPI_VECTOR, 1 << RECEIVER_VCPU_ID_1);
+	nop_loop();
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]);
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ipis_expected[1]);
+	GUEST_SYNC(stage++);
+
+	/* 'Slow' HvCallSendSyntheticClusterIpiEx to RECEIVER_VCPU_ID_1 */
+	memset(hcall_page, 0, 4096);
+	ipi_ex->vector = IPI_VECTOR;
+	ipi_ex->vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+	ipi_ex->vp_set.valid_bank_mask = 1 << 0;
+	ipi_ex->vp_set.bank_contents[0] = BIT(RECEIVER_VCPU_ID_1);
+	hyperv_hypercall(HVCALL_SEND_IPI_EX | (1 << HV_HYPERCALL_VARHEAD_OFFSET),
+			 pgs_gpa, pgs_gpa + 4096);
+	nop_loop();
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]);
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ipis_expected[1]);
+	GUEST_SYNC(stage++);
+	/* 'XMM Fast' HvCallSendSyntheticClusterIpiEx to RECEIVER_VCPU_ID_1 */
+	hyperv_write_xmm_input(&ipi_ex->vp_set.valid_bank_mask, 1);
+	hyperv_hypercall(HVCALL_SEND_IPI_EX | HV_HYPERCALL_FAST_BIT |
+			 (1 << HV_HYPERCALL_VARHEAD_OFFSET),
+			 IPI_VECTOR, HV_GENERIC_SET_SPARSE_4K);
+	nop_loop();
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]);
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ipis_expected[1]);
+	GUEST_SYNC(stage++);
+
+	/* 'Slow' HvCallSendSyntheticClusterIpiEx to RECEIVER_VCPU_ID_2 */
+	memset(hcall_page, 0, 4096);
+	ipi_ex->vector = IPI_VECTOR;
+	ipi_ex->vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+	ipi_ex->vp_set.valid_bank_mask = 1 << 1;
+	ipi_ex->vp_set.bank_contents[0] = BIT(RECEIVER_VCPU_ID_2 - 64);
+	hyperv_hypercall(HVCALL_SEND_IPI_EX | (1 << HV_HYPERCALL_VARHEAD_OFFSET),
+			 pgs_gpa, pgs_gpa + 4096);
+	nop_loop();
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ipis_expected[0]);
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]);
+	GUEST_SYNC(stage++);
+	/* 'XMM Fast' HvCallSendSyntheticClusterIpiEx to RECEIVER_VCPU_ID_2 */
+	hyperv_write_xmm_input(&ipi_ex->vp_set.valid_bank_mask, 1);
+	hyperv_hypercall(HVCALL_SEND_IPI_EX | HV_HYPERCALL_FAST_BIT |
+			 (1 << HV_HYPERCALL_VARHEAD_OFFSET),
+			 IPI_VECTOR, HV_GENERIC_SET_SPARSE_4K);
+	nop_loop();
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ipis_expected[0]);
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]);
+	GUEST_SYNC(stage++);
+
+	/* 'Slow' HvCallSendSyntheticClusterIpiEx to both RECEIVER_VCPU_ID_{1,2} */
+	memset(hcall_page, 0, 4096);
+	ipi_ex->vector = IPI_VECTOR;
+	ipi_ex->vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+	ipi_ex->vp_set.valid_bank_mask = 1 << 1 | 1;
+	ipi_ex->vp_set.bank_contents[0] = BIT(RECEIVER_VCPU_ID_1);
+	ipi_ex->vp_set.bank_contents[1] = BIT(RECEIVER_VCPU_ID_2 - 64);
+	hyperv_hypercall(HVCALL_SEND_IPI_EX | (2 << HV_HYPERCALL_VARHEAD_OFFSET),
+			 pgs_gpa, pgs_gpa + 4096);
+	nop_loop();
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]);
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]);
+	GUEST_SYNC(stage++);
+	/* 'XMM Fast' HvCallSendSyntheticClusterIpiEx to both RECEIVER_VCPU_ID_{1, 2} */
+	hyperv_write_xmm_input(&ipi_ex->vp_set.valid_bank_mask, 2);
+	hyperv_hypercall(HVCALL_SEND_IPI_EX | HV_HYPERCALL_FAST_BIT |
+			 (2 << HV_HYPERCALL_VARHEAD_OFFSET),
+			 IPI_VECTOR, HV_GENERIC_SET_SPARSE_4K);
+	nop_loop();
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]);
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]);
+	GUEST_SYNC(stage++);
+
+	/* 'Slow' HvCallSendSyntheticClusterIpiEx to HV_GENERIC_SET_ALL */
+	memset(hcall_page, 0, 4096);
+	ipi_ex->vector = IPI_VECTOR;
+	ipi_ex->vp_set.format = HV_GENERIC_SET_ALL;
+	hyperv_hypercall(HVCALL_SEND_IPI_EX, pgs_gpa, pgs_gpa + 4096);
+	nop_loop();
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]);
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]);
+	GUEST_SYNC(stage++);
+	/*
+	 * 'XMM Fast' HvCallSendSyntheticClusterIpiEx to HV_GENERIC_SET_ALL.
+	 */
+	ipi_ex->vp_set.valid_bank_mask = 0;
+	hyperv_write_xmm_input(&ipi_ex->vp_set.valid_bank_mask, 2);
+	hyperv_hypercall(HVCALL_SEND_IPI_EX | HV_HYPERCALL_FAST_BIT,
+			 IPI_VECTOR, HV_GENERIC_SET_ALL);
+	nop_loop();
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]);
+	GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]);
+	GUEST_SYNC(stage++);
+
+	GUEST_DONE();
+}
+
+static void *vcpu_thread(void *arg)
+{
+	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)arg;
+	int old, r;
+
+	r = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &old);
+	TEST_ASSERT(!r, "pthread_setcanceltype failed on vcpu_id=%u with errno=%d",
+		    vcpu->id, r);
+
+	vcpu_run(vcpu);
+
+	TEST_FAIL("vCPU %u exited unexpectedly", vcpu->id);
+
+	return NULL;
+}
+
+static void cancel_join_vcpu_thread(pthread_t thread, struct kvm_vcpu *vcpu)
+{
+	void *retval;
+	int r;
+
+	r = pthread_cancel(thread);
+	TEST_ASSERT(!r, "pthread_cancel on vcpu_id=%d failed with errno=%d",
+		    vcpu->id, r);
+
+	r = pthread_join(thread, &retval);
+	TEST_ASSERT(!r, "pthread_join on vcpu_id=%d failed with errno=%d",
+		    vcpu->id, r);
+	TEST_ASSERT(retval == PTHREAD_CANCELED,
+		    "expected retval=%p, got %p", PTHREAD_CANCELED,
+		    retval);
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vm *vm;
+	struct kvm_vcpu *vcpu[3];
+	vm_vaddr_t hcall_page;
+	pthread_t threads[2];
+	int stage = 1, r;
+	struct ucall uc;
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_SEND_IPI));
+
+	vm = vm_create_with_one_vcpu(&vcpu[0], sender_guest_code);
+
+	/* Hypercall input/output */
+	hcall_page = vm_vaddr_alloc_pages(vm, 2);
+	memset(addr_gva2hva(vm, hcall_page), 0x0, 2 * getpagesize());
+
+	vm_init_descriptor_tables(vm);
+
+	vcpu[1] = vm_vcpu_add(vm, RECEIVER_VCPU_ID_1, receiver_code);
+	vcpu_init_descriptor_tables(vcpu[1]);
+	vcpu_args_set(vcpu[1], 2, hcall_page, addr_gva2gpa(vm, hcall_page));
+	vcpu_set_msr(vcpu[1], HV_X64_MSR_VP_INDEX, RECEIVER_VCPU_ID_1);
+	vcpu_set_hv_cpuid(vcpu[1]);
+
+	vcpu[2] = vm_vcpu_add(vm, RECEIVER_VCPU_ID_2, receiver_code);
+	vcpu_init_descriptor_tables(vcpu[2]);
+	vcpu_args_set(vcpu[2], 2, hcall_page, addr_gva2gpa(vm, hcall_page));
+	vcpu_set_msr(vcpu[2], HV_X64_MSR_VP_INDEX, RECEIVER_VCPU_ID_2);
+	vcpu_set_hv_cpuid(vcpu[2]);
+
+	vm_install_exception_handler(vm, IPI_VECTOR, guest_ipi_handler);
+
+	vcpu_args_set(vcpu[0], 2, hcall_page, addr_gva2gpa(vm, hcall_page));
+	vcpu_set_hv_cpuid(vcpu[0]);
+
+	r = pthread_create(&threads[0], NULL, vcpu_thread, vcpu[1]);
+	TEST_ASSERT(!r, "pthread_create failed errno=%d", r);
+
+	r = pthread_create(&threads[1], NULL, vcpu_thread, vcpu[2]);
+	TEST_ASSERT(!r, "pthread_create failed errno=%d", errno);
+
+	while (true) {
+		vcpu_run(vcpu[0]);
+
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu[0], KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu[0], &uc)) {
+		case UCALL_SYNC:
+			TEST_ASSERT(uc.args[1] == stage,
+				    "Unexpected stage: %ld (%d expected)",
+				    uc.args[1], stage);
+			break;
+		case UCALL_DONE:
+			goto done;
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			/* NOT REACHED */
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+
+		stage++;
+	}
+
+done:
+	cancel_join_vcpu_thread(threads[0], vcpu[1]);
+	cancel_join_vcpu_thread(threads[1], vcpu[2]);
+	kvm_vm_free(vm);
+
+	return r;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c
new file mode 100644
index 000000000000..c9b18707edc0
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c
@@ -0,0 +1,200 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2022, Red Hat, Inc.
+ *
+ * Tests for Hyper-V extensions to SVM.
+ */
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <linux/bitmap.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "svm_util.h"
+#include "hyperv.h"
+
+#define L2_GUEST_STACK_SIZE 256
+
+/* Exit to L1 from L2 with RDMSR instruction */
+static inline void rdmsr_from_l2(uint32_t msr)
+{
+	/* Currently, L1 doesn't preserve GPRs during vmexits. */
+	__asm__ __volatile__ ("rdmsr" : : "c"(msr) :
+			      "rax", "rbx", "rdx", "rsi", "rdi", "r8", "r9",
+			      "r10", "r11", "r12", "r13", "r14", "r15");
+}
+
+void l2_guest_code(void)
+{
+	u64 unused;
+
+	GUEST_SYNC(3);
+	/* Exit to L1 */
+	vmmcall();
+
+	/* MSR-Bitmap tests */
+	rdmsr_from_l2(MSR_FS_BASE); /* intercepted */
+	rdmsr_from_l2(MSR_FS_BASE); /* intercepted */
+	rdmsr_from_l2(MSR_GS_BASE); /* not intercepted */
+	vmmcall();
+	rdmsr_from_l2(MSR_GS_BASE); /* intercepted */
+
+	GUEST_SYNC(5);
+
+	/* L2 TLB flush tests */
+	hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE |
+			 HV_HYPERCALL_FAST_BIT, 0x0,
+			 HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES |
+			 HV_FLUSH_ALL_PROCESSORS);
+	rdmsr_from_l2(MSR_FS_BASE);
+	/*
+	 * Note: hypercall status (RAX) is not preserved correctly by L1 after
+	 * synthetic vmexit, use unchecked version.
+	 */
+	__hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE |
+			   HV_HYPERCALL_FAST_BIT, 0x0,
+			   HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES |
+			   HV_FLUSH_ALL_PROCESSORS, &unused);
+
+	/* Done, exit to L1 and never come back.  */
+	vmmcall();
+}
+
+static void __attribute__((__flatten__)) guest_code(struct svm_test_data *svm,
+						    struct hyperv_test_pages *hv_pages,
+						    vm_vaddr_t pgs_gpa)
+{
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+	struct vmcb *vmcb = svm->vmcb;
+	struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments;
+
+	GUEST_SYNC(1);
+
+	wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID);
+	wrmsr(HV_X64_MSR_HYPERCALL, pgs_gpa);
+	enable_vp_assist(hv_pages->vp_assist_gpa, hv_pages->vp_assist);
+
+	GUEST_ASSERT(svm->vmcb_gpa);
+	/* Prepare for L2 execution. */
+	generic_svm_setup(svm, l2_guest_code,
+			  &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+	/* L2 TLB flush setup */
+	hve->partition_assist_page = hv_pages->partition_assist_gpa;
+	hve->hv_enlightenments_control.nested_flush_hypercall = 1;
+	hve->hv_vm_id = 1;
+	hve->hv_vp_id = 1;
+	current_vp_assist->nested_control.features.directhypercall = 1;
+	*(u32 *)(hv_pages->partition_assist) = 0;
+
+	GUEST_SYNC(2);
+	run_guest(vmcb, svm->vmcb_gpa);
+	GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL);
+	GUEST_SYNC(4);
+	vmcb->save.rip += 3;
+
+	/* Intercept RDMSR 0xc0000100 */
+	vmcb->control.intercept |= 1ULL << INTERCEPT_MSR_PROT;
+	__set_bit(2 * (MSR_FS_BASE & 0x1fff), svm->msr + 0x800);
+	run_guest(vmcb, svm->vmcb_gpa);
+	GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_MSR);
+	vmcb->save.rip += 2; /* rdmsr */
+
+	/* Enable enlightened MSR bitmap */
+	hve->hv_enlightenments_control.msr_bitmap = 1;
+	run_guest(vmcb, svm->vmcb_gpa);
+	GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_MSR);
+	vmcb->save.rip += 2; /* rdmsr */
+
+	/* Intercept RDMSR 0xc0000101 without telling KVM about it */
+	__set_bit(2 * (MSR_GS_BASE & 0x1fff), svm->msr + 0x800);
+	/* Make sure HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP is set */
+	vmcb->control.clean |= HV_VMCB_NESTED_ENLIGHTENMENTS;
+	run_guest(vmcb, svm->vmcb_gpa);
+	/* Make sure we don't see SVM_EXIT_MSR here so eMSR bitmap works */
+	GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL);
+	vmcb->save.rip += 3; /* vmcall */
+
+	/* Now tell KVM we've changed MSR-Bitmap */
+	vmcb->control.clean &= ~HV_VMCB_NESTED_ENLIGHTENMENTS;
+	run_guest(vmcb, svm->vmcb_gpa);
+	GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_MSR);
+	vmcb->save.rip += 2; /* rdmsr */
+
+
+	/*
+	 * L2 TLB flush test. First VMCALL should be handled directly by L0,
+	 * no VMCALL exit expected.
+	 */
+	run_guest(vmcb, svm->vmcb_gpa);
+	GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_MSR);
+	vmcb->save.rip += 2; /* rdmsr */
+	/* Enable synthetic vmexit */
+	*(u32 *)(hv_pages->partition_assist) = 1;
+	run_guest(vmcb, svm->vmcb_gpa);
+	GUEST_ASSERT(vmcb->control.exit_code == HV_SVM_EXITCODE_ENL);
+	GUEST_ASSERT(vmcb->control.exit_info_1 == HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH);
+
+	run_guest(vmcb, svm->vmcb_gpa);
+	GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL);
+	GUEST_SYNC(6);
+
+	GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+	vm_vaddr_t nested_gva = 0, hv_pages_gva = 0;
+	vm_vaddr_t hcall_page;
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	struct ucall uc;
+	int stage;
+
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM));
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_DIRECT_TLBFLUSH));
+
+	/* Create VM */
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+	vcpu_set_hv_cpuid(vcpu);
+	vcpu_alloc_svm(vm, &nested_gva);
+	vcpu_alloc_hyperv_test_pages(vm, &hv_pages_gva);
+
+	hcall_page = vm_vaddr_alloc_pages(vm, 1);
+	memset(addr_gva2hva(vm, hcall_page), 0x0,  getpagesize());
+
+	vcpu_args_set(vcpu, 3, nested_gva, hv_pages_gva, addr_gva2gpa(vm, hcall_page));
+	vcpu_set_msr(vcpu, HV_X64_MSR_VP_INDEX, vcpu->id);
+
+	for (stage = 1;; stage++) {
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			/* NOT REACHED */
+		case UCALL_SYNC:
+			break;
+		case UCALL_DONE:
+			goto done;
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+
+		/* UCALL_SYNC is handled here.  */
+		TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
+			    uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx",
+			    stage, (ulong)uc.args[1]);
+
+	}
+
+done:
+	kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_tlb_flush.c b/tools/testing/selftests/kvm/x86_64/hyperv_tlb_flush.c
new file mode 100644
index 000000000000..05b56095cf76
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_tlb_flush.c
@@ -0,0 +1,682 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Hyper-V HvFlushVirtualAddress{List,Space}{,Ex} tests
+ *
+ * Copyright (C) 2022, Red Hat, Inc.
+ *
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <asm/barrier.h>
+#include <pthread.h>
+#include <inttypes.h>
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "hyperv.h"
+#include "test_util.h"
+#include "vmx.h"
+
+#define WORKER_VCPU_ID_1 2
+#define WORKER_VCPU_ID_2 65
+
+#define NTRY 100
+#define NTEST_PAGES 2
+
+struct hv_vpset {
+	u64 format;
+	u64 valid_bank_mask;
+	u64 bank_contents[];
+};
+
+enum HV_GENERIC_SET_FORMAT {
+	HV_GENERIC_SET_SPARSE_4K,
+	HV_GENERIC_SET_ALL,
+};
+
+#define HV_FLUSH_ALL_PROCESSORS			BIT(0)
+#define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES	BIT(1)
+#define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY	BIT(2)
+#define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT	BIT(3)
+
+/* HvFlushVirtualAddressSpace, HvFlushVirtualAddressList hypercalls */
+struct hv_tlb_flush {
+	u64 address_space;
+	u64 flags;
+	u64 processor_mask;
+	u64 gva_list[];
+} __packed;
+
+/* HvFlushVirtualAddressSpaceEx, HvFlushVirtualAddressListEx hypercalls */
+struct hv_tlb_flush_ex {
+	u64 address_space;
+	u64 flags;
+	struct hv_vpset hv_vp_set;
+	u64 gva_list[];
+} __packed;
+
+/*
+ * Pass the following info to 'workers' and 'sender'
+ * - Hypercall page's GVA
+ * - Hypercall page's GPA
+ * - Test pages GVA
+ * - GVAs of the test pages' PTEs
+ */
+struct test_data {
+	vm_vaddr_t hcall_gva;
+	vm_paddr_t hcall_gpa;
+	vm_vaddr_t test_pages;
+	vm_vaddr_t test_pages_pte[NTEST_PAGES];
+};
+
+/* 'Worker' vCPU code checking the contents of the test page */
+static void worker_guest_code(vm_vaddr_t test_data)
+{
+	struct test_data *data = (struct test_data *)test_data;
+	u32 vcpu_id = rdmsr(HV_X64_MSR_VP_INDEX);
+	void *exp_page = (void *)data->test_pages + PAGE_SIZE * NTEST_PAGES;
+	u64 *this_cpu = (u64 *)(exp_page + vcpu_id * sizeof(u64));
+	u64 expected, val;
+
+	x2apic_enable();
+	wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID);
+
+	for (;;) {
+		cpu_relax();
+
+		expected = READ_ONCE(*this_cpu);
+
+		/*
+		 * Make sure the value in the test page is read after reading
+		 * the expectation for the first time. Pairs with wmb() in
+		 * prepare_to_test().
+		 */
+		rmb();
+
+		val = READ_ONCE(*(u64 *)data->test_pages);
+
+		/*
+		 * Make sure the value in the test page is read after before
+		 * reading the expectation for the second time. Pairs with wmb()
+		 * post_test().
+		 */
+		rmb();
+
+		/*
+		 * '0' indicates the sender is between iterations, wait until
+		 * the sender is ready for this vCPU to start checking again.
+		 */
+		if (!expected)
+			continue;
+
+		/*
+		 * Re-read the per-vCPU byte to ensure the sender didn't move
+		 * onto a new iteration.
+		 */
+		if (expected != READ_ONCE(*this_cpu))
+			continue;
+
+		GUEST_ASSERT(val == expected);
+	}
+}
+
+/*
+ * Write per-CPU info indicating what each 'worker' CPU is supposed to see in
+ * test page. '0' means don't check.
+ */
+static void set_expected_val(void *addr, u64 val, int vcpu_id)
+{
+	void *exp_page = addr + PAGE_SIZE * NTEST_PAGES;
+
+	*(u64 *)(exp_page + vcpu_id * sizeof(u64)) = val;
+}
+
+/*
+ * Update PTEs swapping two test pages.
+ * TODO: use swap()/xchg() when these are provided.
+ */
+static void swap_two_test_pages(vm_paddr_t pte_gva1, vm_paddr_t pte_gva2)
+{
+	uint64_t tmp = *(uint64_t *)pte_gva1;
+
+	*(uint64_t *)pte_gva1 = *(uint64_t *)pte_gva2;
+	*(uint64_t *)pte_gva2 = tmp;
+}
+
+/*
+ * TODO: replace the silly NOP loop with a proper udelay() implementation.
+ */
+static inline void do_delay(void)
+{
+	int i;
+
+	for (i = 0; i < 1000000; i++)
+		asm volatile("nop");
+}
+
+/*
+ * Prepare to test: 'disable' workers by setting the expectation to '0',
+ * clear hypercall input page and then swap two test pages.
+ */
+static inline void prepare_to_test(struct test_data *data)
+{
+	/* Clear hypercall input page */
+	memset((void *)data->hcall_gva, 0, PAGE_SIZE);
+
+	/* 'Disable' workers */
+	set_expected_val((void *)data->test_pages, 0x0, WORKER_VCPU_ID_1);
+	set_expected_val((void *)data->test_pages, 0x0, WORKER_VCPU_ID_2);
+
+	/* Make sure workers are 'disabled' before we swap PTEs. */
+	wmb();
+
+	/* Make sure workers have enough time to notice */
+	do_delay();
+
+	/* Swap test page mappings */
+	swap_two_test_pages(data->test_pages_pte[0], data->test_pages_pte[1]);
+}
+
+/*
+ * Finalize the test: check hypercall resule set the expected val for
+ * 'worker' CPUs and give them some time to test.
+ */
+static inline void post_test(struct test_data *data, u64 exp1, u64 exp2)
+{
+	/* Make sure we change the expectation after swapping PTEs */
+	wmb();
+
+	/* Set the expectation for workers, '0' means don't test */
+	set_expected_val((void *)data->test_pages, exp1, WORKER_VCPU_ID_1);
+	set_expected_val((void *)data->test_pages, exp2, WORKER_VCPU_ID_2);
+
+	/* Make sure workers have enough time to test */
+	do_delay();
+}
+
+#define TESTVAL1 0x0101010101010101
+#define TESTVAL2 0x0202020202020202
+
+/* Main vCPU doing the test */
+static void sender_guest_code(vm_vaddr_t test_data)
+{
+	struct test_data *data = (struct test_data *)test_data;
+	struct hv_tlb_flush *flush = (struct hv_tlb_flush *)data->hcall_gva;
+	struct hv_tlb_flush_ex *flush_ex = (struct hv_tlb_flush_ex *)data->hcall_gva;
+	vm_paddr_t hcall_gpa = data->hcall_gpa;
+	int i, stage = 1;
+
+	wrmsr(HV_X64_MSR_GUEST_OS_ID, HYPERV_LINUX_OS_ID);
+	wrmsr(HV_X64_MSR_HYPERCALL, data->hcall_gpa);
+
+	/* "Slow" hypercalls */
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE for WORKER_VCPU_ID_1 */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+		flush->processor_mask = BIT(WORKER_VCPU_ID_1);
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE, hcall_gpa,
+				 hcall_gpa + PAGE_SIZE);
+		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, 0x0);
+	}
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST for WORKER_VCPU_ID_1 */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+		flush->processor_mask = BIT(WORKER_VCPU_ID_1);
+		flush->gva_list[0] = (u64)data->test_pages;
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST |
+				 (1UL << HV_HYPERCALL_REP_COMP_OFFSET),
+				 hcall_gpa, hcall_gpa + PAGE_SIZE);
+		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, 0x0);
+	}
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE for HV_FLUSH_ALL_PROCESSORS */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES |
+			HV_FLUSH_ALL_PROCESSORS;
+		flush->processor_mask = 0;
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE, hcall_gpa,
+				 hcall_gpa + PAGE_SIZE);
+		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, i % 2 ? TESTVAL1 : TESTVAL2);
+	}
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST for HV_FLUSH_ALL_PROCESSORS */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES |
+			HV_FLUSH_ALL_PROCESSORS;
+		flush->gva_list[0] = (u64)data->test_pages;
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST |
+				 (1UL << HV_HYPERCALL_REP_COMP_OFFSET),
+				 hcall_gpa, hcall_gpa + PAGE_SIZE);
+		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
+			  i % 2 ? TESTVAL1 : TESTVAL2);
+	}
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX for WORKER_VCPU_ID_2 */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+		flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+		flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_2 / 64);
+		flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_2 % 64);
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX |
+				 (1 << HV_HYPERCALL_VARHEAD_OFFSET),
+				 hcall_gpa, hcall_gpa + PAGE_SIZE);
+		post_test(data, 0x0, i % 2 ? TESTVAL1 : TESTVAL2);
+	}
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX for WORKER_VCPU_ID_2 */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+		flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+		flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_2 / 64);
+		flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_2 % 64);
+		/* bank_contents and gva_list occupy the same space, thus [1] */
+		flush_ex->gva_list[1] = (u64)data->test_pages;
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX |
+				 (1 << HV_HYPERCALL_VARHEAD_OFFSET) |
+				 (1UL << HV_HYPERCALL_REP_COMP_OFFSET),
+				 hcall_gpa, hcall_gpa + PAGE_SIZE);
+		post_test(data, 0x0, i % 2 ? TESTVAL1 : TESTVAL2);
+	}
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX for both vCPUs */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+		flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+		flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_2 / 64) |
+			BIT_ULL(WORKER_VCPU_ID_1 / 64);
+		flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_1 % 64);
+		flush_ex->hv_vp_set.bank_contents[1] = BIT_ULL(WORKER_VCPU_ID_2 % 64);
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX |
+				 (2 << HV_HYPERCALL_VARHEAD_OFFSET),
+				 hcall_gpa, hcall_gpa + PAGE_SIZE);
+		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
+			  i % 2 ? TESTVAL1 : TESTVAL2);
+	}
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX for both vCPUs */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+		flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+		flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_1 / 64) |
+			BIT_ULL(WORKER_VCPU_ID_2 / 64);
+		flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_1 % 64);
+		flush_ex->hv_vp_set.bank_contents[1] = BIT_ULL(WORKER_VCPU_ID_2 % 64);
+		/* bank_contents and gva_list occupy the same space, thus [2] */
+		flush_ex->gva_list[2] = (u64)data->test_pages;
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX |
+				 (2 << HV_HYPERCALL_VARHEAD_OFFSET) |
+				 (1UL << HV_HYPERCALL_REP_COMP_OFFSET),
+				 hcall_gpa, hcall_gpa + PAGE_SIZE);
+		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
+			  i % 2 ? TESTVAL1 : TESTVAL2);
+	}
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX for HV_GENERIC_SET_ALL */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+		flush_ex->hv_vp_set.format = HV_GENERIC_SET_ALL;
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX,
+				 hcall_gpa, hcall_gpa + PAGE_SIZE);
+		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
+			  i % 2 ? TESTVAL1 : TESTVAL2);
+	}
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX for HV_GENERIC_SET_ALL */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+		flush_ex->hv_vp_set.format = HV_GENERIC_SET_ALL;
+		flush_ex->gva_list[0] = (u64)data->test_pages;
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX |
+				 (1UL << HV_HYPERCALL_REP_COMP_OFFSET),
+				 hcall_gpa, hcall_gpa + PAGE_SIZE);
+		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
+			  i % 2 ? TESTVAL1 : TESTVAL2);
+	}
+
+	/* "Fast" hypercalls */
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE for WORKER_VCPU_ID_1 */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush->processor_mask = BIT(WORKER_VCPU_ID_1);
+		hyperv_write_xmm_input(&flush->processor_mask, 1);
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE |
+				 HV_HYPERCALL_FAST_BIT, 0x0,
+				 HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES);
+		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, 0x0);
+	}
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST for WORKER_VCPU_ID_1 */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush->processor_mask = BIT(WORKER_VCPU_ID_1);
+		flush->gva_list[0] = (u64)data->test_pages;
+		hyperv_write_xmm_input(&flush->processor_mask, 1);
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST |
+				 HV_HYPERCALL_FAST_BIT |
+				 (1UL << HV_HYPERCALL_REP_COMP_OFFSET),
+				 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES);
+		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2, 0x0);
+	}
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE for HV_FLUSH_ALL_PROCESSORS */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		hyperv_write_xmm_input(&flush->processor_mask, 1);
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE |
+				 HV_HYPERCALL_FAST_BIT, 0x0,
+				 HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES |
+				 HV_FLUSH_ALL_PROCESSORS);
+		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
+			  i % 2 ? TESTVAL1 : TESTVAL2);
+	}
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST for HV_FLUSH_ALL_PROCESSORS */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush->gva_list[0] = (u64)data->test_pages;
+		hyperv_write_xmm_input(&flush->processor_mask, 1);
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST |
+				 HV_HYPERCALL_FAST_BIT |
+				 (1UL << HV_HYPERCALL_REP_COMP_OFFSET), 0x0,
+				 HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES |
+				 HV_FLUSH_ALL_PROCESSORS);
+		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
+			  i % 2 ? TESTVAL1 : TESTVAL2);
+	}
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX for WORKER_VCPU_ID_2 */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+		flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_2 / 64);
+		flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_2 % 64);
+		hyperv_write_xmm_input(&flush_ex->hv_vp_set, 2);
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX |
+				 HV_HYPERCALL_FAST_BIT |
+				 (1 << HV_HYPERCALL_VARHEAD_OFFSET),
+				 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES);
+		post_test(data, 0x0, i % 2 ? TESTVAL1 : TESTVAL2);
+	}
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX for WORKER_VCPU_ID_2 */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+		flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_2 / 64);
+		flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_2 % 64);
+		/* bank_contents and gva_list occupy the same space, thus [1] */
+		flush_ex->gva_list[1] = (u64)data->test_pages;
+		hyperv_write_xmm_input(&flush_ex->hv_vp_set, 2);
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX |
+				 HV_HYPERCALL_FAST_BIT |
+				 (1 << HV_HYPERCALL_VARHEAD_OFFSET) |
+				 (1UL << HV_HYPERCALL_REP_COMP_OFFSET),
+				 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES);
+		post_test(data, 0x0, i % 2 ? TESTVAL1 : TESTVAL2);
+	}
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX for both vCPUs */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+		flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_2 / 64) |
+			BIT_ULL(WORKER_VCPU_ID_1 / 64);
+		flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_1 % 64);
+		flush_ex->hv_vp_set.bank_contents[1] = BIT_ULL(WORKER_VCPU_ID_2 % 64);
+		hyperv_write_xmm_input(&flush_ex->hv_vp_set, 2);
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX |
+				 HV_HYPERCALL_FAST_BIT |
+				 (2 << HV_HYPERCALL_VARHEAD_OFFSET),
+				 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES);
+		post_test(data, i % 2 ? TESTVAL1 :
+			  TESTVAL2, i % 2 ? TESTVAL1 : TESTVAL2);
+	}
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX for both vCPUs */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush_ex->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+		flush_ex->hv_vp_set.valid_bank_mask = BIT_ULL(WORKER_VCPU_ID_1 / 64) |
+			BIT_ULL(WORKER_VCPU_ID_2 / 64);
+		flush_ex->hv_vp_set.bank_contents[0] = BIT_ULL(WORKER_VCPU_ID_1 % 64);
+		flush_ex->hv_vp_set.bank_contents[1] = BIT_ULL(WORKER_VCPU_ID_2 % 64);
+		/* bank_contents and gva_list occupy the same space, thus [2] */
+		flush_ex->gva_list[2] = (u64)data->test_pages;
+		hyperv_write_xmm_input(&flush_ex->hv_vp_set, 3);
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX |
+				 HV_HYPERCALL_FAST_BIT |
+				 (2 << HV_HYPERCALL_VARHEAD_OFFSET) |
+				 (1UL << HV_HYPERCALL_REP_COMP_OFFSET),
+				 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES);
+		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
+			  i % 2 ? TESTVAL1 : TESTVAL2);
+	}
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX for HV_GENERIC_SET_ALL */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+		flush_ex->hv_vp_set.format = HV_GENERIC_SET_ALL;
+		hyperv_write_xmm_input(&flush_ex->hv_vp_set, 2);
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX |
+				 HV_HYPERCALL_FAST_BIT,
+				 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES);
+		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
+			  i % 2 ? TESTVAL1 : TESTVAL2);
+	}
+
+	GUEST_SYNC(stage++);
+
+	/* HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX for HV_GENERIC_SET_ALL */
+	for (i = 0; i < NTRY; i++) {
+		prepare_to_test(data);
+		flush_ex->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES;
+		flush_ex->hv_vp_set.format = HV_GENERIC_SET_ALL;
+		flush_ex->gva_list[0] = (u64)data->test_pages;
+		hyperv_write_xmm_input(&flush_ex->hv_vp_set, 2);
+		hyperv_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX |
+				 HV_HYPERCALL_FAST_BIT |
+				 (1UL << HV_HYPERCALL_REP_COMP_OFFSET),
+				 0x0, HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES);
+		post_test(data, i % 2 ? TESTVAL1 : TESTVAL2,
+			  i % 2 ? TESTVAL1 : TESTVAL2);
+	}
+
+	GUEST_DONE();
+}
+
+static void *vcpu_thread(void *arg)
+{
+	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)arg;
+	struct ucall uc;
+	int old;
+	int r;
+
+	r = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &old);
+	TEST_ASSERT(!r, "pthread_setcanceltype failed on vcpu_id=%u with errno=%d",
+		    vcpu->id, r);
+
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+		/* NOT REACHED */
+	default:
+		TEST_FAIL("Unexpected ucall %lu, vCPU %d", uc.cmd, vcpu->id);
+	}
+
+	return NULL;
+}
+
+static void cancel_join_vcpu_thread(pthread_t thread, struct kvm_vcpu *vcpu)
+{
+	void *retval;
+	int r;
+
+	r = pthread_cancel(thread);
+	TEST_ASSERT(!r, "pthread_cancel on vcpu_id=%d failed with errno=%d",
+		    vcpu->id, r);
+
+	r = pthread_join(thread, &retval);
+	TEST_ASSERT(!r, "pthread_join on vcpu_id=%d failed with errno=%d",
+		    vcpu->id, r);
+	TEST_ASSERT(retval == PTHREAD_CANCELED,
+		    "expected retval=%p, got %p", PTHREAD_CANCELED,
+		    retval);
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vm *vm;
+	struct kvm_vcpu *vcpu[3];
+	pthread_t threads[2];
+	vm_vaddr_t test_data_page, gva;
+	vm_paddr_t gpa;
+	uint64_t *pte;
+	struct test_data *data;
+	struct ucall uc;
+	int stage = 1, r, i;
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_TLBFLUSH));
+
+	vm = vm_create_with_one_vcpu(&vcpu[0], sender_guest_code);
+
+	/* Test data page */
+	test_data_page = vm_vaddr_alloc_page(vm);
+	data = (struct test_data *)addr_gva2hva(vm, test_data_page);
+
+	/* Hypercall input/output */
+	data->hcall_gva = vm_vaddr_alloc_pages(vm, 2);
+	data->hcall_gpa = addr_gva2gpa(vm, data->hcall_gva);
+	memset(addr_gva2hva(vm, data->hcall_gva), 0x0, 2 * PAGE_SIZE);
+
+	/*
+	 * Test pages: the first one is filled with '0x01's, the second with '0x02's
+	 * and the test will swap their mappings. The third page keeps the indication
+	 * about the current state of mappings.
+	 */
+	data->test_pages = vm_vaddr_alloc_pages(vm, NTEST_PAGES + 1);
+	for (i = 0; i < NTEST_PAGES; i++)
+		memset(addr_gva2hva(vm, data->test_pages + PAGE_SIZE * i),
+		       (u8)(i + 1), PAGE_SIZE);
+	set_expected_val(addr_gva2hva(vm, data->test_pages), 0x0, WORKER_VCPU_ID_1);
+	set_expected_val(addr_gva2hva(vm, data->test_pages), 0x0, WORKER_VCPU_ID_2);
+
+	/*
+	 * Get PTE pointers for test pages and map them inside the guest.
+	 * Use separate page for each PTE for simplicity.
+	 */
+	gva = vm_vaddr_unused_gap(vm, NTEST_PAGES * PAGE_SIZE, KVM_UTIL_MIN_VADDR);
+	for (i = 0; i < NTEST_PAGES; i++) {
+		pte = vm_get_page_table_entry(vm, data->test_pages + i * PAGE_SIZE);
+		gpa = addr_hva2gpa(vm, pte);
+		__virt_pg_map(vm, gva + PAGE_SIZE * i, gpa & PAGE_MASK, PG_LEVEL_4K);
+		data->test_pages_pte[i] = gva + (gpa & ~PAGE_MASK);
+	}
+
+	/*
+	 * Sender vCPU which performs the test: swaps test pages, sets expectation
+	 * for 'workers' and issues TLB flush hypercalls.
+	 */
+	vcpu_args_set(vcpu[0], 1, test_data_page);
+	vcpu_set_hv_cpuid(vcpu[0]);
+
+	/* Create worker vCPUs which check the contents of the test pages */
+	vcpu[1] = vm_vcpu_add(vm, WORKER_VCPU_ID_1, worker_guest_code);
+	vcpu_args_set(vcpu[1], 1, test_data_page);
+	vcpu_set_msr(vcpu[1], HV_X64_MSR_VP_INDEX, WORKER_VCPU_ID_1);
+	vcpu_set_hv_cpuid(vcpu[1]);
+
+	vcpu[2] = vm_vcpu_add(vm, WORKER_VCPU_ID_2, worker_guest_code);
+	vcpu_args_set(vcpu[2], 1, test_data_page);
+	vcpu_set_msr(vcpu[2], HV_X64_MSR_VP_INDEX, WORKER_VCPU_ID_2);
+	vcpu_set_hv_cpuid(vcpu[2]);
+
+	r = pthread_create(&threads[0], NULL, vcpu_thread, vcpu[1]);
+	TEST_ASSERT(!r, "pthread_create() failed");
+
+	r = pthread_create(&threads[1], NULL, vcpu_thread, vcpu[2]);
+	TEST_ASSERT(!r, "pthread_create() failed");
+
+	while (true) {
+		vcpu_run(vcpu[0]);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu[0], KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu[0], &uc)) {
+		case UCALL_SYNC:
+			TEST_ASSERT(uc.args[1] == stage,
+				    "Unexpected stage: %ld (%d expected)",
+				    uc.args[1], stage);
+			break;
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			/* NOT REACHED */
+		case UCALL_DONE:
+			goto done;
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+
+		stage++;
+	}
+
+done:
+	cancel_join_vcpu_thread(threads[0], vcpu[1]);
+	cancel_join_vcpu_thread(threads[1], vcpu[2]);
+	kvm_vm_free(vm);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c b/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c
new file mode 100644
index 000000000000..5bc12222d87a
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/kvm_clock_test.c
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021, Google LLC.
+ *
+ * Tests for adjusting the KVM clock from userspace
+ */
+#include <asm/kvm_para.h>
+#include <asm/pvclock.h>
+#include <asm/pvclock-abi.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <time.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+struct test_case {
+	uint64_t kvmclock_base;
+	int64_t realtime_offset;
+};
+
+static struct test_case test_cases[] = {
+	{ .kvmclock_base = 0 },
+	{ .kvmclock_base = 180 * NSEC_PER_SEC },
+	{ .kvmclock_base = 0, .realtime_offset = -180 * NSEC_PER_SEC },
+	{ .kvmclock_base = 0, .realtime_offset = 180 * NSEC_PER_SEC },
+};
+
+#define GUEST_SYNC_CLOCK(__stage, __val)			\
+		GUEST_SYNC_ARGS(__stage, __val, 0, 0, 0)
+
+static void guest_main(vm_paddr_t pvti_pa, struct pvclock_vcpu_time_info *pvti)
+{
+	int i;
+
+	wrmsr(MSR_KVM_SYSTEM_TIME_NEW, pvti_pa | KVM_MSR_ENABLED);
+	for (i = 0; i < ARRAY_SIZE(test_cases); i++)
+		GUEST_SYNC_CLOCK(i, __pvclock_read_cycles(pvti, rdtsc()));
+}
+
+#define EXPECTED_FLAGS (KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC)
+
+static inline void assert_flags(struct kvm_clock_data *data)
+{
+	TEST_ASSERT((data->flags & EXPECTED_FLAGS) == EXPECTED_FLAGS,
+		    "unexpected clock data flags: %x (want set: %x)",
+		    data->flags, EXPECTED_FLAGS);
+}
+
+static void handle_sync(struct ucall *uc, struct kvm_clock_data *start,
+			struct kvm_clock_data *end)
+{
+	uint64_t obs, exp_lo, exp_hi;
+
+	obs = uc->args[2];
+	exp_lo = start->clock;
+	exp_hi = end->clock;
+
+	assert_flags(start);
+	assert_flags(end);
+
+	TEST_ASSERT(exp_lo <= obs && obs <= exp_hi,
+		    "unexpected kvm-clock value: %"PRIu64" expected range: [%"PRIu64", %"PRIu64"]",
+		    obs, exp_lo, exp_hi);
+
+	pr_info("kvm-clock value: %"PRIu64" expected range [%"PRIu64", %"PRIu64"]\n",
+		obs, exp_lo, exp_hi);
+}
+
+static void handle_abort(struct ucall *uc)
+{
+	REPORT_GUEST_ASSERT(*uc);
+}
+
+static void setup_clock(struct kvm_vm *vm, struct test_case *test_case)
+{
+	struct kvm_clock_data data;
+
+	memset(&data, 0, sizeof(data));
+
+	data.clock = test_case->kvmclock_base;
+	if (test_case->realtime_offset) {
+		struct timespec ts;
+		int r;
+
+		data.flags |= KVM_CLOCK_REALTIME;
+		do {
+			r = clock_gettime(CLOCK_REALTIME, &ts);
+			if (!r)
+				break;
+		} while (errno == EINTR);
+
+		TEST_ASSERT(!r, "clock_gettime() failed: %d", r);
+
+		data.realtime = ts.tv_sec * NSEC_PER_SEC;
+		data.realtime += ts.tv_nsec;
+		data.realtime += test_case->realtime_offset;
+	}
+
+	vm_ioctl(vm, KVM_SET_CLOCK, &data);
+}
+
+static void enter_guest(struct kvm_vcpu *vcpu)
+{
+	struct kvm_clock_data start, end;
+	struct kvm_vm *vm = vcpu->vm;
+	struct ucall uc;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(test_cases); i++) {
+		setup_clock(vm, &test_cases[i]);
+
+		vm_ioctl(vm, KVM_GET_CLOCK, &start);
+
+		vcpu_run(vcpu);
+		vm_ioctl(vm, KVM_GET_CLOCK, &end);
+
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_SYNC:
+			handle_sync(&uc, &start, &end);
+			break;
+		case UCALL_ABORT:
+			handle_abort(&uc);
+			return;
+		default:
+			TEST_ASSERT(0, "unhandled ucall: %ld", uc.cmd);
+		}
+	}
+}
+
+int main(void)
+{
+	struct kvm_vcpu *vcpu;
+	vm_vaddr_t pvti_gva;
+	vm_paddr_t pvti_gpa;
+	struct kvm_vm *vm;
+	int flags;
+
+	flags = kvm_check_cap(KVM_CAP_ADJUST_CLOCK);
+	TEST_REQUIRE(flags & KVM_CLOCK_REALTIME);
+
+	TEST_REQUIRE(sys_clocksource_is_based_on_tsc());
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_main);
+
+	pvti_gva = vm_vaddr_alloc(vm, getpagesize(), 0x10000);
+	pvti_gpa = addr_gva2gpa(vm, pvti_gva);
+	vcpu_args_set(vcpu, 2, pvti_gpa, pvti_gva);
+
+	enter_guest(vcpu);
+	kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c
new file mode 100644
index 000000000000..9e2879af7c20
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c
@@ -0,0 +1,154 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020, Google LLC.
+ *
+ * Tests for KVM paravirtual feature disablement
+ */
+#include <asm/kvm_para.h>
+#include <linux/kvm_para.h>
+#include <stdint.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+struct msr_data {
+	uint32_t idx;
+	const char *name;
+};
+
+#define TEST_MSR(msr) { .idx = msr, .name = #msr }
+#define UCALL_PR_MSR 0xdeadbeef
+#define PR_MSR(msr) ucall(UCALL_PR_MSR, 1, msr)
+
+/*
+ * KVM paravirtual msrs to test. Expect a #GP if any of these msrs are read or
+ * written, as the KVM_CPUID_FEATURES leaf is cleared.
+ */
+static struct msr_data msrs_to_test[] = {
+	TEST_MSR(MSR_KVM_SYSTEM_TIME),
+	TEST_MSR(MSR_KVM_SYSTEM_TIME_NEW),
+	TEST_MSR(MSR_KVM_WALL_CLOCK),
+	TEST_MSR(MSR_KVM_WALL_CLOCK_NEW),
+	TEST_MSR(MSR_KVM_ASYNC_PF_EN),
+	TEST_MSR(MSR_KVM_STEAL_TIME),
+	TEST_MSR(MSR_KVM_PV_EOI_EN),
+	TEST_MSR(MSR_KVM_POLL_CONTROL),
+	TEST_MSR(MSR_KVM_ASYNC_PF_INT),
+	TEST_MSR(MSR_KVM_ASYNC_PF_ACK),
+};
+
+static void test_msr(struct msr_data *msr)
+{
+	uint64_t ignored;
+	uint8_t vector;
+
+	PR_MSR(msr);
+
+	vector = rdmsr_safe(msr->idx, &ignored);
+	GUEST_ASSERT_EQ(vector, GP_VECTOR);
+
+	vector = wrmsr_safe(msr->idx, 0);
+	GUEST_ASSERT_EQ(vector, GP_VECTOR);
+}
+
+struct hcall_data {
+	uint64_t nr;
+	const char *name;
+};
+
+#define TEST_HCALL(hc) { .nr = hc, .name = #hc }
+#define UCALL_PR_HCALL 0xdeadc0de
+#define PR_HCALL(hc) ucall(UCALL_PR_HCALL, 1, hc)
+
+/*
+ * KVM hypercalls to test. Expect -KVM_ENOSYS when called, as the corresponding
+ * features have been cleared in KVM_CPUID_FEATURES.
+ */
+static struct hcall_data hcalls_to_test[] = {
+	TEST_HCALL(KVM_HC_KICK_CPU),
+	TEST_HCALL(KVM_HC_SEND_IPI),
+	TEST_HCALL(KVM_HC_SCHED_YIELD),
+};
+
+static void test_hcall(struct hcall_data *hc)
+{
+	uint64_t r;
+
+	PR_HCALL(hc);
+	r = kvm_hypercall(hc->nr, 0, 0, 0, 0);
+	GUEST_ASSERT_EQ(r, -KVM_ENOSYS);
+}
+
+static void guest_main(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(msrs_to_test); i++) {
+		test_msr(&msrs_to_test[i]);
+	}
+
+	for (i = 0; i < ARRAY_SIZE(hcalls_to_test); i++) {
+		test_hcall(&hcalls_to_test[i]);
+	}
+
+	GUEST_DONE();
+}
+
+static void pr_msr(struct ucall *uc)
+{
+	struct msr_data *msr = (struct msr_data *)uc->args[0];
+
+	pr_info("testing msr: %s (%#x)\n", msr->name, msr->idx);
+}
+
+static void pr_hcall(struct ucall *uc)
+{
+	struct hcall_data *hc = (struct hcall_data *)uc->args[0];
+
+	pr_info("testing hcall: %s (%lu)\n", hc->name, hc->nr);
+}
+
+static void enter_guest(struct kvm_vcpu *vcpu)
+{
+	struct ucall uc;
+
+	while (true) {
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_PR_MSR:
+			pr_msr(&uc);
+			break;
+		case UCALL_PR_HCALL:
+			pr_hcall(&uc);
+			break;
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			return;
+		case UCALL_DONE:
+			return;
+		}
+	}
+}
+
+int main(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_ENFORCE_PV_FEATURE_CPUID));
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_main);
+
+	vcpu_enable_cap(vcpu, KVM_CAP_ENFORCE_PV_FEATURE_CPUID, 1);
+
+	vcpu_clear_cpuid_entry(vcpu, KVM_CPUID_FEATURES);
+
+	vm_init_descriptor_tables(vm);
+	vcpu_init_descriptor_tables(vcpu);
+
+	enter_guest(vcpu);
+	kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c b/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c
new file mode 100644
index 000000000000..3cc4b86832fe
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/max_vcpuid_cap_test.c
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * maximum APIC ID capability tests
+ *
+ * Copyright (C) 2022, Intel, Inc.
+ *
+ * Tests for getting/setting maximum APIC ID capability
+ */
+
+#include "kvm_util.h"
+
+#define MAX_VCPU_ID	2
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vm *vm;
+	int ret;
+
+	vm = vm_create_barebones();
+
+	/* Get KVM_CAP_MAX_VCPU_ID cap supported in KVM */
+	ret = vm_check_cap(vm, KVM_CAP_MAX_VCPU_ID);
+
+	/* Try to set KVM_CAP_MAX_VCPU_ID beyond KVM cap */
+	ret = __vm_enable_cap(vm, KVM_CAP_MAX_VCPU_ID, ret + 1);
+	TEST_ASSERT(ret < 0,
+		    "Setting KVM_CAP_MAX_VCPU_ID beyond KVM cap should fail");
+
+	/* Set KVM_CAP_MAX_VCPU_ID */
+	vm_enable_cap(vm, KVM_CAP_MAX_VCPU_ID, MAX_VCPU_ID);
+
+
+	/* Try to set KVM_CAP_MAX_VCPU_ID again */
+	ret = __vm_enable_cap(vm, KVM_CAP_MAX_VCPU_ID, MAX_VCPU_ID + 1);
+	TEST_ASSERT(ret < 0,
+		    "Setting KVM_CAP_MAX_VCPU_ID multiple times should fail");
+
+	/* Create vCPU with id beyond KVM_CAP_MAX_VCPU_ID cap*/
+	ret = __vm_ioctl(vm, KVM_CREATE_VCPU, (void *)MAX_VCPU_ID);
+	TEST_ASSERT(ret < 0, "Creating vCPU with ID > MAX_VCPU_ID should fail");
+
+	kvm_vm_free(vm);
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c b/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c
deleted file mode 100644
index e6480fd5c4bd..000000000000
--- a/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * mmio_warning_test
- *
- * Copyright (C) 2019, Google LLC.
- *
- * This work is licensed under the terms of the GNU GPL, version 2.
- *
- * Test that we don't get a kernel warning when we call KVM_RUN after a
- * triple fault occurs.  To get the triple fault to occur we call KVM_RUN
- * on a VCPU that hasn't been properly setup.
- *
- */
-
-#define _GNU_SOURCE
-#include <fcntl.h>
-#include <kvm_util.h>
-#include <linux/kvm.h>
-#include <processor.h>
-#include <pthread.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/ioctl.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <sys/wait.h>
-#include <test_util.h>
-#include <unistd.h>
-
-#define NTHREAD 4
-#define NPROCESS 5
-
-struct thread_context {
-	int kvmcpu;
-	struct kvm_run *run;
-};
-
-void *thr(void *arg)
-{
-	struct thread_context *tc = (struct thread_context *)arg;
-	int res;
-	int kvmcpu = tc->kvmcpu;
-	struct kvm_run *run = tc->run;
-
-	res = ioctl(kvmcpu, KVM_RUN, 0);
-	pr_info("ret1=%d exit_reason=%d suberror=%d\n",
-		res, run->exit_reason, run->internal.suberror);
-
-	return 0;
-}
-
-void test(void)
-{
-	int i, kvm, kvmvm, kvmcpu;
-	pthread_t th[NTHREAD];
-	struct kvm_run *run;
-	struct thread_context tc;
-
-	kvm = open("/dev/kvm", O_RDWR);
-	TEST_ASSERT(kvm != -1, "failed to open /dev/kvm");
-	kvmvm = ioctl(kvm, KVM_CREATE_VM, 0);
-	TEST_ASSERT(kvmvm != -1, "KVM_CREATE_VM failed");
-	kvmcpu = ioctl(kvmvm, KVM_CREATE_VCPU, 0);
-	TEST_ASSERT(kvmcpu != -1, "KVM_CREATE_VCPU failed");
-	run = (struct kvm_run *)mmap(0, 4096, PROT_READ|PROT_WRITE, MAP_SHARED,
-				    kvmcpu, 0);
-	tc.kvmcpu = kvmcpu;
-	tc.run = run;
-	srand(getpid());
-	for (i = 0; i < NTHREAD; i++) {
-		pthread_create(&th[i], NULL, thr, (void *)(uintptr_t)&tc);
-		usleep(rand() % 10000);
-	}
-	for (i = 0; i < NTHREAD; i++)
-		pthread_join(th[i], NULL);
-}
-
-int get_warnings_count(void)
-{
-	int warnings;
-	FILE *f;
-
-	f = popen("dmesg | grep \"WARNING:\" | wc -l", "r");
-	fscanf(f, "%d", &warnings);
-	fclose(f);
-
-	return warnings;
-}
-
-int main(void)
-{
-	int warnings_before, warnings_after;
-
-	if (!is_intel_cpu()) {
-		print_skip("Must be run on an Intel CPU");
-		exit(KSFT_SKIP);
-	}
-
-	if (vm_is_unrestricted_guest(NULL)) {
-		print_skip("Unrestricted guest must be disabled");
-		exit(KSFT_SKIP);
-	}
-
-	warnings_before = get_warnings_count();
-
-	for (int i = 0; i < NPROCESS; ++i) {
-		int status;
-		int pid = fork();
-
-		if (pid < 0)
-			exit(1);
-		if (pid == 0) {
-			test();
-			exit(0);
-		}
-		while (waitpid(pid, &status, __WALL) != pid)
-			;
-	}
-
-	warnings_after = get_warnings_count();
-	TEST_ASSERT(warnings_before == warnings_after,
-		   "Warnings found in kernel.  Run 'dmesg' to inspect them.");
-
-	return 0;
-}
diff --git a/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c
new file mode 100644
index 000000000000..853802641e1e
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/monitor_mwait_test.c
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "kvm_util.h"
+#include "processor.h"
+
+#define CPUID_MWAIT (1u << 3)
+
+enum monitor_mwait_testcases {
+	MWAIT_QUIRK_DISABLED = BIT(0),
+	MISC_ENABLES_QUIRK_DISABLED = BIT(1),
+	MWAIT_DISABLED = BIT(2),
+};
+
+/*
+ * If both MWAIT and its quirk are disabled, MONITOR/MWAIT should #UD, in all
+ * other scenarios KVM should emulate them as nops.
+ */
+#define GUEST_ASSERT_MONITOR_MWAIT(insn, testcase, vector)		\
+do {									\
+	bool fault_wanted = ((testcase) & MWAIT_QUIRK_DISABLED) &&	\
+			    ((testcase) & MWAIT_DISABLED);		\
+									\
+	if (fault_wanted)						\
+		__GUEST_ASSERT((vector) == UD_VECTOR,			\
+			       "Expected #UD on " insn " for testcase '0x%x', got '0x%x'", \
+			       testcase, vector);			\
+	else								\
+		__GUEST_ASSERT(!(vector),				\
+			       "Expected success on " insn " for testcase '0x%x', got '0x%x'", \
+			       testcase, vector);			\
+} while (0)
+
+static void guest_monitor_wait(int testcase)
+{
+	u8 vector;
+
+	GUEST_SYNC(testcase);
+
+	/*
+	 * Arbitrarily MONITOR this function, SVM performs fault checks before
+	 * intercept checks, so the inputs for MONITOR and MWAIT must be valid.
+	 */
+	vector = kvm_asm_safe("monitor", "a"(guest_monitor_wait), "c"(0), "d"(0));
+	GUEST_ASSERT_MONITOR_MWAIT("MONITOR", testcase, vector);
+
+	vector = kvm_asm_safe("mwait", "a"(guest_monitor_wait), "c"(0), "d"(0));
+	GUEST_ASSERT_MONITOR_MWAIT("MWAIT", testcase, vector);
+}
+
+static void guest_code(void)
+{
+	guest_monitor_wait(MWAIT_DISABLED);
+
+	guest_monitor_wait(MWAIT_QUIRK_DISABLED | MWAIT_DISABLED);
+
+	guest_monitor_wait(MISC_ENABLES_QUIRK_DISABLED | MWAIT_DISABLED);
+	guest_monitor_wait(MISC_ENABLES_QUIRK_DISABLED);
+
+	guest_monitor_wait(MISC_ENABLES_QUIRK_DISABLED | MWAIT_QUIRK_DISABLED | MWAIT_DISABLED);
+	guest_monitor_wait(MISC_ENABLES_QUIRK_DISABLED | MWAIT_QUIRK_DISABLED);
+
+	GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+	uint64_t disabled_quirks;
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	struct ucall uc;
+	int testcase;
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_DISABLE_QUIRKS2));
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+	vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_MWAIT);
+
+	vm_init_descriptor_tables(vm);
+	vcpu_init_descriptor_tables(vcpu);
+
+	while (1) {
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_SYNC:
+			testcase = uc.args[1];
+			break;
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			goto done;
+		case UCALL_DONE:
+			goto done;
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+			goto done;
+		}
+
+		disabled_quirks = 0;
+		if (testcase & MWAIT_QUIRK_DISABLED)
+			disabled_quirks |= KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS;
+		if (testcase & MISC_ENABLES_QUIRK_DISABLED)
+			disabled_quirks |= KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT;
+		vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, disabled_quirks);
+
+		/*
+		 * If the MISC_ENABLES quirk (KVM neglects to update CPUID to
+		 * enable/disable MWAIT) is disabled, toggle the ENABLE_MWAIT
+		 * bit in MISC_ENABLES accordingly.  If the quirk is enabled,
+		 * the only valid configuration is MWAIT disabled, as CPUID
+		 * can't be manually changed after running the vCPU.
+		 */
+		if (!(testcase & MISC_ENABLES_QUIRK_DISABLED)) {
+			TEST_ASSERT(testcase & MWAIT_DISABLED,
+				    "Can't toggle CPUID features after running vCPU");
+			continue;
+		}
+
+		vcpu_set_msr(vcpu, MSR_IA32_MISC_ENABLE,
+			     (testcase & MWAIT_DISABLED) ? 0 : MSR_IA32_MISC_ENABLE_MWAIT);
+	}
+
+done:
+	kvm_vm_free(vm);
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/nested_exceptions_test.c b/tools/testing/selftests/kvm/x86_64/nested_exceptions_test.c
new file mode 100644
index 000000000000..3670331adf21
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/nested_exceptions_test.c
@@ -0,0 +1,290 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#define _GNU_SOURCE /* for program_invocation_short_name */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+#include "svm_util.h"
+
+#define L2_GUEST_STACK_SIZE 256
+
+/*
+ * Arbitrary, never shoved into KVM/hardware, just need to avoid conflict with
+ * the "real" exceptions used, #SS/#GP/#DF (12/13/8).
+ */
+#define FAKE_TRIPLE_FAULT_VECTOR	0xaa
+
+/* Arbitrary 32-bit error code injected by this test. */
+#define SS_ERROR_CODE 0xdeadbeef
+
+/*
+ * Bit '0' is set on Intel if the exception occurs while delivering a previous
+ * event/exception.  AMD's wording is ambiguous, but presumably the bit is set
+ * if the exception occurs while delivering an external event, e.g. NMI or INTR,
+ * but not for exceptions that occur when delivering other exceptions or
+ * software interrupts.
+ *
+ * Note, Intel's name for it, "External event", is misleading and much more
+ * aligned with AMD's behavior, but the SDM is quite clear on its behavior.
+ */
+#define ERROR_CODE_EXT_FLAG	BIT(0)
+
+/*
+ * Bit '1' is set if the fault occurred when looking up a descriptor in the
+ * IDT, which is the case here as the IDT is empty/NULL.
+ */
+#define ERROR_CODE_IDT_FLAG	BIT(1)
+
+/*
+ * The #GP that occurs when vectoring #SS should show the index into the IDT
+ * for #SS, plus have the "IDT flag" set.
+ */
+#define GP_ERROR_CODE_AMD ((SS_VECTOR * 8) | ERROR_CODE_IDT_FLAG)
+#define GP_ERROR_CODE_INTEL ((SS_VECTOR * 8) | ERROR_CODE_IDT_FLAG | ERROR_CODE_EXT_FLAG)
+
+/*
+ * Intel and AMD both shove '0' into the error code on #DF, regardless of what
+ * led to the double fault.
+ */
+#define DF_ERROR_CODE 0
+
+#define INTERCEPT_SS		(BIT_ULL(SS_VECTOR))
+#define INTERCEPT_SS_DF		(INTERCEPT_SS | BIT_ULL(DF_VECTOR))
+#define INTERCEPT_SS_GP_DF	(INTERCEPT_SS_DF | BIT_ULL(GP_VECTOR))
+
+static void l2_ss_pending_test(void)
+{
+	GUEST_SYNC(SS_VECTOR);
+}
+
+static void l2_ss_injected_gp_test(void)
+{
+	GUEST_SYNC(GP_VECTOR);
+}
+
+static void l2_ss_injected_df_test(void)
+{
+	GUEST_SYNC(DF_VECTOR);
+}
+
+static void l2_ss_injected_tf_test(void)
+{
+	GUEST_SYNC(FAKE_TRIPLE_FAULT_VECTOR);
+}
+
+static void svm_run_l2(struct svm_test_data *svm, void *l2_code, int vector,
+		       uint32_t error_code)
+{
+	struct vmcb *vmcb = svm->vmcb;
+	struct vmcb_control_area *ctrl = &vmcb->control;
+
+	vmcb->save.rip = (u64)l2_code;
+	run_guest(vmcb, svm->vmcb_gpa);
+
+	if (vector == FAKE_TRIPLE_FAULT_VECTOR)
+		return;
+
+	GUEST_ASSERT_EQ(ctrl->exit_code, (SVM_EXIT_EXCP_BASE + vector));
+	GUEST_ASSERT_EQ(ctrl->exit_info_1, error_code);
+}
+
+static void l1_svm_code(struct svm_test_data *svm)
+{
+	struct vmcb_control_area *ctrl = &svm->vmcb->control;
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+
+	generic_svm_setup(svm, NULL, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+	svm->vmcb->save.idtr.limit = 0;
+	ctrl->intercept |= BIT_ULL(INTERCEPT_SHUTDOWN);
+
+	ctrl->intercept_exceptions = INTERCEPT_SS_GP_DF;
+	svm_run_l2(svm, l2_ss_pending_test, SS_VECTOR, SS_ERROR_CODE);
+	svm_run_l2(svm, l2_ss_injected_gp_test, GP_VECTOR, GP_ERROR_CODE_AMD);
+
+	ctrl->intercept_exceptions = INTERCEPT_SS_DF;
+	svm_run_l2(svm, l2_ss_injected_df_test, DF_VECTOR, DF_ERROR_CODE);
+
+	ctrl->intercept_exceptions = INTERCEPT_SS;
+	svm_run_l2(svm, l2_ss_injected_tf_test, FAKE_TRIPLE_FAULT_VECTOR, 0);
+	GUEST_ASSERT_EQ(ctrl->exit_code, SVM_EXIT_SHUTDOWN);
+
+	GUEST_DONE();
+}
+
+static void vmx_run_l2(void *l2_code, int vector, uint32_t error_code)
+{
+	GUEST_ASSERT(!vmwrite(GUEST_RIP, (u64)l2_code));
+
+	GUEST_ASSERT_EQ(vector == SS_VECTOR ? vmlaunch() : vmresume(), 0);
+
+	if (vector == FAKE_TRIPLE_FAULT_VECTOR)
+		return;
+
+	GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_EXCEPTION_NMI);
+	GUEST_ASSERT_EQ((vmreadz(VM_EXIT_INTR_INFO) & 0xff), vector);
+	GUEST_ASSERT_EQ(vmreadz(VM_EXIT_INTR_ERROR_CODE), error_code);
+}
+
+static void l1_vmx_code(struct vmx_pages *vmx)
+{
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+
+	GUEST_ASSERT_EQ(prepare_for_vmx_operation(vmx), true);
+
+	GUEST_ASSERT_EQ(load_vmcs(vmx), true);
+
+	prepare_vmcs(vmx, NULL, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+	GUEST_ASSERT_EQ(vmwrite(GUEST_IDTR_LIMIT, 0), 0);
+
+	/*
+	 * VMX disallows injecting an exception with error_code[31:16] != 0,
+	 * and hardware will never generate a VM-Exit with bits 31:16 set.
+	 * KVM should likewise truncate the "bad" userspace value.
+	 */
+	GUEST_ASSERT_EQ(vmwrite(EXCEPTION_BITMAP, INTERCEPT_SS_GP_DF), 0);
+	vmx_run_l2(l2_ss_pending_test, SS_VECTOR, (u16)SS_ERROR_CODE);
+	vmx_run_l2(l2_ss_injected_gp_test, GP_VECTOR, GP_ERROR_CODE_INTEL);
+
+	GUEST_ASSERT_EQ(vmwrite(EXCEPTION_BITMAP, INTERCEPT_SS_DF), 0);
+	vmx_run_l2(l2_ss_injected_df_test, DF_VECTOR, DF_ERROR_CODE);
+
+	GUEST_ASSERT_EQ(vmwrite(EXCEPTION_BITMAP, INTERCEPT_SS), 0);
+	vmx_run_l2(l2_ss_injected_tf_test, FAKE_TRIPLE_FAULT_VECTOR, 0);
+	GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_TRIPLE_FAULT);
+
+	GUEST_DONE();
+}
+
+static void __attribute__((__flatten__)) l1_guest_code(void *test_data)
+{
+	if (this_cpu_has(X86_FEATURE_SVM))
+		l1_svm_code(test_data);
+	else
+		l1_vmx_code(test_data);
+}
+
+static void assert_ucall_vector(struct kvm_vcpu *vcpu, int vector)
+{
+	struct ucall uc;
+
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_SYNC:
+		TEST_ASSERT(vector == uc.args[1],
+			    "Expected L2 to ask for %d, got %ld", vector, uc.args[1]);
+		break;
+	case UCALL_DONE:
+		TEST_ASSERT(vector == -1,
+			    "Expected L2 to ask for %d, L2 says it's done", vector);
+		break;
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+		break;
+	default:
+		TEST_FAIL("Expected L2 to ask for %d, got unexpected ucall %lu", vector, uc.cmd);
+	}
+}
+
+static void queue_ss_exception(struct kvm_vcpu *vcpu, bool inject)
+{
+	struct kvm_vcpu_events events;
+
+	vcpu_events_get(vcpu, &events);
+
+	TEST_ASSERT(!events.exception.pending,
+		    "Vector %d unexpectedlt pending", events.exception.nr);
+	TEST_ASSERT(!events.exception.injected,
+		    "Vector %d unexpectedly injected", events.exception.nr);
+
+	events.flags = KVM_VCPUEVENT_VALID_PAYLOAD;
+	events.exception.pending = !inject;
+	events.exception.injected = inject;
+	events.exception.nr = SS_VECTOR;
+	events.exception.has_error_code = true;
+	events.exception.error_code = SS_ERROR_CODE;
+	vcpu_events_set(vcpu, &events);
+}
+
+/*
+ * Verify KVM_{G,S}ET_EVENTS play nice with pending vs. injected exceptions
+ * when an exception is being queued for L2.  Specifically, verify that KVM
+ * honors L1 exception intercept controls when a #SS is pending/injected,
+ * triggers a #GP on vectoring the #SS, morphs to #DF if #GP isn't intercepted
+ * by L1, and finally causes (nested) SHUTDOWN if #DF isn't intercepted by L1.
+ */
+int main(int argc, char *argv[])
+{
+	vm_vaddr_t nested_test_data_gva;
+	struct kvm_vcpu_events events;
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_EXCEPTION_PAYLOAD));
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM) || kvm_cpu_has(X86_FEATURE_VMX));
+
+	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
+	vm_enable_cap(vm, KVM_CAP_EXCEPTION_PAYLOAD, -2ul);
+
+	if (kvm_cpu_has(X86_FEATURE_SVM))
+		vcpu_alloc_svm(vm, &nested_test_data_gva);
+	else
+		vcpu_alloc_vmx(vm, &nested_test_data_gva);
+
+	vcpu_args_set(vcpu, 1, nested_test_data_gva);
+
+	/* Run L1 => L2.  L2 should sync and request #SS. */
+	vcpu_run(vcpu);
+	assert_ucall_vector(vcpu, SS_VECTOR);
+
+	/* Pend #SS and request immediate exit.  #SS should still be pending. */
+	queue_ss_exception(vcpu, false);
+	vcpu->run->immediate_exit = true;
+	vcpu_run_complete_io(vcpu);
+
+	/* Verify the pending events comes back out the same as it went in. */
+	vcpu_events_get(vcpu, &events);
+	TEST_ASSERT_EQ(events.flags & KVM_VCPUEVENT_VALID_PAYLOAD,
+			KVM_VCPUEVENT_VALID_PAYLOAD);
+	TEST_ASSERT_EQ(events.exception.pending, true);
+	TEST_ASSERT_EQ(events.exception.nr, SS_VECTOR);
+	TEST_ASSERT_EQ(events.exception.has_error_code, true);
+	TEST_ASSERT_EQ(events.exception.error_code, SS_ERROR_CODE);
+
+	/*
+	 * Run for real with the pending #SS, L1 should get a VM-Exit due to
+	 * #SS interception and re-enter L2 to request #GP (via injected #SS).
+	 */
+	vcpu->run->immediate_exit = false;
+	vcpu_run(vcpu);
+	assert_ucall_vector(vcpu, GP_VECTOR);
+
+	/*
+	 * Inject #SS, the #SS should bypass interception and cause #GP, which
+	 * L1 should intercept before KVM morphs it to #DF.  L1 should then
+	 * disable #GP interception and run L2 to request #DF (via #SS => #GP).
+	 */
+	queue_ss_exception(vcpu, true);
+	vcpu_run(vcpu);
+	assert_ucall_vector(vcpu, DF_VECTOR);
+
+	/*
+	 * Inject #SS, the #SS should bypass interception and cause #GP, which
+	 * L1 is no longer interception, and so should see a #DF VM-Exit.  L1
+	 * should then signal that is done.
+	 */
+	queue_ss_exception(vcpu, true);
+	vcpu_run(vcpu);
+	assert_ucall_vector(vcpu, FAKE_TRIPLE_FAULT_VECTOR);
+
+	/*
+	 * Inject #SS yet again.  L1 is not intercepting #GP or #DF, and so
+	 * should see nested TRIPLE_FAULT / SHUTDOWN.
+	 */
+	queue_ss_exception(vcpu, true);
+	vcpu_run(vcpu);
+	assert_ucall_vector(vcpu, -1);
+
+	kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c
new file mode 100644
index 000000000000..17bbb96fc4df
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c
@@ -0,0 +1,269 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Usage: to be run via nx_huge_page_test.sh, which does the necessary
+ * environment setup and teardown
+ *
+ * Copyright (C) 2022, Google LLC.
+ */
+
+#define _GNU_SOURCE
+
+#include <fcntl.h>
+#include <stdint.h>
+#include <time.h>
+
+#include <test_util.h>
+#include "kvm_util.h"
+#include "processor.h"
+
+#define HPAGE_SLOT		10
+#define HPAGE_GPA		(4UL << 30) /* 4G prevents collision w/ slot 0 */
+#define HPAGE_GVA		HPAGE_GPA /* GVA is arbitrary, so use GPA. */
+#define PAGES_PER_2MB_HUGE_PAGE 512
+#define HPAGE_SLOT_NPAGES	(3 * PAGES_PER_2MB_HUGE_PAGE)
+
+/*
+ * Passed by nx_huge_pages_test.sh to provide an easy warning if this test is
+ * being run without it.
+ */
+#define MAGIC_TOKEN 887563923
+
+/*
+ * x86 opcode for the return instruction. Used to call into, and then
+ * immediately return from, memory backed with hugepages.
+ */
+#define RETURN_OPCODE 0xC3
+
+/* Call the specified memory address. */
+static void guest_do_CALL(uint64_t target)
+{
+	((void (*)(void)) target)();
+}
+
+/*
+ * Exit the VM after each memory access so that the userspace component of the
+ * test can make assertions about the pages backing the VM.
+ *
+ * See the below for an explanation of how each access should affect the
+ * backing mappings.
+ */
+void guest_code(void)
+{
+	uint64_t hpage_1 = HPAGE_GVA;
+	uint64_t hpage_2 = hpage_1 + (PAGE_SIZE * 512);
+	uint64_t hpage_3 = hpage_2 + (PAGE_SIZE * 512);
+
+	READ_ONCE(*(uint64_t *)hpage_1);
+	GUEST_SYNC(1);
+
+	READ_ONCE(*(uint64_t *)hpage_2);
+	GUEST_SYNC(2);
+
+	guest_do_CALL(hpage_1);
+	GUEST_SYNC(3);
+
+	guest_do_CALL(hpage_3);
+	GUEST_SYNC(4);
+
+	READ_ONCE(*(uint64_t *)hpage_1);
+	GUEST_SYNC(5);
+
+	READ_ONCE(*(uint64_t *)hpage_3);
+	GUEST_SYNC(6);
+}
+
+static void check_2m_page_count(struct kvm_vm *vm, int expected_pages_2m)
+{
+	int actual_pages_2m;
+
+	actual_pages_2m = vm_get_stat(vm, "pages_2m");
+
+	TEST_ASSERT(actual_pages_2m == expected_pages_2m,
+		    "Unexpected 2m page count. Expected %d, got %d",
+		    expected_pages_2m, actual_pages_2m);
+}
+
+static void check_split_count(struct kvm_vm *vm, int expected_splits)
+{
+	int actual_splits;
+
+	actual_splits = vm_get_stat(vm, "nx_lpage_splits");
+
+	TEST_ASSERT(actual_splits == expected_splits,
+		    "Unexpected NX huge page split count. Expected %d, got %d",
+		    expected_splits, actual_splits);
+}
+
+static void wait_for_reclaim(int reclaim_period_ms)
+{
+	long reclaim_wait_ms;
+	struct timespec ts;
+
+	reclaim_wait_ms = reclaim_period_ms * 5;
+	ts.tv_sec = reclaim_wait_ms / 1000;
+	ts.tv_nsec = (reclaim_wait_ms - (ts.tv_sec * 1000)) * 1000000;
+	nanosleep(&ts, NULL);
+}
+
+void run_test(int reclaim_period_ms, bool disable_nx_huge_pages,
+	      bool reboot_permissions)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	uint64_t nr_bytes;
+	void *hva;
+	int r;
+
+	vm = vm_create(1);
+
+	if (disable_nx_huge_pages) {
+		r = __vm_disable_nx_huge_pages(vm);
+		if (reboot_permissions) {
+			TEST_ASSERT(!r, "Disabling NX huge pages should succeed if process has reboot permissions");
+		} else {
+			TEST_ASSERT(r == -1 && errno == EPERM,
+				    "This process should not have permission to disable NX huge pages");
+			return;
+		}
+	}
+
+	vcpu = vm_vcpu_add(vm, 0, guest_code);
+
+	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS_HUGETLB,
+				    HPAGE_GPA, HPAGE_SLOT,
+				    HPAGE_SLOT_NPAGES, 0);
+
+	nr_bytes = HPAGE_SLOT_NPAGES * vm->page_size;
+
+	/*
+	 * Ensure that KVM can map HPAGE_SLOT with huge pages by mapping the
+	 * region into the guest with 2MiB pages whenever TDP is disabled (i.e.
+	 * whenever KVM is shadowing the guest page tables).
+	 *
+	 * When TDP is enabled, KVM should be able to map HPAGE_SLOT with huge
+	 * pages irrespective of the guest page size, so map with 4KiB pages
+	 * to test that that is the case.
+	 */
+	if (kvm_is_tdp_enabled())
+		virt_map_level(vm, HPAGE_GVA, HPAGE_GPA, nr_bytes, PG_LEVEL_4K);
+	else
+		virt_map_level(vm, HPAGE_GVA, HPAGE_GPA, nr_bytes, PG_LEVEL_2M);
+
+	hva = addr_gpa2hva(vm, HPAGE_GPA);
+	memset(hva, RETURN_OPCODE, nr_bytes);
+
+	check_2m_page_count(vm, 0);
+	check_split_count(vm, 0);
+
+	/*
+	 * The guest code will first read from the first hugepage, resulting
+	 * in a huge page mapping being created.
+	 */
+	vcpu_run(vcpu);
+	check_2m_page_count(vm, 1);
+	check_split_count(vm, 0);
+
+	/*
+	 * Then the guest code will read from the second hugepage, resulting
+	 * in another huge page mapping being created.
+	 */
+	vcpu_run(vcpu);
+	check_2m_page_count(vm, 2);
+	check_split_count(vm, 0);
+
+	/*
+	 * Next, the guest will execute from the first huge page, causing it
+	 * to be remapped at 4k.
+	 *
+	 * If NX huge pages are disabled, this should have no effect.
+	 */
+	vcpu_run(vcpu);
+	check_2m_page_count(vm, disable_nx_huge_pages ? 2 : 1);
+	check_split_count(vm, disable_nx_huge_pages ? 0 : 1);
+
+	/*
+	 * Executing from the third huge page (previously unaccessed) will
+	 * cause part to be mapped at 4k.
+	 *
+	 * If NX huge pages are disabled, it should be mapped at 2M.
+	 */
+	vcpu_run(vcpu);
+	check_2m_page_count(vm, disable_nx_huge_pages ? 3 : 1);
+	check_split_count(vm, disable_nx_huge_pages ? 0 : 2);
+
+	/* Reading from the first huge page again should have no effect. */
+	vcpu_run(vcpu);
+	check_2m_page_count(vm, disable_nx_huge_pages ? 3 : 1);
+	check_split_count(vm, disable_nx_huge_pages ? 0 : 2);
+
+	/* Give recovery thread time to run. */
+	wait_for_reclaim(reclaim_period_ms);
+
+	/*
+	 * Now that the reclaimer has run, all the split pages should be gone.
+	 *
+	 * If NX huge pages are disabled, the relaimer will not run, so
+	 * nothing should change from here on.
+	 */
+	check_2m_page_count(vm, disable_nx_huge_pages ? 3 : 1);
+	check_split_count(vm, 0);
+
+	/*
+	 * The 4k mapping on hpage 3 should have been removed, so check that
+	 * reading from it causes a huge page mapping to be installed.
+	 */
+	vcpu_run(vcpu);
+	check_2m_page_count(vm, disable_nx_huge_pages ? 3 : 2);
+	check_split_count(vm, 0);
+
+	kvm_vm_free(vm);
+}
+
+static void help(char *name)
+{
+	puts("");
+	printf("usage: %s [-h] [-p period_ms] [-t token]\n", name);
+	puts("");
+	printf(" -p: The NX reclaim period in milliseconds.\n");
+	printf(" -t: The magic token to indicate environment setup is done.\n");
+	printf(" -r: The test has reboot permissions and can disable NX huge pages.\n");
+	puts("");
+	exit(0);
+}
+
+int main(int argc, char **argv)
+{
+	int reclaim_period_ms = 0, token = 0, opt;
+	bool reboot_permissions = false;
+
+	while ((opt = getopt(argc, argv, "hp:t:r")) != -1) {
+		switch (opt) {
+		case 'p':
+			reclaim_period_ms = atoi_positive("Reclaim period", optarg);
+			break;
+		case 't':
+			token = atoi_paranoid(optarg);
+			break;
+		case 'r':
+			reboot_permissions = true;
+			break;
+		case 'h':
+		default:
+			help(argv[0]);
+			break;
+		}
+	}
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_VM_DISABLE_NX_HUGE_PAGES));
+
+	__TEST_REQUIRE(token == MAGIC_TOKEN,
+		       "This test must be run with the magic token via '-t %d'.\n"
+		       "Running via nx_huge_pages_test.sh, which also handles "
+		       "environment setup, is strongly recommended.", MAGIC_TOKEN);
+
+	run_test(reclaim_period_ms, false, reboot_permissions);
+	run_test(reclaim_period_ms, true, reboot_permissions);
+
+	return 0;
+}
+
diff --git a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.sh b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.sh
new file mode 100755
index 000000000000..7cbb409801ee
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0-only */
+#
+# Wrapper script which performs setup and cleanup for nx_huge_pages_test.
+# Makes use of root privileges to set up huge pages and KVM module parameters.
+#
+# Copyright (C) 2022, Google LLC.
+
+set -e
+
+NX_HUGE_PAGES=$(cat /sys/module/kvm/parameters/nx_huge_pages)
+NX_HUGE_PAGES_RECOVERY_RATIO=$(cat /sys/module/kvm/parameters/nx_huge_pages_recovery_ratio)
+NX_HUGE_PAGES_RECOVERY_PERIOD=$(cat /sys/module/kvm/parameters/nx_huge_pages_recovery_period_ms)
+HUGE_PAGES=$(cat /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages)
+
+set +e
+
+function sudo_echo () {
+	echo "$1" | sudo tee -a "$2" > /dev/null
+}
+
+NXECUTABLE="$(dirname $0)/nx_huge_pages_test"
+
+sudo_echo test /dev/null || exit 4 # KSFT_SKIP=4
+
+(
+	set -e
+
+	sudo_echo 1 /sys/module/kvm/parameters/nx_huge_pages
+	sudo_echo 1 /sys/module/kvm/parameters/nx_huge_pages_recovery_ratio
+	sudo_echo 100 /sys/module/kvm/parameters/nx_huge_pages_recovery_period_ms
+	sudo_echo "$(( $HUGE_PAGES + 3 ))" /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
+
+	# Test with reboot permissions
+	if [ $(whoami) == "root" ] || sudo setcap cap_sys_boot+ep $NXECUTABLE 2> /dev/null; then
+		echo Running test with CAP_SYS_BOOT enabled
+		$NXECUTABLE -t 887563923 -p 100 -r
+		test $(whoami) == "root" || sudo setcap cap_sys_boot-ep $NXECUTABLE
+	else
+		echo setcap failed, skipping nx_huge_pages_test with CAP_SYS_BOOT enabled
+	fi
+
+	# Test without reboot permissions
+	if [ $(whoami) != "root" ] ; then
+		echo Running test with CAP_SYS_BOOT disabled
+		$NXECUTABLE -t 887563923 -p 100
+	else
+		echo Running as root, skipping nx_huge_pages_test with CAP_SYS_BOOT disabled
+	fi
+)
+RET=$?
+
+sudo_echo "$NX_HUGE_PAGES" /sys/module/kvm/parameters/nx_huge_pages
+sudo_echo "$NX_HUGE_PAGES_RECOVERY_RATIO" /sys/module/kvm/parameters/nx_huge_pages_recovery_ratio
+sudo_echo "$NX_HUGE_PAGES_RECOVERY_PERIOD" /sys/module/kvm/parameters/nx_huge_pages_recovery_period_ms
+sudo_echo "$HUGE_PAGES" /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
+
+exit $RET
diff --git a/tools/testing/selftests/kvm/x86_64/platform_info_test.c b/tools/testing/selftests/kvm/x86_64/platform_info_test.c
index 1e89688cbbbf..87011965dc41 100644
--- a/tools/testing/selftests/kvm/x86_64/platform_info_test.c
+++ b/tools/testing/selftests/kvm/x86_64/platform_info_test.c
@@ -21,7 +21,6 @@
 #include "kvm_util.h"
 #include "processor.h"
 
-#define VCPU_ID 0
 #define MSR_PLATFORM_INFO_MAX_TURBO_RATIO 0xff00
 
 static void guest_code(void)
@@ -35,71 +34,46 @@ static void guest_code(void)
 	}
 }
 
-static void set_msr_platform_info_enabled(struct kvm_vm *vm, bool enable)
+static void test_msr_platform_info_enabled(struct kvm_vcpu *vcpu)
 {
-	struct kvm_enable_cap cap = {};
-
-	cap.cap = KVM_CAP_MSR_PLATFORM_INFO;
-	cap.flags = 0;
-	cap.args[0] = (int)enable;
-	vm_enable_cap(vm, &cap);
-}
-
-static void test_msr_platform_info_enabled(struct kvm_vm *vm)
-{
-	struct kvm_run *run = vcpu_state(vm, VCPU_ID);
 	struct ucall uc;
 
-	set_msr_platform_info_enabled(vm, true);
-	vcpu_run(vm, VCPU_ID);
-	TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
-			"Exit_reason other than KVM_EXIT_IO: %u (%s),\n",
-			run->exit_reason,
-			exit_reason_str(run->exit_reason));
-	get_ucall(vm, VCPU_ID, &uc);
+	vm_enable_cap(vcpu->vm, KVM_CAP_MSR_PLATFORM_INFO, true);
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+	get_ucall(vcpu, &uc);
 	TEST_ASSERT(uc.cmd == UCALL_SYNC,
-			"Received ucall other than UCALL_SYNC: %lu\n", uc.cmd);
+			"Received ucall other than UCALL_SYNC: %lu", uc.cmd);
 	TEST_ASSERT((uc.args[1] & MSR_PLATFORM_INFO_MAX_TURBO_RATIO) ==
 		MSR_PLATFORM_INFO_MAX_TURBO_RATIO,
 		"Expected MSR_PLATFORM_INFO to have max turbo ratio mask: %i.",
 		MSR_PLATFORM_INFO_MAX_TURBO_RATIO);
 }
 
-static void test_msr_platform_info_disabled(struct kvm_vm *vm)
+static void test_msr_platform_info_disabled(struct kvm_vcpu *vcpu)
 {
-	struct kvm_run *run = vcpu_state(vm, VCPU_ID);
-
-	set_msr_platform_info_enabled(vm, false);
-	vcpu_run(vm, VCPU_ID);
-	TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN,
-			"Exit_reason other than KVM_EXIT_SHUTDOWN: %u (%s)\n",
-			run->exit_reason,
-			exit_reason_str(run->exit_reason));
+	vm_enable_cap(vcpu->vm, KVM_CAP_MSR_PLATFORM_INFO, false);
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_SHUTDOWN);
 }
 
 int main(int argc, char *argv[])
 {
+	struct kvm_vcpu *vcpu;
 	struct kvm_vm *vm;
-	int rv;
 	uint64_t msr_platform_info;
 
-	/* Tell stdout not to buffer its content */
-	setbuf(stdout, NULL);
-
-	rv = kvm_check_cap(KVM_CAP_MSR_PLATFORM_INFO);
-	if (!rv) {
-		print_skip("KVM_CAP_MSR_PLATFORM_INFO not supported");
-		exit(KSFT_SKIP);
-	}
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_MSR_PLATFORM_INFO));
 
-	vm = vm_create_default(VCPU_ID, 0, guest_code);
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
 
-	msr_platform_info = vcpu_get_msr(vm, VCPU_ID, MSR_PLATFORM_INFO);
-	vcpu_set_msr(vm, VCPU_ID, MSR_PLATFORM_INFO,
-		msr_platform_info | MSR_PLATFORM_INFO_MAX_TURBO_RATIO);
-	test_msr_platform_info_enabled(vm);
-	test_msr_platform_info_disabled(vm);
-	vcpu_set_msr(vm, VCPU_ID, MSR_PLATFORM_INFO, msr_platform_info);
+	msr_platform_info = vcpu_get_msr(vcpu, MSR_PLATFORM_INFO);
+	vcpu_set_msr(vcpu, MSR_PLATFORM_INFO,
+		     msr_platform_info | MSR_PLATFORM_INFO_MAX_TURBO_RATIO);
+	test_msr_platform_info_enabled(vcpu);
+	test_msr_platform_info_disabled(vcpu);
+	vcpu_set_msr(vcpu, MSR_PLATFORM_INFO, msr_platform_info);
 
 	kvm_vm_free(vm);
 
diff --git a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c
new file mode 100644
index 000000000000..29609b52f8fa
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c
@@ -0,0 +1,620 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023, Tencent, Inc.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <x86intrin.h>
+
+#include "pmu.h"
+#include "processor.h"
+
+/* Number of LOOP instructions for the guest measurement payload. */
+#define NUM_BRANCHES		10
+/*
+ * Number of "extra" instructions that will be counted, i.e. the number of
+ * instructions that are needed to set up the loop and then disabled the
+ * counter.  1 CLFLUSH/CLFLUSHOPT/NOP, 1 MFENCE, 2 MOV, 2 XOR, 1 WRMSR.
+ */
+#define NUM_EXTRA_INSNS		7
+#define NUM_INSNS_RETIRED	(NUM_BRANCHES + NUM_EXTRA_INSNS)
+
+static uint8_t kvm_pmu_version;
+static bool kvm_has_perf_caps;
+static bool is_forced_emulation_enabled;
+
+static struct kvm_vm *pmu_vm_create_with_one_vcpu(struct kvm_vcpu **vcpu,
+						  void *guest_code,
+						  uint8_t pmu_version,
+						  uint64_t perf_capabilities)
+{
+	struct kvm_vm *vm;
+
+	vm = vm_create_with_one_vcpu(vcpu, guest_code);
+	vm_init_descriptor_tables(vm);
+	vcpu_init_descriptor_tables(*vcpu);
+
+	sync_global_to_guest(vm, kvm_pmu_version);
+	sync_global_to_guest(vm, is_forced_emulation_enabled);
+
+	/*
+	 * Set PERF_CAPABILITIES before PMU version as KVM disallows enabling
+	 * features via PERF_CAPABILITIES if the guest doesn't have a vPMU.
+	 */
+	if (kvm_has_perf_caps)
+		vcpu_set_msr(*vcpu, MSR_IA32_PERF_CAPABILITIES, perf_capabilities);
+
+	vcpu_set_cpuid_property(*vcpu, X86_PROPERTY_PMU_VERSION, pmu_version);
+	return vm;
+}
+
+static void run_vcpu(struct kvm_vcpu *vcpu)
+{
+	struct ucall uc;
+
+	do {
+		vcpu_run(vcpu);
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_SYNC:
+			break;
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			break;
+		case UCALL_PRINTF:
+			pr_info("%s", uc.buffer);
+			break;
+		case UCALL_DONE:
+			break;
+		default:
+			TEST_FAIL("Unexpected ucall: %lu", uc.cmd);
+		}
+	} while (uc.cmd != UCALL_DONE);
+}
+
+static uint8_t guest_get_pmu_version(void)
+{
+	/*
+	 * Return the effective PMU version, i.e. the minimum between what KVM
+	 * supports and what is enumerated to the guest.  The host deliberately
+	 * advertises a PMU version to the guest beyond what is actually
+	 * supported by KVM to verify KVM doesn't freak out and do something
+	 * bizarre with an architecturally valid, but unsupported, version.
+	 */
+	return min_t(uint8_t, kvm_pmu_version, this_cpu_property(X86_PROPERTY_PMU_VERSION));
+}
+
+/*
+ * If an architectural event is supported and guaranteed to generate at least
+ * one "hit, assert that its count is non-zero.  If an event isn't supported or
+ * the test can't guarantee the associated action will occur, then all bets are
+ * off regarding the count, i.e. no checks can be done.
+ *
+ * Sanity check that in all cases, the event doesn't count when it's disabled,
+ * and that KVM correctly emulates the write of an arbitrary value.
+ */
+static void guest_assert_event_count(uint8_t idx,
+				     struct kvm_x86_pmu_feature event,
+				     uint32_t pmc, uint32_t pmc_msr)
+{
+	uint64_t count;
+
+	count = _rdpmc(pmc);
+	if (!this_pmu_has(event))
+		goto sanity_checks;
+
+	switch (idx) {
+	case INTEL_ARCH_INSTRUCTIONS_RETIRED_INDEX:
+		GUEST_ASSERT_EQ(count, NUM_INSNS_RETIRED);
+		break;
+	case INTEL_ARCH_BRANCHES_RETIRED_INDEX:
+		GUEST_ASSERT_EQ(count, NUM_BRANCHES);
+		break;
+	case INTEL_ARCH_LLC_REFERENCES_INDEX:
+	case INTEL_ARCH_LLC_MISSES_INDEX:
+		if (!this_cpu_has(X86_FEATURE_CLFLUSHOPT) &&
+		    !this_cpu_has(X86_FEATURE_CLFLUSH))
+			break;
+		fallthrough;
+	case INTEL_ARCH_CPU_CYCLES_INDEX:
+	case INTEL_ARCH_REFERENCE_CYCLES_INDEX:
+		GUEST_ASSERT_NE(count, 0);
+		break;
+	case INTEL_ARCH_TOPDOWN_SLOTS_INDEX:
+		GUEST_ASSERT(count >= NUM_INSNS_RETIRED);
+		break;
+	default:
+		break;
+	}
+
+sanity_checks:
+	__asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
+	GUEST_ASSERT_EQ(_rdpmc(pmc), count);
+
+	wrmsr(pmc_msr, 0xdead);
+	GUEST_ASSERT_EQ(_rdpmc(pmc), 0xdead);
+}
+
+/*
+ * Enable and disable the PMC in a monolithic asm blob to ensure that the
+ * compiler can't insert _any_ code into the measured sequence.  Note, ECX
+ * doesn't need to be clobbered as the input value, @pmc_msr, is restored
+ * before the end of the sequence.
+ *
+ * If CLFUSH{,OPT} is supported, flush the cacheline containing (at least) the
+ * start of the loop to force LLC references and misses, i.e. to allow testing
+ * that those events actually count.
+ *
+ * If forced emulation is enabled (and specified), force emulation on a subset
+ * of the measured code to verify that KVM correctly emulates instructions and
+ * branches retired events in conjunction with hardware also counting said
+ * events.
+ */
+#define GUEST_MEASURE_EVENT(_msr, _value, clflush, FEP)				\
+do {										\
+	__asm__ __volatile__("wrmsr\n\t"					\
+			     clflush "\n\t"					\
+			     "mfence\n\t"					\
+			     "1: mov $" __stringify(NUM_BRANCHES) ", %%ecx\n\t"	\
+			     FEP "loop .\n\t"					\
+			     FEP "mov %%edi, %%ecx\n\t"				\
+			     FEP "xor %%eax, %%eax\n\t"				\
+			     FEP "xor %%edx, %%edx\n\t"				\
+			     "wrmsr\n\t"					\
+			     :: "a"((uint32_t)_value), "d"(_value >> 32),	\
+				"c"(_msr), "D"(_msr)				\
+	);									\
+} while (0)
+
+#define GUEST_TEST_EVENT(_idx, _event, _pmc, _pmc_msr, _ctrl_msr, _value, FEP)	\
+do {										\
+	wrmsr(pmc_msr, 0);							\
+										\
+	if (this_cpu_has(X86_FEATURE_CLFLUSHOPT))				\
+		GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflushopt 1f", FEP);	\
+	else if (this_cpu_has(X86_FEATURE_CLFLUSH))				\
+		GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflush 1f", FEP);	\
+	else									\
+		GUEST_MEASURE_EVENT(_ctrl_msr, _value, "nop", FEP);		\
+										\
+	guest_assert_event_count(_idx, _event, _pmc, _pmc_msr);			\
+} while (0)
+
+static void __guest_test_arch_event(uint8_t idx, struct kvm_x86_pmu_feature event,
+				    uint32_t pmc, uint32_t pmc_msr,
+				    uint32_t ctrl_msr, uint64_t ctrl_msr_value)
+{
+	GUEST_TEST_EVENT(idx, event, pmc, pmc_msr, ctrl_msr, ctrl_msr_value, "");
+
+	if (is_forced_emulation_enabled)
+		GUEST_TEST_EVENT(idx, event, pmc, pmc_msr, ctrl_msr, ctrl_msr_value, KVM_FEP);
+}
+
+#define X86_PMU_FEATURE_NULL						\
+({									\
+	struct kvm_x86_pmu_feature feature = {};			\
+									\
+	feature;							\
+})
+
+static bool pmu_is_null_feature(struct kvm_x86_pmu_feature event)
+{
+	return !(*(u64 *)&event);
+}
+
+static void guest_test_arch_event(uint8_t idx)
+{
+	const struct {
+		struct kvm_x86_pmu_feature gp_event;
+		struct kvm_x86_pmu_feature fixed_event;
+	} intel_event_to_feature[] = {
+		[INTEL_ARCH_CPU_CYCLES_INDEX]		 = { X86_PMU_FEATURE_CPU_CYCLES, X86_PMU_FEATURE_CPU_CYCLES_FIXED },
+		[INTEL_ARCH_INSTRUCTIONS_RETIRED_INDEX]	 = { X86_PMU_FEATURE_INSNS_RETIRED, X86_PMU_FEATURE_INSNS_RETIRED_FIXED },
+		/*
+		 * Note, the fixed counter for reference cycles is NOT the same
+		 * as the general purpose architectural event.  The fixed counter
+		 * explicitly counts at the same frequency as the TSC, whereas
+		 * the GP event counts at a fixed, but uarch specific, frequency.
+		 * Bundle them here for simplicity.
+		 */
+		[INTEL_ARCH_REFERENCE_CYCLES_INDEX]	 = { X86_PMU_FEATURE_REFERENCE_CYCLES, X86_PMU_FEATURE_REFERENCE_TSC_CYCLES_FIXED },
+		[INTEL_ARCH_LLC_REFERENCES_INDEX]	 = { X86_PMU_FEATURE_LLC_REFERENCES, X86_PMU_FEATURE_NULL },
+		[INTEL_ARCH_LLC_MISSES_INDEX]		 = { X86_PMU_FEATURE_LLC_MISSES, X86_PMU_FEATURE_NULL },
+		[INTEL_ARCH_BRANCHES_RETIRED_INDEX]	 = { X86_PMU_FEATURE_BRANCH_INSNS_RETIRED, X86_PMU_FEATURE_NULL },
+		[INTEL_ARCH_BRANCHES_MISPREDICTED_INDEX] = { X86_PMU_FEATURE_BRANCHES_MISPREDICTED, X86_PMU_FEATURE_NULL },
+		[INTEL_ARCH_TOPDOWN_SLOTS_INDEX]	 = { X86_PMU_FEATURE_TOPDOWN_SLOTS, X86_PMU_FEATURE_TOPDOWN_SLOTS_FIXED },
+	};
+
+	uint32_t nr_gp_counters = this_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS);
+	uint32_t pmu_version = guest_get_pmu_version();
+	/* PERF_GLOBAL_CTRL exists only for Architectural PMU Version 2+. */
+	bool guest_has_perf_global_ctrl = pmu_version >= 2;
+	struct kvm_x86_pmu_feature gp_event, fixed_event;
+	uint32_t base_pmc_msr;
+	unsigned int i;
+
+	/* The host side shouldn't invoke this without a guest PMU. */
+	GUEST_ASSERT(pmu_version);
+
+	if (this_cpu_has(X86_FEATURE_PDCM) &&
+	    rdmsr(MSR_IA32_PERF_CAPABILITIES) & PMU_CAP_FW_WRITES)
+		base_pmc_msr = MSR_IA32_PMC0;
+	else
+		base_pmc_msr = MSR_IA32_PERFCTR0;
+
+	gp_event = intel_event_to_feature[idx].gp_event;
+	GUEST_ASSERT_EQ(idx, gp_event.f.bit);
+
+	GUEST_ASSERT(nr_gp_counters);
+
+	for (i = 0; i < nr_gp_counters; i++) {
+		uint64_t eventsel = ARCH_PERFMON_EVENTSEL_OS |
+				    ARCH_PERFMON_EVENTSEL_ENABLE |
+				    intel_pmu_arch_events[idx];
+
+		wrmsr(MSR_P6_EVNTSEL0 + i, 0);
+		if (guest_has_perf_global_ctrl)
+			wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, BIT_ULL(i));
+
+		__guest_test_arch_event(idx, gp_event, i, base_pmc_msr + i,
+					MSR_P6_EVNTSEL0 + i, eventsel);
+	}
+
+	if (!guest_has_perf_global_ctrl)
+		return;
+
+	fixed_event = intel_event_to_feature[idx].fixed_event;
+	if (pmu_is_null_feature(fixed_event) || !this_pmu_has(fixed_event))
+		return;
+
+	i = fixed_event.f.bit;
+
+	wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, FIXED_PMC_CTRL(i, FIXED_PMC_KERNEL));
+
+	__guest_test_arch_event(idx, fixed_event, i | INTEL_RDPMC_FIXED,
+				MSR_CORE_PERF_FIXED_CTR0 + i,
+				MSR_CORE_PERF_GLOBAL_CTRL,
+				FIXED_PMC_GLOBAL_CTRL_ENABLE(i));
+}
+
+static void guest_test_arch_events(void)
+{
+	uint8_t i;
+
+	for (i = 0; i < NR_INTEL_ARCH_EVENTS; i++)
+		guest_test_arch_event(i);
+
+	GUEST_DONE();
+}
+
+static void test_arch_events(uint8_t pmu_version, uint64_t perf_capabilities,
+			     uint8_t length, uint8_t unavailable_mask)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	/* Testing arch events requires a vPMU (there are no negative tests). */
+	if (!pmu_version)
+		return;
+
+	vm = pmu_vm_create_with_one_vcpu(&vcpu, guest_test_arch_events,
+					 pmu_version, perf_capabilities);
+
+	vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH,
+				length);
+	vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_EVENTS_MASK,
+				unavailable_mask);
+
+	run_vcpu(vcpu);
+
+	kvm_vm_free(vm);
+}
+
+/*
+ * Limit testing to MSRs that are actually defined by Intel (in the SDM).  MSRs
+ * that aren't defined counter MSRs *probably* don't exist, but there's no
+ * guarantee that currently undefined MSR indices won't be used for something
+ * other than PMCs in the future.
+ */
+#define MAX_NR_GP_COUNTERS	8
+#define MAX_NR_FIXED_COUNTERS	3
+
+#define GUEST_ASSERT_PMC_MSR_ACCESS(insn, msr, expect_gp, vector)		\
+__GUEST_ASSERT(expect_gp ? vector == GP_VECTOR : !vector,			\
+	       "Expected %s on " #insn "(0x%x), got vector %u",			\
+	       expect_gp ? "#GP" : "no fault", msr, vector)			\
+
+#define GUEST_ASSERT_PMC_VALUE(insn, msr, val, expected)			\
+	__GUEST_ASSERT(val == expected_val,					\
+		       "Expected " #insn "(0x%x) to yield 0x%lx, got 0x%lx",	\
+		       msr, expected_val, val);
+
+static void guest_test_rdpmc(uint32_t rdpmc_idx, bool expect_success,
+			     uint64_t expected_val)
+{
+	uint8_t vector;
+	uint64_t val;
+
+	vector = rdpmc_safe(rdpmc_idx, &val);
+	GUEST_ASSERT_PMC_MSR_ACCESS(RDPMC, rdpmc_idx, !expect_success, vector);
+	if (expect_success)
+		GUEST_ASSERT_PMC_VALUE(RDPMC, rdpmc_idx, val, expected_val);
+
+	if (!is_forced_emulation_enabled)
+		return;
+
+	vector = rdpmc_safe_fep(rdpmc_idx, &val);
+	GUEST_ASSERT_PMC_MSR_ACCESS(RDPMC, rdpmc_idx, !expect_success, vector);
+	if (expect_success)
+		GUEST_ASSERT_PMC_VALUE(RDPMC, rdpmc_idx, val, expected_val);
+}
+
+static void guest_rd_wr_counters(uint32_t base_msr, uint8_t nr_possible_counters,
+				 uint8_t nr_counters, uint32_t or_mask)
+{
+	const bool pmu_has_fast_mode = !guest_get_pmu_version();
+	uint8_t i;
+
+	for (i = 0; i < nr_possible_counters; i++) {
+		/*
+		 * TODO: Test a value that validates full-width writes and the
+		 * width of the counters.
+		 */
+		const uint64_t test_val = 0xffff;
+		const uint32_t msr = base_msr + i;
+
+		/*
+		 * Fixed counters are supported if the counter is less than the
+		 * number of enumerated contiguous counters *or* the counter is
+		 * explicitly enumerated in the supported counters mask.
+		 */
+		const bool expect_success = i < nr_counters || (or_mask & BIT(i));
+
+		/*
+		 * KVM drops writes to MSR_P6_PERFCTR[0|1] if the counters are
+		 * unsupported, i.e. doesn't #GP and reads back '0'.
+		 */
+		const uint64_t expected_val = expect_success ? test_val : 0;
+		const bool expect_gp = !expect_success && msr != MSR_P6_PERFCTR0 &&
+				       msr != MSR_P6_PERFCTR1;
+		uint32_t rdpmc_idx;
+		uint8_t vector;
+		uint64_t val;
+
+		vector = wrmsr_safe(msr, test_val);
+		GUEST_ASSERT_PMC_MSR_ACCESS(WRMSR, msr, expect_gp, vector);
+
+		vector = rdmsr_safe(msr, &val);
+		GUEST_ASSERT_PMC_MSR_ACCESS(RDMSR, msr, expect_gp, vector);
+
+		/* On #GP, the result of RDMSR is undefined. */
+		if (!expect_gp)
+			GUEST_ASSERT_PMC_VALUE(RDMSR, msr, val, expected_val);
+
+		/*
+		 * Redo the read tests with RDPMC, which has different indexing
+		 * semantics and additional capabilities.
+		 */
+		rdpmc_idx = i;
+		if (base_msr == MSR_CORE_PERF_FIXED_CTR0)
+			rdpmc_idx |= INTEL_RDPMC_FIXED;
+
+		guest_test_rdpmc(rdpmc_idx, expect_success, expected_val);
+
+		/*
+		 * KVM doesn't support non-architectural PMUs, i.e. it should
+		 * impossible to have fast mode RDPMC.  Verify that attempting
+		 * to use fast RDPMC always #GPs.
+		 */
+		GUEST_ASSERT(!expect_success || !pmu_has_fast_mode);
+		rdpmc_idx |= INTEL_RDPMC_FAST;
+		guest_test_rdpmc(rdpmc_idx, false, -1ull);
+
+		vector = wrmsr_safe(msr, 0);
+		GUEST_ASSERT_PMC_MSR_ACCESS(WRMSR, msr, expect_gp, vector);
+	}
+}
+
+static void guest_test_gp_counters(void)
+{
+	uint8_t nr_gp_counters = 0;
+	uint32_t base_msr;
+
+	if (guest_get_pmu_version())
+		nr_gp_counters = this_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS);
+
+	if (this_cpu_has(X86_FEATURE_PDCM) &&
+	    rdmsr(MSR_IA32_PERF_CAPABILITIES) & PMU_CAP_FW_WRITES)
+		base_msr = MSR_IA32_PMC0;
+	else
+		base_msr = MSR_IA32_PERFCTR0;
+
+	guest_rd_wr_counters(base_msr, MAX_NR_GP_COUNTERS, nr_gp_counters, 0);
+	GUEST_DONE();
+}
+
+static void test_gp_counters(uint8_t pmu_version, uint64_t perf_capabilities,
+			     uint8_t nr_gp_counters)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	vm = pmu_vm_create_with_one_vcpu(&vcpu, guest_test_gp_counters,
+					 pmu_version, perf_capabilities);
+
+	vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_NR_GP_COUNTERS,
+				nr_gp_counters);
+
+	run_vcpu(vcpu);
+
+	kvm_vm_free(vm);
+}
+
+static void guest_test_fixed_counters(void)
+{
+	uint64_t supported_bitmask = 0;
+	uint8_t nr_fixed_counters = 0;
+	uint8_t i;
+
+	/* Fixed counters require Architectural vPMU Version 2+. */
+	if (guest_get_pmu_version() >= 2)
+		nr_fixed_counters = this_cpu_property(X86_PROPERTY_PMU_NR_FIXED_COUNTERS);
+
+	/*
+	 * The supported bitmask for fixed counters was introduced in PMU
+	 * version 5.
+	 */
+	if (guest_get_pmu_version() >= 5)
+		supported_bitmask = this_cpu_property(X86_PROPERTY_PMU_FIXED_COUNTERS_BITMASK);
+
+	guest_rd_wr_counters(MSR_CORE_PERF_FIXED_CTR0, MAX_NR_FIXED_COUNTERS,
+			     nr_fixed_counters, supported_bitmask);
+
+	for (i = 0; i < MAX_NR_FIXED_COUNTERS; i++) {
+		uint8_t vector;
+		uint64_t val;
+
+		if (i >= nr_fixed_counters && !(supported_bitmask & BIT_ULL(i))) {
+			vector = wrmsr_safe(MSR_CORE_PERF_FIXED_CTR_CTRL,
+					    FIXED_PMC_CTRL(i, FIXED_PMC_KERNEL));
+			__GUEST_ASSERT(vector == GP_VECTOR,
+				       "Expected #GP for counter %u in FIXED_CTR_CTRL", i);
+
+			vector = wrmsr_safe(MSR_CORE_PERF_GLOBAL_CTRL,
+					    FIXED_PMC_GLOBAL_CTRL_ENABLE(i));
+			__GUEST_ASSERT(vector == GP_VECTOR,
+				       "Expected #GP for counter %u in PERF_GLOBAL_CTRL", i);
+			continue;
+		}
+
+		wrmsr(MSR_CORE_PERF_FIXED_CTR0 + i, 0);
+		wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, FIXED_PMC_CTRL(i, FIXED_PMC_KERNEL));
+		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, FIXED_PMC_GLOBAL_CTRL_ENABLE(i));
+		__asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
+		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+		val = rdmsr(MSR_CORE_PERF_FIXED_CTR0 + i);
+
+		GUEST_ASSERT_NE(val, 0);
+	}
+	GUEST_DONE();
+}
+
+static void test_fixed_counters(uint8_t pmu_version, uint64_t perf_capabilities,
+				uint8_t nr_fixed_counters,
+				uint32_t supported_bitmask)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	vm = pmu_vm_create_with_one_vcpu(&vcpu, guest_test_fixed_counters,
+					 pmu_version, perf_capabilities);
+
+	vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_FIXED_COUNTERS_BITMASK,
+				supported_bitmask);
+	vcpu_set_cpuid_property(vcpu, X86_PROPERTY_PMU_NR_FIXED_COUNTERS,
+				nr_fixed_counters);
+
+	run_vcpu(vcpu);
+
+	kvm_vm_free(vm);
+}
+
+static void test_intel_counters(void)
+{
+	uint8_t nr_arch_events = kvm_cpu_property(X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH);
+	uint8_t nr_fixed_counters = kvm_cpu_property(X86_PROPERTY_PMU_NR_FIXED_COUNTERS);
+	uint8_t nr_gp_counters = kvm_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS);
+	uint8_t pmu_version = kvm_cpu_property(X86_PROPERTY_PMU_VERSION);
+	unsigned int i;
+	uint8_t v, j;
+	uint32_t k;
+
+	const uint64_t perf_caps[] = {
+		0,
+		PMU_CAP_FW_WRITES,
+	};
+
+	/*
+	 * Test up to PMU v5, which is the current maximum version defined by
+	 * Intel, i.e. is the last version that is guaranteed to be backwards
+	 * compatible with KVM's existing behavior.
+	 */
+	uint8_t max_pmu_version = max_t(typeof(pmu_version), pmu_version, 5);
+
+	/*
+	 * Detect the existence of events that aren't supported by selftests.
+	 * This will (obviously) fail any time the kernel adds support for a
+	 * new event, but it's worth paying that price to keep the test fresh.
+	 */
+	TEST_ASSERT(nr_arch_events <= NR_INTEL_ARCH_EVENTS,
+		    "New architectural event(s) detected; please update this test (length = %u, mask = %x)",
+		    nr_arch_events, kvm_cpu_property(X86_PROPERTY_PMU_EVENTS_MASK));
+
+	/*
+	 * Force iterating over known arch events regardless of whether or not
+	 * KVM/hardware supports a given event.
+	 */
+	nr_arch_events = max_t(typeof(nr_arch_events), nr_arch_events, NR_INTEL_ARCH_EVENTS);
+
+	for (v = 0; v <= max_pmu_version; v++) {
+		for (i = 0; i < ARRAY_SIZE(perf_caps); i++) {
+			if (!kvm_has_perf_caps && perf_caps[i])
+				continue;
+
+			pr_info("Testing arch events, PMU version %u, perf_caps = %lx\n",
+				v, perf_caps[i]);
+			/*
+			 * To keep the total runtime reasonable, test every
+			 * possible non-zero, non-reserved bitmap combination
+			 * only with the native PMU version and the full bit
+			 * vector length.
+			 */
+			if (v == pmu_version) {
+				for (k = 1; k < (BIT(nr_arch_events) - 1); k++)
+					test_arch_events(v, perf_caps[i], nr_arch_events, k);
+			}
+			/*
+			 * Test single bits for all PMU version and lengths up
+			 * the number of events +1 (to verify KVM doesn't do
+			 * weird things if the guest length is greater than the
+			 * host length).  Explicitly test a mask of '0' and all
+			 * ones i.e. all events being available and unavailable.
+			 */
+			for (j = 0; j <= nr_arch_events + 1; j++) {
+				test_arch_events(v, perf_caps[i], j, 0);
+				test_arch_events(v, perf_caps[i], j, 0xff);
+
+				for (k = 0; k < nr_arch_events; k++)
+					test_arch_events(v, perf_caps[i], j, BIT(k));
+			}
+
+			pr_info("Testing GP counters, PMU version %u, perf_caps = %lx\n",
+				v, perf_caps[i]);
+			for (j = 0; j <= nr_gp_counters; j++)
+				test_gp_counters(v, perf_caps[i], j);
+
+			pr_info("Testing fixed counters, PMU version %u, perf_caps = %lx\n",
+				v, perf_caps[i]);
+			for (j = 0; j <= nr_fixed_counters; j++) {
+				for (k = 0; k <= (BIT(nr_fixed_counters) - 1); k++)
+					test_fixed_counters(v, perf_caps[i], j, k);
+			}
+		}
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	TEST_REQUIRE(kvm_is_pmu_enabled());
+
+	TEST_REQUIRE(host_cpu_is_intel);
+	TEST_REQUIRE(kvm_cpu_has_p(X86_PROPERTY_PMU_VERSION));
+	TEST_REQUIRE(kvm_cpu_property(X86_PROPERTY_PMU_VERSION) > 0);
+
+	kvm_pmu_version = kvm_cpu_property(X86_PROPERTY_PMU_VERSION);
+	kvm_has_perf_caps = kvm_cpu_has(X86_FEATURE_PDCM);
+	is_forced_emulation_enabled = kvm_is_forced_emulation_enabled();
+
+	test_intel_counters();
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
new file mode 100644
index 000000000000..3c85d1ae9893
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
@@ -0,0 +1,910 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test for x86 KVM_SET_PMU_EVENT_FILTER.
+ *
+ * Copyright (C) 2022, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * Verifies the expected behavior of allow lists and deny lists for
+ * virtual PMU events.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+
+#include "kvm_util.h"
+#include "pmu.h"
+#include "processor.h"
+#include "test_util.h"
+
+#define NUM_BRANCHES 42
+#define MAX_TEST_EVENTS		10
+
+#define PMU_EVENT_FILTER_INVALID_ACTION		(KVM_PMU_EVENT_DENY + 1)
+#define PMU_EVENT_FILTER_INVALID_FLAGS			(KVM_PMU_EVENT_FLAGS_VALID_MASK << 1)
+#define PMU_EVENT_FILTER_INVALID_NEVENTS		(KVM_PMU_EVENT_FILTER_MAX_EVENTS + 1)
+
+struct __kvm_pmu_event_filter {
+	__u32 action;
+	__u32 nevents;
+	__u32 fixed_counter_bitmap;
+	__u32 flags;
+	__u32 pad[4];
+	__u64 events[KVM_PMU_EVENT_FILTER_MAX_EVENTS];
+};
+
+/*
+ * This event list comprises Intel's known architectural events, plus AMD's
+ * "retired branch instructions" for Zen1-Zen3 (and* possibly other AMD CPUs).
+ * Note, AMD and Intel use the same encoding for instructions retired.
+ */
+kvm_static_assert(INTEL_ARCH_INSTRUCTIONS_RETIRED == AMD_ZEN_INSTRUCTIONS_RETIRED);
+
+static const struct __kvm_pmu_event_filter base_event_filter = {
+	.nevents = ARRAY_SIZE(base_event_filter.events),
+	.events = {
+		INTEL_ARCH_CPU_CYCLES,
+		INTEL_ARCH_INSTRUCTIONS_RETIRED,
+		INTEL_ARCH_REFERENCE_CYCLES,
+		INTEL_ARCH_LLC_REFERENCES,
+		INTEL_ARCH_LLC_MISSES,
+		INTEL_ARCH_BRANCHES_RETIRED,
+		INTEL_ARCH_BRANCHES_MISPREDICTED,
+		INTEL_ARCH_TOPDOWN_SLOTS,
+		AMD_ZEN_BRANCHES_RETIRED,
+	},
+};
+
+struct {
+	uint64_t loads;
+	uint64_t stores;
+	uint64_t loads_stores;
+	uint64_t branches_retired;
+	uint64_t instructions_retired;
+} pmc_results;
+
+/*
+ * If we encounter a #GP during the guest PMU sanity check, then the guest
+ * PMU is not functional. Inform the hypervisor via GUEST_SYNC(0).
+ */
+static void guest_gp_handler(struct ex_regs *regs)
+{
+	GUEST_SYNC(-EFAULT);
+}
+
+/*
+ * Check that we can write a new value to the given MSR and read it back.
+ * The caller should provide a non-empty set of bits that are safe to flip.
+ *
+ * Return on success. GUEST_SYNC(0) on error.
+ */
+static void check_msr(uint32_t msr, uint64_t bits_to_flip)
+{
+	uint64_t v = rdmsr(msr) ^ bits_to_flip;
+
+	wrmsr(msr, v);
+	if (rdmsr(msr) != v)
+		GUEST_SYNC(-EIO);
+
+	v ^= bits_to_flip;
+	wrmsr(msr, v);
+	if (rdmsr(msr) != v)
+		GUEST_SYNC(-EIO);
+}
+
+static void run_and_measure_loop(uint32_t msr_base)
+{
+	const uint64_t branches_retired = rdmsr(msr_base + 0);
+	const uint64_t insn_retired = rdmsr(msr_base + 1);
+
+	__asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
+
+	pmc_results.branches_retired = rdmsr(msr_base + 0) - branches_retired;
+	pmc_results.instructions_retired = rdmsr(msr_base + 1) - insn_retired;
+}
+
+static void intel_guest_code(void)
+{
+	check_msr(MSR_CORE_PERF_GLOBAL_CTRL, 1);
+	check_msr(MSR_P6_EVNTSEL0, 0xffff);
+	check_msr(MSR_IA32_PMC0, 0xffff);
+	GUEST_SYNC(0);
+
+	for (;;) {
+		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+		wrmsr(MSR_P6_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE |
+		      ARCH_PERFMON_EVENTSEL_OS | INTEL_ARCH_BRANCHES_RETIRED);
+		wrmsr(MSR_P6_EVNTSEL1, ARCH_PERFMON_EVENTSEL_ENABLE |
+		      ARCH_PERFMON_EVENTSEL_OS | INTEL_ARCH_INSTRUCTIONS_RETIRED);
+		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0x3);
+
+		run_and_measure_loop(MSR_IA32_PMC0);
+		GUEST_SYNC(0);
+	}
+}
+
+/*
+ * To avoid needing a check for CPUID.80000001:ECX.PerfCtrExtCore[bit 23],
+ * this code uses the always-available, legacy K7 PMU MSRs, which alias to
+ * the first four of the six extended core PMU MSRs.
+ */
+static void amd_guest_code(void)
+{
+	check_msr(MSR_K7_EVNTSEL0, 0xffff);
+	check_msr(MSR_K7_PERFCTR0, 0xffff);
+	GUEST_SYNC(0);
+
+	for (;;) {
+		wrmsr(MSR_K7_EVNTSEL0, 0);
+		wrmsr(MSR_K7_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE |
+		      ARCH_PERFMON_EVENTSEL_OS | AMD_ZEN_BRANCHES_RETIRED);
+		wrmsr(MSR_K7_EVNTSEL1, ARCH_PERFMON_EVENTSEL_ENABLE |
+		      ARCH_PERFMON_EVENTSEL_OS | AMD_ZEN_INSTRUCTIONS_RETIRED);
+
+		run_and_measure_loop(MSR_K7_PERFCTR0);
+		GUEST_SYNC(0);
+	}
+}
+
+/*
+ * Run the VM to the next GUEST_SYNC(value), and return the value passed
+ * to the sync. Any other exit from the guest is fatal.
+ */
+static uint64_t run_vcpu_to_sync(struct kvm_vcpu *vcpu)
+{
+	struct ucall uc;
+
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+	get_ucall(vcpu, &uc);
+	TEST_ASSERT(uc.cmd == UCALL_SYNC,
+		    "Received ucall other than UCALL_SYNC: %lu", uc.cmd);
+	return uc.args[1];
+}
+
+static void run_vcpu_and_sync_pmc_results(struct kvm_vcpu *vcpu)
+{
+	uint64_t r;
+
+	memset(&pmc_results, 0, sizeof(pmc_results));
+	sync_global_to_guest(vcpu->vm, pmc_results);
+
+	r = run_vcpu_to_sync(vcpu);
+	TEST_ASSERT(!r, "Unexpected sync value: 0x%lx", r);
+
+	sync_global_from_guest(vcpu->vm, pmc_results);
+}
+
+/*
+ * In a nested environment or if the vPMU is disabled, the guest PMU
+ * might not work as architected (accessing the PMU MSRs may raise
+ * #GP, or writes could simply be discarded). In those situations,
+ * there is no point in running these tests. The guest code will perform
+ * a sanity check and then GUEST_SYNC(success). In the case of failure,
+ * the behavior of the guest on resumption is undefined.
+ */
+static bool sanity_check_pmu(struct kvm_vcpu *vcpu)
+{
+	uint64_t r;
+
+	vm_install_exception_handler(vcpu->vm, GP_VECTOR, guest_gp_handler);
+	r = run_vcpu_to_sync(vcpu);
+	vm_install_exception_handler(vcpu->vm, GP_VECTOR, NULL);
+
+	return !r;
+}
+
+/*
+ * Remove the first occurrence of 'event' (if any) from the filter's
+ * event list.
+ */
+static void remove_event(struct __kvm_pmu_event_filter *f, uint64_t event)
+{
+	bool found = false;
+	int i;
+
+	for (i = 0; i < f->nevents; i++) {
+		if (found)
+			f->events[i - 1] = f->events[i];
+		else
+			found = f->events[i] == event;
+	}
+	if (found)
+		f->nevents--;
+}
+
+#define ASSERT_PMC_COUNTING_INSTRUCTIONS()						\
+do {											\
+	uint64_t br = pmc_results.branches_retired;					\
+	uint64_t ir = pmc_results.instructions_retired;					\
+											\
+	if (br && br != NUM_BRANCHES)							\
+		pr_info("%s: Branch instructions retired = %lu (expected %u)\n",	\
+			__func__, br, NUM_BRANCHES);					\
+	TEST_ASSERT(br, "%s: Branch instructions retired = %lu (expected > 0)",		\
+		    __func__, br);							\
+	TEST_ASSERT(ir,	"%s: Instructions retired = %lu (expected > 0)",		\
+		    __func__, ir);							\
+} while (0)
+
+#define ASSERT_PMC_NOT_COUNTING_INSTRUCTIONS()						\
+do {											\
+	uint64_t br = pmc_results.branches_retired;					\
+	uint64_t ir = pmc_results.instructions_retired;					\
+											\
+	TEST_ASSERT(!br, "%s: Branch instructions retired = %lu (expected 0)",		\
+		    __func__, br);							\
+	TEST_ASSERT(!ir, "%s: Instructions retired = %lu (expected 0)",			\
+		    __func__, ir);							\
+} while (0)
+
+static void test_without_filter(struct kvm_vcpu *vcpu)
+{
+	run_vcpu_and_sync_pmc_results(vcpu);
+
+	ASSERT_PMC_COUNTING_INSTRUCTIONS();
+}
+
+static void test_with_filter(struct kvm_vcpu *vcpu,
+			     struct __kvm_pmu_event_filter *__f)
+{
+	struct kvm_pmu_event_filter *f = (void *)__f;
+
+	vm_ioctl(vcpu->vm, KVM_SET_PMU_EVENT_FILTER, f);
+	run_vcpu_and_sync_pmc_results(vcpu);
+}
+
+static void test_amd_deny_list(struct kvm_vcpu *vcpu)
+{
+	struct __kvm_pmu_event_filter f = {
+		.action = KVM_PMU_EVENT_DENY,
+		.nevents = 1,
+		.events = {
+			RAW_EVENT(0x1C2, 0),
+		},
+	};
+
+	test_with_filter(vcpu, &f);
+
+	ASSERT_PMC_COUNTING_INSTRUCTIONS();
+}
+
+static void test_member_deny_list(struct kvm_vcpu *vcpu)
+{
+	struct __kvm_pmu_event_filter f = base_event_filter;
+
+	f.action = KVM_PMU_EVENT_DENY;
+	test_with_filter(vcpu, &f);
+
+	ASSERT_PMC_NOT_COUNTING_INSTRUCTIONS();
+}
+
+static void test_member_allow_list(struct kvm_vcpu *vcpu)
+{
+	struct __kvm_pmu_event_filter f = base_event_filter;
+
+	f.action = KVM_PMU_EVENT_ALLOW;
+	test_with_filter(vcpu, &f);
+
+	ASSERT_PMC_COUNTING_INSTRUCTIONS();
+}
+
+static void test_not_member_deny_list(struct kvm_vcpu *vcpu)
+{
+	struct __kvm_pmu_event_filter f = base_event_filter;
+
+	f.action = KVM_PMU_EVENT_DENY;
+
+	remove_event(&f, INTEL_ARCH_INSTRUCTIONS_RETIRED);
+	remove_event(&f, INTEL_ARCH_BRANCHES_RETIRED);
+	remove_event(&f, AMD_ZEN_BRANCHES_RETIRED);
+	test_with_filter(vcpu, &f);
+
+	ASSERT_PMC_COUNTING_INSTRUCTIONS();
+}
+
+static void test_not_member_allow_list(struct kvm_vcpu *vcpu)
+{
+	struct __kvm_pmu_event_filter f = base_event_filter;
+
+	f.action = KVM_PMU_EVENT_ALLOW;
+
+	remove_event(&f, INTEL_ARCH_INSTRUCTIONS_RETIRED);
+	remove_event(&f, INTEL_ARCH_BRANCHES_RETIRED);
+	remove_event(&f, AMD_ZEN_BRANCHES_RETIRED);
+	test_with_filter(vcpu, &f);
+
+	ASSERT_PMC_NOT_COUNTING_INSTRUCTIONS();
+}
+
+/*
+ * Verify that setting KVM_PMU_CAP_DISABLE prevents the use of the PMU.
+ *
+ * Note that KVM_CAP_PMU_CAPABILITY must be invoked prior to creating VCPUs.
+ */
+static void test_pmu_config_disable(void (*guest_code)(void))
+{
+	struct kvm_vcpu *vcpu;
+	int r;
+	struct kvm_vm *vm;
+
+	r = kvm_check_cap(KVM_CAP_PMU_CAPABILITY);
+	if (!(r & KVM_PMU_CAP_DISABLE))
+		return;
+
+	vm = vm_create(1);
+
+	vm_enable_cap(vm, KVM_CAP_PMU_CAPABILITY, KVM_PMU_CAP_DISABLE);
+
+	vcpu = vm_vcpu_add(vm, 0, guest_code);
+	vm_init_descriptor_tables(vm);
+	vcpu_init_descriptor_tables(vcpu);
+
+	TEST_ASSERT(!sanity_check_pmu(vcpu),
+		    "Guest should not be able to use disabled PMU.");
+
+	kvm_vm_free(vm);
+}
+
+/*
+ * On Intel, check for a non-zero PMU version, at least one general-purpose
+ * counter per logical processor, and support for counting the number of branch
+ * instructions retired.
+ */
+static bool use_intel_pmu(void)
+{
+	return host_cpu_is_intel &&
+	       kvm_cpu_property(X86_PROPERTY_PMU_VERSION) &&
+	       kvm_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS) &&
+	       kvm_pmu_has(X86_PMU_FEATURE_BRANCH_INSNS_RETIRED);
+}
+
+static bool is_zen1(uint32_t family, uint32_t model)
+{
+	return family == 0x17 && model <= 0x0f;
+}
+
+static bool is_zen2(uint32_t family, uint32_t model)
+{
+	return family == 0x17 && model >= 0x30 && model <= 0x3f;
+}
+
+static bool is_zen3(uint32_t family, uint32_t model)
+{
+	return family == 0x19 && model <= 0x0f;
+}
+
+/*
+ * Determining AMD support for a PMU event requires consulting the AMD
+ * PPR for the CPU or reference material derived therefrom. The AMD
+ * test code herein has been verified to work on Zen1, Zen2, and Zen3.
+ *
+ * Feel free to add more AMD CPUs that are documented to support event
+ * select 0xc2 umask 0 as "retired branch instructions."
+ */
+static bool use_amd_pmu(void)
+{
+	uint32_t family = kvm_cpu_family();
+	uint32_t model = kvm_cpu_model();
+
+	return host_cpu_is_amd &&
+		(is_zen1(family, model) ||
+		 is_zen2(family, model) ||
+		 is_zen3(family, model));
+}
+
+/*
+ * "MEM_INST_RETIRED.ALL_LOADS", "MEM_INST_RETIRED.ALL_STORES", and
+ * "MEM_INST_RETIRED.ANY" from https://perfmon-events.intel.com/
+ * supported on Intel Xeon processors:
+ *  - Sapphire Rapids, Ice Lake, Cascade Lake, Skylake.
+ */
+#define MEM_INST_RETIRED		0xD0
+#define MEM_INST_RETIRED_LOAD		RAW_EVENT(MEM_INST_RETIRED, 0x81)
+#define MEM_INST_RETIRED_STORE		RAW_EVENT(MEM_INST_RETIRED, 0x82)
+#define MEM_INST_RETIRED_LOAD_STORE	RAW_EVENT(MEM_INST_RETIRED, 0x83)
+
+static bool supports_event_mem_inst_retired(void)
+{
+	uint32_t eax, ebx, ecx, edx;
+
+	cpuid(1, &eax, &ebx, &ecx, &edx);
+	if (x86_family(eax) == 0x6) {
+		switch (x86_model(eax)) {
+		/* Sapphire Rapids */
+		case 0x8F:
+		/* Ice Lake */
+		case 0x6A:
+		/* Skylake */
+		/* Cascade Lake */
+		case 0x55:
+			return true;
+		}
+	}
+
+	return false;
+}
+
+/*
+ * "LS Dispatch", from Processor Programming Reference
+ * (PPR) for AMD Family 17h Model 01h, Revision B1 Processors,
+ * Preliminary Processor Programming Reference (PPR) for AMD Family
+ * 17h Model 31h, Revision B0 Processors, and Preliminary Processor
+ * Programming Reference (PPR) for AMD Family 19h Model 01h, Revision
+ * B1 Processors Volume 1 of 2.
+ */
+#define LS_DISPATCH		0x29
+#define LS_DISPATCH_LOAD	RAW_EVENT(LS_DISPATCH, BIT(0))
+#define LS_DISPATCH_STORE	RAW_EVENT(LS_DISPATCH, BIT(1))
+#define LS_DISPATCH_LOAD_STORE	RAW_EVENT(LS_DISPATCH, BIT(2))
+
+#define INCLUDE_MASKED_ENTRY(event_select, mask, match) \
+	KVM_PMU_ENCODE_MASKED_ENTRY(event_select, mask, match, false)
+#define EXCLUDE_MASKED_ENTRY(event_select, mask, match) \
+	KVM_PMU_ENCODE_MASKED_ENTRY(event_select, mask, match, true)
+
+static void masked_events_guest_test(uint32_t msr_base)
+{
+	/*
+	 * The actual value of the counters don't determine the outcome of
+	 * the test.  Only that they are zero or non-zero.
+	 */
+	const uint64_t loads = rdmsr(msr_base + 0);
+	const uint64_t stores = rdmsr(msr_base + 1);
+	const uint64_t loads_stores = rdmsr(msr_base + 2);
+	int val;
+
+
+	__asm__ __volatile__("movl $0, %[v];"
+			     "movl %[v], %%eax;"
+			     "incl %[v];"
+			     : [v]"+m"(val) :: "eax");
+
+	pmc_results.loads = rdmsr(msr_base + 0) - loads;
+	pmc_results.stores = rdmsr(msr_base + 1) - stores;
+	pmc_results.loads_stores = rdmsr(msr_base + 2) - loads_stores;
+}
+
+static void intel_masked_events_guest_code(void)
+{
+	for (;;) {
+		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+
+		wrmsr(MSR_P6_EVNTSEL0 + 0, ARCH_PERFMON_EVENTSEL_ENABLE |
+		      ARCH_PERFMON_EVENTSEL_OS | MEM_INST_RETIRED_LOAD);
+		wrmsr(MSR_P6_EVNTSEL0 + 1, ARCH_PERFMON_EVENTSEL_ENABLE |
+		      ARCH_PERFMON_EVENTSEL_OS | MEM_INST_RETIRED_STORE);
+		wrmsr(MSR_P6_EVNTSEL0 + 2, ARCH_PERFMON_EVENTSEL_ENABLE |
+		      ARCH_PERFMON_EVENTSEL_OS | MEM_INST_RETIRED_LOAD_STORE);
+
+		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0x7);
+
+		masked_events_guest_test(MSR_IA32_PMC0);
+		GUEST_SYNC(0);
+	}
+}
+
+static void amd_masked_events_guest_code(void)
+{
+	for (;;) {
+		wrmsr(MSR_K7_EVNTSEL0, 0);
+		wrmsr(MSR_K7_EVNTSEL1, 0);
+		wrmsr(MSR_K7_EVNTSEL2, 0);
+
+		wrmsr(MSR_K7_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE |
+		      ARCH_PERFMON_EVENTSEL_OS | LS_DISPATCH_LOAD);
+		wrmsr(MSR_K7_EVNTSEL1, ARCH_PERFMON_EVENTSEL_ENABLE |
+		      ARCH_PERFMON_EVENTSEL_OS | LS_DISPATCH_STORE);
+		wrmsr(MSR_K7_EVNTSEL2, ARCH_PERFMON_EVENTSEL_ENABLE |
+		      ARCH_PERFMON_EVENTSEL_OS | LS_DISPATCH_LOAD_STORE);
+
+		masked_events_guest_test(MSR_K7_PERFCTR0);
+		GUEST_SYNC(0);
+	}
+}
+
+static void run_masked_events_test(struct kvm_vcpu *vcpu,
+				   const uint64_t masked_events[],
+				   const int nmasked_events)
+{
+	struct __kvm_pmu_event_filter f = {
+		.nevents = nmasked_events,
+		.action = KVM_PMU_EVENT_ALLOW,
+		.flags = KVM_PMU_EVENT_FLAG_MASKED_EVENTS,
+	};
+
+	memcpy(f.events, masked_events, sizeof(uint64_t) * nmasked_events);
+	test_with_filter(vcpu, &f);
+}
+
+#define ALLOW_LOADS		BIT(0)
+#define ALLOW_STORES		BIT(1)
+#define ALLOW_LOADS_STORES	BIT(2)
+
+struct masked_events_test {
+	uint64_t intel_events[MAX_TEST_EVENTS];
+	uint64_t intel_event_end;
+	uint64_t amd_events[MAX_TEST_EVENTS];
+	uint64_t amd_event_end;
+	const char *msg;
+	uint32_t flags;
+};
+
+/*
+ * These are the test cases for the masked events tests.
+ *
+ * For each test, the guest enables 3 PMU counters (loads, stores,
+ * loads + stores).  The filter is then set in KVM with the masked events
+ * provided.  The test then verifies that the counters agree with which
+ * ones should be counting and which ones should be filtered.
+ */
+const struct masked_events_test test_cases[] = {
+	{
+		.intel_events = {
+			INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFF, 0x81),
+		},
+		.amd_events = {
+			INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xFF, BIT(0)),
+		},
+		.msg = "Only allow loads.",
+		.flags = ALLOW_LOADS,
+	}, {
+		.intel_events = {
+			INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFF, 0x82),
+		},
+		.amd_events = {
+			INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xFF, BIT(1)),
+		},
+		.msg = "Only allow stores.",
+		.flags = ALLOW_STORES,
+	}, {
+		.intel_events = {
+			INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFF, 0x83),
+		},
+		.amd_events = {
+			INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xFF, BIT(2)),
+		},
+		.msg = "Only allow loads + stores.",
+		.flags = ALLOW_LOADS_STORES,
+	}, {
+		.intel_events = {
+			INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0x7C, 0),
+			EXCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFF, 0x83),
+		},
+		.amd_events = {
+			INCLUDE_MASKED_ENTRY(LS_DISPATCH, ~(BIT(0) | BIT(1)), 0),
+		},
+		.msg = "Only allow loads and stores.",
+		.flags = ALLOW_LOADS | ALLOW_STORES,
+	}, {
+		.intel_events = {
+			INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0x7C, 0),
+			EXCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFF, 0x82),
+		},
+		.amd_events = {
+			INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xF8, 0),
+			EXCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xFF, BIT(1)),
+		},
+		.msg = "Only allow loads and loads + stores.",
+		.flags = ALLOW_LOADS | ALLOW_LOADS_STORES
+	}, {
+		.intel_events = {
+			INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0xFE, 0x82),
+		},
+		.amd_events = {
+			INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xF8, 0),
+			EXCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xFF, BIT(0)),
+		},
+		.msg = "Only allow stores and loads + stores.",
+		.flags = ALLOW_STORES | ALLOW_LOADS_STORES
+	}, {
+		.intel_events = {
+			INCLUDE_MASKED_ENTRY(MEM_INST_RETIRED, 0x7C, 0),
+		},
+		.amd_events = {
+			INCLUDE_MASKED_ENTRY(LS_DISPATCH, 0xF8, 0),
+		},
+		.msg = "Only allow loads, stores, and loads + stores.",
+		.flags = ALLOW_LOADS | ALLOW_STORES | ALLOW_LOADS_STORES
+	},
+};
+
+static int append_test_events(const struct masked_events_test *test,
+			      uint64_t *events, int nevents)
+{
+	const uint64_t *evts;
+	int i;
+
+	evts = use_intel_pmu() ? test->intel_events : test->amd_events;
+	for (i = 0; i < MAX_TEST_EVENTS; i++) {
+		if (evts[i] == 0)
+			break;
+
+		events[nevents + i] = evts[i];
+	}
+
+	return nevents + i;
+}
+
+static bool bool_eq(bool a, bool b)
+{
+	return a == b;
+}
+
+static void run_masked_events_tests(struct kvm_vcpu *vcpu, uint64_t *events,
+				    int nevents)
+{
+	int ntests = ARRAY_SIZE(test_cases);
+	int i, n;
+
+	for (i = 0; i < ntests; i++) {
+		const struct masked_events_test *test = &test_cases[i];
+
+		/* Do any test case events overflow MAX_TEST_EVENTS? */
+		assert(test->intel_event_end == 0);
+		assert(test->amd_event_end == 0);
+
+		n = append_test_events(test, events, nevents);
+
+		run_masked_events_test(vcpu, events, n);
+
+		TEST_ASSERT(bool_eq(pmc_results.loads, test->flags & ALLOW_LOADS) &&
+			    bool_eq(pmc_results.stores, test->flags & ALLOW_STORES) &&
+			    bool_eq(pmc_results.loads_stores,
+				    test->flags & ALLOW_LOADS_STORES),
+			    "%s  loads: %lu, stores: %lu, loads + stores: %lu",
+			    test->msg, pmc_results.loads, pmc_results.stores,
+			    pmc_results.loads_stores);
+	}
+}
+
+static void add_dummy_events(uint64_t *events, int nevents)
+{
+	int i;
+
+	for (i = 0; i < nevents; i++) {
+		int event_select = i % 0xFF;
+		bool exclude = ((i % 4) == 0);
+
+		if (event_select == MEM_INST_RETIRED ||
+		    event_select == LS_DISPATCH)
+			event_select++;
+
+		events[i] = KVM_PMU_ENCODE_MASKED_ENTRY(event_select, 0,
+							0, exclude);
+	}
+}
+
+static void test_masked_events(struct kvm_vcpu *vcpu)
+{
+	int nevents = KVM_PMU_EVENT_FILTER_MAX_EVENTS - MAX_TEST_EVENTS;
+	uint64_t events[KVM_PMU_EVENT_FILTER_MAX_EVENTS];
+
+	/* Run the test cases against a sparse PMU event filter. */
+	run_masked_events_tests(vcpu, events, 0);
+
+	/* Run the test cases against a dense PMU event filter. */
+	add_dummy_events(events, KVM_PMU_EVENT_FILTER_MAX_EVENTS);
+	run_masked_events_tests(vcpu, events, nevents);
+}
+
+static int set_pmu_event_filter(struct kvm_vcpu *vcpu,
+				struct __kvm_pmu_event_filter *__f)
+{
+	struct kvm_pmu_event_filter *f = (void *)__f;
+
+	return __vm_ioctl(vcpu->vm, KVM_SET_PMU_EVENT_FILTER, f);
+}
+
+static int set_pmu_single_event_filter(struct kvm_vcpu *vcpu, uint64_t event,
+				       uint32_t flags, uint32_t action)
+{
+	struct __kvm_pmu_event_filter f = {
+		.nevents = 1,
+		.flags = flags,
+		.action = action,
+		.events = {
+			event,
+		},
+	};
+
+	return set_pmu_event_filter(vcpu, &f);
+}
+
+static void test_filter_ioctl(struct kvm_vcpu *vcpu)
+{
+	uint8_t nr_fixed_counters = kvm_cpu_property(X86_PROPERTY_PMU_NR_FIXED_COUNTERS);
+	struct __kvm_pmu_event_filter f;
+	uint64_t e = ~0ul;
+	int r;
+
+	/*
+	 * Unfortunately having invalid bits set in event data is expected to
+	 * pass when flags == 0 (bits other than eventsel+umask).
+	 */
+	r = set_pmu_single_event_filter(vcpu, e, 0, KVM_PMU_EVENT_ALLOW);
+	TEST_ASSERT(r == 0, "Valid PMU Event Filter is failing");
+
+	r = set_pmu_single_event_filter(vcpu, e,
+					KVM_PMU_EVENT_FLAG_MASKED_EVENTS,
+					KVM_PMU_EVENT_ALLOW);
+	TEST_ASSERT(r != 0, "Invalid PMU Event Filter is expected to fail");
+
+	e = KVM_PMU_ENCODE_MASKED_ENTRY(0xff, 0xff, 0xff, 0xf);
+	r = set_pmu_single_event_filter(vcpu, e,
+					KVM_PMU_EVENT_FLAG_MASKED_EVENTS,
+					KVM_PMU_EVENT_ALLOW);
+	TEST_ASSERT(r == 0, "Valid PMU Event Filter is failing");
+
+	f = base_event_filter;
+	f.action = PMU_EVENT_FILTER_INVALID_ACTION;
+	r = set_pmu_event_filter(vcpu, &f);
+	TEST_ASSERT(r, "Set invalid action is expected to fail");
+
+	f = base_event_filter;
+	f.flags = PMU_EVENT_FILTER_INVALID_FLAGS;
+	r = set_pmu_event_filter(vcpu, &f);
+	TEST_ASSERT(r, "Set invalid flags is expected to fail");
+
+	f = base_event_filter;
+	f.nevents = PMU_EVENT_FILTER_INVALID_NEVENTS;
+	r = set_pmu_event_filter(vcpu, &f);
+	TEST_ASSERT(r, "Exceeding the max number of filter events should fail");
+
+	f = base_event_filter;
+	f.fixed_counter_bitmap = ~GENMASK_ULL(nr_fixed_counters, 0);
+	r = set_pmu_event_filter(vcpu, &f);
+	TEST_ASSERT(!r, "Masking non-existent fixed counters should be allowed");
+}
+
+static void intel_run_fixed_counter_guest_code(uint8_t idx)
+{
+	for (;;) {
+		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+		wrmsr(MSR_CORE_PERF_FIXED_CTR0 + idx, 0);
+
+		/* Only OS_EN bit is enabled for fixed counter[idx]. */
+		wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, FIXED_PMC_CTRL(idx, FIXED_PMC_KERNEL));
+		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, FIXED_PMC_GLOBAL_CTRL_ENABLE(idx));
+		__asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
+		wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+
+		GUEST_SYNC(rdmsr(MSR_CORE_PERF_FIXED_CTR0 + idx));
+	}
+}
+
+static uint64_t test_with_fixed_counter_filter(struct kvm_vcpu *vcpu,
+					       uint32_t action, uint32_t bitmap)
+{
+	struct __kvm_pmu_event_filter f = {
+		.action = action,
+		.fixed_counter_bitmap = bitmap,
+	};
+	set_pmu_event_filter(vcpu, &f);
+
+	return run_vcpu_to_sync(vcpu);
+}
+
+static uint64_t test_set_gp_and_fixed_event_filter(struct kvm_vcpu *vcpu,
+						   uint32_t action,
+						   uint32_t bitmap)
+{
+	struct __kvm_pmu_event_filter f = base_event_filter;
+
+	f.action = action;
+	f.fixed_counter_bitmap = bitmap;
+	set_pmu_event_filter(vcpu, &f);
+
+	return run_vcpu_to_sync(vcpu);
+}
+
+static void __test_fixed_counter_bitmap(struct kvm_vcpu *vcpu, uint8_t idx,
+					uint8_t nr_fixed_counters)
+{
+	unsigned int i;
+	uint32_t bitmap;
+	uint64_t count;
+
+	TEST_ASSERT(nr_fixed_counters < sizeof(bitmap) * 8,
+		    "Invalid nr_fixed_counters");
+
+	/*
+	 * Check the fixed performance counter can count normally when KVM
+	 * userspace doesn't set any pmu filter.
+	 */
+	count = run_vcpu_to_sync(vcpu);
+	TEST_ASSERT(count, "Unexpected count value: %ld", count);
+
+	for (i = 0; i < BIT(nr_fixed_counters); i++) {
+		bitmap = BIT(i);
+		count = test_with_fixed_counter_filter(vcpu, KVM_PMU_EVENT_ALLOW,
+						       bitmap);
+		TEST_ASSERT_EQ(!!count, !!(bitmap & BIT(idx)));
+
+		count = test_with_fixed_counter_filter(vcpu, KVM_PMU_EVENT_DENY,
+						       bitmap);
+		TEST_ASSERT_EQ(!!count, !(bitmap & BIT(idx)));
+
+		/*
+		 * Check that fixed_counter_bitmap has higher priority than
+		 * events[] when both are set.
+		 */
+		count = test_set_gp_and_fixed_event_filter(vcpu,
+							   KVM_PMU_EVENT_ALLOW,
+							   bitmap);
+		TEST_ASSERT_EQ(!!count, !!(bitmap & BIT(idx)));
+
+		count = test_set_gp_and_fixed_event_filter(vcpu,
+							   KVM_PMU_EVENT_DENY,
+							   bitmap);
+		TEST_ASSERT_EQ(!!count, !(bitmap & BIT(idx)));
+	}
+}
+
+static void test_fixed_counter_bitmap(void)
+{
+	uint8_t nr_fixed_counters = kvm_cpu_property(X86_PROPERTY_PMU_NR_FIXED_COUNTERS);
+	struct kvm_vm *vm;
+	struct kvm_vcpu *vcpu;
+	uint8_t idx;
+
+	/*
+	 * Check that pmu_event_filter works as expected when it's applied to
+	 * fixed performance counters.
+	 */
+	for (idx = 0; idx < nr_fixed_counters; idx++) {
+		vm = vm_create_with_one_vcpu(&vcpu,
+					     intel_run_fixed_counter_guest_code);
+		vcpu_args_set(vcpu, 1, idx);
+		__test_fixed_counter_bitmap(vcpu, idx, nr_fixed_counters);
+		kvm_vm_free(vm);
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	void (*guest_code)(void);
+	struct kvm_vcpu *vcpu, *vcpu2 = NULL;
+	struct kvm_vm *vm;
+
+	TEST_REQUIRE(kvm_is_pmu_enabled());
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_PMU_EVENT_FILTER));
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_PMU_EVENT_MASKED_EVENTS));
+
+	TEST_REQUIRE(use_intel_pmu() || use_amd_pmu());
+	guest_code = use_intel_pmu() ? intel_guest_code : amd_guest_code;
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+
+	vm_init_descriptor_tables(vm);
+	vcpu_init_descriptor_tables(vcpu);
+
+	TEST_REQUIRE(sanity_check_pmu(vcpu));
+
+	if (use_amd_pmu())
+		test_amd_deny_list(vcpu);
+
+	test_without_filter(vcpu);
+	test_member_deny_list(vcpu);
+	test_member_allow_list(vcpu);
+	test_not_member_deny_list(vcpu);
+	test_not_member_allow_list(vcpu);
+
+	if (use_intel_pmu() &&
+	    supports_event_mem_inst_retired() &&
+	    kvm_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS) >= 3)
+		vcpu2 = vm_vcpu_add(vm, 2, intel_masked_events_guest_code);
+	else if (use_amd_pmu())
+		vcpu2 = vm_vcpu_add(vm, 2, amd_masked_events_guest_code);
+
+	if (vcpu2)
+		test_masked_events(vcpu2);
+	test_filter_ioctl(vcpu);
+
+	kvm_vm_free(vm);
+
+	test_pmu_config_disable(guest_code);
+	test_fixed_counter_bitmap();
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c b/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c
new file mode 100644
index 000000000000..e0f642d2a3c4
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c
@@ -0,0 +1,484 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022, Google LLC.
+ */
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <limits.h>
+#include <pthread.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/kvm_para.h>
+#include <linux/memfd.h>
+#include <linux/sizes.h>
+
+#include <test_util.h>
+#include <kvm_util.h>
+#include <processor.h>
+
+#define BASE_DATA_SLOT		10
+#define BASE_DATA_GPA		((uint64_t)(1ull << 32))
+#define PER_CPU_DATA_SIZE	((uint64_t)(SZ_2M + PAGE_SIZE))
+
+/* Horrific macro so that the line info is captured accurately :-( */
+#define memcmp_g(gpa, pattern,  size)								\
+do {												\
+	uint8_t *mem = (uint8_t *)gpa;								\
+	size_t i;										\
+												\
+	for (i = 0; i < size; i++)								\
+		__GUEST_ASSERT(mem[i] == pattern,						\
+			       "Guest expected 0x%x at offset %lu (gpa 0x%lx), got 0x%x",	\
+			       pattern, i, gpa + i, mem[i]);					\
+} while (0)
+
+static void memcmp_h(uint8_t *mem, uint64_t gpa, uint8_t pattern, size_t size)
+{
+	size_t i;
+
+	for (i = 0; i < size; i++)
+		TEST_ASSERT(mem[i] == pattern,
+			    "Host expected 0x%x at gpa 0x%lx, got 0x%x",
+			    pattern, gpa + i, mem[i]);
+}
+
+/*
+ * Run memory conversion tests with explicit conversion:
+ * Execute KVM hypercall to map/unmap gpa range which will cause userspace exit
+ * to back/unback private memory. Subsequent accesses by guest to the gpa range
+ * will not cause exit to userspace.
+ *
+ * Test memory conversion scenarios with following steps:
+ * 1) Access private memory using private access and verify that memory contents
+ *   are not visible to userspace.
+ * 2) Convert memory to shared using explicit conversions and ensure that
+ *   userspace is able to access the shared regions.
+ * 3) Convert memory back to private using explicit conversions and ensure that
+ *   userspace is again not able to access converted private regions.
+ */
+
+#define GUEST_STAGE(o, s) { .offset = o, .size = s }
+
+enum ucall_syncs {
+	SYNC_SHARED,
+	SYNC_PRIVATE,
+};
+
+static void guest_sync_shared(uint64_t gpa, uint64_t size,
+			      uint8_t current_pattern, uint8_t new_pattern)
+{
+	GUEST_SYNC5(SYNC_SHARED, gpa, size, current_pattern, new_pattern);
+}
+
+static void guest_sync_private(uint64_t gpa, uint64_t size, uint8_t pattern)
+{
+	GUEST_SYNC4(SYNC_PRIVATE, gpa, size, pattern);
+}
+
+/* Arbitrary values, KVM doesn't care about the attribute flags. */
+#define MAP_GPA_SET_ATTRIBUTES	BIT(0)
+#define MAP_GPA_SHARED		BIT(1)
+#define MAP_GPA_DO_FALLOCATE	BIT(2)
+
+static void guest_map_mem(uint64_t gpa, uint64_t size, bool map_shared,
+			  bool do_fallocate)
+{
+	uint64_t flags = MAP_GPA_SET_ATTRIBUTES;
+
+	if (map_shared)
+		flags |= MAP_GPA_SHARED;
+	if (do_fallocate)
+		flags |= MAP_GPA_DO_FALLOCATE;
+	kvm_hypercall_map_gpa_range(gpa, size, flags);
+}
+
+static void guest_map_shared(uint64_t gpa, uint64_t size, bool do_fallocate)
+{
+	guest_map_mem(gpa, size, true, do_fallocate);
+}
+
+static void guest_map_private(uint64_t gpa, uint64_t size, bool do_fallocate)
+{
+	guest_map_mem(gpa, size, false, do_fallocate);
+}
+
+struct {
+	uint64_t offset;
+	uint64_t size;
+} static const test_ranges[] = {
+	GUEST_STAGE(0, PAGE_SIZE),
+	GUEST_STAGE(0, SZ_2M),
+	GUEST_STAGE(PAGE_SIZE, PAGE_SIZE),
+	GUEST_STAGE(PAGE_SIZE, SZ_2M),
+	GUEST_STAGE(SZ_2M, PAGE_SIZE),
+};
+
+static void guest_test_explicit_conversion(uint64_t base_gpa, bool do_fallocate)
+{
+	const uint8_t def_p = 0xaa;
+	const uint8_t init_p = 0xcc;
+	uint64_t j;
+	int i;
+
+	/* Memory should be shared by default. */
+	memset((void *)base_gpa, def_p, PER_CPU_DATA_SIZE);
+	memcmp_g(base_gpa, def_p, PER_CPU_DATA_SIZE);
+	guest_sync_shared(base_gpa, PER_CPU_DATA_SIZE, def_p, init_p);
+
+	memcmp_g(base_gpa, init_p, PER_CPU_DATA_SIZE);
+
+	for (i = 0; i < ARRAY_SIZE(test_ranges); i++) {
+		uint64_t gpa = base_gpa + test_ranges[i].offset;
+		uint64_t size = test_ranges[i].size;
+		uint8_t p1 = 0x11;
+		uint8_t p2 = 0x22;
+		uint8_t p3 = 0x33;
+		uint8_t p4 = 0x44;
+
+		/*
+		 * Set the test region to pattern one to differentiate it from
+		 * the data range as a whole (contains the initial pattern).
+		 */
+		memset((void *)gpa, p1, size);
+
+		/*
+		 * Convert to private, set and verify the private data, and
+		 * then verify that the rest of the data (map shared) still
+		 * holds the initial pattern, and that the host always sees the
+		 * shared memory (initial pattern).  Unlike shared memory,
+		 * punching a hole in private memory is destructive, i.e.
+		 * previous values aren't guaranteed to be preserved.
+		 */
+		guest_map_private(gpa, size, do_fallocate);
+
+		if (size > PAGE_SIZE) {
+			memset((void *)gpa, p2, PAGE_SIZE);
+			goto skip;
+		}
+
+		memset((void *)gpa, p2, size);
+		guest_sync_private(gpa, size, p1);
+
+		/*
+		 * Verify that the private memory was set to pattern two, and
+		 * that shared memory still holds the initial pattern.
+		 */
+		memcmp_g(gpa, p2, size);
+		if (gpa > base_gpa)
+			memcmp_g(base_gpa, init_p, gpa - base_gpa);
+		if (gpa + size < base_gpa + PER_CPU_DATA_SIZE)
+			memcmp_g(gpa + size, init_p,
+				 (base_gpa + PER_CPU_DATA_SIZE) - (gpa + size));
+
+		/*
+		 * Convert odd-number page frames back to shared to verify KVM
+		 * also correctly handles holes in private ranges.
+		 */
+		for (j = 0; j < size; j += PAGE_SIZE) {
+			if ((j >> PAGE_SHIFT) & 1) {
+				guest_map_shared(gpa + j, PAGE_SIZE, do_fallocate);
+				guest_sync_shared(gpa + j, PAGE_SIZE, p1, p3);
+
+				memcmp_g(gpa + j, p3, PAGE_SIZE);
+			} else {
+				guest_sync_private(gpa + j, PAGE_SIZE, p1);
+			}
+		}
+
+skip:
+		/*
+		 * Convert the entire region back to shared, explicitly write
+		 * pattern three to fill in the even-number frames before
+		 * asking the host to verify (and write pattern four).
+		 */
+		guest_map_shared(gpa, size, do_fallocate);
+		memset((void *)gpa, p3, size);
+		guest_sync_shared(gpa, size, p3, p4);
+		memcmp_g(gpa, p4, size);
+
+		/* Reset the shared memory back to the initial pattern. */
+		memset((void *)gpa, init_p, size);
+
+		/*
+		 * Free (via PUNCH_HOLE) *all* private memory so that the next
+		 * iteration starts from a clean slate, e.g. with respect to
+		 * whether or not there are pages/folios in guest_mem.
+		 */
+		guest_map_shared(base_gpa, PER_CPU_DATA_SIZE, true);
+	}
+}
+
+static void guest_punch_hole(uint64_t gpa, uint64_t size)
+{
+	/* "Mapping" memory shared via fallocate() is done via PUNCH_HOLE. */
+	uint64_t flags = MAP_GPA_SHARED | MAP_GPA_DO_FALLOCATE;
+
+	kvm_hypercall_map_gpa_range(gpa, size, flags);
+}
+
+/*
+ * Test that PUNCH_HOLE actually frees memory by punching holes without doing a
+ * proper conversion.  Freeing (PUNCH_HOLE) should zap SPTEs, and reallocating
+ * (subsequent fault) should zero memory.
+ */
+static void guest_test_punch_hole(uint64_t base_gpa, bool precise)
+{
+	const uint8_t init_p = 0xcc;
+	int i;
+
+	/*
+	 * Convert the entire range to private, this testcase is all about
+	 * punching holes in guest_memfd, i.e. shared mappings aren't needed.
+	 */
+	guest_map_private(base_gpa, PER_CPU_DATA_SIZE, false);
+
+	for (i = 0; i < ARRAY_SIZE(test_ranges); i++) {
+		uint64_t gpa = base_gpa + test_ranges[i].offset;
+		uint64_t size = test_ranges[i].size;
+
+		/*
+		 * Free all memory before each iteration, even for the !precise
+		 * case where the memory will be faulted back in.  Freeing and
+		 * reallocating should obviously work, and freeing all memory
+		 * minimizes the probability of cross-testcase influence.
+		 */
+		guest_punch_hole(base_gpa, PER_CPU_DATA_SIZE);
+
+		/* Fault-in and initialize memory, and verify the pattern. */
+		if (precise) {
+			memset((void *)gpa, init_p, size);
+			memcmp_g(gpa, init_p, size);
+		} else {
+			memset((void *)base_gpa, init_p, PER_CPU_DATA_SIZE);
+			memcmp_g(base_gpa, init_p, PER_CPU_DATA_SIZE);
+		}
+
+		/*
+		 * Punch a hole at the target range and verify that reads from
+		 * the guest succeed and return zeroes.
+		 */
+		guest_punch_hole(gpa, size);
+		memcmp_g(gpa, 0, size);
+	}
+}
+
+static void guest_code(uint64_t base_gpa)
+{
+	/*
+	 * Run the conversion test twice, with and without doing fallocate() on
+	 * the guest_memfd backing when converting between shared and private.
+	 */
+	guest_test_explicit_conversion(base_gpa, false);
+	guest_test_explicit_conversion(base_gpa, true);
+
+	/*
+	 * Run the PUNCH_HOLE test twice too, once with the entire guest_memfd
+	 * faulted in, once with only the target range faulted in.
+	 */
+	guest_test_punch_hole(base_gpa, false);
+	guest_test_punch_hole(base_gpa, true);
+	GUEST_DONE();
+}
+
+static void handle_exit_hypercall(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+	uint64_t gpa = run->hypercall.args[0];
+	uint64_t size = run->hypercall.args[1] * PAGE_SIZE;
+	bool set_attributes = run->hypercall.args[2] & MAP_GPA_SET_ATTRIBUTES;
+	bool map_shared = run->hypercall.args[2] & MAP_GPA_SHARED;
+	bool do_fallocate = run->hypercall.args[2] & MAP_GPA_DO_FALLOCATE;
+	struct kvm_vm *vm = vcpu->vm;
+
+	TEST_ASSERT(run->hypercall.nr == KVM_HC_MAP_GPA_RANGE,
+		    "Wanted MAP_GPA_RANGE (%u), got '%llu'",
+		    KVM_HC_MAP_GPA_RANGE, run->hypercall.nr);
+
+	if (do_fallocate)
+		vm_guest_mem_fallocate(vm, gpa, size, map_shared);
+
+	if (set_attributes)
+		vm_set_memory_attributes(vm, gpa, size,
+					 map_shared ? 0 : KVM_MEMORY_ATTRIBUTE_PRIVATE);
+	run->hypercall.ret = 0;
+}
+
+static bool run_vcpus;
+
+static void *__test_mem_conversions(void *__vcpu)
+{
+	struct kvm_vcpu *vcpu = __vcpu;
+	struct kvm_run *run = vcpu->run;
+	struct kvm_vm *vm = vcpu->vm;
+	struct ucall uc;
+
+	while (!READ_ONCE(run_vcpus))
+		;
+
+	for ( ;; ) {
+		vcpu_run(vcpu);
+
+		if (run->exit_reason == KVM_EXIT_HYPERCALL) {
+			handle_exit_hypercall(vcpu);
+			continue;
+		}
+
+		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+			    "Wanted KVM_EXIT_IO, got exit reason: %u (%s)",
+			    run->exit_reason, exit_reason_str(run->exit_reason));
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+		case UCALL_SYNC: {
+			uint64_t gpa  = uc.args[1];
+			size_t size = uc.args[2];
+			size_t i;
+
+			TEST_ASSERT(uc.args[0] == SYNC_SHARED ||
+				    uc.args[0] == SYNC_PRIVATE,
+				    "Unknown sync command '%ld'", uc.args[0]);
+
+			for (i = 0; i < size; i += vm->page_size) {
+				size_t nr_bytes = min_t(size_t, vm->page_size, size - i);
+				uint8_t *hva = addr_gpa2hva(vm, gpa + i);
+
+				/* In all cases, the host should observe the shared data. */
+				memcmp_h(hva, gpa + i, uc.args[3], nr_bytes);
+
+				/* For shared, write the new pattern to guest memory. */
+				if (uc.args[0] == SYNC_SHARED)
+					memset(hva, uc.args[4], nr_bytes);
+			}
+			break;
+		}
+		case UCALL_DONE:
+			return NULL;
+		default:
+			TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd);
+		}
+	}
+}
+
+static void test_mem_conversions(enum vm_mem_backing_src_type src_type, uint32_t nr_vcpus,
+				 uint32_t nr_memslots)
+{
+	/*
+	 * Allocate enough memory so that each vCPU's chunk of memory can be
+	 * naturally aligned with respect to the size of the backing store.
+	 */
+	const size_t alignment = max_t(size_t, SZ_2M, get_backing_src_pagesz(src_type));
+	const size_t per_cpu_size = align_up(PER_CPU_DATA_SIZE, alignment);
+	const size_t memfd_size = per_cpu_size * nr_vcpus;
+	const size_t slot_size = memfd_size / nr_memslots;
+	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
+	pthread_t threads[KVM_MAX_VCPUS];
+	struct kvm_vm *vm;
+	int memfd, i, r;
+
+	const struct vm_shape shape = {
+		.mode = VM_MODE_DEFAULT,
+		.type = KVM_X86_SW_PROTECTED_VM,
+	};
+
+	TEST_ASSERT(slot_size * nr_memslots == memfd_size,
+		    "The memfd size (0x%lx) needs to be cleanly divisible by the number of memslots (%u)",
+		    memfd_size, nr_memslots);
+	vm = __vm_create_with_vcpus(shape, nr_vcpus, 0, guest_code, vcpus);
+
+	vm_enable_cap(vm, KVM_CAP_EXIT_HYPERCALL, (1 << KVM_HC_MAP_GPA_RANGE));
+
+	memfd = vm_create_guest_memfd(vm, memfd_size, 0);
+
+	for (i = 0; i < nr_memslots; i++)
+		vm_mem_add(vm, src_type, BASE_DATA_GPA + slot_size * i,
+			   BASE_DATA_SLOT + i, slot_size / vm->page_size,
+			   KVM_MEM_GUEST_MEMFD, memfd, slot_size * i);
+
+	for (i = 0; i < nr_vcpus; i++) {
+		uint64_t gpa =  BASE_DATA_GPA + i * per_cpu_size;
+
+		vcpu_args_set(vcpus[i], 1, gpa);
+
+		/*
+		 * Map only what is needed so that an out-of-bounds access
+		 * results #PF => SHUTDOWN instead of data corruption.
+		 */
+		virt_map(vm, gpa, gpa, PER_CPU_DATA_SIZE / vm->page_size);
+
+		pthread_create(&threads[i], NULL, __test_mem_conversions, vcpus[i]);
+	}
+
+	WRITE_ONCE(run_vcpus, true);
+
+	for (i = 0; i < nr_vcpus; i++)
+		pthread_join(threads[i], NULL);
+
+	kvm_vm_free(vm);
+
+	/*
+	 * Allocate and free memory from the guest_memfd after closing the VM
+	 * fd.  The guest_memfd is gifted a reference to its owning VM, i.e.
+	 * should prevent the VM from being fully destroyed until the last
+	 * reference to the guest_memfd is also put.
+	 */
+	r = fallocate(memfd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, memfd_size);
+	TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r));
+
+	r = fallocate(memfd, FALLOC_FL_KEEP_SIZE, 0, memfd_size);
+	TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r));
+
+	close(memfd);
+}
+
+static void usage(const char *cmd)
+{
+	puts("");
+	printf("usage: %s [-h] [-m nr_memslots] [-s mem_type] [-n nr_vcpus]\n", cmd);
+	puts("");
+	backing_src_help("-s");
+	puts("");
+	puts(" -n: specify the number of vcpus (default: 1)");
+	puts("");
+	puts(" -m: specify the number of memslots (default: 1)");
+	puts("");
+}
+
+int main(int argc, char *argv[])
+{
+	enum vm_mem_backing_src_type src_type = DEFAULT_VM_MEM_SRC;
+	uint32_t nr_memslots = 1;
+	uint32_t nr_vcpus = 1;
+	int opt;
+
+	TEST_REQUIRE(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM));
+
+	while ((opt = getopt(argc, argv, "hm:s:n:")) != -1) {
+		switch (opt) {
+		case 's':
+			src_type = parse_backing_src_type(optarg);
+			break;
+		case 'n':
+			nr_vcpus = atoi_positive("nr_vcpus", optarg);
+			break;
+		case 'm':
+			nr_memslots = atoi_positive("nr_memslots", optarg);
+			break;
+		case 'h':
+		default:
+			usage(argv[0]);
+			exit(0);
+		}
+	}
+
+	test_mem_conversions(src_type, nr_vcpus, nr_memslots);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/private_mem_kvm_exits_test.c b/tools/testing/selftests/kvm/x86_64/private_mem_kvm_exits_test.c
new file mode 100644
index 000000000000..13e72fcec8dd
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/private_mem_kvm_exits_test.c
@@ -0,0 +1,120 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023, Google LLC.
+ */
+#include <linux/kvm.h>
+#include <pthread.h>
+#include <stdint.h>
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "test_util.h"
+
+/* Arbitrarily selected to avoid overlaps with anything else */
+#define EXITS_TEST_GVA 0xc0000000
+#define EXITS_TEST_GPA EXITS_TEST_GVA
+#define EXITS_TEST_NPAGES 1
+#define EXITS_TEST_SIZE (EXITS_TEST_NPAGES * PAGE_SIZE)
+#define EXITS_TEST_SLOT 10
+
+static uint64_t guest_repeatedly_read(void)
+{
+	volatile uint64_t value;
+
+	while (true)
+		value = *((uint64_t *) EXITS_TEST_GVA);
+
+	return value;
+}
+
+static uint32_t run_vcpu_get_exit_reason(struct kvm_vcpu *vcpu)
+{
+	int r;
+
+	r = _vcpu_run(vcpu);
+	if (r) {
+		TEST_ASSERT(errno == EFAULT, KVM_IOCTL_ERROR(KVM_RUN, r));
+		TEST_ASSERT_EQ(vcpu->run->exit_reason, KVM_EXIT_MEMORY_FAULT);
+	}
+	return vcpu->run->exit_reason;
+}
+
+const struct vm_shape protected_vm_shape = {
+	.mode = VM_MODE_DEFAULT,
+	.type = KVM_X86_SW_PROTECTED_VM,
+};
+
+static void test_private_access_memslot_deleted(void)
+{
+	struct kvm_vm *vm;
+	struct kvm_vcpu *vcpu;
+	pthread_t vm_thread;
+	void *thread_return;
+	uint32_t exit_reason;
+
+	vm = vm_create_shape_with_one_vcpu(protected_vm_shape, &vcpu,
+					   guest_repeatedly_read);
+
+	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+				    EXITS_TEST_GPA, EXITS_TEST_SLOT,
+				    EXITS_TEST_NPAGES,
+				    KVM_MEM_GUEST_MEMFD);
+
+	virt_map(vm, EXITS_TEST_GVA, EXITS_TEST_GPA, EXITS_TEST_NPAGES);
+
+	/* Request to access page privately */
+	vm_mem_set_private(vm, EXITS_TEST_GPA, EXITS_TEST_SIZE);
+
+	pthread_create(&vm_thread, NULL,
+		       (void *(*)(void *))run_vcpu_get_exit_reason,
+		       (void *)vcpu);
+
+	vm_mem_region_delete(vm, EXITS_TEST_SLOT);
+
+	pthread_join(vm_thread, &thread_return);
+	exit_reason = (uint32_t)(uint64_t)thread_return;
+
+	TEST_ASSERT_EQ(exit_reason, KVM_EXIT_MEMORY_FAULT);
+	TEST_ASSERT_EQ(vcpu->run->memory_fault.flags, KVM_MEMORY_EXIT_FLAG_PRIVATE);
+	TEST_ASSERT_EQ(vcpu->run->memory_fault.gpa, EXITS_TEST_GPA);
+	TEST_ASSERT_EQ(vcpu->run->memory_fault.size, EXITS_TEST_SIZE);
+
+	kvm_vm_free(vm);
+}
+
+static void test_private_access_memslot_not_private(void)
+{
+	struct kvm_vm *vm;
+	struct kvm_vcpu *vcpu;
+	uint32_t exit_reason;
+
+	vm = vm_create_shape_with_one_vcpu(protected_vm_shape, &vcpu,
+					   guest_repeatedly_read);
+
+	/* Add a non-private memslot (flags = 0) */
+	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+				    EXITS_TEST_GPA, EXITS_TEST_SLOT,
+				    EXITS_TEST_NPAGES, 0);
+
+	virt_map(vm, EXITS_TEST_GVA, EXITS_TEST_GPA, EXITS_TEST_NPAGES);
+
+	/* Request to access page privately */
+	vm_mem_set_private(vm, EXITS_TEST_GPA, EXITS_TEST_SIZE);
+
+	exit_reason = run_vcpu_get_exit_reason(vcpu);
+
+	TEST_ASSERT_EQ(exit_reason, KVM_EXIT_MEMORY_FAULT);
+	TEST_ASSERT_EQ(vcpu->run->memory_fault.flags, KVM_MEMORY_EXIT_FLAG_PRIVATE);
+	TEST_ASSERT_EQ(vcpu->run->memory_fault.gpa, EXITS_TEST_GPA);
+	TEST_ASSERT_EQ(vcpu->run->memory_fault.size, EXITS_TEST_SIZE);
+
+	kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
+	TEST_REQUIRE(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM));
+
+	test_private_access_memslot_deleted();
+	test_private_access_memslot_not_private();
+}
diff --git a/tools/testing/selftests/kvm/x86_64/recalc_apic_map_test.c b/tools/testing/selftests/kvm/x86_64/recalc_apic_map_test.c
new file mode 100644
index 000000000000..cbc92a862ea9
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/recalc_apic_map_test.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Test edge cases and race conditions in kvm_recalculate_apic_map().
+ */
+
+#include <sys/ioctl.h>
+#include <pthread.h>
+#include <time.h>
+
+#include "processor.h"
+#include "test_util.h"
+#include "kvm_util.h"
+#include "apic.h"
+
+#define TIMEOUT		5	/* seconds */
+
+#define LAPIC_DISABLED	0
+#define LAPIC_X2APIC	(MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)
+#define MAX_XAPIC_ID	0xff
+
+static void *race(void *arg)
+{
+	struct kvm_lapic_state lapic = {};
+	struct kvm_vcpu *vcpu = arg;
+
+	while (1) {
+		/* Trigger kvm_recalculate_apic_map(). */
+		vcpu_ioctl(vcpu, KVM_SET_LAPIC, &lapic);
+		pthread_testcancel();
+	}
+
+	return NULL;
+}
+
+int main(void)
+{
+	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
+	struct kvm_vcpu *vcpuN;
+	struct kvm_vm *vm;
+	pthread_t thread;
+	time_t t;
+	int i;
+
+	kvm_static_assert(KVM_MAX_VCPUS > MAX_XAPIC_ID);
+
+	/*
+	 * Create the max number of vCPUs supported by selftests so that KVM
+	 * has decent amount of work to do when recalculating the map, i.e. to
+	 * make the problematic window large enough to hit.
+	 */
+	vm = vm_create_with_vcpus(KVM_MAX_VCPUS, NULL, vcpus);
+
+	/*
+	 * Enable x2APIC on all vCPUs so that KVM doesn't bail from the recalc
+	 * due to vCPUs having aliased xAPIC IDs (truncated to 8 bits).
+	 */
+	for (i = 0; i < KVM_MAX_VCPUS; i++)
+		vcpu_set_msr(vcpus[i], MSR_IA32_APICBASE, LAPIC_X2APIC);
+
+	TEST_ASSERT_EQ(pthread_create(&thread, NULL, race, vcpus[0]), 0);
+
+	vcpuN = vcpus[KVM_MAX_VCPUS - 1];
+	for (t = time(NULL) + TIMEOUT; time(NULL) < t;) {
+		vcpu_set_msr(vcpuN, MSR_IA32_APICBASE, LAPIC_X2APIC);
+		vcpu_set_msr(vcpuN, MSR_IA32_APICBASE, LAPIC_DISABLED);
+	}
+
+	TEST_ASSERT_EQ(pthread_cancel(thread), 0);
+	TEST_ASSERT_EQ(pthread_join(thread, NULL), 0);
+
+	kvm_vm_free(vm);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c
new file mode 100644
index 000000000000..366cf18600bc
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test that KVM_SET_BOOT_CPU_ID works as intended
+ *
+ * Copyright (C) 2020, Red Hat, Inc.
+ */
+#define _GNU_SOURCE /* for program_invocation_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "apic.h"
+
+static void guest_bsp_vcpu(void *arg)
+{
+	GUEST_SYNC(1);
+
+	GUEST_ASSERT_NE(get_bsp_flag(), 0);
+
+	GUEST_DONE();
+}
+
+static void guest_not_bsp_vcpu(void *arg)
+{
+	GUEST_SYNC(1);
+
+	GUEST_ASSERT_EQ(get_bsp_flag(), 0);
+
+	GUEST_DONE();
+}
+
+static void test_set_bsp_busy(struct kvm_vcpu *vcpu, const char *msg)
+{
+	int r = __vm_ioctl(vcpu->vm, KVM_SET_BOOT_CPU_ID,
+			   (void *)(unsigned long)vcpu->id);
+
+	TEST_ASSERT(r == -1 && errno == EBUSY, "KVM_SET_BOOT_CPU_ID set %s", msg);
+}
+
+static void run_vcpu(struct kvm_vcpu *vcpu)
+{
+	struct ucall uc;
+	int stage;
+
+	for (stage = 0; stage < 2; stage++) {
+
+		vcpu_run(vcpu);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_SYNC:
+			TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
+					uc.args[1] == stage + 1,
+					"Stage %d: Unexpected register values vmexit, got %lx",
+					stage + 1, (ulong)uc.args[1]);
+			test_set_bsp_busy(vcpu, "while running vm");
+			break;
+		case UCALL_DONE:
+			TEST_ASSERT(stage == 1,
+					"Expected GUEST_DONE in stage 2, got stage %d",
+					stage);
+			break;
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+		default:
+			TEST_ASSERT(false, "Unexpected exit: %s",
+				    exit_reason_str(vcpu->run->exit_reason));
+		}
+	}
+}
+
+static struct kvm_vm *create_vm(uint32_t nr_vcpus, uint32_t bsp_vcpu_id,
+				struct kvm_vcpu *vcpus[])
+{
+	struct kvm_vm *vm;
+	uint32_t i;
+
+	vm = vm_create(nr_vcpus);
+
+	vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *)(unsigned long)bsp_vcpu_id);
+
+	for (i = 0; i < nr_vcpus; i++)
+		vcpus[i] = vm_vcpu_add(vm, i, i == bsp_vcpu_id ? guest_bsp_vcpu :
+								 guest_not_bsp_vcpu);
+	return vm;
+}
+
+static void run_vm_bsp(uint32_t bsp_vcpu_id)
+{
+	struct kvm_vcpu *vcpus[2];
+	struct kvm_vm *vm;
+
+	vm = create_vm(ARRAY_SIZE(vcpus), bsp_vcpu_id, vcpus);
+
+	run_vcpu(vcpus[0]);
+	run_vcpu(vcpus[1]);
+
+	kvm_vm_free(vm);
+}
+
+static void check_set_bsp_busy(void)
+{
+	struct kvm_vcpu *vcpus[2];
+	struct kvm_vm *vm;
+
+	vm = create_vm(ARRAY_SIZE(vcpus), 0, vcpus);
+
+	test_set_bsp_busy(vcpus[1], "after adding vcpu");
+
+	run_vcpu(vcpus[0]);
+	run_vcpu(vcpus[1]);
+
+	test_set_bsp_busy(vcpus[1], "to a terminated vcpu");
+
+	kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_SET_BOOT_CPU_ID));
+
+	run_vm_bsp(0);
+	run_vm_bsp(1);
+	run_vm_bsp(0);
+
+	check_set_bsp_busy();
+}
diff --git a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c
index 9f7656184f31..3610981d9162 100644
--- a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c
+++ b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c
@@ -22,27 +22,117 @@
 #include "kvm_util.h"
 #include "processor.h"
 
-#define VCPU_ID                  5
+#define TEST_INVALID_CR_BIT(vcpu, cr, orig, bit)				\
+do {										\
+	struct kvm_sregs new;							\
+	int rc;									\
+										\
+	/* Skip the sub-test, the feature/bit is supported. */			\
+	if (orig.cr & bit)							\
+		break;								\
+										\
+	memcpy(&new, &orig, sizeof(sregs));					\
+	new.cr |= bit;								\
+										\
+	rc = _vcpu_sregs_set(vcpu, &new);					\
+	TEST_ASSERT(rc, "KVM allowed invalid " #cr " bit (0x%lx)", bit);	\
+										\
+	/* Sanity check that KVM didn't change anything. */			\
+	vcpu_sregs_get(vcpu, &new);						\
+	TEST_ASSERT(!memcmp(&new, &orig, sizeof(new)), "KVM modified sregs");	\
+} while (0)
+
+static uint64_t calc_supported_cr4_feature_bits(void)
+{
+	uint64_t cr4;
+
+	cr4 = X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE |
+	      X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE | X86_CR4_PGE |
+	      X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_OSXMMEXCPT;
+	if (kvm_cpu_has(X86_FEATURE_UMIP))
+		cr4 |= X86_CR4_UMIP;
+	if (kvm_cpu_has(X86_FEATURE_LA57))
+		cr4 |= X86_CR4_LA57;
+	if (kvm_cpu_has(X86_FEATURE_VMX))
+		cr4 |= X86_CR4_VMXE;
+	if (kvm_cpu_has(X86_FEATURE_SMX))
+		cr4 |= X86_CR4_SMXE;
+	if (kvm_cpu_has(X86_FEATURE_FSGSBASE))
+		cr4 |= X86_CR4_FSGSBASE;
+	if (kvm_cpu_has(X86_FEATURE_PCID))
+		cr4 |= X86_CR4_PCIDE;
+	if (kvm_cpu_has(X86_FEATURE_XSAVE))
+		cr4 |= X86_CR4_OSXSAVE;
+	if (kvm_cpu_has(X86_FEATURE_SMEP))
+		cr4 |= X86_CR4_SMEP;
+	if (kvm_cpu_has(X86_FEATURE_SMAP))
+		cr4 |= X86_CR4_SMAP;
+	if (kvm_cpu_has(X86_FEATURE_PKU))
+		cr4 |= X86_CR4_PKE;
+
+	return cr4;
+}
 
 int main(int argc, char *argv[])
 {
 	struct kvm_sregs sregs;
+	struct kvm_vcpu *vcpu;
 	struct kvm_vm *vm;
-	int rc;
+	uint64_t cr4;
+	int rc, i;
+
+	/*
+	 * Create a dummy VM, specifically to avoid doing KVM_SET_CPUID2, and
+	 * use it to verify all supported CR4 bits can be set prior to defining
+	 * the vCPU model, i.e. without doing KVM_SET_CPUID2.
+	 */
+	vm = vm_create_barebones();
+	vcpu = __vm_vcpu_add(vm, 0);
+
+	vcpu_sregs_get(vcpu, &sregs);
 
-	/* Tell stdout not to buffer its content */
-	setbuf(stdout, NULL);
+	sregs.cr0 = 0;
+	sregs.cr4 |= calc_supported_cr4_feature_bits();
+	cr4 = sregs.cr4;
+
+	rc = _vcpu_sregs_set(vcpu, &sregs);
+	TEST_ASSERT(!rc, "Failed to set supported CR4 bits (0x%lx)", cr4);
+
+	vcpu_sregs_get(vcpu, &sregs);
+	TEST_ASSERT(sregs.cr4 == cr4, "sregs.CR4 (0x%llx) != CR4 (0x%lx)",
+		    sregs.cr4, cr4);
+
+	/* Verify all unsupported features are rejected by KVM. */
+	TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_UMIP);
+	TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_LA57);
+	TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_VMXE);
+	TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_SMXE);
+	TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_FSGSBASE);
+	TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_PCIDE);
+	TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_OSXSAVE);
+	TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_SMEP);
+	TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_SMAP);
+	TEST_INVALID_CR_BIT(vcpu, cr4, sregs, X86_CR4_PKE);
+
+	for (i = 32; i < 64; i++)
+		TEST_INVALID_CR_BIT(vcpu, cr0, sregs, BIT(i));
+
+	/* NW without CD is illegal, as is PG without PE. */
+	TEST_INVALID_CR_BIT(vcpu, cr0, sregs, X86_CR0_NW);
+	TEST_INVALID_CR_BIT(vcpu, cr0, sregs, X86_CR0_PG);
+
+	kvm_vm_free(vm);
 
-	/* Create VM */
-	vm = vm_create_default(VCPU_ID, 0, NULL);
+	/* Create a "real" VM and verify APIC_BASE can be set. */
+	vm = vm_create_with_one_vcpu(&vcpu, NULL);
 
-	vcpu_sregs_get(vm, VCPU_ID, &sregs);
+	vcpu_sregs_get(vcpu, &sregs);
 	sregs.apic_base = 1 << 10;
-	rc = _vcpu_sregs_set(vm, VCPU_ID, &sregs);
+	rc = _vcpu_sregs_set(vcpu, &sregs);
 	TEST_ASSERT(rc, "Set IA32_APIC_BASE to %llx (invalid)",
 		    sregs.apic_base);
 	sregs.apic_base = 1 << 11;
-	rc = _vcpu_sregs_set(vm, VCPU_ID, &sregs);
+	rc = _vcpu_sregs_set(vcpu, &sregs);
 	TEST_ASSERT(!rc, "Couldn't set IA32_APIC_BASE to %llx (valid)",
 		    sregs.apic_base);
 
diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
new file mode 100644
index 000000000000..0a6dfba3905b
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c
@@ -0,0 +1,397 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kvm.h>
+#include <linux/psp-sev.h>
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <pthread.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "sev.h"
+#include "kselftest.h"
+
+#define NR_MIGRATE_TEST_VCPUS 4
+#define NR_MIGRATE_TEST_VMS 3
+#define NR_LOCK_TESTING_THREADS 3
+#define NR_LOCK_TESTING_ITERATIONS 10000
+
+bool have_sev_es;
+
+static struct kvm_vm *sev_vm_create(bool es)
+{
+	struct kvm_vm *vm;
+	int i;
+
+	vm = vm_create_barebones();
+	if (!es)
+		sev_vm_init(vm);
+	else
+		sev_es_vm_init(vm);
+
+	for (i = 0; i < NR_MIGRATE_TEST_VCPUS; ++i)
+		__vm_vcpu_add(vm, i);
+
+	sev_vm_launch(vm, es ? SEV_POLICY_ES : 0);
+
+	if (es)
+		vm_sev_ioctl(vm, KVM_SEV_LAUNCH_UPDATE_VMSA, NULL);
+	return vm;
+}
+
+static struct kvm_vm *aux_vm_create(bool with_vcpus)
+{
+	struct kvm_vm *vm;
+	int i;
+
+	vm = vm_create_barebones();
+	if (!with_vcpus)
+		return vm;
+
+	for (i = 0; i < NR_MIGRATE_TEST_VCPUS; ++i)
+		__vm_vcpu_add(vm, i);
+
+	return vm;
+}
+
+static int __sev_migrate_from(struct kvm_vm *dst, struct kvm_vm *src)
+{
+	return __vm_enable_cap(dst, KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM, src->fd);
+}
+
+
+static void sev_migrate_from(struct kvm_vm *dst, struct kvm_vm *src)
+{
+	int ret;
+
+	ret = __sev_migrate_from(dst, src);
+	TEST_ASSERT(!ret, "Migration failed, ret: %d, errno: %d", ret, errno);
+}
+
+static void test_sev_migrate_from(bool es)
+{
+	struct kvm_vm *src_vm;
+	struct kvm_vm *dst_vms[NR_MIGRATE_TEST_VMS];
+	int i, ret;
+
+	src_vm = sev_vm_create(es);
+	for (i = 0; i < NR_MIGRATE_TEST_VMS; ++i)
+		dst_vms[i] = aux_vm_create(true);
+
+	/* Initial migration from the src to the first dst. */
+	sev_migrate_from(dst_vms[0], src_vm);
+
+	for (i = 1; i < NR_MIGRATE_TEST_VMS; i++)
+		sev_migrate_from(dst_vms[i], dst_vms[i - 1]);
+
+	/* Migrate the guest back to the original VM. */
+	ret = __sev_migrate_from(src_vm, dst_vms[NR_MIGRATE_TEST_VMS - 1]);
+	TEST_ASSERT(ret == -1 && errno == EIO,
+		    "VM that was migrated from should be dead. ret %d, errno: %d", ret,
+		    errno);
+
+	kvm_vm_free(src_vm);
+	for (i = 0; i < NR_MIGRATE_TEST_VMS; ++i)
+		kvm_vm_free(dst_vms[i]);
+}
+
+struct locking_thread_input {
+	struct kvm_vm *vm;
+	struct kvm_vm *source_vms[NR_LOCK_TESTING_THREADS];
+};
+
+static void *locking_test_thread(void *arg)
+{
+	int i, j;
+	struct locking_thread_input *input = (struct locking_thread_input *)arg;
+
+	for (i = 0; i < NR_LOCK_TESTING_ITERATIONS; ++i) {
+		j = i % NR_LOCK_TESTING_THREADS;
+		__sev_migrate_from(input->vm, input->source_vms[j]);
+	}
+
+	return NULL;
+}
+
+static void test_sev_migrate_locking(void)
+{
+	struct locking_thread_input input[NR_LOCK_TESTING_THREADS];
+	pthread_t pt[NR_LOCK_TESTING_THREADS];
+	int i;
+
+	for (i = 0; i < NR_LOCK_TESTING_THREADS; ++i) {
+		input[i].vm = sev_vm_create(/* es= */ false);
+		input[0].source_vms[i] = input[i].vm;
+	}
+	for (i = 1; i < NR_LOCK_TESTING_THREADS; ++i)
+		memcpy(input[i].source_vms, input[0].source_vms,
+		       sizeof(input[i].source_vms));
+
+	for (i = 0; i < NR_LOCK_TESTING_THREADS; ++i)
+		pthread_create(&pt[i], NULL, locking_test_thread, &input[i]);
+
+	for (i = 0; i < NR_LOCK_TESTING_THREADS; ++i)
+		pthread_join(pt[i], NULL);
+	for (i = 0; i < NR_LOCK_TESTING_THREADS; ++i)
+		kvm_vm_free(input[i].vm);
+}
+
+static void test_sev_migrate_parameters(void)
+{
+	struct kvm_vm *sev_vm, *sev_es_vm, *vm_no_vcpu, *vm_no_sev,
+		*sev_es_vm_no_vmsa;
+	int ret;
+
+	vm_no_vcpu = vm_create_barebones();
+	vm_no_sev = aux_vm_create(true);
+	ret = __sev_migrate_from(vm_no_vcpu, vm_no_sev);
+	TEST_ASSERT(ret == -1 && errno == EINVAL,
+		    "Migrations require SEV enabled. ret %d, errno: %d", ret,
+		    errno);
+
+	if (!have_sev_es)
+		goto out;
+
+	sev_vm = sev_vm_create(/* es= */ false);
+	sev_es_vm = sev_vm_create(/* es= */ true);
+	sev_es_vm_no_vmsa = vm_create_barebones();
+	sev_es_vm_init(sev_es_vm_no_vmsa);
+	__vm_vcpu_add(sev_es_vm_no_vmsa, 1);
+
+	ret = __sev_migrate_from(sev_vm, sev_es_vm);
+	TEST_ASSERT(
+		ret == -1 && errno == EINVAL,
+		"Should not be able migrate to SEV enabled VM. ret: %d, errno: %d",
+		ret, errno);
+
+	ret = __sev_migrate_from(sev_es_vm, sev_vm);
+	TEST_ASSERT(
+		ret == -1 && errno == EINVAL,
+		"Should not be able migrate to SEV-ES enabled VM. ret: %d, errno: %d",
+		ret, errno);
+
+	ret = __sev_migrate_from(vm_no_vcpu, sev_es_vm);
+	TEST_ASSERT(
+		ret == -1 && errno == EINVAL,
+		"SEV-ES migrations require same number of vCPUS. ret: %d, errno: %d",
+		ret, errno);
+
+	ret = __sev_migrate_from(vm_no_vcpu, sev_es_vm_no_vmsa);
+	TEST_ASSERT(
+		ret == -1 && errno == EINVAL,
+		"SEV-ES migrations require UPDATE_VMSA. ret %d, errno: %d",
+		ret, errno);
+
+	kvm_vm_free(sev_vm);
+	kvm_vm_free(sev_es_vm);
+	kvm_vm_free(sev_es_vm_no_vmsa);
+out:
+	kvm_vm_free(vm_no_vcpu);
+	kvm_vm_free(vm_no_sev);
+}
+
+static int __sev_mirror_create(struct kvm_vm *dst, struct kvm_vm *src)
+{
+	return __vm_enable_cap(dst, KVM_CAP_VM_COPY_ENC_CONTEXT_FROM, src->fd);
+}
+
+
+static void sev_mirror_create(struct kvm_vm *dst, struct kvm_vm *src)
+{
+	int ret;
+
+	ret = __sev_mirror_create(dst, src);
+	TEST_ASSERT(!ret, "Copying context failed, ret: %d, errno: %d", ret, errno);
+}
+
+static void verify_mirror_allowed_cmds(struct kvm_vm *vm)
+{
+	struct kvm_sev_guest_status status;
+	int cmd_id;
+
+	for (cmd_id = KVM_SEV_INIT; cmd_id < KVM_SEV_NR_MAX; ++cmd_id) {
+		int ret;
+
+		/*
+		 * These commands are allowed for mirror VMs, all others are
+		 * not.
+		 */
+		switch (cmd_id) {
+		case KVM_SEV_LAUNCH_UPDATE_VMSA:
+		case KVM_SEV_GUEST_STATUS:
+		case KVM_SEV_DBG_DECRYPT:
+		case KVM_SEV_DBG_ENCRYPT:
+			continue;
+		default:
+			break;
+		}
+
+		/*
+		 * These commands should be disallowed before the data
+		 * parameter is examined so NULL is OK here.
+		 */
+		ret = __vm_sev_ioctl(vm, cmd_id, NULL);
+		TEST_ASSERT(
+			ret == -1 && errno == EINVAL,
+			"Should not be able call command: %d. ret: %d, errno: %d",
+			cmd_id, ret, errno);
+	}
+
+	vm_sev_ioctl(vm, KVM_SEV_GUEST_STATUS, &status);
+}
+
+static void test_sev_mirror(bool es)
+{
+	struct kvm_vm *src_vm, *dst_vm;
+	int i;
+
+	src_vm = sev_vm_create(es);
+	dst_vm = aux_vm_create(false);
+
+	sev_mirror_create(dst_vm, src_vm);
+
+	/* Check that we can complete creation of the mirror VM.  */
+	for (i = 0; i < NR_MIGRATE_TEST_VCPUS; ++i)
+		__vm_vcpu_add(dst_vm, i);
+
+	if (es)
+		vm_sev_ioctl(dst_vm, KVM_SEV_LAUNCH_UPDATE_VMSA, NULL);
+
+	verify_mirror_allowed_cmds(dst_vm);
+
+	kvm_vm_free(src_vm);
+	kvm_vm_free(dst_vm);
+}
+
+static void test_sev_mirror_parameters(void)
+{
+	struct kvm_vm *sev_vm, *sev_es_vm, *vm_no_vcpu, *vm_with_vcpu;
+	int ret;
+
+	sev_vm = sev_vm_create(/* es= */ false);
+	vm_with_vcpu = aux_vm_create(true);
+	vm_no_vcpu = aux_vm_create(false);
+
+	ret = __sev_mirror_create(sev_vm, sev_vm);
+	TEST_ASSERT(
+		ret == -1 && errno == EINVAL,
+		"Should not be able copy context to self. ret: %d, errno: %d",
+		ret, errno);
+
+	ret = __sev_mirror_create(vm_no_vcpu, vm_with_vcpu);
+	TEST_ASSERT(ret == -1 && errno == EINVAL,
+		    "Copy context requires SEV enabled. ret %d, errno: %d", ret,
+		    errno);
+
+	ret = __sev_mirror_create(vm_with_vcpu, sev_vm);
+	TEST_ASSERT(
+		ret == -1 && errno == EINVAL,
+		"SEV copy context requires no vCPUS on the destination. ret: %d, errno: %d",
+		ret, errno);
+
+	if (!have_sev_es)
+		goto out;
+
+	sev_es_vm = sev_vm_create(/* es= */ true);
+	ret = __sev_mirror_create(sev_vm, sev_es_vm);
+	TEST_ASSERT(
+		ret == -1 && errno == EINVAL,
+		"Should not be able copy context to SEV enabled VM. ret: %d, errno: %d",
+		ret, errno);
+
+	ret = __sev_mirror_create(sev_es_vm, sev_vm);
+	TEST_ASSERT(
+		ret == -1 && errno == EINVAL,
+		"Should not be able copy context to SEV-ES enabled VM. ret: %d, errno: %d",
+		ret, errno);
+
+	kvm_vm_free(sev_es_vm);
+
+out:
+	kvm_vm_free(sev_vm);
+	kvm_vm_free(vm_with_vcpu);
+	kvm_vm_free(vm_no_vcpu);
+}
+
+static void test_sev_move_copy(void)
+{
+	struct kvm_vm *dst_vm, *dst2_vm, *dst3_vm, *sev_vm, *mirror_vm,
+		      *dst_mirror_vm, *dst2_mirror_vm, *dst3_mirror_vm;
+
+	sev_vm = sev_vm_create(/* es= */ false);
+	dst_vm = aux_vm_create(true);
+	dst2_vm = aux_vm_create(true);
+	dst3_vm = aux_vm_create(true);
+	mirror_vm = aux_vm_create(false);
+	dst_mirror_vm = aux_vm_create(false);
+	dst2_mirror_vm = aux_vm_create(false);
+	dst3_mirror_vm = aux_vm_create(false);
+
+	sev_mirror_create(mirror_vm, sev_vm);
+
+	sev_migrate_from(dst_mirror_vm, mirror_vm);
+	sev_migrate_from(dst_vm, sev_vm);
+
+	sev_migrate_from(dst2_vm, dst_vm);
+	sev_migrate_from(dst2_mirror_vm, dst_mirror_vm);
+
+	sev_migrate_from(dst3_mirror_vm, dst2_mirror_vm);
+	sev_migrate_from(dst3_vm, dst2_vm);
+
+	kvm_vm_free(dst_vm);
+	kvm_vm_free(sev_vm);
+	kvm_vm_free(dst2_vm);
+	kvm_vm_free(dst3_vm);
+	kvm_vm_free(mirror_vm);
+	kvm_vm_free(dst_mirror_vm);
+	kvm_vm_free(dst2_mirror_vm);
+	kvm_vm_free(dst3_mirror_vm);
+
+	/*
+	 * Run similar test be destroy mirrors before mirrored VMs to ensure
+	 * destruction is done safely.
+	 */
+	sev_vm = sev_vm_create(/* es= */ false);
+	dst_vm = aux_vm_create(true);
+	mirror_vm = aux_vm_create(false);
+	dst_mirror_vm = aux_vm_create(false);
+
+	sev_mirror_create(mirror_vm, sev_vm);
+
+	sev_migrate_from(dst_mirror_vm, mirror_vm);
+	sev_migrate_from(dst_vm, sev_vm);
+
+	kvm_vm_free(mirror_vm);
+	kvm_vm_free(dst_mirror_vm);
+	kvm_vm_free(dst_vm);
+	kvm_vm_free(sev_vm);
+}
+
+int main(int argc, char *argv[])
+{
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM));
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM));
+
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SEV));
+
+	have_sev_es = kvm_cpu_has(X86_FEATURE_SEV_ES);
+
+	if (kvm_has_cap(KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM)) {
+		test_sev_migrate_from(/* es= */ false);
+		if (have_sev_es)
+			test_sev_migrate_from(/* es= */ true);
+		test_sev_migrate_locking();
+		test_sev_migrate_parameters();
+		if (kvm_has_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM))
+			test_sev_move_copy();
+	}
+	if (kvm_has_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM)) {
+		test_sev_mirror(/* es= */ false);
+		if (have_sev_es)
+			test_sev_mirror(/* es= */ true);
+		test_sev_mirror_parameters();
+	}
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c b/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c
new file mode 100644
index 000000000000..026779f3ed06
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "svm_util.h"
+#include "linux/psp-sev.h"
+#include "sev.h"
+
+
+static void guest_sev_es_code(void)
+{
+	/* TODO: Check CPUID after GHCB-based hypercall support is added. */
+	GUEST_ASSERT(rdmsr(MSR_AMD64_SEV) & MSR_AMD64_SEV_ENABLED);
+	GUEST_ASSERT(rdmsr(MSR_AMD64_SEV) & MSR_AMD64_SEV_ES_ENABLED);
+
+	/*
+	 * TODO: Add GHCB and ucall support for SEV-ES guests.  For now, simply
+	 * force "termination" to signal "done" via the GHCB MSR protocol.
+	 */
+	wrmsr(MSR_AMD64_SEV_ES_GHCB, GHCB_MSR_TERM_REQ);
+	__asm__ __volatile__("rep; vmmcall");
+}
+
+static void guest_sev_code(void)
+{
+	GUEST_ASSERT(this_cpu_has(X86_FEATURE_SEV));
+	GUEST_ASSERT(rdmsr(MSR_AMD64_SEV) & MSR_AMD64_SEV_ENABLED);
+
+	GUEST_DONE();
+}
+
+static void test_sev(void *guest_code, uint64_t policy)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	struct ucall uc;
+
+	vm = vm_sev_create_with_one_vcpu(policy, guest_code, &vcpu);
+
+	for (;;) {
+		vcpu_run(vcpu);
+
+		if (policy & SEV_POLICY_ES) {
+			TEST_ASSERT(vcpu->run->exit_reason == KVM_EXIT_SYSTEM_EVENT,
+				    "Wanted SYSTEM_EVENT, got %s",
+				    exit_reason_str(vcpu->run->exit_reason));
+			TEST_ASSERT_EQ(vcpu->run->system_event.type, KVM_SYSTEM_EVENT_SEV_TERM);
+			TEST_ASSERT_EQ(vcpu->run->system_event.ndata, 1);
+			TEST_ASSERT_EQ(vcpu->run->system_event.data[0], GHCB_MSR_TERM_REQ);
+			break;
+		}
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_SYNC:
+			continue;
+		case UCALL_DONE:
+			return;
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+		default:
+			TEST_FAIL("Unexpected exit: %s",
+				  exit_reason_str(vcpu->run->exit_reason));
+		}
+	}
+
+	kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SEV));
+
+	test_sev(guest_sev_code, SEV_POLICY_NO_DBG);
+	test_sev(guest_sev_code, 0);
+
+	if (kvm_cpu_has(X86_FEATURE_SEV_ES)) {
+		test_sev(guest_sev_es_code, SEV_POLICY_ES | SEV_POLICY_NO_DBG);
+		test_sev(guest_sev_es_code, SEV_POLICY_ES);
+	}
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c b/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c
new file mode 100644
index 000000000000..416207c38a17
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/smaller_maxphyaddr_emulation_test.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2020, Google LLC.
+ *
+ * Test that KVM emulates instructions in response to EPT violations when
+ * allow_smaller_maxphyaddr is enabled and guest.MAXPHYADDR < host.MAXPHYADDR.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+
+#include "flds_emulation.h"
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "vmx.h"
+
+#define MAXPHYADDR 36
+
+#define MEM_REGION_GVA	0x0000123456789000
+#define MEM_REGION_GPA	0x0000000700000000
+#define MEM_REGION_SLOT	10
+#define MEM_REGION_SIZE PAGE_SIZE
+
+static void guest_code(bool tdp_enabled)
+{
+	uint64_t error_code;
+	uint64_t vector;
+
+	vector = kvm_asm_safe_ec(FLDS_MEM_EAX, error_code, "a"(MEM_REGION_GVA));
+
+	/*
+	 * When TDP is enabled, flds will trigger an emulation failure, exit to
+	 * userspace, and then the selftest host "VMM" skips the instruction.
+	 *
+	 * When TDP is disabled, no instruction emulation is required so flds
+	 * should generate #PF(RSVD).
+	 */
+	if (tdp_enabled) {
+		GUEST_ASSERT(!vector);
+	} else {
+		GUEST_ASSERT_EQ(vector, PF_VECTOR);
+		GUEST_ASSERT(error_code & PFERR_RSVD_MASK);
+	}
+
+	GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	struct ucall uc;
+	uint64_t *pte;
+	uint64_t *hva;
+	uint64_t gpa;
+	int rc;
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_SMALLER_MAXPHYADDR));
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+	vcpu_args_set(vcpu, 1, kvm_is_tdp_enabled());
+
+	vm_init_descriptor_tables(vm);
+	vcpu_init_descriptor_tables(vcpu);
+
+	vcpu_set_cpuid_property(vcpu, X86_PROPERTY_MAX_PHY_ADDR, MAXPHYADDR);
+
+	rc = kvm_check_cap(KVM_CAP_EXIT_ON_EMULATION_FAILURE);
+	TEST_ASSERT(rc, "KVM_CAP_EXIT_ON_EMULATION_FAILURE is unavailable");
+	vm_enable_cap(vm, KVM_CAP_EXIT_ON_EMULATION_FAILURE, 1);
+
+	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+				    MEM_REGION_GPA, MEM_REGION_SLOT,
+				    MEM_REGION_SIZE / PAGE_SIZE, 0);
+	gpa = vm_phy_pages_alloc(vm, MEM_REGION_SIZE / PAGE_SIZE,
+				 MEM_REGION_GPA, MEM_REGION_SLOT);
+	TEST_ASSERT(gpa == MEM_REGION_GPA, "Failed vm_phy_pages_alloc");
+	virt_map(vm, MEM_REGION_GVA, MEM_REGION_GPA, 1);
+	hva = addr_gpa2hva(vm, MEM_REGION_GPA);
+	memset(hva, 0, PAGE_SIZE);
+
+	pte = vm_get_page_table_entry(vm, MEM_REGION_GVA);
+	*pte |= BIT_ULL(MAXPHYADDR);
+
+	vcpu_run(vcpu);
+
+	/*
+	 * When TDP is enabled, KVM must emulate in response the guest physical
+	 * address that is illegal from the guest's perspective, but is legal
+	 * from hardware's perspeective.  This should result in an emulation
+	 * failure exit to userspace since KVM doesn't support emulating flds.
+	 */
+	if (kvm_is_tdp_enabled()) {
+		handle_flds_emulation_failure_exit(vcpu);
+		vcpu_run(vcpu);
+	}
+
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+		break;
+	case UCALL_DONE:
+		break;
+	default:
+		TEST_FAIL("Unrecognized ucall: %lu", uc.cmd);
+	}
+
+	kvm_vm_free(vm);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/smm_test.c b/tools/testing/selftests/kvm/x86_64/smm_test.c
index ae39a220609f..e18b86666e1f 100644
--- a/tools/testing/selftests/kvm/x86_64/smm_test.c
+++ b/tools/testing/selftests/kvm/x86_64/smm_test.c
@@ -19,10 +19,6 @@
 #include "vmx.h"
 #include "svm_util.h"
 
-#define VCPU_ID	      1
-
-#define PAGE_SIZE  4096
-
 #define SMRAM_SIZE 65536
 #define SMRAM_MEMSLOT ((1 << 16) | 1)
 #define SMRAM_PAGES (SMRAM_SIZE / PAGE_SIZE)
@@ -53,15 +49,28 @@ static inline void sync_with_host(uint64_t phase)
 		     : "+a" (phase));
 }
 
-void self_smi(void)
+static void self_smi(void)
+{
+	x2apic_write_reg(APIC_ICR,
+			 APIC_DEST_SELF | APIC_INT_ASSERT | APIC_DM_SMI);
+}
+
+static void l2_guest_code(void)
 {
-	wrmsr(APIC_BASE_MSR + (APIC_ICR >> 4),
-	      APIC_DEST_SELF | APIC_INT_ASSERT | APIC_DM_SMI);
+	sync_with_host(8);
+
+	sync_with_host(10);
+
+	vmcall();
 }
 
-void guest_code(void *arg)
+static void guest_code(void *arg)
 {
+	#define L2_GUEST_STACK_SIZE 64
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
 	uint64_t apicbase = rdmsr(MSR_IA32_APICBASE);
+	struct svm_test_data *svm = arg;
+	struct vmx_pages *vmx_pages = arg;
 
 	sync_with_host(1);
 
@@ -74,37 +83,63 @@ void guest_code(void *arg)
 	sync_with_host(4);
 
 	if (arg) {
-		if (cpu_has_svm())
-			generic_svm_setup(arg, NULL, NULL);
-		else
-			GUEST_ASSERT(prepare_for_vmx_operation(arg));
+		if (this_cpu_has(X86_FEATURE_SVM)) {
+			generic_svm_setup(svm, l2_guest_code,
+					  &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+		} else {
+			GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+			GUEST_ASSERT(load_vmcs(vmx_pages));
+			prepare_vmcs(vmx_pages, l2_guest_code,
+				     &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+		}
 
 		sync_with_host(5);
 
 		self_smi();
 
 		sync_with_host(7);
+
+		if (this_cpu_has(X86_FEATURE_SVM)) {
+			run_guest(svm->vmcb, svm->vmcb_gpa);
+			run_guest(svm->vmcb, svm->vmcb_gpa);
+		} else {
+			vmlaunch();
+			vmresume();
+		}
+
+		/* Stages 8-11 are eaten by SMM (SMRAM_STAGE reported instead) */
+		sync_with_host(12);
 	}
 
 	sync_with_host(DONE);
 }
 
+void inject_smi(struct kvm_vcpu *vcpu)
+{
+	struct kvm_vcpu_events events;
+
+	vcpu_events_get(vcpu, &events);
+
+	events.smi.pending = 1;
+	events.flags |= KVM_VCPUEVENT_VALID_SMM;
+
+	vcpu_events_set(vcpu, &events);
+}
+
 int main(int argc, char *argv[])
 {
 	vm_vaddr_t nested_gva = 0;
 
+	struct kvm_vcpu *vcpu;
 	struct kvm_regs regs;
 	struct kvm_vm *vm;
-	struct kvm_run *run;
 	struct kvm_x86_state *state;
 	int stage, stage_reported;
 
-	/* Create VM */
-	vm = vm_create_default(VCPU_ID, 0, guest_code);
-
-	vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_X86_SMM));
 
-	run = vcpu_state(vm, VCPU_ID);
+	/* Create VM */
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
 
 	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, SMRAM_GPA,
 				    SMRAM_MEMSLOT, SMRAM_PAGES, 0);
@@ -115,29 +150,26 @@ int main(int argc, char *argv[])
 	memcpy(addr_gpa2hva(vm, SMRAM_GPA) + 0x8000, smi_handler,
 	       sizeof(smi_handler));
 
-	vcpu_set_msr(vm, VCPU_ID, MSR_IA32_SMBASE, SMRAM_GPA);
+	vcpu_set_msr(vcpu, MSR_IA32_SMBASE, SMRAM_GPA);
 
-	if (kvm_check_cap(KVM_CAP_NESTED_STATE)) {
-		if (nested_svm_supported())
+	if (kvm_has_cap(KVM_CAP_NESTED_STATE)) {
+		if (kvm_cpu_has(X86_FEATURE_SVM))
 			vcpu_alloc_svm(vm, &nested_gva);
-		else if (nested_vmx_supported())
+		else if (kvm_cpu_has(X86_FEATURE_VMX))
 			vcpu_alloc_vmx(vm, &nested_gva);
 	}
 
 	if (!nested_gva)
 		pr_info("will skip SMM test with VMX enabled\n");
 
-	vcpu_args_set(vm, VCPU_ID, 1, nested_gva);
+	vcpu_args_set(vcpu, 1, nested_gva);
 
 	for (stage = 1;; stage++) {
-		_vcpu_run(vm, VCPU_ID);
-		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
-			    "Stage %d: unexpected exit reason: %u (%s),\n",
-			    stage, run->exit_reason,
-			    exit_reason_str(run->exit_reason));
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
 
 		memset(&regs, 0, sizeof(regs));
-		vcpu_regs_get(vm, VCPU_ID, &regs);
+		vcpu_regs_get(vcpu, &regs);
 
 		stage_reported = regs.rax & 0xff;
 
@@ -149,14 +181,28 @@ int main(int argc, char *argv[])
 			    "Unexpected stage: #%x, got %x",
 			    stage, stage_reported);
 
-		state = vcpu_save_state(vm, VCPU_ID);
+		/*
+		 * Enter SMM during L2 execution and check that we correctly
+		 * return from it. Do not perform save/restore while in SMM yet.
+		 */
+		if (stage == 8) {
+			inject_smi(vcpu);
+			continue;
+		}
+
+		/*
+		 * Perform save/restore while the guest is in SMM triggered
+		 * during L2 execution.
+		 */
+		if (stage == 10)
+			inject_smi(vcpu);
+
+		state = vcpu_save_state(vcpu);
 		kvm_vm_release(vm);
-		kvm_vm_restart(vm, O_RDWR);
-		vm_vcpu_add(vm, VCPU_ID);
-		vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
-		vcpu_load_state(vm, VCPU_ID, state);
-		run = vcpu_state(vm, VCPU_ID);
-		free(state);
+
+		vcpu = vm_recreate_with_one_vcpu(vm);
+		vcpu_load_state(vcpu, state);
+		kvm_x86_state_cleanup(state);
 	}
 
 done:
diff --git a/tools/testing/selftests/kvm/x86_64/state_test.c b/tools/testing/selftests/kvm/x86_64/state_test.c
index f6c8b9042f8a..88b58aab7207 100644
--- a/tools/testing/selftests/kvm/x86_64/state_test.c
+++ b/tools/testing/selftests/kvm/x86_64/state_test.c
@@ -20,7 +20,6 @@
 #include "vmx.h"
 #include "svm_util.h"
 
-#define VCPU_ID		5
 #define L2_GUEST_STACK_SIZE 256
 
 void svm_l2_guest_code(void)
@@ -140,10 +139,87 @@ static void vmx_l1_guest_code(struct vmx_pages *vmx_pages)
 static void __attribute__((__flatten__)) guest_code(void *arg)
 {
 	GUEST_SYNC(1);
+
+	if (this_cpu_has(X86_FEATURE_XSAVE)) {
+		uint64_t supported_xcr0 = this_cpu_supported_xcr0();
+		uint8_t buffer[4096];
+
+		memset(buffer, 0xcc, sizeof(buffer));
+
+		set_cr4(get_cr4() | X86_CR4_OSXSAVE);
+		GUEST_ASSERT(this_cpu_has(X86_FEATURE_OSXSAVE));
+
+		xsetbv(0, xgetbv(0) | supported_xcr0);
+
+		/*
+		 * Modify state for all supported xfeatures to take them out of
+		 * their "init" state, i.e. to make them show up in XSTATE_BV.
+		 *
+		 * Note off-by-default features, e.g. AMX, are out of scope for
+		 * this particular testcase as they have a different ABI.
+		 */
+		GUEST_ASSERT(supported_xcr0 & XFEATURE_MASK_FP);
+		asm volatile ("fincstp");
+
+		GUEST_ASSERT(supported_xcr0 & XFEATURE_MASK_SSE);
+		asm volatile ("vmovdqu %0, %%xmm0" :: "m" (buffer));
+
+		if (supported_xcr0 & XFEATURE_MASK_YMM)
+			asm volatile ("vmovdqu %0, %%ymm0" :: "m" (buffer));
+
+		if (supported_xcr0 & XFEATURE_MASK_AVX512) {
+			asm volatile ("kmovq %0, %%k1" :: "r" (-1ull));
+			asm volatile ("vmovupd %0, %%zmm0" :: "m" (buffer));
+			asm volatile ("vmovupd %0, %%zmm16" :: "m" (buffer));
+		}
+
+		if (this_cpu_has(X86_FEATURE_MPX)) {
+			uint64_t bounds[2] = { 10, 0xffffffffull };
+			uint64_t output[2] = { };
+
+			GUEST_ASSERT(supported_xcr0 & XFEATURE_MASK_BNDREGS);
+			GUEST_ASSERT(supported_xcr0 & XFEATURE_MASK_BNDCSR);
+
+			/*
+			 * Don't bother trying to get BNDCSR into the INUSE
+			 * state.  MSR_IA32_BNDCFGS doesn't count as it isn't
+			 * managed via XSAVE/XRSTOR, and BNDCFGU can only be
+			 * modified by XRSTOR.  Stuffing XSTATE_BV in the host
+			 * is simpler than doing XRSTOR here in the guest.
+			 *
+			 * However, temporarily enable MPX in BNDCFGS so that
+			 * BNDMOV actually loads BND1.  If MPX isn't *fully*
+			 * enabled, all MPX instructions are treated as NOPs.
+			 *
+			 * Hand encode "bndmov (%rax),%bnd1" as support for MPX
+			 * mnemonics/registers has been removed from gcc and
+			 * clang (and was never fully supported by clang).
+			 */
+			wrmsr(MSR_IA32_BNDCFGS, BIT_ULL(0));
+			asm volatile (".byte 0x66,0x0f,0x1a,0x08" :: "a" (bounds));
+			/*
+			 * Hand encode "bndmov %bnd1, (%rax)" to sanity check
+			 * that BND1 actually got loaded.
+			 */
+			asm volatile (".byte 0x66,0x0f,0x1b,0x08" :: "a" (output));
+			wrmsr(MSR_IA32_BNDCFGS, 0);
+
+			GUEST_ASSERT_EQ(bounds[0], output[0]);
+			GUEST_ASSERT_EQ(bounds[1], output[1]);
+		}
+		if (this_cpu_has(X86_FEATURE_PKU)) {
+			GUEST_ASSERT(supported_xcr0 & XFEATURE_MASK_PKRU);
+			set_cr4(get_cr4() | X86_CR4_PKE);
+			GUEST_ASSERT(this_cpu_has(X86_FEATURE_OSPKE));
+
+			wrpkru(-1u);
+		}
+	}
+
 	GUEST_SYNC(2);
 
 	if (arg) {
-		if (cpu_has_svm())
+		if (this_cpu_has(X86_FEATURE_SVM))
 			svm_l1_guest_code(arg);
 		else
 			vmx_l1_guest_code(arg);
@@ -154,45 +230,40 @@ static void __attribute__((__flatten__)) guest_code(void *arg)
 
 int main(int argc, char *argv[])
 {
+	uint64_t *xstate_bv, saved_xstate_bv;
 	vm_vaddr_t nested_gva = 0;
-
+	struct kvm_cpuid2 empty_cpuid = {};
 	struct kvm_regs regs1, regs2;
+	struct kvm_vcpu *vcpu, *vcpuN;
 	struct kvm_vm *vm;
-	struct kvm_run *run;
 	struct kvm_x86_state *state;
 	struct ucall uc;
 	int stage;
 
 	/* Create VM */
-	vm = vm_create_default(VCPU_ID, 0, guest_code);
-	vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
-	run = vcpu_state(vm, VCPU_ID);
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
 
-	vcpu_regs_get(vm, VCPU_ID, &regs1);
+	vcpu_regs_get(vcpu, &regs1);
 
-	if (kvm_check_cap(KVM_CAP_NESTED_STATE)) {
-		if (nested_svm_supported())
+	if (kvm_has_cap(KVM_CAP_NESTED_STATE)) {
+		if (kvm_cpu_has(X86_FEATURE_SVM))
 			vcpu_alloc_svm(vm, &nested_gva);
-		else if (nested_vmx_supported())
+		else if (kvm_cpu_has(X86_FEATURE_VMX))
 			vcpu_alloc_vmx(vm, &nested_gva);
 	}
 
 	if (!nested_gva)
 		pr_info("will skip nested state checks\n");
 
-	vcpu_args_set(vm, VCPU_ID, 1, nested_gva);
+	vcpu_args_set(vcpu, 1, nested_gva);
 
 	for (stage = 1;; stage++) {
-		_vcpu_run(vm, VCPU_ID);
-		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
-			    "Stage %d: unexpected exit reason: %u (%s),\n",
-			    stage, run->exit_reason,
-			    exit_reason_str(run->exit_reason));
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
 
-		switch (get_ucall(vm, VCPU_ID, &uc)) {
+		switch (get_ucall(vcpu, &uc)) {
 		case UCALL_ABORT:
-			TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
-			       	  __FILE__, uc.args[1]);
+			REPORT_GUEST_ASSERT(uc);
 			/* NOT REACHED */
 		case UCALL_SYNC:
 			break;
@@ -207,22 +278,47 @@ int main(int argc, char *argv[])
 			    uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx",
 			    stage, (ulong)uc.args[1]);
 
-		state = vcpu_save_state(vm, VCPU_ID);
+		state = vcpu_save_state(vcpu);
 		memset(&regs1, 0, sizeof(regs1));
-		vcpu_regs_get(vm, VCPU_ID, &regs1);
+		vcpu_regs_get(vcpu, &regs1);
 
 		kvm_vm_release(vm);
 
 		/* Restore state in a new VM.  */
-		kvm_vm_restart(vm, O_RDWR);
-		vm_vcpu_add(vm, VCPU_ID);
-		vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
-		vcpu_load_state(vm, VCPU_ID, state);
-		run = vcpu_state(vm, VCPU_ID);
-		free(state);
+		vcpu = vm_recreate_with_one_vcpu(vm);
+		vcpu_load_state(vcpu, state);
+
+		/*
+		 * Restore XSAVE state in a dummy vCPU, first without doing
+		 * KVM_SET_CPUID2, and then with an empty guest CPUID.  Except
+		 * for off-by-default xfeatures, e.g. AMX, KVM is supposed to
+		 * allow KVM_SET_XSAVE regardless of guest CPUID.  Manually
+		 * load only XSAVE state, MSRs in particular have a much more
+		 * convoluted ABI.
+		 *
+		 * Load two versions of XSAVE state: one with the actual guest
+		 * XSAVE state, and one with all supported features forced "on"
+		 * in xstate_bv, e.g. to ensure that KVM allows loading all
+		 * supported features, even if something goes awry in saving
+		 * the original snapshot.
+		 */
+		xstate_bv = (void *)&((uint8_t *)state->xsave->region)[512];
+		saved_xstate_bv = *xstate_bv;
+
+		vcpuN = __vm_vcpu_add(vm, vcpu->id + 1);
+		vcpu_xsave_set(vcpuN, state->xsave);
+		*xstate_bv = kvm_cpu_supported_xcr0();
+		vcpu_xsave_set(vcpuN, state->xsave);
+
+		vcpu_init_cpuid(vcpuN, &empty_cpuid);
+		vcpu_xsave_set(vcpuN, state->xsave);
+		*xstate_bv = saved_xstate_bv;
+		vcpu_xsave_set(vcpuN, state->xsave);
+
+		kvm_x86_state_cleanup(state);
 
 		memset(&regs2, 0, sizeof(regs2));
-		vcpu_regs_get(vm, VCPU_ID, &regs2);
+		vcpu_regs_get(vcpu, &regs2);
 		TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)),
 			    "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
 			    (ulong) regs2.rdi, (ulong) regs2.rsi);
diff --git a/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c b/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c
new file mode 100644
index 000000000000..32bef39bec21
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/svm_int_ctl_test.c
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * svm_int_ctl_test
+ *
+ * Copyright (C) 2021, Red Hat, Inc.
+ *
+ * Nested SVM testing: test simultaneous use of V_IRQ from L1 and L0.
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "svm_util.h"
+#include "apic.h"
+
+bool vintr_irq_called;
+bool intr_irq_called;
+
+#define VINTR_IRQ_NUMBER 0x20
+#define INTR_IRQ_NUMBER 0x30
+
+static void vintr_irq_handler(struct ex_regs *regs)
+{
+	vintr_irq_called = true;
+}
+
+static void intr_irq_handler(struct ex_regs *regs)
+{
+	x2apic_write_reg(APIC_EOI, 0x00);
+	intr_irq_called = true;
+}
+
+static void l2_guest_code(struct svm_test_data *svm)
+{
+	/* This code raises interrupt INTR_IRQ_NUMBER in the L1's LAPIC,
+	 * and since L1 didn't enable virtual interrupt masking,
+	 * L2 should receive it and not L1.
+	 *
+	 * L2 also has virtual interrupt 'VINTR_IRQ_NUMBER' pending in V_IRQ
+	 * so it should also receive it after the following 'sti'.
+	 */
+	x2apic_write_reg(APIC_ICR,
+		APIC_DEST_SELF | APIC_INT_ASSERT | INTR_IRQ_NUMBER);
+
+	__asm__ __volatile__(
+		"sti\n"
+		"nop\n"
+	);
+
+	GUEST_ASSERT(vintr_irq_called);
+	GUEST_ASSERT(intr_irq_called);
+
+	__asm__ __volatile__(
+		"vmcall\n"
+	);
+}
+
+static void l1_guest_code(struct svm_test_data *svm)
+{
+	#define L2_GUEST_STACK_SIZE 64
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+	struct vmcb *vmcb = svm->vmcb;
+
+	x2apic_enable();
+
+	/* Prepare for L2 execution. */
+	generic_svm_setup(svm, l2_guest_code,
+			  &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+	/* No virtual interrupt masking */
+	vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
+
+	/* No intercepts for real and virtual interrupts */
+	vmcb->control.intercept &= ~(BIT(INTERCEPT_INTR) | BIT(INTERCEPT_VINTR));
+
+	/* Make a virtual interrupt VINTR_IRQ_NUMBER pending */
+	vmcb->control.int_ctl |= V_IRQ_MASK | (0x1 << V_INTR_PRIO_SHIFT);
+	vmcb->control.int_vector = VINTR_IRQ_NUMBER;
+
+	run_guest(vmcb, svm->vmcb_gpa);
+	GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL);
+	GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vcpu *vcpu;
+	vm_vaddr_t svm_gva;
+	struct kvm_vm *vm;
+	struct ucall uc;
+
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM));
+
+	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
+
+	vm_init_descriptor_tables(vm);
+	vcpu_init_descriptor_tables(vcpu);
+
+	vm_install_exception_handler(vm, VINTR_IRQ_NUMBER, vintr_irq_handler);
+	vm_install_exception_handler(vm, INTR_IRQ_NUMBER, intr_irq_handler);
+
+	vcpu_alloc_svm(vm, &svm_gva);
+	vcpu_args_set(vcpu, 1, svm_gva);
+
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+		break;
+		/* NOT REACHED */
+	case UCALL_DONE:
+		goto done;
+	default:
+		TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd);
+	}
+done:
+	kvm_vm_free(vm);
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/svm_nested_shutdown_test.c b/tools/testing/selftests/kvm/x86_64/svm_nested_shutdown_test.c
new file mode 100644
index 000000000000..d6fcdcc3af31
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/svm_nested_shutdown_test.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * svm_nested_shutdown_test
+ *
+ * Copyright (C) 2022, Red Hat, Inc.
+ *
+ * Nested SVM testing: test that unintercepted shutdown in L2 doesn't crash the host
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "svm_util.h"
+
+static void l2_guest_code(struct svm_test_data *svm)
+{
+	__asm__ __volatile__("ud2");
+}
+
+static void l1_guest_code(struct svm_test_data *svm, struct idt_entry *idt)
+{
+	#define L2_GUEST_STACK_SIZE 64
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+	struct vmcb *vmcb = svm->vmcb;
+
+	generic_svm_setup(svm, l2_guest_code,
+			  &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+	vmcb->control.intercept &= ~(BIT(INTERCEPT_SHUTDOWN));
+
+	idt[6].p   = 0; // #UD is intercepted but its injection will cause #NP
+	idt[11].p  = 0; // #NP is not intercepted and will cause another
+			// #NP that will be converted to #DF
+	idt[8].p   = 0; // #DF will cause #NP which will cause SHUTDOWN
+
+	run_guest(vmcb, svm->vmcb_gpa);
+
+	/* should not reach here */
+	GUEST_ASSERT(0);
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vcpu *vcpu;
+	vm_vaddr_t svm_gva;
+	struct kvm_vm *vm;
+
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM));
+
+	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
+	vm_init_descriptor_tables(vm);
+	vcpu_init_descriptor_tables(vcpu);
+
+	vcpu_alloc_svm(vm, &svm_gva);
+
+	vcpu_args_set(vcpu, 2, svm_gva, vm->idt);
+
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_SHUTDOWN);
+
+	kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c
new file mode 100644
index 000000000000..0c7ce3d4e83a
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/svm_nested_soft_inject_test.c
@@ -0,0 +1,213 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2022 Oracle and/or its affiliates.
+ *
+ * Based on:
+ *   svm_int_ctl_test
+ *
+ *   Copyright (C) 2021, Red Hat, Inc.
+ *
+ */
+#include <stdatomic.h>
+#include <stdio.h>
+#include <unistd.h>
+#include "apic.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "svm_util.h"
+#include "test_util.h"
+
+#define INT_NR			0x20
+
+static_assert(ATOMIC_INT_LOCK_FREE == 2, "atomic int is not lockless");
+
+static unsigned int bp_fired;
+static void guest_bp_handler(struct ex_regs *regs)
+{
+	bp_fired++;
+}
+
+static unsigned int int_fired;
+static void l2_guest_code_int(void);
+
+static void guest_int_handler(struct ex_regs *regs)
+{
+	int_fired++;
+	GUEST_ASSERT_EQ(regs->rip, (unsigned long)l2_guest_code_int);
+}
+
+static void l2_guest_code_int(void)
+{
+	GUEST_ASSERT_EQ(int_fired, 1);
+
+	/*
+         * Same as the vmmcall() function, but with a ud2 sneaked after the
+         * vmmcall.  The caller injects an exception with the return address
+         * increased by 2, so the "pop rbp" must be after the ud2 and we cannot
+	 * use vmmcall() directly.
+         */
+	__asm__ __volatile__("push %%rbp; vmmcall; ud2; pop %%rbp"
+                             : : "a"(0xdeadbeef), "c"(0xbeefdead)
+                             : "rbx", "rdx", "rsi", "rdi", "r8", "r9",
+                               "r10", "r11", "r12", "r13", "r14", "r15");
+
+	GUEST_ASSERT_EQ(bp_fired, 1);
+	hlt();
+}
+
+static atomic_int nmi_stage;
+#define nmi_stage_get() atomic_load_explicit(&nmi_stage, memory_order_acquire)
+#define nmi_stage_inc() atomic_fetch_add_explicit(&nmi_stage, 1, memory_order_acq_rel)
+static void guest_nmi_handler(struct ex_regs *regs)
+{
+	nmi_stage_inc();
+
+	if (nmi_stage_get() == 1) {
+		vmmcall();
+		GUEST_FAIL("Unexpected resume after VMMCALL");
+	} else {
+		GUEST_ASSERT_EQ(nmi_stage_get(), 3);
+		GUEST_DONE();
+	}
+}
+
+static void l2_guest_code_nmi(void)
+{
+	ud2();
+}
+
+static void l1_guest_code(struct svm_test_data *svm, uint64_t is_nmi, uint64_t idt_alt)
+{
+	#define L2_GUEST_STACK_SIZE 64
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+	struct vmcb *vmcb = svm->vmcb;
+
+	if (is_nmi)
+		x2apic_enable();
+
+	/* Prepare for L2 execution. */
+	generic_svm_setup(svm,
+			  is_nmi ? l2_guest_code_nmi : l2_guest_code_int,
+			  &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+	vmcb->control.intercept_exceptions |= BIT(PF_VECTOR) | BIT(UD_VECTOR);
+	vmcb->control.intercept |= BIT(INTERCEPT_NMI) | BIT(INTERCEPT_HLT);
+
+	if (is_nmi) {
+		vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
+	} else {
+		vmcb->control.event_inj = INT_NR | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_SOFT;
+		/* The return address pushed on stack */
+		vmcb->control.next_rip = vmcb->save.rip;
+	}
+
+	run_guest(vmcb, svm->vmcb_gpa);
+	__GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL,
+		       "Expected VMMCAL #VMEXIT, got '0x%x', info1 = '0x%lx, info2 = '0x%lx'",
+		       vmcb->control.exit_code,
+		       vmcb->control.exit_info_1, vmcb->control.exit_info_2);
+
+	if (is_nmi) {
+		clgi();
+		x2apic_write_reg(APIC_ICR, APIC_DEST_SELF | APIC_INT_ASSERT | APIC_DM_NMI);
+
+		GUEST_ASSERT_EQ(nmi_stage_get(), 1);
+		nmi_stage_inc();
+
+		stgi();
+		/* self-NMI happens here */
+		while (true)
+			cpu_relax();
+	}
+
+	/* Skip over VMMCALL */
+	vmcb->save.rip += 3;
+
+	/* Switch to alternate IDT to cause intervening NPF again */
+	vmcb->save.idtr.base = idt_alt;
+	vmcb->control.clean = 0; /* &= ~BIT(VMCB_DT) would be enough */
+
+	vmcb->control.event_inj = BP_VECTOR | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT;
+	/* The return address pushed on stack, skip over UD2 */
+	vmcb->control.next_rip = vmcb->save.rip + 2;
+
+	run_guest(vmcb, svm->vmcb_gpa);
+	__GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_HLT,
+		       "Expected HLT #VMEXIT, got '0x%x', info1 = '0x%lx, info2 = '0x%lx'",
+		       vmcb->control.exit_code,
+		       vmcb->control.exit_info_1, vmcb->control.exit_info_2);
+
+	GUEST_DONE();
+}
+
+static void run_test(bool is_nmi)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	vm_vaddr_t svm_gva;
+	vm_vaddr_t idt_alt_vm;
+	struct kvm_guest_debug debug;
+
+	pr_info("Running %s test\n", is_nmi ? "NMI" : "soft int");
+
+	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
+
+	vm_init_descriptor_tables(vm);
+	vcpu_init_descriptor_tables(vcpu);
+
+	vm_install_exception_handler(vm, NMI_VECTOR, guest_nmi_handler);
+	vm_install_exception_handler(vm, BP_VECTOR, guest_bp_handler);
+	vm_install_exception_handler(vm, INT_NR, guest_int_handler);
+
+	vcpu_alloc_svm(vm, &svm_gva);
+
+	if (!is_nmi) {
+		void *idt, *idt_alt;
+
+		idt_alt_vm = vm_vaddr_alloc_page(vm);
+		idt_alt = addr_gva2hva(vm, idt_alt_vm);
+		idt = addr_gva2hva(vm, vm->idt);
+		memcpy(idt_alt, idt, getpagesize());
+	} else {
+		idt_alt_vm = 0;
+	}
+	vcpu_args_set(vcpu, 3, svm_gva, (uint64_t)is_nmi, (uint64_t)idt_alt_vm);
+
+	memset(&debug, 0, sizeof(debug));
+	vcpu_guest_debug_set(vcpu, &debug);
+
+	struct ucall uc;
+
+	alarm(2);
+	vcpu_run(vcpu);
+	alarm(0);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+		break;
+		/* NOT REACHED */
+	case UCALL_DONE:
+		goto done;
+	default:
+		TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd);
+	}
+done:
+	kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM));
+
+	TEST_ASSERT(kvm_cpu_has(X86_FEATURE_NRIPS),
+		    "KVM with nSVM is supposed to unconditionally advertise nRIP Save");
+
+	atomic_init(&nmi_stage, 0);
+
+	run_test(false);
+	run_test(true);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c b/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c
index 0e1adb4e3199..8a62cca28cfb 100644
--- a/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c
+++ b/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c
@@ -12,10 +12,6 @@
 #include "processor.h"
 #include "svm_util.h"
 
-#define VCPU_ID		5
-
-static struct kvm_vm *vm;
-
 static void l2_guest_code(struct svm_test_data *svm)
 {
 	__asm__ __volatile__("vmcall");
@@ -39,29 +35,26 @@ static void l1_guest_code(struct svm_test_data *svm)
 
 int main(int argc, char *argv[])
 {
+	struct kvm_vcpu *vcpu;
 	vm_vaddr_t svm_gva;
+	struct kvm_vm *vm;
 
-	nested_svm_check_supported();
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM));
 
-	vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code);
-	vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
 
 	vcpu_alloc_svm(vm, &svm_gva);
-	vcpu_args_set(vm, VCPU_ID, 1, svm_gva);
+	vcpu_args_set(vcpu, 1, svm_gva);
 
 	for (;;) {
-		volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID);
 		struct ucall uc;
 
-		vcpu_run(vm, VCPU_ID);
-		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
-			    "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n",
-			    run->exit_reason,
-			    exit_reason_str(run->exit_reason));
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
 
-		switch (get_ucall(vm, VCPU_ID, &uc)) {
+		switch (get_ucall(vcpu, &uc)) {
 		case UCALL_ABORT:
-			TEST_FAIL("%s", (const char *)uc.args[0]);
+			REPORT_GUEST_ASSERT(uc);
 			/* NOT REACHED */
 		case UCALL_SYNC:
 			break;
diff --git a/tools/testing/selftests/kvm/x86_64/sync_regs_test.c b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c
index d672f0a473f8..adb5593daf48 100644
--- a/tools/testing/selftests/kvm/x86_64/sync_regs_test.c
+++ b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c
@@ -15,15 +15,19 @@
 #include <stdlib.h>
 #include <string.h>
 #include <sys/ioctl.h>
+#include <pthread.h>
 
+#include "kvm_test_harness.h"
 #include "test_util.h"
 #include "kvm_util.h"
 #include "processor.h"
 
-#define VCPU_ID 5
-
 #define UCALL_PIO_PORT ((uint16_t)0x1000)
 
+struct ucall uc_none = {
+	.cmd = UCALL_NONE,
+};
+
 /*
  * ucall is embedded here to protect against compiler reshuffling registers
  * before calling a function. In this test we only need to get KVM_EXIT_IO
@@ -34,15 +38,18 @@ void guest_code(void)
 	asm volatile("1: in %[port], %%al\n"
 		     "add $0x1, %%rbx\n"
 		     "jmp 1b"
-		     : : [port] "d" (UCALL_PIO_PORT) : "rax", "rbx");
+		     : : [port] "d" (UCALL_PIO_PORT), "D" (&uc_none)
+		     : "rax", "rbx");
 }
 
+KVM_ONE_VCPU_TEST_SUITE(sync_regs_test);
+
 static void compare_regs(struct kvm_regs *left, struct kvm_regs *right)
 {
 #define REG_COMPARE(reg) \
 	TEST_ASSERT(left->reg == right->reg, \
 		    "Register " #reg \
-		    " values did not match: 0x%llx, 0x%llx\n", \
+		    " values did not match: 0x%llx, 0x%llx", \
 		    left->reg, right->reg)
 	REG_COMPARE(rax);
 	REG_COMPARE(rbx);
@@ -77,80 +84,205 @@ static void compare_vcpu_events(struct kvm_vcpu_events *left,
 #define TEST_SYNC_FIELDS   (KVM_SYNC_X86_REGS|KVM_SYNC_X86_SREGS|KVM_SYNC_X86_EVENTS)
 #define INVALID_SYNC_FIELD 0x80000000
 
-int main(int argc, char *argv[])
+/*
+ * Set an exception as pending *and* injected while KVM is processing events.
+ * KVM is supposed to ignore/drop pending exceptions if userspace is also
+ * requesting that an exception be injected.
+ */
+static void *race_events_inj_pen(void *arg)
 {
-	struct kvm_vm *vm;
-	struct kvm_run *run;
-	struct kvm_regs regs;
-	struct kvm_sregs sregs;
-	struct kvm_vcpu_events events;
-	int rv, cap;
+	struct kvm_run *run = (struct kvm_run *)arg;
+	struct kvm_vcpu_events *events = &run->s.regs.events;
 
-	/* Tell stdout not to buffer its content */
-	setbuf(stdout, NULL);
+	WRITE_ONCE(events->exception.nr, UD_VECTOR);
 
-	cap = kvm_check_cap(KVM_CAP_SYNC_REGS);
-	if ((cap & TEST_SYNC_FIELDS) != TEST_SYNC_FIELDS) {
-		print_skip("KVM_CAP_SYNC_REGS not supported");
-		exit(KSFT_SKIP);
+	for (;;) {
+		WRITE_ONCE(run->kvm_dirty_regs, KVM_SYNC_X86_EVENTS);
+		WRITE_ONCE(events->flags, 0);
+		WRITE_ONCE(events->exception.injected, 1);
+		WRITE_ONCE(events->exception.pending, 1);
+
+		pthread_testcancel();
+	}
+
+	return NULL;
+}
+
+/*
+ * Set an invalid exception vector while KVM is processing events.  KVM is
+ * supposed to reject any vector >= 32, as well as NMIs (vector 2).
+ */
+static void *race_events_exc(void *arg)
+{
+	struct kvm_run *run = (struct kvm_run *)arg;
+	struct kvm_vcpu_events *events = &run->s.regs.events;
+
+	for (;;) {
+		WRITE_ONCE(run->kvm_dirty_regs, KVM_SYNC_X86_EVENTS);
+		WRITE_ONCE(events->flags, 0);
+		WRITE_ONCE(events->exception.nr, UD_VECTOR);
+		WRITE_ONCE(events->exception.pending, 1);
+		WRITE_ONCE(events->exception.nr, 255);
+
+		pthread_testcancel();
+	}
+
+	return NULL;
+}
+
+/*
+ * Toggle CR4.PAE while KVM is processing SREGS, EFER.LME=1 with CR4.PAE=0 is
+ * illegal, and KVM's MMU heavily relies on vCPU state being valid.
+ */
+static noinline void *race_sregs_cr4(void *arg)
+{
+	struct kvm_run *run = (struct kvm_run *)arg;
+	__u64 *cr4 = &run->s.regs.sregs.cr4;
+	__u64 pae_enabled = *cr4;
+	__u64 pae_disabled = *cr4 & ~X86_CR4_PAE;
+
+	for (;;) {
+		WRITE_ONCE(run->kvm_dirty_regs, KVM_SYNC_X86_SREGS);
+		WRITE_ONCE(*cr4, pae_enabled);
+		asm volatile(".rept 512\n\t"
+			     "nop\n\t"
+			     ".endr");
+		WRITE_ONCE(*cr4, pae_disabled);
+
+		pthread_testcancel();
 	}
-	if ((cap & INVALID_SYNC_FIELD) != 0) {
-		print_skip("The \"invalid\" field is not invalid");
-		exit(KSFT_SKIP);
+
+	return NULL;
+}
+
+static void race_sync_regs(struct kvm_vcpu *vcpu, void *racer)
+{
+	const time_t TIMEOUT = 2; /* seconds, roughly */
+	struct kvm_x86_state *state;
+	struct kvm_translation tr;
+	struct kvm_run *run;
+	pthread_t thread;
+	time_t t;
+
+	run = vcpu->run;
+
+	run->kvm_valid_regs = KVM_SYNC_X86_SREGS;
+	vcpu_run(vcpu);
+	run->kvm_valid_regs = 0;
+
+	/* Save state *before* spawning the thread that mucks with vCPU state. */
+	state = vcpu_save_state(vcpu);
+
+	/*
+	 * Selftests run 64-bit guests by default, both EFER.LME and CR4.PAE
+	 * should already be set in guest state.
+	 */
+	TEST_ASSERT((run->s.regs.sregs.cr4 & X86_CR4_PAE) &&
+		    (run->s.regs.sregs.efer & EFER_LME),
+		    "vCPU should be in long mode, CR4.PAE=%d, EFER.LME=%d",
+		    !!(run->s.regs.sregs.cr4 & X86_CR4_PAE),
+		    !!(run->s.regs.sregs.efer & EFER_LME));
+
+	TEST_ASSERT_EQ(pthread_create(&thread, NULL, racer, (void *)run), 0);
+
+	for (t = time(NULL) + TIMEOUT; time(NULL) < t;) {
+		/*
+		 * Reload known good state if the vCPU triple faults, e.g. due
+		 * to the unhandled #GPs being injected.  VMX preserves state
+		 * on shutdown, but SVM synthesizes an INIT as the VMCB state
+		 * is architecturally undefined on triple fault.
+		 */
+		if (!__vcpu_run(vcpu) && run->exit_reason == KVM_EXIT_SHUTDOWN)
+			vcpu_load_state(vcpu, state);
+
+		if (racer == race_sregs_cr4) {
+			tr = (struct kvm_translation) { .linear_address = 0 };
+			__vcpu_ioctl(vcpu, KVM_TRANSLATE, &tr);
+		}
 	}
 
-	/* Create VM */
-	vm = vm_create_default(VCPU_ID, 0, guest_code);
+	TEST_ASSERT_EQ(pthread_cancel(thread), 0);
+	TEST_ASSERT_EQ(pthread_join(thread, NULL), 0);
 
-	run = vcpu_state(vm, VCPU_ID);
+	kvm_x86_state_cleanup(state);
+}
+
+KVM_ONE_VCPU_TEST(sync_regs_test, read_invalid, guest_code)
+{
+	struct kvm_run *run = vcpu->run;
+	int rv;
 
 	/* Request reading invalid register set from VCPU. */
 	run->kvm_valid_regs = INVALID_SYNC_FIELD;
-	rv = _vcpu_run(vm, VCPU_ID);
+	rv = _vcpu_run(vcpu);
 	TEST_ASSERT(rv < 0 && errno == EINVAL,
-		    "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n",
+		    "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d",
 		    rv);
-	vcpu_state(vm, VCPU_ID)->kvm_valid_regs = 0;
+	run->kvm_valid_regs = 0;
 
 	run->kvm_valid_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS;
-	rv = _vcpu_run(vm, VCPU_ID);
+	rv = _vcpu_run(vcpu);
 	TEST_ASSERT(rv < 0 && errno == EINVAL,
-		    "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n",
+		    "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d",
 		    rv);
-	vcpu_state(vm, VCPU_ID)->kvm_valid_regs = 0;
+	run->kvm_valid_regs = 0;
+}
+
+KVM_ONE_VCPU_TEST(sync_regs_test, set_invalid, guest_code)
+{
+	struct kvm_run *run = vcpu->run;
+	int rv;
 
 	/* Request setting invalid register set into VCPU. */
 	run->kvm_dirty_regs = INVALID_SYNC_FIELD;
-	rv = _vcpu_run(vm, VCPU_ID);
+	rv = _vcpu_run(vcpu);
 	TEST_ASSERT(rv < 0 && errno == EINVAL,
-		    "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n",
+		    "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d",
 		    rv);
-	vcpu_state(vm, VCPU_ID)->kvm_dirty_regs = 0;
+	run->kvm_dirty_regs = 0;
 
 	run->kvm_dirty_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS;
-	rv = _vcpu_run(vm, VCPU_ID);
+	rv = _vcpu_run(vcpu);
 	TEST_ASSERT(rv < 0 && errno == EINVAL,
-		    "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n",
+		    "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d",
 		    rv);
-	vcpu_state(vm, VCPU_ID)->kvm_dirty_regs = 0;
+	run->kvm_dirty_regs = 0;
+}
+
+KVM_ONE_VCPU_TEST(sync_regs_test, req_and_verify_all_valid, guest_code)
+{
+	struct kvm_run *run = vcpu->run;
+	struct kvm_vcpu_events events;
+	struct kvm_sregs sregs;
+	struct kvm_regs regs;
 
 	/* Request and verify all valid register sets. */
 	/* TODO: BUILD TIME CHECK: TEST_ASSERT(KVM_SYNC_X86_NUM_FIELDS != 3); */
 	run->kvm_valid_regs = TEST_SYNC_FIELDS;
-	rv = _vcpu_run(vm, VCPU_ID);
-	TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
-		    "Unexpected exit reason: %u (%s),\n",
-		    run->exit_reason,
-		    exit_reason_str(run->exit_reason));
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
 
-	vcpu_regs_get(vm, VCPU_ID, &regs);
+	vcpu_regs_get(vcpu, &regs);
 	compare_regs(&regs, &run->s.regs.regs);
 
-	vcpu_sregs_get(vm, VCPU_ID, &sregs);
+	vcpu_sregs_get(vcpu, &sregs);
 	compare_sregs(&sregs, &run->s.regs.sregs);
 
-	vcpu_events_get(vm, VCPU_ID, &events);
+	vcpu_events_get(vcpu, &events);
 	compare_vcpu_events(&events, &run->s.regs.events);
+}
+
+KVM_ONE_VCPU_TEST(sync_regs_test, set_and_verify_various, guest_code)
+{
+	struct kvm_run *run = vcpu->run;
+	struct kvm_vcpu_events events;
+	struct kvm_sregs sregs;
+	struct kvm_regs regs;
+
+	/* Run once to get register set */
+	run->kvm_valid_regs = TEST_SYNC_FIELDS;
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
 
 	/* Set and verify various register values. */
 	run->s.regs.regs.rbx = 0xBAD1DEA;
@@ -159,11 +291,8 @@ int main(int argc, char *argv[])
 
 	run->kvm_valid_regs = TEST_SYNC_FIELDS;
 	run->kvm_dirty_regs = KVM_SYNC_X86_REGS | KVM_SYNC_X86_SREGS;
-	rv = _vcpu_run(vm, VCPU_ID);
-	TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
-		    "Unexpected exit reason: %u (%s),\n",
-		    run->exit_reason,
-		    exit_reason_str(run->exit_reason));
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
 	TEST_ASSERT(run->s.regs.regs.rbx == 0xBAD1DEA + 1,
 		    "rbx sync regs value incorrect 0x%llx.",
 		    run->s.regs.regs.rbx);
@@ -171,14 +300,19 @@ int main(int argc, char *argv[])
 		    "apic_base sync regs value incorrect 0x%llx.",
 		    run->s.regs.sregs.apic_base);
 
-	vcpu_regs_get(vm, VCPU_ID, &regs);
+	vcpu_regs_get(vcpu, &regs);
 	compare_regs(&regs, &run->s.regs.regs);
 
-	vcpu_sregs_get(vm, VCPU_ID, &sregs);
+	vcpu_sregs_get(vcpu, &sregs);
 	compare_sregs(&sregs, &run->s.regs.sregs);
 
-	vcpu_events_get(vm, VCPU_ID, &events);
+	vcpu_events_get(vcpu, &events);
 	compare_vcpu_events(&events, &run->s.regs.events);
+}
+
+KVM_ONE_VCPU_TEST(sync_regs_test, clear_kvm_dirty_regs_bits, guest_code)
+{
+	struct kvm_run *run = vcpu->run;
 
 	/* Clear kvm_dirty_regs bits, verify new s.regs values are
 	 * overwritten with existing guest values.
@@ -186,14 +320,22 @@ int main(int argc, char *argv[])
 	run->kvm_valid_regs = TEST_SYNC_FIELDS;
 	run->kvm_dirty_regs = 0;
 	run->s.regs.regs.rbx = 0xDEADBEEF;
-	rv = _vcpu_run(vm, VCPU_ID);
-	TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
-		    "Unexpected exit reason: %u (%s),\n",
-		    run->exit_reason,
-		    exit_reason_str(run->exit_reason));
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
 	TEST_ASSERT(run->s.regs.regs.rbx != 0xDEADBEEF,
 		    "rbx sync regs value incorrect 0x%llx.",
 		    run->s.regs.regs.rbx);
+}
+
+KVM_ONE_VCPU_TEST(sync_regs_test, clear_kvm_valid_and_dirty_regs, guest_code)
+{
+	struct kvm_run *run = vcpu->run;
+	struct kvm_regs regs;
+
+	/* Run once to get register set */
+	run->kvm_valid_regs = TEST_SYNC_FIELDS;
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
 
 	/* Clear kvm_valid_regs bits and kvm_dirty_bits.
 	 * Verify s.regs values are not overwritten with existing guest values
@@ -202,20 +344,29 @@ int main(int argc, char *argv[])
 	run->kvm_valid_regs = 0;
 	run->kvm_dirty_regs = 0;
 	run->s.regs.regs.rbx = 0xAAAA;
+	vcpu_regs_get(vcpu, &regs);
 	regs.rbx = 0xBAC0;
-	vcpu_regs_set(vm, VCPU_ID, &regs);
-	rv = _vcpu_run(vm, VCPU_ID);
-	TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
-		    "Unexpected exit reason: %u (%s),\n",
-		    run->exit_reason,
-		    exit_reason_str(run->exit_reason));
+	vcpu_regs_set(vcpu, &regs);
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
 	TEST_ASSERT(run->s.regs.regs.rbx == 0xAAAA,
 		    "rbx sync regs value incorrect 0x%llx.",
 		    run->s.regs.regs.rbx);
-	vcpu_regs_get(vm, VCPU_ID, &regs);
+	vcpu_regs_get(vcpu, &regs);
 	TEST_ASSERT(regs.rbx == 0xBAC0 + 1,
 		    "rbx guest value incorrect 0x%llx.",
 		    regs.rbx);
+}
+
+KVM_ONE_VCPU_TEST(sync_regs_test, clear_kvm_valid_regs_bits, guest_code)
+{
+	struct kvm_run *run = vcpu->run;
+	struct kvm_regs regs;
+
+	/* Run once to get register set */
+	run->kvm_valid_regs = TEST_SYNC_FIELDS;
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
 
 	/* Clear kvm_valid_regs bits. Verify s.regs values are not overwritten
 	 * with existing guest values but that guest values are overwritten
@@ -224,20 +375,39 @@ int main(int argc, char *argv[])
 	run->kvm_valid_regs = 0;
 	run->kvm_dirty_regs = TEST_SYNC_FIELDS;
 	run->s.regs.regs.rbx = 0xBBBB;
-	rv = _vcpu_run(vm, VCPU_ID);
-	TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
-		    "Unexpected exit reason: %u (%s),\n",
-		    run->exit_reason,
-		    exit_reason_str(run->exit_reason));
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
 	TEST_ASSERT(run->s.regs.regs.rbx == 0xBBBB,
 		    "rbx sync regs value incorrect 0x%llx.",
 		    run->s.regs.regs.rbx);
-	vcpu_regs_get(vm, VCPU_ID, &regs);
+	vcpu_regs_get(vcpu, &regs);
 	TEST_ASSERT(regs.rbx == 0xBBBB + 1,
 		    "rbx guest value incorrect 0x%llx.",
 		    regs.rbx);
+}
 
-	kvm_vm_free(vm);
+KVM_ONE_VCPU_TEST(sync_regs_test, race_cr4, guest_code)
+{
+	race_sync_regs(vcpu, race_sregs_cr4);
+}
+
+KVM_ONE_VCPU_TEST(sync_regs_test, race_exc, guest_code)
+{
+	race_sync_regs(vcpu, race_events_exc);
+}
+
+KVM_ONE_VCPU_TEST(sync_regs_test, race_inj_pen, guest_code)
+{
+	race_sync_regs(vcpu, race_events_inj_pen);
+}
+
+int main(int argc, char *argv[])
+{
+	int cap;
+
+	cap = kvm_check_cap(KVM_CAP_SYNC_REGS);
+	TEST_REQUIRE((cap & TEST_SYNC_FIELDS) == TEST_SYNC_FIELDS);
+	TEST_REQUIRE(!(cap & INVALID_SYNC_FIELD));
 
-	return 0;
+	return test_harness_run(argc, argv);
 }
diff --git a/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c b/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c
new file mode 100644
index 000000000000..56306a19144a
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/triple_fault_event_test.c
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+#include "svm_util.h"
+
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "kselftest.h"
+
+#define ARBITRARY_IO_PORT	0x2000
+
+/* The virtual machine object. */
+static struct kvm_vm *vm;
+
+static void l2_guest_code(void)
+{
+	asm volatile("inb %%dx, %%al"
+		     : : [port] "d" (ARBITRARY_IO_PORT) : "rax");
+}
+
+#define L2_GUEST_STACK_SIZE 64
+unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+
+void l1_guest_code_vmx(struct vmx_pages *vmx)
+{
+
+	GUEST_ASSERT(vmx->vmcs_gpa);
+	GUEST_ASSERT(prepare_for_vmx_operation(vmx));
+	GUEST_ASSERT(load_vmcs(vmx));
+
+	prepare_vmcs(vmx, l2_guest_code,
+		     &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+	GUEST_ASSERT(!vmlaunch());
+	/* L2 should triple fault after a triple fault event injected. */
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_TRIPLE_FAULT);
+	GUEST_DONE();
+}
+
+void l1_guest_code_svm(struct svm_test_data *svm)
+{
+	struct vmcb *vmcb = svm->vmcb;
+
+	generic_svm_setup(svm, l2_guest_code,
+			&l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+	/* don't intercept shutdown to test the case of SVM allowing to do so */
+	vmcb->control.intercept &= ~(BIT(INTERCEPT_SHUTDOWN));
+
+	run_guest(vmcb, svm->vmcb_gpa);
+
+	/* should not reach here, L1 should crash  */
+	GUEST_ASSERT(0);
+}
+
+int main(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_run *run;
+	struct kvm_vcpu_events events;
+	struct ucall uc;
+
+	bool has_vmx = kvm_cpu_has(X86_FEATURE_VMX);
+	bool has_svm = kvm_cpu_has(X86_FEATURE_SVM);
+
+	TEST_REQUIRE(has_vmx || has_svm);
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_X86_TRIPLE_FAULT_EVENT));
+
+
+	if (has_vmx) {
+		vm_vaddr_t vmx_pages_gva;
+
+		vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code_vmx);
+		vcpu_alloc_vmx(vm, &vmx_pages_gva);
+		vcpu_args_set(vcpu, 1, vmx_pages_gva);
+	} else {
+		vm_vaddr_t svm_gva;
+
+		vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code_svm);
+		vcpu_alloc_svm(vm, &svm_gva);
+		vcpu_args_set(vcpu, 1, svm_gva);
+	}
+
+	vm_enable_cap(vm, KVM_CAP_X86_TRIPLE_FAULT_EVENT, 1);
+	run = vcpu->run;
+	vcpu_run(vcpu);
+
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+	TEST_ASSERT(run->io.port == ARBITRARY_IO_PORT,
+		    "Expected IN from port %d from L2, got port %d",
+		    ARBITRARY_IO_PORT, run->io.port);
+	vcpu_events_get(vcpu, &events);
+	events.flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT;
+	events.triple_fault.pending = true;
+	vcpu_events_set(vcpu, &events);
+	run->immediate_exit = true;
+	vcpu_run_complete_io(vcpu);
+
+	vcpu_events_get(vcpu, &events);
+	TEST_ASSERT(events.flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT,
+		    "Triple fault event invalid");
+	TEST_ASSERT(events.triple_fault.pending,
+		    "No triple fault pending");
+	vcpu_run(vcpu);
+
+
+	if (has_svm) {
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_SHUTDOWN);
+	} else {
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_DONE:
+			break;
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+		default:
+			TEST_FAIL("Unexpected ucall: %lu", uc.cmd);
+		}
+	}
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c b/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c
new file mode 100644
index 000000000000..12b0964f4f13
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c
@@ -0,0 +1,161 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Tests for MSR_IA32_TSC and MSR_IA32_TSC_ADJUST.
+ *
+ * Copyright (C) 2020, Red Hat, Inc.
+ */
+#include <stdio.h>
+#include <string.h>
+#include "kvm_util.h"
+#include "processor.h"
+
+#define UNITY                  (1ull << 30)
+#define HOST_ADJUST            (UNITY * 64)
+#define GUEST_STEP             (UNITY * 4)
+#define ROUND(x)               ((x + UNITY / 2) & -UNITY)
+#define rounded_rdmsr(x)       ROUND(rdmsr(x))
+#define rounded_host_rdmsr(x)  ROUND(vcpu_get_msr(vcpu, x))
+
+static void guest_code(void)
+{
+	u64 val = 0;
+
+	GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val);
+	GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+	/* Guest: writes to MSR_IA32_TSC affect both MSRs.  */
+	val = 1ull * GUEST_STEP;
+	wrmsr(MSR_IA32_TSC, val);
+	GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val);
+	GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+	/* Guest: writes to MSR_IA32_TSC_ADJUST affect both MSRs.  */
+	GUEST_SYNC(2);
+	val = 2ull * GUEST_STEP;
+	wrmsr(MSR_IA32_TSC_ADJUST, val);
+	GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val);
+	GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+	/* Host: setting the TSC offset.  */
+	GUEST_SYNC(3);
+	GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+	GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+	/*
+	 * Guest: writes to MSR_IA32_TSC_ADJUST do not destroy the
+	 * host-side offset and affect both MSRs.
+	 */
+	GUEST_SYNC(4);
+	val = 3ull * GUEST_STEP;
+	wrmsr(MSR_IA32_TSC_ADJUST, val);
+	GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+	GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+	/*
+	 * Guest: writes to MSR_IA32_TSC affect both MSRs, so the host-side
+	 * offset is now visible in MSR_IA32_TSC_ADJUST.
+	 */
+	GUEST_SYNC(5);
+	val = 4ull * GUEST_STEP;
+	wrmsr(MSR_IA32_TSC, val);
+	GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val);
+	GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val - HOST_ADJUST);
+
+	GUEST_DONE();
+}
+
+static void run_vcpu(struct kvm_vcpu *vcpu, int stage)
+{
+	struct ucall uc;
+
+	vcpu_run(vcpu);
+
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_SYNC:
+		if (!strcmp((const char *)uc.args[0], "hello") &&
+		    uc.args[1] == stage + 1)
+			ksft_test_result_pass("stage %d passed\n", stage + 1);
+		else
+			ksft_test_result_fail(
+				"stage %d: Unexpected register values vmexit, got %lx",
+				stage + 1, (ulong)uc.args[1]);
+		return;
+	case UCALL_DONE:
+		ksft_test_result_pass("stage %d passed\n", stage + 1);
+		return;
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+	default:
+		TEST_ASSERT(false, "Unexpected exit: %s",
+			    exit_reason_str(vcpu->run->exit_reason));
+	}
+}
+
+int main(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	uint64_t val;
+
+	ksft_print_header();
+	ksft_set_plan(5);
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+
+	val = 0;
+	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val);
+	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+	/* Guest: writes to MSR_IA32_TSC affect both MSRs.  */
+	run_vcpu(vcpu, 1);
+	val = 1ull * GUEST_STEP;
+	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val);
+	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+	/* Guest: writes to MSR_IA32_TSC_ADJUST affect both MSRs.  */
+	run_vcpu(vcpu, 2);
+	val = 2ull * GUEST_STEP;
+	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val);
+	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+	/*
+	 * Host: writes to MSR_IA32_TSC set the host-side offset
+	 * and therefore do not change MSR_IA32_TSC_ADJUST.
+	 */
+	vcpu_set_msr(vcpu, MSR_IA32_TSC, HOST_ADJUST + val);
+	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+	run_vcpu(vcpu, 3);
+
+	/* Host: writes to MSR_IA32_TSC_ADJUST do not modify the TSC.  */
+	vcpu_set_msr(vcpu, MSR_IA32_TSC_ADJUST, UNITY * 123456);
+	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+	TEST_ASSERT_EQ(vcpu_get_msr(vcpu, MSR_IA32_TSC_ADJUST), UNITY * 123456);
+
+	/* Restore previous value.  */
+	vcpu_set_msr(vcpu, MSR_IA32_TSC_ADJUST, val);
+	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+	/*
+	 * Guest: writes to MSR_IA32_TSC_ADJUST do not destroy the
+	 * host-side offset and affect both MSRs.
+	 */
+	run_vcpu(vcpu, 4);
+	val = 3ull * GUEST_STEP;
+	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+	/*
+	 * Guest: writes to MSR_IA32_TSC affect both MSRs, so the host-side
+	 * offset is now visible in MSR_IA32_TSC_ADJUST.
+	 */
+	run_vcpu(vcpu, 5);
+	val = 4ull * GUEST_STEP;
+	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val);
+	TEST_ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val - HOST_ADJUST);
+
+	kvm_vm_free(vm);
+
+	ksft_finished();	/* Print results and exit() accordingly */
+}
diff --git a/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c b/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c
new file mode 100644
index 000000000000..59c7304f805e
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright © 2021 Amazon.com, Inc. or its affiliates.
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#include <stdint.h>
+#include <time.h>
+#include <sched.h>
+#include <signal.h>
+#include <pthread.h>
+
+#define NR_TEST_VCPUS 20
+
+static struct kvm_vm *vm;
+pthread_spinlock_t create_lock;
+
+#define TEST_TSC_KHZ    2345678UL
+#define TEST_TSC_OFFSET 200000000
+
+uint64_t tsc_sync;
+static void guest_code(void)
+{
+	uint64_t start_tsc, local_tsc, tmp;
+
+	start_tsc = rdtsc();
+	do {
+		tmp = READ_ONCE(tsc_sync);
+		local_tsc = rdtsc();
+		WRITE_ONCE(tsc_sync, local_tsc);
+		if (unlikely(local_tsc < tmp))
+			GUEST_SYNC_ARGS(0, local_tsc, tmp, 0, 0);
+
+	} while (local_tsc - start_tsc < 5000 * TEST_TSC_KHZ);
+
+	GUEST_DONE();
+}
+
+
+static void *run_vcpu(void *_cpu_nr)
+{
+	unsigned long vcpu_id = (unsigned long)_cpu_nr;
+	unsigned long failures = 0;
+	static bool first_cpu_done;
+	struct kvm_vcpu *vcpu;
+
+	/* The kernel is fine, but vm_vcpu_add() needs locking */
+	pthread_spin_lock(&create_lock);
+
+	vcpu = vm_vcpu_add(vm, vcpu_id, guest_code);
+
+	if (!first_cpu_done) {
+		first_cpu_done = true;
+		vcpu_set_msr(vcpu, MSR_IA32_TSC, TEST_TSC_OFFSET);
+	}
+
+	pthread_spin_unlock(&create_lock);
+
+	for (;;) {
+                struct ucall uc;
+
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+                case UCALL_DONE:
+			goto out;
+
+                case UCALL_SYNC:
+			printf("Guest %d sync %lx %lx %ld\n", vcpu->id,
+			       uc.args[2], uc.args[3], uc.args[2] - uc.args[3]);
+			failures++;
+			break;
+
+                default:
+                        TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+	}
+ out:
+	return (void *)failures;
+}
+
+int main(int argc, char *argv[])
+{
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_VM_TSC_CONTROL));
+
+	vm = vm_create(NR_TEST_VCPUS);
+	vm_ioctl(vm, KVM_SET_TSC_KHZ, (void *) TEST_TSC_KHZ);
+
+	pthread_spin_init(&create_lock, PTHREAD_PROCESS_PRIVATE);
+	pthread_t cpu_threads[NR_TEST_VCPUS];
+	unsigned long cpu;
+	for (cpu = 0; cpu < NR_TEST_VCPUS; cpu++)
+		pthread_create(&cpu_threads[cpu], NULL, run_vcpu, (void *)cpu);
+
+	unsigned long failures = 0;
+	for (cpu = 0; cpu < NR_TEST_VCPUS; cpu++) {
+		void *this_cpu_failures;
+		pthread_join(cpu_threads[cpu], &this_cpu_failures);
+		failures += (unsigned long)this_cpu_failures;
+	}
+
+	TEST_ASSERT(!failures, "TSC sync failed");
+	pthread_spin_destroy(&create_lock);
+	kvm_vm_free(vm);
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c b/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c
new file mode 100644
index 000000000000..dcbb3c29fb8e
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c
@@ -0,0 +1,302 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ucna_injection_test
+ *
+ * Copyright (C) 2022, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * Test that user space can inject UnCorrectable No Action required (UCNA)
+ * memory errors to the guest.
+ *
+ * The test starts one vCPU with the MCG_CMCI_P enabled. It verifies that
+ * proper UCNA errors can be injected to a vCPU with MCG_CMCI_P and
+ * corresponding per-bank control register (MCI_CTL2) bit enabled.
+ * The test also checks that the UCNA errors get recorded in the
+ * Machine Check bank registers no matter the error signal interrupts get
+ * delivered into the guest or not.
+ *
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <pthread.h>
+#include <inttypes.h>
+#include <string.h>
+#include <time.h>
+
+#include "kvm_util_base.h"
+#include "kvm_util.h"
+#include "mce.h"
+#include "processor.h"
+#include "test_util.h"
+#include "apic.h"
+
+#define SYNC_FIRST_UCNA 9
+#define SYNC_SECOND_UCNA 10
+#define SYNC_GP 11
+#define FIRST_UCNA_ADDR 0xdeadbeef
+#define SECOND_UCNA_ADDR 0xcafeb0ba
+
+/*
+ * Vector for the CMCI interrupt.
+ * Value is arbitrary. Any value in 0x20-0xFF should work:
+ * https://wiki.osdev.org/Interrupt_Vector_Table
+ */
+#define CMCI_VECTOR  0xa9
+
+#define UCNA_BANK  0x7	// IMC0 bank
+
+#define MCI_CTL2_RESERVED_BIT BIT_ULL(29)
+
+static uint64_t supported_mcg_caps;
+
+/*
+ * Record states about the injected UCNA.
+ * The variables started with the 'i_' prefixes are recorded in interrupt
+ * handler. Variables without the 'i_' prefixes are recorded in guest main
+ * execution thread.
+ */
+static volatile uint64_t i_ucna_rcvd;
+static volatile uint64_t i_ucna_addr;
+static volatile uint64_t ucna_addr;
+static volatile uint64_t ucna_addr2;
+
+struct thread_params {
+	struct kvm_vcpu *vcpu;
+	uint64_t *p_i_ucna_rcvd;
+	uint64_t *p_i_ucna_addr;
+	uint64_t *p_ucna_addr;
+	uint64_t *p_ucna_addr2;
+};
+
+static void verify_apic_base_addr(void)
+{
+	uint64_t msr = rdmsr(MSR_IA32_APICBASE);
+	uint64_t base = GET_APIC_BASE(msr);
+
+	GUEST_ASSERT(base == APIC_DEFAULT_GPA);
+}
+
+static void ucna_injection_guest_code(void)
+{
+	uint64_t ctl2;
+	verify_apic_base_addr();
+	xapic_enable();
+
+	/* Sets up the interrupt vector and enables per-bank CMCI sigaling. */
+	xapic_write_reg(APIC_LVTCMCI, CMCI_VECTOR | APIC_DM_FIXED);
+	ctl2 = rdmsr(MSR_IA32_MCx_CTL2(UCNA_BANK));
+	wrmsr(MSR_IA32_MCx_CTL2(UCNA_BANK), ctl2 | MCI_CTL2_CMCI_EN);
+
+	/* Enables interrupt in guest. */
+	asm volatile("sti");
+
+	/* Let user space inject the first UCNA */
+	GUEST_SYNC(SYNC_FIRST_UCNA);
+
+	ucna_addr = rdmsr(MSR_IA32_MCx_ADDR(UCNA_BANK));
+
+	/* Disables the per-bank CMCI signaling. */
+	ctl2 = rdmsr(MSR_IA32_MCx_CTL2(UCNA_BANK));
+	wrmsr(MSR_IA32_MCx_CTL2(UCNA_BANK), ctl2 & ~MCI_CTL2_CMCI_EN);
+
+	/* Let the user space inject the second UCNA */
+	GUEST_SYNC(SYNC_SECOND_UCNA);
+
+	ucna_addr2 = rdmsr(MSR_IA32_MCx_ADDR(UCNA_BANK));
+	GUEST_DONE();
+}
+
+static void cmci_disabled_guest_code(void)
+{
+	uint64_t ctl2 = rdmsr(MSR_IA32_MCx_CTL2(UCNA_BANK));
+	wrmsr(MSR_IA32_MCx_CTL2(UCNA_BANK), ctl2 | MCI_CTL2_CMCI_EN);
+
+	GUEST_DONE();
+}
+
+static void cmci_enabled_guest_code(void)
+{
+	uint64_t ctl2 = rdmsr(MSR_IA32_MCx_CTL2(UCNA_BANK));
+	wrmsr(MSR_IA32_MCx_CTL2(UCNA_BANK), ctl2 | MCI_CTL2_RESERVED_BIT);
+
+	GUEST_DONE();
+}
+
+static void guest_cmci_handler(struct ex_regs *regs)
+{
+	i_ucna_rcvd++;
+	i_ucna_addr = rdmsr(MSR_IA32_MCx_ADDR(UCNA_BANK));
+	xapic_write_reg(APIC_EOI, 0);
+}
+
+static void guest_gp_handler(struct ex_regs *regs)
+{
+	GUEST_SYNC(SYNC_GP);
+}
+
+static void run_vcpu_expect_gp(struct kvm_vcpu *vcpu)
+{
+	struct ucall uc;
+
+	vcpu_run(vcpu);
+
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+	TEST_ASSERT(get_ucall(vcpu, &uc) == UCALL_SYNC,
+		    "Expect UCALL_SYNC");
+	TEST_ASSERT(uc.args[1] == SYNC_GP, "#GP is expected.");
+	printf("vCPU received GP in guest.\n");
+}
+
+static void inject_ucna(struct kvm_vcpu *vcpu, uint64_t addr) {
+	/*
+	 * A UCNA error is indicated with VAL=1, UC=1, PCC=0, S=0 and AR=0 in
+	 * the IA32_MCi_STATUS register.
+	 * MSCOD=1 (BIT[16] - MscodDataRdErr).
+	 * MCACOD=0x0090 (Memory controller error format, channel 0)
+	 */
+	uint64_t status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
+			  MCI_STATUS_MISCV | MCI_STATUS_ADDRV | 0x10090;
+	struct kvm_x86_mce mce = {};
+	mce.status = status;
+	mce.mcg_status = 0;
+	/*
+	 * MCM_ADDR_PHYS indicates the reported address is a physical address.
+	 * Lowest 6 bits is the recoverable address LSB, i.e., the injected MCE
+	 * is at 4KB granularity.
+	 */
+	mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
+	mce.addr = addr;
+	mce.bank = UCNA_BANK;
+
+	vcpu_ioctl(vcpu, KVM_X86_SET_MCE, &mce);
+}
+
+static void *run_ucna_injection(void *arg)
+{
+	struct thread_params *params = (struct thread_params *)arg;
+	struct ucall uc;
+	int old;
+	int r;
+
+	r = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &old);
+	TEST_ASSERT(r == 0,
+		    "pthread_setcanceltype failed with errno=%d",
+		    r);
+
+	vcpu_run(params->vcpu);
+
+	TEST_ASSERT_KVM_EXIT_REASON(params->vcpu, KVM_EXIT_IO);
+	TEST_ASSERT(get_ucall(params->vcpu, &uc) == UCALL_SYNC,
+		    "Expect UCALL_SYNC");
+	TEST_ASSERT(uc.args[1] == SYNC_FIRST_UCNA, "Injecting first UCNA.");
+
+	printf("Injecting first UCNA at %#x.\n", FIRST_UCNA_ADDR);
+
+	inject_ucna(params->vcpu, FIRST_UCNA_ADDR);
+	vcpu_run(params->vcpu);
+
+	TEST_ASSERT_KVM_EXIT_REASON(params->vcpu, KVM_EXIT_IO);
+	TEST_ASSERT(get_ucall(params->vcpu, &uc) == UCALL_SYNC,
+		    "Expect UCALL_SYNC");
+	TEST_ASSERT(uc.args[1] == SYNC_SECOND_UCNA, "Injecting second UCNA.");
+
+	printf("Injecting second UCNA at %#x.\n", SECOND_UCNA_ADDR);
+
+	inject_ucna(params->vcpu, SECOND_UCNA_ADDR);
+	vcpu_run(params->vcpu);
+
+	TEST_ASSERT_KVM_EXIT_REASON(params->vcpu, KVM_EXIT_IO);
+	if (get_ucall(params->vcpu, &uc) == UCALL_ABORT) {
+		TEST_ASSERT(false, "vCPU assertion failure: %s.",
+			    (const char *)uc.args[0]);
+	}
+
+	return NULL;
+}
+
+static void test_ucna_injection(struct kvm_vcpu *vcpu, struct thread_params *params)
+{
+	struct kvm_vm *vm = vcpu->vm;
+	params->vcpu = vcpu;
+	params->p_i_ucna_rcvd = (uint64_t *)addr_gva2hva(vm, (uint64_t)&i_ucna_rcvd);
+	params->p_i_ucna_addr = (uint64_t *)addr_gva2hva(vm, (uint64_t)&i_ucna_addr);
+	params->p_ucna_addr = (uint64_t *)addr_gva2hva(vm, (uint64_t)&ucna_addr);
+	params->p_ucna_addr2 = (uint64_t *)addr_gva2hva(vm, (uint64_t)&ucna_addr2);
+
+	run_ucna_injection(params);
+
+	TEST_ASSERT(*params->p_i_ucna_rcvd == 1, "Only first UCNA get signaled.");
+	TEST_ASSERT(*params->p_i_ucna_addr == FIRST_UCNA_ADDR,
+		    "Only first UCNA reported addr get recorded via interrupt.");
+	TEST_ASSERT(*params->p_ucna_addr == FIRST_UCNA_ADDR,
+		    "First injected UCNAs should get exposed via registers.");
+	TEST_ASSERT(*params->p_ucna_addr2 == SECOND_UCNA_ADDR,
+		    "Second injected UCNAs should get exposed via registers.");
+
+	printf("Test successful.\n"
+	       "UCNA CMCI interrupts received: %ld\n"
+	       "Last UCNA address received via CMCI: %lx\n"
+	       "First UCNA address in vCPU thread: %lx\n"
+	       "Second UCNA address in vCPU thread: %lx\n",
+	       *params->p_i_ucna_rcvd, *params->p_i_ucna_addr,
+	       *params->p_ucna_addr, *params->p_ucna_addr2);
+}
+
+static void setup_mce_cap(struct kvm_vcpu *vcpu, bool enable_cmci_p)
+{
+	uint64_t mcg_caps = MCG_CTL_P | MCG_SER_P | MCG_LMCE_P | KVM_MAX_MCE_BANKS;
+	if (enable_cmci_p)
+		mcg_caps |= MCG_CMCI_P;
+
+	mcg_caps &= supported_mcg_caps | MCG_CAP_BANKS_MASK;
+	vcpu_ioctl(vcpu, KVM_X86_SETUP_MCE, &mcg_caps);
+}
+
+static struct kvm_vcpu *create_vcpu_with_mce_cap(struct kvm_vm *vm, uint32_t vcpuid,
+						 bool enable_cmci_p, void *guest_code)
+{
+	struct kvm_vcpu *vcpu = vm_vcpu_add(vm, vcpuid, guest_code);
+	setup_mce_cap(vcpu, enable_cmci_p);
+	return vcpu;
+}
+
+int main(int argc, char *argv[])
+{
+	struct thread_params params;
+	struct kvm_vm *vm;
+	struct kvm_vcpu *ucna_vcpu;
+	struct kvm_vcpu *cmcidis_vcpu;
+	struct kvm_vcpu *cmci_vcpu;
+
+	kvm_check_cap(KVM_CAP_MCE);
+
+	vm = __vm_create(VM_SHAPE_DEFAULT, 3, 0);
+
+	kvm_ioctl(vm->kvm_fd, KVM_X86_GET_MCE_CAP_SUPPORTED,
+		  &supported_mcg_caps);
+
+	if (!(supported_mcg_caps & MCG_CMCI_P)) {
+		print_skip("MCG_CMCI_P is not supported");
+		exit(KSFT_SKIP);
+	}
+
+	ucna_vcpu = create_vcpu_with_mce_cap(vm, 0, true, ucna_injection_guest_code);
+	cmcidis_vcpu = create_vcpu_with_mce_cap(vm, 1, false, cmci_disabled_guest_code);
+	cmci_vcpu = create_vcpu_with_mce_cap(vm, 2, true, cmci_enabled_guest_code);
+
+	vm_init_descriptor_tables(vm);
+	vcpu_init_descriptor_tables(ucna_vcpu);
+	vcpu_init_descriptor_tables(cmcidis_vcpu);
+	vcpu_init_descriptor_tables(cmci_vcpu);
+	vm_install_exception_handler(vm, CMCI_VECTOR, guest_cmci_handler);
+	vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler);
+
+	virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA);
+
+	test_ucna_injection(ucna_vcpu, &params);
+	run_vcpu_expect_gp(cmcidis_vcpu);
+	run_vcpu_expect_gp(cmci_vcpu);
+
+	kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/userspace_io_test.c b/tools/testing/selftests/kvm/x86_64/userspace_io_test.c
new file mode 100644
index 000000000000..9481cbcf284f
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/userspace_io_test.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "processor.h"
+
+static void guest_ins_port80(uint8_t *buffer, unsigned int count)
+{
+	unsigned long end;
+
+	if (count == 2)
+		end = (unsigned long)buffer + 1;
+	else
+		end = (unsigned long)buffer + 8192;
+
+	asm volatile("cld; rep; insb" : "+D"(buffer), "+c"(count) : "d"(0x80) : "memory");
+	GUEST_ASSERT_EQ(count, 0);
+	GUEST_ASSERT_EQ((unsigned long)buffer, end);
+}
+
+static void guest_code(void)
+{
+	uint8_t buffer[8192];
+	int i;
+
+	/*
+	 * Special case tests.  main() will adjust RCX 2 => 1 and 3 => 8192 to
+	 * test that KVM doesn't explode when userspace modifies the "count" on
+	 * a userspace I/O exit.  KVM isn't required to play nice with the I/O
+	 * itself as KVM doesn't support manipulating the count, it just needs
+	 * to not explode or overflow a buffer.
+	 */
+	guest_ins_port80(buffer, 2);
+	guest_ins_port80(buffer, 3);
+
+	/* Verify KVM fills the buffer correctly when not stuffing RCX. */
+	memset(buffer, 0, sizeof(buffer));
+	guest_ins_port80(buffer, 8192);
+	for (i = 0; i < 8192; i++)
+		__GUEST_ASSERT(buffer[i] == 0xaa,
+			       "Expected '0xaa', got '0x%x' at buffer[%u]",
+			       buffer[i], i);
+
+	GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_regs regs;
+	struct kvm_run *run;
+	struct kvm_vm *vm;
+	struct ucall uc;
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+	run = vcpu->run;
+
+	memset(&regs, 0, sizeof(regs));
+
+	while (1) {
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		if (get_ucall(vcpu, &uc))
+			break;
+
+		TEST_ASSERT(run->io.port == 0x80,
+			    "Expected I/O at port 0x80, got port 0x%x", run->io.port);
+
+		/*
+		 * Modify the rep string count in RCX: 2 => 1 and 3 => 8192.
+		 * Note, this abuses KVM's batching of rep string I/O to avoid
+		 * getting stuck in an infinite loop.  That behavior isn't in
+		 * scope from a testing perspective as it's not ABI in any way,
+		 * i.e. it really is abusing internal KVM knowledge.
+		 */
+		vcpu_regs_get(vcpu, &regs);
+		if (regs.rcx == 2)
+			regs.rcx = 1;
+		if (regs.rcx == 3)
+			regs.rcx = 8192;
+		memset((void *)run + run->io.data_offset, 0xaa, 4096);
+		vcpu_regs_set(vcpu, &regs);
+	}
+
+	switch (uc.cmd) {
+	case UCALL_DONE:
+		break;
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+	default:
+		TEST_FAIL("Unknown ucall %lu", uc.cmd);
+	}
+
+	kvm_vm_free(vm);
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c b/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c
new file mode 100644
index 000000000000..f4f61a2d2464
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/userspace_msr_exit_test.c
@@ -0,0 +1,780 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2020, Google LLC.
+ *
+ * Tests for exiting into userspace on registered MSRs
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <sys/ioctl.h>
+
+#include "kvm_test_harness.h"
+#include "test_util.h"
+#include "kvm_util.h"
+#include "vmx.h"
+
+static bool fep_available;
+
+#define MSR_NON_EXISTENT 0x474f4f00
+
+static u64 deny_bits = 0;
+struct kvm_msr_filter filter_allow = {
+	.flags = KVM_MSR_FILTER_DEFAULT_ALLOW,
+	.ranges = {
+		{
+			.flags = KVM_MSR_FILTER_READ |
+				 KVM_MSR_FILTER_WRITE,
+			.nmsrs = 1,
+			/* Test an MSR the kernel knows about. */
+			.base = MSR_IA32_XSS,
+			.bitmap = (uint8_t*)&deny_bits,
+		}, {
+			.flags = KVM_MSR_FILTER_READ |
+				 KVM_MSR_FILTER_WRITE,
+			.nmsrs = 1,
+			/* Test an MSR the kernel doesn't know about. */
+			.base = MSR_IA32_FLUSH_CMD,
+			.bitmap = (uint8_t*)&deny_bits,
+		}, {
+			.flags = KVM_MSR_FILTER_READ |
+				 KVM_MSR_FILTER_WRITE,
+			.nmsrs = 1,
+			/* Test a fabricated MSR that no one knows about. */
+			.base = MSR_NON_EXISTENT,
+			.bitmap = (uint8_t*)&deny_bits,
+		},
+	},
+};
+
+struct kvm_msr_filter filter_fs = {
+	.flags = KVM_MSR_FILTER_DEFAULT_ALLOW,
+	.ranges = {
+		{
+			.flags = KVM_MSR_FILTER_READ,
+			.nmsrs = 1,
+			.base = MSR_FS_BASE,
+			.bitmap = (uint8_t*)&deny_bits,
+		},
+	},
+};
+
+struct kvm_msr_filter filter_gs = {
+	.flags = KVM_MSR_FILTER_DEFAULT_ALLOW,
+	.ranges = {
+		{
+			.flags = KVM_MSR_FILTER_READ,
+			.nmsrs = 1,
+			.base = MSR_GS_BASE,
+			.bitmap = (uint8_t*)&deny_bits,
+		},
+	},
+};
+
+static uint64_t msr_non_existent_data;
+static int guest_exception_count;
+static u32 msr_reads, msr_writes;
+
+static u8 bitmap_00000000[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
+static u8 bitmap_00000000_write[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
+static u8 bitmap_40000000[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
+static u8 bitmap_c0000000[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
+static u8 bitmap_c0000000_read[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
+static u8 bitmap_deadbeef[1] = { 0x1 };
+
+static void deny_msr(uint8_t *bitmap, u32 msr)
+{
+	u32 idx = msr & (KVM_MSR_FILTER_MAX_BITMAP_SIZE - 1);
+
+	bitmap[idx / 8] &= ~(1 << (idx % 8));
+}
+
+static void prepare_bitmaps(void)
+{
+	memset(bitmap_00000000, 0xff, sizeof(bitmap_00000000));
+	memset(bitmap_00000000_write, 0xff, sizeof(bitmap_00000000_write));
+	memset(bitmap_40000000, 0xff, sizeof(bitmap_40000000));
+	memset(bitmap_c0000000, 0xff, sizeof(bitmap_c0000000));
+	memset(bitmap_c0000000_read, 0xff, sizeof(bitmap_c0000000_read));
+
+	deny_msr(bitmap_00000000_write, MSR_IA32_POWER_CTL);
+	deny_msr(bitmap_c0000000_read, MSR_SYSCALL_MASK);
+	deny_msr(bitmap_c0000000_read, MSR_GS_BASE);
+}
+
+struct kvm_msr_filter filter_deny = {
+	.flags = KVM_MSR_FILTER_DEFAULT_DENY,
+	.ranges = {
+		{
+			.flags = KVM_MSR_FILTER_READ,
+			.base = 0x00000000,
+			.nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
+			.bitmap = bitmap_00000000,
+		}, {
+			.flags = KVM_MSR_FILTER_WRITE,
+			.base = 0x00000000,
+			.nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
+			.bitmap = bitmap_00000000_write,
+		}, {
+			.flags = KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE,
+			.base = 0x40000000,
+			.nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
+			.bitmap = bitmap_40000000,
+		}, {
+			.flags = KVM_MSR_FILTER_READ,
+			.base = 0xc0000000,
+			.nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
+			.bitmap = bitmap_c0000000_read,
+		}, {
+			.flags = KVM_MSR_FILTER_WRITE,
+			.base = 0xc0000000,
+			.nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
+			.bitmap = bitmap_c0000000,
+		}, {
+			.flags = KVM_MSR_FILTER_WRITE | KVM_MSR_FILTER_READ,
+			.base = 0xdeadbeef,
+			.nmsrs = 1,
+			.bitmap = bitmap_deadbeef,
+		},
+	},
+};
+
+struct kvm_msr_filter no_filter_deny = {
+	.flags = KVM_MSR_FILTER_DEFAULT_ALLOW,
+};
+
+/*
+ * Note: Force test_rdmsr() to not be inlined to prevent the labels,
+ * rdmsr_start and rdmsr_end, from being defined multiple times.
+ */
+static noinline uint64_t test_rdmsr(uint32_t msr)
+{
+	uint32_t a, d;
+
+	guest_exception_count = 0;
+
+	__asm__ __volatile__("rdmsr_start: rdmsr; rdmsr_end:" :
+			"=a"(a), "=d"(d) : "c"(msr) : "memory");
+
+	return a | ((uint64_t) d << 32);
+}
+
+/*
+ * Note: Force test_wrmsr() to not be inlined to prevent the labels,
+ * wrmsr_start and wrmsr_end, from being defined multiple times.
+ */
+static noinline void test_wrmsr(uint32_t msr, uint64_t value)
+{
+	uint32_t a = value;
+	uint32_t d = value >> 32;
+
+	guest_exception_count = 0;
+
+	__asm__ __volatile__("wrmsr_start: wrmsr; wrmsr_end:" ::
+			"a"(a), "d"(d), "c"(msr) : "memory");
+}
+
+extern char rdmsr_start, rdmsr_end;
+extern char wrmsr_start, wrmsr_end;
+
+/*
+ * Note: Force test_em_rdmsr() to not be inlined to prevent the labels,
+ * rdmsr_start and rdmsr_end, from being defined multiple times.
+ */
+static noinline uint64_t test_em_rdmsr(uint32_t msr)
+{
+	uint32_t a, d;
+
+	guest_exception_count = 0;
+
+	__asm__ __volatile__(KVM_FEP "em_rdmsr_start: rdmsr; em_rdmsr_end:" :
+			"=a"(a), "=d"(d) : "c"(msr) : "memory");
+
+	return a | ((uint64_t) d << 32);
+}
+
+/*
+ * Note: Force test_em_wrmsr() to not be inlined to prevent the labels,
+ * wrmsr_start and wrmsr_end, from being defined multiple times.
+ */
+static noinline void test_em_wrmsr(uint32_t msr, uint64_t value)
+{
+	uint32_t a = value;
+	uint32_t d = value >> 32;
+
+	guest_exception_count = 0;
+
+	__asm__ __volatile__(KVM_FEP "em_wrmsr_start: wrmsr; em_wrmsr_end:" ::
+			"a"(a), "d"(d), "c"(msr) : "memory");
+}
+
+extern char em_rdmsr_start, em_rdmsr_end;
+extern char em_wrmsr_start, em_wrmsr_end;
+
+static void guest_code_filter_allow(void)
+{
+	uint64_t data;
+
+	/*
+	 * Test userspace intercepting rdmsr / wrmsr for MSR_IA32_XSS.
+	 *
+	 * A GP is thrown if anything other than 0 is written to
+	 * MSR_IA32_XSS.
+	 */
+	data = test_rdmsr(MSR_IA32_XSS);
+	GUEST_ASSERT(data == 0);
+	GUEST_ASSERT(guest_exception_count == 0);
+
+	test_wrmsr(MSR_IA32_XSS, 0);
+	GUEST_ASSERT(guest_exception_count == 0);
+
+	test_wrmsr(MSR_IA32_XSS, 1);
+	GUEST_ASSERT(guest_exception_count == 1);
+
+	/*
+	 * Test userspace intercepting rdmsr / wrmsr for MSR_IA32_FLUSH_CMD.
+	 *
+	 * A GP is thrown if MSR_IA32_FLUSH_CMD is read
+	 * from or if a value other than 1 is written to it.
+	 */
+	test_rdmsr(MSR_IA32_FLUSH_CMD);
+	GUEST_ASSERT(guest_exception_count == 1);
+
+	test_wrmsr(MSR_IA32_FLUSH_CMD, 0);
+	GUEST_ASSERT(guest_exception_count == 1);
+
+	test_wrmsr(MSR_IA32_FLUSH_CMD, 1);
+	GUEST_ASSERT(guest_exception_count == 0);
+
+	/*
+	 * Test userspace intercepting rdmsr / wrmsr for MSR_NON_EXISTENT.
+	 *
+	 * Test that a fabricated MSR can pass through the kernel
+	 * and be handled in userspace.
+	 */
+	test_wrmsr(MSR_NON_EXISTENT, 2);
+	GUEST_ASSERT(guest_exception_count == 0);
+
+	data = test_rdmsr(MSR_NON_EXISTENT);
+	GUEST_ASSERT(data == 2);
+	GUEST_ASSERT(guest_exception_count == 0);
+
+	if (fep_available) {
+		/* Let userspace know we aren't done. */
+		GUEST_SYNC(0);
+
+		/*
+		 * Now run the same tests with the instruction emulator.
+		 */
+		data = test_em_rdmsr(MSR_IA32_XSS);
+		GUEST_ASSERT(data == 0);
+		GUEST_ASSERT(guest_exception_count == 0);
+		test_em_wrmsr(MSR_IA32_XSS, 0);
+		GUEST_ASSERT(guest_exception_count == 0);
+		test_em_wrmsr(MSR_IA32_XSS, 1);
+		GUEST_ASSERT(guest_exception_count == 1);
+
+		test_em_rdmsr(MSR_IA32_FLUSH_CMD);
+		GUEST_ASSERT(guest_exception_count == 1);
+		test_em_wrmsr(MSR_IA32_FLUSH_CMD, 0);
+		GUEST_ASSERT(guest_exception_count == 1);
+		test_em_wrmsr(MSR_IA32_FLUSH_CMD, 1);
+		GUEST_ASSERT(guest_exception_count == 0);
+
+		test_em_wrmsr(MSR_NON_EXISTENT, 2);
+		GUEST_ASSERT(guest_exception_count == 0);
+		data = test_em_rdmsr(MSR_NON_EXISTENT);
+		GUEST_ASSERT(data == 2);
+		GUEST_ASSERT(guest_exception_count == 0);
+	}
+
+	GUEST_DONE();
+}
+
+static void guest_msr_calls(bool trapped)
+{
+	/* This goes into the in-kernel emulation */
+	wrmsr(MSR_SYSCALL_MASK, 0);
+
+	if (trapped) {
+		/* This goes into user space emulation */
+		GUEST_ASSERT(rdmsr(MSR_SYSCALL_MASK) == MSR_SYSCALL_MASK);
+		GUEST_ASSERT(rdmsr(MSR_GS_BASE) == MSR_GS_BASE);
+	} else {
+		GUEST_ASSERT(rdmsr(MSR_SYSCALL_MASK) != MSR_SYSCALL_MASK);
+		GUEST_ASSERT(rdmsr(MSR_GS_BASE) != MSR_GS_BASE);
+	}
+
+	/* If trapped == true, this goes into user space emulation */
+	wrmsr(MSR_IA32_POWER_CTL, 0x1234);
+
+	/* This goes into the in-kernel emulation */
+	rdmsr(MSR_IA32_POWER_CTL);
+
+	/* Invalid MSR, should always be handled by user space exit */
+	GUEST_ASSERT(rdmsr(0xdeadbeef) == 0xdeadbeef);
+	wrmsr(0xdeadbeef, 0x1234);
+}
+
+static void guest_code_filter_deny(void)
+{
+	guest_msr_calls(true);
+
+	/*
+	 * Disable msr filtering, so that the kernel
+	 * handles everything in the next round
+	 */
+	GUEST_SYNC(0);
+
+	guest_msr_calls(false);
+
+	GUEST_DONE();
+}
+
+static void guest_code_permission_bitmap(void)
+{
+	uint64_t data;
+
+	data = test_rdmsr(MSR_FS_BASE);
+	GUEST_ASSERT(data == MSR_FS_BASE);
+	data = test_rdmsr(MSR_GS_BASE);
+	GUEST_ASSERT(data != MSR_GS_BASE);
+
+	/* Let userspace know to switch the filter */
+	GUEST_SYNC(0);
+
+	data = test_rdmsr(MSR_FS_BASE);
+	GUEST_ASSERT(data != MSR_FS_BASE);
+	data = test_rdmsr(MSR_GS_BASE);
+	GUEST_ASSERT(data == MSR_GS_BASE);
+
+	GUEST_DONE();
+}
+
+static void __guest_gp_handler(struct ex_regs *regs,
+			       char *r_start, char *r_end,
+			       char *w_start, char *w_end)
+{
+	if (regs->rip == (uintptr_t)r_start) {
+		regs->rip = (uintptr_t)r_end;
+		regs->rax = 0;
+		regs->rdx = 0;
+	} else if (regs->rip == (uintptr_t)w_start) {
+		regs->rip = (uintptr_t)w_end;
+	} else {
+		GUEST_ASSERT(!"RIP is at an unknown location!");
+	}
+
+	++guest_exception_count;
+}
+
+static void guest_gp_handler(struct ex_regs *regs)
+{
+	__guest_gp_handler(regs, &rdmsr_start, &rdmsr_end,
+			   &wrmsr_start, &wrmsr_end);
+}
+
+static void guest_fep_gp_handler(struct ex_regs *regs)
+{
+	__guest_gp_handler(regs, &em_rdmsr_start, &em_rdmsr_end,
+			   &em_wrmsr_start, &em_wrmsr_end);
+}
+
+static void check_for_guest_assert(struct kvm_vcpu *vcpu)
+{
+	struct ucall uc;
+
+	if (vcpu->run->exit_reason == KVM_EXIT_IO &&
+	    get_ucall(vcpu, &uc) == UCALL_ABORT) {
+		REPORT_GUEST_ASSERT(uc);
+	}
+}
+
+static void process_rdmsr(struct kvm_vcpu *vcpu, uint32_t msr_index)
+{
+	struct kvm_run *run = vcpu->run;
+
+	check_for_guest_assert(vcpu);
+
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_X86_RDMSR);
+	TEST_ASSERT(run->msr.index == msr_index,
+			"Unexpected msr (0x%04x), expected 0x%04x",
+			run->msr.index, msr_index);
+
+	switch (run->msr.index) {
+	case MSR_IA32_XSS:
+		run->msr.data = 0;
+		break;
+	case MSR_IA32_FLUSH_CMD:
+		run->msr.error = 1;
+		break;
+	case MSR_NON_EXISTENT:
+		run->msr.data = msr_non_existent_data;
+		break;
+	case MSR_FS_BASE:
+		run->msr.data = MSR_FS_BASE;
+		break;
+	case MSR_GS_BASE:
+		run->msr.data = MSR_GS_BASE;
+		break;
+	default:
+		TEST_ASSERT(false, "Unexpected MSR: 0x%04x", run->msr.index);
+	}
+}
+
+static void process_wrmsr(struct kvm_vcpu *vcpu, uint32_t msr_index)
+{
+	struct kvm_run *run = vcpu->run;
+
+	check_for_guest_assert(vcpu);
+
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_X86_WRMSR);
+	TEST_ASSERT(run->msr.index == msr_index,
+			"Unexpected msr (0x%04x), expected 0x%04x",
+			run->msr.index, msr_index);
+
+	switch (run->msr.index) {
+	case MSR_IA32_XSS:
+		if (run->msr.data != 0)
+			run->msr.error = 1;
+		break;
+	case MSR_IA32_FLUSH_CMD:
+		if (run->msr.data != 1)
+			run->msr.error = 1;
+		break;
+	case MSR_NON_EXISTENT:
+		msr_non_existent_data = run->msr.data;
+		break;
+	default:
+		TEST_ASSERT(false, "Unexpected MSR: 0x%04x", run->msr.index);
+	}
+}
+
+static void process_ucall_done(struct kvm_vcpu *vcpu)
+{
+	struct ucall uc;
+
+	check_for_guest_assert(vcpu);
+
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+	TEST_ASSERT(get_ucall(vcpu, &uc) == UCALL_DONE,
+		    "Unexpected ucall command: %lu, expected UCALL_DONE (%d)",
+		    uc.cmd, UCALL_DONE);
+}
+
+static uint64_t process_ucall(struct kvm_vcpu *vcpu)
+{
+	struct ucall uc = {};
+
+	check_for_guest_assert(vcpu);
+
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_SYNC:
+		break;
+	case UCALL_ABORT:
+		check_for_guest_assert(vcpu);
+		break;
+	case UCALL_DONE:
+		process_ucall_done(vcpu);
+		break;
+	default:
+		TEST_ASSERT(false, "Unexpected ucall");
+	}
+
+	return uc.cmd;
+}
+
+static void run_guest_then_process_rdmsr(struct kvm_vcpu *vcpu,
+					 uint32_t msr_index)
+{
+	vcpu_run(vcpu);
+	process_rdmsr(vcpu, msr_index);
+}
+
+static void run_guest_then_process_wrmsr(struct kvm_vcpu *vcpu,
+					 uint32_t msr_index)
+{
+	vcpu_run(vcpu);
+	process_wrmsr(vcpu, msr_index);
+}
+
+static uint64_t run_guest_then_process_ucall(struct kvm_vcpu *vcpu)
+{
+	vcpu_run(vcpu);
+	return process_ucall(vcpu);
+}
+
+static void run_guest_then_process_ucall_done(struct kvm_vcpu *vcpu)
+{
+	vcpu_run(vcpu);
+	process_ucall_done(vcpu);
+}
+
+KVM_ONE_VCPU_TEST_SUITE(user_msr);
+
+KVM_ONE_VCPU_TEST(user_msr, msr_filter_allow, guest_code_filter_allow)
+{
+	struct kvm_vm *vm = vcpu->vm;
+	uint64_t cmd;
+	int rc;
+
+	sync_global_to_guest(vm, fep_available);
+
+	rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR);
+	TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available");
+	vm_enable_cap(vm, KVM_CAP_X86_USER_SPACE_MSR, KVM_MSR_EXIT_REASON_FILTER);
+
+	rc = kvm_check_cap(KVM_CAP_X86_MSR_FILTER);
+	TEST_ASSERT(rc, "KVM_CAP_X86_MSR_FILTER is available");
+
+	vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter_allow);
+
+	vm_init_descriptor_tables(vm);
+	vcpu_init_descriptor_tables(vcpu);
+
+	vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler);
+
+	/* Process guest code userspace exits. */
+	run_guest_then_process_rdmsr(vcpu, MSR_IA32_XSS);
+	run_guest_then_process_wrmsr(vcpu, MSR_IA32_XSS);
+	run_guest_then_process_wrmsr(vcpu, MSR_IA32_XSS);
+
+	run_guest_then_process_rdmsr(vcpu, MSR_IA32_FLUSH_CMD);
+	run_guest_then_process_wrmsr(vcpu, MSR_IA32_FLUSH_CMD);
+	run_guest_then_process_wrmsr(vcpu, MSR_IA32_FLUSH_CMD);
+
+	run_guest_then_process_wrmsr(vcpu, MSR_NON_EXISTENT);
+	run_guest_then_process_rdmsr(vcpu, MSR_NON_EXISTENT);
+
+	vcpu_run(vcpu);
+	cmd = process_ucall(vcpu);
+
+	if (fep_available) {
+		TEST_ASSERT_EQ(cmd, UCALL_SYNC);
+		vm_install_exception_handler(vm, GP_VECTOR, guest_fep_gp_handler);
+
+		/* Process emulated rdmsr and wrmsr instructions. */
+		run_guest_then_process_rdmsr(vcpu, MSR_IA32_XSS);
+		run_guest_then_process_wrmsr(vcpu, MSR_IA32_XSS);
+		run_guest_then_process_wrmsr(vcpu, MSR_IA32_XSS);
+
+		run_guest_then_process_rdmsr(vcpu, MSR_IA32_FLUSH_CMD);
+		run_guest_then_process_wrmsr(vcpu, MSR_IA32_FLUSH_CMD);
+		run_guest_then_process_wrmsr(vcpu, MSR_IA32_FLUSH_CMD);
+
+		run_guest_then_process_wrmsr(vcpu, MSR_NON_EXISTENT);
+		run_guest_then_process_rdmsr(vcpu, MSR_NON_EXISTENT);
+
+		/* Confirm the guest completed without issues. */
+		run_guest_then_process_ucall_done(vcpu);
+	} else {
+		TEST_ASSERT_EQ(cmd, UCALL_DONE);
+		printf("To run the instruction emulated tests set the module parameter 'kvm.force_emulation_prefix=1'\n");
+	}
+}
+
+static int handle_ucall(struct kvm_vcpu *vcpu)
+{
+	struct ucall uc;
+
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+		break;
+	case UCALL_SYNC:
+		vm_ioctl(vcpu->vm, KVM_X86_SET_MSR_FILTER, &no_filter_deny);
+		break;
+	case UCALL_DONE:
+		return 1;
+	default:
+		TEST_FAIL("Unknown ucall %lu", uc.cmd);
+	}
+
+	return 0;
+}
+
+static void handle_rdmsr(struct kvm_run *run)
+{
+	run->msr.data = run->msr.index;
+	msr_reads++;
+
+	if (run->msr.index == MSR_SYSCALL_MASK ||
+	    run->msr.index == MSR_GS_BASE) {
+		TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_FILTER,
+			    "MSR read trap w/o access fault");
+	}
+
+	if (run->msr.index == 0xdeadbeef) {
+		TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_UNKNOWN,
+			    "MSR deadbeef read trap w/o inval fault");
+	}
+}
+
+static void handle_wrmsr(struct kvm_run *run)
+{
+	/* ignore */
+	msr_writes++;
+
+	if (run->msr.index == MSR_IA32_POWER_CTL) {
+		TEST_ASSERT(run->msr.data == 0x1234,
+			    "MSR data for MSR_IA32_POWER_CTL incorrect");
+		TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_FILTER,
+			    "MSR_IA32_POWER_CTL trap w/o access fault");
+	}
+
+	if (run->msr.index == 0xdeadbeef) {
+		TEST_ASSERT(run->msr.data == 0x1234,
+			    "MSR data for deadbeef incorrect");
+		TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_UNKNOWN,
+			    "deadbeef trap w/o inval fault");
+	}
+}
+
+KVM_ONE_VCPU_TEST(user_msr, msr_filter_deny, guest_code_filter_deny)
+{
+	struct kvm_vm *vm = vcpu->vm;
+	struct kvm_run *run = vcpu->run;
+	int rc;
+
+	rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR);
+	TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available");
+	vm_enable_cap(vm, KVM_CAP_X86_USER_SPACE_MSR, KVM_MSR_EXIT_REASON_INVAL |
+						      KVM_MSR_EXIT_REASON_UNKNOWN |
+						      KVM_MSR_EXIT_REASON_FILTER);
+
+	rc = kvm_check_cap(KVM_CAP_X86_MSR_FILTER);
+	TEST_ASSERT(rc, "KVM_CAP_X86_MSR_FILTER is available");
+
+	prepare_bitmaps();
+	vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter_deny);
+
+	while (1) {
+		vcpu_run(vcpu);
+
+		switch (run->exit_reason) {
+		case KVM_EXIT_X86_RDMSR:
+			handle_rdmsr(run);
+			break;
+		case KVM_EXIT_X86_WRMSR:
+			handle_wrmsr(run);
+			break;
+		case KVM_EXIT_IO:
+			if (handle_ucall(vcpu))
+				goto done;
+			break;
+		}
+
+	}
+
+done:
+	TEST_ASSERT(msr_reads == 4, "Handled 4 rdmsr in user space");
+	TEST_ASSERT(msr_writes == 3, "Handled 3 wrmsr in user space");
+}
+
+KVM_ONE_VCPU_TEST(user_msr, msr_permission_bitmap, guest_code_permission_bitmap)
+{
+	struct kvm_vm *vm = vcpu->vm;
+	int rc;
+
+	rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR);
+	TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available");
+	vm_enable_cap(vm, KVM_CAP_X86_USER_SPACE_MSR, KVM_MSR_EXIT_REASON_FILTER);
+
+	rc = kvm_check_cap(KVM_CAP_X86_MSR_FILTER);
+	TEST_ASSERT(rc, "KVM_CAP_X86_MSR_FILTER is available");
+
+	vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter_fs);
+	run_guest_then_process_rdmsr(vcpu, MSR_FS_BASE);
+	TEST_ASSERT(run_guest_then_process_ucall(vcpu) == UCALL_SYNC,
+		    "Expected ucall state to be UCALL_SYNC.");
+	vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter_gs);
+	run_guest_then_process_rdmsr(vcpu, MSR_GS_BASE);
+	run_guest_then_process_ucall_done(vcpu);
+}
+
+#define test_user_exit_msr_ioctl(vm, cmd, arg, flag, valid_mask)	\
+({									\
+	int r = __vm_ioctl(vm, cmd, arg);				\
+									\
+	if (flag & valid_mask)						\
+		TEST_ASSERT(!r, __KVM_IOCTL_ERROR(#cmd, r));		\
+	else								\
+		TEST_ASSERT(r == -1 && errno == EINVAL,			\
+			    "Wanted EINVAL for %s with flag = 0x%llx, got  rc: %i errno: %i (%s)", \
+			    #cmd, flag, r, errno,  strerror(errno));	\
+})
+
+static void run_user_space_msr_flag_test(struct kvm_vm *vm)
+{
+	struct kvm_enable_cap cap = { .cap = KVM_CAP_X86_USER_SPACE_MSR };
+	int nflags = sizeof(cap.args[0]) * BITS_PER_BYTE;
+	int rc;
+	int i;
+
+	rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR);
+	TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available");
+
+	for (i = 0; i < nflags; i++) {
+		cap.args[0] = BIT_ULL(i);
+		test_user_exit_msr_ioctl(vm, KVM_ENABLE_CAP, &cap,
+			   BIT_ULL(i), KVM_MSR_EXIT_REASON_VALID_MASK);
+	}
+}
+
+static void run_msr_filter_flag_test(struct kvm_vm *vm)
+{
+	u64 deny_bits = 0;
+	struct kvm_msr_filter filter = {
+		.flags = KVM_MSR_FILTER_DEFAULT_ALLOW,
+		.ranges = {
+			{
+				.flags = KVM_MSR_FILTER_READ,
+				.nmsrs = 1,
+				.base = 0,
+				.bitmap = (uint8_t *)&deny_bits,
+			},
+		},
+	};
+	int nflags;
+	int rc;
+	int i;
+
+	rc = kvm_check_cap(KVM_CAP_X86_MSR_FILTER);
+	TEST_ASSERT(rc, "KVM_CAP_X86_MSR_FILTER is available");
+
+	nflags = sizeof(filter.flags) * BITS_PER_BYTE;
+	for (i = 0; i < nflags; i++) {
+		filter.flags = BIT_ULL(i);
+		test_user_exit_msr_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter,
+			   BIT_ULL(i), KVM_MSR_FILTER_VALID_MASK);
+	}
+
+	filter.flags = KVM_MSR_FILTER_DEFAULT_ALLOW;
+	nflags = sizeof(filter.ranges[0].flags) * BITS_PER_BYTE;
+	for (i = 0; i < nflags; i++) {
+		filter.ranges[0].flags = BIT_ULL(i);
+		test_user_exit_msr_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter,
+			   BIT_ULL(i), KVM_MSR_FILTER_RANGE_VALID_MASK);
+	}
+}
+
+/* Test that attempts to write to the unused bits in a flag fails. */
+KVM_ONE_VCPU_TEST(user_msr, user_exit_msr_flags, NULL)
+{
+	struct kvm_vm *vm = vcpu->vm;
+
+	/* Test flags for KVM_CAP_X86_USER_SPACE_MSR. */
+	run_user_space_msr_flag_test(vm);
+
+	/* Test flags and range flags for KVM_X86_SET_MSR_FILTER. */
+	run_msr_filter_flag_test(vm);
+}
+
+int main(int argc, char *argv[])
+{
+	fep_available = kvm_is_forced_emulation_enabled();
+
+	return test_harness_run(argc, argv);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c b/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c
new file mode 100644
index 000000000000..a81a24761aac
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vmx_apic_access_test
+ *
+ * Copyright (C) 2020, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * The first subtest simply checks to see that an L2 guest can be
+ * launched with a valid APIC-access address that is backed by a
+ * page of L1 physical memory.
+ *
+ * The second subtest sets the APIC-access address to a (valid) L1
+ * physical address that is not backed by memory. KVM can't handle
+ * this situation, so resuming L2 should result in a KVM exit for
+ * internal error (emulation). This is not an architectural
+ * requirement. It is just a shortcoming of KVM. The internal error
+ * is unfortunate, but it's better than what used to happen!
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "kselftest.h"
+
+static void l2_guest_code(void)
+{
+	/* Exit to L1 */
+	__asm__ __volatile__("vmcall");
+}
+
+static void l1_guest_code(struct vmx_pages *vmx_pages, unsigned long high_gpa)
+{
+#define L2_GUEST_STACK_SIZE 64
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+	uint32_t control;
+
+	GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+	GUEST_ASSERT(load_vmcs(vmx_pages));
+
+	/* Prepare the VMCS for L2 execution. */
+	prepare_vmcs(vmx_pages, l2_guest_code,
+		     &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+	control = vmreadz(CPU_BASED_VM_EXEC_CONTROL);
+	control |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+	vmwrite(CPU_BASED_VM_EXEC_CONTROL, control);
+	control = vmreadz(SECONDARY_VM_EXEC_CONTROL);
+	control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+	vmwrite(SECONDARY_VM_EXEC_CONTROL, control);
+	vmwrite(APIC_ACCESS_ADDR, vmx_pages->apic_access_gpa);
+
+	/* Try to launch L2 with the memory-backed APIC-access address. */
+	GUEST_SYNC(vmreadz(APIC_ACCESS_ADDR));
+	GUEST_ASSERT(!vmlaunch());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+	vmwrite(APIC_ACCESS_ADDR, high_gpa);
+
+	/* Try to resume L2 with the unbacked APIC-access address. */
+	GUEST_SYNC(vmreadz(APIC_ACCESS_ADDR));
+	GUEST_ASSERT(!vmresume());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+	GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned long apic_access_addr = ~0ul;
+	vm_vaddr_t vmx_pages_gva;
+	unsigned long high_gpa;
+	struct vmx_pages *vmx;
+	bool done = false;
+
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
+
+	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
+
+	high_gpa = (vm->max_gfn - 1) << vm->page_shift;
+
+	vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva);
+	prepare_virtualize_apic_accesses(vmx, vm);
+	vcpu_args_set(vcpu, 2, vmx_pages_gva, high_gpa);
+
+	while (!done) {
+		volatile struct kvm_run *run = vcpu->run;
+		struct ucall uc;
+
+		vcpu_run(vcpu);
+		if (apic_access_addr == high_gpa) {
+			TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_INTERNAL_ERROR);
+			TEST_ASSERT(run->internal.suberror ==
+				    KVM_INTERNAL_ERROR_EMULATION,
+				    "Got internal suberror other than KVM_INTERNAL_ERROR_EMULATION: %u",
+				    run->internal.suberror);
+			break;
+		}
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			/* NOT REACHED */
+		case UCALL_SYNC:
+			apic_access_addr = uc.args[1];
+			break;
+		case UCALL_DONE:
+			done = true;
+			break;
+		default:
+			TEST_ASSERT(false, "Unknown ucall %lu", uc.cmd);
+		}
+	}
+	kvm_vm_free(vm);
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c b/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c
index fe40ade06a49..dad988351493 100644
--- a/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c
+++ b/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c
@@ -18,20 +18,15 @@
 
 #include "kselftest.h"
 
-#define VCPU_ID		5
-
 enum {
 	PORT_L0_EXIT = 0x2000,
 };
 
-/* The virtual machine object. */
-static struct kvm_vm *vm;
-
 static void l2_guest_code(void)
 {
 	/* Exit to L0 */
-        asm volatile("inb %%dx, %%al"
-                     : : [port] "d" (PORT_L0_EXIT) : "rax");
+	asm volatile("inb %%dx, %%al"
+		     : : [port] "d" (PORT_L0_EXIT) : "rax");
 }
 
 static void l1_guest_code(struct vmx_pages *vmx_pages)
@@ -53,32 +48,30 @@ static void l1_guest_code(struct vmx_pages *vmx_pages)
 int main(int argc, char *argv[])
 {
 	vm_vaddr_t vmx_pages_gva;
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
 
-	nested_vmx_check_supported();
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
 
-	vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code);
-	vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
 
 	/* Allocate VMX pages and shared descriptors (vmx_pages). */
 	vcpu_alloc_vmx(vm, &vmx_pages_gva);
-	vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
+	vcpu_args_set(vcpu, 1, vmx_pages_gva);
 
 	for (;;) {
-		volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID);
+		volatile struct kvm_run *run = vcpu->run;
 		struct ucall uc;
 
-		vcpu_run(vm, VCPU_ID);
-		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
-			    "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n",
-			    run->exit_reason,
-			    exit_reason_str(run->exit_reason));
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
 
 		if (run->io.port == PORT_L0_EXIT)
 			break;
 
-		switch (get_ucall(vm, VCPU_ID, &uc)) {
+		switch (get_ucall(vcpu, &uc)) {
 		case UCALL_ABORT:
-			TEST_FAIL("%s", (const char *)uc.args[0]);
+			REPORT_GUEST_ASSERT(uc);
 			/* NOT REACHED */
 		default:
 			TEST_FAIL("Unknown ucall %lu", uc.cmd);
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c
index e894a638a155..7f6f5f23fb9b 100644
--- a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c
+++ b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c
@@ -17,8 +17,6 @@
 #include "processor.h"
 #include "vmx.h"
 
-#define VCPU_ID				1
-
 /* The memory slot index to track dirty pages */
 #define TEST_MEM_SLOT_INDEX		1
 #define TEST_MEM_PAGES			3
@@ -73,19 +71,18 @@ int main(int argc, char *argv[])
 	unsigned long *bmap;
 	uint64_t *host_test_mem;
 
+	struct kvm_vcpu *vcpu;
 	struct kvm_vm *vm;
-	struct kvm_run *run;
 	struct ucall uc;
 	bool done = false;
 
-	nested_vmx_check_supported();
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
+	TEST_REQUIRE(kvm_cpu_has_ept());
 
 	/* Create VM */
-	vm = vm_create_default(VCPU_ID, 0, l1_guest_code);
-	vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
 	vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva);
-	vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
-	run = vcpu_state(vm, VCPU_ID);
+	vcpu_args_set(vcpu, 1, vmx_pages_gva);
 
 	/* Add an extra memory slot for testing dirty logging */
 	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
@@ -98,7 +95,7 @@ int main(int argc, char *argv[])
 	 * Add an identity map for GVA range [0xc0000000, 0xc0002000).  This
 	 * affects both L1 and L2.  However...
 	 */
-	virt_map(vm, GUEST_TEST_MEM, GUEST_TEST_MEM, TEST_MEM_PAGES, 0);
+	virt_map(vm, GUEST_TEST_MEM, GUEST_TEST_MEM, TEST_MEM_PAGES);
 
 	/*
 	 * ... pages in the L2 GPA range [0xc0001000, 0xc0003000) will map to
@@ -108,25 +105,21 @@ int main(int argc, char *argv[])
 	 * meaning after the last call to virt_map.
 	 */
 	prepare_eptp(vmx, vm, 0);
-	nested_map_memslot(vmx, vm, 0, 0);
-	nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, 4096, 0);
-	nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, 4096, 0);
+	nested_map_memslot(vmx, vm, 0);
+	nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, 4096);
+	nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, 4096);
 
-	bmap = bitmap_alloc(TEST_MEM_PAGES);
+	bmap = bitmap_zalloc(TEST_MEM_PAGES);
 	host_test_mem = addr_gpa2hva(vm, GUEST_TEST_MEM);
 
 	while (!done) {
 		memset(host_test_mem, 0xaa, TEST_MEM_PAGES * 4096);
-		_vcpu_run(vm, VCPU_ID);
-		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
-			    "Unexpected exit reason: %u (%s),\n",
-			    run->exit_reason,
-			    exit_reason_str(run->exit_reason));
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
 
-		switch (get_ucall(vm, VCPU_ID, &uc)) {
+		switch (get_ucall(vcpu, &uc)) {
 		case UCALL_ABORT:
-			TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
-			       	  __FILE__, uc.args[1]);
+			REPORT_GUEST_ASSERT(uc);
 			/* NOT REACHED */
 		case UCALL_SYNC:
 			/*
@@ -135,17 +128,17 @@ int main(int argc, char *argv[])
 			 */
 			kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap);
 			if (uc.args[1]) {
-				TEST_ASSERT(test_bit(0, bmap), "Page 0 incorrectly reported clean\n");
-				TEST_ASSERT(host_test_mem[0] == 1, "Page 0 not written by guest\n");
+				TEST_ASSERT(test_bit(0, bmap), "Page 0 incorrectly reported clean");
+				TEST_ASSERT(host_test_mem[0] == 1, "Page 0 not written by guest");
 			} else {
-				TEST_ASSERT(!test_bit(0, bmap), "Page 0 incorrectly reported dirty\n");
-				TEST_ASSERT(host_test_mem[0] == 0xaaaaaaaaaaaaaaaaULL, "Page 0 written by guest\n");
+				TEST_ASSERT(!test_bit(0, bmap), "Page 0 incorrectly reported dirty");
+				TEST_ASSERT(host_test_mem[0] == 0xaaaaaaaaaaaaaaaaULL, "Page 0 written by guest");
 			}
 
-			TEST_ASSERT(!test_bit(1, bmap), "Page 1 incorrectly reported dirty\n");
-			TEST_ASSERT(host_test_mem[4096 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 1 written by guest\n");
-			TEST_ASSERT(!test_bit(2, bmap), "Page 2 incorrectly reported dirty\n");
-			TEST_ASSERT(host_test_mem[8192 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 2 written by guest\n");
+			TEST_ASSERT(!test_bit(1, bmap), "Page 1 incorrectly reported dirty");
+			TEST_ASSERT(host_test_mem[4096 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 1 written by guest");
+			TEST_ASSERT(!test_bit(2, bmap), "Page 2 incorrectly reported dirty");
+			TEST_ASSERT(host_test_mem[8192 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 2 written by guest");
 			break;
 		case UCALL_DONE:
 			done = true;
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c b/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c
new file mode 100644
index 000000000000..fad3634fd9eb
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#include <signal.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+
+#include "kselftest.h"
+
+static void guest_ud_handler(struct ex_regs *regs)
+{
+	/* Loop on the ud2 until guest state is made invalid. */
+}
+
+static void guest_code(void)
+{
+	asm volatile("ud2");
+}
+
+static void __run_vcpu_with_invalid_state(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+
+	vcpu_run(vcpu);
+
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_INTERNAL_ERROR);
+	TEST_ASSERT(run->emulation_failure.suberror == KVM_INTERNAL_ERROR_EMULATION,
+		    "Expected emulation failure, got %d",
+		    run->emulation_failure.suberror);
+}
+
+static void run_vcpu_with_invalid_state(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * Always run twice to verify KVM handles the case where _KVM_ queues
+	 * an exception with invalid state and then exits to userspace, i.e.
+	 * that KVM doesn't explode if userspace ignores the initial error.
+	 */
+	__run_vcpu_with_invalid_state(vcpu);
+	__run_vcpu_with_invalid_state(vcpu);
+}
+
+static void set_timer(void)
+{
+	struct itimerval timer;
+
+	timer.it_value.tv_sec  = 0;
+	timer.it_value.tv_usec = 200;
+	timer.it_interval = timer.it_value;
+	TEST_ASSERT_EQ(setitimer(ITIMER_REAL, &timer, NULL), 0);
+}
+
+static void set_or_clear_invalid_guest_state(struct kvm_vcpu *vcpu, bool set)
+{
+	static struct kvm_sregs sregs;
+
+	if (!sregs.cr0)
+		vcpu_sregs_get(vcpu, &sregs);
+	sregs.tr.unusable = !!set;
+	vcpu_sregs_set(vcpu, &sregs);
+}
+
+static void set_invalid_guest_state(struct kvm_vcpu *vcpu)
+{
+	set_or_clear_invalid_guest_state(vcpu, true);
+}
+
+static void clear_invalid_guest_state(struct kvm_vcpu *vcpu)
+{
+	set_or_clear_invalid_guest_state(vcpu, false);
+}
+
+static struct kvm_vcpu *get_set_sigalrm_vcpu(struct kvm_vcpu *__vcpu)
+{
+	static struct kvm_vcpu *vcpu = NULL;
+
+	if (__vcpu)
+		vcpu = __vcpu;
+	return vcpu;
+}
+
+static void sigalrm_handler(int sig)
+{
+	struct kvm_vcpu *vcpu = get_set_sigalrm_vcpu(NULL);
+	struct kvm_vcpu_events events;
+
+	TEST_ASSERT(sig == SIGALRM, "Unexpected signal = %d", sig);
+
+	vcpu_events_get(vcpu, &events);
+
+	/*
+	 * If an exception is pending, attempt KVM_RUN with invalid guest,
+	 * otherwise rearm the timer and keep doing so until the timer fires
+	 * between KVM queueing an exception and re-entering the guest.
+	 */
+	if (events.exception.pending) {
+		set_invalid_guest_state(vcpu);
+		run_vcpu_with_invalid_state(vcpu);
+	} else {
+		set_timer();
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	TEST_REQUIRE(host_cpu_is_intel);
+	TEST_REQUIRE(!vm_is_unrestricted_guest(NULL));
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+	get_set_sigalrm_vcpu(vcpu);
+
+	vm_init_descriptor_tables(vm);
+	vcpu_init_descriptor_tables(vcpu);
+
+	vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler);
+
+	/*
+	 * Stuff invalid guest state for L2 by making TR unusuable.  The next
+	 * KVM_RUN should induce a TRIPLE_FAULT in L2 as KVM doesn't support
+	 * emulating invalid guest state for L2.
+	 */
+	set_invalid_guest_state(vcpu);
+	run_vcpu_with_invalid_state(vcpu);
+
+	/*
+	 * Verify KVM also handles the case where userspace gains control while
+	 * an exception is pending and stuffs invalid state.  Run with valid
+	 * guest state and a timer firing every 200us, and attempt to enter the
+	 * guest with invalid state when the handler interrupts KVM with an
+	 * exception pending.
+	 */
+	clear_invalid_guest_state(vcpu);
+	TEST_ASSERT(signal(SIGALRM, sigalrm_handler) != SIG_ERR,
+		    "Failed to register SIGALRM handler, errno = %d (%s)",
+		    errno, strerror(errno));
+
+	set_timer();
+	run_vcpu_with_invalid_state(vcpu);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_invalid_nested_guest_state.c b/tools/testing/selftests/kvm/x86_64/vmx_invalid_nested_guest_state.c
new file mode 100644
index 000000000000..a100ee5f0009
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/vmx_invalid_nested_guest_state.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "kselftest.h"
+
+#define ARBITRARY_IO_PORT 0x2000
+
+static struct kvm_vm *vm;
+
+static void l2_guest_code(void)
+{
+	/*
+	 * Generate an exit to L0 userspace, i.e. main(), via I/O to an
+	 * arbitrary port.
+	 */
+	asm volatile("inb %%dx, %%al"
+		     : : [port] "d" (ARBITRARY_IO_PORT) : "rax");
+}
+
+static void l1_guest_code(struct vmx_pages *vmx_pages)
+{
+#define L2_GUEST_STACK_SIZE 64
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+
+	GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+	GUEST_ASSERT(load_vmcs(vmx_pages));
+
+	/* Prepare the VMCS for L2 execution. */
+	prepare_vmcs(vmx_pages, l2_guest_code,
+		     &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+	/*
+	 * L2 must be run without unrestricted guest, verify that the selftests
+	 * library hasn't enabled it.  Because KVM selftests jump directly to
+	 * 64-bit mode, unrestricted guest support isn't required.
+	 */
+	GUEST_ASSERT(!(vmreadz(CPU_BASED_VM_EXEC_CONTROL) & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) ||
+		     !(vmreadz(SECONDARY_VM_EXEC_CONTROL) & SECONDARY_EXEC_UNRESTRICTED_GUEST));
+
+	GUEST_ASSERT(!vmlaunch());
+
+	/* L2 should triple fault after main() stuffs invalid guest state. */
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_TRIPLE_FAULT);
+	GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+	vm_vaddr_t vmx_pages_gva;
+	struct kvm_sregs sregs;
+	struct kvm_vcpu *vcpu;
+	struct kvm_run *run;
+	struct ucall uc;
+
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
+
+	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
+
+	/* Allocate VMX pages and shared descriptors (vmx_pages). */
+	vcpu_alloc_vmx(vm, &vmx_pages_gva);
+	vcpu_args_set(vcpu, 1, vmx_pages_gva);
+
+	vcpu_run(vcpu);
+
+	run = vcpu->run;
+
+	/*
+	 * The first exit to L0 userspace should be an I/O access from L2.
+	 * Running L1 should launch L2 without triggering an exit to userspace.
+	 */
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+	TEST_ASSERT(run->io.port == ARBITRARY_IO_PORT,
+		    "Expected IN from port %d from L2, got port %d",
+		    ARBITRARY_IO_PORT, run->io.port);
+
+	/*
+	 * Stuff invalid guest state for L2 by making TR unusuable.  The next
+	 * KVM_RUN should induce a TRIPLE_FAULT in L2 as KVM doesn't support
+	 * emulating invalid guest state for L2.
+	 */
+	memset(&sregs, 0, sizeof(sregs));
+	vcpu_sregs_get(vcpu, &sregs);
+	sregs.tr.unusable = 1;
+	vcpu_sregs_set(vcpu, &sregs);
+
+	vcpu_run(vcpu);
+
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_DONE:
+		break;
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+	default:
+		TEST_FAIL("Unexpected ucall: %lu", uc.cmd);
+	}
+}
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_msrs_test.c b/tools/testing/selftests/kvm/x86_64/vmx_msrs_test.c
new file mode 100644
index 000000000000..90720b6205f4
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/vmx_msrs_test.c
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VMX control MSR test
+ *
+ * Copyright (C) 2022 Google LLC.
+ *
+ * Tests for KVM ownership of bits in the VMX entry/exit control MSRs. Checks
+ * that KVM will set owned bits where appropriate, and will not if
+ * KVM_X86_QUIRK_TWEAK_VMX_CTRL_MSRS is disabled.
+ */
+#include <linux/bitmap.h>
+#include "kvm_util.h"
+#include "vmx.h"
+
+static void vmx_fixed1_msr_test(struct kvm_vcpu *vcpu, uint32_t msr_index,
+				  uint64_t mask)
+{
+	uint64_t val = vcpu_get_msr(vcpu, msr_index);
+	uint64_t bit;
+
+	mask &= val;
+
+	for_each_set_bit(bit, &mask, 64) {
+		vcpu_set_msr(vcpu, msr_index, val & ~BIT_ULL(bit));
+		vcpu_set_msr(vcpu, msr_index, val);
+	}
+}
+
+static void vmx_fixed0_msr_test(struct kvm_vcpu *vcpu, uint32_t msr_index,
+				uint64_t mask)
+{
+	uint64_t val = vcpu_get_msr(vcpu, msr_index);
+	uint64_t bit;
+
+	mask = ~mask | val;
+
+	for_each_clear_bit(bit, &mask, 64) {
+		vcpu_set_msr(vcpu, msr_index, val | BIT_ULL(bit));
+		vcpu_set_msr(vcpu, msr_index, val);
+	}
+}
+
+static void vmx_fixed0and1_msr_test(struct kvm_vcpu *vcpu, uint32_t msr_index)
+{
+	vmx_fixed0_msr_test(vcpu, msr_index, GENMASK_ULL(31, 0));
+	vmx_fixed1_msr_test(vcpu, msr_index, GENMASK_ULL(63, 32));
+}
+
+static void vmx_save_restore_msrs_test(struct kvm_vcpu *vcpu)
+{
+	vcpu_set_msr(vcpu, MSR_IA32_VMX_VMCS_ENUM, 0);
+	vcpu_set_msr(vcpu, MSR_IA32_VMX_VMCS_ENUM, -1ull);
+
+	vmx_fixed1_msr_test(vcpu, MSR_IA32_VMX_BASIC,
+			    BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55));
+
+	vmx_fixed1_msr_test(vcpu, MSR_IA32_VMX_MISC,
+			    BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) |
+			    BIT_ULL(15) | BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30));
+
+	vmx_fixed0and1_msr_test(vcpu, MSR_IA32_VMX_PROCBASED_CTLS2);
+	vmx_fixed1_msr_test(vcpu, MSR_IA32_VMX_EPT_VPID_CAP, -1ull);
+	vmx_fixed0and1_msr_test(vcpu, MSR_IA32_VMX_TRUE_PINBASED_CTLS);
+	vmx_fixed0and1_msr_test(vcpu, MSR_IA32_VMX_TRUE_PROCBASED_CTLS);
+	vmx_fixed0and1_msr_test(vcpu, MSR_IA32_VMX_TRUE_EXIT_CTLS);
+	vmx_fixed0and1_msr_test(vcpu, MSR_IA32_VMX_TRUE_ENTRY_CTLS);
+	vmx_fixed1_msr_test(vcpu, MSR_IA32_VMX_VMFUNC, -1ull);
+}
+
+static void __ia32_feature_control_msr_test(struct kvm_vcpu *vcpu,
+					    uint64_t msr_bit,
+					    struct kvm_x86_cpu_feature feature)
+{
+	uint64_t val;
+
+	vcpu_clear_cpuid_feature(vcpu, feature);
+
+	val = vcpu_get_msr(vcpu, MSR_IA32_FEAT_CTL);
+	vcpu_set_msr(vcpu, MSR_IA32_FEAT_CTL, val | msr_bit | FEAT_CTL_LOCKED);
+	vcpu_set_msr(vcpu, MSR_IA32_FEAT_CTL, (val & ~msr_bit) | FEAT_CTL_LOCKED);
+	vcpu_set_msr(vcpu, MSR_IA32_FEAT_CTL, val | msr_bit | FEAT_CTL_LOCKED);
+	vcpu_set_msr(vcpu, MSR_IA32_FEAT_CTL, (val & ~msr_bit) | FEAT_CTL_LOCKED);
+	vcpu_set_msr(vcpu, MSR_IA32_FEAT_CTL, val);
+
+	if (!kvm_cpu_has(feature))
+		return;
+
+	vcpu_set_cpuid_feature(vcpu, feature);
+}
+
+static void ia32_feature_control_msr_test(struct kvm_vcpu *vcpu)
+{
+	uint64_t supported_bits = FEAT_CTL_LOCKED |
+				  FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
+				  FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX |
+				  FEAT_CTL_SGX_LC_ENABLED |
+				  FEAT_CTL_SGX_ENABLED |
+				  FEAT_CTL_LMCE_ENABLED;
+	int bit, r;
+
+	__ia32_feature_control_msr_test(vcpu, FEAT_CTL_VMX_ENABLED_INSIDE_SMX, X86_FEATURE_SMX);
+	__ia32_feature_control_msr_test(vcpu, FEAT_CTL_VMX_ENABLED_INSIDE_SMX, X86_FEATURE_VMX);
+	__ia32_feature_control_msr_test(vcpu, FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX, X86_FEATURE_VMX);
+	__ia32_feature_control_msr_test(vcpu, FEAT_CTL_SGX_LC_ENABLED, X86_FEATURE_SGX_LC);
+	__ia32_feature_control_msr_test(vcpu, FEAT_CTL_SGX_LC_ENABLED, X86_FEATURE_SGX);
+	__ia32_feature_control_msr_test(vcpu, FEAT_CTL_SGX_ENABLED, X86_FEATURE_SGX);
+	__ia32_feature_control_msr_test(vcpu, FEAT_CTL_LMCE_ENABLED, X86_FEATURE_MCE);
+
+	for_each_clear_bit(bit, &supported_bits, 64) {
+		r = _vcpu_set_msr(vcpu, MSR_IA32_FEAT_CTL, BIT(bit));
+		TEST_ASSERT(r == 0,
+			    "Setting reserved bit %d in IA32_FEATURE_CONTROL should fail", bit);
+	}
+}
+
+int main(void)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_DISABLE_QUIRKS2));
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
+
+	/* No need to actually do KVM_RUN, thus no guest code. */
+	vm = vm_create_with_one_vcpu(&vcpu, NULL);
+
+	vmx_save_restore_msrs_test(vcpu);
+	ia32_feature_control_msr_test(vcpu);
+
+	kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c
new file mode 100644
index 000000000000..1759fa5cb3f2
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c
@@ -0,0 +1,206 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vmx_nested_tsc_scaling_test
+ *
+ * Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * This test case verifies that nested TSC scaling behaves as expected when
+ * both L1 and L2 are scaled using different ratios. For this test we scale
+ * L1 down and scale L2 up.
+ */
+
+#include <time.h>
+
+#include "kvm_util.h"
+#include "vmx.h"
+#include "kselftest.h"
+
+/* L2 is scaled up (from L1's perspective) by this factor */
+#define L2_SCALE_FACTOR 4ULL
+
+#define TSC_OFFSET_L2 ((uint64_t) -33125236320908)
+#define TSC_MULTIPLIER_L2 (L2_SCALE_FACTOR << 48)
+
+#define L2_GUEST_STACK_SIZE 64
+
+enum { USLEEP, UCHECK_L1, UCHECK_L2 };
+#define GUEST_SLEEP(sec)         ucall(UCALL_SYNC, 2, USLEEP, sec)
+#define GUEST_CHECK(level, freq) ucall(UCALL_SYNC, 2, level, freq)
+
+
+/*
+ * This function checks whether the "actual" TSC frequency of a guest matches
+ * its expected frequency. In order to account for delays in taking the TSC
+ * measurements, a difference of 1% between the actual and the expected value
+ * is tolerated.
+ */
+static void compare_tsc_freq(uint64_t actual, uint64_t expected)
+{
+	uint64_t tolerance, thresh_low, thresh_high;
+
+	tolerance = expected / 100;
+	thresh_low = expected - tolerance;
+	thresh_high = expected + tolerance;
+
+	TEST_ASSERT(thresh_low < actual,
+		"TSC freq is expected to be between %"PRIu64" and %"PRIu64
+		" but it actually is %"PRIu64,
+		thresh_low, thresh_high, actual);
+	TEST_ASSERT(thresh_high > actual,
+		"TSC freq is expected to be between %"PRIu64" and %"PRIu64
+		" but it actually is %"PRIu64,
+		thresh_low, thresh_high, actual);
+}
+
+static void check_tsc_freq(int level)
+{
+	uint64_t tsc_start, tsc_end, tsc_freq;
+
+	/*
+	 * Reading the TSC twice with about a second's difference should give
+	 * us an approximation of the TSC frequency from the guest's
+	 * perspective. Now, this won't be completely accurate, but it should
+	 * be good enough for the purposes of this test.
+	 */
+	tsc_start = rdmsr(MSR_IA32_TSC);
+	GUEST_SLEEP(1);
+	tsc_end = rdmsr(MSR_IA32_TSC);
+
+	tsc_freq = tsc_end - tsc_start;
+
+	GUEST_CHECK(level, tsc_freq);
+}
+
+static void l2_guest_code(void)
+{
+	check_tsc_freq(UCHECK_L2);
+
+	/* exit to L1 */
+	__asm__ __volatile__("vmcall");
+}
+
+static void l1_guest_code(struct vmx_pages *vmx_pages)
+{
+	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+	uint32_t control;
+
+	/* check that L1's frequency looks alright before launching L2 */
+	check_tsc_freq(UCHECK_L1);
+
+	GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+	GUEST_ASSERT(load_vmcs(vmx_pages));
+
+	/* prepare the VMCS for L2 execution */
+	prepare_vmcs(vmx_pages, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+	/* enable TSC offsetting and TSC scaling for L2 */
+	control = vmreadz(CPU_BASED_VM_EXEC_CONTROL);
+	control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETTING;
+	vmwrite(CPU_BASED_VM_EXEC_CONTROL, control);
+
+	control = vmreadz(SECONDARY_VM_EXEC_CONTROL);
+	control |= SECONDARY_EXEC_TSC_SCALING;
+	vmwrite(SECONDARY_VM_EXEC_CONTROL, control);
+
+	vmwrite(TSC_OFFSET, TSC_OFFSET_L2);
+	vmwrite(TSC_MULTIPLIER, TSC_MULTIPLIER_L2);
+	vmwrite(TSC_MULTIPLIER_HIGH, TSC_MULTIPLIER_L2 >> 32);
+
+	/* launch L2 */
+	GUEST_ASSERT(!vmlaunch());
+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+	/* check that L1's frequency still looks good */
+	check_tsc_freq(UCHECK_L1);
+
+	GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	vm_vaddr_t vmx_pages_gva;
+
+	uint64_t tsc_start, tsc_end;
+	uint64_t tsc_khz;
+	uint64_t l1_scale_factor;
+	uint64_t l0_tsc_freq = 0;
+	uint64_t l1_tsc_freq = 0;
+	uint64_t l2_tsc_freq = 0;
+
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_TSC_CONTROL));
+	TEST_REQUIRE(sys_clocksource_is_based_on_tsc());
+
+	/*
+	 * We set L1's scale factor to be a random number from 2 to 10.
+	 * Ideally we would do the same for L2's factor but that one is
+	 * referenced by both main() and l1_guest_code() and using a global
+	 * variable does not work.
+	 */
+	srand(time(NULL));
+	l1_scale_factor = (rand() % 9) + 2;
+	printf("L1's scale down factor is: %"PRIu64"\n", l1_scale_factor);
+	printf("L2's scale up factor is: %llu\n", L2_SCALE_FACTOR);
+
+	tsc_start = rdtsc();
+	sleep(1);
+	tsc_end = rdtsc();
+
+	l0_tsc_freq = tsc_end - tsc_start;
+	printf("real TSC frequency is around: %"PRIu64"\n", l0_tsc_freq);
+
+	vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code);
+	vcpu_alloc_vmx(vm, &vmx_pages_gva);
+	vcpu_args_set(vcpu, 1, vmx_pages_gva);
+
+	tsc_khz = __vcpu_ioctl(vcpu, KVM_GET_TSC_KHZ, NULL);
+	TEST_ASSERT(tsc_khz != -1, "vcpu ioctl KVM_GET_TSC_KHZ failed");
+
+	/* scale down L1's TSC frequency */
+	vcpu_ioctl(vcpu, KVM_SET_TSC_KHZ, (void *) (tsc_khz / l1_scale_factor));
+
+	for (;;) {
+		struct ucall uc;
+
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+		case UCALL_SYNC:
+			switch (uc.args[0]) {
+			case USLEEP:
+				sleep(uc.args[1]);
+				break;
+			case UCHECK_L1:
+				l1_tsc_freq = uc.args[1];
+				printf("L1's TSC frequency is around: %"PRIu64
+				       "\n", l1_tsc_freq);
+
+				compare_tsc_freq(l1_tsc_freq,
+						 l0_tsc_freq / l1_scale_factor);
+				break;
+			case UCHECK_L2:
+				l2_tsc_freq = uc.args[1];
+				printf("L2's TSC frequency is around: %"PRIu64
+				       "\n", l2_tsc_freq);
+
+				compare_tsc_freq(l2_tsc_freq,
+						 l1_tsc_freq * L2_SCALE_FACTOR);
+				break;
+			}
+			break;
+		case UCALL_DONE:
+			goto done;
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+	}
+
+done:
+	kvm_vm_free(vm);
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c
new file mode 100644
index 000000000000..ea0cb3cae0f7
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/vmx_pmu_caps_test.c
@@ -0,0 +1,228 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test for VMX-pmu perf capability msr
+ *
+ * Copyright (C) 2021 Intel Corporation
+ *
+ * Test to check the effect of various CPUID settings on
+ * MSR_IA32_PERF_CAPABILITIES MSR, and check that what
+ * we write with KVM_SET_MSR is _not_ modified by the guest
+ * and check it can be retrieved with KVM_GET_MSR, also test
+ * the invalid LBR formats are rejected.
+ */
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <sys/ioctl.h>
+
+#include <linux/bitmap.h>
+
+#include "kvm_test_harness.h"
+#include "kvm_util.h"
+#include "vmx.h"
+
+static union perf_capabilities {
+	struct {
+		u64	lbr_format:6;
+		u64	pebs_trap:1;
+		u64	pebs_arch_reg:1;
+		u64	pebs_format:4;
+		u64	smm_freeze:1;
+		u64	full_width_write:1;
+		u64 pebs_baseline:1;
+		u64	perf_metrics:1;
+		u64	pebs_output_pt_available:1;
+		u64	anythread_deprecated:1;
+	};
+	u64	capabilities;
+} host_cap;
+
+/*
+ * The LBR format and most PEBS features are immutable, all other features are
+ * fungible (if supported by the host and KVM).
+ */
+static const union perf_capabilities immutable_caps = {
+	.lbr_format = -1,
+	.pebs_trap  = 1,
+	.pebs_arch_reg = 1,
+	.pebs_format = -1,
+	.pebs_baseline = 1,
+};
+
+static const union perf_capabilities format_caps = {
+	.lbr_format = -1,
+	.pebs_format = -1,
+};
+
+static void guest_test_perf_capabilities_gp(uint64_t val)
+{
+	uint8_t vector = wrmsr_safe(MSR_IA32_PERF_CAPABILITIES, val);
+
+	__GUEST_ASSERT(vector == GP_VECTOR,
+		       "Expected #GP for value '0x%lx', got vector '0x%x'",
+		       val, vector);
+}
+
+static void guest_code(uint64_t current_val)
+{
+	int i;
+
+	guest_test_perf_capabilities_gp(current_val);
+	guest_test_perf_capabilities_gp(0);
+
+	for (i = 0; i < 64; i++)
+		guest_test_perf_capabilities_gp(current_val ^ BIT_ULL(i));
+
+	GUEST_DONE();
+}
+
+KVM_ONE_VCPU_TEST_SUITE(vmx_pmu_caps);
+
+/*
+ * Verify that guest WRMSRs to PERF_CAPABILITIES #GP regardless of the value
+ * written, that the guest always sees the userspace controlled value, and that
+ * PERF_CAPABILITIES is immutable after KVM_RUN.
+ */
+KVM_ONE_VCPU_TEST(vmx_pmu_caps, guest_wrmsr_perf_capabilities, guest_code)
+{
+	struct ucall uc;
+	int r, i;
+
+	vm_init_descriptor_tables(vcpu->vm);
+	vcpu_init_descriptor_tables(vcpu);
+
+	vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities);
+
+	vcpu_args_set(vcpu, 1, host_cap.capabilities);
+	vcpu_run(vcpu);
+
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+		break;
+	case UCALL_DONE:
+		break;
+	default:
+		TEST_FAIL("Unexpected ucall: %lu", uc.cmd);
+	}
+
+	TEST_ASSERT_EQ(vcpu_get_msr(vcpu, MSR_IA32_PERF_CAPABILITIES),
+			host_cap.capabilities);
+
+	vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities);
+
+	r = _vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, 0);
+	TEST_ASSERT(!r, "Post-KVM_RUN write '0' didn't fail");
+
+	for (i = 0; i < 64; i++) {
+		r = _vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES,
+				  host_cap.capabilities ^ BIT_ULL(i));
+		TEST_ASSERT(!r, "Post-KVM_RUN write '0x%llx'didn't fail",
+			    host_cap.capabilities ^ BIT_ULL(i));
+	}
+}
+
+/*
+ * Verify KVM allows writing PERF_CAPABILITIES with all KVM-supported features
+ * enabled, as well as '0' (to disable all features).
+ */
+KVM_ONE_VCPU_TEST(vmx_pmu_caps, basic_perf_capabilities, guest_code)
+{
+	vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, 0);
+	vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities);
+}
+
+KVM_ONE_VCPU_TEST(vmx_pmu_caps, fungible_perf_capabilities, guest_code)
+{
+	const uint64_t fungible_caps = host_cap.capabilities & ~immutable_caps.capabilities;
+	int bit;
+
+	for_each_set_bit(bit, &fungible_caps, 64) {
+		vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, BIT_ULL(bit));
+		vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES,
+			     host_cap.capabilities & ~BIT_ULL(bit));
+	}
+	vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities);
+}
+
+/*
+ * Verify KVM rejects attempts to set unsupported and/or immutable features in
+ * PERF_CAPABILITIES.  Note, LBR format and PEBS format need to be validated
+ * separately as they are multi-bit values, e.g. toggling or setting a single
+ * bit can generate a false positive without dedicated safeguards.
+ */
+KVM_ONE_VCPU_TEST(vmx_pmu_caps, immutable_perf_capabilities, guest_code)
+{
+	const uint64_t reserved_caps = (~host_cap.capabilities |
+					immutable_caps.capabilities) &
+				       ~format_caps.capabilities;
+	union perf_capabilities val = host_cap;
+	int r, bit;
+
+	for_each_set_bit(bit, &reserved_caps, 64) {
+		r = _vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES,
+				  host_cap.capabilities ^ BIT_ULL(bit));
+		TEST_ASSERT(!r, "%s immutable feature 0x%llx (bit %d) didn't fail",
+			    host_cap.capabilities & BIT_ULL(bit) ? "Setting" : "Clearing",
+			    BIT_ULL(bit), bit);
+	}
+
+	/*
+	 * KVM only supports the host's native LBR format, as well as '0' (to
+	 * disable LBR support).  Verify KVM rejects all other LBR formats.
+	 */
+	for (val.lbr_format = 1; val.lbr_format; val.lbr_format++) {
+		if (val.lbr_format == host_cap.lbr_format)
+			continue;
+
+		r = _vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, val.capabilities);
+		TEST_ASSERT(!r, "Bad LBR FMT = 0x%x didn't fail, host = 0x%x",
+			    val.lbr_format, host_cap.lbr_format);
+	}
+
+	/* Ditto for the PEBS format. */
+	for (val.pebs_format = 1; val.pebs_format; val.pebs_format++) {
+		if (val.pebs_format == host_cap.pebs_format)
+			continue;
+
+		r = _vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, val.capabilities);
+		TEST_ASSERT(!r, "Bad PEBS FMT = 0x%x didn't fail, host = 0x%x",
+			    val.pebs_format, host_cap.pebs_format);
+	}
+}
+
+/*
+ * Test that LBR MSRs are writable when LBRs are enabled, and then verify that
+ * disabling the vPMU via CPUID also disables LBR support.  Set bits 2:0 of
+ * LBR_TOS as those bits are writable across all uarch implementations (arch
+ * LBRs will need to poke a different MSR).
+ */
+KVM_ONE_VCPU_TEST(vmx_pmu_caps, lbr_perf_capabilities, guest_code)
+{
+	int r;
+
+	if (!host_cap.lbr_format)
+		return;
+
+	vcpu_set_msr(vcpu, MSR_IA32_PERF_CAPABILITIES, host_cap.capabilities);
+	vcpu_set_msr(vcpu, MSR_LBR_TOS, 7);
+
+	vcpu_clear_cpuid_entry(vcpu, X86_PROPERTY_PMU_VERSION.function);
+
+	r = _vcpu_set_msr(vcpu, MSR_LBR_TOS, 7);
+	TEST_ASSERT(!r, "Writing LBR_TOS should fail after disabling vPMU");
+}
+
+int main(int argc, char *argv[])
+{
+	TEST_REQUIRE(kvm_is_pmu_enabled());
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_PDCM));
+
+	TEST_REQUIRE(kvm_cpu_has_p(X86_PROPERTY_PMU_VERSION));
+	TEST_REQUIRE(kvm_cpu_property(X86_PROPERTY_PMU_VERSION) > 0);
+
+	host_cap.capabilities = kvm_get_feature_msr(MSR_IA32_PERF_CAPABILITIES);
+
+	TEST_ASSERT(host_cap.full_width_write,
+		    "Full-width writes should always be supported");
+
+	return test_harness_run(argc, argv);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c
index a7737af1224f..affc32800158 100644
--- a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c
+++ b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c
@@ -22,7 +22,6 @@
 #include "processor.h"
 #include "vmx.h"
 
-#define VCPU_ID		5
 #define PREEMPTION_TIMER_VALUE			100000000ull
 #define PREEMPTION_TIMER_VALUE_THRESHOLD1	 80000000ull
 
@@ -158,7 +157,7 @@ int main(int argc, char *argv[])
 
 	struct kvm_regs regs1, regs2;
 	struct kvm_vm *vm;
-	struct kvm_run *run;
+	struct kvm_vcpu *vcpu;
 	struct kvm_x86_state *state;
 	struct ucall uc;
 	int stage;
@@ -167,34 +166,25 @@ int main(int argc, char *argv[])
 	 * AMD currently does not implement any VMX features, so for now we
 	 * just early out.
 	 */
-	nested_vmx_check_supported();
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE));
 
 	/* Create VM */
-	vm = vm_create_default(VCPU_ID, 0, guest_code);
-	vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
-	run = vcpu_state(vm, VCPU_ID);
-
-	vcpu_regs_get(vm, VCPU_ID, &regs1);
-
-	if (kvm_check_cap(KVM_CAP_NESTED_STATE)) {
-		vcpu_alloc_vmx(vm, &vmx_pages_gva);
-		vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
-	} else {
-		pr_info("will skip vmx preemption timer checks\n");
-		goto done;
-	}
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+
+	vcpu_regs_get(vcpu, &regs1);
+
+	vcpu_alloc_vmx(vm, &vmx_pages_gva);
+	vcpu_args_set(vcpu, 1, vmx_pages_gva);
 
 	for (stage = 1;; stage++) {
-		_vcpu_run(vm, VCPU_ID);
-		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
-			    "Stage %d: unexpected exit reason: %u (%s),\n",
-			    stage, run->exit_reason,
-			    exit_reason_str(run->exit_reason));
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
 
-		switch (get_ucall(vm, VCPU_ID, &uc)) {
+		switch (get_ucall(vcpu, &uc)) {
 		case UCALL_ABORT:
-			TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
-				  __FILE__, uc.args[1]);
+			REPORT_GUEST_ASSERT(uc);
 			/* NOT REACHED */
 		case UCALL_SYNC:
 			break;
@@ -233,22 +223,19 @@ int main(int argc, char *argv[])
 				stage, uc.args[4], uc.args[5]);
 		}
 
-		state = vcpu_save_state(vm, VCPU_ID);
+		state = vcpu_save_state(vcpu);
 		memset(&regs1, 0, sizeof(regs1));
-		vcpu_regs_get(vm, VCPU_ID, &regs1);
+		vcpu_regs_get(vcpu, &regs1);
 
 		kvm_vm_release(vm);
 
 		/* Restore state in a new VM.  */
-		kvm_vm_restart(vm, O_RDWR);
-		vm_vcpu_add(vm, VCPU_ID);
-		vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
-		vcpu_load_state(vm, VCPU_ID, state);
-		run = vcpu_state(vm, VCPU_ID);
-		free(state);
+		vcpu = vm_recreate_with_one_vcpu(vm);
+		vcpu_load_state(vcpu, state);
+		kvm_x86_state_cleanup(state);
 
 		memset(&regs2, 0, sizeof(regs2));
-		vcpu_regs_get(vm, VCPU_ID, &regs2);
+		vcpu_regs_get(vcpu, &regs2);
 		TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)),
 			    "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
 			    (ulong) regs2.rdi, (ulong) regs2.rsi);
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c
index 54cdefdfb49d..67a62a5a8895 100644
--- a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c
+++ b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c
@@ -23,38 +23,37 @@
  * changes this should be updated.
  */
 #define VMCS12_REVISION 0x11e57ed0
-#define VCPU_ID 5
 
 bool have_evmcs;
 
-void test_nested_state(struct kvm_vm *vm, struct kvm_nested_state *state)
+void test_nested_state(struct kvm_vcpu *vcpu, struct kvm_nested_state *state)
 {
-	vcpu_nested_state_set(vm, VCPU_ID, state, false);
+	vcpu_nested_state_set(vcpu, state);
 }
 
-void test_nested_state_expect_errno(struct kvm_vm *vm,
+void test_nested_state_expect_errno(struct kvm_vcpu *vcpu,
 				    struct kvm_nested_state *state,
 				    int expected_errno)
 {
 	int rv;
 
-	rv = vcpu_nested_state_set(vm, VCPU_ID, state, true);
+	rv = __vcpu_nested_state_set(vcpu, state);
 	TEST_ASSERT(rv == -1 && errno == expected_errno,
 		"Expected %s (%d) from vcpu_nested_state_set but got rv: %i errno: %s (%d)",
 		strerror(expected_errno), expected_errno, rv, strerror(errno),
 		errno);
 }
 
-void test_nested_state_expect_einval(struct kvm_vm *vm,
+void test_nested_state_expect_einval(struct kvm_vcpu *vcpu,
 				     struct kvm_nested_state *state)
 {
-	test_nested_state_expect_errno(vm, state, EINVAL);
+	test_nested_state_expect_errno(vcpu, state, EINVAL);
 }
 
-void test_nested_state_expect_efault(struct kvm_vm *vm,
+void test_nested_state_expect_efault(struct kvm_vcpu *vcpu,
 				     struct kvm_nested_state *state)
 {
-	test_nested_state_expect_errno(vm, state, EFAULT);
+	test_nested_state_expect_errno(vcpu, state, EFAULT);
 }
 
 void set_revision_id_for_vmcs12(struct kvm_nested_state *state,
@@ -76,10 +75,8 @@ void set_default_state(struct kvm_nested_state *state)
 void set_default_vmx_state(struct kvm_nested_state *state, int size)
 {
 	memset(state, 0, size);
-	state->flags = KVM_STATE_NESTED_GUEST_MODE  |
-			KVM_STATE_NESTED_RUN_PENDING;
 	if (have_evmcs)
-		state->flags |= KVM_STATE_NESTED_EVMCS;
+		state->flags = KVM_STATE_NESTED_EVMCS;
 	state->format = 0;
 	state->size = size;
 	state->hdr.vmx.vmxon_pa = 0x1000;
@@ -88,7 +85,7 @@ void set_default_vmx_state(struct kvm_nested_state *state, int size)
 	set_revision_id_for_vmcs12(state, VMCS12_REVISION);
 }
 
-void test_vmx_nested_state(struct kvm_vm *vm)
+void test_vmx_nested_state(struct kvm_vcpu *vcpu)
 {
 	/* Add a page for VMCS12. */
 	const int state_sz = sizeof(struct kvm_nested_state) + getpagesize();
@@ -98,14 +95,14 @@ void test_vmx_nested_state(struct kvm_vm *vm)
 	/* The format must be set to 0. 0 for VMX, 1 for SVM. */
 	set_default_vmx_state(state, state_sz);
 	state->format = 1;
-	test_nested_state_expect_einval(vm, state);
+	test_nested_state_expect_einval(vcpu, state);
 
 	/*
 	 * We cannot virtualize anything if the guest does not have VMX
 	 * enabled.
 	 */
 	set_default_vmx_state(state, state_sz);
-	test_nested_state_expect_einval(vm, state);
+	test_nested_state_expect_einval(vcpu, state);
 
 	/*
 	 * We cannot virtualize anything if the guest does not have VMX
@@ -114,50 +111,59 @@ void test_vmx_nested_state(struct kvm_vm *vm)
 	 */
 	set_default_vmx_state(state, state_sz);
 	state->hdr.vmx.vmxon_pa = -1ull;
-	test_nested_state_expect_einval(vm, state);
+	test_nested_state_expect_einval(vcpu, state);
 
 	state->hdr.vmx.vmcs12_pa = -1ull;
 	state->flags = KVM_STATE_NESTED_EVMCS;
-	test_nested_state_expect_einval(vm, state);
+	test_nested_state_expect_einval(vcpu, state);
 
 	state->flags = 0;
-	test_nested_state(vm, state);
+	test_nested_state(vcpu, state);
 
 	/* Enable VMX in the guest CPUID. */
-	vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+	vcpu_set_cpuid_feature(vcpu, X86_FEATURE_VMX);
 
 	/*
 	 * Setting vmxon_pa == -1ull and vmcs_pa == -1ull exits early without
-	 * setting the nested state but flags other than eVMCS must be clear.
-	 * The eVMCS flag can be set if the enlightened VMCS capability has
-	 * been enabled.
+	 * setting the nested state. When the eVMCS flag is not set, the
+	 * expected return value is '0'.
 	 */
 	set_default_vmx_state(state, state_sz);
+	state->flags = 0;
 	state->hdr.vmx.vmxon_pa = -1ull;
 	state->hdr.vmx.vmcs12_pa = -1ull;
-	test_nested_state_expect_einval(vm, state);
+	test_nested_state(vcpu, state);
 
-	state->flags &= KVM_STATE_NESTED_EVMCS;
+	/*
+	 * When eVMCS is supported, the eVMCS flag can only be set if the
+	 * enlightened VMCS capability has been enabled.
+	 */
 	if (have_evmcs) {
-		test_nested_state_expect_einval(vm, state);
-		vcpu_enable_evmcs(vm, VCPU_ID);
+		state->flags = KVM_STATE_NESTED_EVMCS;
+		test_nested_state_expect_einval(vcpu, state);
+		vcpu_enable_evmcs(vcpu);
+		test_nested_state(vcpu, state);
 	}
-	test_nested_state(vm, state);
 
 	/* It is invalid to have vmxon_pa == -1ull and SMM flags non-zero. */
 	state->hdr.vmx.smm.flags = 1;
-	test_nested_state_expect_einval(vm, state);
+	test_nested_state_expect_einval(vcpu, state);
+
+	/* Invalid flags are rejected. */
+	set_default_vmx_state(state, state_sz);
+	state->hdr.vmx.flags = ~0;
+	test_nested_state_expect_einval(vcpu, state);
 
 	/* It is invalid to have vmxon_pa == -1ull and vmcs_pa != -1ull. */
 	set_default_vmx_state(state, state_sz);
 	state->hdr.vmx.vmxon_pa = -1ull;
 	state->flags = 0;
-	test_nested_state_expect_einval(vm, state);
+	test_nested_state_expect_einval(vcpu, state);
 
 	/* It is invalid to have vmxon_pa set to a non-page aligned address. */
 	set_default_vmx_state(state, state_sz);
 	state->hdr.vmx.vmxon_pa = 1;
-	test_nested_state_expect_einval(vm, state);
+	test_nested_state_expect_einval(vcpu, state);
 
 	/*
 	 * It is invalid to have KVM_STATE_NESTED_SMM_GUEST_MODE and
@@ -167,7 +173,7 @@ void test_vmx_nested_state(struct kvm_vm *vm)
 	state->flags = KVM_STATE_NESTED_GUEST_MODE  |
 		      KVM_STATE_NESTED_RUN_PENDING;
 	state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE;
-	test_nested_state_expect_einval(vm, state);
+	test_nested_state_expect_einval(vcpu, state);
 
 	/*
 	 * It is invalid to have any of the SMM flags set besides:
@@ -177,29 +183,50 @@ void test_vmx_nested_state(struct kvm_vm *vm)
 	set_default_vmx_state(state, state_sz);
 	state->hdr.vmx.smm.flags = ~(KVM_STATE_NESTED_SMM_GUEST_MODE |
 				KVM_STATE_NESTED_SMM_VMXON);
-	test_nested_state_expect_einval(vm, state);
+	test_nested_state_expect_einval(vcpu, state);
 
 	/* Outside SMM, SMM flags must be zero. */
 	set_default_vmx_state(state, state_sz);
 	state->flags = 0;
 	state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE;
-	test_nested_state_expect_einval(vm, state);
+	test_nested_state_expect_einval(vcpu, state);
+
+	/*
+	 * Size must be large enough to fit kvm_nested_state and vmcs12
+	 * if VMCS12 physical address is set
+	 */
+	set_default_vmx_state(state, state_sz);
+	state->size = sizeof(*state);
+	state->flags = 0;
+	test_nested_state_expect_einval(vcpu, state);
+
+	set_default_vmx_state(state, state_sz);
+	state->size = sizeof(*state);
+	state->flags = 0;
+	state->hdr.vmx.vmcs12_pa = -1;
+	test_nested_state(vcpu, state);
+
+	/*
+	 * KVM_SET_NESTED_STATE succeeds with invalid VMCS
+	 * contents but L2 not running.
+	 */
+	set_default_vmx_state(state, state_sz);
+	state->flags = 0;
+	test_nested_state(vcpu, state);
 
-	/* Size must be large enough to fit kvm_nested_state and vmcs12. */
+	/* Invalid flags are rejected, even if no VMCS loaded. */
 	set_default_vmx_state(state, state_sz);
 	state->size = sizeof(*state);
-	test_nested_state(vm, state);
+	state->flags = 0;
+	state->hdr.vmx.vmcs12_pa = -1;
+	state->hdr.vmx.flags = ~0;
+	test_nested_state_expect_einval(vcpu, state);
 
 	/* vmxon_pa cannot be the same address as vmcs_pa. */
 	set_default_vmx_state(state, state_sz);
 	state->hdr.vmx.vmxon_pa = 0;
 	state->hdr.vmx.vmcs12_pa = 0;
-	test_nested_state_expect_einval(vm, state);
-
-	/* The revision id for vmcs12 must be VMCS12_REVISION. */
-	set_default_vmx_state(state, state_sz);
-	set_revision_id_for_vmcs12(state, 0);
-	test_nested_state_expect_einval(vm, state);
+	test_nested_state_expect_einval(vcpu, state);
 
 	/*
 	 * Test that if we leave nesting the state reflects that when we get
@@ -209,8 +236,8 @@ void test_vmx_nested_state(struct kvm_vm *vm)
 	state->hdr.vmx.vmxon_pa = -1ull;
 	state->hdr.vmx.vmcs12_pa = -1ull;
 	state->flags = 0;
-	test_nested_state(vm, state);
-	vcpu_nested_state_get(vm, VCPU_ID, state);
+	test_nested_state(vcpu, state);
+	vcpu_nested_state_get(vcpu, state);
 	TEST_ASSERT(state->size >= sizeof(*state) && state->size <= state_sz,
 		    "Size must be between %ld and %d.  The size returned was %d.",
 		    sizeof(*state), state_sz, state->size);
@@ -224,29 +251,32 @@ int main(int argc, char *argv[])
 {
 	struct kvm_vm *vm;
 	struct kvm_nested_state state;
+	struct kvm_vcpu *vcpu;
 
 	have_evmcs = kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS);
 
-	if (!kvm_check_cap(KVM_CAP_NESTED_STATE)) {
-		print_skip("KVM_CAP_NESTED_STATE not available");
-		exit(KSFT_SKIP);
-	}
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE));
 
 	/*
 	 * AMD currently does not implement set_nested_state, so for now we
 	 * just early out.
 	 */
-	nested_vmx_check_supported();
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
 
-	vm = vm_create_default(VCPU_ID, 0, 0);
+	vm = vm_create_with_one_vcpu(&vcpu, NULL);
+
+	/*
+	 * First run tests with VMX disabled to check error handling.
+	 */
+	vcpu_clear_cpuid_feature(vcpu, X86_FEATURE_VMX);
 
 	/* Passing a NULL kvm_nested_state causes a EFAULT. */
-	test_nested_state_expect_efault(vm, NULL);
+	test_nested_state_expect_efault(vcpu, NULL);
 
 	/* 'size' cannot be smaller than sizeof(kvm_nested_state). */
 	set_default_state(&state);
 	state.size = 0;
-	test_nested_state_expect_einval(vm, &state);
+	test_nested_state_expect_einval(vcpu, &state);
 
 	/*
 	 * Setting the flags 0xf fails the flags check.  The only flags that
@@ -257,7 +287,7 @@ int main(int argc, char *argv[])
 	 */
 	set_default_state(&state);
 	state.flags = 0xf;
-	test_nested_state_expect_einval(vm, &state);
+	test_nested_state_expect_einval(vcpu, &state);
 
 	/*
 	 * If KVM_STATE_NESTED_RUN_PENDING is set then
@@ -265,9 +295,9 @@ int main(int argc, char *argv[])
 	 */
 	set_default_state(&state);
 	state.flags = KVM_STATE_NESTED_RUN_PENDING;
-	test_nested_state_expect_einval(vm, &state);
+	test_nested_state_expect_einval(vcpu, &state);
 
-	test_vmx_nested_state(vm);
+	test_vmx_nested_state(vcpu);
 
 	kvm_vm_free(vm);
 	return 0;
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c b/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c
index fbe8417cbc2c..2ceb5c78c442 100644
--- a/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c
+++ b/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c
@@ -32,9 +32,6 @@
 #define MSR_IA32_TSC_ADJUST 0x3b
 #endif
 
-#define PAGE_SIZE	4096
-#define VCPU_ID		5
-
 #define TSC_ADJUST_VALUE (1ll << 32)
 #define TSC_OFFSET_VALUE -(1ll << 48)
 
@@ -52,11 +49,6 @@ enum {
 	NUM_VMX_PAGES,
 };
 
-struct kvm_single_msr {
-	struct kvm_msrs header;
-	struct kvm_msr_entry entry;
-} __attribute__((packed));
-
 /* The virtual machine object. */
 static struct kvm_vm *vm;
 
@@ -128,29 +120,25 @@ static void report(int64_t val)
 int main(int argc, char *argv[])
 {
 	vm_vaddr_t vmx_pages_gva;
+	struct kvm_vcpu *vcpu;
 
-	nested_vmx_check_supported();
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
 
-	vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code);
-	vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+	vm = vm_create_with_one_vcpu(&vcpu, (void *) l1_guest_code);
 
 	/* Allocate VMX pages and shared descriptors (vmx_pages). */
 	vcpu_alloc_vmx(vm, &vmx_pages_gva);
-	vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
+	vcpu_args_set(vcpu, 1, vmx_pages_gva);
 
 	for (;;) {
-		volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID);
 		struct ucall uc;
 
-		vcpu_run(vm, VCPU_ID);
-		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
-			    "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n",
-			    run->exit_reason,
-			    exit_reason_str(run->exit_reason));
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
 
-		switch (get_ucall(vm, VCPU_ID, &uc)) {
+		switch (get_ucall(vcpu, &uc)) {
 		case UCALL_ABORT:
-			TEST_FAIL("%s", (const char *)uc.args[0]);
+			REPORT_GUEST_ASSERT(uc);
 			/* NOT REACHED */
 		case UCALL_SYNC:
 			report(uc.args[1]);
@@ -162,7 +150,7 @@ int main(int argc, char *argv[])
 		}
 	}
 
-	kvm_vm_free(vm);
 done:
+	kvm_vm_free(vm);
 	return 0;
 }
diff --git a/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c b/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c
new file mode 100644
index 000000000000..725c206ba0b9
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/xapic_ipi_test.c
@@ -0,0 +1,491 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * xapic_ipi_test
+ *
+ * Copyright (C) 2020, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * Test that when the APIC is in xAPIC mode, a vCPU can send an IPI to wake
+ * another vCPU that is halted when KVM's backing page for the APIC access
+ * address has been moved by mm.
+ *
+ * The test starts two vCPUs: one that sends IPIs and one that continually
+ * executes HLT. The sender checks that the halter has woken from the HLT and
+ * has reentered HLT before sending the next IPI. While the vCPUs are running,
+ * the host continually calls migrate_pages to move all of the process' pages
+ * amongst the available numa nodes on the machine.
+ *
+ * Migration is a command line option. When used on non-numa machines will 
+ * exit with error. Test is still usefull on non-numa for testing IPIs.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <getopt.h>
+#include <pthread.h>
+#include <inttypes.h>
+#include <string.h>
+#include <time.h>
+
+#include "kvm_util.h"
+#include "numaif.h"
+#include "processor.h"
+#include "test_util.h"
+#include "vmx.h"
+
+/* Default running time for the test */
+#define DEFAULT_RUN_SECS 3
+
+/* Default delay between migrate_pages calls (microseconds) */
+#define DEFAULT_DELAY_USECS 500000
+
+/*
+ * Vector for IPI from sender vCPU to halting vCPU.
+ * Value is arbitrary and was chosen for the alternating bit pattern. Any
+ * value should work.
+ */
+#define IPI_VECTOR	 0xa5
+
+/*
+ * Incremented in the IPI handler. Provides evidence to the sender that the IPI
+ * arrived at the destination
+ */
+static volatile uint64_t ipis_rcvd;
+
+/* Data struct shared between host main thread and vCPUs */
+struct test_data_page {
+	uint32_t halter_apic_id;
+	volatile uint64_t hlt_count;
+	volatile uint64_t wake_count;
+	uint64_t ipis_sent;
+	uint64_t migrations_attempted;
+	uint64_t migrations_completed;
+	uint32_t icr;
+	uint32_t icr2;
+	uint32_t halter_tpr;
+	uint32_t halter_ppr;
+
+	/*
+	 *  Record local version register as a cross-check that APIC access
+	 *  worked. Value should match what KVM reports (APIC_VERSION in
+	 *  arch/x86/kvm/lapic.c). If test is failing, check that values match
+	 *  to determine whether APIC access exits are working.
+	 */
+	uint32_t halter_lvr;
+};
+
+struct thread_params {
+	struct test_data_page *data;
+	struct kvm_vcpu *vcpu;
+	uint64_t *pipis_rcvd; /* host address of ipis_rcvd global */
+};
+
+void verify_apic_base_addr(void)
+{
+	uint64_t msr = rdmsr(MSR_IA32_APICBASE);
+	uint64_t base = GET_APIC_BASE(msr);
+
+	GUEST_ASSERT(base == APIC_DEFAULT_GPA);
+}
+
+static void halter_guest_code(struct test_data_page *data)
+{
+	verify_apic_base_addr();
+	xapic_enable();
+
+	data->halter_apic_id = GET_APIC_ID_FIELD(xapic_read_reg(APIC_ID));
+	data->halter_lvr = xapic_read_reg(APIC_LVR);
+
+	/*
+	 * Loop forever HLTing and recording halts & wakes. Disable interrupts
+	 * each time around to minimize window between signaling the pending
+	 * halt to the sender vCPU and executing the halt. No need to disable on
+	 * first run as this vCPU executes first and the host waits for it to
+	 * signal going into first halt before starting the sender vCPU. Record
+	 * TPR and PPR for diagnostic purposes in case the test fails.
+	 */
+	for (;;) {
+		data->halter_tpr = xapic_read_reg(APIC_TASKPRI);
+		data->halter_ppr = xapic_read_reg(APIC_PROCPRI);
+		data->hlt_count++;
+		asm volatile("sti; hlt; cli");
+		data->wake_count++;
+	}
+}
+
+/*
+ * Runs on halter vCPU when IPI arrives. Write an arbitrary non-zero value to
+ * enable diagnosing errant writes to the APIC access address backing page in
+ * case of test failure.
+ */
+static void guest_ipi_handler(struct ex_regs *regs)
+{
+	ipis_rcvd++;
+	xapic_write_reg(APIC_EOI, 77);
+}
+
+static void sender_guest_code(struct test_data_page *data)
+{
+	uint64_t last_wake_count;
+	uint64_t last_hlt_count;
+	uint64_t last_ipis_rcvd_count;
+	uint32_t icr_val;
+	uint32_t icr2_val;
+	uint64_t tsc_start;
+
+	verify_apic_base_addr();
+	xapic_enable();
+
+	/*
+	 * Init interrupt command register for sending IPIs
+	 *
+	 * Delivery mode=fixed, per SDM:
+	 *   "Delivers the interrupt specified in the vector field to the target
+	 *    processor."
+	 *
+	 * Destination mode=physical i.e. specify target by its local APIC
+	 * ID. This vCPU assumes that the halter vCPU has already started and
+	 * set data->halter_apic_id.
+	 */
+	icr_val = (APIC_DEST_PHYSICAL | APIC_DM_FIXED | IPI_VECTOR);
+	icr2_val = SET_APIC_DEST_FIELD(data->halter_apic_id);
+	data->icr = icr_val;
+	data->icr2 = icr2_val;
+
+	last_wake_count = data->wake_count;
+	last_hlt_count = data->hlt_count;
+	last_ipis_rcvd_count = ipis_rcvd;
+	for (;;) {
+		/*
+		 * Send IPI to halter vCPU.
+		 * First IPI can be sent unconditionally because halter vCPU
+		 * starts earlier.
+		 */
+		xapic_write_reg(APIC_ICR2, icr2_val);
+		xapic_write_reg(APIC_ICR, icr_val);
+		data->ipis_sent++;
+
+		/*
+		 * Wait up to ~1 sec for halter to indicate that it has:
+		 * 1. Received the IPI
+		 * 2. Woken up from the halt
+		 * 3. Gone back into halt
+		 * Current CPUs typically run at 2.x Ghz which is ~2
+		 * billion ticks per second.
+		 */
+		tsc_start = rdtsc();
+		while (rdtsc() - tsc_start < 2000000000) {
+			if ((ipis_rcvd != last_ipis_rcvd_count) &&
+			    (data->wake_count != last_wake_count) &&
+			    (data->hlt_count != last_hlt_count))
+				break;
+		}
+
+		GUEST_ASSERT((ipis_rcvd != last_ipis_rcvd_count) &&
+			     (data->wake_count != last_wake_count) &&
+			     (data->hlt_count != last_hlt_count));
+
+		last_wake_count = data->wake_count;
+		last_hlt_count = data->hlt_count;
+		last_ipis_rcvd_count = ipis_rcvd;
+	}
+}
+
+static void *vcpu_thread(void *arg)
+{
+	struct thread_params *params = (struct thread_params *)arg;
+	struct kvm_vcpu *vcpu = params->vcpu;
+	struct ucall uc;
+	int old;
+	int r;
+
+	r = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &old);
+	TEST_ASSERT(r == 0,
+		    "pthread_setcanceltype failed on vcpu_id=%u with errno=%d",
+		    vcpu->id, r);
+
+	fprintf(stderr, "vCPU thread running vCPU %u\n", vcpu->id);
+	vcpu_run(vcpu);
+
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+	if (get_ucall(vcpu, &uc) == UCALL_ABORT) {
+		TEST_ASSERT(false,
+			    "vCPU %u exited with error: %s.\n"
+			    "Sending vCPU sent %lu IPIs to halting vCPU\n"
+			    "Halting vCPU halted %lu times, woke %lu times, received %lu IPIs.\n"
+			    "Halter TPR=%#x PPR=%#x LVR=%#x\n"
+			    "Migrations attempted: %lu\n"
+			    "Migrations completed: %lu",
+			    vcpu->id, (const char *)uc.args[0],
+			    params->data->ipis_sent, params->data->hlt_count,
+			    params->data->wake_count,
+			    *params->pipis_rcvd, params->data->halter_tpr,
+			    params->data->halter_ppr, params->data->halter_lvr,
+			    params->data->migrations_attempted,
+			    params->data->migrations_completed);
+	}
+
+	return NULL;
+}
+
+static void cancel_join_vcpu_thread(pthread_t thread, struct kvm_vcpu *vcpu)
+{
+	void *retval;
+	int r;
+
+	r = pthread_cancel(thread);
+	TEST_ASSERT(r == 0,
+		    "pthread_cancel on vcpu_id=%d failed with errno=%d",
+		    vcpu->id, r);
+
+	r = pthread_join(thread, &retval);
+	TEST_ASSERT(r == 0,
+		    "pthread_join on vcpu_id=%d failed with errno=%d",
+		    vcpu->id, r);
+	TEST_ASSERT(retval == PTHREAD_CANCELED,
+		    "expected retval=%p, got %p", PTHREAD_CANCELED,
+		    retval);
+}
+
+void do_migrations(struct test_data_page *data, int run_secs, int delay_usecs,
+		   uint64_t *pipis_rcvd)
+{
+	long pages_not_moved;
+	unsigned long nodemask = 0;
+	unsigned long nodemasks[sizeof(nodemask) * 8];
+	int nodes = 0;
+	time_t start_time, last_update, now;
+	time_t interval_secs = 1;
+	int i, r;
+	int from, to;
+	unsigned long bit;
+	uint64_t hlt_count;
+	uint64_t wake_count;
+	uint64_t ipis_sent;
+
+	fprintf(stderr, "Calling migrate_pages every %d microseconds\n",
+		delay_usecs);
+
+	/* Get set of first 64 numa nodes available */
+	r = get_mempolicy(NULL, &nodemask, sizeof(nodemask) * 8,
+			  0, MPOL_F_MEMS_ALLOWED);
+	TEST_ASSERT(r == 0, "get_mempolicy failed errno=%d", errno);
+
+	fprintf(stderr, "Numa nodes found amongst first %lu possible nodes "
+		"(each 1-bit indicates node is present): %#lx\n",
+		sizeof(nodemask) * 8, nodemask);
+
+	/* Init array of masks containing a single-bit in each, one for each
+	 * available node. migrate_pages called below requires specifying nodes
+	 * as bit masks.
+	 */
+	for (i = 0, bit = 1; i < sizeof(nodemask) * 8; i++, bit <<= 1) {
+		if (nodemask & bit) {
+			nodemasks[nodes] = nodemask & bit;
+			nodes++;
+		}
+	}
+
+	TEST_ASSERT(nodes > 1,
+		    "Did not find at least 2 numa nodes. Can't do migration");
+
+	fprintf(stderr, "Migrating amongst %d nodes found\n", nodes);
+
+	from = 0;
+	to = 1;
+	start_time = time(NULL);
+	last_update = start_time;
+
+	ipis_sent = data->ipis_sent;
+	hlt_count = data->hlt_count;
+	wake_count = data->wake_count;
+
+	while ((int)(time(NULL) - start_time) < run_secs) {
+		data->migrations_attempted++;
+
+		/*
+		 * migrate_pages with PID=0 will migrate all pages of this
+		 * process between the nodes specified as bitmasks. The page
+		 * backing the APIC access address belongs to this process
+		 * because it is allocated by KVM in the context of the
+		 * KVM_CREATE_VCPU ioctl. If that assumption ever changes this
+		 * test may break or give a false positive signal.
+		 */
+		pages_not_moved = migrate_pages(0, sizeof(nodemasks[from]),
+						&nodemasks[from],
+						&nodemasks[to]);
+		if (pages_not_moved < 0)
+			fprintf(stderr,
+				"migrate_pages failed, errno=%d\n", errno);
+		else if (pages_not_moved > 0)
+			fprintf(stderr,
+				"migrate_pages could not move %ld pages\n",
+				pages_not_moved);
+		else
+			data->migrations_completed++;
+
+		from = to;
+		to++;
+		if (to == nodes)
+			to = 0;
+
+		now = time(NULL);
+		if (((now - start_time) % interval_secs == 0) &&
+		    (now != last_update)) {
+			last_update = now;
+			fprintf(stderr,
+				"%lu seconds: Migrations attempted=%lu completed=%lu, "
+				"IPIs sent=%lu received=%lu, HLTs=%lu wakes=%lu\n",
+				now - start_time, data->migrations_attempted,
+				data->migrations_completed,
+				data->ipis_sent, *pipis_rcvd,
+				data->hlt_count, data->wake_count);
+
+			TEST_ASSERT(ipis_sent != data->ipis_sent &&
+				    hlt_count != data->hlt_count &&
+				    wake_count != data->wake_count,
+				    "IPI, HLT and wake count have not increased "
+				    "in the last %lu seconds. "
+				    "HLTer is likely hung.", interval_secs);
+
+			ipis_sent = data->ipis_sent;
+			hlt_count = data->hlt_count;
+			wake_count = data->wake_count;
+		}
+		usleep(delay_usecs);
+	}
+}
+
+void get_cmdline_args(int argc, char *argv[], int *run_secs,
+		      bool *migrate, int *delay_usecs)
+{
+	for (;;) {
+		int opt = getopt(argc, argv, "s:d:m");
+
+		if (opt == -1)
+			break;
+		switch (opt) {
+		case 's':
+			*run_secs = parse_size(optarg);
+			break;
+		case 'm':
+			*migrate = true;
+			break;
+		case 'd':
+			*delay_usecs = parse_size(optarg);
+			break;
+		default:
+			TEST_ASSERT(false,
+				    "Usage: -s <runtime seconds>. Default is %d seconds.\n"
+				    "-m adds calls to migrate_pages while vCPUs are running."
+				    " Default is no migrations.\n"
+				    "-d <delay microseconds> - delay between migrate_pages() calls."
+				    " Default is %d microseconds.",
+				    DEFAULT_RUN_SECS, DEFAULT_DELAY_USECS);
+		}
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	int r;
+	int wait_secs;
+	const int max_halter_wait = 10;
+	int run_secs = 0;
+	int delay_usecs = 0;
+	struct test_data_page *data;
+	vm_vaddr_t test_data_page_vaddr;
+	bool migrate = false;
+	pthread_t threads[2];
+	struct thread_params params[2];
+	struct kvm_vm *vm;
+	uint64_t *pipis_rcvd;
+
+	get_cmdline_args(argc, argv, &run_secs, &migrate, &delay_usecs);
+	if (run_secs <= 0)
+		run_secs = DEFAULT_RUN_SECS;
+	if (delay_usecs <= 0)
+		delay_usecs = DEFAULT_DELAY_USECS;
+
+	vm = vm_create_with_one_vcpu(&params[0].vcpu, halter_guest_code);
+
+	vm_init_descriptor_tables(vm);
+	vcpu_init_descriptor_tables(params[0].vcpu);
+	vm_install_exception_handler(vm, IPI_VECTOR, guest_ipi_handler);
+
+	virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA);
+
+	params[1].vcpu = vm_vcpu_add(vm, 1, sender_guest_code);
+
+	test_data_page_vaddr = vm_vaddr_alloc_page(vm);
+	data = addr_gva2hva(vm, test_data_page_vaddr);
+	memset(data, 0, sizeof(*data));
+	params[0].data = data;
+	params[1].data = data;
+
+	vcpu_args_set(params[0].vcpu, 1, test_data_page_vaddr);
+	vcpu_args_set(params[1].vcpu, 1, test_data_page_vaddr);
+
+	pipis_rcvd = (uint64_t *)addr_gva2hva(vm, (uint64_t)&ipis_rcvd);
+	params[0].pipis_rcvd = pipis_rcvd;
+	params[1].pipis_rcvd = pipis_rcvd;
+
+	/* Start halter vCPU thread and wait for it to execute first HLT. */
+	r = pthread_create(&threads[0], NULL, vcpu_thread, &params[0]);
+	TEST_ASSERT(r == 0,
+		    "pthread_create halter failed errno=%d", errno);
+	fprintf(stderr, "Halter vCPU thread started\n");
+
+	wait_secs = 0;
+	while ((wait_secs < max_halter_wait) && !data->hlt_count) {
+		sleep(1);
+		wait_secs++;
+	}
+
+	TEST_ASSERT(data->hlt_count,
+		    "Halter vCPU did not execute first HLT within %d seconds",
+		    max_halter_wait);
+
+	fprintf(stderr,
+		"Halter vCPU thread reported its APIC ID: %u after %d seconds.\n",
+		data->halter_apic_id, wait_secs);
+
+	r = pthread_create(&threads[1], NULL, vcpu_thread, &params[1]);
+	TEST_ASSERT(r == 0, "pthread_create sender failed errno=%d", errno);
+
+	fprintf(stderr,
+		"IPI sender vCPU thread started. Letting vCPUs run for %d seconds.\n",
+		run_secs);
+
+	if (!migrate)
+		sleep(run_secs);
+	else
+		do_migrations(data, run_secs, delay_usecs, pipis_rcvd);
+
+	/*
+	 * Cancel threads and wait for them to stop.
+	 */
+	cancel_join_vcpu_thread(threads[0], params[0].vcpu);
+	cancel_join_vcpu_thread(threads[1], params[1].vcpu);
+
+	fprintf(stderr,
+		"Test successful after running for %d seconds.\n"
+		"Sending vCPU sent %lu IPIs to halting vCPU\n"
+		"Halting vCPU halted %lu times, woke %lu times, received %lu IPIs.\n"
+		"Halter APIC ID=%#x\n"
+		"Sender ICR value=%#x ICR2 value=%#x\n"
+		"Halter TPR=%#x PPR=%#x LVR=%#x\n"
+		"Migrations attempted: %lu\n"
+		"Migrations completed: %lu\n",
+		run_secs, data->ipis_sent,
+		data->hlt_count, data->wake_count, *pipis_rcvd,
+		data->halter_apic_id,
+		data->icr, data->icr2,
+		data->halter_tpr, data->halter_ppr, data->halter_lvr,
+		data->migrations_attempted, data->migrations_completed);
+
+	kvm_vm_free(vm);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c
new file mode 100644
index 000000000000..ab75b873a4ad
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c
@@ -0,0 +1,215 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "apic.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "test_util.h"
+
+struct xapic_vcpu {
+	struct kvm_vcpu *vcpu;
+	bool is_x2apic;
+};
+
+static void xapic_guest_code(void)
+{
+	asm volatile("cli");
+
+	xapic_enable();
+
+	while (1) {
+		uint64_t val = (u64)xapic_read_reg(APIC_IRR) |
+			       (u64)xapic_read_reg(APIC_IRR + 0x10) << 32;
+
+		xapic_write_reg(APIC_ICR2, val >> 32);
+		xapic_write_reg(APIC_ICR, val);
+		GUEST_SYNC(val);
+	}
+}
+
+static void x2apic_guest_code(void)
+{
+	asm volatile("cli");
+
+	x2apic_enable();
+
+	do {
+		uint64_t val = x2apic_read_reg(APIC_IRR) |
+			       x2apic_read_reg(APIC_IRR + 0x10) << 32;
+
+		x2apic_write_reg(APIC_ICR, val);
+		GUEST_SYNC(val);
+	} while (1);
+}
+
+static void ____test_icr(struct xapic_vcpu *x, uint64_t val)
+{
+	struct kvm_vcpu *vcpu = x->vcpu;
+	struct kvm_lapic_state xapic;
+	struct ucall uc;
+	uint64_t icr;
+
+	/*
+	 * Tell the guest what ICR value to write.  Use the IRR to pass info,
+	 * all bits are valid and should not be modified by KVM (ignoring the
+	 * fact that vectors 0-15 are technically illegal).
+	 */
+	vcpu_ioctl(vcpu, KVM_GET_LAPIC, &xapic);
+	*((u32 *)&xapic.regs[APIC_IRR]) = val;
+	*((u32 *)&xapic.regs[APIC_IRR + 0x10]) = val >> 32;
+	vcpu_ioctl(vcpu, KVM_SET_LAPIC, &xapic);
+
+	vcpu_run(vcpu);
+	TEST_ASSERT_EQ(get_ucall(vcpu, &uc), UCALL_SYNC);
+	TEST_ASSERT_EQ(uc.args[1], val);
+
+	vcpu_ioctl(vcpu, KVM_GET_LAPIC, &xapic);
+	icr = (u64)(*((u32 *)&xapic.regs[APIC_ICR])) |
+	      (u64)(*((u32 *)&xapic.regs[APIC_ICR2])) << 32;
+	if (!x->is_x2apic) {
+		val &= (-1u | (0xffull << (32 + 24)));
+		TEST_ASSERT_EQ(icr, val & ~APIC_ICR_BUSY);
+	} else {
+		TEST_ASSERT_EQ(icr & ~APIC_ICR_BUSY, val & ~APIC_ICR_BUSY);
+	}
+}
+
+#define X2APIC_RSVED_BITS_MASK  (GENMASK_ULL(31,20) | \
+				 GENMASK_ULL(17,16) | \
+				 GENMASK_ULL(13,13))
+
+static void __test_icr(struct xapic_vcpu *x, uint64_t val)
+{
+	if (x->is_x2apic) {
+		/* Hardware writing vICR register requires reserved bits 31:20,
+		 * 17:16 and 13 kept as zero to avoid #GP exception. Data value
+		 * written to vICR should mask out those bits above.
+		 */
+		val &= ~X2APIC_RSVED_BITS_MASK;
+	}
+	____test_icr(x, val | APIC_ICR_BUSY);
+	____test_icr(x, val & ~(u64)APIC_ICR_BUSY);
+}
+
+static void test_icr(struct xapic_vcpu *x)
+{
+	struct kvm_vcpu *vcpu = x->vcpu;
+	uint64_t icr, i, j;
+
+	icr = APIC_DEST_SELF | APIC_INT_ASSERT | APIC_DM_FIXED;
+	for (i = 0; i <= 0xff; i++)
+		__test_icr(x, icr | i);
+
+	icr = APIC_INT_ASSERT | APIC_DM_FIXED;
+	for (i = 0; i <= 0xff; i++)
+		__test_icr(x, icr | i);
+
+	/*
+	 * Send all flavors of IPIs to non-existent vCPUs.  TODO: use number of
+	 * vCPUs, not vcpu.id + 1.  Arbitrarily use vector 0xff.
+	 */
+	icr = APIC_INT_ASSERT | 0xff;
+	for (i = 0; i < 0xff; i++) {
+		if (i == vcpu->id)
+			continue;
+		for (j = 0; j < 8; j++)
+			__test_icr(x, i << (32 + 24) | icr | (j << 8));
+	}
+
+	/* And again with a shorthand destination for all types of IPIs. */
+	icr = APIC_DEST_ALLBUT | APIC_INT_ASSERT;
+	for (i = 0; i < 8; i++)
+		__test_icr(x, icr | (i << 8));
+
+	/* And a few garbage value, just make sure it's an IRQ (blocked). */
+	__test_icr(x, 0xa5a5a5a5a5a5a5a5 & ~APIC_DM_FIXED_MASK);
+	__test_icr(x, 0x5a5a5a5a5a5a5a5a & ~APIC_DM_FIXED_MASK);
+	__test_icr(x, -1ull & ~APIC_DM_FIXED_MASK);
+}
+
+static void __test_apic_id(struct kvm_vcpu *vcpu, uint64_t apic_base)
+{
+	uint32_t apic_id, expected;
+	struct kvm_lapic_state xapic;
+
+	vcpu_set_msr(vcpu, MSR_IA32_APICBASE, apic_base);
+
+	vcpu_ioctl(vcpu, KVM_GET_LAPIC, &xapic);
+
+	expected = apic_base & X2APIC_ENABLE ? vcpu->id : vcpu->id << 24;
+	apic_id = *((u32 *)&xapic.regs[APIC_ID]);
+
+	TEST_ASSERT(apic_id == expected,
+		    "APIC_ID not set back to %s format; wanted = %x, got = %x",
+		    (apic_base & X2APIC_ENABLE) ? "x2APIC" : "xAPIC",
+		    expected, apic_id);
+}
+
+/*
+ * Verify that KVM switches the APIC_ID between xAPIC and x2APIC when userspace
+ * stuffs MSR_IA32_APICBASE.  Setting the APIC_ID when x2APIC is enabled and
+ * when the APIC transitions for DISABLED to ENABLED is architectural behavior
+ * (on Intel), whereas the x2APIC => xAPIC transition behavior is KVM ABI since
+ * attempted to transition from x2APIC to xAPIC without disabling the APIC is
+ * architecturally disallowed.
+ */
+static void test_apic_id(void)
+{
+	const uint32_t NR_VCPUS = 3;
+	struct kvm_vcpu *vcpus[NR_VCPUS];
+	uint64_t apic_base;
+	struct kvm_vm *vm;
+	int i;
+
+	vm = vm_create_with_vcpus(NR_VCPUS, NULL, vcpus);
+	vm_enable_cap(vm, KVM_CAP_X2APIC_API, KVM_X2APIC_API_USE_32BIT_IDS);
+
+	for (i = 0; i < NR_VCPUS; i++) {
+		apic_base = vcpu_get_msr(vcpus[i], MSR_IA32_APICBASE);
+
+		TEST_ASSERT(apic_base & MSR_IA32_APICBASE_ENABLE,
+			    "APIC not in ENABLED state at vCPU RESET");
+		TEST_ASSERT(!(apic_base & X2APIC_ENABLE),
+			    "APIC not in xAPIC mode at vCPU RESET");
+
+		__test_apic_id(vcpus[i], apic_base);
+		__test_apic_id(vcpus[i], apic_base | X2APIC_ENABLE);
+		__test_apic_id(vcpus[i], apic_base);
+	}
+
+	kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
+	struct xapic_vcpu x = {
+		.vcpu = NULL,
+		.is_x2apic = true,
+	};
+	struct kvm_vm *vm;
+
+	vm = vm_create_with_one_vcpu(&x.vcpu, x2apic_guest_code);
+	test_icr(&x);
+	kvm_vm_free(vm);
+
+	/*
+	 * Use a second VM for the xAPIC test so that x2APIC can be hidden from
+	 * the guest in order to test AVIC.  KVM disallows changing CPUID after
+	 * KVM_RUN and AVIC is disabled if _any_ vCPU is allowed to use x2APIC.
+	 */
+	vm = vm_create_with_one_vcpu(&x.vcpu, xapic_guest_code);
+	x.is_x2apic = false;
+
+	vcpu_clear_cpuid_feature(x.vcpu, X86_FEATURE_X2APIC);
+
+	virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA);
+	test_icr(&x);
+	kvm_vm_free(vm);
+
+	test_apic_id();
+}
diff --git a/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c b/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c
new file mode 100644
index 000000000000..25a0b0db5c3c
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/xcr0_cpuid_test.c
@@ -0,0 +1,137 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * XCR0 cpuid test
+ *
+ * Copyright (C) 2022, Google LLC.
+ */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "processor.h"
+
+/*
+ * Assert that architectural dependency rules are satisfied, e.g. that AVX is
+ * supported if and only if SSE is supported.
+ */
+#define ASSERT_XFEATURE_DEPENDENCIES(supported_xcr0, xfeatures, dependencies)		\
+do {											\
+	uint64_t __supported = (supported_xcr0) & ((xfeatures) | (dependencies));	\
+											\
+	__GUEST_ASSERT((__supported & (xfeatures)) != (xfeatures) ||			\
+		       __supported == ((xfeatures) | (dependencies)),			\
+		       "supported = 0x%lx, xfeatures = 0x%llx, dependencies = 0x%llx",	\
+		       __supported, (xfeatures), (dependencies));			\
+} while (0)
+
+/*
+ * Assert that KVM reports a sane, usable as-is XCR0.  Architecturally, a CPU
+ * isn't strictly required to _support_ all XFeatures related to a feature, but
+ * at the same time XSETBV will #GP if bundled XFeatures aren't enabled and
+ * disabled coherently.  E.g. a CPU can technically enumerate supported for
+ * XTILE_CFG but not XTILE_DATA, but attempting to enable XTILE_CFG without
+ * XTILE_DATA will #GP.
+ */
+#define ASSERT_ALL_OR_NONE_XFEATURE(supported_xcr0, xfeatures)		\
+do {									\
+	uint64_t __supported = (supported_xcr0) & (xfeatures);		\
+									\
+	__GUEST_ASSERT(!__supported || __supported == (xfeatures),	\
+		       "supported = 0x%lx, xfeatures = 0x%llx",		\
+		       __supported, (xfeatures));			\
+} while (0)
+
+static void guest_code(void)
+{
+	uint64_t xcr0_reset;
+	uint64_t supported_xcr0;
+	int i, vector;
+
+	set_cr4(get_cr4() | X86_CR4_OSXSAVE);
+
+	xcr0_reset = xgetbv(0);
+	supported_xcr0 = this_cpu_supported_xcr0();
+
+	GUEST_ASSERT(xcr0_reset == XFEATURE_MASK_FP);
+
+	/* Check AVX */
+	ASSERT_XFEATURE_DEPENDENCIES(supported_xcr0,
+				     XFEATURE_MASK_YMM,
+				     XFEATURE_MASK_SSE);
+
+	/* Check MPX */
+	ASSERT_ALL_OR_NONE_XFEATURE(supported_xcr0,
+				    XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
+
+	/* Check AVX-512 */
+	ASSERT_XFEATURE_DEPENDENCIES(supported_xcr0,
+				     XFEATURE_MASK_AVX512,
+				     XFEATURE_MASK_SSE | XFEATURE_MASK_YMM);
+	ASSERT_ALL_OR_NONE_XFEATURE(supported_xcr0,
+				    XFEATURE_MASK_AVX512);
+
+	/* Check AMX */
+	ASSERT_ALL_OR_NONE_XFEATURE(supported_xcr0,
+				    XFEATURE_MASK_XTILE);
+
+	vector = xsetbv_safe(0, supported_xcr0);
+	__GUEST_ASSERT(!vector,
+		       "Expected success on XSETBV(0x%lx), got vector '0x%x'",
+		       supported_xcr0, vector);
+
+	for (i = 0; i < 64; i++) {
+		if (supported_xcr0 & BIT_ULL(i))
+			continue;
+
+		vector = xsetbv_safe(0, supported_xcr0 | BIT_ULL(i));
+		__GUEST_ASSERT(vector == GP_VECTOR,
+			       "Expected #GP on XSETBV(0x%llx), supported XCR0 = %lx, got vector '0x%x'",
+			       BIT_ULL(i), supported_xcr0, vector);
+	}
+
+	GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_run *run;
+	struct kvm_vm *vm;
+	struct ucall uc;
+
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XSAVE));
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+	run = vcpu->run;
+
+	vm_init_descriptor_tables(vm);
+	vcpu_init_descriptor_tables(vcpu);
+
+	while (1) {
+		vcpu_run(vcpu);
+
+		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+			    "Unexpected exit reason: %u (%s),",
+			    run->exit_reason,
+			    exit_reason_str(run->exit_reason));
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			break;
+		case UCALL_DONE:
+			goto done;
+		default:
+			TEST_FAIL("Unknown ucall %lu", uc.cmd);
+		}
+	}
+
+done:
+	kvm_vm_free(vm);
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
new file mode 100644
index 000000000000..d2ea0435f4f7
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
@@ -0,0 +1,1156 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright © 2021 Amazon.com, Inc. or its affiliates.
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#include <stdint.h>
+#include <time.h>
+#include <sched.h>
+#include <signal.h>
+#include <pthread.h>
+
+#include <sys/eventfd.h>
+
+#define SHINFO_REGION_GVA	0xc0000000ULL
+#define SHINFO_REGION_GPA	0xc0000000ULL
+#define SHINFO_REGION_SLOT	10
+
+#define DUMMY_REGION_GPA	(SHINFO_REGION_GPA + (3 * PAGE_SIZE))
+#define DUMMY_REGION_SLOT	11
+
+#define DUMMY_REGION_GPA_2	(SHINFO_REGION_GPA + (4 * PAGE_SIZE))
+#define DUMMY_REGION_SLOT_2	12
+
+#define SHINFO_ADDR	(SHINFO_REGION_GPA)
+#define VCPU_INFO_ADDR	(SHINFO_REGION_GPA + 0x40)
+#define PVTIME_ADDR	(SHINFO_REGION_GPA + PAGE_SIZE)
+#define RUNSTATE_ADDR	(SHINFO_REGION_GPA + PAGE_SIZE + PAGE_SIZE - 15)
+
+#define SHINFO_VADDR	(SHINFO_REGION_GVA)
+#define VCPU_INFO_VADDR	(SHINFO_REGION_GVA + 0x40)
+#define RUNSTATE_VADDR	(SHINFO_REGION_GVA + PAGE_SIZE + PAGE_SIZE - 15)
+
+#define EVTCHN_VECTOR	0x10
+
+#define EVTCHN_TEST1 15
+#define EVTCHN_TEST2 66
+#define EVTCHN_TIMER 13
+
+enum {
+	TEST_INJECT_VECTOR = 0,
+	TEST_RUNSTATE_runnable,
+	TEST_RUNSTATE_blocked,
+	TEST_RUNSTATE_offline,
+	TEST_RUNSTATE_ADJUST,
+	TEST_RUNSTATE_DATA,
+	TEST_STEAL_TIME,
+	TEST_EVTCHN_MASKED,
+	TEST_EVTCHN_UNMASKED,
+	TEST_EVTCHN_SLOWPATH,
+	TEST_EVTCHN_SEND_IOCTL,
+	TEST_EVTCHN_HCALL,
+	TEST_EVTCHN_HCALL_SLOWPATH,
+	TEST_EVTCHN_HCALL_EVENTFD,
+	TEST_TIMER_SETUP,
+	TEST_TIMER_WAIT,
+	TEST_TIMER_RESTORE,
+	TEST_POLL_READY,
+	TEST_POLL_TIMEOUT,
+	TEST_POLL_MASKED,
+	TEST_POLL_WAKE,
+	SET_VCPU_INFO,
+	TEST_TIMER_PAST,
+	TEST_LOCKING_SEND_RACE,
+	TEST_LOCKING_POLL_RACE,
+	TEST_LOCKING_POLL_TIMEOUT,
+	TEST_DONE,
+
+	TEST_GUEST_SAW_IRQ,
+};
+
+#define XEN_HYPERCALL_MSR	0x40000000
+
+#define MIN_STEAL_TIME		50000
+
+#define SHINFO_RACE_TIMEOUT	2	/* seconds */
+
+#define __HYPERVISOR_set_timer_op	15
+#define __HYPERVISOR_sched_op		29
+#define __HYPERVISOR_event_channel_op	32
+
+#define SCHEDOP_poll			3
+
+#define EVTCHNOP_send			4
+
+#define EVTCHNSTAT_interdomain		2
+
+struct evtchn_send {
+	u32 port;
+};
+
+struct sched_poll {
+	u32 *ports;
+	unsigned int nr_ports;
+	u64 timeout;
+};
+
+struct pvclock_vcpu_time_info {
+	u32   version;
+	u32   pad0;
+	u64   tsc_timestamp;
+	u64   system_time;
+	u32   tsc_to_system_mul;
+	s8    tsc_shift;
+	u8    flags;
+	u8    pad[2];
+} __attribute__((__packed__)); /* 32 bytes */
+
+struct pvclock_wall_clock {
+	u32   version;
+	u32   sec;
+	u32   nsec;
+} __attribute__((__packed__));
+
+struct vcpu_runstate_info {
+	uint32_t state;
+	uint64_t state_entry_time;
+	uint64_t time[5]; /* Extra field for overrun check */
+};
+
+struct compat_vcpu_runstate_info {
+	uint32_t state;
+	uint64_t state_entry_time;
+	uint64_t time[5];
+} __attribute__((__packed__));;
+
+struct arch_vcpu_info {
+	unsigned long cr2;
+	unsigned long pad; /* sizeof(vcpu_info_t) == 64 */
+};
+
+struct vcpu_info {
+	uint8_t evtchn_upcall_pending;
+	uint8_t evtchn_upcall_mask;
+	unsigned long evtchn_pending_sel;
+	struct arch_vcpu_info arch;
+	struct pvclock_vcpu_time_info time;
+}; /* 64 bytes (x86) */
+
+struct shared_info {
+	struct vcpu_info vcpu_info[32];
+	unsigned long evtchn_pending[64];
+	unsigned long evtchn_mask[64];
+	struct pvclock_wall_clock wc;
+	uint32_t wc_sec_hi;
+	/* arch_shared_info here */
+};
+
+#define RUNSTATE_running  0
+#define RUNSTATE_runnable 1
+#define RUNSTATE_blocked  2
+#define RUNSTATE_offline  3
+
+static const char *runstate_names[] = {
+	"running",
+	"runnable",
+	"blocked",
+	"offline"
+};
+
+struct {
+	struct kvm_irq_routing info;
+	struct kvm_irq_routing_entry entries[2];
+} irq_routes;
+
+static volatile bool guest_saw_irq;
+
+static void evtchn_handler(struct ex_regs *regs)
+{
+	struct vcpu_info *vi = (void *)VCPU_INFO_VADDR;
+	vi->evtchn_upcall_pending = 0;
+	vi->evtchn_pending_sel = 0;
+	guest_saw_irq = true;
+
+	GUEST_SYNC(TEST_GUEST_SAW_IRQ);
+}
+
+static void guest_wait_for_irq(void)
+{
+	while (!guest_saw_irq)
+		__asm__ __volatile__ ("rep nop" : : : "memory");
+	guest_saw_irq = false;
+}
+
+static void guest_code(void)
+{
+	struct vcpu_runstate_info *rs = (void *)RUNSTATE_VADDR;
+	int i;
+
+	__asm__ __volatile__(
+		"sti\n"
+		"nop\n"
+	);
+
+	/* Trigger an interrupt injection */
+	GUEST_SYNC(TEST_INJECT_VECTOR);
+
+	guest_wait_for_irq();
+
+	/* Test having the host set runstates manually */
+	GUEST_SYNC(TEST_RUNSTATE_runnable);
+	GUEST_ASSERT(rs->time[RUNSTATE_runnable] != 0);
+	GUEST_ASSERT(rs->state == 0);
+
+	GUEST_SYNC(TEST_RUNSTATE_blocked);
+	GUEST_ASSERT(rs->time[RUNSTATE_blocked] != 0);
+	GUEST_ASSERT(rs->state == 0);
+
+	GUEST_SYNC(TEST_RUNSTATE_offline);
+	GUEST_ASSERT(rs->time[RUNSTATE_offline] != 0);
+	GUEST_ASSERT(rs->state == 0);
+
+	/* Test runstate time adjust */
+	GUEST_SYNC(TEST_RUNSTATE_ADJUST);
+	GUEST_ASSERT(rs->time[RUNSTATE_blocked] == 0x5a);
+	GUEST_ASSERT(rs->time[RUNSTATE_offline] == 0x6b6b);
+
+	/* Test runstate time set */
+	GUEST_SYNC(TEST_RUNSTATE_DATA);
+	GUEST_ASSERT(rs->state_entry_time >= 0x8000);
+	GUEST_ASSERT(rs->time[RUNSTATE_runnable] == 0);
+	GUEST_ASSERT(rs->time[RUNSTATE_blocked] == 0x6b6b);
+	GUEST_ASSERT(rs->time[RUNSTATE_offline] == 0x5a);
+
+	/* sched_yield() should result in some 'runnable' time */
+	GUEST_SYNC(TEST_STEAL_TIME);
+	GUEST_ASSERT(rs->time[RUNSTATE_runnable] >= MIN_STEAL_TIME);
+
+	/* Attempt to deliver a *masked* interrupt */
+	GUEST_SYNC(TEST_EVTCHN_MASKED);
+
+	/* Wait until we see the bit set */
+	struct shared_info *si = (void *)SHINFO_VADDR;
+	while (!si->evtchn_pending[0])
+		__asm__ __volatile__ ("rep nop" : : : "memory");
+
+	/* Now deliver an *unmasked* interrupt */
+	GUEST_SYNC(TEST_EVTCHN_UNMASKED);
+
+	guest_wait_for_irq();
+
+	/* Change memslots and deliver an interrupt */
+	GUEST_SYNC(TEST_EVTCHN_SLOWPATH);
+
+	guest_wait_for_irq();
+
+	/* Deliver event channel with KVM_XEN_HVM_EVTCHN_SEND */
+	GUEST_SYNC(TEST_EVTCHN_SEND_IOCTL);
+
+	guest_wait_for_irq();
+
+	GUEST_SYNC(TEST_EVTCHN_HCALL);
+
+	/* Our turn. Deliver event channel (to ourselves) with
+	 * EVTCHNOP_send hypercall. */
+	struct evtchn_send s = { .port = 127 };
+	xen_hypercall(__HYPERVISOR_event_channel_op, EVTCHNOP_send, &s);
+
+	guest_wait_for_irq();
+
+	GUEST_SYNC(TEST_EVTCHN_HCALL_SLOWPATH);
+
+	/*
+	 * Same again, but this time the host has messed with memslots so it
+	 * should take the slow path in kvm_xen_set_evtchn().
+	 */
+	xen_hypercall(__HYPERVISOR_event_channel_op, EVTCHNOP_send, &s);
+
+	guest_wait_for_irq();
+
+	GUEST_SYNC(TEST_EVTCHN_HCALL_EVENTFD);
+
+	/* Deliver "outbound" event channel to an eventfd which
+	 * happens to be one of our own irqfds. */
+	s.port = 197;
+	xen_hypercall(__HYPERVISOR_event_channel_op, EVTCHNOP_send, &s);
+
+	guest_wait_for_irq();
+
+	GUEST_SYNC(TEST_TIMER_SETUP);
+
+	/* Set a timer 100ms in the future. */
+	xen_hypercall(__HYPERVISOR_set_timer_op,
+		      rs->state_entry_time + 100000000, NULL);
+
+	GUEST_SYNC(TEST_TIMER_WAIT);
+
+	/* Now wait for the timer */
+	guest_wait_for_irq();
+
+	GUEST_SYNC(TEST_TIMER_RESTORE);
+
+	/* The host has 'restored' the timer. Just wait for it. */
+	guest_wait_for_irq();
+
+	GUEST_SYNC(TEST_POLL_READY);
+
+	/* Poll for an event channel port which is already set */
+	u32 ports[1] = { EVTCHN_TIMER };
+	struct sched_poll p = {
+		.ports = ports,
+		.nr_ports = 1,
+		.timeout = 0,
+	};
+
+	xen_hypercall(__HYPERVISOR_sched_op, SCHEDOP_poll, &p);
+
+	GUEST_SYNC(TEST_POLL_TIMEOUT);
+
+	/* Poll for an unset port and wait for the timeout. */
+	p.timeout = 100000000;
+	xen_hypercall(__HYPERVISOR_sched_op, SCHEDOP_poll, &p);
+
+	GUEST_SYNC(TEST_POLL_MASKED);
+
+	/* A timer will wake the masked port we're waiting on, while we poll */
+	p.timeout = 0;
+	xen_hypercall(__HYPERVISOR_sched_op, SCHEDOP_poll, &p);
+
+	GUEST_SYNC(TEST_POLL_WAKE);
+
+	/* Set the vcpu_info to point at exactly the place it already is to
+	 * make sure the attribute is functional. */
+	GUEST_SYNC(SET_VCPU_INFO);
+
+	/* A timer wake an *unmasked* port which should wake us with an
+	 * actual interrupt, while we're polling on a different port. */
+	ports[0]++;
+	p.timeout = 0;
+	xen_hypercall(__HYPERVISOR_sched_op, SCHEDOP_poll, &p);
+
+	guest_wait_for_irq();
+
+	GUEST_SYNC(TEST_TIMER_PAST);
+
+	/* Timer should have fired already */
+	guest_wait_for_irq();
+
+	GUEST_SYNC(TEST_LOCKING_SEND_RACE);
+	/* Racing host ioctls */
+
+	guest_wait_for_irq();
+
+	GUEST_SYNC(TEST_LOCKING_POLL_RACE);
+	/* Racing vmcall against host ioctl */
+
+	ports[0] = 0;
+
+	p = (struct sched_poll) {
+		.ports = ports,
+		.nr_ports = 1,
+		.timeout = 0
+	};
+
+wait_for_timer:
+	/*
+	 * Poll for a timer wake event while the worker thread is mucking with
+	 * the shared info.  KVM XEN drops timer IRQs if the shared info is
+	 * invalid when the timer expires.  Arbitrarily poll 100 times before
+	 * giving up and asking the VMM to re-arm the timer.  100 polls should
+	 * consume enough time to beat on KVM without taking too long if the
+	 * timer IRQ is dropped due to an invalid event channel.
+	 */
+	for (i = 0; i < 100 && !guest_saw_irq; i++)
+		__xen_hypercall(__HYPERVISOR_sched_op, SCHEDOP_poll, &p);
+
+	/*
+	 * Re-send the timer IRQ if it was (likely) dropped due to the timer
+	 * expiring while the event channel was invalid.
+	 */
+	if (!guest_saw_irq) {
+		GUEST_SYNC(TEST_LOCKING_POLL_TIMEOUT);
+		goto wait_for_timer;
+	}
+	guest_saw_irq = false;
+
+	GUEST_SYNC(TEST_DONE);
+}
+
+static int cmp_timespec(struct timespec *a, struct timespec *b)
+{
+	if (a->tv_sec > b->tv_sec)
+		return 1;
+	else if (a->tv_sec < b->tv_sec)
+		return -1;
+	else if (a->tv_nsec > b->tv_nsec)
+		return 1;
+	else if (a->tv_nsec < b->tv_nsec)
+		return -1;
+	else
+		return 0;
+}
+
+static struct shared_info *shinfo;
+static struct vcpu_info *vinfo;
+static struct kvm_vcpu *vcpu;
+
+static void handle_alrm(int sig)
+{
+	if (vinfo)
+		printf("evtchn_upcall_pending 0x%x\n", vinfo->evtchn_upcall_pending);
+	vcpu_dump(stdout, vcpu, 0);
+	TEST_FAIL("IRQ delivery timed out");
+}
+
+static void *juggle_shinfo_state(void *arg)
+{
+	struct kvm_vm *vm = (struct kvm_vm *)arg;
+
+	struct kvm_xen_hvm_attr cache_activate_gfn = {
+		.type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
+		.u.shared_info.gfn = SHINFO_REGION_GPA / PAGE_SIZE
+	};
+
+	struct kvm_xen_hvm_attr cache_deactivate_gfn = {
+		.type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
+		.u.shared_info.gfn = KVM_XEN_INVALID_GFN
+	};
+
+	struct kvm_xen_hvm_attr cache_activate_hva = {
+		.type = KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA,
+		.u.shared_info.hva = (unsigned long)shinfo
+	};
+
+	struct kvm_xen_hvm_attr cache_deactivate_hva = {
+		.type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
+		.u.shared_info.hva = 0
+	};
+
+	int xen_caps = kvm_check_cap(KVM_CAP_XEN_HVM);
+
+	for (;;) {
+		__vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_activate_gfn);
+		pthread_testcancel();
+		__vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_deactivate_gfn);
+
+		if (xen_caps & KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA) {
+			__vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_activate_hva);
+			pthread_testcancel();
+			__vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &cache_deactivate_hva);
+		}
+	}
+
+	return NULL;
+}
+
+int main(int argc, char *argv[])
+{
+	struct timespec min_ts, max_ts, vm_ts;
+	struct kvm_xen_hvm_attr evt_reset;
+	struct kvm_vm *vm;
+	pthread_t thread;
+	bool verbose;
+	int ret;
+
+	verbose = argc > 1 && (!strncmp(argv[1], "-v", 3) ||
+			       !strncmp(argv[1], "--verbose", 10));
+
+	int xen_caps = kvm_check_cap(KVM_CAP_XEN_HVM);
+	TEST_REQUIRE(xen_caps & KVM_XEN_HVM_CONFIG_SHARED_INFO);
+
+	bool do_runstate_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_RUNSTATE);
+	bool do_runstate_flag = !!(xen_caps & KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG);
+	bool do_eventfd_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL);
+	bool do_evtchn_tests = do_eventfd_tests && !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND);
+	bool has_shinfo_hva = !!(xen_caps & KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA);
+
+	clock_gettime(CLOCK_REALTIME, &min_ts);
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+
+	/* Map a region for the shared_info page */
+	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+				    SHINFO_REGION_GPA, SHINFO_REGION_SLOT, 3, 0);
+	virt_map(vm, SHINFO_REGION_GVA, SHINFO_REGION_GPA, 3);
+
+	shinfo = addr_gpa2hva(vm, SHINFO_VADDR);
+
+	int zero_fd = open("/dev/zero", O_RDONLY);
+	TEST_ASSERT(zero_fd != -1, "Failed to open /dev/zero");
+
+	struct kvm_xen_hvm_config hvmc = {
+		.flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL,
+		.msr = XEN_HYPERCALL_MSR,
+	};
+
+	/* Let the kernel know that we *will* use it for sending all
+	 * event channels, which lets it intercept SCHEDOP_poll */
+	if (do_evtchn_tests)
+		hvmc.flags |= KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
+
+	vm_ioctl(vm, KVM_XEN_HVM_CONFIG, &hvmc);
+
+	struct kvm_xen_hvm_attr lm = {
+		.type = KVM_XEN_ATTR_TYPE_LONG_MODE,
+		.u.long_mode = 1,
+	};
+	vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &lm);
+
+	if (do_runstate_flag) {
+		struct kvm_xen_hvm_attr ruf = {
+			.type = KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG,
+			.u.runstate_update_flag = 1,
+		};
+		vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &ruf);
+
+		ruf.u.runstate_update_flag = 0;
+		vm_ioctl(vm, KVM_XEN_HVM_GET_ATTR, &ruf);
+		TEST_ASSERT(ruf.u.runstate_update_flag == 1,
+			    "Failed to read back RUNSTATE_UPDATE_FLAG attr");
+	}
+
+	struct kvm_xen_hvm_attr ha = {};
+
+	if (has_shinfo_hva) {
+		ha.type = KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA;
+		ha.u.shared_info.hva = (unsigned long)shinfo;
+	} else {
+		ha.type = KVM_XEN_ATTR_TYPE_SHARED_INFO;
+		ha.u.shared_info.gfn = SHINFO_ADDR / PAGE_SIZE;
+	}
+
+	vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &ha);
+
+	/*
+	 * Test what happens when the HVA of the shinfo page is remapped after
+	 * the kernel has a reference to it. But make sure we copy the clock
+	 * info over since that's only set at setup time, and we test it later.
+	 */
+	struct pvclock_wall_clock wc_copy = shinfo->wc;
+	void *m = mmap(shinfo, PAGE_SIZE, PROT_READ|PROT_WRITE, MAP_FIXED|MAP_PRIVATE, zero_fd, 0);
+	TEST_ASSERT(m == shinfo, "Failed to map /dev/zero over shared info");
+	shinfo->wc = wc_copy;
+
+	struct kvm_xen_vcpu_attr vi = {
+		.type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO,
+		.u.gpa = VCPU_INFO_ADDR,
+	};
+	vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &vi);
+
+	struct kvm_xen_vcpu_attr pvclock = {
+		.type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
+		.u.gpa = PVTIME_ADDR,
+	};
+	vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &pvclock);
+
+	struct kvm_xen_hvm_attr vec = {
+		.type = KVM_XEN_ATTR_TYPE_UPCALL_VECTOR,
+		.u.vector = EVTCHN_VECTOR,
+	};
+	vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &vec);
+
+	vm_init_descriptor_tables(vm);
+	vcpu_init_descriptor_tables(vcpu);
+	vm_install_exception_handler(vm, EVTCHN_VECTOR, evtchn_handler);
+
+	if (do_runstate_tests) {
+		struct kvm_xen_vcpu_attr st = {
+			.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
+			.u.gpa = RUNSTATE_ADDR,
+		};
+		vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &st);
+	}
+
+	int irq_fd[2] = { -1, -1 };
+
+	if (do_eventfd_tests) {
+		irq_fd[0] = eventfd(0, 0);
+		irq_fd[1] = eventfd(0, 0);
+
+		/* Unexpected, but not a KVM failure */
+		if (irq_fd[0] == -1 || irq_fd[1] == -1)
+			do_evtchn_tests = do_eventfd_tests = false;
+	}
+
+	if (do_eventfd_tests) {
+		irq_routes.info.nr = 2;
+
+		irq_routes.entries[0].gsi = 32;
+		irq_routes.entries[0].type = KVM_IRQ_ROUTING_XEN_EVTCHN;
+		irq_routes.entries[0].u.xen_evtchn.port = EVTCHN_TEST1;
+		irq_routes.entries[0].u.xen_evtchn.vcpu = vcpu->id;
+		irq_routes.entries[0].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+
+		irq_routes.entries[1].gsi = 33;
+		irq_routes.entries[1].type = KVM_IRQ_ROUTING_XEN_EVTCHN;
+		irq_routes.entries[1].u.xen_evtchn.port = EVTCHN_TEST2;
+		irq_routes.entries[1].u.xen_evtchn.vcpu = vcpu->id;
+		irq_routes.entries[1].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+
+		vm_ioctl(vm, KVM_SET_GSI_ROUTING, &irq_routes.info);
+
+		struct kvm_irqfd ifd = { };
+
+		ifd.fd = irq_fd[0];
+		ifd.gsi = 32;
+		vm_ioctl(vm, KVM_IRQFD, &ifd);
+
+		ifd.fd = irq_fd[1];
+		ifd.gsi = 33;
+		vm_ioctl(vm, KVM_IRQFD, &ifd);
+
+		struct sigaction sa = { };
+		sa.sa_handler = handle_alrm;
+		sigaction(SIGALRM, &sa, NULL);
+	}
+
+	struct kvm_xen_vcpu_attr tmr = {
+		.type = KVM_XEN_VCPU_ATTR_TYPE_TIMER,
+		.u.timer.port = EVTCHN_TIMER,
+		.u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
+		.u.timer.expires_ns = 0
+	};
+
+	if (do_evtchn_tests) {
+		struct kvm_xen_hvm_attr inj = {
+			.type = KVM_XEN_ATTR_TYPE_EVTCHN,
+			.u.evtchn.send_port = 127,
+			.u.evtchn.type = EVTCHNSTAT_interdomain,
+			.u.evtchn.flags = 0,
+			.u.evtchn.deliver.port.port = EVTCHN_TEST1,
+			.u.evtchn.deliver.port.vcpu = vcpu->id + 1,
+			.u.evtchn.deliver.port.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
+		};
+		vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj);
+
+		/* Test migration to a different vCPU */
+		inj.u.evtchn.flags = KVM_XEN_EVTCHN_UPDATE;
+		inj.u.evtchn.deliver.port.vcpu = vcpu->id;
+		vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj);
+
+		inj.u.evtchn.send_port = 197;
+		inj.u.evtchn.deliver.eventfd.port = 0;
+		inj.u.evtchn.deliver.eventfd.fd = irq_fd[1];
+		inj.u.evtchn.flags = 0;
+		vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj);
+
+		vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
+	}
+	vinfo = addr_gpa2hva(vm, VCPU_INFO_VADDR);
+	vinfo->evtchn_upcall_pending = 0;
+
+	struct vcpu_runstate_info *rs = addr_gpa2hva(vm, RUNSTATE_ADDR);
+	rs->state = 0x5a;
+
+	bool evtchn_irq_expected = false;
+
+	for (;;) {
+		struct ucall uc;
+
+		vcpu_run(vcpu);
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			/* NOT REACHED */
+		case UCALL_SYNC: {
+			struct kvm_xen_vcpu_attr rst;
+			long rundelay;
+
+			if (do_runstate_tests)
+				TEST_ASSERT(rs->state_entry_time == rs->time[0] +
+					    rs->time[1] + rs->time[2] + rs->time[3],
+					    "runstate times don't add up");
+
+			switch (uc.args[1]) {
+			case TEST_INJECT_VECTOR:
+				if (verbose)
+					printf("Delivering evtchn upcall\n");
+				evtchn_irq_expected = true;
+				vinfo->evtchn_upcall_pending = 1;
+				break;
+
+			case TEST_RUNSTATE_runnable...TEST_RUNSTATE_offline:
+				TEST_ASSERT(!evtchn_irq_expected, "Event channel IRQ not seen");
+				if (!do_runstate_tests)
+					goto done;
+				if (verbose)
+					printf("Testing runstate %s\n", runstate_names[uc.args[1]]);
+				rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT;
+				rst.u.runstate.state = uc.args[1] + RUNSTATE_runnable -
+					TEST_RUNSTATE_runnable;
+				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &rst);
+				break;
+
+			case TEST_RUNSTATE_ADJUST:
+				if (verbose)
+					printf("Testing RUNSTATE_ADJUST\n");
+				rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST;
+				memset(&rst.u, 0, sizeof(rst.u));
+				rst.u.runstate.state = (uint64_t)-1;
+				rst.u.runstate.time_blocked =
+					0x5a - rs->time[RUNSTATE_blocked];
+				rst.u.runstate.time_offline =
+					0x6b6b - rs->time[RUNSTATE_offline];
+				rst.u.runstate.time_runnable = -rst.u.runstate.time_blocked -
+					rst.u.runstate.time_offline;
+				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &rst);
+				break;
+
+			case TEST_RUNSTATE_DATA:
+				if (verbose)
+					printf("Testing RUNSTATE_DATA\n");
+				rst.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA;
+				memset(&rst.u, 0, sizeof(rst.u));
+				rst.u.runstate.state = RUNSTATE_running;
+				rst.u.runstate.state_entry_time = 0x6b6b + 0x5a;
+				rst.u.runstate.time_blocked = 0x6b6b;
+				rst.u.runstate.time_offline = 0x5a;
+				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &rst);
+				break;
+
+			case TEST_STEAL_TIME:
+				if (verbose)
+					printf("Testing steal time\n");
+				/* Yield until scheduler delay exceeds target */
+				rundelay = get_run_delay() + MIN_STEAL_TIME;
+				do {
+					sched_yield();
+				} while (get_run_delay() < rundelay);
+				break;
+
+			case TEST_EVTCHN_MASKED:
+				if (!do_eventfd_tests)
+					goto done;
+				if (verbose)
+					printf("Testing masked event channel\n");
+				shinfo->evtchn_mask[0] = 1UL << EVTCHN_TEST1;
+				eventfd_write(irq_fd[0], 1UL);
+				alarm(1);
+				break;
+
+			case TEST_EVTCHN_UNMASKED:
+				if (verbose)
+					printf("Testing unmasked event channel\n");
+				/* Unmask that, but deliver the other one */
+				shinfo->evtchn_pending[0] = 0;
+				shinfo->evtchn_mask[0] = 0;
+				eventfd_write(irq_fd[1], 1UL);
+				evtchn_irq_expected = true;
+				alarm(1);
+				break;
+
+			case TEST_EVTCHN_SLOWPATH:
+				TEST_ASSERT(!evtchn_irq_expected,
+					    "Expected event channel IRQ but it didn't happen");
+				shinfo->evtchn_pending[1] = 0;
+				if (verbose)
+					printf("Testing event channel after memslot change\n");
+				vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+							    DUMMY_REGION_GPA, DUMMY_REGION_SLOT, 1, 0);
+				eventfd_write(irq_fd[0], 1UL);
+				evtchn_irq_expected = true;
+				alarm(1);
+				break;
+
+			case TEST_EVTCHN_SEND_IOCTL:
+				TEST_ASSERT(!evtchn_irq_expected,
+					    "Expected event channel IRQ but it didn't happen");
+				if (!do_evtchn_tests)
+					goto done;
+
+				shinfo->evtchn_pending[0] = 0;
+				if (verbose)
+					printf("Testing injection with KVM_XEN_HVM_EVTCHN_SEND\n");
+
+				struct kvm_irq_routing_xen_evtchn e;
+				e.port = EVTCHN_TEST2;
+				e.vcpu = vcpu->id;
+				e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+
+				vm_ioctl(vm, KVM_XEN_HVM_EVTCHN_SEND, &e);
+				evtchn_irq_expected = true;
+				alarm(1);
+				break;
+
+			case TEST_EVTCHN_HCALL:
+				TEST_ASSERT(!evtchn_irq_expected,
+					    "Expected event channel IRQ but it didn't happen");
+				shinfo->evtchn_pending[1] = 0;
+
+				if (verbose)
+					printf("Testing guest EVTCHNOP_send direct to evtchn\n");
+				evtchn_irq_expected = true;
+				alarm(1);
+				break;
+
+			case TEST_EVTCHN_HCALL_SLOWPATH:
+				TEST_ASSERT(!evtchn_irq_expected,
+					    "Expected event channel IRQ but it didn't happen");
+				shinfo->evtchn_pending[0] = 0;
+
+				if (verbose)
+					printf("Testing guest EVTCHNOP_send direct to evtchn after memslot change\n");
+				vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+							    DUMMY_REGION_GPA_2, DUMMY_REGION_SLOT_2, 1, 0);
+				evtchn_irq_expected = true;
+				alarm(1);
+				break;
+
+			case TEST_EVTCHN_HCALL_EVENTFD:
+				TEST_ASSERT(!evtchn_irq_expected,
+					    "Expected event channel IRQ but it didn't happen");
+				shinfo->evtchn_pending[0] = 0;
+
+				if (verbose)
+					printf("Testing guest EVTCHNOP_send to eventfd\n");
+				evtchn_irq_expected = true;
+				alarm(1);
+				break;
+
+			case TEST_TIMER_SETUP:
+				TEST_ASSERT(!evtchn_irq_expected,
+					    "Expected event channel IRQ but it didn't happen");
+				shinfo->evtchn_pending[1] = 0;
+
+				if (verbose)
+					printf("Testing guest oneshot timer\n");
+				break;
+
+			case TEST_TIMER_WAIT:
+				memset(&tmr, 0, sizeof(tmr));
+				tmr.type = KVM_XEN_VCPU_ATTR_TYPE_TIMER;
+				vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &tmr);
+				TEST_ASSERT(tmr.u.timer.port == EVTCHN_TIMER,
+					    "Timer port not returned");
+				TEST_ASSERT(tmr.u.timer.priority == KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
+					    "Timer priority not returned");
+				TEST_ASSERT(tmr.u.timer.expires_ns > rs->state_entry_time,
+					    "Timer expiry not returned");
+				evtchn_irq_expected = true;
+				alarm(1);
+				break;
+
+			case TEST_TIMER_RESTORE:
+				TEST_ASSERT(!evtchn_irq_expected,
+					    "Expected event channel IRQ but it didn't happen");
+				shinfo->evtchn_pending[0] = 0;
+
+				if (verbose)
+					printf("Testing restored oneshot timer\n");
+
+				tmr.u.timer.expires_ns = rs->state_entry_time + 100000000;
+				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
+				evtchn_irq_expected = true;
+				alarm(1);
+				break;
+
+			case TEST_POLL_READY:
+				TEST_ASSERT(!evtchn_irq_expected,
+					    "Expected event channel IRQ but it didn't happen");
+
+				if (verbose)
+					printf("Testing SCHEDOP_poll with already pending event\n");
+				shinfo->evtchn_pending[0] = shinfo->evtchn_mask[0] = 1UL << EVTCHN_TIMER;
+				alarm(1);
+				break;
+
+			case TEST_POLL_TIMEOUT:
+				if (verbose)
+					printf("Testing SCHEDOP_poll timeout\n");
+				shinfo->evtchn_pending[0] = 0;
+				alarm(1);
+				break;
+
+			case TEST_POLL_MASKED:
+				if (verbose)
+					printf("Testing SCHEDOP_poll wake on masked event\n");
+
+				tmr.u.timer.expires_ns = rs->state_entry_time + 100000000;
+				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
+				alarm(1);
+				break;
+
+			case TEST_POLL_WAKE:
+				shinfo->evtchn_pending[0] = shinfo->evtchn_mask[0] = 0;
+				if (verbose)
+					printf("Testing SCHEDOP_poll wake on unmasked event\n");
+
+				evtchn_irq_expected = true;
+				tmr.u.timer.expires_ns = rs->state_entry_time + 100000000;
+				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
+
+				/* Read it back and check the pending time is reported correctly */
+				tmr.u.timer.expires_ns = 0;
+				vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &tmr);
+				TEST_ASSERT(tmr.u.timer.expires_ns == rs->state_entry_time + 100000000,
+					    "Timer not reported pending");
+				alarm(1);
+				break;
+
+			case SET_VCPU_INFO:
+				if (has_shinfo_hva) {
+					struct kvm_xen_vcpu_attr vih = {
+						.type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA,
+						.u.hva = (unsigned long)vinfo
+					};
+					vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &vih);
+				}
+				break;
+
+			case TEST_TIMER_PAST:
+				TEST_ASSERT(!evtchn_irq_expected,
+					    "Expected event channel IRQ but it didn't happen");
+				/* Read timer and check it is no longer pending */
+				vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &tmr);
+				TEST_ASSERT(!tmr.u.timer.expires_ns, "Timer still reported pending");
+
+				shinfo->evtchn_pending[0] = 0;
+				if (verbose)
+					printf("Testing timer in the past\n");
+
+				evtchn_irq_expected = true;
+				tmr.u.timer.expires_ns = rs->state_entry_time - 100000000ULL;
+				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
+				alarm(1);
+				break;
+
+			case TEST_LOCKING_SEND_RACE:
+				TEST_ASSERT(!evtchn_irq_expected,
+					    "Expected event channel IRQ but it didn't happen");
+				alarm(0);
+
+				if (verbose)
+					printf("Testing shinfo lock corruption (KVM_XEN_HVM_EVTCHN_SEND)\n");
+
+				ret = pthread_create(&thread, NULL, &juggle_shinfo_state, (void *)vm);
+				TEST_ASSERT(ret == 0, "pthread_create() failed: %s", strerror(ret));
+
+				struct kvm_irq_routing_xen_evtchn uxe = {
+					.port = 1,
+					.vcpu = vcpu->id,
+					.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL
+				};
+
+				evtchn_irq_expected = true;
+				for (time_t t = time(NULL) + SHINFO_RACE_TIMEOUT; time(NULL) < t;)
+					__vm_ioctl(vm, KVM_XEN_HVM_EVTCHN_SEND, &uxe);
+				break;
+
+			case TEST_LOCKING_POLL_RACE:
+				TEST_ASSERT(!evtchn_irq_expected,
+					    "Expected event channel IRQ but it didn't happen");
+
+				if (verbose)
+					printf("Testing shinfo lock corruption (SCHEDOP_poll)\n");
+
+				shinfo->evtchn_pending[0] = 1;
+
+				evtchn_irq_expected = true;
+				tmr.u.timer.expires_ns = rs->state_entry_time +
+							 SHINFO_RACE_TIMEOUT * 1000000000ULL;
+				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
+				break;
+
+			case TEST_LOCKING_POLL_TIMEOUT:
+				/*
+				 * Optional and possibly repeated sync point.
+				 * Injecting the timer IRQ may fail if the
+				 * shinfo is invalid when the timer expires.
+				 * If the timer has expired but the IRQ hasn't
+				 * been delivered, rearm the timer and retry.
+				 */
+				vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &tmr);
+
+				/* Resume the guest if the timer is still pending. */
+				if (tmr.u.timer.expires_ns)
+					break;
+
+				/* All done if the IRQ was delivered. */
+				if (!evtchn_irq_expected)
+					break;
+
+				tmr.u.timer.expires_ns = rs->state_entry_time +
+							 SHINFO_RACE_TIMEOUT * 1000000000ULL;
+				vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &tmr);
+				break;
+			case TEST_DONE:
+				TEST_ASSERT(!evtchn_irq_expected,
+					    "Expected event channel IRQ but it didn't happen");
+
+				ret = pthread_cancel(thread);
+				TEST_ASSERT(ret == 0, "pthread_cancel() failed: %s", strerror(ret));
+
+				ret = pthread_join(thread, 0);
+				TEST_ASSERT(ret == 0, "pthread_join() failed: %s", strerror(ret));
+				goto done;
+
+			case TEST_GUEST_SAW_IRQ:
+				TEST_ASSERT(evtchn_irq_expected, "Unexpected event channel IRQ");
+				evtchn_irq_expected = false;
+				break;
+			}
+			break;
+		}
+		case UCALL_DONE:
+			goto done;
+		default:
+			TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd);
+		}
+	}
+
+ done:
+	evt_reset.type = KVM_XEN_ATTR_TYPE_EVTCHN;
+	evt_reset.u.evtchn.flags = KVM_XEN_EVTCHN_RESET;
+	vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &evt_reset);
+
+	alarm(0);
+	clock_gettime(CLOCK_REALTIME, &max_ts);
+
+	/*
+	 * Just a *really* basic check that things are being put in the
+	 * right place. The actual calculations are much the same for
+	 * Xen as they are for the KVM variants, so no need to check.
+	 */
+	struct pvclock_wall_clock *wc;
+	struct pvclock_vcpu_time_info *ti, *ti2;
+
+	wc = addr_gpa2hva(vm, SHINFO_REGION_GPA + 0xc00);
+	ti = addr_gpa2hva(vm, SHINFO_REGION_GPA + 0x40 + 0x20);
+	ti2 = addr_gpa2hva(vm, PVTIME_ADDR);
+
+	if (verbose) {
+		printf("Wall clock (v %d) %d.%09d\n", wc->version, wc->sec, wc->nsec);
+		printf("Time info 1: v %u tsc %" PRIu64 " time %" PRIu64 " mul %u shift %u flags %x\n",
+		       ti->version, ti->tsc_timestamp, ti->system_time, ti->tsc_to_system_mul,
+		       ti->tsc_shift, ti->flags);
+		printf("Time info 2: v %u tsc %" PRIu64 " time %" PRIu64 " mul %u shift %u flags %x\n",
+		       ti2->version, ti2->tsc_timestamp, ti2->system_time, ti2->tsc_to_system_mul,
+		       ti2->tsc_shift, ti2->flags);
+	}
+
+	vm_ts.tv_sec = wc->sec;
+	vm_ts.tv_nsec = wc->nsec;
+	TEST_ASSERT(wc->version && !(wc->version & 1),
+		    "Bad wallclock version %x", wc->version);
+	TEST_ASSERT(cmp_timespec(&min_ts, &vm_ts) <= 0, "VM time too old");
+	TEST_ASSERT(cmp_timespec(&max_ts, &vm_ts) >= 0, "VM time too new");
+
+	TEST_ASSERT(ti->version && !(ti->version & 1),
+		    "Bad time_info version %x", ti->version);
+	TEST_ASSERT(ti2->version && !(ti2->version & 1),
+		    "Bad time_info version %x", ti->version);
+
+	if (do_runstate_tests) {
+		/*
+		 * Fetch runstate and check sanity. Strictly speaking in the
+		 * general case we might not expect the numbers to be identical
+		 * but in this case we know we aren't running the vCPU any more.
+		 */
+		struct kvm_xen_vcpu_attr rst = {
+			.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA,
+		};
+		vcpu_ioctl(vcpu, KVM_XEN_VCPU_GET_ATTR, &rst);
+
+		if (verbose) {
+			printf("Runstate: %s(%d), entry %" PRIu64 " ns\n",
+			       rs->state <= RUNSTATE_offline ? runstate_names[rs->state] : "unknown",
+			       rs->state, rs->state_entry_time);
+			for (int i = RUNSTATE_running; i <= RUNSTATE_offline; i++) {
+				printf("State %s: %" PRIu64 " ns\n",
+				       runstate_names[i], rs->time[i]);
+			}
+		}
+
+		/*
+		 * Exercise runstate info at all points across the page boundary, in
+		 * 32-bit and 64-bit mode. In particular, test the case where it is
+		 * configured in 32-bit mode and then switched to 64-bit mode while
+		 * active, which takes it onto the second page.
+		 */
+		unsigned long runstate_addr;
+		struct compat_vcpu_runstate_info *crs;
+		for (runstate_addr = SHINFO_REGION_GPA + PAGE_SIZE + PAGE_SIZE - sizeof(*rs) - 4;
+		     runstate_addr < SHINFO_REGION_GPA + PAGE_SIZE + PAGE_SIZE + 4; runstate_addr++) {
+
+			rs = addr_gpa2hva(vm, runstate_addr);
+			crs = (void *)rs;
+
+			memset(rs, 0xa5, sizeof(*rs));
+
+			/* Set to compatibility mode */
+			lm.u.long_mode = 0;
+			vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &lm);
+
+			/* Set runstate to new address (kernel will write it) */
+			struct kvm_xen_vcpu_attr st = {
+				.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
+				.u.gpa = runstate_addr,
+			};
+			vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &st);
+
+			if (verbose)
+				printf("Compatibility runstate at %08lx\n", runstate_addr);
+
+			TEST_ASSERT(crs->state == rst.u.runstate.state, "Runstate mismatch");
+			TEST_ASSERT(crs->state_entry_time == rst.u.runstate.state_entry_time,
+				    "State entry time mismatch");
+			TEST_ASSERT(crs->time[RUNSTATE_running] == rst.u.runstate.time_running,
+				    "Running time mismatch");
+			TEST_ASSERT(crs->time[RUNSTATE_runnable] == rst.u.runstate.time_runnable,
+				    "Runnable time mismatch");
+			TEST_ASSERT(crs->time[RUNSTATE_blocked] == rst.u.runstate.time_blocked,
+				    "Blocked time mismatch");
+			TEST_ASSERT(crs->time[RUNSTATE_offline] == rst.u.runstate.time_offline,
+				    "Offline time mismatch");
+			TEST_ASSERT(crs->time[RUNSTATE_offline + 1] == 0xa5a5a5a5a5a5a5a5ULL,
+				    "Structure overrun");
+			TEST_ASSERT(crs->state_entry_time == crs->time[0] +
+				    crs->time[1] + crs->time[2] + crs->time[3],
+				    "runstate times don't add up");
+
+
+			/* Now switch to 64-bit mode */
+			lm.u.long_mode = 1;
+			vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &lm);
+
+			memset(rs, 0xa5, sizeof(*rs));
+
+			/* Don't change the address, just trigger a write */
+			struct kvm_xen_vcpu_attr adj = {
+				.type = KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST,
+				.u.runstate.state = (uint64_t)-1
+			};
+			vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &adj);
+
+			if (verbose)
+				printf("64-bit runstate at %08lx\n", runstate_addr);
+
+			TEST_ASSERT(rs->state == rst.u.runstate.state, "Runstate mismatch");
+			TEST_ASSERT(rs->state_entry_time == rst.u.runstate.state_entry_time,
+				    "State entry time mismatch");
+			TEST_ASSERT(rs->time[RUNSTATE_running] == rst.u.runstate.time_running,
+				    "Running time mismatch");
+			TEST_ASSERT(rs->time[RUNSTATE_runnable] == rst.u.runstate.time_runnable,
+				    "Runnable time mismatch");
+			TEST_ASSERT(rs->time[RUNSTATE_blocked] == rst.u.runstate.time_blocked,
+				    "Blocked time mismatch");
+			TEST_ASSERT(rs->time[RUNSTATE_offline] == rst.u.runstate.time_offline,
+				    "Offline time mismatch");
+			TEST_ASSERT(rs->time[RUNSTATE_offline + 1] == 0xa5a5a5a5a5a5a5a5ULL,
+				    "Structure overrun");
+
+			TEST_ASSERT(rs->state_entry_time == rs->time[0] +
+				    rs->time[1] + rs->time[2] + rs->time[3],
+				    "runstate times don't add up");
+		}
+	}
+
+	kvm_vm_free(vm);
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c b/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c
new file mode 100644
index 000000000000..e149d0574961
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c
@@ -0,0 +1,142 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * xen_vmcall_test
+ *
+ * Copyright © 2020 Amazon.com, Inc. or its affiliates.
+ *
+ * Userspace hypercall testing
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#define HCALL_REGION_GPA	0xc0000000ULL
+#define HCALL_REGION_SLOT	10
+
+#define INPUTVALUE 17
+#define ARGVALUE(x) (0xdeadbeef5a5a0000UL + x)
+#define RETVALUE 0xcafef00dfbfbffffUL
+
+#define XEN_HYPERCALL_MSR	0x40000200
+#define HV_GUEST_OS_ID_MSR	0x40000000
+#define HV_HYPERCALL_MSR	0x40000001
+
+#define HVCALL_SIGNAL_EVENT		0x005d
+#define HV_STATUS_INVALID_ALIGNMENT	4
+
+static void guest_code(void)
+{
+	unsigned long rax = INPUTVALUE;
+	unsigned long rdi = ARGVALUE(1);
+	unsigned long rsi = ARGVALUE(2);
+	unsigned long rdx = ARGVALUE(3);
+	unsigned long rcx;
+	register unsigned long r10 __asm__("r10") = ARGVALUE(4);
+	register unsigned long r8 __asm__("r8") = ARGVALUE(5);
+	register unsigned long r9 __asm__("r9") = ARGVALUE(6);
+
+	/* First a direct invocation of 'vmcall' */
+	__asm__ __volatile__("vmcall" :
+			     "=a"(rax) :
+			     "a"(rax), "D"(rdi), "S"(rsi), "d"(rdx),
+			     "r"(r10), "r"(r8), "r"(r9));
+	GUEST_ASSERT(rax == RETVALUE);
+
+	/* Fill in the Xen hypercall page */
+	__asm__ __volatile__("wrmsr" : : "c" (XEN_HYPERCALL_MSR),
+			     "a" (HCALL_REGION_GPA & 0xffffffff),
+			     "d" (HCALL_REGION_GPA >> 32));
+
+	/* Set Hyper-V Guest OS ID */
+	__asm__ __volatile__("wrmsr" : : "c" (HV_GUEST_OS_ID_MSR),
+			     "a" (0x5a), "d" (0));
+
+	/* Hyper-V hypercall page */
+	u64 msrval = HCALL_REGION_GPA + PAGE_SIZE + 1;
+	__asm__ __volatile__("wrmsr" : : "c" (HV_HYPERCALL_MSR),
+			     "a" (msrval & 0xffffffff),
+			     "d" (msrval >> 32));
+
+	/* Invoke a Xen hypercall */
+	__asm__ __volatile__("call *%1" : "=a"(rax) :
+			     "r"(HCALL_REGION_GPA + INPUTVALUE * 32),
+			     "a"(rax), "D"(rdi), "S"(rsi), "d"(rdx),
+			     "r"(r10), "r"(r8), "r"(r9));
+	GUEST_ASSERT(rax == RETVALUE);
+
+	/* Invoke a Hyper-V hypercall */
+	rax = 0;
+	rcx = HVCALL_SIGNAL_EVENT;	/* code */
+	rdx = 0x5a5a5a5a;		/* ingpa (badly aligned) */
+	__asm__ __volatile__("call *%1" : "=a"(rax) :
+			     "r"(HCALL_REGION_GPA + PAGE_SIZE),
+			     "a"(rax), "c"(rcx), "d"(rdx),
+			     "r"(r8));
+	GUEST_ASSERT(rax == HV_STATUS_INVALID_ALIGNMENT);
+
+	GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int xen_caps;
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	xen_caps = kvm_check_cap(KVM_CAP_XEN_HVM);
+	TEST_REQUIRE(xen_caps & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL);
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+	vcpu_set_hv_cpuid(vcpu);
+
+	struct kvm_xen_hvm_config hvmc = {
+		.flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL,
+		.msr = XEN_HYPERCALL_MSR,
+	};
+	vm_ioctl(vm, KVM_XEN_HVM_CONFIG, &hvmc);
+
+	/* Map a region for the hypercall pages */
+	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+				    HCALL_REGION_GPA, HCALL_REGION_SLOT, 2, 0);
+	virt_map(vm, HCALL_REGION_GPA, HCALL_REGION_GPA, 2);
+
+	for (;;) {
+		volatile struct kvm_run *run = vcpu->run;
+		struct ucall uc;
+
+		vcpu_run(vcpu);
+
+		if (run->exit_reason == KVM_EXIT_XEN) {
+			TEST_ASSERT_EQ(run->xen.type, KVM_EXIT_XEN_HCALL);
+			TEST_ASSERT_EQ(run->xen.u.hcall.cpl, 0);
+			TEST_ASSERT_EQ(run->xen.u.hcall.longmode, 1);
+			TEST_ASSERT_EQ(run->xen.u.hcall.input, INPUTVALUE);
+			TEST_ASSERT_EQ(run->xen.u.hcall.params[0], ARGVALUE(1));
+			TEST_ASSERT_EQ(run->xen.u.hcall.params[1], ARGVALUE(2));
+			TEST_ASSERT_EQ(run->xen.u.hcall.params[2], ARGVALUE(3));
+			TEST_ASSERT_EQ(run->xen.u.hcall.params[3], ARGVALUE(4));
+			TEST_ASSERT_EQ(run->xen.u.hcall.params[4], ARGVALUE(5));
+			TEST_ASSERT_EQ(run->xen.u.hcall.params[5], ARGVALUE(6));
+			run->xen.u.hcall.result = RETVALUE;
+			continue;
+		}
+
+		TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+		switch (get_ucall(vcpu, &uc)) {
+		case UCALL_ABORT:
+			REPORT_GUEST_ASSERT(uc);
+			/* NOT REACHED */
+		case UCALL_SYNC:
+			break;
+		case UCALL_DONE:
+			goto done;
+		default:
+			TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd);
+		}
+	}
+done:
+	kvm_vm_free(vm);
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/xss_msr_test.c b/tools/testing/selftests/kvm/x86_64/xss_msr_test.c
index 3529376747c2..167c97abff1b 100644
--- a/tools/testing/selftests/kvm/x86_64/xss_msr_test.c
+++ b/tools/testing/selftests/kvm/x86_64/xss_msr_test.c
@@ -12,64 +12,44 @@
 #include "kvm_util.h"
 #include "vmx.h"
 
-#define VCPU_ID	      1
 #define MSR_BITS      64
 
-#define X86_FEATURE_XSAVES	(1<<3)
-
-bool is_supported_msr(u32 msr_index)
-{
-	struct kvm_msr_list *list;
-	bool found = false;
-	int i;
-
-	list = kvm_get_msr_index_list();
-	for (i = 0; i < list->nmsrs; ++i) {
-		if (list->indices[i] == msr_index) {
-			found = true;
-			break;
-		}
-	}
-
-	free(list);
-	return found;
-}
-
 int main(int argc, char *argv[])
 {
-	struct kvm_cpuid_entry2 *entry;
-	bool xss_supported = false;
+	bool xss_in_msr_list;
 	struct kvm_vm *vm;
+	struct kvm_vcpu *vcpu;
 	uint64_t xss_val;
 	int i, r;
 
 	/* Create VM */
-	vm = vm_create_default(VCPU_ID, 0, 0);
+	vm = vm_create_with_one_vcpu(&vcpu, NULL);
 
-	if (kvm_get_cpuid_max_basic() >= 0xd) {
-		entry = kvm_get_supported_cpuid_index(0xd, 1);
-		xss_supported = entry && !!(entry->eax & X86_FEATURE_XSAVES);
-	}
-	if (!xss_supported) {
-		print_skip("IA32_XSS is not supported by the vCPU");
-		exit(KSFT_SKIP);
-	}
+	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_XSAVES));
 
-	xss_val = vcpu_get_msr(vm, VCPU_ID, MSR_IA32_XSS);
+	xss_val = vcpu_get_msr(vcpu, MSR_IA32_XSS);
 	TEST_ASSERT(xss_val == 0,
-		    "MSR_IA32_XSS should be initialized to zero\n");
+		    "MSR_IA32_XSS should be initialized to zero");
+
+	vcpu_set_msr(vcpu, MSR_IA32_XSS, xss_val);
 
-	vcpu_set_msr(vm, VCPU_ID, MSR_IA32_XSS, xss_val);
 	/*
 	 * At present, KVM only supports a guest IA32_XSS value of 0. Verify
 	 * that trying to set the guest IA32_XSS to an unsupported value fails.
 	 * Also, in the future when a non-zero value succeeds check that
-	 * IA32_XSS is in the KVM_GET_MSR_INDEX_LIST.
+	 * IA32_XSS is in the list of MSRs to save/restore.
 	 */
+	xss_in_msr_list = kvm_msr_is_in_save_restore_list(MSR_IA32_XSS);
 	for (i = 0; i < MSR_BITS; ++i) {
-		r = _vcpu_set_msr(vm, VCPU_ID, MSR_IA32_XSS, 1ull << i);
-		TEST_ASSERT(r == 0 || is_supported_msr(MSR_IA32_XSS),
-			    "IA32_XSS was able to be set, but was not found in KVM_GET_MSR_INDEX_LIST.\n");
+		r = _vcpu_set_msr(vcpu, MSR_IA32_XSS, 1ull << i);
+
+		/*
+		 * Setting a list of MSRs returns the entry that "faulted", or
+		 * the last entry +1 if all MSRs were successfully written.
+		 */
+		TEST_ASSERT(!r || r == 1, KVM_IOCTL_ERROR(KVM_SET_MSRS, r));
+		TEST_ASSERT(r != 1 || xss_in_msr_list,
+			    "IA32_XSS was able to be set, but was not in save/restore list");
 	}
 
 	kvm_vm_free(vm);